block/blk-iocost.c | 35 +++++++++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 6 deletions(-)
When a disk is saturated, it is common for no IOs to complete within a
timer period. Currently, in this case, rq_wait_pct and missed_ppm are
calculated as 0, the iocost incorrectly interprets this as meeting QoS
targets and resets busy_level to 0.
This reset prevents busy_level from reaching the threshold (4) needed
to reduce vrate. On certain cloud storage, such as Azure Premium SSD,
we observed that iocost may fail to reduce vrate for tens of seconds
during saturation, failing to mitigate noisy neighbor issues.
Fix this by tracking the number of IO completions (nr_done) in a period.
If nr_done is 0, we adjust the logic:
* If there are lagging IOs, the saturation status is unknown, so we try
to keep busy_level unchanged. To avoid drastic vrate oscillations, we
clamp it between -4 and 4.
* If there are shortages but no lagging IOs, the vrate might be too low
to issue any IOs. We should allow vrate to increase but not decrease.
* Otherwise, reset busy_level to 0.
Note that when nr_done is 0 and nr_lagging is 0, the adjustment logic
is nearly identical to the "QoS targets are being met with >25% margin"
state, which minimizes the risk of regressions.
The issue is consistently reproducible on Azure Standard_D8as_v5 (Dasv5)
VMs with 512GB Premium SSD (P20) using the script below. It was not
observed on GCP n2d VMs (100G pd-ssd and 1.5T local-ssd), and no
regressions were found with this patch. In this script, cgA saturates
the device. The iocost is expected to throttle it so that cgB's
completion latency remains low.
BLK_DEVID="8:0"
MODEL="rbps=173471131 rseqiops=3566 rrandiops=3566 wbps=173333269 wseqiops=3566 wrandiops=3566"
QOS="rpct=90.00 rlat=3500 wpct=90 wlat=3500 min=80 max=10000"
echo "$BLK_DEVID ctrl=user model=linear $MODEL" > /sys/fs/cgroup/io.cost.model
echo "$BLK_DEVID enable=1 ctrl=user $QOS" > /sys/fs/cgroup/io.cost.qos
CG_A="/sys/fs/cgroup/cgA"
CG_B="/sys/fs/cgroup/cgB"
FILE_A="/data0/A.fio.testfile"
FILE_B="/data0/B.fio.testfile"
RESULT_DIR="./iocost_results_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$CG_A" "$CG_B" "$RESULT_DIR"
get_result() {
local file=$1
local label=$2
local results=$(jq -r '
.jobs[0].mixed |
( .iops | tonumber | round ) as $iops |
( .bw_bytes / 1024 / 1024 ) as $bps |
( .clat_ns.mean / 1000000 ) as $avg |
( .clat_ns.max / 1000000 ) as $max |
( .clat_ns.percentile["90.000000"] / 1000000 ) as $p90 |
( .clat_ns.percentile["99.000000"] / 1000000 ) as $p99 |
( .clat_ns.percentile["99.900000"] / 1000000 ) as $p999 |
( .clat_ns.percentile["99.990000"] / 1000000 ) as $p9999 |
"\($iops)|\($bps)|\($avg)|\($max)|\($p90)|\($p99)|\($p999)|\($p9999)"
' "$file")
IFS='|' read -r iops bps avg max p90 p99 p999 p9999 <<<"$results"
printf "%-8s %-6s %-7.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f\n" \
"$label" "$iops" "$bps" "$avg" "$max" "$p90" "$p99" "$p999" "$p9999"
}
run_fio() {
local cg_path=$1
local filename=$2
local name=$3
local bs=$4
local qd=$5
local out=$6
shift 6
local extra=$@
(
pid=$(sh -c 'echo $PPID')
echo $pid >"${cg_path}/cgroup.procs"
fio --name="$name" --filename="$filename" --direct=1 --rw=randrw --rwmixread=50 \
--ioengine=libaio --bs="$bs" --iodepth="$qd" --size=4G --runtime=10 \
--time_based --group_reporting --unified_rw_reporting=mixed \
--output-format=json --output="$out" $extra >/dev/null 2>&1
) &
}
echo "Starting Test ..."
for bs_b in "4k" "32k" "256k"; do
echo "Running iteration: BS=$bs_b"
out_a="${RESULT_DIR}/cgA_1m.json"
out_b="${RESULT_DIR}/cgB_${bs_b}.json"
# cgA: Heavy background (BS 1MB, QD 128)
run_fio "$CG_A" "$FILE_A" "cgA" "1m" 128 "$out_a"
# cgB: Latency sensitive (Variable BS, QD 1, Read/Write IOPS limit 100)
run_fio "$CG_B" "$FILE_B" "cgB" "$bs_b" 1 "$out_b" "--rate_iops=100"
wait
SUMMARY_DATA+="$(get_result "$out_a" "cgA-1m")"$'\n'
SUMMARY_DATA+="$(get_result "$out_b" "cgB-$bs_b")"$'\n\n'
done
# Final Output
echo -e "\nFinal Results Summary:\n"
printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s\n\n" \
"CGROUP" "IOPS" "MB/s" "Avg(ms)" "Max(ms)" "P90(ms)" "P99" "P99.9" "P99.99"
echo "$SUMMARY_DATA"
echo "Results saved in $RESULT_DIR"
Before:
CGROUP IOPS MB/s Avg(ms) Max(ms) P90(ms) P99 P99.9 P99.99
cgA-1m 167 167.02 748.65 1641.43 960.50 1551.89 1635.78 1635.78
cgB-4k 5 0.02 190.57 806.84 742.39 809.50 809.50 809.50
cgA-1m 166 166.36 751.38 1744.31 994.05 1451.23 1736.44 1736.44
cgB-32k 4 0.14 225.71 1057.25 759.17 1061.16 1061.16 1061.16
cgA-1m 166 165.91 751.48 1610.94 1010.83 1417.67 1602.22 1619.00
cgB-256k 5 1.26 198.50 1046.30 742.39 1044.38 1044.38 1044.38
After:
CGROUP IOPS MB/s Avg(ms) Max(ms) P90(ms) P99 P99.9 P99.99
cgA-1m 159 158.59 769.06 828.52 809.50 817.89 826.28 826.28
cgB-4k 200 0.78 2.01 26.11 2.87 6.26 12.39 26.08
cgA-1m 147 146.84 832.05 985.80 943.72 960.50 985.66 985.66
cgB-32k 200 6.25 2.82 71.05 3.42 15.40 50.07 70.78
cgA-1m 114 114.47 1044.98 1294.48 1199.57 1283.46 1300.23 1300.23
cgB-256k 200 50.00 4.01 34.49 5.08 15.66 30.54 34.34
Signed-off-by: Jialin Wang <wjl.linux@gmail.com>
---
v2:
- Handle more edge cases to prevent potential regressions.
v1: https://lore.kernel.org/all/20260318163351.394528-1-wjl.linux@gmail.com/
block/blk-iocost.c | 35 +++++++++++++++++++++++++++++------
1 file changed, 29 insertions(+), 6 deletions(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index d145db61e5c3..5184c6e25a0c 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1596,7 +1596,8 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
+static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p,
+ u32 *nr_done)
{
u32 nr_met[2] = { };
u32 nr_missed[2] = { };
@@ -1633,6 +1634,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
ioc->period_us * NSEC_PER_USEC);
+
+ *nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE];
}
/* was iocg idle this period? */
@@ -2250,12 +2253,12 @@ static void ioc_timer_fn(struct timer_list *timer)
u64 usage_us_sum = 0;
u32 ppm_rthr;
u32 ppm_wthr;
- u32 missed_ppm[2], rq_wait_pct;
+ u32 missed_ppm[2], rq_wait_pct, nr_done;
u64 period_vtime;
int prev_busy_level;
/* how were the latencies during the period? */
- ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done);
/* take care of active iocgs */
spin_lock_irq(&ioc->lock);
@@ -2397,9 +2400,29 @@ static void ioc_timer_fn(struct timer_list *timer)
* and should increase vtime rate.
*/
prev_busy_level = ioc->busy_level;
- if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
- missed_ppm[READ] > ppm_rthr ||
- missed_ppm[WRITE] > ppm_wthr) {
+ if (!nr_done) {
+ if (nr_lagging)
+ /*
+ * When there are lagging IOs but no completions, we
+ * don't know if the IO latency will meet the QoS
+ * targets. The disk might be saturated or not. We
+ * should not reset busy_level to 0 (which would
+ * prevent vrate from scaling up or down), but rather
+ * try to keep it unchanged. To avoid drastic vrate
+ * oscillations, we clamp it between -4 and 4.
+ */
+ ioc->busy_level = clamp(ioc->busy_level, -4, 4);
+ else if (nr_shortages)
+ /*
+ * The vrate might be too low to issue any IOs. We
+ * should allow vrate to increase but not decrease.
+ */
+ ioc->busy_level = min(ioc->busy_level, 0);
+ else
+ ioc->busy_level = 0;
+ } else if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
+ missed_ppm[READ] > ppm_rthr ||
+ missed_ppm[WRITE] > ppm_wthr) {
/* clearly missing QoS targets, slow down vrate */
ioc->busy_level = max(ioc->busy_level, 0);
ioc->busy_level++;
--
2.53.0
Hello,
On Sun, Mar 29, 2026 at 03:41:12PM +0000, Jialin Wang wrote:
...
> Before:
> CGROUP IOPS MB/s Avg(ms) Max(ms) P90(ms) P99 P99.9 P99.99
>
> cgA-1m 167 167.02 748.65 1641.43 960.50 1551.89 1635.78 1635.78
> cgB-4k 5 0.02 190.57 806.84 742.39 809.50 809.50 809.50
>
> cgA-1m 166 166.36 751.38 1744.31 994.05 1451.23 1736.44 1736.44
> cgB-32k 4 0.14 225.71 1057.25 759.17 1061.16 1061.16 1061.16
>
> cgA-1m 166 165.91 751.48 1610.94 1010.83 1417.67 1602.22 1619.00
> cgB-256k 5 1.26 198.50 1046.30 742.39 1044.38 1044.38 1044.38
>
> After:
> CGROUP IOPS MB/s Avg(ms) Max(ms) P90(ms) P99 P99.9 P99.99
>
> cgA-1m 159 158.59 769.06 828.52 809.50 817.89 826.28 826.28
> cgB-4k 200 0.78 2.01 26.11 2.87 6.26 12.39 26.08
>
> cgA-1m 147 146.84 832.05 985.80 943.72 960.50 985.66 985.66
> cgB-32k 200 6.25 2.82 71.05 3.42 15.40 50.07 70.78
>
> cgA-1m 114 114.47 1044.98 1294.48 1199.57 1283.46 1300.23 1300.23
> cgB-256k 200 50.00 4.01 34.49 5.08 15.66 30.54 34.34
Are the latency numbers end-to-end or on-device? If former, can you provide
on-device numbers? What period duration are you using?
> @@ -2397,9 +2400,29 @@ static void ioc_timer_fn(struct timer_list *timer)
> * and should increase vtime rate.
> */
> prev_busy_level = ioc->busy_level;
> - if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
> - missed_ppm[READ] > ppm_rthr ||
> - missed_ppm[WRITE] > ppm_wthr) {
> + if (!nr_done) {
> + if (nr_lagging)
Please use {} even when it's just comments that makes the bodies multi-line.
> + /*
> + * When there are lagging IOs but no completions, we
> + * don't know if the IO latency will meet the QoS
> + * targets. The disk might be saturated or not. We
> + * should not reset busy_level to 0 (which would
> + * prevent vrate from scaling up or down), but rather
> + * try to keep it unchanged. To avoid drastic vrate
> + * oscillations, we clamp it between -4 and 4.
> + */
> + ioc->busy_level = clamp(ioc->busy_level, -4, 4);
Is this from some observed behavior or just out of intuition? The
justification seems a bit flimsy. Why -4 and 4?
> + else if (nr_shortages)
> + /*
> + * The vrate might be too low to issue any IOs. We
> + * should allow vrate to increase but not decrease.
> + */
> + ioc->busy_level = min(ioc->busy_level, 0);
So, this is no completion, no lagging and shortages case. In the existing
code, this would alos get busy_level-- to get things moving. Wouldn't this
path need that too? Or rather, would it make more sense to handle !nr_done
&& nr_lagging case and leave the other cases as-are?
Thanks.
--
tejun
Hi,
On Mon, Mar 30, 2026 at 09:19:49AM -1000, Tejun Heo wrote:
> Hello,
>
> On Sun, Mar 29, 2026 at 03:41:12PM +0000, Jialin Wang wrote:
> ...
> > Before:
> > CGROUP IOPS MB/s Avg(ms) Max(ms) P90(ms) P99 P99.9 P99.99
> >
> > cgA-1m 167 167.02 748.65 1641.43 960.50 1551.89 1635.78 1635.78
> > cgB-4k 5 0.02 190.57 806.84 742.39 809.50 809.50 809.50
> >
> > cgA-1m 166 166.36 751.38 1744.31 994.05 1451.23 1736.44 1736.44
> > cgB-32k 4 0.14 225.71 1057.25 759.17 1061.16 1061.16 1061.16
> >
> > cgA-1m 166 165.91 751.48 1610.94 1010.83 1417.67 1602.22 1619.00
> > cgB-256k 5 1.26 198.50 1046.30 742.39 1044.38 1044.38 1044.38
> >
> > After:
> > CGROUP IOPS MB/s Avg(ms) Max(ms) P90(ms) P99 P99.9 P99.99
> >
> > cgA-1m 159 158.59 769.06 828.52 809.50 817.89 826.28 826.28
> > cgB-4k 200 0.78 2.01 26.11 2.87 6.26 12.39 26.08
> >
> > cgA-1m 147 146.84 832.05 985.80 943.72 960.50 985.66 985.66
> > cgB-32k 200 6.25 2.82 71.05 3.42 15.40 50.07 70.78
> >
> > cgA-1m 114 114.47 1044.98 1294.48 1199.57 1283.46 1300.23 1300.23
> > cgB-256k 200 50.00 4.01 34.49 5.08 15.66 30.54 34.34
>
> Are the latency numbers end-to-end or on-device? If former, can you provide
> on-device numbers? What period duration are you using?
These latency numbers are completion latency results from fio using
ioengine=libaio. For cgB, since --iodepth=1 is used, these completion
latencies are very close to the actual on-device times.
I used the following QoS parameters:
rpct=90 rlat=3500 wpct=90 wlat=3500 min=80 max=10000 (period: 7ms)
When switching to:
rpct=80 rlat=10000 wpct=80 wlat=10000 min=80 max=10000 (period: 40ms)
While this showed some improvement, cgB still failed to reach the
expected 200 IOPS, and the P99 latency remained high:
CGROUP IOPS MB/s Avg(ms) Max(ms) P90(ms) P99 P99.9 P99.99
cgA-1m 161 160.81 758.52 1462.38 1044.38 1317.01 1451.23 1468.01
cgB-4k 125 0.49 7.18 661.39 2.70 189.79 650.12 658.51
cgA-1m 155 154.63 784.92 1234.01 1010.83 1182.79 1233.13 1233.13
cgB-32k 136 4.26 6.40 300.78 3.85 160.43 295.70 299.89
cgA-1m 138 137.91 860.32 1704.14 1317.01 1669.33 1702.89 1702.89
cgB-256k 95 23.70 9.83 394.73 5.34 206.57 396.36 396.36
I also tested several other sets of parameters and the results were similar.
Using bpftrace, it can still be frequently observed that busy_level is
reset to 0 when no IO complete, and the vrate cannot be lowered in time.
08:26:20.186950 iocost_ioc_vrate_adj: [sdb] vrate=127.50%->126.23% busy=4 missed_ppm=1000000:1000000 rq_wait_pct=0 lagging=3 shortages=1
08:26:20.220910 ioc_rqos_done
08:26:20.222616 ioc_rqos_done
08:26:20.226913 ioc_rqos_done
08:26:20.227951 iocost_ioc_vrate_adj: [sdb] vrate=126.23%->124.97% busy=5 missed_ppm=1000000:1000000 rq_wait_pct=0 lagging=3 shortages=1
-- no IO complete, busy_level was reset to 0 --
08:26:20.268945 iocost_ioc_vrate_adj: [sdb] vrate=124.97%->124.97% busy=0 missed_ppm=0:0 rq_wait_pct=0 lagging=3 shortages=1
bpftrace -e '
#define VTIME_PER_USEC 137438
kfunc:ioc_rqos_done
{
printf("%s ioc_rqos_done\n", strftime("%H:%M:%S.%f", nsecs));
}
tracepoint:iocost:iocost_ioc_vrate_adj
{
$old_vrate = args->old_vrate * 10000 / VTIME_PER_USEC;
$new_vrate = args->new_vrate * 10000 / VTIME_PER_USEC;
printf("%s iocost_ioc_vrate_adj: [%s] vrate=%d.%02d%%->%d.%02d%% busy=%d missed_ppm=%u:%u rq_wait_pct=%u lagging=%d shortages=%d\n",
strftime("%H:%M:%S.%f", nsecs), str(args->devname),
$old_vrate / 100, $old_vrate % 100, $new_vrate / 100,
$new_vrate % 100, args->busy_level, args->read_missed_ppm,
args->write_missed_ppm, args->rq_wait_pct, args->nr_lagging,
args->nr_shortages);
}'
> > @@ -2397,9 +2400,29 @@ static void ioc_timer_fn(struct timer_list *timer)
> > * and should increase vtime rate.
> > */
> > prev_busy_level = ioc->busy_level;
> > - if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
> > - missed_ppm[READ] > ppm_rthr ||
> > - missed_ppm[WRITE] > ppm_wthr) {
> > + if (!nr_done) {
> > + if (nr_lagging)
>
> Please use {} even when it's just comments that makes the bodies multi-line.
>
> > + /*
> > + * When there are lagging IOs but no completions, we
> > + * don't know if the IO latency will meet the QoS
> > + * targets. The disk might be saturated or not. We
> > + * should not reset busy_level to 0 (which would
> > + * prevent vrate from scaling up or down), but rather
> > + * try to keep it unchanged. To avoid drastic vrate
> > + * oscillations, we clamp it between -4 and 4.
> > + */
> > + ioc->busy_level = clamp(ioc->busy_level, -4, 4);
>
> Is this from some observed behavior or just out of intuition? The
> justification seems a bit flimsy. Why -4 and 4?
During my testing with the parameters rpct=90 rlat=3500 wpct=90 wlat=3500
min=10 max=10000, I noticed that vrate occasionally drops significantly
(down to 50% or lower), which adversely impacted the IOPS of cgA. So I
limit the busy_level to a maximum of 4 to reduce vrate at the lowest speed.
CGROUP IOPS MB/s Avg(ms) Max(ms) P90(ms) P99 P99.9 P99.99
cgA-1m 137 137.11 891.21 1278.66 1082.13 1216.35 1266.68 1283.46
cgB-4k 200 0.78 2.12 62.64 2.47 7.44 49.55 62.65
I realized that raising min to 80 would effectively mitigate this issue,
so I will remove it in the next v3.
> > + else if (nr_shortages)
> > + /*
> > + * The vrate might be too low to issue any IOs. We
> > + * should allow vrate to increase but not decrease.
> > + */
> > + ioc->busy_level = min(ioc->busy_level, 0);
>
> So, this is no completion, no lagging and shortages case. In the existing
> code, this would alos get busy_level-- to get things moving. Wouldn't this
> path need that too? Or rather, would it make more sense to handle !nr_done
> && nr_lagging case and leave the other cases as-are?
That's a fair point. My initial thought was not to adjust busy_level
when there is no latency data, and I haven't observed this specific path
(no completions, no lagging, but with shortages) occurring in my testing
so far, so I might have been overthinking it. I will simplify the logic
in v3 to handle only the !nr_done && nr_lagging case and leave the other
cases as they are.
--
Thanks,
Jialin
When a disk is saturated, it is common for no IOs to complete within a
timer period. Currently, in this case, rq_wait_pct and missed_ppm are
calculated as 0, the iocost incorrectly interprets this as meeting QoS
targets and resets busy_level to 0.
This reset prevents busy_level from reaching the threshold (4) needed
to reduce vrate. On certain cloud storage, such as Azure Premium SSD,
we observed that iocost may fail to reduce vrate for tens of seconds
during saturation, failing to mitigate noisy neighbor issues.
Fix this by tracking the number of IO completions (nr_done) in a period.
If nr_done is 0 and there are lagging IOs, the saturation status is
unknown, so we keep busy_level unchanged.
The issue is consistently reproducible on Azure Standard_D8as_v5 (Dasv5)
VMs with 512GB Premium SSD (P20) using the script below. It was not
observed on GCP n2d VMs (with 100G pd-ssd and 1.5T local-ssd), and no
regressions were found with this patch. In this script, cgA performs
large IOs with iodepth=128, while cgB performs small IOs with iodepth=1
rate_iops=100 rw=randrw. With iocost enabled, we expect it to throttle
cgA, the submission latency (slat) of cgA should be significantly higher,
cgB can reach 200 IOPS and the completion latency (clat) should below.
BLK_DEVID="8:0"
MODEL="rbps=173471131 rseqiops=3566 rrandiops=3566 wbps=173333269 wseqiops=3566 wrandiops=3566"
QOS="rpct=90 rlat=3500 wpct=90 wlat=3500 min=80 max=10000"
echo "$BLK_DEVID ctrl=user model=linear $MODEL" > /sys/fs/cgroup/io.cost.model
echo "$BLK_DEVID enable=1 ctrl=user $QOS" > /sys/fs/cgroup/io.cost.qos
CG_A="/sys/fs/cgroup/cgA"
CG_B="/sys/fs/cgroup/cgB"
FILE_A="/path/to/sda/A.fio.testfile"
FILE_B="/path/to/sda/B.fio.testfile"
RESULT_DIR="./iocost_results_$(date +%Y%m%d_%H%M%S)"
mkdir -p "$CG_A" "$CG_B" "$RESULT_DIR"
get_result() {
local file=$1
local label=$2
local results=$(jq -r '
.jobs[0].mixed |
( .iops | tonumber | round ) as $iops |
( .bw_bytes / 1024 / 1024 ) as $bps |
( .slat_ns.mean / 1000000 ) as $slat |
( .clat_ns.mean / 1000000 ) as $avg |
( .clat_ns.max / 1000000 ) as $max |
( .clat_ns.percentile["90.000000"] / 1000000 ) as $p90 |
( .clat_ns.percentile["99.000000"] / 1000000 ) as $p99 |
( .clat_ns.percentile["99.900000"] / 1000000 ) as $p999 |
( .clat_ns.percentile["99.990000"] / 1000000 ) as $p9999 |
"\($iops)|\($bps)|\($slat)|\($avg)|\($max)|\($p90)|\($p99)|\($p999)|\($p9999)"
' "$file")
IFS='|' read -r iops bps slat avg max p90 p99 p999 p9999 <<<"$results"
printf "%-8s %-6s %-7.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f\n" \
"$label" "$iops" "$bps" "$slat" "$avg" "$max" "$p90" "$p99" "$p999" "$p9999"
}
run_fio() {
local cg_path=$1
local filename=$2
local name=$3
local bs=$4
local qd=$5
local out=$6
shift 6
local extra=$@
(
pid=$(sh -c 'echo $PPID')
echo $pid >"${cg_path}/cgroup.procs"
fio --name="$name" --filename="$filename" --direct=1 --rw=randrw --rwmixread=50 \
--ioengine=libaio --bs="$bs" --iodepth="$qd" --size=4G --runtime=10 \
--time_based --group_reporting --unified_rw_reporting=mixed \
--output-format=json --output="$out" $extra >/dev/null 2>&1
) &
}
echo "Starting Test ..."
for bs_b in "4k" "32k" "256k"; do
echo "Running iteration: BS=$bs_b"
out_a="${RESULT_DIR}/cgA_1m.json"
out_b="${RESULT_DIR}/cgB_${bs_b}.json"
# cgA: Heavy background (BS 1MB, QD 128)
run_fio "$CG_A" "$FILE_A" "cgA" "1m" 128 "$out_a"
# cgB: Latency sensitive (Variable BS, QD 1, Read/Write IOPS limit 100)
run_fio "$CG_B" "$FILE_B" "cgB" "$bs_b" 1 "$out_b" "--rate_iops=100"
wait
SUMMARY_DATA+="$(get_result "$out_a" "cgA-1m")"$'\n'
SUMMARY_DATA+="$(get_result "$out_b" "cgB-$bs_b")"$'\n\n'
done
echo -e "\nFinal Results Summary:\n"
printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n" \
"" "" "" "slat" "clat" "clat" "clat" "clat" "clat" "clat"
printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s %-8s\n\n" \
"CGROUP" "IOPS" "MB/s" "avg(ms)" "avg(ms)" "max(ms)" "P90(ms)" "P99" "P99.9" "P99.99"
echo "$SUMMARY_DATA"
echo "Results saved in $RESULT_DIR"
Before:
slat clat clat clat clat clat clat
CGROUP IOPS MB/s avg(ms) avg(ms) max(ms) P90(ms) P99 P99.9 P99.99
cgA-1m 166 166.37 3.44 748.95 1298.29 977.27 1233.13 1300.23 1300.23
cgB-4k 5 0.02 0.02 181.74 761.32 742.39 759.17 759.17 759.17
cgA-1m 167 166.51 1.98 748.68 1549.41 809.50 1451.23 1551.89 1551.89
cgB-32k 6 0.18 0.02 169.98 761.76 742.39 759.17 759.17 759.17
cgA-1m 166 165.55 2.89 750.89 1540.37 851.44 1451.23 1535.12 1535.12
cgB-256k 5 1.30 0.02 191.35 759.51 750.78 759.17 759.17 759.17
After:
slat clat clat clat clat clat clat
CGROUP IOPS MB/s avg(ms) avg(ms) max(ms) P90(ms) P99 P99.9 P99.99
cgA-1m 162 162.48 6.14 749.69 850.02 826.28 834.67 843.06 851.44
cgB-4k 199 0.78 0.01 1.95 42.12 2.57 7.50 34.87 42.21
cgA-1m 146 146.20 6.83 833.04 908.68 893.39 901.78 910.16 910.16
cgB-32k 200 6.25 0.01 2.32 31.40 3.06 7.50 16.58 31.33
cgA-1m 110 110.46 9.04 1082.67 1197.91 1182.79 1199.57 1199.57 1199.57
cgB-256k 200 49.98 0.02 3.69 22.20 4.88 9.11 20.05 22.15
Signed-off-by: Jialin Wang <wjl.linux@gmail.com>
---
Changes in v3:
- Handle only the !nr_done && nr_lagging case and leave the other cases
as they are.
Changes in v2:
- Handle more edge cases to prevent potential regressions.
v2: https://lore.kernel.org/all/20260329154112.526679-1-wjl.linux@gmail.com/
v1: https://lore.kernel.org/all/20260318163351.394528-1-wjl.linux@gmail.com/
block/blk-iocost.c | 23 +++++++++++++++++------
1 file changed, 17 insertions(+), 6 deletions(-)
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index d145db61e5c3..0cca88a366dc 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1596,7 +1596,8 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
-static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
+static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p,
+ u32 *nr_done)
{
u32 nr_met[2] = { };
u32 nr_missed[2] = { };
@@ -1633,6 +1634,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
ioc->period_us * NSEC_PER_USEC);
+
+ *nr_done = nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[WRITE];
}
/* was iocg idle this period? */
@@ -2250,12 +2253,12 @@ static void ioc_timer_fn(struct timer_list *timer)
u64 usage_us_sum = 0;
u32 ppm_rthr;
u32 ppm_wthr;
- u32 missed_ppm[2], rq_wait_pct;
+ u32 missed_ppm[2], rq_wait_pct, nr_done;
u64 period_vtime;
int prev_busy_level;
/* how were the latencies during the period? */
- ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done);
/* take care of active iocgs */
spin_lock_irq(&ioc->lock);
@@ -2397,9 +2400,17 @@ static void ioc_timer_fn(struct timer_list *timer)
* and should increase vtime rate.
*/
prev_busy_level = ioc->busy_level;
- if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
- missed_ppm[READ] > ppm_rthr ||
- missed_ppm[WRITE] > ppm_wthr) {
+ if (!nr_done && nr_lagging) {
+ /*
+ * When there are lagging IOs but no completions, we don't
+ * know if the IO latency will meet the QoS targets. The
+ * disk might be saturated or not. We should not reset
+ * busy_level to 0 (which would prevent vrate from scaling
+ * up or down), but rather to keep it unchanged.
+ */
+ } else if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
+ missed_ppm[READ] > ppm_rthr ||
+ missed_ppm[WRITE] > ppm_wthr) {
/* clearly missing QoS targets, slow down vrate */
ioc->busy_level = max(ioc->busy_level, 0);
ioc->busy_level++;
--
2.53.0
On Tue, 31 Mar 2026 10:05:09 +0000, Jialin Wang wrote:
> When a disk is saturated, it is common for no IOs to complete within a
> timer period. Currently, in this case, rq_wait_pct and missed_ppm are
> calculated as 0, the iocost incorrectly interprets this as meeting QoS
> targets and resets busy_level to 0.
>
> This reset prevents busy_level from reaching the threshold (4) needed
> to reduce vrate. On certain cloud storage, such as Azure Premium SSD,
> we observed that iocost may fail to reduce vrate for tens of seconds
> during saturation, failing to mitigate noisy neighbor issues.
>
> [...]
Applied, thanks!
[1/1] blk-iocost: fix busy_level reset when no IOs complete
commit: f91ffe89b2016d280995a9c28d73288b02d83615
Best regards,
--
Jens Axboe
On Tue, Mar 31, 2026 at 10:05:09AM +0000, Jialin Wang wrote: > When a disk is saturated, it is common for no IOs to complete within a > timer period. Currently, in this case, rq_wait_pct and missed_ppm are > calculated as 0, the iocost incorrectly interprets this as meeting QoS > targets and resets busy_level to 0. > > This reset prevents busy_level from reaching the threshold (4) needed > to reduce vrate. On certain cloud storage, such as Azure Premium SSD, > we observed that iocost may fail to reduce vrate for tens of seconds > during saturation, failing to mitigate noisy neighbor issues. > > Fix this by tracking the number of IO completions (nr_done) in a period. > If nr_done is 0 and there are lagging IOs, the saturation status is > unknown, so we keep busy_level unchanged. > > The issue is consistently reproducible on Azure Standard_D8as_v5 (Dasv5) > VMs with 512GB Premium SSD (P20) using the script below. It was not > observed on GCP n2d VMs (with 100G pd-ssd and 1.5T local-ssd), and no > regressions were found with this patch. In this script, cgA performs > large IOs with iodepth=128, while cgB performs small IOs with iodepth=1 > rate_iops=100 rw=randrw. With iocost enabled, we expect it to throttle > cgA, the submission latency (slat) of cgA should be significantly higher, > cgB can reach 200 IOPS and the completion latency (clat) should below. ... > Signed-off-by: Jialin Wang <wjl.linux@gmail.com> Acked-by: Tejun Heo <tj@kernel.org> Thanks. -- tejun
© 2016 - 2026 Red Hat, Inc.