From nobody Thu Apr  2 12:02:45 2026
Received: from mail-pf1-f193.google.com (mail-pf1-f193.google.com
 [209.85.210.193])
	(using TLSv1.2 with cipher ECDHE-RSA-AES128-GCM-SHA256 (128/128 bits))
	(No client certificate requested)
	by smtp.subspace.kernel.org (Postfix) with ESMTPS id AA8D72367D9
	for <linux-kernel@vger.kernel.org>; Sun, 29 Mar 2026 15:41:21 +0000 (UTC)
Authentication-Results: smtp.subspace.kernel.org;
 arc=none smtp.client-ip=209.85.210.193
ARC-Seal: i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116;
	t=1774798883; cv=none;
 b=usQ1+uXE923qOx0nelQ7TQc1F/ieaRV1YVeBLk+yrLE6lWOVuKeqyfupPHKbHu/DReFOl24arUw6fYcJZ5YlArh/6Uk5Q2yE0RI4jHsmqDNTLirOjlWbbsuEjl6zn+1OlwJ7k9IMESE/94HmfBMfch10ejq5a4dArrmEHdoz6rk=
ARC-Message-Signature: i=1; a=rsa-sha256; d=subspace.kernel.org;
	s=arc-20240116; t=1774798883; c=relaxed/simple;
	bh=0UxYwmWrWT4FOWXddN0QIWyeclk0j70twaLLE+szpko=;
	h=From:To:Cc:Subject:Date:Message-ID:MIME-Version;
 b=Da6qPl+ipFoH6Z3R1LF6rUw0imLSUCyvAiuhW7S+AxjmCmTkbdaJsXi4GYGQ/IftShUQCMPgFcaMctZPFFG2/Nxni74g2AKSF/a6S+D1h7UhGw4fMEgUK2pw+7ia3BQgdvj32dMSTzSTVZVWSm9D+17vLtQm6edAMtptMF+URfA=
ARC-Authentication-Results: i=1; smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com;
 spf=pass smtp.mailfrom=gmail.com;
 dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b=cxEDUx9V; arc=none smtp.client-ip=209.85.210.193
Authentication-Results: smtp.subspace.kernel.org;
 dmarc=pass (p=none dis=none) header.from=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
 spf=pass smtp.mailfrom=gmail.com
Authentication-Results: smtp.subspace.kernel.org;
	dkim=pass (2048-bit key) header.d=gmail.com header.i=@gmail.com
 header.b="cxEDUx9V"
Received: by mail-pf1-f193.google.com with SMTP id
 d2e1a72fcca58-82c20b9fb15so1660488b3a.3
        for <linux-kernel@vger.kernel.org>;
 Sun, 29 Mar 2026 08:41:21 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20251104; t=1774798881; x=1775403681;
 darn=vger.kernel.org;
        h=content-transfer-encoding:mime-version:message-id:date:subject:cc
         :to:from:from:to:cc:subject:date:message-id:reply-to;
        bh=cn2DYOVpo8bOTJ2a/G22aIN/JEjANtXiC08Q2WS+2PA=;
        b=cxEDUx9V+kfwEh6ULc5CrdZBBKpsAQ1wo9uG5hXSJgb0h7aln75QK5L/ZRxKTzqNiu
         STegWL3HGRgV43Oz2KVQxIUlM1WKUM7bebWHV9NqU2gzBYLjj08q7nwsbojPtkzp4vo2
         wFJ0amYIF7QnfsJbxly3sVWF/FN9uuiWDT79TQK60KmsDB/ELfE5+rSNXJEASRUPE9z1
         jlsN8+KpuH8J1bgEsaOSijBFun7JzeKhHDI0c4//Z9yWwJNYpqArIPdSnEnv7GOcWNd4
         o15oNhEB21tfBVgS4G0wqwNWDdE7mK0wWaPt7/CjpCAIOcCPKtJ9Ellr1TukbZxfkZ/h
         0k4Q==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20251104; t=1774798881; x=1775403681;
        h=content-transfer-encoding:mime-version:message-id:date:subject:cc
         :to:from:x-gm-gg:x-gm-message-state:from:to:cc:subject:date
         :message-id:reply-to;
        bh=cn2DYOVpo8bOTJ2a/G22aIN/JEjANtXiC08Q2WS+2PA=;
        b=SQvEWtSnhsPSLQ7h3a6vbPAwtLaH/Eu7YDIPrI7r6bS4BkV1PCkr3aBmPyIgP4bY5n
         JZ76xX2Mka2RquGfl5JaAYU9atXusC3H4wIT25cLVJKoaGKl4/vzPwE/aEk2Q7/xDbgm
         MPgU9Gj3JIKkeB/OMWFOPJ8+nAEdDY71yFOjDlS/N7ipbAJrvdSbZxY2fPDVfCr7HqGB
         oB/dwr4VAGy9fPi/EQXncJStBbCsrH41l8Zz4XYgmKztjQykSagozqJ1L2zxcta8TS/3
         ynY4C2uwInya7QPVfi3BAhPr/Imnaw9RgmTWQfsqL1+FqbXpntJY45tIaj2HwOYxPKmx
         JTMw==
X-Forwarded-Encrypted: i=1;
 AJvYcCUeurqvVwYU8bCwZUfnbkimqvBb6rugS2YJsPtJkgF0poJKuVXbwtjCCn8si1kaD/TvyQEiuBpSwNshguU=@vger.kernel.org
X-Gm-Message-State: AOJu0Yx4I4dRltFYbDhJQkKNR4K3HWObJUhTztCkx1mrKscDugLyFFBN
	eAVDClj9yd6lhPWle766yEqeSSLgO3Iax1rx/uy8eAvr2v/feUq6Dt4V
X-Gm-Gg: ATEYQzzUgofu0GiQNIl4JdIKwslHGxGJh33RVVmqJVCtlbm9azOmnrYrqEDPjzxaJ/i
	YHMrcQ6GnTSmplBitGpkrXS2oMpZx8K1RiFghOF4VX1QiP6NHK+kfMlIte/LRZcLFH0etbPYkGe
	IkO/SBYlnOLlKK5bMA+YAJUKVPsfuw2Z/++5Gpem+rp/cLu5iqCcJZW260C3u2nL1lOiIEAHl8P
	IqPTiRmsHxl5nbMcI7Zp48XCbqQ766tHk9eQejUpogaEB+TextvcE8vh5qms2yUf5yqYKWTHjFm
	XhRX4midAQyCTyBJKmG/GY9i2bFwLbJwwn7grikQpLdeyZIAenjdmve4gwgFwWfw0J6jMNLeIha
	kKSaTPsT9zLU5KfDq8xVUletYAMQKTc9pMGGHGH67xQru6kbNgKAuFV9TqosW0frjhJnWq8I3bj
	9IjoOKu3ejDfnUTKSl8A9lmt4W7iW1tMxHHw==
X-Received: by 2002:a05:6300:210c:b0:39b:d9f1:6cff with SMTP id
 adf61e73a8af0-39c87b6af38mr9436144637.47.1774798880807;
        Sun, 29 Mar 2026 08:41:20 -0700 (PDT)
Received: from archwsl.localdomain ([116.232.56.124])
        by smtp.gmail.com with ESMTPSA id
 41be03b00d2f7-c76916cce42sm4111879a12.9.2026.03.29.08.41.17
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Sun, 29 Mar 2026 08:41:20 -0700 (PDT)
From: Jialin Wang <wjl.linux@gmail.com>
To: tj@kernel.org,
	josef@toxicpanda.com,
	axboe@kernel.dk
Cc: cgroups@vger.kernel.org,
	linux-block@vger.kernel.org,
	linux-kernel@vger.kernel.org,
	wjl.linux@gmail.com
Subject: [PATCH v2] blk-iocost: fix busy_level reset when no IOs complete
Date: Sun, 29 Mar 2026 15:41:12 +0000
Message-ID: <20260329154112.526679-1-wjl.linux@gmail.com>
X-Mailer: git-send-email 2.53.0
Precedence: bulk
X-Mailing-List: linux-kernel@vger.kernel.org
List-Id: <linux-kernel.vger.kernel.org>
List-Subscribe: <mailto:linux-kernel+subscribe@vger.kernel.org>
List-Unsubscribe: <mailto:linux-kernel+unsubscribe@vger.kernel.org>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Content-Type: text/plain; charset="utf-8"

When a disk is saturated, it is common for no IOs to complete within a
timer period. Currently, in this case, rq_wait_pct and missed_ppm are
calculated as 0, the iocost incorrectly interprets this as meeting QoS
targets and resets busy_level to 0.

This reset prevents busy_level from reaching the threshold (4) needed
to reduce vrate. On certain cloud storage, such as Azure Premium SSD,
we observed that iocost may fail to reduce vrate for tens of seconds
during saturation, failing to mitigate noisy neighbor issues.

Fix this by tracking the number of IO completions (nr_done) in a period.
If nr_done is 0, we adjust the logic:

* If there are lagging IOs, the saturation status is unknown, so we try
  to keep busy_level unchanged. To avoid drastic vrate oscillations, we
  clamp it between -4 and 4.
* If there are shortages but no lagging IOs, the vrate might be too low
  to issue any IOs. We should allow vrate to increase but not decrease.
* Otherwise, reset busy_level to 0.

Note that when nr_done is 0 and nr_lagging is 0, the adjustment logic
is nearly identical to the "QoS targets are being met with >25% margin"
state, which minimizes the risk of regressions.

The issue is consistently reproducible on Azure Standard_D8as_v5 (Dasv5)
VMs with 512GB Premium SSD (P20) using the script below. It was not
observed on GCP n2d VMs (100G pd-ssd and 1.5T local-ssd), and no
regressions were found with this patch. In this script, cgA saturates
the device. The iocost is expected to throttle it so that cgB's
completion latency remains low.

  BLK_DEVID=3D"8:0"
  MODEL=3D"rbps=3D173471131 rseqiops=3D3566 rrandiops=3D3566 wbps=3D1733332=
69 wseqiops=3D3566 wrandiops=3D3566"
  QOS=3D"rpct=3D90.00 rlat=3D3500 wpct=3D90 wlat=3D3500 min=3D80 max=3D1000=
0"

  echo "$BLK_DEVID ctrl=3Duser model=3Dlinear $MODEL" > /sys/fs/cgroup/io.c=
ost.model
  echo "$BLK_DEVID enable=3D1 ctrl=3Duser $QOS" > /sys/fs/cgroup/io.cost.qos

  CG_A=3D"/sys/fs/cgroup/cgA"
  CG_B=3D"/sys/fs/cgroup/cgB"

  FILE_A=3D"/data0/A.fio.testfile"
  FILE_B=3D"/data0/B.fio.testfile"
  RESULT_DIR=3D"./iocost_results_$(date +%Y%m%d_%H%M%S)"

  mkdir -p "$CG_A" "$CG_B" "$RESULT_DIR"

  get_result() {
    local file=3D$1
    local label=3D$2

    local results=3D$(jq -r '
    .jobs[0].mixed |
    ( .iops | tonumber | round ) as $iops |
    ( .bw_bytes / 1024 / 1024 ) as $bps |
    ( .clat_ns.mean / 1000000 ) as $avg |
    ( .clat_ns.max / 1000000 ) as $max |
    ( .clat_ns.percentile["90.000000"] / 1000000 ) as $p90 |
    ( .clat_ns.percentile["99.000000"] / 1000000 ) as $p99 |
    ( .clat_ns.percentile["99.900000"] / 1000000 ) as $p999 |
    ( .clat_ns.percentile["99.990000"] / 1000000 ) as $p9999 |
    "\($iops)|\($bps)|\($avg)|\($max)|\($p90)|\($p99)|\($p999)|\($p9999)"
    ' "$file")

    IFS=3D'|' read -r iops bps avg max p90 p99 p999 p9999 <<<"$results"
    printf "%-8s %-6s %-7.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f %-8.2f\n" \
           "$label" "$iops" "$bps" "$avg" "$max" "$p90" "$p99" "$p999" "$p9=
999"
  }

  run_fio() {
    local cg_path=3D$1
    local filename=3D$2
    local name=3D$3
    local bs=3D$4
    local qd=3D$5
    local out=3D$6
    shift 6
    local extra=3D$@

    (
      pid=3D$(sh -c 'echo $PPID')
      echo $pid >"${cg_path}/cgroup.procs"
      fio --name=3D"$name" --filename=3D"$filename" --direct=3D1 --rw=3Dran=
drw --rwmixread=3D50 \
          --ioengine=3Dlibaio --bs=3D"$bs" --iodepth=3D"$qd" --size=3D4G --=
runtime=3D10 \
          --time_based --group_reporting --unified_rw_reporting=3Dmixed \
          --output-format=3Djson --output=3D"$out" $extra >/dev/null 2>&1
    ) &
  }

  echo "Starting Test ..."

  for bs_b in "4k" "32k" "256k"; do
    echo "Running iteration: BS=3D$bs_b"
    out_a=3D"${RESULT_DIR}/cgA_1m.json"
    out_b=3D"${RESULT_DIR}/cgB_${bs_b}.json"

    # cgA: Heavy background (BS 1MB, QD 128)
    run_fio "$CG_A" "$FILE_A" "cgA" "1m" 128 "$out_a"
    # cgB: Latency sensitive (Variable BS, QD 1, Read/Write IOPS limit 100)
    run_fio "$CG_B" "$FILE_B" "cgB" "$bs_b" 1 "$out_b" "--rate_iops=3D100"

    wait
    SUMMARY_DATA+=3D"$(get_result "$out_a" "cgA-1m")"$'\n'
    SUMMARY_DATA+=3D"$(get_result "$out_b" "cgB-$bs_b")"$'\n\n'
  done

  # Final Output
  echo -e "\nFinal Results Summary:\n"

  printf "%-8s %-6s %-7s %-8s %-8s %-8s %-8s %-8s %-8s\n\n" \
         "CGROUP" "IOPS" "MB/s" "Avg(ms)" "Max(ms)" "P90(ms)" "P99" "P99.9"=
 "P99.99"
  echo "$SUMMARY_DATA"

  echo "Results saved in $RESULT_DIR"

Before:
  CGROUP   IOPS   MB/s    Avg(ms)  Max(ms)  P90(ms)  P99      P99.9    P99.=
99

  cgA-1m   167    167.02  748.65   1641.43  960.50   1551.89  1635.78  1635=
.78
  cgB-4k   5      0.02    190.57   806.84   742.39   809.50   809.50   809.=
50

  cgA-1m   166    166.36  751.38   1744.31  994.05   1451.23  1736.44  1736=
.44
  cgB-32k  4      0.14    225.71   1057.25  759.17   1061.16  1061.16  1061=
.16

  cgA-1m   166    165.91  751.48   1610.94  1010.83  1417.67  1602.22  1619=
.00
  cgB-256k 5      1.26    198.50   1046.30  742.39   1044.38  1044.38  1044=
.38

After:
  CGROUP   IOPS   MB/s    Avg(ms)  Max(ms)  P90(ms)  P99      P99.9    P99.=
99

  cgA-1m   159    158.59  769.06   828.52   809.50   817.89   826.28   826.=
28
  cgB-4k   200    0.78    2.01     26.11    2.87     6.26     12.39    26.08

  cgA-1m   147    146.84  832.05   985.80   943.72   960.50   985.66   985.=
66
  cgB-32k  200    6.25    2.82     71.05    3.42     15.40    50.07    70.78

  cgA-1m   114    114.47  1044.98  1294.48  1199.57  1283.46  1300.23  1300=
.23
  cgB-256k 200    50.00   4.01     34.49    5.08     15.66    30.54    34.34

Signed-off-by: Jialin Wang <wjl.linux@gmail.com>
---
v2:
- Handle more edge cases to prevent potential regressions.

v1: https://lore.kernel.org/all/20260318163351.394528-1-wjl.linux@gmail.com/

 block/blk-iocost.c | 35 +++++++++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 6 deletions(-)

diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index d145db61e5c3..5184c6e25a0c 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1596,7 +1596,8 @@ static enum hrtimer_restart iocg_waitq_timer_fn(struc=
t hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
=20
-static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait=
_pct_p)
+static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait=
_pct_p,
+			 u32 *nr_done)
 {
 	u32 nr_met[2] =3D { };
 	u32 nr_missed[2] =3D { };
@@ -1633,6 +1634,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed=
_ppm_ar, u32 *rq_wait_pct_p
=20
 	*rq_wait_pct_p =3D div64_u64(rq_wait_ns * 100,
 				   ioc->period_us * NSEC_PER_USEC);
+
+	*nr_done =3D nr_met[READ] + nr_met[WRITE] + nr_missed[READ] + nr_missed[W=
RITE];
 }
=20
 /* was iocg idle this period? */
@@ -2250,12 +2253,12 @@ static void ioc_timer_fn(struct timer_list *timer)
 	u64 usage_us_sum =3D 0;
 	u32 ppm_rthr;
 	u32 ppm_wthr;
-	u32 missed_ppm[2], rq_wait_pct;
+	u32 missed_ppm[2], rq_wait_pct, nr_done;
 	u64 period_vtime;
 	int prev_busy_level;
=20
 	/* how were the latencies during the period? */
-	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct, &nr_done);
=20
 	/* take care of active iocgs */
 	spin_lock_irq(&ioc->lock);
@@ -2397,9 +2400,29 @@ static void ioc_timer_fn(struct timer_list *timer)
 	 * and should increase vtime rate.
 	 */
 	prev_busy_level =3D ioc->busy_level;
-	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
-	    missed_ppm[READ] > ppm_rthr ||
-	    missed_ppm[WRITE] > ppm_wthr) {
+	if (!nr_done) {
+		if (nr_lagging)
+			/*
+			 * When there are lagging IOs but no completions, we
+			 * don't know if the IO latency will meet the QoS
+			 * targets. The disk might be saturated or not. We
+			 * should not reset busy_level to 0 (which would
+			 * prevent vrate from scaling up or down), but rather
+			 * try to keep it unchanged. To avoid drastic vrate
+			 * oscillations, we clamp it between -4 and 4.
+			 */
+			ioc->busy_level =3D clamp(ioc->busy_level, -4, 4);
+		else if (nr_shortages)
+			/*
+			 * The vrate might be too low to issue any IOs. We
+			 * should allow vrate to increase but not decrease.
+			 */
+			ioc->busy_level =3D min(ioc->busy_level, 0);
+		else
+			ioc->busy_level =3D 0;
+	} else if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
+		   missed_ppm[READ] > ppm_rthr ||
+		   missed_ppm[WRITE] > ppm_wthr) {
 		/* clearly missing QoS targets, slow down vrate */
 		ioc->busy_level =3D max(ioc->busy_level, 0);
 		ioc->busy_level++;
--=20
2.53.0