block/blk-wbt.c | 60 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 51 insertions(+), 9 deletions(-)
Optimize the computation of cur_win_nsec = win_nsec / sqrt(scale_step + 1)
in blk-wbt by introducing a fast inverse square root algorithm.
This approach replaces the original use of int_sqrt and division with a
more efficient and accurate approximation method.
Signed-off-by: Meng Shao Liu <sau525@gmail.com>
---
Since this fast inverse square root algorithm now appears in three locations
(blk-wbt, sch_cake, codel), it might be worth considering refactoring
the implementation into a shared helper to reduce duplication and ensure consistency.
However, this patch focuses solely on introducing the optimization in blk-wbt.
block/blk-wbt.c | 60 +++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 51 insertions(+), 9 deletions(-)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index a50d4cd55..1fd5af3ba 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -80,6 +80,8 @@ struct rq_wb {
u64 win_nsec; /* default window size */
u64 cur_win_nsec; /* current window size */
+ u32 rec_inv_sqrt; /* reciprocal value of sqrt(scaling step + 1) */
struct blk_stat_callback *cb;
u64 sync_issue;
@@ -130,6 +132,11 @@ enum {
*/
RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL,
+ /*
+ * Initial reciprocal value of sqrt(scaling step + 1)
+ */
+ RWB_REC_INV_SQRT = 0,
+
/*
* Disregard stats, if we don't meet this minimum
*/
@@ -395,20 +402,55 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle)
rwb_trace_step(rwb, tracepoint_string("scale down"));
}
+#define REC_INV_SQRT_CACHE (16)
+static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = {
+ ~0, ~0, 3037000500, 2479700525,
+ 2147483647, 1920767767, 1753413056, 1623345051,
+ 1518500250, 1431655765, 1358187914, 1294981364,
+ 1239850263, 1191209601, 1147878294, 1108955788
+};
+
+/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
+ * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
+ *
+ * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
+ */
+
+static void rwb_newton_step(struct rq_wb *rwb)
+{
+ struct rq_depth *rqd = &rwb->rq_depth;
+ u32 invsqrt, invsqrt2;
+ u64 val;
+
+ invsqrt = rwb->rec_inv_sqrt;
+ invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
+ val = (3LL << 32) - ((u64)(rqd->scale_step + 1) * invsqrt2);
+
+ val >>= 2; /* avoid overflow in following multiply */
+ val = (val * invsqrt) >> (32 - 2 + 1);
+
+ rwb->rec_inv_sqrt = val;
+}
+
+static void rwb_invsqrt(struct rq_wb *rwb)
+{
+ struct rq_depth *rqd = &rwb->rq_depth;
+
+ if (rqd->scale_step + 1 < REC_INV_SQRT_CACHE)
+ rwb->rec_inv_sqrt = inv_sqrt_cache[rqd->scale_step + 1];
+ else
+ rwb_newton_step(rwb);
+}
+
static void rwb_arm_timer(struct rq_wb *rwb)
{
struct rq_depth *rqd = &rwb->rq_depth;
if (rqd->scale_step > 0) {
- /*
- * We should speed this up, using some variant of a fast
- * integer inverse square root calculation. Since we only do
- * this for every window expiration, it's not a huge deal,
- * though.
- */
- rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
- int_sqrt((rqd->scale_step + 1) << 8));
- } else {
+ rwb_invsqrt(rwb);
+ rwb->cur_win_nsec = reciprocal_scale(rwb->win_nsec,
+ rwb->rec_inv_sqrt);
+ } else {
/*
* For step < 0, we don't want to increase/decrease the
* window size.
@@ -911,6 +953,7 @@ int wbt_init(struct gendisk *disk)
rwb->last_comp = rwb->last_issue = jiffies;
rwb->win_nsec = RWB_WINDOW_NSEC;
+ rwb->rec_inv_sqrt = RWB_REC_INV_SQRT;
rwb->enable_state = WBT_STATE_ON_DEFAULT;
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
--
2.50.1
Hi, 在 2025/07/27 19:21, Meng Shao Liu 写道: > Optimize the computation of cur_win_nsec = win_nsec / sqrt(scale_step + 1) > in blk-wbt by introducing a fast inverse square root algorithm. > This approach replaces the original use of int_sqrt and division with a > more efficient and accurate approximation method. > > Signed-off-by: Meng Shao Liu <sau525@gmail.com> > --- > Since this fast inverse square root algorithm now appears in three locations > (blk-wbt, sch_cake, codel), it might be worth considering refactoring > the implementation into a shared helper to reduce duplication and ensure consistency. > However, this patch focuses solely on introducing the optimization in blk-wbt. > > block/blk-wbt.c | 60 +++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 51 insertions(+), 9 deletions(-) > Do you have numbers how this optimization is helper for wbt? Any difference for IO perforamce or other overhead? I don't feel this is much helper in the slow path. Please consider introducing a shared helper first, then convert wbt to use that new helper. Thanks, Kuai > diff --git a/block/blk-wbt.c b/block/blk-wbt.c > index a50d4cd55..1fd5af3ba 100644 > --- a/block/blk-wbt.c > +++ b/block/blk-wbt.c > @@ -80,6 +80,8 @@ struct rq_wb { > u64 win_nsec; /* default window size */ > u64 cur_win_nsec; /* current window size */ > > + u32 rec_inv_sqrt; /* reciprocal value of sqrt(scaling step + 1) */ > struct blk_stat_callback *cb; > > u64 sync_issue; > @@ -130,6 +132,11 @@ enum { > */ > RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, > > + /* > + * Initial reciprocal value of sqrt(scaling step + 1) > + */ > + RWB_REC_INV_SQRT = 0, > + > /* > * Disregard stats, if we don't meet this minimum > */ > @@ -395,20 +402,55 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle) > rwb_trace_step(rwb, tracepoint_string("scale down")); > } > > +#define REC_INV_SQRT_CACHE (16) > +static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { > + ~0, ~0, 3037000500, 2479700525, > + 2147483647, 1920767767, 1753413056, 1623345051, > + 1518500250, 1431655765, 1358187914, 1294981364, > + 1239850263, 1191209601, 1147878294, 1108955788 > +}; > + > +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots > + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) > + * > + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 > + */ > + > +static void rwb_newton_step(struct rq_wb *rwb) > +{ > + struct rq_depth *rqd = &rwb->rq_depth; > + u32 invsqrt, invsqrt2; > + u64 val; > + > + invsqrt = rwb->rec_inv_sqrt; > + invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; > + val = (3LL << 32) - ((u64)(rqd->scale_step + 1) * invsqrt2); > + > + val >>= 2; /* avoid overflow in following multiply */ > + val = (val * invsqrt) >> (32 - 2 + 1); > + > + rwb->rec_inv_sqrt = val; > +} > + > +static void rwb_invsqrt(struct rq_wb *rwb) > +{ > + struct rq_depth *rqd = &rwb->rq_depth; > + > + if (rqd->scale_step + 1 < REC_INV_SQRT_CACHE) > + rwb->rec_inv_sqrt = inv_sqrt_cache[rqd->scale_step + 1]; > + else > + rwb_newton_step(rwb); > +} > + > static void rwb_arm_timer(struct rq_wb *rwb) > { > struct rq_depth *rqd = &rwb->rq_depth; > > if (rqd->scale_step > 0) { > - /* > - * We should speed this up, using some variant of a fast > - * integer inverse square root calculation. Since we only do > - * this for every window expiration, it's not a huge deal, > - * though. > - */ > - rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, > - int_sqrt((rqd->scale_step + 1) << 8)); > - } else { > + rwb_invsqrt(rwb); > + rwb->cur_win_nsec = reciprocal_scale(rwb->win_nsec, > + rwb->rec_inv_sqrt); > + } else { > /* > * For step < 0, we don't want to increase/decrease the > * window size. > @@ -911,6 +953,7 @@ int wbt_init(struct gendisk *disk) > > rwb->last_comp = rwb->last_issue = jiffies; > rwb->win_nsec = RWB_WINDOW_NSEC; > + rwb->rec_inv_sqrt = RWB_REC_INV_SQRT; > rwb->enable_state = WBT_STATE_ON_DEFAULT; > rwb->rq_depth.default_depth = RWB_DEF_DEPTH; > rwb->min_lat_nsec = wbt_default_latency_nsec(q); >
On Sun, Jul 27, 2025 at 7:22 PM Meng Shao Liu <sau525@gmail.com> wrote: > > Optimize the computation of cur_win_nsec = win_nsec / sqrt(scale_step + 1) > in blk-wbt by introducing a fast inverse square root algorithm. > This approach replaces the original use of int_sqrt and division with a > more efficient and accurate approximation method. > > Signed-off-by: Meng Shao Liu <sau525@gmail.com> > --- > Since this fast inverse square root algorithm now appears in three locations > (blk-wbt, sch_cake, codel), it might be worth considering refactoring > the implementation into a shared helper to reduce duplication and ensure consistency. > However, this patch focuses solely on introducing the optimization in blk-wbt. > > block/blk-wbt.c | 60 +++++++++++++++++++++++++++++++++++++++++-------- > 1 file changed, 51 insertions(+), 9 deletions(-) > > diff --git a/block/blk-wbt.c b/block/blk-wbt.c > index a50d4cd55..1fd5af3ba 100644 > --- a/block/blk-wbt.c > +++ b/block/blk-wbt.c > @@ -80,6 +80,8 @@ struct rq_wb { > u64 win_nsec; /* default window size */ > u64 cur_win_nsec; /* current window size */ > > + u32 rec_inv_sqrt; /* reciprocal value of sqrt(scaling step + 1) */ > struct blk_stat_callback *cb; > > u64 sync_issue; > @@ -130,6 +132,11 @@ enum { > */ > RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, > > + /* > + * Initial reciprocal value of sqrt(scaling step + 1) > + */ > + RWB_REC_INV_SQRT = 0, Hi Meng, As the initial value of scale_step is 0, then sqrt(0 + 1) = 1, which is not 0. > + > /* > * Disregard stats, if we don't meet this minimum > */ > @@ -395,20 +402,55 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle) > rwb_trace_step(rwb, tracepoint_string("scale down")); > } > > +#define REC_INV_SQRT_CACHE (16) > +static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { > + ~0, ~0, 3037000500, 2479700525, > + 2147483647, 1920767767, 1753413056, 1623345051, > + 1518500250, 1431655765, 1358187914, 1294981364, > + 1239850263, 1191209601, 1147878294, 1108955788 > +}; > + > +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots > + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) > + * > + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 > + */ > + > +static void rwb_newton_step(struct rq_wb *rwb) > +{ > + struct rq_depth *rqd = &rwb->rq_depth; > + u32 invsqrt, invsqrt2; > + u64 val; > + > + invsqrt = rwb->rec_inv_sqrt; > + invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; > + val = (3LL << 32) - ((u64)(rqd->scale_step + 1) * invsqrt2); > + > + val >>= 2; /* avoid overflow in following multiply */ > + val = (val * invsqrt) >> (32 - 2 + 1); > + > + rwb->rec_inv_sqrt = val; > +} > + > +static void rwb_invsqrt(struct rq_wb *rwb) > +{ > + struct rq_depth *rqd = &rwb->rq_depth; > + > + if (rqd->scale_step + 1 < REC_INV_SQRT_CACHE) > + rwb->rec_inv_sqrt = inv_sqrt_cache[rqd->scale_step + 1]; > + else > + rwb_newton_step(rwb); > +} > + > static void rwb_arm_timer(struct rq_wb *rwb) > { > struct rq_depth *rqd = &rwb->rq_depth; > > if (rqd->scale_step > 0) { > - /* > - * We should speed this up, using some variant of a fast > - * integer inverse square root calculation. Since we only do > - * this for every window expiration, it's not a huge deal, > - * though. > - */ > - rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, > - int_sqrt((rqd->scale_step + 1) << 8)); > - } else { > + rwb_invsqrt(rwb); > + rwb->cur_win_nsec = reciprocal_scale(rwb->win_nsec, > + rwb->rec_inv_sqrt); I think placing the two lines of code involving mathematical formulas directly in a core wbt code path is not a good idea. I suggest you encapsulate them in a separate function and document its purpose. Thanks, Yi > + } else { > /* > * For step < 0, we don't want to increase/decrease the > * window size. > @@ -911,6 +953,7 @@ int wbt_init(struct gendisk *disk) > > rwb->last_comp = rwb->last_issue = jiffies; > rwb->win_nsec = RWB_WINDOW_NSEC; > + rwb->rec_inv_sqrt = RWB_REC_INV_SQRT; > rwb->enable_state = WBT_STATE_ON_DEFAULT; > rwb->rq_depth.default_depth = RWB_DEF_DEPTH; > rwb->min_lat_nsec = wbt_default_latency_nsec(q); > -- > 2.50.1 > >
On Mon, Jul 28, 2025 at 02:06:52AM +0800, Yizhou Tang wrote: > On Sun, Jul 27, 2025 at 7:22 PM Meng Shao Liu <sau525@gmail.com> wrote: > > > > Optimize the computation of cur_win_nsec = win_nsec / sqrt(scale_step + 1) > > in blk-wbt by introducing a fast inverse square root algorithm. > > This approach replaces the original use of int_sqrt and division with a > > more efficient and accurate approximation method. > > > > Signed-off-by: Meng Shao Liu <sau525@gmail.com> > > --- > > Since this fast inverse square root algorithm now appears in three locations > > (blk-wbt, sch_cake, codel), it might be worth considering refactoring > > the implementation into a shared helper to reduce duplication and ensure consistency. > > However, this patch focuses solely on introducing the optimization in blk-wbt. > > > > block/blk-wbt.c | 60 +++++++++++++++++++++++++++++++++++++++++-------- > > 1 file changed, 51 insertions(+), 9 deletions(-) > > > > diff --git a/block/blk-wbt.c b/block/blk-wbt.c > > index a50d4cd55..1fd5af3ba 100644 > > --- a/block/blk-wbt.c > > +++ b/block/blk-wbt.c > > @@ -80,6 +80,8 @@ struct rq_wb { > > u64 win_nsec; /* default window size */ > > u64 cur_win_nsec; /* current window size */ > > > > + u32 rec_inv_sqrt; /* reciprocal value of sqrt(scaling step + 1) */ > > struct blk_stat_callback *cb; > > > > u64 sync_issue; > > @@ -130,6 +132,11 @@ enum { > > */ > > RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, > > > > + /* > > + * Initial reciprocal value of sqrt(scaling step + 1) > > + */ > > + RWB_REC_INV_SQRT = 0, > > Hi Meng, > > As the initial value of scale_step is 0, then sqrt(0 + 1) = 1, which is not 0. > Thanks for pointing out the problem. > > + > > /* > > * Disregard stats, if we don't meet this minimum > > */ > > @@ -395,20 +402,55 @@ static void scale_down(struct rq_wb *rwb, bool hard_throttle) > > rwb_trace_step(rwb, tracepoint_string("scale down")); > > } > > > > +#define REC_INV_SQRT_CACHE (16) > > +static const u32 inv_sqrt_cache[REC_INV_SQRT_CACHE] = { > > + ~0, ~0, 3037000500, 2479700525, > > + 2147483647, 1920767767, 1753413056, 1623345051, > > + 1518500250, 1431655765, 1358187914, 1294981364, > > + 1239850263, 1191209601, 1147878294, 1108955788 > > +}; > > + > > +/* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots > > + * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2) > > + * > > + * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32 > > + */ > > + > > +static void rwb_newton_step(struct rq_wb *rwb) > > +{ > > + struct rq_depth *rqd = &rwb->rq_depth; > > + u32 invsqrt, invsqrt2; > > + u64 val; > > + > > + invsqrt = rwb->rec_inv_sqrt; > > + invsqrt2 = ((u64)invsqrt * invsqrt) >> 32; > > + val = (3LL << 32) - ((u64)(rqd->scale_step + 1) * invsqrt2); > > + > > + val >>= 2; /* avoid overflow in following multiply */ > > + val = (val * invsqrt) >> (32 - 2 + 1); > > + > > + rwb->rec_inv_sqrt = val; > > +} > > + > > +static void rwb_invsqrt(struct rq_wb *rwb) > > +{ > > + struct rq_depth *rqd = &rwb->rq_depth; > > + > > + if (rqd->scale_step + 1 < REC_INV_SQRT_CACHE) > > + rwb->rec_inv_sqrt = inv_sqrt_cache[rqd->scale_step + 1]; > > + else > > + rwb_newton_step(rwb); > > +} > > + > > static void rwb_arm_timer(struct rq_wb *rwb) > > { > > struct rq_depth *rqd = &rwb->rq_depth; > > > > if (rqd->scale_step > 0) { > > - /* > > - * We should speed this up, using some variant of a fast > > - * integer inverse square root calculation. Since we only do > > - * this for every window expiration, it's not a huge deal, > > - * though. > > - */ > > - rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, > > - int_sqrt((rqd->scale_step + 1) << 8)); > > - } else { > > + rwb_invsqrt(rwb); > > + rwb->cur_win_nsec = reciprocal_scale(rwb->win_nsec, > > + rwb->rec_inv_sqrt); > > I think placing the two lines of code involving mathematical formulas > directly in a core wbt code path is not a good idea. I suggest you > encapsulate them in a separate function and document its purpose. > > Thanks, > Yi > Got it! > > + } else { > > /* > > * For step < 0, we don't want to increase/decrease the > > * window size. > > @@ -911,6 +953,7 @@ int wbt_init(struct gendisk *disk) > > > > rwb->last_comp = rwb->last_issue = jiffies; > > rwb->win_nsec = RWB_WINDOW_NSEC; > > + rwb->rec_inv_sqrt = RWB_REC_INV_SQRT; > > rwb->enable_state = WBT_STATE_ON_DEFAULT; > > rwb->rq_depth.default_depth = RWB_DEF_DEPTH; > > rwb->min_lat_nsec = wbt_default_latency_nsec(q); > > -- > > 2.50.1 > > > >
© 2016 - 2025 Red Hat, Inc.