[PATCH] bitops: Optimize fns() for improved performance

Kuan-Wei Chiu posted 1 patch 1 week, 4 days ago
There is a newer version of this series
include/linux/bitops.h | 12 ++++--------
1 file changed, 4 insertions(+), 8 deletions(-)
[PATCH] bitops: Optimize fns() for improved performance
Posted by Kuan-Wei Chiu 1 week, 4 days ago
The current fns() repeatedly uses __ffs() to find the index of the
least significant bit and then clears the corresponding bit using
__clear_bit(). The method for clearing the least significant bit can be
optimized by using word &= word - 1 instead.

Typically, the execution time of one __ffs() plus one __clear_bit() is
longer than that of a bitwise AND operation and a subtraction. To
improve performance, the loop for clearing the least significant bit
has been replaced with word &= word - 1, followed by a single __ffs()
operation to obtain the answer. This change reduces the number of
__ffs() iterations from n to just one, enhancing overall performance.

The following microbenchmark data, conducted on my x86-64 machine,
shows the execution time (in microseconds) required for 1000000 test
data generated by get_random_u64() and executed by fns() under
different values of n:

+-----+---------------+---------------+
|  n  |   time_old    |   time_new    |
+-----+---------------+---------------+
|  0  |     29194     |     25878     |
|  1  |     25510     |     25497     |
|  2  |     27836     |     25721     |
|  3  |     30140     |     25673     |
|  4  |     32569     |     25426     |
|  5  |     34792     |     25690     |
|  6  |     37117     |     25651     |
|  7  |     39742     |     25383     |
|  8  |     42360     |     25657     |
|  9  |     44672     |     25897     |
| 10  |     47237     |     25819     |
| 11  |     49884     |     26530     |
| 12  |     51864     |     26647     |
| 13  |     54265     |     28915     |
| 14  |     56440     |     28373     |
| 15  |     58839     |     28616     |
| 16  |     62383     |     29128     |
| 17  |     64257     |     30041     |
| 18  |     66805     |     29773     |
| 19  |     69368     |     33203     |
| 20  |     72942     |     33688     |
| 21  |     77006     |     34518     |
| 22  |     80926     |     34298     |
| 23  |     85723     |     35586     |
| 24  |     90324     |     36376     |
| 25  |     95992     |     37465     |
| 26  |    101101     |     37599     |
| 27  |    106520     |     37466     |
| 28  |    113287     |     38163     |
| 29  |    120552     |     38810     |
| 30  |    128040     |     39373     |
| 31  |    135624     |     40500     |
| 32  |    142580     |     40343     |
| 33  |    148915     |     40460     |
| 34  |    154005     |     41294     |
| 35  |    157996     |     41730     |
| 36  |    160806     |     41523     |
| 37  |    162975     |     42088     |
| 38  |    163426     |     41530     |
| 39  |    164872     |     41789     |
| 40  |    164477     |     42505     |
| 41  |    164758     |     41879     |
| 42  |    164182     |     41415     |
| 43  |    164842     |     42119     |
| 44  |    164881     |     42297     |
| 45  |    164870     |     42145     |
| 46  |    164673     |     42066     |
| 47  |    164616     |     42051     |
| 48  |    165055     |     41902     |
| 49  |    164847     |     41862     |
| 50  |    165171     |     41960     |
| 51  |    164851     |     42089     |
| 52  |    164763     |     41717     |
| 53  |    164635     |     42154     |
| 54  |    164757     |     41983     |
| 55  |    165095     |     41419     |
| 56  |    164641     |     42381     |
| 57  |    164601     |     41654     |
| 58  |    164864     |     41834     |
| 59  |    164594     |     41920     |
| 60  |    165207     |     42020     |
| 61  |    165056     |     41185     |
| 62  |    165160     |     41722     |
| 63  |    164923     |     41702     |
| 64  |    164777     |     41880     |
+-----+---------------+---------------+

Signed-off-by: Kuan-Wei Chiu <visitorckw@gmail.com>
---

This patch has undergone unit testing and micro benchmarking using the
following code:

static int test(void)
{
	unsigned int i, n;
	volatile unsigned long x, y, z;
	const unsigned long round = 1000000;
	ktime_t start, end;
	s32 time_old[65], time_new[65];

	for (n = 0; n <= 64; n++) {
		start = ktime_get();
		for(i = 0; i < round; i++) {
			x = get_random_u64();
			y = fns(x, n);
		}
		end = ktime_get();
		time_old[n] = ktime_us_delta(end, start);

		start = ktime_get();
		for(i = 0; i < round; i++) {
			x = get_random_u64();
			y = fns_new(x, n);
		}
		end = ktime_get();
		time_new[n] = ktime_us_delta(end, start);
	}

	for (n = 0; n <= 64; n++) {
		for(i = 0; i < round; i++) {
			x = get_random_u64();
			y = fns(x, n);
			z = fns_new(x, n);
			if (y != z) {
				printk(KERN_INFO "test failed!\n");
				return -1;
			}
		}
	}

	printk(KERN_INFO "test passed!\n");
	for (n = 0; n <= 64; n++) {
		printk(KERN_INFO "n = %u\n", n);
		printk(KERN_INFO "time old = %d\n", time_old[n]);
		printk(KERN_INFO "time new = %d\n", time_new[n]);
	}
	return 0;
}

 include/linux/bitops.h | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/include/linux/bitops.h b/include/linux/bitops.h
index 2ba557e067fe..ffeb71e72154 100644
--- a/include/linux/bitops.h
+++ b/include/linux/bitops.h
@@ -254,16 +254,12 @@ static inline unsigned long __ffs64(u64 word)
  */
 static inline unsigned long fns(unsigned long word, unsigned int n)
 {
-	unsigned int bit;
+	unsigned int i;
 
-	while (word) {
-		bit = __ffs(word);
-		if (n-- == 0)
-			return bit;
-		__clear_bit(bit, &word);
-	}
+	for (i = 0; word && i < n; i++)
+		word &= word - 1;
 
-	return BITS_PER_LONG;
+	return word ? __ffs(word) : BITS_PER_LONG;
 }
 
 /**
-- 
2.34.1