From: Nicolas Pitre <npitre@baylibre.com>
Several years later I just realized that this code could be greatly
simplified.
First, let's formalize the need for overflow handling in
__arch_xprod64(). Assuming n = UINT64_MAX, there are 2 cases where
an overflow may occur:
1) If a bias must be added, we have m_lo * n_lo + m or
m_lo * 0xffffffff + ((m_hi << 32) + m_lo) or
((m_lo << 32) - m_lo) + ((m_hi << 32) + m_lo) or
(m_lo + m_hi) << 32 which must be < (1 << 64). So the criteria for no
overflow is m_lo + m_hi < (1 << 32).
2) The cross product m_lo * n_hi + m_hi * n_lo or
m_lo * 0xffffffff + m_hi * 0xffffffff or
((m_lo << 32) - m_lo) + ((m_hi << 32) - m_hi). Assuming the top
result from the previous step (m_lo + m_hi) that must be added to
this, we get (m_lo + m_hi) << 32 again.
So let's have a straight and simpler version when this is true.
Otherwise some reordering allows for taking care of possible overflows
without any actual conditionals. And prevent from generating both code
variants by making sure this is considered only if m is perceived as
constant by the compiler.
This, in turn, allows for greatly simplifying __div64_const32(). The
"special case" may go as well as the regular case works just fine
without needing a bias. Then reduction should be applied all the time as
minimizing m is the key.
Signed-off-by: Nicolas Pitre <npitre@baylibre.com>
---
include/asm-generic/div64.h | 114 +++++++++++-------------------------
1 file changed, 35 insertions(+), 79 deletions(-)
diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 13f5aa68a4..5d59cf7e73 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -74,7 +74,8 @@
* do the trick here). \
*/ \
uint64_t ___res, ___x, ___t, ___m, ___n = (n); \
- uint32_t ___p, ___bias; \
+ uint32_t ___p; \
+ bool ___bias = false; \
\
/* determine MSB of b */ \
___p = 1 << ilog2(___b); \
@@ -87,22 +88,14 @@
___x = ~0ULL / ___b * ___b - 1; \
\
/* test our ___m with res = m * x / (p << 64) */ \
- ___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32; \
- ___t = ___res += (___m & 0xffffffff) * (___x >> 32); \
- ___res += (___x & 0xffffffff) * (___m >> 32); \
- ___t = (___res < ___t) ? (1ULL << 32) : 0; \
- ___res = (___res >> 32) + ___t; \
- ___res += (___m >> 32) * (___x >> 32); \
- ___res /= ___p; \
+ ___res = (___m & 0xffffffff) * (___x & 0xffffffff); \
+ ___t = (___m & 0xffffffff) * (___x >> 32) + (___res >> 32); \
+ ___res = (___m >> 32) * (___x >> 32) + (___t >> 32); \
+ ___t = (___m >> 32) * (___x & 0xffffffff) + (___t & 0xffffffff);\
+ ___res = (___res + (___t >> 32)) / ___p; \
\
- /* Now sanitize and optimize what we've got. */ \
- if (~0ULL % (___b / (___b & -___b)) == 0) { \
- /* special case, can be simplified to ... */ \
- ___n /= (___b & -___b); \
- ___m = ~0ULL / (___b / (___b & -___b)); \
- ___p = 1; \
- ___bias = 1; \
- } else if (___res != ___x / ___b) { \
+ /* Now validate what we've got. */ \
+ if (___res != ___x / ___b) { \
/* \
* We can't get away without a bias to compensate \
* for bit truncation errors. To avoid it we'd need an \
@@ -111,45 +104,18 @@
* \
* Instead we do m = p / b and n / b = (n * m + m) / p. \
*/ \
- ___bias = 1; \
+ ___bias = true; \
/* Compute m = (p << 64) / b */ \
___m = (~0ULL / ___b) * ___p; \
___m += ((~0ULL % ___b + 1) * ___p) / ___b; \
- } else { \
- /* \
- * Reduce m / p, and try to clear bit 31 of m when \
- * possible, otherwise that'll need extra overflow \
- * handling later. \
- */ \
- uint32_t ___bits = -(___m & -___m); \
- ___bits |= ___m >> 32; \
- ___bits = (~___bits) << 1; \
- /* \
- * If ___bits == 0 then setting bit 31 is unavoidable. \
- * Simply apply the maximum possible reduction in that \
- * case. Otherwise the MSB of ___bits indicates the \
- * best reduction we should apply. \
- */ \
- if (!___bits) { \
- ___p /= (___m & -___m); \
- ___m /= (___m & -___m); \
- } else { \
- ___p >>= ilog2(___bits); \
- ___m >>= ilog2(___bits); \
- } \
- /* No bias needed. */ \
- ___bias = 0; \
} \
\
+ /* Reduce m / p to help avoid overflow handling later. */ \
+ ___p /= (___m & -___m); \
+ ___m /= (___m & -___m); \
+ \
/* \
- * Now we have a combination of 2 conditions: \
- * \
- * 1) whether or not we need to apply a bias, and \
- * \
- * 2) whether or not there might be an overflow in the cross \
- * product determined by (___m & ((1 << 63) | (1 << 31))). \
- * \
- * Select the best way to do (m_bias + m * n) / (1 << 64). \
+ * Perform (m_bias + m * n) / (1 << 64). \
* From now on there will be actual runtime code generated. \
*/ \
___res = __arch_xprod_64(___m, ___n, ___bias); \
@@ -165,7 +131,7 @@
* Semantic: retval = ((bias ? m : 0) + m * n) >> 64
*
* The product is a 128-bit value, scaled down to 64 bits.
- * Assuming constant propagation to optimize away unused conditional code.
+ * Hoping for compile-time optimization of conditional code.
* Architectures may provide their own optimized assembly implementation.
*/
static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
@@ -174,38 +140,28 @@ static inline uint64_t __arch_xprod_64(const uint64_t m, uint64_t n, bool bias)
uint32_t m_hi = m >> 32;
uint32_t n_lo = n;
uint32_t n_hi = n >> 32;
- uint64_t res;
- uint32_t res_lo, res_hi, tmp;
-
- if (!bias) {
- res = ((uint64_t)m_lo * n_lo) >> 32;
- } else if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
- /* there can't be any overflow here */
- res = (m + (uint64_t)m_lo * n_lo) >> 32;
- } else {
- res = m + (uint64_t)m_lo * n_lo;
- res_lo = res >> 32;
- res_hi = (res_lo < m_hi);
- res = res_lo | ((uint64_t)res_hi << 32);
- }
-
- if (!(m & ((1ULL << 63) | (1ULL << 31)))) {
- /* there can't be any overflow here */
- res += (uint64_t)m_lo * n_hi;
- res += (uint64_t)m_hi * n_lo;
- res >>= 32;
+ uint64_t x, y;
+
+ /* Determine if overflow handling can be dispensed with. */
+ bool no_ovf = __builtin_constant_p(m) &&
+ ((m >> 32) + (m & 0xffffffff) < 0x100000000);
+
+ if (no_ovf) {
+ x = (uint64_t)m_lo * n_lo + (bias ? m : 0);
+ x >>= 32;
+ x += (uint64_t)m_lo * n_hi;
+ x += (uint64_t)m_hi * n_lo;
+ x >>= 32;
+ x += (uint64_t)m_hi * n_hi;
} else {
- res += (uint64_t)m_lo * n_hi;
- tmp = res >> 32;
- res += (uint64_t)m_hi * n_lo;
- res_lo = res >> 32;
- res_hi = (res_lo < tmp);
- res = res_lo | ((uint64_t)res_hi << 32);
+ x = (uint64_t)m_lo * n_lo + (bias ? m_lo : 0);
+ y = (uint64_t)m_lo * n_hi + (uint32_t)(x >> 32) + (bias ? m_hi : 0);
+ x = (uint64_t)m_hi * n_hi + (uint32_t)(y >> 32);
+ y = (uint64_t)m_hi * n_lo + (uint32_t)y;
+ x += (uint32_t)(y >> 32);
}
- res += (uint64_t)m_hi * n_hi;
-
- return res;
+ return x;
}
#endif
--
2.46.1
Hi Nicolas, kernel test robot noticed the following build warnings: [auto build test WARNING on arnd-asm-generic/master] [also build test WARNING on soc/for-next linus/master arm/for-next arm/fixes v6.12-rc1 next-20241004] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Nicolas-Pitre/lib-math-test_div64-add-some-edge-cases-relevant-to-__div64_const32/20241004-052015 base: https://git.kernel.org/pub/scm/linux/kernel/git/arnd/asm-generic.git master patch link: https://lore.kernel.org/r/20241003211829.2750436-3-nico%40fluxnic.net patch subject: [PATCH v4 2/4] asm-generic/div64: optimize/simplify __div64_const32() config: arm-u8500_defconfig (https://download.01.org/0day-ci/archive/20241005/202410050606.G2mm4qbj-lkp@intel.com/config) compiler: arm-linux-gnueabi-gcc (GCC) 14.1.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241005/202410050606.G2mm4qbj-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202410050606.G2mm4qbj-lkp@intel.com/ All warnings (new ones prefixed by >>): security/keys/proc.c: In function 'proc_keys_show': security/keys/proc.c:217:45: warning: 'sprintf' may write a terminating nul past the end of the destination [-Wformat-overflow=] 217 | sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7)); | ^ security/keys/proc.c:217:25: note: 'sprintf' output between 3 and 17 bytes into a destination of size 16 217 | sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ security/keys/proc.c:215:45: warning: 'sprintf' may write a terminating nul past the end of the destination [-Wformat-overflow=] 215 | sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24)); | ^ security/keys/proc.c:215:25: note: 'sprintf' output between 3 and 17 bytes into a destination of size 16 215 | sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ security/keys/proc.c:213:44: warning: 'h' directive writing 1 byte into a region of size between 0 and 15 [-Wformat-overflow=] 213 | sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60)); | ^ security/keys/proc.c:213:25: note: 'sprintf' output between 3 and 18 bytes into a destination of size 16 213 | sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ >> security/keys/proc.c:211:40: warning: '%llu' directive writing between 1 and 18 bytes into a region of size 16 [-Wformat-overflow=] 211 | sprintf(xbuf, "%llum", div_u64(timo, 60)); | ^~~~ security/keys/proc.c:211:39: note: directive argument in the range [0, 576460752303423487] 211 | sprintf(xbuf, "%llum", div_u64(timo, 60)); | ^~~~~~~ security/keys/proc.c:211:25: note: 'sprintf' output between 3 and 20 bytes into a destination of size 16 211 | sprintf(xbuf, "%llum", div_u64(timo, 60)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vim +211 security/keys/proc.c ^1da177e4c3f41 Linus Torvalds 2005-04-16 152 ^1da177e4c3f41 Linus Torvalds 2005-04-16 153 static int proc_keys_show(struct seq_file *m, void *v) ^1da177e4c3f41 Linus Torvalds 2005-04-16 154 { ^1da177e4c3f41 Linus Torvalds 2005-04-16 155 struct rb_node *_p = v; ^1da177e4c3f41 Linus Torvalds 2005-04-16 156 struct key *key = rb_entry(_p, struct key, serial_node); ab5c69f01313c8 Eric Biggers 2017-09-27 157 unsigned long flags; 927942aabbbe50 David Howells 2010-06-11 158 key_ref_t key_ref, skey_ref; 074d58989569b3 Baolin Wang 2017-11-15 159 time64_t now, expiry; 03dab869b7b239 David Howells 2016-10-26 160 char xbuf[16]; 363b02dab09b32 David Howells 2017-10-04 161 short state; 074d58989569b3 Baolin Wang 2017-11-15 162 u64 timo; 06ec7be557a125 Michael LeMay 2006-06-26 163 int rc; 06ec7be557a125 Michael LeMay 2006-06-26 164 4bdf0bc3003141 David Howells 2013-09-24 165 struct keyring_search_context ctx = { ede0fa98a900e6 Eric Biggers 2019-02-22 166 .index_key = key->index_key, 4aa68e07d84556 Eric Biggers 2017-09-18 167 .cred = m->file->f_cred, 462919591a1791 David Howells 2014-09-16 168 .match_data.cmp = lookup_user_key_possessed, 462919591a1791 David Howells 2014-09-16 169 .match_data.raw_data = key, 462919591a1791 David Howells 2014-09-16 170 .match_data.lookup_type = KEYRING_SEARCH_LOOKUP_DIRECT, dcf49dbc8077e2 David Howells 2019-06-26 171 .flags = (KEYRING_SEARCH_NO_STATE_CHECK | dcf49dbc8077e2 David Howells 2019-06-26 172 KEYRING_SEARCH_RECURSE), 4bdf0bc3003141 David Howells 2013-09-24 173 }; 4bdf0bc3003141 David Howells 2013-09-24 174 028db3e290f15a Linus Torvalds 2019-07-10 175 key_ref = make_key_ref(key, 0); 927942aabbbe50 David Howells 2010-06-11 176 927942aabbbe50 David Howells 2010-06-11 177 /* determine if the key is possessed by this process (a test we can 927942aabbbe50 David Howells 2010-06-11 178 * skip if the key does not indicate the possessor can view it 927942aabbbe50 David Howells 2010-06-11 179 */ 028db3e290f15a Linus Torvalds 2019-07-10 180 if (key->perm & KEY_POS_VIEW) { 028db3e290f15a Linus Torvalds 2019-07-10 181 rcu_read_lock(); e59428f721ee09 David Howells 2019-06-19 182 skey_ref = search_cred_keyrings_rcu(&ctx); 028db3e290f15a Linus Torvalds 2019-07-10 183 rcu_read_unlock(); 927942aabbbe50 David Howells 2010-06-11 184 if (!IS_ERR(skey_ref)) { 927942aabbbe50 David Howells 2010-06-11 185 key_ref_put(skey_ref); 927942aabbbe50 David Howells 2010-06-11 186 key_ref = make_key_ref(key, 1); 927942aabbbe50 David Howells 2010-06-11 187 } 927942aabbbe50 David Howells 2010-06-11 188 } 927942aabbbe50 David Howells 2010-06-11 189 4aa68e07d84556 Eric Biggers 2017-09-18 190 /* check whether the current task is allowed to view the key */ f5895943d91b41 David Howells 2014-03-14 191 rc = key_task_permission(key_ref, ctx.cred, KEY_NEED_VIEW); 06ec7be557a125 Michael LeMay 2006-06-26 192 if (rc < 0) 028db3e290f15a Linus Torvalds 2019-07-10 193 return 0; ^1da177e4c3f41 Linus Torvalds 2005-04-16 194 074d58989569b3 Baolin Wang 2017-11-15 195 now = ktime_get_real_seconds(); ^1da177e4c3f41 Linus Torvalds 2005-04-16 196 028db3e290f15a Linus Torvalds 2019-07-10 197 rcu_read_lock(); 028db3e290f15a Linus Torvalds 2019-07-10 198 ^1da177e4c3f41 Linus Torvalds 2005-04-16 199 /* come up with a suitable timeout value */ ab5c69f01313c8 Eric Biggers 2017-09-27 200 expiry = READ_ONCE(key->expiry); 39299bdd254668 David Howells 2023-12-09 201 if (expiry == TIME64_MAX) { ^1da177e4c3f41 Linus Torvalds 2005-04-16 202 memcpy(xbuf, "perm", 5); 074d58989569b3 Baolin Wang 2017-11-15 203 } else if (now >= expiry) { ^1da177e4c3f41 Linus Torvalds 2005-04-16 204 memcpy(xbuf, "expd", 5); 7b1b9164598286 David Howells 2009-09-02 205 } else { 074d58989569b3 Baolin Wang 2017-11-15 206 timo = expiry - now; ^1da177e4c3f41 Linus Torvalds 2005-04-16 207 ^1da177e4c3f41 Linus Torvalds 2005-04-16 208 if (timo < 60) 074d58989569b3 Baolin Wang 2017-11-15 209 sprintf(xbuf, "%llus", timo); ^1da177e4c3f41 Linus Torvalds 2005-04-16 210 else if (timo < 60*60) 074d58989569b3 Baolin Wang 2017-11-15 @211 sprintf(xbuf, "%llum", div_u64(timo, 60)); ^1da177e4c3f41 Linus Torvalds 2005-04-16 212 else if (timo < 60*60*24) 074d58989569b3 Baolin Wang 2017-11-15 213 sprintf(xbuf, "%lluh", div_u64(timo, 60 * 60)); ^1da177e4c3f41 Linus Torvalds 2005-04-16 214 else if (timo < 60*60*24*7) 074d58989569b3 Baolin Wang 2017-11-15 @215 sprintf(xbuf, "%llud", div_u64(timo, 60 * 60 * 24)); ^1da177e4c3f41 Linus Torvalds 2005-04-16 216 else 074d58989569b3 Baolin Wang 2017-11-15 217 sprintf(xbuf, "%lluw", div_u64(timo, 60 * 60 * 24 * 7)); ^1da177e4c3f41 Linus Torvalds 2005-04-16 218 } ^1da177e4c3f41 Linus Torvalds 2005-04-16 219 363b02dab09b32 David Howells 2017-10-04 220 state = key_read_state(key); 363b02dab09b32 David Howells 2017-10-04 221 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
© 2016 - 2024 Red Hat, Inc.