[v1] Helper to isolate least-significant bit

[RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Petr Tesarik 1 month ago

Replace the open-coded bit-twiddling trick to get the least significant bit
with a call to ffs_val().

This patch has been generated by the following coccinelle script:

  @
  depends on !(file in "include/asm-generic/bitops/ffs_val.h")
	  && !(file in "tools/")
  @
  expression E;
  @@

  - ((E) & (-(E)))
  + ffs_val(E)

Include <linux/ffs_val.h> where the result did not build.

Signed-off-by: Petr Tesarik <ptesarik@suse.com>
---
 arch/alpha/include/asm/bitops.h              |  2 +-
 arch/alpha/kernel/core_cia.c                 |  2 +-
 arch/alpha/kernel/smp.c                      |  2 +-
 arch/arc/include/asm/bitops.h                |  2 +-
 arch/m68k/include/asm/bitops.h               | 12 ++++++------
 arch/mips/dec/ecc-berr.c                     |  2 +-
 arch/mips/include/asm/bitops.h               |  4 ++--
 arch/mips/pci/pci-malta.c                    |  4 ++--
 arch/powerpc/include/asm/bitops.h            |  4 ++--
 arch/powerpc/kvm/e500_mmu_host.c             |  2 +-
 arch/powerpc/lib/sstep.c                     |  2 +-
 arch/powerpc/xmon/ppc-dis.c                  |  3 ++-
 arch/powerpc/xmon/ppc-opc.c                  |  6 +++---
 arch/s390/include/asm/bitops.h               |  2 +-
 arch/xtensa/include/asm/bitops.h             |  6 +++---
 arch/xtensa/kernel/traps.c                   |  2 +-
 drivers/gpu/drm/gma500/psb_intel_sdvo.c      |  2 +-
 drivers/iommu/dma-iommu.c                    |  2 +-
 drivers/net/ethernet/netronome/nfp/bpf/jit.c |  2 +-
 drivers/net/usb/cdc_ncm.c                    |  4 ++--
 include/asm-generic/div64.h                  |  4 ++--
 include/linux/bitfield.h                     |  3 ++-
 include/linux/log2.h                         |  2 +-
 include/linux/min_heap.h                     |  5 +++--
 lib/math/gcd.c                               |  4 ++--
 lib/sort.c                                   |  3 ++-
 net/bluetooth/mgmt.c                         |  2 +-
 net/netfilter/nft_set_pipapo.c               |  2 +-
 28 files changed, 48 insertions(+), 44 deletions(-)

diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
index 76e4343c090f7..590e25862192f 100644
--- a/arch/alpha/include/asm/bitops.h
+++ b/arch/alpha/include/asm/bitops.h
@@ -317,7 +317,7 @@ static inline unsigned long ffz_b(unsigned long x)
 {
 	unsigned long sum, x1, x2, x4;
 
-	x = ~x & -~x;		/* set first 0 bit, clear others */
+	x = ffs_val(~x);	/* set first 0 bit, clear others */
 	x1 = x & 0xAA;
 	x2 = x & 0xCC;
 	x4 = x & 0xF0;
diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c
index 6e577228e175d..b3c273c13c104 100644
--- a/arch/alpha/kernel/core_cia.c
+++ b/arch/alpha/kernel/core_cia.c
@@ -1035,7 +1035,7 @@ cia_decode_ecc_error(struct el_CIA_sysdata_mcheck *cia, const char *msg)
 	cia_decode_mem_error(cia, msg);
 
 	syn = cia->cia_syn & 0xff;
-	if (syn == (syn & -syn)) {
+	if (syn == ffs_val(syn)) {
 		fmt = KERN_CRIT "  ECC syndrome %#x -- check bit %d\n";
 		i = ffs(syn) - 1;
 	} else {
diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
index ed06367ece574..2f3809c015b99 100644
--- a/arch/alpha/kernel/smp.c
+++ b/arch/alpha/kernel/smp.c
@@ -525,7 +525,7 @@ handle_ipi(struct pt_regs *regs)
 	  do {
 		unsigned long which;
 
-		which = ops & -ops;
+		which = ffs_val(ops);
 		ops &= ~which;
 		which = __ffs(which);
 
diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
index df894235fdbc6..def870bd7e4c7 100644
--- a/arch/arc/include/asm/bitops.h
+++ b/arch/arc/include/asm/bitops.h
@@ -94,7 +94,7 @@ static inline __attribute__ ((const)) unsigned long __fls(unsigned long x)
  * ffs = Find First Set in word (LSB to MSB)
  * @result: [1-32], 0 if all 0's
  */
-#define ffs(x)	({ unsigned long __t = (x); fls(__t & -__t); })
+#define ffs(x)	({ unsigned long __t = (x); fls(ffs_val(__t)); })
 
 /*
  * __ffs: Similar to ffs, but zero based (0-31)
diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
index e9639e48c6c3c..a67f08b8a7541 100644
--- a/arch/m68k/include/asm/bitops.h
+++ b/arch/m68k/include/asm/bitops.h
@@ -368,7 +368,7 @@ static inline unsigned long find_first_zero_bit(const unsigned long *vaddr,
 	}
 
 	__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
-			      : "=d" (res) : "d" (num & -num));
+			      : "=d" (res) : "d" (ffs_val(num)));
 	res ^= 31;
 out:
 	res += ((long)p - (long)vaddr - 4) * 8;
@@ -392,7 +392,7 @@ static inline unsigned long find_next_zero_bit(const unsigned long *vaddr,
 
 		/* Look for zero in first longword */
 		__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
-				      : "=d" (res) : "d" (num & -num));
+				      : "=d" (res) : "d" (ffs_val(num)));
 		if (res < 32) {
 			offset += res ^ 31;
 			return offset < size ? offset : size;
@@ -425,7 +425,7 @@ static inline unsigned long find_first_bit(const unsigned long *vaddr,
 	}
 
 	__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
-			      : "=d" (res) : "d" (num & -num));
+			      : "=d" (res) : "d" (ffs_val(num)));
 	res ^= 31;
 out:
 	res += ((long)p - (long)vaddr - 4) * 8;
@@ -449,7 +449,7 @@ static inline unsigned long find_next_bit(const unsigned long *vaddr,
 
 		/* Look for one in first longword */
 		__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
-				      : "=d" (res) : "d" (num & -num));
+				      : "=d" (res) : "d" (ffs_val(num)));
 		if (res < 32) {
 			offset += res ^ 31;
 			return offset < size ? offset : size;
@@ -473,7 +473,7 @@ static inline unsigned long __attribute_const__ ffz(unsigned long word)
 	int res;
 
 	__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
-			      : "=d" (res) : "d" (~word & -~word));
+			      : "=d" (res) : "d" (ffs_val(~word)));
 	return res ^ 31;
 }
 
@@ -527,7 +527,7 @@ static inline __attribute_const__ int ffs(int x)
 
 	__asm__ ("bfffo %1{#0:#0},%0"
 		: "=d" (cnt)
-		: "dm" (x & -x));
+		: "dm" (ffs_val(x)));
 	return 32 - cnt;
 }
 
diff --git a/arch/mips/dec/ecc-berr.c b/arch/mips/dec/ecc-berr.c
index 1eb356fdd8323..8934b8b1cf375 100644
--- a/arch/mips/dec/ecc-berr.c
+++ b/arch/mips/dec/ecc-berr.c
@@ -153,7 +153,7 @@ static int dec_ecc_be_backend(struct pt_regs *regs, int is_fixup, int invoker)
 			/* Ack now, now we've rewritten (or not). */
 			dec_ecc_be_ack();
 
-			if (syn && syn == (syn & -syn)) {
+			if (syn && syn == ffs_val(syn)) {
 				if (syn == 0x01) {
 					fmt = KERN_ALERT "%s"
 					      "%#04x -- %s bit error "
diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
index 42f88452c920f..6e71999618876 100644
--- a/arch/mips/include/asm/bitops.h
+++ b/arch/mips/include/asm/bitops.h
@@ -395,7 +395,7 @@ static __always_inline __attribute_const__ unsigned long __fls(unsigned long wor
  */
 static __always_inline __attribute_const__ unsigned long __ffs(unsigned long word)
 {
-	return __fls(word & -word);
+	return __fls(ffs_val(word));
 }
 
 /*
@@ -463,7 +463,7 @@ static inline __attribute_const__ int ffs(int word)
 	if (!word)
 		return 0;
 
-	return fls(word & -word);
+	return fls(ffs_val(word));
 }
 
 #include <asm-generic/bitops/ffz.h>
diff --git a/arch/mips/pci/pci-malta.c b/arch/mips/pci/pci-malta.c
index 2e35aeba45bc1..502f99f46eb75 100644
--- a/arch/mips/pci/pci-malta.c
+++ b/arch/mips/pci/pci-malta.c
@@ -117,7 +117,7 @@ void __init mips_pcibios_init(void)
 		mask = ~(start ^ end);
 		/* We don't support remapping with a discontiguous mask.  */
 		BUG_ON((start & GT_PCI_HD_MSK) != (map & GT_PCI_HD_MSK) &&
-		       mask != ~((mask & -mask) - 1));
+		       mask != ~(ffs_val(mask) - 1));
 		gt64120_mem_resource.start = start;
 		gt64120_mem_resource.end = end;
 		gt64120_controller.mem_offset = (start & mask) - (map & mask);
@@ -134,7 +134,7 @@ void __init mips_pcibios_init(void)
 		mask = ~(start ^ end);
 		/* We don't support remapping with a discontiguous mask.  */
 		BUG_ON((start & GT_PCI_HD_MSK) != (map & GT_PCI_HD_MSK) &&
-		       mask != ~((mask & -mask) - 1));
+		       mask != ~(ffs_val(mask) - 1));
 		gt64120_io_resource.start = map & mask;
 		gt64120_io_resource.end = (map & mask) | ~mask;
 		gt64120_controller.io_offset = 0;
diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
index 0d0470cd5ac31..571700bb41e46 100644
--- a/arch/powerpc/include/asm/bitops.h
+++ b/arch/powerpc/include/asm/bitops.h
@@ -87,8 +87,8 @@ static __always_inline bool is_rlwinm_mask_valid(unsigned long x)
 	if (!x)
 		return false;
 	if (x & 1)
-		x = ~x;	// make the mask non-wrapping
-	x += x & -x;	// adding the low set bit results in at most one bit set
+		x = ~x;		// make the mask non-wrapping
+	x += ffs_val(x);	// adding the low set bit results in at most one bit set
 
 	return !(x & (x - 1));
 }
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 06caf8bbbe2b7..00ae88b3369fd 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -205,7 +205,7 @@ void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
 
 		local_irq_save(flags);
 		while (tmp) {
-			hw_tlb_indx = __ilog2_u64(tmp & -tmp);
+			hw_tlb_indx = __ilog2_u64(ffs_val(tmp));
 			mtspr(SPRN_MAS0,
 			      MAS0_TLBSEL(1) |
 			      MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
index ac3ee19531d8a..2a8a7e007dbd2 100644
--- a/arch/powerpc/lib/sstep.c
+++ b/arch/powerpc/lib/sstep.c
@@ -236,7 +236,7 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned int instr,
 static nokprobe_inline unsigned long max_align(unsigned long x)
 {
 	x |= sizeof(unsigned long);
-	return x & -x;		/* isolates rightmost bit */
+	return ffs_val(x);		/* isolates rightmost bit */
 }
 
 static nokprobe_inline unsigned long byterev_2(unsigned long x)
diff --git a/arch/powerpc/xmon/ppc-dis.c b/arch/powerpc/xmon/ppc-dis.c
index af105e1bc3fca..13b31964686ad 100644
--- a/arch/powerpc/xmon/ppc-dis.c
+++ b/arch/powerpc/xmon/ppc-dis.c
@@ -9,6 +9,7 @@ This file is part of GDB, GAS, and the GNU binutils.
 
 #include <asm/cputable.h>
 #include <asm/cpu_has_feature.h>
+#include <linux/ffs_val.h>
 #include "nonstdio.h"
 #include "ansidecl.h"
 #include "ppc.h"
@@ -44,7 +45,7 @@ operand_value_powerpc (const struct powerpc_operand *operand,
 	  unsigned long top = operand->bitm;
 	  /* top & -top gives the rightmost 1 bit, so this
 	     fills in any trailing zeros.  */
-	  top |= (top & -top) - 1;
+	  top |= ffs_val(top) - 1;
 	  top &= ~(top >> 1);
 	  value = (value ^ top) - top;
 	}
diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c
index de9b4236728c4..23d845439fa11 100644
--- a/arch/powerpc/xmon/ppc-opc.c
+++ b/arch/powerpc/xmon/ppc-opc.c
@@ -1417,7 +1417,7 @@ insert_fxm (unsigned long insn,
      one bit of the mask field is set.  */
   if ((insn & (1 << 20)) != 0)
     {
-      if (value == 0 || (value & -value) != value)
+      if (value == 0 || ffs_val(value) != value)
 	{
 	  *errmsg = _("invalid mask field");
 	  value = 0;
@@ -1430,7 +1430,7 @@ insert_fxm (unsigned long insn,
      new form unless -mpower4 has been given, or -many and the two
      operand form of mfcr was used.  */
   else if (value > 0
-	   && (value & -value) == value
+	   && ffs_val(value) == value
 	   && ((dialect & PPC_OPCODE_POWER4) != 0
 	       || ((dialect & PPC_OPCODE_ANY) != 0
 		   && (insn & (0x3ff << 1)) == 19 << 1)))
@@ -1460,7 +1460,7 @@ extract_fxm (unsigned long insn,
   if ((insn & (1 << 20)) != 0)
     {
       /* Exactly one bit of MASK should be set.  */
-      if (mask == 0 || (mask & -mask) != mask)
+      if (mask == 0 || ffs_val(mask) != mask)
 	*invalid = 1;
     }
 
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 5f10074665b0c..dadd8a5bd5428 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -193,7 +193,7 @@ static __always_inline __flatten __attribute_const__ int ffs(int word)
 {
 	unsigned int val = (unsigned int)word;
 
-	return BITS_PER_LONG - __flogr(-val & val);
+	return BITS_PER_LONG - __flogr(ffs_val(val));
 }
 
 #else /* CONFIG_CC_HAS_BUILTIN_FFS */
diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h
index f7390b6761e1b..44e0debde1b3d 100644
--- a/arch/xtensa/include/asm/bitops.h
+++ b/arch/xtensa/include/asm/bitops.h
@@ -39,7 +39,7 @@ static inline unsigned long __cntlz (unsigned long x)
 
 static inline int __attribute_const__ ffz(unsigned long x)
 {
-	return 31 - __cntlz(~x & -~x);
+	return 31 - __cntlz(ffs_val(~x));
 }
 
 /*
@@ -48,7 +48,7 @@ static inline int __attribute_const__ ffz(unsigned long x)
 
 static inline __attribute_const__ unsigned long __ffs(unsigned long x)
 {
-	return 31 - __cntlz(x & -x);
+	return 31 - __cntlz(ffs_val(x));
 }
 
 /*
@@ -59,7 +59,7 @@ static inline __attribute_const__ unsigned long __ffs(unsigned long x)
 
 static inline __attribute_const__ int ffs(unsigned long x)
 {
-	return 32 - __cntlz(x & -x);
+	return 32 - __cntlz(ffs_val(x));
 }
 
 /*
diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
index 44c07c4e0833a..7ab71b21581f5 100644
--- a/arch/xtensa/kernel/traps.c
+++ b/arch/xtensa/kernel/traps.c
@@ -309,7 +309,7 @@ static void do_interrupt(struct pt_regs *regs)
 			break;
 
 		/* clear lowest pending irq in the unhandled mask */
-		unhandled ^= (int_at_level & -int_at_level);
+		unhandled ^= ffs_val(int_at_level);
 		do_IRQ(__ffs(int_at_level), regs);
 	}
 
diff --git a/drivers/gpu/drm/gma500/psb_intel_sdvo.c b/drivers/gpu/drm/gma500/psb_intel_sdvo.c
index 553e7c7d9bb83..56d3fa0bd0334 100644
--- a/drivers/gpu/drm/gma500/psb_intel_sdvo.c
+++ b/drivers/gpu/drm/gma500/psb_intel_sdvo.c
@@ -1226,7 +1226,7 @@ psb_intel_sdvo_multifunc_encoder(struct psb_intel_sdvo *psb_intel_sdvo)
 {
 	/* Is there more than one type of output? */
 	int caps = psb_intel_sdvo->caps.output_flags & 0xf;
-	return caps & -caps;
+	return ffs_val(caps);
 }
 
 static struct edid *
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c92088855450a..a0211b51d8043 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -943,7 +943,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
 	    iommu_deferred_attach(dev, domain))
 		return NULL;
 
-	min_size = alloc_sizes & -alloc_sizes;
+	min_size = ffs_val(alloc_sizes);
 	if (min_size < PAGE_SIZE) {
 		min_size = PAGE_SIZE;
 		alloc_sizes |= PAGE_SIZE;
diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
index 3a02eef58cc6d..25e0905a61721 100644
--- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
+++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
@@ -1565,7 +1565,7 @@ static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
 	rvalue = reciprocal_value_adv(imm, 32);
 	exp = rvalue.exp;
 	if (rvalue.is_wide_m && !(imm & 1)) {
-		pre_shift = fls(imm & -imm) - 1;
+		pre_shift = fls(ffs_val(imm)) - 1;
 		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
 	} else {
 		pre_shift = 0;
diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
index 5d123df0a866b..e132f222a026a 100644
--- a/drivers/net/usb/cdc_ncm.c
+++ b/drivers/net/usb/cdc_ncm.c
@@ -668,7 +668,7 @@ static void cdc_ncm_fix_modulus(struct usbnet *dev)
 	val = ctx->tx_ndp_modulus;
 
 	if ((val < USB_CDC_NCM_NDP_ALIGN_MIN_SIZE) ||
-	    (val != ((-val) & val)) || (val >= ctx->tx_max)) {
+	    (val != ffs_val(val)) || (val >= ctx->tx_max)) {
 		dev_dbg(&dev->intf->dev, "Using default alignment: 4 bytes\n");
 		ctx->tx_ndp_modulus = USB_CDC_NCM_NDP_ALIGN_MIN_SIZE;
 	}
@@ -682,7 +682,7 @@ static void cdc_ncm_fix_modulus(struct usbnet *dev)
 	val = ctx->tx_modulus;
 
 	if ((val < USB_CDC_NCM_NDP_ALIGN_MIN_SIZE) ||
-	    (val != ((-val) & val)) || (val >= ctx->tx_max)) {
+	    (val != ffs_val(val)) || (val >= ctx->tx_max)) {
 		dev_dbg(&dev->intf->dev, "Using default transmit modulus: 4 bytes\n");
 		ctx->tx_modulus = USB_CDC_NCM_NDP_ALIGN_MIN_SIZE;
 	}
diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
index 25e7b4b58dcf5..d56208be887de 100644
--- a/include/asm-generic/div64.h
+++ b/include/asm-generic/div64.h
@@ -111,8 +111,8 @@
 	}								\
 									\
 	/* Reduce m / p to help avoid overflow handling later. */	\
-	___p /= (___m & -___m);						\
-	___m /= (___m & -___m);						\
+	___p /= ffs_val(___m);						\
+	___m /= ffs_val(___m);						\
 									\
 	/*								\
 	 * Perform (m_bias + m * n) / (1 << 64).			\
diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
index 126dc5b380afa..26fbb6cfc0719 100644
--- a/include/linux/bitfield.h
+++ b/include/linux/bitfield.h
@@ -8,6 +8,7 @@
 #define _LINUX_BITFIELD_H
 
 #include <linux/build_bug.h>
+#include <linux/ffs_val.h>
 #include <linux/typecheck.h>
 #include <asm/byteorder.h>
 
@@ -202,7 +203,7 @@ static __always_inline u64 field_multiplier(u64 field)
 {
 	if ((field | (field - 1)) & ((field | (field - 1)) + 1))
 		__bad_mask();
-	return field & -field;
+	return ffs_val(field);
 }
 static __always_inline u64 field_mask(u64 field)
 {
diff --git a/include/linux/log2.h b/include/linux/log2.h
index 2eac3fc9303d6..7615e513885e9 100644
--- a/include/linux/log2.h
+++ b/include/linux/log2.h
@@ -266,7 +266,7 @@ int __bits_per(unsigned long n)
 static inline __attribute__((const))
 unsigned int max_pow_of_two_factor(unsigned int n)
 {
-	return n & -n;
+	return ffs_val(n);
 }
 
 #endif /* _LINUX_LOG2_H */
diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
index 79ddc0adbf2bf..e914919470c88 100644
--- a/include/linux/min_heap.h
+++ b/include/linux/min_heap.h
@@ -3,6 +3,7 @@
 #define _LINUX_MIN_HEAP_H
 
 #include <linux/bug.h>
+#include <linux/ffs_val.h>
 #include <linux/string.h>
 #include <linux/types.h>
 
@@ -257,7 +258,7 @@ static __always_inline
 void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size,
 				 const struct min_heap_callbacks *func, void *args)
 {
-	const unsigned long lsbit = elem_size & -elem_size;
+	const unsigned long lsbit = ffs_val(elem_size);
 	void *data = heap->data;
 	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
 	/* pre-scale counters for performance */
@@ -297,7 +298,7 @@ static __always_inline
 void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx,
 			       const struct min_heap_callbacks *func, void *args)
 {
-	const unsigned long lsbit = elem_size & -elem_size;
+	const unsigned long lsbit = ffs_val(elem_size);
 	void *data = heap->data;
 	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
 	/* pre-scale counters for performance */
diff --git a/lib/math/gcd.c b/lib/math/gcd.c
index 62efca6787aef..3706c84e68bc1 100644
--- a/lib/math/gcd.c
+++ b/lib/math/gcd.c
@@ -23,12 +23,12 @@ static unsigned long binary_gcd(unsigned long a, unsigned long b)
 
 	b >>= __ffs(b);
 	if (b == 1)
-		return r & -r;
+		return ffs_val(r);
 
 	for (;;) {
 		a >>= __ffs(a);
 		if (a == 1)
-			return r & -r;
+			return ffs_val(r);
 		if (a == b)
 			return a << __ffs(r);
 
diff --git a/lib/sort.c b/lib/sort.c
index 52363995ccc5c..2489aaaa21a06 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -12,6 +12,7 @@
 
 #include <linux/types.h>
 #include <linux/export.h>
+#include <linux/ffs_val.h>
 #include <linux/sort.h>
 
 /**
@@ -196,7 +197,7 @@ static void __sort_r(void *base, size_t num, size_t size,
 {
 	/* pre-scale counters for performance */
 	size_t n = num * size, a = (num/2) * size;
-	const unsigned int lsbit = size & -size;  /* Used to find parent */
+	const unsigned int lsbit = ffs_val(size);  /* Used to find parent */
 	size_t shift = 0;
 
 	if (!a)		/* num < 2 || size == 0 */
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 5be9b8c919490..2b13998fdb0fe 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -8664,7 +8664,7 @@ static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags)
 	supported_flags = get_supported_adv_flags(hdev);
 	phy_flags = adv_flags & MGMT_ADV_FLAG_SEC_MASK;
 	if (adv_flags & ~supported_flags ||
-	    ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
+	    ((phy_flags && (phy_flags ^ ffs_val(phy_flags)))))
 		return false;
 
 	return true;
diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
index 112fe46788b6f..f93f509cc3fbf 100644
--- a/net/netfilter/nft_set_pipapo.c
+++ b/net/netfilter/nft_set_pipapo.c
@@ -370,7 +370,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
 	for (k = 0; k < len; k++) {
 		bitset = map[k];
 		while (bitset) {
-			unsigned long t = bitset & -bitset;
+			unsigned long t = ffs_val(bitset);
 			int r = __builtin_ctzl(bitset);
 			int i = k * BITS_PER_LONG + r;
 
-- 
2.52.0

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Yury Norov 1 month ago

On Fri, Jan 09, 2026 at 05:37:57PM +0100, Petr Tesarik wrote:
> Replace the open-coded bit-twiddling trick to get the least significant bit
> with a call to ffs_val().
> 
> This patch has been generated by the following coccinelle script:
> 
>   @
>   depends on !(file in "include/asm-generic/bitops/ffs_val.h")
> 	  && !(file in "tools/")
>   @
>   expression E;
>   @@
> 
>   - ((E) & (-(E)))
>   + ffs_val(E)
> 
> Include <linux/ffs_val.h> where the result did not build.
> 
> Signed-off-by: Petr Tesarik <ptesarik@suse.com>
> ---
>  arch/alpha/include/asm/bitops.h              |  2 +-
>  arch/alpha/kernel/core_cia.c                 |  2 +-
>  arch/alpha/kernel/smp.c                      |  2 +-
>  arch/arc/include/asm/bitops.h                |  2 +-
>  arch/m68k/include/asm/bitops.h               | 12 ++++++------
>  arch/mips/dec/ecc-berr.c                     |  2 +-
>  arch/mips/include/asm/bitops.h               |  4 ++--
>  arch/mips/pci/pci-malta.c                    |  4 ++--
>  arch/powerpc/include/asm/bitops.h            |  4 ++--
>  arch/powerpc/kvm/e500_mmu_host.c             |  2 +-
>  arch/powerpc/lib/sstep.c                     |  2 +-
>  arch/powerpc/xmon/ppc-dis.c                  |  3 ++-
>  arch/powerpc/xmon/ppc-opc.c                  |  6 +++---
>  arch/s390/include/asm/bitops.h               |  2 +-
>  arch/xtensa/include/asm/bitops.h             |  6 +++---
>  arch/xtensa/kernel/traps.c                   |  2 +-
>  drivers/gpu/drm/gma500/psb_intel_sdvo.c      |  2 +-
>  drivers/iommu/dma-iommu.c                    |  2 +-
>  drivers/net/ethernet/netronome/nfp/bpf/jit.c |  2 +-
>  drivers/net/usb/cdc_ncm.c                    |  4 ++--
>  include/asm-generic/div64.h                  |  4 ++--
>  include/linux/bitfield.h                     |  3 ++-
>  include/linux/log2.h                         |  2 +-
>  include/linux/min_heap.h                     |  5 +++--
>  lib/math/gcd.c                               |  4 ++--
>  lib/sort.c                                   |  3 ++-
>  net/bluetooth/mgmt.c                         |  2 +-
>  net/netfilter/nft_set_pipapo.c               |  2 +-
>  28 files changed, 48 insertions(+), 44 deletions(-)
> 
> diff --git a/arch/alpha/include/asm/bitops.h b/arch/alpha/include/asm/bitops.h
> index 76e4343c090f7..590e25862192f 100644
> --- a/arch/alpha/include/asm/bitops.h
> +++ b/arch/alpha/include/asm/bitops.h
> @@ -317,7 +317,7 @@ static inline unsigned long ffz_b(unsigned long x)
>  {
>  	unsigned long sum, x1, x2, x4;
>  
> -	x = ~x & -~x;		/* set first 0 bit, clear others */
> +	x = ffs_val(~x);	/* set first 0 bit, clear others */
>  	x1 = x & 0xAA;
>  	x2 = x & 0xCC;
>  	x4 = x & 0xF0;
> diff --git a/arch/alpha/kernel/core_cia.c b/arch/alpha/kernel/core_cia.c
> index 6e577228e175d..b3c273c13c104 100644
> --- a/arch/alpha/kernel/core_cia.c
> +++ b/arch/alpha/kernel/core_cia.c
> @@ -1035,7 +1035,7 @@ cia_decode_ecc_error(struct el_CIA_sysdata_mcheck *cia, const char *msg)
>  	cia_decode_mem_error(cia, msg);
>  
>  	syn = cia->cia_syn & 0xff;
> -	if (syn == (syn & -syn)) {
> +	if (syn == ffs_val(syn)) {
>  		fmt = KERN_CRIT "  ECC syndrome %#x -- check bit %d\n";
>  		i = ffs(syn) - 1;
>  	} else {
> diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
> index ed06367ece574..2f3809c015b99 100644
> --- a/arch/alpha/kernel/smp.c
> +++ b/arch/alpha/kernel/smp.c
> @@ -525,7 +525,7 @@ handle_ipi(struct pt_regs *regs)
>  	  do {
>  		unsigned long which;
>  
> -		which = ops & -ops;
> +		which = ffs_val(ops);
>  		ops &= ~which;
>  		which = __ffs(which);

I guess, this should be:

        which = __ffs(ops);
        ops &= ops - 1;

Check how fns() works. And to that extend, you can make another
helper: clear_lsb().

> diff --git a/arch/arc/include/asm/bitops.h b/arch/arc/include/asm/bitops.h
> index df894235fdbc6..def870bd7e4c7 100644
> --- a/arch/arc/include/asm/bitops.h
> +++ b/arch/arc/include/asm/bitops.h
> @@ -94,7 +94,7 @@ static inline __attribute__ ((const)) unsigned long __fls(unsigned long x)
>   * ffs = Find First Set in word (LSB to MSB)
>   * @result: [1-32], 0 if all 0's
>   */
> -#define ffs(x)	({ unsigned long __t = (x); fls(__t & -__t); })
> +#define ffs(x)	({ unsigned long __t = (x); fls(ffs_val(__t)); })
>  
>  /*
>   * __ffs: Similar to ffs, but zero based (0-31)
> diff --git a/arch/m68k/include/asm/bitops.h b/arch/m68k/include/asm/bitops.h
> index e9639e48c6c3c..a67f08b8a7541 100644
> --- a/arch/m68k/include/asm/bitops.h
> +++ b/arch/m68k/include/asm/bitops.h
> @@ -368,7 +368,7 @@ static inline unsigned long find_first_zero_bit(const unsigned long *vaddr,
>  	}
>  
>  	__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
> -			      : "=d" (res) : "d" (num & -num));
> +			      : "=d" (res) : "d" (ffs_val(num)));
>  	res ^= 31;
>  out:
>  	res += ((long)p - (long)vaddr - 4) * 8;
> @@ -392,7 +392,7 @@ static inline unsigned long find_next_zero_bit(const unsigned long *vaddr,
>  
>  		/* Look for zero in first longword */
>  		__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
> -				      : "=d" (res) : "d" (num & -num));
> +				      : "=d" (res) : "d" (ffs_val(num)));
>  		if (res < 32) {
>  			offset += res ^ 31;
>  			return offset < size ? offset : size;
> @@ -425,7 +425,7 @@ static inline unsigned long find_first_bit(const unsigned long *vaddr,
>  	}
>  
>  	__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
> -			      : "=d" (res) : "d" (num & -num));
> +			      : "=d" (res) : "d" (ffs_val(num)));
>  	res ^= 31;
>  out:
>  	res += ((long)p - (long)vaddr - 4) * 8;
> @@ -449,7 +449,7 @@ static inline unsigned long find_next_bit(const unsigned long *vaddr,
>  
>  		/* Look for one in first longword */
>  		__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
> -				      : "=d" (res) : "d" (num & -num));
> +				      : "=d" (res) : "d" (ffs_val(num)));
>  		if (res < 32) {
>  			offset += res ^ 31;
>  			return offset < size ? offset : size;
> @@ -473,7 +473,7 @@ static inline unsigned long __attribute_const__ ffz(unsigned long word)
>  	int res;
>  
>  	__asm__ __volatile__ ("bfffo %1{#0,#0},%0"
> -			      : "=d" (res) : "d" (~word & -~word));
> +			      : "=d" (res) : "d" (ffs_val(~word)));

This should be ffz_val(word). But if you like fsb_mask(), maybe
fzb_mask()?

>  	return res ^ 31;
>  }
>  
> @@ -527,7 +527,7 @@ static inline __attribute_const__ int ffs(int x)
>  
>  	__asm__ ("bfffo %1{#0:#0},%0"
>  		: "=d" (cnt)
> -		: "dm" (x & -x));
> +		: "dm" (ffs_val(x)));
>  	return 32 - cnt;
>  }
>  
> diff --git a/arch/mips/dec/ecc-berr.c b/arch/mips/dec/ecc-berr.c
> index 1eb356fdd8323..8934b8b1cf375 100644
> --- a/arch/mips/dec/ecc-berr.c
> +++ b/arch/mips/dec/ecc-berr.c
> @@ -153,7 +153,7 @@ static int dec_ecc_be_backend(struct pt_regs *regs, int is_fixup, int invoker)
>  			/* Ack now, now we've rewritten (or not). */
>  			dec_ecc_be_ack();
>  
> -			if (syn && syn == (syn & -syn)) {
> +			if (syn && syn == ffs_val(syn)) {

It opencodes is_power_of_2().

>  				if (syn == 0x01) {
>  					fmt = KERN_ALERT "%s"
>  					      "%#04x -- %s bit error "
> diff --git a/arch/mips/include/asm/bitops.h b/arch/mips/include/asm/bitops.h
> index 42f88452c920f..6e71999618876 100644
> --- a/arch/mips/include/asm/bitops.h
> +++ b/arch/mips/include/asm/bitops.h
> @@ -395,7 +395,7 @@ static __always_inline __attribute_const__ unsigned long __fls(unsigned long wor
>   */
>  static __always_inline __attribute_const__ unsigned long __ffs(unsigned long word)
>  {
> -	return __fls(word & -word);
> +	return __fls(ffs_val(word));
>  }
>  
>  /*
> @@ -463,7 +463,7 @@ static inline __attribute_const__ int ffs(int word)
>  	if (!word)
>  		return 0;
>  
> -	return fls(word & -word);
> +	return fls(ffs_val(word));
>  }
>  
>  #include <asm-generic/bitops/ffz.h>
> diff --git a/arch/mips/pci/pci-malta.c b/arch/mips/pci/pci-malta.c
> index 2e35aeba45bc1..502f99f46eb75 100644
> --- a/arch/mips/pci/pci-malta.c
> +++ b/arch/mips/pci/pci-malta.c
> @@ -117,7 +117,7 @@ void __init mips_pcibios_init(void)
>  		mask = ~(start ^ end);
>  		/* We don't support remapping with a discontiguous mask.  */
>  		BUG_ON((start & GT_PCI_HD_MSK) != (map & GT_PCI_HD_MSK) &&
> -		       mask != ~((mask & -mask) - 1));
> +		       mask != ~(ffs_val(mask) - 1));
>  		gt64120_mem_resource.start = start;
>  		gt64120_mem_resource.end = end;
>  		gt64120_controller.mem_offset = (start & mask) - (map & mask);
> @@ -134,7 +134,7 @@ void __init mips_pcibios_init(void)
>  		mask = ~(start ^ end);
>  		/* We don't support remapping with a discontiguous mask.  */
>  		BUG_ON((start & GT_PCI_HD_MSK) != (map & GT_PCI_HD_MSK) &&
> -		       mask != ~((mask & -mask) - 1));
> +		       mask != ~(ffs_val(mask) - 1));
>  		gt64120_io_resource.start = map & mask;
>  		gt64120_io_resource.end = (map & mask) | ~mask;
>  		gt64120_controller.io_offset = 0;
> diff --git a/arch/powerpc/include/asm/bitops.h b/arch/powerpc/include/asm/bitops.h
> index 0d0470cd5ac31..571700bb41e46 100644
> --- a/arch/powerpc/include/asm/bitops.h
> +++ b/arch/powerpc/include/asm/bitops.h
> @@ -87,8 +87,8 @@ static __always_inline bool is_rlwinm_mask_valid(unsigned long x)
>  	if (!x)
>  		return false;
>  	if (x & 1)
> -		x = ~x;	// make the mask non-wrapping
> -	x += x & -x;	// adding the low set bit results in at most one bit set
> +		x = ~x;		// make the mask non-wrapping
> +	x += ffs_val(x);	// adding the low set bit results in at most one bit set
>  
>  	return !(x & (x - 1));
>  }
> diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
> index 06caf8bbbe2b7..00ae88b3369fd 100644
> --- a/arch/powerpc/kvm/e500_mmu_host.c
> +++ b/arch/powerpc/kvm/e500_mmu_host.c
> @@ -205,7 +205,7 @@ void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel,
>  
>  		local_irq_save(flags);
>  		while (tmp) {
> -			hw_tlb_indx = __ilog2_u64(tmp & -tmp);
> +			hw_tlb_indx = __ilog2_u64(ffs_val(tmp));

Is it the same as __ffs64(tmp)?

>  			mtspr(SPRN_MAS0,
>  			      MAS0_TLBSEL(1) |
>  			      MAS0_ESEL(to_htlb1_esel(hw_tlb_indx)));
> diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c
> index ac3ee19531d8a..2a8a7e007dbd2 100644
> --- a/arch/powerpc/lib/sstep.c
> +++ b/arch/powerpc/lib/sstep.c
> @@ -236,7 +236,7 @@ static nokprobe_inline unsigned long mlsd_8lsd_ea(unsigned int instr,
>  static nokprobe_inline unsigned long max_align(unsigned long x)
>  {
>  	x |= sizeof(unsigned long);
> -	return x & -x;		/* isolates rightmost bit */
> +	return ffs_val(x);		/* isolates rightmost bit */
>  }
>  
>  static nokprobe_inline unsigned long byterev_2(unsigned long x)
> diff --git a/arch/powerpc/xmon/ppc-dis.c b/arch/powerpc/xmon/ppc-dis.c
> index af105e1bc3fca..13b31964686ad 100644
> --- a/arch/powerpc/xmon/ppc-dis.c
> +++ b/arch/powerpc/xmon/ppc-dis.c
> @@ -9,6 +9,7 @@ This file is part of GDB, GAS, and the GNU binutils.
>  
>  #include <asm/cputable.h>
>  #include <asm/cpu_has_feature.h>
> +#include <linux/ffs_val.h>
>  #include "nonstdio.h"
>  #include "ansidecl.h"
>  #include "ppc.h"
> @@ -44,7 +45,7 @@ operand_value_powerpc (const struct powerpc_operand *operand,
>  	  unsigned long top = operand->bitm;
>  	  /* top & -top gives the rightmost 1 bit, so this
>  	     fills in any trailing zeros.  */
> -	  top |= (top & -top) - 1;
> +	  top |= ffs_val(top) - 1;

Yeah, to me here it would be more verbose if:
          
          top |= lsb_mask(top) - 1;

>  	  top &= ~(top >> 1);
>  	  value = (value ^ top) - top;
>  	}
> diff --git a/arch/powerpc/xmon/ppc-opc.c b/arch/powerpc/xmon/ppc-opc.c
> index de9b4236728c4..23d845439fa11 100644
> --- a/arch/powerpc/xmon/ppc-opc.c
> +++ b/arch/powerpc/xmon/ppc-opc.c
> @@ -1417,7 +1417,7 @@ insert_fxm (unsigned long insn,
>       one bit of the mask field is set.  */
>    if ((insn & (1 << 20)) != 0)
>      {
> -      if (value == 0 || (value & -value) != value)
> +      if (value == 0 || ffs_val(value) != value)

Again, opencodes is_power_of_2()

>  	{
>  	  *errmsg = _("invalid mask field");
>  	  value = 0;
> @@ -1430,7 +1430,7 @@ insert_fxm (unsigned long insn,
>       new form unless -mpower4 has been given, or -many and the two
>       operand form of mfcr was used.  */
>    else if (value > 0
> -	   && (value & -value) == value
> +	   && ffs_val(value) == value
>  	   && ((dialect & PPC_OPCODE_POWER4) != 0
>  	       || ((dialect & PPC_OPCODE_ANY) != 0
>  		   && (insn & (0x3ff << 1)) == 19 << 1)))
> @@ -1460,7 +1460,7 @@ extract_fxm (unsigned long insn,
>    if ((insn & (1 << 20)) != 0)
>      {
>        /* Exactly one bit of MASK should be set.  */
> -      if (mask == 0 || (mask & -mask) != mask)
> +      if (mask == 0 || ffs_val(mask) != mask)

And again.

--

OK, let me stop here.

Can you please instead of mechanical rework check each case
individually and find the best replacement to highlight the intention?

Can you also split this patch to a series according to subsystems, so
that corresponding maintainers will have better access to their changes?

Thanks,
Yury

>  	*invalid = 1;
>      }
>  
> diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
> index 5f10074665b0c..dadd8a5bd5428 100644
> --- a/arch/s390/include/asm/bitops.h
> +++ b/arch/s390/include/asm/bitops.h
> @@ -193,7 +193,7 @@ static __always_inline __flatten __attribute_const__ int ffs(int word)
>  {
>  	unsigned int val = (unsigned int)word;
>  
> -	return BITS_PER_LONG - __flogr(-val & val);
> +	return BITS_PER_LONG - __flogr(ffs_val(val));
>  }
>  
>  #else /* CONFIG_CC_HAS_BUILTIN_FFS */
> diff --git a/arch/xtensa/include/asm/bitops.h b/arch/xtensa/include/asm/bitops.h
> index f7390b6761e1b..44e0debde1b3d 100644
> --- a/arch/xtensa/include/asm/bitops.h
> +++ b/arch/xtensa/include/asm/bitops.h
> @@ -39,7 +39,7 @@ static inline unsigned long __cntlz (unsigned long x)
>  
>  static inline int __attribute_const__ ffz(unsigned long x)
>  {
> -	return 31 - __cntlz(~x & -~x);
> +	return 31 - __cntlz(ffs_val(~x));
>  }
>  
>  /*
> @@ -48,7 +48,7 @@ static inline int __attribute_const__ ffz(unsigned long x)
>  
>  static inline __attribute_const__ unsigned long __ffs(unsigned long x)
>  {
> -	return 31 - __cntlz(x & -x);
> +	return 31 - __cntlz(ffs_val(x));
>  }
>  
>  /*
> @@ -59,7 +59,7 @@ static inline __attribute_const__ unsigned long __ffs(unsigned long x)
>  
>  static inline __attribute_const__ int ffs(unsigned long x)
>  {
> -	return 32 - __cntlz(x & -x);
> +	return 32 - __cntlz(ffs_val(x));
>  }
>  
>  /*
> diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c
> index 44c07c4e0833a..7ab71b21581f5 100644
> --- a/arch/xtensa/kernel/traps.c
> +++ b/arch/xtensa/kernel/traps.c
> @@ -309,7 +309,7 @@ static void do_interrupt(struct pt_regs *regs)
>  			break;
>  
>  		/* clear lowest pending irq in the unhandled mask */
> -		unhandled ^= (int_at_level & -int_at_level);
> +		unhandled ^= ffs_val(int_at_level);
>  		do_IRQ(__ffs(int_at_level), regs);
>  	}
>  
> diff --git a/drivers/gpu/drm/gma500/psb_intel_sdvo.c b/drivers/gpu/drm/gma500/psb_intel_sdvo.c
> index 553e7c7d9bb83..56d3fa0bd0334 100644
> --- a/drivers/gpu/drm/gma500/psb_intel_sdvo.c
> +++ b/drivers/gpu/drm/gma500/psb_intel_sdvo.c
> @@ -1226,7 +1226,7 @@ psb_intel_sdvo_multifunc_encoder(struct psb_intel_sdvo *psb_intel_sdvo)
>  {
>  	/* Is there more than one type of output? */
>  	int caps = psb_intel_sdvo->caps.output_flags & 0xf;
> -	return caps & -caps;
> +	return ffs_val(caps);
>  }
>  
>  static struct edid *
> diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> index c92088855450a..a0211b51d8043 100644
> --- a/drivers/iommu/dma-iommu.c
> +++ b/drivers/iommu/dma-iommu.c
> @@ -943,7 +943,7 @@ static struct page **__iommu_dma_alloc_noncontiguous(struct device *dev,
>  	    iommu_deferred_attach(dev, domain))
>  		return NULL;
>  
> -	min_size = alloc_sizes & -alloc_sizes;
> +	min_size = ffs_val(alloc_sizes);
>  	if (min_size < PAGE_SIZE) {
>  		min_size = PAGE_SIZE;
>  		alloc_sizes |= PAGE_SIZE;
> diff --git a/drivers/net/ethernet/netronome/nfp/bpf/jit.c b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
> index 3a02eef58cc6d..25e0905a61721 100644
> --- a/drivers/net/ethernet/netronome/nfp/bpf/jit.c
> +++ b/drivers/net/ethernet/netronome/nfp/bpf/jit.c
> @@ -1565,7 +1565,7 @@ static int wrp_div_imm(struct nfp_prog *nfp_prog, u8 dst, u64 imm)
>  	rvalue = reciprocal_value_adv(imm, 32);
>  	exp = rvalue.exp;
>  	if (rvalue.is_wide_m && !(imm & 1)) {
> -		pre_shift = fls(imm & -imm) - 1;
> +		pre_shift = fls(ffs_val(imm)) - 1;
>  		rvalue = reciprocal_value_adv(imm >> pre_shift, 32 - pre_shift);
>  	} else {
>  		pre_shift = 0;
> diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c
> index 5d123df0a866b..e132f222a026a 100644
> --- a/drivers/net/usb/cdc_ncm.c
> +++ b/drivers/net/usb/cdc_ncm.c
> @@ -668,7 +668,7 @@ static void cdc_ncm_fix_modulus(struct usbnet *dev)
>  	val = ctx->tx_ndp_modulus;
>  
>  	if ((val < USB_CDC_NCM_NDP_ALIGN_MIN_SIZE) ||
> -	    (val != ((-val) & val)) || (val >= ctx->tx_max)) {
> +	    (val != ffs_val(val)) || (val >= ctx->tx_max)) {
>  		dev_dbg(&dev->intf->dev, "Using default alignment: 4 bytes\n");
>  		ctx->tx_ndp_modulus = USB_CDC_NCM_NDP_ALIGN_MIN_SIZE;
>  	}
> @@ -682,7 +682,7 @@ static void cdc_ncm_fix_modulus(struct usbnet *dev)
>  	val = ctx->tx_modulus;
>  
>  	if ((val < USB_CDC_NCM_NDP_ALIGN_MIN_SIZE) ||
> -	    (val != ((-val) & val)) || (val >= ctx->tx_max)) {
> +	    (val != ffs_val(val)) || (val >= ctx->tx_max)) {
>  		dev_dbg(&dev->intf->dev, "Using default transmit modulus: 4 bytes\n");
>  		ctx->tx_modulus = USB_CDC_NCM_NDP_ALIGN_MIN_SIZE;
>  	}
> diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h
> index 25e7b4b58dcf5..d56208be887de 100644
> --- a/include/asm-generic/div64.h
> +++ b/include/asm-generic/div64.h
> @@ -111,8 +111,8 @@
>  	}								\
>  									\
>  	/* Reduce m / p to help avoid overflow handling later. */	\
> -	___p /= (___m & -___m);						\
> -	___m /= (___m & -___m);						\
> +	___p /= ffs_val(___m);						\
> +	___m /= ffs_val(___m);						\
>  									\
>  	/*								\
>  	 * Perform (m_bias + m * n) / (1 << 64).			\
> diff --git a/include/linux/bitfield.h b/include/linux/bitfield.h
> index 126dc5b380afa..26fbb6cfc0719 100644
> --- a/include/linux/bitfield.h
> +++ b/include/linux/bitfield.h
> @@ -8,6 +8,7 @@
>  #define _LINUX_BITFIELD_H
>  
>  #include <linux/build_bug.h>
> +#include <linux/ffs_val.h>
>  #include <linux/typecheck.h>
>  #include <asm/byteorder.h>
>  
> @@ -202,7 +203,7 @@ static __always_inline u64 field_multiplier(u64 field)
>  {
>  	if ((field | (field - 1)) & ((field | (field - 1)) + 1))
>  		__bad_mask();
> -	return field & -field;
> +	return ffs_val(field);
>  }
>  static __always_inline u64 field_mask(u64 field)
>  {
> diff --git a/include/linux/log2.h b/include/linux/log2.h
> index 2eac3fc9303d6..7615e513885e9 100644
> --- a/include/linux/log2.h
> +++ b/include/linux/log2.h
> @@ -266,7 +266,7 @@ int __bits_per(unsigned long n)
>  static inline __attribute__((const))
>  unsigned int max_pow_of_two_factor(unsigned int n)
>  {
> -	return n & -n;
> +	return ffs_val(n);
>  }
>  
>  #endif /* _LINUX_LOG2_H */
> diff --git a/include/linux/min_heap.h b/include/linux/min_heap.h
> index 79ddc0adbf2bf..e914919470c88 100644
> --- a/include/linux/min_heap.h
> +++ b/include/linux/min_heap.h
> @@ -3,6 +3,7 @@
>  #define _LINUX_MIN_HEAP_H
>  
>  #include <linux/bug.h>
> +#include <linux/ffs_val.h>
>  #include <linux/string.h>
>  #include <linux/types.h>
>  
> @@ -257,7 +258,7 @@ static __always_inline
>  void __min_heap_sift_down_inline(min_heap_char *heap, size_t pos, size_t elem_size,
>  				 const struct min_heap_callbacks *func, void *args)
>  {
> -	const unsigned long lsbit = elem_size & -elem_size;
> +	const unsigned long lsbit = ffs_val(elem_size);
>  	void *data = heap->data;
>  	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
>  	/* pre-scale counters for performance */
> @@ -297,7 +298,7 @@ static __always_inline
>  void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx,
>  			       const struct min_heap_callbacks *func, void *args)
>  {
> -	const unsigned long lsbit = elem_size & -elem_size;
> +	const unsigned long lsbit = ffs_val(elem_size);
>  	void *data = heap->data;
>  	void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
>  	/* pre-scale counters for performance */
> diff --git a/lib/math/gcd.c b/lib/math/gcd.c
> index 62efca6787aef..3706c84e68bc1 100644
> --- a/lib/math/gcd.c
> +++ b/lib/math/gcd.c
> @@ -23,12 +23,12 @@ static unsigned long binary_gcd(unsigned long a, unsigned long b)
>  
>  	b >>= __ffs(b);
>  	if (b == 1)
> -		return r & -r;
> +		return ffs_val(r);
>  
>  	for (;;) {
>  		a >>= __ffs(a);
>  		if (a == 1)
> -			return r & -r;
> +			return ffs_val(r);
>  		if (a == b)
>  			return a << __ffs(r);
>  
> diff --git a/lib/sort.c b/lib/sort.c
> index 52363995ccc5c..2489aaaa21a06 100644
> --- a/lib/sort.c
> +++ b/lib/sort.c
> @@ -12,6 +12,7 @@
>  
>  #include <linux/types.h>
>  #include <linux/export.h>
> +#include <linux/ffs_val.h>
>  #include <linux/sort.h>
>  
>  /**
> @@ -196,7 +197,7 @@ static void __sort_r(void *base, size_t num, size_t size,
>  {
>  	/* pre-scale counters for performance */
>  	size_t n = num * size, a = (num/2) * size;
> -	const unsigned int lsbit = size & -size;  /* Used to find parent */
> +	const unsigned int lsbit = ffs_val(size);  /* Used to find parent */
>  	size_t shift = 0;
>  
>  	if (!a)		/* num < 2 || size == 0 */
> diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
> index 5be9b8c919490..2b13998fdb0fe 100644
> --- a/net/bluetooth/mgmt.c
> +++ b/net/bluetooth/mgmt.c
> @@ -8664,7 +8664,7 @@ static bool requested_adv_flags_are_valid(struct hci_dev *hdev, u32 adv_flags)
>  	supported_flags = get_supported_adv_flags(hdev);
>  	phy_flags = adv_flags & MGMT_ADV_FLAG_SEC_MASK;
>  	if (adv_flags & ~supported_flags ||
> -	    ((phy_flags && (phy_flags ^ (phy_flags & -phy_flags)))))
> +	    ((phy_flags && (phy_flags ^ ffs_val(phy_flags)))))
>  		return false;
>  
>  	return true;
> diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c
> index 112fe46788b6f..f93f509cc3fbf 100644
> --- a/net/netfilter/nft_set_pipapo.c
> +++ b/net/netfilter/nft_set_pipapo.c
> @@ -370,7 +370,7 @@ int pipapo_refill(unsigned long *map, unsigned int len, unsigned int rules,
>  	for (k = 0; k < len; k++) {
>  		bitset = map[k];
>  		while (bitset) {
> -			unsigned long t = bitset & -bitset;
> +			unsigned long t = ffs_val(bitset);
>  			int r = __builtin_ctzl(bitset);
>  			int i = k * BITS_PER_LONG + r;
>  
> -- 
> 2.52.0

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Maciej W. Rozycki 1 month ago

On Fri, 9 Jan 2026, Yury Norov wrote:

> > diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
> > index ed06367ece574..2f3809c015b99 100644
> > --- a/arch/alpha/kernel/smp.c
> > +++ b/arch/alpha/kernel/smp.c
> > @@ -525,7 +525,7 @@ handle_ipi(struct pt_regs *regs)
> >  	  do {
> >  		unsigned long which;
> >  
> > -		which = ops & -ops;
> > +		which = ffs_val(ops);
> >  		ops &= ~which;
> >  		which = __ffs(which);
> 
> I guess, this should be:
> 
>         which = __ffs(ops);
>         ops &= ops - 1;

 I think it's the wrong operation order.

> > diff --git a/arch/mips/dec/ecc-berr.c b/arch/mips/dec/ecc-berr.c
> > index 1eb356fdd8323..8934b8b1cf375 100644
> > --- a/arch/mips/dec/ecc-berr.c
> > +++ b/arch/mips/dec/ecc-berr.c
> > @@ -153,7 +153,7 @@ static int dec_ecc_be_backend(struct pt_regs *regs, int is_fixup, int invoker)
> >  			/* Ack now, now we've rewritten (or not). */
> >  			dec_ecc_be_ack();
> >  
> > -			if (syn && syn == (syn & -syn)) {
> > +			if (syn && syn == ffs_val(syn)) {
> 
> It opencodes is_power_of_2().

 Correct and it also predates that helper's existence by ~4 years.  I'll 
be happy to see this converted as the intent will be obviously clearer.

  Maciej

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Kuan-Wei Chiu 1 month ago

On Sat, Jan 10, 2026 at 10:36:07AM +0000, Maciej W. Rozycki wrote:
> On Fri, 9 Jan 2026, Yury Norov wrote:
> 
> > > diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
> > > index ed06367ece574..2f3809c015b99 100644
> > > --- a/arch/alpha/kernel/smp.c
> > > +++ b/arch/alpha/kernel/smp.c
> > > @@ -525,7 +525,7 @@ handle_ipi(struct pt_regs *regs)
> > >  	  do {
> > >  		unsigned long which;
> > >  
> > > -		which = ops & -ops;
> > > +		which = ffs_val(ops);
> > >  		ops &= ~which;
> > >  		which = __ffs(which);
> > 
> > I guess, this should be:
> > 
> >         which = __ffs(ops);
> >         ops &= ops - 1;
> 
>  I think it's the wrong operation order.
> 
> > > diff --git a/arch/mips/dec/ecc-berr.c b/arch/mips/dec/ecc-berr.c
> > > index 1eb356fdd8323..8934b8b1cf375 100644
> > > --- a/arch/mips/dec/ecc-berr.c
> > > +++ b/arch/mips/dec/ecc-berr.c
> > > @@ -153,7 +153,7 @@ static int dec_ecc_be_backend(struct pt_regs *regs, int is_fixup, int invoker)
> > >  			/* Ack now, now we've rewritten (or not). */
> > >  			dec_ecc_be_ack();
> > >  
> > > -			if (syn && syn == (syn & -syn)) {
> > > +			if (syn && syn == ffs_val(syn)) {
> > 
> > It opencodes is_power_of_2().
> 
>  Correct and it also predates that helper's existence by ~4 years.  I'll 
> be happy to see this converted as the intent will be obviously clearer.
> 
>   Maciej
> 

Hi all,

(Apologies for the off-topic question regarding email delivery)

I noticed Maciej's reply only after seeing David's subsequent response.
For some reason, this specific email from Maciej never reached my gmail
inbox, nor did it appear in my spam folder. It seems gmail may have
silently dropped it.

I have retrieved this raw message from the lore.kernel.org web
interface to reply here.

Since there are other gmail users on this thread (e.g., David, whom I
assume received it successfully given the reply), I am wondering if
anyone else using gmail experienced similar issues with this message?

If anyone has insights on why this might be happening or how to adjust
settings to prevent this, please let me know. I want to ensure I don't
inadvertently miss any comments or appear unresponsive in the future.

Regards,
Kuan-Wei

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by David Laight 1 month ago

On Sun, 11 Jan 2026 00:42:56 +0800
Kuan-Wei Chiu <visitorckw@gmail.com> wrote:

...
> Since there are other gmail users on this thread (e.g., David, whom I
> assume received it successfully given the reply), I am wondering if
> anyone else using gmail experienced similar issues with this message?

Me 'David' ?
I'm actually subscribed to the lists on a different email provider.
I used to use gmail, but it bounces, spams and just plain discards
far too many messages to reliably receive lkml and friends.
I post from the gmail address to avoid leaking the other address to spammers.

	David

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Kuan-Wei Chiu 1 month ago

On Sat, Jan 10, 2026 at 10:23:02PM +0000, David Laight wrote:
> On Sun, 11 Jan 2026 00:42:56 +0800
> Kuan-Wei Chiu <visitorckw@gmail.com> wrote:
> 
> ...
> > Since there are other gmail users on this thread (e.g., David, whom I
> > assume received it successfully given the reply), I am wondering if
> > anyone else using gmail experienced similar issues with this message?
> 
> Me 'David' ?
> I'm actually subscribed to the lists on a different email provider.
> I used to use gmail, but it bounces, spams and just plain discards
> far too many messages to reliably receive lkml and friends.
> I post from the gmail address to avoid leaking the other address to spammers.
> 
Ah, I misunderstood. I thought you were receiving on gmail as well.

I actually just missed another reply from Maciej (again, nothing in
spam). It appears gmail has decided to silently drop messages from him
for some reason.

I'm a bit surprised to hear gmail described as generally unsuitable. I
have used this account for Linux and several other open source projects
for the past three years without issues until now.

Also, a quick check shows:

$ grep "gmail.com" MAINTAINERS | wc -l
534

Given that over 500 maintainers and reviewers use gmail, it implies it
is widely relied upon. I even recall Linus mentioning issues with gmail
spam filters regarding PRs in the past t (though I am not sure of the
exact association between linux-foundation.org and gmail), suggesting
that even at that level, gmail is part of the workflow.

TBH, if these emails were simply going to spam, it wouldn't be an issue
as I check that folder regularly. The problem here is that Maciej's
emails aren't even in spam.

That said, if direct emails continue to vanish like this, I may indeed
have to start looking for an alternative. (sigh...)

Regards,
Kuan-Wei

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Maciej W. Rozycki 1 month ago

On Sun, 11 Jan 2026, Kuan-Wei Chiu wrote:

> > > Since there are other gmail users on this thread (e.g., David, whom I
> > > assume received it successfully given the reply), I am wondering if
> > > anyone else using gmail experienced similar issues with this message?
> > 
> > Me 'David' ?
> > I'm actually subscribed to the lists on a different email provider.
> > I used to use gmail, but it bounces, spams and just plain discards
> > far too many messages to reliably receive lkml and friends.
> > I post from the gmail address to avoid leaking the other address to spammers.
> > 
> Ah, I misunderstood. I thought you were receiving on gmail as well.
> 
> I actually just missed another reply from Maciej (again, nothing in
> spam). It appears gmail has decided to silently drop messages from him
> for some reason.

 Not at all silent, GMail rejects any e-mail sent here and I get a bounce 
every time.  It has been like this for a while now.  I'm not going to do 
anything about it, my SMTP service complies with the relevant RFC AFAIK 
and GMail is the only recipient showing any issues.

 FWIW I chose to host my e-mail myself for the very reason to avoid any
odd limitations imposed by professional hosting services and have the 
choice to use my own tools to process e-mail.  I have no direct influence 
on other people's choices.  I'll appreciate your understanding.

  Maciej

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by David Laight 1 month ago

On Sat, 10 Jan 2026 10:36:07 +0000 (GMT)
"Maciej W. Rozycki" <macro@orcam.me.uk> wrote:

> On Fri, 9 Jan 2026, Yury Norov wrote:
> 
> > > diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c
> > > index ed06367ece574..2f3809c015b99 100644
> > > --- a/arch/alpha/kernel/smp.c
> > > +++ b/arch/alpha/kernel/smp.c
> > > @@ -525,7 +525,7 @@ handle_ipi(struct pt_regs *regs)
> > >  	  do {
> > >  		unsigned long which;
> > >  
> > > -		which = ops & -ops;
> > > +		which = ffs_val(ops);
> > >  		ops &= ~which;
> > >  		which = __ffs(which);  
> > 
> > I guess, this should be:
> > 
> >         which = __ffs(ops);
> >         ops &= ops - 1;  
> 
>  I think it's the wrong operation order.

If you are going to do the __ffs() do it first.
		which = __ffs(ops);
		ops &= ~(1u << which);

OTOH what follows is:
		switch (which) {
		case IPI_RESCHEDULE:
so changing to 'case 1u << IPI_RESCHEDULE: would save the ffs().
But the entire loop is pointless, three tests, the first being:
	if (op & (1u << IPI_RESCHEDULE))
		scheduler_ipi();
would be far less code.

But probably pointless churn - no one cares about alpha any more :-)

> 
> > > diff --git a/arch/mips/dec/ecc-berr.c b/arch/mips/dec/ecc-berr.c
> > > index 1eb356fdd8323..8934b8b1cf375 100644
> > > --- a/arch/mips/dec/ecc-berr.c
> > > +++ b/arch/mips/dec/ecc-berr.c
> > > @@ -153,7 +153,7 @@ static int dec_ecc_be_backend(struct pt_regs *regs, int is_fixup, int invoker)
> > >  			/* Ack now, now we've rewritten (or not). */
> > >  			dec_ecc_be_ack();
> > >  
> > > -			if (syn && syn == (syn & -syn)) {
> > > +			if (syn && syn == ffs_val(syn)) {  
> > 
> > It opencodes is_power_of_2().

Badly...

	David

> 
>  Correct and it also predates that helper's existence by ~4 years.  I'll 
> be happy to see this converted as the intent will be obviously clearer.
> 
>   Maciej
>

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Maciej W. Rozycki 1 month ago

On Sat, 10 Jan 2026, David Laight wrote:

> > > > diff --git a/arch/mips/dec/ecc-berr.c b/arch/mips/dec/ecc-berr.c
> > > > index 1eb356fdd8323..8934b8b1cf375 100644
> > > > --- a/arch/mips/dec/ecc-berr.c
> > > > +++ b/arch/mips/dec/ecc-berr.c
> > > > @@ -153,7 +153,7 @@ static int dec_ecc_be_backend(struct pt_regs *regs, int is_fixup, int invoker)
> > > >  			/* Ack now, now we've rewritten (or not). */
> > > >  			dec_ecc_be_ack();
> > > >  
> > > > -			if (syn && syn == (syn & -syn)) {
> > > > +			if (syn && syn == ffs_val(syn)) {  
> > > 
> > > It opencodes is_power_of_2().
> 
> Badly...

 It's 4 MIPS instructions and exactly the same machine code produced as 
compiler output from `is_power_of_2'.  If you know of a better alternative 
I'll be happy to get enlightened, however it's academic anyway, as it's 
the ECC memory error handler.  If you get here, then you have a bigger 
issue than a couple of extra instructions executed per kernel log entry 
produced.  If you indeed can get rid of them in the first place, that is.

  Maciej

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by David Laight 1 month ago

On Sun, 11 Jan 2026 03:15:22 +0000 (GMT)
"Maciej W. Rozycki" <macro@orcam.me.uk> wrote:

> On Sat, 10 Jan 2026, David Laight wrote:
> 
> > > > > diff --git a/arch/mips/dec/ecc-berr.c b/arch/mips/dec/ecc-berr.c
> > > > > index 1eb356fdd8323..8934b8b1cf375 100644
> > > > > --- a/arch/mips/dec/ecc-berr.c
> > > > > +++ b/arch/mips/dec/ecc-berr.c
> > > > > @@ -153,7 +153,7 @@ static int dec_ecc_be_backend(struct pt_regs *regs, int is_fixup, int invoker)
> > > > >  			/* Ack now, now we've rewritten (or not). */
> > > > >  			dec_ecc_be_ack();
> > > > >  
> > > > > -			if (syn && syn == (syn & -syn)) {
> > > > > +			if (syn && syn == ffs_val(syn)) {    
> > > > 
> > > > It opencodes is_power_of_2().  
> > 
> > Badly...  
> 
>  It's 4 MIPS instructions and exactly the same machine code produced as 
> compiler output from `is_power_of_2'.

The compiler must be detecting that x == (x & -x) is the same as
(x & (x - 1)) == 0.
Although for MIPS the former is negate, and, beq(x,y) - so 3 instructions.
On x86 it is negate, and, cmp, beq(zero flag) - one extra instruction.
(The 4th MIPS instruction will be beq(syn,0).)

So maybe s/Badly/In an unusual way/

> If you know of a better alternative

Doing is_power_of_2() with only one conditional branch would be a gain.
But I've not thought about it hard enough to find one.
I suspect there may not be one, but all these 'tricks' come from the 1970s
(or earlier) and don't allow for the behaviour of modern cpu with multiple
execution units and long pipelines.
(Although MIPS is likely to be relatively sane.)

	David

> I'll be happy to get enlightened, however it's academic anyway, as it's 
> the ECC memory error handler.  If you get here, then you have a bigger 
> issue than a couple of extra instructions executed per kernel log entry 
> produced.  If you indeed can get rid of them in the first place, that is.
> 
>   Maciej
>

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Maciej W. Rozycki 1 month ago

On Sun, 11 Jan 2026, David Laight wrote:

> >  It's 4 MIPS instructions and exactly the same machine code produced as 
> > compiler output from `is_power_of_2'.
> 
> The compiler must be detecting that x == (x & -x) is the same as
> (x & (x - 1)) == 0.

 There are 3 basic operations either way, so unless a CPU has an odd 
instruction that combines any, there's no way to reduce the instruction 
count.

> Although for MIPS the former is negate, and, beq(x,y) - so 3 instructions.
> On x86 it is negate, and, cmp, beq(zero flag) - one extra instruction.
> (The 4th MIPS instruction will be beq(syn,0).)

 This code only ever runs on R2000/R3000 and R4000/R4400 processors, on 
which the cost of an untaken branch is nil.  The cost of a taken branch on 
the former ones is nil too, with their 4-stage pipelines, and while there 
is a fixed penalty on the latter ones, the compiler should be able to 
arrange code here such that at most one branch is taken anyway.

> So maybe s/Badly/In an unusual way/

 What's unusual in this way of determining that exactly one bit is set in 
a word?  It's as good as any I suppose.

> > If you know of a better alternative
> 
> Doing is_power_of_2() with only one conditional branch would be a gain.
> But I've not thought about it hard enough to find one.

 Sure you can, either of:

	(x != 0) & (x == (x & -x))

and:

	(x != 0) & ((x & (x - 1)) == 0)

will do, but it's 5 instructions on MIPS, so no gain here.

 If you have a population count instruction such as CTPOP on EV67 Alpha, 
you can do it in two instructions, such as:

	ctpop	$16, $0
	cmpeq	$0, 1, $0

so there seems to be room for improvement here, but GCC stubbornly refuses 
to produce this instruction sequence for this code:

bool is_power_of_2(unsigned long n)
{
	return __builtin_popcountl(n) == 1;
}

apparently owing to no insn cost set for the POPCOUNT RTX in the backend 
causing the middle end to pick up some default.  Instead GCC comes up with 
what can be coded in C as:

bool is_power_of_2(unsigned long n)
{
	return n - 1 < (n ^ (n - 1));
}

which seems an interesting alternative that produces 3 instructions only 
across Alpha, MIPS and RISC-V targets.  It's 5 instructions on POWER and 
x86-64 vs 8 and 9 respectively with our current implementation.  There are 
no branches produced here, although an inline expansion will likely end 
with one.

 Overall it seems a worthwhile improvement even if still an instruction 
longer than necessary for EV67 Alpha (and also for POWER, which also has 
a population count operation, and which GCC does pick up in this case).

 FWIW on VAX, notorious for the lack of conditional set operations, it's 7 
instructions and 1 branch, down from 14 and 2 respectively.

> I suspect there may not be one, but all these 'tricks' come from the 1970s
> (or earlier) and don't allow for the behaviour of modern cpu with multiple
> execution units and long pipelines.

 As we can see you never know.  I've sent a code update proposal.

 The use of the population count operation can be considered separately, 
but we'd have to be careful here as the quality of code produced by the 
intrinsic varies, e.g. with pre-EV67 Alpha, RISC-V and VAX (!) GCC emits 
code equivalent to my proposal, but with MIPS or x86-64 it resorts to a 
libcall, so a guarding condition would be required.  So I'd leave it for 
another occasion.

 Thanks for sparking this discussion overall!

  Maciej

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by David Laight 1 month ago

On Sun, 11 Jan 2026 21:22:03 +0000 (GMT)
"Maciej W. Rozycki" <macro@orcam.me.uk> wrote:

> On Sun, 11 Jan 2026, David Laight wrote:
> 
> > >  It's 4 MIPS instructions and exactly the same machine code produced as 
> > > compiler output from `is_power_of_2'.  
> > 
> > The compiler must be detecting that x == (x & -x) is the same as
> > (x & (x - 1)) == 0.  
> 
>  There are 3 basic operations either way, so unless a CPU has an odd 
> instruction that combines any, there's no way to reduce the instruction 
> count.
> 
> > Although for MIPS the former is negate, and, beq(x,y) - so 3 instructions.
> > On x86 it is negate, and, cmp, beq(zero flag) - one extra instruction.
> > (The 4th MIPS instruction will be beq(syn,0).)  
> 
>  This code only ever runs on R2000/R3000 and R4000/R4400 processors, on 
> which the cost of an untaken branch is nil.  The cost of a taken branch on 
> the former ones is nil too, with their 4-stage pipelines, and while there 
> is a fixed penalty on the latter ones, the compiler should be able to 
> arrange code here such that at most one branch is taken anyway.
> 
> > So maybe s/Badly/In an unusual way/  
> 
>  What's unusual in this way of determining that exactly one bit is set in 
> a word?  It's as good as any I suppose.
> 
> > > If you know of a better alternative  
> > 
> > Doing is_power_of_2() with only one conditional branch would be a gain.
> > But I've not thought about it hard enough to find one.  
> 
>  Sure you can, either of:
> 
> 	(x != 0) & (x == (x & -x))
> 
> and:
> 
> 	(x != 0) & ((x & (x - 1)) == 0)
> 
> will do, but it's 5 instructions on MIPS, so no gain here.
> 
>  If you have a population count instruction such as CTPOP on EV67 Alpha, 
> you can do it in two instructions, such as:
> 
> 	ctpop	$16, $0
> 	cmpeq	$0, 1, $0
> 
> so there seems to be room for improvement here, but GCC stubbornly refuses 
> to produce this instruction sequence for this code:
> 
> bool is_power_of_2(unsigned long n)
> {
> 	return __builtin_popcountl(n) == 1;
> }
> 
> apparently owing to no insn cost set for the POPCOUNT RTX in the backend 
> causing the middle end to pick up some default.  Instead GCC comes up with 
> what can be coded in C as:
> 
> bool is_power_of_2(unsigned long n)
> {
> 	return n - 1 < (n ^ (n - 1));
> }

Not seen that one - and not thought of popcnt, far too new for me.
(I learnt asm for a pdp11 first...)

> which seems an interesting alternative that produces 3 instructions only 
> across Alpha, MIPS and RISC-V targets.  It's 5 instructions on POWER and 
> x86-64 vs 8 and 9 respectively with our current implementation.  There are 
> no branches produced here, although an inline expansion will likely end 
> with one.

I'm seeing:
is_power_of_2:
        leaq    -1(%rdi), %rax
        xorq    %rax, %rdi
        cmpq    %rdi, %rax
        setb    %al
        retq
on x86-64 - which is really 3 instructions.

While using the popcnt instruction would reduce it by one, it will only
be faster on amd zen cpu, on intel cpu popcnt has a latency of 3 and only
executes on p1 (according to Agner).

> 
>  Overall it seems a worthwhile improvement even if still an instruction 
> longer than necessary for EV67 Alpha (and also for POWER, which also has 
> a population count operation, and which GCC does pick up in this case).
> 
>  FWIW on VAX, notorious for the lack of conditional set operations, it's 7 
> instructions and 1 branch, down from 14 and 2 respectively.

I worked for ICL through the 1980s so missed out on VAX.
Picked up 8085, x86, sparc32 and m68k.

> > I suspect there may not be one, but all these 'tricks' come from the 1970s
> > (or earlier) and don't allow for the behaviour of modern cpu with multiple
> > execution units and long pipelines.  
> 
>  As we can see you never know.  I've sent a code update proposal.

I'd suggest moving is_power_of_2() to bits.h as:
#define is_power_of_2(n) ({ auto _n = n; _n - 1 < _n ^ (_n - 1); })
and add:
#define is_zero_or_power_of_2(n) ({ auto _n = n; _n & (_n - 1) != 0; })

> 
>  The use of the population count operation can be considered separately, 
> but we'd have to be careful here as the quality of code produced by the 
> intrinsic varies, e.g. with pre-EV67 Alpha, RISC-V and VAX (!) GCC emits 
> code equivalent to my proposal, but with MIPS or x86-64 it resorts to a 
> libcall, so a guarding condition would be required.  So I'd leave it for 
> another occasion.

I'd guess that many of those popcnt instructions aren't actually as fast
as the dec/xor pair.
But the libcall is just plain stupid.
I think that happens elsewhere - which is one reason __ffs() is x86 asm.

> 
>  Thanks for sparking this discussion overall!

Nice wet Sunday bikeshed...

	David

> 
>   Maciej
>

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Maciej W. Rozycki 4 weeks, 1 day ago

On Sun, 11 Jan 2026, David Laight wrote:

> >  If you have a population count instruction such as CTPOP on EV67 Alpha, 
> > you can do it in two instructions, such as:
> > 
> > 	ctpop	$16, $0
> > 	cmpeq	$0, 1, $0
> > 
> > so there seems to be room for improvement here, but GCC stubbornly refuses 
> > to produce this instruction sequence for this code:
> > 
> > bool is_power_of_2(unsigned long n)
> > {
> > 	return __builtin_popcountl(n) == 1;
> > }
> > 
> > apparently owing to no insn cost set for the POPCOUNT RTX in the backend 
> > causing the middle end to pick up some default.  Instead GCC comes up with 
> > what can be coded in C as:
> > 
> > bool is_power_of_2(unsigned long n)
> > {
> > 	return n - 1 < (n ^ (n - 1));
> > }
> 
> Not seen that one - and not thought of popcnt, far too new for me.
> (I learnt asm for a pdp11 first...)

 It does show up as an RTL operation in the compiler; it's a good way to 
discover stuff.

> > which seems an interesting alternative that produces 3 instructions only 
> > across Alpha, MIPS and RISC-V targets.  It's 5 instructions on POWER and 
> > x86-64 vs 8 and 9 respectively with our current implementation.  There are 
> > no branches produced here, although an inline expansion will likely end 
> > with one.
> 
> I'm seeing:
> is_power_of_2:
>         leaq    -1(%rdi), %rax
>         xorq    %rax, %rdi
>         cmpq    %rdi, %rax
>         setb    %al
>         retq
> on x86-64 - which is really 3 instructions.

 Ack, this is `unsigned long' vs `bool' return type.  I'm just too used to 
RISC psABIs.  FWIW I have this:

	leaq	-1(%rdi), %rax
	xorq	%rax, %rdi
	cmpq	%rdi, %rax
	setb	%al
	movzbl	%al, %eax
	ret

in the former case and I take it `bool' is 8-bit on x86.

> While using the popcnt instruction would reduce it by one, it will only
> be faster on amd zen cpu, on intel cpu popcnt has a latency of 3 and only
> executes on p1 (according to Agner).

 Ack.  I've lost track of x86 after ~Pentium Pro.

> > > I suspect there may not be one, but all these 'tricks' come from the 1970s
> > > (or earlier) and don't allow for the behaviour of modern cpu with multiple
> > > execution units and long pipelines.  
> > 
> >  As we can see you never know.  I've sent a code update proposal.
> 
> I'd suggest moving is_power_of_2() to bits.h as:
> #define is_power_of_2(n) ({ auto _n = n; _n - 1 < _n ^ (_n - 1); })
> and add:
> #define is_zero_or_power_of_2(n) ({ auto _n = n; _n & (_n - 1) != 0; })

 I'll leave it as an exercise for someone else.

> >  The use of the population count operation can be considered separately, 
> > but we'd have to be careful here as the quality of code produced by the 
> > intrinsic varies, e.g. with pre-EV67 Alpha, RISC-V and VAX (!) GCC emits 
> > code equivalent to my proposal, but with MIPS or x86-64 it resorts to a 
> > libcall, so a guarding condition would be required.  So I'd leave it for 
> > another occasion.
> 
> I'd guess that many of those popcnt instructions aren't actually as fast
> as the dec/xor pair.

 With EV67 Alpha and POWER being proper RISC architectures I'd expect the 
instruction to be implemented using a dedicated logic gate network.  After 
all the population count operation is just an n-operand addition of 1-bit 
inputs, so not a big deal.  The propagation delay is probably on the same 
order of magnitude as with a 2-operand n-bit adder, so the operation ought 
to fit in a single pipeline stage.

 I suppose it's just a matter of whether you want to dedicate a piece of
silicon just for this somewhat exotic operation, and in the old days the 
answer would be no for a general-purpose CPU owing to manufacturing cost 
and die size limitation, while nowadays with process shrinkage and power 
consumption reduction it may have become worthwhile.

> But the libcall is just plain stupid.

 It could just be that it's a recent addition as my compilation of GCC for 
x86-64 and MIPS turns out to be version 11 and 13 respectively, while I 
have version 15 for the remaining targets.  Indeed when I tried now GCC 14 
that I have at hand for another MIPS configuration the libcall is gone, so 
the optimisation must have been added in that version.

 Of course you still need a libcall for a plain `__builtin_popcountl' call 
where there's no suitable hardware instruction available.

  Maciej

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by David Laight 4 weeks, 1 day ago

On Mon, 12 Jan 2026 11:21:59 +0000 (GMT)
"Maciej W. Rozycki" <macro@orcam.me.uk> wrote:

> On Sun, 11 Jan 2026, David Laight wrote:
> 
> > >  If you have a population count instruction such as CTPOP on EV67 Alpha, 
> > > you can do it in two instructions, such as:
> > > 
> > > 	ctpop	$16, $0
> > > 	cmpeq	$0, 1, $0
> > > 
> > > so there seems to be room for improvement here, but GCC stubbornly refuses 
> > > to produce this instruction sequence for this code:
> > > 
> > > bool is_power_of_2(unsigned long n)
> > > {
> > > 	return __builtin_popcountl(n) == 1;
> > > }
> > > 
> > > apparently owing to no insn cost set for the POPCOUNT RTX in the backend 
> > > causing the middle end to pick up some default.  Instead GCC comes up with 
> > > what can be coded in C as:
> > > 
> > > bool is_power_of_2(unsigned long n)
> > > {
> > > 	return n - 1 < (n ^ (n - 1));
> > > }  
> > 
> > Not seen that one - and not thought of popcnt, far too new for me.
> > (I learnt asm for a pdp11 first...)  
> 
>  It does show up as an RTL operation in the compiler; it's a good way to 
> discover stuff.
> 
> > > which seems an interesting alternative that produces 3 instructions only 
> > > across Alpha, MIPS and RISC-V targets.  It's 5 instructions on POWER and 
> > > x86-64 vs 8 and 9 respectively with our current implementation.  There are 
> > > no branches produced here, although an inline expansion will likely end 
> > > with one.  
> > 
> > I'm seeing:
> > is_power_of_2:
> >         leaq    -1(%rdi), %rax
> >         xorq    %rax, %rdi
> >         cmpq    %rdi, %rax
> >         setb    %al
> >         retq
> > on x86-64 - which is really 3 instructions.  
> 
>  Ack, this is `unsigned long' vs `bool' return type.  I'm just too used to 
> RISC psABIs.  FWIW I have this:
> 
> 	leaq	-1(%rdi), %rax
> 	xorq	%rax, %rdi
> 	cmpq	%rdi, %rax
> 	setb	%al
> 	movzbl	%al, %eax
> 	ret
> 
> in the former case and I take it `bool' is 8-bit on x86.

That was _Bool on godbolt, it you return 'int' the extra 'zero extend'
is needed. SETcc was added for 386 (I still forget about it) and someone
assumed you wouldn't want the 'word' variant.
(It isn't as though they couldn't have allocated another row of the 0F
opcode to it.)
I sort of hate 'bool' types, something sane has to happen for 'a & b'
when the bit patterns are unexpected - and it ought to be well defined.
The traditional C where 'a > b' generates 0 or 1 is fine.

> > While using the popcnt instruction would reduce it by one, it will only
> > be faster on amd zen cpu, on intel cpu popcnt has a latency of 3 and only
> > executes on p1 (according to Agner).  
> 
>  Ack.  I've lost track of x86 after ~Pentium Pro.

Those were the days :-)
I do remember something from one of the Intel manuals of that period about
not intermixing data with code because of the possibility of the cpu
speculatively executing the data following a branch and if the data happened
to be a floating point trig function it had to wait for it to finish.
SLS isn't a modern thing.

> > > > I suspect there may not be one, but all these 'tricks' come from the 1970s
> > > > (or earlier) and don't allow for the behaviour of modern cpu with multiple
> > > > execution units and long pipelines.    
> > > 
> > >  As we can see you never know.  I've sent a code update proposal.  
> > 
> > I'd suggest moving is_power_of_2() to bits.h as:
> > #define is_power_of_2(n) ({ auto _n = n; _n - 1 < _n ^ (_n - 1); })
> > and add:
> > #define is_zero_or_power_of_2(n) ({ auto _n = n; _n & (_n - 1) != 0; })  
> 
>  I'll leave it as an exercise for someone else.

I might add it to the patchset I'm writing - mostly bits.h

> > >  The use of the population count operation can be considered separately, 
> > > but we'd have to be careful here as the quality of code produced by the 
> > > intrinsic varies, e.g. with pre-EV67 Alpha, RISC-V and VAX (!) GCC emits 
> > > code equivalent to my proposal, but with MIPS or x86-64 it resorts to a 
> > > libcall, so a guarding condition would be required.  So I'd leave it for 
> > > another occasion.  
> > 
> > I'd guess that many of those popcnt instructions aren't actually as fast
> > as the dec/xor pair.  
> 
>  With EV67 Alpha and POWER being proper RISC architectures I'd expect the 
> instruction to be implemented using a dedicated logic gate network.  After 
> all the population count operation is just an n-operand addition of 1-bit 
> inputs, so not a big deal.  The propagation delay is probably on the same 
> order of magnitude as with a 2-operand n-bit adder, so the operation ought 
> to fit in a single pipeline stage.

There are some tricks that can be done to reduce the propagation delay
of a 64bit add from ~64 (for the ripple carry) to nearer 16 (add each byte
with and without a carry input and then use a mux to select the correct
output). That may not work for popcnt.

Also I suspect that multiply isn't single clock, so multicycle instructions
have to be supported.
Throw silicon at multiply and you can reduce it to full-adders with a carry-
chain the length of the result, not sure you can do much better.

For my sins I re-implemented the Nios-II cpu a couple of years ago (because
Intel had deprecated it), the single 64bit add needed at the end of multiply
(done by fgpa DSP blocks) made it impossible to feed the product straight
back into the ALU for the next cycle.
My version is about the same size, 1 clock faster in a few places (like
predicted taken branches and the stall if a multiply result (and I think
memory reads) is needed in the next-but-one instruction.
FMax is a bit lower, but we were constrained by the PCIe interface to 67.5MHz
and it was fast enough.

	David

> 
>  I suppose it's just a matter of whether you want to dedicate a piece of
> silicon just for this somewhat exotic operation, and in the old days the 
> answer would be no for a general-purpose CPU owing to manufacturing cost 
> and die size limitation, while nowadays with process shrinkage and power 
> consumption reduction it may have become worthwhile.
> 
> > But the libcall is just plain stupid.  
> 
>  It could just be that it's a recent addition as my compilation of GCC for 
> x86-64 and MIPS turns out to be version 11 and 13 respectively, while I 
> have version 15 for the remaining targets.  Indeed when I tried now GCC 14 
> that I have at hand for another MIPS configuration the libcall is gone, so 
> the optimisation must have been added in that version.
> 
>  Of course you still need a libcall for a plain `__builtin_popcountl' call 
> where there's no suitable hardware instruction available.
> 
>   Maciej
>

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Petr Tesarik 1 month ago

On Fri, 9 Jan 2026 13:19:25 -0500
Yury Norov <ynorov@nvidia.com> wrote:

>[...]
> --
> 
> OK, let me stop here.
> 
> Can you please instead of mechanical rework check each case
> individually and find the best replacement to highlight the intention?
> 
> Can you also split this patch to a series according to subsystems, so
> that corresponding maintainers will have better access to their changes?

Well, then it makes more sense to start by identifying the intention in
every place and getting them fixed one by one. Whatever remains should
be converted to use the new macro.

In fact, I can do both in parallel, converting the places where the
result of ffs() is immediately used as a shift count.

Then no tree-wide change is necessary, as long as we can first get the
helper (under whichever name) into bitops.h. But it will be rejected
without an in-tree user. I somehow feel caught by Catch XXII.

Advice welcome. Non-ironically.

Petr T

Re: [RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded

Posted by Yury Norov 1 month ago

On Fri, Jan 09, 2026 at 08:32:50PM +0100, Petr Tesarik wrote:
> On Fri, 9 Jan 2026 13:19:25 -0500
> Yury Norov <ynorov@nvidia.com> wrote:
> 
> >[...]
> > --
> > 
> > OK, let me stop here.
> > 
> > Can you please instead of mechanical rework check each case
> > individually and find the best replacement to highlight the intention?
> > 
> > Can you also split this patch to a series according to subsystems, so
> > that corresponding maintainers will have better access to their changes?
> 
> Well, then it makes more sense to start by identifying the intention in
> every place and getting them fixed one by one. Whatever remains should
> be converted to use the new macro.
> 
> In fact, I can do both in parallel, converting the places where the
> result of ffs() is immediately used as a shift count.
> 
> Then no tree-wide change is necessary, as long as we can first get the
> helper (under whichever name) into bitops.h. But it will be rejected
> without an in-tree user. I somehow feel caught by Catch XXII.
> 
> Advice welcome. Non-ironically.

Build your series like this:

1. Switch all 'a & -a' chunks to the existing helpers where possible;
2. Introduce the new macro;
3. Convert the rest of the codebase to using the macro;

In cover-letter mention that patches from #1 may be taken individually
or together with the rest of the series at maintainers' discretion, and
#2 and 3 must be merged at once, for example with my branch.

Then depending on feedback, I'll take your series as is, or will take
only explicitly acked patches. In 2nd case, I will ask you to resend
non-acked part for better visibility, and then decide either take it
or drop.

Thanks,
Yury

[RFC PATCH 1/2] bits: introduce ffs_val()
[RFC PATCH 2/2] treewide, bits: use ffs_val() where it is open-coded