[PATCH v5 7/8] rust: percpu: Add pin-hole optimizations for numerics

Mitchell Levy posted 8 patches 5 hours ago
[PATCH v5 7/8] rust: percpu: Add pin-hole optimizations for numerics
Posted by Mitchell Levy 5 hours ago
The C implementations of `this_cpu_add`, `this_cpu_sub`, etc., are
optimized to save an instruction by avoiding having to compute
`this_cpu_ptr(&x)` for some per-CPU variable `x`. For example, rather
than

    u64 *x_ptr = this_cpu_ptr(&x);
    *x_ptr += 5;

the implementation of `this_cpu_add` is clever enough to make use of the
fact that per-CPU variables are implemented on x86 via segment
registers, and so we can use only a single instruction (where we assume
`&x` is already in `rax`)

    add gs:[rax], 5

Add this optimization via a `PerCpuNumeric` type to enable code-reuse
between `DynamicPerCpu` and `StaticPerCpu`.

Signed-off-by: Mitchell Levy <levymitchell0@gmail.com>
---
 rust/kernel/percpu.rs         |   1 +
 rust/kernel/percpu/dynamic.rs |  10 ++-
 rust/kernel/percpu/numeric.rs | 138 ++++++++++++++++++++++++++++++++++++++++++
 samples/rust/rust_percpu.rs   |  36 +++++++++++
 4 files changed, 184 insertions(+), 1 deletion(-)

diff --git a/rust/kernel/percpu.rs b/rust/kernel/percpu.rs
index 72c83fef68ee..ff04607ee047 100644
--- a/rust/kernel/percpu.rs
+++ b/rust/kernel/percpu.rs
@@ -6,6 +6,7 @@
 
 pub mod cpu_guard;
 mod dynamic;
+pub mod numeric;
 mod static_;
 
 #[doc(inline)]
diff --git a/rust/kernel/percpu/dynamic.rs b/rust/kernel/percpu/dynamic.rs
index 40514704b3d0..a717138b93dc 100644
--- a/rust/kernel/percpu/dynamic.rs
+++ b/rust/kernel/percpu/dynamic.rs
@@ -28,7 +28,7 @@
 /// the memory location on any particular CPU has been initialized. This means that it cannot tell
 /// whether it should drop the *contents* of the allocation when it is dropped. It is up to the
 /// user to do this via something like [`core::ptr::drop_in_place`].
-pub struct PerCpuAllocation<T>(PerCpuPtr<T>);
+pub struct PerCpuAllocation<T>(pub(super) PerCpuPtr<T>);
 
 impl<T: Zeroable> PerCpuAllocation<T> {
     /// Dynamically allocates a space in the per-CPU area suitably sized and aligned to hold a `T`,
@@ -162,6 +162,14 @@ pub fn new_from(mut initer: impl FnMut(CpuId) -> T, flags: Flags) -> Option<Self
     }
 }
 
+impl<T> DynamicPerCpu<T> {
+    /// Gets the allocation backing this per-CPU variable.
+    pub(crate) fn alloc(&self) -> &Arc<PerCpuAllocation<T>> {
+        // SAFETY: This type's invariant ensures that `self.alloc` is `Some`.
+        unsafe { self.alloc.as_ref().unwrap_unchecked() }
+    }
+}
+
 impl<T> PerCpu<T> for DynamicPerCpu<T> {
     unsafe fn get_mut(&mut self, guard: CpuGuard) -> PerCpuToken<'_, T> {
         // SAFETY:
diff --git a/rust/kernel/percpu/numeric.rs b/rust/kernel/percpu/numeric.rs
new file mode 100644
index 000000000000..13b4ab4a794d
--- /dev/null
+++ b/rust/kernel/percpu/numeric.rs
@@ -0,0 +1,138 @@
+// SPDX-License-Identifier: GPL-2.0
+//! Pin-hole optimizations for [`PerCpu<T>`] where T is a numeric type.
+
+use super::*;
+use core::arch::asm;
+
+/// Represents a per-CPU variable that can be manipulated with machine-intrinsic numeric
+/// operations.
+pub struct PerCpuNumeric<'a, T> {
+    // INVARIANT: `ptr.0` is a valid offset into the per-CPU area and is initialized on all CPUs
+    // (since we don't have a CPU guard, we have to be pessimistic and assume we could be on any
+    // CPU).
+    ptr: &'a PerCpuPtr<T>,
+}
+
+macro_rules! impl_ops {
+    ($ty:ty, $reg:tt) => {
+        impl DynamicPerCpu<$ty> {
+            /// Returns a [`PerCpuNumeric`] that can be used to manipulate the underlying per-CPU
+            /// variable.
+            #[inline]
+            pub fn num(&mut self) -> PerCpuNumeric<'_, $ty> {
+                // The invariant is satisfied because `DynamicPerCpu`'s invariant guarantees that
+                // this pointer is valid and initialized on all CPUs.
+                PerCpuNumeric { ptr: &self.alloc().0 }
+            }
+        }
+        impl StaticPerCpu<$ty> {
+            /// Returns a [`PerCpuNumeric`] that can be used to manipulate the underlying per-CPU
+            /// variable.
+            #[inline]
+            pub fn num(&mut self) -> PerCpuNumeric<'_, $ty> {
+                // The invariant is satisfied because `StaticPerCpu`'s invariant guarantees that
+                // this pointer is valid and initialized on all CPUs.
+                PerCpuNumeric { ptr: &self.0 }
+            }
+        }
+
+        impl PerCpuNumeric<'_, $ty> {
+            /// Adds `rhs` to the per-CPU variable.
+            #[inline]
+            pub fn add(&mut self, rhs: $ty) {
+                // SAFETY: `self.ptr.0` is a valid offset into the per-CPU area (i.e., valid as a
+                // pointer relative to the `gs` segment register) by the invariants of this type.
+                unsafe {
+                    asm!(
+                        concat!("add gs:[{off}], {val:", $reg, "}"),
+                        off = in(reg) self.ptr.0.cast::<$ty>(),
+                        val = in(reg) rhs,
+                    );
+                }
+            }
+        }
+        impl PerCpuNumeric<'_, $ty> {
+            /// Subtracts `rhs` from the per-CPU variable.
+            #[inline]
+            pub fn sub(&mut self, rhs: $ty) {
+                // SAFETY: `self.ptr.0` is a valid offset into the per-CPU area (i.e., valid as a
+                // pointer relative to the `gs` segment register) by the invariants of this type.
+                unsafe {
+                    asm!(
+                        concat!("sub gs:[{off}], {val:", $reg, "}"),
+                        off = in(reg) self.ptr.0.cast::<$ty>(),
+                        val = in(reg) rhs,
+                    );
+                }
+            }
+        }
+    };
+}
+
+macro_rules! impl_ops_byte {
+    ($ty:ty) => {
+        impl DynamicPerCpu<$ty> {
+            /// Returns a [`PerCpuNumeric`] that can be used to manipulate the underlying per-CPU
+            /// variable.
+            #[inline]
+            pub fn num(&mut self) -> PerCpuNumeric<'_, $ty> {
+                // The invariant is satisfied because `DynamicPerCpu`'s invariant guarantees that
+                // this pointer is valid and initialized on all CPUs.
+                PerCpuNumeric { ptr: &self.alloc().0 }
+            }
+        }
+        impl StaticPerCpu<$ty> {
+            /// Returns a [`PerCpuNumeric`] that can be used to manipulate the underlying per-CPU
+            /// variable.
+            #[inline]
+            pub fn num(&mut self) -> PerCpuNumeric<'_, $ty> {
+                // The invariant is satisfied because `StaticPerCpu`'s invariant guarantees that
+                // this pointer is valid and initialized on all CPUs.
+                PerCpuNumeric { ptr: &self.0 }
+            }
+        }
+
+        impl PerCpuNumeric<'_, $ty> {
+            /// Adds `rhs` to the per-CPU variable.
+            #[inline]
+            pub fn add(&mut self, rhs: $ty) {
+                // SAFETY: `self.ptr.0` is a valid offset into the per-CPU area (i.e., valid as a
+                // pointer relative to the `gs` segment register) by the invariants of this type.
+                unsafe {
+                    asm!(
+                        "add gs:[{off}], {val}",
+                        off = in(reg) self.ptr.0.cast::<$ty>(),
+                        val = in(reg_byte) rhs,
+                    );
+                }
+            }
+        }
+        impl PerCpuNumeric<'_, $ty> {
+            /// Subtracts `rhs` from the per-CPU variable.
+            #[inline]
+            pub fn sub(&mut self, rhs: $ty) {
+                // SAFETY: `self.ptr.0` is a valid offset into the per-CPU area (i.e., valid as a
+                // pointer relative to the `gs` segment register) by the invariants of this type.
+                unsafe {
+                    asm!(
+                        "sub gs:[{off}], {val}",
+                        off = in(reg) self.ptr.0.cast::<$ty>(),
+                        val = in(reg_byte) rhs,
+                    );
+                }
+            }
+        }
+    };
+}
+
+impl_ops_byte!(i8);
+impl_ops!(i16, "x");
+impl_ops!(i32, "e");
+impl_ops!(i64, "r");
+impl_ops!(isize, "r");
+
+impl_ops_byte!(u8);
+impl_ops!(u16, "x");
+impl_ops!(u32, "e");
+impl_ops!(u64, "r");
+impl_ops!(usize, "r");
diff --git a/samples/rust/rust_percpu.rs b/samples/rust/rust_percpu.rs
index 5adb30509bd4..90f5debd3c7a 100644
--- a/samples/rust/rust_percpu.rs
+++ b/samples/rust/rust_percpu.rs
@@ -28,6 +28,26 @@
 define_per_cpu!(UPERCPU: u64 = 0);
 define_per_cpu!(CHECKED: RefCell<u64> = RefCell::new(0));
 
+macro_rules! make_optimization_test {
+    ($ty:ty) => {
+        let mut test: DynamicPerCpu<$ty> = DynamicPerCpu::new_zero(GFP_KERNEL).unwrap();
+        {
+            let _guard = CpuGuard::new();
+            // SAFETY: No other usage of `test`
+            unsafe { test.get_mut(CpuGuard::new()) }.with(|val: &mut $ty| *val = 10);
+            test.num().add(1);
+            // SAFETY: No other usage of `test`
+            unsafe { test.get_mut(CpuGuard::new()) }.with(|val: &mut $ty| assert_eq!(*val, 11));
+            test.num().add(10);
+            // SAFETY: No other usage of `test`
+            unsafe { test.get_mut(CpuGuard::new()) }.with(|val: &mut $ty| assert_eq!(*val, 21));
+            test.num().sub(5);
+            // SAFETY: No other usage of `test`
+            unsafe { test.get_mut(CpuGuard::new()) }.with(|val: &mut $ty| assert_eq!(*val, 16));
+        }
+    };
+}
+
 impl kernel::Module for PerCpuMod {
     fn init(_module: &'static ThisModule) -> Result<Self, Error> {
         pr_info!("rust percpu test start\n");
@@ -228,6 +248,22 @@ fn init(_module: &'static ThisModule) -> Result<Self, Error> {
 
         pr_info!("rust dynamic percpu test done\n");
 
+        pr_info!("rust numeric optimizations test start\n");
+
+        make_optimization_test!(u8);
+        make_optimization_test!(u16);
+        make_optimization_test!(u32);
+        make_optimization_test!(u64);
+        make_optimization_test!(usize);
+
+        make_optimization_test!(i8);
+        make_optimization_test!(i16);
+        make_optimization_test!(i32);
+        make_optimization_test!(i64);
+        make_optimization_test!(isize);
+
+        pr_info!("rust numeric optimizations test done\n");
+
         // Return Err to unload the module
         Result::Err(EINVAL)
     }

-- 
2.34.1
Re: [PATCH v5 7/8] rust: percpu: Add pin-hole optimizations for numerics
Posted by Yury Norov 17 minutes ago
On Fri, Apr 10, 2026 at 02:35:37PM -0700, Mitchell Levy wrote:
> The C implementations of `this_cpu_add`, `this_cpu_sub`, etc., are
> optimized to save an instruction by avoiding having to compute
> `this_cpu_ptr(&x)` for some per-CPU variable `x`. For example, rather
> than
> 
>     u64 *x_ptr = this_cpu_ptr(&x);
>     *x_ptr += 5;
> 
> the implementation of `this_cpu_add` is clever enough to make use of the
> fact that per-CPU variables are implemented on x86 via segment
> registers, and so we can use only a single instruction (where we assume
> `&x` is already in `rax`)
> 
>     add gs:[rax], 5
> 
> Add this optimization via a `PerCpuNumeric` type to enable code-reuse
> between `DynamicPerCpu` and `StaticPerCpu`.
> 
> Signed-off-by: Mitchell Levy <levymitchell0@gmail.com>
> ---
>  rust/kernel/percpu.rs         |   1 +
>  rust/kernel/percpu/dynamic.rs |  10 ++-
>  rust/kernel/percpu/numeric.rs | 138 ++++++++++++++++++++++++++++++++++++++++++
>  samples/rust/rust_percpu.rs   |  36 +++++++++++
>  4 files changed, 184 insertions(+), 1 deletion(-)
> 
> diff --git a/rust/kernel/percpu.rs b/rust/kernel/percpu.rs
> index 72c83fef68ee..ff04607ee047 100644
> --- a/rust/kernel/percpu.rs
> +++ b/rust/kernel/percpu.rs
> @@ -6,6 +6,7 @@
>  
>  pub mod cpu_guard;
>  mod dynamic;
> +pub mod numeric;
>  mod static_;
>  
>  #[doc(inline)]
> diff --git a/rust/kernel/percpu/dynamic.rs b/rust/kernel/percpu/dynamic.rs
> index 40514704b3d0..a717138b93dc 100644
> --- a/rust/kernel/percpu/dynamic.rs
> +++ b/rust/kernel/percpu/dynamic.rs
> @@ -28,7 +28,7 @@
>  /// the memory location on any particular CPU has been initialized. This means that it cannot tell
>  /// whether it should drop the *contents* of the allocation when it is dropped. It is up to the
>  /// user to do this via something like [`core::ptr::drop_in_place`].
> -pub struct PerCpuAllocation<T>(PerCpuPtr<T>);
> +pub struct PerCpuAllocation<T>(pub(super) PerCpuPtr<T>);
>  
>  impl<T: Zeroable> PerCpuAllocation<T> {
>      /// Dynamically allocates a space in the per-CPU area suitably sized and aligned to hold a `T`,
> @@ -162,6 +162,14 @@ pub fn new_from(mut initer: impl FnMut(CpuId) -> T, flags: Flags) -> Option<Self
>      }
>  }
>  
> +impl<T> DynamicPerCpu<T> {
> +    /// Gets the allocation backing this per-CPU variable.
> +    pub(crate) fn alloc(&self) -> &Arc<PerCpuAllocation<T>> {
> +        // SAFETY: This type's invariant ensures that `self.alloc` is `Some`.
> +        unsafe { self.alloc.as_ref().unwrap_unchecked() }
> +    }
> +}
> +
>  impl<T> PerCpu<T> for DynamicPerCpu<T> {
>      unsafe fn get_mut(&mut self, guard: CpuGuard) -> PerCpuToken<'_, T> {
>          // SAFETY:
> diff --git a/rust/kernel/percpu/numeric.rs b/rust/kernel/percpu/numeric.rs
> new file mode 100644
> index 000000000000..13b4ab4a794d
> --- /dev/null
> +++ b/rust/kernel/percpu/numeric.rs
> @@ -0,0 +1,138 @@
> +// SPDX-License-Identifier: GPL-2.0
> +//! Pin-hole optimizations for [`PerCpu<T>`] where T is a numeric type.
> +
> +use super::*;
> +use core::arch::asm;
> +
> +/// Represents a per-CPU variable that can be manipulated with machine-intrinsic numeric
> +/// operations.
> +pub struct PerCpuNumeric<'a, T> {
> +    // INVARIANT: `ptr.0` is a valid offset into the per-CPU area and is initialized on all CPUs
> +    // (since we don't have a CPU guard, we have to be pessimistic and assume we could be on any
> +    // CPU).
> +    ptr: &'a PerCpuPtr<T>,
> +}
> +
> +macro_rules! impl_ops {
> +    ($ty:ty, $reg:tt) => {
> +        impl DynamicPerCpu<$ty> {
> +            /// Returns a [`PerCpuNumeric`] that can be used to manipulate the underlying per-CPU
> +            /// variable.
> +            #[inline]
> +            pub fn num(&mut self) -> PerCpuNumeric<'_, $ty> {
> +                // The invariant is satisfied because `DynamicPerCpu`'s invariant guarantees that
> +                // this pointer is valid and initialized on all CPUs.
> +                PerCpuNumeric { ptr: &self.alloc().0 }
> +            }
> +        }
> +        impl StaticPerCpu<$ty> {
> +            /// Returns a [`PerCpuNumeric`] that can be used to manipulate the underlying per-CPU
> +            /// variable.
> +            #[inline]
> +            pub fn num(&mut self) -> PerCpuNumeric<'_, $ty> {
> +                // The invariant is satisfied because `StaticPerCpu`'s invariant guarantees that
> +                // this pointer is valid and initialized on all CPUs.
> +                PerCpuNumeric { ptr: &self.0 }
> +            }
> +        }
> +
> +        impl PerCpuNumeric<'_, $ty> {
> +            /// Adds `rhs` to the per-CPU variable.
> +            #[inline]
> +            pub fn add(&mut self, rhs: $ty) {
> +                // SAFETY: `self.ptr.0` is a valid offset into the per-CPU area (i.e., valid as a
> +                // pointer relative to the `gs` segment register) by the invariants of this type.
> +                unsafe {
> +                    asm!(
> +                        concat!("add gs:[{off}], {val:", $reg, "}"),
> +                        off = in(reg) self.ptr.0.cast::<$ty>(),
> +                        val = in(reg) rhs,

So, every user of .add() now will be only compilable against x86_64? 
I don't think it's right. Can you make it in a more convenient way:
implement a generic version, and then an x86_64-optimized.

How bad the generic x86_64 version looks comparing to the optimized
one?

Thanks,
Yury

> +                    );
> +                }
> +            }
> +        }
> +        impl PerCpuNumeric<'_, $ty> {
> +            /// Subtracts `rhs` from the per-CPU variable.
> +            #[inline]
> +            pub fn sub(&mut self, rhs: $ty) {
> +                // SAFETY: `self.ptr.0` is a valid offset into the per-CPU area (i.e., valid as a
> +                // pointer relative to the `gs` segment register) by the invariants of this type.
> +                unsafe {
> +                    asm!(
> +                        concat!("sub gs:[{off}], {val:", $reg, "}"),
> +                        off = in(reg) self.ptr.0.cast::<$ty>(),
> +                        val = in(reg) rhs,
> +                    );
> +                }
> +            }
> +        }
> +    };
> +}
> +
> +macro_rules! impl_ops_byte {
> +    ($ty:ty) => {
> +        impl DynamicPerCpu<$ty> {
> +            /// Returns a [`PerCpuNumeric`] that can be used to manipulate the underlying per-CPU
> +            /// variable.
> +            #[inline]
> +            pub fn num(&mut self) -> PerCpuNumeric<'_, $ty> {
> +                // The invariant is satisfied because `DynamicPerCpu`'s invariant guarantees that
> +                // this pointer is valid and initialized on all CPUs.
> +                PerCpuNumeric { ptr: &self.alloc().0 }
> +            }
> +        }
> +        impl StaticPerCpu<$ty> {
> +            /// Returns a [`PerCpuNumeric`] that can be used to manipulate the underlying per-CPU
> +            /// variable.
> +            #[inline]
> +            pub fn num(&mut self) -> PerCpuNumeric<'_, $ty> {
> +                // The invariant is satisfied because `StaticPerCpu`'s invariant guarantees that
> +                // this pointer is valid and initialized on all CPUs.
> +                PerCpuNumeric { ptr: &self.0 }
> +            }
> +        }
> +
> +        impl PerCpuNumeric<'_, $ty> {
> +            /// Adds `rhs` to the per-CPU variable.
> +            #[inline]
> +            pub fn add(&mut self, rhs: $ty) {
> +                // SAFETY: `self.ptr.0` is a valid offset into the per-CPU area (i.e., valid as a
> +                // pointer relative to the `gs` segment register) by the invariants of this type.
> +                unsafe {
> +                    asm!(
> +                        "add gs:[{off}], {val}",
> +                        off = in(reg) self.ptr.0.cast::<$ty>(),
> +                        val = in(reg_byte) rhs,
> +                    );
> +                }
> +            }
> +        }
> +        impl PerCpuNumeric<'_, $ty> {
> +            /// Subtracts `rhs` from the per-CPU variable.
> +            #[inline]
> +            pub fn sub(&mut self, rhs: $ty) {
> +                // SAFETY: `self.ptr.0` is a valid offset into the per-CPU area (i.e., valid as a
> +                // pointer relative to the `gs` segment register) by the invariants of this type.
> +                unsafe {
> +                    asm!(
> +                        "sub gs:[{off}], {val}",
> +                        off = in(reg) self.ptr.0.cast::<$ty>(),
> +                        val = in(reg_byte) rhs,
> +                    );
> +                }
> +            }
> +        }
> +    };
> +}
> +
> +impl_ops_byte!(i8);
> +impl_ops!(i16, "x");
> +impl_ops!(i32, "e");
> +impl_ops!(i64, "r");
> +impl_ops!(isize, "r");
> +
> +impl_ops_byte!(u8);
> +impl_ops!(u16, "x");
> +impl_ops!(u32, "e");
> +impl_ops!(u64, "r");
> +impl_ops!(usize, "r");
> diff --git a/samples/rust/rust_percpu.rs b/samples/rust/rust_percpu.rs
> index 5adb30509bd4..90f5debd3c7a 100644
> --- a/samples/rust/rust_percpu.rs
> +++ b/samples/rust/rust_percpu.rs
> @@ -28,6 +28,26 @@
>  define_per_cpu!(UPERCPU: u64 = 0);
>  define_per_cpu!(CHECKED: RefCell<u64> = RefCell::new(0));
>  
> +macro_rules! make_optimization_test {
> +    ($ty:ty) => {
> +        let mut test: DynamicPerCpu<$ty> = DynamicPerCpu::new_zero(GFP_KERNEL).unwrap();
> +        {
> +            let _guard = CpuGuard::new();
> +            // SAFETY: No other usage of `test`
> +            unsafe { test.get_mut(CpuGuard::new()) }.with(|val: &mut $ty| *val = 10);
> +            test.num().add(1);
> +            // SAFETY: No other usage of `test`
> +            unsafe { test.get_mut(CpuGuard::new()) }.with(|val: &mut $ty| assert_eq!(*val, 11));
> +            test.num().add(10);
> +            // SAFETY: No other usage of `test`
> +            unsafe { test.get_mut(CpuGuard::new()) }.with(|val: &mut $ty| assert_eq!(*val, 21));
> +            test.num().sub(5);
> +            // SAFETY: No other usage of `test`
> +            unsafe { test.get_mut(CpuGuard::new()) }.with(|val: &mut $ty| assert_eq!(*val, 16));
> +        }
> +    };
> +}
> +
>  impl kernel::Module for PerCpuMod {
>      fn init(_module: &'static ThisModule) -> Result<Self, Error> {
>          pr_info!("rust percpu test start\n");
> @@ -228,6 +248,22 @@ fn init(_module: &'static ThisModule) -> Result<Self, Error> {
>  
>          pr_info!("rust dynamic percpu test done\n");
>  
> +        pr_info!("rust numeric optimizations test start\n");
> +
> +        make_optimization_test!(u8);
> +        make_optimization_test!(u16);
> +        make_optimization_test!(u32);
> +        make_optimization_test!(u64);
> +        make_optimization_test!(usize);
> +
> +        make_optimization_test!(i8);
> +        make_optimization_test!(i16);
> +        make_optimization_test!(i32);
> +        make_optimization_test!(i64);
> +        make_optimization_test!(isize);
> +
> +        pr_info!("rust numeric optimizations test done\n");
> +
>          // Return Err to unload the module
>          Result::Err(EINVAL)
>      }
> 
> -- 
> 2.34.1