[PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()

K Prateek Nayak posted 2 patches 3 weeks, 5 days ago
There is a newer version of this series
[PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
Posted by K Prateek Nayak 3 weeks, 5 days ago
cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent
writer(s), however amd-pstate depends on fetching the cpudata via the
policy's driver data which necessitates grabbing the reference.

Since schedutil governor can call "cpufreq_driver->update_perf()"
during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled,
fetching the policy object using the cpufreq_cpu_get() helper in the
scheduler fast-path leads to "BUG: scheduling while atomic" on
PREEMPT_RT [1].

Pass the cached cpufreq policy object in sg_policy to the update_perf()
instead of just the CPU. The CPU can be inferred using "policy->cpu".

The lifetime of cpufreq_policy object outlasts that of the governor and
the cpufreq driver (allocated when the CPU is onlined and only reclaimed
when the CPU is offlined / the CPU device is removed) which makes it
safe to be referenced throughout the governor's lifetime.

Reported-by: Bert Karwatzki <spasswolf@web.de>
Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1]
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
Changelog rfc v1..v2:

o Updated the kdoc comment above cpufreq_driver_adjust_perf() to
  reflect that cpufreq_policy is passed as an argument now instead of
  the Target CPU. (kernel test robot)

o Added "Reported-by:" and "Closes:" tags. (Mario)

o Collected tags from v1. (Thank you Viresh)
---
 drivers/cpufreq/amd-pstate.c     | 3 +--
 drivers/cpufreq/cpufreq.c        | 6 +++---
 drivers/cpufreq/intel_pstate.c   | 4 ++--
 include/linux/cpufreq.h          | 4 ++--
 kernel/sched/cpufreq_schedutil.c | 5 +++--
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 5818a92d96b9..455e58a9b738 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -697,13 +697,12 @@ static unsigned int amd_pstate_fast_switch(struct cpufreq_policy *policy,
 	return policy->cur;
 }
 
-static void amd_pstate_adjust_perf(unsigned int cpu,
+static void amd_pstate_adjust_perf(struct cpufreq_policy *policy,
 				   unsigned long _min_perf,
 				   unsigned long target_perf,
 				   unsigned long capacity)
 {
 	u8 max_perf, min_perf, des_perf, cap_perf;
-	struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu);
 	struct amd_cpudata *cpudata;
 	union perf_cached perf;
 
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 50dde2980f1b..28b82c2eb7b3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2225,7 +2225,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
 
 /**
  * cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
- * @cpu: Target CPU.
+ * @policy: cpufreq policy of the target CPU.
  * @min_perf: Minimum (required) performance level (units of @capacity).
  * @target_perf: Target (desired) performance level (units of @capacity).
  * @capacity: Capacity of the target CPU.
@@ -2244,12 +2244,12 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
  * parallel with either ->target() or ->target_index() or ->fast_switch() for
  * the same CPU.
  */
-void cpufreq_driver_adjust_perf(unsigned int cpu,
+void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy,
 				 unsigned long min_perf,
 				 unsigned long target_perf,
 				 unsigned long capacity)
 {
-	cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity);
+	cpufreq_driver->adjust_perf(policy, min_perf, target_perf, capacity);
 }
 
 /**
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index ec4abe374573..8d25f0f2925c 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -3237,12 +3237,12 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
 	return target_pstate * cpu->pstate.scaling;
 }
 
-static void intel_cpufreq_adjust_perf(unsigned int cpunum,
+static void intel_cpufreq_adjust_perf(struct cpufreq_policy *policy,
 				      unsigned long min_perf,
 				      unsigned long target_perf,
 				      unsigned long capacity)
 {
-	struct cpudata *cpu = all_cpu_data[cpunum];
+	struct cpudata *cpu = all_cpu_data[policy->cpu];
 	u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
 	int old_pstate = cpu->pstate.current_pstate;
 	int cap_pstate, min_pstate, max_pstate, target_pstate;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 0465d1e6f72a..fd26b3a4aa28 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -367,7 +367,7 @@ struct cpufreq_driver {
 	 * conditions) scale invariance can be disabled, which causes the
 	 * schedutil governor to fall back to the latter.
 	 */
-	void		(*adjust_perf)(unsigned int cpu,
+	void		(*adjust_perf)(struct cpufreq_policy *policy,
 				       unsigned long min_perf,
 				       unsigned long target_perf,
 				       unsigned long capacity);
@@ -612,7 +612,7 @@ struct cpufreq_governor {
 /* Pass a target to the cpufreq driver */
 unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
 					unsigned int target_freq);
-void cpufreq_driver_adjust_perf(unsigned int cpu,
+void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy,
 				unsigned long min_perf,
 				unsigned long target_perf,
 				unsigned long capacity);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0ab5f9d4bc59..307f3076635e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 				     unsigned int flags)
 {
 	struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+	struct sugov_policy *sg_policy = sg_cpu->sg_policy;
 	unsigned long prev_util = sg_cpu->util;
 	unsigned long max_cap;
 
@@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
 	if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
 		sg_cpu->util = prev_util;
 
-	cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+	cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min,
 				   sg_cpu->util, max_cap);
 
-	sg_cpu->sg_policy->last_freq_update_time = time;
+	sg_policy->last_freq_update_time = time;
 }
 
 static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
-- 
2.34.1
Re: [PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
Posted by Viresh Kumar 3 weeks, 4 days ago
On 14-01-26, 08:51, K Prateek Nayak wrote:
> cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent
> writer(s), however amd-pstate depends on fetching the cpudata via the
> policy's driver data which necessitates grabbing the reference.
> 
> Since schedutil governor can call "cpufreq_driver->update_perf()"
> during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled,
> fetching the policy object using the cpufreq_cpu_get() helper in the
> scheduler fast-path leads to "BUG: scheduling while atomic" on
> PREEMPT_RT [1].
> 
> Pass the cached cpufreq policy object in sg_policy to the update_perf()
> instead of just the CPU. The CPU can be inferred using "policy->cpu".
> 
> The lifetime of cpufreq_policy object outlasts that of the governor and
> the cpufreq driver (allocated when the CPU is onlined and only reclaimed
> when the CPU is offlined / the CPU device is removed) which makes it
> safe to be referenced throughout the governor's lifetime.
> 
> Reported-by: Bert Karwatzki <spasswolf@web.de>
> Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1]
> Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>

Ahh, you need to fix the Rust binding as well. Minor change there.

-- 
viresh
Re: [PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
Posted by K Prateek Nayak 3 weeks, 3 days ago
Hello Viresh,

On 1/15/2026 9:44 AM, Viresh Kumar wrote:
> On 14-01-26, 08:51, K Prateek Nayak wrote:
>> cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent
>> writer(s), however amd-pstate depends on fetching the cpudata via the
>> policy's driver data which necessitates grabbing the reference.
>>
>> Since schedutil governor can call "cpufreq_driver->update_perf()"
>> during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled,
>> fetching the policy object using the cpufreq_cpu_get() helper in the
>> scheduler fast-path leads to "BUG: scheduling while atomic" on
>> PREEMPT_RT [1].
>>
>> Pass the cached cpufreq policy object in sg_policy to the update_perf()
>> instead of just the CPU. The CPU can be inferred using "policy->cpu".
>>
>> The lifetime of cpufreq_policy object outlasts that of the governor and
>> the cpufreq driver (allocated when the CPU is onlined and only reclaimed
>> when the CPU is offlined / the CPU device is removed) which makes it
>> safe to be referenced throughout the governor's lifetime.
>>
>> Reported-by: Bert Karwatzki <spasswolf@web.de>
>> Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1]
>> Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
>> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
> 
> Ahh, you need to fix the Rust binding as well. Minor change there.

Guilty of not having CONFIG_RUST enabled! I've arrived at the
following after tasking a look at cpufreq.rs (specifically the
fast_switch_callback implementation):

  (Only build tested currently)

diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs
index f968fbd22890..bea109f8ff78 100644
--- a/rust/kernel/cpufreq.rs
+++ b/rust/kernel/cpufreq.rs
@@ -1255,18 +1255,17 @@ impl<T: Driver> Registration<T> {
     /// # Safety
     ///
     /// - This function may only be called from the cpufreq C infrastructure.
+    /// - The pointer arguments must be valid pointers.
     unsafe extern "C" fn adjust_perf_callback(
-        cpu: c_uint,
+        ptr: *mut bindings::cpufreq_policy,
         min_perf: c_ulong,
         target_perf: c_ulong,
         capacity: c_ulong,
     ) {
-        // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number.
-        let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) };
-
-        if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) {
-            T::adjust_perf(&mut policy, min_perf, target_perf, capacity);
-        }
+        // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the
+        // lifetime of `policy`.
+        let policy = unsafe { Policy::from_raw_mut(ptr) };
+        T::adjust_perf(policy, min_perf, target_perf, capacity);
     }
 
     /// Driver's `get_intermediate` callback.
---

Let me know what you think. If you think this is alright, I'll fold it
into this same patch in the next version to preserve bisectiblity with
CONFIG_RUST (or should I keep it separate?).

Since this is my first time dealing with the rust bindings, comments are
highly appreciated.

-- 
Thanks and Regards,
Prateek
Re: [PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
Posted by Viresh Kumar 3 weeks, 3 days ago
On 16-01-26, 10:08, K Prateek Nayak wrote:
> Guilty of not having CONFIG_RUST enabled! I've arrived at the
> following after tasking a look at cpufreq.rs (specifically the
> fast_switch_callback implementation):
> 
>   (Only build tested currently)
> 
> diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs
> index f968fbd22890..bea109f8ff78 100644
> --- a/rust/kernel/cpufreq.rs
> +++ b/rust/kernel/cpufreq.rs
> @@ -1255,18 +1255,17 @@ impl<T: Driver> Registration<T> {
>      /// # Safety
>      ///
>      /// - This function may only be called from the cpufreq C infrastructure.
> +    /// - The pointer arguments must be valid pointers.
>      unsafe extern "C" fn adjust_perf_callback(
> -        cpu: c_uint,
> +        ptr: *mut bindings::cpufreq_policy,
>          min_perf: c_ulong,
>          target_perf: c_ulong,
>          capacity: c_ulong,
>      ) {
> -        // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number.
> -        let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) };
> -
> -        if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) {
> -            T::adjust_perf(&mut policy, min_perf, target_perf, capacity);
> -        }
> +        // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the
> +        // lifetime of `policy`.
> +        let policy = unsafe { Policy::from_raw_mut(ptr) };
> +        T::adjust_perf(policy, min_perf, target_perf, capacity);
>      }
>  
>      /// Driver's `get_intermediate` callback.
> ---
> 
> Let me know what you think. If you think this is alright, I'll fold it
> into this same patch in the next version to preserve bisectiblity with
> CONFIG_RUST (or should I keep it separate?).
> 
> Since this is my first time dealing with the rust bindings, comments are
> highly appreciated.

Yeah looks fine. Just make sure it builds fine.

-- 
viresh
Re: [PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
Posted by K Prateek Nayak 3 weeks, 3 days ago
Hello Viresh,

On 1/16/2026 10:14 AM, Viresh Kumar wrote:
> On 16-01-26, 10:08, K Prateek Nayak wrote:
>> Guilty of not having CONFIG_RUST enabled! I've arrived at the
>> following after tasking a look at cpufreq.rs (specifically the
>> fast_switch_callback implementation):
>>
>>   (Only build tested currently)
>>
>> diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs
>> index f968fbd22890..bea109f8ff78 100644
>> --- a/rust/kernel/cpufreq.rs
>> +++ b/rust/kernel/cpufreq.rs
>> @@ -1255,18 +1255,17 @@ impl<T: Driver> Registration<T> {
>>      /// # Safety
>>      ///
>>      /// - This function may only be called from the cpufreq C infrastructure.
>> +    /// - The pointer arguments must be valid pointers.
>>      unsafe extern "C" fn adjust_perf_callback(
>> -        cpu: c_uint,
>> +        ptr: *mut bindings::cpufreq_policy,
>>          min_perf: c_ulong,
>>          target_perf: c_ulong,
>>          capacity: c_ulong,
>>      ) {
>> -        // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number.
>> -        let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) };
>> -
>> -        if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) {
>> -            T::adjust_perf(&mut policy, min_perf, target_perf, capacity);
>> -        }
>> +        // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the
>> +        // lifetime of `policy`.
>> +        let policy = unsafe { Policy::from_raw_mut(ptr) };
>> +        T::adjust_perf(policy, min_perf, target_perf, capacity);
>>      }
>>  
>>      /// Driver's `get_intermediate` callback.
>> ---
>>
>> Let me know what you think. If you think this is alright, I'll fold it
>> into this same patch in the next version to preserve bisectiblity with
>> CONFIG_RUST (or should I keep it separate?).
>>
>> Since this is my first time dealing with the rust bindings, comments are
>> highly appreciated.
> 
> Yeah looks fine. Just make sure it builds fine.

Ack! Let me give it a spin before sending out the v3. Thank you again
for taking a look.

-- 
Thanks and Regards,
Prateek
Re: [PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
Posted by kernel test robot 3 weeks, 4 days ago
Hi Prateek,

kernel test robot noticed the following build errors:

[auto build test ERROR on 1f630297e183aa2abbafdbe8c4f916ee647e6e21]

url:    https://github.com/intel-lab-lkp/linux/commits/K-Prateek-Nayak/cpufreq-amd-pstate-Pass-the-policy-to-amd_pstate_update/20260114-165940
base:   1f630297e183aa2abbafdbe8c4f916ee647e6e21
patch link:    https://lore.kernel.org/r/20260114085113.21378-3-kprateek.nayak%40amd.com
patch subject: [PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
config: arm64-randconfig-001-20260114 (https://download.01.org/0day-ci/archive/20260115/202601150006.r4sCNEZq-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 9b8addffa70cee5b2acc5454712d9cf78ce45710)
rustc: rustc 1.88.0 (6b00bc388 2025-06-23)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260115/202601150006.r4sCNEZq-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601150006.r4sCNEZq-lkp@intel.com/

All errors (new ones prefixed by >>):

>> error[E0308]: mismatched types
   --> rust/kernel/cpufreq.rs:946:18
   |
   946 |             Some(Self::adjust_perf_callback)
   |             ---- ^^^^^^^^^^^^^^^^^^^^^^^^^^ expected fn pointer, found fn item
   |             |
   |             arguments to this enum variant are incorrect
   |
   = note: expected fn pointer `unsafe extern "C" fn(*mut bindings::cpufreq_policy, _, _, _)`
   found fn item `unsafe extern "C" fn(u32, _, _, _) {cpufreq::Registration::<T>::adjust_perf_callback}`
   help: the type constructed contains `unsafe extern "C" fn(u32, usize, usize, usize) {cpufreq::Registration::<T>::adjust_perf_callback}` due to the type of the argument passed
   --> rust/kernel/cpufreq.rs:946:13
   |
   946 |             Some(Self::adjust_perf_callback)
   |             ^^^^^--------------------------^
   |                  |
   |                  this argument influences the type of `Some`
   note: tuple variant defined here
   --> /opt/cross/rustc-1.88.0-bindgen-0.72.1/rustup/toolchains/1.88.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/option.rs:597:5
   |
   597 |     Some(#[stable(feature = "rust1", since = "1.0.0")] T),
   |     ^^^^

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki