cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent
writer(s), however amd-pstate depends on fetching the cpudata via the
policy's driver data which necessitates grabbing the reference.
Since schedutil governor can call "cpufreq_driver->update_perf()"
during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled,
fetching the policy object using the cpufreq_cpu_get() helper in the
scheduler fast-path leads to "BUG: scheduling while atomic" on
PREEMPT_RT [1].
Pass the cached cpufreq policy object in sg_policy to the update_perf()
instead of just the CPU. The CPU can be inferred using "policy->cpu".
The lifetime of cpufreq_policy object outlasts that of the governor and
the cpufreq driver (allocated when the CPU is onlined and only reclaimed
when the CPU is offlined / the CPU device is removed) which makes it
safe to be referenced throughout the governor's lifetime.
Reported-by: Bert Karwatzki <spasswolf@web.de>
Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1]
Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
---
Changelog rfc v1..v2:
o Updated the kdoc comment above cpufreq_driver_adjust_perf() to
reflect that cpufreq_policy is passed as an argument now instead of
the Target CPU. (kernel test robot)
o Added "Reported-by:" and "Closes:" tags. (Mario)
o Collected tags from v1. (Thank you Viresh)
---
drivers/cpufreq/amd-pstate.c | 3 +--
drivers/cpufreq/cpufreq.c | 6 +++---
drivers/cpufreq/intel_pstate.c | 4 ++--
include/linux/cpufreq.h | 4 ++--
kernel/sched/cpufreq_schedutil.c | 5 +++--
5 files changed, 11 insertions(+), 11 deletions(-)
diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c
index 5818a92d96b9..455e58a9b738 100644
--- a/drivers/cpufreq/amd-pstate.c
+++ b/drivers/cpufreq/amd-pstate.c
@@ -697,13 +697,12 @@ static unsigned int amd_pstate_fast_switch(struct cpufreq_policy *policy,
return policy->cur;
}
-static void amd_pstate_adjust_perf(unsigned int cpu,
+static void amd_pstate_adjust_perf(struct cpufreq_policy *policy,
unsigned long _min_perf,
unsigned long target_perf,
unsigned long capacity)
{
u8 max_perf, min_perf, des_perf, cap_perf;
- struct cpufreq_policy *policy __free(put_cpufreq_policy) = cpufreq_cpu_get(cpu);
struct amd_cpudata *cpudata;
union perf_cached perf;
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index 50dde2980f1b..28b82c2eb7b3 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2225,7 +2225,7 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
/**
* cpufreq_driver_adjust_perf - Adjust CPU performance level in one go.
- * @cpu: Target CPU.
+ * @policy: cpufreq policy of the target CPU.
* @min_perf: Minimum (required) performance level (units of @capacity).
* @target_perf: Target (desired) performance level (units of @capacity).
* @capacity: Capacity of the target CPU.
@@ -2244,12 +2244,12 @@ EXPORT_SYMBOL_GPL(cpufreq_driver_fast_switch);
* parallel with either ->target() or ->target_index() or ->fast_switch() for
* the same CPU.
*/
-void cpufreq_driver_adjust_perf(unsigned int cpu,
+void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy,
unsigned long min_perf,
unsigned long target_perf,
unsigned long capacity)
{
- cpufreq_driver->adjust_perf(cpu, min_perf, target_perf, capacity);
+ cpufreq_driver->adjust_perf(policy, min_perf, target_perf, capacity);
}
/**
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index ec4abe374573..8d25f0f2925c 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -3237,12 +3237,12 @@ static unsigned int intel_cpufreq_fast_switch(struct cpufreq_policy *policy,
return target_pstate * cpu->pstate.scaling;
}
-static void intel_cpufreq_adjust_perf(unsigned int cpunum,
+static void intel_cpufreq_adjust_perf(struct cpufreq_policy *policy,
unsigned long min_perf,
unsigned long target_perf,
unsigned long capacity)
{
- struct cpudata *cpu = all_cpu_data[cpunum];
+ struct cpudata *cpu = all_cpu_data[policy->cpu];
u64 hwp_cap = READ_ONCE(cpu->hwp_cap_cached);
int old_pstate = cpu->pstate.current_pstate;
int cap_pstate, min_pstate, max_pstate, target_pstate;
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h
index 0465d1e6f72a..fd26b3a4aa28 100644
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -367,7 +367,7 @@ struct cpufreq_driver {
* conditions) scale invariance can be disabled, which causes the
* schedutil governor to fall back to the latter.
*/
- void (*adjust_perf)(unsigned int cpu,
+ void (*adjust_perf)(struct cpufreq_policy *policy,
unsigned long min_perf,
unsigned long target_perf,
unsigned long capacity);
@@ -612,7 +612,7 @@ struct cpufreq_governor {
/* Pass a target to the cpufreq driver */
unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy,
unsigned int target_freq);
-void cpufreq_driver_adjust_perf(unsigned int cpu,
+void cpufreq_driver_adjust_perf(struct cpufreq_policy *policy,
unsigned long min_perf,
unsigned long target_perf,
unsigned long capacity);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 0ab5f9d4bc59..307f3076635e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -461,6 +461,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
unsigned int flags)
{
struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
unsigned long prev_util = sg_cpu->util;
unsigned long max_cap;
@@ -482,10 +483,10 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time,
if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util)
sg_cpu->util = prev_util;
- cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min,
+ cpufreq_driver_adjust_perf(sg_policy->policy, sg_cpu->bw_min,
sg_cpu->util, max_cap);
- sg_cpu->sg_policy->last_freq_update_time = time;
+ sg_policy->last_freq_update_time = time;
}
static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
--
2.34.1
On 14-01-26, 08:51, K Prateek Nayak wrote: > cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent > writer(s), however amd-pstate depends on fetching the cpudata via the > policy's driver data which necessitates grabbing the reference. > > Since schedutil governor can call "cpufreq_driver->update_perf()" > during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled, > fetching the policy object using the cpufreq_cpu_get() helper in the > scheduler fast-path leads to "BUG: scheduling while atomic" on > PREEMPT_RT [1]. > > Pass the cached cpufreq policy object in sg_policy to the update_perf() > instead of just the CPU. The CPU can be inferred using "policy->cpu". > > The lifetime of cpufreq_policy object outlasts that of the governor and > the cpufreq driver (allocated when the CPU is onlined and only reclaimed > when the CPU is offlined / the CPU device is removed) which makes it > safe to be referenced throughout the governor's lifetime. > > Reported-by: Bert Karwatzki <spasswolf@web.de> > Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1] > Acked-by: Viresh Kumar <viresh.kumar@linaro.org> > Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com> Ahh, you need to fix the Rust binding as well. Minor change there. -- viresh
Hello Viresh,
On 1/15/2026 9:44 AM, Viresh Kumar wrote:
> On 14-01-26, 08:51, K Prateek Nayak wrote:
>> cpufreq_cpu_get() can sleep on PREEMPT_RT in presence of concurrent
>> writer(s), however amd-pstate depends on fetching the cpudata via the
>> policy's driver data which necessitates grabbing the reference.
>>
>> Since schedutil governor can call "cpufreq_driver->update_perf()"
>> during sched_tick/enqueue/dequeue with rq_lock held and IRQs disabled,
>> fetching the policy object using the cpufreq_cpu_get() helper in the
>> scheduler fast-path leads to "BUG: scheduling while atomic" on
>> PREEMPT_RT [1].
>>
>> Pass the cached cpufreq policy object in sg_policy to the update_perf()
>> instead of just the CPU. The CPU can be inferred using "policy->cpu".
>>
>> The lifetime of cpufreq_policy object outlasts that of the governor and
>> the cpufreq driver (allocated when the CPU is onlined and only reclaimed
>> when the CPU is offlined / the CPU device is removed) which makes it
>> safe to be referenced throughout the governor's lifetime.
>>
>> Reported-by: Bert Karwatzki <spasswolf@web.de>
>> Closes:https://lore.kernel.org/all/20250731092316.3191-1-spasswolf@web.de/ [1]
>> Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
>> Signed-off-by: K Prateek Nayak <kprateek.nayak@amd.com>
>
> Ahh, you need to fix the Rust binding as well. Minor change there.
Guilty of not having CONFIG_RUST enabled! I've arrived at the
following after tasking a look at cpufreq.rs (specifically the
fast_switch_callback implementation):
(Only build tested currently)
diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs
index f968fbd22890..bea109f8ff78 100644
--- a/rust/kernel/cpufreq.rs
+++ b/rust/kernel/cpufreq.rs
@@ -1255,18 +1255,17 @@ impl<T: Driver> Registration<T> {
/// # Safety
///
/// - This function may only be called from the cpufreq C infrastructure.
+ /// - The pointer arguments must be valid pointers.
unsafe extern "C" fn adjust_perf_callback(
- cpu: c_uint,
+ ptr: *mut bindings::cpufreq_policy,
min_perf: c_ulong,
target_perf: c_ulong,
capacity: c_ulong,
) {
- // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number.
- let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) };
-
- if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) {
- T::adjust_perf(&mut policy, min_perf, target_perf, capacity);
- }
+ // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the
+ // lifetime of `policy`.
+ let policy = unsafe { Policy::from_raw_mut(ptr) };
+ T::adjust_perf(policy, min_perf, target_perf, capacity);
}
/// Driver's `get_intermediate` callback.
---
Let me know what you think. If you think this is alright, I'll fold it
into this same patch in the next version to preserve bisectiblity with
CONFIG_RUST (or should I keep it separate?).
Since this is my first time dealing with the rust bindings, comments are
highly appreciated.
--
Thanks and Regards,
Prateek
On 16-01-26, 10:08, K Prateek Nayak wrote:
> Guilty of not having CONFIG_RUST enabled! I've arrived at the
> following after tasking a look at cpufreq.rs (specifically the
> fast_switch_callback implementation):
>
> (Only build tested currently)
>
> diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs
> index f968fbd22890..bea109f8ff78 100644
> --- a/rust/kernel/cpufreq.rs
> +++ b/rust/kernel/cpufreq.rs
> @@ -1255,18 +1255,17 @@ impl<T: Driver> Registration<T> {
> /// # Safety
> ///
> /// - This function may only be called from the cpufreq C infrastructure.
> + /// - The pointer arguments must be valid pointers.
> unsafe extern "C" fn adjust_perf_callback(
> - cpu: c_uint,
> + ptr: *mut bindings::cpufreq_policy,
> min_perf: c_ulong,
> target_perf: c_ulong,
> capacity: c_ulong,
> ) {
> - // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number.
> - let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) };
> -
> - if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) {
> - T::adjust_perf(&mut policy, min_perf, target_perf, capacity);
> - }
> + // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the
> + // lifetime of `policy`.
> + let policy = unsafe { Policy::from_raw_mut(ptr) };
> + T::adjust_perf(policy, min_perf, target_perf, capacity);
> }
>
> /// Driver's `get_intermediate` callback.
> ---
>
> Let me know what you think. If you think this is alright, I'll fold it
> into this same patch in the next version to preserve bisectiblity with
> CONFIG_RUST (or should I keep it separate?).
>
> Since this is my first time dealing with the rust bindings, comments are
> highly appreciated.
Yeah looks fine. Just make sure it builds fine.
--
viresh
Hello Viresh,
On 1/16/2026 10:14 AM, Viresh Kumar wrote:
> On 16-01-26, 10:08, K Prateek Nayak wrote:
>> Guilty of not having CONFIG_RUST enabled! I've arrived at the
>> following after tasking a look at cpufreq.rs (specifically the
>> fast_switch_callback implementation):
>>
>> (Only build tested currently)
>>
>> diff --git a/rust/kernel/cpufreq.rs b/rust/kernel/cpufreq.rs
>> index f968fbd22890..bea109f8ff78 100644
>> --- a/rust/kernel/cpufreq.rs
>> +++ b/rust/kernel/cpufreq.rs
>> @@ -1255,18 +1255,17 @@ impl<T: Driver> Registration<T> {
>> /// # Safety
>> ///
>> /// - This function may only be called from the cpufreq C infrastructure.
>> + /// - The pointer arguments must be valid pointers.
>> unsafe extern "C" fn adjust_perf_callback(
>> - cpu: c_uint,
>> + ptr: *mut bindings::cpufreq_policy,
>> min_perf: c_ulong,
>> target_perf: c_ulong,
>> capacity: c_ulong,
>> ) {
>> - // SAFETY: The C API guarantees that `cpu` refers to a valid CPU number.
>> - let cpu_id = unsafe { CpuId::from_u32_unchecked(cpu) };
>> -
>> - if let Ok(mut policy) = PolicyCpu::from_cpu(cpu_id) {
>> - T::adjust_perf(&mut policy, min_perf, target_perf, capacity);
>> - }
>> + // SAFETY: The `ptr` is guaranteed to be valid by the contract with the C code for the
>> + // lifetime of `policy`.
>> + let policy = unsafe { Policy::from_raw_mut(ptr) };
>> + T::adjust_perf(policy, min_perf, target_perf, capacity);
>> }
>>
>> /// Driver's `get_intermediate` callback.
>> ---
>>
>> Let me know what you think. If you think this is alright, I'll fold it
>> into this same patch in the next version to preserve bisectiblity with
>> CONFIG_RUST (or should I keep it separate?).
>>
>> Since this is my first time dealing with the rust bindings, comments are
>> highly appreciated.
>
> Yeah looks fine. Just make sure it builds fine.
Ack! Let me give it a spin before sending out the v3. Thank you again
for taking a look.
--
Thanks and Regards,
Prateek
Hi Prateek,
kernel test robot noticed the following build errors:
[auto build test ERROR on 1f630297e183aa2abbafdbe8c4f916ee647e6e21]
url: https://github.com/intel-lab-lkp/linux/commits/K-Prateek-Nayak/cpufreq-amd-pstate-Pass-the-policy-to-amd_pstate_update/20260114-165940
base: 1f630297e183aa2abbafdbe8c4f916ee647e6e21
patch link: https://lore.kernel.org/r/20260114085113.21378-3-kprateek.nayak%40amd.com
patch subject: [PATCH v2 2/2] cpufreq: Pass the policy to cpufreq_driver->adjust_perf()
config: arm64-randconfig-001-20260114 (https://download.01.org/0day-ci/archive/20260115/202601150006.r4sCNEZq-lkp@intel.com/config)
compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project 9b8addffa70cee5b2acc5454712d9cf78ce45710)
rustc: rustc 1.88.0 (6b00bc388 2025-06-23)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260115/202601150006.r4sCNEZq-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202601150006.r4sCNEZq-lkp@intel.com/
All errors (new ones prefixed by >>):
>> error[E0308]: mismatched types
--> rust/kernel/cpufreq.rs:946:18
|
946 | Some(Self::adjust_perf_callback)
| ---- ^^^^^^^^^^^^^^^^^^^^^^^^^^ expected fn pointer, found fn item
| |
| arguments to this enum variant are incorrect
|
= note: expected fn pointer `unsafe extern "C" fn(*mut bindings::cpufreq_policy, _, _, _)`
found fn item `unsafe extern "C" fn(u32, _, _, _) {cpufreq::Registration::<T>::adjust_perf_callback}`
help: the type constructed contains `unsafe extern "C" fn(u32, usize, usize, usize) {cpufreq::Registration::<T>::adjust_perf_callback}` due to the type of the argument passed
--> rust/kernel/cpufreq.rs:946:13
|
946 | Some(Self::adjust_perf_callback)
| ^^^^^--------------------------^
| |
| this argument influences the type of `Some`
note: tuple variant defined here
--> /opt/cross/rustc-1.88.0-bindgen-0.72.1/rustup/toolchains/1.88.0-x86_64-unknown-linux-gnu/lib/rustlib/src/rust/library/core/src/option.rs:597:5
|
597 | Some(#[stable(feature = "rust1", since = "1.0.0")] T),
| ^^^^
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
© 2016 - 2026 Red Hat, Inc.