Optimize ttwu() by pushing select_idle_siblings() up above waiting for
on_cpu(). This allows making use of the cycles otherwise spend waiting
to search for an idle CPU.
One little detail is that since the task we're looking for an idle CPU
for might still be on the CPU, that CPU won't report as running the
idle task, and thus won't find his own CPU idle, even when it is.
To compensate, remove the 'rq->curr == rq->idle' condition from
idle_cpu() -- it doesn't really make sense anyway.
Additionally, Chris found (concurrently) that perf-c2c reported that
test as being a cache-miss monster.
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
kernel/sched/core.c | 3 ++-
kernel/sched/syscalls.c | 3 ---
2 files changed, 2 insertions(+), 4 deletions(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4305,6 +4305,8 @@ int try_to_wake_up(struct task_struct *p
ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
break;
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+
/*
* If the owning (remote) CPU is still in the middle of schedule() with
* this task as prev, wait until it's done referencing the task.
@@ -4316,7 +4318,6 @@ int try_to_wake_up(struct task_struct *p
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
- cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -203,9 +203,6 @@ int idle_cpu(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- if (rq->curr != rq->idle)
- return 0;
-
if (rq->nr_running)
return 0;
Greetings,
This patch gives RT builds terminal heartburn. This particular boot
survived long/well enough to trigger it with LTP sched tests and still
be able to crash dump the hung box.
(is_migration_disabled() confirmation thingy below gripe)
[ 44.379563] WARNING: CPU: 6 PID: 4468 at kernel/sched/core.c:3354 set_task_cpu+0x1c1/0x1d0
[ 44.379569] Modules linked in: af_packet nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_tables ebtable_nat ebtable_broute ip6table_nat ip6table_mangle ip6table_raw ip6table_security iptable_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 iptable_mangle iptable_raw iptable_security bridge stp llc iscsi_ibft iscsi_boot_sysfs rfkill ip_set nfnetlink ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter binfmt_misc intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel snd_hda_codec_realtek nls_iso8859_1 nls_cp437 snd_hda_codec_generic snd_hda_codec_hdmi snd_hda_scodec_component snd_hda_intel at24 iTCO_wdt snd_intel_dspcfg r8169 regmap_i2c snd_intel_sdw_acpi intel_pmc_bxt kvm mei_pxp mei_hdcp iTCO_vendor_support realtek snd_hda_codec i2c_i801 mdio_devres snd_hda_core ums_realtek libphy i2c_mux irqbypass pcspkr i2c_smbus snd_hwdep snd_pcm usblp mdio_bus mei_me lpc_ich mfd_core mei snd_timer
[ 44.379621] snd soundcore thermal fan joydev intel_smartconnect tiny_power_button nfsd auth_rpcgss nfs_acl lockd sch_fq_codel grace sunrpc fuse configfs dmi_sysfs ip_tables x_tables uas usb_storage hid_logitech_hidpp hid_logitech_dj hid_generic usbhid nouveau drm_ttm_helper ttm gpu_sched xhci_pci i2c_algo_bit xhci_hcd ahci drm_gpuvm ehci_pci ehci_hcd libahci drm_exec mxm_wmi libata polyval_clmulni usbcore ghash_clmulni_intel drm_display_helper sha512_ssse3 sha1_ssse3 cec rc_core video wmi button sd_mod scsi_dh_emc scsi_dh_rdac scsi_dh_alua sg scsi_mod scsi_common vfat fat ext4 crc16 mbcache jbd2 loop msr efivarfs aesni_intel
[ 44.379663] CPU: 6 UID: 0 PID: 4468 Comm: sandbox_ipc_thr Kdump: loaded Not tainted 6.15.0.ge271ed52-master-rt #19 PREEMPT_{RT,(lazy)} e4f2516a9b85ac19222adb94a538ef0c57343c1c
[ 44.379666] Hardware name: MEDION MS-7848/MS-7848, BIOS M7848W08.20C 09/23/2013
[ 44.379668] RIP: 0010:set_task_cpu+0x1c1/0x1d0
[ 44.379670] Code: 0f 0b e9 8f fe ff ff 80 8b 8c 05 00 00 04 e9 f5 fe ff ff 0f 0b e9 7c fe ff ff 0f 0b 66 83 bb 40 04 00 00 00 0f 84 8b fe ff ff <0f> 0b e9 84 fe ff ff 0f 1f 84 00 00 00 00 00 90 90 90 90 90 90 90
[ 44.379672] RSP: 0018:ffffcd844ef77668 EFLAGS: 00010002
[ 44.379673] RAX: 0000000000000200 RBX: ffff896da1e8c700 RCX: 0000000000000000
[ 44.379675] RDX: ffff896da1e8cb30 RSI: 0000000000000000 RDI: ffff896da1e8c700
[ 44.379676] RBP: 0000000000000000 R08: 0000000000000206 R09: 000000000002361d
[ 44.379677] R10: fbfffffffffff79d R11: 0000000000000004 R12: 0000000000000000
[ 44.379678] R13: 0000000000000000 R14: 0000000000000028 R15: ffff896da1e8d030
[ 44.379679] FS: 0000000000000000(0000) GS:ffff8970f15d8000(0000) knlGS:0000000000000000
[ 44.379681] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 44.379682] CR2: 00007f4c297c4688 CR3: 000000011b9de002 CR4: 00000000001726f0
[ 44.379683] Call Trace:
[ 44.379686] <TASK>
[ 44.379688] try_to_wake_up+0x245/0x810
[ 44.379692] rt_mutex_slowunlock+0x1d2/0x2d0
[ 44.379696] ? __pfx_lru_activate+0x10/0x10
[ 44.379700] folio_batch_move_lru+0xc7/0x100
[ 44.379704] ? __pfx_lru_activate+0x10/0x10
[ 44.379706] __folio_batch_add_and_move+0xf2/0x110
[ 44.379710] folio_mark_accessed+0x80/0x1b0
[ 44.379711] unmap_page_range+0x176b/0x1a60
[ 44.379717] unmap_vmas+0xae/0x1a0
[ 44.379720] exit_mmap+0xe5/0x3c0
[ 44.379725] mmput+0x6e/0x150
[ 44.379729] do_exit+0x23c/0xa20
[ 44.379732] do_group_exit+0x33/0x90
[ 44.379735] get_signal+0x85d/0x8b0
[ 44.379738] arch_do_signal_or_restart+0x2d/0x240
[ 44.379743] ? place_entity+0x1b/0x130
[ 44.379745] ? __x64_sys_poll+0x47/0x1a0
[ 44.379749] exit_to_user_mode_loop+0x86/0x150
[ 44.379753] do_syscall_64+0x1ba/0x8e0
[ 44.379756] ? wakeup_preempt+0x40/0x70
[ 44.379758] ? ttwu_do_activate+0x84/0x210
[ 44.379760] ? _raw_spin_unlock_irqrestore+0x22/0x40
[ 44.379763] ? try_to_wake_up+0xab/0x810
[ 44.379765] ? preempt_count_add+0x4b/0xa0
[ 44.379768] ? futex_hash_put+0x43/0x90
[ 44.379772] ? futex_wake+0xb2/0x1c0
[ 44.379775] ? do_futex+0x125/0x190
[ 44.379776] ? __x64_sys_futex+0x10b/0x1c0
[ 44.379779] ? do_syscall_64+0x7f/0x8e0
[ 44.379781] ? __do_sys_prctl+0xbe/0xee0
[ 44.379783] ? do_syscall_64+0x7f/0x8e0
[ 44.379786] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 44.379789] RIP: 0033:0x7f4c3571fdef
[ 44.379803] Code: Unable to access opcode bytes at 0x7f4c3571fdc5.
[ 44.379803] RSP: 002b:00007f4c0c9fe700 EFLAGS: 00000293 ORIG_RAX: 0000000000000007
[ 44.379805] RAX: fffffffffffffdfc RBX: 00007f4c0c9fe730 RCX: 00007f4c3571fdef
[ 44.379806] RDX: 00000000ffffffff RSI: 0000000000000002 RDI: 00007f4c0c9fe730
[ 44.379807] RBP: 00007f4c0c9fe920 R08: 0000000000000000 R09: 0000000000000007
[ 44.379808] R10: 00005587485bf1d0 R11: 0000000000000293 R12: 00005587485a7fc0
[ 44.379809] R13: 0000000000000000 R14: 0000000000001174 R15: 00007f4c0c1ff000
[ 44.379812] </TASK>
---
kernel/sched/core.c | 8 +++++++-
1 file changed, 7 insertions(+), 1 deletion(-)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4313,7 +4313,10 @@ int try_to_wake_up(struct task_struct *p
ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
break;
- cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+ if (is_migration_disabled(p))
+ cpu = -1;
+ else
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
/*
* If the owning (remote) CPU is still in the middle of schedule() with
@@ -4326,6 +4329,9 @@ int try_to_wake_up(struct task_struct *p
*/
smp_cond_load_acquire(&p->on_cpu, !VAL);
+ if (cpu == -1)
+ cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
+
if (task_cpu(p) != cpu) {
if (p->in_iowait) {
delayacct_blkio_end(p);
On Mon, Jun 09, 2025 at 07:01:47AM +0200, Mike Galbraith wrote:
Right; so the problem being that we can race with
migrate_disable_switch().
> kernel/sched/core.c | 8 +++++++-
> 1 file changed, 7 insertions(+), 1 deletion(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -4313,7 +4313,10 @@ int try_to_wake_up(struct task_struct *p
> ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
> break;
>
> - cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
> + if (is_migration_disabled(p))
> + cpu = -1;
> + else
> + cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
>
> /*
> * If the owning (remote) CPU is still in the middle of schedule() with
> @@ -4326,6 +4329,9 @@ int try_to_wake_up(struct task_struct *p
> */
> smp_cond_load_acquire(&p->on_cpu, !VAL);
>
> + if (cpu == -1)
> + cpu = select_task_rq(p, p->wake_cpu, &wake_flags);
> +
> if (task_cpu(p) != cpu) {
> if (p->in_iowait) {
> delayacct_blkio_end(p);
>
So select_task_rq() already checks is_migration_disabled(); just not
well enough. Also, I'm thinking that if we see migration_disabled, we
don't need to call it a second time, just let it be where it was.
Does something like this help? Specifically, when nr_cpus_allowed == 1
|| is_migration_disabled(), don't change @cpu at all.
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3593,7 +3593,7 @@ int select_task_rq(struct task_struct *p
cpu = p->sched_class->select_task_rq(p, cpu, *wake_flags);
*wake_flags |= WF_RQ_SELECTED;
} else {
- cpu = cpumask_any(p->cpus_ptr);
+ cpu = task_cpu(p);
}
/*
On Fri, 2025-06-13 at 11:40 +0200, Peter Zijlstra wrote:
> On Mon, Jun 09, 2025 at 07:01:47AM +0200, Mike Galbraith wrote:
>
> Right; so the problem being that we can race with
> migrate_disable_switch().
Yeah. Most of the time when we do fallback saves us, but we can and do
zip past it, and that turns box various shades of sad.
>
> Does something like this help?
It surely will, but I'll testdrive it. No news is good news.
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -3593,7 +3593,7 @@ int select_task_rq(struct task_struct *p
> cpu = p->sched_class->select_task_rq(p, cpu,
> *wake_flags);
> *wake_flags |= WF_RQ_SELECTED;
> } else {
> - cpu = cpumask_any(p->cpus_ptr);
> + cpu = task_cpu(p);
> }
>
> /*
© 2016 - 2025 Red Hat, Inc.