When putting CPUs to sleep permanently, we should try to put them into the most power conserving state possible. For now it is unclear whether, especially in a deep C-state, the P-state also matters, so this series only arranges for the C-state side of things (plus some cleanup). 1: x86/idle: re-arrange dead-idle handling 2: x86/cpuidle: push parked CPUs into deeper sleep states when possible 3: x86/cpuidle: clean up Cx dumping Jan _______________________________________________ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
In order to be able to wake parked CPUs from default_dead_idle() (for them to then enter a different dead-idle routine), the function should not itself loop. Move the loop into play_dead(), and use play_dead() as well on the AP boot error path. Furthermore, not the least considering the comment in play_dead(), make sure NMI raised (for now this would be a bug elsewhere, but that's about to change) against a parked or fully offline CPU won't invoke the actual, full-blown NMI handler. Note however that this doesn't make #MC any safer for fully offline CPUs. Signed-off-by: Jan Beulich <jbeulich@suse.com> --- v2: Add spec_ctrl_exit_idle() to default_dead_idle(). Add #MC related remark to description. --- Note: I had to drop the discussed acpi_dead_idle() adjustment again, as it breaks booting with "smt=0" and "maxcpus=" on at least one of my systems. I've not yet managed to understand why that would be. --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -100,14 +100,20 @@ void default_dead_idle(void) */ spec_ctrl_enter_idle(get_cpu_info()); wbinvd(); - for ( ; ; ) - halt(); + halt(); + spec_ctrl_exit_idle(get_cpu_info()); } -static void play_dead(void) +void play_dead(void) { + unsigned int cpu = smp_processor_id(); + local_irq_disable(); + /* Change the NMI handler to a nop (see comment below). */ + _set_gate_lower(&idt_tables[cpu][TRAP_nmi], SYS_DESC_irq_gate, 0, + &trap_nop); + /* * NOTE: After cpu_exit_clear, per-cpu variables may no longer accessible, * as they may be freed at any time if offline CPUs don't get parked. In @@ -118,9 +124,10 @@ static void play_dead(void) * Consider very carefully when adding code to *dead_idle. Most hypervisor * subsystems are unsafe to call. */ - cpu_exit_clear(smp_processor_id()); + cpu_exit_clear(cpu); - (*dead_idle)(); + for ( ; ; ) + dead_idle(); } static void idle_loop(void) --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -33,6 +33,7 @@ #include <xen/serial.h> #include <xen/numa.h> #include <xen/cpu.h> +#include <asm/cpuidle.h> #include <asm/current.h> #include <asm/mc146818rtc.h> #include <asm/desc.h> @@ -209,8 +210,7 @@ static void smp_callin(void) halt: clear_local_APIC(); spin_debug_enable(); - cpu_exit_clear(cpu); - (*dead_idle)(); + play_dead(); } /* Allow the master to continue. */ --- a/xen/include/asm-x86/cpuidle.h +++ b/xen/include/asm-x86/cpuidle.h @@ -20,6 +20,7 @@ int mwait_idle_init(struct notifier_bloc int cpuidle_init_cpu(unsigned int cpu); void default_dead_idle(void); void acpi_dead_idle(void); +void play_dead(void); void trace_exit_reason(u32 *irq_traced); void update_idle_stats(struct acpi_processor_power *, struct acpi_processor_cx *, uint64_t, uint64_t); _______________________________________________ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
On 17/05/2019 11:11, Jan Beulich wrote: > In order to be able to wake parked CPUs from default_dead_idle() (for > them to then enter a different dead-idle routine), the function should > not itself loop. Move the loop into play_dead(), and use play_dead() as > well on the AP boot error path. > > Furthermore, not the least considering the comment in play_dead(), > make sure NMI raised (for now this would be a bug elsewhere, but that's > about to change) against a parked or fully offline CPU won't invoke the > actual, full-blown NMI handler. > > Note however that this doesn't make #MC any safer for fully offline > CPUs. > > Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> _______________________________________________ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
When the mwait-idle driver isn't used, C-state information becomes available only in the course of Dom0 starting up. Use the provided data to allow parked CPUs to sleep in a more energy efficient way, by waking them briefly (via NMI) once the data has been recorded. This involves re-arranging how/when the governor's ->enable() hook gets invoked. The changes there include addition of so far missing error handling in the respective CPU notifier handlers. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/arch/x86/acpi/cpu_idle.c +++ b/xen/arch/x86/acpi/cpu_idle.c @@ -351,12 +351,22 @@ static void dump_cx(unsigned char key) unsigned int cpu; printk("'%c' pressed -> printing ACPI Cx structures\n", key); - for_each_online_cpu ( cpu ) - if (processor_powers[cpu]) - { - print_acpi_power(cpu, processor_powers[cpu]); - process_pending_softirqs(); - } + for_each_present_cpu ( cpu ) + { + struct acpi_processor_power *power = processor_powers[cpu]; + + if ( !power ) + continue; + + if ( cpu_online(cpu) ) + print_acpi_power(cpu, power); + else if ( park_offline_cpus ) + printk("CPU%u parked in state %u (C%u)\n", cpu, + power->last_state ? power->last_state->idx : 1, + power->last_state ? power->last_state->type : 1); + + process_pending_softirqs(); + } } static int __init cpu_idle_key_init(void) @@ -765,6 +775,7 @@ void acpi_dead_idle(void) goto default_halt; cx = &power->states[power->count - 1]; + power->last_state = cx; if ( cx->entry_method == ACPI_CSTATE_EM_FFH ) { @@ -1217,9 +1228,30 @@ long set_cx_pminfo(uint32_t acpi_id, str set_cx(acpi_power, &xen_cx); } - if ( cpuidle_current_governor->enable && - cpuidle_current_governor->enable(acpi_power) ) - return -EFAULT; + if ( !cpu_online(cpu_id) ) + { + uint32_t apic_id = x86_cpu_to_apicid[cpu_id]; + + /* + * If we've just learned of more available C states, wake the CPU if + * it's parked, so it can go back to sleep in perhaps a deeper state. + */ + if ( park_offline_cpus && apic_id != BAD_APICID ) + { + unsigned long flags; + + local_irq_save(flags); + apic_wait_icr_idle(); + apic_icr_write(APIC_DM_NMI | APIC_DEST_PHYSICAL, apic_id); + local_irq_restore(flags); + } + } + else if ( cpuidle_current_governor->enable ) + { + ret = cpuidle_current_governor->enable(acpi_power); + if ( ret < 0 ) + return ret; + } /* FIXME: C-state dependency is not supported by far */ @@ -1379,19 +1411,22 @@ static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; + int rc = 0; - /* Only hook on CPU_ONLINE because a dead cpu may utilize the info to - * to enter deep C-state */ + /* + * Only hook on CPU_UP_PREPARE because a dead cpu may utilize the info + * to enter deep C-state. + */ switch ( action ) { - case CPU_ONLINE: - (void)cpuidle_init_cpu(cpu); - break; - default: + case CPU_UP_PREPARE: + rc = cpuidle_init_cpu(cpu); + if ( !rc && cpuidle_current_governor->enable ) + rc = cpuidle_current_governor->enable(processor_powers[cpu]); break; } - return NOTIFY_DONE; + return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_nfb = { @@ -1406,6 +1441,7 @@ static int __init cpuidle_presmp_init(vo return 0; mwait_idle_init(&cpu_nfb); + cpu_nfb.notifier_call(&cpu_nfb, CPU_UP_PREPARE, cpu); cpu_nfb.notifier_call(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; --- a/xen/arch/x86/acpi/cpuidle_menu.c +++ b/xen/arch/x86/acpi/cpuidle_menu.c @@ -277,9 +277,6 @@ static void menu_reflect(struct acpi_pro static int menu_enable_device(struct acpi_processor_power *power) { - if (!cpu_online(power->cpu)) - return -1; - memset(&per_cpu(menu_devices, power->cpu), 0, sizeof(struct menu_device)); return 0; --- a/xen/arch/x86/cpu/mwait-idle.c +++ b/xen/arch/x86/cpu/mwait-idle.c @@ -1166,12 +1166,17 @@ static int mwait_idle_cpu_init(struct no struct acpi_processor_power *dev = processor_powers[cpu]; switch (action) { + int rc; + default: return NOTIFY_DONE; case CPU_UP_PREPARE: - cpuidle_init_cpu(cpu); - return NOTIFY_DONE; + rc = cpuidle_init_cpu(cpu); + dev = processor_powers[cpu]; + if (!rc && cpuidle_current_governor->enable) + rc = cpuidle_current_governor->enable(dev); + return !rc ? NOTIFY_DONE : notifier_from_errno(rc); case CPU_ONLINE: if (!dev) @@ -1260,8 +1265,6 @@ int __init mwait_idle_init(struct notifi } if (!err) { nfb->notifier_call = mwait_idle_cpu_init; - mwait_idle_cpu_init(nfb, CPU_UP_PREPARE, NULL); - pm_idle_save = pm_idle; pm_idle = mwait_idle; dead_idle = acpi_dead_idle; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
Don't log the same global information once per CPU. Don't log the same information (here: the currently active state) twice. Don't prefix decimal numbers with zeros (giving the impression they're octal). Use format specifiers matching the type of the corresponding expressions. Don't split printk()-s without intervening new-lines. Signed-off-by: Jan Beulich <jbeulich@suse.com> Acked-by: Andrew Cooper <andrew.cooper3@citrix.com> --- a/xen/arch/x86/acpi/cpu_idle.c +++ b/xen/arch/x86/acpi/cpu_idle.c @@ -304,9 +304,6 @@ static void print_acpi_power(uint32_t cp printk("==cpu%d==\n", cpu); last_state_idx = power->last_state ? power->last_state->idx : -1; - printk("active state:\t\tC%d\n", last_state_idx); - printk("max_cstate:\t\tC%d\n", max_cstate); - printk("states:\n"); spin_lock_irq(&power->stat_lock); current_tick = cpuidle_get_tick(); @@ -331,16 +328,14 @@ static void print_acpi_power(uint32_t cp idle_usage += usage[i]; idle_res += tick_to_ns(res_tick[i]); - printk((last_state_idx == i) ? " *" : " "); - printk("C%d:\t", i); - printk("type[C%d] ", power->states[i].type); - printk("latency[%03d] ", power->states[i].latency); - printk("usage[%08"PRIu64"] ", usage[i]); - printk("method[%5s] ", acpi_cstate_method_name[power->states[i].entry_method]); - printk("duration[%"PRIu64"]\n", tick_to_ns(res_tick[i])); + printk(" %cC%u:\ttype[C%d] latency[%3u] usage[%8"PRIu64"] method[%5s] duration[%"PRIu64"]\n", + (last_state_idx == i) ? '*' : ' ', i, + power->states[i].type, power->states[i].latency, usage[i], + acpi_cstate_method_name[power->states[i].entry_method], + tick_to_ns(res_tick[i])); } - printk((last_state_idx == 0) ? " *" : " "); - printk("C0:\tusage[%08"PRIu64"] duration[%"PRIu64"]\n", + printk(" %cC0:\tusage[%8"PRIu64"] duration[%"PRIu64"]\n", + (last_state_idx == 0) ? '*' : ' ', usage[0] + idle_usage, current_stime - idle_res); print_hw_residencies(cpu); @@ -351,6 +346,7 @@ static void dump_cx(unsigned char key) unsigned int cpu; printk("'%c' pressed -> printing ACPI Cx structures\n", key); + printk("max cstate: C%u\n", max_cstate); for_each_present_cpu ( cpu ) { struct acpi_processor_power *power = processor_powers[cpu]; _______________________________________________ Xen-devel mailing list Xen-devel@lists.xenproject.org https://lists.xenproject.org/mailman/listinfo/xen-devel
© 2016 - 2024 Red Hat, Inc.