workqueue: fix invalid cpu in kick_pool

[PATCH] workqueue: fix invalid cpu in kick_pool

Posted by Yong He 2 years, 2 months ago

From: Yong He <alexyonghe@tencent.com>

Now unbound workqueue supports non-strict affinity scope after
commit 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for
unbound workqueues"), which allow the worker task to run out of the pod
to gain better performance, then use kick_pool() to migarate the worker
task back to the pod.

With incorrect unbound workqueue configurations, this may introduce kernel
panic, because cpumask_any_distribute() will not always return a valid cpu,
such as one set the 'isolcpus' and 'workqueue.unbound_cpus' into the same
cpuset, and this will make the @pool->attrs->__pod_cpumask an empty set,
then trigger panic like this:

 BUG: unable to handle page fault for address: ffffffff8305e9c0
 #PF: supervisor read access in kernel mode
 #PF: error_code(0x0000) - not-present page
 PGD 2c31067 P4D 2c31067 PUD 2c32063 PMD 10a18d063 PTE 800ffffffcfa1062
 Oops: 0000 [#1] PREEMPT SMP PTI
 CPU: 39 PID: 1 Comm: systemd Not tainted 6.6.1-tlinux4-0011.1 #2
 Hardware name: Cloud Hypervisor cloud-hypervisor, BIOS 0
 RIP: 0010:available_idle_cpu+0x21/0x60
 RSP: 0018:ffffc90000013828 EFLAGS: 00010082
 RAX: 0000000000000000 RBX: 0000000000000028 RCX: 0000000000000008
 RDX: ffffffff8305e040 RSI: 0000000000000028 RDI: 0000000000000028
 RBP: ffffc90000013828 R08: 0000000000000027 R09: 00000000000000b0
 R10: 0000000000000000 R11: ffffffff82c64348 R12: 0000000000000028
 R13: ffff888100928000 R14: 0000000000000028 R15: 0000000000000000
 FS:  00007f0d6a5d39c0(0000) GS:ffff888c8ddc0000(0000) knlGS:0000000000000000
 CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
 CR2: ffffffff8305e9c0 CR3: 0000000100074002 CR4: 0000000000770ee0
 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
 PKRU: 55555554
 Call Trace:
  <TASK>
  select_idle_sibling+0x79/0xaf0
  select_task_rq_fair+0x1cb/0x7b0
  try_to_wake_up+0x29c/0x5c0
  wake_up_process+0x19/0x20
  kick_pool+0x5e/0xb0
  __queue_work+0x119/0x430
  queue_work_on+0x29/0x30
  driver_deferred_probe_trigger.part.15+0x8b/0x90
  driver_bound+0x8b/0xe0
  really_probe+0x2e6/0x3b0
  __driver_probe_device+0x85/0x170
  driver_probe_device+0x24/0x90
  __driver_attach+0xd5/0x170
  bus_for_each_dev+0x7a/0xd0
  driver_attach+0x22/0x30
  bus_add_driver+0x17c/0x230
  driver_register+0x5e/0x110
  ? 0xffffffffa021b000
  register_virtio_driver+0x24/0x40
  register_virtio_driver+0x24/0x40
  virtio_rng_driver_init+0x19/0x1000 [virtio_rng]
  do_one_initcall+0x54/0x220
  do_init_module+0x68/0x250
  load_module+0x1f21/0x2080
  init_module_from_file+0x99/0xd0
  idempotent_init_module+0x195/0x250
  __x64_sys_finit_module+0x68/0xc0
  do_syscall_64+0x40/0x90
  entry_SYSCALL_64_after_hwframe+0x63/0xcd

Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
Signed-off-by: Yong He <alexyonghe@tencent.com>
---
 kernel/workqueue.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6e578f576a6f..0d20feded4e2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1106,6 +1106,7 @@ static bool kick_pool(struct worker_pool *pool)
 {
 	struct worker *worker = first_idle_worker(pool);
 	struct task_struct *p;
+	int cpu;
 
 	lockdep_assert_held(&pool->lock);
 
@@ -1133,10 +1134,13 @@ static bool kick_pool(struct worker_pool *pool)
 	 */
 	if (!pool->attrs->affn_strict &&
 	    !cpumask_test_cpu(p->wake_cpu, pool->attrs->__pod_cpumask)) {
-		struct work_struct *work = list_first_entry(&pool->worklist,
-						struct work_struct, entry);
-		p->wake_cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);
-		get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+		cpu = cpumask_any_distribute(pool->attrs->__pod_cpumask);
+		if (cpu < nr_cpu_ids) {
+			struct work_struct *work = list_first_entry(&pool->worklist,
+							struct work_struct, entry);
+			p->wake_cpu = cpu;
+			get_work_pwq(work)->stats[PWQ_STAT_REPATRIATED]++;
+		}
 	}
 #endif
 	wake_up_process(p);
-- 
2.31.1

Re: [PATCH] workqueue: fix invalid cpu in kick_pool

Posted by kernel test robot 2 years, 2 months ago

Hi Yong,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tj-wq/for-next]
[also build test WARNING on linus/master v6.7-rc2 next-20231120]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Yong-He/workqueue-fix-invalid-cpu-in-kick_pool/20231120-201849
base:   https://git.kernel.org/pub/scm/linux/kernel/git/tj/wq.git for-next
patch link:    https://lore.kernel.org/r/20231120121623.119780-1-alexyonghe%40tencent.com
patch subject: [PATCH] workqueue: fix invalid cpu in kick_pool
config: arc-randconfig-001-20231121 (https://download.01.org/0day-ci/archive/20231121/202311210443.gdbUH0vN-lkp@intel.com/config)
compiler: arceb-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20231121/202311210443.gdbUH0vN-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202311210443.gdbUH0vN-lkp@intel.com/

All warnings (new ones prefixed by >>):

   kernel/workqueue.c: In function 'kick_pool':
>> kernel/workqueue.c:1109:13: warning: unused variable 'cpu' [-Wunused-variable]
    1109 |         int cpu;
         |             ^~~


vim +/cpu +1109 kernel/workqueue.c

  1097	
  1098	/**
  1099	 * kick_pool - wake up an idle worker if necessary
  1100	 * @pool: pool to kick
  1101	 *
  1102	 * @pool may have pending work items. Wake up worker if necessary. Returns
  1103	 * whether a worker was woken up.
  1104	 */
  1105	static bool kick_pool(struct worker_pool *pool)
  1106	{
  1107		struct worker *worker = first_idle_worker(pool);
  1108		struct task_struct *p;
> 1109		int cpu;
  1110	
  1111		lockdep_assert_held(&pool->lock);
  1112	
  1113		if (!need_more_worker(pool) || !worker)
  1114			return false;
  1115	
  1116		p = worker->task;
  1117	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

Re: [PATCH] workqueue: fix invalid cpu in kick_pool

Posted by Tejun Heo 2 years, 2 months ago

Hello,

On Mon, Nov 20, 2023 at 08:16:23PM +0800, Yong He wrote:
> With incorrect unbound workqueue configurations, this may introduce kernel
> panic, because cpumask_any_distribute() will not always return a valid cpu,
> such as one set the 'isolcpus' and 'workqueue.unbound_cpus' into the same
> cpuset, and this will make the @pool->attrs->__pod_cpumask an empty set,
> then trigger panic like this:

This shouldn't have happened. Can you share the configuration and the full
dmesg? Let's fix the problem at the source.

Thanks.

-- 
tejun

Re: [PATCH] workqueue: fix invalid cpu in kick_pool

Posted by zhuangel570 2 years, 2 months ago

Thanks, I have uploaded my configuration and console logs to the following
links, please check.

https://raw.githubusercontent.com/zhuangel/misc/main/debug/workqueue/console.log
https://raw.githubusercontent.com/zhuangel/misc/main/debug/workqueue/config-6.7.rc1
https://raw.githubusercontent.com/zhuangel/misc/main/debug/workqueue/config-4.18.0-348.el8.x86_64

The issue was first discovered in my BM machine and for ease of debugging,
I ran a virtual machine of the same case and reproduced it. My test virtual
machine was installed from centos 8.5.2111 DVD (origin kernel is 4.18.0-348)
and then the kernel was updated from the 6.7.rc1 source code. The virtual
machine ran on 4 CPU, 8G memory and some virtio devices.

My investigation show, when "workqueue.unbound_cpus" and "isolcpus" are
configured as same cpuset, this will make the "wq_unbound_cpumask" as an
empty set, when some idle work task try to set "wake_cpu" from
"cpumask_any_distribute", an invalid CPU will be set, then may trigger
panic.

To be honestly, I am not really known why there is a "not-present page"
exception, after I remove "workqueue.unbound_cpus" from command line or
apply this patch to the running kernel, the system could boot successfully.

On Tue, Nov 21, 2023 at 3:07 AM Tejun Heo <tj@kernel.org> wrote:
>
> Hello,
>
> On Mon, Nov 20, 2023 at 08:16:23PM +0800, Yong He wrote:
> > With incorrect unbound workqueue configurations, this may introduce kernel
> > panic, because cpumask_any_distribute() will not always return a valid cpu,
> > such as one set the 'isolcpus' and 'workqueue.unbound_cpus' into the same
> > cpuset, and this will make the @pool->attrs->__pod_cpumask an empty set,
> > then trigger panic like this:
>
> This shouldn't have happened. Can you share the configuration and the full
> dmesg? Let's fix the problem at the source.
>
> Thanks.
>
> --
> tejun

-- 
——————————
   zhuangel570
——————————

[PATCH] workqueue: Make sure that wq_unbound_cpumask is never empty

Posted by Tejun Heo 2 years, 2 months ago

During boot, depending on how the housekeeping and workqueue.unbound_cpus
masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1
("workqueue: Implement non-strict affinity scope for unbound workqueues"),
this may end up feeding -1 as a CPU number into scheduler leading to oopses.

  BUG: unable to handle page fault for address: ffffffff8305e9c0
  #PF: supervisor read access in kernel mode
  #PF: error_code(0x0000) - not-present page
  ...
  Call Trace:
   <TASK>
   select_idle_sibling+0x79/0xaf0
   select_task_rq_fair+0x1cb/0x7b0
   try_to_wake_up+0x29c/0x5c0
   wake_up_process+0x19/0x20
   kick_pool+0x5e/0xb0
   __queue_work+0x119/0x430
   queue_work_on+0x29/0x30
  ...

An empty wq_unbound_cpumask is a clear misconfiguration and already
disallowed once system is booted up. Let's warn on and ignore
unbound_cpumask restrictions which lead to no unbound cpus. While at it,
also remove now unncessary empty check on wq_unbound_cpumask in
wq_select_unbound_cpu().

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Yong He <alexyonghe@tencent.com>
Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com
Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
Cc: stable@vger.kernel.org # v6.6+
---
Hello,

Yong He, zhuangel570, can you please verify that this patch makes the oops
go away? Waiman, this touches code that you've recently worked on. AFAICS,
they shouldn't interact or cause conflicts. cc'ing just in case.

Thanks.

 kernel/workqueue.c |   22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6e578f576a6f..0295291d54bc 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1684,9 +1684,6 @@ static int wq_select_unbound_cpu(int cpu)
 		pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
 	}
 
-	if (cpumask_empty(wq_unbound_cpumask))
-		return cpu;
-
 	new_cpu = __this_cpu_read(wq_rr_cpu_last);
 	new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
 	if (unlikely(new_cpu >= nr_cpu_ids)) {
@@ -6515,6 +6512,17 @@ static inline void wq_watchdog_init(void) { }
 
 #endif	/* CONFIG_WQ_WATCHDOG */
 
+static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
+{
+	if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
+		pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
+			cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
+		return;
+	}
+
+	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
+}
+
 /**
  * workqueue_init_early - early init for workqueue subsystem
  *
@@ -6534,11 +6542,11 @@ void __init workqueue_init_early(void)
 	BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
 
 	BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-	cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
-	cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
-
+	cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
+	restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
+	restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
 	if (!cpumask_empty(&wq_cmdline_cpumask))
-		cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
+		restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
 
 	pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);

Re: [PATCH] workqueue: Make sure that wq_unbound_cpumask is never empty

Posted by Tejun Heo 2 years, 2 months ago

On Tue, Nov 21, 2023 at 11:39:36AM -1000, Tejun Heo wrote:
> During boot, depending on how the housekeeping and workqueue.unbound_cpus
> masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1
> ("workqueue: Implement non-strict affinity scope for unbound workqueues"),
> this may end up feeding -1 as a CPU number into scheduler leading to oopses.
> 
>   BUG: unable to handle page fault for address: ffffffff8305e9c0
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
>   ...
>   Call Trace:
>    <TASK>
>    select_idle_sibling+0x79/0xaf0
>    select_task_rq_fair+0x1cb/0x7b0
>    try_to_wake_up+0x29c/0x5c0
>    wake_up_process+0x19/0x20
>    kick_pool+0x5e/0xb0
>    __queue_work+0x119/0x430
>    queue_work_on+0x29/0x30
>   ...
> 
> An empty wq_unbound_cpumask is a clear misconfiguration and already
> disallowed once system is booted up. Let's warn on and ignore
> unbound_cpumask restrictions which lead to no unbound cpus. While at it,
> also remove now unncessary empty check on wq_unbound_cpumask in
> wq_select_unbound_cpu().
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Yong He <alexyonghe@tencent.com>
> Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com
> Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
> Cc: stable@vger.kernel.org # v6.6+

Applied to wq/for-6.7-fixes.

Thanks.

-- 
tejun

Re: [PATCH] workqueue: Make sure that wq_unbound_cpumask is never empty

Posted by zhuangel570 2 years, 2 months ago

On Wed, Nov 22, 2023 at 5:39 AM Tejun Heo <tj@kernel.org> wrote:
>
> During boot, depending on how the housekeeping and workqueue.unbound_cpus
> masks are set, wq_unbound_cpumask can end up empty. Since 8639ecebc9b1
> ("workqueue: Implement non-strict affinity scope for unbound workqueues"),
> this may end up feeding -1 as a CPU number into scheduler leading to oopses.
>
>   BUG: unable to handle page fault for address: ffffffff8305e9c0
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
>   ...
>   Call Trace:
>    <TASK>
>    select_idle_sibling+0x79/0xaf0
>    select_task_rq_fair+0x1cb/0x7b0
>    try_to_wake_up+0x29c/0x5c0
>    wake_up_process+0x19/0x20
>    kick_pool+0x5e/0xb0
>    __queue_work+0x119/0x430
>    queue_work_on+0x29/0x30
>   ...
>
> An empty wq_unbound_cpumask is a clear misconfiguration and already
> disallowed once system is booted up. Let's warn on and ignore
> unbound_cpumask restrictions which lead to no unbound cpus. While at it,
> also remove now unncessary empty check on wq_unbound_cpumask in
> wq_select_unbound_cpu().
>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Yong He <alexyonghe@tencent.com>
> Link: http://lkml.kernel.org/r/20231120121623.119780-1-alexyonghe@tencent.com
> Fixes: 8639ecebc9b1 ("workqueue: Implement non-strict affinity scope for unbound workqueues")
> Cc: stable@vger.kernel.org # v6.6+
> ---
> Hello,
>
> Yong He, zhuangel570, can you please verify that this patch makes the oops
> go away? Waiman, this touches code that you've recently worked on. AFAICS,
> they shouldn't interact or cause conflicts. cc'ing just in case.

Sure.
I port this patch to my 6.7 branch, and the kernel could boot successfully on BM
and VM, with the same configurations, also I can see the new added warning, so
this patch solves the oops.

So, one last check, do you think we still need to check return value from
cpumask_any_distribute() to make sure kick_pool() set a correct wake_cpu?

Tested-by: Yong He <alexyonghe@tencent.com>

>
> Thanks.
>
>  kernel/workqueue.c |   22 +++++++++++++++-------
>  1 file changed, 15 insertions(+), 7 deletions(-)
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 6e578f576a6f..0295291d54bc 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1684,9 +1684,6 @@ static int wq_select_unbound_cpu(int cpu)
>                 pr_warn_once("workqueue: round-robin CPU selection forced, expect performance impact\n");
>         }
>
> -       if (cpumask_empty(wq_unbound_cpumask))
> -               return cpu;
> -
>         new_cpu = __this_cpu_read(wq_rr_cpu_last);
>         new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
>         if (unlikely(new_cpu >= nr_cpu_ids)) {
> @@ -6515,6 +6512,17 @@ static inline void wq_watchdog_init(void) { }
>
>  #endif /* CONFIG_WQ_WATCHDOG */
>
> +static void __init restrict_unbound_cpumask(const char *name, const struct cpumask *mask)
> +{
> +       if (!cpumask_intersects(wq_unbound_cpumask, mask)) {
> +               pr_warn("workqueue: Restricting unbound_cpumask (%*pb) with %s (%*pb) leaves no CPU, ignoring\n",
> +                       cpumask_pr_args(wq_unbound_cpumask), name, cpumask_pr_args(mask));
> +               return;
> +       }
> +
> +       cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, mask);
> +}
> +
>  /**
>   * workqueue_init_early - early init for workqueue subsystem
>   *
> @@ -6534,11 +6542,11 @@ void __init workqueue_init_early(void)
>         BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
>
>         BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
> -       cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ));
> -       cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN));
> -
> +       cpumask_copy(wq_unbound_cpumask, cpu_possible_mask);
> +       restrict_unbound_cpumask("HK_TYPE_WQ", housekeeping_cpumask(HK_TYPE_WQ));
> +       restrict_unbound_cpumask("HK_TYPE_DOMAIN", housekeeping_cpumask(HK_TYPE_DOMAIN));
>         if (!cpumask_empty(&wq_cmdline_cpumask))
> -               cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, &wq_cmdline_cpumask);
> +               restrict_unbound_cpumask("workqueue.unbound_cpus", &wq_cmdline_cpumask);
>
>         pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
>