[PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements

Thomas Hellström posted 1 patch 1 month, 3 weeks ago
There is a newer version of this series
include/linux/ww_mutex.h       | 14 ++++++++++++++
kernel/locking/test-ww_mutex.c |  6 ++++--
2 files changed, 18 insertions(+), 2 deletions(-)
[PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements
Posted by Thomas Hellström 1 month, 3 weeks ago
When using mutex_acquire_nest() with a nest_lock, lockdep refcounts the
number of acquired lockdep_maps of mutexes of the same class, and also
keeps a pointer to the first acquired lockdep_map of a class. That pointer
is then used for various comparison-, printing- and checking purposes,
but there is no mechanism to actively ensure that lockdep_map stays in
memory. Instead, a warning is printed if the lockdep_map is freed and
there are still held locks of the same lock class, even if the lockdep_map
itself has been released.

In the context of WW/WD transactions that means that if a user unlocks
and frees a ww_mutex from within an ongoing ww transaction, and that
mutex happens to be the first ww_mutex grabbed in the transaction,
such a warning is printed and there might be a risk of a UAF.

Note that this is only problem when lockdep is enabled and affects only
dereferences of struct lockdep_map.

Adjust to this by adding a fake lockdep_map to the acquired context and
make sure it is the first acquired lockdep map of the associated
ww_mutex class. Then hold it for the duration of the WW/WD transaction.

This has the side effect that trying to lock a ww mutex *without* a
ww_acquire_context but where a such context has been acquire, we'd see
a lockdep splat. The test-ww_mutex.c selftest attempts to do that, so
modify that particular test to not acquire a ww_acquire_context if it
is not going to be used.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Will Deacon <will@kernel.org>
Cc: Waiman Long <longman@redhat.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Maarten Lankhorst <maarten@lankhorst.se>
Cc: Christian König <christian.koenig@amd.com>
Cc: dri-devel@lists.freedesktop.org
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
---
 include/linux/ww_mutex.h       | 14 ++++++++++++++
 kernel/locking/test-ww_mutex.c |  6 ++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index bb763085479a..a401a2f31a77 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -65,6 +65,16 @@ struct ww_acquire_ctx {
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
+	/**
+	 * @first_lock_dep_map: fake lockdep_map for first locked ww_mutex.
+	 *
+	 * lockdep requires the lockdep_map for the first locked ww_mutex
+	 * in a ww transaction to remain in memory until all ww_mutexes of
+	 * the transaction have been unlocked. Ensure this by keeping a
+	 * fake locked ww_mutex lockdep map between ww_acquire_init() and
+	 * ww_acquire_fini().
+	 */
+	struct lockdep_map first_lock_dep_map;
 #endif
 #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
 	unsigned int deadlock_inject_interval;
@@ -146,7 +156,10 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
 	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
 	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
 			 &ww_class->acquire_key, 0);
+	lockdep_init_map(&ctx->first_lock_dep_map, ww_class->mutex_name,
+			 &ww_class->mutex_key, 0);
 	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
+	mutex_acquire_nest(&ctx->first_lock_dep_map, 0, 0, &ctx->dep_map, _RET_IP_);
 #endif
 #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
 	ctx->deadlock_inject_interval = 1;
@@ -185,6 +198,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
 static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+	mutex_release(&ctx->first_lock_dep_map, _THIS_IP_);
 	mutex_release(&ctx->dep_map, _THIS_IP_);
 #endif
 #ifdef DEBUG_WW_MUTEXES
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 10a5736a21c2..4c2b8b567de5 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -62,7 +62,8 @@ static int __test_mutex(unsigned int flags)
 	int ret;
 
 	ww_mutex_init(&mtx.mutex, &ww_class);
-	ww_acquire_init(&ctx, &ww_class);
+	if (flags & TEST_MTX_CTX)
+		ww_acquire_init(&ctx, &ww_class);
 
 	INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
 	init_completion(&mtx.ready);
@@ -90,7 +91,8 @@ static int __test_mutex(unsigned int flags)
 		ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
 	}
 	ww_mutex_unlock(&mtx.mutex);
-	ww_acquire_fini(&ctx);
+	if (flags & TEST_MTX_CTX)
+		ww_acquire_fini(&ctx);
 
 	if (ret) {
 		pr_err("%s(flags=%x): mutual exclusion failure\n",
-- 
2.46.0

Re: [PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements
Posted by Maarten Lankhorst 1 month, 2 weeks ago
Hey,

I'm pretty sure I've seen this use-after-free in the wild, just never found the root cause since it's so unlikely to trigger on demand.

Acked-by: Maarten Lankhorst <maarten.lankhorst@linux.intel.com>

Den 2024-10-02 kl. 14:56, skrev Thomas Hellström:
> When using mutex_acquire_nest() with a nest_lock, lockdep refcounts the
> number of acquired lockdep_maps of mutexes of the same class, and also
> keeps a pointer to the first acquired lockdep_map of a class. That pointer
> is then used for various comparison-, printing- and checking purposes,
> but there is no mechanism to actively ensure that lockdep_map stays in
> memory. Instead, a warning is printed if the lockdep_map is freed and
> there are still held locks of the same lock class, even if the lockdep_map
> itself has been released.
> 
> In the context of WW/WD transactions that means that if a user unlocks
> and frees a ww_mutex from within an ongoing ww transaction, and that
> mutex happens to be the first ww_mutex grabbed in the transaction,
> such a warning is printed and there might be a risk of a UAF.
> 
> Note that this is only problem when lockdep is enabled and affects only
> dereferences of struct lockdep_map.
> 
> Adjust to this by adding a fake lockdep_map to the acquired context and
> make sure it is the first acquired lockdep map of the associated
> ww_mutex class. Then hold it for the duration of the WW/WD transaction.
> 
> This has the side effect that trying to lock a ww mutex *without* a
> ww_acquire_context but where a such context has been acquire, we'd see
> a lockdep splat. The test-ww_mutex.c selftest attempts to do that, so
> modify that particular test to not acquire a ww_acquire_context if it
> is not going to be used.
> 
> Cc: Peter Zijlstra <peterz@infradead.org>
> Cc: Ingo Molnar <mingo@redhat.com>
> Cc: Will Deacon <will@kernel.org>
> Cc: Waiman Long <longman@redhat.com>
> Cc: Boqun Feng <boqun.feng@gmail.com>
> Cc: Maarten Lankhorst <maarten@lankhorst.se>
> Cc: Christian König <christian.koenig@amd.com>
> Cc: dri-devel@lists.freedesktop.org
> Cc: linux-kernel@vger.kernel.org
> Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
> ---
>  include/linux/ww_mutex.h       | 14 ++++++++++++++
>  kernel/locking/test-ww_mutex.c |  6 ++++--
>  2 files changed, 18 insertions(+), 2 deletions(-)
> 
> diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
> index bb763085479a..a401a2f31a77 100644
> --- a/include/linux/ww_mutex.h
> +++ b/include/linux/ww_mutex.h
> @@ -65,6 +65,16 @@ struct ww_acquire_ctx {
>  #endif
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
>  	struct lockdep_map dep_map;
> +	/**
> +	 * @first_lock_dep_map: fake lockdep_map for first locked ww_mutex.
> +	 *
> +	 * lockdep requires the lockdep_map for the first locked ww_mutex
> +	 * in a ww transaction to remain in memory until all ww_mutexes of
> +	 * the transaction have been unlocked. Ensure this by keeping a
> +	 * fake locked ww_mutex lockdep map between ww_acquire_init() and
> +	 * ww_acquire_fini().
> +	 */
> +	struct lockdep_map first_lock_dep_map;
>  #endif
>  #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
>  	unsigned int deadlock_inject_interval;
> @@ -146,7 +156,10 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
>  	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
>  	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
>  			 &ww_class->acquire_key, 0);
> +	lockdep_init_map(&ctx->first_lock_dep_map, ww_class->mutex_name,
> +			 &ww_class->mutex_key, 0);
>  	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
> +	mutex_acquire_nest(&ctx->first_lock_dep_map, 0, 0, &ctx->dep_map, _RET_IP_);
>  #endif
>  #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
>  	ctx->deadlock_inject_interval = 1;
> @@ -185,6 +198,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
>  static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
>  {
>  #ifdef CONFIG_DEBUG_LOCK_ALLOC
> +	mutex_release(&ctx->first_lock_dep_map, _THIS_IP_);
>  	mutex_release(&ctx->dep_map, _THIS_IP_);
>  #endif
>  #ifdef DEBUG_WW_MUTEXES
> diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
> index 10a5736a21c2..4c2b8b567de5 100644
> --- a/kernel/locking/test-ww_mutex.c
> +++ b/kernel/locking/test-ww_mutex.c
> @@ -62,7 +62,8 @@ static int __test_mutex(unsigned int flags)
>  	int ret;
>  
>  	ww_mutex_init(&mtx.mutex, &ww_class);
> -	ww_acquire_init(&ctx, &ww_class);
> +	if (flags & TEST_MTX_CTX)
> +		ww_acquire_init(&ctx, &ww_class);
>  
>  	INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
>  	init_completion(&mtx.ready);
> @@ -90,7 +91,8 @@ static int __test_mutex(unsigned int flags)
>  		ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
>  	}
>  	ww_mutex_unlock(&mtx.mutex);
> -	ww_acquire_fini(&ctx);
> +	if (flags & TEST_MTX_CTX)
> +		ww_acquire_fini(&ctx);
>  
>  	if (ret) {
>  		pr_err("%s(flags=%x): mutual exclusion failure\n",

Re: [PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements
Posted by kernel test robot 1 month, 2 weeks ago

Hello,

kernel test robot noticed "WARNING:at_kernel/locking/lockdep.c:#__lock_acquire" on:

commit: d417c66b8b12b5706c9df4ddf5367af540f195c6 ("[PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements")
url: https://github.com/intel-lab-lkp/linux/commits/Thomas-Hellstr-m/locking-ww_mutex-Adjust-to-lockdep-nest_lock-requirements/20241002-205818
base: https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git d00b83d416e73bc3fa4d21b14bec920e88b70ce6
patch link: https://lore.kernel.org/all/20241002125611.361001-1-thomas.hellstrom@linux.intel.com/
patch subject: [PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements

in testcase: kernel-selftests
version: kernel-selftests-x86_64-977d51cf-1_20240508
with following parameters:

	group: locking



compiler: gcc-12
test machine: 4 threads Intel(R) Xeon(R) CPU E3-1225 v5 @ 3.30GHz (Skylake) with 16G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202410091542.f6c4a438-oliver.sang@intel.com


[   63.327071][  T246] ------------[ cut here ]------------
[   63.332388][  T246] DEBUG_LOCKS_WARN_ON(hlock->references < references)
[ 63.332410][ T246] WARNING: CPU: 2 PID: 246 at kernel/locking/lockdep.c:5058 __lock_acquire (kernel/locking/lockdep.c:5058 (discriminator 9)) 
[   63.348622][  T246] Modules linked in: test_ww_mutex(+) openvswitch nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 psample btrfs blake2b_generic xor zstd_compress raid6_pq libcrc32c intel_rapl_msr intel_rapl_common x86_pkg_temp_thermal intel_powerclamp sd_mod coretemp sg kvm_intel ipmi_devintf ipmi_msghandler i915 kvm binfmt_misc drm_buddy intel_gtt drm_display_helper crct10dif_pclmul crc32_pclmul crc32c_intel mei_wdt ghash_clmulni_intel ttm wmi_bmof sha512_ssse3 ahci rapl drm_kms_helper libahci video intel_cstate intel_uncore serio_raw libata mei_me i2c_i801 mei i2c_smbus intel_pch_thermal ie31200_edac wmi acpi_pad tpm_infineon loop fuse drm dm_mod ip_tables sch_fq_codel
[   63.409553][  T246] CPU: 2 PID: 246 Comm: kworker/u16:5 Tainted: G S                 6.10.0-04481-gd417c66b8b12 #1
[   63.419907][  T246] Hardware name: HP HP Z238 Microtower Workstation/8183, BIOS N51 Ver. 01.63 10/05/2017
[   63.429471][  T246] Workqueue: test-ww_mutex stress_inorder_work [test_ww_mutex]
[ 63.436889][ T246] RIP: 0010:__lock_acquire (kernel/locking/lockdep.c:5058 (discriminator 9)) 
[ 63.442284][ T246] Code: d2 0f 85 15 0c 00 00 44 8b 0d 7d df c7 04 45 85 c9 0f 85 d0 fe ff ff 48 c7 c6 c0 c1 2a 84 48 c7 c7 00 91 2a 84 e8 8d 39 e5 ff <0f> 0b e9 b6 fe ff ff 41 be 02 00 00 00 e9 11 f7 ff ff 31 db e9 bb
All code
========
   0:	d2 0f                	rorb   %cl,(%rdi)
   2:	85 15 0c 00 00 44    	test   %edx,0x4400000c(%rip)        # 0x44000014
   8:	8b 0d 7d df c7 04    	mov    0x4c7df7d(%rip),%ecx        # 0x4c7df8b
   e:	45 85 c9             	test   %r9d,%r9d
  11:	0f 85 d0 fe ff ff    	jne    0xfffffffffffffee7
  17:	48 c7 c6 c0 c1 2a 84 	mov    $0xffffffff842ac1c0,%rsi
  1e:	48 c7 c7 00 91 2a 84 	mov    $0xffffffff842a9100,%rdi
  25:	e8 8d 39 e5 ff       	callq  0xffffffffffe539b7
  2a:*	0f 0b                	ud2    		<-- trapping instruction
  2c:	e9 b6 fe ff ff       	jmpq   0xfffffffffffffee7
  31:	41 be 02 00 00 00    	mov    $0x2,%r14d
  37:	e9 11 f7 ff ff       	jmpq   0xfffffffffffff74d
  3c:	31 db                	xor    %ebx,%ebx
  3e:	e9                   	.byte 0xe9
  3f:	bb                   	.byte 0xbb

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2    
   2:	e9 b6 fe ff ff       	jmpq   0xfffffffffffffebd
   7:	41 be 02 00 00 00    	mov    $0x2,%r14d
   d:	e9 11 f7 ff ff       	jmpq   0xfffffffffffff723
  12:	31 db                	xor    %ebx,%ebx
  14:	e9                   	.byte 0xe9
  15:	bb                   	.byte 0xbb
[   63.461748][  T246] RSP: 0018:ffffc9000141f7a8 EFLAGS: 00010086
[   63.467683][  T246] RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000027
[   63.475525][  T246] RDX: 0000000000000027 RSI: 0000000000000004 RDI: ffff8883b5330c48
[   63.483354][  T246] RBP: ffff888430688000 R08: 0000000000000001 R09: ffffed1076a66189
[   63.491172][  T246] R10: ffff8883b5330c4b R11: 0000000000000001 R12: ffff888430688f58
[   63.499015][  T246] R13: 00000000000000a0 R14: ffff88841d5b1f28 R15: 0000000000000000
[   63.506860][  T246] FS:  0000000000000000(0000) GS:ffff8883b5300000(0000) knlGS:0000000000000000
[   63.515659][  T246] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   63.522116][  T246] CR2: 00005566d67bd842 CR3: 000000043b67e002 CR4: 00000000003706f0
[   63.529942][  T246] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   63.537768][  T246] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[   63.545591][  T246] Call Trace:
[   63.548744][  T246]  <TASK>
[ 63.551537][ T246] ? __warn (kernel/panic.c:693) 
[ 63.555470][ T246] ? __lock_acquire (kernel/locking/lockdep.c:5058 (discriminator 9)) 
[ 63.560262][ T246] ? report_bug (lib/bug.c:180 lib/bug.c:219) 
[ 63.564637][ T246] ? handle_bug (arch/x86/kernel/traps.c:239) 
[ 63.568838][ T246] ? exc_invalid_op (arch/x86/kernel/traps.c:260 (discriminator 1)) 
[ 63.573389][ T246] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:621) 
[ 63.578283][ T246] ? __lock_acquire (kernel/locking/lockdep.c:5058 (discriminator 9)) 
[ 63.583090][ T246] ? __lock_acquire (kernel/locking/lockdep.c:5058 (discriminator 9)) 
[ 63.587884][ T246] lock_acquire (kernel/locking/lockdep.c:466 kernel/locking/lockdep.c:5758 kernel/locking/lockdep.c:5721) 
[ 63.592243][ T246] ? stress_inorder_work (kernel/locking/test-ww_mutex.c:456) test_ww_mutex
[ 63.598787][ T246] ? __pfx_lock_acquire (kernel/locking/lockdep.c:5724) 
[ 63.603676][ T246] ? __pfx_do_raw_spin_lock (kernel/locking/spinlock_debug.c:114) 
[ 63.608899][ T246] ? __pfx___might_resched (kernel/sched/core.c:8392) 
[ 63.614036][ T246] ? __ww_mutex_lock+0x94c/0x2b50 
[ 63.619954][ T246] __ww_mutex_lock+0x1f9/0x2b50 
[ 63.625696][ T246] ? stress_inorder_work (kernel/locking/test-ww_mutex.c:456) test_ww_mutex
[ 63.632218][ T246] ? stress_inorder_work (kernel/locking/test-ww_mutex.c:456) test_ww_mutex
[ 63.638750][ T246] ? __pfx___ww_mutex_lock+0x10/0x10 
[ 63.644940][ T246] ? __mutex_unlock_slowpath (arch/x86/include/asm/atomic64_64.h:101 include/linux/atomic/atomic-arch-fallback.h:4329 include/linux/atomic/atomic-long.h:1506 include/linux/atomic/atomic-instrumented.h:4481 kernel/locking/mutex.c:929) 
[ 63.650454][ T246] ? lock_is_held_type (kernel/locking/lockdep.c:5497 kernel/locking/lockdep.c:5827) 
[ 63.655348][ T246] ? __pfx___might_resched (kernel/sched/core.c:8392) 
[ 63.660502][ T246] ? ww_mutex_lock (kernel/locking/mutex.c:878) 
[ 63.665047][ T246] ww_mutex_lock (kernel/locking/mutex.c:878) 
[ 63.669432][ T246] stress_inorder_work (kernel/locking/test-ww_mutex.c:456) test_ww_mutex
[ 63.675813][ T246] ? __pfx_stress_inorder_work (kernel/locking/test-ww_mutex.c:434) test_ww_mutex
[ 63.682709][ T246] ? lock_is_held_type (kernel/locking/lockdep.c:5497 kernel/locking/lockdep.c:5827) 
[ 63.687600][ T246] process_one_work (kernel/workqueue.c:3236) 
[ 63.692398][ T246] ? __pfx_lock_acquire (kernel/locking/lockdep.c:5724) 
[ 63.697307][ T246] ? __pfx_process_one_work (kernel/workqueue.c:3133) 
[ 63.702545][ T246] ? assign_work (kernel/workqueue.c:1202) 
[ 63.707003][ T246] ? lock_is_held_type (kernel/locking/lockdep.c:5497 kernel/locking/lockdep.c:5827) 
[ 63.711899][ T246] worker_thread (kernel/workqueue.c:3306 kernel/workqueue.c:3390) 
[ 63.716353][ T246] ? __pfx_worker_thread (kernel/workqueue.c:3339) 
[ 63.721334][ T246] kthread (kernel/kthread.c:389) 
[ 63.725270][ T246] ? __pfx_kthread (kernel/kthread.c:342) 
[ 63.729732][ T246] ret_from_fork (arch/x86/kernel/process.c:153) 
[ 63.734019][ T246] ? __pfx_kthread (kernel/kthread.c:342) 
[ 63.738485][ T246] ret_from_fork_asm (arch/x86/entry/entry_64.S:257) 
[   63.743119][  T246]  </TASK>
[   63.746012][  T246] irq event stamp: 10527
[ 63.750118][ T246] hardirqs last enabled at (10527): finish_task_switch+0x1b6/0x950 
[ 63.760389][ T246] hardirqs last disabled at (10526): __schedule (kernel/sched/core.c:6416 (discriminator 1)) 
[ 63.769458][ T246] softirqs last enabled at (10478): handle_softirqs (arch/x86/include/asm/preempt.h:26 kernel/softirq.c:401 kernel/softirq.c:582) 
[ 63.778864][ T246] softirqs last disabled at (10473): __irq_exit_rcu (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637) 
[   63.788185][  T246] ---[ end trace 0000000000000000 ]---



The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20241009/202410091542.f6c4a438-oliver.sang@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements
Posted by Thomas Hellström 1 month, 2 weeks ago
On Wed, 2024-10-09 at 15:42 +0800, kernel test robot wrote:
> 
> 
> Hello,
> 
> kernel test robot noticed
> "WARNING:at_kernel/locking/lockdep.c:#__lock_acquire" on:
> 
> commit: d417c66b8b12b5706c9df4ddf5367af540f195c6 ("[PATCH RESEND]
> locking/ww_mutex: Adjust to lockdep nest_lock requirements")
> url:

This is weird. This is an overflow warning indicating that the number
of locks held overflows the 12-bit unsigned hlock->references.

I don't see it on local testing where hlock->references exactly maxes
out with 2048 which is the expected value from 2047 locks in the test
plus one extra dummy lock from the patch.

OTOH there might be a reason why the number of locks originally was set
to 2047, and why we don't see more instances of this error. Could it be
gcc-12 packing problem perhaps.

Anyway if this is indeed the problem, it should suffice to lower the
number of locks in the selftest to 2046...

I'll send out an additional version of the patch to do just that.

/Thomas


> https://github.com/intel-lab-lkp/linux/commits/Thomas-Hellstr-m/locking-ww_mutex-Adjust-to-lockdep-nest_lock-requirements/20241002-205818
> base:
> https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git d00b83d416e7
> 3bc3fa4d21b14bec920e88b70ce6
> patch link:
> https://lore.kernel.org/all/20241002125611.361001-1-thomas.hellstrom@linux.intel.com/
> patch subject: [PATCH RESEND] locking/ww_mutex: Adjust to lockdep
> nest_lock requirements
> 
> in testcase: kernel-selftests
> version: kernel-selftests-x86_64-977d51cf-1_20240508
> with following parameters:
> 
> 	group: locking
> 
> 
> 
> compiler: gcc-12
> test machine: 4 threads Intel(R) Xeon(R) CPU E3-1225 v5 @ 3.30GHz
> (Skylake) with 16G memory
> 
> (please refer to attached dmesg/kmsg for entire log/backtrace)
> 
> 
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new
> version of
> the same patch/commit), kindly add following tags
> > Reported-by: kernel test robot <oliver.sang@intel.com>
> > Closes:
> > https://lore.kernel.org/oe-lkp/202410091542.f6c4a438-oliver.sang@intel.com
> 
> 
> [   63.327071][  T246] ------------[ cut here ]------------
> [   63.332388][  T246] DEBUG_LOCKS_WARN_ON(hlock->references <
> references)
> [ 63.332410][ T246] WARNING: CPU: 2 PID: 246 at
> kernel/locking/lockdep.c:5058 __lock_acquire
> (kernel/locking/lockdep.c:5058 (discriminator 9)) 
> [   63.348622][  T246] Modules linked in: test_ww_mutex(+)
> openvswitch nf_conncount nf_nat nf_conntrack nf_defrag_ipv6
> nf_defrag_ipv4 psample btrfs blake2b_generic xor zstd_compress
> raid6_pq libcrc32c intel_rapl_msr intel_rapl_common
> x86_pkg_temp_thermal intel_powerclamp sd_mod coretemp sg kvm_intel
> ipmi_devintf ipmi_msghandler i915 kvm binfmt_misc drm_buddy intel_gtt
> drm_display_helper crct10dif_pclmul crc32_pclmul crc32c_intel mei_wdt
> ghash_clmulni_intel ttm wmi_bmof sha512_ssse3 ahci rapl
> drm_kms_helper libahci video intel_cstate intel_uncore serio_raw
> libata mei_me i2c_i801 mei i2c_smbus intel_pch_thermal ie31200_edac
> wmi acpi_pad tpm_infineon loop fuse drm dm_mod ip_tables sch_fq_codel
> [   63.409553][  T246] CPU: 2 PID: 246 Comm: kworker/u16:5 Tainted: G
> S                 6.10.0-04481-gd417c66b8b12 #1
> [   63.419907][  T246] Hardware name: HP HP Z238 Microtower
> Workstation/8183, BIOS N51 Ver. 01.63 10/05/2017
> [   63.429471][  T246] Workqueue: test-ww_mutex stress_inorder_work
> [test_ww_mutex]
> [ 63.436889][ T246] RIP: 0010:__lock_acquire
> (kernel/locking/lockdep.c:5058 (discriminator 9)) 
> [ 63.442284][ T246] Code: d2 0f 85 15 0c 00 00 44 8b 0d 7d df c7 04
> 45 85 c9 0f 85 d0 fe ff ff 48 c7 c6 c0 c1 2a 84 48 c7 c7 00 91 2a 84
> e8 8d 39 e5 ff <0f> 0b e9 b6 fe ff ff 41 be 02 00 00 00 e9 11 f7 ff
> ff 31 db e9 bb
> All code
> ========
>    0:	d2 0f                	rorb   %cl,(%rdi)
>    2:	85 15 0c 00 00 44    	test   %edx,0x4400000c(%rip)       
> # 0x44000014
>    8:	8b 0d 7d df c7 04    	mov    0x4c7df7d(%rip),%ecx        #
> 0x4c7df8b
>    e:	45 85 c9             	test   %r9d,%r9d
>   11:	0f 85 d0 fe ff ff    	jne    0xfffffffffffffee7
>   17:	48 c7 c6 c0 c1 2a 84 	mov    $0xffffffff842ac1c0,%rsi
>   1e:	48 c7 c7 00 91 2a 84 	mov    $0xffffffff842a9100,%rdi
>   25:	e8 8d 39 e5 ff       	callq  0xffffffffffe539b7
>   2a:*	0f 0b                	ud2    		<-- trapping
> instruction
>   2c:	e9 b6 fe ff ff       	jmpq   0xfffffffffffffee7
>   31:	41 be 02 00 00 00    	mov    $0x2,%r14d
>   37:	e9 11 f7 ff ff       	jmpq   0xfffffffffffff74d
>   3c:	31 db                	xor    %ebx,%ebx
>   3e:	e9                   	.byte 0xe9
>   3f:	bb                   	.byte 0xbb
> 
> Code starting with the faulting instruction
> ===========================================
>    0:	0f 0b                	ud2    
>    2:	e9 b6 fe ff ff       	jmpq   0xfffffffffffffebd
>    7:	41 be 02 00 00 00    	mov    $0x2,%r14d
>    d:	e9 11 f7 ff ff       	jmpq   0xfffffffffffff723
>   12:	31 db                	xor    %ebx,%ebx
>   14:	e9                   	.byte 0xe9
>   15:	bb                   	.byte 0xbb
> [   63.461748][  T246] RSP: 0018:ffffc9000141f7a8 EFLAGS: 00010086
> [   63.467683][  T246] RAX: 0000000000000000 RBX: 0000000000000001
> RCX: 0000000000000027
> [   63.475525][  T246] RDX: 0000000000000027 RSI: 0000000000000004
> RDI: ffff8883b5330c48
> [   63.483354][  T246] RBP: ffff888430688000 R08: 0000000000000001
> R09: ffffed1076a66189
> [   63.491172][  T246] R10: ffff8883b5330c4b R11: 0000000000000001
> R12: ffff888430688f58
> [   63.499015][  T246] R13: 00000000000000a0 R14: ffff88841d5b1f28
> R15: 0000000000000000
> [   63.506860][  T246] FS:  0000000000000000(0000)
> GS:ffff8883b5300000(0000) knlGS:0000000000000000
> [   63.515659][  T246] CS:  0010 DS: 0000 ES: 0000 CR0:
> 0000000080050033
> [   63.522116][  T246] CR2: 00005566d67bd842 CR3: 000000043b67e002
> CR4: 00000000003706f0
> [   63.529942][  T246] DR0: 0000000000000000 DR1: 0000000000000000
> DR2: 0000000000000000
> [   63.537768][  T246] DR3: 0000000000000000 DR6: 00000000fffe0ff0
> DR7: 0000000000000400
> [   63.545591][  T246] Call Trace:
> [   63.548744][  T246]  <TASK>
> [ 63.551537][ T246] ? __warn (kernel/panic.c:693) 
> [ 63.555470][ T246] ? __lock_acquire (kernel/locking/lockdep.c:5058
> (discriminator 9)) 
> [ 63.560262][ T246] ? report_bug (lib/bug.c:180 lib/bug.c:219) 
> [ 63.564637][ T246] ? handle_bug (arch/x86/kernel/traps.c:239) 
> [ 63.568838][ T246] ? exc_invalid_op (arch/x86/kernel/traps.c:260
> (discriminator 1)) 
> [ 63.573389][ T246] ? asm_exc_invalid_op
> (arch/x86/include/asm/idtentry.h:621) 
> [ 63.578283][ T246] ? __lock_acquire (kernel/locking/lockdep.c:5058
> (discriminator 9)) 
> [ 63.583090][ T246] ? __lock_acquire (kernel/locking/lockdep.c:5058
> (discriminator 9)) 
> [ 63.587884][ T246] lock_acquire (kernel/locking/lockdep.c:466
> kernel/locking/lockdep.c:5758 kernel/locking/lockdep.c:5721) 
> [ 63.592243][ T246] ? stress_inorder_work (kernel/locking/test-
> ww_mutex.c:456) test_ww_mutex
> [ 63.598787][ T246] ? __pfx_lock_acquire
> (kernel/locking/lockdep.c:5724) 
> [ 63.603676][ T246] ? __pfx_do_raw_spin_lock
> (kernel/locking/spinlock_debug.c:114) 
> [ 63.608899][ T246] ? __pfx___might_resched
> (kernel/sched/core.c:8392) 
> [ 63.614036][ T246] ? __ww_mutex_lock+0x94c/0x2b50 
> [ 63.619954][ T246] __ww_mutex_lock+0x1f9/0x2b50 
> [ 63.625696][ T246] ? stress_inorder_work (kernel/locking/test-
> ww_mutex.c:456) test_ww_mutex
> [ 63.632218][ T246] ? stress_inorder_work (kernel/locking/test-
> ww_mutex.c:456) test_ww_mutex
> [ 63.638750][ T246] ? __pfx___ww_mutex_lock+0x10/0x10 
> [ 63.644940][ T246] ? __mutex_unlock_slowpath
> (arch/x86/include/asm/atomic64_64.h:101 include/linux/atomic/atomic-
> arch-fallback.h:4329 include/linux/atomic/atomic-long.h:1506
> include/linux/atomic/atomic-instrumented.h:4481
> kernel/locking/mutex.c:929) 
> [ 63.650454][ T246] ? lock_is_held_type
> (kernel/locking/lockdep.c:5497 kernel/locking/lockdep.c:5827) 
> [ 63.655348][ T246] ? __pfx___might_resched
> (kernel/sched/core.c:8392) 
> [ 63.660502][ T246] ? ww_mutex_lock (kernel/locking/mutex.c:878) 
> [ 63.665047][ T246] ww_mutex_lock (kernel/locking/mutex.c:878) 
> [ 63.669432][ T246] stress_inorder_work (kernel/locking/test-
> ww_mutex.c:456) test_ww_mutex
> [ 63.675813][ T246] ? __pfx_stress_inorder_work (kernel/locking/test-
> ww_mutex.c:434) test_ww_mutex
> [ 63.682709][ T246] ? lock_is_held_type
> (kernel/locking/lockdep.c:5497 kernel/locking/lockdep.c:5827) 
> [ 63.687600][ T246] process_one_work (kernel/workqueue.c:3236) 
> [ 63.692398][ T246] ? __pfx_lock_acquire
> (kernel/locking/lockdep.c:5724) 
> [ 63.697307][ T246] ? __pfx_process_one_work
> (kernel/workqueue.c:3133) 
> [ 63.702545][ T246] ? assign_work (kernel/workqueue.c:1202) 
> [ 63.707003][ T246] ? lock_is_held_type
> (kernel/locking/lockdep.c:5497 kernel/locking/lockdep.c:5827) 
> [ 63.711899][ T246] worker_thread (kernel/workqueue.c:3306
> kernel/workqueue.c:3390) 
> [ 63.716353][ T246] ? __pfx_worker_thread (kernel/workqueue.c:3339) 
> [ 63.721334][ T246] kthread (kernel/kthread.c:389) 
> [ 63.725270][ T246] ? __pfx_kthread (kernel/kthread.c:342) 
> [ 63.729732][ T246] ret_from_fork (arch/x86/kernel/process.c:153) 
> [ 63.734019][ T246] ? __pfx_kthread (kernel/kthread.c:342) 
> [ 63.738485][ T246] ret_from_fork_asm (arch/x86/entry/entry_64.S:257)
> [   63.743119][  T246]  </TASK>
> [   63.746012][  T246] irq event stamp: 10527
> [ 63.750118][ T246] hardirqs last enabled at (10527):
> finish_task_switch+0x1b6/0x950 
> [ 63.760389][ T246] hardirqs last disabled at (10526): __schedule
> (kernel/sched/core.c:6416 (discriminator 1)) 
> [ 63.769458][ T246] softirqs last enabled at (10478): handle_softirqs
> (arch/x86/include/asm/preempt.h:26 kernel/softirq.c:401
> kernel/softirq.c:582) 
> [ 63.778864][ T246] softirqs last disabled at (10473): __irq_exit_rcu
> (kernel/softirq.c:589 kernel/softirq.c:428 kernel/softirq.c:637) 
> [   63.788185][  T246] ---[ end trace 0000000000000000 ]---
> 
> 
> 
> The kernel config and materials to reproduce are available at:
> https://download.01.org/0day-ci/archive/20241009/202410091542.f6c4a438-oliver.sang@intel.com
> 
> 
> 
Re: [PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements
Posted by Peter Zijlstra 1 month, 3 weeks ago
On Wed, Oct 02, 2024 at 02:56:11PM +0200, Thomas Hellström wrote:
> When using mutex_acquire_nest() with a nest_lock, lockdep refcounts the
> number of acquired lockdep_maps of mutexes of the same class, and also
> keeps a pointer to the first acquired lockdep_map of a class. That pointer
> is then used for various comparison-, printing- and checking purposes,
> but there is no mechanism to actively ensure that lockdep_map stays in
> memory. Instead, a warning is printed if the lockdep_map is freed and
> there are still held locks of the same lock class, even if the lockdep_map
> itself has been released.
> 
> In the context of WW/WD transactions that means that if a user unlocks
> and frees a ww_mutex from within an ongoing ww transaction, and that
> mutex happens to be the first ww_mutex grabbed in the transaction,
> such a warning is printed and there might be a risk of a UAF.

I'm assuming you actually hit this?

Anyway, work around seems sane enough, thanks!
Re: [PATCH RESEND] locking/ww_mutex: Adjust to lockdep nest_lock requirements
Posted by Thomas Hellström 1 month, 3 weeks ago
On Fri, 2024-10-04 at 12:16 +0200, Peter Zijlstra wrote:
> On Wed, Oct 02, 2024 at 02:56:11PM +0200, Thomas Hellström wrote:
> > When using mutex_acquire_nest() with a nest_lock, lockdep refcounts
> > the
> > number of acquired lockdep_maps of mutexes of the same class, and
> > also
> > keeps a pointer to the first acquired lockdep_map of a class. That
> > pointer
> > is then used for various comparison-, printing- and checking
> > purposes,
> > but there is no mechanism to actively ensure that lockdep_map stays
> > in
> > memory. Instead, a warning is printed if the lockdep_map is freed
> > and
> > there are still held locks of the same lock class, even if the
> > lockdep_map
> > itself has been released.
> > 
> > In the context of WW/WD transactions that means that if a user
> > unlocks
> > and frees a ww_mutex from within an ongoing ww transaction, and
> > that
> > mutex happens to be the first ww_mutex grabbed in the transaction,
> > such a warning is printed and there might be a risk of a UAF.
> 
> I'm assuming you actually hit this?

Yes, but there was a change merged to drm_exec, a main ww_mutex user
that makes it less likely to happen. (Unlocking in reverse unless the
user explicitly requests an unlock which will be a more common use-case
moving forward).

> 
> Anyway, work around seems sane enough, thanks!

Thanks,
Thomas
[tip: locking/core] locking/ww_mutex: Adjust to lockdep nest_lock requirements
Posted by tip-bot2 for Thomas Hellström 1 month, 2 weeks ago
The following commit has been merged into the locking/core branch of tip:

Commit-ID:     59f812dd6d95bed605c7265bf193ec8155cf6bcc
Gitweb:        https://git.kernel.org/tip/59f812dd6d95bed605c7265bf193ec8155cf6bcc
Author:        Thomas Hellström <thomas.hellstrom@linux.intel.com>
AuthorDate:    Wed, 02 Oct 2024 14:56:11 +02:00
Committer:     Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 07 Oct 2024 09:28:35 +02:00

locking/ww_mutex: Adjust to lockdep nest_lock requirements

When using mutex_acquire_nest() with a nest_lock, lockdep refcounts the
number of acquired lockdep_maps of mutexes of the same class, and also
keeps a pointer to the first acquired lockdep_map of a class. That pointer
is then used for various comparison-, printing- and checking purposes,
but there is no mechanism to actively ensure that lockdep_map stays in
memory. Instead, a warning is printed if the lockdep_map is freed and
there are still held locks of the same lock class, even if the lockdep_map
itself has been released.

In the context of WW/WD transactions that means that if a user unlocks
and frees a ww_mutex from within an ongoing ww transaction, and that
mutex happens to be the first ww_mutex grabbed in the transaction,
such a warning is printed and there might be a risk of a UAF.

Note that this is only problem when lockdep is enabled and affects only
dereferences of struct lockdep_map.

Adjust to this by adding a fake lockdep_map to the acquired context and
make sure it is the first acquired lockdep map of the associated
ww_mutex class. Then hold it for the duration of the WW/WD transaction.

This has the side effect that trying to lock a ww mutex *without* a
ww_acquire_context but where a such context has been acquire, we'd see
a lockdep splat. The test-ww_mutex.c selftest attempts to do that, so
modify that particular test to not acquire a ww_acquire_context if it
is not going to be used.

Signed-off-by: Thomas Hellström <thomas.hellstrom@linux.intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20241002125611.361001-1-thomas.hellstrom@linux.intel.com
---
 include/linux/ww_mutex.h       | 14 ++++++++++++++
 kernel/locking/test-ww_mutex.c |  6 ++++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h
index bb76308..a401a2f 100644
--- a/include/linux/ww_mutex.h
+++ b/include/linux/ww_mutex.h
@@ -65,6 +65,16 @@ struct ww_acquire_ctx {
 #endif
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 	struct lockdep_map dep_map;
+	/**
+	 * @first_lock_dep_map: fake lockdep_map for first locked ww_mutex.
+	 *
+	 * lockdep requires the lockdep_map for the first locked ww_mutex
+	 * in a ww transaction to remain in memory until all ww_mutexes of
+	 * the transaction have been unlocked. Ensure this by keeping a
+	 * fake locked ww_mutex lockdep map between ww_acquire_init() and
+	 * ww_acquire_fini().
+	 */
+	struct lockdep_map first_lock_dep_map;
 #endif
 #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
 	unsigned int deadlock_inject_interval;
@@ -146,7 +156,10 @@ static inline void ww_acquire_init(struct ww_acquire_ctx *ctx,
 	debug_check_no_locks_freed((void *)ctx, sizeof(*ctx));
 	lockdep_init_map(&ctx->dep_map, ww_class->acquire_name,
 			 &ww_class->acquire_key, 0);
+	lockdep_init_map(&ctx->first_lock_dep_map, ww_class->mutex_name,
+			 &ww_class->mutex_key, 0);
 	mutex_acquire(&ctx->dep_map, 0, 0, _RET_IP_);
+	mutex_acquire_nest(&ctx->first_lock_dep_map, 0, 0, &ctx->dep_map, _RET_IP_);
 #endif
 #ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH
 	ctx->deadlock_inject_interval = 1;
@@ -185,6 +198,7 @@ static inline void ww_acquire_done(struct ww_acquire_ctx *ctx)
 static inline void ww_acquire_fini(struct ww_acquire_ctx *ctx)
 {
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
+	mutex_release(&ctx->first_lock_dep_map, _THIS_IP_);
 	mutex_release(&ctx->dep_map, _THIS_IP_);
 #endif
 #ifdef DEBUG_WW_MUTEXES
diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c
index 10a5736..4c2b8b5 100644
--- a/kernel/locking/test-ww_mutex.c
+++ b/kernel/locking/test-ww_mutex.c
@@ -62,7 +62,8 @@ static int __test_mutex(unsigned int flags)
 	int ret;
 
 	ww_mutex_init(&mtx.mutex, &ww_class);
-	ww_acquire_init(&ctx, &ww_class);
+	if (flags & TEST_MTX_CTX)
+		ww_acquire_init(&ctx, &ww_class);
 
 	INIT_WORK_ONSTACK(&mtx.work, test_mutex_work);
 	init_completion(&mtx.ready);
@@ -90,7 +91,8 @@ static int __test_mutex(unsigned int flags)
 		ret = wait_for_completion_timeout(&mtx.done, TIMEOUT);
 	}
 	ww_mutex_unlock(&mtx.mutex);
-	ww_acquire_fini(&ctx);
+	if (flags & TEST_MTX_CTX)
+		ww_acquire_fini(&ctx);
 
 	if (ret) {
 		pr_err("%s(flags=%x): mutual exclusion failure\n",