[PATCH v2 2/3] futex: Create set_robust_list2

André Almeida posted 3 patches 3 weeks, 2 days ago
[PATCH v2 2/3] futex: Create set_robust_list2
Posted by André Almeida 3 weeks, 2 days ago
Create a new robust_list() syscall. The current syscall can't be
expanded to cover the following use case, so a new one is needed. This
new syscall allows users to set multiple robust lists per process and to
have either 32bit or 64bit pointers in the list.

* Interface

This is the proposed interface:

	long set_robust_list2(void *head, int index, unsigned int flags)

`head` is the head of the userspace struct robust_list_head, just as old
set_robust_list(). It needs to be a void pointer since it can point to a
normal robust_list_head or a compat_robust_list_head.

`flags` can be used for defining the list type:

	enum robust_list_type {
	 	ROBUST_LIST_32BIT,
		ROBUST_LIST_64BIT,
	 };

`index` is the index in the internal robust_list's linked list (the
naming starts to get confusing, I reckon). If `index == -1`, that means
that user wants to set a new robust_list, and the kernel will append it
in the end of the list, assign a new index and return this index to the
user. If `index >= 0`, that means that user wants to re-set `*head` of
an already existing list (similarly to what happens when you call
set_robust_list() twice with different `*head`).

If `index` is out of range, or it points to a non-existing robust_list,
or if the internal list is full, an error is returned.

User cannot remove lists.

Signed-off-by: André Almeida <andrealmeid@igalia.com>
---
 include/linux/futex.h             |  1 +
 include/linux/sched.h             |  1 +
 include/uapi/asm-generic/unistd.h |  5 +-
 include/uapi/linux/futex.h        | 24 +++++++++
 init/init_task.c                  |  3 ++
 kernel/futex/core.c               | 85 ++++++++++++++++++++++++++++---
 kernel/futex/futex.h              |  3 ++
 kernel/futex/syscalls.c           | 36 +++++++++++++
 8 files changed, 149 insertions(+), 9 deletions(-)

diff --git a/include/linux/futex.h b/include/linux/futex.h
index 8217b5ebdd9c..997fe0013bc0 100644
--- a/include/linux/futex.h
+++ b/include/linux/futex.h
@@ -76,6 +76,7 @@ static inline void futex_init_task(struct task_struct *tsk)
 #ifdef CONFIG_COMPAT
 	tsk->compat_robust_list = NULL;
 #endif
+	INIT_LIST_HEAD(&tsk->robust_list2);
 	INIT_LIST_HEAD(&tsk->pi_state_list);
 	tsk->pi_state_cache = NULL;
 	tsk->futex_state = FUTEX_STATE_OK;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8f20b703557d..4a2455f1b07c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1284,6 +1284,7 @@ struct task_struct {
 #ifdef CONFIG_COMPAT
 	struct robust_list_head32 __user *compat_robust_list;
 #endif
+	struct list_head		robust_list2;
 	struct list_head		pi_state_list;
 	struct futex_pi_state		*pi_state_cache;
 	struct mutex			futex_exit_mutex;
diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h
index 5bf6148cac2b..c1f5c9635c07 100644
--- a/include/uapi/asm-generic/unistd.h
+++ b/include/uapi/asm-generic/unistd.h
@@ -841,8 +841,11 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules)
 #define __NR_mseal 462
 __SYSCALL(__NR_mseal, sys_mseal)
 
+#define __NR_set_robust_list2 463
+__SYSCALL(__NR_set_robust_list2, sys_set_robust_list2)
+
 #undef __NR_syscalls
-#define __NR_syscalls 463
+#define __NR_syscalls 464
 
 /*
  * 32 bit systems traditionally used different
diff --git a/include/uapi/linux/futex.h b/include/uapi/linux/futex.h
index d2ee625ea189..13903a278b71 100644
--- a/include/uapi/linux/futex.h
+++ b/include/uapi/linux/futex.h
@@ -146,6 +146,30 @@ struct robust_list_head {
 	struct robust_list __user *list_op_pending;
 };
 
+#define ROBUST_LISTS_PER_TASK 10
+
+enum robust_list2_type {
+	ROBUST_LIST_32BIT,
+	ROBUST_LIST_64BIT,
+};
+
+#define ROBUST_LIST_TYPE_MASK (ROBUST_LIST_32BIT | ROBUST_LIST_64BIT)
+
+/*
+ * This is an entry of a linked list of robust lists.
+ *
+ * @head: can point to a 64bit list or a 32bit list
+ * @list_type: determine the size of the futex pointers in the list
+ * @index: the index of this entry in the list
+ * @list: linked list element
+ */
+struct robust_list2_entry {
+	void __user *head;
+	enum robust_list2_type list_type;
+	unsigned int index;
+	struct list_head list;
+};
+
 /*
  * Are there any waiters for this robust futex:
  */
diff --git a/init/init_task.c b/init/init_task.c
index 136a8231355a..1b08e745c47d 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -219,6 +219,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = {
 #ifdef CONFIG_SECCOMP_FILTER
 	.seccomp	= { .filter_count = ATOMIC_INIT(0) },
 #endif
+#ifdef CONFIG_FUTEX
+	.robust_list2 = LIST_HEAD_INIT(init_task.robust_list2),
+#endif
 };
 EXPORT_SYMBOL(init_task);
 
diff --git a/kernel/futex/core.c b/kernel/futex/core.c
index bcd0e2a7ba65..f74476d0bcc1 100644
--- a/kernel/futex/core.c
+++ b/kernel/futex/core.c
@@ -797,9 +797,9 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
-static void exit_robust_list64(struct task_struct *curr)
+static void exit_robust_list64(struct task_struct *curr,
+			       struct robust_list_head __user *head)
 {
-	struct robust_list_head __user *head = curr->robust_list;
 	struct robust_list __user *entry, *next_entry, *pending;
 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned int next_pi;
@@ -859,7 +859,8 @@ static void exit_robust_list64(struct task_struct *curr)
 	}
 }
 #else
-static void exit_robust_list64(struct task_struct *curr)
+static void exit_robust_list64(struct task_struct *curr,
+			      struct robust_list_head __user *head)
 {
 	pr_warn("32bit kernel should not allow ROBUST_LIST_64BIT");
 	return;
@@ -897,9 +898,9 @@ fetch_robust_entry32(u32 *uentry, struct robust_list __user **entry,
  *
  * We silently return on any sign of list-walking problem.
  */
-static void exit_robust_list32(struct task_struct *curr)
+static void exit_robust_list32(struct task_struct *curr,
+			       struct robust_list_head32 __user *head)
 {
-	struct robust_list_head32 __user *head = curr->compat_robust_list;
 	struct robust_list __user *entry, *next_entry, *pending;
 	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned int next_pi;
@@ -965,6 +966,54 @@ static void exit_robust_list32(struct task_struct *curr)
 	}
 }
 
+long do_set_robust_list2(struct robust_list_head __user *head,
+			 int index, unsigned int type)
+{
+	struct list_head *list2 = &current->robust_list2;
+	struct robust_list2_entry *prev, *new = NULL;
+
+	if (index == -1) {
+		if (list_empty(list2)) {
+			index = 0;
+		} else {
+			prev = list_last_entry(list2, struct robust_list2_entry, list);
+			index = prev->index + 1;
+		}
+
+		if (index >= ROBUST_LISTS_PER_TASK)
+			return -EINVAL;
+
+		new = kmalloc(sizeof(struct robust_list2_entry), GFP_KERNEL);
+		if (!new)
+			return -ENOMEM;
+
+		list_add_tail(&new->list, list2);
+		new->index = index;
+
+	} else if (index >= 0) {
+		struct robust_list2_entry *curr;
+
+		if (list_empty(list2))
+			return -ENOENT;
+
+		list_for_each_entry(curr, list2, list) {
+			if (index == curr->index) {
+				new = curr;
+				break;
+			}
+		}
+
+		if (!new)
+			return -ENOENT;
+	}
+
+	BUG_ON(!new);
+	new->head = head;
+	new->list_type = type;
+
+	return index;
+}
+
 #ifdef CONFIG_FUTEX_PI
 
 /*
@@ -1046,24 +1095,44 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
 
 static void futex_cleanup(struct task_struct *tsk)
 {
+	struct robust_list2_entry *curr, *n;
+	struct list_head *list2 = &tsk->robust_list2;
+
 #ifdef CONFIG_64BIT
 	if (unlikely(tsk->robust_list)) {
-		exit_robust_list64(tsk);
+		exit_robust_list64(tsk, tsk->robust_list);
 		tsk->robust_list = NULL;
 	}
 #else
 	if (unlikely(tsk->robust_list)) {
-		exit_robust_list32(tsk);
+		exit_robust_list32(tsk, (struct robust_list_head32 *) tsk->robust_list);
 		tsk->robust_list = NULL;
 	}
 #endif
 
 #ifdef CONFIG_COMPAT
 	if (unlikely(tsk->compat_robust_list)) {
-		exit_robust_list32(tsk);
+		exit_robust_list32(tsk, tsk->compat_robust_list);
 		tsk->compat_robust_list = NULL;
 	}
 #endif
+	/*
+	 * Walk through the linked list, parsing robust lists and freeing the
+	 * allocated lists
+	 */
+	if (unlikely(!list_empty(list2))) {
+		list_for_each_entry_safe(curr, n, list2, list) {
+			if (curr->head != NULL) {
+				if (curr->list_type == ROBUST_LIST_64BIT)
+					exit_robust_list64(tsk, curr->head);
+				else if (curr->list_type == ROBUST_LIST_32BIT)
+					exit_robust_list32(tsk, curr->head);
+				curr->head = NULL;
+			}
+			list_del_init(&curr->list);
+			kfree(curr);
+		}
+	}
 
 	if (unlikely(!list_empty(&tsk->pi_state_list)))
 		exit_pi_state_list(tsk);
diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h
index 8b195d06f4e8..7247d5c583d5 100644
--- a/kernel/futex/futex.h
+++ b/kernel/futex/futex.h
@@ -349,6 +349,9 @@ extern int __futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 extern int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
 		      ktime_t *abs_time, u32 bitset);
 
+extern long do_set_robust_list2(struct robust_list_head __user *head,
+			 int index, unsigned int type);
+
 /**
  * struct futex_vector - Auxiliary struct for futex_waitv()
  * @w: Userspace provided data
diff --git a/kernel/futex/syscalls.c b/kernel/futex/syscalls.c
index dba193dfd216..ff61570bb9c8 100644
--- a/kernel/futex/syscalls.c
+++ b/kernel/futex/syscalls.c
@@ -39,6 +39,42 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
 	return 0;
 }
 
+#define ROBUST_LIST_FLAGS ROBUST_LIST_TYPE_MASK
+
+/*
+ * sys_set_robust_list2()
+ *
+ * When index == -1, create a new list for user. When index >= 0, try to find
+ * the corresponding list and re-set the head there.
+ *
+ * Return values:
+ *  >= 0: success, index of the robust list
+ *  -EINVAL: invalid flags, invalid index
+ *  -ENOENT: requested index no where to be found
+ *  -ENOMEM: error allocating new list
+ *  -ESRCH: too many allocated lists
+ */
+SYSCALL_DEFINE3(set_robust_list2, struct robust_list_head __user *, head,
+		int, index, unsigned int, flags)
+{
+	unsigned int type;
+
+	type = flags & ROBUST_LIST_TYPE_MASK;
+
+	if (index < -1 || index >= ROBUST_LISTS_PER_TASK)
+		return -EINVAL;
+
+	if ((flags & ~ROBUST_LIST_FLAGS) != 0)
+		return -EINVAL;
+
+#ifndef CONFIG_64BIT
+	if (type == ROBUST_LIST_64BIT)
+		return -EINVAL;
+#endif
+
+	return do_set_robust_list2(head, index, type);
+}
+
 /**
  * sys_get_robust_list() - Get the robust-futex list head of a task
  * @pid:	pid of the process [zero for current task]
-- 
2.47.0

Re: [PATCH v2 2/3] futex: Create set_robust_list2
Posted by Peter Zijlstra 2 weeks, 6 days ago
On Fri, Nov 01, 2024 at 01:21:46PM -0300, André Almeida wrote:
> @@ -1046,24 +1095,44 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
>  
>  static void futex_cleanup(struct task_struct *tsk)
>  {
> +	struct robust_list2_entry *curr, *n;
> +	struct list_head *list2 = &tsk->robust_list2;
> +
>  #ifdef CONFIG_64BIT
>  	if (unlikely(tsk->robust_list)) {
> -		exit_robust_list64(tsk);
> +		exit_robust_list64(tsk, tsk->robust_list);
>  		tsk->robust_list = NULL;
>  	}
>  #else
>  	if (unlikely(tsk->robust_list)) {
> -		exit_robust_list32(tsk);
> +		exit_robust_list32(tsk, (struct robust_list_head32 *) tsk->robust_list);
>  		tsk->robust_list = NULL;
>  	}
>  #endif
>  
>  #ifdef CONFIG_COMPAT
>  	if (unlikely(tsk->compat_robust_list)) {
> -		exit_robust_list32(tsk);
> +		exit_robust_list32(tsk, tsk->compat_robust_list);
>  		tsk->compat_robust_list = NULL;
>  	}
>  #endif
> +	/*
> +	 * Walk through the linked list, parsing robust lists and freeing the
> +	 * allocated lists
> +	 */
> +	if (unlikely(!list_empty(list2))) {
> +		list_for_each_entry_safe(curr, n, list2, list) {
> +			if (curr->head != NULL) {
> +				if (curr->list_type == ROBUST_LIST_64BIT)
> +					exit_robust_list64(tsk, curr->head);
> +				else if (curr->list_type == ROBUST_LIST_32BIT)
> +					exit_robust_list32(tsk, curr->head);
> +				curr->head = NULL;
> +			}
> +			list_del_init(&curr->list);
> +			kfree(curr);
> +		}
> +	}
>  
>  	if (unlikely(!list_empty(&tsk->pi_state_list)))
>  		exit_pi_state_list(tsk);

I'm still digesting this, but the above seems particularly silly.

Should not the legacy lists also be on the list of lists? I mean, it
makes no sense to have two completely separate means of tracking lists.
Re: [PATCH v2 2/3] futex: Create set_robust_list2
Posted by André Almeida 2 weeks, 5 days ago
Hi Peter,

Em 04/11/2024 08:22, Peter Zijlstra escreveu:
> On Fri, Nov 01, 2024 at 01:21:46PM -0300, André Almeida wrote:
>> @@ -1046,24 +1095,44 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
>>   
>>   static void futex_cleanup(struct task_struct *tsk)
>>   {
>> +	struct robust_list2_entry *curr, *n;
>> +	struct list_head *list2 = &tsk->robust_list2;
>> +
>>   #ifdef CONFIG_64BIT
>>   	if (unlikely(tsk->robust_list)) {
>> -		exit_robust_list64(tsk);
>> +		exit_robust_list64(tsk, tsk->robust_list);
>>   		tsk->robust_list = NULL;
>>   	}
>>   #else
>>   	if (unlikely(tsk->robust_list)) {
>> -		exit_robust_list32(tsk);
>> +		exit_robust_list32(tsk, (struct robust_list_head32 *) tsk->robust_list);
>>   		tsk->robust_list = NULL;
>>   	}
>>   #endif
>>   
>>   #ifdef CONFIG_COMPAT
>>   	if (unlikely(tsk->compat_robust_list)) {
>> -		exit_robust_list32(tsk);
>> +		exit_robust_list32(tsk, tsk->compat_robust_list);
>>   		tsk->compat_robust_list = NULL;
>>   	}
>>   #endif
>> +	/*
>> +	 * Walk through the linked list, parsing robust lists and freeing the
>> +	 * allocated lists
>> +	 */
>> +	if (unlikely(!list_empty(list2))) {
>> +		list_for_each_entry_safe(curr, n, list2, list) {
>> +			if (curr->head != NULL) {
>> +				if (curr->list_type == ROBUST_LIST_64BIT)
>> +					exit_robust_list64(tsk, curr->head);
>> +				else if (curr->list_type == ROBUST_LIST_32BIT)
>> +					exit_robust_list32(tsk, curr->head);
>> +				curr->head = NULL;
>> +			}
>> +			list_del_init(&curr->list);
>> +			kfree(curr);
>> +		}
>> +	}
>>   
>>   	if (unlikely(!list_empty(&tsk->pi_state_list)))
>>   		exit_pi_state_list(tsk);
> 
> I'm still digesting this, but the above seems particularly silly.
> 
> Should not the legacy lists also be on the list of lists? I mean, it
> makes no sense to have two completely separate means of tracking lists.
> 

You are asking if, whenever someone calls set_robust_list() or 
compat_set_robust_list() to be inserted into &current->robust_list2 
instead of using tsk->robust_list and tsk->compat_robust_list?

I was thinking of doing that, but my current implementation has a 
kmalloc() call for every insertion, and I wasn't sure if I could add 
this new latency to the old set_robust_list() syscall. Assuming it is 
usually called just once during the thread initialization perhaps it 
shouldn't cause much harm I guess.

Re: [PATCH v2 2/3] futex: Create set_robust_list2
Posted by Peter Zijlstra 2 weeks, 5 days ago
On Mon, Nov 04, 2024 at 06:55:45PM -0300, André Almeida wrote:
> Hi Peter,
> 
> Em 04/11/2024 08:22, Peter Zijlstra escreveu:
> > On Fri, Nov 01, 2024 at 01:21:46PM -0300, André Almeida wrote:
> > > @@ -1046,24 +1095,44 @@ static inline void exit_pi_state_list(struct task_struct *curr) { }
> > >   static void futex_cleanup(struct task_struct *tsk)
> > >   {
> > > +	struct robust_list2_entry *curr, *n;
> > > +	struct list_head *list2 = &tsk->robust_list2;
> > > +
> > >   #ifdef CONFIG_64BIT
> > >   	if (unlikely(tsk->robust_list)) {
> > > -		exit_robust_list64(tsk);
> > > +		exit_robust_list64(tsk, tsk->robust_list);
> > >   		tsk->robust_list = NULL;
> > >   	}
> > >   #else
> > >   	if (unlikely(tsk->robust_list)) {
> > > -		exit_robust_list32(tsk);
> > > +		exit_robust_list32(tsk, (struct robust_list_head32 *) tsk->robust_list);
> > >   		tsk->robust_list = NULL;
> > >   	}
> > >   #endif
> > >   #ifdef CONFIG_COMPAT
> > >   	if (unlikely(tsk->compat_robust_list)) {
> > > -		exit_robust_list32(tsk);
> > > +		exit_robust_list32(tsk, tsk->compat_robust_list);
> > >   		tsk->compat_robust_list = NULL;
> > >   	}
> > >   #endif
> > > +	/*
> > > +	 * Walk through the linked list, parsing robust lists and freeing the
> > > +	 * allocated lists
> > > +	 */
> > > +	if (unlikely(!list_empty(list2))) {
> > > +		list_for_each_entry_safe(curr, n, list2, list) {
> > > +			if (curr->head != NULL) {
> > > +				if (curr->list_type == ROBUST_LIST_64BIT)
> > > +					exit_robust_list64(tsk, curr->head);
> > > +				else if (curr->list_type == ROBUST_LIST_32BIT)
> > > +					exit_robust_list32(tsk, curr->head);
> > > +				curr->head = NULL;
> > > +			}
> > > +			list_del_init(&curr->list);
> > > +			kfree(curr);
> > > +		}
> > > +	}
> > >   	if (unlikely(!list_empty(&tsk->pi_state_list)))
> > >   		exit_pi_state_list(tsk);
> > 
> > I'm still digesting this, but the above seems particularly silly.
> > 
> > Should not the legacy lists also be on the list of lists? I mean, it
> > makes no sense to have two completely separate means of tracking lists.
> > 
> 
> You are asking if, whenever someone calls set_robust_list() or
> compat_set_robust_list() to be inserted into &current->robust_list2 instead
> of using tsk->robust_list and tsk->compat_robust_list?

Yes, that.
Re: [PATCH v2 2/3] futex: Create set_robust_list2
Posted by kernel test robot 3 weeks, 1 day ago
Hi André,

kernel test robot noticed the following build warnings:

[auto build test WARNING on tip/locking/core]
[also build test WARNING on tip/sched/core linus/master v6.12-rc5 next-20241101]
[cannot apply to tip/x86/asm]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Andr-Almeida/futex-Use-explicit-sizes-for-compat_exit_robust_list/20241102-002419
base:   tip/locking/core
patch link:    https://lore.kernel.org/r/20241101162147.284993-3-andrealmeid%40igalia.com
patch subject: [PATCH v2 2/3] futex: Create set_robust_list2
config: i386-randconfig-062-20241102 (https://download.01.org/0day-ci/archive/20241103/202411030038.W5DgvCYP-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241103/202411030038.W5DgvCYP-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411030038.W5DgvCYP-lkp@intel.com/

sparse warnings: (new ones prefixed by >>)
   kernel/futex/core.c:915:59: sparse: sparse: cast removes address space '__user' of expression
   kernel/futex/core.c:915:59: sparse: sparse: incorrect type in argument 3 (different address spaces) @@     expected unsigned int [noderef] [usertype] __user *head @@     got unsigned int [usertype] * @@
   kernel/futex/core.c:915:59: sparse:     expected unsigned int [noderef] [usertype] __user *head
   kernel/futex/core.c:915:59: sparse:     got unsigned int [usertype] *
   kernel/futex/core.c:1108:42: sparse: sparse: cast removes address space '__user' of expression
>> kernel/futex/core.c:1108:42: sparse: sparse: incorrect type in argument 2 (different address spaces) @@     expected struct robust_list_head32 [noderef] __user *head @@     got struct robust_list_head32 * @@
   kernel/futex/core.c:1108:42: sparse:     expected struct robust_list_head32 [noderef] __user *head
   kernel/futex/core.c:1108:42: sparse:     got struct robust_list_head32 *
   kernel/futex/core.c: note: in included file (through include/linux/smp.h, include/linux/alloc_tag.h, include/linux/percpu.h, ...):
   include/linux/list.h:83:21: sparse: sparse: self-comparison always evaluates to true

vim +1108 kernel/futex/core.c

  1095	
  1096	static void futex_cleanup(struct task_struct *tsk)
  1097	{
  1098		struct robust_list2_entry *curr, *n;
  1099		struct list_head *list2 = &tsk->robust_list2;
  1100	
  1101	#ifdef CONFIG_64BIT
  1102		if (unlikely(tsk->robust_list)) {
  1103			exit_robust_list64(tsk, tsk->robust_list);
  1104			tsk->robust_list = NULL;
  1105		}
  1106	#else
  1107		if (unlikely(tsk->robust_list)) {
> 1108			exit_robust_list32(tsk, (struct robust_list_head32 *) tsk->robust_list);
  1109			tsk->robust_list = NULL;
  1110		}
  1111	#endif
  1112	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki