[PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process

Usama Arif posted 7 patches 7 months ago
[PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process
Posted by Usama Arif 7 months ago
This is set via the new PR_SET_THP_POLICY prctl. It has 2 affects:
- It sets VM_HUGEPAGE and clears VM_NOHUGEPAGE on the default VMA flags
  (def_flags). This means that every new VMA will be considered for
  hugepage.
- Iterate through every VMA in the process and call hugepage_madvise
  on it, with MADV_HUGEPAGE policy.
The policy is inherited during fork+exec.

This effectively allows setting MADV_HUGEPAGE on the entire process.
In an environment where different types of workloads are run on the
same machine, this will allow workloads that benefit from always having
hugepages to do so, without regressing those that don't.

Signed-off-by: Usama Arif <usamaarif642@gmail.com>
---
 include/linux/huge_mm.h                       |  1 +
 include/linux/mm.h                            |  2 +-
 include/linux/mm_types.h                      |  4 ++-
 include/uapi/linux/prctl.h                    |  4 +++
 kernel/sys.c                                  | 29 +++++++++++++++++++
 mm/huge_memory.c                              | 13 +++++++++
 tools/include/uapi/linux/prctl.h              |  4 +++
 .../trace/beauty/include/uapi/linux/prctl.h   |  4 +++
 8 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 23580a43787c..b24a2e0ae642 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -431,6 +431,7 @@ change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
 			__split_huge_pud(__vma, __pud, __address);	\
 	}  while (0)
 
+void process_default_madv_hugepage(struct mm_struct *mm, int advice);
 int hugepage_set_vmflags(unsigned long *vm_flags, int advice);
 int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
 		     int advice);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 43748c8f3454..436f4588bce8 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -466,7 +466,7 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB)
 
 /* This mask defines which mm->def_flags a process can inherit its parent */
-#define VM_INIT_DEF_MASK	VM_NOHUGEPAGE
+#define VM_INIT_DEF_MASK	(VM_HUGEPAGE | VM_NOHUGEPAGE)
 
 /* This mask represents all the VMA flag bits used by mlock */
 #define VM_LOCKED_MASK	(VM_LOCKED | VM_LOCKONFAULT)
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index e76bade9ebb1..f1836b7c5704 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1703,6 +1703,7 @@ enum {
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when mm is available for khugepaged */
+#define MMF_VM_HUGEPAGE_MASK	(1 << MMF_VM_HUGEPAGE)
 
 /*
  * This one-shot flag is dropped due to necessity of changing exe once again
@@ -1742,7 +1743,8 @@ enum {
 
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
-				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
+				 MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK |\
+				 MMF_VM_HUGEPAGE_MASK)
 
 static inline unsigned long mmf_init_flags(unsigned long flags)
 {
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 15c18ef4eb11..15aaa4db5ff8 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -364,4 +364,8 @@ struct prctl_mm_map {
 # define PR_TIMER_CREATE_RESTORE_IDS_ON		1
 # define PR_TIMER_CREATE_RESTORE_IDS_GET	2
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_DEFAULT_MADV_HUGEPAGE	0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index c434968e9f5d..74397ace62f3 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2474,6 +2474,7 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 		unsigned long, arg4, unsigned long, arg5)
 {
 	struct task_struct *me = current;
+	struct mm_struct *mm = me->mm;
 	unsigned char comm[sizeof(me->comm)];
 	long error;
 
@@ -2658,6 +2659,34 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			clear_bit(MMF_DISABLE_THP, &me->mm->flags);
 		mmap_write_unlock(me->mm);
 		break;
+	case PR_GET_THP_POLICY:
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (mmap_write_lock_killable(mm))
+			return -EINTR;
+		if (mm->def_flags & VM_HUGEPAGE)
+			error = PR_DEFAULT_MADV_HUGEPAGE;
+		mmap_write_unlock(mm);
+		break;
+	case PR_SET_THP_POLICY:
+		if (arg3 || arg4 || arg5)
+			return -EINVAL;
+		if (mmap_write_lock_killable(mm))
+			return -EINTR;
+		switch (arg2) {
+		case PR_DEFAULT_MADV_HUGEPAGE:
+			if (!hugepage_global_enabled())
+				error = -EPERM;
+			error = hugepage_set_vmflags(&mm->def_flags, MADV_HUGEPAGE);
+			if (!error)
+				process_default_madv_hugepage(mm, MADV_HUGEPAGE);
+			break;
+		default:
+			error = -EINVAL;
+			break;
+		}
+		mmap_write_unlock(mm);
+		break;
 	case PR_MPX_ENABLE_MANAGEMENT:
 	case PR_MPX_DISABLE_MANAGEMENT:
 		/* No longer implemented: */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2780a12b25f0..72806fe772b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -98,6 +98,19 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
 	return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
 }
 
+void process_default_madv_hugepage(struct mm_struct *mm, int advice)
+{
+	struct vm_area_struct *vma;
+	unsigned long vm_flags;
+
+	mmap_assert_write_locked(mm);
+	VMA_ITERATOR(vmi, mm, 0);
+	for_each_vma(vmi, vma) {
+		vm_flags = vma->vm_flags;
+		hugepage_madvise(vma, &vm_flags, advice);
+	}
+}
+
 unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
 					 unsigned long vm_flags,
 					 unsigned long tva_flags,
diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h
index 35791791a879..f5945ebfe3f2 100644
--- a/tools/include/uapi/linux/prctl.h
+++ b/tools/include/uapi/linux/prctl.h
@@ -328,4 +328,8 @@ struct prctl_mm_map {
 # define PR_PPC_DEXCR_CTRL_CLEAR_ONEXEC	0x10 /* Clear the aspect on exec */
 # define PR_PPC_DEXCR_CTRL_MASK		0x1f
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_THP_POLICY_DEFAULT_HUGE	0
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/tools/perf/trace/beauty/include/uapi/linux/prctl.h b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
index 15c18ef4eb11..325c72f40a93 100644
--- a/tools/perf/trace/beauty/include/uapi/linux/prctl.h
+++ b/tools/perf/trace/beauty/include/uapi/linux/prctl.h
@@ -364,4 +364,8 @@ struct prctl_mm_map {
 # define PR_TIMER_CREATE_RESTORE_IDS_ON		1
 # define PR_TIMER_CREATE_RESTORE_IDS_GET	2
 
+#define PR_SET_THP_POLICY		78
+#define PR_GET_THP_POLICY		79
+#define PR_THP_POLICY_DEFAULT_HUGE	0
+
 #endif /* _LINUX_PRCTL_H */
-- 
2.47.1
Re: [PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process
Posted by kernel test robot 7 months ago
Hi Usama,

kernel test robot noticed the following build errors:

[auto build test ERROR on akpm-mm/mm-everything]
[also build test ERROR on perf-tools-next/perf-tools-next tip/perf/core perf-tools/perf-tools linus/master v6.15-rc7]
[cannot apply to acme/perf/core next-20250516]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Usama-Arif/mm-khugepaged-extract-vm-flag-setting-outside-of-hugepage_madvise/20250520-063452
base:   https://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm.git mm-everything
patch link:    https://lore.kernel.org/r/20250519223307.3601786-3-usamaarif642%40gmail.com
patch subject: [PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process
config: s390-randconfig-001-20250520 (https://download.01.org/0day-ci/archive/20250520/202505201614.N4SXnAln-lkp@intel.com/config)
compiler: clang version 21.0.0git (https://github.com/llvm/llvm-project f819f46284f2a79790038e1f6649172789734ae8)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250520/202505201614.N4SXnAln-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202505201614.N4SXnAln-lkp@intel.com/

All errors (new ones prefixed by >>):

>> kernel/sys.c:2678:9: error: call to undeclared function 'hugepage_global_enabled'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    2678 |                         if (!hugepage_global_enabled())
         |                              ^
>> kernel/sys.c:2680:12: error: call to undeclared function 'hugepage_set_vmflags'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    2680 |                         error = hugepage_set_vmflags(&mm->def_flags, MADV_HUGEPAGE);
         |                                 ^
>> kernel/sys.c:2682:5: error: call to undeclared function 'process_default_madv_hugepage'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
    2682 |                                 process_default_madv_hugepage(mm, MADV_HUGEPAGE);
         |                                 ^
   3 errors generated.


vim +/hugepage_global_enabled +2678 kernel/sys.c

  2472	
  2473	SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
  2474			unsigned long, arg4, unsigned long, arg5)
  2475	{
  2476		struct task_struct *me = current;
  2477		struct mm_struct *mm = me->mm;
  2478		unsigned char comm[sizeof(me->comm)];
  2479		long error;
  2480	
  2481		error = security_task_prctl(option, arg2, arg3, arg4, arg5);
  2482		if (error != -ENOSYS)
  2483			return error;
  2484	
  2485		error = 0;
  2486		switch (option) {
  2487		case PR_SET_PDEATHSIG:
  2488			if (!valid_signal(arg2)) {
  2489				error = -EINVAL;
  2490				break;
  2491			}
  2492			me->pdeath_signal = arg2;
  2493			break;
  2494		case PR_GET_PDEATHSIG:
  2495			error = put_user(me->pdeath_signal, (int __user *)arg2);
  2496			break;
  2497		case PR_GET_DUMPABLE:
  2498			error = get_dumpable(me->mm);
  2499			break;
  2500		case PR_SET_DUMPABLE:
  2501			if (arg2 != SUID_DUMP_DISABLE && arg2 != SUID_DUMP_USER) {
  2502				error = -EINVAL;
  2503				break;
  2504			}
  2505			set_dumpable(me->mm, arg2);
  2506			break;
  2507	
  2508		case PR_SET_UNALIGN:
  2509			error = SET_UNALIGN_CTL(me, arg2);
  2510			break;
  2511		case PR_GET_UNALIGN:
  2512			error = GET_UNALIGN_CTL(me, arg2);
  2513			break;
  2514		case PR_SET_FPEMU:
  2515			error = SET_FPEMU_CTL(me, arg2);
  2516			break;
  2517		case PR_GET_FPEMU:
  2518			error = GET_FPEMU_CTL(me, arg2);
  2519			break;
  2520		case PR_SET_FPEXC:
  2521			error = SET_FPEXC_CTL(me, arg2);
  2522			break;
  2523		case PR_GET_FPEXC:
  2524			error = GET_FPEXC_CTL(me, arg2);
  2525			break;
  2526		case PR_GET_TIMING:
  2527			error = PR_TIMING_STATISTICAL;
  2528			break;
  2529		case PR_SET_TIMING:
  2530			if (arg2 != PR_TIMING_STATISTICAL)
  2531				error = -EINVAL;
  2532			break;
  2533		case PR_SET_NAME:
  2534			comm[sizeof(me->comm) - 1] = 0;
  2535			if (strncpy_from_user(comm, (char __user *)arg2,
  2536					      sizeof(me->comm) - 1) < 0)
  2537				return -EFAULT;
  2538			set_task_comm(me, comm);
  2539			proc_comm_connector(me);
  2540			break;
  2541		case PR_GET_NAME:
  2542			get_task_comm(comm, me);
  2543			if (copy_to_user((char __user *)arg2, comm, sizeof(comm)))
  2544				return -EFAULT;
  2545			break;
  2546		case PR_GET_ENDIAN:
  2547			error = GET_ENDIAN(me, arg2);
  2548			break;
  2549		case PR_SET_ENDIAN:
  2550			error = SET_ENDIAN(me, arg2);
  2551			break;
  2552		case PR_GET_SECCOMP:
  2553			error = prctl_get_seccomp();
  2554			break;
  2555		case PR_SET_SECCOMP:
  2556			error = prctl_set_seccomp(arg2, (char __user *)arg3);
  2557			break;
  2558		case PR_GET_TSC:
  2559			error = GET_TSC_CTL(arg2);
  2560			break;
  2561		case PR_SET_TSC:
  2562			error = SET_TSC_CTL(arg2);
  2563			break;
  2564		case PR_TASK_PERF_EVENTS_DISABLE:
  2565			error = perf_event_task_disable();
  2566			break;
  2567		case PR_TASK_PERF_EVENTS_ENABLE:
  2568			error = perf_event_task_enable();
  2569			break;
  2570		case PR_GET_TIMERSLACK:
  2571			if (current->timer_slack_ns > ULONG_MAX)
  2572				error = ULONG_MAX;
  2573			else
  2574				error = current->timer_slack_ns;
  2575			break;
  2576		case PR_SET_TIMERSLACK:
  2577			if (rt_or_dl_task_policy(current))
  2578				break;
  2579			if (arg2 <= 0)
  2580				current->timer_slack_ns =
  2581						current->default_timer_slack_ns;
  2582			else
  2583				current->timer_slack_ns = arg2;
  2584			break;
  2585		case PR_MCE_KILL:
  2586			if (arg4 | arg5)
  2587				return -EINVAL;
  2588			switch (arg2) {
  2589			case PR_MCE_KILL_CLEAR:
  2590				if (arg3 != 0)
  2591					return -EINVAL;
  2592				current->flags &= ~PF_MCE_PROCESS;
  2593				break;
  2594			case PR_MCE_KILL_SET:
  2595				current->flags |= PF_MCE_PROCESS;
  2596				if (arg3 == PR_MCE_KILL_EARLY)
  2597					current->flags |= PF_MCE_EARLY;
  2598				else if (arg3 == PR_MCE_KILL_LATE)
  2599					current->flags &= ~PF_MCE_EARLY;
  2600				else if (arg3 == PR_MCE_KILL_DEFAULT)
  2601					current->flags &=
  2602							~(PF_MCE_EARLY|PF_MCE_PROCESS);
  2603				else
  2604					return -EINVAL;
  2605				break;
  2606			default:
  2607				return -EINVAL;
  2608			}
  2609			break;
  2610		case PR_MCE_KILL_GET:
  2611			if (arg2 | arg3 | arg4 | arg5)
  2612				return -EINVAL;
  2613			if (current->flags & PF_MCE_PROCESS)
  2614				error = (current->flags & PF_MCE_EARLY) ?
  2615					PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
  2616			else
  2617				error = PR_MCE_KILL_DEFAULT;
  2618			break;
  2619		case PR_SET_MM:
  2620			error = prctl_set_mm(arg2, arg3, arg4, arg5);
  2621			break;
  2622		case PR_GET_TID_ADDRESS:
  2623			error = prctl_get_tid_address(me, (int __user * __user *)arg2);
  2624			break;
  2625		case PR_SET_CHILD_SUBREAPER:
  2626			me->signal->is_child_subreaper = !!arg2;
  2627			if (!arg2)
  2628				break;
  2629	
  2630			walk_process_tree(me, propagate_has_child_subreaper, NULL);
  2631			break;
  2632		case PR_GET_CHILD_SUBREAPER:
  2633			error = put_user(me->signal->is_child_subreaper,
  2634					 (int __user *)arg2);
  2635			break;
  2636		case PR_SET_NO_NEW_PRIVS:
  2637			if (arg2 != 1 || arg3 || arg4 || arg5)
  2638				return -EINVAL;
  2639	
  2640			task_set_no_new_privs(current);
  2641			break;
  2642		case PR_GET_NO_NEW_PRIVS:
  2643			if (arg2 || arg3 || arg4 || arg5)
  2644				return -EINVAL;
  2645			return task_no_new_privs(current) ? 1 : 0;
  2646		case PR_GET_THP_DISABLE:
  2647			if (arg2 || arg3 || arg4 || arg5)
  2648				return -EINVAL;
  2649			error = !!test_bit(MMF_DISABLE_THP, &me->mm->flags);
  2650			break;
  2651		case PR_SET_THP_DISABLE:
  2652			if (arg3 || arg4 || arg5)
  2653				return -EINVAL;
  2654			if (mmap_write_lock_killable(me->mm))
  2655				return -EINTR;
  2656			if (arg2)
  2657				set_bit(MMF_DISABLE_THP, &me->mm->flags);
  2658			else
  2659				clear_bit(MMF_DISABLE_THP, &me->mm->flags);
  2660			mmap_write_unlock(me->mm);
  2661			break;
  2662		case PR_GET_THP_POLICY:
  2663			if (arg2 || arg3 || arg4 || arg5)
  2664				return -EINVAL;
  2665			if (mmap_write_lock_killable(mm))
  2666				return -EINTR;
  2667			if (mm->def_flags & VM_HUGEPAGE)
  2668				error = PR_DEFAULT_MADV_HUGEPAGE;
  2669			mmap_write_unlock(mm);
  2670			break;
  2671		case PR_SET_THP_POLICY:
  2672			if (arg3 || arg4 || arg5)
  2673				return -EINVAL;
  2674			if (mmap_write_lock_killable(mm))
  2675				return -EINTR;
  2676			switch (arg2) {
  2677			case PR_DEFAULT_MADV_HUGEPAGE:
> 2678				if (!hugepage_global_enabled())
  2679					error = -EPERM;
> 2680				error = hugepage_set_vmflags(&mm->def_flags, MADV_HUGEPAGE);
  2681				if (!error)
> 2682					process_default_madv_hugepage(mm, MADV_HUGEPAGE);
  2683				break;
  2684			default:
  2685				error = -EINVAL;
  2686				break;
  2687			}
  2688			mmap_write_unlock(mm);
  2689			break;
  2690		case PR_MPX_ENABLE_MANAGEMENT:
  2691		case PR_MPX_DISABLE_MANAGEMENT:
  2692			/* No longer implemented: */
  2693			return -EINVAL;
  2694		case PR_SET_FP_MODE:
  2695			error = SET_FP_MODE(me, arg2);
  2696			break;
  2697		case PR_GET_FP_MODE:
  2698			error = GET_FP_MODE(me);
  2699			break;
  2700		case PR_SVE_SET_VL:
  2701			error = SVE_SET_VL(arg2);
  2702			break;
  2703		case PR_SVE_GET_VL:
  2704			error = SVE_GET_VL();
  2705			break;
  2706		case PR_SME_SET_VL:
  2707			error = SME_SET_VL(arg2);
  2708			break;
  2709		case PR_SME_GET_VL:
  2710			error = SME_GET_VL();
  2711			break;
  2712		case PR_GET_SPECULATION_CTRL:
  2713			if (arg3 || arg4 || arg5)
  2714				return -EINVAL;
  2715			error = arch_prctl_spec_ctrl_get(me, arg2);
  2716			break;
  2717		case PR_SET_SPECULATION_CTRL:
  2718			if (arg4 || arg5)
  2719				return -EINVAL;
  2720			error = arch_prctl_spec_ctrl_set(me, arg2, arg3);
  2721			break;
  2722		case PR_PAC_RESET_KEYS:
  2723			if (arg3 || arg4 || arg5)
  2724				return -EINVAL;
  2725			error = PAC_RESET_KEYS(me, arg2);
  2726			break;
  2727		case PR_PAC_SET_ENABLED_KEYS:
  2728			if (arg4 || arg5)
  2729				return -EINVAL;
  2730			error = PAC_SET_ENABLED_KEYS(me, arg2, arg3);
  2731			break;
  2732		case PR_PAC_GET_ENABLED_KEYS:
  2733			if (arg2 || arg3 || arg4 || arg5)
  2734				return -EINVAL;
  2735			error = PAC_GET_ENABLED_KEYS(me);
  2736			break;
  2737		case PR_SET_TAGGED_ADDR_CTRL:
  2738			if (arg3 || arg4 || arg5)
  2739				return -EINVAL;
  2740			error = SET_TAGGED_ADDR_CTRL(arg2);
  2741			break;
  2742		case PR_GET_TAGGED_ADDR_CTRL:
  2743			if (arg2 || arg3 || arg4 || arg5)
  2744				return -EINVAL;
  2745			error = GET_TAGGED_ADDR_CTRL();
  2746			break;
  2747		case PR_SET_IO_FLUSHER:
  2748			if (!capable(CAP_SYS_RESOURCE))
  2749				return -EPERM;
  2750	
  2751			if (arg3 || arg4 || arg5)
  2752				return -EINVAL;
  2753	
  2754			if (arg2 == 1)
  2755				current->flags |= PR_IO_FLUSHER;
  2756			else if (!arg2)
  2757				current->flags &= ~PR_IO_FLUSHER;
  2758			else
  2759				return -EINVAL;
  2760			break;
  2761		case PR_GET_IO_FLUSHER:
  2762			if (!capable(CAP_SYS_RESOURCE))
  2763				return -EPERM;
  2764	
  2765			if (arg2 || arg3 || arg4 || arg5)
  2766				return -EINVAL;
  2767	
  2768			error = (current->flags & PR_IO_FLUSHER) == PR_IO_FLUSHER;
  2769			break;
  2770		case PR_SET_SYSCALL_USER_DISPATCH:
  2771			error = set_syscall_user_dispatch(arg2, arg3, arg4,
  2772							  (char __user *) arg5);
  2773			break;
  2774	#ifdef CONFIG_SCHED_CORE
  2775		case PR_SCHED_CORE:
  2776			error = sched_core_share_pid(arg2, arg3, arg4, arg5);
  2777			break;
  2778	#endif
  2779		case PR_SET_MDWE:
  2780			error = prctl_set_mdwe(arg2, arg3, arg4, arg5);
  2781			break;
  2782		case PR_GET_MDWE:
  2783			error = prctl_get_mdwe(arg2, arg3, arg4, arg5);
  2784			break;
  2785		case PR_PPC_GET_DEXCR:
  2786			if (arg3 || arg4 || arg5)
  2787				return -EINVAL;
  2788			error = PPC_GET_DEXCR_ASPECT(me, arg2);
  2789			break;
  2790		case PR_PPC_SET_DEXCR:
  2791			if (arg4 || arg5)
  2792				return -EINVAL;
  2793			error = PPC_SET_DEXCR_ASPECT(me, arg2, arg3);
  2794			break;
  2795		case PR_SET_VMA:
  2796			error = prctl_set_vma(arg2, arg3, arg4, arg5);
  2797			break;
  2798		case PR_GET_AUXV:
  2799			if (arg4 || arg5)
  2800				return -EINVAL;
  2801			error = prctl_get_auxv((void __user *)arg2, arg3);
  2802			break;
  2803	#ifdef CONFIG_KSM
  2804		case PR_SET_MEMORY_MERGE:
  2805			if (arg3 || arg4 || arg5)
  2806				return -EINVAL;
  2807			if (mmap_write_lock_killable(me->mm))
  2808				return -EINTR;
  2809	
  2810			if (arg2)
  2811				error = ksm_enable_merge_any(me->mm);
  2812			else
  2813				error = ksm_disable_merge_any(me->mm);
  2814			mmap_write_unlock(me->mm);
  2815			break;
  2816		case PR_GET_MEMORY_MERGE:
  2817			if (arg2 || arg3 || arg4 || arg5)
  2818				return -EINVAL;
  2819	
  2820			error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
  2821			break;
  2822	#endif
  2823		case PR_RISCV_V_SET_CONTROL:
  2824			error = RISCV_V_SET_CONTROL(arg2);
  2825			break;
  2826		case PR_RISCV_V_GET_CONTROL:
  2827			error = RISCV_V_GET_CONTROL();
  2828			break;
  2829		case PR_RISCV_SET_ICACHE_FLUSH_CTX:
  2830			error = RISCV_SET_ICACHE_FLUSH_CTX(arg2, arg3);
  2831			break;
  2832		case PR_GET_SHADOW_STACK_STATUS:
  2833			if (arg3 || arg4 || arg5)
  2834				return -EINVAL;
  2835			error = arch_get_shadow_stack_status(me, (unsigned long __user *) arg2);
  2836			break;
  2837		case PR_SET_SHADOW_STACK_STATUS:
  2838			if (arg3 || arg4 || arg5)
  2839				return -EINVAL;
  2840			error = arch_set_shadow_stack_status(me, arg2);
  2841			break;
  2842		case PR_LOCK_SHADOW_STACK_STATUS:
  2843			if (arg3 || arg4 || arg5)
  2844				return -EINVAL;
  2845			error = arch_lock_shadow_stack_status(me, arg2);
  2846			break;
  2847		case PR_TIMER_CREATE_RESTORE_IDS:
  2848			if (arg3 || arg4 || arg5)
  2849				return -EINVAL;
  2850			error = posixtimer_create_prctl(arg2);
  2851			break;
  2852		default:
  2853			trace_task_prctl_unknown(option, arg2, arg3, arg4, arg5);
  2854			error = -EINVAL;
  2855			break;
  2856		}
  2857		return error;
  2858	}
  2859	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process
Posted by Jann Horn 7 months ago
On Tue, May 20, 2025 at 12:33 AM Usama Arif <usamaarif642@gmail.com> wrote:
> This is set via the new PR_SET_THP_POLICY prctl. It has 2 affects:
> - It sets VM_HUGEPAGE and clears VM_NOHUGEPAGE on the default VMA flags
>   (def_flags). This means that every new VMA will be considered for
>   hugepage.
> - Iterate through every VMA in the process and call hugepage_madvise
>   on it, with MADV_HUGEPAGE policy.
> The policy is inherited during fork+exec.

As I replied to Lorenzo's series
(https://lore.kernel.org/all/CAG48ez3-7EnBVEjpdoW7z5K0hX41nLQN5Wb65Vg-1p8DdXRnjg@mail.gmail.com/),
it would be nice if you could avoid introducing new flags that have
the combination of all the following properties:

1. persists across exec
2. not cleared on secureexec execution
3. settable without ns_capable(CAP_SYS_ADMIN)
4. settable without NO_NEW_PRIVS

Flags that have all of these properties need to be reviewed extra
carefully to see if there is any way they could impact the security of
setuid binaries, for example by changing mmap() behavior in a way that
makes addresses significantly more predictable.
Re: [PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process
Posted by Lorenzo Stoakes 7 months ago
On Tue, May 20, 2025 at 01:01:38AM +0200, Jann Horn wrote:
> On Tue, May 20, 2025 at 12:33 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > This is set via the new PR_SET_THP_POLICY prctl. It has 2 affects:
> > - It sets VM_HUGEPAGE and clears VM_NOHUGEPAGE on the default VMA flags
> >   (def_flags). This means that every new VMA will be considered for
> >   hugepage.
> > - Iterate through every VMA in the process and call hugepage_madvise
> >   on it, with MADV_HUGEPAGE policy.
> > The policy is inherited during fork+exec.
>
> As I replied to Lorenzo's series
> (https://lore.kernel.org/all/CAG48ez3-7EnBVEjpdoW7z5K0hX41nLQN5Wb65Vg-1p8DdXRnjg@mail.gmail.com/),
> it would be nice if you could avoid introducing new flags that have
> the combination of all the following properties:
>
> 1. persists across exec
> 2. not cleared on secureexec execution
> 3. settable without ns_capable(CAP_SYS_ADMIN)
> 4. settable without NO_NEW_PRIVS
>
> Flags that have all of these properties need to be reviewed extra
> carefully to see if there is any way they could impact the security of
> setuid binaries, for example by changing mmap() behavior in a way that
> makes addresses significantly more predictable.

Indeed, this series was meant to be as RFC as mine while we still figured this
out :) grr. Well, with the NACK it is - in effect - now an RFC.

Yes having something persistent like this is not great, the idea of
introducing this in my series was to provide an alternative generic version
of this approach that can be better controlled and isn't just a 'tacked on'
change specific to one company's needs but rather a more general idea of
'madvise() by default'.

I do wonder in this case, whether we need be so cautious however given the
_relatively_ safe nature of these flags?

I do absolutely agree we need to very carefully review whether:

1. It really even makes sense to do this
2. Any such restrictions need be made

I am weaker on the security side so very glad for your input here (thanks!)

I suspect probably we want ns_capable(CAP_SYS_ADMIN) _as a rule_ for this
kind of mm->def_flags change.

I also wanted to dig a little deeper into whether this was sensible as a
general approach.

I, however, do _very much_ prefer it to an mm->flags change (that'd
necessity a pre-requisite 'make mm->flags 64-bit on 32-bit kernels'
series anyway).
Re: [PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process
Posted by David Hildenbrand 7 months ago
On 20.05.25 07:23, Lorenzo Stoakes wrote:
> On Tue, May 20, 2025 at 01:01:38AM +0200, Jann Horn wrote:
>> On Tue, May 20, 2025 at 12:33 AM Usama Arif <usamaarif642@gmail.com> wrote:
>>> This is set via the new PR_SET_THP_POLICY prctl. It has 2 affects:
>>> - It sets VM_HUGEPAGE and clears VM_NOHUGEPAGE on the default VMA flags
>>>    (def_flags). This means that every new VMA will be considered for
>>>    hugepage.
>>> - Iterate through every VMA in the process and call hugepage_madvise
>>>    on it, with MADV_HUGEPAGE policy.
>>> The policy is inherited during fork+exec.
>>
>> As I replied to Lorenzo's series
>> (https://lore.kernel.org/all/CAG48ez3-7EnBVEjpdoW7z5K0hX41nLQN5Wb65Vg-1p8DdXRnjg@mail.gmail.com/),
>> it would be nice if you could avoid introducing new flags that have
>> the combination of all the following properties:
>>
>> 1. persists across exec
>> 2. not cleared on secureexec execution
>> 3. settable without ns_capable(CAP_SYS_ADMIN)
>> 4. settable without NO_NEW_PRIVS
>>
>> Flags that have all of these properties need to be reviewed extra
>> carefully to see if there is any way they could impact the security of
>> setuid binaries, for example by changing mmap() behavior in a way that
>> makes addresses significantly more predictable.
> 
> Indeed, this series was meant to be as RFC as mine while we still figured this
> out :) grr. Well, with the NACK it is - in effect - now an RFC.
> 
> Yes having something persistent like this is not great, the idea of
> introducing this in my series was to provide an alternative generic version
> of this approach that can be better controlled and isn't just a 'tacked on'
> change specific to one company's needs but rather a more general idea of
> 'madvise() by default'.
> 
> I do wonder in this case, whether we need be so cautious however given the
> _relatively_ safe nature of these flags?

Yes. Changing VM_HUGEPAGE / VM_NOHUGEPAGE defaults should have little 
impact, but we better be careful.

setuid execution is certainly an interesting point. Maybe the general 
rule should be, that it is not inherited over secureexec unless 
CAP_SYS_ADMIN?

-- 
Cheers,

David / dhildenb

Re: [PATCH v3 2/7] prctl: introduce PR_DEFAULT_MADV_HUGEPAGE for the process
Posted by Lorenzo Stoakes 7 months ago
On Tue, May 20, 2025 at 11:09:05AM +0200, David Hildenbrand wrote:
> On 20.05.25 07:23, Lorenzo Stoakes wrote:
> > On Tue, May 20, 2025 at 01:01:38AM +0200, Jann Horn wrote:
> > > On Tue, May 20, 2025 at 12:33 AM Usama Arif <usamaarif642@gmail.com> wrote:
> > > > This is set via the new PR_SET_THP_POLICY prctl. It has 2 affects:
> > > > - It sets VM_HUGEPAGE and clears VM_NOHUGEPAGE on the default VMA flags
> > > >    (def_flags). This means that every new VMA will be considered for
> > > >    hugepage.
> > > > - Iterate through every VMA in the process and call hugepage_madvise
> > > >    on it, with MADV_HUGEPAGE policy.
> > > > The policy is inherited during fork+exec.
> > >
> > > As I replied to Lorenzo's series
> > > (https://lore.kernel.org/all/CAG48ez3-7EnBVEjpdoW7z5K0hX41nLQN5Wb65Vg-1p8DdXRnjg@mail.gmail.com/),
> > > it would be nice if you could avoid introducing new flags that have
> > > the combination of all the following properties:
> > >
> > > 1. persists across exec
> > > 2. not cleared on secureexec execution
> > > 3. settable without ns_capable(CAP_SYS_ADMIN)
> > > 4. settable without NO_NEW_PRIVS
> > >
> > > Flags that have all of these properties need to be reviewed extra
> > > carefully to see if there is any way they could impact the security of
> > > setuid binaries, for example by changing mmap() behavior in a way that
> > > makes addresses significantly more predictable.
> >
> > Indeed, this series was meant to be as RFC as mine while we still figured this
> > out :) grr. Well, with the NACK it is - in effect - now an RFC.
> >
> > Yes having something persistent like this is not great, the idea of
> > introducing this in my series was to provide an alternative generic version
> > of this approach that can be better controlled and isn't just a 'tacked on'
> > change specific to one company's needs but rather a more general idea of
> > 'madvise() by default'.
> >
> > I do wonder in this case, whether we need be so cautious however given the
> > _relatively_ safe nature of these flags?
>
> Yes. Changing VM_HUGEPAGE / VM_NOHUGEPAGE defaults should have little
> impact, but we better be careful.
>
> setuid execution is certainly an interesting point. Maybe the general rule
> should be, that it is not inherited over secureexec unless CAP_SYS_ADMIN?

I think probably we should just restrict this operation to system admins
anyway. This will be the most cautious option, and simplifies things as we
then don't have to especially check for things at certain points?

>
> --
> Cheers,
>
> David / dhildenb
>