Introduces a mechanism to inherit hardware capabilities (AT_HWCAP,
AT_HWCAP2, etc.) from a parent process when they have been modified via
prctl.
To support C/R operations (snapshots, live migration) in heterogeneous
clusters, we must ensure that processes utilize CPU features available
on all potential target nodes. To solve this, we need to advertise a
common feature set across the cluster.
This patch adds a new mm flag MMF_USER_HWCAP, which is set when the
auxiliary vector is modified via prctl(PR_SET_MM, PR_SET_MM_AUXV). When
execve() is called, if the current process has MMF_USER_HWCAP set, the
HWCAP values are extracted from the current auxiliary vector and stored
in the linux_binprm structure. These values are then used to populate
the auxiliary vector of the new process, effectively inheriting the
hardware capabilities.
The inherited HWCAPs are masked with the hardware capabilities supported
by the current kernel to ensure that we don't report more features than
actually supported. This is important to avoid unexpected behavior,
especially for processes with additional privileges.
Signed-off-by: Andrei Vagin <avagin@google.com>
---
fs/binfmt_elf.c | 8 +++---
fs/binfmt_elf_fdpic.c | 8 +++---
fs/exec.c | 58 ++++++++++++++++++++++++++++++++++++++++
include/linux/binfmts.h | 11 ++++++++
include/linux/mm_types.h | 2 ++
kernel/fork.c | 3 +++
kernel/sys.c | 5 +++-
7 files changed, 86 insertions(+), 9 deletions(-)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3eb734c192e9..aec129e33f0b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -246,7 +246,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
*/
ARCH_DLINFO;
#endif
- NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
+ NEW_AUX_ENT(AT_HWCAP, bprm->hwcap);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, phdr_addr);
@@ -264,13 +264,13 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
#ifdef ELF_HWCAP2
- NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+ NEW_AUX_ENT(AT_HWCAP2, bprm->hwcap2);
#endif
#ifdef ELF_HWCAP3
- NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+ NEW_AUX_ENT(AT_HWCAP3, bprm->hwcap3);
#endif
#ifdef ELF_HWCAP4
- NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+ NEW_AUX_ENT(AT_HWCAP4, bprm->hwcap4);
#endif
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a3d4e6973b29..55b482f03c82 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -629,15 +629,15 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm,
*/
ARCH_DLINFO;
#endif
- NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
+ NEW_AUX_ENT(AT_HWCAP, bprm->hwcap);
#ifdef ELF_HWCAP2
- NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+ NEW_AUX_ENT(AT_HWCAP2, bprm->hwcap2);
#endif
#ifdef ELF_HWCAP3
- NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+ NEW_AUX_ENT(AT_HWCAP3, bprm->hwcap3);
#endif
#ifdef ELF_HWCAP4
- NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+ NEW_AUX_ENT(AT_HWCAP4, bprm->hwcap4);
#endif
NEW_AUX_ENT(AT_PAGESZ, PAGE_SIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
diff --git a/fs/exec.c b/fs/exec.c
index 9d5ebc9d15b0..94382285eeda 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1462,6 +1462,17 @@ static struct linux_binprm *alloc_bprm(int fd, struct filename *filename, int fl
*/
bprm->is_check = !!(flags & AT_EXECVE_CHECK);
+ bprm->hwcap = ELF_HWCAP;
+#ifdef ELF_HWCAP2
+ bprm->hwcap2 = ELF_HWCAP2;
+#endif
+#ifdef ELF_HWCAP3
+ bprm->hwcap3 = ELF_HWCAP3;
+#endif
+#ifdef ELF_HWCAP4
+ bprm->hwcap4 = ELF_HWCAP4;
+#endif
+
retval = bprm_mm_init(bprm);
if (!retval)
return bprm;
@@ -1780,6 +1791,50 @@ static int bprm_execve(struct linux_binprm *bprm)
return retval;
}
+static void inherit_hwcap(struct linux_binprm *bprm)
+{
+ int i, n;
+
+#ifdef ELF_HWCAP4
+ n = 4;
+#elif defined(ELF_HWCAP3)
+ n = 3;
+#elif defined(ELF_HWCAP2)
+ n = 2;
+#else
+ n = 1;
+#endif
+
+ for (i = 0; n && i < AT_VECTOR_SIZE; i += 2) {
+ long val = current->mm->saved_auxv[i + 1];
+
+ switch (current->mm->saved_auxv[i]) {
+ case AT_HWCAP:
+ bprm->hwcap = val & ELF_HWCAP;
+ break;
+#ifdef ELF_HWCAP2
+ case AT_HWCAP2:
+ bprm->hwcap2 = val & ELF_HWCAP2;
+ break;
+#endif
+#ifdef ELF_HWCAP3
+ case AT_HWCAP3:
+ bprm->hwcap3 = val & ELF_HWCAP3;
+ break;
+#endif
+#ifdef ELF_HWCAP4
+ case AT_HWCAP4:
+ bprm->hwcap4 = val & ELF_HWCAP4;
+ break;
+#endif
+ default:
+ continue;
+ }
+ n--;
+ }
+ mm_flags_set(MMF_USER_HWCAP, bprm->mm);
+}
+
static int do_execveat_common(int fd, struct filename *filename,
struct user_arg_ptr argv,
struct user_arg_ptr envp,
@@ -1856,6 +1911,9 @@ static int do_execveat_common(int fd, struct filename *filename,
current->comm, bprm->filename);
}
+ if (mm_flags_test(MMF_USER_HWCAP, current->mm))
+ inherit_hwcap(bprm);
+
retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h
index 65abd5ab8836..94a3dcf9b1d2 100644
--- a/include/linux/binfmts.h
+++ b/include/linux/binfmts.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_BINFMTS_H
#define _LINUX_BINFMTS_H
+#include <linux/elf.h>
#include <linux/sched.h>
#include <linux/unistd.h>
#include <asm/exec.h>
@@ -67,6 +68,16 @@ struct linux_binprm {
unsigned long exec;
struct rlimit rlim_stack; /* Saved RLIMIT_STACK used during exec. */
+ unsigned long hwcap;
+#ifdef ELF_HWCAP2
+ unsigned long hwcap2;
+#endif
+#ifdef ELF_HWCAP3
+ unsigned long hwcap3;
+#endif
+#ifdef ELF_HWCAP4
+ unsigned long hwcap4;
+#endif
char buf[BINPRM_BUF_SIZE];
} __randomize_layout;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 42af2292951d..93e7aa929fda 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1862,6 +1862,8 @@ enum {
#define MMF_TOPDOWN 31 /* mm searches top down by default */
#define MMF_TOPDOWN_MASK BIT(MMF_TOPDOWN)
+#define MMF_USER_HWCAP 32 /* user-defined HWCAPs */
+
#define MMF_INIT_LEGACY_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\
MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK)
diff --git a/kernel/fork.c b/kernel/fork.c
index b1f3915d5f8e..0091315643de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1103,6 +1103,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
__mm_flags_overwrite_word(mm, mmf_init_legacy_flags(flags));
mm->def_flags = current->mm->def_flags & VM_INIT_DEF_MASK;
+
+ if (mm_flags_test(MMF_USER_HWCAP, current->mm))
+ mm_flags_set(MMF_USER_HWCAP, mm);
} else {
__mm_flags_overwrite_word(mm, default_dump_filter);
mm->def_flags = 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index 8d199cf457ae..83283001abfb 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2157,8 +2157,10 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data
* not introduce additional locks here making the kernel
* more complex.
*/
- if (prctl_map.auxv_size)
+ if (prctl_map.auxv_size) {
memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+ mm_flags_set(MMF_USER_HWCAP, current->mm);
+ }
mmap_read_unlock(mm);
return 0;
@@ -2191,6 +2193,7 @@ static int prctl_set_auxv(struct mm_struct *mm, unsigned long addr,
task_lock(current);
memcpy(mm->saved_auxv, user_auxv, len);
task_unlock(current);
+ mm_flags_set(MMF_USER_HWCAP, current->mm);
return 0;
}
--
2.52.0.351.gbe84eed79e-goog
On Thu, Jan 08, 2026 at 05:07:47AM +0000, Andrei Vagin <avagin@google.com> wrote:
> @@ -1780,6 +1791,50 @@ static int bprm_execve(struct linux_binprm *bprm)
> return retval;
> }
>
> +static void inherit_hwcap(struct linux_binprm *bprm)
> +{
> + int i, n;
> +
> +#ifdef ELF_HWCAP4
> + n = 4;
> +#elif defined(ELF_HWCAP3)
> + n = 3;
> +#elif defined(ELF_HWCAP2)
> + n = 2;
> +#else
> + n = 1;
> +#endif
Is it guaranteed that HWCAP n+1 exists only when n does?
(To make this work.)
> +
> + for (i = 0; n && i < AT_VECTOR_SIZE; i += 2) {
> + long val = current->mm->saved_auxv[i + 1];
> +
> + switch (current->mm->saved_auxv[i]) {
> + case AT_HWCAP:
> + bprm->hwcap = val & ELF_HWCAP;
> + break;
> +#ifdef ELF_HWCAP2
> + case AT_HWCAP2:
> + bprm->hwcap2 = val & ELF_HWCAP2;
> + break;
> +#endif
> +#ifdef ELF_HWCAP3
> + case AT_HWCAP3:
> + bprm->hwcap3 = val & ELF_HWCAP3;
> + break;
> +#endif
> +#ifdef ELF_HWCAP4
> + case AT_HWCAP4:
> + bprm->hwcap4 = val & ELF_HWCAP4;
> + break;
> +#endif
> + default:
> + continue;
> + }
> + n--;
> + }
> + mm_flags_set(MMF_USER_HWCAP, bprm->mm);
Will this work when mm->saved_auxv isn't set by the prctl (it is
zeroes?)?
Thanks,
Michal
On Mon, Jan 12, 2026 at 4:46 AM Michal Koutný <mkoutny@suse.com> wrote:
>
> On Thu, Jan 08, 2026 at 05:07:47AM +0000, Andrei Vagin <avagin@google.com> wrote:
> > @@ -1780,6 +1791,50 @@ static int bprm_execve(struct linux_binprm *bprm)
> > return retval;
> > }
> >
> > +static void inherit_hwcap(struct linux_binprm *bprm)
> > +{
> > + int i, n;
> > +
> > +#ifdef ELF_HWCAP4
> > + n = 4;
> > +#elif defined(ELF_HWCAP3)
> > + n = 3;
> > +#elif defined(ELF_HWCAP2)
> > + n = 2;
> > +#else
> > + n = 1;
> > +#endif
>
> Is it guaranteed that HWCAP n+1 exists only when n does?
> (To make this work.)
>
It is true for all existing arch-es. I can't imagine why we would want to
define ELF_HWCAP{n+1} without having ELF_HWCAP{n}. If you think we need
to handle this case, I can address it in the next version.
It is just a small optimization to stop iterating after handling all
entries. The code will work correctly even when HWCAP n+1 exists but n
doesn't.
>
> > +
> > + for (i = 0; n && i < AT_VECTOR_SIZE; i += 2) {
> > + long val = current->mm->saved_auxv[i + 1];
> > +
> > + switch (current->mm->saved_auxv[i]) {
> > + case AT_HWCAP:
> > + bprm->hwcap = val & ELF_HWCAP;
> > + break;
> > +#ifdef ELF_HWCAP2
> > + case AT_HWCAP2:
> > + bprm->hwcap2 = val & ELF_HWCAP2;
> > + break;
> > +#endif
> > +#ifdef ELF_HWCAP3
> > + case AT_HWCAP3:
> > + bprm->hwcap3 = val & ELF_HWCAP3;
> > + break;
> > +#endif
> > +#ifdef ELF_HWCAP4
> > + case AT_HWCAP4:
> > + bprm->hwcap4 = val & ELF_HWCAP4;
> > + break;
> > +#endif
> > + default:
> > + continue;
> > + }
> > + n--;
> > + }
> > + mm_flags_set(MMF_USER_HWCAP, bprm->mm);
>
> Will this work when mm->saved_auxv isn't set by the prctl (it is
> zeroes?)?
The inherit_hwcap function is only called if MMF_USER_HWCAP is set (auxv was
modified via prctl). However, even if mm->saved_auxv hasn't been
modified, it still
contains valid values.
Thanks,
Andrei
ps: Please ignore the html version I mistakenly sent.
On Mon, Jan 12, 2026 at 02:18:18PM -0800, Andrei Vagin <avagin@google.com> wrote:
> It is true for all existing arch-es. I can't imagine why we would want to
> define ELF_HWCAP{n+1} without having ELF_HWCAP{n}. If you think we need
> to handle this case, I can address it in the next version.
>
> It is just a small optimization to stop iterating after handling all
> entries. The code will work correctly even when HWCAP n+1 exists but n
> doesn't.
Indeed (I accidentally ignored the AT_VECTOR_SIZE condition), it turns
out no big deal then.
I like that it's not needlessly searched (and copied altogether).
> The inherit_hwcap function is only called if MMF_USER_HWCAP is set (auxv was
> modified via prctl). However, even if mm->saved_auxv hasn't been
> modified, it still contains valid values.
Hm, bprm_mm_init/mm_alloc/mm_init would tranfser the flag from
current, I'm still unclear whether it is necessary here. (It should make
no harm though.)
saved_auxv validity seems OK then.
One more thing came up to my mind -- synchronization between prctl'ing
and exec'ing threads (I see de_thread() is relatively late after
bprm__mm_init()).
Thanks,
Michal
On Wed, Jan 14, 2026 at 1:25 PM Michal Koutný <mkoutny@suse.com> wrote:
>
> On Mon, Jan 12, 2026 at 02:18:18PM -0800, Andrei Vagin <avagin@google.com> wrote:
> > It is true for all existing arch-es. I can't imagine why we would want to
> > define ELF_HWCAP{n+1} without having ELF_HWCAP{n}. If you think we need
> > to handle this case, I can address it in the next version.
> >
> > It is just a small optimization to stop iterating after handling all
> > entries. The code will work correctly even when HWCAP n+1 exists but n
> > doesn't.
>
> Indeed (I accidentally ignored the AT_VECTOR_SIZE condition), it turns
> out no big deal then.
> I like that it's not needlessly searched (and copied altogether).
>
> > The inherit_hwcap function is only called if MMF_USER_HWCAP is set (auxv was
> > modified via prctl). However, even if mm->saved_auxv hasn't been
> > modified, it still contains valid values.
>
> Hm, bprm_mm_init/mm_alloc/mm_init would tranfser the flag from
> current, I'm still unclear whether it is necessary here. (It should make
> no harm though.)
It is just another optimization. Without this flag, we would need to
parse mm->saved_auxv even when it hasn't been changed.
>
> saved_auxv validity seems OK then.
>
> One more thing came up to my mind -- synchronization between prctl'ing
> and exec'ing threads (I see de_thread() is relatively late after
> bprm__mm_init()).
Currently, it is a user responsibility to synchronize these calls.
The comment in prctl_set_mm_map states:
Note this update of @saved_auxv is lockless thus
if someone reads this member in procfs while we're
updating -- it may get partly updated results. It's
known and acceptable trade off: we leave it as is to
not introduce additional locks here making the kernel
more complex.
Without synchronization between threads calling prctl() and execve(), a
new process could be executed with inconsistent HWCAPs. However, this
would not trigger any issues within the kernel. If we decide to
synchronize access to saved_auxv, we can use mm->arg_lock for that
purpose.
Thanks,
Andrei
© 2016 - 2026 Red Hat, Inc.