From: "Mike Rapoport (Microsoft)" <rppt@kernel.org>
Currently execmem_cache_free() ignores potential allocation failures that
may happen in execmem_cache_add(). Besides, it uses text poking to fill the
memory with trapping instructions before returning it to cache although it
would be more efficient to make that memory writable, update it using
memcpy and then restore ROX protection.
Rework execmem_cache_free() so that in case of an error it will defer
freeing of the memory to a delayed work.
With this the happy fast path will now change permissions to RW, fill the
memory with trapping instructions using memcpy, restore ROX permissions,
add the memory back to the free cache and clear the relevant entry in
busy_areas.
If any step in the fast path fails, the entry in busy_areas will be marked
as pending_free. These entries will be handled by a delayed work and freed
asynchronously.
To make the fast path faster, use __GFP_NORETRY for memory allocations and
let asynchronous handler try harder with GFP_KERNEL.
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
mm/execmem.c | 120 +++++++++++++++++++++++++++++++++++++++++----------
1 file changed, 97 insertions(+), 23 deletions(-)
diff --git a/mm/execmem.c b/mm/execmem.c
index 6b040fbc5f4f..1cc781244593 100644
--- a/mm/execmem.c
+++ b/mm/execmem.c
@@ -93,8 +93,15 @@ struct execmem_cache {
struct mutex mutex;
struct maple_tree busy_areas;
struct maple_tree free_areas;
+ unsigned int pending_free_cnt; /* protected by mutex */
};
+/* delay to schedule asynchronous free if fast path free fails */
+#define FREE_DELAY (msecs_to_jiffies(10))
+
+/* mark entries in busy_areas that should be freed asynchronously */
+#define PENDING_FREE_MASK (1 << (PAGE_SHIFT - 1))
+
static struct execmem_cache execmem_cache = {
.mutex = __MUTEX_INITIALIZER(execmem_cache.mutex),
.busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN,
@@ -155,20 +162,17 @@ static void execmem_cache_clean(struct work_struct *work)
static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean);
-static int execmem_cache_add(void *ptr, size_t size)
+static int execmem_cache_add_locked(void *ptr, size_t size, gfp_t gfp_mask)
{
struct maple_tree *free_areas = &execmem_cache.free_areas;
- struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, free_areas, addr - 1, addr + 1);
unsigned long lower, upper;
void *area = NULL;
- int err;
lower = addr;
upper = addr + size - 1;
- mutex_lock(mutex);
area = mas_walk(&mas);
if (area && mas.last == addr - 1)
lower = mas.index;
@@ -178,12 +182,14 @@ static int execmem_cache_add(void *ptr, size_t size)
upper = mas.last;
mas_set_range(&mas, lower, upper);
- err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL);
- mutex_unlock(mutex);
- if (err)
- return err;
+ return mas_store_gfp(&mas, (void *)lower, gfp_mask);
+}
- return 0;
+static int execmem_cache_add(void *ptr, size_t size, gfp_t gfp_mask)
+{
+ guard(mutex)(&execmem_cache.mutex);
+
+ return execmem_cache_add_locked(ptr, size, gfp_mask);
}
static bool within_range(struct execmem_range *range, struct ma_state *mas,
@@ -278,7 +284,7 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size)
if (err)
goto err_free_mem;
- err = execmem_cache_add(p, alloc_size);
+ err = execmem_cache_add(p, alloc_size, GFP_KERNEL);
if (err)
goto err_reset_direct_map;
@@ -307,33 +313,101 @@ static void *execmem_cache_alloc(struct execmem_range *range, size_t size)
return __execmem_cache_alloc(range, size);
}
+static inline bool is_pending_free(void *ptr)
+{
+ return ((unsigned long)ptr & PENDING_FREE_MASK);
+}
+
+static inline void *pending_free_set(void *ptr)
+{
+ return (void *)((unsigned long)ptr | PENDING_FREE_MASK);
+}
+
+static inline void *pending_free_clear(void *ptr)
+{
+ return (void *)((unsigned long)ptr & ~PENDING_FREE_MASK);
+}
+
+static int execmem_force_rw(void *ptr, size_t size);
+
+static int __execmem_cache_free(struct ma_state *mas, void *ptr, gfp_t gfp_mask)
+{
+ size_t size = mas_range_len(mas);
+ int err;
+
+ err = execmem_force_rw(ptr, size);
+ if (err)
+ return err;
+
+ execmem_fill_trapping_insns(ptr, size, /* writable = */ true);
+ execmem_restore_rox(ptr, size);
+
+ err = execmem_cache_add_locked(ptr, size, gfp_mask);
+ if (err)
+ return err;
+
+ mas_store_gfp(mas, NULL, gfp_mask);
+ return 0;
+}
+
+static void execmem_cache_free_slow(struct work_struct *work);
+static DECLARE_DELAYED_WORK(execmem_cache_free_work, execmem_cache_free_slow);
+
+static void execmem_cache_free_slow(struct work_struct *work)
+{
+ struct maple_tree *busy_areas = &execmem_cache.busy_areas;
+ MA_STATE(mas, busy_areas, 0, ULONG_MAX);
+ void *area;
+
+ guard(mutex)(&execmem_cache.mutex);
+
+ if (!execmem_cache.pending_free_cnt)
+ return;
+
+ mas_for_each(&mas, area, ULONG_MAX) {
+ if (!is_pending_free(area))
+ continue;
+
+ pending_free_clear(area);
+ if (__execmem_cache_free(&mas, area, GFP_KERNEL))
+ continue;
+
+ execmem_cache.pending_free_cnt--;
+ }
+
+ if (execmem_cache.pending_free_cnt)
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ else
+ schedule_work(&execmem_cache_clean_work);
+}
+
static bool execmem_cache_free(void *ptr)
{
struct maple_tree *busy_areas = &execmem_cache.busy_areas;
- struct mutex *mutex = &execmem_cache.mutex;
unsigned long addr = (unsigned long)ptr;
MA_STATE(mas, busy_areas, addr, addr);
- size_t size;
void *area;
+ int err;
+
+ guard(mutex)(&execmem_cache.mutex);
- mutex_lock(mutex);
area = mas_walk(&mas);
- if (!area) {
- mutex_unlock(mutex);
+ if (!area)
return false;
- }
- size = mas_range_len(&mas);
-
- mas_store_gfp(&mas, NULL, GFP_KERNEL);
- mutex_unlock(mutex);
-
- execmem_fill_trapping_insns(ptr, size, /* writable = */ false);
- execmem_cache_add(ptr, size);
+ err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY);
+ if (err)
+ goto err_slowpath;
schedule_work(&execmem_cache_clean_work);
return true;
+
+err_slowpath:
+ mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL);
+ execmem_cache.pending_free_cnt++;
+ schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY);
+ return true;
}
static int execmem_force_rw(void *ptr, size_t size)
--
2.47.2
On Fri, Jul 4, 2025 at 3:54 PM Mike Rapoport <rppt@kernel.org> wrote: > + > +static void execmem_cache_free_slow(struct work_struct *work) > +{ > + struct maple_tree *busy_areas = &execmem_cache.busy_areas; > + MA_STATE(mas, busy_areas, 0, ULONG_MAX); > + void *area; > + > + guard(mutex)(&execmem_cache.mutex); > + > + if (!execmem_cache.pending_free_cnt) > + return; > + > + mas_for_each(&mas, area, ULONG_MAX) { > + if (!is_pending_free(area)) > + continue; > + > + pending_free_clear(area); Probably: area = pending_free_clear(area); ? > + if (__execmem_cache_free(&mas, area, GFP_KERNEL)) > + continue; > + > + execmem_cache.pending_free_cnt--; > + } > + > + if (execmem_cache.pending_free_cnt) > + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > + else > + schedule_work(&execmem_cache_clean_work); > +} Regards; Yann.
On Mon, Jul 07, 2025 at 05:32:11PM +0200, Yann Ylavic wrote: > On Fri, Jul 4, 2025 at 3:54 PM Mike Rapoport <rppt@kernel.org> wrote: > > + > > +static void execmem_cache_free_slow(struct work_struct *work) > > +{ > > + struct maple_tree *busy_areas = &execmem_cache.busy_areas; > > + MA_STATE(mas, busy_areas, 0, ULONG_MAX); > > + void *area; > > + > > + guard(mutex)(&execmem_cache.mutex); > > + > > + if (!execmem_cache.pending_free_cnt) > > + return; > > + > > + mas_for_each(&mas, area, ULONG_MAX) { > > + if (!is_pending_free(area)) > > + continue; > > + > > + pending_free_clear(area); > > Probably: > area = pending_free_clear(area); > ? Right, thanks! > > + if (__execmem_cache_free(&mas, area, GFP_KERNEL)) > > + continue; > > + > > + execmem_cache.pending_free_cnt--; > > + } > > + > > + if (execmem_cache.pending_free_cnt) > > + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > > + else > > + schedule_work(&execmem_cache_clean_work); > > +} > > > Regards; > Yann. -- Sincerely yours, Mike.
On Mon, Jul 7, 2025 at 5:32 PM Yann Ylavic <ylavic.dev@gmail.com> wrote: > > On Fri, Jul 4, 2025 at 3:54 PM Mike Rapoport <rppt@kernel.org> wrote: > > + > > +static void execmem_cache_free_slow(struct work_struct *work) > > +{ > > + struct maple_tree *busy_areas = &execmem_cache.busy_areas; > > + MA_STATE(mas, busy_areas, 0, ULONG_MAX); > > + void *area; > > + > > + guard(mutex)(&execmem_cache.mutex); > > + > > + if (!execmem_cache.pending_free_cnt) > > + return; > > + > > + mas_for_each(&mas, area, ULONG_MAX) { > > + if (!is_pending_free(area)) > > + continue; > > + > > + pending_free_clear(area); > > Probably: > area = pending_free_clear(area); > ? Likewise in execmem_cache_free_slow() btw. > > > + if (__execmem_cache_free(&mas, area, GFP_KERNEL)) > > + continue; > > + > > + execmem_cache.pending_free_cnt--; > > + } > > + > > + if (execmem_cache.pending_free_cnt) > > + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > > + else > > + schedule_work(&execmem_cache_clean_work); > > +} > > > Regards; > Yann.
On Fri, Jul 04, 2025 at 04:49:38PM +0300, Mike Rapoport wrote: > static bool execmem_cache_free(void *ptr) > { > struct maple_tree *busy_areas = &execmem_cache.busy_areas; > unsigned long addr = (unsigned long)ptr; > MA_STATE(mas, busy_areas, addr, addr); > void *area; > + int err; > + > + guard(mutex)(&execmem_cache.mutex); > > area = mas_walk(&mas); > + if (!area) > return false; > > + err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); > + if (err) > + goto err_slowpath; > > schedule_work(&execmem_cache_clean_work); > > return true; > + > +err_slowpath: > + mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); > + execmem_cache.pending_free_cnt++; > + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > + return true; > } This is a bit if an anti-pattern, using guard() and error goto. Since there is only the one site, its best to write it like so: static bool execmem_cache_free(void *ptr) { struct maple_tree *busy_areas = &execmem_cache.busy_areas; unsigned long addr = (unsigned long)ptr; MA_STATE(mas, busy_areas, addr, addr); void *area; int err; guard(mutex)(&execmem_cache.mutex); area = mas_walk(&mas); if (!area) return false; err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); if (err) { mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); execmem_cache.pending_free_cnt++; schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); return true; } schedule_work(&execmem_cache_clean_work); return true; } And now I have to ask what happens if mas_store_gfp() returns an error?
On Mon, Jul 07, 2025 at 01:11:02PM +0200, Peter Zijlstra wrote: > On Fri, Jul 04, 2025 at 04:49:38PM +0300, Mike Rapoport wrote: > > static bool execmem_cache_free(void *ptr) > > { > > struct maple_tree *busy_areas = &execmem_cache.busy_areas; > > unsigned long addr = (unsigned long)ptr; > > MA_STATE(mas, busy_areas, addr, addr); > > void *area; > > + int err; > > + > > + guard(mutex)(&execmem_cache.mutex); > > > > area = mas_walk(&mas); > > + if (!area) > > return false; > > > > + err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); > > + if (err) > > + goto err_slowpath; > > > > schedule_work(&execmem_cache_clean_work); > > > > return true; > > + > > +err_slowpath: > > + mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); > > + execmem_cache.pending_free_cnt++; > > + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > > + return true; > > } > > This is a bit if an anti-pattern, using guard() and error goto. Since Good to know :) > there is only the one site, its best to write it like so: > > static bool execmem_cache_free(void *ptr) > { > struct maple_tree *busy_areas = &execmem_cache.busy_areas; > unsigned long addr = (unsigned long)ptr; > MA_STATE(mas, busy_areas, addr, addr); > void *area; > int err; > > guard(mutex)(&execmem_cache.mutex); > > area = mas_walk(&mas); > if (!area) > return false; > > err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); > if (err) { > mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); > execmem_cache.pending_free_cnt++; > schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > return true; > } > > schedule_work(&execmem_cache_clean_work); > return true; > } > > And now I have to ask what happens if mas_store_gfp() returns an error? AFAIU it won't. mas points to exact slot we've got the area from, nothing else can modify the tree because of the mutex, so that mas_store_gfp() essentially updates the value at an existing entry. I'll add a comment about it. Added @Liam to make sure I'm not saying nonsense :) -- Sincerely yours, Mike.
* Mike Rapoport <rppt@kernel.org> [250707 07:32]: > On Mon, Jul 07, 2025 at 01:11:02PM +0200, Peter Zijlstra wrote: > > On Fri, Jul 04, 2025 at 04:49:38PM +0300, Mike Rapoport wrote: > > > static bool execmem_cache_free(void *ptr) > > > { > > > struct maple_tree *busy_areas = &execmem_cache.busy_areas; > > > unsigned long addr = (unsigned long)ptr; > > > MA_STATE(mas, busy_areas, addr, addr); > > > void *area; > > > + int err; > > > + > > > + guard(mutex)(&execmem_cache.mutex); > > > > > > area = mas_walk(&mas); > > > + if (!area) > > > return false; > > > > > > + err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); > > > + if (err) > > > + goto err_slowpath; > > > > > > schedule_work(&execmem_cache_clean_work); > > > > > > return true; > > > + > > > +err_slowpath: > > > + mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); > > > + execmem_cache.pending_free_cnt++; > > > + schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > > > + return true; > > > } > > > > This is a bit if an anti-pattern, using guard() and error goto. Since > > Good to know :) > > > there is only the one site, its best to write it like so: > > > > static bool execmem_cache_free(void *ptr) > > { > > struct maple_tree *busy_areas = &execmem_cache.busy_areas; > > unsigned long addr = (unsigned long)ptr; > > MA_STATE(mas, busy_areas, addr, addr); > > void *area; > > int err; > > > > guard(mutex)(&execmem_cache.mutex); > > > > area = mas_walk(&mas); > > if (!area) > > return false; > > > > err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); > > if (err) { > > mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); > > execmem_cache.pending_free_cnt++; > > schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > > return true; > > } > > > > schedule_work(&execmem_cache_clean_work); > > return true; > > } > > > > And now I have to ask what happens if mas_store_gfp() returns an error? > > AFAIU it won't. mas points to exact slot we've got the area from, nothing else > can modify the tree because of the mutex, so that mas_store_gfp() > essentially updates the value at an existing entry. > > I'll add a comment about it. > > Added @Liam to make sure I'm not saying nonsense :) > Yes, if there is already a node with a value with the same range, there will be no allocations that will happen, so it'll just change the pointer for you. This is a slot store operation. But, if it's possible to have no entries (an empty tree, or a single value at 0), you will most likely allocate a node to store it, which is 256B. I don't think this is a concern in this particular case though as you are searching for an entry and storing, so it needs to exist. So really, the only scenario here is if you store 1 - ULONG_MAX (without having expanded a root node) or 0 - ULONG_MAX, and that seems invalid. Thanks, Liam
On Mon, Jul 07, 2025 at 11:06:25AM -0400, Liam R. Howlett wrote: > * Mike Rapoport <rppt@kernel.org> [250707 07:32]: > > On Mon, Jul 07, 2025 at 01:11:02PM +0200, Peter Zijlstra wrote: > > > > > > err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); > > > if (err) { > > > mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); > > > execmem_cache.pending_free_cnt++; > > > schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > > > return true; > > > } > > > > > > schedule_work(&execmem_cache_clean_work); > > > return true; > > > } > > > > > > And now I have to ask what happens if mas_store_gfp() returns an error? > > > > AFAIU it won't. mas points to exact slot we've got the area from, nothing else > > can modify the tree because of the mutex, so that mas_store_gfp() > > essentially updates the value at an existing entry. > > > > I'll add a comment about it. > > > > Added @Liam to make sure I'm not saying nonsense :) > > > > Yes, if there is already a node with a value with the same range, there > will be no allocations that will happen, so it'll just change the > pointer for you. This is a slot store operation. > > But, if it's possible to have no entries (an empty tree, or a single > value at 0), you will most likely allocate a node to store it, which is > 256B. > > I don't think this is a concern in this particular case though as you > are searching for an entry and storing, so it needs to exist. So > really, the only scenario here is if you store 1 - ULONG_MAX (without > having expanded a root node) or 0 - ULONG_MAX, and that seems invalid. Thanks for clarification, Liam! The tree cannot be empty at that point and if it has a single value, it won't be at 0, I'm quite sure no architecture has execmem areas at 0. > Thanks, > Liam -- Sincerely yours, Mike.
On Mon, Jul 07, 2025 at 06:12:26PM +0300, Mike Rapoport wrote: > On Mon, Jul 07, 2025 at 11:06:25AM -0400, Liam R. Howlett wrote: > > * Mike Rapoport <rppt@kernel.org> [250707 07:32]: > > > On Mon, Jul 07, 2025 at 01:11:02PM +0200, Peter Zijlstra wrote: > > > > > > > > err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); > > > > if (err) { > > > > mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); > > > > execmem_cache.pending_free_cnt++; > > > > schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > > > > return true; > > > > } > > > > > > > > schedule_work(&execmem_cache_clean_work); > > > > return true; > > > > } > > > > > > > > And now I have to ask what happens if mas_store_gfp() returns an error? > > > > > > AFAIU it won't. mas points to exact slot we've got the area from, nothing else > > > can modify the tree because of the mutex, so that mas_store_gfp() > > > essentially updates the value at an existing entry. > > > > > > I'll add a comment about it. > > > > > > Added @Liam to make sure I'm not saying nonsense :) > > > > > > > Yes, if there is already a node with a value with the same range, there > > will be no allocations that will happen, so it'll just change the > > pointer for you. This is a slot store operation. > > > > But, if it's possible to have no entries (an empty tree, or a single > > value at 0), you will most likely allocate a node to store it, which is > > 256B. > > > > I don't think this is a concern in this particular case though as you > > are searching for an entry and storing, so it needs to exist. So > > really, the only scenario here is if you store 1 - ULONG_MAX (without > > having expanded a root node) or 0 - ULONG_MAX, and that seems invalid. > > Thanks for clarification, Liam! > The tree cannot be empty at that point and if it has a single value, it > won't be at 0, I'm quite sure no architecture has execmem areas at 0. Would it make sense to have something like GFP_NO_ALLOC to pass to functions like this where we know it won't actually allocate -- and which when it does reach the allocator generates a WARN and returns NULL ?
On Tue, Jul 08, 2025 at 09:26:49AM +0200, Peter Zijlstra wrote: > On Mon, Jul 07, 2025 at 06:12:26PM +0300, Mike Rapoport wrote: > > On Mon, Jul 07, 2025 at 11:06:25AM -0400, Liam R. Howlett wrote: > > > * Mike Rapoport <rppt@kernel.org> [250707 07:32]: > > > > On Mon, Jul 07, 2025 at 01:11:02PM +0200, Peter Zijlstra wrote: > > > > > > > > > > err = __execmem_cache_free(&mas, ptr, GFP_KERNEL | __GFP_NORETRY); > > > > > if (err) { > > > > > mas_store_gfp(&mas, pending_free_set(ptr), GFP_KERNEL); > > > > > execmem_cache.pending_free_cnt++; > > > > > schedule_delayed_work(&execmem_cache_free_work, FREE_DELAY); > > > > > return true; > > > > > } > > > > > > > > > > schedule_work(&execmem_cache_clean_work); > > > > > return true; > > > > > } > > > > > > > > > > And now I have to ask what happens if mas_store_gfp() returns an error? > > > > > > > > AFAIU it won't. mas points to exact slot we've got the area from, nothing else > > > > can modify the tree because of the mutex, so that mas_store_gfp() > > > > essentially updates the value at an existing entry. > > > > > > > > I'll add a comment about it. > > > > > > > > Added @Liam to make sure I'm not saying nonsense :) > > > > > > > > > > Yes, if there is already a node with a value with the same range, there > > > will be no allocations that will happen, so it'll just change the > > > pointer for you. This is a slot store operation. > > > > > > But, if it's possible to have no entries (an empty tree, or a single > > > value at 0), you will most likely allocate a node to store it, which is > > > 256B. > > > > > > I don't think this is a concern in this particular case though as you > > > are searching for an entry and storing, so it needs to exist. So > > > really, the only scenario here is if you store 1 - ULONG_MAX (without > > > having expanded a root node) or 0 - ULONG_MAX, and that seems invalid. > > > > Thanks for clarification, Liam! > > The tree cannot be empty at that point and if it has a single value, it > > won't be at 0, I'm quite sure no architecture has execmem areas at 0. > > Would it make sense to have something like GFP_NO_ALLOC to pass to > functions like this where we know it won't actually allocate -- and > which when it does reach the allocator generates a WARN and returns NULL > ? We can add a WARN at the caller as well, that won't require a new gfp flag. The question is how to recover if such thing happen, I don't really see what execmem can do here if mas_store_gfp() returns an error :/ -- Sincerely yours, Mike.
© 2016 - 2025 Red Hat, Inc.