[v8] page_owner: print stacks and their outstanding allocations

[PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Oscar Salvador 6 months, 4 weeks ago

page_owner needs to increment a stack_record refcount when a new allocation
occurs, and decrement it on a free operation.
In order to do that, we need to have a way to get a stack_record from a
handle.
Implement __stack_depot_get_stack_record() which just does that, and make
it public so page_owner can use it.

Also implement {inc,dec}_stack_record_count() which increments
or decrements on respective allocation and free operations, via
__reset_page_owner() (free operation) and __set_page_owner() (alloc
operation).

Traversing all stackdepot buckets comes with its own complexity,
plus we would have to implement a way to mark only those stack_records
that were originated from page_owner, as those are the ones we are
interested in.
For that reason, page_owner maintains its own list of stack_records,
because traversing that list is faster than traversing all buckets
while keeping at the same time a low complexity.
inc_stack_record_count() is responsible of adding new stack_records
into the list stack_list.

Modifications on the list are protected via a spinlock with irqs
disabled, since this code can also be reached from IRQ context.

Signed-off-by: Oscar Salvador <osalvador@suse.de>
---
 include/linux/stackdepot.h |  9 +++++
 lib/stackdepot.c           |  8 +++++
 mm/page_owner.c            | 73 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 90 insertions(+)

diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
index 90274860fd8e..f3c2162bf615 100644
--- a/include/linux/stackdepot.h
+++ b/include/linux/stackdepot.h
@@ -175,6 +175,15 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
 depot_stack_handle_t stack_depot_save(unsigned long *entries,
 				      unsigned int nr_entries, gfp_t gfp_flags);
 
+/**
+ * __stack_depot_get_stack_record - Get a pointer to a stack_record struct
+ * This function is only for internal purposes.
+ * @handle: Stack depot handle
+ *
+ * Return: Returns a pointer to a stack_record struct
+ */
+struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle);
+
 /**
  * stack_depot_fetch - Fetch a stack trace from stack depot
  *
diff --git a/lib/stackdepot.c b/lib/stackdepot.c
index 6f9095374847..fdb09450a538 100644
--- a/lib/stackdepot.c
+++ b/lib/stackdepot.c
@@ -685,6 +685,14 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
 }
 EXPORT_SYMBOL_GPL(stack_depot_save);
 
+struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle)
+{
+	if (!handle)
+		return NULL;
+
+	return depot_fetch_stack(handle);
+}
+
 unsigned int stack_depot_fetch(depot_stack_handle_t handle,
 			       unsigned long **entries)
 {
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 5634e5d890f8..7d1b3f75cef3 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -36,6 +36,14 @@ struct page_owner {
 	pid_t free_tgid;
 };
 
+struct stack {
+	struct stack_record *stack_record;
+	struct stack *next;
+};
+
+static struct stack *stack_list;
+static DEFINE_SPINLOCK(stack_list_lock);
+
 static bool page_owner_enabled __initdata;
 DEFINE_STATIC_KEY_FALSE(page_owner_inited);
 
@@ -61,6 +69,57 @@ static __init bool need_page_owner(void)
 	return page_owner_enabled;
 }
 
+static void add_stack_record_to_list(struct stack_record *stack_record)
+{
+	unsigned long flags;
+	struct stack *stack;
+
+	stack = kmalloc(sizeof(*stack), GFP_KERNEL);
+	if (stack) {
+		stack->stack_record = stack_record;
+		stack->next = NULL;
+
+		spin_lock_irqsave(&stack_list_lock, flags);
+		if (!stack_list) {
+			stack_list = stack;
+		} else {
+			stack->next = stack_list;
+			stack_list = stack;
+		}
+		spin_unlock_irqrestore(&stack_list_lock, flags);
+	}
+}
+
+static void inc_stack_record_count(depot_stack_handle_t handle)
+{
+	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+
+	if (stack_record) {
+		/*
+		 * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
+		 * with REFCOUNT_SATURATED to catch spurious increments of their
+		 * refcount.
+		 * Since we do not use STACK_DEPOT_FLAG_{GET,PUT} API, let us
+		 * set a refcount of 1 ourselves.
+		 */
+		if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
+			refcount_set(&stack_record->count, 1);
+
+			/* Add the new stack_record to our list */
+			add_stack_record_to_list(stack_record);
+		}
+		refcount_inc(&stack_record->count);
+	}
+}
+
+static void dec_stack_record_count(depot_stack_handle_t handle)
+{
+	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
+
+	if (stack_record)
+		refcount_dec(&stack_record->count);
+}
+
 static __always_inline depot_stack_handle_t create_dummy_stack(void)
 {
 	unsigned long entries[4];
@@ -140,6 +199,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
 	int i;
 	struct page_ext *page_ext;
 	depot_stack_handle_t handle;
+	depot_stack_handle_t alloc_handle;
 	struct page_owner *page_owner;
 	u64 free_ts_nsec = local_clock();
 
@@ -147,6 +207,9 @@ void __reset_page_owner(struct page *page, unsigned short order)
 	if (unlikely(!page_ext))
 		return;
 
+	page_owner = get_page_owner(page_ext);
+	alloc_handle = page_owner->handle;
+
 	handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
 	for (i = 0; i < (1 << order); i++) {
 		__clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
@@ -158,6 +221,15 @@ void __reset_page_owner(struct page *page, unsigned short order)
 		page_ext = page_ext_next(page_ext);
 	}
 	page_ext_put(page_ext);
+	if (alloc_handle != early_handle)
+		/*
+		 * early_handle is being set as a handle for all those
+		 * early allocated pages. See init_pages_in_zone().
+		 * Since their refcount is not being incremented because
+		 * the machinery is not ready yet, we cannot decrement
+		 * their refcount either.
+		 */
+		dec_stack_record_count(alloc_handle);
 }
 
 static inline void __set_page_owner_handle(struct page_ext *page_ext,
@@ -199,6 +271,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
 		return;
 	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
 	page_ext_put(page_ext);
+	inc_stack_record_count(handle);
 }
 
 void __set_page_owner_migrate_reason(struct page *page, int reason)
-- 
2.43.0

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Vlastimil Babka 6 months, 3 weeks ago

On 2/12/24 23:30, Oscar Salvador wrote:
> page_owner needs to increment a stack_record refcount when a new allocation
> occurs, and decrement it on a free operation.
> In order to do that, we need to have a way to get a stack_record from a
> handle.
> Implement __stack_depot_get_stack_record() which just does that, and make
> it public so page_owner can use it.
> 
> Also implement {inc,dec}_stack_record_count() which increments
> or decrements on respective allocation and free operations, via
> __reset_page_owner() (free operation) and __set_page_owner() (alloc
> operation).
> 
> Traversing all stackdepot buckets comes with its own complexity,
> plus we would have to implement a way to mark only those stack_records
> that were originated from page_owner, as those are the ones we are
> interested in.
> For that reason, page_owner maintains its own list of stack_records,
> because traversing that list is faster than traversing all buckets
> while keeping at the same time a low complexity.
> inc_stack_record_count() is responsible of adding new stack_records
> into the list stack_list.
> 
> Modifications on the list are protected via a spinlock with irqs
> disabled, since this code can also be reached from IRQ context.
> 
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
>  include/linux/stackdepot.h |  9 +++++
>  lib/stackdepot.c           |  8 +++++
>  mm/page_owner.c            | 73 ++++++++++++++++++++++++++++++++++++++
>  3 files changed, 90 insertions(+)
>  static inline void __set_page_owner_handle(struct page_ext *page_ext,
> @@ -199,6 +271,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
>  		return;
>  	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
>  	page_ext_put(page_ext);
> +	inc_stack_record_count(handle);

What if this is dummy handle, which means we have recursed in page owner,
and we'll by trying to kmalloc() its struct stack and link it to the
stack_list because it was returned for the first time? Also failure_handle.
Could you pre-create static (not kmalloc) struct stack for these handles
with refcount of 0 and insert them to stack_list, all during
init_page_owner()? Bonus: no longer treating stack_list == NULL in a special
way in add_stack_record_to_list() (although you don't need to handle it
extra even now, AFAICS).

>  }
>  
>  void __set_page_owner_migrate_reason(struct page *page, int reason)

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Oscar Salvador 6 months, 3 weeks ago

On Tue, Feb 13, 2024 at 02:42:25PM +0100, Vlastimil Babka wrote:
> On 2/12/24 23:30, Oscar Salvador wrote:
> >  	__set_page_owner_handle(page_ext, handle, order, gfp_mask);
> >  	page_ext_put(page_ext);
> > +	inc_stack_record_count(handle);
> 
> What if this is dummy handle, which means we have recursed in page owner,
> and we'll by trying to kmalloc() its struct stack and link it to the
> stack_list because it was returned for the first time? Also failure_handle.
> Could you pre-create static (not kmalloc) struct stack for these handles
> with refcount of 0 and insert them to stack_list, all during
> init_page_owner()? Bonus: no longer treating stack_list == NULL in a special
> way in add_stack_record_to_list() (although you don't need to handle it
> extra even now, AFAICS).

Good catch. I did not think about this scenario, but this could
definitely happen.
Yeah, maybe creating an array of 2 structs for {dummy,failure}_handle
and link them into stack_list.

I thought about giving them a refcount of 1, because we only print
stacks which refcount > 1 anyways, but setting it to 0 has comes with
the advantage of catching spurious increments, should someone call
refcount_inc on those (which should not really happen).

I will try to implement it.

Thanks

-- 
Oscar Salvador
SUSE Labs

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Oscar Salvador 6 months, 3 weeks ago

On Tue, Feb 13, 2024 at 04:29:28PM +0100, Oscar Salvador wrote:
> I thought about giving them a refcount of 1, because we only print
> stacks which refcount > 1 anyways, but setting it to 0 has comes with
> the advantage of catching spurious increments, should someone call
> refcount_inc on those (which should not really happen).

On a second thought, I think we want a refcount of 1 for these stacks
at the beginning.
So should we e.g: re-enter page_owner, we would increment dummy_stack's
refcount. If refcount is 0, we will get a warning.

-- 
Oscar Salvador
SUSE Labs

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Vlastimil Babka 6 months, 3 weeks ago

On 2/12/24 23:30, Oscar Salvador wrote:
> page_owner needs to increment a stack_record refcount when a new allocation
> occurs, and decrement it on a free operation.
> In order to do that, we need to have a way to get a stack_record from a
> handle.
> Implement __stack_depot_get_stack_record() which just does that, and make
> it public so page_owner can use it.
> 
> Also implement {inc,dec}_stack_record_count() which increments
> or decrements on respective allocation and free operations, via
> __reset_page_owner() (free operation) and __set_page_owner() (alloc
> operation).
> 
> Traversing all stackdepot buckets comes with its own complexity,
> plus we would have to implement a way to mark only those stack_records
> that were originated from page_owner, as those are the ones we are
> interested in.
> For that reason, page_owner maintains its own list of stack_records,
> because traversing that list is faster than traversing all buckets
> while keeping at the same time a low complexity.
> inc_stack_record_count() is responsible of adding new stack_records
> into the list stack_list.
> 
> Modifications on the list are protected via a spinlock with irqs
> disabled, since this code can also be reached from IRQ context.
> 
> Signed-off-by: Oscar Salvador <osalvador@suse.de>
> ---
>  include/linux/stackdepot.h |  9 +++++
>  lib/stackdepot.c           |  8 +++++
>  mm/page_owner.c            | 73 ++++++++++++++++++++++++++++++++++++++
>  3 files changed, 90 insertions(+)

...


> --- a/mm/page_owner.c
> +++ b/mm/page_owner.c
> @@ -36,6 +36,14 @@ struct page_owner {
>  	pid_t free_tgid;
>  };
>  
> +struct stack {
> +	struct stack_record *stack_record;
> +	struct stack *next;
> +};
> +
> +static struct stack *stack_list;
> +static DEFINE_SPINLOCK(stack_list_lock);
> +
>  static bool page_owner_enabled __initdata;
>  DEFINE_STATIC_KEY_FALSE(page_owner_inited);
>  
> @@ -61,6 +69,57 @@ static __init bool need_page_owner(void)
>  	return page_owner_enabled;
>  }
>  
> +static void add_stack_record_to_list(struct stack_record *stack_record)
> +{
> +	unsigned long flags;
> +	struct stack *stack;
> +
> +	stack = kmalloc(sizeof(*stack), GFP_KERNEL);

I doubt you can use GFP_KERNEL unconditionally? Think you need to pass down
the gfp flags from __set_page_owner() here?
And what about the alloc failure case, this will just leave the stack record
unlinked forever? Can we somehow know which ones we failed to link, and try
next time? Probably easier by not recording the stack for the page at all in
that case, so the next attempt with the same stack looks like the very first
again and attemps the add to list.
Still not happy that these extra tracking objects are needed, but I guess
the per-users stack depot instances I suggested would be a major change.

> +	if (stack) {
> +		stack->stack_record = stack_record;
> +		stack->next = NULL;
> +
> +		spin_lock_irqsave(&stack_list_lock, flags);
> +		if (!stack_list) {
> +			stack_list = stack;
> +		} else {
> +			stack->next = stack_list;
> +			stack_list = stack;
> +		}
> +		spin_unlock_irqrestore(&stack_list_lock, flags);
> +	}
> +}
> +
> +static void inc_stack_record_count(depot_stack_handle_t handle)
> +{
> +	struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
> +
> +	if (stack_record) {
> +		/*
> +		 * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
> +		 * with REFCOUNT_SATURATED to catch spurious increments of their
> +		 * refcount.
> +		 * Since we do not use STACK_DEPOT_FLAG_{GET,PUT} API, let us
> +		 * set a refcount of 1 ourselves.
> +		 */
> +		if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
> +			refcount_set(&stack_record->count, 1);

Isn't this racy? Shouldn't we use some atomic cmpxchg operation to change
from REFCOUNT_SATURATED to 1?

> +			/* Add the new stack_record to our list */
> +			add_stack_record_to_list(stack_record);
> +		}
> +		refcount_inc(&stack_record->count);
> +	}
> +}
> +

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Oscar Salvador 6 months, 3 weeks ago

On Tue, Feb 13, 2024 at 10:16:04AM +0100, Vlastimil Babka wrote:
> On 2/12/24 23:30, Oscar Salvador wrote:
> > +static void add_stack_record_to_list(struct stack_record *stack_record)
> > +{
> > +	unsigned long flags;
> > +	struct stack *stack;
> > +
> > +	stack = kmalloc(sizeof(*stack), GFP_KERNEL);
> 
> I doubt you can use GFP_KERNEL unconditionally? Think you need to pass down
> the gfp flags from __set_page_owner() here?

Yes, we should re-use the same gfp flags.

> And what about the alloc failure case, this will just leave the stack record
> unlinked forever? Can we somehow know which ones we failed to link, and try
> next time? Probably easier by not recording the stack for the page at all in
> that case, so the next attempt with the same stack looks like the very first
> again and attemps the add to list.

Well, it is not that easy.
We could, before even calling into stackdepot to save the trace, try to
allocate a "struct stack", and skip everything if that fails, so
subsequent calls will try again as if this was the first time.

The thing I do not like about that is that it gets in the way of
traditional page_owner functionality (to put it a name), meaning that
if we cannot allocate a "struct stack", we also do not log the page
and the stack as we usually do, which means we lose that information.

One could argue that if we are so screwed that we cannot allocate that
struct, we may also fail deep in stackdepot code when allocating
the stack_record struct, but who knows.
I tried to keep both features as independent as possible.

> Still not happy that these extra tracking objects are needed, but I guess
> the per-users stack depot instances I suggested would be a major change.

Well, it is definitely something I could have a look in a short-term
future, to see if it makes sense, but for now I would go this the
current state as it has a well balanced complexity vs optimization.

> > +	if (stack_record) {
> > +		/*
> > +		 * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
> > +		 * with REFCOUNT_SATURATED to catch spurious increments of their
> > +		 * refcount.
> > +		 * Since we do not use STACK_DEPOT_FLAG_{GET,PUT} API, let us
> > +		 * set a refcount of 1 ourselves.
> > +		 */
> > +		if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
> > +			refcount_set(&stack_record->count, 1);
> 
> Isn't this racy? Shouldn't we use some atomic cmpxchg operation to change
> from REFCOUNT_SATURATED to 1?

Yeah, missed that. Probably check it under the lock as Maroc suggested.

Thanks Vlastimil for the feedback!

-- 
Oscar Salvador
SUSE Labs

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Marco Elver 6 months, 3 weeks ago

On Tue, 13 Feb 2024 at 10:16, Vlastimil Babka <vbabka@suse.cz> wrote:
>
> On 2/12/24 23:30, Oscar Salvador wrote:
> > page_owner needs to increment a stack_record refcount when a new allocation
> > occurs, and decrement it on a free operation.
> > In order to do that, we need to have a way to get a stack_record from a
> > handle.
> > Implement __stack_depot_get_stack_record() which just does that, and make
> > it public so page_owner can use it.
> >
> > Also implement {inc,dec}_stack_record_count() which increments
> > or decrements on respective allocation and free operations, via
> > __reset_page_owner() (free operation) and __set_page_owner() (alloc
> > operation).
> >
> > Traversing all stackdepot buckets comes with its own complexity,
> > plus we would have to implement a way to mark only those stack_records
> > that were originated from page_owner, as those are the ones we are
> > interested in.
> > For that reason, page_owner maintains its own list of stack_records,
> > because traversing that list is faster than traversing all buckets
> > while keeping at the same time a low complexity.
> > inc_stack_record_count() is responsible of adding new stack_records
> > into the list stack_list.
> >
> > Modifications on the list are protected via a spinlock with irqs
> > disabled, since this code can also be reached from IRQ context.
> >
> > Signed-off-by: Oscar Salvador <osalvador@suse.de>
> > ---
> >  include/linux/stackdepot.h |  9 +++++
> >  lib/stackdepot.c           |  8 +++++
> >  mm/page_owner.c            | 73 ++++++++++++++++++++++++++++++++++++++
> >  3 files changed, 90 insertions(+)
>
> ...
>
>
> > --- a/mm/page_owner.c
> > +++ b/mm/page_owner.c
> > @@ -36,6 +36,14 @@ struct page_owner {
> >       pid_t free_tgid;
> >  };
> >
> > +struct stack {
> > +     struct stack_record *stack_record;
> > +     struct stack *next;
> > +};
> > +
> > +static struct stack *stack_list;
> > +static DEFINE_SPINLOCK(stack_list_lock);
> > +
> >  static bool page_owner_enabled __initdata;
> >  DEFINE_STATIC_KEY_FALSE(page_owner_inited);
> >
> > @@ -61,6 +69,57 @@ static __init bool need_page_owner(void)
> >       return page_owner_enabled;
> >  }
> >
> > +static void add_stack_record_to_list(struct stack_record *stack_record)
> > +{
> > +     unsigned long flags;
> > +     struct stack *stack;
> > +
> > +     stack = kmalloc(sizeof(*stack), GFP_KERNEL);
>
> I doubt you can use GFP_KERNEL unconditionally? Think you need to pass down
> the gfp flags from __set_page_owner() here?
> And what about the alloc failure case, this will just leave the stack record
> unlinked forever? Can we somehow know which ones we failed to link, and try
> next time? Probably easier by not recording the stack for the page at all in
> that case, so the next attempt with the same stack looks like the very first
> again and attemps the add to list.
> Still not happy that these extra tracking objects are needed, but I guess
> the per-users stack depot instances I suggested would be a major change.
>
> > +     if (stack) {
> > +             stack->stack_record = stack_record;
> > +             stack->next = NULL;
> > +
> > +             spin_lock_irqsave(&stack_list_lock, flags);
> > +             if (!stack_list) {
> > +                     stack_list = stack;
> > +             } else {
> > +                     stack->next = stack_list;
> > +                     stack_list = stack;
> > +             }
> > +             spin_unlock_irqrestore(&stack_list_lock, flags);
> > +     }
> > +}
> > +
> > +static void inc_stack_record_count(depot_stack_handle_t handle)
> > +{
> > +     struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
> > +
> > +     if (stack_record) {
> > +             /*
> > +              * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
> > +              * with REFCOUNT_SATURATED to catch spurious increments of their
> > +              * refcount.
> > +              * Since we do not use STACK_DEPOT_FLAG_{GET,PUT} API, let us
> > +              * set a refcount of 1 ourselves.
> > +              */
> > +             if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
> > +                     refcount_set(&stack_record->count, 1);
>
> Isn't this racy? Shouldn't we use some atomic cmpxchg operation to change
> from REFCOUNT_SATURATED to 1?

If 2 threads race here, both will want to add it to the list as well
and take the lock. So this could just be solved with double-checked
locking:

if (count == REFCOUNT_SATURATED) {
  spin_lock(...);
  if (count == REFCOUNT_SATURATED) {
    refcount_set(.., 1);
    .. add to list ...
  }
  spin_unlock(..);
}

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Vlastimil Babka 6 months, 3 weeks ago

On 2/13/24 10:21, Marco Elver wrote:
> On Tue, 13 Feb 2024 at 10:16, Vlastimil Babka <vbabka@suse.cz> wrote:
>>
>> On 2/12/24 23:30, Oscar Salvador wrote:
>> > page_owner needs to increment a stack_record refcount when a new allocation
>> > occurs, and decrement it on a free operation.
>> > In order to do that, we need to have a way to get a stack_record from a
>> > handle.
>> > Implement __stack_depot_get_stack_record() which just does that, and make
>> > it public so page_owner can use it.
>> >
>> > Also implement {inc,dec}_stack_record_count() which increments
>> > or decrements on respective allocation and free operations, via
>> > __reset_page_owner() (free operation) and __set_page_owner() (alloc
>> > operation).
>> >
>> > Traversing all stackdepot buckets comes with its own complexity,
>> > plus we would have to implement a way to mark only those stack_records
>> > that were originated from page_owner, as those are the ones we are
>> > interested in.
>> > For that reason, page_owner maintains its own list of stack_records,
>> > because traversing that list is faster than traversing all buckets
>> > while keeping at the same time a low complexity.
>> > inc_stack_record_count() is responsible of adding new stack_records
>> > into the list stack_list.
>> >
>> > Modifications on the list are protected via a spinlock with irqs
>> > disabled, since this code can also be reached from IRQ context.
>> >
>> > Signed-off-by: Oscar Salvador <osalvador@suse.de>
>> > ---
>> >  include/linux/stackdepot.h |  9 +++++
>> >  lib/stackdepot.c           |  8 +++++
>> >  mm/page_owner.c            | 73 ++++++++++++++++++++++++++++++++++++++
>> >  3 files changed, 90 insertions(+)
>>
>> ...
>>
>>
>> > --- a/mm/page_owner.c
>> > +++ b/mm/page_owner.c
>> > @@ -36,6 +36,14 @@ struct page_owner {
>> >       pid_t free_tgid;
>> >  };
>> >
>> > +struct stack {
>> > +     struct stack_record *stack_record;
>> > +     struct stack *next;
>> > +};
>> > +
>> > +static struct stack *stack_list;
>> > +static DEFINE_SPINLOCK(stack_list_lock);
>> > +
>> >  static bool page_owner_enabled __initdata;
>> >  DEFINE_STATIC_KEY_FALSE(page_owner_inited);
>> >
>> > @@ -61,6 +69,57 @@ static __init bool need_page_owner(void)
>> >       return page_owner_enabled;
>> >  }
>> >
>> > +static void add_stack_record_to_list(struct stack_record *stack_record)
>> > +{
>> > +     unsigned long flags;
>> > +     struct stack *stack;
>> > +
>> > +     stack = kmalloc(sizeof(*stack), GFP_KERNEL);
>>
>> I doubt you can use GFP_KERNEL unconditionally? Think you need to pass down
>> the gfp flags from __set_page_owner() here?
>> And what about the alloc failure case, this will just leave the stack record
>> unlinked forever? Can we somehow know which ones we failed to link, and try
>> next time? Probably easier by not recording the stack for the page at all in
>> that case, so the next attempt with the same stack looks like the very first
>> again and attemps the add to list.
>> Still not happy that these extra tracking objects are needed, but I guess
>> the per-users stack depot instances I suggested would be a major change.
>>
>> > +     if (stack) {
>> > +             stack->stack_record = stack_record;
>> > +             stack->next = NULL;
>> > +
>> > +             spin_lock_irqsave(&stack_list_lock, flags);
>> > +             if (!stack_list) {
>> > +                     stack_list = stack;
>> > +             } else {
>> > +                     stack->next = stack_list;
>> > +                     stack_list = stack;
>> > +             }
>> > +             spin_unlock_irqrestore(&stack_list_lock, flags);
>> > +     }
>> > +}
>> > +
>> > +static void inc_stack_record_count(depot_stack_handle_t handle)
>> > +{
>> > +     struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
>> > +
>> > +     if (stack_record) {
>> > +             /*
>> > +              * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
>> > +              * with REFCOUNT_SATURATED to catch spurious increments of their
>> > +              * refcount.
>> > +              * Since we do not use STACK_DEPOT_FLAG_{GET,PUT} API, let us
>> > +              * set a refcount of 1 ourselves.
>> > +              */
>> > +             if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
>> > +                     refcount_set(&stack_record->count, 1);
>>
>> Isn't this racy? Shouldn't we use some atomic cmpxchg operation to change
>> from REFCOUNT_SATURATED to 1?
> 
> If 2 threads race here, both will want to add it to the list as well
> and take the lock. So this could just be solved with double-checked
> locking:
> 
> if (count == REFCOUNT_SATURATED) {
>   spin_lock(...);

Yeah probably stack_list_lock could be taken here already. But then the
kmalloc() of struct stack must happen also here, before taking the lock.

>   if (count == REFCOUNT_SATURATED) {
>     refcount_set(.., 1);
>     .. add to list ...
>   }
>   spin_unlock(..);
> }

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Oscar Salvador 6 months, 3 weeks ago

On Tue, Feb 13, 2024 at 12:34:55PM +0100, Vlastimil Babka wrote:
> On 2/13/24 10:21, Marco Elver wrote:
> > On Tue, 13 Feb 2024 at 10:16, Vlastimil Babka <vbabka@suse.cz> wrote:
> >> Isn't this racy? Shouldn't we use some atomic cmpxchg operation to change
> >> from REFCOUNT_SATURATED to 1?
> > 
> > If 2 threads race here, both will want to add it to the list as well
> > and take the lock. So this could just be solved with double-checked
> > locking:
> > 
> > if (count == REFCOUNT_SATURATED) {
> >   spin_lock(...);
> 
> Yeah probably stack_list_lock could be taken here already. But then the
> kmalloc() of struct stack must happen also here, before taking the lock.

I am thinking what would be a less expensive and safer option here.
Of course, taking the spinlock is easier, but having the allocation
inside is tricky, and having it outside could mean that we might free
the struct once we checked __within__ the lock that the refcount
is no longer REFCOUNT_SATURATED. No big deal, but a bit sub-optimal.

On the other hand, IIUC, cmpxchg has some memory ordering, like
store_and_release/load_acquire do, so would it be safe to use it
instead of taking the lock?

-- 
Oscar Salvador
SUSE Labs

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Marco Elver 6 months, 3 weeks ago

On Tue, 13 Feb 2024 at 13:39, Oscar Salvador <osalvador@suse.de> wrote:
>
> On Tue, Feb 13, 2024 at 12:34:55PM +0100, Vlastimil Babka wrote:
> > On 2/13/24 10:21, Marco Elver wrote:
> > > On Tue, 13 Feb 2024 at 10:16, Vlastimil Babka <vbabka@suse.cz> wrote:
> > >> Isn't this racy? Shouldn't we use some atomic cmpxchg operation to change
> > >> from REFCOUNT_SATURATED to 1?
> > >
> > > If 2 threads race here, both will want to add it to the list as well
> > > and take the lock. So this could just be solved with double-checked
> > > locking:
> > >
> > > if (count == REFCOUNT_SATURATED) {
> > >   spin_lock(...);
> >
> > Yeah probably stack_list_lock could be taken here already. But then the
> > kmalloc() of struct stack must happen also here, before taking the lock.
>
> I am thinking what would be a less expensive and safer option here.
> Of course, taking the spinlock is easier, but having the allocation
> inside is tricky, and having it outside could mean that we might free
> the struct once we checked __within__ the lock that the refcount
> is no longer REFCOUNT_SATURATED. No big deal, but a bit sub-optimal.
>
> On the other hand, IIUC, cmpxchg has some memory ordering, like
> store_and_release/load_acquire do, so would it be safe to use it
> instead of taking the lock?

Memory ordering here is secondary because the count is not used to
release and later acquire any memory (the list is used for that, you
change list head reads/writes to smp_load_acquire/smp_store_release in
the later patch). The problem is mutual exclusion. You can do mutual
exclusion with something like this as well:

>       if (refcount_read(&stack->count) == REFCOUNT_SATURATED) {
>               int old = REFCOUNT_SATURATED;
>               if (atomic_try_cmpxchg_relaxed(&stack->count.refs, &old, 1))
>                       add_stack_record_to_list(...);
>       }
>       refcount_inc(&stack->count);

Probably simpler.

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Marco Elver 6 months, 3 weeks ago

On Mon, 12 Feb 2024 at 23:29, Oscar Salvador <osalvador@suse.de> wrote:
>
> page_owner needs to increment a stack_record refcount when a new allocation
> occurs, and decrement it on a free operation.
> In order to do that, we need to have a way to get a stack_record from a
> handle.
> Implement __stack_depot_get_stack_record() which just does that, and make
> it public so page_owner can use it.
>
> Also implement {inc,dec}_stack_record_count() which increments
> or decrements on respective allocation and free operations, via
> __reset_page_owner() (free operation) and __set_page_owner() (alloc
> operation).
>
> Traversing all stackdepot buckets comes with its own complexity,
> plus we would have to implement a way to mark only those stack_records
> that were originated from page_owner, as those are the ones we are
> interested in.
> For that reason, page_owner maintains its own list of stack_records,
> because traversing that list is faster than traversing all buckets
> while keeping at the same time a low complexity.
> inc_stack_record_count() is responsible of adding new stack_records
> into the list stack_list.
>
> Modifications on the list are protected via a spinlock with irqs
> disabled, since this code can also be reached from IRQ context.
>
> Signed-off-by: Oscar Salvador <osalvador@suse.de>

For the code:

Reviewed-by: Marco Elver <elver@google.com>

But see minor comments below.

> ---
>  include/linux/stackdepot.h |  9 +++++
>  lib/stackdepot.c           |  8 +++++
>  mm/page_owner.c            | 73 ++++++++++++++++++++++++++++++++++++++
>  3 files changed, 90 insertions(+)
>
> diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h
> index 90274860fd8e..f3c2162bf615 100644
> --- a/include/linux/stackdepot.h
> +++ b/include/linux/stackdepot.h
> @@ -175,6 +175,15 @@ depot_stack_handle_t stack_depot_save_flags(unsigned long *entries,
>  depot_stack_handle_t stack_depot_save(unsigned long *entries,
>                                       unsigned int nr_entries, gfp_t gfp_flags);
>
> +/**
> + * __stack_depot_get_stack_record - Get a pointer to a stack_record struct
> + * This function is only for internal purposes.

I think the body of the kernel doc needs to go after argument declarations.

> + * @handle: Stack depot handle
> + *
> + * Return: Returns a pointer to a stack_record struct
> + */
> +struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle);
> +
>  /**
>   * stack_depot_fetch - Fetch a stack trace from stack depot
>   *
> diff --git a/lib/stackdepot.c b/lib/stackdepot.c
> index 6f9095374847..fdb09450a538 100644
> --- a/lib/stackdepot.c
> +++ b/lib/stackdepot.c
> @@ -685,6 +685,14 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries,
>  }
>  EXPORT_SYMBOL_GPL(stack_depot_save);
>
> +struct stack_record *__stack_depot_get_stack_record(depot_stack_handle_t handle)
> +{
> +       if (!handle)
> +               return NULL;
> +
> +       return depot_fetch_stack(handle);
> +}
> +
>  unsigned int stack_depot_fetch(depot_stack_handle_t handle,
>                                unsigned long **entries)
>  {
> diff --git a/mm/page_owner.c b/mm/page_owner.c
> index 5634e5d890f8..7d1b3f75cef3 100644
> --- a/mm/page_owner.c
> +++ b/mm/page_owner.c
> @@ -36,6 +36,14 @@ struct page_owner {
>         pid_t free_tgid;
>  };
>
> +struct stack {
> +       struct stack_record *stack_record;
> +       struct stack *next;
> +};
> +
> +static struct stack *stack_list;
> +static DEFINE_SPINLOCK(stack_list_lock);
> +
>  static bool page_owner_enabled __initdata;
>  DEFINE_STATIC_KEY_FALSE(page_owner_inited);
>
> @@ -61,6 +69,57 @@ static __init bool need_page_owner(void)
>         return page_owner_enabled;
>  }
>
> +static void add_stack_record_to_list(struct stack_record *stack_record)
> +{
> +       unsigned long flags;
> +       struct stack *stack;
> +
> +       stack = kmalloc(sizeof(*stack), GFP_KERNEL);
> +       if (stack) {

It's usually more elegant to write

if (!stack)
  return;

If the rest of the function is conditional.

> +               stack->stack_record = stack_record;
> +               stack->next = NULL;
> +
> +               spin_lock_irqsave(&stack_list_lock, flags);
> +               if (!stack_list) {
> +                       stack_list = stack;
> +               } else {
> +                       stack->next = stack_list;
> +                       stack_list = stack;
> +               }
> +               spin_unlock_irqrestore(&stack_list_lock, flags);
> +       }
> +}
> +
> +static void inc_stack_record_count(depot_stack_handle_t handle)
> +{
> +       struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
> +
> +       if (stack_record) {
> +               /*
> +                * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
> +                * with REFCOUNT_SATURATED to catch spurious increments of their
> +                * refcount.
> +                * Since we do not use STACK_DEPOT_FLAG_{GET,PUT} API, let us

I think I mentioned this in the other email, there is no
STACK_DEPOT_FLAG_PUT, only stack_depot_put().

> +                * set a refcount of 1 ourselves.
> +                */
> +               if (refcount_read(&stack_record->count) == REFCOUNT_SATURATED) {
> +                       refcount_set(&stack_record->count, 1);
> +
> +                       /* Add the new stack_record to our list */
> +                       add_stack_record_to_list(stack_record);
> +               }
> +               refcount_inc(&stack_record->count);
> +       }
> +}
> +
> +static void dec_stack_record_count(depot_stack_handle_t handle)
> +{
> +       struct stack_record *stack_record = __stack_depot_get_stack_record(handle);
> +
> +       if (stack_record)
> +               refcount_dec(&stack_record->count);
> +}
> +
>  static __always_inline depot_stack_handle_t create_dummy_stack(void)
>  {
>         unsigned long entries[4];
> @@ -140,6 +199,7 @@ void __reset_page_owner(struct page *page, unsigned short order)
>         int i;
>         struct page_ext *page_ext;
>         depot_stack_handle_t handle;
> +       depot_stack_handle_t alloc_handle;
>         struct page_owner *page_owner;
>         u64 free_ts_nsec = local_clock();
>
> @@ -147,6 +207,9 @@ void __reset_page_owner(struct page *page, unsigned short order)
>         if (unlikely(!page_ext))
>                 return;
>
> +       page_owner = get_page_owner(page_ext);
> +       alloc_handle = page_owner->handle;
> +
>         handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
>         for (i = 0; i < (1 << order); i++) {
>                 __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
> @@ -158,6 +221,15 @@ void __reset_page_owner(struct page *page, unsigned short order)
>                 page_ext = page_ext_next(page_ext);
>         }
>         page_ext_put(page_ext);
> +       if (alloc_handle != early_handle)
> +               /*
> +                * early_handle is being set as a handle for all those
> +                * early allocated pages. See init_pages_in_zone().
> +                * Since their refcount is not being incremented because
> +                * the machinery is not ready yet, we cannot decrement
> +                * their refcount either.
> +                */
> +               dec_stack_record_count(alloc_handle);
>  }
>
>  static inline void __set_page_owner_handle(struct page_ext *page_ext,
> @@ -199,6 +271,7 @@ noinline void __set_page_owner(struct page *page, unsigned short order,
>                 return;
>         __set_page_owner_handle(page_ext, handle, order, gfp_mask);
>         page_ext_put(page_ext);
> +       inc_stack_record_count(handle);
>  }
>
>  void __set_page_owner_migrate_reason(struct page *page, int reason)
> --
> 2.43.0
>

Re: [PATCH v8 2/5] mm,page_owner: Implement the tracking of the stacks count

Posted by Oscar Salvador 6 months, 3 weeks ago

On Tue, Feb 13, 2024 at 09:30:25AM +0100, Marco Elver wrote:
> On Mon, 12 Feb 2024 at 23:29, Oscar Salvador <osalvador@suse.de> wrote:
> > Signed-off-by: Oscar Salvador <osalvador@suse.de>
> 
> For the code:
> 
> Reviewed-by: Marco Elver <elver@google.com>

Thanks!

> But see minor comments below.

> > +/**
> > + * __stack_depot_get_stack_record - Get a pointer to a stack_record struct
> > + * This function is only for internal purposes.
> 
> I think the body of the kernel doc needs to go after argument declarations.

I see. I will amend that.

> > +static void add_stack_record_to_list(struct stack_record *stack_record)
> > +{
> > +       unsigned long flags;
> > +       struct stack *stack;
> > +
> > +       stack = kmalloc(sizeof(*stack), GFP_KERNEL);
> > +       if (stack) {
> 
> It's usually more elegant to write
> 
> if (!stack)
>   return;
> 
> If the rest of the function is conditional.

Yeah, probably better to save some identation.

> > +       if (stack_record) {
> > +               /*
> > +                * New stack_record's that do not use STACK_DEPOT_FLAG_GET start
> > +                * with REFCOUNT_SATURATED to catch spurious increments of their
> > +                * refcount.
> > +                * Since we do not use STACK_DEPOT_FLAG_{GET,PUT} API, let us
> 
> I think I mentioned this in the other email, there is no
> STACK_DEPOT_FLAG_PUT, only stack_depot_put().

Yes, you did. This was an oversight.
I will fix that.


Thanks for the feedback Marco!

-- 
Oscar Salvador
SUSE Labs