The commit 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") added
__alloc_pages_bulk() along with the page_list argument. The next
commit 0f87d9d30f21 ("mm/page_alloc: add an array-based interface to the
bulk page allocator") added the array-based argument. As it turns out, the
page_list argument has no users in the current tree (if it ever had any).
Dropping it allows for a slight simplification and eliminates some
unnecessary checks, now that page_array is required.
Also, note that the removal of the page_list argument was proposed before
in the thread below, where Matthew Wilcox mentions that:
"""
Iterating a linked list is _expensive_. It is about 10x quicker to
iterate an array than a linked list.
"""
(https://lore.kernel.org/linux-mm/20231025093254.xvomlctwhcuerzky@techsingularity.net)
Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
---
include/linux/gfp.h | 8 ++------
mm/mempolicy.c | 14 +++++++-------
mm/page_alloc.c | 39 ++++++++++++---------------------------
3 files changed, 21 insertions(+), 40 deletions(-)
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index b0fe9f62d15b6..eebed36443b35 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -212,7 +212,6 @@ struct folio *__folio_alloc_noprof(gfp_t gfp, unsigned int order, int preferred_
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
- struct list_head *page_list,
struct page **page_array);
#define __alloc_pages_bulk(...) alloc_hooks(alloc_pages_bulk_noprof(__VA_ARGS__))
@@ -223,11 +222,8 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
alloc_hooks(alloc_pages_bulk_array_mempolicy_noprof(__VA_ARGS__))
/* Bulk allocate order-0 pages */
-#define alloc_pages_bulk_list(_gfp, _nr_pages, _list) \
- __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _list, NULL)
-
#define alloc_pages_bulk_array(_gfp, _nr_pages, _page_array) \
- __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, NULL, _page_array)
+ __alloc_pages_bulk(_gfp, numa_mem_id(), NULL, _nr_pages, _page_array)
static inline unsigned long
alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
@@ -236,7 +232,7 @@ alloc_pages_bulk_array_node_noprof(gfp_t gfp, int nid, unsigned long nr_pages,
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
- return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, NULL, page_array);
+ return alloc_pages_bulk_noprof(gfp, nid, NULL, nr_pages, page_array);
}
#define alloc_pages_bulk_array_node(...) \
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 04f35659717ae..42a7b07ccc15a 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -2375,13 +2375,13 @@ static unsigned long alloc_pages_bulk_array_interleave(gfp_t gfp,
if (delta) {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node + 1, NULL,
+ nr_pages_per_node + 1,
page_array);
delta--;
} else {
nr_allocated = alloc_pages_bulk_noprof(gfp,
interleave_nodes(pol), NULL,
- nr_pages_per_node, NULL, page_array);
+ nr_pages_per_node, page_array);
}
page_array += nr_allocated;
@@ -2430,7 +2430,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (weight && node_isset(node, nodes)) {
node_pages = min(rem_pages, weight);
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
/* if that's all the pages, no need to interleave */
@@ -2493,7 +2493,7 @@ static unsigned long alloc_pages_bulk_array_weighted_interleave(gfp_t gfp,
if (!node_pages)
break;
nr_allocated = __alloc_pages_bulk(gfp, node, NULL, node_pages,
- NULL, page_array);
+ page_array);
page_array += nr_allocated;
total_allocated += nr_allocated;
if (total_allocated == nr_pages)
@@ -2517,11 +2517,11 @@ static unsigned long alloc_pages_bulk_array_preferred_many(gfp_t gfp, int nid,
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
nr_allocated = alloc_pages_bulk_noprof(preferred_gfp, nid, &pol->nodes,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
if (nr_allocated < nr_pages)
nr_allocated += alloc_pages_bulk_noprof(gfp, numa_node_id(), NULL,
- nr_pages - nr_allocated, NULL,
+ nr_pages - nr_allocated,
page_array + nr_allocated);
return nr_allocated;
}
@@ -2557,7 +2557,7 @@ unsigned long alloc_pages_bulk_array_mempolicy_noprof(gfp_t gfp,
nid = numa_node_id();
nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid);
return alloc_pages_bulk_noprof(gfp, nid, nodemask,
- nr_pages, NULL, page_array);
+ nr_pages, page_array);
}
int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cae7b93864c23..4ce77a0e8d9a7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4531,28 +4531,23 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
}
/*
- * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
+ * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
* @gfp: GFP flags for the allocation
* @preferred_nid: The preferred NUMA node ID to allocate from
* @nodemask: Set of nodes to allocate from, may be NULL
- * @nr_pages: The number of pages desired on the list or array
- * @page_list: Optional list to store the allocated pages
- * @page_array: Optional array to store the pages
+ * @nr_pages: The number of pages desired in the array
+ * @page_array: Array to store the pages
*
* This is a batched version of the page allocator that attempts to
- * allocate nr_pages quickly. Pages are added to page_list if page_list
- * is not NULL, otherwise it is assumed that the page_array is valid.
+ * allocate nr_pages quickly. Pages are added to the page_array.
*
- * For lists, nr_pages is the number of pages that should be allocated.
- *
- * For arrays, only NULL elements are populated with pages and nr_pages
+ * Note that only NULL elements are populated with pages and nr_pages
* is the maximum number of pages that will be stored in the array.
*
- * Returns the number of pages on the list or array.
+ * Returns the number of pages in the array.
*/
unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nodemask_t *nodemask, int nr_pages,
- struct list_head *page_list,
struct page **page_array)
{
struct page *page;
@@ -4570,7 +4565,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
* Skip populated array elements to determine if any pages need
* to be allocated before disabling IRQs.
*/
- while (page_array && nr_populated < nr_pages && page_array[nr_populated])
+ while (nr_populated < nr_pages && page_array[nr_populated])
nr_populated++;
/* No pages requested? */
@@ -4578,7 +4573,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
goto out;
/* Already populated array? */
- if (unlikely(page_array && nr_pages - nr_populated == 0))
+ if (unlikely(nr_pages - nr_populated == 0))
goto out;
/* Bulk allocator does not support memcg accounting. */
@@ -4660,7 +4655,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
while (nr_populated < nr_pages) {
/* Skip existing pages */
- if (page_array && page_array[nr_populated]) {
+ if (page_array[nr_populated]) {
nr_populated++;
continue;
}
@@ -4678,11 +4673,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
nr_account++;
prep_new_page(page, 0, gfp, 0);
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
+ page_array[nr_populated++] = page;
}
pcp_spin_unlock(pcp);
@@ -4699,14 +4690,8 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
failed:
page = __alloc_pages_noprof(gfp, 0, preferred_nid, nodemask);
- if (page) {
- if (page_list)
- list_add(&page->lru, page_list);
- else
- page_array[nr_populated] = page;
- nr_populated++;
- }
-
+ if (page)
+ page_array[nr_populated++] = page;
goto out;
}
EXPORT_SYMBOL_GPL(alloc_pages_bulk_noprof);
--
2.47.1
On 23.12.24 23:00, Luiz Capitulino wrote:
> The commit 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") added
> __alloc_pages_bulk() along with the page_list argument. The next
> commit 0f87d9d30f21 ("mm/page_alloc: add an array-based interface to the
> bulk page allocator") added the array-based argument. As it turns out, the
> page_list argument has no users in the current tree (if it ever had any).
> Dropping it allows for a slight simplification and eliminates some
> unnecessary checks, now that page_array is required.
>
> Also, note that the removal of the page_list argument was proposed before
> in the thread below, where Matthew Wilcox mentions that:
>
> """
> Iterating a linked list is _expensive_. It is about 10x quicker to
> iterate an array than a linked list.
> """
> (https://lore.kernel.org/linux-mm/20231025093254.xvomlctwhcuerzky@techsingularity.net)
>
> Signed-off-by: Luiz Capitulino <luizcap@redhat.com>
> ---
Acked-by: David Hildenbrand <david@redhat.com>
Thanks!
--
Cheers,
David / dhildenb
On 2024/12/24 6:00, Luiz Capitulino wrote:
> /*
> - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
> + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
> * @gfp: GFP flags for the allocation
> * @preferred_nid: The preferred NUMA node ID to allocate from
> * @nodemask: Set of nodes to allocate from, may be NULL
> - * @nr_pages: The number of pages desired on the list or array
> - * @page_list: Optional list to store the allocated pages
> - * @page_array: Optional array to store the pages
> + * @nr_pages: The number of pages desired in the array
> + * @page_array: Array to store the pages
> *
> * This is a batched version of the page allocator that attempts to
> - * allocate nr_pages quickly. Pages are added to page_list if page_list
> - * is not NULL, otherwise it is assumed that the page_array is valid.
> + * allocate nr_pages quickly. Pages are added to the page_array.
> *
> - * For lists, nr_pages is the number of pages that should be allocated.
> - *
> - * For arrays, only NULL elements are populated with pages and nr_pages
> + * Note that only NULL elements are populated with pages and nr_pages
It is not really related to this patch, but while we are at this, the above
seems like an odd behavior. By roughly looking at all the callers of that
API, it seems like only the below callers rely on that?
fs/erofs/zutil.c: z_erofs_gbuf_growsize()
fs/xfs/xfs_buf.c: xfs_buf_alloc_pages()
It seems it is quite straight forward to change the above callers to not
rely on the above behavior, and we might be able to avoid more checking
by removing the above behavior?
> * is the maximum number of pages that will be stored in the array.
> *
> - * Returns the number of pages on the list or array.
> + * Returns the number of pages in the array.
> */
> unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
> nodemask_t *nodemask, int nr_pages,
> - struct list_head *page_list,
> struct page **page_array)
> {
> struct page *page;
> @@ -4570,7 +4565,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
> * Skip populated array elements to determine if any pages need
> * to be allocated before disabling IRQs.
> */
> - while (page_array && nr_populated < nr_pages && page_array[nr_populated])
> + while (nr_populated < nr_pages && page_array[nr_populated])
> nr_populated++;
The above might be avoided as mentioned above.
>
> /* No pages requested? */
> @@ -4578,7 +4573,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
> goto out;
>
> /* Already populated array? */
> - if (unlikely(page_array && nr_pages - nr_populated == 0))
> + if (unlikely(nr_pages - nr_populated == 0))
> goto out;
>
> /* Bulk allocator does not support memcg accounting. */
> @@ -4660,7 +4655,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
> while (nr_populated < nr_pages) {
>
> /* Skip existing pages */
> - if (page_array && page_array[nr_populated]) {
> + if (page_array[nr_populated]) {
Similar here.
> nr_populated++;
> continue;
> }
On Wed, Dec 25, 2024 at 08:36:04PM +0800, Yunsheng Lin wrote: > On 2024/12/24 6:00, Luiz Capitulino wrote: > > > /* > > - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array > > + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array > > * @gfp: GFP flags for the allocation > > * @preferred_nid: The preferred NUMA node ID to allocate from > > * @nodemask: Set of nodes to allocate from, may be NULL > > - * @nr_pages: The number of pages desired on the list or array > > - * @page_list: Optional list to store the allocated pages > > - * @page_array: Optional array to store the pages > > + * @nr_pages: The number of pages desired in the array > > + * @page_array: Array to store the pages > > * > > * This is a batched version of the page allocator that attempts to > > - * allocate nr_pages quickly. Pages are added to page_list if page_list > > - * is not NULL, otherwise it is assumed that the page_array is valid. > > + * allocate nr_pages quickly. Pages are added to the page_array. > > * > > - * For lists, nr_pages is the number of pages that should be allocated. > > - * > > - * For arrays, only NULL elements are populated with pages and nr_pages > > + * Note that only NULL elements are populated with pages and nr_pages > > It is not really related to this patch, but while we are at this, the above > seems like an odd behavior. By roughly looking at all the callers of that > API, it seems like only the below callers rely on that? > fs/erofs/zutil.c: z_erofs_gbuf_growsize() > fs/xfs/xfs_buf.c: xfs_buf_alloc_pages() > > It seems it is quite straight forward to change the above callers to not > rely on the above behavior, and we might be able to avoid more checking > by removing the above behavior? > It was implemented that way for an early user, net/sunrpc/svc_xprt.c. The behaviour removes a burden from the caller to track the number of populated elements and then pass the exact number of pages that must be allocated. If the API does not handle that detail, each caller needs similar state tracking implementations. As the overhead is going to be the same whether the API implements it once or each caller implements there own, it is simplier if there is just one implementation. -- Mel Gorman SUSE Labs
On 2025/1/3 4:00, Mel Gorman wrote:
> On Wed, Dec 25, 2024 at 08:36:04PM +0800, Yunsheng Lin wrote:
>> On 2024/12/24 6:00, Luiz Capitulino wrote:
>>
>>> /*
>>> - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
>>> + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
>>> * @gfp: GFP flags for the allocation
>>> * @preferred_nid: The preferred NUMA node ID to allocate from
>>> * @nodemask: Set of nodes to allocate from, may be NULL
>>> - * @nr_pages: The number of pages desired on the list or array
>>> - * @page_list: Optional list to store the allocated pages
>>> - * @page_array: Optional array to store the pages
>>> + * @nr_pages: The number of pages desired in the array
>>> + * @page_array: Array to store the pages
>>> *
>>> * This is a batched version of the page allocator that attempts to
>>> - * allocate nr_pages quickly. Pages are added to page_list if page_list
>>> - * is not NULL, otherwise it is assumed that the page_array is valid.
>>> + * allocate nr_pages quickly. Pages are added to the page_array.
>>> *
>>> - * For lists, nr_pages is the number of pages that should be allocated.
>>> - *
>>> - * For arrays, only NULL elements are populated with pages and nr_pages
>>> + * Note that only NULL elements are populated with pages and nr_pages
>>
>> It is not really related to this patch, but while we are at this, the above
>> seems like an odd behavior. By roughly looking at all the callers of that
>> API, it seems like only the below callers rely on that?
>> fs/erofs/zutil.c: z_erofs_gbuf_growsize()
>> fs/xfs/xfs_buf.c: xfs_buf_alloc_pages()
>>
>> It seems it is quite straight forward to change the above callers to not
>> rely on the above behavior, and we might be able to avoid more checking
>> by removing the above behavior?
>>
>
> It was implemented that way for an early user, net/sunrpc/svc_xprt.c.
> The behaviour removes a burden from the caller to track the number of
> populated elements and then pass the exact number of pages that must be
> allocated. If the API does not handle that detail, each caller needs
> similar state tracking implementations. As the overhead is going to be
> the same whether the API implements it once or each caller implements
> there own, it is simplier if there is just one implementation.
It seems it is quite straight forward to change the above use case to
not rely on that by something like below?
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 43c57124de52..52800bfddc86 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -670,19 +670,21 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
pages = RPCSVC_MAXPAGES;
}
- for (filled = 0; filled < pages; filled = ret) {
- ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
- rqstp->rq_pages);
- if (ret > filled)
+ for (filled = 0; filled < pages;) {
+ ret = alloc_pages_bulk_array(GFP_KERNEL, pages - filled,
+ rqstp->rq_pages + filled);
+ if (ret) {
+ filled += ret;
/* Made progress, don't sleep yet */
continue;
+ }
set_current_state(TASK_IDLE);
if (svc_thread_should_stop(rqstp)) {
set_current_state(TASK_RUNNING);
return false;
}
- trace_svc_alloc_arg_err(pages, ret);
+ trace_svc_alloc_arg_err(pages, filled);
memalloc_retry_wait(GFP_KERNEL);
}
rqstp->rq_page_end = &rqstp->rq_pages[pages];
>
On Fri, Jan 03, 2025 at 07:29:30PM +0800, Yunsheng Lin wrote:
> On 2025/1/3 4:00, Mel Gorman wrote:
> > On Wed, Dec 25, 2024 at 08:36:04PM +0800, Yunsheng Lin wrote:
> >> On 2024/12/24 6:00, Luiz Capitulino wrote:
> >>
> >>> /*
> >>> - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
> >>> + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
> >>> * @gfp: GFP flags for the allocation
> >>> * @preferred_nid: The preferred NUMA node ID to allocate from
> >>> * @nodemask: Set of nodes to allocate from, may be NULL
> >>> - * @nr_pages: The number of pages desired on the list or array
> >>> - * @page_list: Optional list to store the allocated pages
> >>> - * @page_array: Optional array to store the pages
> >>> + * @nr_pages: The number of pages desired in the array
> >>> + * @page_array: Array to store the pages
> >>> *
> >>> * This is a batched version of the page allocator that attempts to
> >>> - * allocate nr_pages quickly. Pages are added to page_list if page_list
> >>> - * is not NULL, otherwise it is assumed that the page_array is valid.
> >>> + * allocate nr_pages quickly. Pages are added to the page_array.
> >>> *
> >>> - * For lists, nr_pages is the number of pages that should be allocated.
> >>> - *
> >>> - * For arrays, only NULL elements are populated with pages and nr_pages
> >>> + * Note that only NULL elements are populated with pages and nr_pages
> >>
> >> It is not really related to this patch, but while we are at this, the above
> >> seems like an odd behavior. By roughly looking at all the callers of that
> >> API, it seems like only the below callers rely on that?
> >> fs/erofs/zutil.c: z_erofs_gbuf_growsize()
> >> fs/xfs/xfs_buf.c: xfs_buf_alloc_pages()
> >>
> >> It seems it is quite straight forward to change the above callers to not
> >> rely on the above behavior, and we might be able to avoid more checking
> >> by removing the above behavior?
> >>
> >
> > It was implemented that way for an early user, net/sunrpc/svc_xprt.c.
> > The behaviour removes a burden from the caller to track the number of
> > populated elements and then pass the exact number of pages that must be
> > allocated. If the API does not handle that detail, each caller needs
> > similar state tracking implementations. As the overhead is going to be
> > the same whether the API implements it once or each caller implements
> > there own, it is simplier if there is just one implementation.
>
> It seems it is quite straight forward to change the above use case to
> not rely on that by something like below?
>
> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
> index 43c57124de52..52800bfddc86 100644
> --- a/net/sunrpc/svc_xprt.c
> +++ b/net/sunrpc/svc_xprt.c
> @@ -670,19 +670,21 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
> pages = RPCSVC_MAXPAGES;
> }
>
> - for (filled = 0; filled < pages; filled = ret) {
> - ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
> - rqstp->rq_pages);
> - if (ret > filled)
> + for (filled = 0; filled < pages;) {
> + ret = alloc_pages_bulk_array(GFP_KERNEL, pages - filled,
> + rqstp->rq_pages + filled);
> + if (ret) {
> + filled += ret;
> /* Made progress, don't sleep yet */
> continue;
> + }
>
> set_current_state(TASK_IDLE);
> if (svc_thread_should_stop(rqstp)) {
> set_current_state(TASK_RUNNING);
> return false;
> }
> - trace_svc_alloc_arg_err(pages, ret);
> + trace_svc_alloc_arg_err(pages, filled);
> memalloc_retry_wait(GFP_KERNEL);
> }
> rqstp->rq_page_end = &rqstp->rq_pages[pages];
>
The API implementation would also need to change to make this work as the
return value is a number of pages that are on the array, not the number of
new pages allocated. Even if fixed, it still moves cost and complexity to
the caller and the API is harder to use and easier to make mistakes. That
shift in responsibility and the maintenance burden would need to be
justified. While it is possible to use wrappers to allow callers to decide
whether to manage the tracking or let the API handle it, the requirement
then is to show that there is a performance gain for a common use case.
This is outside the scope of a serise that removes the page_list
argument. Even if a series was proposed to shift responsibility to the
caller, I would not expect it to be submitted with a page_list removal.
--
Mel Gorman
SUSE Labs
On 2025-01-03 09:27, Mel Gorman wrote:
> On Fri, Jan 03, 2025 at 07:29:30PM +0800, Yunsheng Lin wrote:
>> On 2025/1/3 4:00, Mel Gorman wrote:
>>> On Wed, Dec 25, 2024 at 08:36:04PM +0800, Yunsheng Lin wrote:
>>>> On 2024/12/24 6:00, Luiz Capitulino wrote:
>>>>
>>>>> /*
>>>>> - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
>>>>> + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
>>>>> * @gfp: GFP flags for the allocation
>>>>> * @preferred_nid: The preferred NUMA node ID to allocate from
>>>>> * @nodemask: Set of nodes to allocate from, may be NULL
>>>>> - * @nr_pages: The number of pages desired on the list or array
>>>>> - * @page_list: Optional list to store the allocated pages
>>>>> - * @page_array: Optional array to store the pages
>>>>> + * @nr_pages: The number of pages desired in the array
>>>>> + * @page_array: Array to store the pages
>>>>> *
>>>>> * This is a batched version of the page allocator that attempts to
>>>>> - * allocate nr_pages quickly. Pages are added to page_list if page_list
>>>>> - * is not NULL, otherwise it is assumed that the page_array is valid.
>>>>> + * allocate nr_pages quickly. Pages are added to the page_array.
>>>>> *
>>>>> - * For lists, nr_pages is the number of pages that should be allocated.
>>>>> - *
>>>>> - * For arrays, only NULL elements are populated with pages and nr_pages
>>>>> + * Note that only NULL elements are populated with pages and nr_pages
>>>>
>>>> It is not really related to this patch, but while we are at this, the above
>>>> seems like an odd behavior. By roughly looking at all the callers of that
>>>> API, it seems like only the below callers rely on that?
>>>> fs/erofs/zutil.c: z_erofs_gbuf_growsize()
>>>> fs/xfs/xfs_buf.c: xfs_buf_alloc_pages()
>>>>
>>>> It seems it is quite straight forward to change the above callers to not
>>>> rely on the above behavior, and we might be able to avoid more checking
>>>> by removing the above behavior?
>>>>
>>>
>>> It was implemented that way for an early user, net/sunrpc/svc_xprt.c.
>>> The behaviour removes a burden from the caller to track the number of
>>> populated elements and then pass the exact number of pages that must be
>>> allocated. If the API does not handle that detail, each caller needs
>>> similar state tracking implementations. As the overhead is going to be
>>> the same whether the API implements it once or each caller implements
>>> there own, it is simplier if there is just one implementation.
>>
>> It seems it is quite straight forward to change the above use case to
>> not rely on that by something like below?
>>
>> diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
>> index 43c57124de52..52800bfddc86 100644
>> --- a/net/sunrpc/svc_xprt.c
>> +++ b/net/sunrpc/svc_xprt.c
>> @@ -670,19 +670,21 @@ static bool svc_alloc_arg(struct svc_rqst *rqstp)
>> pages = RPCSVC_MAXPAGES;
>> }
>>
>> - for (filled = 0; filled < pages; filled = ret) {
>> - ret = alloc_pages_bulk_array(GFP_KERNEL, pages,
>> - rqstp->rq_pages);
>> - if (ret > filled)
>> + for (filled = 0; filled < pages;) {
>> + ret = alloc_pages_bulk_array(GFP_KERNEL, pages - filled,
>> + rqstp->rq_pages + filled);
>> + if (ret) {
>> + filled += ret;
>> /* Made progress, don't sleep yet */
>> continue;
>> + }
>>
>> set_current_state(TASK_IDLE);
>> if (svc_thread_should_stop(rqstp)) {
>> set_current_state(TASK_RUNNING);
>> return false;
>> }
>> - trace_svc_alloc_arg_err(pages, ret);
>> + trace_svc_alloc_arg_err(pages, filled);
>> memalloc_retry_wait(GFP_KERNEL);
>> }
>> rqstp->rq_page_end = &rqstp->rq_pages[pages];
>>
>
> The API implementation would also need to change to make this work as the
> return value is a number of pages that are on the array, not the number of
> new pages allocated. Even if fixed, it still moves cost and complexity to
> the caller and the API is harder to use and easier to make mistakes. That
> shift in responsibility and the maintenance burden would need to be
> justified. While it is possible to use wrappers to allow callers to decide
> whether to manage the tracking or let the API handle it, the requirement
> then is to show that there is a performance gain for a common use case.
I agree with all your points, but I think there's value in simplifying
alloc_pages_bulk_noprof() if the wrappers turn out not be complex or
difficult to use.
If all users of this case always fill the array sequentially, then
maybe we could just have a wrapper that scans the array first (this
is the first loop in alloc_pages_bulk_noprof()) or maybe callers
could keep track of 'filled' (via a pointer or macro usage). But
this is just brainstorming, we'd need to see the resulting code to
know if it works and makes sense.
> This is outside the scope of a serise that removes the page_list
> argument. Even if a series was proposed to shift responsibility to the
> caller, I would not expect it to be submitted with a page_list removal.
Yes, absolutely.
On 2024-12-25 07:36, Yunsheng Lin wrote:
> On 2024/12/24 6:00, Luiz Capitulino wrote:
>
>> /*
>> - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array
>> + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array
>> * @gfp: GFP flags for the allocation
>> * @preferred_nid: The preferred NUMA node ID to allocate from
>> * @nodemask: Set of nodes to allocate from, may be NULL
>> - * @nr_pages: The number of pages desired on the list or array
>> - * @page_list: Optional list to store the allocated pages
>> - * @page_array: Optional array to store the pages
>> + * @nr_pages: The number of pages desired in the array
>> + * @page_array: Array to store the pages
>> *
>> * This is a batched version of the page allocator that attempts to
>> - * allocate nr_pages quickly. Pages are added to page_list if page_list
>> - * is not NULL, otherwise it is assumed that the page_array is valid.
>> + * allocate nr_pages quickly. Pages are added to the page_array.
>> *
>> - * For lists, nr_pages is the number of pages that should be allocated.
>> - *
>> - * For arrays, only NULL elements are populated with pages and nr_pages
>> + * Note that only NULL elements are populated with pages and nr_pages
>
> It is not really related to this patch, but while we are at this, the above
> seems like an odd behavior. By roughly looking at all the callers of that
> API, it seems like only the below callers rely on that?
> fs/erofs/zutil.c: z_erofs_gbuf_growsize()
> fs/xfs/xfs_buf.c: xfs_buf_alloc_pages()
>
> It seems it is quite straight forward to change the above callers to not
> rely on the above behavior, and we might be able to avoid more checking
> by removing the above behavior?
Hi Yunsheng,
Assuming the use-case is valid, I think we might want to keep common code
in the API vs. duplicating it in callers?
In any case, even if we decide to go for your suggestion, I'd prefer not
to grow the scope of this series since this could delay its inclusion.
Dropping the list-API (and dead code) is actually important so IMHO it
should go in first.
PS: Sorry for the delay in my response.
- Luiz
>
>> * is the maximum number of pages that will be stored in the array.
>> *
>> - * Returns the number of pages on the list or array.
>> + * Returns the number of pages in the array.
>> */
>> unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
>> nodemask_t *nodemask, int nr_pages,
>> - struct list_head *page_list,
>> struct page **page_array)
>> {
>> struct page *page;
>> @@ -4570,7 +4565,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
>> * Skip populated array elements to determine if any pages need
>> * to be allocated before disabling IRQs.
>> */
>> - while (page_array && nr_populated < nr_pages && page_array[nr_populated])
>> + while (nr_populated < nr_pages && page_array[nr_populated])
>> nr_populated++;
>
> The above might be avoided as mentioned above.
>
>>
>> /* No pages requested? */
>> @@ -4578,7 +4573,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
>> goto out;
>>
>> /* Already populated array? */
>> - if (unlikely(page_array && nr_pages - nr_populated == 0))
>> + if (unlikely(nr_pages - nr_populated == 0))
>> goto out;
>>
>> /* Bulk allocator does not support memcg accounting. */
>> @@ -4660,7 +4655,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
>> while (nr_populated < nr_pages) {
>>
>> /* Skip existing pages */
>> - if (page_array && page_array[nr_populated]) {
>> + if (page_array[nr_populated]) {
>
> Similar here.
>
>> nr_populated++;
>> continue;
>> }
>
>
On 2025/1/3 0:38, Luiz Capitulino wrote: > On 2024-12-25 07:36, Yunsheng Lin wrote: >> On 2024/12/24 6:00, Luiz Capitulino wrote: >> >>> /* >>> - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array >>> + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array >>> * @gfp: GFP flags for the allocation >>> * @preferred_nid: The preferred NUMA node ID to allocate from >>> * @nodemask: Set of nodes to allocate from, may be NULL >>> - * @nr_pages: The number of pages desired on the list or array >>> - * @page_list: Optional list to store the allocated pages >>> - * @page_array: Optional array to store the pages >>> + * @nr_pages: The number of pages desired in the array >>> + * @page_array: Array to store the pages >>> * >>> * This is a batched version of the page allocator that attempts to >>> - * allocate nr_pages quickly. Pages are added to page_list if page_list >>> - * is not NULL, otherwise it is assumed that the page_array is valid. >>> + * allocate nr_pages quickly. Pages are added to the page_array. >>> * >>> - * For lists, nr_pages is the number of pages that should be allocated. >>> - * >>> - * For arrays, only NULL elements are populated with pages and nr_pages >>> + * Note that only NULL elements are populated with pages and nr_pages >> >> It is not really related to this patch, but while we are at this, the above >> seems like an odd behavior. By roughly looking at all the callers of that >> API, it seems like only the below callers rely on that? >> fs/erofs/zutil.c: z_erofs_gbuf_growsize() >> fs/xfs/xfs_buf.c: xfs_buf_alloc_pages() >> >> It seems it is quite straight forward to change the above callers to not >> rely on the above behavior, and we might be able to avoid more checking >> by removing the above behavior? > > Hi Yunsheng, > > Assuming the use-case is valid, I think we might want to keep common code > in the API vs. duplicating it in callers? I was thinking maybe adding a wrapper/helper around the __alloc_pages_bulk() to avoid the overhead for other usecase and make the semantics more obvious if it is an valid use-case. > > In any case, even if we decide to go for your suggestion, I'd prefer not > to grow the scope of this series since this could delay its inclusion. > Dropping the list-API (and dead code) is actually important so IMHO it > should go in first. Sure. >
On 2025-01-03 06:21, Yunsheng Lin wrote: > On 2025/1/3 0:38, Luiz Capitulino wrote: >> On 2024-12-25 07:36, Yunsheng Lin wrote: >>> On 2024/12/24 6:00, Luiz Capitulino wrote: >>> >>>> /* >>>> - * __alloc_pages_bulk - Allocate a number of order-0 pages to a list or array >>>> + * __alloc_pages_bulk - Allocate a number of order-0 pages to an array >>>> * @gfp: GFP flags for the allocation >>>> * @preferred_nid: The preferred NUMA node ID to allocate from >>>> * @nodemask: Set of nodes to allocate from, may be NULL >>>> - * @nr_pages: The number of pages desired on the list or array >>>> - * @page_list: Optional list to store the allocated pages >>>> - * @page_array: Optional array to store the pages >>>> + * @nr_pages: The number of pages desired in the array >>>> + * @page_array: Array to store the pages >>>> * >>>> * This is a batched version of the page allocator that attempts to >>>> - * allocate nr_pages quickly. Pages are added to page_list if page_list >>>> - * is not NULL, otherwise it is assumed that the page_array is valid. >>>> + * allocate nr_pages quickly. Pages are added to the page_array. >>>> * >>>> - * For lists, nr_pages is the number of pages that should be allocated. >>>> - * >>>> - * For arrays, only NULL elements are populated with pages and nr_pages >>>> + * Note that only NULL elements are populated with pages and nr_pages >>> >>> It is not really related to this patch, but while we are at this, the above >>> seems like an odd behavior. By roughly looking at all the callers of that >>> API, it seems like only the below callers rely on that? >>> fs/erofs/zutil.c: z_erofs_gbuf_growsize() >>> fs/xfs/xfs_buf.c: xfs_buf_alloc_pages() >>> >>> It seems it is quite straight forward to change the above callers to not >>> rely on the above behavior, and we might be able to avoid more checking >>> by removing the above behavior? >> >> Hi Yunsheng, >> >> Assuming the use-case is valid, I think we might want to keep common code >> in the API vs. duplicating it in callers? > > I was thinking maybe adding a wrapper/helper around the __alloc_pages_bulk() > to avoid the overhead for other usecase and make the semantics more obvious > if it is an valid use-case. Yeah, that might be a good idea. > >> >> In any case, even if we decide to go for your suggestion, I'd prefer not >> to grow the scope of this series since this could delay its inclusion. >> Dropping the list-API (and dead code) is actually important so IMHO it >> should go in first. > > Sure. > >> >
© 2016 - 2025 Red Hat, Inc.