We are above to use page_frag_alloc_*() API to not just
allocate memory for skb->data, but also use them to do
the memory allocation for skb frag too. Currently the
implementation of page_frag in mm subsystem is running
the offset as a countdown rather than count-up value,
there may have several advantages to that as mentioned
in [1], but it may have some disadvantages, for example,
it may disable skb frag coaleasing and more correct cache
prefetching
We have a trade-off to make in order to have a unified
implementation and API for page_frag, so use a initial zero
offset in this patch, and the following patch will try to
make some optimization to aovid the disadvantages as much
as possible.
1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
CC: Alexander Duyck <alexander.duyck@gmail.com>
Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
---
mm/page_frag_cache.c | 31 ++++++++++++++-----------------
1 file changed, 14 insertions(+), 17 deletions(-)
diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
index a0f90ba25200..3e3e88d9af90 100644
--- a/mm/page_frag_cache.c
+++ b/mm/page_frag_cache.c
@@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask,
unsigned int align_mask)
{
- unsigned int size = PAGE_SIZE;
+ unsigned int size, offset;
struct page *page;
- int offset;
if (unlikely(!nc->va)) {
refill:
@@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
if (!page)
return NULL;
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- /* if size can vary use size else just use PAGE_SIZE */
- size = nc->size;
-#endif
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
@@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
- nc->offset = size;
+ nc->offset = 0;
}
- offset = nc->offset - fragsz;
- if (unlikely(offset < 0)) {
+#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
+ /* if size can vary use size else just use PAGE_SIZE */
+ size = nc->size;
+#else
+ size = PAGE_SIZE;
+#endif
+
+ offset = ALIGN(nc->offset, -align_mask);
+ if (unlikely(offset + fragsz > size)) {
page = virt_to_page(nc->va);
if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
@@ -104,17 +106,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
goto refill;
}
-#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
- /* if size can vary use size else just use PAGE_SIZE */
- size = nc->size;
-#endif
/* OK, page count is 0, we can safely set it */
set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
- offset = size - fragsz;
- if (unlikely(offset < 0)) {
+ offset = 0;
+ if (unlikely(fragsz > size)) {
/*
* The caller is trying to allocate a fragment
* with fragsz > PAGE_SIZE but the cache isn't big
@@ -129,8 +127,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
}
nc->pagecnt_bias--;
- offset &= align_mask;
- nc->offset = offset;
+ nc->offset = offset + fragsz;
return nc->va + offset;
}
--
2.33.0
On Sun, 2024-04-07 at 21:08 +0800, Yunsheng Lin wrote:
> We are above to use page_frag_alloc_*() API to not just
> allocate memory for skb->data, but also use them to do
> the memory allocation for skb frag too. Currently the
> implementation of page_frag in mm subsystem is running
> the offset as a countdown rather than count-up value,
> there may have several advantages to that as mentioned
> in [1], but it may have some disadvantages, for example,
> it may disable skb frag coaleasing and more correct cache
> prefetching
>
> We have a trade-off to make in order to have a unified
> implementation and API for page_frag, so use a initial zero
> offset in this patch, and the following patch will try to
> make some optimization to aovid the disadvantages as much
> as possible.
>
> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>
> CC: Alexander Duyck <alexander.duyck@gmail.com>
> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> ---
> mm/page_frag_cache.c | 31 ++++++++++++++-----------------
> 1 file changed, 14 insertions(+), 17 deletions(-)
>
> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> index a0f90ba25200..3e3e88d9af90 100644
> --- a/mm/page_frag_cache.c
> +++ b/mm/page_frag_cache.c
> @@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> unsigned int fragsz, gfp_t gfp_mask,
> unsigned int align_mask)
> {
> - unsigned int size = PAGE_SIZE;
> + unsigned int size, offset;
> struct page *page;
> - int offset;
>
> if (unlikely(!nc->va)) {
> refill:
> @@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> if (!page)
> return NULL;
>
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> - /* if size can vary use size else just use PAGE_SIZE */
> - size = nc->size;
> -#endif
> /* Even if we own the page, we do not use atomic_set().
> * This would break get_page_unless_zero() users.
> */
> @@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> /* reset page count bias and offset to start of new frag */
> nc->pfmemalloc = page_is_pfmemalloc(page);
> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> - nc->offset = size;
> + nc->offset = 0;
> }
>
> - offset = nc->offset - fragsz;
> - if (unlikely(offset < 0)) {
> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> + /* if size can vary use size else just use PAGE_SIZE */
> + size = nc->size;
> +#else
> + size = PAGE_SIZE;
> +#endif
> +
> + offset = ALIGN(nc->offset, -align_mask);
> + if (unlikely(offset + fragsz > size)) {
Rather than using "ALIGN" with a negative value it would probably make
more sense to use __ALIGN_KERNEL_MASK with ~align_mask. I am not sure
how well the compiler sorts out the use of negatives to flip values
that are then converted to masks with the "(a) - 1".
> page = virt_to_page(nc->va);
>
> if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
> @@ -104,17 +106,13 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> goto refill;
> }
>
> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> - /* if size can vary use size else just use PAGE_SIZE */
> - size = nc->size;
> -#endif
> /* OK, page count is 0, we can safely set it */
> set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
>
> /* reset page count bias and offset to start of new frag */
> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> - offset = size - fragsz;
> - if (unlikely(offset < 0)) {
> + offset = 0;
> + if (unlikely(fragsz > size)) {
> /*
> * The caller is trying to allocate a fragment
> * with fragsz > PAGE_SIZE but the cache isn't big
> @@ -129,8 +127,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> }
>
> nc->pagecnt_bias--;
> - offset &= align_mask;
> - nc->offset = offset;
> + nc->offset = offset + fragsz;
>
> return nc->va + offset;
> }
On 2024/4/8 1:52, Alexander H Duyck wrote:
> On Sun, 2024-04-07 at 21:08 +0800, Yunsheng Lin wrote:
>> We are above to use page_frag_alloc_*() API to not just
>> allocate memory for skb->data, but also use them to do
>> the memory allocation for skb frag too. Currently the
>> implementation of page_frag in mm subsystem is running
>> the offset as a countdown rather than count-up value,
>> there may have several advantages to that as mentioned
>> in [1], but it may have some disadvantages, for example,
>> it may disable skb frag coaleasing and more correct cache
>> prefetching
>>
>> We have a trade-off to make in order to have a unified
>> implementation and API for page_frag, so use a initial zero
>> offset in this patch, and the following patch will try to
>> make some optimization to aovid the disadvantages as much
>> as possible.
>>
>> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>>
>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>> ---
>> mm/page_frag_cache.c | 31 ++++++++++++++-----------------
>> 1 file changed, 14 insertions(+), 17 deletions(-)
>>
>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>> index a0f90ba25200..3e3e88d9af90 100644
>> --- a/mm/page_frag_cache.c
>> +++ b/mm/page_frag_cache.c
>> @@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>> unsigned int fragsz, gfp_t gfp_mask,
>> unsigned int align_mask)
>> {
>> - unsigned int size = PAGE_SIZE;
>> + unsigned int size, offset;
>> struct page *page;
>> - int offset;
>>
>> if (unlikely(!nc->va)) {
>> refill:
>> @@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>> if (!page)
>> return NULL;
>>
>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> - /* if size can vary use size else just use PAGE_SIZE */
>> - size = nc->size;
>> -#endif
>> /* Even if we own the page, we do not use atomic_set().
>> * This would break get_page_unless_zero() users.
>> */
>> @@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>> /* reset page count bias and offset to start of new frag */
>> nc->pfmemalloc = page_is_pfmemalloc(page);
>> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>> - nc->offset = size;
>> + nc->offset = 0;
>> }
>>
>> - offset = nc->offset - fragsz;
>> - if (unlikely(offset < 0)) {
>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>> + /* if size can vary use size else just use PAGE_SIZE */
>> + size = nc->size;
>> +#else
>> + size = PAGE_SIZE;
>> +#endif
>> +
>> + offset = ALIGN(nc->offset, -align_mask);
>> + if (unlikely(offset + fragsz > size)) {
>
> Rather than using "ALIGN" with a negative value it would probably make
> more sense to use __ALIGN_KERNEL_MASK with ~align_mask. I am not sure
> how well the compiler sorts out the use of negatives to flip values
> that are then converted to masks with the "(a) - 1".
The next patch will remove the '-' in '-align_mask' as the 'ALIGN' operation
is done in the inline helper. I am not sure that matter much to use
__ALIGN_KERNEL_MASK with ~align_mask?
>
On Mon, Apr 8, 2024 at 6:39 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>
> On 2024/4/8 1:52, Alexander H Duyck wrote:
> > On Sun, 2024-04-07 at 21:08 +0800, Yunsheng Lin wrote:
> >> We are above to use page_frag_alloc_*() API to not just
> >> allocate memory for skb->data, but also use them to do
> >> the memory allocation for skb frag too. Currently the
> >> implementation of page_frag in mm subsystem is running
> >> the offset as a countdown rather than count-up value,
> >> there may have several advantages to that as mentioned
> >> in [1], but it may have some disadvantages, for example,
> >> it may disable skb frag coaleasing and more correct cache
> >> prefetching
> >>
> >> We have a trade-off to make in order to have a unified
> >> implementation and API for page_frag, so use a initial zero
> >> offset in this patch, and the following patch will try to
> >> make some optimization to aovid the disadvantages as much
> >> as possible.
> >>
> >> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
> >>
> >> CC: Alexander Duyck <alexander.duyck@gmail.com>
> >> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
> >> ---
> >> mm/page_frag_cache.c | 31 ++++++++++++++-----------------
> >> 1 file changed, 14 insertions(+), 17 deletions(-)
> >>
> >> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
> >> index a0f90ba25200..3e3e88d9af90 100644
> >> --- a/mm/page_frag_cache.c
> >> +++ b/mm/page_frag_cache.c
> >> @@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >> unsigned int fragsz, gfp_t gfp_mask,
> >> unsigned int align_mask)
> >> {
> >> - unsigned int size = PAGE_SIZE;
> >> + unsigned int size, offset;
> >> struct page *page;
> >> - int offset;
> >>
> >> if (unlikely(!nc->va)) {
> >> refill:
> >> @@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >> if (!page)
> >> return NULL;
> >>
> >> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> - /* if size can vary use size else just use PAGE_SIZE */
> >> - size = nc->size;
> >> -#endif
> >> /* Even if we own the page, we do not use atomic_set().
> >> * This would break get_page_unless_zero() users.
> >> */
> >> @@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
> >> /* reset page count bias and offset to start of new frag */
> >> nc->pfmemalloc = page_is_pfmemalloc(page);
> >> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
> >> - nc->offset = size;
> >> + nc->offset = 0;
> >> }
> >>
> >> - offset = nc->offset - fragsz;
> >> - if (unlikely(offset < 0)) {
> >> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
> >> + /* if size can vary use size else just use PAGE_SIZE */
> >> + size = nc->size;
> >> +#else
> >> + size = PAGE_SIZE;
> >> +#endif
> >> +
> >> + offset = ALIGN(nc->offset, -align_mask);
> >> + if (unlikely(offset + fragsz > size)) {
> >
> > Rather than using "ALIGN" with a negative value it would probably make
> > more sense to use __ALIGN_KERNEL_MASK with ~align_mask. I am not sure
> > how well the compiler sorts out the use of negatives to flip values
> > that are then converted to masks with the "(a) - 1".
>
> The next patch will remove the '-' in '-align_mask' as the 'ALIGN' operation
> is done in the inline helper. I am not sure that matter much to use
> __ALIGN_KERNEL_MASK with ~align_mask?
It is a matter of making the negations more obvious. Basically you
could achieve the same alignment by doing:
(offset + (~align_mask)) & ~(~align_mask)
rather than:
(offset + ((-align_mask) - 1)) & ~((-align_mask) - 1)
I'm not sure the compiler will pick up on the fact that the two are
identical and can save a number of operations. Also my suggested
approach is closer to how it used to work. Technically the one you are
using only works if align_mask is a negative power of 2.
On 2024/4/9 0:11, Alexander Duyck wrote:
> On Mon, Apr 8, 2024 at 6:39 AM Yunsheng Lin <linyunsheng@huawei.com> wrote:
>>
>> On 2024/4/8 1:52, Alexander H Duyck wrote:
>>> On Sun, 2024-04-07 at 21:08 +0800, Yunsheng Lin wrote:
>>>> We are above to use page_frag_alloc_*() API to not just
>>>> allocate memory for skb->data, but also use them to do
>>>> the memory allocation for skb frag too. Currently the
>>>> implementation of page_frag in mm subsystem is running
>>>> the offset as a countdown rather than count-up value,
>>>> there may have several advantages to that as mentioned
>>>> in [1], but it may have some disadvantages, for example,
>>>> it may disable skb frag coaleasing and more correct cache
>>>> prefetching
>>>>
>>>> We have a trade-off to make in order to have a unified
>>>> implementation and API for page_frag, so use a initial zero
>>>> offset in this patch, and the following patch will try to
>>>> make some optimization to aovid the disadvantages as much
>>>> as possible.
>>>>
>>>> 1. https://lore.kernel.org/all/f4abe71b3439b39d17a6fb2d410180f367cadf5c.camel@gmail.com/
>>>>
>>>> CC: Alexander Duyck <alexander.duyck@gmail.com>
>>>> Signed-off-by: Yunsheng Lin <linyunsheng@huawei.com>
>>>> ---
>>>> mm/page_frag_cache.c | 31 ++++++++++++++-----------------
>>>> 1 file changed, 14 insertions(+), 17 deletions(-)
>>>>
>>>> diff --git a/mm/page_frag_cache.c b/mm/page_frag_cache.c
>>>> index a0f90ba25200..3e3e88d9af90 100644
>>>> --- a/mm/page_frag_cache.c
>>>> +++ b/mm/page_frag_cache.c
>>>> @@ -67,9 +67,8 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>> unsigned int fragsz, gfp_t gfp_mask,
>>>> unsigned int align_mask)
>>>> {
>>>> - unsigned int size = PAGE_SIZE;
>>>> + unsigned int size, offset;
>>>> struct page *page;
>>>> - int offset;
>>>>
>>>> if (unlikely(!nc->va)) {
>>>> refill:
>>>> @@ -77,10 +76,6 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>> if (!page)
>>>> return NULL;
>>>>
>>>> -#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> - /* if size can vary use size else just use PAGE_SIZE */
>>>> - size = nc->size;
>>>> -#endif
>>>> /* Even if we own the page, we do not use atomic_set().
>>>> * This would break get_page_unless_zero() users.
>>>> */
>>>> @@ -89,11 +84,18 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
>>>> /* reset page count bias and offset to start of new frag */
>>>> nc->pfmemalloc = page_is_pfmemalloc(page);
>>>> nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
>>>> - nc->offset = size;
>>>> + nc->offset = 0;
>>>> }
>>>>
>>>> - offset = nc->offset - fragsz;
>>>> - if (unlikely(offset < 0)) {
>>>> +#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
>>>> + /* if size can vary use size else just use PAGE_SIZE */
>>>> + size = nc->size;
>>>> +#else
>>>> + size = PAGE_SIZE;
>>>> +#endif
>>>> +
>>>> + offset = ALIGN(nc->offset, -align_mask);
>>>> + if (unlikely(offset + fragsz > size)) {
>>>
>>> Rather than using "ALIGN" with a negative value it would probably make
>>> more sense to use __ALIGN_KERNEL_MASK with ~align_mask. I am not sure
>>> how well the compiler sorts out the use of negatives to flip values
>>> that are then converted to masks with the "(a) - 1".
>>
>> The next patch will remove the '-' in '-align_mask' as the 'ALIGN' operation
>> is done in the inline helper. I am not sure that matter much to use
>> __ALIGN_KERNEL_MASK with ~align_mask?
>
> It is a matter of making the negations more obvious. Basically you
> could achieve the same alignment by doing:
> (offset + (~align_mask)) & ~(~align_mask)
> rather than:
> (offset + ((-align_mask) - 1)) & ~((-align_mask) - 1)
>
> I'm not sure the compiler will pick up on the fact that the two are
> identical and can save a number of operations. Also my suggested
> approach is closer to how it used to work. Technically the one you are
> using only works if align_mask is a negative power of 2.
In patch 3, we have below, so the above trick is not really needed after
patch 3:
@@ -94,7 +93,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
size = PAGE_SIZE;
#endif
- offset = ALIGN(nc->offset, -align_mask);
+ offset = nc->offset;
if (unlikely(offset + fragsz > size)) {
page = virt_to_page(nc->va);
@@ -131,7 +130,7 @@ void *__page_frag_alloc_align(struct page_frag_cache *nc,
return nc->va + offset;
}
-EXPORT_SYMBOL(__page_frag_alloc_align);
+EXPORT_SYMBOL(page_frag_alloc);
...
+static inline void *__page_frag_alloc_align(struct page_frag_cache *nc,
+ unsigned int fragsz, gfp_t gfp_mask,
+ unsigned int align)
+{
+ nc->offset = ALIGN(nc->offset, align);
+
+ return page_frag_alloc(nc, fragsz, gfp_mask);
+}
static inline void *page_frag_alloc_align(struct page_frag_cache *nc,
unsigned int fragsz, gfp_t gfp_mask,
unsigned int align)
{
WARN_ON_ONCE(!is_power_of_2(align));
- return __page_frag_alloc_align(nc, fragsz, gfp_mask, -align);
-}
> .
>
© 2016 - 2026 Red Hat, Inc.