[PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier

Alexander Lobakin posted 7 patches 1 year, 9 months ago
There is a newer version of this series
[PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier
Posted by Alexander Lobakin 1 year, 9 months ago
We can save a couple more function calls in the Page Pool code if we
check for dma_need_sync() earlier, just when we test pp->p.dma_sync.
Move both these checks into an inline wrapper and call the PP wrapper
over the generic DMA sync function only when both are true.
You can't cache the result of dma_need_sync() in &page_pool, as it may
change anytime if an SWIOTLB buffer is allocated or mapped.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 net/core/page_pool.c | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 6cf26a68fa91..87319c6365e0 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -398,16 +398,24 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
 	return page;
 }
 
-static void page_pool_dma_sync_for_device(const struct page_pool *pool,
-					  const struct page *page,
-					  unsigned int dma_sync_size)
+static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
+					    const struct page *page,
+					    u32 dma_sync_size)
 {
 	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
 
 	dma_sync_size = min(dma_sync_size, pool->p.max_len);
-	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
-					 pool->p.offset, dma_sync_size,
-					 pool->p.dma_dir);
+	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
+				     dma_sync_size, pool->p.dma_dir);
+}
+
+static __always_inline void
+page_pool_dma_sync_for_device(const struct page_pool *pool,
+			      const struct page *page,
+			      u32 dma_sync_size)
+{
+	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
+		__page_pool_dma_sync_for_device(pool, page, dma_sync_size);
 }
 
 static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
@@ -429,8 +437,7 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	if (page_pool_set_dma_addr(page, dma))
 		goto unmap_failed;
 
-	if (pool->dma_sync)
-		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
+	page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
 
 	return true;
 
@@ -699,9 +706,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
 	if (likely(__page_pool_page_can_be_recycled(page))) {
 		/* Read barrier done in page_ref_count / READ_ONCE */
 
-		if (pool->dma_sync)
-			page_pool_dma_sync_for_device(pool, page,
-						      dma_sync_size);
+		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
 
 		if (allow_direct && page_pool_recycle_in_cache(page, pool))
 			return NULL;
@@ -840,9 +845,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
 		return NULL;
 
 	if (__page_pool_page_can_be_recycled(page)) {
-		if (pool->dma_sync)
-			page_pool_dma_sync_for_device(pool, page, -1);
-
+		page_pool_dma_sync_for_device(pool, page, -1);
 		return page;
 	}
 
-- 
2.44.0
Re: [PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier
Posted by Alexander Lobakin 1 year, 9 months ago
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 23 Apr 2024 15:58:31 +0200

> We can save a couple more function calls in the Page Pool code if we
> check for dma_need_sync() earlier, just when we test pp->p.dma_sync.
> Move both these checks into an inline wrapper and call the PP wrapper
> over the generic DMA sync function only when both are true.
> You can't cache the result of dma_need_sync() in &page_pool, as it may
> change anytime if an SWIOTLB buffer is allocated or mapped.
> 
> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
> ---
>  net/core/page_pool.c | 31 +++++++++++++++++--------------
>  1 file changed, 17 insertions(+), 14 deletions(-)
> 
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 6cf26a68fa91..87319c6365e0 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -398,16 +398,24 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
>  	return page;
>  }
>  
> -static void page_pool_dma_sync_for_device(const struct page_pool *pool,
> -					  const struct page *page,
> -					  unsigned int dma_sync_size)
> +static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
> +					    const struct page *page,
> +					    u32 dma_sync_size)
>  {
>  	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
>  
>  	dma_sync_size = min(dma_sync_size, pool->p.max_len);
> -	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
> -					 pool->p.offset, dma_sync_size,
> -					 pool->p.dma_dir);
> +	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
> +				     dma_sync_size, pool->p.dma_dir);

Breh, this breaks builds on !DMA_NEED_SYNC systems, as this function
isn't defined there, and my CI didn't catch it in time... I'll ifdef
this in the next spin after some reviews for this one.

> +}
> +
> +static __always_inline void
> +page_pool_dma_sync_for_device(const struct page_pool *pool,
> +			      const struct page *page,
> +			      u32 dma_sync_size)
> +{
> +	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
> +		__page_pool_dma_sync_for_device(pool, page, dma_sync_size);
>  }
>  
>  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
> @@ -429,8 +437,7 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
>  	if (page_pool_set_dma_addr(page, dma))
>  		goto unmap_failed;
>  
> -	if (pool->dma_sync)
> -		page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
> +	page_pool_dma_sync_for_device(pool, page, pool->p.max_len);
>  
>  	return true;
>  
> @@ -699,9 +706,7 @@ __page_pool_put_page(struct page_pool *pool, struct page *page,
>  	if (likely(__page_pool_page_can_be_recycled(page))) {
>  		/* Read barrier done in page_ref_count / READ_ONCE */
>  
> -		if (pool->dma_sync)
> -			page_pool_dma_sync_for_device(pool, page,
> -						      dma_sync_size);
> +		page_pool_dma_sync_for_device(pool, page, dma_sync_size);
>  
>  		if (allow_direct && page_pool_recycle_in_cache(page, pool))
>  			return NULL;
> @@ -840,9 +845,7 @@ static struct page *page_pool_drain_frag(struct page_pool *pool,
>  		return NULL;
>  
>  	if (__page_pool_page_can_be_recycled(page)) {
> -		if (pool->dma_sync)
> -			page_pool_dma_sync_for_device(pool, page, -1);
> -
> +		page_pool_dma_sync_for_device(pool, page, -1);
>  		return page;
>  	}

Thanks,
Olek
Re: [PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier
Posted by Robin Murphy 1 year, 9 months ago
On 24/04/2024 9:52 am, Alexander Lobakin wrote:
> From: Alexander Lobakin <aleksander.lobakin@intel.com>
> Date: Tue, 23 Apr 2024 15:58:31 +0200
> 
>> We can save a couple more function calls in the Page Pool code if we
>> check for dma_need_sync() earlier, just when we test pp->p.dma_sync.
>> Move both these checks into an inline wrapper and call the PP wrapper
>> over the generic DMA sync function only when both are true.
>> You can't cache the result of dma_need_sync() in &page_pool, as it may
>> change anytime if an SWIOTLB buffer is allocated or mapped.
>>
>> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
>> ---
>>   net/core/page_pool.c | 31 +++++++++++++++++--------------
>>   1 file changed, 17 insertions(+), 14 deletions(-)
>>
>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>> index 6cf26a68fa91..87319c6365e0 100644
>> --- a/net/core/page_pool.c
>> +++ b/net/core/page_pool.c
>> @@ -398,16 +398,24 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
>>   	return page;
>>   }
>>   
>> -static void page_pool_dma_sync_for_device(const struct page_pool *pool,
>> -					  const struct page *page,
>> -					  unsigned int dma_sync_size)
>> +static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
>> +					    const struct page *page,
>> +					    u32 dma_sync_size)
>>   {
>>   	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
>>   
>>   	dma_sync_size = min(dma_sync_size, pool->p.max_len);
>> -	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
>> -					 pool->p.offset, dma_sync_size,
>> -					 pool->p.dma_dir);
>> +	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
>> +				     dma_sync_size, pool->p.dma_dir);
> 
> Breh, this breaks builds on !DMA_NEED_SYNC systems, as this function
> isn't defined there, and my CI didn't catch it in time... I'll ifdef
> this in the next spin after some reviews for this one.

Hmm, the way things have ended up, do we even need this change? (Other
than factoring out the pool->dma_sync check, which is clearly nice)

Since __page_pool_dma_sync_for_device() is never called directly, the
flow we always get is:

page_pool_dma_sync_for_device()
     dma_dev_need_sync()
         __page_pool_dma_sync_for_device()
             ... // a handful of trivial arithmetic
             __dma_sync_single_for_device()

i.e. still effectively the same order of
"if (dma_dev_need_sync()) __dma_sync()" invocations as if we'd just used
the standard dma_sync(), so referencing the unwrapped internals only
spreads it out a bit more for no real benefit. As an experiment I tried
the diff below on top, effectively undoing this problematic part, and it
doesn't seem to make any appreciable difference in a before-and-after
comparison of the object code, at least for my arm64 build.

Thanks,
Robin.

----->8-----
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 27f3b6db800e..b8ab7d4ca229 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -398,24 +398,20 @@ static struct page *__page_pool_get_cached(struct page_pool *pool)
  	return page;
  }
  
-static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
-					    const struct page *page,
-					    u32 dma_sync_size)
-{
-	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
-
-	dma_sync_size = min(dma_sync_size, pool->p.max_len);
-	__dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
-				     dma_sync_size, pool->p.dma_dir);
-}
-
  static __always_inline void
  page_pool_dma_sync_for_device(const struct page_pool *pool,
  			      const struct page *page,
  			      u32 dma_sync_size)
  {
-	if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
-		__page_pool_dma_sync_for_device(pool, page, dma_sync_size);
+	dma_addr_t dma_addr = page_pool_get_dma_addr(page);
+
+	if (!pool->dma_sync)
+		return;
+
+	dma_sync_size = min(dma_sync_size, pool->p.max_len);
+	dma_sync_single_range_for_device(pool->p.dev, dma_addr,
+					 pool->p.offset, dma_sync_size,
+					 pool->p.dma_dir);
  }
  
  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
Re: [PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier
Posted by Alexander Lobakin 1 year, 9 months ago
From: Robin Murphy <robin.murphy@arm.com>
Date: Thu, 2 May 2024 16:49:48 +0100

> On 24/04/2024 9:52 am, Alexander Lobakin wrote:
>> From: Alexander Lobakin <aleksander.lobakin@intel.com>
>> Date: Tue, 23 Apr 2024 15:58:31 +0200
>>
>>> We can save a couple more function calls in the Page Pool code if we
>>> check for dma_need_sync() earlier, just when we test pp->p.dma_sync.
>>> Move both these checks into an inline wrapper and call the PP wrapper
>>> over the generic DMA sync function only when both are true.
>>> You can't cache the result of dma_need_sync() in &page_pool, as it may
>>> change anytime if an SWIOTLB buffer is allocated or mapped.
>>>
>>> Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
>>> ---
>>>   net/core/page_pool.c | 31 +++++++++++++++++--------------
>>>   1 file changed, 17 insertions(+), 14 deletions(-)
>>>
>>> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
>>> index 6cf26a68fa91..87319c6365e0 100644
>>> --- a/net/core/page_pool.c
>>> +++ b/net/core/page_pool.c
>>> @@ -398,16 +398,24 @@ static struct page
>>> *__page_pool_get_cached(struct page_pool *pool)
>>>       return page;
>>>   }
>>>   -static void page_pool_dma_sync_for_device(const struct page_pool
>>> *pool,
>>> -                      const struct page *page,
>>> -                      unsigned int dma_sync_size)
>>> +static void __page_pool_dma_sync_for_device(const struct page_pool
>>> *pool,
>>> +                        const struct page *page,
>>> +                        u32 dma_sync_size)
>>>   {
>>>       dma_addr_t dma_addr = page_pool_get_dma_addr(page);
>>>         dma_sync_size = min(dma_sync_size, pool->p.max_len);
>>> -    dma_sync_single_range_for_device(pool->p.dev, dma_addr,
>>> -                     pool->p.offset, dma_sync_size,
>>> -                     pool->p.dma_dir);
>>> +    __dma_sync_single_for_device(pool->p.dev, dma_addr +
>>> pool->p.offset,
>>> +                     dma_sync_size, pool->p.dma_dir);
>>
>> Breh, this breaks builds on !DMA_NEED_SYNC systems, as this function
>> isn't defined there, and my CI didn't catch it in time... I'll ifdef
>> this in the next spin after some reviews for this one.
> 
> Hmm, the way things have ended up, do we even need this change? (Other
> than factoring out the pool->dma_sync check, which is clearly nice)
> 
> Since __page_pool_dma_sync_for_device() is never called directly, the
> flow we always get is:
> 
> page_pool_dma_sync_for_device()
>     dma_dev_need_sync()
>         __page_pool_dma_sync_for_device()
>             ... // a handful of trivial arithmetic
>             __dma_sync_single_for_device()
> 
> i.e. still effectively the same order of
> "if (dma_dev_need_sync()) __dma_sync()" invocations as if we'd just used
> the standard dma_sync(), so referencing the unwrapped internals only
> spreads it out a bit more for no real benefit. As an experiment I tried
> the diff below on top, effectively undoing this problematic part, and it
> doesn't seem to make any appreciable difference in a before-and-after
> comparison of the object code, at least for my arm64 build.
> 
> Thanks,
> Robin.
> 
> ----->8-----
> diff --git a/net/core/page_pool.c b/net/core/page_pool.c
> index 27f3b6db800e..b8ab7d4ca229 100644
> --- a/net/core/page_pool.c
> +++ b/net/core/page_pool.c
> @@ -398,24 +398,20 @@ static struct page *__page_pool_get_cached(struct
> page_pool *pool)
>      return page;
>  }
>  
> -static void __page_pool_dma_sync_for_device(const struct page_pool *pool,
> -                        const struct page *page,
> -                        u32 dma_sync_size)
> -{
> -    dma_addr_t dma_addr = page_pool_get_dma_addr(page);
> -
> -    dma_sync_size = min(dma_sync_size, pool->p.max_len);
> -    __dma_sync_single_for_device(pool->p.dev, dma_addr + pool->p.offset,
> -                     dma_sync_size, pool->p.dma_dir);
> -}
> -
>  static __always_inline void

So this would unconditionally inline the whole sync code into the call
sites, while I wanted to give a chance for the compilers to uninline
__page_pool_dma_sync_for_device().

>  page_pool_dma_sync_for_device(const struct page_pool *pool,
>                    const struct page *page,
>                    u32 dma_sync_size)
>  {
> -    if (pool->dma_sync && dma_dev_need_sync(pool->p.dev))
> -        __page_pool_dma_sync_for_device(pool, page, dma_sync_size);
> +    dma_addr_t dma_addr = page_pool_get_dma_addr(page);
> +
> +    if (!pool->dma_sync)
> +        return;
> +
> +    dma_sync_size = min(dma_sync_size, pool->p.max_len);
> +    dma_sync_single_range_for_device(pool->p.dev, dma_addr,
> +                     pool->p.offset, dma_sync_size,
> +                     pool->p.dma_dir);
>  }
>  
>  static bool page_pool_dma_map(struct page_pool *pool, struct page *page)

Thanks,
Olek
Re: [PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier
Posted by Christoph Hellwig 1 year, 9 months ago
On Wed, Apr 24, 2024 at 10:52:49AM +0200, Alexander Lobakin wrote:
> Breh, this breaks builds on !DMA_NEED_SYNC systems, as this function
> isn't defined there, and my CI didn't catch it in time... I'll ifdef
> this in the next spin after some reviews for this one.

I looked over the dma-mapping parts of the series and it looks fine
to me.  So please resend as the fixed up version as soon as you fine
time for it.
Re: [PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier
Posted by Christoph Hellwig 1 year, 9 months ago
Hi Alexander,

do you have time to resend the series?  I'd love to merge it for
6.10 if I can get the updated version ASAP.
Re: [PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier
Posted by Alexander Lobakin 1 year, 9 months ago
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 2 May 2024 07:58:55 +0200

> Hi Alexander,
> 
> do you have time to resend the series?  I'd love to merge it for
> 6.10 if I can get the updated version ASAP.

Hi!

Sorry, I was on a small vacation.
Do I still have a chance to get this into 6.10?

Thanks,
Olek
Re: [PATCH net-next v4 6/7] page_pool: check for DMA sync shortcut earlier
Posted by Christoph Hellwig 1 year, 9 months ago
On Mon, May 06, 2024 at 11:38:17AM +0200, Alexander Lobakin wrote:
> From: Christoph Hellwig <hch@lst.de>
> Date: Thu, 2 May 2024 07:58:55 +0200
> 
> > Hi Alexander,
> > 
> > do you have time to resend the series?  I'd love to merge it for
> > 6.10 if I can get the updated version ASAP.
> 
> Hi!
> 
> Sorry, I was on a small vacation.
> Do I still have a chance to get this into 6.10?

If I can get it merged by tomorrow, yes.