[PATCH v2 RESEND 7/7] swiotlb: per-device flag if there are dynamically allocated buffers

Petr Tesarik posted 7 patches 2 years, 9 months ago
There is a newer version of this series
[PATCH v2 RESEND 7/7] swiotlb: per-device flag if there are dynamically allocated buffers
Posted by Petr Tesarik 2 years, 9 months ago
From: Petr Tesarik <petr.tesarik.ext@huawei.com>

Do not walk the list of dynamically allocated bounce buffers if the
list is empty. This avoids taking dma_io_tlb_dyn_lock for devices
which do not use any dynamically allocated bounce buffers.

When unmapping the last dynamically allocated bounce buffer, the
flag is set to false as soon as possible to allow skipping the
spinlock even before the list itself is updated.

Signed-off-by: Petr Tesarik <petr.tesarik.ext@huawei.com>
---
 include/linux/device.h  | 4 ++++
 include/linux/swiotlb.h | 6 +++++-
 kernel/dma/swiotlb.c    | 6 ++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/include/linux/device.h b/include/linux/device.h
index d1d2b8557b30..e340e0f06dce 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -516,6 +516,9 @@ struct device_physical_location {
  * @dma_io_tlb_dyn_slots:
  *		Dynamically allocated bounce buffers for this device.
  *		Not for driver use.
+ * @dma_io_tlb_have_dyn:
+ *		Does this device have any dynamically allocated bounce
+ *		buffers? Not for driver use.
  * @archdata:	For arch-specific additions.
  * @of_node:	Associated device tree node.
  * @fwnode:	Associated device node supplied by platform firmware.
@@ -623,6 +626,7 @@ struct device {
 	struct io_tlb_mem *dma_io_tlb_mem;
 	spinlock_t dma_io_tlb_dyn_lock;
 	struct list_head dma_io_tlb_dyn_slots;
+	bool dma_io_tlb_have_dyn;
 #endif
 	/* arch specific additions */
 	struct dev_archdata	archdata;
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index daa2064f2ede..8cbb0bebb0bc 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -152,7 +152,11 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
 
 	return mem &&
 		(is_swiotlb_fixed(mem, paddr) ||
-		 (mem->allow_dyn && is_swiotlb_dyn(dev, paddr)));
+		 /* Pairs with smp_store_release() in swiotlb_dyn_map()
+		  * and swiotlb_dyn_unmap().
+		  */
+		 (smp_load_acquire(&dev->dma_io_tlb_have_dyn) &&
+		  is_swiotlb_dyn(dev, paddr)));
 }
 
 static inline bool is_swiotlb_force_bounce(struct device *dev)
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index 81eab1c72c50..e8be3ee50f18 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -642,6 +642,9 @@ static phys_addr_t swiotlb_dyn_map(struct device *dev, phys_addr_t orig_addr,
 
 	spin_lock_irqsave(&dev->dma_io_tlb_dyn_lock, flags);
 	list_add(&slot->node, &dev->dma_io_tlb_dyn_slots);
+	if (!dev->dma_io_tlb_have_dyn)
+		/* Pairs with smp_load_acquire() in is_swiotlb_buffer() */
+		smp_store_release(&dev->dma_io_tlb_have_dyn, true);
 	spin_unlock_irqrestore(&dev->dma_io_tlb_dyn_lock, flags);
 
 	return page_to_phys(slot->page);
@@ -668,6 +671,9 @@ static void swiotlb_dyn_unmap(struct device *dev, phys_addr_t tlb_addr,
 	unsigned long flags;
 
 	spin_lock_irqsave(&dev->dma_io_tlb_dyn_lock, flags);
+	if (list_is_singular(&dev->dma_io_tlb_dyn_slots))
+		/* Pairs with smp_load_acquire() in is_swiotlb_buffer() */
+		smp_store_release(&dev->dma_io_tlb_have_dyn, false);
 	slot = lookup_dyn_slot_locked(dev, tlb_addr);
 	list_del(&slot->node);
 	spin_unlock_irqrestore(&dev->dma_io_tlb_dyn_lock, flags);
-- 
2.25.1
Re: [PATCH v2 RESEND 7/7] swiotlb: per-device flag if there are dynamically allocated buffers
Posted by Catalin Marinas 2 years, 9 months ago
On Tue, May 09, 2023 at 11:18:19AM +0200, Petr Tesarik wrote:
> diff --git a/include/linux/device.h b/include/linux/device.h
> index d1d2b8557b30..e340e0f06dce 100644
> --- a/include/linux/device.h
> +++ b/include/linux/device.h
> @@ -516,6 +516,9 @@ struct device_physical_location {
>   * @dma_io_tlb_dyn_slots:
>   *		Dynamically allocated bounce buffers for this device.
>   *		Not for driver use.
> + * @dma_io_tlb_have_dyn:
> + *		Does this device have any dynamically allocated bounce
> + *		buffers? Not for driver use.
>   * @archdata:	For arch-specific additions.
>   * @of_node:	Associated device tree node.
>   * @fwnode:	Associated device node supplied by platform firmware.
> @@ -623,6 +626,7 @@ struct device {
>  	struct io_tlb_mem *dma_io_tlb_mem;
>  	spinlock_t dma_io_tlb_dyn_lock;
>  	struct list_head dma_io_tlb_dyn_slots;
> +	bool dma_io_tlb_have_dyn;
>  #endif
>  	/* arch specific additions */
>  	struct dev_archdata	archdata;
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index daa2064f2ede..8cbb0bebb0bc 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -152,7 +152,11 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
>  
>  	return mem &&
>  		(is_swiotlb_fixed(mem, paddr) ||
> -		 (mem->allow_dyn && is_swiotlb_dyn(dev, paddr)));
> +		 /* Pairs with smp_store_release() in swiotlb_dyn_map()
> +		  * and swiotlb_dyn_unmap().
> +		  */
> +		 (smp_load_acquire(&dev->dma_io_tlb_have_dyn) &&
> +		  is_swiotlb_dyn(dev, paddr)));
>  }
>  
>  static inline bool is_swiotlb_force_bounce(struct device *dev)
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index 81eab1c72c50..e8be3ee50f18 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -642,6 +642,9 @@ static phys_addr_t swiotlb_dyn_map(struct device *dev, phys_addr_t orig_addr,
>  
>  	spin_lock_irqsave(&dev->dma_io_tlb_dyn_lock, flags);
>  	list_add(&slot->node, &dev->dma_io_tlb_dyn_slots);
> +	if (!dev->dma_io_tlb_have_dyn)
> +		/* Pairs with smp_load_acquire() in is_swiotlb_buffer() */
> +		smp_store_release(&dev->dma_io_tlb_have_dyn, true);
>  	spin_unlock_irqrestore(&dev->dma_io_tlb_dyn_lock, flags);

I'm not sure this works. What this seems to do is that if the caller of
is_swiotlb_buffer() sees the flag set, it's guaranteed that something
was added to the dma_io_tlb_dyn_slots list. But the reverse is not
necessarily true. IOW, if something was added to the list, there is a
brief window where the dma_io_tlb_have_dyn flag is still false. In the
general case, I doubt any ordering between list_add() and the flag
setting changes anything since neither of them may be visible to another
CPU.

What you need is for a 'paddr' added to the dynamic list to be correctly
identified by another CPU as dynamic swiotlb. That other CPU doesn't
check random addresses but only those returned by the DMA API. Such
values would be normally passed through a memory location (e.g. driver
local structures) and that's what you want to order against.

What I mean is that a 'dev->blah = paddr' needs to be ordered _after_
your flag setting. So you need the either the 'blah = paddr' assignment
to have release semantics or the flag setting to be an
smp_store_acquire() (but we don't have such thing). You'd have to use an
explicit smp_wmb() barrier after the flag setting (it can be outside the
lock). The spin_unlock() is not sufficient since it only has release
semantics. I also don't think the ordering between list_add() and flag
setting matters since the smp_wmb() would ensure that both are visible
when the 'paddr' value made it to the other CPU.

Similarly on the is_swiotlb_buffer() side, you want the flag reading to
be ordered after the 'blah = paddr' is observed. Here the
smp_load_acquire() is sufficient.

>  	return page_to_phys(slot->page);
> @@ -668,6 +671,9 @@ static void swiotlb_dyn_unmap(struct device *dev, phys_addr_t tlb_addr,
>  	unsigned long flags;
>  
>  	spin_lock_irqsave(&dev->dma_io_tlb_dyn_lock, flags);
> +	if (list_is_singular(&dev->dma_io_tlb_dyn_slots))
> +		/* Pairs with smp_load_acquire() in is_swiotlb_buffer() */
> +		smp_store_release(&dev->dma_io_tlb_have_dyn, false);
>  	slot = lookup_dyn_slot_locked(dev, tlb_addr);
>  	list_del(&slot->node);
>  	spin_unlock_irqrestore(&dev->dma_io_tlb_dyn_lock, flags);

As with the map case, I don't think the ordering between list_del() and
the flag setting matters. If you unmap the last dynamic buffer, the
worst that can happen is that an is_swiotlb_buffer() call attempts a
read of the list but the flag will eventually become visible. There
shouldn't be another caller trying to unmap the same paddr (in well
behaved drivers).

Now, thinking about the list_head access and the flag ordering, since it
doesn't matter, you might as well not bother with the flag at all and
rely on list_add() and list_empty() ordering vs the hypothetical 'blah'
access. Both of these use READ/WRITE_ONCE() for setting
dma_io_tlb_dyn_slots.next. You only need an smp_wmb() after the
list_add() and an smp_rmb() before a list_empty() check in
is_swiotlb_buffer(), no dma_iotlb_have_dyn variable.

That's my reasoning but to be absolutely sure, you can pass that through
some formal modelling.

-- 
Catalin
Re: [PATCH v2 RESEND 7/7] swiotlb: per-device flag if there are dynamically allocated buffers
Posted by Petr Tesařík 2 years, 9 months ago
Hi Catalin,

On Sun, 14 May 2023 19:54:27 +0100
Catalin Marinas <catalin.marinas@arm.com> wrote:

> On Tue, May 09, 2023 at 11:18:19AM +0200, Petr Tesarik wrote:
> > diff --git a/include/linux/device.h b/include/linux/device.h
> > index d1d2b8557b30..e340e0f06dce 100644
> > --- a/include/linux/device.h
> > +++ b/include/linux/device.h
> > @@ -516,6 +516,9 @@ struct device_physical_location {
> >   * @dma_io_tlb_dyn_slots:
> >   *		Dynamically allocated bounce buffers for this device.
> >   *		Not for driver use.
> > + * @dma_io_tlb_have_dyn:
> > + *		Does this device have any dynamically allocated bounce
> > + *		buffers? Not for driver use.
> >   * @archdata:	For arch-specific additions.
> >   * @of_node:	Associated device tree node.
> >   * @fwnode:	Associated device node supplied by platform firmware.
> > @@ -623,6 +626,7 @@ struct device {
> >  	struct io_tlb_mem *dma_io_tlb_mem;
> >  	spinlock_t dma_io_tlb_dyn_lock;
> >  	struct list_head dma_io_tlb_dyn_slots;
> > +	bool dma_io_tlb_have_dyn;
> >  #endif
> >  	/* arch specific additions */
> >  	struct dev_archdata	archdata;
> > diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> > index daa2064f2ede..8cbb0bebb0bc 100644
> > --- a/include/linux/swiotlb.h
> > +++ b/include/linux/swiotlb.h
> > @@ -152,7 +152,11 @@ static inline bool is_swiotlb_buffer(struct device *dev, phys_addr_t paddr)
> >  
> >  	return mem &&
> >  		(is_swiotlb_fixed(mem, paddr) ||
> > -		 (mem->allow_dyn && is_swiotlb_dyn(dev, paddr)));
> > +		 /* Pairs with smp_store_release() in swiotlb_dyn_map()
> > +		  * and swiotlb_dyn_unmap().
> > +		  */
> > +		 (smp_load_acquire(&dev->dma_io_tlb_have_dyn) &&
> > +		  is_swiotlb_dyn(dev, paddr)));
> >  }
> >  
> >  static inline bool is_swiotlb_force_bounce(struct device *dev)
> > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> > index 81eab1c72c50..e8be3ee50f18 100644
> > --- a/kernel/dma/swiotlb.c
> > +++ b/kernel/dma/swiotlb.c
> > @@ -642,6 +642,9 @@ static phys_addr_t swiotlb_dyn_map(struct device *dev, phys_addr_t orig_addr,
> >  
> >  	spin_lock_irqsave(&dev->dma_io_tlb_dyn_lock, flags);
> >  	list_add(&slot->node, &dev->dma_io_tlb_dyn_slots);
> > +	if (!dev->dma_io_tlb_have_dyn)
> > +		/* Pairs with smp_load_acquire() in is_swiotlb_buffer() */
> > +		smp_store_release(&dev->dma_io_tlb_have_dyn, true);
> >  	spin_unlock_irqrestore(&dev->dma_io_tlb_dyn_lock, flags);  
> 
> I'm not sure this works. What this seems to do is that if the caller of
> is_swiotlb_buffer() sees the flag set, it's guaranteed that something
> was added to the dma_io_tlb_dyn_slots list. But the reverse is not
> necessarily true. IOW, if something was added to the list, there is a
> brief window where the dma_io_tlb_have_dyn flag is still false. In the
> general case, I doubt any ordering between list_add() and the flag
> setting changes anything since neither of them may be visible to another
> CPU.

Thank you for the review! This patch probably needs a bit more
explanation.

The goal is to avoid taking a spin lock in the mkost common case that
the dynamic list is empty. The only required invariant is:

  When the flag is clear, it is safe to skip the list.

It's not a bug to walk an empty list, it's merely less efficient. Such
race window would be acceptable. OTOH that's not your concern if I
understand you correctly.

> What you need is for a 'paddr' added to the dynamic list to be correctly
> identified by another CPU as dynamic swiotlb. That other CPU doesn't
> check random addresses but only those returned by the DMA API.

Yes, that's correct.

> Such
> values would be normally passed through a memory location (e.g. driver
> local structures) and that's what you want to order against.

This would certainly work, but I'm not sure I need it. My only goal is
that when the flag is set, the new value is observed by all CPUs on the
next call to is_swiotlb_buffer().

> What I mean is that a 'dev->blah = paddr' needs to be ordered _after_
> your flag setting. So you need the either the 'blah = paddr' assignment
> to have release semantics or the flag setting to be an
> smp_store_acquire() (but we don't have such thing). You'd have to use an
> explicit smp_wmb() barrier after the flag setting (it can be outside the
> lock). The spin_unlock() is not sufficient since it only has release
> semantics.

Understood. The spinlock is supposed to order changes to the list, not
to the flag.

> I also don't think the ordering between list_add() and flag
> setting matters since the smp_wmb() would ensure that both are visible
> when the 'paddr' value made it to the other CPU.

If the flag makes it before the list, then the other CPU will walk the
list only after acquiring dma_io_tlb_dyn_lock, and that's what the list
is ordered against.

I don't think there is any concern if the list makes it before the
flag, as long as the new value of the flag is observed on the next call
to is_swiotlb_buffer (on any CPU).

> Similarly on the is_swiotlb_buffer() side, you want the flag reading to
> be ordered after the 'blah = paddr' is observed. Here the
> smp_load_acquire() is sufficient.
> 
> >  	return page_to_phys(slot->page);
> > @@ -668,6 +671,9 @@ static void swiotlb_dyn_unmap(struct device *dev, phys_addr_t tlb_addr,
> >  	unsigned long flags;
> >  
> >  	spin_lock_irqsave(&dev->dma_io_tlb_dyn_lock, flags);
> > +	if (list_is_singular(&dev->dma_io_tlb_dyn_slots))
> > +		/* Pairs with smp_load_acquire() in is_swiotlb_buffer() */
> > +		smp_store_release(&dev->dma_io_tlb_have_dyn, false);
> >  	slot = lookup_dyn_slot_locked(dev, tlb_addr);
> >  	list_del(&slot->node);
> >  	spin_unlock_irqrestore(&dev->dma_io_tlb_dyn_lock, flags);  
> 
> As with the map case, I don't think the ordering between list_del() and
> the flag setting matters. If you unmap the last dynamic buffer, the
> worst that can happen is that an is_swiotlb_buffer() call attempts a
> read of the list but the flag will eventually become visible. There
> shouldn't be another caller trying to unmap the same paddr (in well
> behaved drivers).
> 
> Now, thinking about the list_head access and the flag ordering, since it
> doesn't matter, you might as well not bother with the flag at all and
> rely on list_add() and list_empty() ordering vs the hypothetical 'blah'
> access. Both of these use READ/WRITE_ONCE() for setting
> dma_io_tlb_dyn_slots.next. You only need an smp_wmb() after the
> list_add() and an smp_rmb() before a list_empty() check in
> is_swiotlb_buffer(), no dma_iotlb_have_dyn variable.

Wait, let me check that I understand you right. Do you suggest that I
convert dma_io_tlb_dyn_slots to a lockless list and get rid of the
spinlock?

I'm sure it can be done for list_add() and list_del(). I'll have
to think about list_move().

> That's my reasoning but to I'll have be absolutely sure, you can pass that through
> some formal modelling.

Good idea. Especially if I try to get rid of the lock.

Petr T
Re: [PATCH v2 RESEND 7/7] swiotlb: per-device flag if there are dynamically allocated buffers
Posted by Petr Tesařík 2 years, 9 months ago
On Mon, 15 May 2023 10:48:47 +0200
Petr Tesařík <petr@tesarici.cz> wrote:

> Hi Catalin,
> 
> On Sun, 14 May 2023 19:54:27 +0100
> Catalin Marinas <catalin.marinas@arm.com> wrote:
>[...]
> > Now, thinking about the list_head access and the flag ordering, since it
> > doesn't matter, you might as well not bother with the flag at all and
> > rely on list_add() and list_empty() ordering vs the hypothetical 'blah'
> > access. Both of these use READ/WRITE_ONCE() for setting
> > dma_io_tlb_dyn_slots.next. You only need an smp_wmb() after the
> > list_add() and an smp_rmb() before a list_empty() check in
                      ^^^^^^^^^
Got it, finally. Well, that's exactly something I don't want to do.
For example, on arm64 (seeing your email address), smp_rmb() translates
to a "dsb ld" instruction. I would expect that this is more expensive
than a "ldar", generated by smp_load_acquire().

I mean, for devices that do not need swiotlb, the flag never changes
from zero, so reading it must be as cheap as possible.

Petr T

> > is_swiotlb_buffer(), no dma_iotlb_have_dyn variable.  
> 
> Wait, let me check that I understand you right. Do you suggest that I
> convert dma_io_tlb_dyn_slots to a lockless list and get rid of the
> spinlock?
> 
> I'm sure it can be done for list_add() and list_del(). I'll have
> to think about list_move().

Hm, even the documentation of llist_empty() says that it is "not
guaranteed to be accurate or up to date". If it could be, I'm quite
sure the authors would have gladly implemented it as such.

Petr T
Re: [PATCH v2 RESEND 7/7] swiotlb: per-device flag if there are dynamically allocated buffers
Posted by Catalin Marinas 2 years, 9 months ago
(some of you replies may have been filtered to various of my mailboxes,
depending on which lists you cc'ed; replying here)

On Mon, May 15, 2023 at 12:00:54PM +0200, Petr Tesařík wrote:
> On Mon, 15 May 2023 10:48:47 +0200
> Petr Tesařík <petr@tesarici.cz> wrote:
> > On Sun, 14 May 2023 19:54:27 +0100
> > Catalin Marinas <catalin.marinas@arm.com> wrote:
> > > Now, thinking about the list_head access and the flag ordering, since it
> > > doesn't matter, you might as well not bother with the flag at all and
> > > rely on list_add() and list_empty() ordering vs the hypothetical 'blah'
> > > access. Both of these use READ/WRITE_ONCE() for setting
> > > dma_io_tlb_dyn_slots.next. You only need an smp_wmb() after the
> > > list_add() and an smp_rmb() before a list_empty() check in
>                       ^^^^^^^^^
> Got it, finally. Well, that's exactly something I don't want to do.
> For example, on arm64 (seeing your email address), smp_rmb() translates
> to a "dsb ld" instruction. I would expect that this is more expensive
> than a "ldar", generated by smp_load_acquire().

It translates to a dmb ishld which is on par with ldar (dsb is indeed a
lot more expensive but that's not generated here).

> > > is_swiotlb_buffer(), no dma_iotlb_have_dyn variable.  
> > 
> > Wait, let me check that I understand you right. Do you suggest that I
> > convert dma_io_tlb_dyn_slots to a lockless list and get rid of the
> > spinlock?
> > 
> > I'm sure it can be done for list_add() and list_del(). I'll have
> > to think about list_move().
> 
> Hm, even the documentation of llist_empty() says that it is "not
> guaranteed to be accurate or up to date". If it could be, I'm quite
> sure the authors would have gladly implemented it as such.

It doesn't but neither does your flag. If you want a guarantee, you'd
need locks because a llist_empty() on its own can race with other
llist_add/del_*() that may not yet be visible to a CPU at exactly that
moment. BTW, the llist implementation cannot delete a random element, so
not sure this is suitable for your implementation (it can only delete
the first element or the whole list).

I do think you need to change your invariants and not rely on an
absolute list_empty() or some flag:

P0:
	list_add(paddr);
	WRITE_ONCE(blah, paddr);

P1:
	paddr = READ_ONCE(blah);
	list_empty();

Your invariant (on P1) should be blah == paddr => !list_empty(). If
there is another P2 removing paddr from the list, this wouldn't work
(nor your flag) but the assumption is that a correctly written driver
that still has a reference to paddr doesn't use it after being removed
from the list (i.e. it doesn't do a dma_unmap(paddr) and still continue
to use this paddr for e.g. dma_sync()).

For such invariant, you'd need ordering between list_add() and the
write of paddr (smp_wmb() would do). On P1, you need an smp_rmb() before
list_empty() since the implementation does a READ_ONCE only).

You still need the locks for list modifications and list traversal as I
don't see how you can use the llist implementation with random element
removal.


There is another scenario to take into account on the list_del() side.
Let's assume that there are other elements on the list, so
list_empty() == false:

P0:
	list_del(paddr);
	/* the memory gets freed, added to some slab or page free list */
	WRITE_ONCE(slab_free_list, __va(paddr));

P1:
	paddr = __pa(READ_ONCE(slab_free_list));/* re-allocating paddr freed on P0 */
	if (!list_empty()) {			/* assuming other elements on the list */
		/* searching the list */
		list_for_each() {
			if (pos->paddr) == __pa(vaddr))
				/* match */
		}
	}

On P0, you want the list update to be visible before the memory is freed
(and potentially reallocated on P1). An smp_wmb() on P0 would do. For
P1, we don't care about list_empty() as there can be other elements
already. But we do want any list elements reading during the search to
be ordered after the slab_free_list reading. The smp_rmb() you'd add for
the case above would suffice.

-- 
Catalin
Re: [PATCH v2 RESEND 7/7] swiotlb: per-device flag if there are dynamically allocated buffers
Posted by Petr Tesařík 2 years, 9 months ago
On Mon, 15 May 2023 17:28:38 +0100
Catalin Marinas <catalin.marinas@arm.com> wrote:

> (some of you replies may have been filtered to various of my mailboxes,
> depending on which lists you cc'ed; replying here)
> 
> On Mon, May 15, 2023 at 12:00:54PM +0200, Petr Tesařík wrote:
> > On Mon, 15 May 2023 10:48:47 +0200
> > Petr Tesařík <petr@tesarici.cz> wrote:  
> > > On Sun, 14 May 2023 19:54:27 +0100
> > > Catalin Marinas <catalin.marinas@arm.com> wrote:  
> > > > Now, thinking about the list_head access and the flag ordering, since it
> > > > doesn't matter, you might as well not bother with the flag at all and
> > > > rely on list_add() and list_empty() ordering vs the hypothetical 'blah'
> > > > access. Both of these use READ/WRITE_ONCE() for setting
> > > > dma_io_tlb_dyn_slots.next. You only need an smp_wmb() after the
> > > > list_add() and an smp_rmb() before a list_empty() check in  
> >                       ^^^^^^^^^
> > Got it, finally. Well, that's exactly something I don't want to do.
> > For example, on arm64 (seeing your email address), smp_rmb() translates
> > to a "dsb ld" instruction. I would expect that this is more expensive
> > than a "ldar", generated by smp_load_acquire().  
> 
> It translates to a dmb ishld which is on par with ldar (dsb is indeed a
> lot more expensive but that's not generated here).

You're right, dsb is generated for the non-smp barrier variants. Thanks
for the correction.

> > > > is_swiotlb_buffer(), no dma_iotlb_have_dyn variable.    
> > > 
> > > Wait, let me check that I understand you right. Do you suggest that I
> > > convert dma_io_tlb_dyn_slots to a lockless list and get rid of the
> > > spinlock?
> > > 
> > > I'm sure it can be done for list_add() and list_del(). I'll have
> > > to think about list_move().  
> > 
> > Hm, even the documentation of llist_empty() says that it is "not
> > guaranteed to be accurate or up to date". If it could be, I'm quite
> > sure the authors would have gladly implemented it as such.  
> 
> It doesn't but neither does your flag.

Yes, I have already agreed in another sub-thread.

> If you want a guarantee, you'd
> need locks because a llist_empty() on its own can race with other
> llist_add/del_*() that may not yet be visible to a CPU at exactly that
> moment. BTW, the llist implementation cannot delete a random element, so
> not sure this is suitable for your implementation (it can only delete
> the first element or the whole list).
> 
> I do think you need to change your invariants and not rely on an
> absolute list_empty() or some flag:
> 
> P0:
> 	list_add(paddr);
> 	WRITE_ONCE(blah, paddr);
> 
> P1:
> 	paddr = READ_ONCE(blah);
> 	list_empty();
> 
> Your invariant (on P1) should be blah == paddr => !list_empty(). If
> there is another P2 removing paddr from the list, this wouldn't work
> (nor your flag) but the assumption is that a correctly written driver
> that still has a reference to paddr doesn't use it after being removed
> from the list (i.e. it doesn't do a dma_unmap(paddr) and still continue
> to use this paddr for e.g. dma_sync()).

Right. In other words, given any paddr:

  a. Either it is on the list, and then the list cannot become empty by
     any concurrent code.

  a. Or it is not on the list, but then we may skip the search
     regardless of any races with other CPUs.

> For such invariant, you'd need ordering between list_add() and the
> write of paddr (smp_wmb() would do). On P1, you need an smp_rmb() before
> list_empty() since the implementation does a READ_ONCE only).

I agree.

> You still need the locks for list modifications and list traversal as I
> don't see how you can use the llist implementation with random element
> removal.

That's right. It might even perform better than a truly non-blocking
list (cf. Valois, Harris, Zhang).

> There is another scenario to take into account on the list_del() side.
> Let's assume that there are other elements on the list, so
> list_empty() == false:
> 
> P0:
> 	list_del(paddr);
> 	/* the memory gets freed, added to some slab or page free list */
> 	WRITE_ONCE(slab_free_list, __va(paddr));
> 
> P1:
> 	paddr = __pa(READ_ONCE(slab_free_list));/* re-allocating paddr freed on P0 */
> 	if (!list_empty()) {			/* assuming other elements on the list */
> 		/* searching the list */
> 		list_for_each() {
> 			if (pos->paddr) == __pa(vaddr))
> 				/* match */
> 		}
> 	}
> 
> On P0, you want the list update to be visible before the memory is freed
> (and potentially reallocated on P1). An smp_wmb() on P0 would do. For
> P1, we don't care about list_empty() as there can be other elements
> already. But we do want any list elements reading during the search to
> be ordered after the slab_free_list reading. The smp_rmb() you'd add for
> the case above would suffice.

Yes, but to protect against concurrent insertions/deletions, a spinlock
is held while searching the list. The spin lock provides the necessary
memory barriers implicitly.

Again, thank you very much for your time!

Petr T
Re: [PATCH v2 RESEND 7/7] swiotlb: per-device flag if there are dynamically allocated buffers
Posted by Catalin Marinas 2 years, 9 months ago
On Tue, May 16, 2023 at 09:55:12AM +0200, Petr Tesařík wrote:
> On Mon, 15 May 2023 17:28:38 +0100
> Catalin Marinas <catalin.marinas@arm.com> wrote:
> > There is another scenario to take into account on the list_del() side.
> > Let's assume that there are other elements on the list, so
> > list_empty() == false:
> > 
> > P0:
> > 	list_del(paddr);
> > 	/* the memory gets freed, added to some slab or page free list */
> > 	WRITE_ONCE(slab_free_list, __va(paddr));
> > 
> > P1:
> > 	paddr = __pa(READ_ONCE(slab_free_list));/* re-allocating paddr freed on P0 */
> > 	if (!list_empty()) {			/* assuming other elements on the list */
> > 		/* searching the list */
> > 		list_for_each() {
> > 			if (pos->paddr) == __pa(vaddr))
> > 				/* match */
> > 		}
> > 	}
> > 
> > On P0, you want the list update to be visible before the memory is freed
> > (and potentially reallocated on P1). An smp_wmb() on P0 would do. For
> > P1, we don't care about list_empty() as there can be other elements
> > already. But we do want any list elements reading during the search to
> > be ordered after the slab_free_list reading. The smp_rmb() you'd add for
> > the case above would suffice.
> 
> Yes, but to protect against concurrent insertions/deletions, a spinlock
> is held while searching the list. The spin lock provides the necessary
> memory barriers implicitly.

Well, mostly. The spinlock acquire/release semantics ensure that
accesses within the locked region are not observed outside the
lock/unlock. But it doesn't guarantee anything about accesses outside
such region in relation to the accesses within the region. For example:

P0:
	spin_lock_irqsave(&swiotlb_dyn_lock);
	list_del(paddr);
	spin_unlock_irqrestore(&swiotlb_dyn_lock);

	/* the blah write below can be observed before list_del() above */
	WRITE_ONCE(blah, paddr);

	/* that's somewhat tricker but slab_free_list update can also be
	 * seen before list_del() above on certain architectures */
	spin_lock_irqsave(&slab_lock);
 	WRITE_ONCE(slab_free_list, __va(paddr));
	spin_unlock_irqrestore(&slab_lock);

On most architectures, the writing of the pointer to a slab structure
(assuming some spinlocks) would be ordered against the list_del() from
the swiotlb code. Apart from powerpc where the spin_unlock() is not
necessarily ordered against the subsequent spin_lock(). The architecture
selects ARCH_WEAK_RELEASE_ACQUIRE which in turns makes
smp_mb__after_unlock_lock() an smp_mb() (rather than no-op on all the
other architectures).

On arm64 we have smp_mb__after_spinlock() which ensures that memory
accesses prior to spin_lock() are not observed after accesses within the
locked region. I don't think this matters for your case but I thought
I'd mention it.

-- 
Catalin