[PATCH v2 1/3] gpio: mpsse: use rcu to ensure worker is torn down

Mary Strodl posted 3 patches 1 week, 1 day ago
[PATCH v2 1/3] gpio: mpsse: use rcu to ensure worker is torn down
Posted by Mary Strodl 1 week, 1 day ago
When an IRQ worker is running, unplugging the device would cause a
crash. The sealevel hardware this driver was written for was not
hotpluggable, so I never realized it.

This change uses RCU to create a list of workers, which it tears down on
disconnect.

Signed-off-by: Mary Strodl <mstrodl@csh.rit.edu>
---
 drivers/gpio/gpio-mpsse.c | 119 +++++++++++++++++++++++++++++++++++---
 1 file changed, 110 insertions(+), 9 deletions(-)

diff --git a/drivers/gpio/gpio-mpsse.c b/drivers/gpio/gpio-mpsse.c
index 9f42bb30b4ec..af9a9196ac9d 100644
--- a/drivers/gpio/gpio-mpsse.c
+++ b/drivers/gpio/gpio-mpsse.c
@@ -10,6 +10,7 @@
 #include <linux/cleanup.h>
 #include <linux/gpio/driver.h>
 #include <linux/mutex.h>
+#include <linux/spinlock.h>
 #include <linux/usb.h>
 
 struct mpsse_priv {
@@ -17,8 +18,10 @@ struct mpsse_priv {
 	struct usb_device *udev;     /* USB device encompassing all MPSSEs */
 	struct usb_interface *intf;  /* USB interface for this MPSSE */
 	u8 intf_id;                  /* USB interface number for this MPSSE */
-	struct work_struct irq_work; /* polling work thread */
+	struct list_head workers;    /* polling work threads */
 	struct mutex irq_mutex;	     /* lock over irq_data */
+	struct mutex irq_race;	     /* race for polling worker teardown */
+	raw_spinlock_t irq_spin;     /* protects worker list */
 	atomic_t irq_type[16];	     /* pin -> edge detection type */
 	atomic_t irq_enabled;
 	int id;
@@ -34,6 +37,15 @@ struct mpsse_priv {
 	struct mutex io_mutex;	    /* sync I/O with disconnect */
 };
 
+struct mpsse_worker {
+	struct mpsse_priv  *priv;
+	struct work_struct  work;
+	atomic_t       cancelled;
+	struct list_head    list;   /* linked list */
+	struct list_head destroy;   /* teardown linked list */
+	struct rcu_head     rcu;    /* synchronization */
+};
+
 struct bulk_desc {
 	bool tx;	            /* direction of bulk transfer */
 	u8 *data;                   /* input (tx) or output (rx) */
@@ -261,9 +273,8 @@ static int gpio_mpsse_direction_input(struct gpio_chip *chip,
 
 	guard(mutex)(&priv->io_mutex);
 	priv->gpio_dir[bank] &= ~BIT(bank_offset);
-	gpio_mpsse_set_bank(priv, bank);
 
-	return 0;
+	return gpio_mpsse_set_bank(priv, bank);
 }
 
 static int gpio_mpsse_get_direction(struct gpio_chip *chip,
@@ -284,18 +295,55 @@ static int gpio_mpsse_get_direction(struct gpio_chip *chip,
 	return ret;
 }
 
-static void gpio_mpsse_poll(struct work_struct *work)
+static void gpio_mpsse_stop(struct mpsse_worker *worker)
+{
+	cancel_work_sync(&worker->work);
+	kfree(worker);
+}
+
+static void gpio_mpsse_poll(struct work_struct *my_work)
 {
 	unsigned long pin_mask, pin_states, flags;
 	int irq_enabled, offset, err, value, fire_irq,
 		irq, old_value[16], irq_type[16];
-	struct mpsse_priv *priv = container_of(work, struct mpsse_priv,
-					       irq_work);
+	struct mpsse_worker *worker;
+	struct mpsse_worker *my_worker = container_of(my_work, struct mpsse_worker, work);
+	struct mpsse_priv *priv = my_worker->priv;
+	struct list_head destructors = LIST_HEAD_INIT(destructors);
 
 	for (offset = 0; offset < priv->gpio.ngpio; ++offset)
 		old_value[offset] = -1;
 
-	while ((irq_enabled = atomic_read(&priv->irq_enabled))) {
+	/*
+	 * We only want one worker. Workers race to acquire irq_race and tear
+	 * down all other workers. This is a cond guard so that we don't deadlock
+	 * trying to cancel a worker.
+	 */
+	scoped_cond_guard(mutex_try, ;, &priv->irq_race) {
+		scoped_guard(rcu) {
+			list_for_each_entry_rcu(worker, &priv->workers, list) {
+				/* Don't stop ourselves */
+				if (worker == my_worker)
+					continue;
+
+				scoped_guard(raw_spinlock_irqsave, &priv->irq_spin)
+					list_del_rcu(&worker->list);
+
+				/* Give worker a chance to terminate itself */
+				atomic_set(&worker->cancelled, 1);
+				/* Keep track of stuff to cancel */
+				INIT_LIST_HEAD(&worker->destroy);
+				list_add(&worker->destroy, &destructors);
+			}
+		}
+		/* Make sure list consumers are finished before we tear down */
+		synchronize_rcu();
+		list_for_each_entry(worker, &destructors, destroy)
+			gpio_mpsse_stop(worker);
+	}
+
+	while ((irq_enabled = atomic_read(&priv->irq_enabled)) &&
+	       !atomic_read(&my_worker->cancelled)) {
 		usleep_range(MPSSE_POLL_INTERVAL, MPSSE_POLL_INTERVAL + 1000);
 		/* Cleanup will trigger at the end of the loop */
 		guard(mutex)(&priv->irq_mutex);
@@ -370,21 +418,41 @@ static int gpio_mpsse_set_irq_type(struct irq_data *irqd, unsigned int type)
 
 static void gpio_mpsse_irq_disable(struct irq_data *irqd)
 {
+	struct mpsse_worker *worker;
 	struct mpsse_priv *priv = irq_data_get_irq_chip_data(irqd);
 
 	atomic_and(~BIT(irqd->hwirq), &priv->irq_enabled);
 	gpiochip_disable_irq(&priv->gpio, irqd->hwirq);
+
+	/* Can't actually do teardown in IRQ context (blocks...) */
+	scoped_guard(rcu)
+		list_for_each_entry_rcu(worker, &priv->workers, list)
+			atomic_set(&worker->cancelled, 1);
 }
 
 static void gpio_mpsse_irq_enable(struct irq_data *irqd)
 {
+	struct mpsse_worker *worker;
 	struct mpsse_priv *priv = irq_data_get_irq_chip_data(irqd);
 
 	gpiochip_enable_irq(&priv->gpio, irqd->hwirq);
 	/* If no-one else was using the IRQ, enable it */
 	if (!atomic_fetch_or(BIT(irqd->hwirq), &priv->irq_enabled)) {
-		INIT_WORK(&priv->irq_work, gpio_mpsse_poll);
-		schedule_work(&priv->irq_work);
+		/*
+		 * Can't be devm because it uses a non-raw spinlock (illegal in
+		 * this context, where a raw spinlock is held by our caller)
+		 */
+		worker = kzalloc(sizeof(*worker), GFP_KERNEL);
+		if (!worker)
+			return;
+
+		worker->priv = priv;
+		INIT_LIST_HEAD(&worker->list);
+		INIT_WORK(&worker->work, gpio_mpsse_poll);
+		schedule_work(&worker->work);
+
+		scoped_guard(raw_spinlock_irqsave, &priv->irq_spin)
+			list_add_rcu(&worker->list, &priv->workers);
 	}
 }
 
@@ -436,6 +504,12 @@ static int gpio_mpsse_probe(struct usb_interface *interface,
 	if (err)
 		return err;
 
+	err = devm_mutex_init(dev, &priv->irq_race);
+	if (err)
+		return err;
+
+	raw_spin_lock_init(&priv->irq_spin);
+
 	priv->gpio.label = devm_kasprintf(dev, GFP_KERNEL,
 					  "gpio-mpsse.%d.%d",
 					  priv->id, priv->intf_id);
@@ -504,7 +578,34 @@ static int gpio_mpsse_probe(struct usb_interface *interface,
 
 static void gpio_mpsse_disconnect(struct usb_interface *intf)
 {
+	struct mpsse_worker *worker;
 	struct mpsse_priv *priv = usb_get_intfdata(intf);
+	struct list_head destructors = LIST_HEAD_INIT(destructors);
+
+	/*
+	 * Lock prevents double-free of worker from here and the teardown
+	 * step at the beginning of gpio_mpsse_poll
+	 */
+	scoped_guard(mutex, &priv->irq_race) {
+		scoped_guard(rcu) {
+			list_for_each_entry_rcu(worker, &priv->workers, list) {
+				scoped_guard(raw_spinlock_irqsave, &priv->irq_spin)
+					list_del_rcu(&worker->list);
+
+				/* Give worker a chance to terminate itself */
+				atomic_set(&worker->cancelled, 1);
+				/* Keep track of stuff to cancel */
+				INIT_LIST_HEAD(&worker->destroy);
+				list_add(&worker->destroy, &destructors);
+			}
+		}
+		/* Make sure list consumers are finished before we tear down */
+		synchronize_rcu();
+		list_for_each_entry(worker, &destructors, destroy)
+			gpio_mpsse_stop(worker);
+	}
+
+	rcu_barrier();
 
 	priv->intf = NULL;
 	usb_set_intfdata(intf, NULL);
-- 
2.47.0
Re: [PATCH v2 1/3] gpio: mpsse: use rcu to ensure worker is torn down
Posted by Linus Walleij 20 hours ago
On Tue, Sep 23, 2025 at 3:35 PM Mary Strodl <mstrodl@csh.rit.edu> wrote:

> When an IRQ worker is running, unplugging the device would cause a
> crash. The sealevel hardware this driver was written for was not
> hotpluggable, so I never realized it.
>
> This change uses RCU to create a list of workers, which it tears down on
> disconnect.
>
> Signed-off-by: Mary Strodl <mstrodl@csh.rit.edu>

Oh this RCU thing is a bit terse and a lot of code.

Can you look if you can use the new revocable resource
management API? It uses RCU underneath.
https://lwn.net/Articles/1038523/

Yours,
Linus Walleij
Re: [PATCH v2 1/3] gpio: mpsse: use rcu to ensure worker is torn down
Posted by Mary Strodl 12 hours ago
Hey Linus,

On Wed, Oct 01, 2025 at 09:15:14AM +0200, Linus Walleij wrote:
> Oh this RCU thing is a bit terse and a lot of code.
> 
> Can you look if you can use the new revocable resource
> management API? It uses RCU underneath.
> https://lwn.net/Articles/1038523/

Yeah I'm very open to suggestions about how to do this nicer.

I can't read that article, because I don't subscribe to LWN (Maybe I should
though). Looks like it becomes free tomorrow, so I can take a look then. I did
find and skim through this, which I believe is the implementation of the API
you're talking about:
https://lore.kernel.org/chrome-platform/20250923075302.591026-2-tzungbi@kernel.org/

Based on this, it seems like:
* This uses sleepable RCU, which I don't think we can use in the IRQ callbacks
  since they don't allow any blocking (this is the main reason for the complexity)
* We'd still need some sort of list primitive, because we could potentially have
  multiple workers being torn down at a time, so we need to know what all to
  revoke when the device is being torn down. Right now, I'm using the RCU lists
  API to keep track of this. My instinct is to use devm, but that also isn't
  technically safe for use in IRQ handlers.

I obviously haven't played with the revocable resource management API, so
maybe these limitations aren't as big of a deal as I think they are.

With that said, I think now that I've found spinlocks work, I could use those
to gate access to the list everywhere, and use the standard lists api rather
than the RCU lists api. Obviously teardown of the workers would happen outside
the spin lock critical section, guarded by a proper mutex.

Let me know what you think... Thank you!
Re: [PATCH v2 1/3] gpio: mpsse: use rcu to ensure worker is torn down
Posted by Linus Walleij 4 hours ago
On Wed, Oct 1, 2025 at 5:07 PM Mary Strodl <mstrodl@csh.rit.edu> wrote:

> With that said, I think now that I've found spinlocks work, I could use those
> to gate access to the list everywhere, and use the standard lists api rather
> than the RCU lists api. Obviously teardown of the workers would happen outside
> the spin lock critical section, guarded by a proper mutex.

Yeah RCU is for massive parallelism where you have a lot of (performance
sensitive) readers and a  few writers to the struct.

If this isn't performance-sensitive and doesn't have a *lot* of simultaneous
readers, try to use simpler locking mechanisms if you can.

But if performance hampers, RCU is surely the way to go!

Yours,
Linus Walleij
Re: [PATCH v2 1/3] gpio: mpsse: use rcu to ensure worker is torn down
Posted by Dan Carpenter 2 days, 18 hours ago
Hi Mary,

kernel test robot noticed the following build warnings:

https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mary-Strodl/gpio-mpsse-use-rcu-to-ensure-worker-is-torn-down/20250923-214710
base:   https://git.kernel.org/pub/scm/linux/kernel/git/brgl/linux.git gpio/for-next
patch link:    https://lore.kernel.org/r/20250923133304.273529-2-mstrodl%40csh.rit.edu
patch subject: [PATCH v2 1/3] gpio: mpsse: use rcu to ensure worker is torn down
config: i386-randconfig-141-20250929 (https://download.01.org/0day-ci/archive/20250929/202509290823.hreUi6Tp-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.4.0-5) 12.4.0

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
| Closes: https://lore.kernel.org/r/202509290823.hreUi6Tp-lkp@intel.com/

New smatch warnings:
drivers/gpio/gpio-mpsse.c:341 gpio_mpsse_poll() error: dereferencing freed memory 'worker' (line 342)
drivers/gpio/gpio-mpsse.c:604 gpio_mpsse_disconnect() error: dereferencing freed memory 'worker' (line 605)

vim +/worker +341 drivers/gpio/gpio-mpsse.c

a14b0c5e3b0741 Mary Strodl 2025-09-23  304  static void gpio_mpsse_poll(struct work_struct *my_work)
c46a74ff05c0ac Mary Strodl 2024-10-09  305  {
c46a74ff05c0ac Mary Strodl 2024-10-09  306  	unsigned long pin_mask, pin_states, flags;
c46a74ff05c0ac Mary Strodl 2024-10-09  307  	int irq_enabled, offset, err, value, fire_irq,
c46a74ff05c0ac Mary Strodl 2024-10-09  308  		irq, old_value[16], irq_type[16];
a14b0c5e3b0741 Mary Strodl 2025-09-23  309  	struct mpsse_worker *worker;
a14b0c5e3b0741 Mary Strodl 2025-09-23  310  	struct mpsse_worker *my_worker = container_of(my_work, struct mpsse_worker, work);
a14b0c5e3b0741 Mary Strodl 2025-09-23  311  	struct mpsse_priv *priv = my_worker->priv;
a14b0c5e3b0741 Mary Strodl 2025-09-23  312  	struct list_head destructors = LIST_HEAD_INIT(destructors);
c46a74ff05c0ac Mary Strodl 2024-10-09  313  
c46a74ff05c0ac Mary Strodl 2024-10-09  314  	for (offset = 0; offset < priv->gpio.ngpio; ++offset)
c46a74ff05c0ac Mary Strodl 2024-10-09  315  		old_value[offset] = -1;
c46a74ff05c0ac Mary Strodl 2024-10-09  316  
a14b0c5e3b0741 Mary Strodl 2025-09-23  317  	/*
a14b0c5e3b0741 Mary Strodl 2025-09-23  318  	 * We only want one worker. Workers race to acquire irq_race and tear
a14b0c5e3b0741 Mary Strodl 2025-09-23  319  	 * down all other workers. This is a cond guard so that we don't deadlock
a14b0c5e3b0741 Mary Strodl 2025-09-23  320  	 * trying to cancel a worker.
a14b0c5e3b0741 Mary Strodl 2025-09-23  321  	 */
a14b0c5e3b0741 Mary Strodl 2025-09-23  322  	scoped_cond_guard(mutex_try, ;, &priv->irq_race) {
a14b0c5e3b0741 Mary Strodl 2025-09-23  323  		scoped_guard(rcu) {
a14b0c5e3b0741 Mary Strodl 2025-09-23  324  			list_for_each_entry_rcu(worker, &priv->workers, list) {
a14b0c5e3b0741 Mary Strodl 2025-09-23  325  				/* Don't stop ourselves */
a14b0c5e3b0741 Mary Strodl 2025-09-23  326  				if (worker == my_worker)
a14b0c5e3b0741 Mary Strodl 2025-09-23  327  					continue;
a14b0c5e3b0741 Mary Strodl 2025-09-23  328  
a14b0c5e3b0741 Mary Strodl 2025-09-23  329  				scoped_guard(raw_spinlock_irqsave, &priv->irq_spin)
a14b0c5e3b0741 Mary Strodl 2025-09-23  330  					list_del_rcu(&worker->list);
a14b0c5e3b0741 Mary Strodl 2025-09-23  331  
a14b0c5e3b0741 Mary Strodl 2025-09-23  332  				/* Give worker a chance to terminate itself */
a14b0c5e3b0741 Mary Strodl 2025-09-23  333  				atomic_set(&worker->cancelled, 1);
a14b0c5e3b0741 Mary Strodl 2025-09-23  334  				/* Keep track of stuff to cancel */
a14b0c5e3b0741 Mary Strodl 2025-09-23  335  				INIT_LIST_HEAD(&worker->destroy);
a14b0c5e3b0741 Mary Strodl 2025-09-23  336  				list_add(&worker->destroy, &destructors);
a14b0c5e3b0741 Mary Strodl 2025-09-23  337  			}
a14b0c5e3b0741 Mary Strodl 2025-09-23  338  		}
a14b0c5e3b0741 Mary Strodl 2025-09-23  339  		/* Make sure list consumers are finished before we tear down */
a14b0c5e3b0741 Mary Strodl 2025-09-23  340  		synchronize_rcu();
a14b0c5e3b0741 Mary Strodl 2025-09-23 @341  		list_for_each_entry(worker, &destructors, destroy)
a14b0c5e3b0741 Mary Strodl 2025-09-23 @342  			gpio_mpsse_stop(worker);

This needs to be list_for_each_entry_save() because gpio_mpsse_stop()
frees the worker.  Or kfree_rcu() inside an rcu lock or something.

a14b0c5e3b0741 Mary Strodl 2025-09-23  343  	}
a14b0c5e3b0741 Mary Strodl 2025-09-23  344  
a14b0c5e3b0741 Mary Strodl 2025-09-23  345  	while ((irq_enabled = atomic_read(&priv->irq_enabled)) &&

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
Re: [PATCH v2 1/3] gpio: mpsse: use rcu to ensure worker is torn down
Posted by Dan Carpenter 2 days, 18 hours ago
On Mon, Sep 29, 2025 at 11:46:24AM +0300, Dan Carpenter wrote:
> a14b0c5e3b0741 Mary Strodl 2025-09-23  339  		/* Make sure list consumers are finished before we tear down */
> a14b0c5e3b0741 Mary Strodl 2025-09-23  340  		synchronize_rcu();
> a14b0c5e3b0741 Mary Strodl 2025-09-23 @341  		list_for_each_entry(worker, &destructors, destroy)
> a14b0c5e3b0741 Mary Strodl 2025-09-23 @342  			gpio_mpsse_stop(worker);
> 
> This needs to be list_for_each_entry_save() because gpio_mpsse_stop()

s/save/safe/.  #CantType

regards,
dan carpenter

> frees the worker.  Or kfree_rcu() inside an rcu lock or something.
> 
> a14b0c5e3b0741 Mary Strodl 2025-09-23  343  	}
> a14b0c5e3b0741 Mary Strodl 2025-09-23  344  
> a14b0c5e3b0741 Mary Strodl 2025-09-23  345  	while ((irq_enabled = atomic_read(&priv->irq_enabled)) &&
> 
> -- 
> 0-DAY CI Kernel Test Service
> https://github.com/intel/lkp-tests/wiki