[PATCH] firmware: stratix10-rsu: avoid blocking reboot_image sysfs when busy

Dinh Nguyen posted 1 patch 3 days, 7 hours ago
drivers/firmware/stratix10-rsu.c | 85 +++++++++++++++++++++++++++-----
1 file changed, 73 insertions(+), 12 deletions(-)
[PATCH] firmware: stratix10-rsu: avoid blocking reboot_image sysfs when busy
Posted by Dinh Nguyen 3 days, 7 hours ago
Writes to the reboot_image sysfs attribute went through rsu_send_msg(),
which unconditionally takes priv->lock with mutex_lock(). If another RSU
operation is in flight (e.g. a DCMF status query from probe or a
concurrent sysfs read path), userspace writers get stuck in the kernel
waiting on the mutex instead of being told the device is busy.

Split rsu_send_msg() into an inner __rsu_send_msg_locked() helper that
performs the SMC transaction with the caller holding priv->lock, plus
two thin wrappers: rsu_send_msg() preserves the original blocking
behaviour for existing callers, and rsu_try_send_msg() uses
mutex_trylock() and returns -EBUSY immediately when the lock is held.

Use rsu_try_send_msg() from reboot_image_store() so the write returns
-EBUSY without blocking when an RSU operation is already running.
Userspace can retry on -EBUSY. No functional change for other sysfs
attributes.

This keeps blocking rsu_send_msg() for existing callers, add
rsu_try_send_msg() with -EBUSY only for reboot_image_store(). That
matches the original goal (avoid a second reboot_image write blocking
behind priv->lock) without changing sysfs behaviour for the other
attributes. The earlier idea of using mutex_trylock() in all of
rsu_send_msg() and returning -EAGAIN would have been harder to justify
for userspace (echo does not retry on that).

Tze Yee tested the patch on an Agilex SoC devkit.

[Test 1] Idle reboot_image write (success path)
Result:
   # insmod stratix10-rsu.ko
   # echo 0x01000000 > .../reboot_image
   # echo "exit=$?"
   exit=0
   # ./rsu_client --log
         VERSION: 0x00000202
           STATE: 0x00000000
   CURRENT IMAGE: 0x0000000001000000
      FAIL IMAGE: 0x0000000000000000
       ERROR LOC: 0x00000000
   ERROR DETAILS: 0x00000000
   RETRY COUNTER: 0x00000000
   Operation completed

[Test 2] reboot_image while priv->lock is held (-EBUSY path)
To get a deterministic busy window without flooding the service layer,
add a local debug helper (module parameter debug_hold_lock_sec +
kthread that holds priv->lock for N seconds after probe).

Result:
   # insmod stratix10-rsu.ko debug_hold_lock_sec=60
   [  121.220904] stratix10-rsu stratix10-rsu.0: TEST: RSU lock held for
60 s - try reboot_image now
   # echo 0x01000000 > .../reboot_image
   -sh: echo: write error: Device or resource busy
   # echo "during hold: exit=$?"
   during hold: exit=1
   [  183.268706] stratix10-rsu stratix10-rsu.0: TEST: RSU lock released
   # echo 0x01000000 > .../reboot_image
   # echo "after release: exit=$?"
   after release: exit=0

Together, these results match the intended behaviour: reboot_image
fails fast with -EBUSY when the RSU mutex is already held, and
succeeds once the lock is available.

Assisted-by: Claude:claude-opus-4-7
Tested-by: Tze Yee Ng <tze.yee.ng@altera.com>
Signed-off-by: Dinh Nguyen <dinguyen@kernel.org>
---
 drivers/firmware/stratix10-rsu.c | 85 +++++++++++++++++++++++++++-----
 1 file changed, 73 insertions(+), 12 deletions(-)

diff --git a/drivers/firmware/stratix10-rsu.c b/drivers/firmware/stratix10-rsu.c
index 6c5f952f48d8f..a420eb0c13e1c 100644
--- a/drivers/firmware/stratix10-rsu.c
+++ b/drivers/firmware/stratix10-rsu.c
@@ -338,27 +338,26 @@ static void rsu_async_get_spt_table_callback(struct device *dev,
 }
 
 /**
- * rsu_send_msg() - send a message to Intel service layer
+ * __rsu_send_msg_locked() - send a message to Intel service layer
  * @priv: pointer to rsu private data
  * @command: RSU status or update command
  * @arg: the request argument, the bitstream address or notify status
  * @callback: function pointer for the callback (status or update)
  *
- * Start an Intel service layer transaction to perform the SMC call that
- * is necessary to get RSU boot log or set the address of bitstream to
- * boot after reboot.
+ * Perform the actual SMC transaction. The caller must hold @priv->lock.
  *
- * Returns 0 on success or -ETIMEDOUT on error.
+ * Returns 0 on success or a negative errno on failure.
  */
-static int rsu_send_msg(struct stratix10_rsu_priv *priv,
-			enum stratix10_svc_command_code command,
-			unsigned long arg,
-			rsu_callback callback)
+static int __rsu_send_msg_locked(struct stratix10_rsu_priv *priv,
+				 enum stratix10_svc_command_code command,
+				 unsigned long arg,
+				 rsu_callback callback)
 {
 	struct stratix10_svc_client_msg msg;
 	int ret;
 
-	mutex_lock(&priv->lock);
+	lockdep_assert_held(&priv->lock);
+
 	reinit_completion(&priv->completion);
 	priv->client.receive_cb = callback;
 
@@ -387,6 +386,59 @@ static int rsu_send_msg(struct stratix10_rsu_priv *priv,
 
 status_done:
 	stratix10_svc_done(priv->chan);
+	return ret;
+}
+
+/**
+ * rsu_send_msg() - send a message to Intel service layer
+ * @priv: pointer to rsu private data
+ * @command: RSU status or update command
+ * @arg: the request argument, the bitstream address or notify status
+ * @callback: function pointer for the callback (status or update)
+ *
+ * Start an Intel service layer transaction to perform the SMC call that
+ * is necessary to get RSU boot log or set the address of bitstream to
+ * boot after reboot. This call will block until the RSU lock can be
+ * acquired.
+ *
+ * Returns 0 on success or a negative errno on failure.
+ */
+static int rsu_send_msg(struct stratix10_rsu_priv *priv,
+			enum stratix10_svc_command_code command,
+			unsigned long arg,
+			rsu_callback callback)
+{
+	int ret;
+
+	mutex_lock(&priv->lock);
+	ret = __rsu_send_msg_locked(priv, command, arg, callback);
+	mutex_unlock(&priv->lock);
+	return ret;
+}
+
+/**
+ * rsu_try_send_msg() - non-blocking variant of rsu_send_msg()
+ * @priv: pointer to rsu private data
+ * @command: RSU status or update command
+ * @arg: the request argument, the bitstream address or notify status
+ * @callback: function pointer for the callback (status or update)
+ *
+ * Same as rsu_send_msg() but returns -EBUSY immediately when another
+ * RSU operation is already in flight, instead of waiting for the lock.
+ *
+ * Returns 0 on success, -EBUSY if the RSU is busy, or another negative
+ * errno on failure.
+ */
+static int rsu_try_send_msg(struct stratix10_rsu_priv *priv,
+			    enum stratix10_svc_command_code command,
+			    unsigned long arg,
+			    rsu_callback callback)
+{
+	int ret;
+
+	if (!mutex_trylock(&priv->lock))
+		return -EBUSY;
+	ret = __rsu_send_msg_locked(priv, command, arg, callback);
 	mutex_unlock(&priv->lock);
 	return ret;
 }
@@ -689,8 +741,17 @@ static ssize_t reboot_image_store(struct device *dev,
 	if (ret)
 		return ret;
 
-	ret = rsu_send_msg(priv, COMMAND_RSU_UPDATE,
-			   address, rsu_command_callback);
+	/*
+	 * Use the non-blocking variant so a write to this sysfs attribute
+	 * does not stall the caller while another RSU operation is in
+	 * flight. Userspace can retry on -EBUSY.
+	 */
+	ret = rsu_try_send_msg(priv, COMMAND_RSU_UPDATE,
+			       address, rsu_command_callback);
+	if (ret == -EBUSY) {
+		dev_dbg(dev, "RSU busy, reboot_image write rejected\n");
+		return ret;
+	}
 	if (ret) {
 		dev_err(dev, "Error, RSU update returned %i\n", ret);
 		return ret;
-- 
2.42.0.411.g813d9a9188