drivers/remoteproc/remoteproc_core.c | 3 ++- drivers/remoteproc/remoteproc_sysfs.c | 1 + include/linux/remoteproc.h | 5 ++++- 3 files changed, 7 insertions(+), 2 deletions(-)
Multiple call to glink_subdev_stop() for the same remoteproc can happen
if rproc_stop() fails from Process-A that leaves the rproc state to
RPROC_CRASHED state later a call to recovery_store from user space in
Process B triggers rproc_trigger_recovery() of the same remoteproc to
recover it results in NULL pointer dereference issue in
qcom_glink_smem_unregister().
There is other side to this issue if we want to fix this via adding a
NULL check on glink->edge which does not guarantees that the remoteproc
will recover in second call from Process B as it has failed in the first
Process A during SMC shutdown call and may again fail at the same call
and rproc can not recover for such case.
Add a new rproc state RPROC_DEFUNCT i.e., non recoverable state of
remoteproc and the only way to recover from it via system restart.
Process-A Process-B
fatal error interrupt happens
rproc_crash_handler_work()
mutex_lock_interruptible(&rproc->lock);
...
rproc->state = RPROC_CRASHED;
...
mutex_unlock(&rproc->lock);
rproc_trigger_recovery()
mutex_lock_interruptible(&rproc->lock);
adsp_stop()
qcom_q6v5_pas 20c00000.remoteproc: failed to shutdown: -22
remoteproc remoteproc3: can't stop rproc: -22
mutex_unlock(&rproc->lock);
echo enabled > /sys/class/remoteproc/remoteprocX/recovery
recovery_store()
rproc_trigger_recovery()
mutex_lock_interruptible(&rproc->lock);
rproc_stop()
glink_subdev_stop()
qcom_glink_smem_unregister() ==|
|
V
Unable to handle kernel NULL pointer dereference
at virtual address 0000000000000358
Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com>
---
Changes in v2:
- Removed NULL pointer check instead added a new state to signify
non-recoverable state of remoteproc.
drivers/remoteproc/remoteproc_core.c | 3 ++-
drivers/remoteproc/remoteproc_sysfs.c | 1 +
include/linux/remoteproc.h | 5 ++++-
3 files changed, 7 insertions(+), 2 deletions(-)
diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c
index f276956f2c5c..494c8fcc63ca 100644
--- a/drivers/remoteproc/remoteproc_core.c
+++ b/drivers/remoteproc/remoteproc_core.c
@@ -1727,6 +1727,7 @@ static int rproc_stop(struct rproc *rproc, bool crashed)
/* power off the remote processor */
ret = rproc->ops->stop(rproc);
if (ret) {
+ rproc->state = RPROC_DEFUNCT;
dev_err(dev, "can't stop rproc: %d\n", ret);
return ret;
}
@@ -1839,7 +1840,7 @@ int rproc_trigger_recovery(struct rproc *rproc)
return ret;
/* State could have changed before we got the mutex */
- if (rproc->state != RPROC_CRASHED)
+ if (rproc_start == RPROC_DEFUNCT || rproc->state != RPROC_CRASHED)
goto unlock_mutex;
dev_err(dev, "recovering %s\n", rproc->name);
diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c
index 138e752c5e4e..5f722b4576b2 100644
--- a/drivers/remoteproc/remoteproc_sysfs.c
+++ b/drivers/remoteproc/remoteproc_sysfs.c
@@ -171,6 +171,7 @@ static const char * const rproc_state_string[] = {
[RPROC_DELETED] = "deleted",
[RPROC_ATTACHED] = "attached",
[RPROC_DETACHED] = "detached",
+ [RPROC_DEFUNCT] = "defunct",
[RPROC_LAST] = "invalid",
};
diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h
index b4795698d8c2..3e4ba06c6a9a 100644
--- a/include/linux/remoteproc.h
+++ b/include/linux/remoteproc.h
@@ -417,6 +417,8 @@ struct rproc_ops {
* has attached to it
* @RPROC_DETACHED: device has been booted by another entity and waiting
* for the core to attach to it
+ * @RPROC_DEFUNCT: device neither crashed nor responding to any of the
+ * requests and can only recover on system restart.
* @RPROC_LAST: just keep this one at the end
*
* Please note that the values of these states are used as indices
@@ -433,7 +435,8 @@ enum rproc_state {
RPROC_DELETED = 4,
RPROC_ATTACHED = 5,
RPROC_DETACHED = 6,
- RPROC_LAST = 7,
+ RPROC_DEFUNCT = 7,
+ RPROC_LAST = 8,
};
/**
--
2.34.1
Hi Mukesh, kernel test robot noticed the following build warnings: [auto build test WARNING on remoteproc/rproc-next] [also build test WARNING on linus/master v6.12-rc3 next-20241015] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Mukesh-Ojha/remoteproc-Add-a-new-remoteproc-state-RPROC_DEFUNCT/20241015-043318 base: https://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git rproc-next patch link: https://lore.kernel.org/r/20241014203118.1580024-1-quic_mojha%40quicinc.com patch subject: [PATCH v2] remoteproc: Add a new remoteproc state RPROC_DEFUNCT config: x86_64-buildonly-randconfig-003-20241016 (https://download.01.org/0day-ci/archive/20241016/202410161111.ZSy2XQxP-lkp@intel.com/config) compiler: clang version 18.1.8 (https://github.com/llvm/llvm-project 3b5b5c1ec4a3095ab096dd780e84d7ab81f3d7ff) reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241016/202410161111.ZSy2XQxP-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202410161111.ZSy2XQxP-lkp@intel.com/ All warnings (new ones prefixed by >>): >> drivers/remoteproc/remoteproc_core.c:1843:18: warning: comparison between pointer and integer ('int (*)(struct rproc *, const struct firmware *)' and 'int') [-Wpointer-integer-compare] 1843 | if (rproc_start == RPROC_DEFUNCT || rproc->state != RPROC_CRASHED) | ~~~~~~~~~~~ ^ ~~~~~~~~~~~~~ 1 warning generated. vim +1843 drivers/remoteproc/remoteproc_core.c 1820 1821 /** 1822 * rproc_trigger_recovery() - recover a remoteproc 1823 * @rproc: the remote processor 1824 * 1825 * The recovery is done by resetting all the virtio devices, that way all the 1826 * rpmsg drivers will be reseted along with the remote processor making the 1827 * remoteproc functional again. 1828 * 1829 * This function can sleep, so it cannot be called from atomic context. 1830 * 1831 * Return: 0 on success or a negative value upon failure 1832 */ 1833 int rproc_trigger_recovery(struct rproc *rproc) 1834 { 1835 struct device *dev = &rproc->dev; 1836 int ret; 1837 1838 ret = mutex_lock_interruptible(&rproc->lock); 1839 if (ret) 1840 return ret; 1841 1842 /* State could have changed before we got the mutex */ > 1843 if (rproc_start == RPROC_DEFUNCT || rproc->state != RPROC_CRASHED) 1844 goto unlock_mutex; 1845 1846 dev_err(dev, "recovering %s\n", rproc->name); 1847 1848 if (rproc_has_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY)) 1849 ret = rproc_attach_recovery(rproc); 1850 else 1851 ret = rproc_boot_recovery(rproc); 1852 1853 unlock_mutex: 1854 mutex_unlock(&rproc->lock); 1855 return ret; 1856 } 1857 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
Hi Mukesh, kernel test robot noticed the following build warnings: [auto build test WARNING on remoteproc/rproc-next] [also build test WARNING on linus/master v6.12-rc3 next-20241015] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch#_base_tree_information] url: https://github.com/intel-lab-lkp/linux/commits/Mukesh-Ojha/remoteproc-Add-a-new-remoteproc-state-RPROC_DEFUNCT/20241015-043318 base: https://git.kernel.org/pub/scm/linux/kernel/git/remoteproc/linux.git rproc-next patch link: https://lore.kernel.org/r/20241014203118.1580024-1-quic_mojha%40quicinc.com patch subject: [PATCH v2] remoteproc: Add a new remoteproc state RPROC_DEFUNCT config: x86_64-buildonly-randconfig-002-20241016 (https://download.01.org/0day-ci/archive/20241016/202410161104.5ZIbsMLV-lkp@intel.com/config) compiler: gcc-12 (Debian 12.2.0-14) 12.2.0 reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241016/202410161104.5ZIbsMLV-lkp@intel.com/reproduce) If you fix the issue in a separate patch/commit (i.e. not just a new version of the same patch/commit), kindly add following tags | Reported-by: kernel test robot <lkp@intel.com> | Closes: https://lore.kernel.org/oe-kbuild-all/202410161104.5ZIbsMLV-lkp@intel.com/ All warnings (new ones prefixed by >>): drivers/remoteproc/remoteproc_core.c: In function 'rproc_trigger_recovery': >> drivers/remoteproc/remoteproc_core.c:1843:25: warning: comparison between pointer and integer 1843 | if (rproc_start == RPROC_DEFUNCT || rproc->state != RPROC_CRASHED) | ^~ vim +1843 drivers/remoteproc/remoteproc_core.c 1820 1821 /** 1822 * rproc_trigger_recovery() - recover a remoteproc 1823 * @rproc: the remote processor 1824 * 1825 * The recovery is done by resetting all the virtio devices, that way all the 1826 * rpmsg drivers will be reseted along with the remote processor making the 1827 * remoteproc functional again. 1828 * 1829 * This function can sleep, so it cannot be called from atomic context. 1830 * 1831 * Return: 0 on success or a negative value upon failure 1832 */ 1833 int rproc_trigger_recovery(struct rproc *rproc) 1834 { 1835 struct device *dev = &rproc->dev; 1836 int ret; 1837 1838 ret = mutex_lock_interruptible(&rproc->lock); 1839 if (ret) 1840 return ret; 1841 1842 /* State could have changed before we got the mutex */ > 1843 if (rproc_start == RPROC_DEFUNCT || rproc->state != RPROC_CRASHED) 1844 goto unlock_mutex; 1845 1846 dev_err(dev, "recovering %s\n", rproc->name); 1847 1848 if (rproc_has_feature(rproc, RPROC_FEAT_ATTACH_ON_RECOVERY)) 1849 ret = rproc_attach_recovery(rproc); 1850 else 1851 ret = rproc_boot_recovery(rproc); 1852 1853 unlock_mutex: 1854 mutex_unlock(&rproc->lock); 1855 return ret; 1856 } 1857 -- 0-DAY CI Kernel Test Service https://github.com/intel/lkp-tests/wiki
On Tue, Oct 15, 2024 at 02:01:18AM +0530, Mukesh Ojha wrote: > Multiple call to glink_subdev_stop() for the same remoteproc can happen > if rproc_stop() fails from Process-A that leaves the rproc state to > RPROC_CRASHED state later a call to recovery_store from user space in > Process B triggers rproc_trigger_recovery() of the same remoteproc to > recover it results in NULL pointer dereference issue in > qcom_glink_smem_unregister(). > > There is other side to this issue if we want to fix this via adding a > NULL check on glink->edge which does not guarantees that the remoteproc > will recover in second call from Process B as it has failed in the first > Process A during SMC shutdown call and may again fail at the same call > and rproc can not recover for such case. > > Add a new rproc state RPROC_DEFUNCT i.e., non recoverable state of > remoteproc and the only way to recover from it via system restart. > > Process-A Process-B > > fatal error interrupt happens > > rproc_crash_handler_work() > mutex_lock_interruptible(&rproc->lock); > ... > > rproc->state = RPROC_CRASHED; > ... > mutex_unlock(&rproc->lock); > > rproc_trigger_recovery() > mutex_lock_interruptible(&rproc->lock); > > adsp_stop() > qcom_q6v5_pas 20c00000.remoteproc: failed to shutdown: -22 > remoteproc remoteproc3: can't stop rproc: -22 > mutex_unlock(&rproc->lock); > > echo enabled > /sys/class/remoteproc/remoteprocX/recovery > recovery_store() > rproc_trigger_recovery() > mutex_lock_interruptible(&rproc->lock); > rproc_stop() > glink_subdev_stop() > qcom_glink_smem_unregister() ==| > | > V > Unable to handle kernel NULL pointer dereference > at virtual address 0000000000000358 > > Signed-off-by: Mukesh Ojha <quic_mojha@quicinc.com> > --- > Changes in v2: > - Removed NULL pointer check instead added a new state to signify > non-recoverable state of remoteproc. > > drivers/remoteproc/remoteproc_core.c | 3 ++- > drivers/remoteproc/remoteproc_sysfs.c | 1 + > include/linux/remoteproc.h | 5 ++++- > 3 files changed, 7 insertions(+), 2 deletions(-) > > diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c > index f276956f2c5c..494c8fcc63ca 100644 > --- a/drivers/remoteproc/remoteproc_core.c > +++ b/drivers/remoteproc/remoteproc_core.c > @@ -1727,6 +1727,7 @@ static int rproc_stop(struct rproc *rproc, bool crashed) > /* power off the remote processor */ > ret = rproc->ops->stop(rproc); > if (ret) { > + rproc->state = RPROC_DEFUNCT; I have put it here, but I am more inclined towards adding this assignment in qcom remoteproc(pas) driver. -Mukesh > } > @@ -1839,7 +1840,7 @@ int rproc_trigger_recovery(struct rproc *rproc) > return ret; > > /* State could have changed before we got the mutex */ > - if (rproc->state != RPROC_CRASHED) > + if (rproc_start == RPROC_DEFUNCT || rproc->state != RPROC_CRASHED) > goto unlock_mutex; > > dev_err(dev, "recovering %s\n", rproc->name); > diff --git a/drivers/remoteproc/remoteproc_sysfs.c b/drivers/remoteproc/remoteproc_sysfs.c > index 138e752c5e4e..5f722b4576b2 100644 > --- a/drivers/remoteproc/remoteproc_sysfs.c > +++ b/drivers/remoteproc/remoteproc_sysfs.c > @@ -171,6 +171,7 @@ static const char * const rproc_state_string[] = { > [RPROC_DELETED] = "deleted", > [RPROC_ATTACHED] = "attached", > [RPROC_DETACHED] = "detached", > + [RPROC_DEFUNCT] = "defunct", > [RPROC_LAST] = "invalid", > }; > > diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h > index b4795698d8c2..3e4ba06c6a9a 100644 > --- a/include/linux/remoteproc.h > +++ b/include/linux/remoteproc.h > @@ -417,6 +417,8 @@ struct rproc_ops { > * has attached to it > * @RPROC_DETACHED: device has been booted by another entity and waiting > * for the core to attach to it > + * @RPROC_DEFUNCT: device neither crashed nor responding to any of the > + * requests and can only recover on system restart. > * @RPROC_LAST: just keep this one at the end > * > * Please note that the values of these states are used as indices > @@ -433,7 +435,8 @@ enum rproc_state { > RPROC_DELETED = 4, > RPROC_ATTACHED = 5, > RPROC_DETACHED = 6, > - RPROC_LAST = 7, > + RPROC_DEFUNCT = 7, > + RPROC_LAST = 8, > }; > > /** > -- > 2.34.1 >
© 2016 - 2024 Red Hat, Inc.