[RFC 1/2] drm: Add sysrq key to kill current job on GPU

Rob Clark posted 2 patches 3 weeks ago
[RFC 1/2] drm: Add sysrq key to kill current job on GPU
Posted by Rob Clark 3 weeks ago
If your compositor is getting starved for GPU time, it is useful to have
a way to kill the current thing that is hogging the GPU.

Signed-off-by: Rob Clark <robin.clark@oss.qualcomm.com>
---
 drivers/gpu/drm/Makefile        |  1 +
 drivers/gpu/drm/drm_dev_sysrq.c | 67 +++++++++++++++++++++++++++++++++
 drivers/gpu/drm/drm_drv.c       |  3 ++
 drivers/gpu/drm/drm_internal.h  | 11 ++++++
 include/drm/drm_device.h        |  8 ++++
 include/drm/drm_drv.h           |  7 ++++
 6 files changed, 97 insertions(+)
 create mode 100644 drivers/gpu/drm/drm_dev_sysrq.c

diff --git a/drivers/gpu/drm/Makefile b/drivers/gpu/drm/Makefile
index 0e1c668b46d2..337af859753f 100644
--- a/drivers/gpu/drm/Makefile
+++ b/drivers/gpu/drm/Makefile
@@ -44,6 +44,7 @@ drm-y := \
 	drm_colorop.o \
 	drm_connector.o \
 	drm_crtc.o \
+	drm_dev_sysrq.o \
 	drm_displayid.o \
 	drm_drv.o \
 	drm_dumb_buffers.o \
diff --git a/drivers/gpu/drm/drm_dev_sysrq.c b/drivers/gpu/drm/drm_dev_sysrq.c
new file mode 100644
index 000000000000..47e029b7cd0b
--- /dev/null
+++ b/drivers/gpu/drm/drm_dev_sysrq.c
@@ -0,0 +1,67 @@
+// SPDX-License-Identifier: GPL-2.0 or MIT
+
+#include <linux/sysrq.h>
+
+#include <drm/drm_device.h>
+#include <drm/drm_drv.h>
+#include <drm/drm_print.h>
+
+#include "drm_internal.h"
+
+#ifdef CONFIG_MAGIC_SYSRQ
+static LIST_HEAD(drm_dev_sysrq_dev_list);
+static DEFINE_MUTEX(drm_dev_sysrq_dev_lock);
+
+/* emergency restore, don't bother with error reporting */
+static void drm_dev_sysrq_restore_work_fn(struct work_struct *ignored)
+{
+	struct drm_device *dev;
+
+	guard(mutex)(&drm_dev_sysrq_dev_lock);
+
+	list_for_each_entry(dev, &drm_dev_sysrq_dev_list, dev_sysrq_list) {
+		dev->driver->sysrq_kill(dev);
+	}
+}
+
+static DECLARE_WORK(drm_dev_sysrq_restore_work, drm_dev_sysrq_restore_work_fn);
+
+static void drm_dev_sysrq_restore_handler(u8 ignored)
+{
+	schedule_work(&drm_dev_sysrq_restore_work);
+}
+
+static const struct sysrq_key_op drm_dev_sysrq_kill_op = {
+	.handler = drm_dev_sysrq_restore_handler,
+	.help_msg = "kill-gpu-job(G)",
+	.action_msg = "Kill current job on the GPU",
+};
+
+void drm_dev_sysrq_register(struct drm_device *dev)
+{
+	const struct drm_driver *driver = dev->driver;
+
+	if (!driver->sysrq_kill)
+		return;
+
+	guard(mutex)(&drm_dev_sysrq_dev_lock);
+
+	if (list_empty(&drm_dev_sysrq_dev_list))
+		register_sysrq_key('G', &drm_dev_sysrq_kill_op);
+
+	list_add(&dev->dev_sysrq_list, &drm_dev_sysrq_dev_list);
+}
+
+void drm_dev_sysrq_unregister(struct drm_device *dev)
+{
+	guard(mutex)(&drm_dev_sysrq_dev_lock);
+
+	/* remove device from global restore list */
+	if (!drm_WARN_ON(dev, list_empty(&dev->dev_sysrq_list)))
+		list_del(&dev->dev_sysrq_list);
+
+	/* no devices left; unregister key */
+	if (list_empty(&drm_dev_sysrq_dev_list))
+		unregister_sysrq_key('G', &drm_dev_sysrq_kill_op);
+}
+#endif
diff --git a/drivers/gpu/drm/drm_drv.c b/drivers/gpu/drm/drm_drv.c
index 2915118436ce..c1f5a4ee6d58 100644
--- a/drivers/gpu/drm/drm_drv.c
+++ b/drivers/gpu/drm/drm_drv.c
@@ -734,6 +734,7 @@ static int drm_dev_init(struct drm_device *dev,
 	INIT_LIST_HEAD(&dev->filelist_internal);
 	INIT_LIST_HEAD(&dev->clientlist);
 	INIT_LIST_HEAD(&dev->client_sysrq_list);
+	INIT_LIST_HEAD(&dev->dev_sysrq_list);
 	INIT_LIST_HEAD(&dev->vblank_event_list);
 
 	spin_lock_init(&dev->event_lock);
@@ -1102,6 +1103,7 @@ int drm_dev_register(struct drm_device *dev, unsigned long flags)
 	}
 	drm_panic_register(dev);
 	drm_client_sysrq_register(dev);
+	drm_dev_sysrq_register(dev);
 
 	DRM_INFO("Initialized %s %d.%d.%d for %s on minor %d\n",
 		 driver->name, driver->major, driver->minor,
@@ -1146,6 +1148,7 @@ void drm_dev_unregister(struct drm_device *dev)
 {
 	dev->registered = false;
 
+	drm_dev_sysrq_unregister(dev);
 	drm_client_sysrq_unregister(dev);
 	drm_panic_unregister(dev);
 
diff --git a/drivers/gpu/drm/drm_internal.h b/drivers/gpu/drm/drm_internal.h
index f893b1e3a596..164ff588aea4 100644
--- a/drivers/gpu/drm/drm_internal.h
+++ b/drivers/gpu/drm/drm_internal.h
@@ -67,6 +67,17 @@ static inline void drm_client_sysrq_unregister(struct drm_device *dev)
 { }
 #endif
 
+/* drm_dev_sysrq.c */
+#ifdef CONFIG_MAGIC_SYSRQ
+void drm_dev_sysrq_register(struct drm_device *dev);
+void drm_dev_sysrq_unregister(struct drm_device *dev);
+#else
+static inline void drm_dev_sysrq_register(struct drm_device *dev)
+{ }
+static inline void drm_dev_sysrq_unregister(struct drm_device *dev)
+{ }
+#endif
+
 /* drm_file.c */
 extern struct mutex drm_global_mutex;
 bool drm_dev_needs_global_mutex(struct drm_device *dev);
diff --git a/include/drm/drm_device.h b/include/drm/drm_device.h
index 5af49c5c3778..c6c0987dba36 100644
--- a/include/drm/drm_device.h
+++ b/include/drm/drm_device.h
@@ -246,6 +246,14 @@ struct drm_device {
 	 */
 	struct list_head client_sysrq_list;
 
+	/**
+	 * @dev_sysrq_list:
+	 *
+	 * Entry into list of devices registered for sysrq to kill current
+	 * GPU job.
+	 */
+	struct list_head dev_sysrq_list;
+
 	/**
 	 * @vblank_disable_immediate:
 	 *
diff --git a/include/drm/drm_drv.h b/include/drm/drm_drv.h
index 42fc085f986d..8e9d5d597451 100644
--- a/include/drm/drm_drv.h
+++ b/include/drm/drm_drv.h
@@ -385,6 +385,13 @@ struct drm_driver {
 	int (*fbdev_probe)(struct drm_fb_helper *fbdev_helper,
 			   struct drm_fb_helper_surface_size *sizes);
 
+	/**
+	 * @sysrq_kill:
+	 *
+	 * Handler for magic sysrq key to kill current job on the GPU.
+	 */
+	void (*sysrq_kill)(struct drm_device *dev);
+
 	/**
 	 * @show_fdinfo:
 	 *
-- 
2.53.0