[PATCH v1 1/4] rcuscale: Create debugfs file for writer durations

Yuzhuo Jing posted 4 patches 2 months, 1 week ago
[PATCH v1 1/4] rcuscale: Create debugfs file for writer durations
Posted by Yuzhuo Jing 2 months, 1 week ago
Creates an "rcuscale" folder in debugfs and creates a "writer_durations"
file in the folder.  This file is in CSV format.  Each line represents
one duration record, with columns defined as:

  writer_id,duration

Added an option "writer_no_print" to skip printing writer durations on
cleanup.

This allows external tools to read structured data and also drastically
improves cleanup performance on large core count machines.

On a 256C 512T machines running nreaders=1 nwriters=511:

Before:
$ time modprobe -r rcuscale; modprobe -r torture
real    3m17.349s
user    0m0.000s
sys     3m15.288s

After:
$ time cat /sys/kernel/debug/rcuscale/writer_durations > durations.csv
real    0m0.005s
user    0m0.000s
sys     0m0.005s
$ time modprobe -r rcuscale; modprobe -r torture
real    0m0.388s
user    0m0.000s
sys     0m0.335s

Signed-off-by: Yuzhuo Jing <yuzhuo@google.com>
---
 .../admin-guide/kernel-parameters.txt         |   5 +
 kernel/rcu/rcuscale.c                         | 142 +++++++++++++++++-
 2 files changed, 139 insertions(+), 8 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index f1f2c0874da9..7b62a84a19d4 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -5583,6 +5583,11 @@
 			periods, but in jiffies.  The default of zero
 			says no holdoff.
 
+	rcuscale.writer_no_print= [KNL]
+			Do not print writer durations to kernel ring buffer.
+			Instead, users can read them from the
+			rcuscale/writer_durations file in debugfs.
+
 	rcutorture.fqs_duration= [KNL]
 			Set duration of force_quiescent_state bursts
 			in microseconds.
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index b521d0455992..ad10b42be6fc 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -40,6 +40,8 @@
 #include <linux/vmalloc.h>
 #include <linux/rcupdate_trace.h>
 #include <linux/sched/debug.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 #include "rcu.h"
 
@@ -97,6 +99,7 @@ torture_param(bool, shutdown, RCUSCALE_SHUTDOWN,
 torture_param(int, verbose, 1, "Enable verbose debugging printk()s");
 torture_param(int, writer_holdoff, 0, "Holdoff (us) between GPs, zero to disable");
 torture_param(int, writer_holdoff_jiffies, 0, "Holdoff (jiffies) between GPs, zero to disable");
+torture_param(bool, writer_no_print, false, "Do not print writer durations to ring buffer");
 torture_param(int, kfree_rcu_test, 0, "Do we run a kfree_rcu() scale test?");
 torture_param(int, kfree_mult, 1, "Multiple of kfree_obj size to allocate.");
 torture_param(int, kfree_by_call_rcu, 0, "Use call_rcu() to emulate kfree_rcu()?");
@@ -138,6 +141,9 @@ static u64 t_rcu_scale_writer_finished;
 static unsigned long b_rcu_gp_test_started;
 static unsigned long b_rcu_gp_test_finished;
 
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_writer_durations;
+
 #define MAX_MEAS 10000
 #define MIN_MEAS 100
 
@@ -607,6 +613,7 @@ rcu_scale_writer(void *arg)
 		t = ktime_get_mono_fast_ns();
 		*wdp = t - *wdp;
 		i_max = i;
+		writer_n_durations[me] = i_max + 1;
 		if (!started &&
 		    atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
 			started = true;
@@ -620,6 +627,7 @@ rcu_scale_writer(void *arg)
 			    nrealwriters) {
 				schedule_timeout_interruptible(10);
 				rcu_ftrace_dump(DUMP_ALL);
+				WRITE_ONCE(test_complete, true);
 				SCALEOUT_STRING("Test complete");
 				t_rcu_scale_writer_finished = t;
 				if (gp_exp) {
@@ -666,7 +674,6 @@ rcu_scale_writer(void *arg)
 		rcu_scale_free(wmbp);
 		cur_ops->gp_barrier();
 	}
-	writer_n_durations[me] = i_max + 1;
 	torture_kthread_stopping("rcu_scale_writer");
 	return 0;
 }
@@ -941,6 +948,117 @@ kfree_scale_init(void)
 	return firsterr;
 }
 
+/*
+ * A seq_file for writer_durations.  Content is only visible when all writers
+ * finish.  Element i of the sequence is writer_durations + i.
+ */
+static void *writer_durations_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t writer_id = *pos;
+
+	if (!test_complete || writer_id < 0 || writer_id >= nrealwriters)
+		return NULL;
+
+	return writer_durations + writer_id;
+}
+
+static void *writer_durations_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	(*pos)++;
+	return writer_durations_start(m, pos);
+}
+
+static void writer_durations_stop(struct seq_file *m, void *v)
+{
+}
+
+/*
+ * Each element in the seq_file is an array of one writer's durations.
+ * Each element prints writer_n_durations[writer_id] lines, and each line
+ * contains one duration record, in CSV format:
+ * writer_id,duration
+ */
+static int writer_durations_show(struct seq_file *m, void *v)
+{
+	u64 **durations = v;
+	loff_t writer_id = durations - writer_durations;
+
+	for (int i = 0; i < writer_n_durations[writer_id]; ++i)
+		seq_printf(m, "%lld,%lld\n", writer_id, durations[0][i]);
+
+	return 0;
+}
+
+static const struct seq_operations writer_durations_op = {
+	.start	= writer_durations_start,
+	.next	= writer_durations_next,
+	.stop	= writer_durations_stop,
+	.show	= writer_durations_show
+};
+
+static int writer_durations_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &writer_durations_op);
+}
+
+static const struct file_operations writer_durations_fops = {
+	.owner = THIS_MODULE,
+	.open = writer_durations_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+/*
+ * Create an rcuscale directory exposing run states and results.
+ */
+static int register_debugfs(void)
+{
+#define try_create_file(variable, name, mode, parent, data, fops)		\
+({										\
+	variable = debugfs_create_file((name), (mode), (parent), (data), (fops)); \
+	err = PTR_ERR_OR_ZERO(variable);					\
+	err;									\
+})
+
+	int err;
+
+	debugfs_dir = debugfs_create_dir("rcuscale", NULL);
+	err = PTR_ERR_OR_ZERO(debugfs_dir);
+	if (err)
+		goto fail;
+
+	if (try_create_file(debugfs_writer_durations, "writer_durations", 0444,
+			debugfs_dir, NULL, &writer_durations_fops))
+		goto fail;
+
+	return 0;
+fail:
+	pr_err("rcu-scale: Failed to create debugfs file.");
+	/* unregister_debugfs is called by rcu_scale_cleanup, avoid
+	 * calling it twice.
+	 */
+	return err;
+#undef try_create_file
+}
+
+static void unregister_debugfs(void)
+{
+#define try_remove(variable)			\
+do {						\
+	if (!IS_ERR_OR_NULL(variable))		\
+		debugfs_remove(variable);	\
+	variable = NULL;			\
+} while (0)
+
+	try_remove(debugfs_writer_durations);
+
+	/* Remove directory after files. */
+	try_remove(debugfs_dir);
+
+#undef try_remove
+}
+
 static void
 rcu_scale_cleanup(void)
 {
@@ -961,6 +1079,8 @@ rcu_scale_cleanup(void)
 	if (gp_exp && gp_async)
 		SCALEOUT_ERRSTRING("No expedited async GPs, so went with async!");
 
+	unregister_debugfs();
+
 	// If built-in, just report all of the GP kthread's CPU time.
 	if (IS_BUILTIN(CONFIG_RCU_SCALE_TEST) && !kthread_tp && cur_ops->rso_gp_kthread)
 		kthread_tp = cur_ops->rso_gp_kthread();
@@ -1020,13 +1140,15 @@ rcu_scale_cleanup(void)
 			wdpp = writer_durations[i];
 			if (!wdpp)
 				continue;
-			for (j = 0; j < writer_n_durations[i]; j++) {
-				wdp = &wdpp[j];
-				pr_alert("%s%s %4d writer-duration: %5d %llu\n",
-					scale_type, SCALE_FLAG,
-					i, j, *wdp);
-				if (j % 100 == 0)
-					schedule_timeout_uninterruptible(1);
+			if (!writer_no_print) {
+				for (j = 0; j < writer_n_durations[i]; j++) {
+					wdp = &wdpp[j];
+					pr_alert("%s%s %4d writer-duration: %5d %llu\n",
+						scale_type, SCALE_FLAG,
+						i, j, *wdp);
+					if (j % 100 == 0)
+						schedule_timeout_uninterruptible(1);
+				}
 			}
 			kfree(writer_durations[i]);
 			if (writer_freelists) {
@@ -1202,6 +1324,10 @@ rcu_scale_init(void)
 		if (torture_init_error(firsterr))
 			goto unwind;
 	}
+
+	if (register_debugfs())
+		goto unwind;
+
 	torture_init_end();
 	return 0;
 
-- 
2.50.1.552.g942d659e1b-goog
Re: [PATCH v1 1/4] rcuscale: Create debugfs file for writer durations
Posted by kernel test robot 2 months ago
Hi Yuzhuo,

kernel test robot noticed the following build errors:

[auto build test ERROR on rcu/rcu/dev]
[also build test ERROR on linus/master v6.16 next-20250731]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Yuzhuo-Jing/rcuscale-Create-debugfs-file-for-writer-durations/20250730-102613
base:   https://git.kernel.org/pub/scm/linux/kernel/git/rcu/linux.git rcu/dev
patch link:    https://lore.kernel.org/r/20250730022347.71722-2-yuzhuo%40google.com
patch subject: [PATCH v1 1/4] rcuscale: Create debugfs file for writer durations
config: i386-randconfig-002-20250731 (https://download.01.org/0day-ci/archive/20250731/202507311504.ttQGhW04-lkp@intel.com/config)
compiler: gcc-12 (Debian 12.2.0-14+deb12u1) 12.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20250731/202507311504.ttQGhW04-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202507311504.ttQGhW04-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from <command-line>:
   kernel/rcu/rcuscale.c: In function 'rcu_scale_writer':
>> kernel/rcu/rcuscale.c:630:44: error: 'test_complete' undeclared (first use in this function); did you mean 'complete'?
     630 |                                 WRITE_ONCE(test_complete, true);
         |                                            ^~~~~~~~~~~~~
   include/linux/compiler_types.h:548:23: note: in definition of macro '__compiletime_assert'
     548 |                 if (!(condition))                                       \
         |                       ^~~~~~~~~
   include/linux/compiler_types.h:568:9: note: in expansion of macro '_compiletime_assert'
     568 |         _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
         |         ^~~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:9: note: in expansion of macro 'compiletime_assert'
      36 |         compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
         |         ^~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:28: note: in expansion of macro '__native_word'
      36 |         compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
         |                            ^~~~~~~~~~~~~
   include/asm-generic/rwonce.h:60:9: note: in expansion of macro 'compiletime_assert_rwonce_type'
      60 |         compiletime_assert_rwonce_type(x);                              \
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/rcu/rcuscale.c:630:33: note: in expansion of macro 'WRITE_ONCE'
     630 |                                 WRITE_ONCE(test_complete, true);
         |                                 ^~~~~~~~~~
   kernel/rcu/rcuscale.c:630:44: note: each undeclared identifier is reported only once for each function it appears in
     630 |                                 WRITE_ONCE(test_complete, true);
         |                                            ^~~~~~~~~~~~~
   include/linux/compiler_types.h:548:23: note: in definition of macro '__compiletime_assert'
     548 |                 if (!(condition))                                       \
         |                       ^~~~~~~~~
   include/linux/compiler_types.h:568:9: note: in expansion of macro '_compiletime_assert'
     568 |         _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__)
         |         ^~~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:9: note: in expansion of macro 'compiletime_assert'
      36 |         compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
         |         ^~~~~~~~~~~~~~~~~~
   include/asm-generic/rwonce.h:36:28: note: in expansion of macro '__native_word'
      36 |         compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long),  \
         |                            ^~~~~~~~~~~~~
   include/asm-generic/rwonce.h:60:9: note: in expansion of macro 'compiletime_assert_rwonce_type'
      60 |         compiletime_assert_rwonce_type(x);                              \
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
   kernel/rcu/rcuscale.c:630:33: note: in expansion of macro 'WRITE_ONCE'
     630 |                                 WRITE_ONCE(test_complete, true);
         |                                 ^~~~~~~~~~
   kernel/rcu/rcuscale.c: In function 'writer_durations_start':
   kernel/rcu/rcuscale.c:959:14: error: 'test_complete' undeclared (first use in this function); did you mean 'complete'?
     959 |         if (!test_complete || writer_id < 0 || writer_id >= nrealwriters)
         |              ^~~~~~~~~~~~~
         |              complete


vim +630 kernel/rcu/rcuscale.c

   534	
   535	/*
   536	 * RCU scale writer kthread.  Repeatedly does a grace period.
   537	 */
   538	static int
   539	rcu_scale_writer(void *arg)
   540	{
   541		int i = 0;
   542		int i_max;
   543		unsigned long jdone;
   544		long me = (long)arg;
   545		bool selfreport = false;
   546		bool started = false, done = false, alldone = false;
   547		u64 t;
   548		DEFINE_TORTURE_RANDOM(tr);
   549		u64 *wdp;
   550		u64 *wdpp = writer_durations[me];
   551		struct writer_freelist *wflp = &writer_freelists[me];
   552		struct writer_mblock *wmbp = NULL;
   553	
   554		VERBOSE_SCALEOUT_STRING("rcu_scale_writer task started");
   555		WARN_ON(!wdpp);
   556		set_cpus_allowed_ptr(current, cpumask_of(me % nr_cpu_ids));
   557		current->flags |= PF_NO_SETAFFINITY;
   558		sched_set_fifo_low(current);
   559	
   560		if (holdoff)
   561			schedule_timeout_idle(holdoff * HZ);
   562	
   563		/*
   564		 * Wait until rcu_end_inkernel_boot() is called for normal GP tests
   565		 * so that RCU is not always expedited for normal GP tests.
   566		 * The system_state test is approximate, but works well in practice.
   567		 */
   568		while (!gp_exp && system_state != SYSTEM_RUNNING)
   569			schedule_timeout_uninterruptible(1);
   570	
   571		t = ktime_get_mono_fast_ns();
   572		if (atomic_inc_return(&n_rcu_scale_writer_started) >= nrealwriters) {
   573			t_rcu_scale_writer_started = t;
   574			if (gp_exp) {
   575				b_rcu_gp_test_started =
   576					cur_ops->exp_completed() / 2;
   577			} else {
   578				b_rcu_gp_test_started = cur_ops->get_gp_seq();
   579			}
   580		}
   581	
   582		jdone = jiffies + minruntime * HZ;
   583		do {
   584			bool gp_succeeded = false;
   585	
   586			if (writer_holdoff)
   587				udelay(writer_holdoff);
   588			if (writer_holdoff_jiffies)
   589				schedule_timeout_idle(torture_random(&tr) % writer_holdoff_jiffies + 1);
   590			wdp = &wdpp[i];
   591			*wdp = ktime_get_mono_fast_ns();
   592			if (gp_async && !WARN_ON_ONCE(!cur_ops->async)) {
   593				if (!wmbp)
   594					wmbp = rcu_scale_alloc(me);
   595				if (wmbp && atomic_read(&wflp->ws_inflight) < gp_async_max) {
   596					atomic_inc(&wflp->ws_inflight);
   597					cur_ops->async(&wmbp->wmb_rh, rcu_scale_async_cb);
   598					wmbp = NULL;
   599					gp_succeeded = true;
   600				} else if (!kthread_should_stop()) {
   601					cur_ops->gp_barrier();
   602				} else {
   603					rcu_scale_free(wmbp); /* Because we are stopping. */
   604					wmbp = NULL;
   605				}
   606			} else if (gp_exp) {
   607				cur_ops->exp_sync();
   608				gp_succeeded = true;
   609			} else {
   610				cur_ops->sync();
   611				gp_succeeded = true;
   612			}
   613			t = ktime_get_mono_fast_ns();
   614			*wdp = t - *wdp;
   615			i_max = i;
   616			writer_n_durations[me] = i_max + 1;
   617			if (!started &&
   618			    atomic_read(&n_rcu_scale_writer_started) >= nrealwriters)
   619				started = true;
   620			if (!done && i >= MIN_MEAS && time_after(jiffies, jdone)) {
   621				done = true;
   622				WRITE_ONCE(writer_done[me], true);
   623				sched_set_normal(current, 0);
   624				pr_alert("%s%s rcu_scale_writer %ld has %d measurements\n",
   625					 scale_type, SCALE_FLAG, me, MIN_MEAS);
   626				if (atomic_inc_return(&n_rcu_scale_writer_finished) >=
   627				    nrealwriters) {
   628					schedule_timeout_interruptible(10);
   629					rcu_ftrace_dump(DUMP_ALL);
 > 630					WRITE_ONCE(test_complete, true);
   631					SCALEOUT_STRING("Test complete");
   632					t_rcu_scale_writer_finished = t;
   633					if (gp_exp) {
   634						b_rcu_gp_test_finished =
   635							cur_ops->exp_completed() / 2;
   636					} else {
   637						b_rcu_gp_test_finished =
   638							cur_ops->get_gp_seq();
   639					}
   640					if (shutdown) {
   641						smp_mb(); /* Assign before wake. */
   642						wake_up(&shutdown_wq);
   643					}
   644				}
   645			}
   646			if (done && !alldone &&
   647			    atomic_read(&n_rcu_scale_writer_finished) >= nrealwriters)
   648				alldone = true;
   649			if (done && !alldone && time_after(jiffies, jdone + HZ * 60)) {
   650				static atomic_t dumped;
   651				int i;
   652	
   653				if (!atomic_xchg(&dumped, 1)) {
   654					for (i = 0; i < nrealwriters; i++) {
   655						if (writer_done[i])
   656							continue;
   657						pr_info("%s: Task %ld flags writer %d:\n", __func__, me, i);
   658						sched_show_task(writer_tasks[i]);
   659					}
   660					if (cur_ops->stats)
   661						cur_ops->stats();
   662				}
   663			}
   664			if (!selfreport && time_after(jiffies, jdone + HZ * (70 + me))) {
   665				pr_info("%s: Writer %ld self-report: started %d done %d/%d->%d i %d jdone %lu.\n",
   666					__func__, me, started, done, writer_done[me], atomic_read(&n_rcu_scale_writer_finished), i, jiffies - jdone);
   667				selfreport = true;
   668			}
   669			if (gp_succeeded && started && !alldone && i < MAX_MEAS - 1)
   670				i++;
   671			rcu_scale_wait_shutdown();
   672		} while (!torture_must_stop());
   673		if (gp_async && cur_ops->async) {
   674			rcu_scale_free(wmbp);
   675			cur_ops->gp_barrier();
   676		}
   677		torture_kthread_stopping("rcu_scale_writer");
   678		return 0;
   679	}
   680	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki