[v3] riscv: Unaligned access speed probing fixes and skipping

[PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 11 months, 1 week ago

Allow skipping scalar and vector unaligned access speed tests. This
is useful for testing alternative code paths and to skip the tests in
environments where they run too slowly. All CPUs must have the same
unaligned access speed.

The code movement is because we now need the scalar cpu hotplug
callback to always run, so we need to bring it and its supporting
functions out of CONFIG_RISCV_PROBE_UNALIGNED_ACCESS.

Signed-off-by: Andrew Jones <ajones@ventanamicro.com>
---
 arch/riscv/kernel/unaligned_access_speed.c | 187 +++++++++++++--------
 1 file changed, 121 insertions(+), 66 deletions(-)

diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
index d9d4ca1fadc7..18e334549544 100644
--- a/arch/riscv/kernel/unaligned_access_speed.c
+++ b/arch/riscv/kernel/unaligned_access_speed.c
@@ -24,8 +24,12 @@
 DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
 DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
 
-#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
+static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
+static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN;
+
 static cpumask_t fast_misaligned_access;
+
+#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
 static int check_unaligned_access(void *param)
 {
 	int cpu = smp_processor_id();
@@ -130,6 +134,50 @@ static void __init check_unaligned_access_nonboot_cpu(void *param)
 		check_unaligned_access(pages[cpu]);
 }
 
+/* Measure unaligned access speed on all CPUs present at boot in parallel. */
+static void __init check_unaligned_access_speed_all_cpus(void)
+{
+	unsigned int cpu;
+	unsigned int cpu_count = num_possible_cpus();
+	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
+
+	if (!bufs) {
+		pr_warn("Allocation failure, not measuring misaligned performance\n");
+		return;
+	}
+
+	/*
+	 * Allocate separate buffers for each CPU so there's no fighting over
+	 * cache lines.
+	 */
+	for_each_cpu(cpu, cpu_online_mask) {
+		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+		if (!bufs[cpu]) {
+			pr_warn("Allocation failure, not measuring misaligned performance\n");
+			goto out;
+		}
+	}
+
+	/* Check everybody except 0, who stays behind to tend jiffies. */
+	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
+
+	/* Check core 0. */
+	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
+
+out:
+	for_each_cpu(cpu, cpu_online_mask) {
+		if (bufs[cpu])
+			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
+	}
+
+	kfree(bufs);
+}
+#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
+static void __init check_unaligned_access_speed_all_cpus(void)
+{
+}
+#endif
+
 DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
 
 static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
@@ -188,21 +236,29 @@ arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
 
 static int riscv_online_cpu(unsigned int cpu)
 {
-	static struct page *buf;
-
 	/* We are already set since the last check */
-	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
+	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
+		goto exit;
+	} else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
+		per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
 		goto exit;
-
-	check_unaligned_access_emulated(NULL);
-	buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
-	if (!buf) {
-		pr_warn("Allocation failure, not measuring misaligned performance\n");
-		return -ENOMEM;
 	}
 
-	check_unaligned_access(buf);
-	__free_pages(buf, MISALIGNED_BUFFER_ORDER);
+#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
+	{
+		static struct page *buf;
+
+		check_unaligned_access_emulated(NULL);
+		buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
+		if (!buf) {
+			pr_warn("Allocation failure, not measuring misaligned performance\n");
+			return -ENOMEM;
+		}
+
+		check_unaligned_access(buf);
+		__free_pages(buf, MISALIGNED_BUFFER_ORDER);
+	}
+#endif
 
 exit:
 	set_unaligned_access_static_branches();
@@ -217,50 +273,6 @@ static int riscv_offline_cpu(unsigned int cpu)
 	return 0;
 }
 
-/* Measure unaligned access speed on all CPUs present at boot in parallel. */
-static void __init check_unaligned_access_speed_all_cpus(void)
-{
-	unsigned int cpu;
-	unsigned int cpu_count = num_possible_cpus();
-	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
-
-	if (!bufs) {
-		pr_warn("Allocation failure, not measuring misaligned performance\n");
-		return;
-	}
-
-	/*
-	 * Allocate separate buffers for each CPU so there's no fighting over
-	 * cache lines.
-	 */
-	for_each_cpu(cpu, cpu_online_mask) {
-		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
-		if (!bufs[cpu]) {
-			pr_warn("Allocation failure, not measuring misaligned performance\n");
-			goto out;
-		}
-	}
-
-	/* Check everybody except 0, who stays behind to tend jiffies. */
-	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
-
-	/* Check core 0. */
-	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
-
-out:
-	for_each_cpu(cpu, cpu_online_mask) {
-		if (bufs[cpu])
-			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
-	}
-
-	kfree(bufs);
-}
-#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
-static void __init check_unaligned_access_speed_all_cpus(void)
-{
-}
-#endif
-
 #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
 static void check_vector_unaligned_access(struct work_struct *work __always_unused)
 {
@@ -372,8 +384,8 @@ static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __alway
 
 static int riscv_online_cpu_vec(unsigned int cpu)
 {
-	if (!has_vector()) {
-		per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
+	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
+		per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
 		return 0;
 	}
 
@@ -388,30 +400,73 @@ static int riscv_online_cpu_vec(unsigned int cpu)
 	return 0;
 }
 
+static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" };
+
+static int __init set_unaligned_scalar_speed_param(char *str)
+{
+	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW]))
+		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
+	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST]))
+		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
+	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED]))
+		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED;
+	else
+		return -EINVAL;
+
+	return 1;
+}
+__setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param);
+
+static int __init set_unaligned_vector_speed_param(char *str)
+{
+	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW]))
+		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
+	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST]))
+		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
+	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED]))
+		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
+	else
+		return -EINVAL;
+
+	return 1;
+}
+__setup("unaligned_vector_speed=", set_unaligned_vector_speed_param);
+
 static int __init check_unaligned_access_all_cpus(void)
 {
 	int cpu;
 
-	if (!check_unaligned_access_emulated_all_cpus())
+	if (unaligned_scalar_speed_param == RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN &&
+	    !check_unaligned_access_emulated_all_cpus()) {
 		check_unaligned_access_speed_all_cpus();
-
-	if (!has_vector()) {
+	} else {
+		pr_info("scalar unaligned access speed set to '%s' by command line\n",
+			speed_str[unaligned_scalar_speed_param]);
 		for_each_online_cpu(cpu)
-			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
-	} else if (!check_vector_unaligned_access_emulated_all_cpus() &&
-		   IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
+			per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
+	}
+
+	if (!has_vector())
+		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
+
+	if (unaligned_vector_speed_param == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN &&
+	    !check_vector_unaligned_access_emulated_all_cpus() &&
+	    IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
 		kthread_run(vec_check_unaligned_access_speed_all_cpus,
 			    NULL, "vec_check_unaligned_access_speed_all_cpus");
+	} else {
+		pr_info("vector unaligned access speed set to '%s' by command line\n",
+			speed_str[unaligned_vector_speed_param]);
+		for_each_online_cpu(cpu)
+			per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
 	}
 
 	/*
 	 * Setup hotplug callbacks for any new CPUs that come online or go
 	 * offline.
 	 */
-#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
 				  riscv_online_cpu, riscv_offline_cpu);
-#endif
 	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
 				  riscv_online_cpu_vec, NULL);
 
-- 
2.48.1

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Geert Uytterhoeven 10 months ago

Hi Andrew,

On Tue, 4 Mar 2025 at 13:02, Andrew Jones <ajones@ventanamicro.com> wrote:
> Allow skipping scalar and vector unaligned access speed tests. This
> is useful for testing alternative code paths and to skip the tests in
> environments where they run too slowly. All CPUs must have the same
> unaligned access speed.
>
> The code movement is because we now need the scalar cpu hotplug
> callback to always run, so we need to bring it and its supporting
> functions out of CONFIG_RISCV_PROBE_UNALIGNED_ACCESS.
>
> Signed-off-by: Andrew Jones <ajones@ventanamicro.com>

> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c

>  static int __init check_unaligned_access_all_cpus(void)
>  {
>         int cpu;
>
> -       if (!check_unaligned_access_emulated_all_cpus())
> +       if (unaligned_scalar_speed_param == RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN &&
> +           !check_unaligned_access_emulated_all_cpus()) {
>                 check_unaligned_access_speed_all_cpus();
> -
> -       if (!has_vector()) {
> +       } else {
> +               pr_info("scalar unaligned access speed set to '%s' by command line\n",
> +                       speed_str[unaligned_scalar_speed_param]);
>                 for_each_online_cpu(cpu)
> -                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> -       } else if (!check_vector_unaligned_access_emulated_all_cpus() &&
> -                  IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> +                       per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
> +       }
> +
> +       if (!has_vector())
> +               unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> +
> +       if (unaligned_vector_speed_param == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN &&
> +           !check_vector_unaligned_access_emulated_all_cpus() &&
> +           IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
>                 kthread_run(vec_check_unaligned_access_speed_all_cpus,
>                             NULL, "vec_check_unaligned_access_speed_all_cpus");
> +       } else {
> +               pr_info("vector unaligned access speed set to '%s' by command line\n",
> +                       speed_str[unaligned_vector_speed_param]);

On SiPEED MAiXBiT, unaligned_scalar_speed_param is zero, and it prints:

    scalar unaligned access speed set to '(null)' by command line


> +               for_each_online_cpu(cpu)
> +                       per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
>         }
>

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 10 months ago

On Tue, Apr 08, 2025 at 02:25:12PM +0200, Geert Uytterhoeven wrote:
> Hi Andrew,
> 
> On Tue, 4 Mar 2025 at 13:02, Andrew Jones <ajones@ventanamicro.com> wrote:
> > Allow skipping scalar and vector unaligned access speed tests. This
> > is useful for testing alternative code paths and to skip the tests in
> > environments where they run too slowly. All CPUs must have the same
> > unaligned access speed.
> >
> > The code movement is because we now need the scalar cpu hotplug
> > callback to always run, so we need to bring it and its supporting
> > functions out of CONFIG_RISCV_PROBE_UNALIGNED_ACCESS.
> >
> > Signed-off-by: Andrew Jones <ajones@ventanamicro.com>
> 
> > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> 
> >  static int __init check_unaligned_access_all_cpus(void)
> >  {
> >         int cpu;
> >
> > -       if (!check_unaligned_access_emulated_all_cpus())
> > +       if (unaligned_scalar_speed_param == RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN &&
> > +           !check_unaligned_access_emulated_all_cpus()) {
> >                 check_unaligned_access_speed_all_cpus();
> > -
> > -       if (!has_vector()) {
> > +       } else {
> > +               pr_info("scalar unaligned access speed set to '%s' by command line\n",
> > +                       speed_str[unaligned_scalar_speed_param]);
> >                 for_each_online_cpu(cpu)
> > -                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> > -       } else if (!check_vector_unaligned_access_emulated_all_cpus() &&
> > -                  IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> > +                       per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
> > +       }
> > +
> > +       if (!has_vector())
> > +               unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> > +
> > +       if (unaligned_vector_speed_param == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN &&
> > +           !check_vector_unaligned_access_emulated_all_cpus() &&
> > +           IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> >                 kthread_run(vec_check_unaligned_access_speed_all_cpus,
> >                             NULL, "vec_check_unaligned_access_speed_all_cpus");
> > +       } else {
> > +               pr_info("vector unaligned access speed set to '%s' by command line\n",
> > +                       speed_str[unaligned_vector_speed_param]);
> 
> On SiPEED MAiXBiT, unaligned_scalar_speed_param is zero, and it prints:
> 
>     scalar unaligned access speed set to '(null)' by command line

Thanks, Geert. I think unaligned_scalar_speed_param is likely 1 in this
case and we should be printing 'emulated', but I neglected to add that
string to speed_str[].

I'll fix this too.

Thanks,
drew

> 
> 
> > +               for_each_online_cpu(cpu)
> > +                       per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
> >         }
> >
> 
> Gr{oetje,eeting}s,
> 
>                         Geert
> 
> -- 
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like that.
>                                 -- Linus Torvalds

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Geert Uytterhoeven 10 months ago

Hi Andrew,

On Tue, 8 Apr 2025 at 15:03, Andrew Jones <ajones@ventanamicro.com> wrote:
> On Tue, Apr 08, 2025 at 02:25:12PM +0200, Geert Uytterhoeven wrote:
> > On Tue, 4 Mar 2025 at 13:02, Andrew Jones <ajones@ventanamicro.com> wrote:
> > > Allow skipping scalar and vector unaligned access speed tests. This
> > > is useful for testing alternative code paths and to skip the tests in
> > > environments where they run too slowly. All CPUs must have the same
> > > unaligned access speed.
> > >
> > > The code movement is because we now need the scalar cpu hotplug
> > > callback to always run, so we need to bring it and its supporting
> > > functions out of CONFIG_RISCV_PROBE_UNALIGNED_ACCESS.
> > >
> > > Signed-off-by: Andrew Jones <ajones@ventanamicro.com>
> >
> > > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> >
> > >  static int __init check_unaligned_access_all_cpus(void)
> > >  {
> > >         int cpu;
> > >
> > > -       if (!check_unaligned_access_emulated_all_cpus())
> > > +       if (unaligned_scalar_speed_param == RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN &&
> > > +           !check_unaligned_access_emulated_all_cpus()) {
> > >                 check_unaligned_access_speed_all_cpus();
> > > -
> > > -       if (!has_vector()) {
> > > +       } else {
> > > +               pr_info("scalar unaligned access speed set to '%s' by command line\n",
> > > +                       speed_str[unaligned_scalar_speed_param]);
> > >                 for_each_online_cpu(cpu)
> > > -                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> > > -       } else if (!check_vector_unaligned_access_emulated_all_cpus() &&
> > > -                  IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> > > +                       per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
> > > +       }
> > > +
> > > +       if (!has_vector())
> > > +               unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> > > +
> > > +       if (unaligned_vector_speed_param == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN &&
> > > +           !check_vector_unaligned_access_emulated_all_cpus() &&
> > > +           IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> > >                 kthread_run(vec_check_unaligned_access_speed_all_cpus,
> > >                             NULL, "vec_check_unaligned_access_speed_all_cpus");
> > > +       } else {
> > > +               pr_info("vector unaligned access speed set to '%s' by command line\n",
> > > +                       speed_str[unaligned_vector_speed_param]);
> >
> > On SiPEED MAiXBiT, unaligned_scalar_speed_param is zero, and it prints:
> >
> >     scalar unaligned access speed set to '(null)' by command line
>
> Thanks, Geert. I think unaligned_scalar_speed_param is likely 1 in this
> case and we should be printing 'emulated', but I neglected to add that
> string to speed_str[].

No, the value of unaligned_scalar_speed_param is zero.

> I'll fix this too.

Thanks!

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Geert Uytterhoeven 10 months, 1 week ago

Hi Andrew,

On Tue, 4 Mar 2025 at 13:02, Andrew Jones <ajones@ventanamicro.com> wrote:
> Allow skipping scalar and vector unaligned access speed tests. This
> is useful for testing alternative code paths and to skip the tests in
> environments where they run too slowly. All CPUs must have the same
> unaligned access speed.
>
> The code movement is because we now need the scalar cpu hotplug
> callback to always run, so we need to bring it and its supporting
> functions out of CONFIG_RISCV_PROBE_UNALIGNED_ACCESS.
>
> Signed-off-by: Andrew Jones <ajones@ventanamicro.com>

Thanks for your patch, which is now commit aecb09e091dc1433
("riscv: Add parameter for skipping access speed tests") in
v6.15-rc1.

> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c

>  static int __init check_unaligned_access_all_cpus(void)
>  {
>         int cpu;
>
> -       if (!check_unaligned_access_emulated_all_cpus())
> +       if (unaligned_scalar_speed_param == RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN &&
> +           !check_unaligned_access_emulated_all_cpus()) {
>                 check_unaligned_access_speed_all_cpus();
> -
> -       if (!has_vector()) {
> +       } else {
> +               pr_info("scalar unaligned access speed set to '%s' by command line\n",
> +                       speed_str[unaligned_scalar_speed_param]);
>                 for_each_online_cpu(cpu)
> -                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> -       } else if (!check_vector_unaligned_access_emulated_all_cpus() &&
> -                  IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> +                       per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
> +       }
> +
> +       if (!has_vector())
> +               unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> +
> +       if (unaligned_vector_speed_param == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN &&
> +           !check_vector_unaligned_access_emulated_all_cpus() &&
> +           IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
>                 kthread_run(vec_check_unaligned_access_speed_all_cpus,
>                             NULL, "vec_check_unaligned_access_speed_all_cpus");
> +       } else {
> +               pr_info("vector unaligned access speed set to '%s' by command line\n",
> +                       speed_str[unaligned_vector_speed_param]);
> +               for_each_online_cpu(cpu)
> +                       per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
>         }

On RZ/Five:

    vector unaligned access speed set to 'unsupported' by command line

However, this is not set on my command line?

Apparently this can be set using three different methods:
  1. It is the default value in the declaration of vector_misaligned_access,
  2. From the handle_vector_misaligned_load() exception handler,
  3. From the command line.
Hence the current kernel message is rather confusing...

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 10 months ago

Hi Geert,

On Mon, Apr 07, 2025 at 11:49:59AM +0200, Geert Uytterhoeven wrote:
> Hi Andrew,
> 
> On Tue, 4 Mar 2025 at 13:02, Andrew Jones <ajones@ventanamicro.com> wrote:
> > Allow skipping scalar and vector unaligned access speed tests. This
> > is useful for testing alternative code paths and to skip the tests in
> > environments where they run too slowly. All CPUs must have the same
> > unaligned access speed.
> >
> > The code movement is because we now need the scalar cpu hotplug
> > callback to always run, so we need to bring it and its supporting
> > functions out of CONFIG_RISCV_PROBE_UNALIGNED_ACCESS.
> >
> > Signed-off-by: Andrew Jones <ajones@ventanamicro.com>
> 
> Thanks for your patch, which is now commit aecb09e091dc1433
> ("riscv: Add parameter for skipping access speed tests") in
> v6.15-rc1.
> 
> > --- a/arch/riscv/kernel/unaligned_access_speed.c
> > +++ b/arch/riscv/kernel/unaligned_access_speed.c
> 
> >  static int __init check_unaligned_access_all_cpus(void)
> >  {
> >         int cpu;
> >
> > -       if (!check_unaligned_access_emulated_all_cpus())
> > +       if (unaligned_scalar_speed_param == RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN &&
> > +           !check_unaligned_access_emulated_all_cpus()) {
> >                 check_unaligned_access_speed_all_cpus();
> > -
> > -       if (!has_vector()) {
> > +       } else {
> > +               pr_info("scalar unaligned access speed set to '%s' by command line\n",
> > +                       speed_str[unaligned_scalar_speed_param]);
> >                 for_each_online_cpu(cpu)
> > -                       per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> > -       } else if (!check_vector_unaligned_access_emulated_all_cpus() &&
> > -                  IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> > +                       per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
> > +       }
> > +
> > +       if (!has_vector())
> > +               unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> > +
> > +       if (unaligned_vector_speed_param == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN &&
> > +           !check_vector_unaligned_access_emulated_all_cpus() &&
> > +           IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> >                 kthread_run(vec_check_unaligned_access_speed_all_cpus,
> >                             NULL, "vec_check_unaligned_access_speed_all_cpus");
> > +       } else {
> > +               pr_info("vector unaligned access speed set to '%s' by command line\n",
> > +                       speed_str[unaligned_vector_speed_param]);
> > +               for_each_online_cpu(cpu)
> > +                       per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
> >         }
> 
> On RZ/Five:
> 
>     vector unaligned access speed set to 'unsupported' by command line
> 
> However, this is not set on my command line?
> 
> Apparently this can be set using three different methods:
>   1. It is the default value in the declaration of vector_misaligned_access,
>   2. From the handle_vector_misaligned_load() exception handler,
>   3. From the command line.
> Hence the current kernel message is rather confusing...

Thanks for the report.

The three ways above are OK, since (1) sets it to 'unknown' which means
"not yet set" (by command line or otherwise), (2) doesn't actually touch
unaligned_vector_speed_param, just its per-cpu counterpart. And the
message applies to (3). However, there's a (4) which I added without
considering the message and that's the 'if (!has_vector())' part of the
hunk above, which sets 'unsupported', as you're seeing, when vector is
not present.

I'll send a patch that ensures we only get the message for truly command
line set states.

Thanks,
drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Alexandre Ghiti 10 months, 3 weeks ago

Hi Drew,

On 04/03/2025 13:00, Andrew Jones wrote:
> Allow skipping scalar and vector unaligned access speed tests. This
> is useful for testing alternative code paths and to skip the tests in
> environments where they run too slowly. All CPUs must have the same
> unaligned access speed.

I'm not a big fan of the command line parameter, this is not where we 
should push uarch decisions because there could be many other in the 
future, the best solution to me should be in DT/ACPI and since the DT 
folks, according to Palmer, shut down this solution, it remains using an 
extension.

I have been reading a bit about unaligned accesses. Zicclsm was 
described as "Even though mandated, misaligned loads and stores might 
execute extremely slowly. Standard software distributions should assume 
their existence only for correctness, not for performance." in rva20/22 
but *not* in rva23. So what about using this "hole" and consider that a 
platform that *advertises* Zicclsm means its unaligned accesses are 
fast? After internal discussion, It actually does not make sense to 
advertise Zicclsm if the platform accesses are slow right?

arm64 for example considers that armv8 has fast unaligned accesses and 
can then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though 
some uarchs are slow. Distros will very likely use rva23 as baseline so 
they will enable Zicclsm which would allow us to take advantage of this 
too, without this, we lose a lot of perf improvement in the kernel, see 
https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.

Or we could have a new named feature for this, even though it's weird to 
have a named feature which would basically  mean "Zicclsm is fast". We 
don't have, for example, a named feature to say "Zicboz is fast" but 
given the vague wording in the profile spec, maybe we can ask for one in 
that case?

Sorry for the late review and for triggering this debate...

Thanks,

Alex


>
> The code movement is because we now need the scalar cpu hotplug
> callback to always run, so we need to bring it and its supporting
> functions out of CONFIG_RISCV_PROBE_UNALIGNED_ACCESS.
>
> Signed-off-by: Andrew Jones <ajones@ventanamicro.com>
> ---
>   arch/riscv/kernel/unaligned_access_speed.c | 187 +++++++++++++--------
>   1 file changed, 121 insertions(+), 66 deletions(-)
>
> diff --git a/arch/riscv/kernel/unaligned_access_speed.c b/arch/riscv/kernel/unaligned_access_speed.c
> index d9d4ca1fadc7..18e334549544 100644
> --- a/arch/riscv/kernel/unaligned_access_speed.c
> +++ b/arch/riscv/kernel/unaligned_access_speed.c
> @@ -24,8 +24,12 @@
>   DEFINE_PER_CPU(long, misaligned_access_speed) = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
>   DEFINE_PER_CPU(long, vector_misaligned_access) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
>   
> -#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> +static long unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN;
> +static long unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN;
> +
>   static cpumask_t fast_misaligned_access;
> +
> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>   static int check_unaligned_access(void *param)
>   {
>   	int cpu = smp_processor_id();
> @@ -130,6 +134,50 @@ static void __init check_unaligned_access_nonboot_cpu(void *param)
>   		check_unaligned_access(pages[cpu]);
>   }
>   
> +/* Measure unaligned access speed on all CPUs present at boot in parallel. */
> +static void __init check_unaligned_access_speed_all_cpus(void)
> +{
> +	unsigned int cpu;
> +	unsigned int cpu_count = num_possible_cpus();
> +	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
> +
> +	if (!bufs) {
> +		pr_warn("Allocation failure, not measuring misaligned performance\n");
> +		return;
> +	}
> +
> +	/*
> +	 * Allocate separate buffers for each CPU so there's no fighting over
> +	 * cache lines.
> +	 */
> +	for_each_cpu(cpu, cpu_online_mask) {
> +		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> +		if (!bufs[cpu]) {
> +			pr_warn("Allocation failure, not measuring misaligned performance\n");
> +			goto out;
> +		}
> +	}
> +
> +	/* Check everybody except 0, who stays behind to tend jiffies. */
> +	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
> +
> +	/* Check core 0. */
> +	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
> +
> +out:
> +	for_each_cpu(cpu, cpu_online_mask) {
> +		if (bufs[cpu])
> +			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
> +	}
> +
> +	kfree(bufs);
> +}
> +#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
> +static void __init check_unaligned_access_speed_all_cpus(void)
> +{
> +}
> +#endif
> +
>   DEFINE_STATIC_KEY_FALSE(fast_unaligned_access_speed_key);
>   
>   static void modify_unaligned_access_branches(cpumask_t *mask, int weight)
> @@ -188,21 +236,29 @@ arch_initcall_sync(lock_and_set_unaligned_access_static_branch);
>   
>   static int riscv_online_cpu(unsigned int cpu)
>   {
> -	static struct page *buf;
> -
>   	/* We are already set since the last check */
> -	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN)
> +	if (per_cpu(misaligned_access_speed, cpu) != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
> +		goto exit;
> +	} else if (unaligned_scalar_speed_param != RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN) {
> +		per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
>   		goto exit;
> -
> -	check_unaligned_access_emulated(NULL);
> -	buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> -	if (!buf) {
> -		pr_warn("Allocation failure, not measuring misaligned performance\n");
> -		return -ENOMEM;
>   	}
>   
> -	check_unaligned_access(buf);
> -	__free_pages(buf, MISALIGNED_BUFFER_ORDER);
> +#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
> +	{
> +		static struct page *buf;
> +
> +		check_unaligned_access_emulated(NULL);
> +		buf = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> +		if (!buf) {
> +			pr_warn("Allocation failure, not measuring misaligned performance\n");
> +			return -ENOMEM;
> +		}
> +
> +		check_unaligned_access(buf);
> +		__free_pages(buf, MISALIGNED_BUFFER_ORDER);
> +	}
> +#endif
>   
>   exit:
>   	set_unaligned_access_static_branches();
> @@ -217,50 +273,6 @@ static int riscv_offline_cpu(unsigned int cpu)
>   	return 0;
>   }
>   
> -/* Measure unaligned access speed on all CPUs present at boot in parallel. */
> -static void __init check_unaligned_access_speed_all_cpus(void)
> -{
> -	unsigned int cpu;
> -	unsigned int cpu_count = num_possible_cpus();
> -	struct page **bufs = kcalloc(cpu_count, sizeof(*bufs), GFP_KERNEL);
> -
> -	if (!bufs) {
> -		pr_warn("Allocation failure, not measuring misaligned performance\n");
> -		return;
> -	}
> -
> -	/*
> -	 * Allocate separate buffers for each CPU so there's no fighting over
> -	 * cache lines.
> -	 */
> -	for_each_cpu(cpu, cpu_online_mask) {
> -		bufs[cpu] = alloc_pages(GFP_KERNEL, MISALIGNED_BUFFER_ORDER);
> -		if (!bufs[cpu]) {
> -			pr_warn("Allocation failure, not measuring misaligned performance\n");
> -			goto out;
> -		}
> -	}
> -
> -	/* Check everybody except 0, who stays behind to tend jiffies. */
> -	on_each_cpu(check_unaligned_access_nonboot_cpu, bufs, 1);
> -
> -	/* Check core 0. */
> -	smp_call_on_cpu(0, check_unaligned_access, bufs[0], true);
> -
> -out:
> -	for_each_cpu(cpu, cpu_online_mask) {
> -		if (bufs[cpu])
> -			__free_pages(bufs[cpu], MISALIGNED_BUFFER_ORDER);
> -	}
> -
> -	kfree(bufs);
> -}
> -#else /* CONFIG_RISCV_PROBE_UNALIGNED_ACCESS */
> -static void __init check_unaligned_access_speed_all_cpus(void)
> -{
> -}
> -#endif
> -
>   #ifdef CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS
>   static void check_vector_unaligned_access(struct work_struct *work __always_unused)
>   {
> @@ -372,8 +384,8 @@ static int __init vec_check_unaligned_access_speed_all_cpus(void *unused __alway
>   
>   static int riscv_online_cpu_vec(unsigned int cpu)
>   {
> -	if (!has_vector()) {
> -		per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> +	if (unaligned_vector_speed_param != RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN) {
> +		per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
>   		return 0;
>   	}
>   
> @@ -388,30 +400,73 @@ static int riscv_online_cpu_vec(unsigned int cpu)
>   	return 0;
>   }
>   
> +static const char * const speed_str[] __initconst = { NULL, NULL, "slow", "fast", "unsupported" };
> +
> +static int __init set_unaligned_scalar_speed_param(char *str)
> +{
> +	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW]))
> +		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_SLOW;
> +	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_FAST]))
> +		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_FAST;
> +	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED]))
> +		unaligned_scalar_speed_param = RISCV_HWPROBE_MISALIGNED_SCALAR_UNSUPPORTED;
> +	else
> +		return -EINVAL;
> +
> +	return 1;
> +}
> +__setup("unaligned_scalar_speed=", set_unaligned_scalar_speed_param);
> +
> +static int __init set_unaligned_vector_speed_param(char *str)
> +{
> +	if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW]))
> +		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_SLOW;
> +	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_FAST]))
> +		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_FAST;
> +	else if (!strcmp(str, speed_str[RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED]))
> +		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> +	else
> +		return -EINVAL;
> +
> +	return 1;
> +}
> +__setup("unaligned_vector_speed=", set_unaligned_vector_speed_param);
> +
>   static int __init check_unaligned_access_all_cpus(void)
>   {
>   	int cpu;
>   
> -	if (!check_unaligned_access_emulated_all_cpus())
> +	if (unaligned_scalar_speed_param == RISCV_HWPROBE_MISALIGNED_SCALAR_UNKNOWN &&
> +	    !check_unaligned_access_emulated_all_cpus()) {
>   		check_unaligned_access_speed_all_cpus();
> -
> -	if (!has_vector()) {
> +	} else {
> +		pr_info("scalar unaligned access speed set to '%s' by command line\n",
> +			speed_str[unaligned_scalar_speed_param]);
>   		for_each_online_cpu(cpu)
> -			per_cpu(vector_misaligned_access, cpu) = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> -	} else if (!check_vector_unaligned_access_emulated_all_cpus() &&
> -		   IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
> +			per_cpu(misaligned_access_speed, cpu) = unaligned_scalar_speed_param;
> +	}
> +
> +	if (!has_vector())
> +		unaligned_vector_speed_param = RISCV_HWPROBE_MISALIGNED_VECTOR_UNSUPPORTED;
> +
> +	if (unaligned_vector_speed_param == RISCV_HWPROBE_MISALIGNED_VECTOR_UNKNOWN &&
> +	    !check_vector_unaligned_access_emulated_all_cpus() &&
> +	    IS_ENABLED(CONFIG_RISCV_PROBE_VECTOR_UNALIGNED_ACCESS)) {
>   		kthread_run(vec_check_unaligned_access_speed_all_cpus,
>   			    NULL, "vec_check_unaligned_access_speed_all_cpus");
> +	} else {
> +		pr_info("vector unaligned access speed set to '%s' by command line\n",
> +			speed_str[unaligned_vector_speed_param]);
> +		for_each_online_cpu(cpu)
> +			per_cpu(vector_misaligned_access, cpu) = unaligned_vector_speed_param;
>   	}
>   
>   	/*
>   	 * Setup hotplug callbacks for any new CPUs that come online or go
>   	 * offline.
>   	 */
> -#ifdef CONFIG_RISCV_PROBE_UNALIGNED_ACCESS
>   	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
>   				  riscv_online_cpu, riscv_offline_cpu);
> -#endif
>   	cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "riscv:online",
>   				  riscv_online_cpu_vec, NULL);
>

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 10 months, 3 weeks ago

On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
> Hi Drew,
> 
> On 04/03/2025 13:00, Andrew Jones wrote:
> > Allow skipping scalar and vector unaligned access speed tests. This
> > is useful for testing alternative code paths and to skip the tests in
> > environments where they run too slowly. All CPUs must have the same
> > unaligned access speed.
> 
> I'm not a big fan of the command line parameter, this is not where we should
> push uarch decisions because there could be many other in the future, the
> best solution to me should be in DT/ACPI and since the DT folks, according
> to Palmer, shut down this solution, it remains using an extension.
> 
> I have been reading a bit about unaligned accesses. Zicclsm was described as
> "Even though mandated, misaligned loads and stores might execute extremely
> slowly. Standard software distributions should assume their existence only
> for correctness, not for performance." in rva20/22 but *not* in rva23. So
> what about using this "hole" and consider that a platform that *advertises*
> Zicclsm means its unaligned accesses are fast? After internal discussion, It
> actually does not make sense to advertise Zicclsm if the platform accesses
> are slow right?

This topic pops up every so often, including in yesterday's server
platform TG call. In that call, and, afaict, every other time it has
popped up, the result is to reiterate that ISA extensions never say
anything about performance. So, Zicclsm will never mean fast and we
won't likely be able to add any extension that does.

> 
> arm64 for example considers that armv8 has fast unaligned accesses and can
> then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
> uarchs are slow. Distros will very likely use rva23 as baseline so they will
> enable Zicclsm which would allow us to take advantage of this too, without
> this, we lose a lot of perf improvement in the kernel, see
> https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
> 
> Or we could have a new named feature for this, even though it's weird to
> have a named feature which would basically  mean "Zicclsm is fast". We don't
> have, for example, a named feature to say "Zicboz is fast" but given the
> vague wording in the profile spec, maybe we can ask for one in that case?
> 
> Sorry for the late review and for triggering this debate...

No problem, let's try to pick the best option. I'll try listing all the
options and there pros/cons.

1. Leave as is, which is to always probe
   pro: Nothing to do
   con: Not ideal in all environments

2. New DT/ACPI description
   pro: Describing whether or not misaligned accesses are implemented in
        HW (which presumably means fast) is something that should be done
	in HW descriptions
   con: We'll need to live with probing until we can get the descriptions
        defined, which may be never if there's too much opposition

3. Command line
   pro: Easy and serves its purpose, which is to skip probing in the
        environments where probing is not desired
   con: Yet another command line option (which we may want to deprecate
        someday)

4. New ISA extension
   pro: Easy to add to HW descriptions
   con: Not likely to get it through ratification

5. New SBI FWFT feature
   pro: Probably easier to get through ratification than an ISA extension
   con: Instead of probing, kernel would have to ask SBI -- would that
        even be faster? Will all the environments that want to skip
	probing even have a complete SBI?

6. ??

I'm voting for (3), which is why I posted this patchset, but I'm happy
to hear other votes or other proposals and discuss.

Thanks,
drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Alexandre Ghiti 10 months, 3 weeks ago

On 18/03/2025 09:48, Andrew Jones wrote:
> On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
>> Hi Drew,
>>
>> On 04/03/2025 13:00, Andrew Jones wrote:
>>> Allow skipping scalar and vector unaligned access speed tests. This
>>> is useful for testing alternative code paths and to skip the tests in
>>> environments where they run too slowly. All CPUs must have the same
>>> unaligned access speed.
>> I'm not a big fan of the command line parameter, this is not where we should
>> push uarch decisions because there could be many other in the future, the
>> best solution to me should be in DT/ACPI and since the DT folks, according
>> to Palmer, shut down this solution, it remains using an extension.
>>
>> I have been reading a bit about unaligned accesses. Zicclsm was described as
>> "Even though mandated, misaligned loads and stores might execute extremely
>> slowly. Standard software distributions should assume their existence only
>> for correctness, not for performance." in rva20/22 but *not* in rva23. So
>> what about using this "hole" and consider that a platform that *advertises*
>> Zicclsm means its unaligned accesses are fast? After internal discussion, It
>> actually does not make sense to advertise Zicclsm if the platform accesses
>> are slow right?
> This topic pops up every so often, including in yesterday's server
> platform TG call. In that call, and, afaict, every other time it has
> popped up, the result is to reiterate that ISA extensions never say
> anything about performance. So, Zicclsm will never mean fast and we
> won't likely be able to add any extension that does.


Ok, I should not say "fast". Usually, when an extension is advertised by 
a platform, we don't question its speed (zicboz, zicbom...etc), we 
simply use it and it's up to the vendor to benchmark its implementation 
and act accordingly (i.e. do not set it in the isa string).


>> arm64 for example considers that armv8 has fast unaligned accesses and can
>> then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
>> uarchs are slow. Distros will very likely use rva23 as baseline so they will
>> enable Zicclsm which would allow us to take advantage of this too, without
>> this, we lose a lot of perf improvement in the kernel, see
>> https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
>>
>> Or we could have a new named feature for this, even though it's weird to
>> have a named feature which would basically  mean "Zicclsm is fast". We don't
>> have, for example, a named feature to say "Zicboz is fast" but given the
>> vague wording in the profile spec, maybe we can ask for one in that case?
>>
>> Sorry for the late review and for triggering this debate...
> No problem, let's try to pick the best option. I'll try listing all the
> options and there pros/cons.
>
> 1. Leave as is, which is to always probe
>     pro: Nothing to do
>     con: Not ideal in all environments
>
> 2. New DT/ACPI description
>     pro: Describing whether or not misaligned accesses are implemented in
>          HW (which presumably means fast) is something that should be done
> 	in HW descriptions
>     con: We'll need to live with probing until we can get the descriptions
>          defined, which may be never if there's too much opposition
>
> 3. Command line
>     pro: Easy and serves its purpose, which is to skip probing in the
>          environments where probing is not desired
>     con: Yet another command line option (which we may want to deprecate
>          someday)
>
> 4. New ISA extension
>     pro: Easy to add to HW descriptions
>     con: Not likely to get it through ratification
>
> 5. New SBI FWFT feature
>     pro: Probably easier to get through ratification than an ISA extension
>     con: Instead of probing, kernel would have to ask SBI -- would that
>          even be faster? Will all the environments that want to skip
> 	probing even have a complete SBI?
>
> 6. ??


So what about:

7. New enum value describing the performance as "FORCED" or "HW" (or 
anything better)
     pro: We only use the existing Zicclsm
     con: It's not clear that the accesses are fast but it basically 
says to SW "don't think too much, I'm telling you that you can use it", 
up to us to describe this correctly for users to understand.

Thanks,

Alex


>
> I'm voting for (3), which is why I posted this patchset, but I'm happy
> to hear other votes or other proposals and discuss.
>
> Thanks,
> drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 10 months, 3 weeks ago

On Tue, Mar 18, 2025 at 01:13:18PM +0100, Alexandre Ghiti wrote:
> 
> On 18/03/2025 09:48, Andrew Jones wrote:
> > On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
> > > Hi Drew,
> > > 
> > > On 04/03/2025 13:00, Andrew Jones wrote:
> > > > Allow skipping scalar and vector unaligned access speed tests. This
> > > > is useful for testing alternative code paths and to skip the tests in
> > > > environments where they run too slowly. All CPUs must have the same
> > > > unaligned access speed.
> > > I'm not a big fan of the command line parameter, this is not where we should
> > > push uarch decisions because there could be many other in the future, the
> > > best solution to me should be in DT/ACPI and since the DT folks, according
> > > to Palmer, shut down this solution, it remains using an extension.
> > > 
> > > I have been reading a bit about unaligned accesses. Zicclsm was described as
> > > "Even though mandated, misaligned loads and stores might execute extremely
> > > slowly. Standard software distributions should assume their existence only
> > > for correctness, not for performance." in rva20/22 but *not* in rva23. So
> > > what about using this "hole" and consider that a platform that *advertises*
> > > Zicclsm means its unaligned accesses are fast? After internal discussion, It
> > > actually does not make sense to advertise Zicclsm if the platform accesses
> > > are slow right?
> > This topic pops up every so often, including in yesterday's server
> > platform TG call. In that call, and, afaict, every other time it has
> > popped up, the result is to reiterate that ISA extensions never say
> > anything about performance. So, Zicclsm will never mean fast and we
> > won't likely be able to add any extension that does.
> 
> 
> Ok, I should not say "fast". Usually, when an extension is advertised by a
> platform, we don't question its speed (zicboz, zicbom...etc), we simply use
> it and it's up to the vendor to benchmark its implementation and act
> accordingly (i.e. do not set it in the isa string).
> 
> 
> > > arm64 for example considers that armv8 has fast unaligned accesses and can
> > > then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
> > > uarchs are slow. Distros will very likely use rva23 as baseline so they will
> > > enable Zicclsm which would allow us to take advantage of this too, without
> > > this, we lose a lot of perf improvement in the kernel, see
> > > https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
> > > 
> > > Or we could have a new named feature for this, even though it's weird to
> > > have a named feature which would basically  mean "Zicclsm is fast". We don't
> > > have, for example, a named feature to say "Zicboz is fast" but given the
> > > vague wording in the profile spec, maybe we can ask for one in that case?
> > > 
> > > Sorry for the late review and for triggering this debate...
> > No problem, let's try to pick the best option. I'll try listing all the
> > options and there pros/cons.
> > 
> > 1. Leave as is, which is to always probe
> >     pro: Nothing to do
> >     con: Not ideal in all environments
> > 
> > 2. New DT/ACPI description
> >     pro: Describing whether or not misaligned accesses are implemented in
> >          HW (which presumably means fast) is something that should be done
> > 	in HW descriptions
> >     con: We'll need to live with probing until we can get the descriptions
> >          defined, which may be never if there's too much opposition
> > 
> > 3. Command line
> >     pro: Easy and serves its purpose, which is to skip probing in the
> >          environments where probing is not desired
> >     con: Yet another command line option (which we may want to deprecate
> >          someday)
> > 
> > 4. New ISA extension
> >     pro: Easy to add to HW descriptions
> >     con: Not likely to get it through ratification
> > 
> > 5. New SBI FWFT feature
> >     pro: Probably easier to get through ratification than an ISA extension
> >     con: Instead of probing, kernel would have to ask SBI -- would that
> >          even be faster? Will all the environments that want to skip
> > 	probing even have a complete SBI?
> > 
> > 6. ??
> 
> 
> So what about:
> 
> 7. New enum value describing the performance as "FORCED" or "HW" (or
> anything better)
>     pro: We only use the existing Zicclsm
>     con: It's not clear that the accesses are fast but it basically says to
> SW "don't think too much, I'm telling you that you can use it", up to us to
> describe this correctly for users to understand.

But Zicclsm doesn't mean misaligned accesses are in HW, it just means
they're not going to explode. We'd still need the probing to find out
if the accesses are emulated (slow) or hw (fast). We at least want to
know the answer to that question because we advertise it to userspace
through hwprobe.

(BTW, another pro of the command line is that it can be used to test
both slow and fast paths without recompiling.)

Thanks,
drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Alexandre Ghiti 10 months, 3 weeks ago

On 18/03/2025 13:45, Andrew Jones wrote:
> On Tue, Mar 18, 2025 at 01:13:18PM +0100, Alexandre Ghiti wrote:
>> On 18/03/2025 09:48, Andrew Jones wrote:
>>> On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
>>>> Hi Drew,
>>>>
>>>> On 04/03/2025 13:00, Andrew Jones wrote:
>>>>> Allow skipping scalar and vector unaligned access speed tests. This
>>>>> is useful for testing alternative code paths and to skip the tests in
>>>>> environments where they run too slowly. All CPUs must have the same
>>>>> unaligned access speed.
>>>> I'm not a big fan of the command line parameter, this is not where we should
>>>> push uarch decisions because there could be many other in the future, the
>>>> best solution to me should be in DT/ACPI and since the DT folks, according
>>>> to Palmer, shut down this solution, it remains using an extension.
>>>>
>>>> I have been reading a bit about unaligned accesses. Zicclsm was described as
>>>> "Even though mandated, misaligned loads and stores might execute extremely
>>>> slowly. Standard software distributions should assume their existence only
>>>> for correctness, not for performance." in rva20/22 but *not* in rva23. So
>>>> what about using this "hole" and consider that a platform that *advertises*
>>>> Zicclsm means its unaligned accesses are fast? After internal discussion, It
>>>> actually does not make sense to advertise Zicclsm if the platform accesses
>>>> are slow right?
>>> This topic pops up every so often, including in yesterday's server
>>> platform TG call. In that call, and, afaict, every other time it has
>>> popped up, the result is to reiterate that ISA extensions never say
>>> anything about performance. So, Zicclsm will never mean fast and we
>>> won't likely be able to add any extension that does.
>>
>> Ok, I should not say "fast". Usually, when an extension is advertised by a
>> platform, we don't question its speed (zicboz, zicbom...etc), we simply use
>> it and it's up to the vendor to benchmark its implementation and act
>> accordingly (i.e. do not set it in the isa string).
>>
>>
>>>> arm64 for example considers that armv8 has fast unaligned accesses and can
>>>> then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
>>>> uarchs are slow. Distros will very likely use rva23 as baseline so they will
>>>> enable Zicclsm which would allow us to take advantage of this too, without
>>>> this, we lose a lot of perf improvement in the kernel, see
>>>> https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
>>>>
>>>> Or we could have a new named feature for this, even though it's weird to
>>>> have a named feature which would basically  mean "Zicclsm is fast". We don't
>>>> have, for example, a named feature to say "Zicboz is fast" but given the
>>>> vague wording in the profile spec, maybe we can ask for one in that case?
>>>>
>>>> Sorry for the late review and for triggering this debate...
>>> No problem, let's try to pick the best option. I'll try listing all the
>>> options and there pros/cons.
>>>
>>> 1. Leave as is, which is to always probe
>>>      pro: Nothing to do
>>>      con: Not ideal in all environments
>>>
>>> 2. New DT/ACPI description
>>>      pro: Describing whether or not misaligned accesses are implemented in
>>>           HW (which presumably means fast) is something that should be done
>>> 	in HW descriptions
>>>      con: We'll need to live with probing until we can get the descriptions
>>>           defined, which may be never if there's too much opposition
>>>
>>> 3. Command line
>>>      pro: Easy and serves its purpose, which is to skip probing in the
>>>           environments where probing is not desired
>>>      con: Yet another command line option (which we may want to deprecate
>>>           someday)
>>>
>>> 4. New ISA extension
>>>      pro: Easy to add to HW descriptions
>>>      con: Not likely to get it through ratification
>>>
>>> 5. New SBI FWFT feature
>>>      pro: Probably easier to get through ratification than an ISA extension
>>>      con: Instead of probing, kernel would have to ask SBI -- would that
>>>           even be faster? Will all the environments that want to skip
>>> 	probing even have a complete SBI?
>>>
>>> 6. ??
>>
>> So what about:
>>
>> 7. New enum value describing the performance as "FORCED" or "HW" (or
>> anything better)
>>      pro: We only use the existing Zicclsm
>>      con: It's not clear that the accesses are fast but it basically says to
>> SW "don't think too much, I'm telling you that you can use it", up to us to
>> describe this correctly for users to understand.
> But Zicclsm doesn't mean misaligned accesses are in HW, it just means
> they're not going to explode.


They never explode since if they are not supported by the HW, we rely on 
S-mode emulation already.


> We'd still need the probing to find out
> if the accesses are emulated (slow) or hw (fast). We at least want to
> know the answer to that question because we advertise it to userspace
> through hwprobe.
>
> (BTW, another pro of the command line is that it can be used to test
> both slow and fast paths without recompiling.)
>
> Thanks,
> drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 10 months, 3 weeks ago

On Tue, Mar 18, 2025 at 01:58:10PM +0100, Alexandre Ghiti wrote:
> On 18/03/2025 13:45, Andrew Jones wrote:
> > On Tue, Mar 18, 2025 at 01:13:18PM +0100, Alexandre Ghiti wrote:
> > > On 18/03/2025 09:48, Andrew Jones wrote:
> > > > On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
> > > > > Hi Drew,
> > > > > 
> > > > > On 04/03/2025 13:00, Andrew Jones wrote:
> > > > > > Allow skipping scalar and vector unaligned access speed tests. This
> > > > > > is useful for testing alternative code paths and to skip the tests in
> > > > > > environments where they run too slowly. All CPUs must have the same
> > > > > > unaligned access speed.
> > > > > I'm not a big fan of the command line parameter, this is not where we should
> > > > > push uarch decisions because there could be many other in the future, the
> > > > > best solution to me should be in DT/ACPI and since the DT folks, according
> > > > > to Palmer, shut down this solution, it remains using an extension.
> > > > > 
> > > > > I have been reading a bit about unaligned accesses. Zicclsm was described as
> > > > > "Even though mandated, misaligned loads and stores might execute extremely
> > > > > slowly. Standard software distributions should assume their existence only
> > > > > for correctness, not for performance." in rva20/22 but *not* in rva23. So
> > > > > what about using this "hole" and consider that a platform that *advertises*
> > > > > Zicclsm means its unaligned accesses are fast? After internal discussion, It
> > > > > actually does not make sense to advertise Zicclsm if the platform accesses
> > > > > are slow right?
> > > > This topic pops up every so often, including in yesterday's server
> > > > platform TG call. In that call, and, afaict, every other time it has
> > > > popped up, the result is to reiterate that ISA extensions never say
> > > > anything about performance. So, Zicclsm will never mean fast and we
> > > > won't likely be able to add any extension that does.
> > > 
> > > Ok, I should not say "fast". Usually, when an extension is advertised by a
> > > platform, we don't question its speed (zicboz, zicbom...etc), we simply use
> > > it and it's up to the vendor to benchmark its implementation and act
> > > accordingly (i.e. do not set it in the isa string).
> > > 
> > > 
> > > > > arm64 for example considers that armv8 has fast unaligned accesses and can
> > > > > then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
> > > > > uarchs are slow. Distros will very likely use rva23 as baseline so they will
> > > > > enable Zicclsm which would allow us to take advantage of this too, without
> > > > > this, we lose a lot of perf improvement in the kernel, see
> > > > > https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
> > > > > 
> > > > > Or we could have a new named feature for this, even though it's weird to
> > > > > have a named feature which would basically  mean "Zicclsm is fast". We don't
> > > > > have, for example, a named feature to say "Zicboz is fast" but given the
> > > > > vague wording in the profile spec, maybe we can ask for one in that case?
> > > > > 
> > > > > Sorry for the late review and for triggering this debate...
> > > > No problem, let's try to pick the best option. I'll try listing all the
> > > > options and there pros/cons.
> > > > 
> > > > 1. Leave as is, which is to always probe
> > > >      pro: Nothing to do
> > > >      con: Not ideal in all environments
> > > > 
> > > > 2. New DT/ACPI description
> > > >      pro: Describing whether or not misaligned accesses are implemented in
> > > >           HW (which presumably means fast) is something that should be done
> > > > 	in HW descriptions
> > > >      con: We'll need to live with probing until we can get the descriptions
> > > >           defined, which may be never if there's too much opposition
> > > > 
> > > > 3. Command line
> > > >      pro: Easy and serves its purpose, which is to skip probing in the
> > > >           environments where probing is not desired
> > > >      con: Yet another command line option (which we may want to deprecate
> > > >           someday)
> > > > 
> > > > 4. New ISA extension
> > > >      pro: Easy to add to HW descriptions
> > > >      con: Not likely to get it through ratification
> > > > 
> > > > 5. New SBI FWFT feature
> > > >      pro: Probably easier to get through ratification than an ISA extension
> > > >      con: Instead of probing, kernel would have to ask SBI -- would that
> > > >           even be faster? Will all the environments that want to skip
> > > > 	probing even have a complete SBI?
> > > > 
> > > > 6. ??
> > > 
> > > So what about:
> > > 
> > > 7. New enum value describing the performance as "FORCED" or "HW" (or
> > > anything better)
> > >      pro: We only use the existing Zicclsm
> > >      con: It's not clear that the accesses are fast but it basically says to
> > > SW "don't think too much, I'm telling you that you can use it", up to us to
> > > describe this correctly for users to understand.
> > But Zicclsm doesn't mean misaligned accesses are in HW, it just means
> > they're not going to explode.
> 
> 
> They never explode since if they are not supported by the HW, we rely on
> S-mode emulation already.

Exactly. Zicclsm is just a new name for that behavior. Profiles try to
name every behavior, even the ones we take for granted. Unfortunately,
like in the case of Zicclsm, we don't necessarily gain anything from
the new name. In this case, we don't gain a way to avoid probing.

Thanks,
drew

> 
> 
> > We'd still need the probing to find out
> > if the accesses are emulated (slow) or hw (fast). We at least want to
> > know the answer to that question because we advertise it to userspace
> > through hwprobe.
> > 
> > (BTW, another pro of the command line is that it can be used to test
> > both slow and fast paths without recompiling.)
> > 
> > Thanks,
> > drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Alexandre Ghiti 10 months, 3 weeks ago

On 18/03/2025 14:04, Andrew Jones wrote:
> On Tue, Mar 18, 2025 at 01:58:10PM +0100, Alexandre Ghiti wrote:
>> On 18/03/2025 13:45, Andrew Jones wrote:
>>> On Tue, Mar 18, 2025 at 01:13:18PM +0100, Alexandre Ghiti wrote:
>>>> On 18/03/2025 09:48, Andrew Jones wrote:
>>>>> On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
>>>>>> Hi Drew,
>>>>>>
>>>>>> On 04/03/2025 13:00, Andrew Jones wrote:
>>>>>>> Allow skipping scalar and vector unaligned access speed tests. This
>>>>>>> is useful for testing alternative code paths and to skip the tests in
>>>>>>> environments where they run too slowly. All CPUs must have the same
>>>>>>> unaligned access speed.
>>>>>> I'm not a big fan of the command line parameter, this is not where we should
>>>>>> push uarch decisions because there could be many other in the future, the
>>>>>> best solution to me should be in DT/ACPI and since the DT folks, according
>>>>>> to Palmer, shut down this solution, it remains using an extension.
>>>>>>
>>>>>> I have been reading a bit about unaligned accesses. Zicclsm was described as
>>>>>> "Even though mandated, misaligned loads and stores might execute extremely
>>>>>> slowly. Standard software distributions should assume their existence only
>>>>>> for correctness, not for performance." in rva20/22 but *not* in rva23. So
>>>>>> what about using this "hole" and consider that a platform that *advertises*
>>>>>> Zicclsm means its unaligned accesses are fast? After internal discussion, It
>>>>>> actually does not make sense to advertise Zicclsm if the platform accesses
>>>>>> are slow right?
>>>>> This topic pops up every so often, including in yesterday's server
>>>>> platform TG call. In that call, and, afaict, every other time it has
>>>>> popped up, the result is to reiterate that ISA extensions never say
>>>>> anything about performance. So, Zicclsm will never mean fast and we
>>>>> won't likely be able to add any extension that does.
>>>> Ok, I should not say "fast". Usually, when an extension is advertised by a
>>>> platform, we don't question its speed (zicboz, zicbom...etc), we simply use
>>>> it and it's up to the vendor to benchmark its implementation and act
>>>> accordingly (i.e. do not set it in the isa string).
>>>>
>>>>
>>>>>> arm64 for example considers that armv8 has fast unaligned accesses and can
>>>>>> then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
>>>>>> uarchs are slow. Distros will very likely use rva23 as baseline so they will
>>>>>> enable Zicclsm which would allow us to take advantage of this too, without
>>>>>> this, we lose a lot of perf improvement in the kernel, see
>>>>>> https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
>>>>>>
>>>>>> Or we could have a new named feature for this, even though it's weird to
>>>>>> have a named feature which would basically  mean "Zicclsm is fast". We don't
>>>>>> have, for example, a named feature to say "Zicboz is fast" but given the
>>>>>> vague wording in the profile spec, maybe we can ask for one in that case?
>>>>>>
>>>>>> Sorry for the late review and for triggering this debate...
>>>>> No problem, let's try to pick the best option. I'll try listing all the
>>>>> options and there pros/cons.
>>>>>
>>>>> 1. Leave as is, which is to always probe
>>>>>       pro: Nothing to do
>>>>>       con: Not ideal in all environments
>>>>>
>>>>> 2. New DT/ACPI description
>>>>>       pro: Describing whether or not misaligned accesses are implemented in
>>>>>            HW (which presumably means fast) is something that should be done
>>>>> 	in HW descriptions
>>>>>       con: We'll need to live with probing until we can get the descriptions
>>>>>            defined, which may be never if there's too much opposition
>>>>>
>>>>> 3. Command line
>>>>>       pro: Easy and serves its purpose, which is to skip probing in the
>>>>>            environments where probing is not desired
>>>>>       con: Yet another command line option (which we may want to deprecate
>>>>>            someday)
>>>>>
>>>>> 4. New ISA extension
>>>>>       pro: Easy to add to HW descriptions
>>>>>       con: Not likely to get it through ratification
>>>>>
>>>>> 5. New SBI FWFT feature
>>>>>       pro: Probably easier to get through ratification than an ISA extension
>>>>>       con: Instead of probing, kernel would have to ask SBI -- would that
>>>>>            even be faster? Will all the environments that want to skip
>>>>> 	probing even have a complete SBI?
>>>>>
>>>>> 6. ??
>>>> So what about:
>>>>
>>>> 7. New enum value describing the performance as "FORCED" or "HW" (or
>>>> anything better)
>>>>       pro: We only use the existing Zicclsm
>>>>       con: It's not clear that the accesses are fast but it basically says to
>>>> SW "don't think too much, I'm telling you that you can use it", up to us to
>>>> describe this correctly for users to understand.
>>> But Zicclsm doesn't mean misaligned accesses are in HW, it just means
>>> they're not going to explode.
>>
>> They never explode since if they are not supported by the HW, we rely on
>> S-mode emulation already.
> Exactly. Zicclsm is just a new name for that behavior. Profiles try to
> name every behavior, even the ones we take for granted. Unfortunately,
> like in the case of Zicclsm, we don't necessarily gain anything from
> the new name. In this case, we don't gain a way to avoid probing.


I understand your point but given the misaligned traps exist, I can't 
find another meaning to Zicclsm than "I'm telling you to use it". 
Zicclsm can't be used to describe an OS behaviour (ie the emulation of 
misaligned accesses).

I'm also insisting because we need a compile-time hint which allows us 
to enable HAVE_EFFICIENT_UNALIGNED_ACCESS in the kernel and Zicclsm is 
great since it is required in RVA23. if that's not Zicclsm, that must be 
another named feature/extension.

What do you suggest to make progress here?

Thanks,

Alex


>
> Thanks,
> drew
>
>>
>>> We'd still need the probing to find out
>>> if the accesses are emulated (slow) or hw (fast). We at least want to
>>> know the answer to that question because we advertise it to userspace
>>> through hwprobe.
>>>
>>> (BTW, another pro of the command line is that it can be used to test
>>> both slow and fast paths without recompiling.)
>>>
>>> Thanks,
>>> drew
> _______________________________________________
> linux-riscv mailing list
> linux-riscv@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-riscv

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 10 months, 3 weeks ago

On Tue, Mar 18, 2025 at 03:09:18PM +0100, Alexandre Ghiti wrote:
> On 18/03/2025 14:04, Andrew Jones wrote:
> > On Tue, Mar 18, 2025 at 01:58:10PM +0100, Alexandre Ghiti wrote:
> > > On 18/03/2025 13:45, Andrew Jones wrote:
> > > > On Tue, Mar 18, 2025 at 01:13:18PM +0100, Alexandre Ghiti wrote:
> > > > > On 18/03/2025 09:48, Andrew Jones wrote:
> > > > > > On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
> > > > > > > Hi Drew,
> > > > > > > 
> > > > > > > On 04/03/2025 13:00, Andrew Jones wrote:
> > > > > > > > Allow skipping scalar and vector unaligned access speed tests. This
> > > > > > > > is useful for testing alternative code paths and to skip the tests in
> > > > > > > > environments where they run too slowly. All CPUs must have the same
> > > > > > > > unaligned access speed.
> > > > > > > I'm not a big fan of the command line parameter, this is not where we should
> > > > > > > push uarch decisions because there could be many other in the future, the
> > > > > > > best solution to me should be in DT/ACPI and since the DT folks, according
> > > > > > > to Palmer, shut down this solution, it remains using an extension.
> > > > > > > 
> > > > > > > I have been reading a bit about unaligned accesses. Zicclsm was described as
> > > > > > > "Even though mandated, misaligned loads and stores might execute extremely
> > > > > > > slowly. Standard software distributions should assume their existence only
> > > > > > > for correctness, not for performance." in rva20/22 but *not* in rva23. So
> > > > > > > what about using this "hole" and consider that a platform that *advertises*
> > > > > > > Zicclsm means its unaligned accesses are fast? After internal discussion, It
> > > > > > > actually does not make sense to advertise Zicclsm if the platform accesses
> > > > > > > are slow right?
> > > > > > This topic pops up every so often, including in yesterday's server
> > > > > > platform TG call. In that call, and, afaict, every other time it has
> > > > > > popped up, the result is to reiterate that ISA extensions never say
> > > > > > anything about performance. So, Zicclsm will never mean fast and we
> > > > > > won't likely be able to add any extension that does.
> > > > > Ok, I should not say "fast". Usually, when an extension is advertised by a
> > > > > platform, we don't question its speed (zicboz, zicbom...etc), we simply use
> > > > > it and it's up to the vendor to benchmark its implementation and act
> > > > > accordingly (i.e. do not set it in the isa string).
> > > > > 
> > > > > 
> > > > > > > arm64 for example considers that armv8 has fast unaligned accesses and can
> > > > > > > then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
> > > > > > > uarchs are slow. Distros will very likely use rva23 as baseline so they will
> > > > > > > enable Zicclsm which would allow us to take advantage of this too, without
> > > > > > > this, we lose a lot of perf improvement in the kernel, see
> > > > > > > https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
> > > > > > > 
> > > > > > > Or we could have a new named feature for this, even though it's weird to
> > > > > > > have a named feature which would basically  mean "Zicclsm is fast". We don't
> > > > > > > have, for example, a named feature to say "Zicboz is fast" but given the
> > > > > > > vague wording in the profile spec, maybe we can ask for one in that case?
> > > > > > > 
> > > > > > > Sorry for the late review and for triggering this debate...
> > > > > > No problem, let's try to pick the best option. I'll try listing all the
> > > > > > options and there pros/cons.
> > > > > > 
> > > > > > 1. Leave as is, which is to always probe
> > > > > >       pro: Nothing to do
> > > > > >       con: Not ideal in all environments
> > > > > > 
> > > > > > 2. New DT/ACPI description
> > > > > >       pro: Describing whether or not misaligned accesses are implemented in
> > > > > >            HW (which presumably means fast) is something that should be done
> > > > > > 	in HW descriptions
> > > > > >       con: We'll need to live with probing until we can get the descriptions
> > > > > >            defined, which may be never if there's too much opposition
> > > > > > 
> > > > > > 3. Command line
> > > > > >       pro: Easy and serves its purpose, which is to skip probing in the
> > > > > >            environments where probing is not desired
> > > > > >       con: Yet another command line option (which we may want to deprecate
> > > > > >            someday)
> > > > > > 
> > > > > > 4. New ISA extension
> > > > > >       pro: Easy to add to HW descriptions
> > > > > >       con: Not likely to get it through ratification
> > > > > > 
> > > > > > 5. New SBI FWFT feature
> > > > > >       pro: Probably easier to get through ratification than an ISA extension
> > > > > >       con: Instead of probing, kernel would have to ask SBI -- would that
> > > > > >            even be faster? Will all the environments that want to skip
> > > > > > 	probing even have a complete SBI?
> > > > > > 
> > > > > > 6. ??
> > > > > So what about:
> > > > > 
> > > > > 7. New enum value describing the performance as "FORCED" or "HW" (or
> > > > > anything better)
> > > > >       pro: We only use the existing Zicclsm
> > > > >       con: It's not clear that the accesses are fast but it basically says to
> > > > > SW "don't think too much, I'm telling you that you can use it", up to us to
> > > > > describe this correctly for users to understand.
> > > > But Zicclsm doesn't mean misaligned accesses are in HW, it just means
> > > > they're not going to explode.
> > > 
> > > They never explode since if they are not supported by the HW, we rely on
> > > S-mode emulation already.
> > Exactly. Zicclsm is just a new name for that behavior. Profiles try to
> > name every behavior, even the ones we take for granted. Unfortunately,
> > like in the case of Zicclsm, we don't necessarily gain anything from
> > the new name. In this case, we don't gain a way to avoid probing.
> 
> 
> I understand your point but given the misaligned traps exist, I can't find
> another meaning to Zicclsm than "I'm telling you to use it". Zicclsm can't
> be used to describe an OS behaviour (ie the emulation of misaligned
> accesses).
> 
> I'm also insisting because we need a compile-time hint which allows us to
> enable HAVE_EFFICIENT_UNALIGNED_ACCESS in the kernel and Zicclsm is great
> since it is required in RVA23. if that's not Zicclsm, that must be another
> named feature/extension.
> 
> What do you suggest to make progress here?
>

I guess you mean besides listing five options and posting patches for two
of them :-)  We can't force semantics onto Zicclsm and I doubt we'll get
agreement to make another extension with the semantics we want. So (4)
is out. I agree with Clement that (5) isn't good. That leaves (2). I
guess we should start by trying to understand what issues there were/are
with it.

Thanks,
drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Anup Patel 10 months, 3 weeks ago

On Tue, Mar 18, 2025 at 8:39 PM Andrew Jones <ajones@ventanamicro.com> wrote:
>
> On Tue, Mar 18, 2025 at 03:09:18PM +0100, Alexandre Ghiti wrote:
> > On 18/03/2025 14:04, Andrew Jones wrote:
> > > On Tue, Mar 18, 2025 at 01:58:10PM +0100, Alexandre Ghiti wrote:
> > > > On 18/03/2025 13:45, Andrew Jones wrote:
> > > > > On Tue, Mar 18, 2025 at 01:13:18PM +0100, Alexandre Ghiti wrote:
> > > > > > On 18/03/2025 09:48, Andrew Jones wrote:
> > > > > > > On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
> > > > > > > > Hi Drew,
> > > > > > > >
> > > > > > > > On 04/03/2025 13:00, Andrew Jones wrote:
> > > > > > > > > Allow skipping scalar and vector unaligned access speed tests. This
> > > > > > > > > is useful for testing alternative code paths and to skip the tests in
> > > > > > > > > environments where they run too slowly. All CPUs must have the same
> > > > > > > > > unaligned access speed.
> > > > > > > > I'm not a big fan of the command line parameter, this is not where we should
> > > > > > > > push uarch decisions because there could be many other in the future, the
> > > > > > > > best solution to me should be in DT/ACPI and since the DT folks, according
> > > > > > > > to Palmer, shut down this solution, it remains using an extension.
> > > > > > > >
> > > > > > > > I have been reading a bit about unaligned accesses. Zicclsm was described as
> > > > > > > > "Even though mandated, misaligned loads and stores might execute extremely
> > > > > > > > slowly. Standard software distributions should assume their existence only
> > > > > > > > for correctness, not for performance." in rva20/22 but *not* in rva23. So
> > > > > > > > what about using this "hole" and consider that a platform that *advertises*
> > > > > > > > Zicclsm means its unaligned accesses are fast? After internal discussion, It
> > > > > > > > actually does not make sense to advertise Zicclsm if the platform accesses
> > > > > > > > are slow right?
> > > > > > > This topic pops up every so often, including in yesterday's server
> > > > > > > platform TG call. In that call, and, afaict, every other time it has
> > > > > > > popped up, the result is to reiterate that ISA extensions never say
> > > > > > > anything about performance. So, Zicclsm will never mean fast and we
> > > > > > > won't likely be able to add any extension that does.
> > > > > > Ok, I should not say "fast". Usually, when an extension is advertised by a
> > > > > > platform, we don't question its speed (zicboz, zicbom...etc), we simply use
> > > > > > it and it's up to the vendor to benchmark its implementation and act
> > > > > > accordingly (i.e. do not set it in the isa string).
> > > > > >
> > > > > >
> > > > > > > > arm64 for example considers that armv8 has fast unaligned accesses and can
> > > > > > > > then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
> > > > > > > > uarchs are slow. Distros will very likely use rva23 as baseline so they will
> > > > > > > > enable Zicclsm which would allow us to take advantage of this too, without
> > > > > > > > this, we lose a lot of perf improvement in the kernel, see
> > > > > > > > https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
> > > > > > > >
> > > > > > > > Or we could have a new named feature for this, even though it's weird to
> > > > > > > > have a named feature which would basically  mean "Zicclsm is fast". We don't
> > > > > > > > have, for example, a named feature to say "Zicboz is fast" but given the
> > > > > > > > vague wording in the profile spec, maybe we can ask for one in that case?
> > > > > > > >
> > > > > > > > Sorry for the late review and for triggering this debate...
> > > > > > > No problem, let's try to pick the best option. I'll try listing all the
> > > > > > > options and there pros/cons.
> > > > > > >
> > > > > > > 1. Leave as is, which is to always probe
> > > > > > >       pro: Nothing to do
> > > > > > >       con: Not ideal in all environments
> > > > > > >
> > > > > > > 2. New DT/ACPI description
> > > > > > >       pro: Describing whether or not misaligned accesses are implemented in
> > > > > > >            HW (which presumably means fast) is something that should be done
> > > > > > >     in HW descriptions
> > > > > > >       con: We'll need to live with probing until we can get the descriptions
> > > > > > >            defined, which may be never if there's too much opposition
> > > > > > >
> > > > > > > 3. Command line
> > > > > > >       pro: Easy and serves its purpose, which is to skip probing in the
> > > > > > >            environments where probing is not desired
> > > > > > >       con: Yet another command line option (which we may want to deprecate
> > > > > > >            someday)
> > > > > > >
> > > > > > > 4. New ISA extension
> > > > > > >       pro: Easy to add to HW descriptions
> > > > > > >       con: Not likely to get it through ratification
> > > > > > >
> > > > > > > 5. New SBI FWFT feature
> > > > > > >       pro: Probably easier to get through ratification than an ISA extension
> > > > > > >       con: Instead of probing, kernel would have to ask SBI -- would that
> > > > > > >            even be faster? Will all the environments that want to skip
> > > > > > >     probing even have a complete SBI?
> > > > > > >
> > > > > > > 6. ??
> > > > > > So what about:
> > > > > >
> > > > > > 7. New enum value describing the performance as "FORCED" or "HW" (or
> > > > > > anything better)
> > > > > >       pro: We only use the existing Zicclsm
> > > > > >       con: It's not clear that the accesses are fast but it basically says to
> > > > > > SW "don't think too much, I'm telling you that you can use it", up to us to
> > > > > > describe this correctly for users to understand.
> > > > > But Zicclsm doesn't mean misaligned accesses are in HW, it just means
> > > > > they're not going to explode.
> > > >
> > > > They never explode since if they are not supported by the HW, we rely on
> > > > S-mode emulation already.
> > > Exactly. Zicclsm is just a new name for that behavior. Profiles try to
> > > name every behavior, even the ones we take for granted. Unfortunately,
> > > like in the case of Zicclsm, we don't necessarily gain anything from
> > > the new name. In this case, we don't gain a way to avoid probing.
> >
> >
> > I understand your point but given the misaligned traps exist, I can't find
> > another meaning to Zicclsm than "I'm telling you to use it". Zicclsm can't
> > be used to describe an OS behaviour (ie the emulation of misaligned
> > accesses).
> >
> > I'm also insisting because we need a compile-time hint which allows us to
> > enable HAVE_EFFICIENT_UNALIGNED_ACCESS in the kernel and Zicclsm is great
> > since it is required in RVA23. if that's not Zicclsm, that must be another
> > named feature/extension.
> >
> > What do you suggest to make progress here?
> >
>
> I guess you mean besides listing five options and posting patches for two
> of them :-)  We can't force semantics onto Zicclsm and I doubt we'll get
> agreement to make another extension with the semantics we want. So (4)
> is out. I agree with Clement that (5) isn't good. That leaves (2). I
> guess we should start by trying to understand what issues there were/are
> with it.
>

Please note that if we define a DT parameter then we have to define
ACPI RHCT node as well.

Regards,
Anup

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Clément Léger 10 months, 3 weeks ago


On 18/03/2025 15:09, Alexandre Ghiti wrote:
> On 18/03/2025 14:04, Andrew Jones wrote:
>> On Tue, Mar 18, 2025 at 01:58:10PM +0100, Alexandre Ghiti wrote:
>>> On 18/03/2025 13:45, Andrew Jones wrote:
>>>> On Tue, Mar 18, 2025 at 01:13:18PM +0100, Alexandre Ghiti wrote:
>>>>> On 18/03/2025 09:48, Andrew Jones wrote:
>>>>>> On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
>>>>>>> Hi Drew,
>>>>>>>
>>>>>>> On 04/03/2025 13:00, Andrew Jones wrote:
>>>>>>>> Allow skipping scalar and vector unaligned access speed tests. This
>>>>>>>> is useful for testing alternative code paths and to skip the
>>>>>>>> tests in
>>>>>>>> environments where they run too slowly. All CPUs must have the same
>>>>>>>> unaligned access speed.
>>>>>>> I'm not a big fan of the command line parameter, this is not
>>>>>>> where we should
>>>>>>> push uarch decisions because there could be many other in the
>>>>>>> future, the
>>>>>>> best solution to me should be in DT/ACPI and since the DT folks,
>>>>>>> according
>>>>>>> to Palmer, shut down this solution, it remains using an extension.
>>>>>>>
>>>>>>> I have been reading a bit about unaligned accesses. Zicclsm was
>>>>>>> described as
>>>>>>> "Even though mandated, misaligned loads and stores might execute
>>>>>>> extremely
>>>>>>> slowly. Standard software distributions should assume their
>>>>>>> existence only
>>>>>>> for correctness, not for performance." in rva20/22 but *not* in
>>>>>>> rva23. So
>>>>>>> what about using this "hole" and consider that a platform that
>>>>>>> *advertises*
>>>>>>> Zicclsm means its unaligned accesses are fast? After internal
>>>>>>> discussion, It
>>>>>>> actually does not make sense to advertise Zicclsm if the platform
>>>>>>> accesses
>>>>>>> are slow right?
>>>>>> This topic pops up every so often, including in yesterday's server
>>>>>> platform TG call. In that call, and, afaict, every other time it has
>>>>>> popped up, the result is to reiterate that ISA extensions never say
>>>>>> anything about performance. So, Zicclsm will never mean fast and we
>>>>>> won't likely be able to add any extension that does.
>>>>> Ok, I should not say "fast". Usually, when an extension is
>>>>> advertised by a
>>>>> platform, we don't question its speed (zicboz, zicbom...etc), we
>>>>> simply use
>>>>> it and it's up to the vendor to benchmark its implementation and act
>>>>> accordingly (i.e. do not set it in the isa string).
>>>>>
>>>>>
>>>>>>> arm64 for example considers that armv8 has fast unaligned
>>>>>>> accesses and can
>>>>>>> then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even
>>>>>>> though some
>>>>>>> uarchs are slow. Distros will very likely use rva23 as baseline
>>>>>>> so they will
>>>>>>> enable Zicclsm which would allow us to take advantage of this
>>>>>>> too, without
>>>>>>> this, we lose a lot of perf improvement in the kernel, see
>>>>>>> https://lore.kernel.org/lkml/20231225044207.3821-1-
>>>>>>> jszhang@kernel.org/.
>>>>>>>
>>>>>>> Or we could have a new named feature for this, even though it's
>>>>>>> weird to
>>>>>>> have a named feature which would basically  mean "Zicclsm is
>>>>>>> fast". We don't
>>>>>>> have, for example, a named feature to say "Zicboz is fast" but
>>>>>>> given the
>>>>>>> vague wording in the profile spec, maybe we can ask for one in
>>>>>>> that case?
>>>>>>>
>>>>>>> Sorry for the late review and for triggering this debate...
>>>>>> No problem, let's try to pick the best option. I'll try listing
>>>>>> all the
>>>>>> options and there pros/cons.
>>>>>>
>>>>>> 1. Leave as is, which is to always probe
>>>>>>       pro: Nothing to do
>>>>>>       con: Not ideal in all environments
>>>>>>
>>>>>> 2. New DT/ACPI description
>>>>>>       pro: Describing whether or not misaligned accesses are
>>>>>> implemented in
>>>>>>            HW (which presumably means fast) is something that
>>>>>> should be done
>>>>>>     in HW descriptions
>>>>>>       con: We'll need to live with probing until we can get the
>>>>>> descriptions
>>>>>>            defined, which may be never if there's too much opposition
>>>>>>
>>>>>> 3. Command line
>>>>>>       pro: Easy and serves its purpose, which is to skip probing
>>>>>> in the
>>>>>>            environments where probing is not desired
>>>>>>       con: Yet another command line option (which we may want to
>>>>>> deprecate
>>>>>>            someday)
>>>>>>
>>>>>> 4. New ISA extension
>>>>>>       pro: Easy to add to HW descriptions
>>>>>>       con: Not likely to get it through ratification
>>>>>>
>>>>>> 5. New SBI FWFT feature
>>>>>>       pro: Probably easier to get through ratification than an ISA
>>>>>> extension
>>>>>>       con: Instead of probing, kernel would have to ask SBI --
>>>>>> would that
>>>>>>            even be faster? Will all the environments that want to
>>>>>> skip
>>>>>>     probing even have a complete SBI?
>>>>>>
>>>>>> 6. ??
>>>>> So what about:
>>>>>
>>>>> 7. New enum value describing the performance as "FORCED" or "HW" (or
>>>>> anything better)
>>>>>       pro: We only use the existing Zicclsm
>>>>>       con: It's not clear that the accesses are fast but it
>>>>> basically says to
>>>>> SW "don't think too much, I'm telling you that you can use it", up
>>>>> to us to
>>>>> describe this correctly for users to understand.
>>>> But Zicclsm doesn't mean misaligned accesses are in HW, it just means
>>>> they're not going to explode.
>>>
>>> They never explode since if they are not supported by the HW, we rely on
>>> S-mode emulation already.
>> Exactly. Zicclsm is just a new name for that behavior. Profiles try to
>> name every behavior, even the ones we take for granted. Unfortunately,
>> like in the case of Zicclsm, we don't necessarily gain anything from
>> the new name. In this case, we don't gain a way to avoid probing.
> 
> 
> I understand your point but given the misaligned traps exist, I can't
> find another meaning to Zicclsm than "I'm telling you to use it".
> Zicclsm can't be used to describe an OS behaviour (ie the emulation of
> misaligned accesses).

Hi Alex,

Some SBI implementation might decide not to delegate the misaligned trap
and not emulate it or partially emulate it. IMHO, Zicclsm should
actually be advertised by SBI to actually tell the OS that misaligned
accesses are supported (even though they are slow) since Zicclsm is a
profile extension (at least in its first definition). I think we can not
rely on Zicclsm to determine that accesses are fast. Moreover, it seems
like its definition evolved over time and lacks clarity to be reliable.

> 
> I'm also insisting because we need a compile-time hint which allows us
> to enable HAVE_EFFICIENT_UNALIGNED_ACCESS in the kernel and Zicclsm is
> great since it is required in RVA23. if that's not Zicclsm, that must be
> another named feature/extension.

As said in the other thread, I think we might have to enable
HAVE_EFFICIENT_UNALIGNED_ACCESS as a default (or whatever option selects
that CONFIG). HW without misaligned access support implementation would
suffer from that choice but would still work (although poorly) thanks to
S-mode emulation. If one wants to run the kernel more efficiently on
some smaller chip without any hardware support for it, then it should
disable that config. I think that we can not accommodate both world
without hurting one side or the other, a choice needs to be made.

Thanks,

Clément

> 
> What do you suggest to make progress here?
> 
> Thanks,
> 
> Alex
> 
> 
>>
>> Thanks,
>> drew
>>
>>>
>>>> We'd still need the probing to find out
>>>> if the accesses are emulated (slow) or hw (fast). We at least want to
>>>> know the answer to that question because we advertise it to userspace
>>>> through hwprobe.
>>>>
>>>> (BTW, another pro of the command line is that it can be used to test
>>>> both slow and fast paths without recompiling.)
>>>>
>>>> Thanks,
>>>> drew
>> _______________________________________________
>> linux-riscv mailing list
>> linux-riscv@lists.infradead.org
>> http://lists.infradead.org/mailman/listinfo/linux-riscv

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 10 months, 3 weeks ago

On Tue, Mar 18, 2025 at 09:48:21AM +0100, Andrew Jones wrote:
> On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
> > Hi Drew,
> > 
> > On 04/03/2025 13:00, Andrew Jones wrote:
> > > Allow skipping scalar and vector unaligned access speed tests. This
> > > is useful for testing alternative code paths and to skip the tests in
> > > environments where they run too slowly. All CPUs must have the same
> > > unaligned access speed.
> > 
> > I'm not a big fan of the command line parameter, this is not where we should
> > push uarch decisions because there could be many other in the future, the
> > best solution to me should be in DT/ACPI and since the DT folks, according
> > to Palmer, shut down this solution, it remains using an extension.
> > 
> > I have been reading a bit about unaligned accesses. Zicclsm was described as
> > "Even though mandated, misaligned loads and stores might execute extremely
> > slowly. Standard software distributions should assume their existence only
> > for correctness, not for performance." in rva20/22 but *not* in rva23. So
> > what about using this "hole" and consider that a platform that *advertises*
> > Zicclsm means its unaligned accesses are fast? After internal discussion, It
> > actually does not make sense to advertise Zicclsm if the platform accesses
> > are slow right?
> 
> This topic pops up every so often, including in yesterday's server
> platform TG call. In that call, and, afaict, every other time it has
> popped up, the result is to reiterate that ISA extensions never say
> anything about performance. So, Zicclsm will never mean fast and we
> won't likely be able to add any extension that does.
> 
> > 
> > arm64 for example considers that armv8 has fast unaligned accesses and can
> > then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
> > uarchs are slow. Distros will very likely use rva23 as baseline so they will
> > enable Zicclsm which would allow us to take advantage of this too, without
> > this, we lose a lot of perf improvement in the kernel, see
> > https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
> > 
> > Or we could have a new named feature for this, even though it's weird to
> > have a named feature which would basically  mean "Zicclsm is fast". We don't
> > have, for example, a named feature to say "Zicboz is fast" but given the
> > vague wording in the profile spec, maybe we can ask for one in that case?
> > 
> > Sorry for the late review and for triggering this debate...
> 
> No problem, let's try to pick the best option. I'll try listing all the
> options and there pros/cons.
> 
> 1. Leave as is, which is to always probe
>    pro: Nothing to do
>    con: Not ideal in all environments
> 
> 2. New DT/ACPI description
>    pro: Describing whether or not misaligned accesses are implemented in
>         HW (which presumably means fast) is something that should be done
> 	in HW descriptions
>    con: We'll need to live with probing until we can get the descriptions
>         defined, which may be never if there's too much opposition
> 
> 3. Command line
>    pro: Easy and serves its purpose, which is to skip probing in the
>         environments where probing is not desired
>    con: Yet another command line option (which we may want to deprecate
>         someday)
> 
> 4. New ISA extension
>    pro: Easy to add to HW descriptions
>    con: Not likely to get it through ratification
> 
> 5. New SBI FWFT feature
>    pro: Probably easier to get through ratification than an ISA extension
>    con: Instead of probing, kernel would have to ask SBI -- would that
>         even be faster? Will all the environments that want to skip
> 	probing even have a complete SBI?
> 
> 6. ??

I forgot one, which was v1 of this series and already rejected,

 6. Use ID registers
    pro: None of the above cons, including the main con with the command
         line, which is that there could be many other decisions in the
	 future, implying we could need many more command line options.
    con: A slippery slope. We don't want to open the door to
         features-by-idregs. (However, we can at least always close the
	 door again if better mechanisms become available. Command
	 lines would need to be deprecated, but feature-by-idreg code
	 can just be deleted.)

Thanks,
drew

> 
> I'm voting for (3), which is why I posted this patchset, but I'm happy
> to hear other votes or other proposals and discuss.
> 
> Thanks,
> drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Clément Léger 10 months, 3 weeks ago


On 18/03/2025 10:00, Andrew Jones wrote:
> On Tue, Mar 18, 2025 at 09:48:21AM +0100, Andrew Jones wrote:
>> On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
>>> Hi Drew,
>>>
>>> On 04/03/2025 13:00, Andrew Jones wrote:
>>>> Allow skipping scalar and vector unaligned access speed tests. This
>>>> is useful for testing alternative code paths and to skip the tests in
>>>> environments where they run too slowly. All CPUs must have the same
>>>> unaligned access speed.
>>>
>>> I'm not a big fan of the command line parameter, this is not where we should
>>> push uarch decisions because there could be many other in the future, the
>>> best solution to me should be in DT/ACPI and since the DT folks, according
>>> to Palmer, shut down this solution, it remains using an extension.
>>>
>>> I have been reading a bit about unaligned accesses. Zicclsm was described as
>>> "Even though mandated, misaligned loads and stores might execute extremely
>>> slowly. Standard software distributions should assume their existence only
>>> for correctness, not for performance." in rva20/22 but *not* in rva23. So
>>> what about using this "hole" and consider that a platform that *advertises*
>>> Zicclsm means its unaligned accesses are fast? After internal discussion, It
>>> actually does not make sense to advertise Zicclsm if the platform accesses
>>> are slow right?
>>
>> This topic pops up every so often, including in yesterday's server
>> platform TG call. In that call, and, afaict, every other time it has
>> popped up, the result is to reiterate that ISA extensions never say
>> anything about performance. So, Zicclsm will never mean fast and we
>> won't likely be able to add any extension that does.
>>
>>>
>>> arm64 for example considers that armv8 has fast unaligned accesses and can
>>> then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
>>> uarchs are slow. Distros will very likely use rva23 as baseline so they will
>>> enable Zicclsm which would allow us to take advantage of this too, without
>>> this, we lose a lot of perf improvement in the kernel, see
>>> https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
>>>
>>> Or we could have a new named feature for this, even though it's weird to
>>> have a named feature which would basically  mean "Zicclsm is fast". We don't
>>> have, for example, a named feature to say "Zicboz is fast" but given the
>>> vague wording in the profile spec, maybe we can ask for one in that case?
>>>
>>> Sorry for the late review and for triggering this debate...
>>
>> No problem, let's try to pick the best option. I'll try listing all the
>> options and there pros/cons.
>>
>> 1. Leave as is, which is to always probe
>>    pro: Nothing to do
>>    con: Not ideal in all environments
>>
>> 2. New DT/ACPI description
>>    pro: Describing whether or not misaligned accesses are implemented in
>>         HW (which presumably means fast) is something that should be done
>> 	in HW descriptions
>>    con: We'll need to live with probing until we can get the descriptions
>>         defined, which may be never if there's too much opposition
>>
>> 3. Command line
>>    pro: Easy and serves its purpose, which is to skip probing in the
>>         environments where probing is not desired
>>    con: Yet another command line option (which we may want to deprecate
>>         someday)
>>
>> 4. New ISA extension
>>    pro: Easy to add to HW descriptions
>>    con: Not likely to get it through ratification
>>
>> 5. New SBI FWFT feature
>>    pro: Probably easier to get through ratification than an ISA extension
>>    con: Instead of probing, kernel would have to ask SBI -- would that
>>         even be faster? Will all the environments that want to skip
>> 	probing even have a complete SBI?

Hi Andrew

FWFT is not really meant to "query" information from the firmware,
fwft_set() wouldn't have anything to actually set. The problem would
also just be pushed away from Linux but would probably still require
specification anyway.

>>
>> 6. ??
> 
> I forgot one, which was v1 of this series and already rejected,
> 
>  6. Use ID registers
>     pro: None of the above cons, including the main con with the command
>          line, which is that there could be many other decisions in the
> 	 future, implying we could need many more command line options.
>     con: A slippery slope. We don't want to open the door to
>          features-by-idregs. (However, we can at least always close the
> 	 door again if better mechanisms become available. Command
> 	 lines would need to be deprecated, but feature-by-idreg code
> 	 can just be deleted.)

My preferred option would have been option 2. BTW, what are the
arguments to push away the description of misaligned access speed out of
device-tree ? that's almost exactly what the device-tree is meant to do,
ie describe hardware.

As a last resort solution, I'm for option 3. There already exists a
command line option to preset the jiffies. This is almost the same use
case that we have, ie have a faster boot time by presetting the
misaligned access probing.

IMHO, skipping misaligned access probing speed is orthogonal to
EFFICIENT_UNALIGNED_ACCESS. one is done at runtime and allows the
userspace to know the speed of misaligned accesses, the other one at
compile time to improve kernel speed. Depending on which system we want
to support, we might need to enable EFFICIENT_UNALIGNED_ACCESS as a
default, allowing for the most Linux "capable" chips to have full
performances.

Thanks,

Clément

> 
> Thanks,
> drew
> 
>>
>> I'm voting for (3), which is why I posted this patchset, but I'm happy
>> to hear other votes or other proposals and discuss.
>>
>> Thanks,
>> drew

Re: [PATCH v3 7/8] riscv: Add parameter for skipping access speed tests

Posted by Andrew Jones 10 months, 3 weeks ago

On Tue, Mar 18, 2025 at 03:09:58PM +0100, Clément Léger wrote:
> 
> 
> On 18/03/2025 10:00, Andrew Jones wrote:
> > On Tue, Mar 18, 2025 at 09:48:21AM +0100, Andrew Jones wrote:
> >> On Mon, Mar 17, 2025 at 03:39:01PM +0100, Alexandre Ghiti wrote:
> >>> Hi Drew,
> >>>
> >>> On 04/03/2025 13:00, Andrew Jones wrote:
> >>>> Allow skipping scalar and vector unaligned access speed tests. This
> >>>> is useful for testing alternative code paths and to skip the tests in
> >>>> environments where they run too slowly. All CPUs must have the same
> >>>> unaligned access speed.
> >>>
> >>> I'm not a big fan of the command line parameter, this is not where we should
> >>> push uarch decisions because there could be many other in the future, the
> >>> best solution to me should be in DT/ACPI and since the DT folks, according
> >>> to Palmer, shut down this solution, it remains using an extension.
> >>>
> >>> I have been reading a bit about unaligned accesses. Zicclsm was described as
> >>> "Even though mandated, misaligned loads and stores might execute extremely
> >>> slowly. Standard software distributions should assume their existence only
> >>> for correctness, not for performance." in rva20/22 but *not* in rva23. So
> >>> what about using this "hole" and consider that a platform that *advertises*
> >>> Zicclsm means its unaligned accesses are fast? After internal discussion, It
> >>> actually does not make sense to advertise Zicclsm if the platform accesses
> >>> are slow right?
> >>
> >> This topic pops up every so often, including in yesterday's server
> >> platform TG call. In that call, and, afaict, every other time it has
> >> popped up, the result is to reiterate that ISA extensions never say
> >> anything about performance. So, Zicclsm will never mean fast and we
> >> won't likely be able to add any extension that does.
> >>
> >>>
> >>> arm64 for example considers that armv8 has fast unaligned accesses and can
> >>> then enable HAVE_EFFICIENT_ALIGNED_ACCESS in the kernel, even though some
> >>> uarchs are slow. Distros will very likely use rva23 as baseline so they will
> >>> enable Zicclsm which would allow us to take advantage of this too, without
> >>> this, we lose a lot of perf improvement in the kernel, see
> >>> https://lore.kernel.org/lkml/20231225044207.3821-1-jszhang@kernel.org/.
> >>>
> >>> Or we could have a new named feature for this, even though it's weird to
> >>> have a named feature which would basically  mean "Zicclsm is fast". We don't
> >>> have, for example, a named feature to say "Zicboz is fast" but given the
> >>> vague wording in the profile spec, maybe we can ask for one in that case?
> >>>
> >>> Sorry for the late review and for triggering this debate...
> >>
> >> No problem, let's try to pick the best option. I'll try listing all the
> >> options and there pros/cons.
> >>
> >> 1. Leave as is, which is to always probe
> >>    pro: Nothing to do
> >>    con: Not ideal in all environments
> >>
> >> 2. New DT/ACPI description
> >>    pro: Describing whether or not misaligned accesses are implemented in
> >>         HW (which presumably means fast) is something that should be done
> >> 	in HW descriptions
> >>    con: We'll need to live with probing until we can get the descriptions
> >>         defined, which may be never if there's too much opposition
> >>
> >> 3. Command line
> >>    pro: Easy and serves its purpose, which is to skip probing in the
> >>         environments where probing is not desired
> >>    con: Yet another command line option (which we may want to deprecate
> >>         someday)
> >>
> >> 4. New ISA extension
> >>    pro: Easy to add to HW descriptions
> >>    con: Not likely to get it through ratification
> >>
> >> 5. New SBI FWFT feature
> >>    pro: Probably easier to get through ratification than an ISA extension
> >>    con: Instead of probing, kernel would have to ask SBI -- would that
> >>         even be faster? Will all the environments that want to skip
> >> 	probing even have a complete SBI?
> 
> Hi Andrew
> 
> FWFT is not really meant to "query" information from the firmware,
> fwft_set() wouldn't have anything to actually set. The problem would
> also just be pushed away from Linux but would probably still require
> specification anyway.

Agreed. Actually, if we had HW descriptions for every feature in FWFT,
and allowed each feature to have implementation-defined reset values,
then we wouldn't need the get function. The OS would only call the
set function if it disagreed with the value it saw in the HW description.
But this is getting off-topic and we can just agree that FWFT isn't the
right approach.

> 
> >>
> >> 6. ??
> > 
> > I forgot one, which was v1 of this series and already rejected,
> > 
> >  6. Use ID registers
> >     pro: None of the above cons, including the main con with the command
> >          line, which is that there could be many other decisions in the
> > 	 future, implying we could need many more command line options.
> >     con: A slippery slope. We don't want to open the door to
> >          features-by-idregs. (However, we can at least always close the
> > 	 door again if better mechanisms become available. Command
> > 	 lines would need to be deprecated, but feature-by-idreg code
> > 	 can just be deleted.)
> 
> My preferred option would have been option 2. BTW, what are the
> arguments to push away the description of misaligned access speed out of
> device-tree ? that's almost exactly what the device-tree is meant to do,
> ie describe hardware.

Actually, I don't know. Maybe Palmer can point to something.

Thanks,
drew

> 
> As a last resort solution, I'm for option 3. There already exists a
> command line option to preset the jiffies. This is almost the same use
> case that we have, ie have a faster boot time by presetting the
> misaligned access probing.
> 
> IMHO, skipping misaligned access probing speed is orthogonal to
> EFFICIENT_UNALIGNED_ACCESS. one is done at runtime and allows the
> userspace to know the speed of misaligned accesses, the other one at
> compile time to improve kernel speed. Depending on which system we want
> to support, we might need to enable EFFICIENT_UNALIGNED_ACCESS as a
> default, allowing for the most Linux "capable" chips to have full
> performances.
> 
> Thanks,
> 
> Clément
> 
> > 
> > Thanks,
> > drew
> > 
> >>
> >> I'm voting for (3), which is why I posted this patchset, but I'm happy
> >> to hear other votes or other proposals and discuss.
> >>
> >> Thanks,
> >> drew
>

[PATCH v3 1/8] riscv: Annotate unaligned access init functions
[PATCH v3 2/8] riscv: Fix riscv_online_cpu_vec
[PATCH v3 3/8] riscv: Fix check_unaligned_access_all_cpus
[PATCH v3 4/8] riscv: Change check_unaligned_access_speed_all_cpus to void
[PATCH v3 5/8] riscv: Fix set up of cpu hotplug callbacks
[PATCH v3 6/8] riscv: Fix set up of vector cpu hotplug callback
[PATCH v3 7/8] riscv: Add parameter for skipping access speed tests
[PATCH v3 8/8] Documentation/kernel-parameters: Add riscv unaligned speed parameters