arch/arm64/include/asm/cpu.h | 2 + arch/arm64/include/asm/sysreg.h | 3 + arch/arm64/kernel/cpuinfo.c | 192 +++++++++++++++++++++++++++++++- 3 files changed, 194 insertions(+), 3 deletions(-)
From: Palmer Dabbelt <palmerdabbelt@meta.com>
We've found that some of our workloads run faster when some of these
fields are set to non-default values on some of the systems we're trying
to run those workloads on. This allows us to set those values via
sysfs, so we can do workload/system-specific tuning.
Signed-off-by: Palmer Dabbelt <palmerdabbelt@meta.com>
---
I've only really smoke tested this, but I figured I'd send it along because I'm
not sure if this is even a sane thing to be doing -- these extended control
registers have some wacky stuff in them, so maybe they're not exposed to
userspace on purpose. IIUC firmware can gate these writes, though, so it
should be possible for vendors to forbid the really scary values.
That said, we do see some performance improvements here on real workloads. So
we're hoping to roll some of this tuning work out more widely, but we also
don't want to adopt some internal interface. Thus it'd make our lives easier
if we could twiddle these bits in a standard way.
Nobody's wed to sysfs, I just went with it because some of the other system
registers are exposed there.
---
arch/arm64/include/asm/cpu.h | 2 +
arch/arm64/include/asm/sysreg.h | 3 +
arch/arm64/kernel/cpuinfo.c | 192 +++++++++++++++++++++++++++++++-
3 files changed, 194 insertions(+), 3 deletions(-)
diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 71493b760b83..275d24da962b 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -39,6 +39,8 @@ struct cpuinfo_32bit {
struct cpuinfo_arm64 {
struct kobject kobj;
+ unsigned int cpu;
+
u64 reg_ctr;
u64 reg_cntfrq;
u64 reg_dczid;
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index d5b5f2ae1afa..17dfbf640a2a 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -412,6 +412,9 @@
#define SYS_AIDR_EL1 sys_reg(3, 1, 0, 0, 7)
+#define SYS_CPUECTRL_EL1 sys_reg(3, 0, 15, 1, 4)
+#define SYS_CPUECTRL2_EL1 sys_reg(3, 0, 15, 1, 5)
+
#define SYS_RNDR_EL0 sys_reg(3, 3, 2, 4, 0)
#define SYS_RNDRRS_EL0 sys_reg(3, 3, 2, 4, 1)
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index ba834909a28b..beaabde5b8e3 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -339,7 +339,7 @@ static struct attribute *cpuregs_id_attrs[] = {
NULL
};
-static const struct attribute_group cpuregs_attr_group = {
+static const struct attribute_group id_cpuregs_attr_group = {
.attrs = cpuregs_id_attrs,
.name = "identification"
};
@@ -354,12 +354,191 @@ static const struct attribute_group sme_cpuregs_attr_group = {
.name = "identification"
};
+/*
+ * sysfs has per-file locking, but each of the extended control register fields
+ * are in their own file. So here we just have a single shared lock for all of
+ * them -- we could have per-CPU locking, but seems overkill.
+ */
+static DEFINE_MUTEX(extended_lock);
+struct cpuectrl_op {
+ unsigned long long before;
+ unsigned long long mask;
+ unsigned long long val;
+ unsigned long long after;
+};
+
+#define CPUREGS_ECTRL_ACCESS(_name, _reg) \
+ static void access_##_name(void *op_uncast) \
+ { \
+ struct cpuectrl_op *op = op_uncast; \
+ \
+ mutex_lock(&extended_lock); \
+ \
+ op->before = read_cpuid(_reg); \
+ if (op->mask) { \
+ unsigned long long new = (op->before & ~op->mask) | op->val; \
+ write_sysreg_s(new, SYS_##_reg); \
+ op->after = read_cpuid(_reg); \
+ } \
+ \
+ mutex_unlock(&extended_lock); \
+ }
+
+CPUREGS_ECTRL_ACCESS(cpuectrl_el1, CPUECTRL_EL1)
+CPUREGS_ECTRL_ACCESS(cpuectrl2_el1, CPUECTRL2_EL1)
+
+#define CPUREGS_ECTRL_ATTR__RW(_reg, _field, _bit_hi, _bit_lo) \
+ static ssize_t _reg##_field##_show(struct kobject *kobj, \
+ struct kobj_attribute *attr, char *buf) \
+ { \
+ struct cpuinfo_arm64 *info = kobj_to_cpuinfo(kobj); \
+ struct cpuectrl_op op; \
+ \
+ op.mask = 0; \
+ smp_call_function_single(info->cpu, access_##_reg, &op, 1); \
+ op.val = op.before & GENMASK(_bit_hi, _bit_lo); \
+ return sprintf(buf, "0x%llx\n", op.val >> _bit_lo); \
+ } \
+ \
+ static ssize_t _reg##_field##_store(struct kobject *kobj, \
+ struct kobj_attribute *attr, const char *buf, \
+ size_t bytes) \
+ { \
+ struct cpuinfo_arm64 *info = kobj_to_cpuinfo(kobj); \
+ struct cpuectrl_op op; \
+ \
+ if (sscanf(buf, "0x%llx", &op.val) != 1) \
+ return -EINVAL; \
+ op.mask = GENMASK(_bit_hi, _bit_lo); \
+ op.val <<= _bit_lo; \
+ if ((op.val & op.mask) != op.val) \
+ return -EINVAL; \
+ \
+ smp_call_function_single(info->cpu, access_##_reg, &op, 1); \
+ \
+ if ((op.after & op.mask) != op.val) \
+ return -ENOTSUPP; \
+ return strlen(buf); \
+ } \
+ \
+ static struct kobj_attribute cpuregs_attr_##_reg##_##_field = \
+ __ATTR(_field, S_IWUSR | S_IRUGO, _reg##_field##_show, _reg##_field##_store)
+
+
+/*
+ * The names for these match the names in the arm Arm ARM, which is a bit
+ * terse. It seems somewhat reasonable to leave them as-is, though, as users
+ * probably shouldn't just be poking at them unless they know what they're
+ * doing and the fancy-looking names will hopefully hint at that.
+ */
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, cmc_min_ways, 63, 61);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, l2_inst_part, 60, 58);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, l2_data_part, 57, 55);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, mm_vmid_thr, 54, 54);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, mm_asp_en, 53, 53);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, mm_ch_dis, 52, 52);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, mm_tlbpf_dis, 51, 51);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, mm_hpa_l1_dis, 47, 47);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, mm_hpa_dis, 46, 46);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, l2_flush, 43, 43);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, pft_mm, 41, 40);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, pft_ls, 39, 38);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, pft_if, 37, 36);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, ca_uclean_evict_en, 35, 35);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, ca_evict_dis, 34, 34);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, atomic_ld_force_near, 33, 33);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, atomic_acq_near, 32, 32);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, atomic_st_near, 31, 31);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, atomic_rel_near, 30, 30);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, atomic_ld_near, 29, 29);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, tlb_pred_dis, 28, 28);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, tlb_pred_aggr, 27, 27);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, tlb_cabt_en, 26, 26);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, ws_thr_l2, 25, 24);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, ws_thr_l3, 23, 22);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, ws_thr_l4, 21, 20);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, ws_thr_dram, 19, 18);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, pf_dis, 15, 15);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, pf_ss_l2_dis, 13, 12);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, pf_sts_dis, 9, 9);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, pf_sti_dis, 8, 8);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, pf_mode, 7, 6);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, rpf_mode, 5, 4);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, rpf_lo_conf, 3, 3);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, ras_raz, 1, 1);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl_el1, ext_llc, 0, 0);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl2_el1, txreq_min, 16, 15);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl2_el1, cbusy_filter_window, 10, 9);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl2_el1, cbusy_filter_threshold, 8, 7);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl2_el1, txreq_limit_dec, 6, 5);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl2_el1, txreq_limit_inc, 4, 3);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl2_el1, txreq_limit_dynamic, 2, 2);
+CPUREGS_ECTRL_ATTR__RW(cpuectrl2_el1, txreq_max, 1, 0);
+
+static struct attribute *cpuregs_cpuectrl_attrs[] = {
+ &cpuregs_attr_cpuectrl_el1_cmc_min_ways.attr,
+ &cpuregs_attr_cpuectrl_el1_l2_inst_part.attr,
+ &cpuregs_attr_cpuectrl_el1_l2_data_part.attr,
+ &cpuregs_attr_cpuectrl_el1_mm_vmid_thr.attr,
+ &cpuregs_attr_cpuectrl_el1_mm_asp_en.attr,
+ &cpuregs_attr_cpuectrl_el1_mm_ch_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_mm_tlbpf_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_mm_hpa_l1_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_mm_hpa_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_l2_flush.attr,
+ &cpuregs_attr_cpuectrl_el1_pft_mm.attr,
+ &cpuregs_attr_cpuectrl_el1_pft_ls.attr,
+ &cpuregs_attr_cpuectrl_el1_pft_if.attr,
+ &cpuregs_attr_cpuectrl_el1_ca_uclean_evict_en.attr,
+ &cpuregs_attr_cpuectrl_el1_ca_evict_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_atomic_ld_force_near.attr,
+ &cpuregs_attr_cpuectrl_el1_atomic_acq_near.attr,
+ &cpuregs_attr_cpuectrl_el1_atomic_st_near.attr,
+ &cpuregs_attr_cpuectrl_el1_atomic_rel_near.attr,
+ &cpuregs_attr_cpuectrl_el1_atomic_ld_near.attr,
+ &cpuregs_attr_cpuectrl_el1_tlb_pred_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_tlb_pred_aggr.attr,
+ &cpuregs_attr_cpuectrl_el1_tlb_cabt_en.attr,
+ &cpuregs_attr_cpuectrl_el1_ws_thr_l2.attr,
+ &cpuregs_attr_cpuectrl_el1_ws_thr_l3.attr,
+ &cpuregs_attr_cpuectrl_el1_ws_thr_l4.attr,
+ &cpuregs_attr_cpuectrl_el1_ws_thr_dram.attr,
+ &cpuregs_attr_cpuectrl_el1_pf_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_pf_ss_l2_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_pf_sts_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_pf_sti_dis.attr,
+ &cpuregs_attr_cpuectrl_el1_pf_mode.attr,
+ &cpuregs_attr_cpuectrl_el1_rpf_mode.attr,
+ &cpuregs_attr_cpuectrl_el1_rpf_lo_conf.attr,
+ &cpuregs_attr_cpuectrl_el1_ras_raz.attr,
+ &cpuregs_attr_cpuectrl_el1_ext_llc.attr,
+ &cpuregs_attr_cpuectrl2_el1_txreq_min.attr,
+ &cpuregs_attr_cpuectrl2_el1_cbusy_filter_window.attr,
+ &cpuregs_attr_cpuectrl2_el1_cbusy_filter_threshold.attr,
+ &cpuregs_attr_cpuectrl2_el1_txreq_limit_dec.attr,
+ &cpuregs_attr_cpuectrl2_el1_txreq_limit_inc.attr,
+ &cpuregs_attr_cpuectrl2_el1_txreq_limit_dynamic.attr,
+ &cpuregs_attr_cpuectrl2_el1_txreq_max.attr,
+ NULL
+};
+
+static const struct attribute_group ectrl_cpuregs_attr_group = {
+ .attrs = cpuregs_cpuectrl_attrs,
+ .name = "extended_control"
+};
+
static int cpuid_cpu_online(unsigned int cpu)
{
int rc;
struct device *dev;
struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu);
+ /*
+ * FIXME: There must be some better way to go from a per_cpu pointer to
+ * the CPU ID...
+ */
+ info->cpu = cpu;
+
dev = get_cpu_device(cpu);
if (!dev) {
rc = -ENODEV;
@@ -368,11 +547,17 @@ static int cpuid_cpu_online(unsigned int cpu)
rc = kobject_add(&info->kobj, &dev->kobj, "regs");
if (rc)
goto out;
- rc = sysfs_create_group(&info->kobj, &cpuregs_attr_group);
+ rc = sysfs_create_group(&info->kobj, &id_cpuregs_attr_group);
if (rc)
kobject_del(&info->kobj);
if (system_supports_sme())
rc = sysfs_merge_group(&info->kobj, &sme_cpuregs_attr_group);
+
+ rc = sysfs_create_group(&info->kobj, &ectrl_cpuregs_attr_group);
+ if (rc) {
+ sysfs_remove_group(&info->kobj, &id_cpuregs_attr_group);
+ kobject_del(&info->kobj);
+ }
out:
return rc;
}
@@ -386,7 +571,8 @@ static int cpuid_cpu_offline(unsigned int cpu)
if (!dev)
return -ENODEV;
if (info->kobj.parent) {
- sysfs_remove_group(&info->kobj, &cpuregs_attr_group);
+ sysfs_remove_group(&info->kobj, &id_cpuregs_attr_group);
+ sysfs_remove_group(&info->kobj, &ectrl_cpuregs_attr_group);
kobject_del(&info->kobj);
}
--
2.50.1
On Wed, 06 Aug 2025 20:48:13 +0100, Palmer Dabbelt <palmer@dabbelt.com> wrote: > > From: Palmer Dabbelt <palmerdabbelt@meta.com> > > We've found that some of our workloads run faster when some of these > fields are set to non-default values on some of the systems we're trying > to run those workloads on. This allows us to set those values via > sysfs, so we can do workload/system-specific tuning. > > Signed-off-by: Palmer Dabbelt <palmerdabbelt@meta.com> > --- > I've only really smoke tested this, but I figured I'd send it along because I'm > not sure if this is even a sane thing to be doing -- these extended control > registers have some wacky stuff in them, so maybe they're not exposed to > userspace on purpose. IIUC firmware can gate these writes, though, so it > should be possible for vendors to forbid the really scary values. That's really wrong. For a start, these encodings fall into the IMPDEF range. They won't exist on non-ARM implementations. Next, this will catch fire as a guest, either because the hypervisor will simply refuse to entertain letting it access registers that have no definition, or because the VM has been migrated from one implementation to another, and you have no idea this is doing on the new target. > > That said, we do see some performance improvements here on real workloads. So > we're hoping to roll some of this tuning work out more widely, but we also > don't want to adopt some internal interface. Thus it'd make our lives easier > if we could twiddle these bits in a standard way. Honestly, this is the sort of bring-up stuff that is better kept in your private sandbox, and doesn't really help in general. Thanks, M. -- Without deviation from the norm, progress is not possible.
On Thu, 07 Aug 2025 01:08:26 PDT (-0700), Marc Zyngier wrote: > On Wed, 06 Aug 2025 20:48:13 +0100, > Palmer Dabbelt <palmer@dabbelt.com> wrote: >> >> From: Palmer Dabbelt <palmerdabbelt@meta.com> >> >> We've found that some of our workloads run faster when some of these >> fields are set to non-default values on some of the systems we're trying >> to run those workloads on. This allows us to set those values via >> sysfs, so we can do workload/system-specific tuning. >> >> Signed-off-by: Palmer Dabbelt <palmerdabbelt@meta.com> >> --- >> I've only really smoke tested this, but I figured I'd send it along because I'm >> not sure if this is even a sane thing to be doing -- these extended control >> registers have some wacky stuff in them, so maybe they're not exposed to >> userspace on purpose. IIUC firmware can gate these writes, though, so it >> should be possible for vendors to forbid the really scary values. > > That's really wrong. > > For a start, these encodings fall into the IMPDEF range. They won't > exist on non-ARM implementations. OK, and that's because it says "Provides additional IMPLEMENTATION DEFINED configuration and control options for the processor." at the start of the manual page? Sorry, I'm kind of new to trying to read the Arm specs -- I thought just the meaning of the values was changing, but I probably just didn't read enough specs. > Next, this will catch fire as a guest, either because the hypervisor > will simply refuse to entertain letting it access registers that have > no definition, or because the VM has been migrated from one > implementation to another, and you have no idea this is doing on the > new target. Ya, makes sense. >> That said, we do see some performance improvements here on real workloads. So >> we're hoping to roll some of this tuning work out more widely, but we also >> don't want to adopt some internal interface. Thus it'd make our lives easier >> if we could twiddle these bits in a standard way. > > Honestly, this is the sort of bring-up stuff that is better kept in > your private sandbox, and doesn't really help in general. So we're not doing bringup (or at least not doing what I'd call bringup) here, the theory is that we just get better performance on different workloads with different tunings. That's all still a little early, but if the data holds we'd want to be setting these based on what workload is running (ie, not just some static tuning for everything). That said, part of the reason I just sent this as-is is because I was sort of expecting the answer to be "no" here. No big deal if that's the case, we can figure out some other way to solve the problem. Happy to throw some time in to making some more generic flavor of this, though... > Thanks, > > M.
On Thu, 07 Aug 2025 18:26:29 +0100, Palmer Dabbelt <palmer@dabbelt.com> wrote: > > On Thu, 07 Aug 2025 01:08:26 PDT (-0700), Marc Zyngier wrote: > > On Wed, 06 Aug 2025 20:48:13 +0100, > > Palmer Dabbelt <palmer@dabbelt.com> wrote: > >> > >> From: Palmer Dabbelt <palmerdabbelt@meta.com> > >> > >> We've found that some of our workloads run faster when some of these > >> fields are set to non-default values on some of the systems we're trying > >> to run those workloads on. This allows us to set those values via > >> sysfs, so we can do workload/system-specific tuning. > >> > >> Signed-off-by: Palmer Dabbelt <palmerdabbelt@meta.com> > >> --- > >> I've only really smoke tested this, but I figured I'd send it along because I'm > >> not sure if this is even a sane thing to be doing -- these extended control > >> registers have some wacky stuff in them, so maybe they're not exposed to > >> userspace on purpose. IIUC firmware can gate these writes, though, so it > >> should be possible for vendors to forbid the really scary values. > > > > That's really wrong. > > > > For a start, these encodings fall into the IMPDEF range. They won't > > exist on non-ARM implementations. > > OK, and that's because it says "Provides additional IMPLEMENTATION > DEFINED configuration and control options for the processor." at the > start of the manual page? Sorry, I'm kind of new to trying to read > the Arm specs -- I thought just the meaning of the values was > changing, but I probably just didn't read enough specs. The architecture defines a range described in D24.2.162 (in the L.b revision of the ARM ARM) which is reserved for IMPDEF purposes. What these registers do is not defined, and there is no standard across implementations. This really is for chicken bits and other fun stuff. Most of them will simply generate an UNDEF, because they don't pass the decode stage. But for all we know, there is a bit in there that turns NOP into the HCF instruction -- or better. So exposing any of that stuff for any given CPU is dangerous. And exposing any of it on *all* CPUs is a bit like swallowing a powered chainsaw (don't). > > > Next, this will catch fire as a guest, either because the hypervisor > > will simply refuse to entertain letting it access registers that have > > no definition, or because the VM has been migrated from one > > implementation to another, and you have no idea this is doing on the > > new target. > > Ya, makes sense. > > >> That said, we do see some performance improvements here on real workloads. So > >> we're hoping to roll some of this tuning work out more widely, but we also > >> don't want to adopt some internal interface. Thus it'd make our lives easier > >> if we could twiddle these bits in a standard way. > > > > Honestly, this is the sort of bring-up stuff that is better kept in > > your private sandbox, and doesn't really help in general. > > So we're not doing bringup (or at least not doing what I'd call > bringup) here, the theory is that we just get better performance on > different workloads with different tunings. That's all still a little > early, but if the data holds we'd want to be setting these based on > what workload is running (ie, not just some static tuning for > everything). In general, none of that crap is safe to turn on and off at random times. You really want to talk to your implementer to find out. And if it is, the firmware is probably the place to handle that. > That said, part of the reason I just sent this as-is is because I was > sort of expecting the answer to be "no" here. No big deal if that's > the case, we can figure out some other way to solve the problem. > Happy to throw some time in to making some more generic flavor of > this, though... I have no idea how we can achieve that, given that there is no architected definition for any of these registers. M. -- Without deviation from the norm, progress is not possible.
On Thu, 07 Aug 2025 11:06:27 PDT (-0700), Marc Zyngier wrote: > On Thu, 07 Aug 2025 18:26:29 +0100, > Palmer Dabbelt <palmer@dabbelt.com> wrote: >> >> On Thu, 07 Aug 2025 01:08:26 PDT (-0700), Marc Zyngier wrote: >> > On Wed, 06 Aug 2025 20:48:13 +0100, >> > Palmer Dabbelt <palmer@dabbelt.com> wrote: >> >> >> >> From: Palmer Dabbelt <palmerdabbelt@meta.com> >> >> >> >> We've found that some of our workloads run faster when some of these >> >> fields are set to non-default values on some of the systems we're trying >> >> to run those workloads on. This allows us to set those values via >> >> sysfs, so we can do workload/system-specific tuning. >> >> >> >> Signed-off-by: Palmer Dabbelt <palmerdabbelt@meta.com> >> >> --- >> >> I've only really smoke tested this, but I figured I'd send it along because I'm >> >> not sure if this is even a sane thing to be doing -- these extended control >> >> registers have some wacky stuff in them, so maybe they're not exposed to >> >> userspace on purpose. IIUC firmware can gate these writes, though, so it >> >> should be possible for vendors to forbid the really scary values. >> > >> > That's really wrong. >> > >> > For a start, these encodings fall into the IMPDEF range. They won't >> > exist on non-ARM implementations. >> >> OK, and that's because it says "Provides additional IMPLEMENTATION >> DEFINED configuration and control options for the processor." at the >> start of the manual page? Sorry, I'm kind of new to trying to read >> the Arm specs -- I thought just the meaning of the values was >> changing, but I probably just didn't read enough specs. > > The architecture defines a range described in D24.2.162 (in the L.b > revision of the ARM ARM) which is reserved for IMPDEF purposes. > > What these registers do is not defined, and there is no standard > across implementations. This really is for chicken bits and other fun > stuff. Most of them will simply generate an UNDEF, because they don't > pass the decode stage. But for all we know, there is a bit in there > that turns NOP into the HCF instruction -- or better. > > So exposing any of that stuff for any given CPU is dangerous. And > exposing any of it on *all* CPUs is a bit like swallowing a powered > chainsaw (don't). OK, makes sense. >> > Next, this will catch fire as a guest, either because the hypervisor >> > will simply refuse to entertain letting it access registers that have >> > no definition, or because the VM has been migrated from one >> > implementation to another, and you have no idea this is doing on the >> > new target. >> >> Ya, makes sense. >> >> >> That said, we do see some performance improvements here on real workloads. So >> >> we're hoping to roll some of this tuning work out more widely, but we also >> >> don't want to adopt some internal interface. Thus it'd make our lives easier >> >> if we could twiddle these bits in a standard way. >> > >> > Honestly, this is the sort of bring-up stuff that is better kept in >> > your private sandbox, and doesn't really help in general. >> >> So we're not doing bringup (or at least not doing what I'd call >> bringup) here, the theory is that we just get better performance on >> different workloads with different tunings. That's all still a little >> early, but if the data holds we'd want to be setting these based on >> what workload is running (ie, not just some static tuning for >> everything). > > In general, none of that crap is safe to turn on and off at random > times. You really want to talk to your implementer to find out. And if > it is, the firmware is probably the place to handle that. Ya, if it's generally not expected to be sane to runtime modify these then it seems sane to just hide them behind a firmare interface. Then it's really up to the firmware to proactively expose the bits that are useful, and it's inherently vendor-specific. >> That said, part of the reason I just sent this as-is is because I was >> sort of expecting the answer to be "no" here. No big deal if that's >> the case, we can figure out some other way to solve the problem. >> Happy to throw some time in to making some more generic flavor of >> this, though... > > I have no idea how we can achieve that, given that there is no > architected definition for any of these registers. I'd basically have some interface for getting/setting the registers that the kernel exposes (gated behind whatever tests we'd need to make sure the registers are accessible), and then some userspace program that deal with the implementation-specific behavior. It'd probably just devolve into some database of known implementations with what the bits do, with some attempt at mapping them to generic behavior -- though even that's kind of clunky, as something like "this tunes some prefetcher to smell different" doesn't really help a ton. If it's a firmware-gated thing, though, then it's probably going to just end up as some vendor-specific firmware widget that we go fumble around in ACPI to mangle... > > M.
On Thu, Aug 07, 2025 at 10:26:29AM -0700, Palmer Dabbelt wrote: > On Thu, 07 Aug 2025 01:08:26 PDT (-0700), Marc Zyngier wrote: > > For a start, these encodings fall into the IMPDEF range. They won't > > exist on non-ARM implementations. > OK, and that's because it says "Provides additional IMPLEMENTATION DEFINED > configuration and control options for the processor." at the start of the > manual page? Sorry, I'm kind of new to trying to read the Arm specs -- I > thought just the meaning of the values was changing, but I probably just > didn't read enough specs. Yes, pretty much and also because this is in a range of registers reserved for use by the specific implementation. See DDI0487 (the ARM) version L.a sections D23.3.2 and D24.2.162 which specify the IMPLEMENTATION DEFINED system register range, and the definition of the term IMPLEMENTATION DEFINED in the glossary of DDI0487. If you see a small upper case term like that in a spec related to the architecture then it'll have a specific architectural defintion. > That said, part of the reason I just sent this as-is is because I was sort > of expecting the answer to be "no" here. No big deal if that's the case, we > can figure out some other way to solve the problem. Happy to throw some > time in to making some more generic flavor of this, though... It's something that does come up, working out a way to make use of the tuning in the IMPDEF registers in a way that generic software can safely and sensibly make use of is rather non-trivial though.
On Thu, 07 Aug 2025 10:57:26 PDT (-0700), broonie@kernel.org wrote: > On Thu, Aug 07, 2025 at 10:26:29AM -0700, Palmer Dabbelt wrote: >> On Thu, 07 Aug 2025 01:08:26 PDT (-0700), Marc Zyngier wrote: > >> > For a start, these encodings fall into the IMPDEF range. They won't >> > exist on non-ARM implementations. > >> OK, and that's because it says "Provides additional IMPLEMENTATION DEFINED >> configuration and control options for the processor." at the start of the >> manual page? Sorry, I'm kind of new to trying to read the Arm specs -- I >> thought just the meaning of the values was changing, but I probably just >> didn't read enough specs. > > Yes, pretty much and also because this is in a range of registers > reserved for use by the specific implementation. See DDI0487 (the ARM) > version L.a sections D23.3.2 and D24.2.162 which specify the > IMPLEMENTATION DEFINED system register range, and the definition of the > term IMPLEMENTATION DEFINED in the glossary of DDI0487. If you see a > small upper case term like that in a spec related to the architecture > then it'll have a specific architectural defintion. Thanks! >> That said, part of the reason I just sent this as-is is because I was sort >> of expecting the answer to be "no" here. No big deal if that's the case, we >> can figure out some other way to solve the problem. Happy to throw some >> time in to making some more generic flavor of this, though... > > It's something that does come up, working out a way to make use of the > tuning in the IMPDEF registers in a way that generic software can safely > and sensibly make use of is rather non-trivial though. Ya, I'd expect it to be a bit of a time sink -- even if something like this patch had gone in we would have had a pile of ugliness in userspace. I think there's no way around some amount of ugliness when it comes to implemnetation-specific, though. That said, if our nubers do pan out here then we'll likely need to do something. So if a more generic solution is interesting to people then I'm happy to throw some time at it, shouldn't be too hard to justify on my end.
© 2016 - 2025 Red Hat, Inc.