[v13] x86,fs/resctrl telemetry monitoring

[PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Tony Luck 3 months, 1 week ago

resctrl assumes that all monitor events can be displayed as unsigned
decimal integers.

Hardware architecture counters may provide some telemetry events with
greater precision where the event is not a simple count, but is a
measurement of some sort (e.g. Joules for energy consumed).

Add a new argument to resctrl_enable_mon_event() for architecture code
to inform the file system that the value for a counter is a fixed-point
value with a specific number of binary places.
Only allow architecture to use floating point format on events that the
file system has marked with mon_evt::is_floating_point.

Display fixed point values with values rounded to an appropriate number
of decimal places for the precision of the number of binary places
provided. Add one extra decimal place for every three additional binary
places, except for low precision binary values where exact representation
is possible:

  1 binary place is 0.0 or 0.5			=> 1 decimal place
  2 binary places is 0.0, 0.25, 0.5, 0.75	=> 2 decimal places
  3 binary places is 0.0, 0.125, etc.		=> 3 decimal places

Signed-off-by: Tony Luck <tony.luck@intel.com>
Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
---
 include/linux/resctrl.h            |  3 +-
 fs/resctrl/internal.h              |  8 +++
 arch/x86/kernel/cpu/resctrl/core.c |  6 +--
 fs/resctrl/ctrlmondata.c           | 84 ++++++++++++++++++++++++++++++
 fs/resctrl/monitor.c               | 10 +++-
 5 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 702205505dc9..a7e5a546152d 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -409,7 +409,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
 u32 resctrl_arch_system_num_rmid_idx(void);
 int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
 
-void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu);
+void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu,
+			      unsigned int binary_bits);
 
 bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid);
 
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index 40b76eaa33d0..f5189b6771a0 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -62,6 +62,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
  *			Only valid if @evtid is an MBM event.
  * @configurable:	true if the event is configurable
  * @any_cpu:		true if the event can be read from any CPU
+ * @is_floating_point:	event values are displayed in floating point format
+ * @binary_bits:	number of fixed-point binary bits from architecture,
+ *			only valid if @is_floating_point is true
  * @enabled:		true if the event is enabled
  */
 struct mon_evt {
@@ -71,6 +74,8 @@ struct mon_evt {
 	u32			evt_cfg;
 	bool			configurable;
 	bool			any_cpu;
+	bool			is_floating_point;
+	unsigned int		binary_bits;
 	bool			enabled;
 };
 
@@ -79,6 +84,9 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS];
 #define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT];	\
 				      mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++)
 
+/* Limit for mon_evt::binary_bits */
+#define MAX_BINARY_BITS	27
+
 /**
  * struct mon_data - Monitoring details for each event file.
  * @list:            Member of the global @mon_data_kn_priv_list list.
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 78ad493dcc01..c435319552be 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -893,15 +893,15 @@ static __init bool get_rdt_mon_resources(void)
 	bool ret = false;
 
 	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
-		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false);
+		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0);
 		ret = true;
 	}
 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
-		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false);
+		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0);
 		ret = true;
 	}
 	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
-		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false);
+		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0);
 		ret = true;
 	}
 	if (rdt_cpu_has(X86_FEATURE_ABMC))
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index 883be6f0810f..290a959776de 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -17,6 +17,7 @@
 
 #include <linux/cpu.h>
 #include <linux/kernfs.h>
+#include <linux/math.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/tick.h>
@@ -597,6 +598,87 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 		resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx);
 }
 
+/*
+ * Decimal place precision to use for each number of fixed-point
+ * binary bits.
+ */
+static unsigned int decplaces[MAX_BINARY_BITS + 1] = {
+	[1]  =  1,
+	[2]  =  2,
+	[3]  =  3,
+	[4]  =  3,
+	[5]  =  3,
+	[6]  =  3,
+	[7]  =  3,
+	[8]  =  3,
+	[9]  =  3,
+	[10] =  4,
+	[11] =  4,
+	[12] =  4,
+	[13] =  5,
+	[14] =  5,
+	[15] =  5,
+	[16] =  6,
+	[17] =  6,
+	[18] =  6,
+	[19] =  7,
+	[20] =  7,
+	[21] =  7,
+	[22] =  8,
+	[23] =  8,
+	[24] =  8,
+	[25] =  9,
+	[26] =  9,
+	[27] =  9
+};
+
+static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
+{
+	unsigned long long frac;
+	char buf[10];
+
+	if (!binary_bits) {
+		seq_printf(m, "%llu.0\n", val);
+		return;
+	}
+
+	/* Mask off the integer part of the fixed-point value. */
+	frac = val & GENMASK_ULL(binary_bits, 0);
+
+	/*
+	 * Multiply by 10^{desired decimal places}. The integer part of
+	 * the fixed point value is now almost what is needed.
+	 */
+	frac *= int_pow(10ull, decplaces[binary_bits]);
+
+	/*
+	 * Round to nearest by adding a value that would be a "1" in the
+	 * binary_bits + 1 place.  Integer part of fixed point value is
+	 * now the needed value.
+	 */
+	frac += 1ull << (binary_bits - 1);
+
+	/*
+	 * Extract the integer part of the value. This is the decimal
+	 * representation of the original fixed-point fractional value.
+	 */
+	frac >>= binary_bits;
+
+	/*
+	 * "frac" is now in the range [0 .. 10^decplaces).  I.e. string
+	 * representation will fit into chosen number of decimal places.
+	 */
+	snprintf(buf, sizeof(buf), "%0*llu", decplaces[binary_bits], frac);
+
+	/* Trim trailing zeroes */
+	for (int i = decplaces[binary_bits] - 1; i > 0; i--) {
+		if (buf[i] != '0')
+			break;
+		buf[i] = '\0';
+	}
+	seq_printf(m, "%llu.%s\n", val >> binary_bits, buf);
+}
+
 int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
@@ -674,6 +756,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 		seq_puts(m, "Unavailable\n");
 	else if (rr.err == -ENOENT)
 		seq_puts(m, "Unassigned\n");
+	else if (evt->is_floating_point)
+		print_event_value(m, evt->binary_bits, rr.val);
 	else
 		seq_printf(m, "%llu\n", rr.val);
 
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index 6eab98b47816..7d1b65316bc8 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -975,16 +975,22 @@ struct mon_evt mon_event_all[QOS_NUM_EVENTS] = {
 	},
 };
 
-void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu)
+void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu, unsigned int binary_bits)
 {
-	if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS))
+	if (WARN_ON_ONCE(eventid < QOS_FIRST_EVENT || eventid >= QOS_NUM_EVENTS ||
+			 binary_bits > MAX_BINARY_BITS))
 		return;
 	if (mon_event_all[eventid].enabled) {
 		pr_warn("Duplicate enable for event %d\n", eventid);
 		return;
 	}
+	if (binary_bits && !mon_event_all[eventid].is_floating_point) {
+		pr_warn("Event %d may not be floating point\n", eventid);
+		return;
+	}
 
 	mon_event_all[eventid].any_cpu = any_cpu;
+	mon_event_all[eventid].binary_bits = binary_bits;
 	mon_event_all[eventid].enabled = true;
 }
 
-- 
2.51.0

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by David Laight 2 months, 4 weeks ago

On Wed, 29 Oct 2025 09:20:55 -0700
Tony Luck <tony.luck@intel.com> wrote:

> resctrl assumes that all monitor events can be displayed as unsigned
> decimal integers.
> 
> Hardware architecture counters may provide some telemetry events with
> greater precision where the event is not a simple count, but is a
> measurement of some sort (e.g. Joules for energy consumed).
> 
> Add a new argument to resctrl_enable_mon_event() for architecture code
> to inform the file system that the value for a counter is a fixed-point
> value with a specific number of binary places.
> Only allow architecture to use floating point format on events that the
> file system has marked with mon_evt::is_floating_point.
> 
> Display fixed point values with values rounded to an appropriate number
> of decimal places for the precision of the number of binary places
> provided. Add one extra decimal place for every three additional binary
> places, except for low precision binary values where exact representation
> is possible:
> 
>   1 binary place is 0.0 or 0.5			=> 1 decimal place
>   2 binary places is 0.0, 0.25, 0.5, 0.75	=> 2 decimal places
>   3 binary places is 0.0, 0.125, etc.		=> 3 decimal places
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
> ---
>  include/linux/resctrl.h            |  3 +-
>  fs/resctrl/internal.h              |  8 +++
>  arch/x86/kernel/cpu/resctrl/core.c |  6 +--
>  fs/resctrl/ctrlmondata.c           | 84 ++++++++++++++++++++++++++++++
>  fs/resctrl/monitor.c               | 10 +++-
>  5 files changed, 105 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 702205505dc9..a7e5a546152d 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -409,7 +409,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
>  u32 resctrl_arch_system_num_rmid_idx(void);
>  int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
>  
> -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu);
> +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu,
> +			      unsigned int binary_bits);
>  
>  bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid);
>  
> diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
> index 40b76eaa33d0..f5189b6771a0 100644
> --- a/fs/resctrl/internal.h
> +++ b/fs/resctrl/internal.h
> @@ -62,6 +62,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
>   *			Only valid if @evtid is an MBM event.
>   * @configurable:	true if the event is configurable
>   * @any_cpu:		true if the event can be read from any CPU
> + * @is_floating_point:	event values are displayed in floating point format
> + * @binary_bits:	number of fixed-point binary bits from architecture,
> + *			only valid if @is_floating_point is true
>   * @enabled:		true if the event is enabled
>   */
>  struct mon_evt {
> @@ -71,6 +74,8 @@ struct mon_evt {
>  	u32			evt_cfg;
>  	bool			configurable;
>  	bool			any_cpu;
> +	bool			is_floating_point;
> +	unsigned int		binary_bits;
>  	bool			enabled;
>  };

Nit: You've added 4 bytes of padding.

	David

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Dave Martin 3 months ago

Hi Tony,

A few drive-by nits from me -- apologies, I hadn't looked at this in a
while.

On Wed, Oct 29, 2025 at 09:20:55AM -0700, Tony Luck wrote:
> resctrl assumes that all monitor events can be displayed as unsigned
> decimal integers.
> 
> Hardware architecture counters may provide some telemetry events with
> greater precision where the event is not a simple count, but is a
> measurement of some sort (e.g. Joules for energy consumed).
> 
> Add a new argument to resctrl_enable_mon_event() for architecture code
> to inform the file system that the value for a counter is a fixed-point
> value with a specific number of binary places.
> Only allow architecture to use floating point format on events that the
> file system has marked with mon_evt::is_floating_point.
> 
> Display fixed point values with values rounded to an appropriate number
> of decimal places for the precision of the number of binary places
> provided. Add one extra decimal place for every three additional binary

(Is this just informal wording?  If not, it's wrong...)

> places, except for low precision binary values where exact representation
> is possible:
> 
>   1 binary place is 0.0 or 0.5			=> 1 decimal place
>   2 binary places is 0.0, 0.25, 0.5, 0.75	=> 2 decimal places
>   3 binary places is 0.0, 0.125, etc.		=> 3 decimal places

What's the rationale for this special treatment?  I don't see any
previous discussion (apologies if I missed it).

> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
> ---
>  include/linux/resctrl.h            |  3 +-
>  fs/resctrl/internal.h              |  8 +++
>  arch/x86/kernel/cpu/resctrl/core.c |  6 +--
>  fs/resctrl/ctrlmondata.c           | 84 ++++++++++++++++++++++++++++++
>  fs/resctrl/monitor.c               | 10 +++-
>  5 files changed, 105 insertions(+), 6 deletions(-)
> 
> diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> index 702205505dc9..a7e5a546152d 100644
> --- a/include/linux/resctrl.h
> +++ b/include/linux/resctrl.h
> @@ -409,7 +409,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
>  u32 resctrl_arch_system_num_rmid_idx(void);
>  int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
>  
> -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu);
> +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu,
> +			      unsigned int binary_bits);
>  
>  bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid);
>  
> diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
> index 40b76eaa33d0..f5189b6771a0 100644
> --- a/fs/resctrl/internal.h
> +++ b/fs/resctrl/internal.h
> @@ -62,6 +62,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
>   *			Only valid if @evtid is an MBM event.
>   * @configurable:	true if the event is configurable
>   * @any_cpu:		true if the event can be read from any CPU
> + * @is_floating_point:	event values are displayed in floating point format

Nit: Maybe rebrand this as is_fixed_point, or is_fractional, or similar?

The print syntax is just a decimal fraction, and the hardware
representation is fixed-point.  Nothing floats.

> + * @binary_bits:	number of fixed-point binary bits from architecture,
> + *			only valid if @is_floating_point is true
>   * @enabled:		true if the event is enabled
>   */
>  struct mon_evt {
> @@ -71,6 +74,8 @@ struct mon_evt {
>  	u32			evt_cfg;
>  	bool			configurable;
>  	bool			any_cpu;
> +	bool			is_floating_point;
> +	unsigned int		binary_bits;
>  	bool			enabled;
>  };
>  
> @@ -79,6 +84,9 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS];
>  #define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT];	\
>  				      mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++)
>  
> +/* Limit for mon_evt::binary_bits */
> +#define MAX_BINARY_BITS	27
> +

Could this be up to 30?

(The formatting code relies on the the product of the maximum fraction
value with 10^decplaces[] not exceeding a u64, so I think 30 bits
fits?  But this only has to be as large as the largest value required
by some supported piece of hardware... I didn't go check on that.)

>  /**
>   * struct mon_data - Monitoring details for each event file.
>   * @list:            Member of the global @mon_data_kn_priv_list list.
> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
> index 78ad493dcc01..c435319552be 100644
> --- a/arch/x86/kernel/cpu/resctrl/core.c
> +++ b/arch/x86/kernel/cpu/resctrl/core.c
> @@ -893,15 +893,15 @@ static __init bool get_rdt_mon_resources(void)
>  	bool ret = false;
>  
>  	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
> -		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false);
> +		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0);
>  		ret = true;
>  	}
>  	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
> -		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false);
> +		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0);
>  		ret = true;
>  	}
>  	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
> -		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false);
> +		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0);
>  		ret = true;
>  	}
>  	if (rdt_cpu_has(X86_FEATURE_ABMC))
> diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
> index 883be6f0810f..290a959776de 100644
> --- a/fs/resctrl/ctrlmondata.c
> +++ b/fs/resctrl/ctrlmondata.c
> @@ -17,6 +17,7 @@
>  
>  #include <linux/cpu.h>
>  #include <linux/kernfs.h>
> +#include <linux/math.h>
>  #include <linux/seq_file.h>
>  #include <linux/slab.h>
>  #include <linux/tick.h>
> @@ -597,6 +598,87 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
>  		resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx);
>  }
>  
> +/*
> + * Decimal place precision to use for each number of fixed-point
> + * binary bits.
> + */
> +static unsigned int decplaces[MAX_BINARY_BITS + 1] = {

^ const


Also, maybe explicitly initialise

	[0]  =	1,

here?  (See print_event_value().)
		
> +	[1]  =  1,
> +	[2]  =  2,
> +	[3]  =  3,
> +	[4]  =  3,
> +	[5]  =  3,
> +	[6]  =  3,
> +	[7]  =  3,
> +	[8]  =  3,
> +	[9]  =  3,
> +	[10] =  4,

Why these specific values?

ceil(binary_bits * log10(2)) makes sense if we want to expose all
available hardware precision with as few digits as possible.

floor(binary_bits * log10(2)) makes sense if we want expose as many
digits as possible without advertising spurious precision.

Disregarding the special-casing for binary_bits <= 3, still neither
option quite seems to match this list.


Rounding up means that the hardware value can be reconstructed, but
only if userspace knows the value of binary_bits.  Should that be
exposed?

> +	[11] =  4,
> +	[12] =  4,
> +	[13] =  5,
> +	[14] =  5,
> +	[15] =  5,
> +	[16] =  6,
> +	[17] =  6,
> +	[18] =  6,
> +	[19] =  7,
> +	[20] =  7,
> +	[21] =  7,
> +	[22] =  8,
> +	[23] =  8,
> +	[24] =  8,
> +	[25] =  9,
> +	[26] =  9,
> +	[27] =  9

Documenting the rule for generating these may be a good idea unless we
are sure that no more entries will never be added.

> +};
> +
> +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
> +{
> +	unsigned long long frac;
> +	char buf[10];

In place of the magic number 10, how about
decplaces[MAX_BINARY_BITS] + 1 ?

(I think the compiler should accept that as an initialiser if the array
is const.)

> +
> +	if (!binary_bits) {
> +		seq_printf(m, "%llu.0\n", val);
> +		return;
> +	}

Can an initialiser for decplaces[0] reduce the special-casing for
binary_bits == 0?

> +
> +	/* Mask off the integer part of the fixed-point value. */
> +	frac = val & GENMASK_ULL(binary_bits, 0);

Should this be GENMASK_ULL(binary_bits - 1, 0)?

Should we be telling userspace the binary_bits value?  It is not
(exactly) deducible from the number of decimal places printed.

It depends on the use cases and what the code is trying to achieve, but
this does not seem to be described in detail, unless I've missed it
somewhere.

> +
> +	/*
> +	 * Multiply by 10^{desired decimal places}. The integer part of
> +	 * the fixed point value is now almost what is needed.
> +	 */
> +	frac *= int_pow(10ull, decplaces[binary_bits]);
> +
> +	/*
> +	 * Round to nearest by adding a value that would be a "1" in the
> +	 * binary_bits + 1 place.  Integer part of fixed point value is
> +	 * now the needed value.
> +	 */
> +	frac += 1ull << (binary_bits - 1);
> +
> +	/*
> +	 * Extract the integer part of the value. This is the decimal
> +	 * representation of the original fixed-point fractional value.
> +	 */
> +	frac >>= binary_bits;
> +
> +	/*
> +	 * "frac" is now in the range [0 .. 10^decplaces).  I.e. string
> +	 * representation will fit into chosen number of decimal places.
> +	 */
> +	snprintf(buf, sizeof(buf), "%0*llu", decplaces[binary_bits], frac);
> +
> +	/* Trim trailing zeroes */

Why?

Would it be better to present the values with consistent precision?

There's no reason why a telemetry counter should settle for any length
of time at a tidy value, so the precision represented by the trailing
zeros is always significant.

The hardware precision doesn't go up and down depending on the precise
value of the counter...

> +	for (int i = decplaces[binary_bits] - 1; i > 0; i--) {
> +		if (buf[i] != '0')
> +			break;
> +		buf[i] = '\0';
> +	}
> +	seq_printf(m, "%llu.%s\n", val >> binary_bits, buf);
> +}
> +

[...]

Cheers
---Dave

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Luck, Tony 2 months, 4 weeks ago

On Wed, Nov 05, 2025 at 02:42:18PM +0000, Dave Martin wrote:
> > +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
> > +{
> > +	unsigned long long frac;
> > +	char buf[10];
> 
> In place of the magic number 10, how about
> decplaces[MAX_BINARY_BITS] + 1 ?
> 
> (I think the compiler should accept that as an initialiser if the array
> is const.)

The compiler (gcc 15.2.1) accepts without any warnings. But generates
different code.

sparse complains:
fs/resctrl/ctrlmondata.c:640:45: warning: Variable length array is used.

I may change the hard coded constant to 21 (guaranteed to be big enough
for a "long long" plus terminating NUL byte.)

-Tony

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Dave Martin 2 months, 4 weeks ago

Hi Tony,

On Mon, Nov 10, 2025 at 08:52:52AM -0800, Luck, Tony wrote:
> On Wed, Nov 05, 2025 at 02:42:18PM +0000, Dave Martin wrote:
> > > +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
> > > +{
> > > +	unsigned long long frac;
> > > +	char buf[10];
> > 
> > In place of the magic number 10, how about
> > decplaces[MAX_BINARY_BITS] + 1 ?
> > 
> > (I think the compiler should accept that as an initialiser if the array
> > is const.)
> 
> The compiler (gcc 15.2.1) accepts without any warnings. But generates
> different code.
> 
> sparse complains:
> fs/resctrl/ctrlmondata.c:640:45: warning: Variable length array is used.

Hmmm.  Shame.

(Of course, this is only a warning.  sparse may not know how to
determine that the resulting buffer is limited to a sane size, but
looking at the code makes it pretty obvious.  Perhaps best avoided,
though.)

> I may change the hard coded constant to 21 (guaranteed to be big enough
> for a "long long" plus terminating NUL byte.)

I guess.  We may be able to sidestep this, though (see my other reply
about getting rid of buf[] altogether.)

Cheers
---Dave

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Luck, Tony 3 months ago

Hi Dave,

Thanks for taking time to review. You did unearth one big bug
and I'm super-grateful for that.

On Wed, Nov 05, 2025 at 02:42:18PM +0000, Dave Martin wrote:
> Hi Tony,
> 
> A few drive-by nits from me -- apologies, I hadn't looked at this in a
> while.
> 
> On Wed, Oct 29, 2025 at 09:20:55AM -0700, Tony Luck wrote:
> > resctrl assumes that all monitor events can be displayed as unsigned
> > decimal integers.
> > 
> > Hardware architecture counters may provide some telemetry events with
> > greater precision where the event is not a simple count, but is a
> > measurement of some sort (e.g. Joules for energy consumed).
> > 
> > Add a new argument to resctrl_enable_mon_event() for architecture code
> > to inform the file system that the value for a counter is a fixed-point
> > value with a specific number of binary places.
> > Only allow architecture to use floating point format on events that the
> > file system has marked with mon_evt::is_floating_point.
> > 
> > Display fixed point values with values rounded to an appropriate number
> > of decimal places for the precision of the number of binary places
> > provided. Add one extra decimal place for every three additional binary
> 
> (Is this just informal wording?  If not, it's wrong...)

Informal. It isn't far off from the table. Once out of the small numbers
the number of decimal places does increment after each group of three.

> 
> > places, except for low precision binary values where exact representation
> > is possible:
> > 
> >   1 binary place is 0.0 or 0.5			=> 1 decimal place
> >   2 binary places is 0.0, 0.25, 0.5, 0.75	=> 2 decimal places
> >   3 binary places is 0.0, 0.125, etc.		=> 3 decimal places
> 
> What's the rationale for this special treatment?  I don't see any
> previous discussion (apologies if I missed it).

The strict log10(2) calculations below throw away some precision from
these cases. I thought that was bad.

> > 
> > Signed-off-by: Tony Luck <tony.luck@intel.com>
> > Reviewed-by: Reinette Chatre <reinette.chatre@intel.com>
> > ---
> >  include/linux/resctrl.h            |  3 +-
> >  fs/resctrl/internal.h              |  8 +++
> >  arch/x86/kernel/cpu/resctrl/core.c |  6 +--
> >  fs/resctrl/ctrlmondata.c           | 84 ++++++++++++++++++++++++++++++
> >  fs/resctrl/monitor.c               | 10 +++-
> >  5 files changed, 105 insertions(+), 6 deletions(-)
> > 
> > diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
> > index 702205505dc9..a7e5a546152d 100644
> > --- a/include/linux/resctrl.h
> > +++ b/include/linux/resctrl.h
> > @@ -409,7 +409,8 @@ u32 resctrl_arch_get_num_closid(struct rdt_resource *r);
> >  u32 resctrl_arch_system_num_rmid_idx(void);
> >  int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid);
> >  
> > -void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu);
> > +void resctrl_enable_mon_event(enum resctrl_event_id eventid, bool any_cpu,
> > +			      unsigned int binary_bits);
> >  
> >  bool resctrl_is_mon_event_enabled(enum resctrl_event_id eventid);
> >  
> > diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
> > index 40b76eaa33d0..f5189b6771a0 100644
> > --- a/fs/resctrl/internal.h
> > +++ b/fs/resctrl/internal.h
> > @@ -62,6 +62,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
> >   *			Only valid if @evtid is an MBM event.
> >   * @configurable:	true if the event is configurable
> >   * @any_cpu:		true if the event can be read from any CPU
> > + * @is_floating_point:	event values are displayed in floating point format
> 
> Nit: Maybe rebrand this as is_fixed_point, or is_fractional, or similar?
> 
> The print syntax is just a decimal fraction, and the hardware
> representation is fixed-point.  Nothing floats.

You are right. I can change from is_floating_point to is_fixed_point.

> > + * @binary_bits:	number of fixed-point binary bits from architecture,
> > + *			only valid if @is_floating_point is true
> >   * @enabled:		true if the event is enabled
> >   */
> >  struct mon_evt {
> > @@ -71,6 +74,8 @@ struct mon_evt {
> >  	u32			evt_cfg;
> >  	bool			configurable;
> >  	bool			any_cpu;
> > +	bool			is_floating_point;
> > +	unsigned int		binary_bits;
> >  	bool			enabled;
> >  };
> >  
> > @@ -79,6 +84,9 @@ extern struct mon_evt mon_event_all[QOS_NUM_EVENTS];
> >  #define for_each_mon_event(mevt) for (mevt = &mon_event_all[QOS_FIRST_EVENT];	\
> >  				      mevt < &mon_event_all[QOS_NUM_EVENTS]; mevt++)
> >  
> > +/* Limit for mon_evt::binary_bits */
> > +#define MAX_BINARY_BITS	27
> > +
> 
> Could this be up to 30?

Yes.

> (The formatting code relies on the the product of the maximum fraction
> value with 10^decplaces[] not exceeding a u64, so I think 30 bits
> fits?  But this only has to be as large as the largest value required
> by some supported piece of hardware... I didn't go check on that.)

I only have one data point. The Intel telemetry events are using 18
binary places.

> >  /**
> >   * struct mon_data - Monitoring details for each event file.
> >   * @list:            Member of the global @mon_data_kn_priv_list list.
> > diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
> > index 78ad493dcc01..c435319552be 100644
> > --- a/arch/x86/kernel/cpu/resctrl/core.c
> > +++ b/arch/x86/kernel/cpu/resctrl/core.c
> > @@ -893,15 +893,15 @@ static __init bool get_rdt_mon_resources(void)
> >  	bool ret = false;
> >  
> >  	if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) {
> > -		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false);
> > +		resctrl_enable_mon_event(QOS_L3_OCCUP_EVENT_ID, false, 0);
> >  		ret = true;
> >  	}
> >  	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) {
> > -		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false);
> > +		resctrl_enable_mon_event(QOS_L3_MBM_TOTAL_EVENT_ID, false, 0);
> >  		ret = true;
> >  	}
> >  	if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) {
> > -		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false);
> > +		resctrl_enable_mon_event(QOS_L3_MBM_LOCAL_EVENT_ID, false, 0);
> >  		ret = true;
> >  	}
> >  	if (rdt_cpu_has(X86_FEATURE_ABMC))
> > diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
> > index 883be6f0810f..290a959776de 100644
> > --- a/fs/resctrl/ctrlmondata.c
> > +++ b/fs/resctrl/ctrlmondata.c
> > @@ -17,6 +17,7 @@
> >  
> >  #include <linux/cpu.h>
> >  #include <linux/kernfs.h>
> > +#include <linux/math.h>
> >  #include <linux/seq_file.h>
> >  #include <linux/slab.h>
> >  #include <linux/tick.h>
> > @@ -597,6 +598,87 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
> >  		resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx);
> >  }
> >  
> > +/*
> > + * Decimal place precision to use for each number of fixed-point
> > + * binary bits.
> > + */
> > +static unsigned int decplaces[MAX_BINARY_BITS + 1] = {
> 
> ^ const

OK

> 
> Also, maybe explicitly initialise
> 
> 	[0]  =	1,

OK (though this might only occur if there is an event that resctrl says
must be fixed point, with a h/w implementation that provides a simple
integer).

> here?  (See print_event_value().)
> 		
> > +	[1]  =  1,
> > +	[2]  =  2,
> > +	[3]  =  3,
> > +	[4]  =  3,
> > +	[5]  =  3,
> > +	[6]  =  3,
> > +	[7]  =  3,
> > +	[8]  =  3,
> > +	[9]  =  3,
> > +	[10] =  4,
> 
> Why these specific values?

For 1, 2, 3 binary bits you get an exact decimal representation
with 1, 2, 3 decimal places. I kept the "3" going from 4 to 9
bits because it should output at least as many places as 3 bits.

After that I started stepping every 3 extra bits.

> ceil(binary_bits * log10(2)) makes sense if we want to expose all
> available hardware precision with as few digits as possible.
> 
> floor(binary_bits * log10(2)) makes sense if we want expose as many
> digits as possible without advertising spurious precision.
> 
> Disregarding the special-casing for binary_bits <= 3, still neither
> option quite seems to match this list.

Side-by-side comparion:

#include <stdio.h>
#include <math.h>

static unsigned int tony[] = {
	[0]  =  0, [1]  =  1, [2]  =  2, [3]  =  3, [4]  =  3, [5]  =  3,
	[6]  =  3, [7]  =  3, [8]  =  3, [9]  =  3, [10] =  4, [11] =  4,
	[12] =  4, [13] =  5, [14] =  5, [15] =  5, [16] =  6, [17] =  6,
	[18] =  6, [19] =  7, [20] =  7, [21] =  7, [22] =  8, [23] =  8,
	[24] =  8, [25] =  9, [26] =  9, [27] =  9
};

int main(void)
{
	int binary_bits;
	double log10_2 = log10(2.0);

	printf("bits:\tceil\tfloor\ttony\n");
	for (binary_bits = 0; binary_bits < 28; binary_bits++)
		printf("%d:\t%d\t%d\t%d\n",
			binary_bits,
			(int)ceil(binary_bits * log10_2),
			(int)floor(binary_bits * log10_2),
			tony[binary_bits]);

	return 0;
}

bits:	ceil	floor	tony
0:	0	0	0
1:	1	0	1
2:	1	0	2
3:	1	0	3
4:	2	1	3
5:	2	1	3
6:	2	1	3
7:	3	2	3
8:	3	2	3
9:	3	2	3
10:	4	3	4
11:	4	3	4
12:	4	3	4
13:	4	3	5
14:	5	4	5
15:	5	4	5
16:	5	4	6
17:	6	5	6
18:	6	5	6
19:	6	5	7
20:	7	6	7
21:	7	6	7
22:	7	6	8
23:	7	6	8
24:	8	7	8
25:	8	7	9
26:	8	7	9
27:	9	8	9

I'm not a fan of the "floor" option. Looks like it loses precision.  Terrible for
1-3 binary bits. Also not what I'd like for the bits==18 case that I currently
care about.

"ceil" is good for bits > 6. Almost matches my numbers (except I jump
to one more decimal place one binary bit earlier).

What do you think of me swapping out the values from 7 upwards for the
ceil values and documenting that 0..6 are hand-picked, but 7 and up are
ceil(binary_bits * log10_2)?

> 
> Rounding up means that the hardware value can be reconstructed, but
> only if userspace knows the value of binary_bits.  Should that be
> exposed?

I'm not sure I see when users would need to reconstruct the h/w value.
General use case for these resctrl events is: read1, sleepN, read2 &
compute rate = (read2 - read1) / N

In the case of the Intel telemetry events there is some jitter around
the timing of the reads (since events may only be updated every 2ms).
So the error bars get big if "N" is small. Which all leads me to believe
that a "good enough" approach to representing the event values will
be close enough for all use cases.
> 
> > +	[11] =  4,
> > +	[12] =  4,
> > +	[13] =  5,
> > +	[14] =  5,
> > +	[15] =  5,
> > +	[16] =  6,
> > +	[17] =  6,
> > +	[18] =  6,
> > +	[19] =  7,
> > +	[20] =  7,
> > +	[21] =  7,
> > +	[22] =  8,
> > +	[23] =  8,
> > +	[24] =  8,
> > +	[25] =  9,
> > +	[26] =  9,
> > +	[27] =  9
> 
> Documenting the rule for generating these may be a good idea unless we
> are sure that no more entries will never be added.

Above proposal - use the ceil function for bits >= 7.

> > +};
> > +
> > +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
> > +{
> > +	unsigned long long frac;
> > +	char buf[10];
> 
> In place of the magic number 10, how about
> decplaces[MAX_BINARY_BITS] + 1 ?
> 
> (I think the compiler should accept that as an initialiser if the array
> is const.)

If the compiler doesn't barf, then OK.

> > +
> > +	if (!binary_bits) {
> > +		seq_printf(m, "%llu.0\n", val);
> > +		return;
> > +	}
> 
> Can an initialiser for decplaces[0] reduce the special-casing for
> binary_bits == 0?

I'll check and see.

> > +
> > +	/* Mask off the integer part of the fixed-point value. */
> > +	frac = val & GENMASK_ULL(binary_bits, 0);
> 
> Should this be GENMASK_ULL(binary_bits - 1, 0)?

Oops. I think you are right.

> Should we be telling userspace the binary_bits value?  It is not
> (exactly) deducible from the number of decimal places printed.

I could add another info file for fixed_point events to display this.
But I'm not convinced that it would result in users doing anything
different.

Assume you just did the "read1, sleepN, read2" and got values of
235.617542 and 338.964815, tell me how things would be different
if an info file said that binary_bits was 17 vs. 19?

> It depends on the use cases and what the code is trying to achieve, but
> this does not seem to be described in detail, unless I've missed it
> somewhere.
> 
> > +
> > +	/*
> > +	 * Multiply by 10^{desired decimal places}. The integer part of
> > +	 * the fixed point value is now almost what is needed.
> > +	 */
> > +	frac *= int_pow(10ull, decplaces[binary_bits]);
> > +
> > +	/*
> > +	 * Round to nearest by adding a value that would be a "1" in the
> > +	 * binary_bits + 1 place.  Integer part of fixed point value is
> > +	 * now the needed value.
> > +	 */
> > +	frac += 1ull << (binary_bits - 1);
> > +
> > +	/*
> > +	 * Extract the integer part of the value. This is the decimal
> > +	 * representation of the original fixed-point fractional value.
> > +	 */
> > +	frac >>= binary_bits;
> > +
> > +	/*
> > +	 * "frac" is now in the range [0 .. 10^decplaces).  I.e. string
> > +	 * representation will fit into chosen number of decimal places.
> > +	 */
> > +	snprintf(buf, sizeof(buf), "%0*llu", decplaces[binary_bits], frac);
> > +
> > +	/* Trim trailing zeroes */
> 
> Why?

It felt good. I'm not wedded to this. Maybe saving a few cycles of
kernel CPU time by dropping this would be good.

> Would it be better to present the values with consistent precision?

Humans might notice the difference. Apps reading the file aren't going
to care.

> There's no reason why a telemetry counter should settle for any length
> of time at a tidy value, so the precision represented by the trailing
> zeros is always significant.

But x1 = atof("1.5") and x2 = atof("1.500000") ... can the subsequent
use of x1 tell that there was less precision that x2?
> 
> The hardware precision doesn't go up and down depending on the precise
> value of the counter...
> 
> > +	for (int i = decplaces[binary_bits] - 1; i > 0; i--) {
> > +		if (buf[i] != '0')
> > +			break;
> > +		buf[i] = '\0';
> > +	}
> > +	seq_printf(m, "%llu.%s\n", val >> binary_bits, buf);
> > +}
> > +
> 
> [...]
> 
> Cheers
> ---Dave

-Tony

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Dave Martin 2 months, 4 weeks ago

Hi Tony,

On Wed, Nov 05, 2025 at 03:31:07PM -0800, Luck, Tony wrote:
> Hi Dave,
> 
> Thanks for taking time to review. You did unearth one big bug
> and I'm super-grateful for that.
> 
> On Wed, Nov 05, 2025 at 02:42:18PM +0000, Dave Martin wrote:
> > Hi Tony,
> > 
> > A few drive-by nits from me -- apologies, I hadn't looked at this in a
> > while.
> > 
> > On Wed, Oct 29, 2025 at 09:20:55AM -0700, Tony Luck wrote:
> > > resctrl assumes that all monitor events can be displayed as unsigned
> > > decimal integers.
> > > 
> > > Hardware architecture counters may provide some telemetry events with
> > > greater precision where the event is not a simple count, but is a
> > > measurement of some sort (e.g. Joules for energy consumed).
> > > 
> > > Add a new argument to resctrl_enable_mon_event() for architecture code
> > > to inform the file system that the value for a counter is a fixed-point
> > > value with a specific number of binary places.
> > > Only allow architecture to use floating point format on events that the
> > > file system has marked with mon_evt::is_floating_point.
> > > 
> > > Display fixed point values with values rounded to an appropriate number
> > > of decimal places for the precision of the number of binary places
> > > provided. Add one extra decimal place for every three additional binary
> > 
> > (Is this just informal wording?  If not, it's wrong...)
> 
> Informal. It isn't far off from the table. Once out of the small numbers
> the number of decimal places does increment after each group of three.
> 
> > 
> > > places, except for low precision binary values where exact representation
> > > is possible:
> > > 
> > >   1 binary place is 0.0 or 0.5			=> 1 decimal place
> > >   2 binary places is 0.0, 0.25, 0.5, 0.75	=> 2 decimal places
> > >   3 binary places is 0.0, 0.125, etc.		=> 3 decimal places
> > 
> > What's the rationale for this special treatment?  I don't see any
> > previous discussion (apologies if I missed it).
> 
> The strict log10(2) calculations below throw away some precision from
> these cases. I thought that was bad.

It depends what is meant by "precision".

We can't magic up accuracy that isn't present in the counters, just by
including extra digits when formatting.

So long as we format values in such a way that every counter value is
formatted in a unique way, we are as precise as it is possible to be.

If I didn't confuse myself, ceil(binary_bits * log10(2)) is the
smallest number of fractional decimal digits that provide this
guarantee.


(This may seem pedantic -- partly, I was wondering what was so special
about implementations with fewer than 3 binary places that they needed
special treatment -- I think that still hasn't been answered?)

[...]

> > > +/* Limit for mon_evt::binary_bits */
> > > +#define MAX_BINARY_BITS	27
> > > +
> > 
> > Could this be up to 30?
> 
> Yes.
> 
> > (The formatting code relies on the the product of the maximum fraction
> > value with 10^decplaces[] not exceeding a u64, so I think 30 bits
> > fits?  But this only has to be as large as the largest value required
> > by some supported piece of hardware... I didn't go check on that.)
> 
> I only have one data point. The Intel telemetry events are using 18
> binary places.

Ah, right.

[...]

> > > diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
> > > index 883be6f0810f..290a959776de 100644
> > > --- a/fs/resctrl/ctrlmondata.c
> > > +++ b/fs/resctrl/ctrlmondata.c

[...]

> > > @@ -597,6 +598,87 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
> > >  		resctrl_arch_mon_ctx_free(r, evt->evtid, rr->arch_mon_ctx);
> > >  }
> > >  
> > > +/*
> > > + * Decimal place precision to use for each number of fixed-point
> > > + * binary bits.
> > > + */
> > > +static unsigned int decplaces[MAX_BINARY_BITS + 1] = {
> > 
> > ^ const
> 
> OK
> 
> > 
> > Also, maybe explicitly initialise
> > 
> > 	[0]  =	1,
> 
> OK (though this might only occur if there is an event that resctrl says
> must be fixed point, with a h/w implementation that provides a simple
> integer).
> 
> > here?  (See print_event_value().)
> > 		
> > > +	[1]  =  1,
> > > +	[2]  =  2,
> > > +	[3]  =  3,
> > > +	[4]  =  3,
> > > +	[5]  =  3,
> > > +	[6]  =  3,
> > > +	[7]  =  3,
> > > +	[8]  =  3,
> > > +	[9]  =  3,
> > > +	[10] =  4,
> > 
> > Why these specific values?
> 
> For 1, 2, 3 binary bits you get an exact decimal representation
> with 1, 2, 3 decimal places. I kept the "3" going from 4 to 9
> bits because it should output at least as many places as 3 bits.
> 
> After that I started stepping every 3 extra bits.
> 
> > ceil(binary_bits * log10(2)) makes sense if we want to expose all
> > available hardware precision with as few digits as possible.
> > 
> > floor(binary_bits * log10(2)) makes sense if we want expose as many
> > digits as possible without advertising spurious precision.
> > 
> > Disregarding the special-casing for binary_bits <= 3, still neither
> > option quite seems to match this list.
> 
> Side-by-side comparion:
> 
> #include <stdio.h>
> #include <math.h>
> 
> static unsigned int tony[] = {
> 	[0]  =  0, [1]  =  1, [2]  =  2, [3]  =  3, [4]  =  3, [5]  =  3,
> 	[6]  =  3, [7]  =  3, [8]  =  3, [9]  =  3, [10] =  4, [11] =  4,
> 	[12] =  4, [13] =  5, [14] =  5, [15] =  5, [16] =  6, [17] =  6,
> 	[18] =  6, [19] =  7, [20] =  7, [21] =  7, [22] =  8, [23] =  8,
> 	[24] =  8, [25] =  9, [26] =  9, [27] =  9
> };
> 
> int main(void)
> {
> 	int binary_bits;
> 	double log10_2 = log10(2.0);
> 
> 	printf("bits:\tceil\tfloor\ttony\n");
> 	for (binary_bits = 0; binary_bits < 28; binary_bits++)
> 		printf("%d:\t%d\t%d\t%d\n",
> 			binary_bits,
> 			(int)ceil(binary_bits * log10_2),
> 			(int)floor(binary_bits * log10_2),
> 			tony[binary_bits]);
> 
> 	return 0;
> }
> 
> bits:	ceil	floor	tony
> 0:	0	0	0
> 1:	1	0	1
> 2:	1	0	2
> 3:	1	0	3
> 4:	2	1	3
> 5:	2	1	3
> 6:	2	1	3
> 7:	3	2	3
> 8:	3	2	3
> 9:	3	2	3
> 10:	4	3	4
> 11:	4	3	4
> 12:	4	3	4
> 13:	4	3	5
> 14:	5	4	5
> 15:	5	4	5
> 16:	5	4	6
> 17:	6	5	6
> 18:	6	5	6
> 19:	6	5	7
> 20:	7	6	7
> 21:	7	6	7
> 22:	7	6	8
> 23:	7	6	8
> 24:	8	7	8
> 25:	8	7	9
> 26:	8	7	9
> 27:	9	8	9
> 
> I'm not a fan of the "floor" option. Looks like it loses precision.  Terrible for

Loses precision, but does not advertise bogus precision precision
beyond the precision in the original value.  (This is why it is not
standard to print doubles with more then 15 significant digits, even
though 17 significant digits are needed for bit-exact reproduction.)

I don't know whether this matters relative to the use cases, but it
would be nice to have some rationale.

> 1-3 binary bits. Also not what I'd like for the bits==18 case that I currently
> care about.
> 
> "ceil" is good for bits > 6. Almost matches my numbers (except I jump
> to one more decimal place one binary bit earlier).
> 
> What do you think of me swapping out the values from 7 upwards for the
> ceil values and documenting that 0..6 are hand-picked, but 7 and up are
> ceil(binary_bits * log10_2)?

If there is sound rationale for hand-picking some values then yes.

I haven't yet been convinced that there is ;)

(The 7 times table could doubtless be made to look nicer by hand-
picking some entries.  But it wouldn't be the 7 times table any more.)

> > Rounding up means that the hardware value can be reconstructed, but
> > only if userspace knows the value of binary_bits.  Should that be
> > exposed?
> 
> I'm not sure I see when users would need to reconstruct the h/w value.
> General use case for these resctrl events is: read1, sleepN, read2 &
> compute rate = (read2 - read1) / N

If userspace can reconstruct the original values, it can do this
calculation more accurately.

Since the values yielded by read1 and read2 might not differ by very
much, the relative error introduced by formatting the values in decimal
_might_ be significant.

(If we include enough decimal digits that there is no error, userspace
will see unexpectedly coarse granularity in the delta read2 - read1.
And this is only practical when the number of fractional bits is small.)


Again, I don't know whether this matters for use cases, but minimising
the number of magic numbers and arbitrary tradeoffs feels like it would
hide fewer potential surprises...

> In the case of the Intel telemetry events there is some jitter around
> the timing of the reads (since events may only be updated every 2ms).
> So the error bars get big if "N" is small. Which all leads me to believe
> that a "good enough" approach to representing the event values will
> be close enough for all use cases.

Probably (and in any case, userspace is likely to be a giant hack
rather than rigorous statistical analysis).

Still, telling the userspace the actual precision the hardware supports
feels easy to do.

(It could be added later on as an extension, though.)

> > > +	[11] =  4,
> > > +	[12] =  4,
> > > +	[13] =  5,
> > > +	[14] =  5,
> > > +	[15] =  5,
> > > +	[16] =  6,
> > > +	[17] =  6,
> > > +	[18] =  6,
> > > +	[19] =  7,
> > > +	[20] =  7,
> > > +	[21] =  7,
> > > +	[22] =  8,
> > > +	[23] =  8,
> > > +	[24] =  8,
> > > +	[25] =  9,
> > > +	[26] =  9,
> > > +	[27] =  9
> > 
> > Documenting the rule for generating these may be a good idea unless we
> > are sure that no more entries will never be added.
> 
> Above proposal - use the ceil function for bits >= 7.
> 
> > > +};
> > > +
> > > +static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
> > > +{
> > > +	unsigned long long frac;
> > > +	char buf[10];
> > 
> > In place of the magic number 10, how about
> > decplaces[MAX_BINARY_BITS] + 1 ?
> > 
> > (I think the compiler should accept that as an initialiser if the array
> > is const.)
> 
> If the compiler doesn't barf, then OK.
> 
> > > +
> > > +	if (!binary_bits) {
> > > +		seq_printf(m, "%llu.0\n", val);
> > > +		return;
> > > +	}
> > 
> > Can an initialiser for decplaces[0] reduce the special-casing for
> > binary_bits == 0?
> 
> I'll check and see.
> 
> > > +
> > > +	/* Mask off the integer part of the fixed-point value. */
> > > +	frac = val & GENMASK_ULL(binary_bits, 0);
> > 
> > Should this be GENMASK_ULL(binary_bits - 1, 0)?
> 
> Oops. I think you are right.
> 
> > Should we be telling userspace the binary_bits value?  It is not
> > (exactly) deducible from the number of decimal places printed.
> 
> I could add another info file for fixed_point events to display this.
> But I'm not convinced that it would result in users doing anything
> different.
> 
> Assume you just did the "read1, sleepN, read2" and got values of
> 235.617542 and 338.964815, tell me how things would be different
> if an info file said that binary_bits was 17 vs. 19?

It changes the error bars, no?

For 17 bits, ± .00000381 (approx.)
For 19 bits, ± .000000953 (approx.)

(i.e., ± 0.5 times the least-significant bit).

Whether it is important / useful to know this is usecase dependent,
though.

[...]

> > > +	/* Trim trailing zeroes */
> > 
> > Why?
> 
> It felt good. I'm not wedded to this. Maybe saving a few cycles of
> kernel CPU time by dropping this would be good.
> 
> > Would it be better to present the values with consistent precision?
> 
> Humans might notice the difference. Apps reading the file aren't going
> to care.

I noticed ;)  In that, there is explicit code here that seems to have
no function other than to make the output worse (i.e., more
unpredictable and with no obvious gain in usefulness).

If the number of digits is the only clue to the size of the error bars
in the readings, userspace code might well care about this.

> 
> > There's no reason why a telemetry counter should settle for any length
> > of time at a tidy value, so the precision represented by the trailing
> > zeros is always significant.
> 
> But x1 = atof("1.5") and x2 = atof("1.500000") ... can the subsequent
> use of x1 tell that there was less precision that x2?

Exactly.  If knowledge of the error bars is needed, just knowing the
nearest real number to the measured value is insufficient.

But the number of digits is all we seem to be giving userspace to go on
here -- and we're not presenting that in a predictable way, either (?)

Cheers
---Dave

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Luck, Tony 2 months, 3 weeks ago

On Tue, Nov 11, 2025 at 05:16:22PM +0000, Dave Martin wrote:

... snip

> > I'm not a fan of the "floor" option. Looks like it loses precision.  Terrible for
> 
> Loses precision, but does not advertise bogus precision precision
> beyond the precision in the original value.  (This is why it is not
> standard to print doubles with more then 15 significant digits, even
> though 17 significant digits are needed for bit-exact reproduction.)
> 
> I don't know whether this matters relative to the use cases, but it
> would be nice to have some rationale.
> 
> > 1-3 binary bits. Also not what I'd like for the bits==18 case that I currently
> > care about.
> > 
> > "ceil" is good for bits > 6. Almost matches my numbers (except I jump
> > to one more decimal place one binary bit earlier).
> > 
> > What do you think of me swapping out the values from 7 upwards for the
> > ceil values and documenting that 0..6 are hand-picked, but 7 and up are
> > ceil(binary_bits * log10_2)?
> 
> If there is sound rationale for hand-picking some values then yes.
> 
> I haven't yet been convinced that there is ;)

I don't have a rationale and I've been doing the thing I tell others
not to do "getting attached to code that I wrote". I will switch the
whole table to the ceil(binary_bits * log10_2) values.

One exception for binary_bits == 0. Back in the v6 version of these
patches I printed as a plain integer. Reinette commented[1]:

    At this time I understand that it will be clear for which
    events user space expects floating point numbers. If the architecture in
    turn does not support any "binary bits" then I think resctrl
    should still print a floating point number ("x.0") to match user space
    expectation.

-Tony

Link: https://lore.kernel.org/all/8214ae1f-d64c-496c-b41d-13b31250acea@intel.com/ [1]

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Luck, Tony 3 months ago

On Wed, Nov 05, 2025 at 03:31:07PM -0800, Luck, Tony wrote:
> > > +
> > > +	if (!binary_bits) {
> > > +		seq_printf(m, "%llu.0\n", val);
> > > +		return;
> > > +	}

I can't completely escape a test for !binary_bits. Most of the
flow works ok (doing nothing, working towards frac == 0 when
it comes time for the snprintf()).

But the round-up code:

	frac += 1ull << (binary_bits - 1);

goes badly wrong if binary_bits == 0.

I could write it like this:


static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
{
	char buf[decplaces[MAX_BINARY_BITS] + 1];
	unsigned long long frac = 0;

	if (binary_bits) {
		/* Mask off the integer part of the fixed-point value. */
		frac = val & GENMASK_ULL(binary_bits - 1, 0);

		/*
		 * Multiply by 10^{desired decimal places}. The integer part of
		 * the fixed point value is now almost what is needed.
		 */
		frac *= int_pow(10ull, decplaces[binary_bits]);

		/*
		 * Round to nearest by adding a value that would be a "1" in the
		 * binary_bits + 1 place.  Integer part of fixed point value is
		 * now the needed value.
		 */
		frac += 1ull << (binary_bits - 1);

		/*
		 * Extract the integer part of the value. This is the decimal
		 * representation of the original fixed-point fractional value.
		 */
		frac >>= binary_bits;
	}

	/*
	 * "frac" is now in the range [0 .. 10^decplaces).  I.e. string
	 * representation will fit into chosen number of decimal places.
	 */
	snprintf(buf, sizeof(buf), "%0*llu", decplaces[binary_bits], frac);

	seq_printf(m, "%llu.%s\n", val >> binary_bits, buf);
}

-Tony

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Dave Martin 2 months, 4 weeks ago

Hi,

On Wed, Nov 05, 2025 at 06:27:48PM -0800, Luck, Tony wrote:
> On Wed, Nov 05, 2025 at 03:31:07PM -0800, Luck, Tony wrote:
> > > > +
> > > > +	if (!binary_bits) {
> > > > +		seq_printf(m, "%llu.0\n", val);
> > > > +		return;
> > > > +	}
> 
> I can't completely escape a test for !binary_bits. Most of the
> flow works ok (doing nothing, working towards frac == 0 when
> it comes time for the snprintf()).
> 
> But the round-up code:
> 
> 	frac += 1ull << (binary_bits - 1);
> 
> goes badly wrong if binary_bits == 0.
> 
> I could write it like this:
> 
> 
> static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
> {
> 	char buf[decplaces[MAX_BINARY_BITS] + 1];
> 	unsigned long long frac = 0;
> 
> 	if (binary_bits) {
> 		/* Mask off the integer part of the fixed-point value. */
> 		frac = val & GENMASK_ULL(binary_bits - 1, 0);
> 
> 		/*
> 		 * Multiply by 10^{desired decimal places}. The integer part of
> 		 * the fixed point value is now almost what is needed.
> 		 */
> 		frac *= int_pow(10ull, decplaces[binary_bits]);

I guess there was already a discussion on whether it is worth
precomputing this multiplier.

int_pow() is not free, but if implemented in the standard way, it
should be pretty fast on 64-bit arches (which is all we care about).

(I've not checked.)

> 		/*
> 		 * Round to nearest by adding a value that would be a "1" in the
> 		 * binary_bits + 1 place.  Integer part of fixed point value is
> 		 * now the needed value.
> 		 */
> 		frac += 1ull << (binary_bits - 1);
> 
> 		/*
> 		 * Extract the integer part of the value. This is the decimal
> 		 * representation of the original fixed-point fractional value.
> 		 */
> 		frac >>= binary_bits;

Looks reasonable.  It's your call whether this is simpler, I guess.

> 	}
> 
> 	/*
> 	 * "frac" is now in the range [0 .. 10^decplaces).  I.e. string
> 	 * representation will fit into chosen number of decimal places.
> 	 */
> 	snprintf(buf, sizeof(buf), "%0*llu", decplaces[binary_bits], frac);
> 
> 	seq_printf(m, "%llu.%s\n", val >> binary_bits, buf);

Can we get rid of buf, actually?

I don't see why we can't just do

	seq_printf(m, "%llu.%0*llu",
		   val >> binary_bits, decplaces[binary_bits], frac);

...?

This avoids having to care about the size of buf.

seq_file's crystal ball knows how to make its buffer large enough.

Cheers
---Dave

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Luck, Tony 2 months, 3 weeks ago

Hi Dave,

On Tue, Nov 11, 2025 at 05:31:12PM +0000, Dave Martin wrote:
> Hi,
> 
> On Wed, Nov 05, 2025 at 06:27:48PM -0800, Luck, Tony wrote:
> > On Wed, Nov 05, 2025 at 03:31:07PM -0800, Luck, Tony wrote:
> > > > > +
> > > > > +	if (!binary_bits) {
> > > > > +		seq_printf(m, "%llu.0\n", val);
> > > > > +		return;
> > > > > +	}
> > 
> > I can't completely escape a test for !binary_bits. Most of the
> > flow works ok (doing nothing, working towards frac == 0 when
> > it comes time for the snprintf()).
> > 
> > But the round-up code:
> > 
> > 	frac += 1ull << (binary_bits - 1);
> > 
> > goes badly wrong if binary_bits == 0.
> > 
> > I could write it like this:
> > 
> > 
> > static void print_event_value(struct seq_file *m, unsigned int binary_bits, u64 val)
> > {
> > 	char buf[decplaces[MAX_BINARY_BITS] + 1];
> > 	unsigned long long frac = 0;
> > 
> > 	if (binary_bits) {
> > 		/* Mask off the integer part of the fixed-point value. */
> > 		frac = val & GENMASK_ULL(binary_bits - 1, 0);
> > 
> > 		/*
> > 		 * Multiply by 10^{desired decimal places}. The integer part of
> > 		 * the fixed point value is now almost what is needed.
> > 		 */
> > 		frac *= int_pow(10ull, decplaces[binary_bits]);
> 
> I guess there was already a discussion on whether it is worth
> precomputing this multiplier.
> 
> int_pow() is not free, but if implemented in the standard way, it
> should be pretty fast on 64-bit arches (which is all we care about).

Earlier versions of the patch had the precomputed value. Reinette
pointed me to int_pow(). It is in lib/math/int_pow.c and does seem
to be pretty efficient.

> 
> (I've not checked.)
> 
> > 		/*
> > 		 * Round to nearest by adding a value that would be a "1" in the
> > 		 * binary_bits + 1 place.  Integer part of fixed point value is
> > 		 * now the needed value.
> > 		 */
> > 		frac += 1ull << (binary_bits - 1);
> > 
> > 		/*
> > 		 * Extract the integer part of the value. This is the decimal
> > 		 * representation of the original fixed-point fractional value.
> > 		 */
> > 		frac >>= binary_bits;
> 
> Looks reasonable.  It's your call whether this is simpler, I guess.
> 
> > 	}
> > 
> > 	/*
> > 	 * "frac" is now in the range [0 .. 10^decplaces).  I.e. string
> > 	 * representation will fit into chosen number of decimal places.
> > 	 */
> > 	snprintf(buf, sizeof(buf), "%0*llu", decplaces[binary_bits], frac);
> > 
> > 	seq_printf(m, "%llu.%s\n", val >> binary_bits, buf);
> 
> Can we get rid of buf, actually?
> 
> I don't see why we can't just do
> 
> 	seq_printf(m, "%llu.%0*llu",
> 		   val >> binary_bits, decplaces[binary_bits], frac);

The buf[] was only there for trimming the trailing zeroes. Now that is
gone the result can be sent directly to seq_printf() as you suggest.
> 
> ...?
> 
> This avoids having to care about the size of buf.
> 
> seq_file's crystal ball knows how to make its buffer large enough.
> 
> Cheers
> ---Dave

-Tony

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Reinette Chatre 3 months ago

Hi Dave and Tony,

On 11/5/25 3:31 PM, Luck, Tony wrote:
> On Wed, Nov 05, 2025 at 02:42:18PM +0000, Dave Martin wrote:
>> On Wed, Oct 29, 2025 at 09:20:55AM -0700, Tony Luck wrote:

...

>>> diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
>>> index 40b76eaa33d0..f5189b6771a0 100644
>>> --- a/fs/resctrl/internal.h
>>> +++ b/fs/resctrl/internal.h
>>> @@ -62,6 +62,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
>>>   *			Only valid if @evtid is an MBM event.
>>>   * @configurable:	true if the event is configurable
>>>   * @any_cpu:		true if the event can be read from any CPU
>>> + * @is_floating_point:	event values are displayed in floating point format
>>
>> Nit: Maybe rebrand this as is_fixed_point, or is_fractional, or similar?
>>
>> The print syntax is just a decimal fraction, and the hardware
>> representation is fixed-point.  Nothing floats.
> 
> You are right. I can change from is_floating_point to is_fixed_point.
> 

This is a fs property though, not hardware, and highlights that the value is displayed in
floating point format which is the closest resctrl has to establish a "contract" with user
space on what format user space can expect when reading the data as backed with a
matching update to resctrl.rst for the events that have this hardcoded by the fs.
Whether an architecture uses fixed point format or some other mechanism to determine the
value eventually exposed to user space is unique to the architecture. 

Reinette

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Dave Martin 2 months, 4 weeks ago

Hi,

On Wed, Nov 05, 2025 at 04:09:28PM -0800, Reinette Chatre wrote:
> Hi Dave and Tony,
> 
> On 11/5/25 3:31 PM, Luck, Tony wrote:
> > On Wed, Nov 05, 2025 at 02:42:18PM +0000, Dave Martin wrote:
> >> On Wed, Oct 29, 2025 at 09:20:55AM -0700, Tony Luck wrote:
> 
> ...
> 
> >>> diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
> >>> index 40b76eaa33d0..f5189b6771a0 100644
> >>> --- a/fs/resctrl/internal.h
> >>> +++ b/fs/resctrl/internal.h
> >>> @@ -62,6 +62,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
> >>>   *			Only valid if @evtid is an MBM event.
> >>>   * @configurable:	true if the event is configurable
> >>>   * @any_cpu:		true if the event can be read from any CPU
> >>> + * @is_floating_point:	event values are displayed in floating point format
> >>
> >> Nit: Maybe rebrand this as is_fixed_point, or is_fractional, or similar?
> >>
> >> The print syntax is just a decimal fraction, and the hardware
> >> representation is fixed-point.  Nothing floats.
> > 
> > You are right. I can change from is_floating_point to is_fixed_point.
> > 
> 
> This is a fs property though, not hardware, and highlights that the value is displayed in
> floating point format which is the closest resctrl has to establish a "contract" with user
> space on what format user space can expect when reading the data as backed with a
> matching update to resctrl.rst for the events that have this hardcoded by the fs.
> Whether an architecture uses fixed point format or some other mechanism to determine the
> value eventually exposed to user space is unique to the architecture. 

Sure, getting the docmuentation right is the most important thing,
while the internal name for this property is not ABI.

(I don't strongly object to "is_floating_point", even if we expose this
in the filesystem, so long as we document carefully what it means.)

Cheers
---Dave

Re: [PATCH v13 12/32] x86,fs/resctrl: Support binary fixed point event counters

Posted by Reinette Chatre 2 months, 3 weeks ago

Hi Dave,

On 11/11/25 9:22 AM, Dave Martin wrote:
> Hi,
> 
> On Wed, Nov 05, 2025 at 04:09:28PM -0800, Reinette Chatre wrote:
>> Hi Dave and Tony,
>>
>> On 11/5/25 3:31 PM, Luck, Tony wrote:
>>> On Wed, Nov 05, 2025 at 02:42:18PM +0000, Dave Martin wrote:
>>>> On Wed, Oct 29, 2025 at 09:20:55AM -0700, Tony Luck wrote:
>>
>> ...
>>
>>>>> diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
>>>>> index 40b76eaa33d0..f5189b6771a0 100644
>>>>> --- a/fs/resctrl/internal.h
>>>>> +++ b/fs/resctrl/internal.h
>>>>> @@ -62,6 +62,9 @@ static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc)
>>>>>   *			Only valid if @evtid is an MBM event.
>>>>>   * @configurable:	true if the event is configurable
>>>>>   * @any_cpu:		true if the event can be read from any CPU
>>>>> + * @is_floating_point:	event values are displayed in floating point format
>>>>
>>>> Nit: Maybe rebrand this as is_fixed_point, or is_fractional, or similar?
>>>>
>>>> The print syntax is just a decimal fraction, and the hardware
>>>> representation is fixed-point.  Nothing floats.
>>>
>>> You are right. I can change from is_floating_point to is_fixed_point.
>>>
>>
>> This is a fs property though, not hardware, and highlights that the value is displayed in
>> floating point format which is the closest resctrl has to establish a "contract" with user
>> space on what format user space can expect when reading the data as backed with a
>> matching update to resctrl.rst for the events that have this hardcoded by the fs.
>> Whether an architecture uses fixed point format or some other mechanism to determine the
>> value eventually exposed to user space is unique to the architecture. 
> 
> Sure, getting the docmuentation right is the most important thing,
> while the internal name for this property is not ABI.
> 
> (I don't strongly object to "is_floating_point", even if we expose this
> in the filesystem, so long as we document carefully what it means.)

Highlighting the member name and description in fs/resctrl/internal.h: 
	@is_floating_point:	event values are displayed in floating point format

I consider it important that the description highlights that the event will be displayed to
user space as floating point. struct mon_evt that contains this member is internal to resctrl fs
and there is no helper available to arch with which @is_floating_point can be changed since
this is a contract with user space. I find that having the member name match that description
and contract easier to read.

The documentation (resctrl.rst) is updated in patch #32 with below to make this clear:

	"core energy" reports a floating point number for the energy (in Joules) ...
	...
	"activity" also reports a floating point value (in Farads).

I agree that internal names are not ABI and this is evident with the only internal
connection to a value displayed as floating point being an internal fixed point fraction
number. This can change any time. We have to draw the line somewhere to make it clear
how resctrl interacts with user space and I find the event's display property to be
appropriate for this.

Reinette