[PATCH v2 13/16] x86/resctrl: Add code to display core telemetry events

Tony Luck posted 16 patches 10 months, 3 weeks ago
There is a newer version of this series
[PATCH v2 13/16] x86/resctrl: Add code to display core telemetry events
Posted by Tony Luck 10 months, 3 weeks ago
These can be read from any CPU. Rely on the smp_call*() functions
picking the current CPU when given a free choice from cpu_online_mask.

There may be multiple devices tracking each package, so scan all of them
and add up counters.

Output format depends on the data type. Either a 63 bit integer, or a
fixed point decimal.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 include/linux/resctrl.h                 |  3 ++
 fs/resctrl/internal.h                   |  4 +-
 arch/x86/kernel/cpu/resctrl/intel_aet.c | 53 +++++++++++++++++++++++++
 fs/resctrl/ctrlmondata.c                | 23 ++++++++++-
 fs/resctrl/monitor.c                    | 23 +++++++++--
 5 files changed, 100 insertions(+), 6 deletions(-)

diff --git a/include/linux/resctrl.h b/include/linux/resctrl.h
index 999e0802a26e..e900764393f4 100644
--- a/include/linux/resctrl.h
+++ b/include/linux/resctrl.h
@@ -532,8 +532,11 @@ void resctrl_exit(void);
 
 #ifdef CONFIG_INTEL_AET_RESCTRL
 void rdt_get_intel_aet_mount(void);
+bool intel_aet_read_event(int domid, int rmid, int evtid, u64 *val, bool *fptype);
 #else
 static inline void rdt_get_intel_aet_mount(void) { }
+static inline bool intel_aet_read_event(int domid, int rmid, int evtid, u64 *val,
+					bool *fptype) { return false; }
 #endif
 
 #endif /* _RESCTRL_H */
diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h
index f5a698b49e97..4d65a781034e 100644
--- a/fs/resctrl/internal.h
+++ b/fs/resctrl/internal.h
@@ -98,6 +98,7 @@ struct mon_data {
  *	   domains in @r sharing L3 @ci.id
  * @evtid: Which monitor event to read.
  * @first: Initialize MBM counter when true.
+ * @fptype:If true indicates @val is in 46.18 fixed point format
  * @ci:    Cacheinfo for L3. Only set when @d is NULL. Used when summing domains.
  * @err:   Error encountered when reading counter.
  * @val:   Returned value of event counter. If @rgrp is a parent resource group,
@@ -112,6 +113,7 @@ struct rmid_read {
 	struct rdt_mon_domain	*d;
 	unsigned int		evtid;
 	bool			first;
+	bool			fptype;
 	struct cacheinfo	*ci;
 	int			err;
 	u64			val;
@@ -343,7 +345,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg);
 
 void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 		    struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
-		    cpumask_t *cpumask, int evtid, int first);
+		    const cpumask_t *cpumask, int evtid, int first);
 
 int resctrl_mon_resource_init(void);
 
diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c
index bab8e4de26b3..41ebb2ee9b41 100644
--- a/arch/x86/kernel/cpu/resctrl/intel_aet.c
+++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c
@@ -357,3 +357,56 @@ void rdt_get_intel_aet_mount(void)
 		r->mon_capable = false;
 	}
 }
+
+#define VALID_BIT	BIT_ULL(63)
+#define DATA_BITS	GENMASK_ULL(62, 0)
+
+/*
+ * Walk the array of telemetry groups on a specific package.
+ * Read and sum values for a specific counter (described by
+ * guid and offset).
+ * Return failure (~0x0ull) if any counter isn't valid.
+ */
+static u64 scan_pmt_devs(int package, int guid, int offset)
+{
+	u64 rval, val;
+	int ndev = 0;
+
+	rval = 0;
+
+	for (int i = 0; i < pkg_info[package].count; i++) {
+		if (pkg_info[package].regions[i].guid != guid)
+			continue;
+		ndev++;
+		val = readq(pkg_info[package].regions[i].addr + offset);
+
+		if (!(val & VALID_BIT))
+			return ~0ull;
+		rval += val & DATA_BITS;
+	}
+
+	return ndev ? rval : ~0ull;
+}
+
+/*
+ * Read counter for an event on a domain (summing all aggregators
+ * on the domain).
+ */
+bool intel_aet_read_event(int domid, int rmid, int evtid, u64 *val, bool *fptype)
+{
+	u64 evtcount;
+	int offset;
+
+	if (rmid >= EVT_NUM_RMIDS(evtid))
+		return false;
+
+	offset = rmid * EVT_STRIDE(evtid);
+	offset += EVT_OFFSET(evtid);
+	evtcount = scan_pmt_devs(domid, EVT_GUID(evtid), offset);
+	*fptype = evtid == PMT_EVENT_ENERGY || evtid == PMT_EVENT_ACTIVITY;
+
+	if (evtcount != ~0ull || *val == 0)
+		*val += evtcount;
+
+	return evtcount != ~0ull;
+}
diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c
index d56b78450a99..5612f5f64574 100644
--- a/fs/resctrl/ctrlmondata.c
+++ b/fs/resctrl/ctrlmondata.c
@@ -548,7 +548,7 @@ struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id,
 
 void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 		    struct rdt_mon_domain *d, struct rdtgroup *rdtgrp,
-		    cpumask_t *cpumask, int evtid, int first)
+		    const cpumask_t *cpumask, int evtid, int first)
 {
 	int cpu;
 
@@ -585,6 +585,21 @@ void mon_event_read(struct rmid_read *rr, struct rdt_resource *r,
 	resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx);
 }
 
+#define NUM_FRAC_BITS	18
+#define FRAC_MASK	GENMASK(NUM_FRAC_BITS - 1, 0)
+
+static void show_fp_value(struct seq_file *m, u64 val)
+{
+	u64 frac;
+
+	frac = val & FRAC_MASK;
+	frac = frac * 1000000;
+	frac += 1ul << (NUM_FRAC_BITS - 1);
+	frac >>= NUM_FRAC_BITS;
+
+	seq_printf(m, "%llu.%06llu\n", val >> NUM_FRAC_BITS, frac);
+}
+
 int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 {
 	struct kernfs_open_file *of = m->private;
@@ -594,6 +609,7 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 	u32 resid, evtid, domid;
 	struct rdtgroup *rdtgrp;
 	struct rdt_resource *r;
+	const cpumask_t *mask;
 	struct mon_data *md;
 	int ret = 0;
 
@@ -642,7 +658,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 			goto out;
 		}
 		d = container_of(hdr, struct rdt_mon_domain, hdr);
-		mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false);
+		mask = (resid == RDT_RESOURCE_L3) ? &d->hdr.cpu_mask : cpu_online_mask;
+		mon_event_read(&rr, r, d, rdtgrp, mask, evtid, false);
 	}
 
 checkresult:
@@ -651,6 +668,8 @@ int rdtgroup_mondata_show(struct seq_file *m, void *arg)
 		seq_puts(m, "Error\n");
 	else if (rr.err == -EINVAL)
 		seq_puts(m, "Unavailable\n");
+	else if (rr.fptype)
+		show_fp_value(m, rr.val);
 	else
 		seq_printf(m, "%llu\n", rr.val);
 
diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c
index 3fe21dcf0fde..f4049ae5344c 100644
--- a/fs/resctrl/monitor.c
+++ b/fs/resctrl/monitor.c
@@ -447,6 +447,24 @@ static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr)
 	m->prev_bw = cur_bw;
 }
 
+static int mon_event_count_one_group(int closid, int rmid, struct rmid_read *rr)
+{
+	bool ret;
+
+	switch (rr->r->rid) {
+	case RDT_RESOURCE_L3:
+		rr->fptype = false;
+		return __mon_event_count(closid, rmid, rr);
+	case RDT_RESOURCE_INTEL_AET:
+		ret = intel_aet_read_event(rr->d->hdr.id, rmid, rr->evtid, &rr->val, &rr->fptype);
+		if (!ret)
+			rr->err = -EINVAL;
+		return ret ? 0 : -EINVAL;
+	}
+
+	return -EINVAL;
+}
+
 /*
  * This is scheduled by mon_event_read() to read the CQM/MBM counters
  * on a domain.
@@ -460,7 +478,7 @@ void mon_event_count(void *info)
 
 	rdtgrp = rr->rgrp;
 
-	ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr);
+	ret = mon_event_count_one_group(rdtgrp->closid, rdtgrp->mon.rmid, rr);
 
 	/*
 	 * For Ctrl groups read data from child monitor groups and
@@ -471,8 +489,7 @@ void mon_event_count(void *info)
 
 	if (rdtgrp->type == RDTCTRL_GROUP) {
 		list_for_each_entry(entry, head, mon.crdtgrp_list) {
-			if (__mon_event_count(entry->closid, entry->mon.rmid,
-					      rr) == 0)
+			if (mon_event_count_one_group(entry->closid, entry->mon.rmid, rr) == 0)
 				ret = 0;
 		}
 	}
-- 
2.48.1
Re: [PATCH v2 13/16] x86/resctrl: Add code to display core telemetry events
Posted by Reinette Chatre 10 months, 2 weeks ago
Hi Tony,

(nit: "Add code to" can be dropped from shortlog)

On 3/21/25 4:16 PM, Tony Luck wrote:
> These can be read from any CPU. Rely on the smp_call*() functions
> picking the current CPU when given a free choice from cpu_online_mask.
> 
> There may be multiple devices tracking each package, so scan all of them
> and add up counters.
> 
> Output format depends on the data type. Either a 63 bit integer, or a
> fixed point decimal.
> 

At this point the architecture and fs code is very intertwined. I hope that
some of the items I mentioned in earlier patches will help to support a clear
separation that will make the code that follows from here on easier to split
between arch and fs. 
For example, I think this may end up with the new event enums defined in
include/linux/resctrl_types.h to support new architectural helpers
that take the enum as argument that the fs code can use to request the
event value from the architecture. 

Reinette
Re: [PATCH v2 13/16] x86/resctrl: Add code to display core telemetry events
Posted by Luck, Tony 10 months, 2 weeks ago
On Mon, Mar 31, 2025 at 09:23:48AM -0700, Reinette Chatre wrote:
> Hi Tony,
> 
> (nit: "Add code to" can be dropped from shortlog)
> 
> On 3/21/25 4:16 PM, Tony Luck wrote:
> > These can be read from any CPU. Rely on the smp_call*() functions
> > picking the current CPU when given a free choice from cpu_online_mask.
> > 
> > There may be multiple devices tracking each package, so scan all of them
> > and add up counters.
> > 
> > Output format depends on the data type. Either a 63 bit integer, or a
> > fixed point decimal.
> > 
> 
> At this point the architecture and fs code is very intertwined. I hope that
> some of the items I mentioned in earlier patches will help to support a clear
> separation that will make the code that follows from here on easier to split
> between arch and fs. 
> For example, I think this may end up with the new event enums defined in
> include/linux/resctrl_types.h to support new architectural helpers
> that take the enum as argument that the fs code can use to request the
> event value from the architecture. 

I have a solution for the separtion for this. Each mon_evt structure
gets two new fields.

The first is "bool any_cpu;" if this is set to true the event can
be read from any CPU (and we can use James suggestion to pick from
"online_cpu_mask" instead of "d->hdr.cpu_mask" and let the optimizations
in smp_call*() avoid the IPI.

The second is a "enum format" field that specifies how to display the
value returned to FS code from architecture. Existing events all
print as a decimal number. I need to add the binary fixed point with
18 binary places to be printed as a floating point number.

Note that this solution needs to copy these fields from the
mon_evt structure to the mon_data_bits union (this is easier
if James' pending patch to convert to a mon_data structure removes the
restriction that all fields fit into 32-bits).
> 
> Reinette

-Tony