[PATCH v9 17/31] x86/resctrl: Discover hardware telemetry events

Tony Luck posted 31 patches 1 month ago
There is a newer version of this series
[PATCH v9 17/31] x86/resctrl: Discover hardware telemetry events
Posted by Tony Luck 1 month ago
Each CPU collects data for telemetry events that it sends to a nearby
telemetry event aggregator either when the value of IA32_PQR_ASSOC.RMID
changed, or when a two millisecond timer expires.

The telemetry event aggregators maintain per-RMID per-event counts of
the total seen for all the CPUs. There may be more than one telemetry
event aggregator per package.

Each telemetry event aggregator is responsible for a specific group of
events. E.g. on the Intel Clearwater Forest CPU there are two types of
aggregators. One type tracks a pair of energy related events. The other
type tracks a subset of "perf" type events.

The event counts are made available to Linux in a region of MMIO space
for each aggregator. All details about the layout of counters in each
aggregator MMIO region are described in XML files published by Intel and
made available in a GitHub repository: https://github.com/intel/Intel-PMT.

The key to matching a specific telemetry aggregator to the XML file that
describes the MMIO layout is a 32-bit value. The Linux telemetry subsystem
refers to this as a "guid" while the XML files call it a "uniqueid".

Each XML file provides the following information:
1) Which telemetry events are included in the group.
2) The order in which the event counters appear for each RMID.
3) The value type of each event counter (integer or fixed-point).
4) The number of RMIDs supported.
5) Which additional aggregator status registers are included.
6) The total size of the MMIO region for this aggregator.

The INTEL_PMT_TELEMETRY driver enumerates support for telemetry events.
This driver provides intel_pmt_get_regions_by_feature() to list all
available telemetry event aggregators. The list includes the "guid",
the base address in MMIO space for the region where the event counters
are exposed, and the package id where the CPUs that report to this
aggregator are located.

Add a new Kconfig option CONFIG_X86_CPU_RESCTRL_INTEL_AET for the Intel
specific parts of telemetry code. This depends on the INTEL_PMT_TELEMETRY
and INTEL_TPMI drivers being built-in to the kernel for enumeration of
telemetry features.

Call intel_pmt_get_regions_by_feature() for each pmt_feature_id that
indicates per-RMID telemetry.

Save the returned pmt_feature_group pointers with guids that are known
to resctrl for use at run time.  Those pointers are returned to the
INTEL_PMT_TELEMETRY subsystem at resctrl_arch_exit() time.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
Note that checkpatch complains about this:

DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,
		if (!IS_ERR_OR_NULL(_T))
			intel_pmt_put_feature_group(_T))
with:
CHECK: Alignment should match open parenthesis

But if the alignment is fixed, it then complains:
WARNING: Statements should start on a tabstop
---
 arch/x86/kernel/cpu/resctrl/internal.h  |   8 ++
 arch/x86/kernel/cpu/resctrl/core.c      |   5 +
 arch/x86/kernel/cpu/resctrl/intel_aet.c | 133 ++++++++++++++++++++++++
 arch/x86/Kconfig                        |  13 +++
 arch/x86/kernel/cpu/resctrl/Makefile    |   1 +
 5 files changed, 160 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/resctrl/intel_aet.c

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 6b3f3203edc4..9ddfbbe5c3cf 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -169,4 +169,12 @@ void __init intel_rdt_mbm_apply_quirk(void);
 
 void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
 
+#ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET
+bool intel_aet_get_events(void);
+void __exit intel_aet_exit(void);
+#else
+static inline bool intel_aet_get_events(void) { return false; }
+static inline void __exit intel_aet_exit(void) { }
+#endif
+
 #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 566530c6dbc3..57b34e1dc088 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -735,6 +735,9 @@ void resctrl_arch_pre_mount(void)
 
 	if (!atomic_try_cmpxchg(&only_once, &old, 1))
 		return;
+
+	if (!intel_aet_get_events())
+		return;
 }
 
 enum {
@@ -1087,6 +1090,8 @@ late_initcall(resctrl_arch_late_init);
 
 static void __exit resctrl_arch_exit(void)
 {
+	intel_aet_exit();
+
 	cpuhp_remove_state(rdt_online);
 
 	resctrl_exit();
diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c
new file mode 100644
index 000000000000..45cadbb87dc8
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c
@@ -0,0 +1,133 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Intel Application Energy Telemetry
+ *
+ * Copyright (C) 2025 Intel Corporation
+ *
+ * Author:
+ *    Tony Luck <tony.luck@intel.com>
+ */
+
+#define pr_fmt(fmt)   "resctrl: " fmt
+
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <linux/intel_vsec.h>
+#include <linux/resctrl.h>
+
+#include "internal.h"
+
+/**
+ * struct event_group - All information about a group of telemetry events.
+ * @pfg:		Points to the aggregated telemetry space information
+ *			within the INTEL_PMT_TELEMETRY driver that contains data for all
+ *			telemetry regions.
+ * @guid:		Unique number per XML description file.
+ */
+struct event_group {
+	/* Data fields for additional structures to manage this group. */
+	struct pmt_feature_group	*pfg;
+
+	/* Remaining fields initialized from XML file. */
+	u32				guid;
+};
+
+/*
+ * Link: https://github.com/intel/Intel-PMT
+ * File: xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml
+ */
+static struct event_group energy_0x26696143 = {
+	.guid		= 0x26696143,
+};
+
+/*
+ * Link: https://github.com/intel/Intel-PMT
+ * File: xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml
+ */
+static struct event_group perf_0x26557651 = {
+	.guid		= 0x26557651,
+};
+
+static struct event_group *known_energy_event_groups[] = {
+	&energy_0x26696143,
+};
+
+static struct event_group *known_perf_event_groups[] = {
+	&perf_0x26557651,
+};
+
+#define for_each_enabled_event_group(_peg, _grp)			\
+	for (_peg = _grp; _peg < &_grp[ARRAY_SIZE(_grp)]; _peg++)	\
+		if ((*_peg)->pfg)
+
+/* Stub for now */
+static bool enable_events(struct event_group *e, struct pmt_feature_group *p)
+{
+	return false;
+}
+
+DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,
+		if (!IS_ERR_OR_NULL(_T))
+			intel_pmt_put_feature_group(_T))
+
+/*
+ * Make a request to the INTEL_PMT_TELEMETRY driver for the pmt_feature_group
+ * for a specific feature. If there is one the returned structure has an array
+ * of telemetry_region structures. Each describes one telemetry aggregator.
+ * Try to use every telemetry aggregator with a known guid.
+ */
+static bool get_pmt_feature(enum pmt_feature_id feature, struct event_group **evgs,
+			    unsigned int num_evg)
+{
+	struct pmt_feature_group *p __free(intel_pmt_put_feature_group) = NULL;
+	struct event_group **peg;
+	int ret;
+
+	p = intel_pmt_get_regions_by_feature(feature);
+
+	if (IS_ERR_OR_NULL(p))
+		return false;
+
+	for (peg = evgs; peg < &evgs[num_evg]; peg++) {
+		ret = enable_events(*peg, p);
+		if (ret) {
+			(*peg)->pfg = no_free_ptr(p);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Ask INTEL_PMT_TELEMETRY driver for all the RMID based telemetry groups
+ * that it supports.
+ */
+bool intel_aet_get_events(void)
+{
+	bool ret1, ret2;
+
+	ret1 = get_pmt_feature(FEATURE_PER_RMID_ENERGY_TELEM,
+			       known_energy_event_groups,
+			       ARRAY_SIZE(known_energy_event_groups));
+	ret2 = get_pmt_feature(FEATURE_PER_RMID_PERF_TELEM,
+			       known_perf_event_groups,
+			       ARRAY_SIZE(known_perf_event_groups));
+
+	return ret1 || ret2;
+}
+
+void __exit intel_aet_exit(void)
+{
+	struct event_group **peg;
+
+	for_each_enabled_event_group(peg, known_energy_event_groups) {
+		intel_pmt_put_feature_group((*peg)->pfg);
+		(*peg)->pfg = NULL;
+	}
+	for_each_enabled_event_group(peg, known_perf_event_groups) {
+		intel_pmt_put_feature_group((*peg)->pfg);
+		(*peg)->pfg = NULL;
+	}
+}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 58d890fe2100..50051fdf4659 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -525,6 +525,19 @@ config X86_CPU_RESCTRL
 
 	  Say N if unsure.
 
+config X86_CPU_RESCTRL_INTEL_AET
+	bool "Intel Application Energy Telemetry" if INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y
+	depends on X86_CPU_RESCTRL && CPU_SUP_INTEL
+	help
+	  Enable per-RMID telemetry events in resctrl.
+
+	  Intel feature that collects per-RMID execution data
+	  about energy consumption, measure of frequency independent
+	  activity and other performance metrics. Data is aggregated
+	  per package.
+
+	  Say N if unsure.
+
 config X86_FRED
 	bool "Flexible Return and Event Delivery"
 	depends on X86_64
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index d8a04b195da2..273ddfa30836 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_X86_CPU_RESCTRL)		+= core.o rdtgroup.o monitor.o
 obj-$(CONFIG_X86_CPU_RESCTRL)		+= ctrlmondata.o
+obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET)	+= intel_aet.o
 obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK)	+= pseudo_lock.o
 
 # To allow define_trace.h's recursive include:
-- 
2.50.1
Re: [PATCH v9 17/31] x86/resctrl: Discover hardware telemetry events
Posted by Ilpo Järvinen 1 month ago
On Fri, 29 Aug 2025, Tony Luck wrote:

> Each CPU collects data for telemetry events that it sends to a nearby
> telemetry event aggregator either when the value of IA32_PQR_ASSOC.RMID
> changed, or when a two millisecond timer expires.
> 
> The telemetry event aggregators maintain per-RMID per-event counts of
> the total seen for all the CPUs. There may be more than one telemetry
> event aggregator per package.
> 
> Each telemetry event aggregator is responsible for a specific group of
> events. E.g. on the Intel Clearwater Forest CPU there are two types of
> aggregators. One type tracks a pair of energy related events. The other
> type tracks a subset of "perf" type events.
> 
> The event counts are made available to Linux in a region of MMIO space
> for each aggregator. All details about the layout of counters in each
> aggregator MMIO region are described in XML files published by Intel and
> made available in a GitHub repository: https://github.com/intel/Intel-PMT.
> 
> The key to matching a specific telemetry aggregator to the XML file that
> describes the MMIO layout is a 32-bit value. The Linux telemetry subsystem
> refers to this as a "guid" while the XML files call it a "uniqueid".
> 
> Each XML file provides the following information:
> 1) Which telemetry events are included in the group.
> 2) The order in which the event counters appear for each RMID.
> 3) The value type of each event counter (integer or fixed-point).
> 4) The number of RMIDs supported.
> 5) Which additional aggregator status registers are included.
> 6) The total size of the MMIO region for this aggregator.
> 
> The INTEL_PMT_TELEMETRY driver enumerates support for telemetry events.
> This driver provides intel_pmt_get_regions_by_feature() to list all
> available telemetry event aggregators. The list includes the "guid",
> the base address in MMIO space for the region where the event counters
> are exposed, and the package id where the CPUs that report to this
> aggregator are located.
> 
> Add a new Kconfig option CONFIG_X86_CPU_RESCTRL_INTEL_AET for the Intel
> specific parts of telemetry code. This depends on the INTEL_PMT_TELEMETRY
> and INTEL_TPMI drivers being built-in to the kernel for enumeration of
> telemetry features.
> 
> Call intel_pmt_get_regions_by_feature() for each pmt_feature_id that
> indicates per-RMID telemetry.
> 
> Save the returned pmt_feature_group pointers with guids that are known
> to resctrl for use at run time.  Those pointers are returned to the

Extra space

> INTEL_PMT_TELEMETRY subsystem at resctrl_arch_exit() time.
> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---
> Note that checkpatch complains about this:
> 
> DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,
> 		if (!IS_ERR_OR_NULL(_T))
> 			intel_pmt_put_feature_group(_T))
> with:
> CHECK: Alignment should match open parenthesis
> 
> But if the alignment is fixed, it then complains:
> WARNING: Statements should start on a tabstop
> ---
>  arch/x86/kernel/cpu/resctrl/internal.h  |   8 ++
>  arch/x86/kernel/cpu/resctrl/core.c      |   5 +
>  arch/x86/kernel/cpu/resctrl/intel_aet.c | 133 ++++++++++++++++++++++++
>  arch/x86/Kconfig                        |  13 +++
>  arch/x86/kernel/cpu/resctrl/Makefile    |   1 +
>  5 files changed, 160 insertions(+)
>  create mode 100644 arch/x86/kernel/cpu/resctrl/intel_aet.c
> 
> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
> index 6b3f3203edc4..9ddfbbe5c3cf 100644
> --- a/arch/x86/kernel/cpu/resctrl/internal.h
> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
> @@ -169,4 +169,12 @@ void __init intel_rdt_mbm_apply_quirk(void);
>  
>  void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
>  
> +#ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET
> +bool intel_aet_get_events(void);
> +void __exit intel_aet_exit(void);
> +#else
> +static inline bool intel_aet_get_events(void) { return false; }
> +static inline void __exit intel_aet_exit(void) { }
> +#endif
> +
>  #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
> diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
> index 566530c6dbc3..57b34e1dc088 100644
> --- a/arch/x86/kernel/cpu/resctrl/core.c
> +++ b/arch/x86/kernel/cpu/resctrl/core.c
> @@ -735,6 +735,9 @@ void resctrl_arch_pre_mount(void)
>  
>  	if (!atomic_try_cmpxchg(&only_once, &old, 1))
>  		return;
> +
> +	if (!intel_aet_get_events())
> +		return;
>  }
>  
>  enum {
> @@ -1087,6 +1090,8 @@ late_initcall(resctrl_arch_late_init);
>  
>  static void __exit resctrl_arch_exit(void)
>  {
> +	intel_aet_exit();
> +
>  	cpuhp_remove_state(rdt_online);
>  
>  	resctrl_exit();
> diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c
> new file mode 100644
> index 000000000000..45cadbb87dc8
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c
> @@ -0,0 +1,133 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Resource Director Technology(RDT)
> + * - Intel Application Energy Telemetry
> + *
> + * Copyright (C) 2025 Intel Corporation
> + *
> + * Author:
> + *    Tony Luck <tony.luck@intel.com>
> + */
> +
> +#define pr_fmt(fmt)   "resctrl: " fmt
> +
> +#include <linux/cleanup.h>
> +#include <linux/cpu.h>
> +#include <linux/intel_vsec.h>
> +#include <linux/resctrl.h>
> +
> +#include "internal.h"
> +
> +/**
> + * struct event_group - All information about a group of telemetry events.
> + * @pfg:		Points to the aggregated telemetry space information
> + *			within the INTEL_PMT_TELEMETRY driver that contains data for all
> + *			telemetry regions.
> + * @guid:		Unique number per XML description file.
> + */
> +struct event_group {
> +	/* Data fields for additional structures to manage this group. */
> +	struct pmt_feature_group	*pfg;
> +
> +	/* Remaining fields initialized from XML file. */
> +	u32				guid;
> +};
> +
> +/*
> + * Link: https://github.com/intel/Intel-PMT
> + * File: xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml
> + */
> +static struct event_group energy_0x26696143 = {
> +	.guid		= 0x26696143,
> +};
> +
> +/*
> + * Link: https://github.com/intel/Intel-PMT
> + * File: xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml
> + */
> +static struct event_group perf_0x26557651 = {
> +	.guid		= 0x26557651,
> +};
> +
> +static struct event_group *known_energy_event_groups[] = {
> +	&energy_0x26696143,
> +};
> +
> +static struct event_group *known_perf_event_groups[] = {
> +	&perf_0x26557651,
> +};
> +
> +#define for_each_enabled_event_group(_peg, _grp)			\
> +	for (_peg = _grp; _peg < &_grp[ARRAY_SIZE(_grp)]; _peg++)	\

+ linux/array_size.h

Wouldn't it be wiser to protect the args even if the current users seem 
fine?

> +		if ((*_peg)->pfg)
> +
> +/* Stub for now */
> +static bool enable_events(struct event_group *e, struct pmt_feature_group *p)
> +{
> +	return false;
> +}
> +
> +DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,
> +		if (!IS_ERR_OR_NULL(_T))
> +			intel_pmt_put_feature_group(_T))
> +
> +/*
> + * Make a request to the INTEL_PMT_TELEMETRY driver for the pmt_feature_group
> + * for a specific feature. If there is one the returned structure has an array

Missing comma?

> + * of telemetry_region structures. Each describes one telemetry aggregator.
> + * Try to use every telemetry aggregator with a known guid.
> + */
> +static bool get_pmt_feature(enum pmt_feature_id feature, struct event_group **evgs,
> +			    unsigned int num_evg)
> +{
> +	struct pmt_feature_group *p __free(intel_pmt_put_feature_group) = NULL;
> +	struct event_group **peg;
> +	int ret;
> +
> +	p = intel_pmt_get_regions_by_feature(feature);
> +
> +	if (IS_ERR_OR_NULL(p))

+ linux/err.h

> +		return false;
> +
> +	for (peg = evgs; peg < &evgs[num_evg]; peg++) {
> +		ret = enable_events(*peg, p);
> +		if (ret) {

This is super misleading. You place a bool returning function's return 
value into "int ret" and then use the value like bool.

> +			(*peg)->pfg = no_free_ptr(p);
> +			return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +/*
> + * Ask INTEL_PMT_TELEMETRY driver for all the RMID based telemetry groups
> + * that it supports.
> + */
> +bool intel_aet_get_events(void)
> +{
> +	bool ret1, ret2;
> +
> +	ret1 = get_pmt_feature(FEATURE_PER_RMID_ENERGY_TELEM,
> +			       known_energy_event_groups,
> +			       ARRAY_SIZE(known_energy_event_groups));
> +	ret2 = get_pmt_feature(FEATURE_PER_RMID_PERF_TELEM,
> +			       known_perf_event_groups,
> +			       ARRAY_SIZE(known_perf_event_groups));
> +
> +	return ret1 || ret2;
> +}
> +
> +void __exit intel_aet_exit(void)
> +{
> +	struct event_group **peg;
> +
> +	for_each_enabled_event_group(peg, known_energy_event_groups) {
> +		intel_pmt_put_feature_group((*peg)->pfg);
> +		(*peg)->pfg = NULL;
> +	}
> +	for_each_enabled_event_group(peg, known_perf_event_groups) {
> +		intel_pmt_put_feature_group((*peg)->pfg);
> +		(*peg)->pfg = NULL;
> +	}
> +}
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 58d890fe2100..50051fdf4659 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -525,6 +525,19 @@ config X86_CPU_RESCTRL
>  
>  	  Say N if unsure.
>  
> +config X86_CPU_RESCTRL_INTEL_AET
> +	bool "Intel Application Energy Telemetry" if INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y
> +	depends on X86_CPU_RESCTRL && CPU_SUP_INTEL
> +	help
> +	  Enable per-RMID telemetry events in resctrl.
> +
> +	  Intel feature that collects per-RMID execution data
> +	  about energy consumption, measure of frequency independent
> +	  activity and other performance metrics. Data is aggregated
> +	  per package.
> +
> +	  Say N if unsure.
> +
>  config X86_FRED
>  	bool "Flexible Return and Event Delivery"
>  	depends on X86_64
> diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
> index d8a04b195da2..273ddfa30836 100644
> --- a/arch/x86/kernel/cpu/resctrl/Makefile
> +++ b/arch/x86/kernel/cpu/resctrl/Makefile
> @@ -1,6 +1,7 @@
>  # SPDX-License-Identifier: GPL-2.0
>  obj-$(CONFIG_X86_CPU_RESCTRL)		+= core.o rdtgroup.o monitor.o
>  obj-$(CONFIG_X86_CPU_RESCTRL)		+= ctrlmondata.o
> +obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET)	+= intel_aet.o
>  obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK)	+= pseudo_lock.o
>  
>  # To allow define_trace.h's recursive include:
> 

-- 
 i.
Re: [PATCH v9 17/31] x86/resctrl: Discover hardware telemetry events
Posted by Luck, Tony 4 weeks, 1 day ago
Ilpo,

Thanks for looking at these patches. Applied all your suggestions
for this part. Updated patch below. Also pushed to
git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux.git rdt-aet-v10-wip

-Tony

From c717ab20b7781eec8b6dcf711eb0e2f8255aef8c Mon Sep 17 00:00:00 2001
From: Tony Luck <tony.luck@intel.com>
Date: Mon, 25 Aug 2025 10:45:34 -0700
Subject: [PATCH 17/31] x86/resctrl: Discover hardware telemetry events

Each CPU collects data for telemetry events that it sends to a nearby
telemetry event aggregator either when the value of IA32_PQR_ASSOC.RMID
changed, or when a two millisecond timer expires.

The telemetry event aggregators maintain per-RMID per-event counts of
the total seen for all the CPUs. There may be more than one telemetry
event aggregator per package.

Each telemetry event aggregator is responsible for a specific group of
events. E.g. on the Intel Clearwater Forest CPU there are two types of
aggregators. One type tracks a pair of energy related events. The other
type tracks a subset of "perf" type events.

The event counts are made available to Linux in a region of MMIO space
for each aggregator. All details about the layout of counters in each
aggregator MMIO region are described in XML files published by Intel and
made available in a GitHub repository: https://github.com/intel/Intel-PMT.

The key to matching a specific telemetry aggregator to the XML file that
describes the MMIO layout is a 32-bit value. The Linux telemetry subsystem
refers to this as a "guid" while the XML files call it a "uniqueid".

Each XML file provides the following information:
1) Which telemetry events are included in the group.
2) The order in which the event counters appear for each RMID.
3) The value type of each event counter (integer or fixed-point).
4) The number of RMIDs supported.
5) Which additional aggregator status registers are included.
6) The total size of the MMIO region for this aggregator.

The INTEL_PMT_TELEMETRY driver enumerates support for telemetry events.
This driver provides intel_pmt_get_regions_by_feature() to list all
available telemetry event aggregators. The list includes the "guid",
the base address in MMIO space for the region where the event counters
are exposed, and the package id where the CPUs that report to this
aggregator are located.

Add a new Kconfig option CONFIG_X86_CPU_RESCTRL_INTEL_AET for the Intel
specific parts of telemetry code. This depends on the INTEL_PMT_TELEMETRY
and INTEL_TPMI drivers being built-in to the kernel for enumeration of
telemetry features.

Call intel_pmt_get_regions_by_feature() for each pmt_feature_id that
indicates per-RMID telemetry.

Save the returned pmt_feature_group pointers with guids that are known
to resctrl for use at run time. Those pointers are returned to the
INTEL_PMT_TELEMETRY subsystem at resctrl_arch_exit() time.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
Note that checkpatch complains about this:

DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,
		if (!IS_ERR_OR_NULL(_T))
			intel_pmt_put_feature_group(_T))
with:
CHECK: Alignment should match open parenthesis

But if the alignment is fixed, it then complains:
WARNING: Statements should start on a tabstop
---
 arch/x86/kernel/cpu/resctrl/internal.h  |   8 ++
 arch/x86/kernel/cpu/resctrl/core.c      |   5 +
 arch/x86/kernel/cpu/resctrl/intel_aet.c | 140 ++++++++++++++++++++++++
 arch/x86/Kconfig                        |  13 +++
 arch/x86/kernel/cpu/resctrl/Makefile    |   1 +
 5 files changed, 167 insertions(+)
 create mode 100644 arch/x86/kernel/cpu/resctrl/intel_aet.c

diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 6b3f3203edc4..9ddfbbe5c3cf 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -169,4 +169,12 @@ void __init intel_rdt_mbm_apply_quirk(void);
 
 void rdt_domain_reconfigure_cdp(struct rdt_resource *r);
 
+#ifdef CONFIG_X86_CPU_RESCTRL_INTEL_AET
+bool intel_aet_get_events(void);
+void __exit intel_aet_exit(void);
+#else
+static inline bool intel_aet_get_events(void) { return false; }
+static inline void __exit intel_aet_exit(void) { }
+#endif
+
 #endif /* _ASM_X86_RESCTRL_INTERNAL_H */
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index 566530c6dbc3..57b34e1dc088 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -735,6 +735,9 @@ void resctrl_arch_pre_mount(void)
 
 	if (!atomic_try_cmpxchg(&only_once, &old, 1))
 		return;
+
+	if (!intel_aet_get_events())
+		return;
 }
 
 enum {
@@ -1087,6 +1090,8 @@ late_initcall(resctrl_arch_late_init);
 
 static void __exit resctrl_arch_exit(void)
 {
+	intel_aet_exit();
+
 	cpuhp_remove_state(rdt_online);
 
 	resctrl_exit();
diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c
new file mode 100644
index 000000000000..1a9b864e2dc7
--- /dev/null
+++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Resource Director Technology(RDT)
+ * - Intel Application Energy Telemetry
+ *
+ * Copyright (C) 2025 Intel Corporation
+ *
+ * Author:
+ *    Tony Luck <tony.luck@intel.com>
+ */
+
+#define pr_fmt(fmt)   "resctrl: " fmt
+
+#include <linux/array_size.h>
+#include <linux/cleanup.h>
+#include <linux/cpu.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/intel_pmt_features.h>
+#include <linux/intel_vsec.h>
+#include <linux/overflow.h>
+#include <linux/resctrl.h>
+#include <linux/stddef.h>
+#include <linux/types.h>
+
+#include "internal.h"
+
+/**
+ * struct event_group - All information about a group of telemetry events.
+ * @pfg:		Points to the aggregated telemetry space information
+ *			within the INTEL_PMT_TELEMETRY driver that contains data for all
+ *			telemetry regions.
+ * @guid:		Unique number per XML description file.
+ */
+struct event_group {
+	/* Data fields for additional structures to manage this group. */
+	struct pmt_feature_group	*pfg;
+
+	/* Remaining fields initialized from XML file. */
+	u32				guid;
+};
+
+/*
+ * Link: https://github.com/intel/Intel-PMT
+ * File: xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml
+ */
+static struct event_group energy_0x26696143 = {
+	.guid		= 0x26696143,
+};
+
+/*
+ * Link: https://github.com/intel/Intel-PMT
+ * File: xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml
+ */
+static struct event_group perf_0x26557651 = {
+	.guid		= 0x26557651,
+};
+
+static struct event_group *known_energy_event_groups[] = {
+	&energy_0x26696143,
+};
+
+static struct event_group *known_perf_event_groups[] = {
+	&perf_0x26557651,
+};
+
+#define for_each_enabled_event_group(_peg, _grp)			\
+	for (_peg = (_grp); _peg < &_grp[ARRAY_SIZE(_grp)]; _peg++)	\
+		if ((*_peg)->pfg)
+
+/* Stub for now */
+static bool enable_events(struct event_group *e, struct pmt_feature_group *p)
+{
+	return false;
+}
+
+DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,
+		if (!IS_ERR_OR_NULL(_T))
+			intel_pmt_put_feature_group(_T))
+
+/*
+ * Make a request to the INTEL_PMT_TELEMETRY driver for the pmt_feature_group
+ * for a specific feature. If there is one, the returned structure has an array
+ * of telemetry_region structures. Each describes one telemetry aggregator.
+ * Try to use every telemetry aggregator with a known guid.
+ */
+static bool get_pmt_feature(enum pmt_feature_id feature, struct event_group **evgs,
+			    unsigned int num_evg)
+{
+	struct pmt_feature_group *p __free(intel_pmt_put_feature_group) = NULL;
+	struct event_group **peg;
+	bool ret;
+
+	p = intel_pmt_get_regions_by_feature(feature);
+
+	if (IS_ERR_OR_NULL(p))
+		return false;
+
+	for (peg = evgs; peg < &evgs[num_evg]; peg++) {
+		ret = enable_events(*peg, p);
+		if (ret) {
+			(*peg)->pfg = no_free_ptr(p);
+			return true;
+		}
+	}
+
+	return false;
+}
+
+/*
+ * Ask INTEL_PMT_TELEMETRY driver for all the RMID based telemetry groups
+ * that it supports.
+ */
+bool intel_aet_get_events(void)
+{
+	bool ret1, ret2;
+
+	ret1 = get_pmt_feature(FEATURE_PER_RMID_ENERGY_TELEM,
+			       known_energy_event_groups,
+			       ARRAY_SIZE(known_energy_event_groups));
+	ret2 = get_pmt_feature(FEATURE_PER_RMID_PERF_TELEM,
+			       known_perf_event_groups,
+			       ARRAY_SIZE(known_perf_event_groups));
+
+	return ret1 || ret2;
+}
+
+void __exit intel_aet_exit(void)
+{
+	struct event_group **peg;
+
+	for_each_enabled_event_group(peg, known_energy_event_groups) {
+		intel_pmt_put_feature_group((*peg)->pfg);
+		(*peg)->pfg = NULL;
+	}
+	for_each_enabled_event_group(peg, known_perf_event_groups) {
+		intel_pmt_put_feature_group((*peg)->pfg);
+		(*peg)->pfg = NULL;
+	}
+}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 58d890fe2100..50051fdf4659 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -525,6 +525,19 @@ config X86_CPU_RESCTRL
 
 	  Say N if unsure.
 
+config X86_CPU_RESCTRL_INTEL_AET
+	bool "Intel Application Energy Telemetry" if INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y
+	depends on X86_CPU_RESCTRL && CPU_SUP_INTEL
+	help
+	  Enable per-RMID telemetry events in resctrl.
+
+	  Intel feature that collects per-RMID execution data
+	  about energy consumption, measure of frequency independent
+	  activity and other performance metrics. Data is aggregated
+	  per package.
+
+	  Say N if unsure.
+
 config X86_FRED
 	bool "Flexible Return and Event Delivery"
 	depends on X86_64
diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile
index d8a04b195da2..273ddfa30836 100644
--- a/arch/x86/kernel/cpu/resctrl/Makefile
+++ b/arch/x86/kernel/cpu/resctrl/Makefile
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 obj-$(CONFIG_X86_CPU_RESCTRL)		+= core.o rdtgroup.o monitor.o
 obj-$(CONFIG_X86_CPU_RESCTRL)		+= ctrlmondata.o
+obj-$(CONFIG_X86_CPU_RESCTRL_INTEL_AET)	+= intel_aet.o
 obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK)	+= pseudo_lock.o
 
 # To allow define_trace.h's recursive include:
-- 
2.51.0
Re: [PATCH v9 17/31] x86/resctrl: Discover hardware telemetry events
Posted by Reinette Chatre 3 weeks, 2 days ago
Hi Tony,

On 9/3/25 11:12 AM, Luck, Tony wrote:
> 
> Ilpo,
> 
> Thanks for looking at these patches. Applied all your suggestions
> for this part. Updated patch below. Also pushed to
> git://git.kernel.org/pub/scm/linux/kernel/git/aegl/linux.git rdt-aet-v10-wip
> 
> -Tony
> 
> From c717ab20b7781eec8b6dcf711eb0e2f8255aef8c Mon Sep 17 00:00:00 2001
> From: Tony Luck <tony.luck@intel.com>
> Date: Mon, 25 Aug 2025 10:45:34 -0700
> Subject: [PATCH 17/31] x86/resctrl: Discover hardware telemetry events
> 
> Each CPU collects data for telemetry events that it sends to a nearby
> telemetry event aggregator either when the value of IA32_PQR_ASSOC.RMID
> changed, or when a two millisecond timer expires.

"changed" -> "changes" (repeated from v8)

> 
> The telemetry event aggregators maintain per-RMID per-event counts of
> the total seen for all the CPUs. There may be more than one telemetry
> event aggregator per package.

... and if there are more than one telemetry event aggregator per package
do they *all* maintain per-RMID per-event counts of the total seen for all the
CPUs? Above is not clear about this.

> 
> Each telemetry event aggregator is responsible for a specific group of
> events. E.g. on the Intel Clearwater Forest CPU there are two types of
> aggregators. One type tracks a pair of energy related events. The other
> type tracks a subset of "perf" type events.
> 
> The event counts are made available to Linux in a region of MMIO space
> for each aggregator. All details about the layout of counters in each
> aggregator MMIO region are described in XML files published by Intel and
> made available in a GitHub repository: https://github.com/intel/Intel-PMT.

I expect the usage of "Link:" will clarify but until then we can keep using
it based on guidance in maintainer-tip.rst. For example, "made available in
a GitHub repository [1]." with the "Link:" tag added to end of commit tags as:

	Link: https://github.com/intel/Intel-PMT # [1]

> 
> The key to matching a specific telemetry aggregator to the XML file that
> describes the MMIO layout is a 32-bit value. The Linux telemetry subsystem
> refers to this as a "guid" while the XML files call it a "uniqueid".
> 
> Each XML file provides the following information:
> 1) Which telemetry events are included in the group.
> 2) The order in which the event counters appear for each RMID.
> 3) The value type of each event counter (integer or fixed-point).
> 4) The number of RMIDs supported.
> 5) Which additional aggregator status registers are included.
> 6) The total size of the MMIO region for this aggregator.

"this aggregator" -> "an aggregator"? As in, all aggregators matching
"guid" will have same MMIO size, the MMIO size is not unique per
aggregator associated with the event group?

> 
> The INTEL_PMT_TELEMETRY driver enumerates support for telemetry events.
> This driver provides intel_pmt_get_regions_by_feature() to list all
> available telemetry event aggregators. The list includes the "guid",
> the base address in MMIO space for the region where the event counters
> are exposed, and the package id where the CPUs that report to this
> aggregator are located.

Please note this switches from earlier "The telemetry event aggregators
maintain per-RMID per-event counts of the total seen for *all* the CPUs."
to here (singular) "package id where the CPUs that report to this aggregator
are located." Please maintain consistent descriptions to make the
architecture easier to understand.

> 
> Add a new Kconfig option CONFIG_X86_CPU_RESCTRL_INTEL_AET for the Intel
> specific parts of telemetry code. This depends on the INTEL_PMT_TELEMETRY
> and INTEL_TPMI drivers being built-in to the kernel for enumeration of
> telemetry features.
> 
> Call intel_pmt_get_regions_by_feature() for each pmt_feature_id that
> indicates per-RMID telemetry.

"for each pmt_feature_id that indicates per-RMID telemetry" -> "for each
per-RMID telemetry feature ID/id"? (goal is for changelog to be less of
a description of the code and more what the code does)

> 
> Save the returned pmt_feature_group pointers with guids that are known

All guids are saved, no? Not just those known to resctrl?

> to resctrl for use at run time. Those pointers are returned to the

"Return those pointers ..." (but see below)

> INTEL_PMT_TELEMETRY subsystem at resctrl_arch_exit() time.

This does still sound as though resctrl obtains a pointer to a data
structure within INTEL_PMT_TELEMETRY. I think it will be helpful to
highlight that resctrl obtains a fresh copy of the whole datastructure
for its private use ... and not document the code so detailed when
describing this.
For example (please correct where wrong),

	Use INTEL_PMT_TELEMETRY's intel_pmt_get_regions_by_feature() with
	each per-RMID telemetry feature id to obtain a private copy of
	struct pmt_feature_group that contains all discovered/enumerated
	telemetry aggregator data for all event groups (known and unknown
	to resctrl) of that feature id. Further processing on this structure
	will enable all supported events in resctrl. Return the structure to
	INTEL_PMT_TELEMETRY at resctrl exit time.

> 
> Signed-off-by: Tony Luck <tony.luck@intel.com>
> ---

...

> diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c
> new file mode 100644
> index 000000000000..1a9b864e2dc7
> --- /dev/null
> +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c
> @@ -0,0 +1,140 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Resource Director Technology(RDT)
> + * - Intel Application Energy Telemetry
> + *
> + * Copyright (C) 2025 Intel Corporation
> + *
> + * Author:
> + *    Tony Luck <tony.luck@intel.com>
> + */
> +
> +#define pr_fmt(fmt)   "resctrl: " fmt
> +
> +#include <linux/array_size.h>
> +#include <linux/cleanup.h>
> +#include <linux/cpu.h>
> +#include <linux/err.h>
> +#include <linux/init.h>
> +#include <linux/intel_pmt_features.h>
> +#include <linux/intel_vsec.h>
> +#include <linux/overflow.h>
> +#include <linux/resctrl.h>
> +#include <linux/stddef.h>
> +#include <linux/types.h>
> +
> +#include "internal.h"
> +
> +/**
> + * struct event_group - All information about a group of telemetry events.
> + * @pfg:		Points to the aggregated telemetry space information
> + *			within the INTEL_PMT_TELEMETRY driver that contains data for all
> + *			telemetry regions.

We since learned that this is not the case, no?

Considering the new usage I also think it will be helpful to add a usage snippet like:
"Valid if system supports the event group, NULL otherwise."

While doing so, please keep the columns consistently under 80 chars. 

> + * @guid:		Unique number per XML description file.
> + */
> +struct event_group {
> +	/* Data fields for additional structures to manage this group. */
> +	struct pmt_feature_group	*pfg;
> +
> +	/* Remaining fields initialized from XML file. */
> +	u32				guid;
> +};
> +
> +/*
> + * Link: https://github.com/intel/Intel-PMT
> + * File: xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml
> + */
> +static struct event_group energy_0x26696143 = {
> +	.guid		= 0x26696143,
> +};
> +
> +/*
> + * Link: https://github.com/intel/Intel-PMT
> + * File: xml/CWF/OOBMSM/RMID-PERF/cwf_aggregator.xml
> + */
> +static struct event_group perf_0x26557651 = {
> +	.guid		= 0x26557651,
> +};
> +
> +static struct event_group *known_energy_event_groups[] = {
> +	&energy_0x26696143,
> +};
> +
> +static struct event_group *known_perf_event_groups[] = {
> +	&perf_0x26557651,
> +};
> +
> +#define for_each_enabled_event_group(_peg, _grp)			\
> +	for (_peg = (_grp); _peg < &_grp[ARRAY_SIZE(_grp)]; _peg++)	\
> +		if ((*_peg)->pfg)
> +
> +/* Stub for now */
> +static bool enable_events(struct event_group *e, struct pmt_feature_group *p)
> +{
> +	return false;
> +}
> +
> +DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,
> +		if (!IS_ERR_OR_NULL(_T))
> +			intel_pmt_put_feature_group(_T))
> +
> +/*
> + * Make a request to the INTEL_PMT_TELEMETRY driver for the pmt_feature_group

"for the pmt_feature_group" -> "for a copy of the pmt_feature_group"?

> + * for a specific feature. If there is one, the returned structure has an array
> + * of telemetry_region structures. Each describes one telemetry aggregator.
> + * Try to use every telemetry aggregator with a known guid.
> + */
> +static bool get_pmt_feature(enum pmt_feature_id feature, struct event_group **evgs,
> +			    unsigned int num_evg)
> +{
> +	struct pmt_feature_group *p __free(intel_pmt_put_feature_group) = NULL;
> +	struct event_group **peg;
> +	bool ret;
> +
> +	p = intel_pmt_get_regions_by_feature(feature);
> +
> +	if (IS_ERR_OR_NULL(p))
> +		return false;
> +
> +	for (peg = evgs; peg < &evgs[num_evg]; peg++) {
> +		ret = enable_events(*peg, p);
> +		if (ret) {
> +			(*peg)->pfg = no_free_ptr(p);
> +			return true;
> +		}
> +	}
> +
> +	return false;
> +}
> +
> +/*
> + * Ask INTEL_PMT_TELEMETRY driver for all the RMID based telemetry groups
> + * that it supports.
> + */
> +bool intel_aet_get_events(void)
> +{
> +	bool ret1, ret2;
> +
> +	ret1 = get_pmt_feature(FEATURE_PER_RMID_ENERGY_TELEM,
> +			       known_energy_event_groups,
> +			       ARRAY_SIZE(known_energy_event_groups));
> +	ret2 = get_pmt_feature(FEATURE_PER_RMID_PERF_TELEM,
> +			       known_perf_event_groups,
> +			       ARRAY_SIZE(known_perf_event_groups));
> +
> +	return ret1 || ret2;
> +}
> +
> +void __exit intel_aet_exit(void)
> +{
> +	struct event_group **peg;
> +
> +	for_each_enabled_event_group(peg, known_energy_event_groups) {
> +		intel_pmt_put_feature_group((*peg)->pfg);
> +		(*peg)->pfg = NULL;
> +	}
> +	for_each_enabled_event_group(peg, known_perf_event_groups) {
> +		intel_pmt_put_feature_group((*peg)->pfg);
> +		(*peg)->pfg = NULL;
> +	}
> +}
> diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
> index 58d890fe2100..50051fdf4659 100644
> --- a/arch/x86/Kconfig
> +++ b/arch/x86/Kconfig
> @@ -525,6 +525,19 @@ config X86_CPU_RESCTRL
>  
>  	  Say N if unsure.
>  
> +config X86_CPU_RESCTRL_INTEL_AET
> +	bool "Intel Application Energy Telemetry" if INTEL_PMT_TELEMETRY=y && INTEL_TPMI=y
> +	depends on X86_CPU_RESCTRL && CPU_SUP_INTEL

Could you please help me understand why both the "depends on" and "if" syntax is
needed? With the above the config menu displays:
│   Depends on: X86_64 [=y] && X86_CPU_RESCTRL [=y] && CPU_SUP_INTEL [=y]                                                                                                                                                        
│   Visible if: X86_64 [=y] && X86_CPU_RESCTRL [=y] && CPU_SUP_INTEL [=y] && INTEL_PMT_TELEMETRY [=y]=y [=y] && INTEL_TPMI [=y]=y [=y]

According to scripts/kconfig/menu.c the visibility usually matches the
dependencies and that seems to be what is intended? The changelog only
mentions dependencies so it is not clear why there may be a need for
some unique visibility?

Reinette