[PATCH v8 18/32] x86/resctrl: Count valid telemetry aggregators per package

Tony Luck posted 32 patches 1 month, 3 weeks ago
There is a newer version of this series
[PATCH v8 18/32] x86/resctrl: Count valid telemetry aggregators per package
Posted by Tony Luck 1 month, 3 weeks ago
There may be multiple telemetry aggregators per package, each enumerated
by a telemetry region structure in the feature group.

Linux can only use telemetry regions when all of the following
conditions are met:
1) The "guid" of the region is known to Linux
2) The package id for the region is valid (less than
   topology_max_packages())
3) The size of the MMIO region exactly matches the expected
   size for this "guid" (as implied from description in the
   XML file for this "guid").

Scan the array of telemetry region structures and count how many are
usable in each package in preparation to allocate structures to save
the MMIO addresses for each in a convenient format for use when reading
event counters.

Signed-off-by: Tony Luck <tony.luck@intel.com>
---
 arch/x86/kernel/cpu/resctrl/intel_aet.c | 56 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c
index 25075f369148..09043d36e08c 100644
--- a/arch/x86/kernel/cpu/resctrl/intel_aet.c
+++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c
@@ -15,6 +15,7 @@
 #include <linux/cpu.h>
 #include <linux/intel_vsec.h>
 #include <linux/resctrl.h>
+#include <linux/slab.h>
 
 #include "internal.h"
 
@@ -24,6 +25,7 @@
  *			within the OOBMSM driver that contains data for all
  *			telemetry regions.
  * @guid:		Unique number per XML description file.
+ * @mmio_size:		Number of bytes of MMIO registers for this group.
  */
 struct event_group {
 	/* Data fields for additional structures to manage this group. */
@@ -31,14 +33,19 @@ struct event_group {
 
 	/* Remaining fields initialized from XML file. */
 	u32				guid;
+	size_t				mmio_size;
 };
 
+#define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \
+		      (((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64))
+
 /*
  * Link: https://github.com/intel/Intel-PMT
  * File: xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml
  */
 static struct event_group energy_0x26696143 = {
 	.guid		= 0x26696143,
+	.mmio_size	= XML_MMIO_SIZE(576, 2, 3),
 };
 
 /*
@@ -47,6 +54,7 @@ static struct event_group energy_0x26696143 = {
  */
 static struct event_group perf_0x26557651 = {
 	.guid		= 0x26557651,
+	.mmio_size	= XML_MMIO_SIZE(576, 7, 3),
 };
 
 static struct event_group *known_energy_event_groups[] = {
@@ -57,10 +65,54 @@ static struct event_group *known_perf_event_groups[] = {
 	&perf_0x26557651,
 };
 
-/* Stub for now */
+static bool skip_this_region(struct telemetry_region *tr, struct event_group *e)
+{
+	if (tr->guid != e->guid)
+		return true;
+	if (tr->plat_info.package_id >= topology_max_packages()) {
+		pr_warn_once("Bad package %d in guid 0x%x\n", tr->plat_info.package_id,
+			     tr->guid);
+		return true;
+	}
+	if (tr->size != e->mmio_size) {
+		pr_warn_once("MMIO space %zu wrong size for guid 0x%x\n", tr->size, e->guid);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Discover events from one pmt_feature_group.
+ * 1) Count how many usable telemetry regions per package.
+ * 2...) To be continued.
+ */
 static int discover_events(struct event_group *e, struct pmt_feature_group *p)
 {
-	return -EINVAL;
+	int *pkgcounts __free(kfree) = NULL;
+	struct telemetry_region *tr;
+	int num_pkgs;
+
+	num_pkgs = topology_max_packages();
+
+	/* Get per-package counts of telemetry regions for this event group */
+	for (int i = 0; i < p->count; i++) {
+		tr = &p->regions[i];
+		if (skip_this_region(tr, e))
+			continue;
+
+		if (!pkgcounts) {
+			pkgcounts = kcalloc(num_pkgs, sizeof(*pkgcounts), GFP_KERNEL);
+			if (!pkgcounts)
+				return -ENOMEM;
+		}
+		pkgcounts[tr->plat_info.package_id]++;
+	}
+
+	if (!pkgcounts)
+		return -ENODEV;
+
+	return 0;
 }
 
 DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,
-- 
2.50.1
Re: [PATCH v8 18/32] x86/resctrl: Count valid telemetry aggregators per package
Posted by Reinette Chatre 1 month, 3 weeks ago
Hi Tony,

On 8/11/25 11:16 AM, Tony Luck wrote:
> diff --git a/arch/x86/kernel/cpu/resctrl/intel_aet.c b/arch/x86/kernel/cpu/resctrl/intel_aet.c
> index 25075f369148..09043d36e08c 100644
> --- a/arch/x86/kernel/cpu/resctrl/intel_aet.c
> +++ b/arch/x86/kernel/cpu/resctrl/intel_aet.c
> @@ -15,6 +15,7 @@
>  #include <linux/cpu.h>
>  #include <linux/intel_vsec.h>
>  #include <linux/resctrl.h>
> +#include <linux/slab.h>
>  
>  #include "internal.h"
>  
> @@ -24,6 +25,7 @@
>   *			within the OOBMSM driver that contains data for all
>   *			telemetry regions.
>   * @guid:		Unique number per XML description file.
> + * @mmio_size:		Number of bytes of MMIO registers for this group.
>   */
>  struct event_group {
>  	/* Data fields for additional structures to manage this group. */
> @@ -31,14 +33,19 @@ struct event_group {
>  
>  	/* Remaining fields initialized from XML file. */
>  	u32				guid;
> +	size_t				mmio_size;
>  };
>  
> +#define XML_MMIO_SIZE(num_rmids, num_events, num_extra_status) \
> +		      (((num_rmids) * (num_events) + (num_extra_status)) * sizeof(u64))
> +
>  /*
>   * Link: https://github.com/intel/Intel-PMT
>   * File: xml/CWF/OOBMSM/RMID-ENERGY/cwf_aggregator.xml
>   */
>  static struct event_group energy_0x26696143 = {
>  	.guid		= 0x26696143,
> +	.mmio_size	= XML_MMIO_SIZE(576, 2, 3),
>  };
>  
>  /*
> @@ -47,6 +54,7 @@ static struct event_group energy_0x26696143 = {
>   */
>  static struct event_group perf_0x26557651 = {
>  	.guid		= 0x26557651,
> +	.mmio_size	= XML_MMIO_SIZE(576, 7, 3),
>  };
>  
>  static struct event_group *known_energy_event_groups[] = {
> @@ -57,10 +65,54 @@ static struct event_group *known_perf_event_groups[] = {
>  	&perf_0x26557651,
>  };
>  
> -/* Stub for now */
> +static bool skip_this_region(struct telemetry_region *tr, struct event_group *e)
> +{
> +	if (tr->guid != e->guid)
> +		return true;
> +	if (tr->plat_info.package_id >= topology_max_packages()) {
> +		pr_warn_once("Bad package %d in guid 0x%x\n", tr->plat_info.package_id,
> +			     tr->guid);
> +		return true;
> +	}
> +	if (tr->size != e->mmio_size) {
> +		pr_warn_once("MMIO space %zu wrong size for guid 0x%x\n", tr->size, e->guid);

I think this message can be improved. Compare with, for example,
"MMIO space wrong size (%zu bytes) for guid 0x%x. Expected %zu bytes.\n"
(please feel free to improve open to other ideas)

> +		return true;
> +	}
> +
> +	return false;
> +}
> +
> +/*
> + * Discover events from one pmt_feature_group.
> + * 1) Count how many usable telemetry regions per package.
> + * 2...) To be continued.
> + */
>  static int discover_events(struct event_group *e, struct pmt_feature_group *p)
>  {
> -	return -EINVAL;
> +	int *pkgcounts __free(kfree) = NULL;
> +	struct telemetry_region *tr;
> +	int num_pkgs;
> +
> +	num_pkgs = topology_max_packages();
> +
> +	/* Get per-package counts of telemetry regions for this event group */
> +	for (int i = 0; i < p->count; i++) {
> +		tr = &p->regions[i];
> +		if (skip_this_region(tr, e))
> +			continue;
> +
> +		if (!pkgcounts) {
> +			pkgcounts = kcalloc(num_pkgs, sizeof(*pkgcounts), GFP_KERNEL);
> +			if (!pkgcounts)
> +				return -ENOMEM;
> +		}
> +		pkgcounts[tr->plat_info.package_id]++;
> +	}
> +
> +	if (!pkgcounts)
> +		return -ENODEV;
> +
> +	return 0;

Returning success requires matching support to drop the reference to the
feature group that does not exist at this point. This makes the code increasingly harder to
follow since the code that is promised in previous patch is not available yet
but needed here. This can be solved by handling the reference in previous patch.

>  }
>  
>  DEFINE_FREE(intel_pmt_put_feature_group, struct pmt_feature_group *,

Reinette