[v1] Support HBM and CXL PMON uncore counters

[PATCH 1/8] perf/x86/uncore: Save the unit control address of all units

Posted by kan.liang@linux.intel.com 1 year, 8 months ago

From: Kan Liang <kan.liang@linux.intel.com>

The unit control address of some CXL units may be wrongly calculated
under some configuration on a EMR machine.

The current implementation only saves the unit control address of the
units from the first die, and the first unit of the rest of dies. Perf
assumed that the units from the other dies have the same offset as the
first die. So the unit control address of the rest of the units can be
calculated. However, the assumption is wrong, especially for the CXL
units.

Introduce an RB tree for each uncore type to save the unit control
address and ID information for all the units.

Compared with the current implementation, more space is required to save
the information of all units. The extra size should be acceptable.
For example, on EMR, there are 221 units at most. For a 2-socket machine,
the extra space is ~6KB at most.

Tested-by: Yunying Sun <yunying.sun@intel.com>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
 arch/x86/events/intel/uncore_discovery.c | 79 +++++++++++++++++++++++-
 arch/x86/events/intel/uncore_discovery.h | 10 +++
 2 files changed, 87 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
index 9a698a92962a..ce520e69a3c1 100644
--- a/arch/x86/events/intel/uncore_discovery.c
+++ b/arch/x86/events/intel/uncore_discovery.c
@@ -93,6 +93,8 @@ add_uncore_discovery_type(struct uncore_unit_discovery *unit)
 	if (!type->box_ctrl_die)
 		goto free_type;
 
+	type->units = RB_ROOT;
+
 	type->access_type = unit->access_type;
 	num_discovered_types[type->access_type]++;
 	type->type = unit->box_type;
@@ -120,10 +122,59 @@ get_uncore_discovery_type(struct uncore_unit_discovery *unit)
 	return add_uncore_discovery_type(unit);
 }
 
+static inline bool unit_less(struct rb_node *a, const struct rb_node *b)
+{
+	struct intel_uncore_discovery_unit *a_node, *b_node;
+
+	a_node = rb_entry(a, struct intel_uncore_discovery_unit, node);
+	b_node = rb_entry(b, struct intel_uncore_discovery_unit, node);
+
+	if (a_node->pmu_idx < b_node->pmu_idx)
+		return true;
+	if (a_node->pmu_idx > b_node->pmu_idx)
+		return false;
+
+	if (a_node->die < b_node->die)
+		return true;
+	if (a_node->die > b_node->die)
+		return false;
+
+	return 0;
+}
+
+static inline struct intel_uncore_discovery_unit *
+uncore_find_unit(struct rb_root *root, unsigned int id)
+{
+	struct intel_uncore_discovery_unit *unit;
+	struct rb_node *node;
+
+	for (node = rb_first(root); node; node = rb_next(node)) {
+		unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
+		if (unit->id == id)
+			return unit;
+	}
+
+	return NULL;
+}
+
+static void uncore_find_add_unit(struct intel_uncore_discovery_unit *node,
+				 struct rb_root *root, u16 *num_units)
+{
+	struct intel_uncore_discovery_unit *unit = uncore_find_unit(root, node->id);
+
+	if (unit)
+		node->pmu_idx = unit->pmu_idx;
+	else if (num_units)
+		node->pmu_idx = (*num_units)++;
+
+	rb_add(&node->node, root, unit_less);
+}
+
 static void
 uncore_insert_box_info(struct uncore_unit_discovery *unit,
 		       int die, bool parsed)
 {
+	struct intel_uncore_discovery_unit *node;
 	struct intel_uncore_discovery_type *type;
 	unsigned int *ids;
 	u64 *box_offset;
@@ -136,14 +187,26 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
 		return;
 	}
 
+	node = kzalloc(sizeof(*node), GFP_KERNEL);
+	if (!node)
+		return;
+
+	node->die = die;
+	node->id = unit->box_id;
+	node->addr = unit->ctl;
+
 	if (parsed) {
 		type = search_uncore_discovery_type(unit->box_type);
 		if (!type) {
 			pr_info("A spurious uncore type %d is detected, "
 				"Disable the uncore type.\n",
 				unit->box_type);
+			kfree(node);
 			return;
 		}
+
+		uncore_find_add_unit(node, &type->units, &type->num_units);
+
 		/* Store the first box of each die */
 		if (!type->box_ctrl_die[die])
 			type->box_ctrl_die[die] = unit->ctl;
@@ -152,16 +215,18 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
 
 	type = get_uncore_discovery_type(unit);
 	if (!type)
-		return;
+		goto free_node;
 
 	box_offset = kcalloc(type->num_boxes + 1, sizeof(u64), GFP_KERNEL);
 	if (!box_offset)
-		return;
+		goto free_node;
 
 	ids = kcalloc(type->num_boxes + 1, sizeof(unsigned int), GFP_KERNEL);
 	if (!ids)
 		goto free_box_offset;
 
+	uncore_find_add_unit(node, &type->units, &type->num_units);
+
 	/* Store generic information for the first box */
 	if (!type->num_boxes) {
 		type->box_ctrl = unit->ctl;
@@ -201,6 +266,8 @@ uncore_insert_box_info(struct uncore_unit_discovery *unit,
 free_box_offset:
 	kfree(box_offset);
 
+free_node:
+	kfree(node);
 }
 
 static bool
@@ -339,8 +406,16 @@ bool intel_uncore_has_discovery_tables(int *ignore)
 void intel_uncore_clear_discovery_tables(void)
 {
 	struct intel_uncore_discovery_type *type, *next;
+	struct intel_uncore_discovery_unit *pos;
+	struct rb_node *node;
 
 	rbtree_postorder_for_each_entry_safe(type, next, &discovery_tables, node) {
+		while (!RB_EMPTY_ROOT(&type->units)) {
+			node = rb_first(&type->units);
+			pos = rb_entry(node, struct intel_uncore_discovery_unit, node);
+			rb_erase(node, &type->units);
+			kfree(pos);
+		}
 		kfree(type->box_ctrl_die);
 		kfree(type);
 	}
diff --git a/arch/x86/events/intel/uncore_discovery.h b/arch/x86/events/intel/uncore_discovery.h
index 22e769a81103..5190017aba51 100644
--- a/arch/x86/events/intel/uncore_discovery.h
+++ b/arch/x86/events/intel/uncore_discovery.h
@@ -113,17 +113,27 @@ struct uncore_unit_discovery {
 	};
 };
 
+struct intel_uncore_discovery_unit {
+	struct rb_node	node;
+	unsigned int	pmu_idx;	/* The idx of the corresponding PMU */
+	unsigned int	id;		/* Unit ID */
+	unsigned int	die;		/* Die ID */
+	u64		addr;		/* Unit Control Address */
+};
+
 struct intel_uncore_discovery_type {
 	struct rb_node	node;
 	enum uncore_access_type	access_type;
 	u64		box_ctrl;	/* Unit ctrl addr of the first box */
 	u64		*box_ctrl_die;	/* Unit ctrl addr of the first box of each die */
+	struct rb_root	units;		/* Unit ctrl addr for all units */
 	u16		type;		/* Type ID of the uncore block */
 	u8		num_counters;
 	u8		counter_width;
 	u8		ctl_offset;	/* Counter Control 0 offset */
 	u8		ctr_offset;	/* Counter 0 offset */
 	u16		num_boxes;	/* number of boxes for the uncore block */
+	u16		num_units;	/* number of units */
 	unsigned int	*ids;		/* Box IDs */
 	u64		*box_offset;	/* Box offset */
 };
-- 
2.35.1

Re: [PATCH 1/8] perf/x86/uncore: Save the unit control address of all units

Posted by Tim Chen 1 year, 8 months ago

On Mon, 2024-06-10 at 13:16 -0700, kan.liang@linux.intel.com wrote:
> From: Kan Liang <kan.liang@linux.intel.com>
> 
> The unit control address of some CXL units may be wrongly calculated
> under some configuration on a EMR machine.
> 
> The current implementation only saves the unit control address of the
> units from the first die, and the first unit of the rest of dies. Perf
> assumed that the units from the other dies have the same offset as the
> first die. So the unit control address of the rest of the units can be
> calculated. However, the assumption is wrong, especially for the CXL
> units.
> 
> Introduce an RB tree for each uncore type to save the unit control
> address and ID information for all the units.
> 
> Compared with the current implementation, more space is required to save
> the information of all units. The extra size should be acceptable.
> For example, on EMR, there are 221 units at most. For a 2-socket machine,
> the extra space is ~6KB at most.
> 
> Tested-by: Yunying Sun <yunying.sun@intel.com>
> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> ---
>  arch/x86/events/intel/uncore_discovery.c | 79 +++++++++++++++++++++++-
>  arch/x86/events/intel/uncore_discovery.h | 10 +++
>  2 files changed, 87 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
> index 9a698a92962a..ce520e69a3c1 100644
> --- a/arch/x86/events/intel/uncore_discovery.c
> +++ b/arch/x86/events/intel/uncore_discovery.c
> @@ -93,6 +93,8 @@ add_uncore_discovery_type(struct uncore_unit_discovery *unit)
>  	if (!type->box_ctrl_die)
>  		goto free_type;
>  
> +	type->units = RB_ROOT;
> +
>  	type->access_type = unit->access_type;
>  	num_discovered_types[type->access_type]++;
>  	type->type = unit->box_type;
> @@ -120,10 +122,59 @@ get_uncore_discovery_type(struct uncore_unit_discovery *unit)
>  	return add_uncore_discovery_type(unit);
>  }
>  
> +static inline bool unit_less(struct rb_node *a, const struct rb_node *b)
> +{
> +	struct intel_uncore_discovery_unit *a_node, *b_node;
> +
> +	a_node = rb_entry(a, struct intel_uncore_discovery_unit, node);
> +	b_node = rb_entry(b, struct intel_uncore_discovery_unit, node);
> +
> +	if (a_node->pmu_idx < b_node->pmu_idx)
> +		return true;
> +	if (a_node->pmu_idx > b_node->pmu_idx)
> +		return false;
> +
> +	if (a_node->die < b_node->die)
> +		return true;
> +	if (a_node->die > b_node->die)
> +		return false;
> +
> +	return 0;

Will it be better if the rb_node is sorted by id instead
of pmu_idx+die? 

Then it will be faster for uncore_find_unit() to run in
O(log(N)) instead of O(N).  Right now it looks like we
are traversing the whole tree to find the entry with the
id.

Tim

> +}
> +
> +static inline struct intel_uncore_discovery_unit *
> +uncore_find_unit(struct rb_root *root, unsigned int id)
> +{
> +	struct intel_uncore_discovery_unit *unit;
> +	struct rb_node *node;
> +
> +	for (node = rb_first(root); node; node = rb_next(node)) {
> +		unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
> +		if (unit->id == id)
> +			return unit;
> +	}
> +
> +	return NULL;
> +}
> +

Re: [PATCH 1/8] perf/x86/uncore: Save the unit control address of all units

Posted by Liang, Kan 1 year, 8 months ago


On 2024-06-10 6:40 p.m., Tim Chen wrote:
> On Mon, 2024-06-10 at 13:16 -0700, kan.liang@linux.intel.com wrote:
>> From: Kan Liang <kan.liang@linux.intel.com>
>>
>> The unit control address of some CXL units may be wrongly calculated
>> under some configuration on a EMR machine.
>>
>> The current implementation only saves the unit control address of the
>> units from the first die, and the first unit of the rest of dies. Perf
>> assumed that the units from the other dies have the same offset as the
>> first die. So the unit control address of the rest of the units can be
>> calculated. However, the assumption is wrong, especially for the CXL
>> units.
>>
>> Introduce an RB tree for each uncore type to save the unit control
>> address and ID information for all the units.
>>
>> Compared with the current implementation, more space is required to save
>> the information of all units. The extra size should be acceptable.
>> For example, on EMR, there are 221 units at most. For a 2-socket machine,
>> the extra space is ~6KB at most.
>>
>> Tested-by: Yunying Sun <yunying.sun@intel.com>
>> Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
>> ---
>>  arch/x86/events/intel/uncore_discovery.c | 79 +++++++++++++++++++++++-
>>  arch/x86/events/intel/uncore_discovery.h | 10 +++
>>  2 files changed, 87 insertions(+), 2 deletions(-)
>>
>> diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
>> index 9a698a92962a..ce520e69a3c1 100644
>> --- a/arch/x86/events/intel/uncore_discovery.c
>> +++ b/arch/x86/events/intel/uncore_discovery.c
>> @@ -93,6 +93,8 @@ add_uncore_discovery_type(struct uncore_unit_discovery *unit)
>>  	if (!type->box_ctrl_die)
>>  		goto free_type;
>>  
>> +	type->units = RB_ROOT;
>> +
>>  	type->access_type = unit->access_type;
>>  	num_discovered_types[type->access_type]++;
>>  	type->type = unit->box_type;
>> @@ -120,10 +122,59 @@ get_uncore_discovery_type(struct uncore_unit_discovery *unit)
>>  	return add_uncore_discovery_type(unit);
>>  }
>>  
>> +static inline bool unit_less(struct rb_node *a, const struct rb_node *b)
>> +{
>> +	struct intel_uncore_discovery_unit *a_node, *b_node;
>> +
>> +	a_node = rb_entry(a, struct intel_uncore_discovery_unit, node);
>> +	b_node = rb_entry(b, struct intel_uncore_discovery_unit, node);
>> +
>> +	if (a_node->pmu_idx < b_node->pmu_idx)
>> +		return true;
>> +	if (a_node->pmu_idx > b_node->pmu_idx)
>> +		return false;
>> +
>> +	if (a_node->die < b_node->die)
>> +		return true;
>> +	if (a_node->die > b_node->die)
>> +		return false;
>> +
>> +	return 0;
> 
> Will it be better if the rb_node is sorted by id instead
> of pmu_idx+die?

The id and pmu_idx+die can all be used as a key to search the RB tree in
different places.

The id is the physical ID of a unit. The search via id is invoked when
adding a new unit. Perf needs to make sure that the same PMU idx
(logical id) is assigned to the unit with the same physical ID. Because
the units with the same physical ID in different dies share the same PMU.

The pmu_idx+die key is used when setting the cpumask. Please see
intel_uncore_find_discovery_unit_id() in the patch 2. Perf wants to
understand on which dies the given PMU is available.

Since different keys can be used to search the RB tree, I think one of
them has to traverse the whole tree. At the stage of adding a new unit,
the tree is not complete yet. It minimizes the impact of the O(N)
search. So I choose the pmu_idx+die rather than id.

Also, the driver only does once to build the tree and set the cpumask at
driver load time. I think the O(N) should be acceptable here.

Thanks,
Kan

> 
> Then it will be faster for uncore_find_unit() to run in
> O(log(N)) instead of O(N).  Right now it looks like we
> are traversing the whole tree to find the entry with the
> id.
> 
> Tim
> 
>> +}
>> +
>> +static inline struct intel_uncore_discovery_unit *
>> +uncore_find_unit(struct rb_root *root, unsigned int id)
>> +{
>> +	struct intel_uncore_discovery_unit *unit;
>> +	struct rb_node *node;
>> +
>> +	for (node = rb_first(root); node; node = rb_next(node)) {
>> +		unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
>> +		if (unit->id == id)
>> +			return unit;
>> +	}
>> +
>> +	return NULL;
>> +}
>> +
> 
>

Re: [PATCH 1/8] perf/x86/uncore: Save the unit control address of all units

Posted by Tim Chen 1 year, 8 months ago

On Wed, 2024-06-12 at 10:49 -0400, Liang, Kan wrote:
> 
> 
> The id and pmu_idx+die can all be used as a key to search the RB tree in
> different places.
> 
> The id is the physical ID of a unit. The search via id is invoked when
> adding a new unit. Perf needs to make sure that the same PMU idx
> (logical id) is assigned to the unit with the same physical ID. Because
> the units with the same physical ID in different dies share the same PMU.

This info about having same physical ID implies the same PMU
is worth mentioning in a comment and will be quite helpful in
understanding the rb-tree organization.

Thanks.

Tim
> 
> The pmu_idx+die key is used when setting the cpumask. Please see
> intel_uncore_find_discovery_unit_id() in the patch 2. Perf wants to
> understand on which dies the given PMU is available.
> 
> Since different keys can be used to search the RB tree, I think one of
> them has to traverse the whole tree. At the stage of adding a new unit,
> the tree is not complete yet. It minimizes the impact of the O(N)
> search. So I choose the pmu_idx+die rather than id.
> 
> Also, the driver only does once to build the tree and set the cpumask at
> driver load time. I think the O(N) should be acceptable here.
> 
> Thanks,
> Kan

Re: [PATCH 1/8] perf/x86/uncore: Save the unit control address of all units

Posted by Liang, Kan 1 year, 8 months ago


On 2024-06-12 1:33 p.m., Tim Chen wrote:
> On Wed, 2024-06-12 at 10:49 -0400, Liang, Kan wrote:
>>
>>
>> The id and pmu_idx+die can all be used as a key to search the RB tree in
>> different places.
>>
>> The id is the physical ID of a unit. The search via id is invoked when
>> adding a new unit. Perf needs to make sure that the same PMU idx
>> (logical id) is assigned to the unit with the same physical ID. Because
>> the units with the same physical ID in different dies share the same PMU.
> 
> This info about having same physical ID implies the same PMU
> is worth mentioning in a comment and will be quite helpful in
> understanding the rb-tree organization.
>

Sure I will update the description to explain the choice. Thanks.

Thanks,
Kan

> Thanks.
> 
> Tim
>>
>> The pmu_idx+die key is used when setting the cpumask. Please see
>> intel_uncore_find_discovery_unit_id() in the patch 2. Perf wants to
>> understand on which dies the given PMU is available.
>>
>> Since different keys can be used to search the RB tree, I think one of
>> them has to traverse the whole tree. At the stage of adding a new unit,
>> the tree is not complete yet. It minimizes the impact of the O(N)
>> search. So I choose the pmu_idx+die rather than id.
>>
>> Also, the driver only does once to build the tree and set the cpumask at
>> driver load time. I think the O(N) should be acceptable here.
>>
>> Thanks,
>> Kan
>

Re: [PATCH 1/8] perf/x86/uncore: Save the unit control address of all units

Posted by Tim Chen 1 year, 8 months ago

On Wed, 2024-06-12 at 10:49 -0400, Liang, Kan wrote:
> 
> On 2024-06-10 6:40 p.m., Tim Chen wrote:
> > On Mon, 2024-06-10 at 13:16 -0700, kan.liang@linux.intel.com wrote:
> > > From: Kan Liang <kan.liang@linux.intel.com>
> > > 
> > > The unit control address of some CXL units may be wrongly calculated
> > > under some configuration on a EMR machine.
> > > 
> > > The current implementation only saves the unit control address of the
> > > units from the first die, and the first unit of the rest of dies. Perf
> > > assumed that the units from the other dies have the same offset as the
> > > first die. So the unit control address of the rest of the units can be
> > > calculated. However, the assumption is wrong, especially for the CXL
> > > units.
> > > 
> > > Introduce an RB tree for each uncore type to save the unit control
> > > address and ID information for all the units.
> > > 
> > > Compared with the current implementation, more space is required to save
> > > the information of all units. The extra size should be acceptable.
> > > For example, on EMR, there are 221 units at most. For a 2-socket machine,
> > > the extra space is ~6KB at most.
> > > 
> > > Tested-by: Yunying Sun <yunying.sun@intel.com>
> > > Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
> > > ---
> > >  arch/x86/events/intel/uncore_discovery.c | 79 +++++++++++++++++++++++-
> > >  arch/x86/events/intel/uncore_discovery.h | 10 +++
> > >  2 files changed, 87 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/arch/x86/events/intel/uncore_discovery.c b/arch/x86/events/intel/uncore_discovery.c
> > > index 9a698a92962a..ce520e69a3c1 100644
> > > --- a/arch/x86/events/intel/uncore_discovery.c
> > > +++ b/arch/x86/events/intel/uncore_discovery.c
> > > @@ -93,6 +93,8 @@ add_uncore_discovery_type(struct uncore_unit_discovery *unit)
> > >  	if (!type->box_ctrl_die)
> > >  		goto free_type;
> > >  
> > > +	type->units = RB_ROOT;
> > > +
> > >  	type->access_type = unit->access_type;
> > >  	num_discovered_types[type->access_type]++;
> > >  	type->type = unit->box_type;
> > > @@ -120,10 +122,59 @@ get_uncore_discovery_type(struct uncore_unit_discovery *unit)
> > >  	return add_uncore_discovery_type(unit);
> > >  }
> > >  
> > > +static inline bool unit_less(struct rb_node *a, const struct rb_node *b)
> > > +{
> > > +	struct intel_uncore_discovery_unit *a_node, *b_node;
> > > +
> > > +	a_node = rb_entry(a, struct intel_uncore_discovery_unit, node);
> > > +	b_node = rb_entry(b, struct intel_uncore_discovery_unit, node);
> > > +
> > > +	if (a_node->pmu_idx < b_node->pmu_idx)
> > > +		return true;
> > > +	if (a_node->pmu_idx > b_node->pmu_idx)
> > > +		return false;
> > > +
> > > +	if (a_node->die < b_node->die)
> > > +		return true;
> > > +	if (a_node->die > b_node->die)
> > > +		return false;
> > > +
> > > +	return 0;
> > 
> > Will it be better if the rb_node is sorted by id instead
> > of pmu_idx+die?
> 
> The id and pmu_idx+die can all be used as a key to search the RB tree in
> different places.
> 
> The id is the physical ID of a unit. The search via id is invoked when
> adding a new unit. Perf needs to make sure that the same PMU idx
> (logical id) is assigned to the unit with the same physical ID. Because
> the units with the same physical ID in different dies share the same PMU.
> 
> The pmu_idx+die key is used when setting the cpumask. Please see
> intel_uncore_find_discovery_unit_id() in the patch 2. Perf wants to
> understand on which dies the given PMU is available.
> 
> Since different keys can be used to search the RB tree, I think one of
> them has to traverse the whole tree. At the stage of adding a new unit,
> the tree is not complete yet. It minimizes the impact of the O(N)
> search. So I choose the pmu_idx+die rather than id.
> 
> Also, the driver only does once to build the tree and set the cpumask at
> driver load time. I think the O(N) should be acceptable here.
> 

Thanks for explaining.  I think you saying you are looking up pmu in the tree with id
once during driver load time but needs to look up the tree by die and pmu_idx
more frequently, hence the choice for index.

May be worth mentioning that in a comment.

Tim

> Thanks,
> Kan
> 
> > 
> > Then it will be faster for uncore_find_unit() to run in
> > O(log(N)) instead of O(N).  Right now it looks like we
> > are traversing the whole tree to find the entry with the
> > id.
> > 
> > Tim
> > 
> > > +}
> > > +
> > > +static inline struct intel_uncore_discovery_unit *
> > > +uncore_find_unit(struct rb_root *root, unsigned int id)
> > > +{
> > > +	struct intel_uncore_discovery_unit *unit;
> > > +	struct rb_node *node;
> > > +
> > > +	for (node = rb_first(root); node; node = rb_next(node)) {
> > > +		unit = rb_entry(node, struct intel_uncore_discovery_unit, node);
> > > +		if (unit->id == id)
> > > +			return unit;
> > > +	}
> > > +
> > > +	return NULL;
> > > +}
> > > +
> > 
> >