[v4] x86/resctrl: Support for AMD QoS new features and bug fix

[PATCH v4 11/13] x86/resctrl: Add sysfs interface to write the event configuration

Posted by Babu Moger 3 years, 7 months ago

Add the sysfs interface to write the event configuration for the
MBM configurable events. The event configuration can be changed by
writing to the sysfs file for that specific event.

Following are the types of events supported:

====  ===========================================================
Bits    Description
====  ===========================================================
6       Dirty Victims from the QOS domain to all types of memory
5       Reads to slow memory in the non-local NUMA domain
4       Reads to slow memory in the local NUMA domain
3       Non-temporal writes to non-local NUMA domain
2       Non-temporal writes to local NUMA domain
1       Reads to memory in the non-local NUMA domain
0       Reads to memory in the local NUMA domain
====  ===========================================================

By default, the mbm_total_bytes configuration is set to 0x7F to count
all the event types and the mbm_local_bytes configuration is set to
0x15 to count all the local memory events.

For example:
To change the mbm_total_bytes to count all the reads, run the command.
$echo  0x33 > /sys/fs/resctrl/mon_data/mon_L3_00/mbm_total_config

To change the mbm_local_bytes to count all the slow memory reads, run
the command.
$echo  0x30 > /sys/fs/resctrl/mon_data/mon_L3_00/mbm_local_config

Signed-off-by: Babu Moger <babu.moger@amd.com>
---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  112 ++++++++++++++++++++++++++++++++
 1 file changed, 112 insertions(+)

diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 6f067c1ac7c1..59b484eb1267 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -330,9 +330,121 @@ int rdtgroup_mondata_config_show(struct seq_file *m, void *arg)
 	return ret;
 }
 
+/*
+ * This is called via IPI to read the CQM/MBM counters
+ * in a domain.
+ */
+void mon_event_config_write(void *info)
+{
+	struct mon_config_info *mon_info = info;
+	u32 msr_index;
+
+	switch (mon_info->evtid) {
+	case QOS_L3_MBM_TOTAL_EVENT_ID:
+		msr_index = 0;
+		break;
+	case QOS_L3_MBM_LOCAL_EVENT_ID:
+		msr_index = 1;
+		break;
+	default:
+		/* Not expected to come here */
+		return;
+	}
+
+	wrmsr(MSR_IA32_EVT_CFG_BASE + msr_index, mon_info->mon_config, 0);
+}
+
+ssize_t  rdtgroup_mondata_config_write(struct kernfs_open_file *of,
+				       char *buf, size_t nbytes, loff_t off)
+{
+	struct mon_config_info mon_info;
+	struct rdt_hw_resource *hw_res;
+	struct rdtgroup *rdtgrp;
+	struct rdt_resource *r;
+	unsigned int mon_config;
+	cpumask_var_t cpu_mask;
+	union mon_data_bits md;
+	struct rdt_domain *d;
+	u32 resid, domid;
+	int ret = 0, cpu;
+
+	ret = kstrtouint(buf, 0, &mon_config);
+	if (ret)
+		return ret;
+
+	rdt_last_cmd_clear();
+
+	/* mon_config cannot be more than the supported set of events */
+	if (mon_config > MAX_EVT_CONFIG_BITS) {
+		rdt_last_cmd_puts("Invalid event configuration\n");
+		return -EINVAL;
+	}
+
+	cpus_read_lock();
+	rdtgrp = rdtgroup_kn_lock_live(of->kn);
+	if (!rdtgrp) {
+		return -ENOENT;
+		goto e_unlock;
+	}
+
+	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) {
+		ret = -ENOMEM;
+		goto e_unlock;
+	}
+
+	md.priv = of->kn->priv;
+	resid = md.u.rid;
+	domid = md.u.domid;
+
+	hw_res = &rdt_resources_all[resid];
+	r = &hw_res->r_resctrl;
+	d = rdt_find_domain(r, domid, NULL);
+	if (IS_ERR_OR_NULL(d)) {
+		ret = -ENOENT;
+		goto e_cpumask;
+	}
+
+	/*
+	 * Read the current config value first. If both are same
+	 * then we dont need to write again
+	 */
+	mon_info.evtid = md.u.evtid;
+	mondata_config_read(d, &mon_info);
+	if (mon_info.mon_config == mon_config)
+		goto e_cpumask;
+
+	mon_info.mon_config = mon_config;
+
+	/* Pick all the CPUs in the domain instance */
+	for_each_cpu(cpu, &d->cpu_mask)
+		cpumask_set_cpu(cpu, cpu_mask);
+
+	/* Update MSR_IA32_EVT_CFG_BASE MSR on all the CPUs in cpu_mask */
+	on_each_cpu_mask(cpu_mask, mon_event_config_write, &mon_info, 1);
+
+	/*
+	 * When an Event Configuration is changed, the bandwidth counters
+	 * for all RMIDs and Events will be cleared, and the U-bit for every
+	 * RMID will be set on the next read to any BwEvent for every RMID.
+	 * Clear the mbm_local and mbm_total counts for all the RMIDs.
+	 */
+	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
+	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
+
+e_cpumask:
+	free_cpumask_var(cpu_mask);
+
+e_unlock:
+	rdtgroup_kn_unlock(of->kn);
+	cpus_read_unlock();
+
+	return ret ?: nbytes;
+}
+
 static const struct kernfs_ops kf_mondata_config_ops = {
 	.atomic_write_len       = PAGE_SIZE,
 	.seq_show               = rdtgroup_mondata_config_show,
+	.write                  = rdtgroup_mondata_config_write,
 };
 
 static bool is_cpu_list(struct kernfs_open_file *of)

Re: [PATCH v4 11/13] x86/resctrl: Add sysfs interface to write the event configuration

Posted by Reinette Chatre 3 years, 6 months ago

Hi Babu,

On 9/7/2022 11:01 AM, Babu Moger wrote:

...

> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> index 6f067c1ac7c1..59b484eb1267 100644
> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
> @@ -330,9 +330,121 @@ int rdtgroup_mondata_config_show(struct seq_file *m, void *arg)
>  	return ret;
>  }
>  
> +/*
> + * This is called via IPI to read the CQM/MBM counters
> + * in a domain.

copy&paste from previous patch?

> + */
> +void mon_event_config_write(void *info)
> +{
> +	struct mon_config_info *mon_info = info;
> +	u32 msr_index;
> +
> +	switch (mon_info->evtid) {
> +	case QOS_L3_MBM_TOTAL_EVENT_ID:
> +		msr_index = 0;
> +		break;
> +	case QOS_L3_MBM_LOCAL_EVENT_ID:
> +		msr_index = 1;
> +		break;
> +	default:
> +		/* Not expected to come here */
> +		return;
> +	}
> +
> +	wrmsr(MSR_IA32_EVT_CFG_BASE + msr_index, mon_info->mon_config, 0);
> +}
> +
> +ssize_t  rdtgroup_mondata_config_write(struct kernfs_open_file *of,
> +				       char *buf, size_t nbytes, loff_t off)
> +{
> +	struct mon_config_info mon_info;
> +	struct rdt_hw_resource *hw_res;
> +	struct rdtgroup *rdtgrp;
> +	struct rdt_resource *r;
> +	unsigned int mon_config;
> +	cpumask_var_t cpu_mask;
> +	union mon_data_bits md;
> +	struct rdt_domain *d;
> +	u32 resid, domid;
> +	int ret = 0, cpu;
> +
> +	ret = kstrtouint(buf, 0, &mon_config);
> +	if (ret)
> +		return ret;
> +
> +	rdt_last_cmd_clear();
> +
> +	/* mon_config cannot be more than the supported set of events */
> +	if (mon_config > MAX_EVT_CONFIG_BITS) {
> +		rdt_last_cmd_puts("Invalid event configuration\n");
> +		return -EINVAL;
> +	}
> +
> +	cpus_read_lock();
> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
> +	if (!rdtgrp) {
> +		return -ENOENT;
> +		goto e_unlock;
> +	}
> +
> +	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) {
> +		ret = -ENOMEM;
> +		goto e_unlock;
> +	}
> +
> +	md.priv = of->kn->priv;
> +	resid = md.u.rid;
> +	domid = md.u.domid;
> +
> +	hw_res = &rdt_resources_all[resid];
> +	r = &hw_res->r_resctrl;
> +	d = rdt_find_domain(r, domid, NULL);
> +	if (IS_ERR_OR_NULL(d)) {
> +		ret = -ENOENT;
> +		goto e_cpumask;
> +	}
> +
> +	/*
> +	 * Read the current config value first. If both are same
> +	 * then we dont need to write again
> +	 */
> +	mon_info.evtid = md.u.evtid;
> +	mondata_config_read(d, &mon_info);
> +	if (mon_info.mon_config == mon_config)
> +		goto e_cpumask;
> +
> +	mon_info.mon_config = mon_config;
> +
> +	/* Pick all the CPUs in the domain instance */
> +	for_each_cpu(cpu, &d->cpu_mask)
> +		cpumask_set_cpu(cpu, cpu_mask);
> +
> +	/* Update MSR_IA32_EVT_CFG_BASE MSR on all the CPUs in cpu_mask */
> +	on_each_cpu_mask(cpu_mask, mon_event_config_write, &mon_info, 1);

If this is required then could you please add a comment why every CPU in
the domain needs to be updated? Until now configuration changes
only needed to be made on one CPU per domain. 
Even in the previous patch when the user reads the current configuration
value it is only done on one CPU in the domain ... to me that implies
that the scope is per domain and only one CPU in the domain needs to be
changed.

> +
> +	/*
> +	 * When an Event Configuration is changed, the bandwidth counters
> +	 * for all RMIDs and Events will be cleared, and the U-bit for every
> +	 * RMID will be set on the next read to any BwEvent for every RMID.
> +	 * Clear the mbm_local and mbm_total counts for all the RMIDs.
> +	 */

This is a snippet that was copied from the hardware spec and since it is
inserted into the kernel driver the context makes it hard to understand.
Could it be translated into what it means in this context?
Perhaps something like:

	/*
	 * When an Event Configuration is changed, the bandwidth counters
	 * for all RMIDs and Events will be cleared by the hardware. The
         * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for every
	 * RMID on the next read to any event for every RMID. Subsequent
	 * reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) cleared
	 * while it is tracked by the hardware.
	 * Clear the mbm_local and mbm_total counts for all the RMIDs.
	 */

Please fixup where I got it wrong.

> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
> +

Reinette

Re: [PATCH v4 11/13] x86/resctrl: Add sysfs interface to write the event configuration

Posted by Moger, Babu 3 years, 6 months ago

On 9/16/22 11:17, Reinette Chatre wrote:
> Hi Babu,
>
> On 9/7/2022 11:01 AM, Babu Moger wrote:
>
> ...
>
>> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> index 6f067c1ac7c1..59b484eb1267 100644
>> --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
>> @@ -330,9 +330,121 @@ int rdtgroup_mondata_config_show(struct seq_file *m, void *arg)
>>  	return ret;
>>  }
>>  
>> +/*
>> + * This is called via IPI to read the CQM/MBM counters
>> + * in a domain.
> copy&paste from previous patch?
Will correct it.
>
>> + */
>> +void mon_event_config_write(void *info)
>> +{
>> +	struct mon_config_info *mon_info = info;
>> +	u32 msr_index;
>> +
>> +	switch (mon_info->evtid) {
>> +	case QOS_L3_MBM_TOTAL_EVENT_ID:
>> +		msr_index = 0;
>> +		break;
>> +	case QOS_L3_MBM_LOCAL_EVENT_ID:
>> +		msr_index = 1;
>> +		break;
>> +	default:
>> +		/* Not expected to come here */
>> +		return;
>> +	}
>> +
>> +	wrmsr(MSR_IA32_EVT_CFG_BASE + msr_index, mon_info->mon_config, 0);
>> +}
>> +
>> +ssize_t  rdtgroup_mondata_config_write(struct kernfs_open_file *of,
>> +				       char *buf, size_t nbytes, loff_t off)
>> +{
>> +	struct mon_config_info mon_info;
>> +	struct rdt_hw_resource *hw_res;
>> +	struct rdtgroup *rdtgrp;
>> +	struct rdt_resource *r;
>> +	unsigned int mon_config;
>> +	cpumask_var_t cpu_mask;
>> +	union mon_data_bits md;
>> +	struct rdt_domain *d;
>> +	u32 resid, domid;
>> +	int ret = 0, cpu;
>> +
>> +	ret = kstrtouint(buf, 0, &mon_config);
>> +	if (ret)
>> +		return ret;
>> +
>> +	rdt_last_cmd_clear();
>> +
>> +	/* mon_config cannot be more than the supported set of events */
>> +	if (mon_config > MAX_EVT_CONFIG_BITS) {
>> +		rdt_last_cmd_puts("Invalid event configuration\n");
>> +		return -EINVAL;
>> +	}
>> +
>> +	cpus_read_lock();
>> +	rdtgrp = rdtgroup_kn_lock_live(of->kn);
>> +	if (!rdtgrp) {
>> +		return -ENOENT;
>> +		goto e_unlock;
>> +	}
>> +
>> +	if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) {
>> +		ret = -ENOMEM;
>> +		goto e_unlock;
>> +	}
>> +
>> +	md.priv = of->kn->priv;
>> +	resid = md.u.rid;
>> +	domid = md.u.domid;
>> +
>> +	hw_res = &rdt_resources_all[resid];
>> +	r = &hw_res->r_resctrl;
>> +	d = rdt_find_domain(r, domid, NULL);
>> +	if (IS_ERR_OR_NULL(d)) {
>> +		ret = -ENOENT;
>> +		goto e_cpumask;
>> +	}
>> +
>> +	/*
>> +	 * Read the current config value first. If both are same
>> +	 * then we dont need to write again
>> +	 */
>> +	mon_info.evtid = md.u.evtid;
>> +	mondata_config_read(d, &mon_info);
>> +	if (mon_info.mon_config == mon_config)
>> +		goto e_cpumask;
>> +
>> +	mon_info.mon_config = mon_config;
>> +
>> +	/* Pick all the CPUs in the domain instance */
>> +	for_each_cpu(cpu, &d->cpu_mask)
>> +		cpumask_set_cpu(cpu, cpu_mask);
>> +
>> +	/* Update MSR_IA32_EVT_CFG_BASE MSR on all the CPUs in cpu_mask */
>> +	on_each_cpu_mask(cpu_mask, mon_event_config_write, &mon_info, 1);
> If this is required then could you please add a comment why every CPU in
> the domain needs to be updated? Until now configuration changes
> only needed to be made on one CPU per domain. 
> Even in the previous patch when the user reads the current configuration
> value it is only done on one CPU in the domain ... to me that implies
> that the scope is per domain and only one CPU in the domain needs to be
> changed.

Yes, This register is domain specific. However the hardware team
recommends it to update on all the CPU threads. It is not clear in the
specs right now.

I have asked them to make that clear in the specs next time.

>
>> +
>> +	/*
>> +	 * When an Event Configuration is changed, the bandwidth counters
>> +	 * for all RMIDs and Events will be cleared, and the U-bit for every
>> +	 * RMID will be set on the next read to any BwEvent for every RMID.
>> +	 * Clear the mbm_local and mbm_total counts for all the RMIDs.
>> +	 */
> This is a snippet that was copied from the hardware spec and since it is
> inserted into the kernel driver the context makes it hard to understand.
> Could it be translated into what it means in this context?
> Perhaps something like:
>
> 	/*
> 	 * When an Event Configuration is changed, the bandwidth counters
> 	 * for all RMIDs and Events will be cleared by the hardware. The
>          * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for every
> 	 * RMID on the next read to any event for every RMID. Subsequent
> 	 * reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) cleared
> 	 * while it is tracked by the hardware.
> 	 * Clear the mbm_local and mbm_total counts for all the RMIDs.
> 	 */
>
> Please fixup where I got it wrong.

Looks good.

Thanks Babu

>
>> +	memset(d->mbm_local, 0, sizeof(struct mbm_state) * r->num_rmid);
>> +	memset(d->mbm_total, 0, sizeof(struct mbm_state) * r->num_rmid);
>> +
> Reinette

-- 
Thanks
Babu Moger