Searching for the right pmu by iterating over all pmus is no longer
required since all pmus now *must* be present in the 'pmu_idr' list.
So, remove linear searching code.
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
---
kernel/events/core.c | 37 +++++++++++++------------------------
1 file changed, 13 insertions(+), 24 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0695bb9fbbb6..eba2b8595115 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11630,38 +11630,27 @@ static struct pmu *perf_init_event(struct perf_event *event)
}
again:
+ ret = -ENOENT;
rcu_read_lock();
pmu = idr_find(&pmu_idr, type);
rcu_read_unlock();
- if (pmu) {
- if (event->attr.type != type && type != PERF_TYPE_RAW &&
- !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
- goto fail;
-
- ret = perf_try_init_event(pmu, event);
- if (ret == -ENOENT && event->attr.type != type && !extended_type) {
- type = event->attr.type;
- goto again;
- }
+ if (!pmu)
+ goto fail;
- if (ret)
- pmu = ERR_PTR(ret);
+ if (event->attr.type != type && type != PERF_TYPE_RAW &&
+ !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
+ goto fail;
- goto unlock;
+ ret = perf_try_init_event(pmu, event);
+ if (ret == -ENOENT && event->attr.type != type && !extended_type) {
+ type = event->attr.type;
+ goto again;
}
- list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
- ret = perf_try_init_event(pmu, event);
- if (!ret)
- goto unlock;
-
- if (ret != -ENOENT) {
- pmu = ERR_PTR(ret);
- goto unlock;
- }
- }
fail:
- pmu = ERR_PTR(-ENOENT);
+ if (ret)
+ pmu = ERR_PTR(ret);
+
unlock:
srcu_read_unlock(&pmus_srcu, idx);
--
2.40.0
Hi Ravi,
+ arm64 KVM folks
On Thu, May 04, 2023 at 04:30:02PM +0530, Ravi Bangoria wrote:
> Searching for the right pmu by iterating over all pmus is no longer
> required since all pmus now *must* be present in the 'pmu_idr' list.
> So, remove linear searching code.
>
> Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
> ---
> kernel/events/core.c | 37 +++++++++++++------------------------
> 1 file changed, 13 insertions(+), 24 deletions(-)
>
> diff --git a/kernel/events/core.c b/kernel/events/core.c
> index 0695bb9fbbb6..eba2b8595115 100644
> --- a/kernel/events/core.c
> +++ b/kernel/events/core.c
> @@ -11630,38 +11630,27 @@ static struct pmu *perf_init_event(struct perf_event *event)
> }
>
> again:
> + ret = -ENOENT;
> rcu_read_lock();
> pmu = idr_find(&pmu_idr, type);
> rcu_read_unlock();
> - if (pmu) {
> - if (event->attr.type != type && type != PERF_TYPE_RAW &&
> - !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
> - goto fail;
> -
> - ret = perf_try_init_event(pmu, event);
> - if (ret == -ENOENT && event->attr.type != type && !extended_type) {
> - type = event->attr.type;
> - goto again;
> - }
> + if (!pmu)
> + goto fail;
>
> - if (ret)
> - pmu = ERR_PTR(ret);
> + if (event->attr.type != type && type != PERF_TYPE_RAW &&
> + !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
> + goto fail;
>
> - goto unlock;
> + ret = perf_try_init_event(pmu, event);
> + if (ret == -ENOENT && event->attr.type != type && !extended_type) {
> + type = event->attr.type;
> + goto again;
> }
>
> - list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
> - ret = perf_try_init_event(pmu, event);
> - if (!ret)
> - goto unlock;
> -
> - if (ret != -ENOENT) {
> - pmu = ERR_PTR(ret);
> - goto unlock;
> - }
> - }
> fail:
> - pmu = ERR_PTR(-ENOENT);
> + if (ret)
> + pmu = ERR_PTR(ret);
> +
> unlock:
> srcu_read_unlock(&pmus_srcu, idx);
>
> --
> 2.40.0
>
My apologies if this has already been reported or fixed already, I did a
search of lore.kernel.org and did not find anything. This patch as
commit 9551fbb64d09 ("perf/core: Remove pmu linear searching code") in
-next breaks starting QEMU with KVM enabled on two of my arm64 machines:
$ qemu-system-aarch64 \
-display none \
-nodefaults \
-machine virt,gic-version=max \
-append 'console=ttyAMA0 earlycon' \
-kernel arch/arm64/boot/Image.gz \
-initrd rootfs.cpio \
-cpu host \
-enable-kvm \
-m 512m \
-smp 8 \
-serial mon:stdio
qemu-system-aarch64: PMU: KVM_SET_DEVICE_ATTR: No such device
qemu-system-aarch64: failed to set irq for PMU
In the kernel log, I see
[ 42.944952] kvm: pmu event creation failed -2
I am not sure if this issue is unexpected as a result of this change or
if there is something that needs to change on the arm64 KVM side (it
appears the kernel message comes from arch/arm64/kvm/pmu-emul.c).
If there is any further information I can provide or patches I can test,
I am more than happy to do so.
Cheers,
Nathan
# bad: [cf09e328589a2ed7f6c8d90f2edb697fb4f8a96b] Add linux-next specific files for 20230524
# good: [27e462c8fad4bf04ec4f81f8539ce6fa947ead3a] Merge tag 'xtensa-20230523' of https://github.com/jcmvbkbc/linux-xtensa
git bisect start 'cf09e328589a2ed7f6c8d90f2edb697fb4f8a96b' '27e462c8fad4bf04ec4f81f8539ce6fa947ead3a'
# good: [a20d8ab9e26daaeeaf971139b736981cf164ab0a] Merge branch 'for-linux-next' of git://anongit.freedesktop.org/drm/drm-misc
git bisect good a20d8ab9e26daaeeaf971139b736981cf164ab0a
# good: [2714032dfd641b22695e14efd5f9dff08a5e3245] Merge branch 'next' of git://git.kernel.org/pub/scm/linux/kernel/git/dtor/input.git
git bisect good 2714032dfd641b22695e14efd5f9dff08a5e3245
# bad: [b2bc2854ec87557033538aa9290f70b9141a6653] Merge branch 'for-leds-next' of git://git.kernel.org/pub/scm/linux/kernel/git/lee/leds.git
git bisect bad b2bc2854ec87557033538aa9290f70b9141a6653
# good: [20d4044f23c7724020b6c7d34ccee9bb929d1078] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/spi.git
git bisect good 20d4044f23c7724020b6c7d34ccee9bb929d1078
# bad: [c3cab2fce7b318ee2edf148b1436f3a3864ae773] Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace.git
git bisect bad c3cab2fce7b318ee2edf148b1436f3a3864ae773
# bad: [af75e6871092fc1f9fa039132d5667f1e0a47a0a] Merge branch into tip/master: 'sched/core'
git bisect bad af75e6871092fc1f9fa039132d5667f1e0a47a0a
# good: [c50a7b40a2f50403c3f58f1c27a85e4c5d2e0865] Merge branch into tip/master: 'objtool/core'
git bisect good c50a7b40a2f50403c3f58f1c27a85e4c5d2e0865
# good: [519fabc7aaba3f0847cf37d5f9a5740c370eb777] psi: remove 500ms min window size limitation for triggers
git bisect good 519fabc7aaba3f0847cf37d5f9a5740c370eb777
# bad: [b85c6694924e9f09a40a2e0a3798f3945eaa6fda] Merge branch into tip/master: 'perf/core'
git bisect bad b85c6694924e9f09a40a2e0a3798f3945eaa6fda
# bad: [9551fbb64d094cc105964716224adeb7765df8fd] perf/core: Remove pmu linear searching code
git bisect bad 9551fbb64d094cc105964716224adeb7765df8fd
# good: [2fad201fe38ff9a692acedb1990ece2c52a29f95] perf/ibs: Fix interface via core pmu events
git bisect good 2fad201fe38ff9a692acedb1990ece2c52a29f95
# first bad commit: [9551fbb64d094cc105964716224adeb7765df8fd] perf/core: Remove pmu linear searching code
Hi Nathan,
On 25-May-23 3:11 AM, Nathan Chancellor wrote:
> Hi Ravi,
>
> + arm64 KVM folks
>
> On Thu, May 04, 2023 at 04:30:02PM +0530, Ravi Bangoria wrote:
>> Searching for the right pmu by iterating over all pmus is no longer
>> required since all pmus now *must* be present in the 'pmu_idr' list.
>> So, remove linear searching code.
>>
>> Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
>> ---
>> kernel/events/core.c | 37 +++++++++++++------------------------
>> 1 file changed, 13 insertions(+), 24 deletions(-)
>>
>> diff --git a/kernel/events/core.c b/kernel/events/core.c
>> index 0695bb9fbbb6..eba2b8595115 100644
>> --- a/kernel/events/core.c
>> +++ b/kernel/events/core.c
>> @@ -11630,38 +11630,27 @@ static struct pmu *perf_init_event(struct perf_event *event)
>> }
>>
>> again:
>> + ret = -ENOENT;
>> rcu_read_lock();
>> pmu = idr_find(&pmu_idr, type);
>> rcu_read_unlock();
>> - if (pmu) {
>> - if (event->attr.type != type && type != PERF_TYPE_RAW &&
>> - !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
>> - goto fail;
>> -
>> - ret = perf_try_init_event(pmu, event);
>> - if (ret == -ENOENT && event->attr.type != type && !extended_type) {
>> - type = event->attr.type;
>> - goto again;
>> - }
>> + if (!pmu)
>> + goto fail;
>>
>> - if (ret)
>> - pmu = ERR_PTR(ret);
>> + if (event->attr.type != type && type != PERF_TYPE_RAW &&
>> + !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
>> + goto fail;
>>
>> - goto unlock;
>> + ret = perf_try_init_event(pmu, event);
>> + if (ret == -ENOENT && event->attr.type != type && !extended_type) {
>> + type = event->attr.type;
>> + goto again;
>> }
>>
>> - list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
>> - ret = perf_try_init_event(pmu, event);
>> - if (!ret)
>> - goto unlock;
>> -
>> - if (ret != -ENOENT) {
>> - pmu = ERR_PTR(ret);
>> - goto unlock;
>> - }
>> - }
>> fail:
>> - pmu = ERR_PTR(-ENOENT);
>> + if (ret)
>> + pmu = ERR_PTR(ret);
>> +
>> unlock:
>> srcu_read_unlock(&pmus_srcu, idx);
>>
>> --
>> 2.40.0
>>
>
> My apologies if this has already been reported or fixed already, I did a
> search of lore.kernel.org and did not find anything. This patch as
> commit 9551fbb64d09 ("perf/core: Remove pmu linear searching code") in
> -next breaks starting QEMU with KVM enabled on two of my arm64 machines:
>
> $ qemu-system-aarch64 \
> -display none \
> -nodefaults \
> -machine virt,gic-version=max \
> -append 'console=ttyAMA0 earlycon' \
> -kernel arch/arm64/boot/Image.gz \
> -initrd rootfs.cpio \
> -cpu host \
> -enable-kvm \
> -m 512m \
> -smp 8 \
> -serial mon:stdio
> qemu-system-aarch64: PMU: KVM_SET_DEVICE_ATTR: No such device
> qemu-system-aarch64: failed to set irq for PMU
>
> In the kernel log, I see
>
> [ 42.944952] kvm: pmu event creation failed -2
>
> I am not sure if this issue is unexpected as a result of this change or
> if there is something that needs to change on the arm64 KVM side (it
> appears the kernel message comes from arch/arm64/kvm/pmu-emul.c).
Thanks for reporting it.
Based on these detail, I feel the pmu registration failed in the host,
most probably because pmu driver did not pass pmu name while calling
perf_pmu_register(). Consequently kvm also failed while trying to use
it for guest. Can you please check host kernel logs.
I'm sorry but I neither have Arm board to try myself not I'm familiar
with the Arm architecture. So I'll need your help to diagnose it.
Ravi
On Thu, May 25, 2023 at 10:46:01AM +0530, Ravi Bangoria wrote:
> On 25-May-23 3:11 AM, Nathan Chancellor wrote:
> > My apologies if this has already been reported or fixed already, I did a
> > search of lore.kernel.org and did not find anything. This patch as
> > commit 9551fbb64d09 ("perf/core: Remove pmu linear searching code") in
> > -next breaks starting QEMU with KVM enabled on two of my arm64 machines:
> >
> > $ qemu-system-aarch64 \
> > -display none \
> > -nodefaults \
> > -machine virt,gic-version=max \
> > -append 'console=ttyAMA0 earlycon' \
> > -kernel arch/arm64/boot/Image.gz \
> > -initrd rootfs.cpio \
> > -cpu host \
> > -enable-kvm \
> > -m 512m \
> > -smp 8 \
> > -serial mon:stdio
> > qemu-system-aarch64: PMU: KVM_SET_DEVICE_ATTR: No such device
> > qemu-system-aarch64: failed to set irq for PMU
> >
> > In the kernel log, I see
> >
> > [ 42.944952] kvm: pmu event creation failed -2
> >
> > I am not sure if this issue is unexpected as a result of this change or
> > if there is something that needs to change on the arm64 KVM side (it
> > appears the kernel message comes from arch/arm64/kvm/pmu-emul.c).
>
> Thanks for reporting it.
>
> Based on these detail, I feel the pmu registration failed in the host,
> most probably because pmu driver did not pass pmu name while calling
> perf_pmu_register(). Consequently kvm also failed while trying to use
> it for guest. Can you please check host kernel logs.
The PMUv3 driver does pass a name, but it relies on getting back an
allocated pmu id as @type is -1 in the call to perf_pmu_register().
What actually broke is how KVM probes for a default core PMU to use for
a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
reads the pmu from the returned perf_event. The linear search had the
effect of eventually stumbling on the correct core PMU and succeeding.
Perf folks: is this WAI for heterogenous systems?
Either way, the whole KVM end of this scheme is a bit clunky, and I
believe it to be unneccessary at this point as we maintain a list of
core PMU instances that KVM is able to virtualize. We can just walk
that to find a default PMU to use.
Not seeing any issues on -next with the below diff. If this works for
folks I can actually wrap it up in a patch and send it out.
diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
index 45727d50d18d..cbc0b662b7f8 100644
--- a/arch/arm64/kvm/pmu-emul.c
+++ b/arch/arm64/kvm/pmu-emul.c
@@ -694,47 +694,26 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
static struct arm_pmu *kvm_pmu_probe_armpmu(void)
{
- struct perf_event_attr attr = { };
- struct perf_event *event;
- struct arm_pmu *pmu = NULL;
-
- /*
- * Create a dummy event that only counts user cycles. As we'll never
- * leave this function with the event being live, it will never
- * count anything. But it allows us to probe some of the PMU
- * details. Yes, this is terrible.
- */
- attr.type = PERF_TYPE_RAW;
- attr.size = sizeof(attr);
- attr.pinned = 1;
- attr.disabled = 0;
- attr.exclude_user = 0;
- attr.exclude_kernel = 1;
- attr.exclude_hv = 1;
- attr.exclude_host = 1;
- attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
- attr.sample_period = GENMASK(63, 0);
+ struct arm_pmu *arm_pmu = NULL, *tmp;
+ struct arm_pmu_entry *entry;
+ int cpu;
- event = perf_event_create_kernel_counter(&attr, -1, current,
- kvm_pmu_perf_overflow, &attr);
+ mutex_lock(&arm_pmus_lock);
+ cpu = get_cpu();
- if (IS_ERR(event)) {
- pr_err_once("kvm: pmu event creation failed %ld\n",
- PTR_ERR(event));
- return NULL;
- }
+ list_for_each_entry(entry, &arm_pmus, entry) {
+ tmp = entry->arm_pmu;
- if (event->pmu) {
- pmu = to_arm_pmu(event->pmu);
- if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
- pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
- pmu = NULL;
+ if (cpumask_test_cpu(cpu, &tmp->supported_cpus)) {
+ arm_pmu = tmp;
+ break;
+ }
}
- perf_event_disable(event);
- perf_event_release_kernel(event);
+ put_cpu();
+ mutex_unlock(&arm_pmus_lock);
- return pmu;
+ return arm_pmu;
}
u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
--
Thanks,
Oliver
On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
> On Thu, May 25, 2023 at 10:46:01AM +0530, Ravi Bangoria wrote:
> > On 25-May-23 3:11 AM, Nathan Chancellor wrote:
> > > My apologies if this has already been reported or fixed already, I did a
> > > search of lore.kernel.org and did not find anything. This patch as
> > > commit 9551fbb64d09 ("perf/core: Remove pmu linear searching code") in
> > > -next breaks starting QEMU with KVM enabled on two of my arm64 machines:
> > >
> > > $ qemu-system-aarch64 \
> > > -display none \
> > > -nodefaults \
> > > -machine virt,gic-version=max \
> > > -append 'console=ttyAMA0 earlycon' \
> > > -kernel arch/arm64/boot/Image.gz \
> > > -initrd rootfs.cpio \
> > > -cpu host \
> > > -enable-kvm \
> > > -m 512m \
> > > -smp 8 \
> > > -serial mon:stdio
> > > qemu-system-aarch64: PMU: KVM_SET_DEVICE_ATTR: No such device
> > > qemu-system-aarch64: failed to set irq for PMU
> > >
> > > In the kernel log, I see
> > >
> > > [ 42.944952] kvm: pmu event creation failed -2
> > >
> > > I am not sure if this issue is unexpected as a result of this change or
> > > if there is something that needs to change on the arm64 KVM side (it
> > > appears the kernel message comes from arch/arm64/kvm/pmu-emul.c).
> >
> > Thanks for reporting it.
> >
> > Based on these detail, I feel the pmu registration failed in the host,
> > most probably because pmu driver did not pass pmu name while calling
> > perf_pmu_register(). Consequently kvm also failed while trying to use
> > it for guest. Can you please check host kernel logs.
>
> The PMUv3 driver does pass a name, but it relies on getting back an
> allocated pmu id as @type is -1 in the call to perf_pmu_register().
>
> What actually broke is how KVM probes for a default core PMU to use for
> a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
> reads the pmu from the returned perf_event. The linear search had the
> effect of eventually stumbling on the correct core PMU and succeeding.
>
> Perf folks: is this WAI for heterogenous systems?
>
> Either way, the whole KVM end of this scheme is a bit clunky, and I
> believe it to be unneccessary at this point as we maintain a list of
> core PMU instances that KVM is able to virtualize. We can just walk
> that to find a default PMU to use.
>
> Not seeing any issues on -next with the below diff. If this works for
> folks I can actually wrap it up in a patch and send it out.
I can start QEMU on both the machines that had issues and my machines
continue to run without any visible issues but I have never done any
profile work within them. If there is any further testing or validation
that I should do, I am more than happy to do so. Until then, consider
it:
Tested-by: Nathan Chancellor <nathan@kernel.org>
> diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
> index 45727d50d18d..cbc0b662b7f8 100644
> --- a/arch/arm64/kvm/pmu-emul.c
> +++ b/arch/arm64/kvm/pmu-emul.c
> @@ -694,47 +694,26 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
>
> static struct arm_pmu *kvm_pmu_probe_armpmu(void)
> {
> - struct perf_event_attr attr = { };
> - struct perf_event *event;
> - struct arm_pmu *pmu = NULL;
> -
> - /*
> - * Create a dummy event that only counts user cycles. As we'll never
> - * leave this function with the event being live, it will never
> - * count anything. But it allows us to probe some of the PMU
> - * details. Yes, this is terrible.
> - */
> - attr.type = PERF_TYPE_RAW;
> - attr.size = sizeof(attr);
> - attr.pinned = 1;
> - attr.disabled = 0;
> - attr.exclude_user = 0;
> - attr.exclude_kernel = 1;
> - attr.exclude_hv = 1;
> - attr.exclude_host = 1;
> - attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
> - attr.sample_period = GENMASK(63, 0);
> + struct arm_pmu *arm_pmu = NULL, *tmp;
> + struct arm_pmu_entry *entry;
> + int cpu;
>
> - event = perf_event_create_kernel_counter(&attr, -1, current,
> - kvm_pmu_perf_overflow, &attr);
> + mutex_lock(&arm_pmus_lock);
> + cpu = get_cpu();
>
> - if (IS_ERR(event)) {
> - pr_err_once("kvm: pmu event creation failed %ld\n",
> - PTR_ERR(event));
> - return NULL;
> - }
> + list_for_each_entry(entry, &arm_pmus, entry) {
> + tmp = entry->arm_pmu;
>
> - if (event->pmu) {
> - pmu = to_arm_pmu(event->pmu);
> - if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
> - pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
> - pmu = NULL;
> + if (cpumask_test_cpu(cpu, &tmp->supported_cpus)) {
> + arm_pmu = tmp;
> + break;
> + }
> }
>
> - perf_event_disable(event);
> - perf_event_release_kernel(event);
> + put_cpu();
> + mutex_unlock(&arm_pmus_lock);
>
> - return pmu;
> + return arm_pmu;
> }
>
> u64 kvm_pmu_get_pmceid(struct kvm_vcpu *vcpu, bool pmceid1)
>
> --
> Thanks,
> Oliver
On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
> The PMUv3 driver does pass a name, but it relies on getting back an
> allocated pmu id as @type is -1 in the call to perf_pmu_register().
>
> What actually broke is how KVM probes for a default core PMU to use for
> a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
> reads the pmu from the returned perf_event. The linear search had the
> effect of eventually stumbling on the correct core PMU and succeeding.
>
> Perf folks: is this WAI for heterogenous systems?
TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not
sure what ARM64 does here.
IIRC the only way is to hard affine things; that is, force vCPU of
'type' to the pCPU mask of 'type' CPUs.
If you don't do that; or let userspace 'override' that, things go
sideways *real* fast.
Mark gonna have to look at this.
> Either way, the whole KVM end of this scheme is a bit clunky, and I
> believe it to be unneccessary at this point as we maintain a list of
> core PMU instances that KVM is able to virtualize. We can just walk
> that to find a default PMU to use.
>
> Not seeing any issues on -next with the below diff. If this works for
> folks I can actually wrap it up in a patch and send it out.
>
> diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c
> index 45727d50d18d..cbc0b662b7f8 100644
> --- a/arch/arm64/kvm/pmu-emul.c
> +++ b/arch/arm64/kvm/pmu-emul.c
> @@ -694,47 +694,26 @@ void kvm_host_pmu_init(struct arm_pmu *pmu)
>
> static struct arm_pmu *kvm_pmu_probe_armpmu(void)
> {
> - struct perf_event_attr attr = { };
> - struct perf_event *event;
> - struct arm_pmu *pmu = NULL;
> -
> - /*
> - * Create a dummy event that only counts user cycles. As we'll never
> - * leave this function with the event being live, it will never
> - * count anything. But it allows us to probe some of the PMU
> - * details. Yes, this is terrible.
> - */
> - attr.type = PERF_TYPE_RAW;
> - attr.size = sizeof(attr);
> - attr.pinned = 1;
> - attr.disabled = 0;
> - attr.exclude_user = 0;
> - attr.exclude_kernel = 1;
> - attr.exclude_hv = 1;
> - attr.exclude_host = 1;
> - attr.config = ARMV8_PMUV3_PERFCTR_CPU_CYCLES;
> - attr.sample_period = GENMASK(63, 0);
> + struct arm_pmu *arm_pmu = NULL, *tmp;
> + struct arm_pmu_entry *entry;
> + int cpu;
>
> - event = perf_event_create_kernel_counter(&attr, -1, current,
> - kvm_pmu_perf_overflow, &attr);
> + mutex_lock(&arm_pmus_lock);
> + cpu = get_cpu();
>
> - if (IS_ERR(event)) {
> - pr_err_once("kvm: pmu event creation failed %ld\n",
> - PTR_ERR(event));
> - return NULL;
> - }
> + list_for_each_entry(entry, &arm_pmus, entry) {
> + tmp = entry->arm_pmu;
>
> - if (event->pmu) {
> - pmu = to_arm_pmu(event->pmu);
> - if (pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_NI ||
> - pmu->pmuver == ID_AA64DFR0_EL1_PMUVer_IMP_DEF)
> - pmu = NULL;
> + if (cpumask_test_cpu(cpu, &tmp->supported_cpus)) {
> + arm_pmu = tmp;
> + break;
> + }
> }
>
> - perf_event_disable(event);
> - perf_event_release_kernel(event);
> + put_cpu();
> + mutex_unlock(&arm_pmus_lock);
>
> - return pmu;
> + return arm_pmu;
> }
On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote: > On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote: > > > The PMUv3 driver does pass a name, but it relies on getting back an > > allocated pmu id as @type is -1 in the call to perf_pmu_register(). > > > > What actually broke is how KVM probes for a default core PMU to use for > > a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and > > reads the pmu from the returned perf_event. The linear search had the > > effect of eventually stumbling on the correct core PMU and succeeding. > > > > Perf folks: is this WAI for heterogenous systems? > > TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not > sure what ARM64 does here. > > IIRC the only way is to hard affine things; that is, force vCPU of > 'type' to the pCPU mask of 'type' CPUs. We provide absolutely no illusion of consistency across implementations. Userspace can select the PMU type, and then it is a userspace problem affining vCPUs to the right pCPUs. And if they get that wrong, we just bail and refuse to run the vCPU. > If you don't do that; or let userspace 'override' that, things go > sideways *real* fast. Oh yeah, and I wish PMUs were the only problem with these hetero systems... > Mark gonna have to look at this. Cool. I'll go ahead with the KVM cleanup regardless of the outcome. -- Thanks, Oliver
On Thu, May 25, 2023 at 8:56 AM Oliver Upton <oliver.upton@linux.dev> wrote: > > On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote: > > On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote: > > > > > The PMUv3 driver does pass a name, but it relies on getting back an > > > allocated pmu id as @type is -1 in the call to perf_pmu_register(). > > > > > > What actually broke is how KVM probes for a default core PMU to use for > > > a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and > > > reads the pmu from the returned perf_event. The linear search had the > > > effect of eventually stumbling on the correct core PMU and succeeding. > > > > > > Perf folks: is this WAI for heterogenous systems? > > > > TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not > > sure what ARM64 does here. > > > > IIRC the only way is to hard affine things; that is, force vCPU of > > 'type' to the pCPU mask of 'type' CPUs. > > We provide absolutely no illusion of consistency across implementations. > Userspace can select the PMU type, and then it is a userspace problem > affining vCPUs to the right pCPUs. > > And if they get that wrong, we just bail and refuse to run the vCPU. > > > If you don't do that; or let userspace 'override' that, things go > > sideways *real* fast. > > Oh yeah, and I wish PMUs were the only problem with these hetero > systems... Just to add some context from what I understand. There are inbuilt type numbers for PMUs: https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/include/uapi/linux/perf_event.h?h=perf-tools-next#n34 so the PMU generally called /sys/devices/cpu should have type 4 (ARM give it another name). For heterogeneous ARM there is a single PMU and the same events are programmed regardless of whether it is a big or a little core - the cpumask lists all CPUs. On heterogeneous (aka hybrid) Intel there are two PMUs, the performance cores have a PMU called /sys/devices/cpu_core and it has type 4, the atom cores have a PMU of /sys/devices/cpu_atom and on my Alderlake the type number is 8. The cpu_core and cpu_atom PMUs list the CPUs that are valid for raw style events, where the config values in perf_event_attr contains all of the event programming data. There are also legacy events of PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE where to specify the PMU the type is encoded in the high (and unused) 32-bits of config - so the type would be something like PERF_TYPE_HARDWARE and then config would be "value | (4 << 32)" for the performance core or "value | (8 << 32)" for the atom. If the vCPU and pCPUs mappings vary then there is a chance to change the CPU mask on heterogeneous Intel, but it seems if the event is open and you move from between core types then things are going to break. Thanks, Ian > > Mark gonna have to look at this. > > Cool. I'll go ahead with the KVM cleanup regardless of the outcome. > > -- > Thanks, > Oliver
On Sat, 27 May 2023 00:00:47 +0100,
Ian Rogers <irogers@google.com> wrote:
>
> On Thu, May 25, 2023 at 8:56 AM Oliver Upton <oliver.upton@linux.dev> wrote:
> >
> > On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote:
> > > On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
> > >
> > > > The PMUv3 driver does pass a name, but it relies on getting back an
> > > > allocated pmu id as @type is -1 in the call to perf_pmu_register().
> > > >
> > > > What actually broke is how KVM probes for a default core PMU to use for
> > > > a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
> > > > reads the pmu from the returned perf_event. The linear search had the
> > > > effect of eventually stumbling on the correct core PMU and succeeding.
> > > >
> > > > Perf folks: is this WAI for heterogenous systems?
> > >
> > > TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not
> > > sure what ARM64 does here.
> > >
> > > IIRC the only way is to hard affine things; that is, force vCPU of
> > > 'type' to the pCPU mask of 'type' CPUs.
> >
> > We provide absolutely no illusion of consistency across implementations.
> > Userspace can select the PMU type, and then it is a userspace problem
> > affining vCPUs to the right pCPUs.
> >
> > And if they get that wrong, we just bail and refuse to run the vCPU.
> >
> > > If you don't do that; or let userspace 'override' that, things go
> > > sideways *real* fast.
> >
> > Oh yeah, and I wish PMUs were the only problem with these hetero
> > systems...
>
> Just to add some context from what I understand. There are inbuilt
> type numbers for PMUs:
> https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/include/uapi/linux/perf_event.h?h=perf-tools-next#n34
> so the PMU generally called /sys/devices/cpu should have type 4 (ARM
> give it another name). For heterogeneous ARM there is a single PMU and
> the same events are programmed regardless of whether it is a big or a
> little core - the cpumask lists all CPUs.
I think you misunderstood the way heterogeneous arm64 systems are
described . Each CPU type gets its own PMU type, and its own event
list. Case in point:
$ grep . /sys/devices/*pmu/{type,cpus}
/sys/devices/apple_avalanche_pmu/type:9
/sys/devices/apple_blizzard_pmu/type:8
/sys/devices/apple_avalanche_pmu/cpus:4-9
/sys/devices/apple_blizzard_pmu/cpus:0-3
Type 4 (aka PERF_EVENT_RAW) is AFAICT just a way to encode the raw
event number, nothing else.
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
On Sat, May 27, 2023 at 6:32 AM Marc Zyngier <maz@kernel.org> wrote:
>
> On Sat, 27 May 2023 00:00:47 +0100,
> Ian Rogers <irogers@google.com> wrote:
> >
> > On Thu, May 25, 2023 at 8:56 AM Oliver Upton <oliver.upton@linux.dev> wrote:
> > >
> > > On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote:
> > > > On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
> > > >
> > > > > The PMUv3 driver does pass a name, but it relies on getting back an
> > > > > allocated pmu id as @type is -1 in the call to perf_pmu_register().
> > > > >
> > > > > What actually broke is how KVM probes for a default core PMU to use for
> > > > > a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
> > > > > reads the pmu from the returned perf_event. The linear search had the
> > > > > effect of eventually stumbling on the correct core PMU and succeeding.
> > > > >
> > > > > Perf folks: is this WAI for heterogenous systems?
> > > >
> > > > TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not
> > > > sure what ARM64 does here.
> > > >
> > > > IIRC the only way is to hard affine things; that is, force vCPU of
> > > > 'type' to the pCPU mask of 'type' CPUs.
> > >
> > > We provide absolutely no illusion of consistency across implementations.
> > > Userspace can select the PMU type, and then it is a userspace problem
> > > affining vCPUs to the right pCPUs.
> > >
> > > And if they get that wrong, we just bail and refuse to run the vCPU.
> > >
> > > > If you don't do that; or let userspace 'override' that, things go
> > > > sideways *real* fast.
> > >
> > > Oh yeah, and I wish PMUs were the only problem with these hetero
> > > systems...
> >
> > Just to add some context from what I understand. There are inbuilt
> > type numbers for PMUs:
> > https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/include/uapi/linux/perf_event.h?h=perf-tools-next#n34
> > so the PMU generally called /sys/devices/cpu should have type 4 (ARM
> > give it another name). For heterogeneous ARM there is a single PMU and
> > the same events are programmed regardless of whether it is a big or a
> > little core - the cpumask lists all CPUs.
>
> I think you misunderstood the way heterogeneous arm64 systems are
> described . Each CPU type gets its own PMU type, and its own event
> list. Case in point:
>
> $ grep . /sys/devices/*pmu/{type,cpus}
> /sys/devices/apple_avalanche_pmu/type:9
> /sys/devices/apple_blizzard_pmu/type:8
> /sys/devices/apple_avalanche_pmu/cpus:4-9
> /sys/devices/apple_blizzard_pmu/cpus:0-3
>
> Type 4 (aka PERF_EVENT_RAW) is AFAICT just a way to encode the raw
> event number, nothing else.
Which PMU will a raw event open on? Note, the raw events don't support
the extended type that is present in PERF_TYPE_HARDWARE and
PERF_TYPE_HW_CACHE:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/include/uapi/linux/perf_event.h#n41
as the bits are already in use for being just plain config values. I
suspect not being type 4 is a bug on apple ARM here.
Thanks,
Ian
> Thanks,
>
> M.
>
> --
> Without deviation from the norm, progress is not possible.
On Sat, 27 May 2023 18:00:13 +0100,
Ian Rogers <irogers@google.com> wrote:
>
> On Sat, May 27, 2023 at 6:32 AM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On Sat, 27 May 2023 00:00:47 +0100,
> > Ian Rogers <irogers@google.com> wrote:
> > >
> > > On Thu, May 25, 2023 at 8:56 AM Oliver Upton <oliver.upton@linux.dev> wrote:
> > > >
> > > > On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote:
> > > > > On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
> > > > >
> > > > > > The PMUv3 driver does pass a name, but it relies on getting back an
> > > > > > allocated pmu id as @type is -1 in the call to perf_pmu_register().
> > > > > >
> > > > > > What actually broke is how KVM probes for a default core PMU to use for
> > > > > > a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
> > > > > > reads the pmu from the returned perf_event. The linear search had the
> > > > > > effect of eventually stumbling on the correct core PMU and succeeding.
> > > > > >
> > > > > > Perf folks: is this WAI for heterogenous systems?
> > > > >
> > > > > TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not
> > > > > sure what ARM64 does here.
> > > > >
> > > > > IIRC the only way is to hard affine things; that is, force vCPU of
> > > > > 'type' to the pCPU mask of 'type' CPUs.
> > > >
> > > > We provide absolutely no illusion of consistency across implementations.
> > > > Userspace can select the PMU type, and then it is a userspace problem
> > > > affining vCPUs to the right pCPUs.
> > > >
> > > > And if they get that wrong, we just bail and refuse to run the vCPU.
> > > >
> > > > > If you don't do that; or let userspace 'override' that, things go
> > > > > sideways *real* fast.
> > > >
> > > > Oh yeah, and I wish PMUs were the only problem with these hetero
> > > > systems...
> > >
> > > Just to add some context from what I understand. There are inbuilt
> > > type numbers for PMUs:
> > > https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/include/uapi/linux/perf_event.h?h=perf-tools-next#n34
> > > so the PMU generally called /sys/devices/cpu should have type 4 (ARM
> > > give it another name). For heterogeneous ARM there is a single PMU and
> > > the same events are programmed regardless of whether it is a big or a
> > > little core - the cpumask lists all CPUs.
> >
> > I think you misunderstood the way heterogeneous arm64 systems are
> > described . Each CPU type gets its own PMU type, and its own event
> > list. Case in point:
> >
> > $ grep . /sys/devices/*pmu/{type,cpus}
> > /sys/devices/apple_avalanche_pmu/type:9
> > /sys/devices/apple_blizzard_pmu/type:8
> > /sys/devices/apple_avalanche_pmu/cpus:4-9
> > /sys/devices/apple_blizzard_pmu/cpus:0-3
> >
> > Type 4 (aka PERF_EVENT_RAW) is AFAICT just a way to encode the raw
> > event number, nothing else.
>
> Which PMU will a raw event open on?
On the PMU that matches the current CPU.
> Note, the raw events don't support
> the extended type that is present in PERF_TYPE_HARDWARE and
> PERF_TYPE_HW_CACHE:
> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/include/uapi/linux/perf_event.h#n41
> as the bits are already in use for being just plain config values.
I'm not sure how relevant this is to the numbering of PMUs on arm64.
> I suspect not being type 4 is a bug on apple ARM here.
If that's a bug on this machine, it's a bug on all machines, at which
point it is the de-facto API:
$ grep . /sys/devices/armv8*/{type,cpus}
/sys/devices/armv8_cortex_a53/type:8
/sys/devices/armv8_cortex_a72/type:9
/sys/devices/armv8_cortex_a53/cpus:0-3
/sys/devices/armv8_cortex_a72/cpus:4-5
See, non-Apple HW. And now for a system with homogeneous CPUs:
$ grep . /sys/devices/armv8*/{type,cpus}
/sys/devices/armv8_pmuv3_0/type:8
/sys/devices/armv8_pmuv3_0/cpus:0-159
Still no type 4. I could go on for hours, I have plenty of HW around
me!
So whatever your source of information is, it doesn't match reality.
Our PMUs are numbered arbitrarily, and have been so for... a very long
time. At least since perf_pmu_register has supported dynamic
registration (see 2e80a82a49c4c).
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
On 5/27/23 20:38, Marc Zyngier wrote:
> On Sat, 27 May 2023 18:00:13 +0100,
> Ian Rogers <irogers@google.com> wrote:
>>
>> On Sat, May 27, 2023 at 6:32 AM Marc Zyngier <maz@kernel.org> wrote:
>>>
>>> On Sat, 27 May 2023 00:00:47 +0100,
>>> Ian Rogers <irogers@google.com> wrote:
>>>>
>>>> On Thu, May 25, 2023 at 8:56 AM Oliver Upton <oliver.upton@linux.dev> wrote:
>>>>>
>>>>> On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote:
>>>>>> On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
>>>>>>
>>>>>>> The PMUv3 driver does pass a name, but it relies on getting back an
>>>>>>> allocated pmu id as @type is -1 in the call to perf_pmu_register().
>>>>>>>
>>>>>>> What actually broke is how KVM probes for a default core PMU to use for
>>>>>>> a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
>>>>>>> reads the pmu from the returned perf_event. The linear search had the
>>>>>>> effect of eventually stumbling on the correct core PMU and succeeding.
>>>>>>>
>>>>>>> Perf folks: is this WAI for heterogenous systems?
>>>>>>
>>>>>> TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not
>>>>>> sure what ARM64 does here.
>>>>>>
>>>>>> IIRC the only way is to hard affine things; that is, force vCPU of
>>>>>> 'type' to the pCPU mask of 'type' CPUs.
>>>>>
>>>>> We provide absolutely no illusion of consistency across implementations.
>>>>> Userspace can select the PMU type, and then it is a userspace problem
>>>>> affining vCPUs to the right pCPUs.
>>>>>
>>>>> And if they get that wrong, we just bail and refuse to run the vCPU.
>>>>>
>>>>>> If you don't do that; or let userspace 'override' that, things go
>>>>>> sideways *real* fast.
>>>>>
>>>>> Oh yeah, and I wish PMUs were the only problem with these hetero
>>>>> systems...
>>>>
>>>> Just to add some context from what I understand. There are inbuilt
>>>> type numbers for PMUs:
>>>> https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/include/uapi/linux/perf_event.h?h=perf-tools-next#n34
>>>> so the PMU generally called /sys/devices/cpu should have type 4 (ARM
>>>> give it another name). For heterogeneous ARM there is a single PMU and
>>>> the same events are programmed regardless of whether it is a big or a
>>>> little core - the cpumask lists all CPUs.
>>>
>>> I think you misunderstood the way heterogeneous arm64 systems are
>>> described . Each CPU type gets its own PMU type, and its own event
>>> list. Case in point:
>>>
>>> $ grep . /sys/devices/*pmu/{type,cpus}
>>> /sys/devices/apple_avalanche_pmu/type:9
>>> /sys/devices/apple_blizzard_pmu/type:8
>>> /sys/devices/apple_avalanche_pmu/cpus:4-9
>>> /sys/devices/apple_blizzard_pmu/cpus:0-3
>>>
>>> Type 4 (aka PERF_EVENT_RAW) is AFAICT just a way to encode the raw
>>> event number, nothing else.
>>
>> Which PMU will a raw event open on?
>
> On the PMU that matches the current CPU.
>
>> Note, the raw events don't support
>> the extended type that is present in PERF_TYPE_HARDWARE and
>> PERF_TYPE_HW_CACHE:
>> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/include/uapi/linux/perf_event.h#n41
>> as the bits are already in use for being just plain config values.
>
> I'm not sure how relevant this is to the numbering of PMUs on arm64.
>
>> I suspect not being type 4 is a bug on apple ARM here.
>
> If that's a bug on this machine, it's a bug on all machines, at which
> point it is the de-facto API:
>
> $ grep . /sys/devices/armv8*/{type,cpus}
> /sys/devices/armv8_cortex_a53/type:8
> /sys/devices/armv8_cortex_a72/type:9
> /sys/devices/armv8_cortex_a53/cpus:0-3
> /sys/devices/armv8_cortex_a72/cpus:4-5
>
> See, non-Apple HW. And now for a system with homogeneous CPUs:
>
> $ grep . /sys/devices/armv8*/{type,cpus}
> /sys/devices/armv8_pmuv3_0/type:8
> /sys/devices/armv8_pmuv3_0/cpus:0-159
>
> Still no type 4. I could go on for hours, I have plenty of HW around
> me!
>
> So whatever your source of information is, it doesn't match reality.
> Our PMUs are numbered arbitrarily, and have been so for... a very long
> time. At least since perf_pmu_register has supported dynamic
> registration (see 2e80a82a49c4c).
>
> Thanks,
>
> M.
>
I agree with Marc,
on s390 we have 5 different PMUs and all have arbitrary numbers
and have totally different features:
# ll /sys/devices/{cpum,pai}*/type
-r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/cpum_cf_diag/type
-r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/cpum_cf/type
-r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/cpum_sf/type
-r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/pai_crypto/type
-r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/pai_ext/type
# grep . /sys/devices/{cpum,pai}*/type
/sys/devices/cpum_cf_diag/type:9
/sys/devices/cpum_cf/type:8
/sys/devices/cpum_sf/type:4
/sys/devices/pai_crypto/type:10
/sys/devices/pai_ext/type:11
#
Thanks Thomas
--
Thomas Richter, Dept 3303, IBM s390 Linux Development, Boeblingen, Germany
--
Vorsitzender des Aufsichtsrats: Gregor Pillen
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294
On Tue, May 30, 2023 at 12:45 AM Thomas Richter <tmricht@linux.ibm.com> wrote:
>
> On 5/27/23 20:38, Marc Zyngier wrote:
> > On Sat, 27 May 2023 18:00:13 +0100,
> > Ian Rogers <irogers@google.com> wrote:
> >>
> >> On Sat, May 27, 2023 at 6:32 AM Marc Zyngier <maz@kernel.org> wrote:
> >>>
> >>> On Sat, 27 May 2023 00:00:47 +0100,
> >>> Ian Rogers <irogers@google.com> wrote:
> >>>>
> >>>> On Thu, May 25, 2023 at 8:56 AM Oliver Upton <oliver.upton@linux.dev> wrote:
> >>>>>
> >>>>> On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote:
> >>>>>> On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
> >>>>>>
> >>>>>>> The PMUv3 driver does pass a name, but it relies on getting back an
> >>>>>>> allocated pmu id as @type is -1 in the call to perf_pmu_register().
> >>>>>>>
> >>>>>>> What actually broke is how KVM probes for a default core PMU to use for
> >>>>>>> a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
> >>>>>>> reads the pmu from the returned perf_event. The linear search had the
> >>>>>>> effect of eventually stumbling on the correct core PMU and succeeding.
> >>>>>>>
> >>>>>>> Perf folks: is this WAI for heterogenous systems?
> >>>>>>
> >>>>>> TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not
> >>>>>> sure what ARM64 does here.
> >>>>>>
> >>>>>> IIRC the only way is to hard affine things; that is, force vCPU of
> >>>>>> 'type' to the pCPU mask of 'type' CPUs.
> >>>>>
> >>>>> We provide absolutely no illusion of consistency across implementations.
> >>>>> Userspace can select the PMU type, and then it is a userspace problem
> >>>>> affining vCPUs to the right pCPUs.
> >>>>>
> >>>>> And if they get that wrong, we just bail and refuse to run the vCPU.
> >>>>>
> >>>>>> If you don't do that; or let userspace 'override' that, things go
> >>>>>> sideways *real* fast.
> >>>>>
> >>>>> Oh yeah, and I wish PMUs were the only problem with these hetero
> >>>>> systems...
> >>>>
> >>>> Just to add some context from what I understand. There are inbuilt
> >>>> type numbers for PMUs:
> >>>> https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/include/uapi/linux/perf_event.h?h=perf-tools-next#n34
> >>>> so the PMU generally called /sys/devices/cpu should have type 4 (ARM
> >>>> give it another name). For heterogeneous ARM there is a single PMU and
> >>>> the same events are programmed regardless of whether it is a big or a
> >>>> little core - the cpumask lists all CPUs.
> >>>
> >>> I think you misunderstood the way heterogeneous arm64 systems are
> >>> described . Each CPU type gets its own PMU type, and its own event
> >>> list. Case in point:
> >>>
> >>> $ grep . /sys/devices/*pmu/{type,cpus}
> >>> /sys/devices/apple_avalanche_pmu/type:9
> >>> /sys/devices/apple_blizzard_pmu/type:8
> >>> /sys/devices/apple_avalanche_pmu/cpus:4-9
> >>> /sys/devices/apple_blizzard_pmu/cpus:0-3
> >>>
> >>> Type 4 (aka PERF_EVENT_RAW) is AFAICT just a way to encode the raw
> >>> event number, nothing else.
> >>
> >> Which PMU will a raw event open on?
> >
> > On the PMU that matches the current CPU.
> >
> >> Note, the raw events don't support
> >> the extended type that is present in PERF_TYPE_HARDWARE and
> >> PERF_TYPE_HW_CACHE:
> >> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/include/uapi/linux/perf_event.h#n41
> >> as the bits are already in use for being just plain config values.
> >
> > I'm not sure how relevant this is to the numbering of PMUs on arm64.
> >
> >> I suspect not being type 4 is a bug on apple ARM here.
> >
> > If that's a bug on this machine, it's a bug on all machines, at which
> > point it is the de-facto API:
> >
> > $ grep . /sys/devices/armv8*/{type,cpus}
> > /sys/devices/armv8_cortex_a53/type:8
> > /sys/devices/armv8_cortex_a72/type:9
> > /sys/devices/armv8_cortex_a53/cpus:0-3
> > /sys/devices/armv8_cortex_a72/cpus:4-5
> >
> > See, non-Apple HW. And now for a system with homogeneous CPUs:
> >
> > $ grep . /sys/devices/armv8*/{type,cpus}
> > /sys/devices/armv8_pmuv3_0/type:8
> > /sys/devices/armv8_pmuv3_0/cpus:0-159
> >
> > Still no type 4. I could go on for hours, I have plenty of HW around
> > me!
> >
> > So whatever your source of information is, it doesn't match reality.
> > Our PMUs are numbered arbitrarily, and have been so for... a very long
> > time. At least since perf_pmu_register has supported dynamic
> > registration (see 2e80a82a49c4c).
> >
> > Thanks,
> >
> > M.
> >
>
>
> I agree with Marc,
> on s390 we have 5 different PMUs and all have arbitrary numbers
> and have totally different features:
>
> # ll /sys/devices/{cpum,pai}*/type
> -r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/cpum_cf_diag/type
> -r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/cpum_cf/type
> -r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/cpum_sf/type
> -r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/pai_crypto/type
> -r--r--r-- 1 root root 4096 May 30 09:33 /sys/devices/pai_ext/type
> # grep . /sys/devices/{cpum,pai}*/type
> /sys/devices/cpum_cf_diag/type:9
> /sys/devices/cpum_cf/type:8
> /sys/devices/cpum_sf/type:4
> /sys/devices/pai_crypto/type:10
> /sys/devices/pai_ext/type:11
> #
Thanks Thomas, could you:
$ ls /sys/devices/*/cpu*
The assumption is that if a file called "cpus" exists then this a
"core" PMU, while "cpumask" would indicate an uncore PMU. The
exception is /sys/devices/cpu where there is no cpus but all online
CPUs are assumed to be in "cpus" cpu map. On Intel one of the core
PMUs has type 4 which aligns with the perf_event.h "ABI" constant
PERF_TYPE_RAW, so opening a type 4 event will open it on that PMU. The
question is I have is what should be the behavior be on non-Intel for
type 4, for the big.little/hybrid case it seems opening it on core
PMUs would make most sense and is what is done for PERF_TYPE_HARDWARE
and PERF_TYPE_HW_CACHE.
Thanks,
Ian
> Thanks Thomas
> --
> Thomas Richter, Dept 3303, IBM s390 Linux Development, Boeblingen, Germany
> --
> Vorsitzender des Aufsichtsrats: Gregor Pillen
> Geschäftsführung: David Faller
> Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294
>
On 5/30/23 16:00, Ian Rogers wrote:
> ls /sys/devices/*/cpu*
Hi Ian,
here is the output yo requested:
# ls /sys/devices/*/cpu*
cpu0 cpu3 cpu6 hotplug modalias possible smt
cpu1 cpu4 cpu7 isolated offline present uevent
cpu2 cpu5 dispatching kernel_max online rescan vulnerabilities
#
In fact it is the same as
# ls /sys/devices/system/cpu/
cpu0 cpu3 cpu6 hotplug modalias possible smt
cpu1 cpu4 cpu7 isolated offline present uevent
cpu2 cpu5 dispatching kernel_max online rescan vulnerabilities
#
This directory tree has nothing to do with events for perf, it is
merely used to support CPU hotplug on s390.
The PMUs on s390 are
# ls -ld /sys/devices/{cpum_,pai_}*
drwxr-xr-x 4 root root 0 May 25 15:45 /sys/devices/cpum_cf
drwxr-xr-x 4 root root 0 May 25 15:45 /sys/devices/cpum_cf_diag
drwxr-xr-x 4 root root 0 May 25 15:45 /sys/devices/cpum_sf
drwxr-xr-x 4 root root 0 May 25 15:45 /sys/devices/pai_crypto
drwxr-xr-x 4 root root 0 May 25 15:45 /sys/devices/pai_ext
#
I hope his helps.
--
Thomas Richter, Dept 3303, IBM s390 Linux Development, Boeblingen, Germany
--
Vorsitzender des Aufsichtsrats: Gregor Pillen
Geschäftsführung: David Faller
Sitz der Gesellschaft: Böblingen / Registergericht: Amtsgericht Stuttgart, HRB 243294
On Sat, May 27, 2023 at 11:38 AM Marc Zyngier <maz@kernel.org> wrote:
>
> On Sat, 27 May 2023 18:00:13 +0100,
> Ian Rogers <irogers@google.com> wrote:
> >
> > On Sat, May 27, 2023 at 6:32 AM Marc Zyngier <maz@kernel.org> wrote:
> > >
> > > On Sat, 27 May 2023 00:00:47 +0100,
> > > Ian Rogers <irogers@google.com> wrote:
> > > >
> > > > On Thu, May 25, 2023 at 8:56 AM Oliver Upton <oliver.upton@linux.dev> wrote:
> > > > >
> > > > > On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote:
> > > > > > On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
> > > > > >
> > > > > > > The PMUv3 driver does pass a name, but it relies on getting back an
> > > > > > > allocated pmu id as @type is -1 in the call to perf_pmu_register().
> > > > > > >
> > > > > > > What actually broke is how KVM probes for a default core PMU to use for
> > > > > > > a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
> > > > > > > reads the pmu from the returned perf_event. The linear search had the
> > > > > > > effect of eventually stumbling on the correct core PMU and succeeding.
> > > > > > >
> > > > > > > Perf folks: is this WAI for heterogenous systems?
> > > > > >
> > > > > > TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not
> > > > > > sure what ARM64 does here.
> > > > > >
> > > > > > IIRC the only way is to hard affine things; that is, force vCPU of
> > > > > > 'type' to the pCPU mask of 'type' CPUs.
> > > > >
> > > > > We provide absolutely no illusion of consistency across implementations.
> > > > > Userspace can select the PMU type, and then it is a userspace problem
> > > > > affining vCPUs to the right pCPUs.
> > > > >
> > > > > And if they get that wrong, we just bail and refuse to run the vCPU.
> > > > >
> > > > > > If you don't do that; or let userspace 'override' that, things go
> > > > > > sideways *real* fast.
> > > > >
> > > > > Oh yeah, and I wish PMUs were the only problem with these hetero
> > > > > systems...
> > > >
> > > > Just to add some context from what I understand. There are inbuilt
> > > > type numbers for PMUs:
> > > > https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/include/uapi/linux/perf_event.h?h=perf-tools-next#n34
> > > > so the PMU generally called /sys/devices/cpu should have type 4 (ARM
> > > > give it another name). For heterogeneous ARM there is a single PMU and
> > > > the same events are programmed regardless of whether it is a big or a
> > > > little core - the cpumask lists all CPUs.
> > >
> > > I think you misunderstood the way heterogeneous arm64 systems are
> > > described . Each CPU type gets its own PMU type, and its own event
> > > list. Case in point:
> > >
> > > $ grep . /sys/devices/*pmu/{type,cpus}
> > > /sys/devices/apple_avalanche_pmu/type:9
> > > /sys/devices/apple_blizzard_pmu/type:8
> > > /sys/devices/apple_avalanche_pmu/cpus:4-9
> > > /sys/devices/apple_blizzard_pmu/cpus:0-3
> > >
> > > Type 4 (aka PERF_EVENT_RAW) is AFAICT just a way to encode the raw
> > > event number, nothing else.
> >
> > Which PMU will a raw event open on?
>
> On the PMU that matches the current CPU.
Thanks. This should really be captured in the man page. Presumably
that's the behavior with the -1 any CPU, and if a CPU is specified
then the selected PMU will match that CPU?
> > Note, the raw events don't support
> > the extended type that is present in PERF_TYPE_HARDWARE and
> > PERF_TYPE_HW_CACHE:
> > https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/include/uapi/linux/perf_event.h#n41
> > as the bits are already in use for being just plain config values.
>
> I'm not sure how relevant this is to the numbering of PMUs on arm64.
It matters because on the tool side you can group events, for example
if you are measuring IPC then it makes sense that the instructions and
cycles events are both multiplexed as part of a group. The
instructions and cycles events have a type of PERF_TYPE_HARDWARE as
this is hard coded into the event parser in perf. For perf we may
specify this as:
perf stat -e '{cycles,instructions}' ...
When we open the events on a heterogeneous system the expectation is
we get a group of cycles and instructions for each PMU, the extended
type in the attribute for each event will be set to the type of the
PMU the group is targeting. When we parse the events we have cycles
events per PMU then instructions events per PMU, we then resort and
regroup the events as you can't have a group spanning PMUs except in a
few special cases like software events.
Now let's say we want to do a raw event from the json files, on Intel
let's say we choose the INST_RETIRED.ANY event and we want to measure
it with cycles:
perf stat -e '{cycles,inst_retired.any}' ...
The INST_RETIRED.ANY event will be matched on Intel with the PMUs
"cpu_core" and "cpu_atom" on heterogeneous systems. The cycles event
will be opened on two PMUs and the extended type set to each one. So
we have differing perf_event_attr types, but we know it is valid to
group the events because the raw event encoding's PMU type can be
matched to the extended type of the hardware event.
To better support the wildcard opening, sorting and grouping
operations of events I want to have an ability in the perf tool to map
a type number to a list of PMUs. Mapping a hardware/hw_cache type
number to PMUs should give the set of core PMUs for a heterogeneous
system. On heterogeneous Intel mapping a raw event (type 4) will give
"cpu_core" but it sounds like on ARM, if no sysfs PMU has type number
4, then we should map 4 to the set of core PMUs.
Mapping a type number to a set of perf tool's representation of PMUs
isn't current behavior, but hopefully you can see that ARM's behavior
is differing from Intel's, for which I'd been basing the
implementation so that we can support heterogeneous metrics, etc.
I'm also confused what should be the behavior of something like:
perf stat -e r0 ...
The code is going to open the event on every core PMU with the config set to 0:
https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/tools/perf/util/parse-events.c?h=perf-tools-next#n1485
But perhaps the intent on ARM is that a single raw type event is
opened? It seems the every core PMU behavior is more useful.
> > I suspect not being type 4 is a bug on apple ARM here.
>
> If that's a bug on this machine, it's a bug on all machines, at which
> point it is the de-facto API:
>
> $ grep . /sys/devices/armv8*/{type,cpus}
> /sys/devices/armv8_cortex_a53/type:8
> /sys/devices/armv8_cortex_a72/type:9
> /sys/devices/armv8_cortex_a53/cpus:0-3
> /sys/devices/armv8_cortex_a72/cpus:4-5
>
> See, non-Apple HW. And now for a system with homogeneous CPUs:
>
> $ grep . /sys/devices/armv8*/{type,cpus}
> /sys/devices/armv8_pmuv3_0/type:8
> /sys/devices/armv8_pmuv3_0/cpus:0-159
>
> Still no type 4. I could go on for hours, I have plenty of HW around
> me!
This is good to know, thanks for doing the research. I agree that it
is a de-facto API as this has happened. I'm hoping in the future we
can compress some /sys/devices directories and use them as test input.
It obviously won't be possible to open the events, etc. but we can
test things like the event/metric parsing matches an expectation.
Thanks,
Ian
> So whatever your source of information is, it doesn't match reality.
> Our PMUs are numbered arbitrarily, and have been so for... a very long
> time. At least since perf_pmu_register has supported dynamic
> registration (see 2e80a82a49c4c).
>
> Thanks,
>
> M.
>
> --
> Without deviation from the norm, progress is not possible.
On Sat, May 27, 2023 at 10:00 AM Ian Rogers <irogers@google.com> wrote:
>
> On Sat, May 27, 2023 at 6:32 AM Marc Zyngier <maz@kernel.org> wrote:
> >
> > On Sat, 27 May 2023 00:00:47 +0100,
> > Ian Rogers <irogers@google.com> wrote:
> > >
> > > On Thu, May 25, 2023 at 8:56 AM Oliver Upton <oliver.upton@linux.dev> wrote:
> > > >
> > > > On Thu, May 25, 2023 at 04:20:31PM +0200, Peter Zijlstra wrote:
> > > > > On Thu, May 25, 2023 at 07:11:41AM +0000, Oliver Upton wrote:
> > > > >
> > > > > > The PMUv3 driver does pass a name, but it relies on getting back an
> > > > > > allocated pmu id as @type is -1 in the call to perf_pmu_register().
> > > > > >
> > > > > > What actually broke is how KVM probes for a default core PMU to use for
> > > > > > a guest. kvm_pmu_probe_armpmu() creates a counter w/ PERF_TYPE_RAW and
> > > > > > reads the pmu from the returned perf_event. The linear search had the
> > > > > > effect of eventually stumbling on the correct core PMU and succeeding.
> > > > > >
> > > > > > Perf folks: is this WAI for heterogenous systems?
> > > > >
> > > > > TBH, I'm not sure. hetero and virt don't mix very well AFAIK and I'm not
> > > > > sure what ARM64 does here.
> > > > >
> > > > > IIRC the only way is to hard affine things; that is, force vCPU of
> > > > > 'type' to the pCPU mask of 'type' CPUs.
> > > >
> > > > We provide absolutely no illusion of consistency across implementations.
> > > > Userspace can select the PMU type, and then it is a userspace problem
> > > > affining vCPUs to the right pCPUs.
> > > >
> > > > And if they get that wrong, we just bail and refuse to run the vCPU.
> > > >
> > > > > If you don't do that; or let userspace 'override' that, things go
> > > > > sideways *real* fast.
> > > >
> > > > Oh yeah, and I wish PMUs were the only problem with these hetero
> > > > systems...
> > >
> > > Just to add some context from what I understand. There are inbuilt
> > > type numbers for PMUs:
> > > https://git.kernel.org/pub/scm/linux/kernel/git/acme/linux.git/tree/include/uapi/linux/perf_event.h?h=perf-tools-next#n34
> > > so the PMU generally called /sys/devices/cpu should have type 4 (ARM
> > > give it another name). For heterogeneous ARM there is a single PMU and
> > > the same events are programmed regardless of whether it is a big or a
> > > little core - the cpumask lists all CPUs.
> >
> > I think you misunderstood the way heterogeneous arm64 systems are
> > described . Each CPU type gets its own PMU type, and its own event
> > list. Case in point:
> >
> > $ grep . /sys/devices/*pmu/{type,cpus}
> > /sys/devices/apple_avalanche_pmu/type:9
> > /sys/devices/apple_blizzard_pmu/type:8
> > /sys/devices/apple_avalanche_pmu/cpus:4-9
> > /sys/devices/apple_blizzard_pmu/cpus:0-3
> >
> > Type 4 (aka PERF_EVENT_RAW) is AFAICT just a way to encode the raw
> > event number, nothing else.
>
> Which PMU will a raw event open on? Note, the raw events don't support
> the extended type that is present in PERF_TYPE_HARDWARE and
> PERF_TYPE_HW_CACHE:
> https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/include/uapi/linux/perf_event.h#n41
> as the bits are already in use for being just plain config values. I
> suspect not being type 4 is a bug on apple ARM here.
>
> Thanks,
> Ian
Btw, thanks for correcting me! :-D These assumptions are under
documented/tested and nobody wants to make interfaces/tools that are
broken. One source of information is:
https://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git/tree/Documentation/admin-guide/perf
but it doesn't capture this.
Thanks,
Ian
> > Thanks,
> >
> > M.
> >
> > --
> > Without deviation from the norm, progress is not possible.
The following commit has been merged into the perf/core branch of tip:
Commit-ID: 9551fbb64d094cc105964716224adeb7765df8fd
Gitweb: https://git.kernel.org/tip/9551fbb64d094cc105964716224adeb7765df8fd
Author: Ravi Bangoria <ravi.bangoria@amd.com>
AuthorDate: Thu, 04 May 2023 16:30:02 +05:30
Committer: Peter Zijlstra <peterz@infradead.org>
CommitterDate: Mon, 08 May 2023 10:58:31 +02:00
perf/core: Remove pmu linear searching code
Searching for the right pmu by iterating over all pmus is no longer
required since all pmus now *must* be present in the 'pmu_idr' list.
So, remove linear searching code.
Signed-off-by: Ravi Bangoria <ravi.bangoria@amd.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230504110003.2548-4-ravi.bangoria@amd.com
---
kernel/events/core.c | 37 +++++++++++++------------------------
1 file changed, 13 insertions(+), 24 deletions(-)
diff --git a/kernel/events/core.c b/kernel/events/core.c
index c01bbe9..231b187 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -11630,38 +11630,27 @@ static struct pmu *perf_init_event(struct perf_event *event)
}
again:
+ ret = -ENOENT;
rcu_read_lock();
pmu = idr_find(&pmu_idr, type);
rcu_read_unlock();
- if (pmu) {
- if (event->attr.type != type && type != PERF_TYPE_RAW &&
- !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
- goto fail;
-
- ret = perf_try_init_event(pmu, event);
- if (ret == -ENOENT && event->attr.type != type && !extended_type) {
- type = event->attr.type;
- goto again;
- }
+ if (!pmu)
+ goto fail;
- if (ret)
- pmu = ERR_PTR(ret);
+ if (event->attr.type != type && type != PERF_TYPE_RAW &&
+ !(pmu->capabilities & PERF_PMU_CAP_EXTENDED_HW_TYPE))
+ goto fail;
- goto unlock;
+ ret = perf_try_init_event(pmu, event);
+ if (ret == -ENOENT && event->attr.type != type && !extended_type) {
+ type = event->attr.type;
+ goto again;
}
- list_for_each_entry_rcu(pmu, &pmus, entry, lockdep_is_held(&pmus_srcu)) {
- ret = perf_try_init_event(pmu, event);
- if (!ret)
- goto unlock;
-
- if (ret != -ENOENT) {
- pmu = ERR_PTR(ret);
- goto unlock;
- }
- }
fail:
- pmu = ERR_PTR(-ENOENT);
+ if (ret)
+ pmu = ERR_PTR(ret);
+
unlock:
srcu_read_unlock(&pmus_srcu, idx);
© 2016 - 2025 Red Hat, Inc.