[PATCH v3 3/3] xenpm: Add get-intel-temp subcommand

Teddy Astie posted 3 patches 3 days, 2 hours ago
[PATCH v3 3/3] xenpm: Add get-intel-temp subcommand
Posted by Teddy Astie 3 days, 2 hours ago
get-intel-temp allows querying the per-core CPU temperature and
per-package one on Intel processors (as usual Dom0 drivers cannot
work due to misalignment between Dom0 vCPU and pCPUs).

Signed-off-by: Teddy Astie <teddy.astie@vates.tech>
---
 tools/misc/xenpm.c | 91 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 1 deletion(-)

diff --git a/tools/misc/xenpm.c b/tools/misc/xenpm.c
index 682d092479..1558f6c250 100644
--- a/tools/misc/xenpm.c
+++ b/tools/misc/xenpm.c
@@ -30,6 +30,7 @@
 #include <inttypes.h>
 #include <sys/time.h>
 
+#include <xen/asm/msr-index.h>
 #include <xen-tools/common-macros.h>
 
 #define MAX_PKG_RESIDENCIES 12
@@ -37,6 +38,7 @@
 
 static xc_interface *xc_handle;
 static unsigned int max_cpu_nr;
+static xc_physinfo_t physinfo;
 
 /* help message */
 void show_help(void)
@@ -93,6 +95,7 @@ void show_help(void)
             "                                           units default to \"us\" if unspecified.\n"
             "                                           truncates un-representable values.\n"
             "                                           0 lets the hardware decide.\n"
+            " get-intel-temp        [cpuid]       get Intel CPU temperature of <cpuid> or all\n"
             " start [seconds]                     start collect Cx/Px statistics,\n"
             "                                     output after CTRL-C or SIGINT or several seconds.\n"
             " enable-turbo-mode     [cpuid]       enable Turbo Mode for processors that support it.\n"
@@ -1354,6 +1357,92 @@ void enable_turbo_mode(int argc, char *argv[])
                 errno, strerror(errno));
 }
 
+static int fetch_dts_temp(xc_interface *xch, uint32_t cpu, bool package, int *temp)
+{
+    xc_resource_entry_t entries[2] = {
+        (xc_resource_entry_t){
+            .idx = package ? MSR_PACKAGE_THERM_STATUS : MSR_IA32_THERM_STATUS
+        },
+        (xc_resource_entry_t){ .idx = MSR_TEMPERATURE_TARGET },
+    };
+    struct xc_resource_op ops = {
+        .cpu = cpu,
+        .entries = entries,
+        .nr_entries = 2,
+    };
+    int tjmax;
+
+    int ret = xc_resource_op(xch, 1, &ops);
+
+    if ( ret <= 0 )
+        /* This CPU isn't online or can't query this MSR */
+        return ret ?: -EOPNOTSUPP;
+
+    if ( ret == 2 )
+        tjmax = (entries[1].val >> 16) & 0xff;
+    else
+    {
+        /*
+         * The CPU doesn't support MSR_IA32_TEMPERATURE_TARGET, we assume it's 100 which
+         * is correct aside a few selected Atom CPUs. Check coretemp source code for more
+         * information.
+         */
+        fprintf(stderr, "[CPU%d] MSR_IA32_TEMPERATURE_TARGET is not supported, assume "
+                "tjmax=100°C, readings may be incorrect\n", cpu);
+        tjmax = 100;
+    }
+    
+    *temp = tjmax - ((entries[0].val >> 16) & 0xff);
+    return 0;
+}
+
+
+void get_intel_temp(int argc, char *argv[])
+{
+    int temp, cpu = -1;
+    unsigned int socket;
+    bool has_data = false;
+
+    if ( argc > 0 )
+        parse_cpuid(argv[0], &cpu);
+
+    if ( cpu != -1 )
+    {
+        if ( !fetch_dts_temp(xc_handle, cpu, false, &temp) )
+            printf("CPU%d: %d°C\n", cpu, temp);
+        else
+            printf("No data\n");
+        return;
+    }
+
+    /* Per socket measurement */
+    for ( socket = 0, cpu = 0; cpu < max_cpu_nr;
+          socket++, cpu += physinfo.cores_per_socket * physinfo.threads_per_core )
+    {
+        if ( !fetch_dts_temp(xc_handle, cpu, true, &temp) )
+        {
+            has_data = true;
+            printf("Package%d: %d°C\n", socket, temp);
+        }
+    }
+
+    if ( has_data )
+        /* Avoid inserting a trailing line if we have nothing */
+        printf("\n");
+
+    for ( cpu = 0; cpu < max_cpu_nr; cpu += physinfo.threads_per_core )
+    {
+        if ( fetch_dts_temp(xc_handle, cpu, false, &temp) )
+            continue;
+
+        has_data = true;
+        printf("CPU%d: %d°C\n", cpu, temp);
+    }
+
+    if ( !has_data )
+        printf("No data\n");
+}
+
 void disable_turbo_mode(int argc, char *argv[])
 {
     int cpuid = -1;
@@ -1618,12 +1707,12 @@ struct {
     { "set-max-cstate", set_max_cstate_func},
     { "enable-turbo-mode", enable_turbo_mode },
     { "disable-turbo-mode", disable_turbo_mode },
+    { "get-intel-temp", get_intel_temp },
 };
 
 int main(int argc, char *argv[])
 {
     int i, ret = 0;
-    xc_physinfo_t physinfo;
     int nr_matches = 0;
     int matches_main_options[ARRAY_SIZE(main_options)];
 
-- 
2.51.2



--
Teddy Astie | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Re: [PATCH v3 3/3] xenpm: Add get-intel-temp subcommand
Posted by Jan Beulich 2 days, 10 hours ago
On 09.12.2025 18:19, Teddy Astie wrote:
> --- a/tools/misc/xenpm.c
> +++ b/tools/misc/xenpm.c
> @@ -30,6 +30,7 @@
>  #include <inttypes.h>
>  #include <sys/time.h>
>  
> +#include <xen/asm/msr-index.h>

For this to not break non-x86 builds, don't you need to constrain the building
of the tool to CONFIG_X86? (I have no clue why it is being built for Arm as
well right now, as I don't see how it could provide any value there.)

> @@ -37,6 +38,7 @@
>  
>  static xc_interface *xc_handle;
>  static unsigned int max_cpu_nr;
> +static xc_physinfo_t physinfo;
>  
>  /* help message */
>  void show_help(void)
> @@ -93,6 +95,7 @@ void show_help(void)
>              "                                           units default to \"us\" if unspecified.\n"
>              "                                           truncates un-representable values.\n"
>              "                                           0 lets the hardware decide.\n"
> +            " get-intel-temp        [cpuid]       get Intel CPU temperature of <cpuid> or all\n"
>              " start [seconds]                     start collect Cx/Px statistics,\n"
>              "                                     output after CTRL-C or SIGINT or several seconds.\n"
>              " enable-turbo-mode     [cpuid]       enable Turbo Mode for processors that support it.\n"
> @@ -1354,6 +1357,92 @@ void enable_turbo_mode(int argc, char *argv[])
>                  errno, strerror(errno));
>  }
>  
> +static int fetch_dts_temp(xc_interface *xch, uint32_t cpu, bool package, int *temp)
> +{
> +    xc_resource_entry_t entries[2] = {

Is the 2 actually useful to have here?

> +        (xc_resource_entry_t){

Why these type specifiers? They shouldn't be needed in initializers (while they
would be needed in assignments)?

> +            .idx = package ? MSR_PACKAGE_THERM_STATUS : MSR_IA32_THERM_STATUS
> +        },
> +        (xc_resource_entry_t){ .idx = MSR_TEMPERATURE_TARGET },
> +    };
> +    struct xc_resource_op ops = {
> +        .cpu = cpu,
> +        .entries = entries,
> +        .nr_entries = 2,

ARRAY_SIZE() please.

> +    };
> +    int tjmax;
> +
> +    int ret = xc_resource_op(xch, 1, &ops);
> +
> +    if ( ret <= 0 )
> +        /* This CPU isn't online or can't query this MSR */
> +        return ret ?: -EOPNOTSUPP;

xc_resource_op() doesn't return errno values, so by using -EOPNOTSUPP here you
put the caller into a difficult position when actually looking at the return
value: Does -1 mean -1 or -EPERM?

> +    if ( ret == 2 )
> +        tjmax = (entries[1].val >> 16) & 0xff;
> +    else
> +    {
> +        /*
> +         * The CPU doesn't support MSR_IA32_TEMPERATURE_TARGET, we assume it's 100 which
> +         * is correct aside a few selected Atom CPUs. Check coretemp source code for more
> +         * information.
> +         */

What is "coretemp source code" in xen.git context? (I understand you mean the
Linux driver, but that also needs saying then.)

Further please respect line length limits, also ...

> +        fprintf(stderr, "[CPU%d] MSR_IA32_TEMPERATURE_TARGET is not supported, assume "

... e.g. here.

Additionally there are still IA32 infixes here.

Finally, if this message triggers once on a system, it'll likely trigger once
per get_intel_temp()'s loop iteration. Feels like a lot of (potential) noise.

> +                "tjmax=100°C, readings may be incorrect\n", cpu);
> +        tjmax = 100;
> +    }
> +    
> +    *temp = tjmax - ((entries[0].val >> 16) & 0xff);
> +    return 0;
> +}
> +
> +
> +void get_intel_temp(int argc, char *argv[])

static?

> +{
> +    int temp, cpu = -1;
> +    unsigned int socket;
> +    bool has_data = false;
> +
> +    if ( argc > 0 )
> +        parse_cpuid(argv[0], &cpu);
> +
> +    if ( cpu != -1 )
> +    {
> +        if ( !fetch_dts_temp(xc_handle, cpu, false, &temp) )
> +            printf("CPU%d: %d°C\n", cpu, temp);
> +        else
> +            printf("No data\n");
> +        return;
> +    }
> +
> +    /* Per socket measurement */
> +    for ( socket = 0, cpu = 0; cpu < max_cpu_nr;
> +          socket++, cpu += physinfo.cores_per_socket * physinfo.threads_per_core )
> +    {
> +        if ( !fetch_dts_temp(xc_handle, cpu, true, &temp) )
> +        {
> +            has_data = true;
> +            printf("Package%d: %d°C\n", socket, temp);

%u please for socket.

Jan

Re: [PATCH v3 3/3] xenpm: Add get-intel-temp subcommand
Posted by Teddy Astie 2 days, 8 hours ago
Le 10/12/2025 à 09:50, Jan Beulich a écrit :
> xc_resource_op() doesn't return errno values, so by using -EOPNOTSUPP here you
> put the caller into a difficult position when actually looking at the return
> value: Does -1 mean -1 or -EPERM?

That's a bit unfortunate as xc_resource_op() can return either -1 or 
some -errno; so -1 could be either -EPERM or a internal failure of 
xc_resource_op and we can't really know.

One thing is certain is that if xc_resource_op() returns ret >= 0, there 
is no failure aside that we may have no data (e.g no MSR yields a value).


--
Teddy Astie | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Re: [PATCH v3 3/3] xenpm: Add get-intel-temp subcommand
Posted by Jan Beulich 2 days, 8 hours ago
On 10.12.2025 11:37, Teddy Astie wrote:
> Le 10/12/2025 à 09:50, Jan Beulich a écrit :
>> xc_resource_op() doesn't return errno values, so by using -EOPNOTSUPP here you
>> put the caller into a difficult position when actually looking at the return
>> value: Does -1 mean -1 or -EPERM?
> 
> That's a bit unfortunate as xc_resource_op() can return either -1 or 
> some -errno; so -1 could be either -EPERM or a internal failure of 
> xc_resource_op and we can't really know.

Can it? Assuming do_platform_op() and do_multicall_op() behave correctly,
I can't see any problematic return.

Jan

Re: [PATCH v3 3/3] xenpm: Add get-intel-temp subcommand
Posted by Teddy Astie 2 days, 9 hours ago
Le 10/12/2025 à 09:50, Jan Beulich a écrit :
> On 09.12.2025 18:19, Teddy Astie wrote:
>> --- a/tools/misc/xenpm.c
>> +++ b/tools/misc/xenpm.c
>> @@ -30,6 +30,7 @@
>>   #include <inttypes.h>
>>   #include <sys/time.h>
>>   
>> +#include <xen/asm/msr-index.h>
> 
> For this to not break non-x86 builds, don't you need to constrain the building
> of the tool to CONFIG_X86? (I have no clue why it is being built for Arm as
> well right now, as I don't see how it could provide any value there.)
> 

I don't know what are the plans on that area for ARM, the only thing 
that seems supported right now is "get-cpu-topology".

>> @@ -37,6 +38,7 @@
>>   
>>   static xc_interface *xc_handle;
>>   static unsigned int max_cpu_nr;
>> +static xc_physinfo_t physinfo;
>>   
>>   /* help message */
>>   void show_help(void)
>> @@ -93,6 +95,7 @@ void show_help(void)
>>               "                                           units default to \"us\" if unspecified.\n"
>>               "                                           truncates un-representable values.\n"
>>               "                                           0 lets the hardware decide.\n"
>> +            " get-intel-temp        [cpuid]       get Intel CPU temperature of <cpuid> or all\n"
>>               " start [seconds]                     start collect Cx/Px statistics,\n"
>>               "                                     output after CTRL-C or SIGINT or several seconds.\n"
>>               " enable-turbo-mode     [cpuid]       enable Turbo Mode for processors that support it.\n"
>> @@ -1354,6 +1357,92 @@ void enable_turbo_mode(int argc, char *argv[])
>>                   errno, strerror(errno));
>>   }
>>   
>> +static int fetch_dts_temp(xc_interface *xch, uint32_t cpu, bool package, int *temp)
>> +{
>> +    xc_resource_entry_t entries[2] = {
> 
> Is the 2 actually useful to have here?
> 
>> +        (xc_resource_entry_t){
> 
> Why these type specifiers? They shouldn't be needed in initializers (while they
> would be needed in assignments)?
> 

I wasn't aware that omitting this (xc_resource_entry_t) was accepted in 
this case.

>> +            .idx = package ? MSR_PACKAGE_THERM_STATUS : MSR_IA32_THERM_STATUS
>> +        },
>> +        (xc_resource_entry_t){ .idx = MSR_TEMPERATURE_TARGET },
>> +    };
>> +    struct xc_resource_op ops = {
>> +        .cpu = cpu,
>> +        .entries = entries,
>> +        .nr_entries = 2,
> 
> ARRAY_SIZE() please.
> 
>> +    };
>> +    int tjmax;
>> +
>> +    int ret = xc_resource_op(xch, 1, &ops);
>> +
>> +    if ( ret <= 0 )
>> +        /* This CPU isn't online or can't query this MSR */
>> +        return ret ?: -EOPNOTSUPP;
> 
> xc_resource_op() doesn't return errno values, so by using -EOPNOTSUPP here you
> put the caller into a difficult position when actually looking at the return
> value: Does -1 mean -1 or -EPERM?
> 
>> +    if ( ret == 2 )
>> +        tjmax = (entries[1].val >> 16) & 0xff;
>> +    else
>> +    {
>> +        /*
>> +         * The CPU doesn't support MSR_IA32_TEMPERATURE_TARGET, we assume it's 100 which
>> +         * is correct aside a few selected Atom CPUs. Check coretemp source code for more
>> +         * information.
>> +         */
> 
> What is "coretemp source code" in xen.git context? (I understand you mean the
> Linux driver, but that also needs saying then.)
> 

Is "Linux kernel's coretemp.c" better ?

> Further please respect line length limits, also ...
> 
>> +        fprintf(stderr, "[CPU%d] MSR_IA32_TEMPERATURE_TARGET is not supported, assume "
> 
> ... e.g. here.
> 
> Additionally there are still IA32 infixes here.
> 
> Finally, if this message triggers once on a system, it'll likely trigger once
> per get_intel_temp()'s loop iteration. Feels like a lot of (potential) noise.
> 

In principle, different CPUs can have different results here. But we can 
still try to only display the message once (by using a static bool ?) as 
affected hardware will probably be quite uniform in that regard.

>> +                "tjmax=100°C, readings may be incorrect\n", cpu);
>> +        tjmax = 100;
>> +    }
>> +
>> +    *temp = tjmax - ((entries[0].val >> 16) & 0xff);
>> +    return 0;
>> +}
>> +
>> +
>> +void get_intel_temp(int argc, char *argv[])
> 
> static?
> 
>> +{
>> +    int temp, cpu = -1;
>> +    unsigned int socket;
>> +    bool has_data = false;
>> +
>> +    if ( argc > 0 )
>> +        parse_cpuid(argv[0], &cpu);
>> +
>> +    if ( cpu != -1 )
>> +    {
>> +        if ( !fetch_dts_temp(xc_handle, cpu, false, &temp) )
>> +            printf("CPU%d: %d°C\n", cpu, temp);
>> +        else
>> +            printf("No data\n");
>> +        return;
>> +    }
>> +
>> +    /* Per socket measurement */
>> +    for ( socket = 0, cpu = 0; cpu < max_cpu_nr;
>> +          socket++, cpu += physinfo.cores_per_socket * physinfo.threads_per_core )
>> +    {
>> +        if ( !fetch_dts_temp(xc_handle, cpu, true, &temp) )
>> +        {
>> +            has_data = true;
>> +            printf("Package%d: %d°C\n", socket, temp);
> 
> %u please for socket.
> 

Ok for this (and the other adjustments).

> Jan



--
Teddy Astie | Vates XCP-ng Developer

XCP-ng & Xen Orchestra - Vates solutions

web: https://vates.tech
Re: [PATCH v3 3/3] xenpm: Add get-intel-temp subcommand
Posted by Jan Beulich 2 days, 8 hours ago
On 10.12.2025 11:26, Teddy Astie wrote:
> Le 10/12/2025 à 09:50, Jan Beulich a écrit :
>> On 09.12.2025 18:19, Teddy Astie wrote:
>>> --- a/tools/misc/xenpm.c
>>> +++ b/tools/misc/xenpm.c
>>> @@ -30,6 +30,7 @@
>>>   #include <inttypes.h>
>>>   #include <sys/time.h>
>>>   
>>> +#include <xen/asm/msr-index.h>
>>
>> For this to not break non-x86 builds, don't you need to constrain the building
>> of the tool to CONFIG_X86? (I have no clue why it is being built for Arm as
>> well right now, as I don't see how it could provide any value there.)
> 
> I don't know what are the plans on that area for ARM, the only thing 
> that seems supported right now is "get-cpu-topology".

Anthony and/or the Arm maintainers will then need to decide whether the tool
wants to continue to be kept building for non-x86.

>>> +    if ( ret == 2 )
>>> +        tjmax = (entries[1].val >> 16) & 0xff;
>>> +    else
>>> +    {
>>> +        /*
>>> +         * The CPU doesn't support MSR_IA32_TEMPERATURE_TARGET, we assume it's 100 which
>>> +         * is correct aside a few selected Atom CPUs. Check coretemp source code for more
>>> +         * information.
>>> +         */
>>
>> What is "coretemp source code" in xen.git context? (I understand you mean the
>> Linux driver, but that also needs saying then.)
> 
> Is "Linux kernel's coretemp.c" better ?

Yes.

Jan