[v1] Add support for RAPL MSRs in KVM/Qemu

[RFC PATCH] Add support for RAPL MSRs in KVM/Qemu

Posted by Anthony Harivel 11 months, 3 weeks ago

Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
interface (Running Average Power Limit) for advertising the accumulated
energy consumption of various power domains (e.g. CPU packages, DRAM,
etc.).

The consumption is reported via MSRs (model specific registers) like
MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
64 bits registers that represent the accumulated energy consumption in
micro Joules. They are updated by microcode every ~1ms.

For now, KVM always returns 0 when the guest requests the value of
these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
these MSRs dynamically in userspace.

To limit the amount of system calls for every MSR call, create a new
thread in QEMU that updates the "virtual" MSR values asynchronously.

Each vCPU has its own vMSR to reflect the independence of vCPUs. The
thread updates the vMSR values with the ratio of energy consumed of
the whole physical CPU package the vCPU thread runs on and the
thread's utime and stime values.

All other non-vCPU threads are also taken into account. Their energy
consumption is evenly distributed among all vCPUs threads running on
the same physical CPU package.

This feature is activated with -accel kvm,rapl=true.

Actual limitation:
- Works only on Intel host CPU because AMD CPUs are using different MSR
  adresses.

- Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at
  the moment.

- Since each vCPU has an independent vMSR value, the vCPU topology must
  be changed to match that reality. There must be a single vCPU per
  virtual socket (e.g.: -smp 4,sockets=4). Accessing pkg-0 energy will
  give vCPU 0 energy, pkg-1 will give vCPU 1 energy, etc.

Signed-off-by: Anthony Harivel <aharivel@redhat.com>
---

Notes:
    Earlier this year, I've proposed a patch in linux KVM [1] in order to
    bring energy awareness in VM.
    
    Thanks to the feedback, I've worked on another solution that requires
    only a QEMU patch that make us of MSR filtering mecanism.
    
    This patch is proposed as an RFC at the moment in order to validate the
    paradigm and see if the actual limitation could be adressed in a second
    phase.
    
    Regards,
    Anthony
    
    [1]: https://lore.kernel.org/kvm/20230118142123.461247-1-aharivel@redhat.com/

 accel/kvm/kvm-all.c           |  13 ++
 include/sysemu/kvm_int.h      |  11 ++
 target/i386/cpu.h             |   8 +
 target/i386/kvm/kvm.c         | 273 ++++++++++++++++++++++++++++++++++
 target/i386/kvm/meson.build   |   1 +
 target/i386/kvm/vmsr_energy.c | 132 ++++++++++++++++
 target/i386/kvm/vmsr_energy.h |  80 ++++++++++
 7 files changed, 518 insertions(+)
 create mode 100644 target/i386/kvm/vmsr_energy.c
 create mode 100644 target/i386/kvm/vmsr_energy.h

diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
index cf3a88d90e92..13bb2a523c5d 100644
--- a/accel/kvm/kvm-all.c
+++ b/accel/kvm/kvm-all.c
@@ -3699,6 +3699,12 @@ static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
     s->kvm_dirty_ring_size = value;
 }
 
+static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
+{
+    KVMState *s = KVM_STATE(obj);
+    s->msr_energy.enable = value;
+}
+
 static void kvm_accel_instance_init(Object *obj)
 {
     KVMState *s = KVM_STATE(obj);
@@ -3715,6 +3721,7 @@ static void kvm_accel_instance_init(Object *obj)
     s->xen_version = 0;
     s->xen_gnttab_max_frames = 64;
     s->xen_evtchn_max_pirq = 256;
+    s->msr_energy.enable = false;
 }
 
 /**
@@ -3755,6 +3762,12 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data)
     object_class_property_set_description(oc, "dirty-ring-size",
         "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
 
+    object_class_property_add_bool(oc, "rapl",
+                                   NULL,
+                                   kvm_set_kvm_rapl);
+    object_class_property_set_description(oc, "rapl",
+        "Allow energy related MSRs for RAPL interface in Guest");
+
     kvm_arch_accel_class_init(oc);
 }
 
diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
index a641c974ea54..cf3a01f498d7 100644
--- a/include/sysemu/kvm_int.h
+++ b/include/sysemu/kvm_int.h
@@ -47,6 +47,16 @@ typedef struct KVMMemoryListener {
 
 #define KVM_MSI_HASHTAB_SIZE    256
 
+struct KVMMsrEnergy {
+    bool enable;
+    QemuThread msr_thr;
+    int cpus;
+    uint64_t *msr_value;
+    uint64_t msr_unit;
+    uint64_t msr_limit;
+    uint64_t msr_info;
+};
+
 enum KVMDirtyRingReaperState {
     KVM_DIRTY_RING_REAPER_NONE = 0,
     /* The reaper is sleeping */
@@ -116,6 +126,7 @@ struct KVMState
     uint64_t kvm_dirty_ring_bytes;  /* Size of the per-vcpu dirty ring */
     uint32_t kvm_dirty_ring_size;   /* Number of dirty GFNs per ring */
     struct KVMDirtyRingReaper reaper;
+    struct KVMMsrEnergy msr_energy;
     NotifyVmexitOption notify_vmexit;
     uint32_t notify_window;
     uint32_t xen_version;
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 8504aaac6807..14f9c2901680 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -396,6 +396,10 @@ typedef enum X86Seg {
 #define MSR_IA32_TSX_CTRL		0x122
 #define MSR_IA32_TSCDEADLINE            0x6e0
 #define MSR_IA32_PKRS                   0x6e1
+#define MSR_RAPL_POWER_UNIT             0x00000606
+#define MSR_PKG_POWER_LIMIT             0x00000610
+#define MSR_PKG_ENERGY_STATUS           0x00000611
+#define MSR_PKG_POWER_INFO              0x00000614
 #define MSR_ARCH_LBR_CTL                0x000014ce
 #define MSR_ARCH_LBR_DEPTH              0x000014cf
 #define MSR_ARCH_LBR_FROM_0             0x00001500
@@ -1757,6 +1761,10 @@ typedef struct CPUArchState {
 
     uintptr_t retaddr;
 
+    /* RAPL MSR */
+    uint64_t msr_rapl_power_unit;
+    uint64_t msr_pkg_energy_status;
+
     /* Fields up to this point are cleared by a CPU reset */
     struct {} end_reset_fields;
 
diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
index de531842f6b1..c79d6b811109 100644
--- a/target/i386/kvm/kvm.c
+++ b/target/i386/kvm/kvm.c
@@ -16,11 +16,16 @@
 #include "qapi/qapi-events-run-state.h"
 #include "qapi/error.h"
 #include "qapi/visitor.h"
+#include <math.h>
+#include <stdint.h>
 #include <sys/ioctl.h>
 #include <sys/utsname.h>
 #include <sys/syscall.h>
+#include <sys/resource.h>
+#include <sys/time.h>
 
 #include <linux/kvm.h>
+#include <unistd.h>
 #include "standard-headers/asm-x86/kvm_para.h"
 #include "hw/xen/interface/arch-x86/cpuid.h"
 
@@ -35,6 +40,7 @@
 #include "xen-emu.h"
 #include "hyperv.h"
 #include "hyperv-proto.h"
+#include "vmsr_energy.h"
 
 #include "exec/gdbstub.h"
 #include "qemu/host-utils.h"
@@ -2518,6 +2524,49 @@ static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
     return true;
 }
 
+static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu, uint32_t msr,
+                                        uint64_t *val)
+{
+
+    CPUState *cs = CPU(cpu);
+
+    *val = cs->kvm_state->msr_energy.msr_unit;
+
+    return true;
+}
+
+static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu, uint32_t msr,
+                                        uint64_t *val)
+{
+
+    CPUState *cs = CPU(cpu);
+
+    *val = cs->kvm_state->msr_energy.msr_limit;
+
+    return true;
+}
+
+static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu, uint32_t msr,
+                                        uint64_t *val)
+{
+
+    CPUState *cs = CPU(cpu);
+
+    *val = cs->kvm_state->msr_energy.msr_info;
+
+    return true;
+}
+
+static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu, uint32_t msr,
+    uint64_t *val)
+{
+
+    CPUState *cs = CPU(cpu);
+    *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index];
+
+    return true;
+}
+
 static Notifier smram_machine_done;
 static KVMMemoryListener smram_listener;
 static AddressSpace smram_address_space;
@@ -2552,6 +2601,190 @@ static void register_smram_listener(Notifier *n, void *unused)
                                  &smram_address_space, 1, "kvm-smram");
 }
 
+static void *kvm_msr_energy_thread(void *data)
+{
+    KVMState *s = data;
+    struct KVMMsrEnergy *vmsr = &s->msr_energy;
+    unsigned int maxpkgs, maxcpus, maxticks;
+    package_energy_stat *pkg_stat;
+    int num_threads;
+    thread_stat *thd_stat;
+    CPUState *cpu;
+    pid_t pid;
+
+    rcu_register_thread();
+
+    /* Get QEMU PID*/
+    pid = getpid();
+
+    /* Assuming those values are the same accross physical system/packages */
+    maxcpus = get_maxcpus(0); /* Number of CPUS per packages */
+    maxpkgs = numa_max_node(); /* Number of Packages on the system */
+    /* Those MSR values should not change as well */
+    vmsr->msr_unit = read_msr(MSR_RAPL_POWER_UNIT, 0);
+    vmsr->msr_limit = read_msr(MSR_PKG_POWER_LIMIT, 0);
+    vmsr->msr_info = read_msr(MSR_PKG_POWER_INFO, 0);
+
+    /* Allocate memory for each package energy status */
+    pkg_stat = (package_energy_stat *) calloc(maxpkgs + 1,
+                                              sizeof(package_energy_stat));
+
+    /*
+     * Max numbers of ticks per package
+     * time in second * number of ticks/second * Number of cores / package
+     * ex: for 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
+     */
+    maxticks = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
+                    * sysconf(_SC_CLK_TCK) * maxcpus;
+
+    while (true) {
+
+        /* Get all qemu threads id */
+        pid_t *thread_ids = get_thread_ids(pid, &num_threads);
+
+        if (thread_ids == NULL) {
+            return NULL;
+        }
+
+        /* Allocate memory for each thread stats */
+        thd_stat = (thread_stat *) calloc(num_threads, sizeof(thread_stat));
+
+        /* Populate all the thread stats */
+        for (int i = 0; i < num_threads; i++) {
+            thd_stat[i].thread_id = thread_ids[i];
+            thd_stat[i].utime = calloc(2, sizeof(unsigned long long));
+            thd_stat[i].stime = calloc(2, sizeof(unsigned long long));
+            read_thread_stat(&thd_stat[i], pid, 0);
+            thd_stat[i].numa_node_id = numa_node_of_cpu(thd_stat[i].cpu_id);
+        }
+
+        /* Retrieve all packages power plane energy counter */
+        for (int i = 0; i <= maxpkgs; i++) {
+            for (int j = 0; j < num_threads; j++) {
+                /*
+                 * Use the first thread we found that ran on the CPU
+                 * of the package to read the packages energy counter
+                 */
+                if (thd_stat[j].numa_node_id == i) {
+                    pkg_stat[i].e_start = read_msr(MSR_PKG_ENERGY_STATUS, i);
+                    break;
+                }
+            }
+        }
+
+        /* Sleep a short period while the other threads are working */
+        usleep(MSR_ENERGY_THREAD_SLEEP_US);
+
+        /*
+         * Retrieve all packages power plane energy counter
+         * Calculate the delta of all packages
+         */
+        for (int i = 0; i <= maxpkgs; i++) {
+            for (int j = 0; j < num_threads; j++) {
+                /*
+                 * Use the first thread we found that ran on the CPU
+                 * of the package to read the packages energy counter
+                 */
+                if (thd_stat[j].numa_node_id == i) {
+                    pkg_stat[i].e_end =
+                        read_msr(MSR_PKG_ENERGY_STATUS, thd_stat[j].cpu_id);
+                    pkg_stat[i].e_delta =
+                        pkg_stat[i].e_end - pkg_stat[i].e_start;
+                    break;
+                }
+            }
+        }
+
+        /* Delta of ticks spend by each thread between the sample */
+        for (int i = 0; i < num_threads; i++) {
+            if (read_thread_stat(&thd_stat[i], pid, 1) != 0) {
+                /*
+                 * We don't count the dead thread
+                 * i.e threads that existed before the sleep
+                 * and not anymore
+                 */
+                thd_stat[i].delta_ticks = 0;
+            } else {
+                delta_ticks(thd_stat, i);
+            }
+        }
+
+        /*
+         * Identify the vCPU threads
+         * Calculate the Number of vCPU per package
+         */
+        CPU_FOREACH(cpu) {
+            for (int i = 0; i < num_threads; i++) {
+                if (cpu->thread_id == thd_stat[i].thread_id) {
+                    thd_stat[i].is_vcpu = true;
+                    thd_stat[i].vcpu_id = cpu->cpu_index;
+                    pkg_stat[thd_stat[i].numa_node_id].nb_vcpu++;
+                    break;
+                }
+            }
+        }
+
+        /* Calculate the total energy of all non-vCPU thread */
+        for (int i = 0; i < num_threads; i++) {
+            double temp;
+            if ((thd_stat[i].is_vcpu != true) &&
+                (thd_stat[i].delta_ticks > 0)) {
+                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
+                pkg_stat[thd_stat[i].numa_node_id].e_ratio
+                    += (uint64_t)lround(temp);
+            }
+        }
+
+        /* Calculate the ratio per non-vCPU thread of each package */
+        for (int i = 0; i <= maxpkgs; i++) {
+            if (pkg_stat[i].nb_vcpu > 0) {
+                pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
+            }
+        }
+
+        /* Calculate the energy for each vCPU thread */
+        for (int i = 0; i < num_threads; i++) {
+            double temp;
+
+            if ((thd_stat[i].is_vcpu == true) &&
+                (thd_stat[i].delta_ticks > 0)) {
+                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
+                vmsr->msr_value[thd_stat[i].vcpu_id] += (uint64_t)lround(temp);
+                vmsr->msr_value[thd_stat[i].vcpu_id] \
+                    += pkg_stat[thd_stat[i].numa_node_id].e_ratio;
+            }
+        }
+
+        /* free all memory */
+        for (int i = 0; i < num_threads; i++) {
+            free(thd_stat[i].utime);
+            free(thd_stat[i].stime);
+        }
+        free(thd_stat);
+        free(thread_ids);
+    }
+
+    rcu_unregister_thread();
+    return NULL;
+}
+
+static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
+{
+    struct KVMMsrEnergy *r = &s->msr_energy;
+
+    /* Retrieve the number of vCPU */
+    r->cpus = ms->smp.cpus;
+
+    /* Allocate register memory (MSR_PKG_STATUS) for each vCPU */
+    r->msr_value = calloc(r->cpus, sizeof(r->msr_value));
+
+    qemu_thread_create(&r->msr_thr, "kvm-msr",
+                       kvm_msr_energy_thread,
+                       s, QEMU_THREAD_JOINABLE);
+
+    return 0;
+}
+
 int kvm_arch_init(MachineState *ms, KVMState *s)
 {
     uint64_t identity_base = 0xfffbc000;
@@ -2765,6 +2998,46 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
                          strerror(-ret));
             exit(1);
         }
+
+        if (s->msr_energy.enable == true) {
+
+            r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
+                               kvm_rdmsr_rapl_power_unit, NULL);
+            if (!r) {
+                error_report("Could not install MSR_RAPL_POWER_UNIT \
+                                handler: %s",
+                             strerror(-ret));
+                exit(1);
+            }
+
+            r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
+                               kvm_rdmsr_pkg_power_limit, NULL);
+            if (!r) {
+                error_report("Could not install MSR_PKG_POWER_LIMIT \
+                                handler: %s",
+                             strerror(-ret));
+                exit(1);
+            }
+
+            r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
+                               kvm_rdmsr_pkg_power_info, NULL);
+            if (!r) {
+                error_report("Could not install MSR_PKG_POWER_INFO \
+                                handler: %s",
+                             strerror(-ret));
+                exit(1);
+            }
+            r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
+                               kvm_rdmsr_pkg_energy_status, NULL);
+            if (!r) {
+                error_report("Could not install MSR_PKG_ENERGY_STATUS \
+                                handler: %s",
+                             strerror(-ret));
+                exit(1);
+            } else {
+                kvm_msr_energy_thread_init(s, ms);
+            }
+        }
     }
 
     return 0;
diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
index 322272091bce..9cdc93c6c439 100644
--- a/target/i386/kvm/meson.build
+++ b/target/i386/kvm/meson.build
@@ -5,6 +5,7 @@ i386_softmmu_kvm_ss = ss.source_set()
 i386_softmmu_kvm_ss.add(files(
   'kvm.c',
   'kvm-cpu.c',
+  'vmsr_energy.c',
 ))
 
 i386_softmmu_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c
new file mode 100644
index 000000000000..8bd86b32becf
--- /dev/null
+++ b/target/i386/kvm/vmsr_energy.c
@@ -0,0 +1,132 @@
+/*
+ * QEMU KVM support -- x86 virtual energy-related MSR.
+ *
+ * Copyright 2023 Red Hat, Inc. 2023
+ *
+ *  Author:
+ *      Anthony Harivel <aharivel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "vmsr_energy.h"
+
+#define MAX_PATH_LEN 50
+#define MAX_LINE_LEN 500
+
+uint64_t read_msr(uint32_t reg, unsigned int cpu_id)
+{
+    int fd;
+    uint64_t data;
+
+    char path[MAX_PATH_LEN];
+    snprintf(path, MAX_PATH_LEN, "/dev/cpu/%u/msr", cpu_id);
+
+    fd = open(path , O_RDONLY);
+    if (fd < 0) {
+        return 0;
+    }
+    if (pread(fd, &data, sizeof data, reg) != sizeof data) {
+        data = 0;
+    }
+
+    close(fd);
+    return data;
+}
+
+/* Retrieve the number of physical CPU on the package */
+unsigned int get_maxcpus(unsigned int package_num)
+{
+    int k, ncpus;
+    unsigned int maxcpus;
+    struct bitmask *cpus;
+
+    cpus = numa_allocate_cpumask();
+    ncpus = cpus->size;
+
+    if (numa_node_to_cpus(package_num, cpus) < 0) {
+        printf("node %u failed to convert\n", package_num);
+    }
+
+    maxcpus = 0;
+    for (k = 0; k < ncpus; k++) {
+        if (numa_bitmask_isbitset(cpus, k)) {
+            maxcpus++;
+        }
+    }
+
+    return maxcpus;
+}
+
+int read_thread_stat(struct thread_stat *thread, int pid, int index)
+{
+    char path[MAX_PATH_LEN];
+    snprintf(path, MAX_PATH_LEN, "/proc/%u/task/%d/stat", pid, \
+             thread->thread_id);
+
+    FILE *file = fopen(path, "r");
+    if (file == NULL) {
+        return -1;
+    }
+
+    if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
+        " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
+        " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
+           &thread->utime[index], &thread->stime[index], &thread->cpu_id) != 3)
+        return -1;
+
+    fclose(file);
+    return 0;
+}
+
+/* Read QEMU stat task folder to retrieve all QEMU threads ID */
+pid_t *get_thread_ids(pid_t pid, int *num_threads)
+{
+    char path[100];
+    sprintf(path, "/proc/%d/task", pid);
+
+    DIR *dir = opendir(path);
+    if (dir == NULL) {
+        perror("opendir");
+        return NULL;
+    }
+
+    pid_t *thread_ids = NULL;
+    int thread_count = 0;
+
+    struct dirent *ent;
+    while ((ent = readdir(dir)) != NULL) {
+        if (ent->d_name[0] == '.') {
+            continue;
+        }
+        pid_t tid = atoi(ent->d_name);
+        if (pid != tid) {
+            thread_ids = realloc(thread_ids,
+                                 (thread_count + 1) * sizeof(pid_t));
+            thread_ids[thread_count] = tid;
+            thread_count++;
+        }
+    }
+
+    closedir(dir);
+
+    *num_threads = thread_count;
+    return thread_ids;
+}
+
+void delta_ticks(thread_stat *thd_stat, int i)
+{
+    thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
+                            - (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
+}
+
+double get_ratio(package_energy_stat *pkg_stat,
+                        thread_stat *thd_stat,
+                        int maxticks, int i) {
+
+    return (pkg_stat[thd_stat[i].numa_node_id].e_delta / 100.0)
+            * ((100.0 / maxticks) * thd_stat[i].delta_ticks);
+}
+
diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
new file mode 100644
index 000000000000..5f79d2cbe00d
--- /dev/null
+++ b/target/i386/kvm/vmsr_energy.h
@@ -0,0 +1,80 @@
+/*
+ * QEMU KVM support -- x86 virtual energy-related MSR.
+ *
+ * Copyright 2023 Red Hat, Inc. 2023
+ *
+ *  Author:
+ *      Anthony Harivel <aharivel@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#ifndef VMSR_ENERGY_H
+#define VMSR_ENERGY_H
+
+#include "qemu/osdep.h"
+
+#include <numa.h>
+
+/*
+ * Define the interval time in micro seconds between 2 samples of
+ * energy related MSRs
+ */
+#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
+
+/*
+ * Thread statistic
+ * @ thread_id: TID (thread ID)
+ * @ is_vcpu: true is thread is vCPU thread
+ * @ cpu_id: CPU number last executed on
+ * @ vcpu_id: vCPU ID
+ * @ numa_node_id:node number of the CPU
+ * @ utime: amount of clock ticks the thread
+ *          has been scheduled in User mode
+ * @ stime: amount of clock ticks the thread
+ *          has been scheduled in System mode
+ * @ delta_ticks: delta of utime+stime between
+ *          the two samples (before/after sleep)
+ */
+struct thread_stat {
+    unsigned int thread_id;
+    bool is_vcpu;
+    unsigned int cpu_id;
+    unsigned int vcpu_id;
+    unsigned int numa_node_id;
+    unsigned long long *utime;
+    unsigned long long *stime;
+    unsigned long long delta_ticks;
+};
+
+/*
+ * Package statistic
+ * @ e_start: package energy counter before the sleep
+ * @ e_end: package energy counter after the sleep
+ * @ e_delta: delta of package energy counter
+ * @ e_ratio: store the energy ratio of non-vCPU thread
+ * @ nb_vcpu: number of vCPU running on this package
+ */
+struct packge_energy_stat {
+    uint64_t e_start;
+    uint64_t e_end;
+    uint64_t e_delta;
+    uint64_t e_ratio;
+    unsigned int nb_vcpu;
+};
+
+typedef struct thread_stat thread_stat;
+typedef struct packge_energy_stat package_energy_stat;
+
+uint64_t read_msr(uint32_t reg, unsigned int cpu_id);
+void delta_ticks(thread_stat *thd_stat, int i);
+unsigned int get_maxcpus(unsigned int package_num);
+int read_thread_stat(struct thread_stat *thread, int pid, int index);
+pid_t *get_thread_ids(pid_t pid, int *num_threads);
+double get_ratio(package_energy_stat *pkg_stat,
+                        thread_stat *thd_stat,
+                        int maxticks, int i);
+
+#endif /* VMSR_ENERGY_H */
-- 
2.40.1

Re: [RFC PATCH] Add support for RAPL MSRs in KVM/Qemu

Posted by Philippe Mathieu-Daudé 11 months, 3 weeks ago

Hi Anthony,

On 17/5/23 15:07, Anthony Harivel wrote:
> Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> interface (Running Average Power Limit) for advertising the accumulated
> energy consumption of various power domains (e.g. CPU packages, DRAM,
> etc.).
> 
> The consumption is reported via MSRs (model specific registers) like
> MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> 64 bits registers that represent the accumulated energy consumption in
> micro Joules. They are updated by microcode every ~1ms.
> 
> For now, KVM always returns 0 when the guest requests the value of
> these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
> these MSRs dynamically in userspace.
> 
> To limit the amount of system calls for every MSR call, create a new
> thread in QEMU that updates the "virtual" MSR values asynchronously.
> 
> Each vCPU has its own vMSR to reflect the independence of vCPUs. The
> thread updates the vMSR values with the ratio of energy consumed of
> the whole physical CPU package the vCPU thread runs on and the
> thread's utime and stime values.
> 
> All other non-vCPU threads are also taken into account. Their energy
> consumption is evenly distributed among all vCPUs threads running on
> the same physical CPU package.
> 
> This feature is activated with -accel kvm,rapl=true.
> 
> Actual limitation:
> - Works only on Intel host CPU because AMD CPUs are using different MSR
>    adresses.
> 
> - Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at
>    the moment.
> 
> - Since each vCPU has an independent vMSR value, the vCPU topology must
>    be changed to match that reality. There must be a single vCPU per
>    virtual socket (e.g.: -smp 4,sockets=4). Accessing pkg-0 energy will
>    give vCPU 0 energy, pkg-1 will give vCPU 1 energy, etc.
> 
> Signed-off-by: Anthony Harivel <aharivel@redhat.com>
> ---


> diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
> new file mode 100644
> index 000000000000..5f79d2cbe00d
> --- /dev/null
> +++ b/target/i386/kvm/vmsr_energy.h
> @@ -0,0 +1,80 @@
> +/*
> + * QEMU KVM support -- x86 virtual energy-related MSR.
> + *
> + * Copyright 2023 Red Hat, Inc. 2023
> + *
> + *  Author:
> + *      Anthony Harivel <aharivel@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef VMSR_ENERGY_H
> +#define VMSR_ENERGY_H
> +
> +#include "qemu/osdep.h"
> +
> +#include <numa.h>
> +
> +/*
> + * Define the interval time in micro seconds between 2 samples of
> + * energy related MSRs
> + */
> +#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
> +
> +/*
> + * Thread statistic
> + * @ thread_id: TID (thread ID)
> + * @ is_vcpu: true is thread is vCPU thread
> + * @ cpu_id: CPU number last executed on
> + * @ vcpu_id: vCPU ID
> + * @ numa_node_id:node number of the CPU
> + * @ utime: amount of clock ticks the thread
> + *          has been scheduled in User mode
> + * @ stime: amount of clock ticks the thread
> + *          has been scheduled in System mode
> + * @ delta_ticks: delta of utime+stime between
> + *          the two samples (before/after sleep)
> + */
> +struct thread_stat {
> +    unsigned int thread_id;
> +    bool is_vcpu;
> +    unsigned int cpu_id;
> +    unsigned int vcpu_id;
> +    unsigned int numa_node_id;
> +    unsigned long long *utime;
> +    unsigned long long *stime;
> +    unsigned long long delta_ticks;
> +};
> +
> +/*
> + * Package statistic
> + * @ e_start: package energy counter before the sleep
> + * @ e_end: package energy counter after the sleep
> + * @ e_delta: delta of package energy counter
> + * @ e_ratio: store the energy ratio of non-vCPU thread
> + * @ nb_vcpu: number of vCPU running on this package
> + */
> +struct packge_energy_stat {

"package"

> +    uint64_t e_start;
> +    uint64_t e_end;
> +    uint64_t e_delta;
> +    uint64_t e_ratio;
> +    unsigned int nb_vcpu;
> +};
> +
> +typedef struct thread_stat thread_stat;
> +typedef struct packge_energy_stat package_energy_stat;
> +
> +uint64_t read_msr(uint32_t reg, unsigned int cpu_id);
> +void delta_ticks(thread_stat *thd_stat, int i);
> +unsigned int get_maxcpus(unsigned int package_num);
> +int read_thread_stat(struct thread_stat *thread, int pid, int index);
> +pid_t *get_thread_ids(pid_t pid, int *num_threads);
> +double get_ratio(package_energy_stat *pkg_stat,
> +                        thread_stat *thd_stat,
> +                        int maxticks, int i);

Would prefixing these declarations with 'vmsr_' provide
a clearer API? Otherwise, maybe this isn't the best header
to declare them.

> +
> +#endif /* VMSR_ENERGY_H */

Re: [RFC PATCH] Add support for RAPL MSRs in KVM/Qemu

Posted by Anthony Harivel 11 months, 3 weeks ago

Philippe Mathieu-Daudé, May 19, 2023 at 13:32:

Hi Philippe,

> > +/*
> > + * Package statistic
> > + * @ e_start: package energy counter before the sleep
> > + * @ e_end: package energy counter after the sleep
> > + * @ e_delta: delta of package energy counter
> > + * @ e_ratio: store the energy ratio of non-vCPU thread
> > + * @ nb_vcpu: number of vCPU running on this package
> > + */
> > +struct packge_energy_stat {
>
> "package"

My bad..
This will be corrected.

>
> > +    uint64_t e_start;
> > +    uint64_t e_end;
> > +    uint64_t e_delta;
> > +    uint64_t e_ratio;
> > +    unsigned int nb_vcpu;
> > +};
> > +
> > +typedef struct thread_stat thread_stat;
> > +typedef struct packge_energy_stat package_energy_stat;
> > +
> > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id);
> > +void delta_ticks(thread_stat *thd_stat, int i);
> > +unsigned int get_maxcpus(unsigned int package_num);
> > +int read_thread_stat(struct thread_stat *thread, int pid, int index);
> > +pid_t *get_thread_ids(pid_t pid, int *num_threads);
> > +double get_ratio(package_energy_stat *pkg_stat,
> > +                        thread_stat *thd_stat,
> > +                        int maxticks, int i);
>
> Would prefixing these declarations with 'vmsr_' provide
> a clearer API? Otherwise, maybe this isn't the best header
> to declare them.

I agree with you this lack the prefixing you mention for better API clarity. 
I will correct that.

Thanks !

> > +
> > +#endif /* VMSR_ENERGY_H */

Re: [RFC PATCH] Add support for RAPL MSRs in KVM/Qemu

Posted by Marcelo Tosatti 11 months, 3 weeks ago

On Wed, May 17, 2023 at 03:07:30PM +0200, Anthony Harivel wrote:
> Starting with the "Sandy Bridge" generation, Intel CPUs provide a RAPL
> interface (Running Average Power Limit) for advertising the accumulated
> energy consumption of various power domains (e.g. CPU packages, DRAM,
> etc.).
> 
> The consumption is reported via MSRs (model specific registers) like
> MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are
> 64 bits registers that represent the accumulated energy consumption in
> micro Joules. They are updated by microcode every ~1ms.
> 
> For now, KVM always returns 0 when the guest requests the value of
> these MSRs. Use the KVM MSR filtering mechanism to allow QEMU handle
> these MSRs dynamically in userspace.
> 
> To limit the amount of system calls for every MSR call, create a new
> thread in QEMU that updates the "virtual" MSR values asynchronously.
> 
> Each vCPU has its own vMSR to reflect the independence of vCPUs. The
> thread updates the vMSR values with the ratio of energy consumed of
> the whole physical CPU package the vCPU thread runs on and the
> thread's utime and stime values.
> 
> All other non-vCPU threads are also taken into account. Their energy
> consumption is evenly distributed among all vCPUs threads running on
> the same physical CPU package.
> 
> This feature is activated with -accel kvm,rapl=true.
> 
> Actual limitation:
> - Works only on Intel host CPU because AMD CPUs are using different MSR
>   adresses.
> 
> - Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at
>   the moment.
> 
> - Since each vCPU has an independent vMSR value, the vCPU topology must
>   be changed to match that reality. There must be a single vCPU per
>   virtual socket (e.g.: -smp 4,sockets=4). Accessing pkg-0 energy will
>   give vCPU 0 energy, pkg-1 will give vCPU 1 energy, etc.
> 
> Signed-off-by: Anthony Harivel <aharivel@redhat.com>
> ---
> 
> Notes:
>     Earlier this year, I've proposed a patch in linux KVM [1] in order to
>     bring energy awareness in VM.
>     
>     Thanks to the feedback, I've worked on another solution that requires
>     only a QEMU patch that make us of MSR filtering mecanism.
>     
>     This patch is proposed as an RFC at the moment in order to validate the
>     paradigm and see if the actual limitation could be adressed in a second
>     phase.
>     
>     Regards,
>     Anthony
>     
>     [1]: https://lore.kernel.org/kvm/20230118142123.461247-1-aharivel@redhat.com/

Hi Anthony,
> 
>  accel/kvm/kvm-all.c           |  13 ++
>  include/sysemu/kvm_int.h      |  11 ++
>  target/i386/cpu.h             |   8 +
>  target/i386/kvm/kvm.c         | 273 ++++++++++++++++++++++++++++++++++
>  target/i386/kvm/meson.build   |   1 +
>  target/i386/kvm/vmsr_energy.c | 132 ++++++++++++++++
>  target/i386/kvm/vmsr_energy.h |  80 ++++++++++
>  7 files changed, 518 insertions(+)
>  create mode 100644 target/i386/kvm/vmsr_energy.c
>  create mode 100644 target/i386/kvm/vmsr_energy.h
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index cf3a88d90e92..13bb2a523c5d 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -3699,6 +3699,12 @@ static void kvm_set_dirty_ring_size(Object *obj, Visitor *v,
>      s->kvm_dirty_ring_size = value;
>  }
>  
> +static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
> +{
> +    KVMState *s = KVM_STATE(obj);
> +    s->msr_energy.enable = value;
> +}
> +
>  static void kvm_accel_instance_init(Object *obj)
>  {
>      KVMState *s = KVM_STATE(obj);
> @@ -3715,6 +3721,7 @@ static void kvm_accel_instance_init(Object *obj)
>      s->xen_version = 0;
>      s->xen_gnttab_max_frames = 64;
>      s->xen_evtchn_max_pirq = 256;
> +    s->msr_energy.enable = false;
>  }
>  
>  /**
> @@ -3755,6 +3762,12 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data)
>      object_class_property_set_description(oc, "dirty-ring-size",
>          "Size of KVM dirty page ring buffer (default: 0, i.e. use bitmap)");
>  
> +    object_class_property_add_bool(oc, "rapl",
> +                                   NULL,
> +                                   kvm_set_kvm_rapl);
> +    object_class_property_set_description(oc, "rapl",
> +        "Allow energy related MSRs for RAPL interface in Guest");
> +
>      kvm_arch_accel_class_init(oc);
>  }
>  
> diff --git a/include/sysemu/kvm_int.h b/include/sysemu/kvm_int.h
> index a641c974ea54..cf3a01f498d7 100644
> --- a/include/sysemu/kvm_int.h
> +++ b/include/sysemu/kvm_int.h
> @@ -47,6 +47,16 @@ typedef struct KVMMemoryListener {
>  
>  #define KVM_MSI_HASHTAB_SIZE    256
>  
> +struct KVMMsrEnergy {
> +    bool enable;
> +    QemuThread msr_thr;
> +    int cpus;
> +    uint64_t *msr_value;
> +    uint64_t msr_unit;
> +    uint64_t msr_limit;
> +    uint64_t msr_info;
> +};
> +
>  enum KVMDirtyRingReaperState {
>      KVM_DIRTY_RING_REAPER_NONE = 0,
>      /* The reaper is sleeping */
> @@ -116,6 +126,7 @@ struct KVMState
>      uint64_t kvm_dirty_ring_bytes;  /* Size of the per-vcpu dirty ring */
>      uint32_t kvm_dirty_ring_size;   /* Number of dirty GFNs per ring */
>      struct KVMDirtyRingReaper reaper;
> +    struct KVMMsrEnergy msr_energy;
>      NotifyVmexitOption notify_vmexit;
>      uint32_t notify_window;
>      uint32_t xen_version;
> diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> index 8504aaac6807..14f9c2901680 100644
> --- a/target/i386/cpu.h
> +++ b/target/i386/cpu.h
> @@ -396,6 +396,10 @@ typedef enum X86Seg {
>  #define MSR_IA32_TSX_CTRL		0x122
>  #define MSR_IA32_TSCDEADLINE            0x6e0
>  #define MSR_IA32_PKRS                   0x6e1
> +#define MSR_RAPL_POWER_UNIT             0x00000606
> +#define MSR_PKG_POWER_LIMIT             0x00000610
> +#define MSR_PKG_ENERGY_STATUS           0x00000611
> +#define MSR_PKG_POWER_INFO              0x00000614

Why only PKG and not all domains?

>  #define MSR_ARCH_LBR_CTL                0x000014ce
>  #define MSR_ARCH_LBR_DEPTH              0x000014cf
>  #define MSR_ARCH_LBR_FROM_0             0x00001500
> @@ -1757,6 +1761,10 @@ typedef struct CPUArchState {
>  
>      uintptr_t retaddr;
>  
> +    /* RAPL MSR */
> +    uint64_t msr_rapl_power_unit;
> +    uint64_t msr_pkg_energy_status;
> +
>      /* Fields up to this point are cleared by a CPU reset */
>      struct {} end_reset_fields;
>  
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index de531842f6b1..c79d6b811109 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -16,11 +16,16 @@
>  #include "qapi/qapi-events-run-state.h"
>  #include "qapi/error.h"
>  #include "qapi/visitor.h"
> +#include <math.h>
> +#include <stdint.h>
>  #include <sys/ioctl.h>
>  #include <sys/utsname.h>
>  #include <sys/syscall.h>
> +#include <sys/resource.h>
> +#include <sys/time.h>
>  
>  #include <linux/kvm.h>
> +#include <unistd.h>
>  #include "standard-headers/asm-x86/kvm_para.h"
>  #include "hw/xen/interface/arch-x86/cpuid.h"
>  
> @@ -35,6 +40,7 @@
>  #include "xen-emu.h"
>  #include "hyperv.h"
>  #include "hyperv-proto.h"
> +#include "vmsr_energy.h"
>  
>  #include "exec/gdbstub.h"
>  #include "qemu/host-utils.h"
> @@ -2518,6 +2524,49 @@ static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
>      return true;
>  }
>  
> +static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu, uint32_t msr,
> +                                        uint64_t *val)
> +{
> +
> +    CPUState *cs = CPU(cpu);
> +
> +    *val = cs->kvm_state->msr_energy.msr_unit;
> +
> +    return true;
> +}
> +
> +static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu, uint32_t msr,
> +                                        uint64_t *val)
> +{
> +
> +    CPUState *cs = CPU(cpu);
> +
> +    *val = cs->kvm_state->msr_energy.msr_limit;
> +
> +    return true;
> +}
> +
> +static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu, uint32_t msr,
> +                                        uint64_t *val)
> +{
> +
> +    CPUState *cs = CPU(cpu);
> +
> +    *val = cs->kvm_state->msr_energy.msr_info;
> +
> +    return true;
> +}
> +
> +static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu, uint32_t msr,
> +    uint64_t *val)
> +{
> +
> +    CPUState *cs = CPU(cpu);
> +    *val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index];
> +
> +    return true;
> +}
> +
>  static Notifier smram_machine_done;
>  static KVMMemoryListener smram_listener;
>  static AddressSpace smram_address_space;
> @@ -2552,6 +2601,190 @@ static void register_smram_listener(Notifier *n, void *unused)
>                                   &smram_address_space, 1, "kvm-smram");
>  }
>  
> +static void *kvm_msr_energy_thread(void *data)
> +{
> +    KVMState *s = data;
> +    struct KVMMsrEnergy *vmsr = &s->msr_energy;
> +    unsigned int maxpkgs, maxcpus, maxticks;
> +    package_energy_stat *pkg_stat;
> +    int num_threads;
> +    thread_stat *thd_stat;
> +    CPUState *cpu;
> +    pid_t pid;
> +
> +    rcu_register_thread();
> +
> +    /* Get QEMU PID*/
> +    pid = getpid();
> +
> +    /* Assuming those values are the same accross physical system/packages */
> +    maxcpus = get_maxcpus(0); /* Number of CPUS per packages */
> +    maxpkgs = numa_max_node(); /* Number of Packages on the system */
> +    /* Those MSR values should not change as well */
> +    vmsr->msr_unit = read_msr(MSR_RAPL_POWER_UNIT, 0);
> +    vmsr->msr_limit = read_msr(MSR_PKG_POWER_LIMIT, 0);

Power limit - MSR interfaces to specify power limit, time window; lock bit, clamp bit etc

This one can change, right? And why expose the power limit to the guest?

> +    vmsr->msr_info = read_msr(MSR_PKG_POWER_INFO, 0);
> +
> +    /* Allocate memory for each package energy status */
> +    pkg_stat = (package_energy_stat *) calloc(maxpkgs + 1,
> +                                              sizeof(package_energy_stat));
> +
> +    /*
> +     * Max numbers of ticks per package
> +     * time in second * number of ticks/second * Number of cores / package
> +     * ex: for 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
> +     */
> +    maxticks = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
> +                    * sysconf(_SC_CLK_TCK) * maxcpus;
> +
> +    while (true) {
> +
> +        /* Get all qemu threads id */
> +        pid_t *thread_ids = get_thread_ids(pid, &num_threads);
> +
> +        if (thread_ids == NULL) {
> +            return NULL;
> +        }
> +
> +        /* Allocate memory for each thread stats */
> +        thd_stat = (thread_stat *) calloc(num_threads, sizeof(thread_stat));
> +
> +        /* Populate all the thread stats */
> +        for (int i = 0; i < num_threads; i++) {
> +            thd_stat[i].thread_id = thread_ids[i];
> +            thd_stat[i].utime = calloc(2, sizeof(unsigned long long));
> +            thd_stat[i].stime = calloc(2, sizeof(unsigned long long));
> +            read_thread_stat(&thd_stat[i], pid, 0);
> +            thd_stat[i].numa_node_id = numa_node_of_cpu(thd_stat[i].cpu_id);
> +        }
> +
> +        /* Retrieve all packages power plane energy counter */
> +        for (int i = 0; i <= maxpkgs; i++) {
> +            for (int j = 0; j < num_threads; j++) {
> +                /*
> +                 * Use the first thread we found that ran on the CPU
> +                 * of the package to read the packages energy counter
> +                 */
> +                if (thd_stat[j].numa_node_id == i) {
> +                    pkg_stat[i].e_start = read_msr(MSR_PKG_ENERGY_STATUS, i);
> +                    break;
> +                }
> +            }
> +        }
> +
> +        /* Sleep a short period while the other threads are working */
> +        usleep(MSR_ENERGY_THREAD_SLEEP_US);
> +
> +        /*
> +         * Retrieve all packages power plane energy counter
> +         * Calculate the delta of all packages
> +         */
> +        for (int i = 0; i <= maxpkgs; i++) {
> +            for (int j = 0; j < num_threads; j++) {
> +                /*
> +                 * Use the first thread we found that ran on the CPU
> +                 * of the package to read the packages energy counter
> +                 */
> +                if (thd_stat[j].numa_node_id == i) {
> +                    pkg_stat[i].e_end =
> +                        read_msr(MSR_PKG_ENERGY_STATUS, thd_stat[j].cpu_id);
> +                    pkg_stat[i].e_delta =
> +                        pkg_stat[i].e_end - pkg_stat[i].e_start;
> +                    break;
> +                }
> +            }
> +        }

Don't get this: can you do a high level overview of how the emulated
value is calculated? (fail to see what is the relation between whether
a QEMU thread ran and whether to read or not MSR_PKG_ENERGY_STATUS).

It seems this might be simplified, and reading every 1ms might be
excessive overhead.

Independent of this question (for whatever implementation): how
accurate is the power information exposed to the guest vs actual
power consumed (might be nice to have some notion of it).

In general i think its useful to expose the information to guests.

> +
> +        /* Delta of ticks spend by each thread between the sample */
> +        for (int i = 0; i < num_threads; i++) {
> +            if (read_thread_stat(&thd_stat[i], pid, 1) != 0) {
> +                /*
> +                 * We don't count the dead thread
> +                 * i.e threads that existed before the sleep
> +                 * and not anymore
> +                 */
> +                thd_stat[i].delta_ticks = 0;
> +            } else {
> +                delta_ticks(thd_stat, i);
> +            }
> +        }
> +
> +        /*
> +         * Identify the vCPU threads
> +         * Calculate the Number of vCPU per package
> +         */
> +        CPU_FOREACH(cpu) {
> +            for (int i = 0; i < num_threads; i++) {
> +                if (cpu->thread_id == thd_stat[i].thread_id) {
> +                    thd_stat[i].is_vcpu = true;
> +                    thd_stat[i].vcpu_id = cpu->cpu_index;
> +                    pkg_stat[thd_stat[i].numa_node_id].nb_vcpu++;
> +                    break;
> +                }
> +            }
> +        }
> +
> +        /* Calculate the total energy of all non-vCPU thread */
> +        for (int i = 0; i < num_threads; i++) {
> +            double temp;
> +            if ((thd_stat[i].is_vcpu != true) &&
> +                (thd_stat[i].delta_ticks > 0)) {
> +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> +                pkg_stat[thd_stat[i].numa_node_id].e_ratio
> +                    += (uint64_t)lround(temp);
> +            }
> +        }
> +
> +        /* Calculate the ratio per non-vCPU thread of each package */
> +        for (int i = 0; i <= maxpkgs; i++) {
> +            if (pkg_stat[i].nb_vcpu > 0) {
> +                pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
> +            }
> +        }
> +
> +        /* Calculate the energy for each vCPU thread */
> +        for (int i = 0; i < num_threads; i++) {
> +            double temp;
> +
> +            if ((thd_stat[i].is_vcpu == true) &&
> +                (thd_stat[i].delta_ticks > 0)) {
> +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> +                vmsr->msr_value[thd_stat[i].vcpu_id] += (uint64_t)lround(temp);
> +                vmsr->msr_value[thd_stat[i].vcpu_id] \
> +                    += pkg_stat[thd_stat[i].numa_node_id].e_ratio;
> +            }
> +        }
> +
> +        /* free all memory */
> +        for (int i = 0; i < num_threads; i++) {
> +            free(thd_stat[i].utime);
> +            free(thd_stat[i].stime);
> +        }
> +        free(thd_stat);
> +        free(thread_ids);
> +    }
> +
> +    rcu_unregister_thread();
> +    return NULL;
> +}
> +
> +static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
> +{
> +    struct KVMMsrEnergy *r = &s->msr_energy;
> +
> +    /* Retrieve the number of vCPU */
> +    r->cpus = ms->smp.cpus;
> +
> +    /* Allocate register memory (MSR_PKG_STATUS) for each vCPU */
> +    r->msr_value = calloc(r->cpus, sizeof(r->msr_value));
> +
> +    qemu_thread_create(&r->msr_thr, "kvm-msr",
> +                       kvm_msr_energy_thread,
> +                       s, QEMU_THREAD_JOINABLE);
> +
> +    return 0;
> +}
> +
>  int kvm_arch_init(MachineState *ms, KVMState *s)
>  {
>      uint64_t identity_base = 0xfffbc000;
> @@ -2765,6 +2998,46 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
>                           strerror(-ret));
>              exit(1);
>          }
> +
> +        if (s->msr_energy.enable == true) {
> +
> +            r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
> +                               kvm_rdmsr_rapl_power_unit, NULL);
> +            if (!r) {
> +                error_report("Could not install MSR_RAPL_POWER_UNIT \
> +                                handler: %s",
> +                             strerror(-ret));
> +                exit(1);
> +            }
> +
> +            r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
> +                               kvm_rdmsr_pkg_power_limit, NULL);
> +            if (!r) {
> +                error_report("Could not install MSR_PKG_POWER_LIMIT \
> +                                handler: %s",
> +                             strerror(-ret));
> +                exit(1);
> +            }
> +
> +            r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
> +                               kvm_rdmsr_pkg_power_info, NULL);
> +            if (!r) {
> +                error_report("Could not install MSR_PKG_POWER_INFO \
> +                                handler: %s",
> +                             strerror(-ret));
> +                exit(1);
> +            }
> +            r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
> +                               kvm_rdmsr_pkg_energy_status, NULL);
> +            if (!r) {
> +                error_report("Could not install MSR_PKG_ENERGY_STATUS \
> +                                handler: %s",
> +                             strerror(-ret));
> +                exit(1);
> +            } else {
> +                kvm_msr_energy_thread_init(s, ms);
> +            }
> +        }
>      }
>  
>      return 0;
> diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
> index 322272091bce..9cdc93c6c439 100644
> --- a/target/i386/kvm/meson.build
> +++ b/target/i386/kvm/meson.build
> @@ -5,6 +5,7 @@ i386_softmmu_kvm_ss = ss.source_set()
>  i386_softmmu_kvm_ss.add(files(
>    'kvm.c',
>    'kvm-cpu.c',
> +  'vmsr_energy.c',
>  ))
>  
>  i386_softmmu_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
> diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c
> new file mode 100644
> index 000000000000..8bd86b32becf
> --- /dev/null
> +++ b/target/i386/kvm/vmsr_energy.c
> @@ -0,0 +1,132 @@
> +/*
> + * QEMU KVM support -- x86 virtual energy-related MSR.
> + *
> + * Copyright 2023 Red Hat, Inc. 2023
> + *
> + *  Author:
> + *      Anthony Harivel <aharivel@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#include "vmsr_energy.h"
> +
> +#define MAX_PATH_LEN 50
> +#define MAX_LINE_LEN 500
> +
> +uint64_t read_msr(uint32_t reg, unsigned int cpu_id)
> +{
> +    int fd;
> +    uint64_t data;
> +
> +    char path[MAX_PATH_LEN];
> +    snprintf(path, MAX_PATH_LEN, "/dev/cpu/%u/msr", cpu_id);
> +
> +    fd = open(path , O_RDONLY);
> +    if (fd < 0) {
> +        return 0;
> +    }
> +    if (pread(fd, &data, sizeof data, reg) != sizeof data) {
> +        data = 0;
> +    }
> +
> +    close(fd);
> +    return data;
> +}
> +
> +/* Retrieve the number of physical CPU on the package */
> +unsigned int get_maxcpus(unsigned int package_num)
> +{
> +    int k, ncpus;
> +    unsigned int maxcpus;
> +    struct bitmask *cpus;
> +
> +    cpus = numa_allocate_cpumask();
> +    ncpus = cpus->size;
> +
> +    if (numa_node_to_cpus(package_num, cpus) < 0) {
> +        printf("node %u failed to convert\n", package_num);
> +    }
> +
> +    maxcpus = 0;
> +    for (k = 0; k < ncpus; k++) {
> +        if (numa_bitmask_isbitset(cpus, k)) {
> +            maxcpus++;
> +        }
> +    }
> +
> +    return maxcpus;
> +}
> +
> +int read_thread_stat(struct thread_stat *thread, int pid, int index)
> +{
> +    char path[MAX_PATH_LEN];
> +    snprintf(path, MAX_PATH_LEN, "/proc/%u/task/%d/stat", pid, \
> +             thread->thread_id);
> +
> +    FILE *file = fopen(path, "r");
> +    if (file == NULL) {
> +        return -1;
> +    }
> +
> +    if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
> +        " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
> +        " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
> +           &thread->utime[index], &thread->stime[index], &thread->cpu_id) != 3)
> +        return -1;
> +
> +    fclose(file);
> +    return 0;
> +}
> +
> +/* Read QEMU stat task folder to retrieve all QEMU threads ID */
> +pid_t *get_thread_ids(pid_t pid, int *num_threads)
> +{
> +    char path[100];
> +    sprintf(path, "/proc/%d/task", pid);
> +
> +    DIR *dir = opendir(path);
> +    if (dir == NULL) {
> +        perror("opendir");
> +        return NULL;
> +    }
> +
> +    pid_t *thread_ids = NULL;
> +    int thread_count = 0;
> +
> +    struct dirent *ent;
> +    while ((ent = readdir(dir)) != NULL) {
> +        if (ent->d_name[0] == '.') {
> +            continue;
> +        }
> +        pid_t tid = atoi(ent->d_name);
> +        if (pid != tid) {
> +            thread_ids = realloc(thread_ids,
> +                                 (thread_count + 1) * sizeof(pid_t));
> +            thread_ids[thread_count] = tid;
> +            thread_count++;
> +        }
> +    }
> +
> +    closedir(dir);
> +
> +    *num_threads = thread_count;
> +    return thread_ids;
> +}
> +
> +void delta_ticks(thread_stat *thd_stat, int i)
> +{
> +    thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
> +                            - (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
> +}
> +
> +double get_ratio(package_energy_stat *pkg_stat,
> +                        thread_stat *thd_stat,
> +                        int maxticks, int i) {
> +
> +    return (pkg_stat[thd_stat[i].numa_node_id].e_delta / 100.0)
> +            * ((100.0 / maxticks) * thd_stat[i].delta_ticks);
> +}
> +
> diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
> new file mode 100644
> index 000000000000..5f79d2cbe00d
> --- /dev/null
> +++ b/target/i386/kvm/vmsr_energy.h
> @@ -0,0 +1,80 @@
> +/*
> + * QEMU KVM support -- x86 virtual energy-related MSR.
> + *
> + * Copyright 2023 Red Hat, Inc. 2023
> + *
> + *  Author:
> + *      Anthony Harivel <aharivel@redhat.com>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + *
> + */
> +
> +#ifndef VMSR_ENERGY_H
> +#define VMSR_ENERGY_H
> +
> +#include "qemu/osdep.h"
> +
> +#include <numa.h>
> +
> +/*
> + * Define the interval time in micro seconds between 2 samples of
> + * energy related MSRs
> + */
> +#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
> +
> +/*
> + * Thread statistic
> + * @ thread_id: TID (thread ID)
> + * @ is_vcpu: true is thread is vCPU thread
> + * @ cpu_id: CPU number last executed on
> + * @ vcpu_id: vCPU ID
> + * @ numa_node_id:node number of the CPU
> + * @ utime: amount of clock ticks the thread
> + *          has been scheduled in User mode
> + * @ stime: amount of clock ticks the thread
> + *          has been scheduled in System mode
> + * @ delta_ticks: delta of utime+stime between
> + *          the two samples (before/after sleep)
> + */
> +struct thread_stat {
> +    unsigned int thread_id;
> +    bool is_vcpu;
> +    unsigned int cpu_id;
> +    unsigned int vcpu_id;
> +    unsigned int numa_node_id;
> +    unsigned long long *utime;
> +    unsigned long long *stime;
> +    unsigned long long delta_ticks;
> +};
> +
> +/*
> + * Package statistic
> + * @ e_start: package energy counter before the sleep
> + * @ e_end: package energy counter after the sleep
> + * @ e_delta: delta of package energy counter
> + * @ e_ratio: store the energy ratio of non-vCPU thread
> + * @ nb_vcpu: number of vCPU running on this package
> + */
> +struct packge_energy_stat {
> +    uint64_t e_start;
> +    uint64_t e_end;
> +    uint64_t e_delta;
> +    uint64_t e_ratio;
> +    unsigned int nb_vcpu;
> +};
> +
> +typedef struct thread_stat thread_stat;
> +typedef struct packge_energy_stat package_energy_stat;
> +
> +uint64_t read_msr(uint32_t reg, unsigned int cpu_id);
> +void delta_ticks(thread_stat *thd_stat, int i);
> +unsigned int get_maxcpus(unsigned int package_num);
> +int read_thread_stat(struct thread_stat *thread, int pid, int index);
> +pid_t *get_thread_ids(pid_t pid, int *num_threads);
> +double get_ratio(package_energy_stat *pkg_stat,
> +                        thread_stat *thd_stat,
> +                        int maxticks, int i);
> +
> +#endif /* VMSR_ENERGY_H */
> -- 
> 2.40.1
> 
>

Re: [RFC PATCH] Add support for RAPL MSRs in KVM/Qemu

Posted by Anthony Harivel 11 months, 3 weeks ago

Marcelo Tosatti, May 17, 2023 at 17:43:

Hi Marcelo,

> On Wed, May 17, 2023 at 03:07:30PM +0200, Anthony Harivel wrote:
 > diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> > index 8504aaac6807..14f9c2901680 100644
> > --- a/target/i386/cpu.h
> > +++ b/target/i386/cpu.h
> > @@ -396,6 +396,10 @@ typedef enum X86Seg {
> >  #define MSR_IA32_TSX_CTRL		0x122
> >  #define MSR_IA32_TSCDEADLINE            0x6e0
> >  #define MSR_IA32_PKRS                   0x6e1
> > +#define MSR_RAPL_POWER_UNIT             0x00000606
> > +#define MSR_PKG_POWER_LIMIT             0x00000610
> > +#define MSR_PKG_ENERGY_STATUS           0x00000611
> > +#define MSR_PKG_POWER_INFO              0x00000614
>
> Why only PKG and not all domains?
>

Package domains are the only ones you can find accross different CPU
segments (client and server platforms).
Processor cores domains are only available on client platform while
DRAM domains only on server platform.

I figured out that Package domains would be a good start to validate the
implementation and the rest could technically be added later on. 


> > +    /* Assuming those values are the same accross physical system/packages */
> > +    maxcpus = get_maxcpus(0); /* Number of CPUS per packages */
> > +    maxpkgs = numa_max_node(); /* Number of Packages on the system */
> > +    /* Those MSR values should not change as well */
> > +    vmsr->msr_unit = read_msr(MSR_RAPL_POWER_UNIT, 0);
> > +    vmsr->msr_limit = read_msr(MSR_PKG_POWER_LIMIT, 0);
>
> Power limit - MSR interfaces to specify power limit, time window; lock bit, clamp bit etc
>
> This one can change, right? And why expose the power limit to the guest?
>

Right.
Because it belongs to the non-optional RAPL interfaces MSRs, I added it
with the thought that it was mandatory for the RAPL driver to mount
insite the guest. 
Either it is not and can be removed, or we can set the "lock bit" to
inform the guest that power limit settings are static and un-modifiable.
I will correct that. 


> > +    vmsr->msr_info = read_msr(MSR_PKG_POWER_INFO, 0);
> > +
> > +    /* Allocate memory for each package energy status */
> > +    pkg_stat = (package_energy_stat *) calloc(maxpkgs + 1,
> > +                                              sizeof(package_energy_stat));
> > +
> > +    /*
> > +     * Max numbers of ticks per package
> > +     * time in second * number of ticks/second * Number of cores / package
> > +     * ex: for 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
> > +     */
> > +    maxticks = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
> > +                    * sysconf(_SC_CLK_TCK) * maxcpus;
> > +
> > +    while (true) {
> > +
> > +        /* Get all qemu threads id */
> > +        pid_t *thread_ids = get_thread_ids(pid, &num_threads);
> > +
> > +        if (thread_ids == NULL) {
> > +            return NULL;
> > +        }
> > +
> > +        /* Allocate memory for each thread stats */
> > +        thd_stat = (thread_stat *) calloc(num_threads, sizeof(thread_stat));
> > +
> > +        /* Populate all the thread stats */
> > +        for (int i = 0; i < num_threads; i++) {
> > +            thd_stat[i].thread_id = thread_ids[i];
> > +            thd_stat[i].utime = calloc(2, sizeof(unsigned long long));
> > +            thd_stat[i].stime = calloc(2, sizeof(unsigned long long));
> > +            read_thread_stat(&thd_stat[i], pid, 0);
> > +            thd_stat[i].numa_node_id = numa_node_of_cpu(thd_stat[i].cpu_id);
> > +        }
> > +
> > +        /* Retrieve all packages power plane energy counter */
> > +        for (int i = 0; i <= maxpkgs; i++) {
> > +            for (int j = 0; j < num_threads; j++) {
> > +                /*
> > +                 * Use the first thread we found that ran on the CPU
> > +                 * of the package to read the packages energy counter
> > +                 */
> > +                if (thd_stat[j].numa_node_id == i) {
> > +                    pkg_stat[i].e_start = read_msr(MSR_PKG_ENERGY_STATUS, i);
> > +                    break;
> > +                }
> > +            }
> > +        }
> > +
> > +        /* Sleep a short period while the other threads are working */
> > +        usleep(MSR_ENERGY_THREAD_SLEEP_US);
> > +
> > +        /*
> > +         * Retrieve all packages power plane energy counter
> > +         * Calculate the delta of all packages
> > +         */
> > +        for (int i = 0; i <= maxpkgs; i++) {
> > +            for (int j = 0; j < num_threads; j++) {
> > +                /*
> > +                 * Use the first thread we found that ran on the CPU
> > +                 * of the package to read the packages energy counter
> > +                 */
> > +                if (thd_stat[j].numa_node_id == i) {
> > +                    pkg_stat[i].e_end =
> > +                        read_msr(MSR_PKG_ENERGY_STATUS, thd_stat[j].cpu_id);
> > +                    pkg_stat[i].e_delta =
> > +                        pkg_stat[i].e_end - pkg_stat[i].e_start;
> > +                    break;
> > +                }
> > +            }
> > +        }
>
> Don't get this: can you do a high level overview of how the emulated
> value is calculated? (fail to see what is the relation between whether
> a QEMU thread ran and whether to read or not MSR_PKG_ENERGY_STATUS).
>
> It seems this might be simplified, and reading every 1ms might be
> excessive overhead.
>
> Independent of this question (for whatever implementation): how
> accurate is the power information exposed to the guest vs actual
> power consumed (might be nice to have some notion of it).
>
> In general i think its useful to expose the information to guests.
>

No problem, let me try to explain: 
a QEMU process is composed of vCPU thread(s) and non-vCPU thread(s) (IO,
emulated device,...). Each of those threads can run on different cores
that can belongs to the same Package or not. 
The MSR_PKG_ENERGY_STATUS is a counter that increment for the whole
Package domain. If you read this MSR from any core that belongs to the 
package, you will retrieve the same number.
So when I need to read the MSR, I only need to read once for all the
threads that are running on cores of each Package.

Now let's talk about the implementation of the emulated value. 
I've created a thread that does in an infinite loop the following:
- Retrieve all the QEMU threads + statistics about them 
- Read the energy counter of each Package involved 
- Sleep for 1sec (1sec is arbitrary)
- Calculate the delta of ticks for each threads so that we know how much
  time each threads has been scheduled during the Sleep period
- Read again the energy counter of each Package involved and calculate
  the delta of the counter so that we know how much the counter has
  increased during the Sleep period
- Calculate the ratio for each vCPU thread and deduct the energy spent
  for each vCPU
- Calculate the amount of energy spent for all non-vCPU and evenly
  spread it to the vCPU
- Update each virtual MSR for each vCPU

Obviously, this is working better and more consistently with vCPU pinning 
and proper isolation of the cores in the package. 

So those virtual MSRs are updated roughly each second (could be changed
by updating MSR_ENERGY_THREAD_SLEEP_US). Compared to the real MSRs which 
are updated by the microcode every 1ms.

Concerning the "real" power consumption, we have to not forget that the
RAPL interface energy data is not the result of a physical measurement. 
It is a set of architectural events from each core, processor
graphic, IO, etc. and combines them with energy weights to predict the
package's active power consumption.

IMO it is not really important because the idea behind this patch is to
give estimated values to the guests so that software running inside
VM can make use of power tools which are all reading those MSRs (or the
RAPL driver sys interface) to retrieve power consumption.


> > +
> > +        /* Delta of ticks spend by each thread between the sample */
> > +        for (int i = 0; i < num_threads; i++) {
> > +            if (read_thread_stat(&thd_stat[i], pid, 1) != 0) {
> > +                /*
> > +                 * We don't count the dead thread
> > +                 * i.e threads that existed before the sleep
> > +                 * and not anymore
> > +                 */
> > +                thd_stat[i].delta_ticks = 0;
> > +            } else {
> > +                delta_ticks(thd_stat, i);
> > +            }
> > +        }
> > +
> > +        /*
> > +         * Identify the vCPU threads
> > +         * Calculate the Number of vCPU per package
> > +         */
> > +        CPU_FOREACH(cpu) {
> > +            for (int i = 0; i < num_threads; i++) {
> > +                if (cpu->thread_id == thd_stat[i].thread_id) {
> > +                    thd_stat[i].is_vcpu = true;
> > +                    thd_stat[i].vcpu_id = cpu->cpu_index;
> > +                    pkg_stat[thd_stat[i].numa_node_id].nb_vcpu++;
> > +                    break;
> > +                }
> > +            }
> > +        }
> > +
> > +        /* Calculate the total energy of all non-vCPU thread */
> > +        for (int i = 0; i < num_threads; i++) {
> > +            double temp;
> > +            if ((thd_stat[i].is_vcpu != true) &&
> > +                (thd_stat[i].delta_ticks > 0)) {
> > +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> > +                pkg_stat[thd_stat[i].numa_node_id].e_ratio
> > +                    += (uint64_t)lround(temp);
> > +            }
> > +        }
> > +
> > +        /* Calculate the ratio per non-vCPU thread of each package */
> > +        for (int i = 0; i <= maxpkgs; i++) {
> > +            if (pkg_stat[i].nb_vcpu > 0) {
> > +                pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
> > +            }
> > +        }
> > +
> > +        /* Calculate the energy for each vCPU thread */
> > +        for (int i = 0; i < num_threads; i++) {
> > +            double temp;
> > +
> > +            if ((thd_stat[i].is_vcpu == true) &&
> > +                (thd_stat[i].delta_ticks > 0)) {
> > +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> > +                vmsr->msr_value[thd_stat[i].vcpu_id] += (uint64_t)lround(temp);
> > +                vmsr->msr_value[thd_stat[i].vcpu_id] \
> > +                    += pkg_stat[thd_stat[i].numa_node_id].e_ratio;
> > +            }
> > +        }
> > +
> > +        /* free all memory */
> > +        for (int i = 0; i < num_threads; i++) {
> > +            free(thd_stat[i].utime);
> > +            free(thd_stat[i].stime);
> > +        }
> > +        free(thd_stat);
> > +        free(thread_ids);
> > +    }
> > +
> > +    rcu_unregister_thread();
> > +    return NULL;
> > +}
> > +
> > +static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
> > +{
> > +    struct KVMMsrEnergy *r = &s->msr_energy;
> > +
> > +    /* Retrieve the number of vCPU */
> > +    r->cpus = ms->smp.cpus;
> > +
> > +    /* Allocate register memory (MSR_PKG_STATUS) for each vCPU */
> > +    r->msr_value = calloc(r->cpus, sizeof(r->msr_value));
> > +
> > +    qemu_thread_create(&r->msr_thr, "kvm-msr",
> > +                       kvm_msr_energy_thread,
> > +                       s, QEMU_THREAD_JOINABLE);
> > +
> > +    return 0;
> > +}
> > +
> >  int kvm_arch_init(MachineState *ms, KVMState *s)
> >  {
> >      uint64_t identity_base = 0xfffbc000;
> > @@ -2765,6 +2998,46 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
> >                           strerror(-ret));
> >              exit(1);
> >          }
> > +
> > +        if (s->msr_energy.enable == true) {
> > +
> > +            r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
> > +                               kvm_rdmsr_rapl_power_unit, NULL);
> > +            if (!r) {
> > +                error_report("Could not install MSR_RAPL_POWER_UNIT \
> > +                                handler: %s",
> > +                             strerror(-ret));
> > +                exit(1);
> > +            }
> > +
> > +            r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
> > +                               kvm_rdmsr_pkg_power_limit, NULL);
> > +            if (!r) {
> > +                error_report("Could not install MSR_PKG_POWER_LIMIT \
> > +                                handler: %s",
> > +                             strerror(-ret));
> > +                exit(1);
> > +            }
> > +
> > +            r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
> > +                               kvm_rdmsr_pkg_power_info, NULL);
> > +            if (!r) {
> > +                error_report("Could not install MSR_PKG_POWER_INFO \
> > +                                handler: %s",
> > +                             strerror(-ret));
> > +                exit(1);
> > +            }
> > +            r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
> > +                               kvm_rdmsr_pkg_energy_status, NULL);
> > +            if (!r) {
> > +                error_report("Could not install MSR_PKG_ENERGY_STATUS \
> > +                                handler: %s",
> > +                             strerror(-ret));
> > +                exit(1);
> > +            } else {
> > +                kvm_msr_energy_thread_init(s, ms);
> > +            }
> > +        }
> >      }
> >  
> >      return 0;
> > diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
> > index 322272091bce..9cdc93c6c439 100644
> > --- a/target/i386/kvm/meson.build
> > +++ b/target/i386/kvm/meson.build
> > @@ -5,6 +5,7 @@ i386_softmmu_kvm_ss = ss.source_set()
> >  i386_softmmu_kvm_ss.add(files(
> >    'kvm.c',
> >    'kvm-cpu.c',
> > +  'vmsr_energy.c',
> >  ))
> >  
> >  i386_softmmu_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
> > diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c
> > new file mode 100644
> > index 000000000000..8bd86b32becf
> > --- /dev/null
> > +++ b/target/i386/kvm/vmsr_energy.c
> > @@ -0,0 +1,132 @@
> > +/*
> > + * QEMU KVM support -- x86 virtual energy-related MSR.
> > + *
> > + * Copyright 2023 Red Hat, Inc. 2023
> > + *
> > + *  Author:
> > + *      Anthony Harivel <aharivel@redhat.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + *
> > + */
> > +
> > +#include "vmsr_energy.h"
> > +
> > +#define MAX_PATH_LEN 50
> > +#define MAX_LINE_LEN 500
> > +
> > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id)
> > +{
> > +    int fd;
> > +    uint64_t data;
> > +
> > +    char path[MAX_PATH_LEN];
> > +    snprintf(path, MAX_PATH_LEN, "/dev/cpu/%u/msr", cpu_id);
> > +
> > +    fd = open(path , O_RDONLY);
> > +    if (fd < 0) {
> > +        return 0;
> > +    }
> > +    if (pread(fd, &data, sizeof data, reg) != sizeof data) {
> > +        data = 0;
> > +    }
> > +
> > +    close(fd);
> > +    return data;
> > +}
> > +
> > +/* Retrieve the number of physical CPU on the package */
> > +unsigned int get_maxcpus(unsigned int package_num)
> > +{
> > +    int k, ncpus;
> > +    unsigned int maxcpus;
> > +    struct bitmask *cpus;
> > +
> > +    cpus = numa_allocate_cpumask();
> > +    ncpus = cpus->size;
> > +
> > +    if (numa_node_to_cpus(package_num, cpus) < 0) {
> > +        printf("node %u failed to convert\n", package_num);
> > +    }
> > +
> > +    maxcpus = 0;
> > +    for (k = 0; k < ncpus; k++) {
> > +        if (numa_bitmask_isbitset(cpus, k)) {
> > +            maxcpus++;
> > +        }
> > +    }
> > +
> > +    return maxcpus;
> > +}
> > +
> > +int read_thread_stat(struct thread_stat *thread, int pid, int index)
> > +{
> > +    char path[MAX_PATH_LEN];
> > +    snprintf(path, MAX_PATH_LEN, "/proc/%u/task/%d/stat", pid, \
> > +             thread->thread_id);
> > +
> > +    FILE *file = fopen(path, "r");
> > +    if (file == NULL) {
> > +        return -1;
> > +    }
> > +
> > +    if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
> > +        " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
> > +        " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
> > +           &thread->utime[index], &thread->stime[index], &thread->cpu_id) != 3)
> > +        return -1;
> > +
> > +    fclose(file);
> > +    return 0;
> > +}
> > +
> > +/* Read QEMU stat task folder to retrieve all QEMU threads ID */
> > +pid_t *get_thread_ids(pid_t pid, int *num_threads)
> > +{
> > +    char path[100];
> > +    sprintf(path, "/proc/%d/task", pid);
> > +
> > +    DIR *dir = opendir(path);
> > +    if (dir == NULL) {
> > +        perror("opendir");
> > +        return NULL;
> > +    }
> > +
> > +    pid_t *thread_ids = NULL;
> > +    int thread_count = 0;
> > +
> > +    struct dirent *ent;
> > +    while ((ent = readdir(dir)) != NULL) {
> > +        if (ent->d_name[0] == '.') {
> > +            continue;
> > +        }
> > +        pid_t tid = atoi(ent->d_name);
> > +        if (pid != tid) {
> > +            thread_ids = realloc(thread_ids,
> > +                                 (thread_count + 1) * sizeof(pid_t));
> > +            thread_ids[thread_count] = tid;
> > +            thread_count++;
> > +        }
> > +    }
> > +
> > +    closedir(dir);
> > +
> > +    *num_threads = thread_count;
> > +    return thread_ids;
> > +}
> > +
> > +void delta_ticks(thread_stat *thd_stat, int i)
> > +{
> > +    thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
> > +                            - (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
> > +}
> > +
> > +double get_ratio(package_energy_stat *pkg_stat,
> > +                        thread_stat *thd_stat,
> > +                        int maxticks, int i) {
> > +
> > +    return (pkg_stat[thd_stat[i].numa_node_id].e_delta / 100.0)
> > +            * ((100.0 / maxticks) * thd_stat[i].delta_ticks);
> > +}
> > +
> > diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
> > new file mode 100644
> > index 000000000000..5f79d2cbe00d
> > --- /dev/null
> > +++ b/target/i386/kvm/vmsr_energy.h
> > @@ -0,0 +1,80 @@
> > +/*
> > + * QEMU KVM support -- x86 virtual energy-related MSR.
> > + *
> > + * Copyright 2023 Red Hat, Inc. 2023
> > + *
> > + *  Author:
> > + *      Anthony Harivel <aharivel@redhat.com>
> > + *
> > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > + * See the COPYING file in the top-level directory.
> > + *
> > + */
> > +
> > +#ifndef VMSR_ENERGY_H
> > +#define VMSR_ENERGY_H
> > +
> > +#include "qemu/osdep.h"
> > +
> > +#include <numa.h>
> > +
> > +/*
> > + * Define the interval time in micro seconds between 2 samples of
> > + * energy related MSRs
> > + */
> > +#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
> > +
> > +/*
> > + * Thread statistic
> > + * @ thread_id: TID (thread ID)
> > + * @ is_vcpu: true is thread is vCPU thread
> > + * @ cpu_id: CPU number last executed on
> > + * @ vcpu_id: vCPU ID
> > + * @ numa_node_id:node number of the CPU
> > + * @ utime: amount of clock ticks the thread
> > + *          has been scheduled in User mode
> > + * @ stime: amount of clock ticks the thread
> > + *          has been scheduled in System mode
> > + * @ delta_ticks: delta of utime+stime between
> > + *          the two samples (before/after sleep)
> > + */
> > +struct thread_stat {
> > +    unsigned int thread_id;
> > +    bool is_vcpu;
> > +    unsigned int cpu_id;
> > +    unsigned int vcpu_id;
> > +    unsigned int numa_node_id;
> > +    unsigned long long *utime;
> > +    unsigned long long *stime;
> > +    unsigned long long delta_ticks;
> > +};
> > +
> > +/*
> > + * Package statistic
> > + * @ e_start: package energy counter before the sleep
> > + * @ e_end: package energy counter after the sleep
> > + * @ e_delta: delta of package energy counter
> > + * @ e_ratio: store the energy ratio of non-vCPU thread
> > + * @ nb_vcpu: number of vCPU running on this package
> > + */
> > +struct packge_energy_stat {
> > +    uint64_t e_start;
> > +    uint64_t e_end;
> > +    uint64_t e_delta;
> > +    uint64_t e_ratio;
> > +    unsigned int nb_vcpu;
> > +};
> > +
> > +typedef struct thread_stat thread_stat;
> > +typedef struct packge_energy_stat package_energy_stat;
> > +
> > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id);
> > +void delta_ticks(thread_stat *thd_stat, int i);
> > +unsigned int get_maxcpus(unsigned int package_num);
> > +int read_thread_stat(struct thread_stat *thread, int pid, int index);
> > +pid_t *get_thread_ids(pid_t pid, int *num_threads);
> > +double get_ratio(package_energy_stat *pkg_stat,
> > +                        thread_stat *thd_stat,
> > +                        int maxticks, int i);
> > +
> > +#endif /* VMSR_ENERGY_H */
> > -- 
> > 2.40.1
> > 
> >

Re: [RFC PATCH] Add support for RAPL MSRs in KVM/Qemu

Posted by Marcelo Tosatti 11 months, 2 weeks ago

Hi Anthony,

On Thu, May 18, 2023 at 04:26:51PM +0200, Anthony Harivel wrote:
> Marcelo Tosatti, May 17, 2023 at 17:43:
> 
> Hi Marcelo,
> 
> > On Wed, May 17, 2023 at 03:07:30PM +0200, Anthony Harivel wrote:
>  > diff --git a/target/i386/cpu.h b/target/i386/cpu.h
> > > index 8504aaac6807..14f9c2901680 100644
> > > --- a/target/i386/cpu.h
> > > +++ b/target/i386/cpu.h
> > > @@ -396,6 +396,10 @@ typedef enum X86Seg {
> > >  #define MSR_IA32_TSX_CTRL		0x122
> > >  #define MSR_IA32_TSCDEADLINE            0x6e0
> > >  #define MSR_IA32_PKRS                   0x6e1
> > > +#define MSR_RAPL_POWER_UNIT             0x00000606
> > > +#define MSR_PKG_POWER_LIMIT             0x00000610
> > > +#define MSR_PKG_ENERGY_STATUS           0x00000611
> > > +#define MSR_PKG_POWER_INFO              0x00000614
> >
> > Why only PKG and not all domains?
> >
> 
> Package domains are the only ones you can find accross different CPU
> segments (client and server platforms).
> Processor cores domains are only available on client platform while
> DRAM domains only on server platform.
> 
> I figured out that Package domains would be a good start to validate the
> implementation and the rest could technically be added later on. 

Understood.

> > > +    /* Assuming those values are the same accross physical system/packages */
> > > +    maxcpus = get_maxcpus(0); /* Number of CPUS per packages */
> > > +    maxpkgs = numa_max_node(); /* Number of Packages on the system */

numa_max_node() returns the highest node number available on the current system. 
(See the node numbers in /sys/devices/system/node/ ). Also see numa_num_configured_nodes().

One can find package topology information from
/sys/devices/system/cpu/cpuX/topology/

> > > +    /* Those MSR values should not change as well */
> > > +    vmsr->msr_unit = read_msr(MSR_RAPL_POWER_UNIT, 0);
> > > +    vmsr->msr_limit = read_msr(MSR_PKG_POWER_LIMIT, 0);
> >
> > Power limit - MSR interfaces to specify power limit, time window; lock bit, clamp bit etc
> >
> > This one can change, right? And why expose the power limit to the guest?
> >
> 
> Right.
> Because it belongs to the non-optional RAPL interfaces MSRs, I added it
> with the thought that it was mandatory for the RAPL driver to mount
> insite the guest. 
> Either it is not and can be removed, or we can set the "lock bit" to
> inform the guest that power limit settings are static and un-modifiable.
> I will correct that. 

OK.

> > > +    vmsr->msr_info = read_msr(MSR_PKG_POWER_INFO, 0);
> > > +
> > > +    /* Allocate memory for each package energy status */
> > > +    pkg_stat = (package_energy_stat *) calloc(maxpkgs + 1,
> > > +                                              sizeof(package_energy_stat));
> > > +
> > > +    /*
> > > +     * Max numbers of ticks per package
> > > +     * time in second * number of ticks/second * Number of cores / package
> > > +     * ex: for 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
> > > +     */
> > > +    maxticks = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
> > > +                    * sysconf(_SC_CLK_TCK) * maxcpus;
> > > +
> > > +    while (true) {
> > > +
> > > +        /* Get all qemu threads id */
> > > +        pid_t *thread_ids = get_thread_ids(pid, &num_threads);
> > > +
> > > +        if (thread_ids == NULL) {
> > > +            return NULL;
> > > +        }
> > > +
> > > +        /* Allocate memory for each thread stats */
> > > +        thd_stat = (thread_stat *) calloc(num_threads, sizeof(thread_stat));

Can you keep this pre-allocated ? And all other data as well.

> > > +        /* Populate all the thread stats */
> > > +        for (int i = 0; i < num_threads; i++) {
> > > +            thd_stat[i].thread_id = thread_ids[i];
> > > +            thd_stat[i].utime = calloc(2, sizeof(unsigned long long));
> > > +            thd_stat[i].stime = calloc(2, sizeof(unsigned long long));
> > > +            read_thread_stat(&thd_stat[i], pid, 0);
> > > +            thd_stat[i].numa_node_id = numa_node_of_cpu(thd_stat[i].cpu_id);
> > > +        }
> > > +
> > > +        /* Retrieve all packages power plane energy counter */
> > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > +            for (int j = 0; j < num_threads; j++) {
> > > +                /*
> > > +                 * Use the first thread we found that ran on the CPU
> > > +                 * of the package to read the packages energy counter
> > > +                 */
> > > +                if (thd_stat[j].numa_node_id == i) {
> > > +                    pkg_stat[i].e_start = read_msr(MSR_PKG_ENERGY_STATUS, i);
> > > +                    break;
> > > +                }
> > > +            }
> > > +        }

NUMA node does not map necessarily to one package.

> > > +        /* Sleep a short period while the other threads are working */
> > > +        usleep(MSR_ENERGY_THREAD_SLEEP_US);
> > > +
> > > +        /*
> > > +         * Retrieve all packages power plane energy counter
> > > +         * Calculate the delta of all packages
> > > +         */
> > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > +            for (int j = 0; j < num_threads; j++) {
> > > +                /*
> > > +                 * Use the first thread we found that ran on the CPU
> > > +                 * of the package to read the packages energy counter
> > > +                 */
> > > +                if (thd_stat[j].numa_node_id == i) {
> > > +                    pkg_stat[i].e_end =
> > > +                        read_msr(MSR_PKG_ENERGY_STATUS, thd_stat[j].cpu_id);

This is excessive (to read the MSRs of each package in the system).

Consider 100 Linux guests all of them with this enabled, on a system with
4 packages. How many times you'll be reading MSR of each package?

Moreover, don't want to readmsr on an isolated CPU.

> > > +                    pkg_stat[i].e_delta =
> > > +                        pkg_stat[i].e_end - pkg_stat[i].e_start;
> > > +                    break;
> > > +                }
> > > +            }
> > > +        }
> >
> > Don't get this: can you do a high level overview of how the emulated
> > value is calculated? (fail to see what is the relation between whether
> > a QEMU thread ran and whether to read or not MSR_PKG_ENERGY_STATUS).
> >
> > It seems this might be simplified, and reading every 1ms might be
> > excessive overhead.
> >
> > Independent of this question (for whatever implementation): how
> > accurate is the power information exposed to the guest vs actual
> > power consumed (might be nice to have some notion of it).
> >
> > In general i think its useful to expose the information to guests.
> >
> 
> No problem, let me try to explain: 
> a QEMU process is composed of vCPU thread(s) and non-vCPU thread(s) (IO,
> emulated device,...). Each of those threads can run on different cores
> that can belongs to the same Package or not.
> The MSR_PKG_ENERGY_STATUS is a counter that increment for the whole
> Package domain. If you read this MSR from any core that belongs to the 
> package, you will retrieve the same number.
> So when I need to read the MSR, I only need to read once for all the
> threads that are running on cores of each Package.

T=0	read p1v0 (== MSR_PKG_ENERGY_STATUS)
T=1	vcpu-1 executing in core1
	vcpu-2 executing in core2
	vcpu-3 executing in core2
T=2	read p1v1

Won't you be exposing (p1v1-p1v0)/3 by 3 in this case, to the 
virtual MSR_PKG_ENERGY_STATUS?

> Now let's talk about the implementation of the emulated value. 
> I've created a thread that does in an infinite loop the following:
> - Retrieve all the QEMU threads + statistics about them 
> - Read the energy counter of each Package involved 
> - Sleep for 1sec (1sec is arbitrary)
> - Calculate the delta of ticks for each threads so that we know how much
>   time each threads has been scheduled during the Sleep period

Intel docs mention the counter can overflow (faster with higher 
energy consumption). Are you handling the overflow case?

> - Read again the energy counter of each Package involved and calculate
>   the delta of the counter so that we know how much the counter has
>   increased during the Sleep period
> - Calculate the ratio for each vCPU thread and deduct the energy spent
>   for each vCPU
> - Calculate the amount of energy spent for all non-vCPU and evenly
>   spread it to the vCPU
> - Update each virtual MSR for each vCPU
> 
> Obviously, this is working better and more consistently with vCPU pinning 
> and proper isolation of the cores in the package. 

Do you really need to measure the consumption for QEMU threads? I'd say
only measuring for the vcpus is sufficient.

> So those virtual MSRs are updated roughly each second (could be changed
> by updating MSR_ENERGY_THREAD_SLEEP_US). Compared to the real MSRs which 
> are updated by the microcode every 1ms.

How useful are the measurements for the guest, then? (I mean, doesnt a
1 second interval render the measurements less useful).

> Concerning the "real" power consumption, we have to not forget that the
> RAPL interface energy data is not the result of a physical measurement. 
> It is a set of architectural events from each core, processor
> graphic, IO, etc. and combines them with energy weights to predict the
> package's active power consumption.
> 
> IMO it is not really important because the idea behind this patch is to
> give estimated values to the guests so that software running inside
> VM can make use of power tools which are all reading those MSRs (or the
> RAPL driver sys interface) to retrieve power consumption.

Can you describe some use-cases... Because what seems to be useful
(actually, what seem to be possible) is for users to measure the 
energy consumption of a package. 

So if you think, OK i can read the energy consumption information
from within a guest, but i know its the energy consumption divided
by the amount of time the qemu threads executed, how useful that
measure is ?

Why not expose the actual delta from the MSRs, rather than diving
by amount of time the qemu threads execute.

> > > +
> > > +        /* Delta of ticks spend by each thread between the sample */
> > > +        for (int i = 0; i < num_threads; i++) {
> > > +            if (read_thread_stat(&thd_stat[i], pid, 1) != 0) {
> > > +                /*
> > > +                 * We don't count the dead thread
> > > +                 * i.e threads that existed before the sleep
> > > +                 * and not anymore
> > > +                 */
> > > +                thd_stat[i].delta_ticks = 0;
> > > +            } else {
> > > +                delta_ticks(thd_stat, i);
> > > +            }
> > > +        }
> > > +
> > > +        /*
> > > +         * Identify the vCPU threads
> > > +         * Calculate the Number of vCPU per package
> > > +         */
> > > +        CPU_FOREACH(cpu) {
> > > +            for (int i = 0; i < num_threads; i++) {
> > > +                if (cpu->thread_id == thd_stat[i].thread_id) {
> > > +                    thd_stat[i].is_vcpu = true;
> > > +                    thd_stat[i].vcpu_id = cpu->cpu_index;
> > > +                    pkg_stat[thd_stat[i].numa_node_id].nb_vcpu++;
> > > +                    break;
> > > +                }
> > > +            }
> > > +        }
> > > +
> > > +        /* Calculate the total energy of all non-vCPU thread */
> > > +        for (int i = 0; i < num_threads; i++) {
> > > +            double temp;
> > > +            if ((thd_stat[i].is_vcpu != true) &&
> > > +                (thd_stat[i].delta_ticks > 0)) {
> > > +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> > > +                pkg_stat[thd_stat[i].numa_node_id].e_ratio
> > > +                    += (uint64_t)lround(temp);
> > > +            }
> > > +        }
> > > +
> > > +        /* Calculate the ratio per non-vCPU thread of each package */
> > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > +            if (pkg_stat[i].nb_vcpu > 0) {
> > > +                pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
> > > +            }
> > > +        }
> > > +
> > > +        /* Calculate the energy for each vCPU thread */
> > > +        for (int i = 0; i < num_threads; i++) {
> > > +            double temp;
> > > +
> > > +            if ((thd_stat[i].is_vcpu == true) &&
> > > +                (thd_stat[i].delta_ticks > 0)) {
> > > +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> > > +                vmsr->msr_value[thd_stat[i].vcpu_id] += (uint64_t)lround(temp);
> > > +                vmsr->msr_value[thd_stat[i].vcpu_id] \
> > > +                    += pkg_stat[thd_stat[i].numa_node_id].e_ratio;
> > > +            }
> > > +        }
> > > +
> > > +        /* free all memory */
> > > +        for (int i = 0; i < num_threads; i++) {
> > > +            free(thd_stat[i].utime);
> > > +            free(thd_stat[i].stime);
> > > +        }
> > > +        free(thd_stat);
> > > +        free(thread_ids);
> > > +    }
> > > +
> > > +    rcu_unregister_thread();
> > > +    return NULL;
> > > +}
> > > +
> > > +static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
> > > +{
> > > +    struct KVMMsrEnergy *r = &s->msr_energy;
> > > +
> > > +    /* Retrieve the number of vCPU */
> > > +    r->cpus = ms->smp.cpus;
> > > +
> > > +    /* Allocate register memory (MSR_PKG_STATUS) for each vCPU */
> > > +    r->msr_value = calloc(r->cpus, sizeof(r->msr_value));
> > > +
> > > +    qemu_thread_create(&r->msr_thr, "kvm-msr",
> > > +                       kvm_msr_energy_thread,
> > > +                       s, QEMU_THREAD_JOINABLE);
> > > +
> > > +    return 0;
> > > +}
> > > +
> > >  int kvm_arch_init(MachineState *ms, KVMState *s)
> > >  {
> > >      uint64_t identity_base = 0xfffbc000;
> > > @@ -2765,6 +2998,46 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
> > >                           strerror(-ret));
> > >              exit(1);
> > >          }
> > > +
> > > +        if (s->msr_energy.enable == true) {
> > > +
> > > +            r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
> > > +                               kvm_rdmsr_rapl_power_unit, NULL);
> > > +            if (!r) {
> > > +                error_report("Could not install MSR_RAPL_POWER_UNIT \
> > > +                                handler: %s",
> > > +                             strerror(-ret));
> > > +                exit(1);
> > > +            }
> > > +
> > > +            r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
> > > +                               kvm_rdmsr_pkg_power_limit, NULL);
> > > +            if (!r) {
> > > +                error_report("Could not install MSR_PKG_POWER_LIMIT \
> > > +                                handler: %s",
> > > +                             strerror(-ret));
> > > +                exit(1);
> > > +            }
> > > +
> > > +            r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
> > > +                               kvm_rdmsr_pkg_power_info, NULL);
> > > +            if (!r) {
> > > +                error_report("Could not install MSR_PKG_POWER_INFO \
> > > +                                handler: %s",
> > > +                             strerror(-ret));
> > > +                exit(1);
> > > +            }
> > > +            r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
> > > +                               kvm_rdmsr_pkg_energy_status, NULL);
> > > +            if (!r) {
> > > +                error_report("Could not install MSR_PKG_ENERGY_STATUS \
> > > +                                handler: %s",
> > > +                             strerror(-ret));
> > > +                exit(1);
> > > +            } else {
> > > +                kvm_msr_energy_thread_init(s, ms);
> > > +            }
> > > +        }
> > >      }
> > >  
> > >      return 0;
> > > diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
> > > index 322272091bce..9cdc93c6c439 100644
> > > --- a/target/i386/kvm/meson.build
> > > +++ b/target/i386/kvm/meson.build
> > > @@ -5,6 +5,7 @@ i386_softmmu_kvm_ss = ss.source_set()
> > >  i386_softmmu_kvm_ss.add(files(
> > >    'kvm.c',
> > >    'kvm-cpu.c',
> > > +  'vmsr_energy.c',
> > >  ))
> > >  
> > >  i386_softmmu_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
> > > diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c
> > > new file mode 100644
> > > index 000000000000..8bd86b32becf
> > > --- /dev/null
> > > +++ b/target/i386/kvm/vmsr_energy.c
> > > @@ -0,0 +1,132 @@
> > > +/*
> > > + * QEMU KVM support -- x86 virtual energy-related MSR.
> > > + *
> > > + * Copyright 2023 Red Hat, Inc. 2023
> > > + *
> > > + *  Author:
> > > + *      Anthony Harivel <aharivel@redhat.com>
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > + * See the COPYING file in the top-level directory.
> > > + *
> > > + */
> > > +
> > > +#include "vmsr_energy.h"
> > > +
> > > +#define MAX_PATH_LEN 50
> > > +#define MAX_LINE_LEN 500
> > > +
> > > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id)
> > > +{
> > > +    int fd;
> > > +    uint64_t data;
> > > +
> > > +    char path[MAX_PATH_LEN];
> > > +    snprintf(path, MAX_PATH_LEN, "/dev/cpu/%u/msr", cpu_id);
> > > +
> > > +    fd = open(path , O_RDONLY);
> > > +    if (fd < 0) {
> > > +        return 0;
> > > +    }
> > > +    if (pread(fd, &data, sizeof data, reg) != sizeof data) {
> > > +        data = 0;
> > > +    }
> > > +
> > > +    close(fd);
> > > +    return data;
> > > +}
> > > +
> > > +/* Retrieve the number of physical CPU on the package */
> > > +unsigned int get_maxcpus(unsigned int package_num)
> > > +{
> > > +    int k, ncpus;
> > > +    unsigned int maxcpus;
> > > +    struct bitmask *cpus;
> > > +
> > > +    cpus = numa_allocate_cpumask();
> > > +    ncpus = cpus->size;
> > > +
> > > +    if (numa_node_to_cpus(package_num, cpus) < 0) {
> > > +        printf("node %u failed to convert\n", package_num);
> > > +    }
> > > +
> > > +    maxcpus = 0;
> > > +    for (k = 0; k < ncpus; k++) {
> > > +        if (numa_bitmask_isbitset(cpus, k)) {
> > > +            maxcpus++;
> > > +        }
> > > +    }
> > > +
> > > +    return maxcpus;
> > > +}
> > > +
> > > +int read_thread_stat(struct thread_stat *thread, int pid, int index)
> > > +{
> > > +    char path[MAX_PATH_LEN];
> > > +    snprintf(path, MAX_PATH_LEN, "/proc/%u/task/%d/stat", pid, \
> > > +             thread->thread_id);
> > > +
> > > +    FILE *file = fopen(path, "r");
> > > +    if (file == NULL) {
> > > +        return -1;
> > > +    }
> > > +
> > > +    if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
> > > +        " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
> > > +        " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
> > > +           &thread->utime[index], &thread->stime[index], &thread->cpu_id) != 3)
> > > +        return -1;
> > > +
> > > +    fclose(file);
> > > +    return 0;
> > > +}
> > > +
> > > +/* Read QEMU stat task folder to retrieve all QEMU threads ID */
> > > +pid_t *get_thread_ids(pid_t pid, int *num_threads)
> > > +{
> > > +    char path[100];
> > > +    sprintf(path, "/proc/%d/task", pid);
> > > +
> > > +    DIR *dir = opendir(path);
> > > +    if (dir == NULL) {
> > > +        perror("opendir");
> > > +        return NULL;
> > > +    }
> > > +
> > > +    pid_t *thread_ids = NULL;
> > > +    int thread_count = 0;
> > > +
> > > +    struct dirent *ent;
> > > +    while ((ent = readdir(dir)) != NULL) {
> > > +        if (ent->d_name[0] == '.') {
> > > +            continue;
> > > +        }
> > > +        pid_t tid = atoi(ent->d_name);
> > > +        if (pid != tid) {
> > > +            thread_ids = realloc(thread_ids,
> > > +                                 (thread_count + 1) * sizeof(pid_t));
> > > +            thread_ids[thread_count] = tid;
> > > +            thread_count++;
> > > +        }
> > > +    }
> > > +
> > > +    closedir(dir);
> > > +
> > > +    *num_threads = thread_count;
> > > +    return thread_ids;
> > > +}
> > > +
> > > +void delta_ticks(thread_stat *thd_stat, int i)
> > > +{
> > > +    thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
> > > +                            - (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
> > > +}
> > > +
> > > +double get_ratio(package_energy_stat *pkg_stat,
> > > +                        thread_stat *thd_stat,
> > > +                        int maxticks, int i) {
> > > +
> > > +    return (pkg_stat[thd_stat[i].numa_node_id].e_delta / 100.0)
> > > +            * ((100.0 / maxticks) * thd_stat[i].delta_ticks);
> > > +}
> > > +
> > > diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
> > > new file mode 100644
> > > index 000000000000..5f79d2cbe00d
> > > --- /dev/null
> > > +++ b/target/i386/kvm/vmsr_energy.h
> > > @@ -0,0 +1,80 @@
> > > +/*
> > > + * QEMU KVM support -- x86 virtual energy-related MSR.
> > > + *
> > > + * Copyright 2023 Red Hat, Inc. 2023
> > > + *
> > > + *  Author:
> > > + *      Anthony Harivel <aharivel@redhat.com>
> > > + *
> > > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > + * See the COPYING file in the top-level directory.
> > > + *
> > > + */
> > > +
> > > +#ifndef VMSR_ENERGY_H
> > > +#define VMSR_ENERGY_H
> > > +
> > > +#include "qemu/osdep.h"
> > > +
> > > +#include <numa.h>
> > > +
> > > +/*
> > > + * Define the interval time in micro seconds between 2 samples of
> > > + * energy related MSRs
> > > + */
> > > +#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
> > > +
> > > +/*
> > > + * Thread statistic
> > > + * @ thread_id: TID (thread ID)
> > > + * @ is_vcpu: true is thread is vCPU thread
> > > + * @ cpu_id: CPU number last executed on
> > > + * @ vcpu_id: vCPU ID
> > > + * @ numa_node_id:node number of the CPU
> > > + * @ utime: amount of clock ticks the thread
> > > + *          has been scheduled in User mode
> > > + * @ stime: amount of clock ticks the thread
> > > + *          has been scheduled in System mode
> > > + * @ delta_ticks: delta of utime+stime between
> > > + *          the two samples (before/after sleep)
> > > + */
> > > +struct thread_stat {
> > > +    unsigned int thread_id;
> > > +    bool is_vcpu;
> > > +    unsigned int cpu_id;
> > > +    unsigned int vcpu_id;
> > > +    unsigned int numa_node_id;
> > > +    unsigned long long *utime;
> > > +    unsigned long long *stime;
> > > +    unsigned long long delta_ticks;
> > > +};
> > > +
> > > +/*
> > > + * Package statistic
> > > + * @ e_start: package energy counter before the sleep
> > > + * @ e_end: package energy counter after the sleep
> > > + * @ e_delta: delta of package energy counter
> > > + * @ e_ratio: store the energy ratio of non-vCPU thread
> > > + * @ nb_vcpu: number of vCPU running on this package
> > > + */
> > > +struct packge_energy_stat {
> > > +    uint64_t e_start;
> > > +    uint64_t e_end;
> > > +    uint64_t e_delta;
> > > +    uint64_t e_ratio;
> > > +    unsigned int nb_vcpu;
> > > +};
> > > +
> > > +typedef struct thread_stat thread_stat;
> > > +typedef struct packge_energy_stat package_energy_stat;
> > > +
> > > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id);
> > > +void delta_ticks(thread_stat *thd_stat, int i);
> > > +unsigned int get_maxcpus(unsigned int package_num);
> > > +int read_thread_stat(struct thread_stat *thread, int pid, int index);
> > > +pid_t *get_thread_ids(pid_t pid, int *num_threads);
> > > +double get_ratio(package_energy_stat *pkg_stat,
> > > +                        thread_stat *thd_stat,
> > > +                        int maxticks, int i);
> > > +
> > > +#endif /* VMSR_ENERGY_H */
> > > -- 
> > > 2.40.1
> > > 
> > > 
> 
>

Re: [RFC PATCH] Add support for RAPL MSRs in KVM/Qemu

Posted by Anthony Harivel 11 months, 2 weeks ago

Marcelo Tosatti, May 19, 2023 at 20:28:

Hi Marcelo,

> > > > +    /* Assuming those values are the same accross physical system/packages */
> > > > +    maxcpus = get_maxcpus(0); /* Number of CPUS per packages */
> > > > +    maxpkgs = numa_max_node(); /* Number of Packages on the system */
>
> numa_max_node() returns the highest node number available on the current system. 
> (See the node numbers in /sys/devices/system/node/ ). Also see numa_num_configured_nodes().
>
> One can find package topology information from
> /sys/devices/system/cpu/cpuX/topology/
>

Good point. 
I will find a better solution to identify the topology using your hint. 

 > > > +        /* Allocate memory for each thread stats */
> > > > +        thd_stat = (thread_stat *) calloc(num_threads, sizeof(thread_stat));
>
> Can you keep this pre-allocated ? And all other data as well.

Ok no problem.

 > > > +        /* Retrieve all packages power plane energy counter */
> > > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > > +            for (int j = 0; j < num_threads; j++) {
> > > > +                /*
> > > > +                 * Use the first thread we found that ran on the CPU
> > > > +                 * of the package to read the packages energy counter
> > > > +                 */
> > > > +                if (thd_stat[j].numa_node_id == i) {
> > > > +                    pkg_stat[i].e_start = read_msr(MSR_PKG_ENERGY_STATUS, i);
> > > > +                    break;
> > > > +                }
> > > > +            }
> > > > +        }
>
> NUMA node does not map necessarily to one package.

True. I will update this part at the same time with the topology info
discussed above. 

>
> > > > +        /* Sleep a short period while the other threads are working */
> > > > +        usleep(MSR_ENERGY_THREAD_SLEEP_US);
> > > > +
> > > > +        /*
> > > > +         * Retrieve all packages power plane energy counter
> > > > +         * Calculate the delta of all packages
> > > > +         */
> > > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > > +            for (int j = 0; j < num_threads; j++) {
> > > > +                /*
> > > > +                 * Use the first thread we found that ran on the CPU
> > > > +                 * of the package to read the packages energy counter
> > > > +                 */
> > > > +                if (thd_stat[j].numa_node_id == i) {
> > > > +                    pkg_stat[i].e_end =
> > > > +                        read_msr(MSR_PKG_ENERGY_STATUS, thd_stat[j].cpu_id);
>
> This is excessive (to read the MSRs of each package in the system).
>
> Consider 100 Linux guests all of them with this enabled, on a system with
> 4 packages. How many times you'll be reading MSR of each package?

The problem here is that you can have vCPUs that are running on different
packages. However the energy counter of the different packages are
increasing independently. 
Either we "force" somehow users to run only on the same package, either I'm
afraid we are obliged to read all the packages energy counter (when they
are involved in the VM).

Imagine this:

|----pkg-0----|----pkg-1----|
|0|1|2|3|4|5|6|0|1|2|3|4|5|6|
|       |       |       |
| vm-0  |  vm-1 |  vm-2 |

Only vm-1 that has cores from pkg-0 and pkg-1 would have to read both
pkg energy. vm-0 would only read pkg-0 and vm-2 only pkg-1.


>
> Moreover, don't want to readmsr on an isolated CPU.
>

Could you explain me why ?

 > 
> > No problem, let me try to explain: 
> > a QEMU process is composed of vCPU thread(s) and non-vCPU thread(s) (IO,
> > emulated device,...). Each of those threads can run on different cores
> > that can belongs to the same Package or not.
> > The MSR_PKG_ENERGY_STATUS is a counter that increment for the whole
> > Package domain. If you read this MSR from any core that belongs to the 
> > package, you will retrieve the same number.
> > So when I need to read the MSR, I only need to read once for all the
> > threads that are running on cores of each Package.
>
> T=0	read p1v0 (== MSR_PKG_ENERGY_STATUS)
> T=1	vcpu-1 executing in core1
> 	vcpu-2 executing in core2
> 	vcpu-3 executing in core2
> T=2	read p1v1
>
> Won't you be exposing (p1v1-p1v0)/3 by 3 in this case, to the 
> virtual MSR_PKG_ENERGY_STATUS?
>

No, because if we take for exemple a 4 cores per package with 100 ticks per second,
the maximum number of ticks for all the cores in this packages would be
400 ticks per second. 
So if 2 or more vcpus are sharing the same core, they will be maxout by the
100 ticks per second anyway and so ratio will still be ok. 


> > Now let's talk about the implementation of the emulated value. 
> > I've created a thread that does in an infinite loop the following:
> > - Retrieve all the QEMU threads + statistics about them 
> > - Read the energy counter of each Package involved 
> > - Sleep for 1sec (1sec is arbitrary)
> > - Calculate the delta of ticks for each threads so that we know how much
> >   time each threads has been scheduled during the Sleep period
>
> Intel docs mention the counter can overflow (faster with higher 
> energy consumption). Are you handling the overflow case?
>

Should we ? On baremetal, anyone reading the MSRs should handle the
overflow case, I guess on a guest this should be the same.
But I'll make sure *not* to output negative energy value for sure!

> > - Read again the energy counter of each Package involved and calculate
> >   the delta of the counter so that we know how much the counter has
> >   increased during the Sleep period
> > - Calculate the ratio for each vCPU thread and deduct the energy spent
> >   for each vCPU
> > - Calculate the amount of energy spent for all non-vCPU and evenly
> >   spread it to the vCPU
> > - Update each virtual MSR for each vCPU
> > 
> > Obviously, this is working better and more consistently with vCPU pinning 
> > and proper isolation of the cores in the package. 
>
> Do you really need to measure the consumption for QEMU threads? I'd say
> only measuring for the vcpus is sufficient.
>

What if a process in the guest is actually doing a lot of IO ?
The QEMU workers managing those IO would be consumming, no ?
But if this is unsignicant I can remove it.

> > So those virtual MSRs are updated roughly each second (could be changed
> > by updating MSR_ENERGY_THREAD_SLEEP_US). Compared to the real MSRs which 
> > are updated by the microcode every 1ms.
>
> How useful are the measurements for the guest, then? (I mean, doesnt a
> 1 second interval render the measurements less useful).
>

We can always reduce the sleeping time. 
I guess we can also set it with a parameter like:
-accel kvm,rapl=true,update-period=500

with update-period, the interval in milliseconds between updates. 

> > Concerning the "real" power consumption, we have to not forget that the
> > RAPL interface energy data is not the result of a physical measurement. 
> > It is a set of architectural events from each core, processor
> > graphic, IO, etc. and combines them with energy weights to predict the
> > package's active power consumption.
> > 
> > IMO it is not really important because the idea behind this patch is to
> > give estimated values to the guests so that software running inside
> > VM can make use of power tools which are all reading those MSRs (or the
> > RAPL driver sys interface) to retrieve power consumption.
>
> Can you describe some use-cases... Because what seems to be useful
> (actually, what seem to be possible) is for users to measure the 
> energy consumption of a package. 
>

Yes and here they will do the same. Except that for the moment I don't
have any solution to overcome the fact that each vcpu will have his own
package. 
When a software will read MSR_PKG_ENERGY_STATUS of pkg-0, it will also
read the energy of the vcpu-0 because we are in single vcpu per virtual
socket.
I would say it might give better granularity of the power consumption:
If a process in guest is running only on one isolated vcpu, reading the
package MSR of this vcpu would theorically gives the consumption of this
process directly. Whereas in baremetal, we need to take the other cores
of the package into account.

> So if you think, OK i can read the energy consumption information
> from within a guest, but i know its the energy consumption divided
> by the amount of time the qemu threads executed, how useful that
> measure is ?

It gives what the guest has actually consume and inside the guest there
are processes that are running.
There are many tools [1][2][3] that use thoses MSRs for metrics to calculate
the power consumption of applications/processes in order to optimize
their code. 

What I'm trying to achieve here is enabling this possibilities inside VM. 

>
> Why not expose the actual delta from the MSRs, rather than diving
> by amount of time the qemu threads execute.
>

Imagine 2 identical VMs on the same package: 
VM-1 is hogging cores and VM-2 is doing nothing. 
If we only report delta of the MSRs, both VM would consume the same
which is wrong. 
This is why I'm doing ratio of the energy depending on the QEMU threads. 
because a vCPU is just a thread and the /proc/<pid>/stat can give us 
the amount of time spend by each thread on the cores.  

Thanks a lot for your input,

Anthony



[1]: https://github.com/sustainable-computing-io/kepler
[2]: https://powerapi.org/
[3]: https://github.com/hubblo-org/scaphandre



> > > > +
> > > > +        /* Delta of ticks spend by each thread between the sample */
> > > > +        for (int i = 0; i < num_threads; i++) {
> > > > +            if (read_thread_stat(&thd_stat[i], pid, 1) != 0) {
> > > > +                /*
> > > > +                 * We don't count the dead thread
> > > > +                 * i.e threads that existed before the sleep
> > > > +                 * and not anymore
> > > > +                 */
> > > > +                thd_stat[i].delta_ticks = 0;
> > > > +            } else {
> > > > +                delta_ticks(thd_stat, i);
> > > > +            }
> > > > +        }
> > > > +
> > > > +        /*
> > > > +         * Identify the vCPU threads
> > > > +         * Calculate the Number of vCPU per package
> > > > +         */
> > > > +        CPU_FOREACH(cpu) {
> > > > +            for (int i = 0; i < num_threads; i++) {
> > > > +                if (cpu->thread_id == thd_stat[i].thread_id) {
> > > > +                    thd_stat[i].is_vcpu = true;
> > > > +                    thd_stat[i].vcpu_id = cpu->cpu_index;
> > > > +                    pkg_stat[thd_stat[i].numa_node_id].nb_vcpu++;
> > > > +                    break;
> > > > +                }
> > > > +            }
> > > > +        }
> > > > +
> > > > +        /* Calculate the total energy of all non-vCPU thread */
> > > > +        for (int i = 0; i < num_threads; i++) {
> > > > +            double temp;
> > > > +            if ((thd_stat[i].is_vcpu != true) &&
> > > > +                (thd_stat[i].delta_ticks > 0)) {
> > > > +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> > > > +                pkg_stat[thd_stat[i].numa_node_id].e_ratio
> > > > +                    += (uint64_t)lround(temp);
> > > > +            }
> > > > +        }
> > > > +
> > > > +        /* Calculate the ratio per non-vCPU thread of each package */
> > > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > > +            if (pkg_stat[i].nb_vcpu > 0) {
> > > > +                pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
> > > > +            }
> > > > +        }
> > > > +
> > > > +        /* Calculate the energy for each vCPU thread */
> > > > +        for (int i = 0; i < num_threads; i++) {
> > > > +            double temp;
> > > > +
> > > > +            if ((thd_stat[i].is_vcpu == true) &&
> > > > +                (thd_stat[i].delta_ticks > 0)) {
> > > > +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> > > > +                vmsr->msr_value[thd_stat[i].vcpu_id] += (uint64_t)lround(temp);
> > > > +                vmsr->msr_value[thd_stat[i].vcpu_id] \
> > > > +                    += pkg_stat[thd_stat[i].numa_node_id].e_ratio;
> > > > +            }
> > > > +        }
> > > > +
> > > > +        /* free all memory */
> > > > +        for (int i = 0; i < num_threads; i++) {
> > > > +            free(thd_stat[i].utime);
> > > > +            free(thd_stat[i].stime);
> > > > +        }
> > > > +        free(thd_stat);
> > > > +        free(thread_ids);
> > > > +    }
> > > > +
> > > > +    rcu_unregister_thread();
> > > > +    return NULL;
> > > > +}
> > > > +
> > > > +static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
> > > > +{
> > > > +    struct KVMMsrEnergy *r = &s->msr_energy;
> > > > +
> > > > +    /* Retrieve the number of vCPU */
> > > > +    r->cpus = ms->smp.cpus;
> > > > +
> > > > +    /* Allocate register memory (MSR_PKG_STATUS) for each vCPU */
> > > > +    r->msr_value = calloc(r->cpus, sizeof(r->msr_value));
> > > > +
> > > > +    qemu_thread_create(&r->msr_thr, "kvm-msr",
> > > > +                       kvm_msr_energy_thread,
> > > > +                       s, QEMU_THREAD_JOINABLE);
> > > > +
> > > > +    return 0;
> > > > +}
> > > > +
> > > >  int kvm_arch_init(MachineState *ms, KVMState *s)
> > > >  {
> > > >      uint64_t identity_base = 0xfffbc000;
> > > > @@ -2765,6 +2998,46 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
> > > >                           strerror(-ret));
> > > >              exit(1);
> > > >          }
> > > > +
> > > > +        if (s->msr_energy.enable == true) {
> > > > +
> > > > +            r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
> > > > +                               kvm_rdmsr_rapl_power_unit, NULL);
> > > > +            if (!r) {
> > > > +                error_report("Could not install MSR_RAPL_POWER_UNIT \
> > > > +                                handler: %s",
> > > > +                             strerror(-ret));
> > > > +                exit(1);
> > > > +            }
> > > > +
> > > > +            r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
> > > > +                               kvm_rdmsr_pkg_power_limit, NULL);
> > > > +            if (!r) {
> > > > +                error_report("Could not install MSR_PKG_POWER_LIMIT \
> > > > +                                handler: %s",
> > > > +                             strerror(-ret));
> > > > +                exit(1);
> > > > +            }
> > > > +
> > > > +            r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
> > > > +                               kvm_rdmsr_pkg_power_info, NULL);
> > > > +            if (!r) {
> > > > +                error_report("Could not install MSR_PKG_POWER_INFO \
> > > > +                                handler: %s",
> > > > +                             strerror(-ret));
> > > > +                exit(1);
> > > > +            }
> > > > +            r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
> > > > +                               kvm_rdmsr_pkg_energy_status, NULL);
> > > > +            if (!r) {
> > > > +                error_report("Could not install MSR_PKG_ENERGY_STATUS \
> > > > +                                handler: %s",
> > > > +                             strerror(-ret));
> > > > +                exit(1);
> > > > +            } else {
> > > > +                kvm_msr_energy_thread_init(s, ms);
> > > > +            }
> > > > +        }
> > > >      }
> > > >  
> > > >      return 0;
> > > > diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
> > > > index 322272091bce..9cdc93c6c439 100644
> > > > --- a/target/i386/kvm/meson.build
> > > > +++ b/target/i386/kvm/meson.build
> > > > @@ -5,6 +5,7 @@ i386_softmmu_kvm_ss = ss.source_set()
> > > >  i386_softmmu_kvm_ss.add(files(
> > > >    'kvm.c',
> > > >    'kvm-cpu.c',
> > > > +  'vmsr_energy.c',
> > > >  ))
> > > >  
> > > >  i386_softmmu_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
> > > > diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c
> > > > new file mode 100644
> > > > index 000000000000..8bd86b32becf
> > > > --- /dev/null
> > > > +++ b/target/i386/kvm/vmsr_energy.c
> > > > @@ -0,0 +1,132 @@
> > > > +/*
> > > > + * QEMU KVM support -- x86 virtual energy-related MSR.
> > > > + *
> > > > + * Copyright 2023 Red Hat, Inc. 2023
> > > > + *
> > > > + *  Author:
> > > > + *      Anthony Harivel <aharivel@redhat.com>
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > > + * See the COPYING file in the top-level directory.
> > > > + *
> > > > + */
> > > > +
> > > > +#include "vmsr_energy.h"
> > > > +
> > > > +#define MAX_PATH_LEN 50
> > > > +#define MAX_LINE_LEN 500
> > > > +
> > > > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id)
> > > > +{
> > > > +    int fd;
> > > > +    uint64_t data;
> > > > +
> > > > +    char path[MAX_PATH_LEN];
> > > > +    snprintf(path, MAX_PATH_LEN, "/dev/cpu/%u/msr", cpu_id);
> > > > +
> > > > +    fd = open(path , O_RDONLY);
> > > > +    if (fd < 0) {
> > > > +        return 0;
> > > > +    }
> > > > +    if (pread(fd, &data, sizeof data, reg) != sizeof data) {
> > > > +        data = 0;
> > > > +    }
> > > > +
> > > > +    close(fd);
> > > > +    return data;
> > > > +}
> > > > +
> > > > +/* Retrieve the number of physical CPU on the package */
> > > > +unsigned int get_maxcpus(unsigned int package_num)
> > > > +{
> > > > +    int k, ncpus;
> > > > +    unsigned int maxcpus;
> > > > +    struct bitmask *cpus;
> > > > +
> > > > +    cpus = numa_allocate_cpumask();
> > > > +    ncpus = cpus->size;
> > > > +
> > > > +    if (numa_node_to_cpus(package_num, cpus) < 0) {
> > > > +        printf("node %u failed to convert\n", package_num);
> > > > +    }
> > > > +
> > > > +    maxcpus = 0;
> > > > +    for (k = 0; k < ncpus; k++) {
> > > > +        if (numa_bitmask_isbitset(cpus, k)) {
> > > > +            maxcpus++;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    return maxcpus;
> > > > +}
> > > > +
> > > > +int read_thread_stat(struct thread_stat *thread, int pid, int index)
> > > > +{
> > > > +    char path[MAX_PATH_LEN];
> > > > +    snprintf(path, MAX_PATH_LEN, "/proc/%u/task/%d/stat", pid, \
> > > > +             thread->thread_id);
> > > > +
> > > > +    FILE *file = fopen(path, "r");
> > > > +    if (file == NULL) {
> > > > +        return -1;
> > > > +    }
> > > > +
> > > > +    if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
> > > > +        " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
> > > > +        " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
> > > > +           &thread->utime[index], &thread->stime[index], &thread->cpu_id) != 3)
> > > > +        return -1;
> > > > +
> > > > +    fclose(file);
> > > > +    return 0;
> > > > +}
> > > > +
> > > > +/* Read QEMU stat task folder to retrieve all QEMU threads ID */
> > > > +pid_t *get_thread_ids(pid_t pid, int *num_threads)
> > > > +{
> > > > +    char path[100];
> > > > +    sprintf(path, "/proc/%d/task", pid);
> > > > +
> > > > +    DIR *dir = opendir(path);
> > > > +    if (dir == NULL) {
> > > > +        perror("opendir");
> > > > +        return NULL;
> > > > +    }
> > > > +
> > > > +    pid_t *thread_ids = NULL;
> > > > +    int thread_count = 0;
> > > > +
> > > > +    struct dirent *ent;
> > > > +    while ((ent = readdir(dir)) != NULL) {
> > > > +        if (ent->d_name[0] == '.') {
> > > > +            continue;
> > > > +        }
> > > > +        pid_t tid = atoi(ent->d_name);
> > > > +        if (pid != tid) {
> > > > +            thread_ids = realloc(thread_ids,
> > > > +                                 (thread_count + 1) * sizeof(pid_t));
> > > > +            thread_ids[thread_count] = tid;
> > > > +            thread_count++;
> > > > +        }
> > > > +    }
> > > > +
> > > > +    closedir(dir);
> > > > +
> > > > +    *num_threads = thread_count;
> > > > +    return thread_ids;
> > > > +}
> > > > +
> > > > +void delta_ticks(thread_stat *thd_stat, int i)
> > > > +{
> > > > +    thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
> > > > +                            - (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
> > > > +}
> > > > +
> > > > +double get_ratio(package_energy_stat *pkg_stat,
> > > > +                        thread_stat *thd_stat,
> > > > +                        int maxticks, int i) {
> > > > +
> > > > +    return (pkg_stat[thd_stat[i].numa_node_id].e_delta / 100.0)
> > > > +            * ((100.0 / maxticks) * thd_stat[i].delta_ticks);
> > > > +}
> > > > +
> > > > diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
> > > > new file mode 100644
> > > > index 000000000000..5f79d2cbe00d
> > > > --- /dev/null
> > > > +++ b/target/i386/kvm/vmsr_energy.h
> > > > @@ -0,0 +1,80 @@
> > > > +/*
> > > > + * QEMU KVM support -- x86 virtual energy-related MSR.
> > > > + *
> > > > + * Copyright 2023 Red Hat, Inc. 2023
> > > > + *
> > > > + *  Author:
> > > > + *      Anthony Harivel <aharivel@redhat.com>
> > > > + *
> > > > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > > + * See the COPYING file in the top-level directory.
> > > > + *
> > > > + */
> > > > +
> > > > +#ifndef VMSR_ENERGY_H
> > > > +#define VMSR_ENERGY_H
> > > > +
> > > > +#include "qemu/osdep.h"
> > > > +
> > > > +#include <numa.h>
> > > > +
> > > > +/*
> > > > + * Define the interval time in micro seconds between 2 samples of
> > > > + * energy related MSRs
> > > > + */
> > > > +#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
> > > > +
> > > > +/*
> > > > + * Thread statistic
> > > > + * @ thread_id: TID (thread ID)
> > > > + * @ is_vcpu: true is thread is vCPU thread
> > > > + * @ cpu_id: CPU number last executed on
> > > > + * @ vcpu_id: vCPU ID
> > > > + * @ numa_node_id:node number of the CPU
> > > > + * @ utime: amount of clock ticks the thread
> > > > + *          has been scheduled in User mode
> > > > + * @ stime: amount of clock ticks the thread
> > > > + *          has been scheduled in System mode
> > > > + * @ delta_ticks: delta of utime+stime between
> > > > + *          the two samples (before/after sleep)
> > > > + */
> > > > +struct thread_stat {
> > > > +    unsigned int thread_id;
> > > > +    bool is_vcpu;
> > > > +    unsigned int cpu_id;
> > > > +    unsigned int vcpu_id;
> > > > +    unsigned int numa_node_id;
> > > > +    unsigned long long *utime;
> > > > +    unsigned long long *stime;
> > > > +    unsigned long long delta_ticks;
> > > > +};
> > > > +
> > > > +/*
> > > > + * Package statistic
> > > > + * @ e_start: package energy counter before the sleep
> > > > + * @ e_end: package energy counter after the sleep
> > > > + * @ e_delta: delta of package energy counter
> > > > + * @ e_ratio: store the energy ratio of non-vCPU thread
> > > > + * @ nb_vcpu: number of vCPU running on this package
> > > > + */
> > > > +struct packge_energy_stat {
> > > > +    uint64_t e_start;
> > > > +    uint64_t e_end;
> > > > +    uint64_t e_delta;
> > > > +    uint64_t e_ratio;
> > > > +    unsigned int nb_vcpu;
> > > > +};
> > > > +
> > > > +typedef struct thread_stat thread_stat;
> > > > +typedef struct packge_energy_stat package_energy_stat;
> > > > +
> > > > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id);
> > > > +void delta_ticks(thread_stat *thd_stat, int i);
> > > > +unsigned int get_maxcpus(unsigned int package_num);
> > > > +int read_thread_stat(struct thread_stat *thread, int pid, int index);
> > > > +pid_t *get_thread_ids(pid_t pid, int *num_threads);
> > > > +double get_ratio(package_energy_stat *pkg_stat,
> > > > +                        thread_stat *thd_stat,
> > > > +                        int maxticks, int i);
> > > > +
> > > > +#endif /* VMSR_ENERGY_H */
> > > > -- 
> > > > 2.40.1
> > > > 
> > > > 
> > 
> >

Re: [RFC PATCH] Add support for RAPL MSRs in KVM/Qemu

Posted by Marcelo Tosatti 11 months, 2 weeks ago

On Wed, May 24, 2023 at 04:53:49PM +0200, Anthony Harivel wrote:
> set=UTF-8
> Status: RO
> Content-Length: 24102
> Lines: 667
> 
> Marcelo Tosatti, May 19, 2023 at 20:28:
> 
> Hi Marcelo,
> 
> > > > > +    /* Assuming those values are the same accross physical system/packages */
> > > > > +    maxcpus = get_maxcpus(0); /* Number of CPUS per packages */
> > > > > +    maxpkgs = numa_max_node(); /* Number of Packages on the system */
> >
> > numa_max_node() returns the highest node number available on the current system. 
> > (See the node numbers in /sys/devices/system/node/ ). Also see numa_num_configured_nodes().
> >
> > One can find package topology information from
> > /sys/devices/system/cpu/cpuX/topology/
> >
> 
> Good point. 
> I will find a better solution to identify the topology using your hint. 
> 
>  > > > +        /* Allocate memory for each thread stats */
> > > > > +        thd_stat = (thread_stat *) calloc(num_threads, sizeof(thread_stat));
> >
> > Can you keep this pre-allocated ? And all other data as well.
> 
> Ok no problem.
> 
>  > > > +        /* Retrieve all packages power plane energy counter */
> > > > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > > > +            for (int j = 0; j < num_threads; j++) {
> > > > > +                /*
> > > > > +                 * Use the first thread we found that ran on the CPU
> > > > > +                 * of the package to read the packages energy counter
> > > > > +                 */
> > > > > +                if (thd_stat[j].numa_node_id == i) {
> > > > > +                    pkg_stat[i].e_start = read_msr(MSR_PKG_ENERGY_STATUS, i);
> > > > > +                    break;
> > > > > +                }
> > > > > +            }
> > > > > +        }
> >
> > NUMA node does not map necessarily to one package.
> 
> True. I will update this part at the same time with the topology info
> discussed above. 
> 
> >
> > > > > +        /* Sleep a short period while the other threads are working */
> > > > > +        usleep(MSR_ENERGY_THREAD_SLEEP_US);
> > > > > +
> > > > > +        /*
> > > > > +         * Retrieve all packages power plane energy counter
> > > > > +         * Calculate the delta of all packages
> > > > > +         */
> > > > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > > > +            for (int j = 0; j < num_threads; j++) {
> > > > > +                /*
> > > > > +                 * Use the first thread we found that ran on the CPU
> > > > > +                 * of the package to read the packages energy counter
> > > > > +                 */
> > > > > +                if (thd_stat[j].numa_node_id == i) {
> > > > > +                    pkg_stat[i].e_end =
> > > > > +                        read_msr(MSR_PKG_ENERGY_STATUS, thd_stat[j].cpu_id);
> >
> > This is excessive (to read the MSRs of each package in the system).
> >
> > Consider 100 Linux guests all of them with this enabled, on a system with
> > 4 packages. How many times you'll be reading MSR of each package?
> 
> The problem here is that you can have vCPUs that are running on different
> packages. However the energy counter of the different packages are
> increasing independently. 
> Either we "force" somehow users to run only on the same package, either I'm
> afraid we are obliged to read all the packages energy counter (when they
> are involved in the VM).
> 
> Imagine this:
> 
> |----pkg-0----|----pkg-1----|
> |0|1|2|3|4|5|6|0|1|2|3|4|5|6|
> |       |       |       |
> | vm-0  |  vm-1 |  vm-2 |
> 
> Only vm-1 that has cores from pkg-0 and pkg-1 would have to read both
> pkg energy. vm-0 would only read pkg-0 and vm-2 only pkg-1.
> 
> 
> >
> > Moreover, don't want to readmsr on an isolated CPU.
> >
> 
> Could you explain me why ?

Nevermind, its a separate topic.

> > > No problem, let me try to explain: 
> > > a QEMU process is composed of vCPU thread(s) and non-vCPU thread(s) (IO,
> > > emulated device,...). Each of those threads can run on different cores
> > > that can belongs to the same Package or not.
> > > The MSR_PKG_ENERGY_STATUS is a counter that increment for the whole
> > > Package domain. If you read this MSR from any core that belongs to the 
> > > package, you will retrieve the same number.
> > > So when I need to read the MSR, I only need to read once for all the
> > > threads that are running on cores of each Package.
> >
> > T=0	read p1v0 (== MSR_PKG_ENERGY_STATUS)
> > T=1	vcpu-1 executing in core1
> > 	vcpu-2 executing in core2
> > 	vcpu-3 executing in core2
> > T=2	read p1v1
> >
> > Won't you be exposing (p1v1-p1v0)/3 by 3 in this case, to the 
> > virtual MSR_PKG_ENERGY_STATUS?
> >
> 
> No, because if we take for exemple a 4 cores per package with 100 ticks per second,
> the maximum number of ticks for all the cores in this packages would be
> 400 ticks per second. 
> So if 2 or more vcpus are sharing the same core, they will be maxout by the
> 100 ticks per second anyway and so ratio will still be ok. 
> 
> 
> > > Now let's talk about the implementation of the emulated value. 
> > > I've created a thread that does in an infinite loop the following:
> > > - Retrieve all the QEMU threads + statistics about them 
> > > - Read the energy counter of each Package involved 
> > > - Sleep for 1sec (1sec is arbitrary)
> > > - Calculate the delta of ticks for each threads so that we know how much
> > >   time each threads has been scheduled during the Sleep period
> >
> > Intel docs mention the counter can overflow (faster with higher 
> > energy consumption). Are you handling the overflow case?
> >
> 
> Should we ? On baremetal, anyone reading the MSRs should handle the
> overflow case, I guess on a guest this should be the same.
> But I'll make sure *not* to output negative energy value for sure!

I mean to handle overflow in the emulation code, yes.

> 
> > > - Read again the energy counter of each Package involved and calculate
> > >   the delta of the counter so that we know how much the counter has
> > >   increased during the Sleep period
> > > - Calculate the ratio for each vCPU thread and deduct the energy spent
> > >   for each vCPU
> > > - Calculate the amount of energy spent for all non-vCPU and evenly
> > >   spread it to the vCPU
> > > - Update each virtual MSR for each vCPU
> > > 
> > > Obviously, this is working better and more consistently with vCPU pinning 
> > > and proper isolation of the cores in the package. 
> >
> > Do you really need to measure the consumption for QEMU threads? I'd say
> > only measuring for the vcpus is sufficient.
> >
> 
> What if a process in the guest is actually doing a lot of IO ?
> The QEMU workers managing those IO would be consumming, no ?
> But if this is unsignicant I can remove it.

On a second thought, i am not sure. Perhaps makes sense to keep 
the non-vcpu threads.

> 
> > > So those virtual MSRs are updated roughly each second (could be changed
> > > by updating MSR_ENERGY_THREAD_SLEEP_US). Compared to the real MSRs which 
> > > are updated by the microcode every 1ms.
> >
> > How useful are the measurements for the guest, then? (I mean, doesnt a
> > 1 second interval render the measurements less useful).
> >
> 
> We can always reduce the sleeping time. 
> I guess we can also set it with a parameter like:
> -accel kvm,rapl=true,update-period=500
> 
> with update-period, the interval in milliseconds between updates. 

And that would increase the CPU time for emulation significantly
(for higher granularity the consumption goes way up).
Which perhaps limits the use-cases.

I think what would be useful is explanation, on a document, of the behaviour of MSR emulation
in QEMU (how MSR value is emulated, low update rate, etc).
Including some example measurements.

Also consumption of energy by threads does not map 1<->1 to time spent
executing (different instructions have different consumption).

> > > Concerning the "real" power consumption, we have to not forget that the
> > > RAPL interface energy data is not the result of a physical measurement. 
> > > It is a set of architectural events from each core, processor
> > > graphic, IO, etc. and combines them with energy weights to predict the
> > > package's active power consumption.
> > > 
> > > IMO it is not really important because the idea behind this patch is to
> > > give estimated values to the guests so that software running inside
> > > VM can make use of power tools which are all reading those MSRs (or the
> > > RAPL driver sys interface) to retrieve power consumption.
> >
> > Can you describe some use-cases... Because what seems to be useful
> > (actually, what seem to be possible) is for users to measure the 
> > energy consumption of a package. 
> >
> 
> Yes and here they will do the same. Except that for the moment I don't
> have any solution to overcome the fact that each vcpu will have his own
> package. 
> When a software will read MSR_PKG_ENERGY_STATUS of pkg-0, it will also
> read the energy of the vcpu-0 because we are in single vcpu per virtual
> socket.
> I would say it might give better granularity of the power consumption:
> If a process in guest is running only on one isolated vcpu, reading the
> package MSR of this vcpu would theorically gives the consumption of this
> process directly. Whereas in baremetal, we need to take the other cores
> of the package into account.
> 
> > So if you think, OK i can read the energy consumption information
> > from within a guest, but i know its the energy consumption divided
> > by the amount of time the qemu threads executed, how useful that
> > measure is ?
> 
> It gives what the guest has actually consume 

This is not necessarily true due to per-instruction consumption.

So a vcpu executing the same sequence of instructions, in a package, 
sharing a core with a thread executing power hungry instructions, will
have exposed consumption different than if that power hungry thread is
not executing.

I think AVX512 instructions are a good example of power hungry
instructions.

> and inside the guest there
> are processes that are running.
> There are many tools [1][2][3] that use thoses MSRs for metrics to calculate
> the power consumption of applications/processes in order to optimize
> their code. 
> 
> What I'm trying to achieve here is enabling this possibilities inside VM. 

OK.

> 
> >
> > Why not expose the actual delta from the MSRs, rather than diving
> > by amount of time the qemu threads execute.
> >
> 
> Imagine 2 identical VMs on the same package: 
> VM-1 is hogging cores and VM-2 is doing nothing. 
> If we only report delta of the MSRs, both VM would consume the same
> which is wrong. 

Yeah. So having the emulation described in a document allows users to
know what to expect.

> This is why I'm doing ratio of the energy depending on the QEMU threads. 
> because a vCPU is just a thread and the /proc/<pid>/stat can give us 
> the amount of time spend by each thread on the cores.  
> 
> Thanks a lot for your input,
> 
> Anthony

Thanks!

This is a nice investigation into RAPL:
https://helda.helsinki.fi/bitstream/handle/10138/321707/RAPL_in_Action_Experiences_in_Using_RAPL_for_Power_Measurements.pdf?sequence=1

> 
> 
> 
> [1]: https://github.com/sustainable-computing-io/kepler
> [2]: https://powerapi.org/
> [3]: https://github.com/hubblo-org/scaphandre
> 
> 
> 
> > > > > +
> > > > > +        /* Delta of ticks spend by each thread between the sample */
> > > > > +        for (int i = 0; i < num_threads; i++) {
> > > > > +            if (read_thread_stat(&thd_stat[i], pid, 1) != 0) {
> > > > > +                /*
> > > > > +                 * We don't count the dead thread
> > > > > +                 * i.e threads that existed before the sleep
> > > > > +                 * and not anymore
> > > > > +                 */
> > > > > +                thd_stat[i].delta_ticks = 0;
> > > > > +            } else {
> > > > > +                delta_ticks(thd_stat, i);
> > > > > +            }
> > > > > +        }
> > > > > +
> > > > > +        /*
> > > > > +         * Identify the vCPU threads
> > > > > +         * Calculate the Number of vCPU per package
> > > > > +         */
> > > > > +        CPU_FOREACH(cpu) {
> > > > > +            for (int i = 0; i < num_threads; i++) {
> > > > > +                if (cpu->thread_id == thd_stat[i].thread_id) {
> > > > > +                    thd_stat[i].is_vcpu = true;
> > > > > +                    thd_stat[i].vcpu_id = cpu->cpu_index;
> > > > > +                    pkg_stat[thd_stat[i].numa_node_id].nb_vcpu++;
> > > > > +                    break;
> > > > > +                }
> > > > > +            }
> > > > > +        }
> > > > > +
> > > > > +        /* Calculate the total energy of all non-vCPU thread */
> > > > > +        for (int i = 0; i < num_threads; i++) {
> > > > > +            double temp;
> > > > > +            if ((thd_stat[i].is_vcpu != true) &&
> > > > > +                (thd_stat[i].delta_ticks > 0)) {
> > > > > +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> > > > > +                pkg_stat[thd_stat[i].numa_node_id].e_ratio
> > > > > +                    += (uint64_t)lround(temp);
> > > > > +            }
> > > > > +        }
> > > > > +
> > > > > +        /* Calculate the ratio per non-vCPU thread of each package */
> > > > > +        for (int i = 0; i <= maxpkgs; i++) {
> > > > > +            if (pkg_stat[i].nb_vcpu > 0) {
> > > > > +                pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
> > > > > +            }
> > > > > +        }
> > > > > +
> > > > > +        /* Calculate the energy for each vCPU thread */
> > > > > +        for (int i = 0; i < num_threads; i++) {
> > > > > +            double temp;
> > > > > +
> > > > > +            if ((thd_stat[i].is_vcpu == true) &&
> > > > > +                (thd_stat[i].delta_ticks > 0)) {
> > > > > +                temp = get_ratio(pkg_stat, thd_stat, maxticks, i);
> > > > > +                vmsr->msr_value[thd_stat[i].vcpu_id] += (uint64_t)lround(temp);
> > > > > +                vmsr->msr_value[thd_stat[i].vcpu_id] \
> > > > > +                    += pkg_stat[thd_stat[i].numa_node_id].e_ratio;
> > > > > +            }
> > > > > +        }
> > > > > +
> > > > > +        /* free all memory */
> > > > > +        for (int i = 0; i < num_threads; i++) {
> > > > > +            free(thd_stat[i].utime);
> > > > > +            free(thd_stat[i].stime);
> > > > > +        }
> > > > > +        free(thd_stat);
> > > > > +        free(thread_ids);
> > > > > +    }
> > > > > +
> > > > > +    rcu_unregister_thread();
> > > > > +    return NULL;
> > > > > +}
> > > > > +
> > > > > +static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
> > > > > +{
> > > > > +    struct KVMMsrEnergy *r = &s->msr_energy;
> > > > > +
> > > > > +    /* Retrieve the number of vCPU */
> > > > > +    r->cpus = ms->smp.cpus;
> > > > > +
> > > > > +    /* Allocate register memory (MSR_PKG_STATUS) for each vCPU */
> > > > > +    r->msr_value = calloc(r->cpus, sizeof(r->msr_value));
> > > > > +
> > > > > +    qemu_thread_create(&r->msr_thr, "kvm-msr",
> > > > > +                       kvm_msr_energy_thread,
> > > > > +                       s, QEMU_THREAD_JOINABLE);
> > > > > +
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > >  int kvm_arch_init(MachineState *ms, KVMState *s)
> > > > >  {
> > > > >      uint64_t identity_base = 0xfffbc000;
> > > > > @@ -2765,6 +2998,46 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
> > > > >                           strerror(-ret));
> > > > >              exit(1);
> > > > >          }
> > > > > +
> > > > > +        if (s->msr_energy.enable == true) {
> > > > > +
> > > > > +            r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
> > > > > +                               kvm_rdmsr_rapl_power_unit, NULL);
> > > > > +            if (!r) {
> > > > > +                error_report("Could not install MSR_RAPL_POWER_UNIT \
> > > > > +                                handler: %s",
> > > > > +                             strerror(-ret));
> > > > > +                exit(1);
> > > > > +            }
> > > > > +
> > > > > +            r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
> > > > > +                               kvm_rdmsr_pkg_power_limit, NULL);
> > > > > +            if (!r) {
> > > > > +                error_report("Could not install MSR_PKG_POWER_LIMIT \
> > > > > +                                handler: %s",
> > > > > +                             strerror(-ret));
> > > > > +                exit(1);
> > > > > +            }
> > > > > +
> > > > > +            r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
> > > > > +                               kvm_rdmsr_pkg_power_info, NULL);
> > > > > +            if (!r) {
> > > > > +                error_report("Could not install MSR_PKG_POWER_INFO \
> > > > > +                                handler: %s",
> > > > > +                             strerror(-ret));
> > > > > +                exit(1);
> > > > > +            }
> > > > > +            r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
> > > > > +                               kvm_rdmsr_pkg_energy_status, NULL);
> > > > > +            if (!r) {
> > > > > +                error_report("Could not install MSR_PKG_ENERGY_STATUS \
> > > > > +                                handler: %s",
> > > > > +                             strerror(-ret));
> > > > > +                exit(1);
> > > > > +            } else {
> > > > > +                kvm_msr_energy_thread_init(s, ms);
> > > > > +            }
> > > > > +        }
> > > > >      }
> > > > >  
> > > > >      return 0;
> > > > > diff --git a/target/i386/kvm/meson.build b/target/i386/kvm/meson.build
> > > > > index 322272091bce..9cdc93c6c439 100644
> > > > > --- a/target/i386/kvm/meson.build
> > > > > +++ b/target/i386/kvm/meson.build
> > > > > @@ -5,6 +5,7 @@ i386_softmmu_kvm_ss = ss.source_set()
> > > > >  i386_softmmu_kvm_ss.add(files(
> > > > >    'kvm.c',
> > > > >    'kvm-cpu.c',
> > > > > +  'vmsr_energy.c',
> > > > >  ))
> > > > >  
> > > > >  i386_softmmu_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
> > > > > diff --git a/target/i386/kvm/vmsr_energy.c b/target/i386/kvm/vmsr_energy.c
> > > > > new file mode 100644
> > > > > index 000000000000..8bd86b32becf
> > > > > --- /dev/null
> > > > > +++ b/target/i386/kvm/vmsr_energy.c
> > > > > @@ -0,0 +1,132 @@
> > > > > +/*
> > > > > + * QEMU KVM support -- x86 virtual energy-related MSR.
> > > > > + *
> > > > > + * Copyright 2023 Red Hat, Inc. 2023
> > > > > + *
> > > > > + *  Author:
> > > > > + *      Anthony Harivel <aharivel@redhat.com>
> > > > > + *
> > > > > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > > > + * See the COPYING file in the top-level directory.
> > > > > + *
> > > > > + */
> > > > > +
> > > > > +#include "vmsr_energy.h"
> > > > > +
> > > > > +#define MAX_PATH_LEN 50
> > > > > +#define MAX_LINE_LEN 500
> > > > > +
> > > > > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id)
> > > > > +{
> > > > > +    int fd;
> > > > > +    uint64_t data;
> > > > > +
> > > > > +    char path[MAX_PATH_LEN];
> > > > > +    snprintf(path, MAX_PATH_LEN, "/dev/cpu/%u/msr", cpu_id);
> > > > > +
> > > > > +    fd = open(path , O_RDONLY);
> > > > > +    if (fd < 0) {
> > > > > +        return 0;
> > > > > +    }
> > > > > +    if (pread(fd, &data, sizeof data, reg) != sizeof data) {
> > > > > +        data = 0;
> > > > > +    }
> > > > > +
> > > > > +    close(fd);
> > > > > +    return data;
> > > > > +}
> > > > > +
> > > > > +/* Retrieve the number of physical CPU on the package */
> > > > > +unsigned int get_maxcpus(unsigned int package_num)
> > > > > +{
> > > > > +    int k, ncpus;
> > > > > +    unsigned int maxcpus;
> > > > > +    struct bitmask *cpus;
> > > > > +
> > > > > +    cpus = numa_allocate_cpumask();
> > > > > +    ncpus = cpus->size;
> > > > > +
> > > > > +    if (numa_node_to_cpus(package_num, cpus) < 0) {
> > > > > +        printf("node %u failed to convert\n", package_num);
> > > > > +    }
> > > > > +
> > > > > +    maxcpus = 0;
> > > > > +    for (k = 0; k < ncpus; k++) {
> > > > > +        if (numa_bitmask_isbitset(cpus, k)) {
> > > > > +            maxcpus++;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    return maxcpus;
> > > > > +}
> > > > > +
> > > > > +int read_thread_stat(struct thread_stat *thread, int pid, int index)
> > > > > +{
> > > > > +    char path[MAX_PATH_LEN];
> > > > > +    snprintf(path, MAX_PATH_LEN, "/proc/%u/task/%d/stat", pid, \
> > > > > +             thread->thread_id);
> > > > > +
> > > > > +    FILE *file = fopen(path, "r");
> > > > > +    if (file == NULL) {
> > > > > +        return -1;
> > > > > +    }
> > > > > +
> > > > > +    if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
> > > > > +        " %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
> > > > > +        " %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
> > > > > +           &thread->utime[index], &thread->stime[index], &thread->cpu_id) != 3)
> > > > > +        return -1;
> > > > > +
> > > > > +    fclose(file);
> > > > > +    return 0;
> > > > > +}
> > > > > +
> > > > > +/* Read QEMU stat task folder to retrieve all QEMU threads ID */
> > > > > +pid_t *get_thread_ids(pid_t pid, int *num_threads)
> > > > > +{
> > > > > +    char path[100];
> > > > > +    sprintf(path, "/proc/%d/task", pid);
> > > > > +
> > > > > +    DIR *dir = opendir(path);
> > > > > +    if (dir == NULL) {
> > > > > +        perror("opendir");
> > > > > +        return NULL;
> > > > > +    }
> > > > > +
> > > > > +    pid_t *thread_ids = NULL;
> > > > > +    int thread_count = 0;
> > > > > +
> > > > > +    struct dirent *ent;
> > > > > +    while ((ent = readdir(dir)) != NULL) {
> > > > > +        if (ent->d_name[0] == '.') {
> > > > > +            continue;
> > > > > +        }
> > > > > +        pid_t tid = atoi(ent->d_name);
> > > > > +        if (pid != tid) {
> > > > > +            thread_ids = realloc(thread_ids,
> > > > > +                                 (thread_count + 1) * sizeof(pid_t));
> > > > > +            thread_ids[thread_count] = tid;
> > > > > +            thread_count++;
> > > > > +        }
> > > > > +    }
> > > > > +
> > > > > +    closedir(dir);
> > > > > +
> > > > > +    *num_threads = thread_count;
> > > > > +    return thread_ids;
> > > > > +}
> > > > > +
> > > > > +void delta_ticks(thread_stat *thd_stat, int i)
> > > > > +{
> > > > > +    thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
> > > > > +                            - (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
> > > > > +}
> > > > > +
> > > > > +double get_ratio(package_energy_stat *pkg_stat,
> > > > > +                        thread_stat *thd_stat,
> > > > > +                        int maxticks, int i) {
> > > > > +
> > > > > +    return (pkg_stat[thd_stat[i].numa_node_id].e_delta / 100.0)
> > > > > +            * ((100.0 / maxticks) * thd_stat[i].delta_ticks);
> > > > > +}
> > > > > +
> > > > > diff --git a/target/i386/kvm/vmsr_energy.h b/target/i386/kvm/vmsr_energy.h
> > > > > new file mode 100644
> > > > > index 000000000000..5f79d2cbe00d
> > > > > --- /dev/null
> > > > > +++ b/target/i386/kvm/vmsr_energy.h
> > > > > @@ -0,0 +1,80 @@
> > > > > +/*
> > > > > + * QEMU KVM support -- x86 virtual energy-related MSR.
> > > > > + *
> > > > > + * Copyright 2023 Red Hat, Inc. 2023
> > > > > + *
> > > > > + *  Author:
> > > > > + *      Anthony Harivel <aharivel@redhat.com>
> > > > > + *
> > > > > + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> > > > > + * See the COPYING file in the top-level directory.
> > > > > + *
> > > > > + */
> > > > > +
> > > > > +#ifndef VMSR_ENERGY_H
> > > > > +#define VMSR_ENERGY_H
> > > > > +
> > > > > +#include "qemu/osdep.h"
> > > > > +
> > > > > +#include <numa.h>
> > > > > +
> > > > > +/*
> > > > > + * Define the interval time in micro seconds between 2 samples of
> > > > > + * energy related MSRs
> > > > > + */
> > > > > +#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
> > > > > +
> > > > > +/*
> > > > > + * Thread statistic
> > > > > + * @ thread_id: TID (thread ID)
> > > > > + * @ is_vcpu: true is thread is vCPU thread
> > > > > + * @ cpu_id: CPU number last executed on
> > > > > + * @ vcpu_id: vCPU ID
> > > > > + * @ numa_node_id:node number of the CPU
> > > > > + * @ utime: amount of clock ticks the thread
> > > > > + *          has been scheduled in User mode
> > > > > + * @ stime: amount of clock ticks the thread
> > > > > + *          has been scheduled in System mode
> > > > > + * @ delta_ticks: delta of utime+stime between
> > > > > + *          the two samples (before/after sleep)
> > > > > + */
> > > > > +struct thread_stat {
> > > > > +    unsigned int thread_id;
> > > > > +    bool is_vcpu;
> > > > > +    unsigned int cpu_id;
> > > > > +    unsigned int vcpu_id;
> > > > > +    unsigned int numa_node_id;
> > > > > +    unsigned long long *utime;
> > > > > +    unsigned long long *stime;
> > > > > +    unsigned long long delta_ticks;
> > > > > +};
> > > > > +
> > > > > +/*
> > > > > + * Package statistic
> > > > > + * @ e_start: package energy counter before the sleep
> > > > > + * @ e_end: package energy counter after the sleep
> > > > > + * @ e_delta: delta of package energy counter
> > > > > + * @ e_ratio: store the energy ratio of non-vCPU thread
> > > > > + * @ nb_vcpu: number of vCPU running on this package
> > > > > + */
> > > > > +struct packge_energy_stat {
> > > > > +    uint64_t e_start;
> > > > > +    uint64_t e_end;
> > > > > +    uint64_t e_delta;
> > > > > +    uint64_t e_ratio;
> > > > > +    unsigned int nb_vcpu;
> > > > > +};
> > > > > +
> > > > > +typedef struct thread_stat thread_stat;
> > > > > +typedef struct packge_energy_stat package_energy_stat;
> > > > > +
> > > > > +uint64_t read_msr(uint32_t reg, unsigned int cpu_id);
> > > > > +void delta_ticks(thread_stat *thd_stat, int i);
> > > > > +unsigned int get_maxcpus(unsigned int package_num);
> > > > > +int read_thread_stat(struct thread_stat *thread, int pid, int index);
> > > > > +pid_t *get_thread_ids(pid_t pid, int *num_threads);
> > > > > +double get_ratio(package_energy_stat *pkg_stat,
> > > > > +                        thread_stat *thd_stat,
> > > > > +                        int maxticks, int i);
> > > > > +
> > > > > +#endif /* VMSR_ENERGY_H */
> > > > > --=2