[PATCH v5 4/9] contrib/plugins/uftrace: implement tracing

Pierrick Bouvier posted 9 patches 4 months, 1 week ago
Maintainers: "Alex Bennée" <alex.bennee@linaro.org>, Alexandre Iooss <erdnaxe@crans.org>, Mahmoud Mandour <ma.mandourr@gmail.com>, Pierrick Bouvier <pierrick.bouvier@linaro.org>
There is a newer version of this series
[PATCH v5 4/9] contrib/plugins/uftrace: implement tracing
Posted by Pierrick Bouvier 4 months, 1 week ago
We implement tracing, following uftrace format.
Trace is flushed every 32 MB, so file operations don't impact
performance at runtime.

A different trace is generated per cpu, and we ensure they have a unique
name, based on vcpu_index, while keeping room for privilege level coming
in next commit.

Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
---
 contrib/plugins/uftrace.c | 149 +++++++++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 1 deletion(-)

diff --git a/contrib/plugins/uftrace.c b/contrib/plugins/uftrace.c
index d51faceb344..402a242433e 100644
--- a/contrib/plugins/uftrace.c
+++ b/contrib/plugins/uftrace.c
@@ -12,6 +12,13 @@
 #include <qemu-plugin.h>
 #include <glib.h>
 #include <stdio.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#define MiB     (INT64_C(1) << 20)
+#define NANOSECONDS_PER_SECOND 1000000000LL
 
 QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
 
@@ -24,6 +31,13 @@ typedef struct {
     uint64_t frame_pointer;
 } CallstackEntry;
 
+typedef struct {
+    GArray *t;
+    GString *path;
+    GString *name;
+    uint32_t id;
+} Trace;
+
 typedef struct Cpu Cpu;
 
 typedef struct {
@@ -34,6 +48,7 @@ typedef struct {
 } CpuOps;
 
 typedef struct Cpu {
+    Trace *trace;
     Callstack *cs;
     GByteArray *buf;
     CpuOps ops;
@@ -44,9 +59,41 @@ typedef struct {
     struct qemu_plugin_register *reg_fp;
 } Aarch64Cpu;
 
+typedef struct {
+    uint64_t timestamp;
+    uint64_t data;
+} UftraceEntry;
+
+typedef enum {
+    UFTRACE_ENTRY,
+    UFTRACE_EXIT,
+    UFTRACE_LOST,
+    UFTRACE_EVENT
+} UftraceRecordType;
+
 static struct qemu_plugin_scoreboard *score;
 static CpuOps arch_ops;
 
+static uint64_t gettime_ns(void)
+{
+#ifdef _WIN32
+    /*
+     * On Windows, timespec_get is available only with UCRT, but not with
+     * MinGW64 environment. Simplify by using only gettimeofday on this
+     * platform. This may result in a precision loss.
+     */
+    struct timeval tv;
+    gettimeofday(&tv, NULL);
+    uint64_t now_ns = tv.tv_sec * NANOSECONDS_PER_SECOND + tv.tv_usec * 1000;
+#else
+    /* We need nanosecond precision for short lived functions. */
+    struct timespec ts;
+    timespec_get(&ts, TIME_UTC);
+    uint64_t now_ns = ts.tv_sec * NANOSECONDS_PER_SECOND + ts.tv_nsec;
+#endif
+    return now_ns;
+}
+
 static Callstack *callstack_new(void)
 {
     Callstack *cs = g_new0(Callstack, 1);
@@ -112,6 +159,85 @@ static CallstackEntry callstack_pop(Callstack *cs)
     return e;
 }
 
+static Trace *trace_new(uint32_t id, GString *name)
+{
+    Trace *t = g_new0(Trace, 1);
+    t->t = g_array_new(false, false, sizeof(UftraceEntry));
+    t->path = g_string_new(NULL);
+    g_string_append_printf(t->path, "./uftrace.data/%"PRIu32".dat", id);
+    t->name = g_string_new(name->str);
+    t->id = id;
+    return t;
+}
+
+static void trace_free(Trace *t)
+{
+    g_assert(t->t->len == 0);
+    g_array_free(t->t, true);
+    t->t = NULL;
+    g_string_free(t->path, true);
+    t->path = NULL;
+    g_string_free(t->name, true);
+    t->name = NULL;
+    g_free(t);
+}
+
+static void trace_flush(Trace *t, bool append)
+{
+    int create_dir = g_mkdir_with_parents("./uftrace.data",
+                                          S_IRWXU | S_IRWXG | S_IRWXO);
+    g_assert(create_dir == 0);
+    FILE *dat = fopen(t->path->str, append ? "a" : "w");
+    g_assert(dat);
+    GArray *data = t->t;
+    if (data->len) {
+        fwrite(data->data, data->len, sizeof(UftraceEntry), dat);
+    }
+    fclose(dat);
+    g_array_set_size(data, 0);
+}
+
+static void trace_add_entry(Trace *t, uint64_t timestamp, uint64_t pc,
+                            size_t depth, UftraceRecordType type)
+{
+    /* libmcount/record.c:record_event */
+    const uint64_t record_magic = 0x5;
+    uint64_t data = type | record_magic << 3;
+    data += depth << 6;
+    data += pc << 16;
+    UftraceEntry e = {.timestamp = timestamp, .data = data};
+    g_array_append_val(t->t, e);
+    if (t->t->len * sizeof(UftraceEntry) > 32 * MiB) {
+        trace_flush(t, true);
+    }
+}
+
+static void trace_enter_function(Trace *t, uint64_t timestamp,
+                                 uint64_t pc, size_t depth)
+{
+    trace_add_entry(t, timestamp, pc, depth, UFTRACE_ENTRY);
+}
+
+static void trace_exit_function(Trace *t, uint64_t timestamp,
+                                uint64_t pc, size_t depth)
+{
+    trace_add_entry(t, timestamp, pc, depth, UFTRACE_EXIT);
+}
+
+static void trace_enter_stack(Trace *t, Callstack *cs, uint64_t timestamp)
+{
+    for (size_t depth = 1; depth <= callstack_depth(cs); ++depth) {
+        trace_enter_function(t, timestamp, callstack_at(cs, depth)->pc, depth);
+    }
+}
+
+static void trace_exit_stack(Trace *t, Callstack *cs, uint64_t timestamp)
+{
+    for (size_t depth = callstack_depth(cs); depth > 0; --depth) {
+        trace_exit_function(t, timestamp, callstack_at(cs, depth)->pc, depth);
+    }
+}
+
 static uint64_t cpu_read_register64(Cpu *cpu, struct qemu_plugin_register *reg)
 {
     GByteArray *buf = cpu->buf;
@@ -223,7 +349,9 @@ static void track_callstack(unsigned int cpu_index, void *udata)
 {
     uint64_t pc = (uintptr_t) udata;
     Cpu *cpu = qemu_plugin_scoreboard_find(score, cpu_index);
+    uint64_t timestamp = gettime_ns();
     Callstack *cs = cpu->cs;
+    Trace *t = cpu->trace;
 
     uint64_t fp = cpu->ops.get_frame_pointer(cpu);
     if (!fp && callstack_empty(cs)) {
@@ -232,6 +360,7 @@ static void track_callstack(unsigned int cpu_index, void *udata)
          * long as a proper call does not happen.
          */
         callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc});
+        trace_enter_function(t, timestamp, pc, callstack_depth(cs));
         return;
     }
 
@@ -244,7 +373,8 @@ static void track_callstack(unsigned int cpu_index, void *udata)
     CallstackEntry caller = callstack_caller(cs);
     if (fp == caller.frame_pointer) {
         /* return */
-        callstack_pop(cs);
+        CallstackEntry e = callstack_pop(cs);
+        trace_exit_function(t, timestamp, e.pc, callstack_depth(cs));
         return;
     }
 
@@ -252,12 +382,16 @@ static void track_callstack(unsigned int cpu_index, void *udata)
     if (caller_fp == top.frame_pointer) {
         /* call */
         callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc});
+        trace_enter_function(t, timestamp, pc, callstack_depth(cs));
         return;
     }
 
     /* discontinuity, exit current stack and unwind new one */
+    trace_exit_stack(t, cs, timestamp);
     callstack_clear(cs);
+
     cpu_unwind_stack(cpu, fp, pc);
+    trace_enter_stack(t, cs, timestamp);
 }
 
 static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
@@ -296,6 +430,16 @@ static void vcpu_init(qemu_plugin_id_t id, unsigned int vcpu_index)
     cpu->ops.init(cpu);
     cpu->buf = g_byte_array_new();
 
+    g_assert(vcpu_index < UINT32_MAX / 100);
+    /* trace_id is: cpu_number * 100 */
+    uint32_t trace_id = (vcpu_index + 1) * 100;
+
+    g_autoptr(GString) trace_name = g_string_new(NULL);
+    g_string_append_printf(trace_name, "cpu%u", vcpu_index);
+    cpu->trace = trace_new(trace_id, trace_name);
+    /* create/truncate trace file */
+    trace_flush(cpu->trace, false);
+
     cpu->cs = callstack_new();
 }
 
@@ -304,6 +448,7 @@ static void vcpu_end(unsigned int vcpu_index)
     Cpu *cpu = qemu_plugin_scoreboard_find(score, vcpu_index);
     g_byte_array_free(cpu->buf, true);
 
+    trace_free(cpu->trace);
     callstack_free(cpu->cs);
     memset(cpu, 0, sizeof(Cpu));
 }
@@ -311,6 +456,8 @@ static void vcpu_end(unsigned int vcpu_index)
 static void at_exit(qemu_plugin_id_t id, void *data)
 {
     for (size_t i = 0; i < qemu_plugin_num_vcpus(); ++i) {
+        Cpu *cpu = qemu_plugin_scoreboard_find(score, i);
+        trace_flush(cpu->trace, true);
         vcpu_end(i);
     }
 
-- 
2.47.2
Re: [PATCH v5 4/9] contrib/plugins/uftrace: implement tracing
Posted by Manos Pitsidianakis 4 months, 1 week ago
On Fri, 08 Aug 2025 05:06, Pierrick Bouvier <pierrick.bouvier@linaro.org> wrote:
>We implement tracing, following uftrace format.
>Trace is flushed every 32 MB, so file operations don't impact
>performance at runtime.
>
>A different trace is generated per cpu, and we ensure they have a unique
>name, based on vcpu_index, while keeping room for privilege level coming
>in next commit.

Suggestion (not a request): put some kind of documentation about the 
format this patch implements, maybe a commit sha & URL to a header file 
from upstream uftrace.

>
>Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
>---
> contrib/plugins/uftrace.c | 149 +++++++++++++++++++++++++++++++++++++-
> 1 file changed, 148 insertions(+), 1 deletion(-)
>
>diff --git a/contrib/plugins/uftrace.c b/contrib/plugins/uftrace.c
>index d51faceb344..402a242433e 100644
>--- a/contrib/plugins/uftrace.c
>+++ b/contrib/plugins/uftrace.c
>@@ -12,6 +12,13 @@
> #include <qemu-plugin.h>
> #include <glib.h>
> #include <stdio.h>
>+#include <sys/stat.h>
>+#include <sys/time.h>
>+#include <time.h>
>+#include <unistd.h>
>+
>+#define MiB     (INT64_C(1) << 20)
>+#define NANOSECONDS_PER_SECOND 1000000000LL
> 
> QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
> 
>@@ -24,6 +31,13 @@ typedef struct {
>     uint64_t frame_pointer;
> } CallstackEntry;
> 
>+typedef struct {
>+    GArray *t;
>+    GString *path;
>+    GString *name;
>+    uint32_t id;
>+} Trace;
>+
> typedef struct Cpu Cpu;
> 
> typedef struct {
>@@ -34,6 +48,7 @@ typedef struct {
> } CpuOps;
> 
> typedef struct Cpu {
>+    Trace *trace;
>     Callstack *cs;
>     GByteArray *buf;
>     CpuOps ops;
>@@ -44,9 +59,41 @@ typedef struct {
>     struct qemu_plugin_register *reg_fp;
> } Aarch64Cpu;
> 
>+typedef struct {
>+    uint64_t timestamp;
>+    uint64_t data;
>+} UftraceEntry;
>+
>+typedef enum {
>+    UFTRACE_ENTRY,
>+    UFTRACE_EXIT,
>+    UFTRACE_LOST,
>+    UFTRACE_EVENT
>+} UftraceRecordType;
>+
> static struct qemu_plugin_scoreboard *score;
> static CpuOps arch_ops;
> 
>+static uint64_t gettime_ns(void)
>+{
>+#ifdef _WIN32
>+    /*
>+     * On Windows, timespec_get is available only with UCRT, but not with
>+     * MinGW64 environment. Simplify by using only gettimeofday on this
>+     * platform. This may result in a precision loss.
>+     */
>+    struct timeval tv;
>+    gettimeofday(&tv, NULL);
>+    uint64_t now_ns = tv.tv_sec * NANOSECONDS_PER_SECOND + tv.tv_usec * 1000;
>+#else
>+    /* We need nanosecond precision for short lived functions. */
>+    struct timespec ts;
>+    timespec_get(&ts, TIME_UTC);
>+    uint64_t now_ns = ts.tv_sec * NANOSECONDS_PER_SECOND + ts.tv_nsec;
>+#endif
>+    return now_ns;
>+}
>+
> static Callstack *callstack_new(void)
> {
>     Callstack *cs = g_new0(Callstack, 1);
>@@ -112,6 +159,85 @@ static CallstackEntry callstack_pop(Callstack *cs)
>     return e;
> }
> 
>+static Trace *trace_new(uint32_t id, GString *name)
>+{
>+    Trace *t = g_new0(Trace, 1);
>+    t->t = g_array_new(false, false, sizeof(UftraceEntry));
>+    t->path = g_string_new(NULL);
>+    g_string_append_printf(t->path, "./uftrace.data/%"PRIu32".dat", id);
>+    t->name = g_string_new(name->str);
>+    t->id = id;
>+    return t;
>+}
>+
>+static void trace_free(Trace *t)
>+{
>+    g_assert(t->t->len == 0);
>+    g_array_free(t->t, true);
>+    t->t = NULL;
>+    g_string_free(t->path, true);
>+    t->path = NULL;
>+    g_string_free(t->name, true);
>+    t->name = NULL;
>+    g_free(t);
>+}
>+
>+static void trace_flush(Trace *t, bool append)
>+{
>+    int create_dir = g_mkdir_with_parents("./uftrace.data",
>+                                          S_IRWXU | S_IRWXG | S_IRWXO);
>+    g_assert(create_dir == 0);
>+    FILE *dat = fopen(t->path->str, append ? "a" : "w");
>+    g_assert(dat);
>+    GArray *data = t->t;
>+    if (data->len) {
>+        fwrite(data->data, data->len, sizeof(UftraceEntry), dat);

fwrite might not write all bytes, how about using the 
g_file_set_contents() wrapper?

>+    }
>+    fclose(dat);
>+    g_array_set_size(data, 0);
>+}
>+
>+static void trace_add_entry(Trace *t, uint64_t timestamp, uint64_t pc,
>+                            size_t depth, UftraceRecordType type)
>+{
>+    /* libmcount/record.c:record_event */
>+    const uint64_t record_magic = 0x5;
>+    uint64_t data = type | record_magic << 3;
>+    data += depth << 6;
>+    data += pc << 16;
>+    UftraceEntry e = {.timestamp = timestamp, .data = data};
>+    g_array_append_val(t->t, e);
>+    if (t->t->len * sizeof(UftraceEntry) > 32 * MiB) {
>+        trace_flush(t, true);
>+    }
>+}
>+
>+static void trace_enter_function(Trace *t, uint64_t timestamp,
>+                                 uint64_t pc, size_t depth)
>+{
>+    trace_add_entry(t, timestamp, pc, depth, UFTRACE_ENTRY);
>+}
>+
>+static void trace_exit_function(Trace *t, uint64_t timestamp,
>+                                uint64_t pc, size_t depth)
>+{
>+    trace_add_entry(t, timestamp, pc, depth, UFTRACE_EXIT);
>+}
>+
>+static void trace_enter_stack(Trace *t, Callstack *cs, uint64_t timestamp)
>+{
>+    for (size_t depth = 1; depth <= callstack_depth(cs); ++depth) {
>+        trace_enter_function(t, timestamp, callstack_at(cs, depth)->pc, depth);
>+    }
>+}
>+
>+static void trace_exit_stack(Trace *t, Callstack *cs, uint64_t timestamp)
>+{
>+    for (size_t depth = callstack_depth(cs); depth > 0; --depth) {
>+        trace_exit_function(t, timestamp, callstack_at(cs, depth)->pc, depth);
>+    }
>+}
>+
> static uint64_t cpu_read_register64(Cpu *cpu, struct qemu_plugin_register *reg)
> {
>     GByteArray *buf = cpu->buf;
>@@ -223,7 +349,9 @@ static void track_callstack(unsigned int cpu_index, void *udata)
> {
>     uint64_t pc = (uintptr_t) udata;
>     Cpu *cpu = qemu_plugin_scoreboard_find(score, cpu_index);
>+    uint64_t timestamp = gettime_ns();
>     Callstack *cs = cpu->cs;
>+    Trace *t = cpu->trace;
> 
>     uint64_t fp = cpu->ops.get_frame_pointer(cpu);
>     if (!fp && callstack_empty(cs)) {
>@@ -232,6 +360,7 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>          * long as a proper call does not happen.
>          */
>         callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc});
>+        trace_enter_function(t, timestamp, pc, callstack_depth(cs));
>         return;
>     }
> 
>@@ -244,7 +373,8 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>     CallstackEntry caller = callstack_caller(cs);
>     if (fp == caller.frame_pointer) {
>         /* return */
>-        callstack_pop(cs);
>+        CallstackEntry e = callstack_pop(cs);
>+        trace_exit_function(t, timestamp, e.pc, callstack_depth(cs));
>         return;
>     }
> 
>@@ -252,12 +382,16 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>     if (caller_fp == top.frame_pointer) {
>         /* call */
>         callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc});
>+        trace_enter_function(t, timestamp, pc, callstack_depth(cs));
>         return;
>     }
> 
>     /* discontinuity, exit current stack and unwind new one */
>+    trace_exit_stack(t, cs, timestamp);
>     callstack_clear(cs);
>+
>     cpu_unwind_stack(cpu, fp, pc);
>+    trace_enter_stack(t, cs, timestamp);
> }
> 
> static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
>@@ -296,6 +430,16 @@ static void vcpu_init(qemu_plugin_id_t id, unsigned int vcpu_index)
>     cpu->ops.init(cpu);
>     cpu->buf = g_byte_array_new();
> 
>+    g_assert(vcpu_index < UINT32_MAX / 100);
>+    /* trace_id is: cpu_number * 100 */
>+    uint32_t trace_id = (vcpu_index + 1) * 100;
>+
>+    g_autoptr(GString) trace_name = g_string_new(NULL);
>+    g_string_append_printf(trace_name, "cpu%u", vcpu_index);
>+    cpu->trace = trace_new(trace_id, trace_name);
>+    /* create/truncate trace file */
>+    trace_flush(cpu->trace, false);
>+
>     cpu->cs = callstack_new();
> }
> 
>@@ -304,6 +448,7 @@ static void vcpu_end(unsigned int vcpu_index)
>     Cpu *cpu = qemu_plugin_scoreboard_find(score, vcpu_index);
>     g_byte_array_free(cpu->buf, true);
> 
>+    trace_free(cpu->trace);
>     callstack_free(cpu->cs);
>     memset(cpu, 0, sizeof(Cpu));
> }
>@@ -311,6 +456,8 @@ static void vcpu_end(unsigned int vcpu_index)
> static void at_exit(qemu_plugin_id_t id, void *data)
> {
>     for (size_t i = 0; i < qemu_plugin_num_vcpus(); ++i) {
>+        Cpu *cpu = qemu_plugin_scoreboard_find(score, i);
>+        trace_flush(cpu->trace, true);
>         vcpu_end(i);
>     }
> 
>-- 
>2.47.2
>

Looks good overall.
Re: [PATCH v5 4/9] contrib/plugins/uftrace: implement tracing
Posted by Pierrick Bouvier 4 months, 1 week ago
On 8/8/25 2:11 AM, Manos Pitsidianakis wrote:
> On Fri, 08 Aug 2025 05:06, Pierrick Bouvier <pierrick.bouvier@linaro.org> wrote:
>> We implement tracing, following uftrace format.
>> Trace is flushed every 32 MB, so file operations don't impact
>> performance at runtime.
>>
>> A different trace is generated per cpu, and we ensure they have a unique
>> name, based on vcpu_index, while keeping room for privilege level coming
>> in next commit.
> 
> Suggestion (not a request): put some kind of documentation about the
> format this patch implements, maybe a commit sha & URL to a header file
> from upstream uftrace.
> 

I added a link to uftrace source:
https://github.com/namhyung/uftrace/blob/v0.18/libmcount/record.c#L909

As well, I updated the comment with this URL.
Re: [PATCH v5 4/9] contrib/plugins/uftrace: implement tracing
Posted by Pierrick Bouvier 4 months, 1 week ago
On 8/8/25 2:11 AM, Manos Pitsidianakis wrote:
> On Fri, 08 Aug 2025 05:06, Pierrick Bouvier <pierrick.bouvier@linaro.org> wrote:
>> We implement tracing, following uftrace format.
>> Trace is flushed every 32 MB, so file operations don't impact
>> performance at runtime.
>>
>> A different trace is generated per cpu, and we ensure they have a unique
>> name, based on vcpu_index, while keeping room for privilege level coming
>> in next commit.
> 
> Suggestion (not a request): put some kind of documentation about the
> format this patch implements, maybe a commit sha & URL to a header file
> from upstream uftrace.
> 
>>
>> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
>> ---
>> contrib/plugins/uftrace.c | 149 +++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 148 insertions(+), 1 deletion(-)
>>
>> diff --git a/contrib/plugins/uftrace.c b/contrib/plugins/uftrace.c
>> index d51faceb344..402a242433e 100644
>> --- a/contrib/plugins/uftrace.c
>> +++ b/contrib/plugins/uftrace.c
>> @@ -12,6 +12,13 @@
>> #include <qemu-plugin.h>
>> #include <glib.h>
>> #include <stdio.h>
>> +#include <sys/stat.h>
>> +#include <sys/time.h>
>> +#include <time.h>
>> +#include <unistd.h>
>> +
>> +#define MiB     (INT64_C(1) << 20)
>> +#define NANOSECONDS_PER_SECOND 1000000000LL
>>
>> QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
>>
>> @@ -24,6 +31,13 @@ typedef struct {
>>      uint64_t frame_pointer;
>> } CallstackEntry;
>>
>> +typedef struct {
>> +    GArray *t;
>> +    GString *path;
>> +    GString *name;
>> +    uint32_t id;
>> +} Trace;
>> +
>> typedef struct Cpu Cpu;
>>
>> typedef struct {
>> @@ -34,6 +48,7 @@ typedef struct {
>> } CpuOps;
>>
>> typedef struct Cpu {
>> +    Trace *trace;
>>      Callstack *cs;
>>      GByteArray *buf;
>>      CpuOps ops;
>> @@ -44,9 +59,41 @@ typedef struct {
>>      struct qemu_plugin_register *reg_fp;
>> } Aarch64Cpu;
>>
>> +typedef struct {
>> +    uint64_t timestamp;
>> +    uint64_t data;
>> +} UftraceEntry;
>> +
>> +typedef enum {
>> +    UFTRACE_ENTRY,
>> +    UFTRACE_EXIT,
>> +    UFTRACE_LOST,
>> +    UFTRACE_EVENT
>> +} UftraceRecordType;
>> +
>> static struct qemu_plugin_scoreboard *score;
>> static CpuOps arch_ops;
>>
>> +static uint64_t gettime_ns(void)
>> +{
>> +#ifdef _WIN32
>> +    /*
>> +     * On Windows, timespec_get is available only with UCRT, but not with
>> +     * MinGW64 environment. Simplify by using only gettimeofday on this
>> +     * platform. This may result in a precision loss.
>> +     */
>> +    struct timeval tv;
>> +    gettimeofday(&tv, NULL);
>> +    uint64_t now_ns = tv.tv_sec * NANOSECONDS_PER_SECOND + tv.tv_usec * 1000;
>> +#else
>> +    /* We need nanosecond precision for short lived functions. */
>> +    struct timespec ts;
>> +    timespec_get(&ts, TIME_UTC);
>> +    uint64_t now_ns = ts.tv_sec * NANOSECONDS_PER_SECOND + ts.tv_nsec;
>> +#endif
>> +    return now_ns;
>> +}
>> +
>> static Callstack *callstack_new(void)
>> {
>>      Callstack *cs = g_new0(Callstack, 1);
>> @@ -112,6 +159,85 @@ static CallstackEntry callstack_pop(Callstack *cs)
>>      return e;
>> }
>>
>> +static Trace *trace_new(uint32_t id, GString *name)
>> +{
>> +    Trace *t = g_new0(Trace, 1);
>> +    t->t = g_array_new(false, false, sizeof(UftraceEntry));
>> +    t->path = g_string_new(NULL);
>> +    g_string_append_printf(t->path, "./uftrace.data/%"PRIu32".dat", id);
>> +    t->name = g_string_new(name->str);
>> +    t->id = id;
>> +    return t;
>> +}
>> +
>> +static void trace_free(Trace *t)
>> +{
>> +    g_assert(t->t->len == 0);
>> +    g_array_free(t->t, true);
>> +    t->t = NULL;
>> +    g_string_free(t->path, true);
>> +    t->path = NULL;
>> +    g_string_free(t->name, true);
>> +    t->name = NULL;
>> +    g_free(t);
>> +}
>> +
>> +static void trace_flush(Trace *t, bool append)
>> +{
>> +    int create_dir = g_mkdir_with_parents("./uftrace.data",
>> +                                          S_IRWXU | S_IRWXG | S_IRWXO);
>> +    g_assert(create_dir == 0);
>> +    FILE *dat = fopen(t->path->str, append ? "a" : "w");
>> +    g_assert(dat);
>> +    GArray *data = t->t;
>> +    if (data->len) {
>> +        fwrite(data->data, data->len, sizeof(UftraceEntry), dat);
> 
> fwrite might not write all bytes, how about using the
> g_file_set_contents() wrapper?
>

If I see correctly, g_file_set_contents does not allow to append data, 
which is what we need to do here (that's the point of flushing every 
32MB). I can add an assert on fwrite return to make sure we wrote 
everything.

>> +    }
>> +    fclose(dat);
>> +    g_array_set_size(data, 0);
>> +}
>> +
>> +static void trace_add_entry(Trace *t, uint64_t timestamp, uint64_t pc,
>> +                            size_t depth, UftraceRecordType type)
>> +{
>> +    /* libmcount/record.c:record_event */
>> +    const uint64_t record_magic = 0x5;
>> +    uint64_t data = type | record_magic << 3;
>> +    data += depth << 6;
>> +    data += pc << 16;
>> +    UftraceEntry e = {.timestamp = timestamp, .data = data};
>> +    g_array_append_val(t->t, e);
>> +    if (t->t->len * sizeof(UftraceEntry) > 32 * MiB) {
>> +        trace_flush(t, true);
>> +    }
>> +}
>> +
>> +static void trace_enter_function(Trace *t, uint64_t timestamp,
>> +                                 uint64_t pc, size_t depth)
>> +{
>> +    trace_add_entry(t, timestamp, pc, depth, UFTRACE_ENTRY);
>> +}
>> +
>> +static void trace_exit_function(Trace *t, uint64_t timestamp,
>> +                                uint64_t pc, size_t depth)
>> +{
>> +    trace_add_entry(t, timestamp, pc, depth, UFTRACE_EXIT);
>> +}
>> +
>> +static void trace_enter_stack(Trace *t, Callstack *cs, uint64_t timestamp)
>> +{
>> +    for (size_t depth = 1; depth <= callstack_depth(cs); ++depth) {
>> +        trace_enter_function(t, timestamp, callstack_at(cs, depth)->pc, depth);
>> +    }
>> +}
>> +
>> +static void trace_exit_stack(Trace *t, Callstack *cs, uint64_t timestamp)
>> +{
>> +    for (size_t depth = callstack_depth(cs); depth > 0; --depth) {
>> +        trace_exit_function(t, timestamp, callstack_at(cs, depth)->pc, depth);
>> +    }
>> +}
>> +
>> static uint64_t cpu_read_register64(Cpu *cpu, struct qemu_plugin_register *reg)
>> {
>>      GByteArray *buf = cpu->buf;
>> @@ -223,7 +349,9 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>> {
>>      uint64_t pc = (uintptr_t) udata;
>>      Cpu *cpu = qemu_plugin_scoreboard_find(score, cpu_index);
>> +    uint64_t timestamp = gettime_ns();
>>      Callstack *cs = cpu->cs;
>> +    Trace *t = cpu->trace;
>>
>>      uint64_t fp = cpu->ops.get_frame_pointer(cpu);
>>      if (!fp && callstack_empty(cs)) {
>> @@ -232,6 +360,7 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>>           * long as a proper call does not happen.
>>           */
>>          callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc});
>> +        trace_enter_function(t, timestamp, pc, callstack_depth(cs));
>>          return;
>>      }
>>
>> @@ -244,7 +373,8 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>>      CallstackEntry caller = callstack_caller(cs);
>>      if (fp == caller.frame_pointer) {
>>          /* return */
>> -        callstack_pop(cs);
>> +        CallstackEntry e = callstack_pop(cs);
>> +        trace_exit_function(t, timestamp, e.pc, callstack_depth(cs));
>>          return;
>>      }
>>
>> @@ -252,12 +382,16 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>>      if (caller_fp == top.frame_pointer) {
>>          /* call */
>>          callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc});
>> +        trace_enter_function(t, timestamp, pc, callstack_depth(cs));
>>          return;
>>      }
>>
>>      /* discontinuity, exit current stack and unwind new one */
>> +    trace_exit_stack(t, cs, timestamp);
>>      callstack_clear(cs);
>> +
>>      cpu_unwind_stack(cpu, fp, pc);
>> +    trace_enter_stack(t, cs, timestamp);
>> }
>>
>> static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
>> @@ -296,6 +430,16 @@ static void vcpu_init(qemu_plugin_id_t id, unsigned int vcpu_index)
>>      cpu->ops.init(cpu);
>>      cpu->buf = g_byte_array_new();
>>
>> +    g_assert(vcpu_index < UINT32_MAX / 100);
>> +    /* trace_id is: cpu_number * 100 */
>> +    uint32_t trace_id = (vcpu_index + 1) * 100;
>> +
>> +    g_autoptr(GString) trace_name = g_string_new(NULL);
>> +    g_string_append_printf(trace_name, "cpu%u", vcpu_index);
>> +    cpu->trace = trace_new(trace_id, trace_name);
>> +    /* create/truncate trace file */
>> +    trace_flush(cpu->trace, false);
>> +
>>      cpu->cs = callstack_new();
>> }
>>
>> @@ -304,6 +448,7 @@ static void vcpu_end(unsigned int vcpu_index)
>>      Cpu *cpu = qemu_plugin_scoreboard_find(score, vcpu_index);
>>      g_byte_array_free(cpu->buf, true);
>>
>> +    trace_free(cpu->trace);
>>      callstack_free(cpu->cs);
>>      memset(cpu, 0, sizeof(Cpu));
>> }
>> @@ -311,6 +456,8 @@ static void vcpu_end(unsigned int vcpu_index)
>> static void at_exit(qemu_plugin_id_t id, void *data)
>> {
>>      for (size_t i = 0; i < qemu_plugin_num_vcpus(); ++i) {
>> +        Cpu *cpu = qemu_plugin_scoreboard_find(score, i);
>> +        trace_flush(cpu->trace, true);
>>          vcpu_end(i);
>>      }
>>
>> -- 
>> 2.47.2
>>
> 
> Looks good overall.
Re: [PATCH v5 4/9] contrib/plugins/uftrace: implement tracing
Posted by Manos Pitsidianakis 4 months, 1 week ago
On Fri, Aug 8, 2025 at 9:03 PM Pierrick Bouvier
<pierrick.bouvier@linaro.org> wrote:
>
> On 8/8/25 2:11 AM, Manos Pitsidianakis wrote:
> > On Fri, 08 Aug 2025 05:06, Pierrick Bouvier <pierrick.bouvier@linaro.org> wrote:
> >> We implement tracing, following uftrace format.
> >> Trace is flushed every 32 MB, so file operations don't impact
> >> performance at runtime.
> >>
> >> A different trace is generated per cpu, and we ensure they have a unique
> >> name, based on vcpu_index, while keeping room for privilege level coming
> >> in next commit.
> >
> > Suggestion (not a request): put some kind of documentation about the
> > format this patch implements, maybe a commit sha & URL to a header file
> > from upstream uftrace.
> >
> >>
> >> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
> >> ---
> >> contrib/plugins/uftrace.c | 149 +++++++++++++++++++++++++++++++++++++-
> >> 1 file changed, 148 insertions(+), 1 deletion(-)
> >>
> >> diff --git a/contrib/plugins/uftrace.c b/contrib/plugins/uftrace.c
> >> index d51faceb344..402a242433e 100644
> >> --- a/contrib/plugins/uftrace.c
> >> +++ b/contrib/plugins/uftrace.c
> >> @@ -12,6 +12,13 @@
> >> #include <qemu-plugin.h>
> >> #include <glib.h>
> >> #include <stdio.h>
> >> +#include <sys/stat.h>
> >> +#include <sys/time.h>
> >> +#include <time.h>
> >> +#include <unistd.h>
> >> +
> >> +#define MiB     (INT64_C(1) << 20)
> >> +#define NANOSECONDS_PER_SECOND 1000000000LL
> >>
> >> QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
> >>
> >> @@ -24,6 +31,13 @@ typedef struct {
> >>      uint64_t frame_pointer;
> >> } CallstackEntry;
> >>
> >> +typedef struct {
> >> +    GArray *t;
> >> +    GString *path;
> >> +    GString *name;
> >> +    uint32_t id;
> >> +} Trace;
> >> +
> >> typedef struct Cpu Cpu;
> >>
> >> typedef struct {
> >> @@ -34,6 +48,7 @@ typedef struct {
> >> } CpuOps;
> >>
> >> typedef struct Cpu {
> >> +    Trace *trace;
> >>      Callstack *cs;
> >>      GByteArray *buf;
> >>      CpuOps ops;
> >> @@ -44,9 +59,41 @@ typedef struct {
> >>      struct qemu_plugin_register *reg_fp;
> >> } Aarch64Cpu;
> >>
> >> +typedef struct {
> >> +    uint64_t timestamp;
> >> +    uint64_t data;
> >> +} UftraceEntry;
> >> +
> >> +typedef enum {
> >> +    UFTRACE_ENTRY,
> >> +    UFTRACE_EXIT,
> >> +    UFTRACE_LOST,
> >> +    UFTRACE_EVENT
> >> +} UftraceRecordType;
> >> +
> >> static struct qemu_plugin_scoreboard *score;
> >> static CpuOps arch_ops;
> >>
> >> +static uint64_t gettime_ns(void)
> >> +{
> >> +#ifdef _WIN32
> >> +    /*
> >> +     * On Windows, timespec_get is available only with UCRT, but not with
> >> +     * MinGW64 environment. Simplify by using only gettimeofday on this
> >> +     * platform. This may result in a precision loss.
> >> +     */
> >> +    struct timeval tv;
> >> +    gettimeofday(&tv, NULL);
> >> +    uint64_t now_ns = tv.tv_sec * NANOSECONDS_PER_SECOND + tv.tv_usec * 1000;
> >> +#else
> >> +    /* We need nanosecond precision for short lived functions. */
> >> +    struct timespec ts;
> >> +    timespec_get(&ts, TIME_UTC);
> >> +    uint64_t now_ns = ts.tv_sec * NANOSECONDS_PER_SECOND + ts.tv_nsec;
> >> +#endif
> >> +    return now_ns;
> >> +}
> >> +
> >> static Callstack *callstack_new(void)
> >> {
> >>      Callstack *cs = g_new0(Callstack, 1);
> >> @@ -112,6 +159,85 @@ static CallstackEntry callstack_pop(Callstack *cs)
> >>      return e;
> >> }
> >>
> >> +static Trace *trace_new(uint32_t id, GString *name)
> >> +{
> >> +    Trace *t = g_new0(Trace, 1);
> >> +    t->t = g_array_new(false, false, sizeof(UftraceEntry));
> >> +    t->path = g_string_new(NULL);
> >> +    g_string_append_printf(t->path, "./uftrace.data/%"PRIu32".dat", id);
> >> +    t->name = g_string_new(name->str);
> >> +    t->id = id;
> >> +    return t;
> >> +}
> >> +
> >> +static void trace_free(Trace *t)
> >> +{
> >> +    g_assert(t->t->len == 0);
> >> +    g_array_free(t->t, true);
> >> +    t->t = NULL;
> >> +    g_string_free(t->path, true);
> >> +    t->path = NULL;
> >> +    g_string_free(t->name, true);
> >> +    t->name = NULL;
> >> +    g_free(t);
> >> +}
> >> +
> >> +static void trace_flush(Trace *t, bool append)
> >> +{
> >> +    int create_dir = g_mkdir_with_parents("./uftrace.data",
> >> +                                          S_IRWXU | S_IRWXG | S_IRWXO);
> >> +    g_assert(create_dir == 0);
> >> +    FILE *dat = fopen(t->path->str, append ? "a" : "w");
> >> +    g_assert(dat);
> >> +    GArray *data = t->t;
> >> +    if (data->len) {
> >> +        fwrite(data->data, data->len, sizeof(UftraceEntry), dat);
> >
> > fwrite might not write all bytes, how about using the
> > g_file_set_contents() wrapper?
> >
>
> If I see correctly, g_file_set_contents does not allow to append data,
> which is what we need to do here (that's the point of flushing every
> 32MB).

Ah you're right, my bad. It might be because it renames a temporary
file to the destination filename in order to make the operation
atomic.

> I can add an assert on fwrite return to make sure we wrote
> everything.

Or a while loop with a bytes_written counter.

Or keep it all in memory and write it on exit? Your call
Re: [PATCH v5 4/9] contrib/plugins/uftrace: implement tracing
Posted by Pierrick Bouvier 4 months, 1 week ago
On 8/8/25 11:13 AM, Manos Pitsidianakis wrote:
> On Fri, Aug 8, 2025 at 9:03 PM Pierrick Bouvier
> <pierrick.bouvier@linaro.org> wrote:
>>
>> On 8/8/25 2:11 AM, Manos Pitsidianakis wrote:
>>> On Fri, 08 Aug 2025 05:06, Pierrick Bouvier <pierrick.bouvier@linaro.org> wrote:
>>>> We implement tracing, following uftrace format.
>>>> Trace is flushed every 32 MB, so file operations don't impact
>>>> performance at runtime.
>>>>
>>>> A different trace is generated per cpu, and we ensure they have a unique
>>>> name, based on vcpu_index, while keeping room for privilege level coming
>>>> in next commit.
>>>
>>> Suggestion (not a request): put some kind of documentation about the
>>> format this patch implements, maybe a commit sha & URL to a header file
>>> from upstream uftrace.
>>>
>>>>
>>>> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
>>>> ---
>>>> contrib/plugins/uftrace.c | 149 +++++++++++++++++++++++++++++++++++++-
>>>> 1 file changed, 148 insertions(+), 1 deletion(-)
>>>>
>>>> diff --git a/contrib/plugins/uftrace.c b/contrib/plugins/uftrace.c
>>>> index d51faceb344..402a242433e 100644
>>>> --- a/contrib/plugins/uftrace.c
>>>> +++ b/contrib/plugins/uftrace.c
>>>> @@ -12,6 +12,13 @@
>>>> #include <qemu-plugin.h>
>>>> #include <glib.h>
>>>> #include <stdio.h>
>>>> +#include <sys/stat.h>
>>>> +#include <sys/time.h>
>>>> +#include <time.h>
>>>> +#include <unistd.h>
>>>> +
>>>> +#define MiB     (INT64_C(1) << 20)
>>>> +#define NANOSECONDS_PER_SECOND 1000000000LL
>>>>
>>>> QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
>>>>
>>>> @@ -24,6 +31,13 @@ typedef struct {
>>>>       uint64_t frame_pointer;
>>>> } CallstackEntry;
>>>>
>>>> +typedef struct {
>>>> +    GArray *t;
>>>> +    GString *path;
>>>> +    GString *name;
>>>> +    uint32_t id;
>>>> +} Trace;
>>>> +
>>>> typedef struct Cpu Cpu;
>>>>
>>>> typedef struct {
>>>> @@ -34,6 +48,7 @@ typedef struct {
>>>> } CpuOps;
>>>>
>>>> typedef struct Cpu {
>>>> +    Trace *trace;
>>>>       Callstack *cs;
>>>>       GByteArray *buf;
>>>>       CpuOps ops;
>>>> @@ -44,9 +59,41 @@ typedef struct {
>>>>       struct qemu_plugin_register *reg_fp;
>>>> } Aarch64Cpu;
>>>>
>>>> +typedef struct {
>>>> +    uint64_t timestamp;
>>>> +    uint64_t data;
>>>> +} UftraceEntry;
>>>> +
>>>> +typedef enum {
>>>> +    UFTRACE_ENTRY,
>>>> +    UFTRACE_EXIT,
>>>> +    UFTRACE_LOST,
>>>> +    UFTRACE_EVENT
>>>> +} UftraceRecordType;
>>>> +
>>>> static struct qemu_plugin_scoreboard *score;
>>>> static CpuOps arch_ops;
>>>>
>>>> +static uint64_t gettime_ns(void)
>>>> +{
>>>> +#ifdef _WIN32
>>>> +    /*
>>>> +     * On Windows, timespec_get is available only with UCRT, but not with
>>>> +     * MinGW64 environment. Simplify by using only gettimeofday on this
>>>> +     * platform. This may result in a precision loss.
>>>> +     */
>>>> +    struct timeval tv;
>>>> +    gettimeofday(&tv, NULL);
>>>> +    uint64_t now_ns = tv.tv_sec * NANOSECONDS_PER_SECOND + tv.tv_usec * 1000;
>>>> +#else
>>>> +    /* We need nanosecond precision for short lived functions. */
>>>> +    struct timespec ts;
>>>> +    timespec_get(&ts, TIME_UTC);
>>>> +    uint64_t now_ns = ts.tv_sec * NANOSECONDS_PER_SECOND + ts.tv_nsec;
>>>> +#endif
>>>> +    return now_ns;
>>>> +}
>>>> +
>>>> static Callstack *callstack_new(void)
>>>> {
>>>>       Callstack *cs = g_new0(Callstack, 1);
>>>> @@ -112,6 +159,85 @@ static CallstackEntry callstack_pop(Callstack *cs)
>>>>       return e;
>>>> }
>>>>
>>>> +static Trace *trace_new(uint32_t id, GString *name)
>>>> +{
>>>> +    Trace *t = g_new0(Trace, 1);
>>>> +    t->t = g_array_new(false, false, sizeof(UftraceEntry));
>>>> +    t->path = g_string_new(NULL);
>>>> +    g_string_append_printf(t->path, "./uftrace.data/%"PRIu32".dat", id);
>>>> +    t->name = g_string_new(name->str);
>>>> +    t->id = id;
>>>> +    return t;
>>>> +}
>>>> +
>>>> +static void trace_free(Trace *t)
>>>> +{
>>>> +    g_assert(t->t->len == 0);
>>>> +    g_array_free(t->t, true);
>>>> +    t->t = NULL;
>>>> +    g_string_free(t->path, true);
>>>> +    t->path = NULL;
>>>> +    g_string_free(t->name, true);
>>>> +    t->name = NULL;
>>>> +    g_free(t);
>>>> +}
>>>> +
>>>> +static void trace_flush(Trace *t, bool append)
>>>> +{
>>>> +    int create_dir = g_mkdir_with_parents("./uftrace.data",
>>>> +                                          S_IRWXU | S_IRWXG | S_IRWXO);
>>>> +    g_assert(create_dir == 0);
>>>> +    FILE *dat = fopen(t->path->str, append ? "a" : "w");
>>>> +    g_assert(dat);
>>>> +    GArray *data = t->t;
>>>> +    if (data->len) {
>>>> +        fwrite(data->data, data->len, sizeof(UftraceEntry), dat);
>>>
>>> fwrite might not write all bytes, how about using the
>>> g_file_set_contents() wrapper?
>>>
>>
>> If I see correctly, g_file_set_contents does not allow to append data,
>> which is what we need to do here (that's the point of flushing every
>> 32MB).
> 
> Ah you're right, my bad. It might be because it renames a temporary
> file to the destination filename in order to make the operation
> atomic.
>
>> I can add an assert on fwrite return to make sure we wrote
>> everything.
> 
> Or a while loop with a bytes_written counter.
> 

fwrite guarantees all data will be written (to the opposite or raw write 
calls), including if syscall is interrupted, doing a proper write loop 
until all data is written. You can check glibc source if in doubt.
Thus, the only situation where it might fail is if disk space is full. 
In this case, I think that an assert (similar to a panic!) is good 
enough because we can't really do anything to solve the problem.

> Or keep it all in memory and write it on exit? Your call

That was the first version, but it does not scale on long executions, 
where traces can reach several gb.

By the way, adding the assert caught a bug, size and num_elems were 
reversed, so definitely thanks for pointing this out.

Re: [PATCH v5 4/9] contrib/plugins/uftrace: implement tracing
Posted by Manos Pitsidianakis 4 months, 1 week ago
On Fri, Aug 8, 2025 at 9:23 PM Pierrick Bouvier
<pierrick.bouvier@linaro.org> wrote:
>
> On 8/8/25 11:13 AM, Manos Pitsidianakis wrote:
> > On Fri, Aug 8, 2025 at 9:03 PM Pierrick Bouvier
> > <pierrick.bouvier@linaro.org> wrote:
> >>
> >> On 8/8/25 2:11 AM, Manos Pitsidianakis wrote:
> >>> On Fri, 08 Aug 2025 05:06, Pierrick Bouvier <pierrick.bouvier@linaro.org> wrote:
> >>>> We implement tracing, following uftrace format.
> >>>> Trace is flushed every 32 MB, so file operations don't impact
> >>>> performance at runtime.
> >>>>
> >>>> A different trace is generated per cpu, and we ensure they have a unique
> >>>> name, based on vcpu_index, while keeping room for privilege level coming
> >>>> in next commit.
> >>>
> >>> Suggestion (not a request): put some kind of documentation about the
> >>> format this patch implements, maybe a commit sha & URL to a header file
> >>> from upstream uftrace.
> >>>
> >>>>
> >>>> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
> >>>> ---
> >>>> contrib/plugins/uftrace.c | 149 +++++++++++++++++++++++++++++++++++++-
> >>>> 1 file changed, 148 insertions(+), 1 deletion(-)
> >>>>
> >>>> diff --git a/contrib/plugins/uftrace.c b/contrib/plugins/uftrace.c
> >>>> index d51faceb344..402a242433e 100644
> >>>> --- a/contrib/plugins/uftrace.c
> >>>> +++ b/contrib/plugins/uftrace.c
> >>>> @@ -12,6 +12,13 @@
> >>>> #include <qemu-plugin.h>
> >>>> #include <glib.h>
> >>>> #include <stdio.h>
> >>>> +#include <sys/stat.h>
> >>>> +#include <sys/time.h>
> >>>> +#include <time.h>
> >>>> +#include <unistd.h>
> >>>> +
> >>>> +#define MiB     (INT64_C(1) << 20)
> >>>> +#define NANOSECONDS_PER_SECOND 1000000000LL
> >>>>
> >>>> QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
> >>>>
> >>>> @@ -24,6 +31,13 @@ typedef struct {
> >>>>       uint64_t frame_pointer;
> >>>> } CallstackEntry;
> >>>>
> >>>> +typedef struct {
> >>>> +    GArray *t;
> >>>> +    GString *path;
> >>>> +    GString *name;
> >>>> +    uint32_t id;
> >>>> +} Trace;
> >>>> +
> >>>> typedef struct Cpu Cpu;
> >>>>
> >>>> typedef struct {
> >>>> @@ -34,6 +48,7 @@ typedef struct {
> >>>> } CpuOps;
> >>>>
> >>>> typedef struct Cpu {
> >>>> +    Trace *trace;
> >>>>       Callstack *cs;
> >>>>       GByteArray *buf;
> >>>>       CpuOps ops;
> >>>> @@ -44,9 +59,41 @@ typedef struct {
> >>>>       struct qemu_plugin_register *reg_fp;
> >>>> } Aarch64Cpu;
> >>>>
> >>>> +typedef struct {
> >>>> +    uint64_t timestamp;
> >>>> +    uint64_t data;
> >>>> +} UftraceEntry;
> >>>> +
> >>>> +typedef enum {
> >>>> +    UFTRACE_ENTRY,
> >>>> +    UFTRACE_EXIT,
> >>>> +    UFTRACE_LOST,
> >>>> +    UFTRACE_EVENT
> >>>> +} UftraceRecordType;
> >>>> +
> >>>> static struct qemu_plugin_scoreboard *score;
> >>>> static CpuOps arch_ops;
> >>>>
> >>>> +static uint64_t gettime_ns(void)
> >>>> +{
> >>>> +#ifdef _WIN32
> >>>> +    /*
> >>>> +     * On Windows, timespec_get is available only with UCRT, but not with
> >>>> +     * MinGW64 environment. Simplify by using only gettimeofday on this
> >>>> +     * platform. This may result in a precision loss.
> >>>> +     */
> >>>> +    struct timeval tv;
> >>>> +    gettimeofday(&tv, NULL);
> >>>> +    uint64_t now_ns = tv.tv_sec * NANOSECONDS_PER_SECOND + tv.tv_usec * 1000;
> >>>> +#else
> >>>> +    /* We need nanosecond precision for short lived functions. */
> >>>> +    struct timespec ts;
> >>>> +    timespec_get(&ts, TIME_UTC);
> >>>> +    uint64_t now_ns = ts.tv_sec * NANOSECONDS_PER_SECOND + ts.tv_nsec;
> >>>> +#endif
> >>>> +    return now_ns;
> >>>> +}
> >>>> +
> >>>> static Callstack *callstack_new(void)
> >>>> {
> >>>>       Callstack *cs = g_new0(Callstack, 1);
> >>>> @@ -112,6 +159,85 @@ static CallstackEntry callstack_pop(Callstack *cs)
> >>>>       return e;
> >>>> }
> >>>>
> >>>> +static Trace *trace_new(uint32_t id, GString *name)
> >>>> +{
> >>>> +    Trace *t = g_new0(Trace, 1);
> >>>> +    t->t = g_array_new(false, false, sizeof(UftraceEntry));
> >>>> +    t->path = g_string_new(NULL);
> >>>> +    g_string_append_printf(t->path, "./uftrace.data/%"PRIu32".dat", id);
> >>>> +    t->name = g_string_new(name->str);
> >>>> +    t->id = id;
> >>>> +    return t;
> >>>> +}
> >>>> +
> >>>> +static void trace_free(Trace *t)
> >>>> +{
> >>>> +    g_assert(t->t->len == 0);
> >>>> +    g_array_free(t->t, true);
> >>>> +    t->t = NULL;
> >>>> +    g_string_free(t->path, true);
> >>>> +    t->path = NULL;
> >>>> +    g_string_free(t->name, true);
> >>>> +    t->name = NULL;
> >>>> +    g_free(t);
> >>>> +}
> >>>> +
> >>>> +static void trace_flush(Trace *t, bool append)
> >>>> +{
> >>>> +    int create_dir = g_mkdir_with_parents("./uftrace.data",
> >>>> +                                          S_IRWXU | S_IRWXG | S_IRWXO);
> >>>> +    g_assert(create_dir == 0);
> >>>> +    FILE *dat = fopen(t->path->str, append ? "a" : "w");
> >>>> +    g_assert(dat);
> >>>> +    GArray *data = t->t;
> >>>> +    if (data->len) {
> >>>> +        fwrite(data->data, data->len, sizeof(UftraceEntry), dat);
> >>>
> >>> fwrite might not write all bytes, how about using the
> >>> g_file_set_contents() wrapper?
> >>>
> >>
> >> If I see correctly, g_file_set_contents does not allow to append data,
> >> which is what we need to do here (that's the point of flushing every
> >> 32MB).
> >
> > Ah you're right, my bad. It might be because it renames a temporary
> > file to the destination filename in order to make the operation
> > atomic.
> >
> >> I can add an assert on fwrite return to make sure we wrote
> >> everything.
> >
> > Or a while loop with a bytes_written counter.
> >
>
> fwrite guarantees all data will be written (to the opposite or raw write
> calls), including if syscall is interrupted, doing a proper write loop
> until all data is written.

Right, as the manpage says:

> RETURN VALUE
>      On success, fread() and fwrite() return the number of items read or written.  This number equals the number of bytes transferred only when size is 1.  If an error occurs, or the
>      end of the file is reached, the return value is a short item count (or zero).

I thought it meant that "on error" included EAGAIN/EINTR, that's why I
suggested a loop. But the manpage doesn't list error values or even
mention whether it sets errno at all so I was mistaken.

> You can check glibc source if in doubt.
> Thus, the only situation where it might fail is if disk space is full.
> In this case, I think that an assert (similar to a panic!) is good
> enough because we can't really do anything to solve the problem.
>
> > Or keep it all in memory and write it on exit? Your call
>
> That was the first version, but it does not scale on long executions,
> where traces can reach several gb.
>
> By the way, adding the assert caught a bug, size and num_elems were
> reversed, so definitely thanks for pointing this out.
Re: [PATCH v5 4/9] contrib/plugins/uftrace: implement tracing
Posted by Pierrick Bouvier 4 months, 1 week ago
On 8/8/25 2:11 AM, Manos Pitsidianakis wrote:
> On Fri, 08 Aug 2025 05:06, Pierrick Bouvier <pierrick.bouvier@linaro.org> wrote:
>> We implement tracing, following uftrace format.
>> Trace is flushed every 32 MB, so file operations don't impact
>> performance at runtime.
>>
>> A different trace is generated per cpu, and we ensure they have a unique
>> name, based on vcpu_index, while keeping room for privilege level coming
>> in next commit.
> 
> Suggestion (not a request): put some kind of documentation about the
> format this patch implements, maybe a commit sha & URL to a header file
> from upstream uftrace.
> 

This is the comment /* libmcount/record.c:record_event */ in 
trace_add_entry() function.
I can add this to commit message also, so maybe it's easier to find.

There is no official documentation for that on uftrace side, but source 
code is quite readable, and uftrace provides a convenient "dump --debug" 
command.

>>
>> Signed-off-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
>> ---
>> contrib/plugins/uftrace.c | 149 +++++++++++++++++++++++++++++++++++++-
>> 1 file changed, 148 insertions(+), 1 deletion(-)
>>
>> diff --git a/contrib/plugins/uftrace.c b/contrib/plugins/uftrace.c
>> index d51faceb344..402a242433e 100644
>> --- a/contrib/plugins/uftrace.c
>> +++ b/contrib/plugins/uftrace.c
>> @@ -12,6 +12,13 @@
>> #include <qemu-plugin.h>
>> #include <glib.h>
>> #include <stdio.h>
>> +#include <sys/stat.h>
>> +#include <sys/time.h>
>> +#include <time.h>
>> +#include <unistd.h>
>> +
>> +#define MiB     (INT64_C(1) << 20)
>> +#define NANOSECONDS_PER_SECOND 1000000000LL
>>
>> QEMU_PLUGIN_EXPORT int qemu_plugin_version = QEMU_PLUGIN_VERSION;
>>
>> @@ -24,6 +31,13 @@ typedef struct {
>>      uint64_t frame_pointer;
>> } CallstackEntry;
>>
>> +typedef struct {
>> +    GArray *t;
>> +    GString *path;
>> +    GString *name;
>> +    uint32_t id;
>> +} Trace;
>> +
>> typedef struct Cpu Cpu;
>>
>> typedef struct {
>> @@ -34,6 +48,7 @@ typedef struct {
>> } CpuOps;
>>
>> typedef struct Cpu {
>> +    Trace *trace;
>>      Callstack *cs;
>>      GByteArray *buf;
>>      CpuOps ops;
>> @@ -44,9 +59,41 @@ typedef struct {
>>      struct qemu_plugin_register *reg_fp;
>> } Aarch64Cpu;
>>
>> +typedef struct {
>> +    uint64_t timestamp;
>> +    uint64_t data;
>> +} UftraceEntry;
>> +
>> +typedef enum {
>> +    UFTRACE_ENTRY,
>> +    UFTRACE_EXIT,
>> +    UFTRACE_LOST,
>> +    UFTRACE_EVENT
>> +} UftraceRecordType;
>> +
>> static struct qemu_plugin_scoreboard *score;
>> static CpuOps arch_ops;
>>
>> +static uint64_t gettime_ns(void)
>> +{
>> +#ifdef _WIN32
>> +    /*
>> +     * On Windows, timespec_get is available only with UCRT, but not with
>> +     * MinGW64 environment. Simplify by using only gettimeofday on this
>> +     * platform. This may result in a precision loss.
>> +     */
>> +    struct timeval tv;
>> +    gettimeofday(&tv, NULL);
>> +    uint64_t now_ns = tv.tv_sec * NANOSECONDS_PER_SECOND + tv.tv_usec * 1000;
>> +#else
>> +    /* We need nanosecond precision for short lived functions. */
>> +    struct timespec ts;
>> +    timespec_get(&ts, TIME_UTC);
>> +    uint64_t now_ns = ts.tv_sec * NANOSECONDS_PER_SECOND + ts.tv_nsec;
>> +#endif
>> +    return now_ns;
>> +}
>> +
>> static Callstack *callstack_new(void)
>> {
>>      Callstack *cs = g_new0(Callstack, 1);
>> @@ -112,6 +159,85 @@ static CallstackEntry callstack_pop(Callstack *cs)
>>      return e;
>> }
>>
>> +static Trace *trace_new(uint32_t id, GString *name)
>> +{
>> +    Trace *t = g_new0(Trace, 1);
>> +    t->t = g_array_new(false, false, sizeof(UftraceEntry));
>> +    t->path = g_string_new(NULL);
>> +    g_string_append_printf(t->path, "./uftrace.data/%"PRIu32".dat", id);
>> +    t->name = g_string_new(name->str);
>> +    t->id = id;
>> +    return t;
>> +}
>> +
>> +static void trace_free(Trace *t)
>> +{
>> +    g_assert(t->t->len == 0);
>> +    g_array_free(t->t, true);
>> +    t->t = NULL;
>> +    g_string_free(t->path, true);
>> +    t->path = NULL;
>> +    g_string_free(t->name, true);
>> +    t->name = NULL;
>> +    g_free(t);
>> +}
>> +
>> +static void trace_flush(Trace *t, bool append)
>> +{
>> +    int create_dir = g_mkdir_with_parents("./uftrace.data",
>> +                                          S_IRWXU | S_IRWXG | S_IRWXO);
>> +    g_assert(create_dir == 0);
>> +    FILE *dat = fopen(t->path->str, append ? "a" : "w");
>> +    g_assert(dat);
>> +    GArray *data = t->t;
>> +    if (data->len) {
>> +        fwrite(data->data, data->len, sizeof(UftraceEntry), dat);
> 
> fwrite might not write all bytes, how about using the
> g_file_set_contents() wrapper?
> 
>> +    }
>> +    fclose(dat);
>> +    g_array_set_size(data, 0);
>> +}
>> +
>> +static void trace_add_entry(Trace *t, uint64_t timestamp, uint64_t pc,
>> +                            size_t depth, UftraceRecordType type)
>> +{
>> +    /* libmcount/record.c:record_event */
>> +    const uint64_t record_magic = 0x5;
>> +    uint64_t data = type | record_magic << 3;
>> +    data += depth << 6;
>> +    data += pc << 16;
>> +    UftraceEntry e = {.timestamp = timestamp, .data = data};
>> +    g_array_append_val(t->t, e);
>> +    if (t->t->len * sizeof(UftraceEntry) > 32 * MiB) {
>> +        trace_flush(t, true);
>> +    }
>> +}
>> +
>> +static void trace_enter_function(Trace *t, uint64_t timestamp,
>> +                                 uint64_t pc, size_t depth)
>> +{
>> +    trace_add_entry(t, timestamp, pc, depth, UFTRACE_ENTRY);
>> +}
>> +
>> +static void trace_exit_function(Trace *t, uint64_t timestamp,
>> +                                uint64_t pc, size_t depth)
>> +{
>> +    trace_add_entry(t, timestamp, pc, depth, UFTRACE_EXIT);
>> +}
>> +
>> +static void trace_enter_stack(Trace *t, Callstack *cs, uint64_t timestamp)
>> +{
>> +    for (size_t depth = 1; depth <= callstack_depth(cs); ++depth) {
>> +        trace_enter_function(t, timestamp, callstack_at(cs, depth)->pc, depth);
>> +    }
>> +}
>> +
>> +static void trace_exit_stack(Trace *t, Callstack *cs, uint64_t timestamp)
>> +{
>> +    for (size_t depth = callstack_depth(cs); depth > 0; --depth) {
>> +        trace_exit_function(t, timestamp, callstack_at(cs, depth)->pc, depth);
>> +    }
>> +}
>> +
>> static uint64_t cpu_read_register64(Cpu *cpu, struct qemu_plugin_register *reg)
>> {
>>      GByteArray *buf = cpu->buf;
>> @@ -223,7 +349,9 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>> {
>>      uint64_t pc = (uintptr_t) udata;
>>      Cpu *cpu = qemu_plugin_scoreboard_find(score, cpu_index);
>> +    uint64_t timestamp = gettime_ns();
>>      Callstack *cs = cpu->cs;
>> +    Trace *t = cpu->trace;
>>
>>      uint64_t fp = cpu->ops.get_frame_pointer(cpu);
>>      if (!fp && callstack_empty(cs)) {
>> @@ -232,6 +360,7 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>>           * long as a proper call does not happen.
>>           */
>>          callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc});
>> +        trace_enter_function(t, timestamp, pc, callstack_depth(cs));
>>          return;
>>      }
>>
>> @@ -244,7 +373,8 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>>      CallstackEntry caller = callstack_caller(cs);
>>      if (fp == caller.frame_pointer) {
>>          /* return */
>> -        callstack_pop(cs);
>> +        CallstackEntry e = callstack_pop(cs);
>> +        trace_exit_function(t, timestamp, e.pc, callstack_depth(cs));
>>          return;
>>      }
>>
>> @@ -252,12 +382,16 @@ static void track_callstack(unsigned int cpu_index, void *udata)
>>      if (caller_fp == top.frame_pointer) {
>>          /* call */
>>          callstack_push(cs, (CallstackEntry){.frame_pointer = fp, .pc = pc});
>> +        trace_enter_function(t, timestamp, pc, callstack_depth(cs));
>>          return;
>>      }
>>
>>      /* discontinuity, exit current stack and unwind new one */
>> +    trace_exit_stack(t, cs, timestamp);
>>      callstack_clear(cs);
>> +
>>      cpu_unwind_stack(cpu, fp, pc);
>> +    trace_enter_stack(t, cs, timestamp);
>> }
>>
>> static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
>> @@ -296,6 +430,16 @@ static void vcpu_init(qemu_plugin_id_t id, unsigned int vcpu_index)
>>      cpu->ops.init(cpu);
>>      cpu->buf = g_byte_array_new();
>>
>> +    g_assert(vcpu_index < UINT32_MAX / 100);
>> +    /* trace_id is: cpu_number * 100 */
>> +    uint32_t trace_id = (vcpu_index + 1) * 100;
>> +
>> +    g_autoptr(GString) trace_name = g_string_new(NULL);
>> +    g_string_append_printf(trace_name, "cpu%u", vcpu_index);
>> +    cpu->trace = trace_new(trace_id, trace_name);
>> +    /* create/truncate trace file */
>> +    trace_flush(cpu->trace, false);
>> +
>>      cpu->cs = callstack_new();
>> }
>>
>> @@ -304,6 +448,7 @@ static void vcpu_end(unsigned int vcpu_index)
>>      Cpu *cpu = qemu_plugin_scoreboard_find(score, vcpu_index);
>>      g_byte_array_free(cpu->buf, true);
>>
>> +    trace_free(cpu->trace);
>>      callstack_free(cpu->cs);
>>      memset(cpu, 0, sizeof(Cpu));
>> }
>> @@ -311,6 +456,8 @@ static void vcpu_end(unsigned int vcpu_index)
>> static void at_exit(qemu_plugin_id_t id, void *data)
>> {
>>      for (size_t i = 0; i < qemu_plugin_num_vcpus(); ++i) {
>> +        Cpu *cpu = qemu_plugin_scoreboard_find(score, i);
>> +        trace_flush(cpu->trace, true);
>>          vcpu_end(i);
>>      }
>>
>> -- 
>> 2.47.2
>>
> 
> Looks good overall.