[PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()

Xingran Wang posted 1 patch 2 months, 3 weeks ago
accel/tcg/translate-all.c    |  27 ++++++++
include/qemu/plugin-event.h  |   2 +
include/qemu/plugin.h        |  24 +++++++
include/qemu/qemu-plugin.h   | 131 +++++++++++++++++++++++++++++++++++
plugins/api.c                |  78 +++++++++++++++++++++
plugins/core.c               |  42 +++++++++++
plugins/qemu-plugins.symbols |  10 +++
tests/tcg/plugins/bb.c       |  25 +++++++
8 files changed, 339 insertions(+)
[PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()
Posted by Xingran Wang 2 months, 3 weeks ago
Currently, the instruction count obtained by plugins using the translation
block execution callback is larger than the actual value. Adding callbacks
in cpu_restore_state_from_tb() and cpu_io_recompile() allows plugins to
correct the instruction count when exiting a translation block
mid-execution, properly subtracting the excess unexecuted instructions.

Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
---
 accel/tcg/translate-all.c    |  27 ++++++++
 include/qemu/plugin-event.h  |   2 +
 include/qemu/plugin.h        |  24 +++++++
 include/qemu/qemu-plugin.h   | 131 +++++++++++++++++++++++++++++++++++
 plugins/api.c                |  78 +++++++++++++++++++++
 plugins/core.c               |  42 +++++++++++
 plugins/qemu-plugins.symbols |  10 +++
 tests/tcg/plugins/bb.c       |  25 +++++++
 8 files changed, 339 insertions(+)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index fdf6d8ac19..642f684372 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -65,6 +65,7 @@
 #include "internal-target.h"
 #include "tcg/perf.h"
 #include "tcg/insn-start-words.h"
+#include "qemu/plugin.h"
 
 TBContext tb_ctx;
 
@@ -218,6 +219,19 @@ void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
         cpu->neg.icount_decr.u16.low += insns_left;
     }
 
+#ifdef CONFIG_PLUGIN
+    /*
+     * Notify the plugin with the relevant information
+     * when restoring the execution state of a TB.
+     */
+    struct qemu_plugin_tb_restore ptb_restore;
+    ptb_restore.cpu_index = cpu->cpu_index;
+    ptb_restore.insns_left = insns_left;
+    ptb_restore.tb_n = tb->icount;
+    ptb_restore.tb_pc = tb->pc;
+    qemu_plugin_tb_restore_cb(cpu, &ptb_restore);
+#endif
+
     cpu->cc->tcg_ops->restore_state_to_opc(cpu, tb, data);
 }
 
@@ -641,6 +655,19 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
         }
     }
 
+#ifdef CONFIG_PLUGIN
+    /*
+     * Notify the plugin with the relevant information
+     * when cpu_io_recompile is triggered.
+     */
+    struct qemu_plugin_tb_recompile_io ptb_recompile_io;
+    ptb_recompile_io.cpu_index = cpu->cpu_index;
+    ptb_recompile_io.next_tb_n = n;
+    ptb_recompile_io.tb_pc = tb->pc;
+    ptb_recompile_io.cpu_pc = cpu->cc->get_pc(cpu);
+    qemu_plugin_tb_recompile_io_cb(cpu, &ptb_recompile_io);
+#endif
+
     cpu_loop_exit_noexc(cpu);
 }
 
diff --git a/include/qemu/plugin-event.h b/include/qemu/plugin-event.h
index 7056d8427b..875e2b6071 100644
--- a/include/qemu/plugin-event.h
+++ b/include/qemu/plugin-event.h
@@ -14,6 +14,8 @@ enum qemu_plugin_event {
     QEMU_PLUGIN_EV_VCPU_INIT,
     QEMU_PLUGIN_EV_VCPU_EXIT,
     QEMU_PLUGIN_EV_VCPU_TB_TRANS,
+    QEMU_PLUGIN_EV_VCPU_TB_RESTORE,
+    QEMU_PLUGIN_EV_VCPU_TB_RECOMPILE_IO,
     QEMU_PLUGIN_EV_VCPU_IDLE,
     QEMU_PLUGIN_EV_VCPU_RESUME,
     QEMU_PLUGIN_EV_VCPU_SYSCALL,
diff --git a/include/qemu/plugin.h b/include/qemu/plugin.h
index af5f9db469..9250932e44 100644
--- a/include/qemu/plugin.h
+++ b/include/qemu/plugin.h
@@ -60,6 +60,8 @@ union qemu_plugin_cb_sig {
     qemu_plugin_vcpu_simple_cb_t     vcpu_simple;
     qemu_plugin_vcpu_udata_cb_t      vcpu_udata;
     qemu_plugin_vcpu_tb_trans_cb_t   vcpu_tb_trans;
+    qemu_plugin_vcpu_tb_restore_cb_t   vcpu_tb_restore;
+    qemu_plugin_vcpu_tb_recompile_io_cb_t   vcpu_tb_recompile_io;
     qemu_plugin_vcpu_mem_cb_t        vcpu_mem;
     qemu_plugin_vcpu_syscall_cb_t    vcpu_syscall;
     qemu_plugin_vcpu_syscall_ret_cb_t vcpu_syscall_ret;
@@ -139,6 +141,22 @@ struct qemu_plugin_tb {
     GArray *cbs;
 };
 
+/* TranslationBlock Restore info */
+struct qemu_plugin_tb_restore {
+    unsigned int cpu_index;
+    int insns_left;
+    size_t tb_n;
+    uint64_t tb_pc;
+};
+
+/* TranslationBlock Recompile IO info */
+struct qemu_plugin_tb_recompile_io {
+    unsigned int cpu_index;
+    uint32_t next_tb_n;
+    uint64_t tb_pc;
+    uint64_t cpu_pc;
+};
+
 /**
  * struct CPUPluginState - per-CPU state for plugins
  * @event_mask: plugin event bitmap. Modified only via async work.
@@ -158,6 +176,12 @@ CPUPluginState *qemu_plugin_create_vcpu_state(void);
 void qemu_plugin_vcpu_init_hook(CPUState *cpu);
 void qemu_plugin_vcpu_exit_hook(CPUState *cpu);
 void qemu_plugin_tb_trans_cb(CPUState *cpu, struct qemu_plugin_tb *tb);
+void
+qemu_plugin_tb_restore_cb(CPUState *cpu,
+                          struct qemu_plugin_tb_restore *tb);
+void
+qemu_plugin_tb_recompile_io_cb(CPUState *cpu,
+                               struct qemu_plugin_tb_recompile_io *tb);
 void qemu_plugin_vcpu_idle_cb(CPUState *cpu);
 void qemu_plugin_vcpu_resume_cb(CPUState *cpu);
 void
diff --git a/include/qemu/qemu-plugin.h b/include/qemu/qemu-plugin.h
index c71c705b69..e879175b23 100644
--- a/include/qemu/qemu-plugin.h
+++ b/include/qemu/qemu-plugin.h
@@ -228,6 +228,10 @@ struct qemu_plugin_tb;
 struct qemu_plugin_insn;
 /** struct qemu_plugin_scoreboard - Opaque handle for a scoreboard */
 struct qemu_plugin_scoreboard;
+/** struct qemu_plugin_tb_restore - Opaque handle for TB restore */
+struct qemu_plugin_tb_restore;
+/** struct qemu_plugin_tb_recompile_io - Opaque handle for recompile_io */
+struct qemu_plugin_tb_recompile_io;
 
 /**
  * typedef qemu_plugin_u64 - uint64_t member of an entry in a scoreboard
@@ -293,6 +297,22 @@ enum qemu_plugin_cond {
 typedef void (*qemu_plugin_vcpu_tb_trans_cb_t)(qemu_plugin_id_t id,
                                                struct qemu_plugin_tb *tb);
 
+/**
+ * typedef qemu_plugin_vcpu_tb_restore_cb_t - cpu restore state from TB callback
+ * @id: unique plugin id
+ * @tb_restore: opaque handle used for querying TB restore info.
+ */
+typedef void (*qemu_plugin_vcpu_tb_restore_cb_t)(qemu_plugin_id_t id,
+    struct qemu_plugin_tb_restore *tb_restore);
+
+/**
+ * typedef qemu_plugin_vcpu_tb_restore_cb_t - cpu io recompile callback
+ * @id: unique plugin id
+ * @tb_restore: opaque handle used for querying cpu io recompile info.
+ */
+typedef void (*qemu_plugin_vcpu_tb_recompile_io_cb_t)(qemu_plugin_id_t id,
+    struct qemu_plugin_tb_recompile_io *tb_recompile_io);
+
 /**
  * qemu_plugin_register_vcpu_tb_trans_cb() - register a translate cb
  * @id: plugin ID
@@ -309,6 +329,33 @@ QEMU_PLUGIN_API
 void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,
                                            qemu_plugin_vcpu_tb_trans_cb_t cb);
 
+/**
+ * qemu_plugin_register_vcpu_tb_restore_cb() - register a TB restore cb
+ * @id: plugin ID
+ * @cb: callback function
+ *
+ * The @cb function is called every time a TB restore occurs. The @cb
+ * function is passed an opaque qemu_plugin_type which it can query
+ * for additional information.
+ */
+QEMU_PLUGIN_API
+void qemu_plugin_register_vcpu_tb_restore_cb(qemu_plugin_id_t id,
+    qemu_plugin_vcpu_tb_restore_cb_t cb);
+
+/**
+ * qemu_plugin_register_vcpu_tb_recompile_io_cb()
+ * register a cpu io recompile cb
+ * @id: plugin ID
+ * @cb: callback function
+ *
+ * The @cb function is called every time a cpu io recompile occurs. The @cb
+ * function is passed an opaque qemu_plugin_type which it can query
+ * for additional information.
+ */
+QEMU_PLUGIN_API
+void qemu_plugin_register_vcpu_tb_recompile_io_cb(qemu_plugin_id_t id,
+    qemu_plugin_vcpu_tb_recompile_io_cb_t cb);
+
 /**
  * qemu_plugin_register_vcpu_tb_exec_cb() - register execution callback
  * @tb: the opaque qemu_plugin_tb handle for the translation
@@ -469,6 +516,90 @@ QEMU_PLUGIN_API
 struct qemu_plugin_insn *
 qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx);
 
+/**
+ * qemu_plugin_tb_restore_cpu_index()
+ * query helper for cpu index where TB restore occurs
+ * @tb_restore: Opaque handle to the TB restore structure passed to the callback
+ *
+ * Returns: cpu index where the TB restore occurs.
+ */
+QEMU_PLUGIN_API
+unsigned int qemu_plugin_tb_restore_cpu_index(
+    const struct qemu_plugin_tb_restore *tb_restore);
+
+/**
+ * qemu_plugin_tb_restore_insns_left()
+ * query helper for number of unexecuted instructions in TB
+ * @tb_restore: Opaque handle to the TB restore structure passed to the callback
+ *
+ * Returns: number of unexecuted instructions in this block.
+ */
+QEMU_PLUGIN_API
+int qemu_plugin_tb_restore_insns_left(
+    const struct qemu_plugin_tb_restore *tb_restore);
+
+/**
+ * qemu_plugin_tb_restore_tb_n() - query helper for number of insns in TB
+ * @tb_restore: Opaque handle to the TB restore structure passed to the callback
+ *
+ * Returns: number of instructions in this block.
+ */
+QEMU_PLUGIN_API
+size_t qemu_plugin_tb_restore_tb_n(
+    const struct qemu_plugin_tb_restore *tb_restore);
+
+/**
+ * qemu_plugin_tb_restore_tb_pc() - query helper for vaddr of TB start
+ * @tb_restore: Opaque handle to the TB restore structure passed to the callback
+ *
+ * Returns: virtual address of block start.
+ */
+QEMU_PLUGIN_API
+uint64_t qemu_plugin_tb_restore_tb_pc(
+    const struct qemu_plugin_tb_restore *tb_restore);
+
+/**
+ * qemu_plugin_tb_recompile_io_cpu_index()
+ * query helper for cpu index where recompile I/O occurs
+ * @tb_recompile_io: Opaque handle to the TB recompile I/O structure
+ *
+ * Returns: cpu index where I/O recompile occurs.
+ */
+QEMU_PLUGIN_API
+unsigned int qemu_plugin_tb_recompile_io_cpu_index(
+    const struct qemu_plugin_tb_recompile_io *tb_recompile_io);
+
+/**
+ * qemu_plugin_tb_recompile_io_next_tb_n()
+ * query helper for number of insns in next TB
+ * @tb_recompile_io: Opaque handle to the TB recompile I/O structure
+ *
+ * Returns: number of instructions in next block.
+ */
+QEMU_PLUGIN_API
+uint32_t qemu_plugin_tb_recompile_io_next_tb_n(
+    const struct qemu_plugin_tb_recompile_io *tb_recompile_io);
+
+/**
+ * qemu_plugin_tb_recompile_io_tb_pc() - query helper for vaddr of TB start
+ * @tb_recompile_io: Opaque handle to the TB recompile I/O structure
+ *
+ * Returns: virtual address of block start.
+ */
+QEMU_PLUGIN_API
+uint64_t qemu_plugin_tb_recompile_io_tb_pc(
+    const struct qemu_plugin_tb_recompile_io *tb_recompile_io);
+
+/**
+ * qemu_plugin_tb_recompile_io_cpu_pc() - query helper for cpu program counter
+ * @tb_recompile_io: Opaque handle to the TB recompile I/O structure
+ *
+ * Returns: program counter of cpu where recompile I/O occurs.
+ */
+QEMU_PLUGIN_API
+uint64_t qemu_plugin_tb_recompile_io_cpu_pc(
+    const struct qemu_plugin_tb_recompile_io *tb_recompile_io);
+
 /**
  * qemu_plugin_insn_data() - copy instruction data
  * @insn: opaque instruction handle from qemu_plugin_tb_get_insn()
diff --git a/plugins/api.c b/plugins/api.c
index 2ff13d09de..50aee3b38c 100644
--- a/plugins/api.c
+++ b/plugins/api.c
@@ -206,6 +206,18 @@ void qemu_plugin_register_vcpu_tb_trans_cb(qemu_plugin_id_t id,
     plugin_register_cb(id, QEMU_PLUGIN_EV_VCPU_TB_TRANS, cb);
 }
 
+void qemu_plugin_register_vcpu_tb_restore_cb(qemu_plugin_id_t id,
+    qemu_plugin_vcpu_tb_restore_cb_t cb)
+{
+    plugin_register_cb(id, QEMU_PLUGIN_EV_VCPU_TB_RESTORE, cb);
+}
+
+void qemu_plugin_register_vcpu_tb_recompile_io_cb(qemu_plugin_id_t id,
+    qemu_plugin_vcpu_tb_recompile_io_cb_t cb)
+{
+    plugin_register_cb(id, QEMU_PLUGIN_EV_VCPU_TB_RECOMPILE_IO, cb);
+}
+
 void qemu_plugin_register_vcpu_syscall_cb(qemu_plugin_id_t id,
                                           qemu_plugin_vcpu_syscall_cb_t cb)
 {
@@ -257,6 +269,72 @@ qemu_plugin_tb_get_insn(const struct qemu_plugin_tb *tb, size_t idx)
     return insn;
 }
 
+/*
+ * CPU restore state from TB information:
+ *
+ * A plugin can query various details about the TB being restored,
+ * including the CPU index, the number of remaining instructions to execute,
+ * the total number of instructions, and the virtual address of
+ * the start of the block.
+ */
+
+unsigned int qemu_plugin_tb_restore_cpu_index(
+    const struct qemu_plugin_tb_restore *tb_restore)
+{
+    return tb_restore->cpu_index;
+}
+
+int qemu_plugin_tb_restore_insns_left(
+    const struct qemu_plugin_tb_restore *tb_restore)
+{
+    return tb_restore->insns_left;
+}
+
+size_t qemu_plugin_tb_restore_tb_n(
+    const struct qemu_plugin_tb_restore *tb_restore)
+{
+    return tb_restore->tb_n;
+}
+
+uint64_t qemu_plugin_tb_restore_tb_pc(
+    const struct qemu_plugin_tb_restore *tb_restore)
+{
+    return tb_restore->tb_pc;
+}
+
+/*
+ * CPU Recompile I/O information:
+ *
+ * A plugin can query various details related to the I/O recompile process,
+ * including the CPU index, the number of instructions in next TB,
+ * the virtual address of the start of current block, and the program counter
+ * of the CPU at the time of the recompile.
+ */
+
+unsigned int qemu_plugin_tb_recompile_io_cpu_index(
+    const struct qemu_plugin_tb_recompile_io *tb_recompile_io)
+{
+    return tb_recompile_io->cpu_index;
+}
+
+uint32_t qemu_plugin_tb_recompile_io_next_tb_n(
+    const struct qemu_plugin_tb_recompile_io *tb_recompile_io)
+{
+    return tb_recompile_io->next_tb_n;
+}
+
+uint64_t qemu_plugin_tb_recompile_io_tb_pc(
+    const struct qemu_plugin_tb_recompile_io *tb_recompile_io)
+{
+    return tb_recompile_io->tb_pc;
+}
+
+uint64_t qemu_plugin_tb_recompile_io_cpu_pc(
+    const struct qemu_plugin_tb_recompile_io *tb_recompile_io)
+{
+    return tb_recompile_io->cpu_pc;
+}
+
 /*
  * Instruction information
  *
diff --git a/plugins/core.c b/plugins/core.c
index 2897453cac..a4f429b5f2 100644
--- a/plugins/core.c
+++ b/plugins/core.c
@@ -485,6 +485,48 @@ void qemu_plugin_tb_trans_cb(CPUState *cpu, struct qemu_plugin_tb *tb)
     }
 }
 
+/*
+ * Disable CFI checks.
+ * The callback function has been loaded from an external library so we do not
+ * have type information
+ */
+QEMU_DISABLE_CFI
+void qemu_plugin_tb_restore_cb(CPUState *cpu,
+                               struct qemu_plugin_tb_restore *tb_restore)
+{
+    struct qemu_plugin_cb *cb, *next;
+    enum qemu_plugin_event ev = QEMU_PLUGIN_EV_VCPU_TB_RESTORE;
+
+    /* no plugin_state->event_mask check here; caller should have checked */
+
+    QLIST_FOREACH_SAFE_RCU(cb, &plugin.cb_lists[ev], entry, next) {
+        qemu_plugin_vcpu_tb_restore_cb_t func = cb->f.vcpu_tb_restore;
+
+        func(cb->ctx->id, tb_restore);
+    }
+}
+
+/*
+ * Disable CFI checks.
+ * The callback function has been loaded from an external library so we do not
+ * have type information
+ */
+QEMU_DISABLE_CFI
+void qemu_plugin_tb_recompile_io_cb(CPUState *cpu,
+    struct qemu_plugin_tb_recompile_io *tb_recompile_io)
+{
+    struct qemu_plugin_cb *cb, *next;
+    enum qemu_plugin_event ev = QEMU_PLUGIN_EV_VCPU_TB_RECOMPILE_IO;
+
+    /* no plugin_state->event_mask check here; caller should have checked */
+
+    QLIST_FOREACH_SAFE_RCU(cb, &plugin.cb_lists[ev], entry, next) {
+        qemu_plugin_vcpu_tb_recompile_io_cb_t func = cb->f.vcpu_tb_recompile_io;
+
+        func(cb->ctx->id, tb_recompile_io);
+    }
+}
+
 /*
  * Disable CFI checks.
  * The callback function has been loaded from an external library so we do not
diff --git a/plugins/qemu-plugins.symbols b/plugins/qemu-plugins.symbols
index ca773d8d9f..7dcd9f76d5 100644
--- a/plugins/qemu-plugins.symbols
+++ b/plugins/qemu-plugins.symbols
@@ -38,6 +38,8 @@
   qemu_plugin_register_vcpu_tb_exec_cond_cb;
   qemu_plugin_register_vcpu_tb_exec_inline_per_vcpu;
   qemu_plugin_register_vcpu_tb_trans_cb;
+  qemu_plugin_register_vcpu_tb_restore_cb;
+  qemu_plugin_register_vcpu_tb_recompile_io_cb;
   qemu_plugin_request_time_control;
   qemu_plugin_reset;
   qemu_plugin_scoreboard_free;
@@ -47,6 +49,14 @@
   qemu_plugin_tb_get_insn;
   qemu_plugin_tb_n_insns;
   qemu_plugin_tb_vaddr;
+  qemu_plugin_tb_restore_cpu_index;
+  qemu_plugin_tb_restore_insns_left;
+  qemu_plugin_tb_restore_tb_n;
+  qemu_plugin_tb_restore_tb_pc;
+  qemu_plugin_tb_recompile_io_cpu_index;
+  qemu_plugin_tb_recompile_io_next_tb_n;
+  qemu_plugin_tb_recompile_io_tb_pc;
+  qemu_plugin_tb_recompile_io_cpu_pc;
   qemu_plugin_u64_add;
   qemu_plugin_u64_get;
   qemu_plugin_u64_set;
diff --git a/tests/tcg/plugins/bb.c b/tests/tcg/plugins/bb.c
index 36776dee1e..18a16e885f 100644
--- a/tests/tcg/plugins/bb.c
+++ b/tests/tcg/plugins/bb.c
@@ -93,6 +93,29 @@ static void vcpu_tb_trans(qemu_plugin_id_t id, struct qemu_plugin_tb *tb)
     }
 }
 
+static void vcpu_tb_restore(qemu_plugin_id_t id,
+                            struct qemu_plugin_tb_restore *tb_restore)
+{
+    unsigned int cpu_index;
+    cpu_index = qemu_plugin_tb_restore_cpu_index(tb_restore);
+    CPUCount *count = qemu_plugin_scoreboard_find(counts, cpu_index);
+
+    size_t insns_left = qemu_plugin_tb_restore_insns_left(tb_restore);
+    count->insn_count -= insns_left;
+}
+
+static void vcpu_tb_recompile_io(qemu_plugin_id_t id,
+    struct qemu_plugin_tb_recompile_io *tb_recompile_io)
+{
+    unsigned int cpu_index;
+    cpu_index = qemu_plugin_tb_recompile_io_cpu_index(tb_recompile_io);
+    CPUCount *count = qemu_plugin_scoreboard_find(counts, cpu_index);
+
+    uint32_t next_tb_n = qemu_plugin_tb_recompile_io_next_tb_n(tb_recompile_io);
+    count->insn_count += next_tb_n;
+}
+
+
 QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id,
                                            const qemu_info_t *info,
                                            int argc, char **argv)
@@ -128,6 +151,8 @@ QEMU_PLUGIN_EXPORT int qemu_plugin_install(qemu_plugin_id_t id,
     }
 
     qemu_plugin_register_vcpu_tb_trans_cb(id, vcpu_tb_trans);
+    qemu_plugin_register_vcpu_tb_restore_cb(id, vcpu_tb_restore);
+    qemu_plugin_register_vcpu_tb_recompile_io_cb(id, vcpu_tb_recompile_io);
     qemu_plugin_register_atexit_cb(id, plugin_exit, NULL);
     return 0;
 }
-- 
2.34.1
Re: [PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()
Posted by Alex Bennée 2 months, 3 weeks ago
Xingran Wang <wangxingran123456@outlook.com> writes:

> Currently, the instruction count obtained by plugins using the translation
> block execution callback is larger than the actual value. Adding callbacks
> in cpu_restore_state_from_tb() and cpu_io_recompile() allows plugins to
> correct the instruction count when exiting a translation block
> mid-execution, properly subtracting the excess unexecuted
> instructions.

This smells like exposing two much of the TCG internals to the plugin
mechanism. You can already detect when we don't reach the end of a block
of instructions by instrumentation as I did in:

  Message-Id: <20240718145958.1315270-1-alex.bennee@linaro.org>
  Date: Thu, 18 Jul 2024 15:59:58 +0100
  Subject: [RFC PATCH v3] contrib/plugins: control flow plugin
  From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>

So what exactly are we trying to achieve here? A more efficient
detection of short blocks?

>
> Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
> ---
>  accel/tcg/translate-all.c    |  27 ++++++++
>  include/qemu/plugin-event.h  |   2 +
>  include/qemu/plugin.h        |  24 +++++++
>  include/qemu/qemu-plugin.h   | 131 +++++++++++++++++++++++++++++++++++
>  plugins/api.c                |  78 +++++++++++++++++++++
>  plugins/core.c               |  42 +++++++++++
>  plugins/qemu-plugins.symbols |  10 +++
>  tests/tcg/plugins/bb.c       |  25 +++++++
>  8 files changed, 339 insertions(+)
>
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index fdf6d8ac19..642f684372 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -65,6 +65,7 @@
>  #include "internal-target.h"
>  #include "tcg/perf.h"
>  #include "tcg/insn-start-words.h"
> +#include "qemu/plugin.h"
>  
>  TBContext tb_ctx;
>  
> @@ -218,6 +219,19 @@ void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
>          cpu->neg.icount_decr.u16.low += insns_left;
>      }
>  
> +#ifdef CONFIG_PLUGIN
> +    /*
> +     * Notify the plugin with the relevant information
> +     * when restoring the execution state of a TB.
> +     */
> +    struct qemu_plugin_tb_restore ptb_restore;
> +    ptb_restore.cpu_index = cpu->cpu_index;
> +    ptb_restore.insns_left = insns_left;
> +    ptb_restore.tb_n = tb->icount;
> +    ptb_restore.tb_pc = tb->pc;
> +    qemu_plugin_tb_restore_cb(cpu, &ptb_restore);
> +#endif
> +

See also the unwind patches which is a more generic approach to ensuring
"special" registers are synced at midpoint when using the register API:

  Message-Id: <20240606032926.83599-1-richard.henderson@linaro.org>
  Date: Wed,  5 Jun 2024 20:29:17 -0700
  Subject: [PATCH v2 0/9] plugins: Use unwind info for special gdb registers
  From: Richard Henderson <richard.henderson@linaro.org>

<snip>

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro
Re: [PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()
Posted by Pierrick Bouvier 2 months, 3 weeks ago
Hi Xingran,

On 9/2/24 03:42, Alex Bennée wrote:
> Xingran Wang <wangxingran123456@outlook.com> writes:
> 
>> Currently, the instruction count obtained by plugins using the translation
>> block execution callback is larger than the actual value. Adding callbacks
>> in cpu_restore_state_from_tb() and cpu_io_recompile() allows plugins to
>> correct the instruction count when exiting a translation block
>> mid-execution, properly subtracting the excess unexecuted
>> instructions.
> 
> This smells like exposing two much of the TCG internals to the plugin
> mechanism. You can already detect when we don't reach the end of a block
> of instructions by instrumentation as I did in:
> 

I agree that this is definitely a QEMU implementation "detail", and 
should not be a concern for end users.

The documentation already warns that all instructions may not execute, 
and that in this case, it's better to instrument them directly, instead 
of TB: 
https://www.qemu.org/docs/master/devel/tcg-plugins.html#translation-blocks.

Finally, even if we integrated an API like what you propose in this 
patch, I think it would be very easy for plugins writers to make a 
mistake using it, as you need to keep track of everything yourself.

If we want to stay on the topic of this patch, one direction that would 
be better in my opinion is a "after_tb_exec" API, where the TB passed in 
parameter is guaranteed to have all its instructions that were executed 
(not interrupted).

>    Message-Id: <20240718145958.1315270-1-alex.bennee@linaro.org>
>    Date: Thu, 18 Jul 2024 15:59:58 +0100
>    Subject: [RFC PATCH v3] contrib/plugins: control flow plugin
>    From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
> 
> So what exactly are we trying to achieve here? A more efficient
> detection of short blocks?
> 
>>
>> Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
>> ---
>>   accel/tcg/translate-all.c    |  27 ++++++++
>>   include/qemu/plugin-event.h  |   2 +
>>   include/qemu/plugin.h        |  24 +++++++
>>   include/qemu/qemu-plugin.h   | 131 +++++++++++++++++++++++++++++++++++
>>   plugins/api.c                |  78 +++++++++++++++++++++
>>   plugins/core.c               |  42 +++++++++++
>>   plugins/qemu-plugins.symbols |  10 +++
>>   tests/tcg/plugins/bb.c       |  25 +++++++
>>   8 files changed, 339 insertions(+)
>>
>> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
>> index fdf6d8ac19..642f684372 100644
>> --- a/accel/tcg/translate-all.c
>> +++ b/accel/tcg/translate-all.c
>> @@ -65,6 +65,7 @@
>>   #include "internal-target.h"
>>   #include "tcg/perf.h"
>>   #include "tcg/insn-start-words.h"
>> +#include "qemu/plugin.h"
>>   
>>   TBContext tb_ctx;
>>   
>> @@ -218,6 +219,19 @@ void cpu_restore_state_from_tb(CPUState *cpu, TranslationBlock *tb,
>>           cpu->neg.icount_decr.u16.low += insns_left;
>>       }
>>   
>> +#ifdef CONFIG_PLUGIN
>> +    /*
>> +     * Notify the plugin with the relevant information
>> +     * when restoring the execution state of a TB.
>> +     */
>> +    struct qemu_plugin_tb_restore ptb_restore;
>> +    ptb_restore.cpu_index = cpu->cpu_index;
>> +    ptb_restore.insns_left = insns_left;
>> +    ptb_restore.tb_n = tb->icount;
>> +    ptb_restore.tb_pc = tb->pc;
>> +    qemu_plugin_tb_restore_cb(cpu, &ptb_restore);
>> +#endif
>> +
> 
> See also the unwind patches which is a more generic approach to ensuring
> "special" registers are synced at midpoint when using the register API:
> 
>    Message-Id: <20240606032926.83599-1-richard.henderson@linaro.org>
>    Date: Wed,  5 Jun 2024 20:29:17 -0700
>    Subject: [PATCH v2 0/9] plugins: Use unwind info for special gdb registers
>    From: Richard Henderson <richard.henderson@linaro.org>
> 
> <snip>
> 

Thanks,
Pierrick
Re: [PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()
Posted by Alex Bennée 2 months, 3 weeks ago
Pierrick Bouvier <pierrick.bouvier@linaro.org> writes:

> Hi Xingran,
>
> On 9/2/24 03:42, Alex Bennée wrote:
>> Xingran Wang <wangxingran123456@outlook.com> writes:
>> 
>>> Currently, the instruction count obtained by plugins using the translation
>>> block execution callback is larger than the actual value. Adding callbacks
>>> in cpu_restore_state_from_tb() and cpu_io_recompile() allows plugins to
>>> correct the instruction count when exiting a translation block
>>> mid-execution, properly subtracting the excess unexecuted
>>> instructions.
>> This smells like exposing two much of the TCG internals to the
>> plugin
>> mechanism. You can already detect when we don't reach the end of a block
>> of instructions by instrumentation as I did in:
>> 
>
> I agree that this is definitely a QEMU implementation "detail", and
> should not be a concern for end users.
>
> The documentation already warns that all instructions may not execute,
> and that in this case, it's better to instrument them directly,
> instead of TB:
> https://www.qemu.org/docs/master/devel/tcg-plugins.html#translation-blocks.
>
> Finally, even if we integrated an API like what you propose in this
> patch, I think it would be very easy for plugins writers to make a
> mistake using it, as you need to keep track of everything yourself.
>
> If we want to stay on the topic of this patch, one direction that
> would be better in my opinion is a "after_tb_exec" API, where the TB
> passed in parameter is guaranteed to have all its instructions that
> were executed (not interrupted).

Or indeed resolves with the current PC at the "end" of the TB where it
gets to. QEMU could keep track of that easily enough as the recompile
and bus fault paths are slow paths anyway. It would be tricky to support
inline for that though.

As TB is exposed internally I think we'd just need to set a flag and
call out. Maybe an API like:

  /**
   * typedef qemu_plugin_vcpu_tb_end_cb_t - vcpu callback at end of block
   * @vcpu_index: the current vcpu context
   * @pc: the next PC
   * @insns: instructions executed in block
   * @userdata: a pointer to some user data supplied when the callback
   * was registered.
   */
  typedef void (*qemu_plugin_vcpu_tb_end_cb_t)(unsigned int vcpu_index,
                                               uint64_t next_pc,
                                               size_t n_insns,
                                               void *userdata);

  /**
   * qemu_plugin_register_vcpu_tb_exec_end_cb() - register execution callback at end of TB
   * @tb: the opaque qemu_plugin_tb handle for the translation
   * @cb: callback function
   * @flags: does the plugin read or write the CPU's registers?
   * @userdata: any plugin data to pass to the @cb?
   *
   * The @cb function is called every time a translated unit executes.
   */
  QEMU_PLUGIN_API
  void qemu_plugin_register_vcpu_tb_exec_end_cb(struct qemu_plugin_tb *tb,
                                                 qemu_plugin_vcpu_tb_end_cb_t cb,
                                                 enum qemu_plugin_cb_flags flags,
                                                 void *userdata);

I think the tricky bit would be getting TCG to emit the callback code
for the last instruction before the
tcg_gen_exit_tb/tcg_gen_lookup_and_goto_ptr bits but after whatever else
it has done to execute the instruction.

I don't think we could easily support inline ops at tb end though.

Richard,

What do you think?


>>    Message-Id: <20240718145958.1315270-1-alex.bennee@linaro.org>
>>    Date: Thu, 18 Jul 2024 15:59:58 +0100
>>    Subject: [RFC PATCH v3] contrib/plugins: control flow plugin
>>    From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
>> So what exactly are we trying to achieve here? A more efficient
>> detection of short blocks?
>> 
>>>
>>> Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
>>> ---
>>>   accel/tcg/translate-all.c    |  27 ++++++++
>>>   include/qemu/plugin-event.h  |   2 +
>>>   include/qemu/plugin.h        |  24 +++++++
>>>   include/qemu/qemu-plugin.h   | 131 +++++++++++++++++++++++++++++++++++
>>>   plugins/api.c                |  78 +++++++++++++++++++++
>>>   plugins/core.c               |  42 +++++++++++
>>>   plugins/qemu-plugins.symbols |  10 +++
>>>   tests/tcg/plugins/bb.c       |  25 +++++++
>>>   8 files changed, 339 insertions(+)
>>>
>>> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
>>> index fdf6d8ac19..642f684372 100644
>>> --- a/accel/tcg/translate-all.c
>>> +++ b/accel/tcg/translate-all.c
>>> @@ -65,6 +65,7 @@
>>>   #include "internal-target.h"
>>>   #include "tcg/perf.h"
>>>   #include "tcg/insn-start-words.h"
>>> +#include "qemu/plugin.h"
>>>     TBContext tb_ctx;
>>>   @@ -218,6 +219,19 @@ void cpu_restore_state_from_tb(CPUState
>>> *cpu, TranslationBlock *tb,
>>>           cpu->neg.icount_decr.u16.low += insns_left;
>>>       }
>>>   +#ifdef CONFIG_PLUGIN
>>> +    /*
>>> +     * Notify the plugin with the relevant information
>>> +     * when restoring the execution state of a TB.
>>> +     */
>>> +    struct qemu_plugin_tb_restore ptb_restore;
>>> +    ptb_restore.cpu_index = cpu->cpu_index;
>>> +    ptb_restore.insns_left = insns_left;
>>> +    ptb_restore.tb_n = tb->icount;
>>> +    ptb_restore.tb_pc = tb->pc;
>>> +    qemu_plugin_tb_restore_cb(cpu, &ptb_restore);
>>> +#endif
>>> +
>> See also the unwind patches which is a more generic approach to
>> ensuring
>> "special" registers are synced at midpoint when using the register API:
>>    Message-Id: <20240606032926.83599-1-richard.henderson@linaro.org>
>>    Date: Wed,  5 Jun 2024 20:29:17 -0700
>>    Subject: [PATCH v2 0/9] plugins: Use unwind info for special gdb registers
>>    From: Richard Henderson <richard.henderson@linaro.org>
>> <snip>
>> 
>
> Thanks,
> Pierrick

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro
Re: [PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()
Posted by Richard Henderson 2 months, 3 weeks ago
On 9/2/24 10:52, Alex Bennée wrote:
> Pierrick Bouvier <pierrick.bouvier@linaro.org> writes:
> 
>> Hi Xingran,
>>
>> On 9/2/24 03:42, Alex Bennée wrote:
>>> Xingran Wang <wangxingran123456@outlook.com> writes:
>>>
>>>> Currently, the instruction count obtained by plugins using the translation
>>>> block execution callback is larger than the actual value. Adding callbacks
>>>> in cpu_restore_state_from_tb() and cpu_io_recompile() allows plugins to
>>>> correct the instruction count when exiting a translation block
>>>> mid-execution, properly subtracting the excess unexecuted
>>>> instructions.
>>> This smells like exposing two much of the TCG internals to the
>>> plugin
>>> mechanism. You can already detect when we don't reach the end of a block
>>> of instructions by instrumentation as I did in:
>>>
>>
>> I agree that this is definitely a QEMU implementation "detail", and
>> should not be a concern for end users.
>>
>> The documentation already warns that all instructions may not execute,
>> and that in this case, it's better to instrument them directly,
>> instead of TB:
>> https://www.qemu.org/docs/master/devel/tcg-plugins.html#translation-blocks.
>>
>> Finally, even if we integrated an API like what you propose in this
>> patch, I think it would be very easy for plugins writers to make a
>> mistake using it, as you need to keep track of everything yourself.
>>
>> If we want to stay on the topic of this patch, one direction that
>> would be better in my opinion is a "after_tb_exec" API, where the TB
>> passed in parameter is guaranteed to have all its instructions that
>> were executed (not interrupted).
> 
> Or indeed resolves with the current PC at the "end" of the TB where it
> gets to. QEMU could keep track of that easily enough as the recompile
> and bus fault paths are slow paths anyway. It would be tricky to support
> inline for that though.
> 
> As TB is exposed internally I think we'd just need to set a flag and
> call out. Maybe an API like:
> 
>    /**
>     * typedef qemu_plugin_vcpu_tb_end_cb_t - vcpu callback at end of block
>     * @vcpu_index: the current vcpu context
>     * @pc: the next PC
>     * @insns: instructions executed in block
>     * @userdata: a pointer to some user data supplied when the callback
>     * was registered.
>     */
>    typedef void (*qemu_plugin_vcpu_tb_end_cb_t)(unsigned int vcpu_index,
>                                                 uint64_t next_pc,
>                                                 size_t n_insns,
>                                                 void *userdata);
> 
>    /**
>     * qemu_plugin_register_vcpu_tb_exec_end_cb() - register execution callback at end of TB
>     * @tb: the opaque qemu_plugin_tb handle for the translation
>     * @cb: callback function
>     * @flags: does the plugin read or write the CPU's registers?
>     * @userdata: any plugin data to pass to the @cb?
>     *
>     * The @cb function is called every time a translated unit executes.
>     */
>    QEMU_PLUGIN_API
>    void qemu_plugin_register_vcpu_tb_exec_end_cb(struct qemu_plugin_tb *tb,
>                                                   qemu_plugin_vcpu_tb_end_cb_t cb,
>                                                   enum qemu_plugin_cb_flags flags,
>                                                   void *userdata);
> 
> I think the tricky bit would be getting TCG to emit the callback code
> for the last instruction before the
> tcg_gen_exit_tb/tcg_gen_lookup_and_goto_ptr bits but after whatever else
> it has done to execute the instruction.
> 
> I don't think we could easily support inline ops at tb end though.
> 
> Richard,
> 
> What do you think?
I think this will miss all exceptions raised in the middle of the block.
I don't think it will be reliable at all.


r~

Re: [PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()
Posted by Alex Bennée 2 months, 3 weeks ago
Richard Henderson <richard.henderson@linaro.org> writes:

> On 9/2/24 10:52, Alex Bennée wrote:
>> Pierrick Bouvier <pierrick.bouvier@linaro.org> writes:
>> 
>>> Hi Xingran,
>>>
>>> On 9/2/24 03:42, Alex Bennée wrote:
>>>> Xingran Wang <wangxingran123456@outlook.com> writes:
>>>>
>>>>> Currently, the instruction count obtained by plugins using the translation
>>>>> block execution callback is larger than the actual value. Adding callbacks
>>>>> in cpu_restore_state_from_tb() and cpu_io_recompile() allows plugins to
>>>>> correct the instruction count when exiting a translation block
>>>>> mid-execution, properly subtracting the excess unexecuted
>>>>> instructions.
>>>> This smells like exposing two much of the TCG internals to the
>>>> plugin
>>>> mechanism. You can already detect when we don't reach the end of a block
>>>> of instructions by instrumentation as I did in:
>>>>
>>>
>>> I agree that this is definitely a QEMU implementation "detail", and
>>> should not be a concern for end users.
>>>
<snip>
>>    /**
>>     * qemu_plugin_register_vcpu_tb_exec_end_cb() - register execution callback at end of TB
>>     * @tb: the opaque qemu_plugin_tb handle for the translation
>>     * @cb: callback function
>>     * @flags: does the plugin read or write the CPU's registers?
>>     * @userdata: any plugin data to pass to the @cb?
>>     *
>>     * The @cb function is called every time a translated unit executes.
>>     */
>>    QEMU_PLUGIN_API
>>    void qemu_plugin_register_vcpu_tb_exec_end_cb(struct qemu_plugin_tb *tb,
>>                                                   qemu_plugin_vcpu_tb_end_cb_t cb,
>>                                                   enum qemu_plugin_cb_flags flags,
>>                                                   void *userdata);
>> I think the tricky bit would be getting TCG to emit the callback
>> code
>> for the last instruction before the
>> tcg_gen_exit_tb/tcg_gen_lookup_and_goto_ptr bits but after whatever else
>> it has done to execute the instruction.
>> I don't think we could easily support inline ops at tb end though.
>> Richard,
>> What do you think?
> I think this will miss all exceptions raised in the middle of the block.
> I don't think it will be reliable at all.

Ahh yes - I guess we can't fixup as we go through cpu_loop_exit() and
the restore code is only called from helpers. Oh well I think we can
make do with what we currently have.

>
>
> r~

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro
Re: [PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()
Posted by Pierrick Bouvier 2 months, 3 weeks ago
On 9/2/24 10:52, Alex Bennée wrote:
> Pierrick Bouvier <pierrick.bouvier@linaro.org> writes:
> 
>> Hi Xingran,
>>
>> On 9/2/24 03:42, Alex Bennée wrote:
>>> Xingran Wang <wangxingran123456@outlook.com> writes:
>>>
>>>> Currently, the instruction count obtained by plugins using the translation
>>>> block execution callback is larger than the actual value. Adding callbacks
>>>> in cpu_restore_state_from_tb() and cpu_io_recompile() allows plugins to
>>>> correct the instruction count when exiting a translation block
>>>> mid-execution, properly subtracting the excess unexecuted
>>>> instructions.
>>> This smells like exposing two much of the TCG internals to the
>>> plugin
>>> mechanism. You can already detect when we don't reach the end of a block
>>> of instructions by instrumentation as I did in:
>>>
>>
>> I agree that this is definitely a QEMU implementation "detail", and
>> should not be a concern for end users.
>>
>> The documentation already warns that all instructions may not execute,
>> and that in this case, it's better to instrument them directly,
>> instead of TB:
>> https://www.qemu.org/docs/master/devel/tcg-plugins.html#translation-blocks.
>>
>> Finally, even if we integrated an API like what you propose in this
>> patch, I think it would be very easy for plugins writers to make a
>> mistake using it, as you need to keep track of everything yourself.
>>
>> If we want to stay on the topic of this patch, one direction that
>> would be better in my opinion is a "after_tb_exec" API, where the TB
>> passed in parameter is guaranteed to have all its instructions that
>> were executed (not interrupted).
> 
> Or indeed resolves with the current PC at the "end" of the TB where it
> gets to. QEMU could keep track of that easily enough as the recompile
> and bus fault paths are slow paths anyway. It would be tricky to support
> inline for that though.
> 
> As TB is exposed internally I think we'd just need to set a flag and
> call out. Maybe an API like:
> 
>    /**
>     * typedef qemu_plugin_vcpu_tb_end_cb_t - vcpu callback at end of block
>     * @vcpu_index: the current vcpu context
>     * @pc: the next PC
>     * @insns: instructions executed in block
>     * @userdata: a pointer to some user data supplied when the callback
>     * was registered.
>     */
>    typedef void (*qemu_plugin_vcpu_tb_end_cb_t)(unsigned int vcpu_index,
>                                                 uint64_t next_pc,
>                                                 size_t n_insns,
>                                                 void *userdata);
> 
>    /**
>     * qemu_plugin_register_vcpu_tb_exec_end_cb() - register execution callback at end of TB
>     * @tb: the opaque qemu_plugin_tb handle for the translation
>     * @cb: callback function
>     * @flags: does the plugin read or write the CPU's registers?
>     * @userdata: any plugin data to pass to the @cb?
>     *
>     * The @cb function is called every time a translated unit executes.
>     */
>    QEMU_PLUGIN_API
>    void qemu_plugin_register_vcpu_tb_exec_end_cb(struct qemu_plugin_tb *tb,
>                                                   qemu_plugin_vcpu_tb_end_cb_t cb,
>                                                   enum qemu_plugin_cb_flags flags,
>                                                   void *userdata);
> 

Something like this, yes.
I still think it makes the whole API too complex, and would confuse 
users. If plugins writers need "instruction accurate" instrumentation, 
there are already functions for that.
And if the only use case is to identify control flow changes, then we 
could create a dedicated API for this.

I wonder what is the original use case of Xingran. Any chance you could 
share with us why this is needed, and why existing functions are not enough?

> I think the tricky bit would be getting TCG to emit the callback code
> for the last instruction before the
> tcg_gen_exit_tb/tcg_gen_lookup_and_goto_ptr bits but after whatever else
> it has done to execute the instruction.
> 
> I don't think we could easily support inline ops at tb end though.
> 
> Richard,
> 
> What do you think?
> 
> 
>>>     Message-Id: <20240718145958.1315270-1-alex.bennee@linaro.org>
>>>     Date: Thu, 18 Jul 2024 15:59:58 +0100
>>>     Subject: [RFC PATCH v3] contrib/plugins: control flow plugin
>>>     From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
>>> So what exactly are we trying to achieve here? A more efficient
>>> detection of short blocks?
>>>
>>>>
>>>> Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
>>>> ---
>>>>    accel/tcg/translate-all.c    |  27 ++++++++
>>>>    include/qemu/plugin-event.h  |   2 +
>>>>    include/qemu/plugin.h        |  24 +++++++
>>>>    include/qemu/qemu-plugin.h   | 131 +++++++++++++++++++++++++++++++++++
>>>>    plugins/api.c                |  78 +++++++++++++++++++++
>>>>    plugins/core.c               |  42 +++++++++++
>>>>    plugins/qemu-plugins.symbols |  10 +++
>>>>    tests/tcg/plugins/bb.c       |  25 +++++++
>>>>    8 files changed, 339 insertions(+)
>>>>
>>>> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
>>>> index fdf6d8ac19..642f684372 100644
>>>> --- a/accel/tcg/translate-all.c
>>>> +++ b/accel/tcg/translate-all.c
>>>> @@ -65,6 +65,7 @@
>>>>    #include "internal-target.h"
>>>>    #include "tcg/perf.h"
>>>>    #include "tcg/insn-start-words.h"
>>>> +#include "qemu/plugin.h"
>>>>      TBContext tb_ctx;
>>>>    @@ -218,6 +219,19 @@ void cpu_restore_state_from_tb(CPUState
>>>> *cpu, TranslationBlock *tb,
>>>>            cpu->neg.icount_decr.u16.low += insns_left;
>>>>        }
>>>>    +#ifdef CONFIG_PLUGIN
>>>> +    /*
>>>> +     * Notify the plugin with the relevant information
>>>> +     * when restoring the execution state of a TB.
>>>> +     */
>>>> +    struct qemu_plugin_tb_restore ptb_restore;
>>>> +    ptb_restore.cpu_index = cpu->cpu_index;
>>>> +    ptb_restore.insns_left = insns_left;
>>>> +    ptb_restore.tb_n = tb->icount;
>>>> +    ptb_restore.tb_pc = tb->pc;
>>>> +    qemu_plugin_tb_restore_cb(cpu, &ptb_restore);
>>>> +#endif
>>>> +
>>> See also the unwind patches which is a more generic approach to
>>> ensuring
>>> "special" registers are synced at midpoint when using the register API:
>>>     Message-Id: <20240606032926.83599-1-richard.henderson@linaro.org>
>>>     Date: Wed,  5 Jun 2024 20:29:17 -0700
>>>     Subject: [PATCH v2 0/9] plugins: Use unwind info for special gdb registers
>>>     From: Richard Henderson <richard.henderson@linaro.org>
>>> <snip>
>>>
>>
>> Thanks,
>> Pierrick
> 
Re: [PATCH] plugins: add two events for cpu_restore_state_from_tb() and cpu_io_recompile()
Posted by Xingran Wang 2 months, 3 weeks ago
Before seeing Alex's suggestion, I didn't realize that we could use the 
existing injection mechanism to enable plugins to count instructions 
accurately.

Now, I also agree that the new API introduced by this patch for the 
plugin subsystem could make the plugin API overly complex.


Many thanks to Alex and Pierrick for the timely review and helpful comments.



On 9/3/24 2:56 AM, Pierrick Bouvier wrote:
> On 9/2/24 10:52, Alex Bennée wrote:
>> Pierrick Bouvier <pierrick.bouvier@linaro.org> writes:
>>
>>> Hi Xingran,
>>>
>>> On 9/2/24 03:42, Alex Bennée wrote:
>>>> Xingran Wang <wangxingran123456@outlook.com> writes:
>>>>
>>>>> Currently, the instruction count obtained by plugins using the 
>>>>> translation
>>>>> block execution callback is larger than the actual value. Adding 
>>>>> callbacks
>>>>> in cpu_restore_state_from_tb() and cpu_io_recompile() allows 
>>>>> plugins to
>>>>> correct the instruction count when exiting a translation block
>>>>> mid-execution, properly subtracting the excess unexecuted
>>>>> instructions.
>>>> This smells like exposing two much of the TCG internals to the
>>>> plugin
>>>> mechanism. You can already detect when we don't reach the end of a 
>>>> block
>>>> of instructions by instrumentation as I did in:
>>>>
>>>
>>> I agree that this is definitely a QEMU implementation "detail", and
>>> should not be a concern for end users.
>>>
>>> The documentation already warns that all instructions may not execute,
>>> and that in this case, it's better to instrument them directly,
>>> instead of TB:
>>> https://www.qemu.org/docs/master/devel/tcg-plugins.html#translation-blocks. 
>>>
>>>
>>> Finally, even if we integrated an API like what you propose in this
>>> patch, I think it would be very easy for plugins writers to make a
>>> mistake using it, as you need to keep track of everything yourself.
>>>
>>> If we want to stay on the topic of this patch, one direction that
>>> would be better in my opinion is a "after_tb_exec" API, where the TB
>>> passed in parameter is guaranteed to have all its instructions that
>>> were executed (not interrupted).
>>
>> Or indeed resolves with the current PC at the "end" of the TB where it
>> gets to. QEMU could keep track of that easily enough as the recompile
>> and bus fault paths are slow paths anyway. It would be tricky to support
>> inline for that though.
>>
>> As TB is exposed internally I think we'd just need to set a flag and
>> call out. Maybe an API like:
>>
>>    /**
>>     * typedef qemu_plugin_vcpu_tb_end_cb_t - vcpu callback at end of 
>> block
>>     * @vcpu_index: the current vcpu context
>>     * @pc: the next PC
>>     * @insns: instructions executed in block
>>     * @userdata: a pointer to some user data supplied when the callback
>>     * was registered.
>>     */
>>    typedef void (*qemu_plugin_vcpu_tb_end_cb_t)(unsigned int vcpu_index,
>>                                                 uint64_t next_pc,
>>                                                 size_t n_insns,
>>                                                 void *userdata);
>>
>>    /**
>>     * qemu_plugin_register_vcpu_tb_exec_end_cb() - register execution 
>> callback at end of TB
>>     * @tb: the opaque qemu_plugin_tb handle for the translation
>>     * @cb: callback function
>>     * @flags: does the plugin read or write the CPU's registers?
>>     * @userdata: any plugin data to pass to the @cb?
>>     *
>>     * The @cb function is called every time a translated unit executes.
>>     */
>>    QEMU_PLUGIN_API
>>    void qemu_plugin_register_vcpu_tb_exec_end_cb(struct 
>> qemu_plugin_tb *tb,
>> qemu_plugin_vcpu_tb_end_cb_t cb,
>>                                                   enum 
>> qemu_plugin_cb_flags flags,
>>                                                   void *userdata);
>>
>
> Something like this, yes.
> I still think it makes the whole API too complex, and would confuse 
> users. If plugins writers need "instruction accurate" instrumentation, 
> there are already functions for that.
> And if the only use case is to identify control flow changes, then we 
> could create a dedicated API for this.
>

The API that Alex proposed looks great and could replace the two event 
callbacks introduced in this patch.
I also agree with Pierrick that the existing instrumentation mechanism 
already allows users to achieve instruction-accurate instrumentation.
It would be even better if this API didn't rely on inserting TCG 
instructions before every instruction in the TB, as this would offer 
better performance compared to the original implementation for 
identifying control flow changes.

> I wonder what is the original use case of Xingran. Any chance you 
> could share with us why this is needed, and why existing functions are 
> not enough?

Summary:

In retrospect, these two event callbacks don't serve a general purpose 
and can be replaced by the existing injection mechanism, which indeed 
makes the API confusing and complex.

Detail:


We (myself and the XiangShan team) attempted to implement the "SimPoint" 
method using QEMU's plugin system and to accelerate RISCV CPU core 
performance simulation with "SimPoint Checkpoint" and QEMU.

The first step of "SimPoint" is to profile the frequency of executed 
code to create code signatures representing the program's behavior at 
different execution points.
This requires an accurate instruction count, which can be instrumented 
by the plugin.

I used a simple bare-metal program to verify that the instruction count 
instrumented by the plugin matched the actual value.
However, I discovered that TCG would prematurely exit the currently 
executing TB when encountering an MMIO instruction or an exception 
without notifying the count plugin, resulting in the instruction count 
higher than the actual value.
As a QEMU novice, a straightforward solution for me was to have TCG 
notify the plugin whenever a TB is exited prematurely during 
execution.This is the role of the event callback in 
cpu_restore_state_from_tb().
The event callback in cpu_io_recompile(), on the other hand, was added 
as a workaround because the recompiled TB only allows memory 
instrumentation, making the instruction count for this short TB 
undetectable by the plugin.

>
>> I think the tricky bit would be getting TCG to emit the callback code
>> for the last instruction before the
>> tcg_gen_exit_tb/tcg_gen_lookup_and_goto_ptr bits but after whatever else
>> it has done to execute the instruction.
>>
>> I don't think we could easily support inline ops at tb end though.
>>
>> Richard,
>>
>> What do you think?
>>
>>
>>>>     Message-Id: <20240718145958.1315270-1-alex.bennee@linaro.org>
>>>>     Date: Thu, 18 Jul 2024 15:59:58 +0100
>>>>     Subject: [RFC PATCH v3] contrib/plugins: control flow plugin
>>>>     From: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>
>>>> So what exactly are we trying to achieve here? A more efficient
>>>> detection of short blocks?
>>>>
>>>>>
>>>>> Signed-off-by: Xingran Wang <wangxingran123456@outlook.com>
>>>>> ---
>>>>>    accel/tcg/translate-all.c    |  27 ++++++++
>>>>>    include/qemu/plugin-event.h  |   2 +
>>>>>    include/qemu/plugin.h        |  24 +++++++
>>>>>    include/qemu/qemu-plugin.h   | 131 
>>>>> +++++++++++++++++++++++++++++++++++
>>>>>    plugins/api.c                |  78 +++++++++++++++++++++
>>>>>    plugins/core.c               |  42 +++++++++++
>>>>>    plugins/qemu-plugins.symbols |  10 +++
>>>>>    tests/tcg/plugins/bb.c       |  25 +++++++
>>>>>    8 files changed, 339 insertions(+)
>>>>>
>>>>> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
>>>>> index fdf6d8ac19..642f684372 100644
>>>>> --- a/accel/tcg/translate-all.c
>>>>> +++ b/accel/tcg/translate-all.c
>>>>> @@ -65,6 +65,7 @@
>>>>>    #include "internal-target.h"
>>>>>    #include "tcg/perf.h"
>>>>>    #include "tcg/insn-start-words.h"
>>>>> +#include "qemu/plugin.h"
>>>>>      TBContext tb_ctx;
>>>>>    @@ -218,6 +219,19 @@ void cpu_restore_state_from_tb(CPUState
>>>>> *cpu, TranslationBlock *tb,
>>>>>            cpu->neg.icount_decr.u16.low += insns_left;
>>>>>        }
>>>>>    +#ifdef CONFIG_PLUGIN
>>>>> +    /*
>>>>> +     * Notify the plugin with the relevant information
>>>>> +     * when restoring the execution state of a TB.
>>>>> +     */
>>>>> +    struct qemu_plugin_tb_restore ptb_restore;
>>>>> +    ptb_restore.cpu_index = cpu->cpu_index;
>>>>> +    ptb_restore.insns_left = insns_left;
>>>>> +    ptb_restore.tb_n = tb->icount;
>>>>> +    ptb_restore.tb_pc = tb->pc;
>>>>> +    qemu_plugin_tb_restore_cb(cpu, &ptb_restore);
>>>>> +#endif
>>>>> +
>>>> See also the unwind patches which is a more generic approach to
>>>> ensuring
>>>> "special" registers are synced at midpoint when using the register 
>>>> API:
>>>>     Message-Id: <20240606032926.83599-1-richard.henderson@linaro.org>
>>>>     Date: Wed,  5 Jun 2024 20:29:17 -0700
>>>>     Subject: [PATCH v2 0/9] plugins: Use unwind info for special 
>>>> gdb registers
>>>>     From: Richard Henderson <richard.henderson@linaro.org>
>>>> <snip>
>>>>
>>>
>>> Thanks,
>>> Pierrick
>>