This will allow us to measure the performance impact of FP
emulation optimizations.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
tests/fp-bench.c | 290 +++++++++++++++++++++++++++++++++++++++++++++++++
tests/.gitignore | 1 +
tests/Makefile.include | 3 +-
3 files changed, 293 insertions(+), 1 deletion(-)
create mode 100644 tests/fp-bench.c
diff --git a/tests/fp-bench.c b/tests/fp-bench.c
new file mode 100644
index 0000000..a782093
--- /dev/null
+++ b/tests/fp-bench.c
@@ -0,0 +1,290 @@
+/*
+ * fp-bench.c - A collection of simple floating point microbenchmarks.
+ *
+ * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
+ *
+ * License: GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+#include "qemu/osdep.h"
+#include "qemu/atomic.h"
+
+#include <math.h>
+
+#include <sys/time.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <time.h>
+
+/* amortize the computation of random inputs */
+#define OPS_PER_ITER (1000ULL)
+
+#define SEED_A 0xdeadfacedeadface
+#define SEED_B 0xbadc0feebadc0fee
+#define SEED_C 0xbeefdeadbeefdead
+
+enum op {
+ OP_ADD,
+ OP_SUB,
+ OP_MUL,
+ OP_DIV,
+ OP_FMA,
+ OP_SQRT,
+};
+
+static const char * const op_names[] = {
+ [OP_ADD] = "add",
+ [OP_SUB] = "sub",
+ [OP_MUL] = "mul",
+ [OP_DIV] = "div",
+ [OP_FMA] = "fma",
+ [OP_SQRT] = "sqrt",
+};
+
+static uint64_t n_ops = 10000000;
+static enum op op;
+static const char *precision = "float";
+
+static const char commands_string[] =
+ " -n = number of floating point operations\n"
+ " -o = floating point operation (add, sub, mul, div, fma, sqrt). Default: add\n"
+ " -p = precision (float|single, double). Default: float";
+
+static void usage_complete(int argc, char *argv[])
+{
+ fprintf(stderr, "Usage: %s [options]\n", argv[0]);
+ fprintf(stderr, "options:\n%s\n", commands_string);
+ exit(-1);
+}
+
+static void set_op(const char *name)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(op_names); i++) {
+ if (strcmp(name, op_names[i]) == 0) {
+ op = i;
+ return;
+ }
+ }
+ fprintf(stderr, "Unsupported op '%s'\n", name);
+ exit(EXIT_FAILURE);
+}
+
+static inline int64_t get_clock_realtime(void)
+{
+ struct timeval tv;
+
+ gettimeofday(&tv, NULL);
+ return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000);
+}
+
+/*
+ * From: https://en.wikipedia.org/wiki/Xorshift
+ * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
+ * guaranteed to be >= INT_MAX).
+ */
+static uint64_t xorshift64star(uint64_t x)
+{
+ x ^= x >> 12; /* a */
+ x ^= x << 25; /* b */
+ x ^= x >> 27; /* c */
+ return x * UINT64_C(2685821657736338717);
+}
+
+static inline bool u32_is_normal(uint32_t x)
+{
+ return ((x + 0x00800000) & 0x7fffffff) >= 0x01000000;
+}
+
+static inline bool u64_is_normal(uint64_t x)
+{
+ return ((x + (1ULL << 52)) & -1ULL >> 1) >= 1ULL << 53;
+}
+
+static inline float get_random_float(uint64_t *x)
+{
+ uint64_t r = *x;
+ uint32_t r32;
+
+ do {
+ r = xorshift64star(r);
+ } while (!u32_is_normal(r));
+ *x = r;
+ r32 = r;
+ return *(float *)&r32;
+}
+
+static inline double get_random_double(uint64_t *x)
+{
+ uint64_t r = *x;
+
+ do {
+ r = xorshift64star(r);
+ } while (!u64_is_normal(r));
+ *x = r;
+ return *(double *)&r;
+}
+
+/*
+ * Disable optimizations (e.g. "a OP b" outside of the inner loop) with
+ * volatile.
+ */
+#define GEN_BENCH_1OPF(NAME, FUNC, PRECISION) \
+ static void NAME(volatile PRECISION *res) \
+ { \
+ uint64_t ra = SEED_A; \
+ uint64_t i, j; \
+ \
+ for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
+ volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
+ \
+ for (j = 0; j < OPS_PER_ITER; j++) { \
+ *res = FUNC(a); \
+ } \
+ } \
+ }
+
+GEN_BENCH_1OPF(bench_float_sqrt, sqrtf, float)
+GEN_BENCH_1OPF(bench_double_sqrt, sqrt, double)
+#undef GEN_BENCH_1OPF
+
+#define GEN_BENCH_2OP(NAME, OP, PRECISION) \
+ static void NAME(volatile PRECISION *res) \
+ { \
+ uint64_t ra = SEED_A; \
+ uint64_t rb = SEED_B; \
+ uint64_t i, j; \
+ \
+ for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
+ volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
+ volatile PRECISION b = glue(get_random_, PRECISION)(&rb); \
+ \
+ for (j = 0; j < OPS_PER_ITER; j++) { \
+ *res = a OP b; \
+ } \
+ } \
+ }
+
+GEN_BENCH_2OP(bench_float_add, +, float)
+GEN_BENCH_2OP(bench_float_sub, -, float)
+GEN_BENCH_2OP(bench_float_mul, *, float)
+GEN_BENCH_2OP(bench_float_div, /, float)
+
+GEN_BENCH_2OP(bench_double_add, +, double)
+GEN_BENCH_2OP(bench_double_sub, -, double)
+GEN_BENCH_2OP(bench_double_mul, *, double)
+GEN_BENCH_2OP(bench_double_div, /, double)
+
+#define GEN_BENCH_3OPF(NAME, FUNC, PRECISION) \
+ static void NAME(volatile PRECISION *res) \
+ { \
+ uint64_t ra = SEED_A; \
+ uint64_t rb = SEED_B; \
+ uint64_t rc = SEED_C; \
+ uint64_t i, j; \
+ \
+ for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
+ volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
+ volatile PRECISION b = glue(get_random_, PRECISION)(&rb); \
+ volatile PRECISION c = glue(get_random_, PRECISION)(&rc); \
+ \
+ for (j = 0; j < OPS_PER_ITER; j++) { \
+ *res = FUNC(a, b, c); \
+ } \
+ } \
+ }
+
+GEN_BENCH_3OPF(bench_float_fma, fmaf, float)
+GEN_BENCH_3OPF(bench_double_fma, fma, double)
+#undef GEN_BENCH_3OPF
+
+static void parse_args(int argc, char *argv[])
+{
+ int c;
+
+ for (;;) {
+ c = getopt(argc, argv, "n:ho:p:");
+ if (c < 0) {
+ break;
+ }
+ switch (c) {
+ case 'h':
+ usage_complete(argc, argv);
+ exit(0);
+ case 'n':
+ n_ops = atoll(optarg);
+ if (n_ops < OPS_PER_ITER) {
+ n_ops = OPS_PER_ITER;
+ }
+ n_ops -= n_ops % OPS_PER_ITER;
+ break;
+ case 'o':
+ set_op(optarg);
+ break;
+ case 'p':
+ precision = optarg;
+ if (strcmp(precision, "float") &&
+ strcmp(precision, "single") &&
+ strcmp(precision, "double")) {
+ fprintf(stderr, "Unsupported precision '%s'\n", precision);
+ exit(EXIT_FAILURE);
+ }
+ break;
+ }
+ }
+}
+
+#define CALL_BENCH(OP, PRECISION, RESP) \
+ do { \
+ switch (OP) { \
+ case OP_ADD: \
+ glue(glue(bench_, PRECISION), _add)(RESP); \
+ break; \
+ case OP_SUB: \
+ glue(glue(bench_, PRECISION), _sub)(RESP); \
+ break; \
+ case OP_MUL: \
+ glue(glue(bench_, PRECISION), _mul)(RESP); \
+ break; \
+ case OP_DIV: \
+ glue(glue(bench_, PRECISION), _div)(RESP); \
+ break; \
+ case OP_FMA: \
+ glue(glue(bench_, PRECISION), _fma)(RESP); \
+ break; \
+ case OP_SQRT: \
+ glue(glue(bench_, PRECISION), _sqrt)(RESP); \
+ break; \
+ default: \
+ g_assert_not_reached(); \
+ } \
+ } while (0)
+
+int main(int argc, char *argv[])
+{
+ int64_t t0, t1;
+ double resd;
+
+ parse_args(argc, argv);
+ if (!strcmp(precision, "float") || !strcmp(precision, "single")) {
+ float res;
+ t0 = get_clock_realtime();
+ CALL_BENCH(op, float, &res);
+ t1 = get_clock_realtime();
+ resd = res;
+ } else if (!strcmp(precision, "double")) {
+ t0 = get_clock_realtime();
+ CALL_BENCH(op, double, &resd);
+ t1 = get_clock_realtime();
+ } else {
+ g_assert_not_reached();
+ }
+ printf("%.2f MFlops\n", (double)n_ops / (t1 - t0) * 1e3);
+ if (resd) {
+ return 0;
+ }
+ return 0;
+}
diff --git a/tests/.gitignore b/tests/.gitignore
index 18e58b2..df69175 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -12,6 +12,7 @@ check-qobject
check-qstring
check-qom-interface
check-qom-proplist
+fp-bench
qht-bench
rcutorture
test-aio
diff --git a/tests/Makefile.include b/tests/Makefile.include
index ef9b88c..f6121ee 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -587,7 +587,7 @@ test-obj-y = tests/check-qnum.o tests/check-qstring.o tests/check-qdict.o \
tests/rcutorture.o tests/test-rcu-list.o \
tests/test-qdist.o tests/test-shift128.o \
tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
- tests/atomic_add-bench.o
+ tests/atomic_add-bench.o tests/fp-bench.o
$(test-obj-y): QEMU_INCLUDES += -Itests
QEMU_CFLAGS += -I$(SRC_PATH)/tests
@@ -639,6 +639,7 @@ tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) $(tes
tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o $(test-util-obj-y)
tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o $(test-util-obj-y)
+tests/fp-bench$(EXESUF): tests/fp-bench.o $(test-util-obj-y)
tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
--
2.7.4
Emilio G. Cota <cota@braap.org> writes:
> This will allow us to measure the performance impact of FP
> emulation optimizations.
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
> ---
> tests/fp-bench.c | 290 +++++++++++++++++++++++++++++++++++++++++++++++++
> tests/.gitignore | 1 +
> tests/Makefile.include | 3 +-
> 3 files changed, 293 insertions(+), 1 deletion(-)
> create mode 100644 tests/fp-bench.c
>
> diff --git a/tests/fp-bench.c b/tests/fp-bench.c
> new file mode 100644
> index 0000000..a782093
> --- /dev/null
> +++ b/tests/fp-bench.c
> @@ -0,0 +1,290 @@
> +/*
> + * fp-bench.c - A collection of simple floating point microbenchmarks.
> + *
> + * Copyright (C) 2018, Emilio G. Cota <cota@braap.org>
> + *
> + * License: GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +#include "qemu/osdep.h"
> +#include "qemu/atomic.h"
> +
> +#include <math.h>
> +
> +#include <sys/time.h>
> +#include <stdint.h>
> +#include <stdlib.h>
> +#include <unistd.h>
> +#include <stdio.h>
> +#include <time.h>
> +
> +/* amortize the computation of random inputs */
> +#define OPS_PER_ITER (1000ULL)
> +
> +#define SEED_A 0xdeadfacedeadface
> +#define SEED_B 0xbadc0feebadc0fee
> +#define SEED_C 0xbeefdeadbeefdead
> +
> +enum op {
> + OP_ADD,
> + OP_SUB,
> + OP_MUL,
> + OP_DIV,
> + OP_FMA,
> + OP_SQRT,
> +};
> +
> +static const char * const op_names[] = {
> + [OP_ADD] = "add",
> + [OP_SUB] = "sub",
> + [OP_MUL] = "mul",
> + [OP_DIV] = "div",
> + [OP_FMA] = "fma",
> + [OP_SQRT] = "sqrt",
> +};
> +
> +static uint64_t n_ops = 10000000;
> +static enum op op;
> +static const char *precision = "float";
> +
> +static const char commands_string[] =
> + " -n = number of floating point operations\n"
> + " -o = floating point operation (add, sub, mul, div, fma, sqrt). Default: add\n"
> + " -p = precision (float|single, double). Default: float";
> +
> +static void usage_complete(int argc, char *argv[])
> +{
> + fprintf(stderr, "Usage: %s [options]\n", argv[0]);
> + fprintf(stderr, "options:\n%s\n", commands_string);
> + exit(-1);
> +}
> +
> +static void set_op(const char *name)
> +{
> + int i;
> +
> + for (i = 0; i < ARRAY_SIZE(op_names); i++) {
> + if (strcmp(name, op_names[i]) == 0) {
> + op = i;
> + return;
> + }
> + }
> + fprintf(stderr, "Unsupported op '%s'\n", name);
> + exit(EXIT_FAILURE);
> +}
> +
> +static inline int64_t get_clock_realtime(void)
> +{
> + struct timeval tv;
> +
> + gettimeofday(&tv, NULL);
> + return tv.tv_sec * 1000000000LL + (tv.tv_usec * 1000);
> +}
> +
> +/*
> + * From: https://en.wikipedia.org/wiki/Xorshift
> + * This is faster than rand_r(), and gives us a wider range (RAND_MAX is only
> + * guaranteed to be >= INT_MAX).
> + */
> +static uint64_t xorshift64star(uint64_t x)
> +{
> + x ^= x >> 12; /* a */
> + x ^= x << 25; /* b */
> + x ^= x >> 27; /* c */
> + return x * UINT64_C(2685821657736338717);
> +}
> +
> +static inline bool u32_is_normal(uint32_t x)
> +{
> + return ((x + 0x00800000) & 0x7fffffff) >= 0x01000000;
> +}
> +
> +static inline bool u64_is_normal(uint64_t x)
> +{
> + return ((x + (1ULL << 52)) & -1ULL >> 1) >= 1ULL << 53;
> +}
> +
> +static inline float get_random_float(uint64_t *x)
> +{
> + uint64_t r = *x;
> + uint32_t r32;
> +
> + do {
> + r = xorshift64star(r);
> + } while (!u32_is_normal(r));
> + *x = r;
> + r32 = r;
> + return *(float *)&r32;
> +}
> +
> +static inline double get_random_double(uint64_t *x)
> +{
> + uint64_t r = *x;
> +
> + do {
> + r = xorshift64star(r);
> + } while (!u64_is_normal(r));
> + *x = r;
> + return *(double *)&r;
> +}
> +
> +/*
> + * Disable optimizations (e.g. "a OP b" outside of the inner loop) with
> + * volatile.
> + */
> +#define GEN_BENCH_1OPF(NAME, FUNC, PRECISION) \
> + static void NAME(volatile PRECISION *res) \
> + { \
> + uint64_t ra = SEED_A; \
> + uint64_t i, j; \
> + \
> + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
> + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
> + \
> + for (j = 0; j < OPS_PER_ITER; j++) { \
> + *res = FUNC(a); \
> + } \
> + } \
> + }
> +
Have you had a chance to look at if this will vectorise? I have a
similar benchmark which I compile with multiple options to test normal,
NEON/AdvSIMD and SVE enabled loops.
> +GEN_BENCH_1OPF(bench_float_sqrt, sqrtf, float)
> +GEN_BENCH_1OPF(bench_double_sqrt, sqrt, double)
> +#undef GEN_BENCH_1OPF
> +
> +#define GEN_BENCH_2OP(NAME, OP, PRECISION) \
> + static void NAME(volatile PRECISION *res) \
> + { \
> + uint64_t ra = SEED_A; \
> + uint64_t rb = SEED_B; \
> + uint64_t i, j; \
> + \
> + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
> + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
> + volatile PRECISION b = glue(get_random_, PRECISION)(&rb); \
> + \
> + for (j = 0; j < OPS_PER_ITER; j++) { \
> + *res = a OP b; \
> + } \
> + } \
> + }
> +
> +GEN_BENCH_2OP(bench_float_add, +, float)
> +GEN_BENCH_2OP(bench_float_sub, -, float)
> +GEN_BENCH_2OP(bench_float_mul, *, float)
> +GEN_BENCH_2OP(bench_float_div, /, float)
> +
> +GEN_BENCH_2OP(bench_double_add, +, double)
> +GEN_BENCH_2OP(bench_double_sub, -, double)
> +GEN_BENCH_2OP(bench_double_mul, *, double)
> +GEN_BENCH_2OP(bench_double_div, /, double)
> +
> +#define GEN_BENCH_3OPF(NAME, FUNC, PRECISION) \
> + static void NAME(volatile PRECISION *res) \
> + { \
> + uint64_t ra = SEED_A; \
> + uint64_t rb = SEED_B; \
> + uint64_t rc = SEED_C; \
> + uint64_t i, j; \
> + \
> + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
> + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
> + volatile PRECISION b = glue(get_random_, PRECISION)(&rb); \
> + volatile PRECISION c = glue(get_random_, PRECISION)(&rc); \
> + \
> + for (j = 0; j < OPS_PER_ITER; j++) { \
> + *res = FUNC(a, b, c); \
> + } \
> + } \
> + }
> +
> +GEN_BENCH_3OPF(bench_float_fma, fmaf, float)
> +GEN_BENCH_3OPF(bench_double_fma, fma, double)
> +#undef GEN_BENCH_3OPF
> +
> +static void parse_args(int argc, char *argv[])
> +{
> + int c;
> +
> + for (;;) {
> + c = getopt(argc, argv, "n:ho:p:");
> + if (c < 0) {
> + break;
> + }
> + switch (c) {
> + case 'h':
> + usage_complete(argc, argv);
> + exit(0);
> + case 'n':
> + n_ops = atoll(optarg);
> + if (n_ops < OPS_PER_ITER) {
> + n_ops = OPS_PER_ITER;
> + }
> + n_ops -= n_ops % OPS_PER_ITER;
> + break;
> + case 'o':
> + set_op(optarg);
> + break;
> + case 'p':
> + precision = optarg;
> + if (strcmp(precision, "float") &&
> + strcmp(precision, "single") &&
> + strcmp(precision, "double")) {
> + fprintf(stderr, "Unsupported precision '%s'\n", precision);
> + exit(EXIT_FAILURE);
Supporting half-precision if the compiler does would also be useful here.
> + }
> + break;
> + }
> + }
> +}
> +
> +#define CALL_BENCH(OP, PRECISION, RESP) \
> + do { \
> + switch (OP) { \
> + case OP_ADD: \
> + glue(glue(bench_, PRECISION), _add)(RESP); \
> + break; \
> + case OP_SUB: \
> + glue(glue(bench_, PRECISION), _sub)(RESP); \
> + break; \
> + case OP_MUL: \
> + glue(glue(bench_, PRECISION), _mul)(RESP); \
> + break; \
> + case OP_DIV: \
> + glue(glue(bench_, PRECISION), _div)(RESP); \
> + break; \
> + case OP_FMA: \
> + glue(glue(bench_, PRECISION), _fma)(RESP); \
> + break; \
> + case OP_SQRT: \
> + glue(glue(bench_, PRECISION), _sqrt)(RESP); \
> + break; \
> + default: \
> + g_assert_not_reached(); \
> + } \
> + } while (0)
> +
> +int main(int argc, char *argv[])
> +{
> + int64_t t0, t1;
> + double resd;
> +
> + parse_args(argc, argv);
> + if (!strcmp(precision, "float") || !strcmp(precision, "single")) {
> + float res;
> + t0 = get_clock_realtime();
> + CALL_BENCH(op, float, &res);
> + t1 = get_clock_realtime();
> + resd = res;
> + } else if (!strcmp(precision, "double")) {
> + t0 = get_clock_realtime();
> + CALL_BENCH(op, double, &resd);
> + t1 = get_clock_realtime();
> + } else {
> + g_assert_not_reached();
> + }
> + printf("%.2f MFlops\n", (double)n_ops / (t1 - t0) * 1e3);
> + if (resd) {
> + return 0;
> + }
> + return 0;
> +}
> diff --git a/tests/.gitignore b/tests/.gitignore
> index 18e58b2..df69175 100644
> --- a/tests/.gitignore
> +++ b/tests/.gitignore
> @@ -12,6 +12,7 @@ check-qobject
> check-qstring
> check-qom-interface
> check-qom-proplist
> +fp-bench
> qht-bench
> rcutorture
> test-aio
> diff --git a/tests/Makefile.include b/tests/Makefile.include
> index ef9b88c..f6121ee 100644
> --- a/tests/Makefile.include
> +++ b/tests/Makefile.include
> @@ -587,7 +587,7 @@ test-obj-y = tests/check-qnum.o tests/check-qstring.o tests/check-qdict.o \
> tests/rcutorture.o tests/test-rcu-list.o \
> tests/test-qdist.o tests/test-shift128.o \
> tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
> - tests/atomic_add-bench.o
> + tests/atomic_add-bench.o tests/fp-bench.o
Not sure why but "make check" didn't build this. I had to explicitly
"make tests/fp-bench". I guess along with atomic_add_bench though these
are explicitly guest facing tests so maybe we should move them once
tests/tcg is working again. I'll have another run at that this week.
>
> $(test-obj-y): QEMU_INCLUDES += -Itests
> QEMU_CFLAGS += -I$(SRC_PATH)/tests
> @@ -639,6 +639,7 @@ tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) $(tes
> tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
> tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o $(test-util-obj-y)
> tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o $(test-util-obj-y)
> +tests/fp-bench$(EXESUF): tests/fp-bench.o $(test-util-obj-y)
>
> tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
> hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
Anyway for this version:
Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
--
Alex Bennée
On Tue, Mar 27, 2018 at 09:45:14 +0100, Alex Bennée wrote:
> Emilio G. Cota <cota@braap.org> writes:
(snip)
> > +/*
> > + * Disable optimizations (e.g. "a OP b" outside of the inner loop) with
> > + * volatile.
> > + */
> > +#define GEN_BENCH_1OPF(NAME, FUNC, PRECISION) \
> > + static void NAME(volatile PRECISION *res) \
> > + { \
> > + uint64_t ra = SEED_A; \
> > + uint64_t i, j; \
> > + \
> > + for (i = 0; i < n_ops; i += OPS_PER_ITER) { \
> > + volatile PRECISION a = glue(get_random_, PRECISION)(&ra); \
> > + \
> > + for (j = 0; j < OPS_PER_ITER; j++) { \
> > + *res = FUNC(a); \
> > + } \
> > + } \
> > + }
> > +
>
> Have you had a chance to look at if this will vectorise? I have a
> similar benchmark which I compile with multiple options to test normal,
> NEON/AdvSIMD and SVE enabled loops.
It does not. I'm pretty sure the volatile there prevents the compiler
from doing anything smart. In this case I don't want the compiler
to vectorise though, but I can see how that would be a nice
benchmark to have in addition to the above.
> > + case 'p':
> > + precision = optarg;
> > + if (strcmp(precision, "float") &&
> > + strcmp(precision, "single") &&
> > + strcmp(precision, "double")) {
> > + fprintf(stderr, "Unsupported precision '%s'\n", precision);
> > + exit(EXIT_FAILURE);
>
> Supporting half-precision if the compiler does would also be useful here.
I wasn't speeding those up so didn't care to test them. But yes I can see how
that could be useful for arm/aarch64; we can add it later.
> > diff --git a/tests/Makefile.include b/tests/Makefile.include
> > index ef9b88c..f6121ee 100644
> > --- a/tests/Makefile.include
> > +++ b/tests/Makefile.include
> > @@ -587,7 +587,7 @@ test-obj-y = tests/check-qnum.o tests/check-qstring.o tests/check-qdict.o \
> > tests/rcutorture.o tests/test-rcu-list.o \
> > tests/test-qdist.o tests/test-shift128.o \
> > tests/test-qht.o tests/qht-bench.o tests/test-qht-par.o \
> > - tests/atomic_add-bench.o
> > + tests/atomic_add-bench.o tests/fp-bench.o
>
> Not sure why but "make check" didn't build this. I had to explicitly
> "make tests/fp-bench". I guess along with atomic_add_bench though these
> are explicitly guest facing tests so maybe we should move them once
> tests/tcg is working again. I'll have another run at that this week.
That was intentional; these are benchmarks rather than tests so I
wouldn't expect make check to build them or run them at all. So that was
> > $(test-obj-y): QEMU_INCLUDES += -Itests
> > QEMU_CFLAGS += -I$(SRC_PATH)/tests
> > @@ -639,6 +639,7 @@ tests/test-qht-par$(EXESUF): tests/test-qht-par.o tests/qht-bench$(EXESUF) $(tes
> > tests/qht-bench$(EXESUF): tests/qht-bench.o $(test-util-obj-y)
> > tests/test-bufferiszero$(EXESUF): tests/test-bufferiszero.o $(test-util-obj-y)
> > tests/atomic_add-bench$(EXESUF): tests/atomic_add-bench.o $(test-util-obj-y)
> > +tests/fp-bench$(EXESUF): tests/fp-bench.o $(test-util-obj-y)
> >
> > tests/test-qdev-global-props$(EXESUF): tests/test-qdev-global-props.o \
> > hw/core/qdev.o hw/core/qdev-properties.o hw/core/hotplug.o\
>
> Anyway for this version:
>
> Reviewed-by: Alex Bennée <alex.bennee@linaro.org>
Thanks! I'll keep this for v3 (I sent v2 yesterday), since not
much changed.
If I had more time to work on this I'd like to have a -t soft/host flag
like in fp-test. Right now there is no such flag so we default to "host";
IOW, we end up testing the performance of the whole sausage, i.e. guest
compiler + QEMU. This is useful because it represents real-life
scenarios. However, if we tested the functions in fpu/ directly,
we'd get benchmarking that (1) would be more sensitive to the functions
we want to benchmark, and (2) would not depend on the particular
implementation of the QEMU target (e.g. i386 does not emit fma
at all!).
Thanks,
Emilio
© 2016 - 2025 Red Hat, Inc.