1 | v3: https://patchew.org/QEMU/20240206204809.9859-1-amonakov@ispras.ru/ | 1 | Hi, |
---|---|---|---|
2 | v6: https://patchew.org/QEMU/20240424225705.929812-1-richard.henderson@linaro.org/ | ||
3 | 2 | ||
4 | Changes for v7: | 3 | This new version removed the translate_fn() from patch 1 because it |
5 | - Generalize test_buffer_is_zero_next_accel and initialization (phil) | 4 | wasn't removing the sign-extension for pentry as we thought it would. |
5 | A more detailed explanation is given in the commit msg of patch 1. | ||
6 | 6 | ||
7 | We're now retrieving the 'lowaddr' value from load_elf_ram_sym() and | ||
8 | using it when we're running a 32-bit CPU. This worked with 32 bit | ||
9 | 'virt' machine booting with the -kernel option. | ||
7 | 10 | ||
8 | r~ | 11 | If this approach doesn't work for the Xvisor use case, IMO we should |
12 | just filter kernel_load_addr bits directly as we were doing a handful of | ||
13 | versions ago. | ||
9 | 14 | ||
15 | Patches are based on current riscv-to-apply.next. | ||
10 | 16 | ||
11 | Alexander Monakov (5): | 17 | Changes from v9: |
12 | util/bufferiszero: Remove SSE4.1 variant | 18 | - patch 1: |
13 | util/bufferiszero: Remove AVX512 variant | 19 | - removed the translate_fn() callback |
14 | util/bufferiszero: Reorganize for early test for acceleration | 20 | - return 'kernel_low' when running a 32-bit CPU |
15 | util/bufferiszero: Remove useless prefetches | 21 | - v9 link: https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg04509.html |
16 | util/bufferiszero: Optimize SSE2 and AVX2 variants | ||
17 | 22 | ||
18 | Richard Henderson (5): | 23 | Daniel Henrique Barboza (3): |
19 | util/bufferiszero: Improve scalar variant | 24 | hw/riscv: handle 32 bit CPUs kernel_addr in riscv_load_kernel() |
20 | util/bufferiszero: Introduce biz_accel_fn typedef | 25 | hw/riscv/boot.c: consolidate all kernel init in riscv_load_kernel() |
21 | util/bufferiszero: Simplify test_buffer_is_zero_next_accel | 26 | hw/riscv/boot.c: make riscv_load_initrd() static |
22 | util/bufferiszero: Add simd acceleration for aarch64 | ||
23 | tests/bench: Add bufferiszero-bench | ||
24 | 27 | ||
25 | include/qemu/cutils.h | 32 ++- | 28 | hw/riscv/boot.c | 96 +++++++++++++++++++++++--------------- |
26 | tests/bench/bufferiszero-bench.c | 47 ++++ | 29 | hw/riscv/microchip_pfsoc.c | 12 +---- |
27 | util/bufferiszero.c | 465 ++++++++++++++++--------------- | 30 | hw/riscv/opentitan.c | 4 +- |
28 | tests/bench/meson.build | 1 + | 31 | hw/riscv/sifive_e.c | 4 +- |
29 | 4 files changed, 324 insertions(+), 221 deletions(-) | 32 | hw/riscv/sifive_u.c | 12 +---- |
30 | create mode 100644 tests/bench/bufferiszero-bench.c | 33 | hw/riscv/spike.c | 14 ++---- |
34 | hw/riscv/virt.c | 12 +---- | ||
35 | include/hw/riscv/boot.h | 3 +- | ||
36 | 8 files changed, 76 insertions(+), 81 deletions(-) | ||
31 | 37 | ||
32 | -- | 38 | -- |
33 | 2.34.1 | 39 | 2.39.1 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Alexander Monakov <amonakov@ispras.ru> | ||
2 | 1 | ||
3 | The SSE4.1 variant is virtually identical to the SSE2 variant, except | ||
4 | for using 'PTEST+JNZ' in place of 'PCMPEQB+PMOVMSKB+CMP+JNE' for testing | ||
5 | if an SSE register is all zeroes. The PTEST instruction decodes to two | ||
6 | uops, so it can be handled only by the complex decoder, and since | ||
7 | CMP+JNE are macro-fused, both sequences decode to three uops. The uops | ||
8 | comprising the PTEST instruction dispatch to p0 and p5 on Intel CPUs, so | ||
9 | PCMPEQB+PMOVMSKB is comparatively more flexible from dispatch | ||
10 | standpoint. | ||
11 | |||
12 | Hence, the use of PTEST brings no benefit from throughput standpoint. | ||
13 | Its latency is not important, since it feeds only a conditional jump, | ||
14 | which terminates the dependency chain. | ||
15 | |||
16 | I never observed PTEST variants to be faster on real hardware. | ||
17 | |||
18 | Signed-off-by: Alexander Monakov <amonakov@ispras.ru> | ||
19 | Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru> | ||
20 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
21 | Message-Id: <20240206204809.9859-2-amonakov@ispras.ru> | ||
22 | --- | ||
23 | util/bufferiszero.c | 29 ----------------------------- | ||
24 | 1 file changed, 29 deletions(-) | ||
25 | |||
26 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | ||
27 | index XXXXXXX..XXXXXXX 100644 | ||
28 | --- a/util/bufferiszero.c | ||
29 | +++ b/util/bufferiszero.c | ||
30 | @@ -XXX,XX +XXX,XX @@ buffer_zero_sse2(const void *buf, size_t len) | ||
31 | } | ||
32 | |||
33 | #ifdef CONFIG_AVX2_OPT | ||
34 | -static bool __attribute__((target("sse4"))) | ||
35 | -buffer_zero_sse4(const void *buf, size_t len) | ||
36 | -{ | ||
37 | - __m128i t = _mm_loadu_si128(buf); | ||
38 | - __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); | ||
39 | - __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); | ||
40 | - | ||
41 | - /* Loop over 16-byte aligned blocks of 64. */ | ||
42 | - while (likely(p <= e)) { | ||
43 | - __builtin_prefetch(p); | ||
44 | - if (unlikely(!_mm_testz_si128(t, t))) { | ||
45 | - return false; | ||
46 | - } | ||
47 | - t = p[-4] | p[-3] | p[-2] | p[-1]; | ||
48 | - p += 4; | ||
49 | - } | ||
50 | - | ||
51 | - /* Finish the aligned tail. */ | ||
52 | - t |= e[-3]; | ||
53 | - t |= e[-2]; | ||
54 | - t |= e[-1]; | ||
55 | - | ||
56 | - /* Finish the unaligned tail. */ | ||
57 | - t |= _mm_loadu_si128(buf + len - 16); | ||
58 | - | ||
59 | - return _mm_testz_si128(t, t); | ||
60 | -} | ||
61 | - | ||
62 | static bool __attribute__((target("avx2"))) | ||
63 | buffer_zero_avx2(const void *buf, size_t len) | ||
64 | { | ||
65 | @@ -XXX,XX +XXX,XX @@ select_accel_cpuinfo(unsigned info) | ||
66 | #endif | ||
67 | #ifdef CONFIG_AVX2_OPT | ||
68 | { CPUINFO_AVX2, 128, buffer_zero_avx2 }, | ||
69 | - { CPUINFO_SSE4, 64, buffer_zero_sse4 }, | ||
70 | #endif | ||
71 | { CPUINFO_SSE2, 64, buffer_zero_sse2 }, | ||
72 | { CPUINFO_ALWAYS, 0, buffer_zero_int }, | ||
73 | -- | ||
74 | 2.34.1 | diff view generated by jsdifflib |
1 | From: Alexander Monakov <amonakov@ispras.ru> | 1 | load_elf_ram_sym() will sign-extend 32 bit addresses. If a 32 bit QEMU |
---|---|---|---|
2 | guest happens to be running in a hypervisor that are using 64 bits to | ||
3 | encode its address, kernel_entry can be padded with '1's and create | ||
4 | problems [1]. | ||
2 | 5 | ||
3 | Thanks to early checks in the inline buffer_is_zero wrapper, the SIMD | 6 | Using a translate_fn() callback in load_elf_ram_sym() to filter the |
4 | routines are invoked much more rarely in normal use when most buffers | 7 | padding from the address doesn't work. A more detailed explanation can |
5 | are non-zero. This makes use of AVX512 unprofitable, as it incurs extra | 8 | be found in [2]. The short version is that glue(load_elf, SZ), from |
6 | frequency and voltage transition periods during which the CPU operates | 9 | include/hw/elf_ops.h, will calculate 'pentry' (mapped into the |
7 | at reduced performance, as described in | 10 | 'kernel_load_base' var in riscv_load_Kernel()) before using |
8 | https://travisdowns.github.io/blog/2020/01/17/avxfreq1.html | 11 | translate_fn(), and will not recalculate it after executing it. This |
12 | means that the callback does not prevent the padding from | ||
13 | kernel_load_base to appear. | ||
9 | 14 | ||
10 | Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru> | 15 | Let's instead use a kernel_low var to capture the 'lowaddr' value from |
11 | Signed-off-by: Alexander Monakov <amonakov@ispras.ru> | 16 | load_elf_ram_sim(), and return it when we're dealing with 32 bit CPUs. |
12 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | 17 | |
13 | Message-Id: <20240206204809.9859-4-amonakov@ispras.ru> | 18 | [1] https://lists.gnu.org/archive/html/qemu-devel/2023-01/msg02281.html |
14 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 19 | [2] https://lists.gnu.org/archive/html/qemu-devel/2023-02/msg00099.html |
20 | |||
21 | Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> | ||
15 | --- | 22 | --- |
16 | util/bufferiszero.c | 38 +++----------------------------------- | 23 | hw/riscv/boot.c | 15 +++++++++++---- |
17 | 1 file changed, 3 insertions(+), 35 deletions(-) | 24 | hw/riscv/microchip_pfsoc.c | 3 ++- |
25 | hw/riscv/opentitan.c | 3 ++- | ||
26 | hw/riscv/sifive_e.c | 3 ++- | ||
27 | hw/riscv/sifive_u.c | 3 ++- | ||
28 | hw/riscv/spike.c | 3 ++- | ||
29 | hw/riscv/virt.c | 3 ++- | ||
30 | include/hw/riscv/boot.h | 1 + | ||
31 | 8 files changed, 24 insertions(+), 10 deletions(-) | ||
18 | 32 | ||
19 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | 33 | diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c |
20 | index XXXXXXX..XXXXXXX 100644 | 34 | index XXXXXXX..XXXXXXX 100644 |
21 | --- a/util/bufferiszero.c | 35 | --- a/hw/riscv/boot.c |
22 | +++ b/util/bufferiszero.c | 36 | +++ b/hw/riscv/boot.c |
23 | @@ -XXX,XX +XXX,XX @@ buffer_zero_int(const void *buf, size_t len) | 37 | @@ -XXX,XX +XXX,XX @@ target_ulong riscv_load_firmware(const char *firmware_filename, |
38 | } | ||
39 | |||
40 | target_ulong riscv_load_kernel(MachineState *machine, | ||
41 | + RISCVHartArrayState *harts, | ||
42 | target_ulong kernel_start_addr, | ||
43 | symbol_fn_t sym_cb) | ||
44 | { | ||
45 | const char *kernel_filename = machine->kernel_filename; | ||
46 | - uint64_t kernel_load_base, kernel_entry; | ||
47 | + uint64_t kernel_load_base, kernel_entry, kernel_low; | ||
48 | |||
49 | g_assert(kernel_filename != NULL); | ||
50 | |||
51 | @@ -XXX,XX +XXX,XX @@ target_ulong riscv_load_kernel(MachineState *machine, | ||
52 | * the (expected) load address load address. This allows kernels to have | ||
53 | * separate SBI and ELF entry points (used by FreeBSD, for example). | ||
54 | */ | ||
55 | - if (load_elf_ram_sym(kernel_filename, NULL, NULL, NULL, | ||
56 | - NULL, &kernel_load_base, NULL, NULL, 0, | ||
57 | + if (load_elf_ram_sym(kernel_filename, NULL, NULL, NULL, NULL, | ||
58 | + &kernel_load_base, &kernel_low, NULL, 0, | ||
59 | EM_RISCV, 1, 0, NULL, true, sym_cb) > 0) { | ||
60 | - return kernel_load_base; | ||
61 | + kernel_entry = kernel_load_base; | ||
62 | + | ||
63 | + if (riscv_is_32bit(harts)) { | ||
64 | + kernel_entry = kernel_low; | ||
65 | + } | ||
66 | + | ||
67 | + return kernel_entry; | ||
68 | } | ||
69 | |||
70 | if (load_uimage_as(kernel_filename, &kernel_entry, NULL, NULL, | ||
71 | diff --git a/hw/riscv/microchip_pfsoc.c b/hw/riscv/microchip_pfsoc.c | ||
72 | index XXXXXXX..XXXXXXX 100644 | ||
73 | --- a/hw/riscv/microchip_pfsoc.c | ||
74 | +++ b/hw/riscv/microchip_pfsoc.c | ||
75 | @@ -XXX,XX +XXX,XX @@ static void microchip_icicle_kit_machine_init(MachineState *machine) | ||
76 | kernel_start_addr = riscv_calc_kernel_start_addr(&s->soc.u_cpus, | ||
77 | firmware_end_addr); | ||
78 | |||
79 | - kernel_entry = riscv_load_kernel(machine, kernel_start_addr, NULL); | ||
80 | + kernel_entry = riscv_load_kernel(machine, &s->soc.u_cpus, | ||
81 | + kernel_start_addr, NULL); | ||
82 | |||
83 | if (machine->initrd_filename) { | ||
84 | riscv_load_initrd(machine, kernel_entry); | ||
85 | diff --git a/hw/riscv/opentitan.c b/hw/riscv/opentitan.c | ||
86 | index XXXXXXX..XXXXXXX 100644 | ||
87 | --- a/hw/riscv/opentitan.c | ||
88 | +++ b/hw/riscv/opentitan.c | ||
89 | @@ -XXX,XX +XXX,XX @@ static void opentitan_board_init(MachineState *machine) | ||
90 | } | ||
91 | |||
92 | if (machine->kernel_filename) { | ||
93 | - riscv_load_kernel(machine, memmap[IBEX_DEV_RAM].base, NULL); | ||
94 | + riscv_load_kernel(machine, &s->soc.cpus, | ||
95 | + memmap[IBEX_DEV_RAM].base, NULL); | ||
24 | } | 96 | } |
25 | } | 97 | } |
26 | 98 | ||
27 | -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) || defined(__SSE2__) | 99 | diff --git a/hw/riscv/sifive_e.c b/hw/riscv/sifive_e.c |
28 | +#if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) | 100 | index XXXXXXX..XXXXXXX 100644 |
29 | #include <immintrin.h> | 101 | --- a/hw/riscv/sifive_e.c |
30 | 102 | +++ b/hw/riscv/sifive_e.c | |
31 | /* Note that each of these vectorized functions require len >= 64. */ | 103 | @@ -XXX,XX +XXX,XX @@ static void sifive_e_machine_init(MachineState *machine) |
32 | @@ -XXX,XX +XXX,XX @@ buffer_zero_avx2(const void *buf, size_t len) | 104 | memmap[SIFIVE_E_DEV_MROM].base, &address_space_memory); |
105 | |||
106 | if (machine->kernel_filename) { | ||
107 | - riscv_load_kernel(machine, memmap[SIFIVE_E_DEV_DTIM].base, NULL); | ||
108 | + riscv_load_kernel(machine, &s->soc.cpus, | ||
109 | + memmap[SIFIVE_E_DEV_DTIM].base, NULL); | ||
110 | } | ||
33 | } | 111 | } |
34 | #endif /* CONFIG_AVX2_OPT */ | 112 | |
35 | 113 | diff --git a/hw/riscv/sifive_u.c b/hw/riscv/sifive_u.c | |
36 | -#ifdef CONFIG_AVX512F_OPT | 114 | index XXXXXXX..XXXXXXX 100644 |
37 | -static bool __attribute__((target("avx512f"))) | 115 | --- a/hw/riscv/sifive_u.c |
38 | -buffer_zero_avx512(const void *buf, size_t len) | 116 | +++ b/hw/riscv/sifive_u.c |
39 | -{ | 117 | @@ -XXX,XX +XXX,XX @@ static void sifive_u_machine_init(MachineState *machine) |
40 | - /* Begin with an unaligned head of 64 bytes. */ | 118 | kernel_start_addr = riscv_calc_kernel_start_addr(&s->soc.u_cpus, |
41 | - __m512i t = _mm512_loadu_si512(buf); | 119 | firmware_end_addr); |
42 | - __m512i *p = (__m512i *)(((uintptr_t)buf + 5 * 64) & -64); | 120 | |
43 | - __m512i *e = (__m512i *)(((uintptr_t)buf + len) & -64); | 121 | - kernel_entry = riscv_load_kernel(machine, kernel_start_addr, NULL); |
44 | - | 122 | + kernel_entry = riscv_load_kernel(machine, &s->soc.u_cpus, |
45 | - /* Loop over 64-byte aligned blocks of 256. */ | 123 | + kernel_start_addr, NULL); |
46 | - while (p <= e) { | 124 | |
47 | - __builtin_prefetch(p); | 125 | if (machine->initrd_filename) { |
48 | - if (unlikely(_mm512_test_epi64_mask(t, t))) { | 126 | riscv_load_initrd(machine, kernel_entry); |
49 | - return false; | 127 | diff --git a/hw/riscv/spike.c b/hw/riscv/spike.c |
50 | - } | 128 | index XXXXXXX..XXXXXXX 100644 |
51 | - t = p[-4] | p[-3] | p[-2] | p[-1]; | 129 | --- a/hw/riscv/spike.c |
52 | - p += 4; | 130 | +++ b/hw/riscv/spike.c |
53 | - } | 131 | @@ -XXX,XX +XXX,XX @@ static void spike_board_init(MachineState *machine) |
54 | - | 132 | kernel_start_addr = riscv_calc_kernel_start_addr(&s->soc[0], |
55 | - t |= _mm512_loadu_si512(buf + len - 4 * 64); | 133 | firmware_end_addr); |
56 | - t |= _mm512_loadu_si512(buf + len - 3 * 64); | 134 | |
57 | - t |= _mm512_loadu_si512(buf + len - 2 * 64); | 135 | - kernel_entry = riscv_load_kernel(machine, kernel_start_addr, |
58 | - t |= _mm512_loadu_si512(buf + len - 1 * 64); | 136 | + kernel_entry = riscv_load_kernel(machine, &s->soc[0], |
59 | - | 137 | + kernel_start_addr, |
60 | - return !_mm512_test_epi64_mask(t, t); | 138 | htif_symbol_callback); |
61 | - | 139 | |
62 | -} | 140 | if (machine->initrd_filename) { |
63 | -#endif /* CONFIG_AVX512F_OPT */ | 141 | diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c |
64 | - | 142 | index XXXXXXX..XXXXXXX 100644 |
65 | /* | 143 | --- a/hw/riscv/virt.c |
66 | * Make sure that these variables are appropriately initialized when | 144 | +++ b/hw/riscv/virt.c |
67 | * SSE2 is enabled on the compiler command-line, but the compiler is | 145 | @@ -XXX,XX +XXX,XX @@ static void virt_machine_done(Notifier *notifier, void *data) |
68 | * too old to support CONFIG_AVX2_OPT. | 146 | kernel_start_addr = riscv_calc_kernel_start_addr(&s->soc[0], |
69 | */ | 147 | firmware_end_addr); |
70 | -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) | 148 | |
71 | +#if defined(CONFIG_AVX2_OPT) | 149 | - kernel_entry = riscv_load_kernel(machine, kernel_start_addr, NULL); |
72 | # define INIT_USED 0 | 150 | + kernel_entry = riscv_load_kernel(machine, &s->soc[0], |
73 | # define INIT_LENGTH 0 | 151 | + kernel_start_addr, NULL); |
74 | # define INIT_ACCEL buffer_zero_int | 152 | |
75 | @@ -XXX,XX +XXX,XX @@ select_accel_cpuinfo(unsigned info) | 153 | if (machine->initrd_filename) { |
76 | unsigned len; | 154 | riscv_load_initrd(machine, kernel_entry); |
77 | bool (*fn)(const void *, size_t); | 155 | diff --git a/include/hw/riscv/boot.h b/include/hw/riscv/boot.h |
78 | } all[] = { | 156 | index XXXXXXX..XXXXXXX 100644 |
79 | -#ifdef CONFIG_AVX512F_OPT | 157 | --- a/include/hw/riscv/boot.h |
80 | - { CPUINFO_AVX512F, 256, buffer_zero_avx512 }, | 158 | +++ b/include/hw/riscv/boot.h |
81 | -#endif | 159 | @@ -XXX,XX +XXX,XX @@ target_ulong riscv_load_firmware(const char *firmware_filename, |
82 | #ifdef CONFIG_AVX2_OPT | 160 | hwaddr firmware_load_addr, |
83 | { CPUINFO_AVX2, 128, buffer_zero_avx2 }, | 161 | symbol_fn_t sym_cb); |
84 | #endif | 162 | target_ulong riscv_load_kernel(MachineState *machine, |
85 | @@ -XXX,XX +XXX,XX @@ select_accel_cpuinfo(unsigned info) | 163 | + RISCVHartArrayState *harts, |
86 | return 0; | 164 | target_ulong firmware_end_addr, |
87 | } | 165 | symbol_fn_t sym_cb); |
88 | 166 | void riscv_load_initrd(MachineState *machine, uint64_t kernel_entry); | |
89 | -#if defined(CONFIG_AVX512F_OPT) || defined(CONFIG_AVX2_OPT) | ||
90 | +#if defined(CONFIG_AVX2_OPT) | ||
91 | static void __attribute__((constructor)) init_accel(void) | ||
92 | { | ||
93 | used_accel = select_accel_cpuinfo(cpuinfo_init()); | ||
94 | -- | 167 | -- |
95 | 2.34.1 | 168 | 2.39.1 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Alexander Monakov <amonakov@ispras.ru> | ||
2 | 1 | ||
3 | Test for length >= 256 inline, where is is often a constant. | ||
4 | Before calling into the accelerated routine, sample three bytes | ||
5 | from the buffer, which handles most non-zero buffers. | ||
6 | |||
7 | Signed-off-by: Alexander Monakov <amonakov@ispras.ru> | ||
8 | Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru> | ||
9 | Message-Id: <20240206204809.9859-3-amonakov@ispras.ru> | ||
10 | [rth: Use __builtin_constant_p; move the indirect call out of line.] | ||
11 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
12 | --- | ||
13 | include/qemu/cutils.h | 32 ++++++++++++++++- | ||
14 | util/bufferiszero.c | 84 +++++++++++++++++-------------------------- | ||
15 | 2 files changed, 63 insertions(+), 53 deletions(-) | ||
16 | |||
17 | diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h | ||
18 | index XXXXXXX..XXXXXXX 100644 | ||
19 | --- a/include/qemu/cutils.h | ||
20 | +++ b/include/qemu/cutils.h | ||
21 | @@ -XXX,XX +XXX,XX @@ char *freq_to_str(uint64_t freq_hz); | ||
22 | /* used to print char* safely */ | ||
23 | #define STR_OR_NULL(str) ((str) ? (str) : "null") | ||
24 | |||
25 | -bool buffer_is_zero(const void *buf, size_t len); | ||
26 | +/* | ||
27 | + * Check if a buffer is all zeroes. | ||
28 | + */ | ||
29 | + | ||
30 | +bool buffer_is_zero_ool(const void *vbuf, size_t len); | ||
31 | +bool buffer_is_zero_ge256(const void *vbuf, size_t len); | ||
32 | bool test_buffer_is_zero_next_accel(void); | ||
33 | |||
34 | +static inline bool buffer_is_zero_sample3(const char *buf, size_t len) | ||
35 | +{ | ||
36 | + /* | ||
37 | + * For any reasonably sized buffer, these three samples come from | ||
38 | + * three different cachelines. In qemu-img usage, we find that | ||
39 | + * each byte eliminates more than half of all buffer testing. | ||
40 | + * It is therefore critical to performance that the byte tests | ||
41 | + * short-circuit, so that we do not pull in additional cache lines. | ||
42 | + * Do not "optimize" this to !(a | b | c). | ||
43 | + */ | ||
44 | + return !buf[0] && !buf[len - 1] && !buf[len / 2]; | ||
45 | +} | ||
46 | + | ||
47 | +#ifdef __OPTIMIZE__ | ||
48 | +static inline bool buffer_is_zero(const void *buf, size_t len) | ||
49 | +{ | ||
50 | + return (__builtin_constant_p(len) && len >= 256 | ||
51 | + ? buffer_is_zero_sample3(buf, len) && | ||
52 | + buffer_is_zero_ge256(buf, len) | ||
53 | + : buffer_is_zero_ool(buf, len)); | ||
54 | +} | ||
55 | +#else | ||
56 | +#define buffer_is_zero buffer_is_zero_ool | ||
57 | +#endif | ||
58 | + | ||
59 | /* | ||
60 | * Implementation of ULEB128 (http://en.wikipedia.org/wiki/LEB128) | ||
61 | * Input is limited to 14-bit numbers | ||
62 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | ||
63 | index XXXXXXX..XXXXXXX 100644 | ||
64 | --- a/util/bufferiszero.c | ||
65 | +++ b/util/bufferiszero.c | ||
66 | @@ -XXX,XX +XXX,XX @@ | ||
67 | #include "qemu/bswap.h" | ||
68 | #include "host/cpuinfo.h" | ||
69 | |||
70 | -static bool | ||
71 | -buffer_zero_int(const void *buf, size_t len) | ||
72 | +static bool (*buffer_is_zero_accel)(const void *, size_t); | ||
73 | + | ||
74 | +static bool buffer_is_zero_integer(const void *buf, size_t len) | ||
75 | { | ||
76 | if (unlikely(len < 8)) { | ||
77 | /* For a very small buffer, simply accumulate all the bytes. */ | ||
78 | @@ -XXX,XX +XXX,XX @@ buffer_zero_avx2(const void *buf, size_t len) | ||
79 | } | ||
80 | #endif /* CONFIG_AVX2_OPT */ | ||
81 | |||
82 | -/* | ||
83 | - * Make sure that these variables are appropriately initialized when | ||
84 | - * SSE2 is enabled on the compiler command-line, but the compiler is | ||
85 | - * too old to support CONFIG_AVX2_OPT. | ||
86 | - */ | ||
87 | -#if defined(CONFIG_AVX2_OPT) | ||
88 | -# define INIT_USED 0 | ||
89 | -# define INIT_LENGTH 0 | ||
90 | -# define INIT_ACCEL buffer_zero_int | ||
91 | -#else | ||
92 | -# ifndef __SSE2__ | ||
93 | -# error "ISA selection confusion" | ||
94 | -# endif | ||
95 | -# define INIT_USED CPUINFO_SSE2 | ||
96 | -# define INIT_LENGTH 64 | ||
97 | -# define INIT_ACCEL buffer_zero_sse2 | ||
98 | -#endif | ||
99 | - | ||
100 | -static unsigned used_accel = INIT_USED; | ||
101 | -static unsigned length_to_accel = INIT_LENGTH; | ||
102 | -static bool (*buffer_accel)(const void *, size_t) = INIT_ACCEL; | ||
103 | - | ||
104 | static unsigned __attribute__((noinline)) | ||
105 | select_accel_cpuinfo(unsigned info) | ||
106 | { | ||
107 | /* Array is sorted in order of algorithm preference. */ | ||
108 | static const struct { | ||
109 | unsigned bit; | ||
110 | - unsigned len; | ||
111 | bool (*fn)(const void *, size_t); | ||
112 | } all[] = { | ||
113 | #ifdef CONFIG_AVX2_OPT | ||
114 | - { CPUINFO_AVX2, 128, buffer_zero_avx2 }, | ||
115 | + { CPUINFO_AVX2, buffer_zero_avx2 }, | ||
116 | #endif | ||
117 | - { CPUINFO_SSE2, 64, buffer_zero_sse2 }, | ||
118 | - { CPUINFO_ALWAYS, 0, buffer_zero_int }, | ||
119 | + { CPUINFO_SSE2, buffer_zero_sse2 }, | ||
120 | + { CPUINFO_ALWAYS, buffer_is_zero_integer }, | ||
121 | }; | ||
122 | |||
123 | for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { | ||
124 | if (info & all[i].bit) { | ||
125 | - length_to_accel = all[i].len; | ||
126 | - buffer_accel = all[i].fn; | ||
127 | + buffer_is_zero_accel = all[i].fn; | ||
128 | return all[i].bit; | ||
129 | } | ||
130 | } | ||
131 | return 0; | ||
132 | } | ||
133 | |||
134 | -#if defined(CONFIG_AVX2_OPT) | ||
135 | +static unsigned used_accel; | ||
136 | + | ||
137 | static void __attribute__((constructor)) init_accel(void) | ||
138 | { | ||
139 | used_accel = select_accel_cpuinfo(cpuinfo_init()); | ||
140 | } | ||
141 | -#endif /* CONFIG_AVX2_OPT */ | ||
142 | + | ||
143 | +#define INIT_ACCEL NULL | ||
144 | |||
145 | bool test_buffer_is_zero_next_accel(void) | ||
146 | { | ||
147 | @@ -XXX,XX +XXX,XX @@ bool test_buffer_is_zero_next_accel(void) | ||
148 | used_accel |= used; | ||
149 | return used; | ||
150 | } | ||
151 | - | ||
152 | -static bool select_accel_fn(const void *buf, size_t len) | ||
153 | -{ | ||
154 | - if (likely(len >= length_to_accel)) { | ||
155 | - return buffer_accel(buf, len); | ||
156 | - } | ||
157 | - return buffer_zero_int(buf, len); | ||
158 | -} | ||
159 | - | ||
160 | #else | ||
161 | -#define select_accel_fn buffer_zero_int | ||
162 | bool test_buffer_is_zero_next_accel(void) | ||
163 | { | ||
164 | return false; | ||
165 | } | ||
166 | + | ||
167 | +#define INIT_ACCEL buffer_is_zero_integer | ||
168 | #endif | ||
169 | |||
170 | -/* | ||
171 | - * Checks if a buffer is all zeroes | ||
172 | - */ | ||
173 | -bool buffer_is_zero(const void *buf, size_t len) | ||
174 | +static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL; | ||
175 | + | ||
176 | +bool buffer_is_zero_ool(const void *buf, size_t len) | ||
177 | { | ||
178 | if (unlikely(len == 0)) { | ||
179 | return true; | ||
180 | } | ||
181 | + if (!buffer_is_zero_sample3(buf, len)) { | ||
182 | + return false; | ||
183 | + } | ||
184 | + /* All bytes are covered for any len <= 3. */ | ||
185 | + if (unlikely(len <= 3)) { | ||
186 | + return true; | ||
187 | + } | ||
188 | |||
189 | - /* Fetch the beginning of the buffer while we select the accelerator. */ | ||
190 | - __builtin_prefetch(buf); | ||
191 | - | ||
192 | - /* Use an optimized zero check if possible. Note that this also | ||
193 | - includes a check for an unrolled loop over 64-bit integers. */ | ||
194 | - return select_accel_fn(buf, len); | ||
195 | + if (likely(len >= 256)) { | ||
196 | + return buffer_is_zero_accel(buf, len); | ||
197 | + } | ||
198 | + return buffer_is_zero_integer(buf, len); | ||
199 | +} | ||
200 | + | ||
201 | +bool buffer_is_zero_ge256(const void *buf, size_t len) | ||
202 | +{ | ||
203 | + return buffer_is_zero_accel(buf, len); | ||
204 | } | ||
205 | -- | ||
206 | 2.34.1 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Alexander Monakov <amonakov@ispras.ru> | ||
2 | 1 | ||
3 | Use of prefetching in bufferiszero.c is quite questionable: | ||
4 | |||
5 | - prefetches are issued just a few CPU cycles before the corresponding | ||
6 | line would be hit by demand loads; | ||
7 | |||
8 | - they are done for simple access patterns, i.e. where hardware | ||
9 | prefetchers can perform better; | ||
10 | |||
11 | - they compete for load ports in loops that should be limited by load | ||
12 | port throughput rather than ALU throughput. | ||
13 | |||
14 | Signed-off-by: Alexander Monakov <amonakov@ispras.ru> | ||
15 | Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru> | ||
16 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
17 | Message-Id: <20240206204809.9859-5-amonakov@ispras.ru> | ||
18 | --- | ||
19 | util/bufferiszero.c | 3 --- | ||
20 | 1 file changed, 3 deletions(-) | ||
21 | |||
22 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | ||
23 | index XXXXXXX..XXXXXXX 100644 | ||
24 | --- a/util/bufferiszero.c | ||
25 | +++ b/util/bufferiszero.c | ||
26 | @@ -XXX,XX +XXX,XX @@ static bool buffer_is_zero_integer(const void *buf, size_t len) | ||
27 | const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); | ||
28 | |||
29 | for (; p + 8 <= e; p += 8) { | ||
30 | - __builtin_prefetch(p + 8); | ||
31 | if (t) { | ||
32 | return false; | ||
33 | } | ||
34 | @@ -XXX,XX +XXX,XX @@ buffer_zero_sse2(const void *buf, size_t len) | ||
35 | |||
36 | /* Loop over 16-byte aligned blocks of 64. */ | ||
37 | while (likely(p <= e)) { | ||
38 | - __builtin_prefetch(p); | ||
39 | t = _mm_cmpeq_epi8(t, zero); | ||
40 | if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) { | ||
41 | return false; | ||
42 | @@ -XXX,XX +XXX,XX @@ buffer_zero_avx2(const void *buf, size_t len) | ||
43 | |||
44 | /* Loop over 32-byte aligned blocks of 128. */ | ||
45 | while (p <= e) { | ||
46 | - __builtin_prefetch(p); | ||
47 | if (unlikely(!_mm256_testz_si256(t, t))) { | ||
48 | return false; | ||
49 | } | ||
50 | -- | ||
51 | 2.34.1 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | From: Alexander Monakov <amonakov@ispras.ru> | ||
2 | 1 | ||
3 | Increase unroll factor in SIMD loops from 4x to 8x in order to move | ||
4 | their bottlenecks from ALU port contention to load issue rate (two loads | ||
5 | per cycle on popular x86 implementations). | ||
6 | |||
7 | Avoid using out-of-bounds pointers in loop boundary conditions. | ||
8 | |||
9 | Follow SSE2 implementation strategy in the AVX2 variant. Avoid use of | ||
10 | PTEST, which is not profitable there (like in the removed SSE4 variant). | ||
11 | |||
12 | Signed-off-by: Alexander Monakov <amonakov@ispras.ru> | ||
13 | Signed-off-by: Mikhail Romanov <mmromanov@ispras.ru> | ||
14 | Reviewed-by: Richard Henderson <richard.henderson@linaro.org> | ||
15 | Message-Id: <20240206204809.9859-6-amonakov@ispras.ru> | ||
16 | --- | ||
17 | util/bufferiszero.c | 111 +++++++++++++++++++++++++++++--------------- | ||
18 | 1 file changed, 73 insertions(+), 38 deletions(-) | ||
19 | |||
20 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | ||
21 | index XXXXXXX..XXXXXXX 100644 | ||
22 | --- a/util/bufferiszero.c | ||
23 | +++ b/util/bufferiszero.c | ||
24 | @@ -XXX,XX +XXX,XX @@ static bool buffer_is_zero_integer(const void *buf, size_t len) | ||
25 | #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) | ||
26 | #include <immintrin.h> | ||
27 | |||
28 | -/* Note that each of these vectorized functions require len >= 64. */ | ||
29 | +/* Helper for preventing the compiler from reassociating | ||
30 | + chains of binary vector operations. */ | ||
31 | +#define SSE_REASSOC_BARRIER(vec0, vec1) asm("" : "+x"(vec0), "+x"(vec1)) | ||
32 | + | ||
33 | +/* Note that these vectorized functions may assume len >= 256. */ | ||
34 | |||
35 | static bool __attribute__((target("sse2"))) | ||
36 | buffer_zero_sse2(const void *buf, size_t len) | ||
37 | { | ||
38 | - __m128i t = _mm_loadu_si128(buf); | ||
39 | - __m128i *p = (__m128i *)(((uintptr_t)buf + 5 * 16) & -16); | ||
40 | - __m128i *e = (__m128i *)(((uintptr_t)buf + len) & -16); | ||
41 | - __m128i zero = _mm_setzero_si128(); | ||
42 | + /* Unaligned loads at head/tail. */ | ||
43 | + __m128i v = *(__m128i_u *)(buf); | ||
44 | + __m128i w = *(__m128i_u *)(buf + len - 16); | ||
45 | + /* Align head/tail to 16-byte boundaries. */ | ||
46 | + const __m128i *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16); | ||
47 | + const __m128i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16); | ||
48 | + __m128i zero = { 0 }; | ||
49 | |||
50 | - /* Loop over 16-byte aligned blocks of 64. */ | ||
51 | - while (likely(p <= e)) { | ||
52 | - t = _mm_cmpeq_epi8(t, zero); | ||
53 | - if (unlikely(_mm_movemask_epi8(t) != 0xFFFF)) { | ||
54 | + /* Collect a partial block at tail end. */ | ||
55 | + v |= e[-1]; w |= e[-2]; | ||
56 | + SSE_REASSOC_BARRIER(v, w); | ||
57 | + v |= e[-3]; w |= e[-4]; | ||
58 | + SSE_REASSOC_BARRIER(v, w); | ||
59 | + v |= e[-5]; w |= e[-6]; | ||
60 | + SSE_REASSOC_BARRIER(v, w); | ||
61 | + v |= e[-7]; v |= w; | ||
62 | + | ||
63 | + /* | ||
64 | + * Loop over complete 128-byte blocks. | ||
65 | + * With the head and tail removed, e - p >= 14, so the loop | ||
66 | + * must iterate at least once. | ||
67 | + */ | ||
68 | + do { | ||
69 | + v = _mm_cmpeq_epi8(v, zero); | ||
70 | + if (unlikely(_mm_movemask_epi8(v) != 0xFFFF)) { | ||
71 | return false; | ||
72 | } | ||
73 | - t = p[-4] | p[-3] | p[-2] | p[-1]; | ||
74 | - p += 4; | ||
75 | - } | ||
76 | + v = p[0]; w = p[1]; | ||
77 | + SSE_REASSOC_BARRIER(v, w); | ||
78 | + v |= p[2]; w |= p[3]; | ||
79 | + SSE_REASSOC_BARRIER(v, w); | ||
80 | + v |= p[4]; w |= p[5]; | ||
81 | + SSE_REASSOC_BARRIER(v, w); | ||
82 | + v |= p[6]; w |= p[7]; | ||
83 | + SSE_REASSOC_BARRIER(v, w); | ||
84 | + v |= w; | ||
85 | + p += 8; | ||
86 | + } while (p < e - 7); | ||
87 | |||
88 | - /* Finish the aligned tail. */ | ||
89 | - t |= e[-3]; | ||
90 | - t |= e[-2]; | ||
91 | - t |= e[-1]; | ||
92 | - | ||
93 | - /* Finish the unaligned tail. */ | ||
94 | - t |= _mm_loadu_si128(buf + len - 16); | ||
95 | - | ||
96 | - return _mm_movemask_epi8(_mm_cmpeq_epi8(t, zero)) == 0xFFFF; | ||
97 | + return _mm_movemask_epi8(_mm_cmpeq_epi8(v, zero)) == 0xFFFF; | ||
98 | } | ||
99 | |||
100 | #ifdef CONFIG_AVX2_OPT | ||
101 | static bool __attribute__((target("avx2"))) | ||
102 | buffer_zero_avx2(const void *buf, size_t len) | ||
103 | { | ||
104 | - /* Begin with an unaligned head of 32 bytes. */ | ||
105 | - __m256i t = _mm256_loadu_si256(buf); | ||
106 | - __m256i *p = (__m256i *)(((uintptr_t)buf + 5 * 32) & -32); | ||
107 | - __m256i *e = (__m256i *)(((uintptr_t)buf + len) & -32); | ||
108 | + /* Unaligned loads at head/tail. */ | ||
109 | + __m256i v = *(__m256i_u *)(buf); | ||
110 | + __m256i w = *(__m256i_u *)(buf + len - 32); | ||
111 | + /* Align head/tail to 32-byte boundaries. */ | ||
112 | + const __m256i *p = QEMU_ALIGN_PTR_DOWN(buf + 32, 32); | ||
113 | + const __m256i *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 32); | ||
114 | + __m256i zero = { 0 }; | ||
115 | |||
116 | - /* Loop over 32-byte aligned blocks of 128. */ | ||
117 | - while (p <= e) { | ||
118 | - if (unlikely(!_mm256_testz_si256(t, t))) { | ||
119 | + /* Collect a partial block at tail end. */ | ||
120 | + v |= e[-1]; w |= e[-2]; | ||
121 | + SSE_REASSOC_BARRIER(v, w); | ||
122 | + v |= e[-3]; w |= e[-4]; | ||
123 | + SSE_REASSOC_BARRIER(v, w); | ||
124 | + v |= e[-5]; w |= e[-6]; | ||
125 | + SSE_REASSOC_BARRIER(v, w); | ||
126 | + v |= e[-7]; v |= w; | ||
127 | + | ||
128 | + /* Loop over complete 256-byte blocks. */ | ||
129 | + for (; p < e - 7; p += 8) { | ||
130 | + /* PTEST is not profitable here. */ | ||
131 | + v = _mm256_cmpeq_epi8(v, zero); | ||
132 | + if (unlikely(_mm256_movemask_epi8(v) != 0xFFFFFFFF)) { | ||
133 | return false; | ||
134 | } | ||
135 | - t = p[-4] | p[-3] | p[-2] | p[-1]; | ||
136 | - p += 4; | ||
137 | - } ; | ||
138 | + v = p[0]; w = p[1]; | ||
139 | + SSE_REASSOC_BARRIER(v, w); | ||
140 | + v |= p[2]; w |= p[3]; | ||
141 | + SSE_REASSOC_BARRIER(v, w); | ||
142 | + v |= p[4]; w |= p[5]; | ||
143 | + SSE_REASSOC_BARRIER(v, w); | ||
144 | + v |= p[6]; w |= p[7]; | ||
145 | + SSE_REASSOC_BARRIER(v, w); | ||
146 | + v |= w; | ||
147 | + } | ||
148 | |||
149 | - /* Finish the last block of 128 unaligned. */ | ||
150 | - t |= _mm256_loadu_si256(buf + len - 4 * 32); | ||
151 | - t |= _mm256_loadu_si256(buf + len - 3 * 32); | ||
152 | - t |= _mm256_loadu_si256(buf + len - 2 * 32); | ||
153 | - t |= _mm256_loadu_si256(buf + len - 1 * 32); | ||
154 | - | ||
155 | - return _mm256_testz_si256(t, t); | ||
156 | + return _mm256_movemask_epi8(_mm256_cmpeq_epi8(v, zero)) == 0xFFFFFFFF; | ||
157 | } | ||
158 | #endif /* CONFIG_AVX2_OPT */ | ||
159 | |||
160 | -- | ||
161 | 2.34.1 | diff view generated by jsdifflib |
1 | Split less-than and greater-than 256 cases. | 1 | The microchip_icicle_kit, sifive_u, spike and virt boards are now doing |
---|---|---|---|
2 | Use unaligned accesses for head and tail. | 2 | the same steps when '-kernel' is used: |
3 | Avoid using out-of-bounds pointers in loop boundary conditions. | 3 | |
4 | 4 | - execute load_kernel() | |
5 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | 5 | - load init_rd() |
6 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | 6 | - write kernel_cmdline |
7 | |||
8 | Let's fold everything inside riscv_load_kernel() to avoid code | ||
9 | repetition. To not change the behavior of boards that aren't calling | ||
10 | riscv_load_init(), add an 'load_initrd' flag to riscv_load_kernel() and | ||
11 | allow these boards to opt out from initrd loading. | ||
12 | |||
13 | Cc: Palmer Dabbelt <palmer@dabbelt.com> | ||
14 | Reviewed-by: Bin Meng <bmeng@tinylab.org> | ||
15 | Reviewed-by: Alistair Francis <alistair.francis@wdc.com> | ||
16 | Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> | ||
7 | --- | 17 | --- |
8 | util/bufferiszero.c | 85 +++++++++++++++++++++++++++------------------ | 18 | hw/riscv/boot.c | 21 ++++++++++++++++++--- |
9 | 1 file changed, 51 insertions(+), 34 deletions(-) | 19 | hw/riscv/microchip_pfsoc.c | 11 +---------- |
10 | 20 | hw/riscv/opentitan.c | 3 ++- | |
11 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | 21 | hw/riscv/sifive_e.c | 3 ++- |
12 | index XXXXXXX..XXXXXXX 100644 | 22 | hw/riscv/sifive_u.c | 11 +---------- |
13 | --- a/util/bufferiszero.c | 23 | hw/riscv/spike.c | 11 +---------- |
14 | +++ b/util/bufferiszero.c | 24 | hw/riscv/virt.c | 11 +---------- |
15 | @@ -XXX,XX +XXX,XX @@ | 25 | include/hw/riscv/boot.h | 1 + |
16 | 26 | 8 files changed, 27 insertions(+), 45 deletions(-) | |
17 | static bool (*buffer_is_zero_accel)(const void *, size_t); | 27 | |
18 | 28 | diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c | |
19 | -static bool buffer_is_zero_integer(const void *buf, size_t len) | 29 | index XXXXXXX..XXXXXXX 100644 |
20 | +static bool buffer_is_zero_int_lt256(const void *buf, size_t len) | 30 | --- a/hw/riscv/boot.c |
31 | +++ b/hw/riscv/boot.c | ||
32 | @@ -XXX,XX +XXX,XX @@ target_ulong riscv_load_firmware(const char *firmware_filename, | ||
33 | target_ulong riscv_load_kernel(MachineState *machine, | ||
34 | RISCVHartArrayState *harts, | ||
35 | target_ulong kernel_start_addr, | ||
36 | + bool load_initrd, | ||
37 | symbol_fn_t sym_cb) | ||
21 | { | 38 | { |
22 | - if (unlikely(len < 8)) { | 39 | const char *kernel_filename = machine->kernel_filename; |
23 | - /* For a very small buffer, simply accumulate all the bytes. */ | 40 | uint64_t kernel_load_base, kernel_entry, kernel_low; |
24 | - const unsigned char *p = buf; | 41 | + void *fdt = machine->fdt; |
25 | - const unsigned char *e = buf + len; | 42 | |
26 | - unsigned char t = 0; | 43 | g_assert(kernel_filename != NULL); |
27 | + uint64_t t; | 44 | |
28 | + const uint64_t *p, *e; | 45 | @@ -XXX,XX +XXX,XX @@ target_ulong riscv_load_kernel(MachineState *machine, |
29 | 46 | kernel_entry = kernel_low; | |
30 | - do { | 47 | } |
31 | - t |= *p++; | 48 | |
32 | - } while (p < e); | 49 | - return kernel_entry; |
33 | - | 50 | + goto out; |
34 | - return t == 0; | 51 | } |
35 | - } else { | 52 | |
36 | - /* Otherwise, use the unaligned memory access functions to | 53 | if (load_uimage_as(kernel_filename, &kernel_entry, NULL, NULL, |
37 | - handle the beginning and end of the buffer, with a couple | 54 | NULL, NULL, NULL) > 0) { |
38 | - of loops handling the middle aligned section. */ | 55 | - return kernel_entry; |
39 | - uint64_t t = ldq_he_p(buf); | 56 | + goto out; |
40 | - const uint64_t *p = (uint64_t *)(((uintptr_t)buf + 8) & -8); | 57 | } |
41 | - const uint64_t *e = (uint64_t *)(((uintptr_t)buf + len) & -8); | 58 | |
42 | - | 59 | if (load_image_targphys_as(kernel_filename, kernel_start_addr, |
43 | - for (; p + 8 <= e; p += 8) { | 60 | current_machine->ram_size, NULL) > 0) { |
44 | - if (t) { | 61 | - return kernel_start_addr; |
45 | - return false; | 62 | + kernel_entry = kernel_start_addr; |
46 | - } | 63 | + goto out; |
47 | - t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; | 64 | } |
48 | - } | 65 | |
49 | - while (p < e) { | 66 | error_report("could not load kernel '%s'", kernel_filename); |
50 | - t |= *p++; | 67 | exit(1); |
51 | - } | ||
52 | - t |= ldq_he_p(buf + len - 8); | ||
53 | - | ||
54 | - return t == 0; | ||
55 | + /* | ||
56 | + * Use unaligned memory access functions to handle | ||
57 | + * the beginning and end of the buffer. | ||
58 | + */ | ||
59 | + if (unlikely(len <= 8)) { | ||
60 | + return (ldl_he_p(buf) | ldl_he_p(buf + len - 4)) == 0; | ||
61 | } | ||
62 | + | 68 | + |
63 | + t = ldq_he_p(buf) | ldq_he_p(buf + len - 8); | 69 | +out: |
64 | + p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8); | 70 | + if (load_initrd && machine->initrd_filename) { |
65 | + e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8); | 71 | + riscv_load_initrd(machine, kernel_entry); |
72 | + } | ||
66 | + | 73 | + |
67 | + /* Read 0 to 31 aligned words from the middle. */ | 74 | + if (fdt && machine->kernel_cmdline && *machine->kernel_cmdline) { |
68 | + while (p < e) { | 75 | + qemu_fdt_setprop_string(fdt, "/chosen", "bootargs", |
69 | + t |= *p++; | 76 | + machine->kernel_cmdline); |
70 | + } | 77 | + } |
71 | + return t == 0; | ||
72 | +} | ||
73 | + | 78 | + |
74 | +static bool buffer_is_zero_int_ge256(const void *buf, size_t len) | 79 | + return kernel_entry; |
75 | +{ | ||
76 | + /* | ||
77 | + * Use unaligned memory access functions to handle | ||
78 | + * the beginning and end of the buffer. | ||
79 | + */ | ||
80 | + uint64_t t = ldq_he_p(buf) | ldq_he_p(buf + len - 8); | ||
81 | + const uint64_t *p = QEMU_ALIGN_PTR_DOWN(buf + 8, 8); | ||
82 | + const uint64_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 8); | ||
83 | + | ||
84 | + /* Collect a partial block at the tail end. */ | ||
85 | + t |= e[-7] | e[-6] | e[-5] | e[-4] | e[-3] | e[-2] | e[-1]; | ||
86 | + | ||
87 | + /* | ||
88 | + * Loop over 64 byte blocks. | ||
89 | + * With the head and tail removed, e - p >= 30, | ||
90 | + * so the loop must iterate at least 3 times. | ||
91 | + */ | ||
92 | + do { | ||
93 | + if (t) { | ||
94 | + return false; | ||
95 | + } | ||
96 | + t = p[0] | p[1] | p[2] | p[3] | p[4] | p[5] | p[6] | p[7]; | ||
97 | + p += 8; | ||
98 | + } while (p < e - 7); | ||
99 | + | ||
100 | + return t == 0; | ||
101 | } | 80 | } |
102 | 81 | ||
103 | #if defined(CONFIG_AVX2_OPT) || defined(__SSE2__) | 82 | void riscv_load_initrd(MachineState *machine, uint64_t kernel_entry) |
104 | @@ -XXX,XX +XXX,XX @@ select_accel_cpuinfo(unsigned info) | 83 | diff --git a/hw/riscv/microchip_pfsoc.c b/hw/riscv/microchip_pfsoc.c |
105 | { CPUINFO_AVX2, buffer_zero_avx2 }, | 84 | index XXXXXXX..XXXXXXX 100644 |
106 | #endif | 85 | --- a/hw/riscv/microchip_pfsoc.c |
107 | { CPUINFO_SSE2, buffer_zero_sse2 }, | 86 | +++ b/hw/riscv/microchip_pfsoc.c |
108 | - { CPUINFO_ALWAYS, buffer_is_zero_integer }, | 87 | @@ -XXX,XX +XXX,XX @@ static void microchip_icicle_kit_machine_init(MachineState *machine) |
109 | + { CPUINFO_ALWAYS, buffer_is_zero_int_ge256 }, | 88 | firmware_end_addr); |
110 | }; | 89 | |
111 | 90 | kernel_entry = riscv_load_kernel(machine, &s->soc.u_cpus, | |
112 | for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { | 91 | - kernel_start_addr, NULL); |
113 | @@ -XXX,XX +XXX,XX @@ bool test_buffer_is_zero_next_accel(void) | 92 | - |
114 | return false; | 93 | - if (machine->initrd_filename) { |
94 | - riscv_load_initrd(machine, kernel_entry); | ||
95 | - } | ||
96 | - | ||
97 | - if (machine->kernel_cmdline && *machine->kernel_cmdline) { | ||
98 | - qemu_fdt_setprop_string(machine->fdt, "/chosen", | ||
99 | - "bootargs", machine->kernel_cmdline); | ||
100 | - } | ||
101 | + kernel_start_addr, true, NULL); | ||
102 | |||
103 | /* Compute the fdt load address in dram */ | ||
104 | fdt_load_addr = riscv_compute_fdt_addr(memmap[MICROCHIP_PFSOC_DRAM_LO].base, | ||
105 | diff --git a/hw/riscv/opentitan.c b/hw/riscv/opentitan.c | ||
106 | index XXXXXXX..XXXXXXX 100644 | ||
107 | --- a/hw/riscv/opentitan.c | ||
108 | +++ b/hw/riscv/opentitan.c | ||
109 | @@ -XXX,XX +XXX,XX @@ static void opentitan_board_init(MachineState *machine) | ||
110 | |||
111 | if (machine->kernel_filename) { | ||
112 | riscv_load_kernel(machine, &s->soc.cpus, | ||
113 | - memmap[IBEX_DEV_RAM].base, NULL); | ||
114 | + memmap[IBEX_DEV_RAM].base, | ||
115 | + false, NULL); | ||
116 | } | ||
115 | } | 117 | } |
116 | 118 | ||
117 | -#define INIT_ACCEL buffer_is_zero_integer | 119 | diff --git a/hw/riscv/sifive_e.c b/hw/riscv/sifive_e.c |
118 | +#define INIT_ACCEL buffer_is_zero_int_ge256 | 120 | index XXXXXXX..XXXXXXX 100644 |
119 | #endif | 121 | --- a/hw/riscv/sifive_e.c |
120 | 122 | +++ b/hw/riscv/sifive_e.c | |
121 | static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL; | 123 | @@ -XXX,XX +XXX,XX @@ static void sifive_e_machine_init(MachineState *machine) |
122 | @@ -XXX,XX +XXX,XX @@ bool buffer_is_zero_ool(const void *buf, size_t len) | 124 | |
123 | if (likely(len >= 256)) { | 125 | if (machine->kernel_filename) { |
124 | return buffer_is_zero_accel(buf, len); | 126 | riscv_load_kernel(machine, &s->soc.cpus, |
125 | } | 127 | - memmap[SIFIVE_E_DEV_DTIM].base, NULL); |
126 | - return buffer_is_zero_integer(buf, len); | 128 | + memmap[SIFIVE_E_DEV_DTIM].base, |
127 | + return buffer_is_zero_int_lt256(buf, len); | 129 | + false, NULL); |
130 | } | ||
128 | } | 131 | } |
129 | 132 | ||
130 | bool buffer_is_zero_ge256(const void *buf, size_t len) | 133 | diff --git a/hw/riscv/sifive_u.c b/hw/riscv/sifive_u.c |
134 | index XXXXXXX..XXXXXXX 100644 | ||
135 | --- a/hw/riscv/sifive_u.c | ||
136 | +++ b/hw/riscv/sifive_u.c | ||
137 | @@ -XXX,XX +XXX,XX @@ static void sifive_u_machine_init(MachineState *machine) | ||
138 | firmware_end_addr); | ||
139 | |||
140 | kernel_entry = riscv_load_kernel(machine, &s->soc.u_cpus, | ||
141 | - kernel_start_addr, NULL); | ||
142 | - | ||
143 | - if (machine->initrd_filename) { | ||
144 | - riscv_load_initrd(machine, kernel_entry); | ||
145 | - } | ||
146 | - | ||
147 | - if (machine->kernel_cmdline && *machine->kernel_cmdline) { | ||
148 | - qemu_fdt_setprop_string(machine->fdt, "/chosen", "bootargs", | ||
149 | - machine->kernel_cmdline); | ||
150 | - } | ||
151 | + kernel_start_addr, true, NULL); | ||
152 | } else { | ||
153 | /* | ||
154 | * If dynamic firmware is used, it doesn't know where is the next mode | ||
155 | diff --git a/hw/riscv/spike.c b/hw/riscv/spike.c | ||
156 | index XXXXXXX..XXXXXXX 100644 | ||
157 | --- a/hw/riscv/spike.c | ||
158 | +++ b/hw/riscv/spike.c | ||
159 | @@ -XXX,XX +XXX,XX @@ static void spike_board_init(MachineState *machine) | ||
160 | |||
161 | kernel_entry = riscv_load_kernel(machine, &s->soc[0], | ||
162 | kernel_start_addr, | ||
163 | - htif_symbol_callback); | ||
164 | - | ||
165 | - if (machine->initrd_filename) { | ||
166 | - riscv_load_initrd(machine, kernel_entry); | ||
167 | - } | ||
168 | - | ||
169 | - if (machine->kernel_cmdline && *machine->kernel_cmdline) { | ||
170 | - qemu_fdt_setprop_string(machine->fdt, "/chosen", "bootargs", | ||
171 | - machine->kernel_cmdline); | ||
172 | - } | ||
173 | + true, htif_symbol_callback); | ||
174 | } else { | ||
175 | /* | ||
176 | * If dynamic firmware is used, it doesn't know where is the next mode | ||
177 | diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c | ||
178 | index XXXXXXX..XXXXXXX 100644 | ||
179 | --- a/hw/riscv/virt.c | ||
180 | +++ b/hw/riscv/virt.c | ||
181 | @@ -XXX,XX +XXX,XX @@ static void virt_machine_done(Notifier *notifier, void *data) | ||
182 | firmware_end_addr); | ||
183 | |||
184 | kernel_entry = riscv_load_kernel(machine, &s->soc[0], | ||
185 | - kernel_start_addr, NULL); | ||
186 | - | ||
187 | - if (machine->initrd_filename) { | ||
188 | - riscv_load_initrd(machine, kernel_entry); | ||
189 | - } | ||
190 | - | ||
191 | - if (machine->kernel_cmdline && *machine->kernel_cmdline) { | ||
192 | - qemu_fdt_setprop_string(machine->fdt, "/chosen", "bootargs", | ||
193 | - machine->kernel_cmdline); | ||
194 | - } | ||
195 | + kernel_start_addr, true, NULL); | ||
196 | } else { | ||
197 | /* | ||
198 | * If dynamic firmware is used, it doesn't know where is the next mode | ||
199 | diff --git a/include/hw/riscv/boot.h b/include/hw/riscv/boot.h | ||
200 | index XXXXXXX..XXXXXXX 100644 | ||
201 | --- a/include/hw/riscv/boot.h | ||
202 | +++ b/include/hw/riscv/boot.h | ||
203 | @@ -XXX,XX +XXX,XX @@ target_ulong riscv_load_firmware(const char *firmware_filename, | ||
204 | target_ulong riscv_load_kernel(MachineState *machine, | ||
205 | RISCVHartArrayState *harts, | ||
206 | target_ulong firmware_end_addr, | ||
207 | + bool load_initrd, | ||
208 | symbol_fn_t sym_cb); | ||
209 | void riscv_load_initrd(MachineState *machine, uint64_t kernel_entry); | ||
210 | uint64_t riscv_compute_fdt_addr(hwaddr dram_start, uint64_t dram_size, | ||
131 | -- | 211 | -- |
132 | 2.34.1 | 212 | 2.39.1 |
133 | |||
134 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
2 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
3 | --- | ||
4 | util/bufferiszero.c | 7 ++++--- | ||
5 | 1 file changed, 4 insertions(+), 3 deletions(-) | ||
6 | 1 | ||
7 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | ||
8 | index XXXXXXX..XXXXXXX 100644 | ||
9 | --- a/util/bufferiszero.c | ||
10 | +++ b/util/bufferiszero.c | ||
11 | @@ -XXX,XX +XXX,XX @@ | ||
12 | #include "qemu/bswap.h" | ||
13 | #include "host/cpuinfo.h" | ||
14 | |||
15 | -static bool (*buffer_is_zero_accel)(const void *, size_t); | ||
16 | +typedef bool (*biz_accel_fn)(const void *, size_t); | ||
17 | +static biz_accel_fn buffer_is_zero_accel; | ||
18 | |||
19 | static bool buffer_is_zero_int_lt256(const void *buf, size_t len) | ||
20 | { | ||
21 | @@ -XXX,XX +XXX,XX @@ select_accel_cpuinfo(unsigned info) | ||
22 | /* Array is sorted in order of algorithm preference. */ | ||
23 | static const struct { | ||
24 | unsigned bit; | ||
25 | - bool (*fn)(const void *, size_t); | ||
26 | + biz_accel_fn fn; | ||
27 | } all[] = { | ||
28 | #ifdef CONFIG_AVX2_OPT | ||
29 | { CPUINFO_AVX2, buffer_zero_avx2 }, | ||
30 | @@ -XXX,XX +XXX,XX @@ bool test_buffer_is_zero_next_accel(void) | ||
31 | #define INIT_ACCEL buffer_is_zero_int_ge256 | ||
32 | #endif | ||
33 | |||
34 | -static bool (*buffer_is_zero_accel)(const void *, size_t) = INIT_ACCEL; | ||
35 | +static biz_accel_fn buffer_is_zero_accel = INIT_ACCEL; | ||
36 | |||
37 | bool buffer_is_zero_ool(const void *buf, size_t len) | ||
38 | { | ||
39 | -- | ||
40 | 2.34.1 | ||
41 | |||
42 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Because the three alternatives are monotonic, we don't need | ||
2 | to keep a couple of bitmasks, just identify the strongest | ||
3 | alternative at startup. | ||
4 | 1 | ||
5 | Generalize test_buffer_is_zero_next_accel and init_accel | ||
6 | by always defining an accel_table array. | ||
7 | |||
8 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
9 | --- | ||
10 | util/bufferiszero.c | 81 ++++++++++++++++++++------------------------- | ||
11 | 1 file changed, 35 insertions(+), 46 deletions(-) | ||
12 | |||
13 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | ||
14 | index XXXXXXX..XXXXXXX 100644 | ||
15 | --- a/util/bufferiszero.c | ||
16 | +++ b/util/bufferiszero.c | ||
17 | @@ -XXX,XX +XXX,XX @@ | ||
18 | #include "host/cpuinfo.h" | ||
19 | |||
20 | typedef bool (*biz_accel_fn)(const void *, size_t); | ||
21 | -static biz_accel_fn buffer_is_zero_accel; | ||
22 | |||
23 | static bool buffer_is_zero_int_lt256(const void *buf, size_t len) | ||
24 | { | ||
25 | @@ -XXX,XX +XXX,XX @@ buffer_zero_avx2(const void *buf, size_t len) | ||
26 | } | ||
27 | #endif /* CONFIG_AVX2_OPT */ | ||
28 | |||
29 | -static unsigned __attribute__((noinline)) | ||
30 | -select_accel_cpuinfo(unsigned info) | ||
31 | -{ | ||
32 | - /* Array is sorted in order of algorithm preference. */ | ||
33 | - static const struct { | ||
34 | - unsigned bit; | ||
35 | - biz_accel_fn fn; | ||
36 | - } all[] = { | ||
37 | +static biz_accel_fn const accel_table[] = { | ||
38 | + buffer_is_zero_int_ge256, | ||
39 | + buffer_zero_sse2, | ||
40 | #ifdef CONFIG_AVX2_OPT | ||
41 | - { CPUINFO_AVX2, buffer_zero_avx2 }, | ||
42 | + buffer_zero_avx2, | ||
43 | #endif | ||
44 | - { CPUINFO_SSE2, buffer_zero_sse2 }, | ||
45 | - { CPUINFO_ALWAYS, buffer_is_zero_int_ge256 }, | ||
46 | - }; | ||
47 | +}; | ||
48 | |||
49 | - for (unsigned i = 0; i < ARRAY_SIZE(all); ++i) { | ||
50 | - if (info & all[i].bit) { | ||
51 | - buffer_is_zero_accel = all[i].fn; | ||
52 | - return all[i].bit; | ||
53 | - } | ||
54 | +static unsigned best_accel(void) | ||
55 | +{ | ||
56 | + unsigned info = cpuinfo_init(); | ||
57 | + | ||
58 | +#ifdef CONFIG_AVX2_OPT | ||
59 | + if (info & CPUINFO_AVX2) { | ||
60 | + return 2; | ||
61 | } | ||
62 | - return 0; | ||
63 | +#endif | ||
64 | + return info & CPUINFO_SSE2 ? 1 : 0; | ||
65 | } | ||
66 | |||
67 | -static unsigned used_accel; | ||
68 | - | ||
69 | -static void __attribute__((constructor)) init_accel(void) | ||
70 | -{ | ||
71 | - used_accel = select_accel_cpuinfo(cpuinfo_init()); | ||
72 | -} | ||
73 | - | ||
74 | -#define INIT_ACCEL NULL | ||
75 | - | ||
76 | -bool test_buffer_is_zero_next_accel(void) | ||
77 | -{ | ||
78 | - /* | ||
79 | - * Accumulate the accelerators that we've already tested, and | ||
80 | - * remove them from the set to test this round. We'll get back | ||
81 | - * a zero from select_accel_cpuinfo when there are no more. | ||
82 | - */ | ||
83 | - unsigned used = select_accel_cpuinfo(cpuinfo & ~used_accel); | ||
84 | - used_accel |= used; | ||
85 | - return used; | ||
86 | -} | ||
87 | #else | ||
88 | -bool test_buffer_is_zero_next_accel(void) | ||
89 | -{ | ||
90 | - return false; | ||
91 | -} | ||
92 | - | ||
93 | -#define INIT_ACCEL buffer_is_zero_int_ge256 | ||
94 | +#define best_accel() 0 | ||
95 | +static biz_accel_fn const accel_table[1] = { | ||
96 | + buffer_is_zero_int_ge256 | ||
97 | +}; | ||
98 | #endif | ||
99 | |||
100 | -static biz_accel_fn buffer_is_zero_accel = INIT_ACCEL; | ||
101 | +static biz_accel_fn buffer_is_zero_accel; | ||
102 | +static unsigned accel_index; | ||
103 | |||
104 | bool buffer_is_zero_ool(const void *buf, size_t len) | ||
105 | { | ||
106 | @@ -XXX,XX +XXX,XX @@ bool buffer_is_zero_ge256(const void *buf, size_t len) | ||
107 | { | ||
108 | return buffer_is_zero_accel(buf, len); | ||
109 | } | ||
110 | + | ||
111 | +bool test_buffer_is_zero_next_accel(void) | ||
112 | +{ | ||
113 | + if (accel_index != 0) { | ||
114 | + buffer_is_zero_accel = accel_table[--accel_index]; | ||
115 | + return true; | ||
116 | + } | ||
117 | + return false; | ||
118 | +} | ||
119 | + | ||
120 | +static void __attribute__((constructor)) init_accel(void) | ||
121 | +{ | ||
122 | + accel_index = best_accel(); | ||
123 | + buffer_is_zero_accel = accel_table[accel_index]; | ||
124 | +} | ||
125 | -- | ||
126 | 2.34.1 | diff view generated by jsdifflib |
1 | Because non-embedded aarch64 is expected to have AdvSIMD enabled, merely | 1 | The only remaining caller is riscv_load_kernel_and_initrd() which |
---|---|---|---|
2 | double-check with the compiler flags for __ARM_NEON and don't bother with | 2 | belongs to the same file. |
3 | a runtime check. Otherwise, model the loop after the x86 SSE2 function. | ||
4 | 3 | ||
5 | Use UMAXV for the vector reduction. This is 3 cycles on cortex-a76 and | 4 | Signed-off-by: Daniel Henrique Barboza <dbarboza@ventanamicro.com> |
6 | 2 cycles on neoverse-n1. | 5 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> |
6 | Reviewed-by: Bin Meng <bmeng@tinylab.org> | ||
7 | Reviewed-by: Alistair Francis <alistair.francis@wdc.com> | ||
8 | --- | ||
9 | hw/riscv/boot.c | 80 ++++++++++++++++++++--------------------- | ||
10 | include/hw/riscv/boot.h | 1 - | ||
11 | 2 files changed, 40 insertions(+), 41 deletions(-) | ||
7 | 12 | ||
8 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | 13 | diff --git a/hw/riscv/boot.c b/hw/riscv/boot.c |
9 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
10 | --- | ||
11 | util/bufferiszero.c | 67 +++++++++++++++++++++++++++++++++++++++++++++ | ||
12 | 1 file changed, 67 insertions(+) | ||
13 | |||
14 | diff --git a/util/bufferiszero.c b/util/bufferiszero.c | ||
15 | index XXXXXXX..XXXXXXX 100644 | 14 | index XXXXXXX..XXXXXXX 100644 |
16 | --- a/util/bufferiszero.c | 15 | --- a/hw/riscv/boot.c |
17 | +++ b/util/bufferiszero.c | 16 | +++ b/hw/riscv/boot.c |
18 | @@ -XXX,XX +XXX,XX @@ static unsigned best_accel(void) | 17 | @@ -XXX,XX +XXX,XX @@ target_ulong riscv_load_firmware(const char *firmware_filename, |
19 | return info & CPUINFO_SSE2 ? 1 : 0; | 18 | exit(1); |
20 | } | 19 | } |
21 | 20 | ||
22 | +#elif defined(__aarch64__) && defined(__ARM_NEON) | 21 | +static void riscv_load_initrd(MachineState *machine, uint64_t kernel_entry) |
23 | +#include <arm_neon.h> | 22 | +{ |
23 | + const char *filename = machine->initrd_filename; | ||
24 | + uint64_t mem_size = machine->ram_size; | ||
25 | + void *fdt = machine->fdt; | ||
26 | + hwaddr start, end; | ||
27 | + ssize_t size; | ||
24 | + | 28 | + |
25 | +/* | 29 | + g_assert(filename != NULL); |
26 | + * Helper for preventing the compiler from reassociating | ||
27 | + * chains of binary vector operations. | ||
28 | + */ | ||
29 | +#define REASSOC_BARRIER(vec0, vec1) asm("" : "+w"(vec0), "+w"(vec1)) | ||
30 | + | ||
31 | +static bool buffer_is_zero_simd(const void *buf, size_t len) | ||
32 | +{ | ||
33 | + uint32x4_t t0, t1, t2, t3; | ||
34 | + | ||
35 | + /* Align head/tail to 16-byte boundaries. */ | ||
36 | + const uint32x4_t *p = QEMU_ALIGN_PTR_DOWN(buf + 16, 16); | ||
37 | + const uint32x4_t *e = QEMU_ALIGN_PTR_DOWN(buf + len - 1, 16); | ||
38 | + | ||
39 | + /* Unaligned loads at head/tail. */ | ||
40 | + t0 = vld1q_u32(buf) | vld1q_u32(buf + len - 16); | ||
41 | + | ||
42 | + /* Collect a partial block at tail end. */ | ||
43 | + t1 = e[-7] | e[-6]; | ||
44 | + t2 = e[-5] | e[-4]; | ||
45 | + t3 = e[-3] | e[-2]; | ||
46 | + t0 |= e[-1]; | ||
47 | + REASSOC_BARRIER(t0, t1); | ||
48 | + REASSOC_BARRIER(t2, t3); | ||
49 | + t0 |= t1; | ||
50 | + t2 |= t3; | ||
51 | + REASSOC_BARRIER(t0, t2); | ||
52 | + t0 |= t2; | ||
53 | + | 30 | + |
54 | + /* | 31 | + /* |
55 | + * Loop over complete 128-byte blocks. | 32 | + * We want to put the initrd far enough into RAM that when the |
56 | + * With the head and tail removed, e - p >= 14, so the loop | 33 | + * kernel is uncompressed it will not clobber the initrd. However |
57 | + * must iterate at least once. | 34 | + * on boards without much RAM we must ensure that we still leave |
35 | + * enough room for a decent sized initrd, and on boards with large | ||
36 | + * amounts of RAM we must avoid the initrd being so far up in RAM | ||
37 | + * that it is outside lowmem and inaccessible to the kernel. | ||
38 | + * So for boards with less than 256MB of RAM we put the initrd | ||
39 | + * halfway into RAM, and for boards with 256MB of RAM or more we put | ||
40 | + * the initrd at 128MB. | ||
58 | + */ | 41 | + */ |
59 | + do { | 42 | + start = kernel_entry + MIN(mem_size / 2, 128 * MiB); |
60 | + /* | 43 | + |
61 | + * Reduce via UMAXV. Whatever the actual result, | 44 | + size = load_ramdisk(filename, start, mem_size - start); |
62 | + * it will only be zero if all input bytes are zero. | 45 | + if (size == -1) { |
63 | + */ | 46 | + size = load_image_targphys(filename, start, mem_size - start); |
64 | + if (unlikely(vmaxvq_u32(t0) != 0)) { | 47 | + if (size == -1) { |
65 | + return false; | 48 | + error_report("could not load ramdisk '%s'", filename); |
49 | + exit(1); | ||
66 | + } | 50 | + } |
51 | + } | ||
67 | + | 52 | + |
68 | + t0 = p[0] | p[1]; | 53 | + /* Some RISC-V machines (e.g. opentitan) don't have a fdt. */ |
69 | + t1 = p[2] | p[3]; | 54 | + if (fdt) { |
70 | + t2 = p[4] | p[5]; | 55 | + end = start + size; |
71 | + t3 = p[6] | p[7]; | 56 | + qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-start", start); |
72 | + REASSOC_BARRIER(t0, t1); | 57 | + qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-end", end); |
73 | + REASSOC_BARRIER(t2, t3); | 58 | + } |
74 | + t0 |= t1; | ||
75 | + t2 |= t3; | ||
76 | + REASSOC_BARRIER(t0, t2); | ||
77 | + t0 |= t2; | ||
78 | + p += 8; | ||
79 | + } while (p < e - 7); | ||
80 | + | ||
81 | + return vmaxvq_u32(t0) == 0; | ||
82 | +} | 59 | +} |
83 | + | 60 | + |
84 | +#define best_accel() 1 | 61 | target_ulong riscv_load_kernel(MachineState *machine, |
85 | +static biz_accel_fn const accel_table[] = { | 62 | RISCVHartArrayState *harts, |
86 | + buffer_is_zero_int_ge256, | 63 | target_ulong kernel_start_addr, |
87 | + buffer_is_zero_simd, | 64 | @@ -XXX,XX +XXX,XX @@ out: |
88 | +}; | 65 | return kernel_entry; |
89 | #else | 66 | } |
90 | #define best_accel() 0 | 67 | |
91 | static biz_accel_fn const accel_table[1] = { | 68 | -void riscv_load_initrd(MachineState *machine, uint64_t kernel_entry) |
69 | -{ | ||
70 | - const char *filename = machine->initrd_filename; | ||
71 | - uint64_t mem_size = machine->ram_size; | ||
72 | - void *fdt = machine->fdt; | ||
73 | - hwaddr start, end; | ||
74 | - ssize_t size; | ||
75 | - | ||
76 | - g_assert(filename != NULL); | ||
77 | - | ||
78 | - /* | ||
79 | - * We want to put the initrd far enough into RAM that when the | ||
80 | - * kernel is uncompressed it will not clobber the initrd. However | ||
81 | - * on boards without much RAM we must ensure that we still leave | ||
82 | - * enough room for a decent sized initrd, and on boards with large | ||
83 | - * amounts of RAM we must avoid the initrd being so far up in RAM | ||
84 | - * that it is outside lowmem and inaccessible to the kernel. | ||
85 | - * So for boards with less than 256MB of RAM we put the initrd | ||
86 | - * halfway into RAM, and for boards with 256MB of RAM or more we put | ||
87 | - * the initrd at 128MB. | ||
88 | - */ | ||
89 | - start = kernel_entry + MIN(mem_size / 2, 128 * MiB); | ||
90 | - | ||
91 | - size = load_ramdisk(filename, start, mem_size - start); | ||
92 | - if (size == -1) { | ||
93 | - size = load_image_targphys(filename, start, mem_size - start); | ||
94 | - if (size == -1) { | ||
95 | - error_report("could not load ramdisk '%s'", filename); | ||
96 | - exit(1); | ||
97 | - } | ||
98 | - } | ||
99 | - | ||
100 | - /* Some RISC-V machines (e.g. opentitan) don't have a fdt. */ | ||
101 | - if (fdt) { | ||
102 | - end = start + size; | ||
103 | - qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-start", start); | ||
104 | - qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-end", end); | ||
105 | - } | ||
106 | -} | ||
107 | - | ||
108 | /* | ||
109 | * This function makes an assumption that the DRAM interval | ||
110 | * 'dram_base' + 'dram_size' is contiguous. | ||
111 | diff --git a/include/hw/riscv/boot.h b/include/hw/riscv/boot.h | ||
112 | index XXXXXXX..XXXXXXX 100644 | ||
113 | --- a/include/hw/riscv/boot.h | ||
114 | +++ b/include/hw/riscv/boot.h | ||
115 | @@ -XXX,XX +XXX,XX @@ target_ulong riscv_load_kernel(MachineState *machine, | ||
116 | target_ulong firmware_end_addr, | ||
117 | bool load_initrd, | ||
118 | symbol_fn_t sym_cb); | ||
119 | -void riscv_load_initrd(MachineState *machine, uint64_t kernel_entry); | ||
120 | uint64_t riscv_compute_fdt_addr(hwaddr dram_start, uint64_t dram_size, | ||
121 | MachineState *ms); | ||
122 | void riscv_load_fdt(hwaddr fdt_addr, void *fdt); | ||
92 | -- | 123 | -- |
93 | 2.34.1 | 124 | 2.39.1 |
94 | 125 | ||
95 | 126 | diff view generated by jsdifflib |
Deleted patch | |||
---|---|---|---|
1 | Benchmark each acceleration function vs an aligned buffer of zeros. | ||
2 | 1 | ||
3 | Reviewed-by: Philippe Mathieu-Daudé <philmd@linaro.org> | ||
4 | Signed-off-by: Richard Henderson <richard.henderson@linaro.org> | ||
5 | --- | ||
6 | tests/bench/bufferiszero-bench.c | 47 ++++++++++++++++++++++++++++++++ | ||
7 | tests/bench/meson.build | 1 + | ||
8 | 2 files changed, 48 insertions(+) | ||
9 | create mode 100644 tests/bench/bufferiszero-bench.c | ||
10 | |||
11 | diff --git a/tests/bench/bufferiszero-bench.c b/tests/bench/bufferiszero-bench.c | ||
12 | new file mode 100644 | ||
13 | index XXXXXXX..XXXXXXX | ||
14 | --- /dev/null | ||
15 | +++ b/tests/bench/bufferiszero-bench.c | ||
16 | @@ -XXX,XX +XXX,XX @@ | ||
17 | +/* | ||
18 | + * QEMU buffer_is_zero speed benchmark | ||
19 | + * | ||
20 | + * This work is licensed under the terms of the GNU GPL, version 2 or | ||
21 | + * (at your option) any later version. See the COPYING file in the | ||
22 | + * top-level directory. | ||
23 | + */ | ||
24 | +#include "qemu/osdep.h" | ||
25 | +#include "qemu/cutils.h" | ||
26 | +#include "qemu/units.h" | ||
27 | + | ||
28 | +static void test(const void *opaque) | ||
29 | +{ | ||
30 | + size_t max = 64 * KiB; | ||
31 | + void *buf = g_malloc0(max); | ||
32 | + int accel_index = 0; | ||
33 | + | ||
34 | + do { | ||
35 | + if (accel_index != 0) { | ||
36 | + g_test_message("%s", ""); /* gnu_printf Werror for simple "" */ | ||
37 | + } | ||
38 | + for (size_t len = 1 * KiB; len <= max; len *= 4) { | ||
39 | + double total = 0.0; | ||
40 | + | ||
41 | + g_test_timer_start(); | ||
42 | + do { | ||
43 | + buffer_is_zero_ge256(buf, len); | ||
44 | + total += len; | ||
45 | + } while (g_test_timer_elapsed() < 0.5); | ||
46 | + | ||
47 | + total /= MiB; | ||
48 | + g_test_message("buffer_is_zero #%d: %2zuKB %8.0f MB/sec", | ||
49 | + accel_index, len / (size_t)KiB, | ||
50 | + total / g_test_timer_last()); | ||
51 | + } | ||
52 | + accel_index++; | ||
53 | + } while (test_buffer_is_zero_next_accel()); | ||
54 | + | ||
55 | + g_free(buf); | ||
56 | +} | ||
57 | + | ||
58 | +int main(int argc, char **argv) | ||
59 | +{ | ||
60 | + g_test_init(&argc, &argv, NULL); | ||
61 | + g_test_add_data_func("/cutils/bufferiszero/speed", NULL, test); | ||
62 | + return g_test_run(); | ||
63 | +} | ||
64 | diff --git a/tests/bench/meson.build b/tests/bench/meson.build | ||
65 | index XXXXXXX..XXXXXXX 100644 | ||
66 | --- a/tests/bench/meson.build | ||
67 | +++ b/tests/bench/meson.build | ||
68 | @@ -XXX,XX +XXX,XX @@ benchs = {} | ||
69 | |||
70 | if have_block | ||
71 | benchs += { | ||
72 | + 'bufferiszero-bench': [], | ||
73 | 'benchmark-crypto-hash': [crypto], | ||
74 | 'benchmark-crypto-hmac': [crypto], | ||
75 | 'benchmark-crypto-cipher': [crypto], | ||
76 | -- | ||
77 | 2.34.1 | ||
78 | |||
79 | diff view generated by jsdifflib |