This will enable us to decouple code translation from the value
of parallel_cpus at any given time. It will also help us minimize
TB flushes when generating code via EXCP_ATOMIC.
Note that the declaration of parallel_cpus is brought to exec-all.h
to be able to define there the "curr_cflags" inline.
Signed-off-by: Emilio G. Cota <cota@braap.org>
---
include/exec/exec-all.h | 20 +++++++++++++++++++-
include/exec/tb-hash-xx.h | 9 ++++++---
include/exec/tb-hash.h | 4 ++--
include/exec/tb-lookup.h | 6 +++---
tcg/tcg.h | 1 -
accel/tcg/cpu-exec.c | 45 +++++++++++++++++++++++----------------------
accel/tcg/translate-all.c | 13 +++++++++----
exec.c | 2 +-
tcg/tcg-runtime.c | 2 +-
tests/qht-bench.c | 2 +-
10 files changed, 65 insertions(+), 39 deletions(-)
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 8cbd90b..f6a928d 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -353,6 +353,9 @@ struct TranslationBlock {
#define CF_USE_ICOUNT 0x20000
#define CF_IGNORE_ICOUNT 0x40000 /* Do not generate icount code */
#define CF_INVALID 0x80000 /* TB is stale. Setters must acquire tb_lock */
+#define CF_PARALLEL 0x100000 /* Generate code for a parallel context */
+/* cflags' mask for hashing/comparison */
+#define CF_HASH_MASK (CF_COUNT_MASK | CF_PARALLEL)
/* Per-vCPU dynamic tracing state used to generate this TB */
uint32_t trace_vcpu_dstate;
@@ -396,11 +399,26 @@ struct TranslationBlock {
uintptr_t jmp_list_first;
};
+extern bool parallel_cpus;
+
+/* Hide the atomic_read to make code a little easier on the eyes */
+static inline uint32_t tb_cflags(const TranslationBlock *tb)
+{
+ return atomic_read(&tb->cflags);
+}
+
+/* current cflags for hashing/comparison */
+static inline uint32_t curr_cflags(void)
+{
+ return parallel_cpus ? CF_PARALLEL : 0;
+}
+
void tb_free(TranslationBlock *tb);
void tb_flush(CPUState *cpu);
void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
- target_ulong cs_base, uint32_t flags);
+ target_ulong cs_base, uint32_t flags,
+ uint32_t cf_mask);
#if defined(USE_DIRECT_JUMP)
diff --git a/include/exec/tb-hash-xx.h b/include/exec/tb-hash-xx.h
index 6cd3022..747a9a6 100644
--- a/include/exec/tb-hash-xx.h
+++ b/include/exec/tb-hash-xx.h
@@ -48,8 +48,8 @@
* xxhash32, customized for input variables that are not guaranteed to be
* contiguous in memory.
*/
-static inline
-uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
+static inline uint32_t
+tb_hash_func7(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f, uint32_t g)
{
uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2;
uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2;
@@ -78,7 +78,7 @@ uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
v4 *= PRIME32_1;
h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18);
- h32 += 24;
+ h32 += 28;
h32 += e * PRIME32_3;
h32 = rol32(h32, 17) * PRIME32_4;
@@ -86,6 +86,9 @@ uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
h32 += f * PRIME32_3;
h32 = rol32(h32, 17) * PRIME32_4;
+ h32 += g * PRIME32_3;
+ h32 = rol32(h32, 17) * PRIME32_4;
+
h32 ^= h32 >> 15;
h32 *= PRIME32_2;
h32 ^= h32 >> 13;
diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
index 17b5ee0..0526c4f 100644
--- a/include/exec/tb-hash.h
+++ b/include/exec/tb-hash.h
@@ -59,9 +59,9 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
static inline
uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags,
- uint32_t trace_vcpu_dstate)
+ uint32_t cf_mask, uint32_t trace_vcpu_dstate)
{
- return tb_hash_func6(phys_pc, pc, flags, trace_vcpu_dstate);
+ return tb_hash_func7(phys_pc, pc, flags, cf_mask, trace_vcpu_dstate);
}
#endif
diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
index 436b6d5..2961385 100644
--- a/include/exec/tb-lookup.h
+++ b/include/exec/tb-lookup.h
@@ -21,7 +21,7 @@
/* Might cause an exception, so have a longjmp destination ready */
static inline TranslationBlock *
tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
- uint32_t *flags)
+ uint32_t *flags, uint32_t cf_mask)
{
CPUArchState *env = (CPUArchState *)cpu->env_ptr;
TranslationBlock *tb;
@@ -35,10 +35,10 @@ tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
tb->cs_base == *cs_base &&
tb->flags == *flags &&
tb->trace_vcpu_dstate == *cpu->trace_dstate &&
- !(atomic_read(&tb->cflags) & CF_INVALID))) {
+ (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) {
return tb;
}
- tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags);
+ tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags, cf_mask);
if (tb == NULL) {
return NULL;
}
diff --git a/tcg/tcg.h b/tcg/tcg.h
index da78721..96872f8 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -730,7 +730,6 @@ struct TCGContext {
};
extern TCGContext tcg_ctx;
-extern bool parallel_cpus;
static inline void tcg_set_insn_param(int op_idx, int arg, TCGArg v)
{
diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
index fae8c40..b71e015 100644
--- a/accel/tcg/cpu-exec.c
+++ b/accel/tcg/cpu-exec.c
@@ -207,7 +207,8 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
tb_lock();
tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
max_cycles | CF_NOCACHE
- | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
+ | (ignore_icount ? CF_IGNORE_ICOUNT : 0)
+ | curr_cflags());
tb->orig_tb = orig_tb;
tb_unlock();
@@ -225,31 +226,27 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
static void cpu_exec_step(CPUState *cpu)
{
CPUClass *cc = CPU_GET_CLASS(cpu);
- CPUArchState *env = (CPUArchState *)cpu->env_ptr;
TranslationBlock *tb;
target_ulong cs_base, pc;
uint32_t flags;
+ uint32_t cflags = 1 | CF_IGNORE_ICOUNT;
- cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
if (sigsetjmp(cpu->jmp_env, 0) == 0) {
- mmap_lock();
- tb_lock();
- tb = tb_gen_code(cpu, pc, cs_base, flags,
- 1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
- tb->orig_tb = NULL;
- tb_unlock();
- mmap_unlock();
+ tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags,
+ cflags & CF_HASH_MASK);
+ if (tb == NULL) {
+ mmap_lock();
+ tb_lock();
+ tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
+ tb_unlock();
+ mmap_unlock();
+ }
cc->cpu_exec_enter(cpu);
/* execute the generated code */
- trace_exec_tb_nocache(tb, pc);
+ trace_exec_tb(tb, pc);
cpu_tb_exec(cpu, tb);
cc->cpu_exec_exit(cpu);
-
- tb_lock();
- tb_phys_invalidate(tb, -1);
- tb_free(tb);
- tb_unlock();
} else {
/* We may have exited due to another problem here, so we need
* to reset any tb_locks we may have taken but didn't release.
@@ -281,6 +278,7 @@ struct tb_desc {
CPUArchState *env;
tb_page_addr_t phys_page1;
uint32_t flags;
+ uint32_t cf_mask;
uint32_t trace_vcpu_dstate;
};
@@ -294,7 +292,7 @@ static bool tb_cmp(const void *p, const void *d)
tb->cs_base == desc->cs_base &&
tb->flags == desc->flags &&
tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
- !(atomic_read(&tb->cflags) & CF_INVALID)) {
+ (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == desc->cf_mask) {
/* check next page if needed */
if (tb->page_addr[1] == -1) {
return true;
@@ -313,7 +311,8 @@ static bool tb_cmp(const void *p, const void *d)
}
TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
- target_ulong cs_base, uint32_t flags)
+ target_ulong cs_base, uint32_t flags,
+ uint32_t cf_mask)
{
tb_page_addr_t phys_pc;
struct tb_desc desc;
@@ -322,11 +321,12 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
desc.env = (CPUArchState *)cpu->env_ptr;
desc.cs_base = cs_base;
desc.flags = flags;
+ desc.cf_mask = cf_mask;
desc.trace_vcpu_dstate = *cpu->trace_dstate;
desc.pc = pc;
phys_pc = get_page_addr_code(desc.env, pc);
desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
- h = tb_hash_func(phys_pc, pc, flags, *cpu->trace_dstate);
+ h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);
return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
}
@@ -338,8 +338,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
target_ulong cs_base, pc;
uint32_t flags;
bool acquired_tb_lock = false;
+ uint32_t cf_mask = curr_cflags();
- tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
+ tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
if (tb == NULL) {
/* mmap_lock is needed by tb_gen_code, and mmap_lock must be
* taken outside tb_lock. As system emulation is currently
@@ -352,10 +353,10 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
/* There's a chance that our desired tb has been translated while
* taking the locks so we check again inside the lock.
*/
- tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+ tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
if (likely(tb == NULL)) {
/* if no translated code available, then translate it now */
- tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
+ tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
}
mmap_unlock();
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index c8abef0..c1ce38f 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -1077,7 +1077,8 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
/* remove the TB from the hash list */
phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
- h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
+ h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK,
+ tb->trace_vcpu_dstate);
qht_remove(&tcg_ctx.tb_ctx.htable, tb, h);
/* remove the TB from the page list */
@@ -1222,7 +1223,8 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
}
/* add in the hash table */
- h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
+ h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK,
+ tb->trace_vcpu_dstate);
qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
#ifdef DEBUG_TB_CHECK
@@ -1503,7 +1505,8 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
/* we generate a block containing just the instruction
modifying the memory. It will ensure that it cannot modify
itself */
- tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1);
+ tb_gen_code(cpu, current_pc, current_cs_base, current_flags,
+ 1 | curr_cflags());
cpu_loop_exit_noexc(cpu);
}
#endif
@@ -1621,7 +1624,8 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
/* we generate a block containing just the instruction
modifying the memory. It will ensure that it cannot modify
itself */
- tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1);
+ tb_gen_code(cpu, current_pc, current_cs_base, current_flags,
+ 1 | curr_cflags());
/* tb_lock will be reset after cpu_loop_exit_noexc longjmps
* back into the cpu_exec loop. */
return true;
@@ -1765,6 +1769,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
}
cflags = n | CF_LAST_IO;
+ cflags |= curr_cflags();
pc = tb->pc;
cs_base = tb->cs_base;
flags = tb->flags;
diff --git a/exec.c b/exec.c
index 01ac21e..94b0f3e 100644
--- a/exec.c
+++ b/exec.c
@@ -2433,7 +2433,7 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
cpu_loop_exit(cpu);
} else {
cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
- tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
+ tb_gen_code(cpu, pc, cs_base, cpu_flags, 1 | curr_cflags());
cpu_loop_exit_noexc(cpu);
}
}
diff --git a/tcg/tcg-runtime.c b/tcg/tcg-runtime.c
index 7100339..4f873a9 100644
--- a/tcg/tcg-runtime.c
+++ b/tcg/tcg-runtime.c
@@ -151,7 +151,7 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env)
target_ulong cs_base, pc;
uint32_t flags;
- tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
+ tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, curr_cflags());
if (tb == NULL) {
return tcg_ctx.code_gen_epilogue;
}
diff --git a/tests/qht-bench.c b/tests/qht-bench.c
index 11c1cec..4cabdfd 100644
--- a/tests/qht-bench.c
+++ b/tests/qht-bench.c
@@ -103,7 +103,7 @@ static bool is_equal(const void *obj, const void *userp)
static inline uint32_t h(unsigned long v)
{
- return tb_hash_func6(v, 0, 0, 0);
+ return tb_hash_func7(v, 0, 0, 0, 0);
}
/*
--
2.7.4
On 07/20/2017 07:59 PM, Emilio G. Cota wrote: > This will enable us to decouple code translation from the value > of parallel_cpus at any given time. It will also help us minimize > TB flushes when generating code via EXCP_ATOMIC. > > Note that the declaration of parallel_cpus is brought to exec-all.h > to be able to define there the "curr_cflags" inline. > > Signed-off-by: Emilio G. Cota<cota@braap.org> > --- > include/exec/exec-all.h | 20 +++++++++++++++++++- > include/exec/tb-hash-xx.h | 9 ++++++--- > include/exec/tb-hash.h | 4 ++-- > include/exec/tb-lookup.h | 6 +++--- > tcg/tcg.h | 1 - > accel/tcg/cpu-exec.c | 45 +++++++++++++++++++++++---------------------- > accel/tcg/translate-all.c | 13 +++++++++---- > exec.c | 2 +- > tcg/tcg-runtime.c | 2 +- > tests/qht-bench.c | 2 +- > 10 files changed, 65 insertions(+), 39 deletions(-) Reviewed-by: Richard Henderson <rth@twiddle.net> r~
Hi Emilio,
On Fri, Jul 21, 2017 at 1:59 AM, Emilio G. Cota <cota@braap.org> wrote:
> This will enable us to decouple code translation from the value
> of parallel_cpus at any given time. It will also help us minimize
> TB flushes when generating code via EXCP_ATOMIC.
>
> Note that the declaration of parallel_cpus is brought to exec-all.h
> to be able to define there the "curr_cflags" inline.
>
> Signed-off-by: Emilio G. Cota <cota@braap.org>
I was testing a winxp image today and I bisected a infinite loop to
this commit. The loop happens both with and without mttcg, so I think
it has got to do with something else.
It seems to be executing this instruction:
Trace 0x7fffc1d036c0 [0: 00000000000c9a6b]
----------------
IN:
0x00000000000c9a6b: rep movsb %cs:(%si),%es:(%di)
and never stops.
You can find an execution log here: http://pranith.org/files/log_n.txt.gz
Let me know if you need more information.
Thanks,
> ---
> include/exec/exec-all.h | 20 +++++++++++++++++++-
> include/exec/tb-hash-xx.h | 9 ++++++---
> include/exec/tb-hash.h | 4 ++--
> include/exec/tb-lookup.h | 6 +++---
> tcg/tcg.h | 1 -
> accel/tcg/cpu-exec.c | 45 +++++++++++++++++++++++----------------------
> accel/tcg/translate-all.c | 13 +++++++++----
> exec.c | 2 +-
> tcg/tcg-runtime.c | 2 +-
> tests/qht-bench.c | 2 +-
> 10 files changed, 65 insertions(+), 39 deletions(-)
>
> diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
> index 8cbd90b..f6a928d 100644
> --- a/include/exec/exec-all.h
> +++ b/include/exec/exec-all.h
> @@ -353,6 +353,9 @@ struct TranslationBlock {
> #define CF_USE_ICOUNT 0x20000
> #define CF_IGNORE_ICOUNT 0x40000 /* Do not generate icount code */
> #define CF_INVALID 0x80000 /* TB is stale. Setters must acquire tb_lock */
> +#define CF_PARALLEL 0x100000 /* Generate code for a parallel context */
> +/* cflags' mask for hashing/comparison */
> +#define CF_HASH_MASK (CF_COUNT_MASK | CF_PARALLEL)
>
> /* Per-vCPU dynamic tracing state used to generate this TB */
> uint32_t trace_vcpu_dstate;
> @@ -396,11 +399,26 @@ struct TranslationBlock {
> uintptr_t jmp_list_first;
> };
>
> +extern bool parallel_cpus;
> +
> +/* Hide the atomic_read to make code a little easier on the eyes */
> +static inline uint32_t tb_cflags(const TranslationBlock *tb)
> +{
> + return atomic_read(&tb->cflags);
> +}
> +
> +/* current cflags for hashing/comparison */
> +static inline uint32_t curr_cflags(void)
> +{
> + return parallel_cpus ? CF_PARALLEL : 0;
> +}
> +
> void tb_free(TranslationBlock *tb);
> void tb_flush(CPUState *cpu);
> void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
> TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> - target_ulong cs_base, uint32_t flags);
> + target_ulong cs_base, uint32_t flags,
> + uint32_t cf_mask);
>
> #if defined(USE_DIRECT_JUMP)
>
> diff --git a/include/exec/tb-hash-xx.h b/include/exec/tb-hash-xx.h
> index 6cd3022..747a9a6 100644
> --- a/include/exec/tb-hash-xx.h
> +++ b/include/exec/tb-hash-xx.h
> @@ -48,8 +48,8 @@
> * xxhash32, customized for input variables that are not guaranteed to be
> * contiguous in memory.
> */
> -static inline
> -uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
> +static inline uint32_t
> +tb_hash_func7(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f, uint32_t g)
> {
> uint32_t v1 = TB_HASH_XX_SEED + PRIME32_1 + PRIME32_2;
> uint32_t v2 = TB_HASH_XX_SEED + PRIME32_2;
> @@ -78,7 +78,7 @@ uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
> v4 *= PRIME32_1;
>
> h32 = rol32(v1, 1) + rol32(v2, 7) + rol32(v3, 12) + rol32(v4, 18);
> - h32 += 24;
> + h32 += 28;
>
> h32 += e * PRIME32_3;
> h32 = rol32(h32, 17) * PRIME32_4;
> @@ -86,6 +86,9 @@ uint32_t tb_hash_func6(uint64_t a0, uint64_t b0, uint32_t e, uint32_t f)
> h32 += f * PRIME32_3;
> h32 = rol32(h32, 17) * PRIME32_4;
>
> + h32 += g * PRIME32_3;
> + h32 = rol32(h32, 17) * PRIME32_4;
> +
> h32 ^= h32 >> 15;
> h32 *= PRIME32_2;
> h32 ^= h32 >> 13;
> diff --git a/include/exec/tb-hash.h b/include/exec/tb-hash.h
> index 17b5ee0..0526c4f 100644
> --- a/include/exec/tb-hash.h
> +++ b/include/exec/tb-hash.h
> @@ -59,9 +59,9 @@ static inline unsigned int tb_jmp_cache_hash_func(target_ulong pc)
>
> static inline
> uint32_t tb_hash_func(tb_page_addr_t phys_pc, target_ulong pc, uint32_t flags,
> - uint32_t trace_vcpu_dstate)
> + uint32_t cf_mask, uint32_t trace_vcpu_dstate)
> {
> - return tb_hash_func6(phys_pc, pc, flags, trace_vcpu_dstate);
> + return tb_hash_func7(phys_pc, pc, flags, cf_mask, trace_vcpu_dstate);
> }
>
> #endif
> diff --git a/include/exec/tb-lookup.h b/include/exec/tb-lookup.h
> index 436b6d5..2961385 100644
> --- a/include/exec/tb-lookup.h
> +++ b/include/exec/tb-lookup.h
> @@ -21,7 +21,7 @@
> /* Might cause an exception, so have a longjmp destination ready */
> static inline TranslationBlock *
> tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
> - uint32_t *flags)
> + uint32_t *flags, uint32_t cf_mask)
> {
> CPUArchState *env = (CPUArchState *)cpu->env_ptr;
> TranslationBlock *tb;
> @@ -35,10 +35,10 @@ tb_lookup__cpu_state(CPUState *cpu, target_ulong *pc, target_ulong *cs_base,
> tb->cs_base == *cs_base &&
> tb->flags == *flags &&
> tb->trace_vcpu_dstate == *cpu->trace_dstate &&
> - !(atomic_read(&tb->cflags) & CF_INVALID))) {
> + (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == cf_mask)) {
> return tb;
> }
> - tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags);
> + tb = tb_htable_lookup(cpu, *pc, *cs_base, *flags, cf_mask);
> if (tb == NULL) {
> return NULL;
> }
> diff --git a/tcg/tcg.h b/tcg/tcg.h
> index da78721..96872f8 100644
> --- a/tcg/tcg.h
> +++ b/tcg/tcg.h
> @@ -730,7 +730,6 @@ struct TCGContext {
> };
>
> extern TCGContext tcg_ctx;
> -extern bool parallel_cpus;
>
> static inline void tcg_set_insn_param(int op_idx, int arg, TCGArg v)
> {
> diff --git a/accel/tcg/cpu-exec.c b/accel/tcg/cpu-exec.c
> index fae8c40..b71e015 100644
> --- a/accel/tcg/cpu-exec.c
> +++ b/accel/tcg/cpu-exec.c
> @@ -207,7 +207,8 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
> tb_lock();
> tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
> max_cycles | CF_NOCACHE
> - | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
> + | (ignore_icount ? CF_IGNORE_ICOUNT : 0)
> + | curr_cflags());
> tb->orig_tb = orig_tb;
> tb_unlock();
>
> @@ -225,31 +226,27 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
> static void cpu_exec_step(CPUState *cpu)
> {
> CPUClass *cc = CPU_GET_CLASS(cpu);
> - CPUArchState *env = (CPUArchState *)cpu->env_ptr;
> TranslationBlock *tb;
> target_ulong cs_base, pc;
> uint32_t flags;
> + uint32_t cflags = 1 | CF_IGNORE_ICOUNT;
>
> - cpu_get_tb_cpu_state(env, &pc, &cs_base, &flags);
> if (sigsetjmp(cpu->jmp_env, 0) == 0) {
> - mmap_lock();
> - tb_lock();
> - tb = tb_gen_code(cpu, pc, cs_base, flags,
> - 1 | CF_NOCACHE | CF_IGNORE_ICOUNT);
> - tb->orig_tb = NULL;
> - tb_unlock();
> - mmap_unlock();
> + tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags,
> + cflags & CF_HASH_MASK);
> + if (tb == NULL) {
> + mmap_lock();
> + tb_lock();
> + tb = tb_gen_code(cpu, pc, cs_base, flags, cflags);
> + tb_unlock();
> + mmap_unlock();
> + }
>
> cc->cpu_exec_enter(cpu);
> /* execute the generated code */
> - trace_exec_tb_nocache(tb, pc);
> + trace_exec_tb(tb, pc);
> cpu_tb_exec(cpu, tb);
> cc->cpu_exec_exit(cpu);
> -
> - tb_lock();
> - tb_phys_invalidate(tb, -1);
> - tb_free(tb);
> - tb_unlock();
> } else {
> /* We may have exited due to another problem here, so we need
> * to reset any tb_locks we may have taken but didn't release.
> @@ -281,6 +278,7 @@ struct tb_desc {
> CPUArchState *env;
> tb_page_addr_t phys_page1;
> uint32_t flags;
> + uint32_t cf_mask;
> uint32_t trace_vcpu_dstate;
> };
>
> @@ -294,7 +292,7 @@ static bool tb_cmp(const void *p, const void *d)
> tb->cs_base == desc->cs_base &&
> tb->flags == desc->flags &&
> tb->trace_vcpu_dstate == desc->trace_vcpu_dstate &&
> - !(atomic_read(&tb->cflags) & CF_INVALID)) {
> + (tb_cflags(tb) & (CF_HASH_MASK | CF_INVALID)) == desc->cf_mask) {
> /* check next page if needed */
> if (tb->page_addr[1] == -1) {
> return true;
> @@ -313,7 +311,8 @@ static bool tb_cmp(const void *p, const void *d)
> }
>
> TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> - target_ulong cs_base, uint32_t flags)
> + target_ulong cs_base, uint32_t flags,
> + uint32_t cf_mask)
> {
> tb_page_addr_t phys_pc;
> struct tb_desc desc;
> @@ -322,11 +321,12 @@ TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
> desc.env = (CPUArchState *)cpu->env_ptr;
> desc.cs_base = cs_base;
> desc.flags = flags;
> + desc.cf_mask = cf_mask;
> desc.trace_vcpu_dstate = *cpu->trace_dstate;
> desc.pc = pc;
> phys_pc = get_page_addr_code(desc.env, pc);
> desc.phys_page1 = phys_pc & TARGET_PAGE_MASK;
> - h = tb_hash_func(phys_pc, pc, flags, *cpu->trace_dstate);
> + h = tb_hash_func(phys_pc, pc, flags, cf_mask, *cpu->trace_dstate);
> return qht_lookup(&tcg_ctx.tb_ctx.htable, tb_cmp, &desc, h);
> }
>
> @@ -338,8 +338,9 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> target_ulong cs_base, pc;
> uint32_t flags;
> bool acquired_tb_lock = false;
> + uint32_t cf_mask = curr_cflags();
>
> - tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
> + tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, cf_mask);
> if (tb == NULL) {
> /* mmap_lock is needed by tb_gen_code, and mmap_lock must be
> * taken outside tb_lock. As system emulation is currently
> @@ -352,10 +353,10 @@ static inline TranslationBlock *tb_find(CPUState *cpu,
> /* There's a chance that our desired tb has been translated while
> * taking the locks so we check again inside the lock.
> */
> - tb = tb_htable_lookup(cpu, pc, cs_base, flags);
> + tb = tb_htable_lookup(cpu, pc, cs_base, flags, cf_mask);
> if (likely(tb == NULL)) {
> /* if no translated code available, then translate it now */
> - tb = tb_gen_code(cpu, pc, cs_base, flags, 0);
> + tb = tb_gen_code(cpu, pc, cs_base, flags, cf_mask);
> }
>
> mmap_unlock();
> diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
> index c8abef0..c1ce38f 100644
> --- a/accel/tcg/translate-all.c
> +++ b/accel/tcg/translate-all.c
> @@ -1077,7 +1077,8 @@ void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr)
>
> /* remove the TB from the hash list */
> phys_pc = tb->page_addr[0] + (tb->pc & ~TARGET_PAGE_MASK);
> - h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
> + h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK,
> + tb->trace_vcpu_dstate);
> qht_remove(&tcg_ctx.tb_ctx.htable, tb, h);
>
> /* remove the TB from the page list */
> @@ -1222,7 +1223,8 @@ static void tb_link_page(TranslationBlock *tb, tb_page_addr_t phys_pc,
> }
>
> /* add in the hash table */
> - h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->trace_vcpu_dstate);
> + h = tb_hash_func(phys_pc, tb->pc, tb->flags, tb->cflags & CF_HASH_MASK,
> + tb->trace_vcpu_dstate);
> qht_insert(&tcg_ctx.tb_ctx.htable, tb, h);
>
> #ifdef DEBUG_TB_CHECK
> @@ -1503,7 +1505,8 @@ void tb_invalidate_phys_page_range(tb_page_addr_t start, tb_page_addr_t end,
> /* we generate a block containing just the instruction
> modifying the memory. It will ensure that it cannot modify
> itself */
> - tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1);
> + tb_gen_code(cpu, current_pc, current_cs_base, current_flags,
> + 1 | curr_cflags());
> cpu_loop_exit_noexc(cpu);
> }
> #endif
> @@ -1621,7 +1624,8 @@ static bool tb_invalidate_phys_page(tb_page_addr_t addr, uintptr_t pc)
> /* we generate a block containing just the instruction
> modifying the memory. It will ensure that it cannot modify
> itself */
> - tb_gen_code(cpu, current_pc, current_cs_base, current_flags, 1);
> + tb_gen_code(cpu, current_pc, current_cs_base, current_flags,
> + 1 | curr_cflags());
> /* tb_lock will be reset after cpu_loop_exit_noexc longjmps
> * back into the cpu_exec loop. */
> return true;
> @@ -1765,6 +1769,7 @@ void cpu_io_recompile(CPUState *cpu, uintptr_t retaddr)
> }
>
> cflags = n | CF_LAST_IO;
> + cflags |= curr_cflags();
> pc = tb->pc;
> cs_base = tb->cs_base;
> flags = tb->flags;
> diff --git a/exec.c b/exec.c
> index 01ac21e..94b0f3e 100644
> --- a/exec.c
> +++ b/exec.c
> @@ -2433,7 +2433,7 @@ static void check_watchpoint(int offset, int len, MemTxAttrs attrs, int flags)
> cpu_loop_exit(cpu);
> } else {
> cpu_get_tb_cpu_state(env, &pc, &cs_base, &cpu_flags);
> - tb_gen_code(cpu, pc, cs_base, cpu_flags, 1);
> + tb_gen_code(cpu, pc, cs_base, cpu_flags, 1 | curr_cflags());
> cpu_loop_exit_noexc(cpu);
> }
> }
> diff --git a/tcg/tcg-runtime.c b/tcg/tcg-runtime.c
> index 7100339..4f873a9 100644
> --- a/tcg/tcg-runtime.c
> +++ b/tcg/tcg-runtime.c
> @@ -151,7 +151,7 @@ void *HELPER(lookup_tb_ptr)(CPUArchState *env)
> target_ulong cs_base, pc;
> uint32_t flags;
>
> - tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags);
> + tb = tb_lookup__cpu_state(cpu, &pc, &cs_base, &flags, curr_cflags());
> if (tb == NULL) {
> return tcg_ctx.code_gen_epilogue;
> }
> diff --git a/tests/qht-bench.c b/tests/qht-bench.c
> index 11c1cec..4cabdfd 100644
> --- a/tests/qht-bench.c
> +++ b/tests/qht-bench.c
> @@ -103,7 +103,7 @@ static bool is_equal(const void *obj, const void *userp)
>
> static inline uint32_t h(unsigned long v)
> {
> - return tb_hash_func6(v, 0, 0, 0);
> + return tb_hash_func7(v, 0, 0, 0, 0);
> }
>
> /*
> --
> 2.7.4
>
>
On Sun, Aug 27, 2017 at 18:15:50 -0400, Pranith Kumar wrote:
> Hi Emilio,
>
> On Fri, Jul 21, 2017 at 1:59 AM, Emilio G. Cota <cota@braap.org> wrote:
> > This will enable us to decouple code translation from the value
> > of parallel_cpus at any given time. It will also help us minimize
> > TB flushes when generating code via EXCP_ATOMIC.
> >
> > Note that the declaration of parallel_cpus is brought to exec-all.h
> > to be able to define there the "curr_cflags" inline.
> >
> > Signed-off-by: Emilio G. Cota <cota@braap.org>
>
> I was testing a winxp image today and I bisected a infinite loop to
> this commit. The loop happens both with and without mttcg, so I think
> it has got to do with something else.
Can you test the below? It lets me boot ubuntu, otherwise it reliably
chokes on a 'rep movsb' *very* early (doesn't even get to grub).
This discusson on v2 might be relevant (I added CF_COUNT_MASK as a
result of it, but it seems I have to revisit that):
https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg06456.html
Anyway let me know if this fixes it for you. Thanks for testing!
Emilio
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 025fae0..8b2f233 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -326,7 +326,7 @@ struct TranslationBlock {
#define CF_INVALID 0x80000 /* TB is stale. Setters must acquire tb_lock */
#define CF_PARALLEL 0x100000 /* Generate code for a parallel context */
/* cflags' mask for hashing/comparison */
-#define CF_HASH_MASK (CF_COUNT_MASK | CF_PARALLEL)
+#define CF_HASH_MASK (CF_PARALLEL)
/* Per-vCPU dynamic tracing state used to generate this TB */
uint32_t trace_vcpu_dstate;
On Tue, Aug 29, 2017 at 5:16 PM, Emilio G. Cota <cota@braap.org> wrote: > On Sun, Aug 27, 2017 at 18:15:50 -0400, Pranith Kumar wrote: >> Hi Emilio, >> >> On Fri, Jul 21, 2017 at 1:59 AM, Emilio G. Cota <cota@braap.org> wrote: >> > This will enable us to decouple code translation from the value >> > of parallel_cpus at any given time. It will also help us minimize >> > TB flushes when generating code via EXCP_ATOMIC. >> > >> > Note that the declaration of parallel_cpus is brought to exec-all.h >> > to be able to define there the "curr_cflags" inline. >> > >> > Signed-off-by: Emilio G. Cota <cota@braap.org> >> >> I was testing a winxp image today and I bisected a infinite loop to >> this commit. The loop happens both with and without mttcg, so I think >> it has got to do with something else. > > Can you test the below? It lets me boot ubuntu, otherwise it reliably > chokes on a 'rep movsb' *very* early (doesn't even get to grub). > > This discusson on v2 might be relevant (I added CF_COUNT_MASK as a > result of it, but it seems I have to revisit that): > https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg06456.html > > Anyway let me know if this fixes it for you. Thanks for testing! > Yes, this fixes the issue for me. Thanks, -- Pranith
Hi Richard, Are you planning to get this patchset merged in this window? If so, I can give it a respin on top of the current master. Anyway, before doing so we should fix the issue around CF_COUNT_MASK that Pranith reported: On Wed, Aug 30, 2017 at 10:43:28 -0400, Pranith Kumar wrote: > On Tue, Aug 29, 2017 at 5:16 PM, Emilio G. Cota <cota@braap.org> wrote: > > On Sun, Aug 27, 2017 at 18:15:50 -0400, Pranith Kumar wrote: > >> Hi Emilio, > >> > >> On Fri, Jul 21, 2017 at 1:59 AM, Emilio G. Cota <cota@braap.org> wrote: > >> > This will enable us to decouple code translation from the value > >> > of parallel_cpus at any given time. It will also help us minimize > >> > TB flushes when generating code via EXCP_ATOMIC. > >> > > >> > Note that the declaration of parallel_cpus is brought to exec-all.h > >> > to be able to define there the "curr_cflags" inline. > >> > > >> > Signed-off-by: Emilio G. Cota <cota@braap.org> > >> > >> I was testing a winxp image today and I bisected a infinite loop to > >> this commit. The loop happens both with and without mttcg, so I think > >> it has got to do with something else. > > > > Can you test the below? It lets me boot ubuntu, otherwise it reliably > > chokes on a 'rep movsb' *very* early (doesn't even get to grub). > > > > This discusson on v2 might be relevant (I added CF_COUNT_MASK as a > > result of it, but it seems I have to revisit that): > > https://lists.gnu.org/archive/html/qemu-devel/2017-07/msg06456.html > > > > Anyway let me know if this fixes it for you. Thanks for testing! > > > > Yes, this fixes the issue for me. My quick-fix is just not to check those bits, but as you pointed out during v3's review the whole thing is a little bit fragile if we don't check them. Should we just go with the CF_NOCHAIN flag that you proposed? If so, do you want me to give that approach a go, or you prefer to do it yourself? Thanks, Emilio
On 09/22/2017 01:40 PM, Emilio G. Cota wrote: > Hi Richard, > > Are you planning to get this patchset merged in this window? If so, I can > give it a respin on top of the current master. Yes, I do. I've been intending to look at ... > Anyway, before doing so we should fix the issue around CF_COUNT_MASK that > Pranith reported: ... this one and figure out why my intuition is wrong. I'm at Linaro Connect this week, so it may have to wait til next. r~
On Mon, Sep 25, 2017 at 10:01:15 -0700, Richard Henderson wrote:
> On 09/22/2017 01:40 PM, Emilio G. Cota wrote:
> > Hi Richard,
> >
> > Are you planning to get this patchset merged in this window? If so, I can
> > give it a respin on top of the current master.
>
> Yes, I do. I've been intending to look at ...
>
> > Anyway, before doing so we should fix the issue around CF_COUNT_MASK that
> > Pranith reported:
>
> ... this one and figure out why my intuition is wrong.
> I'm at Linaro Connect this week, so it may have to wait til next.
I just tested this with icount. Turns out two fixups to this patchset are
necessary to not break icount. The first one is (again) to just
include CF_PARALLEL in the hash mask. The second is a fix for the comparison
function of tb_tc, without which removals don't work.
I'm including the fixups below.
Thanks,
Emilio
commit 7a899a8df2d769dd80ba1a7f559cb4ddbb0c568b
Author: Emilio G. Cota <cota@braap.org>
Date: Thu Oct 5 18:40:30 2017 -0400
FIXUP for "tcg: define CF_PARALLEL ..."
Signed-off-by: Emilio G. Cota <cota@braap.org>
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 025fae0..8b2f233 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -326,7 +326,7 @@ struct TranslationBlock {
#define CF_INVALID 0x80000 /* TB is stale. Setters must acquire tb_lock */
#define CF_PARALLEL 0x100000 /* Generate code for a parallel context */
/* cflags' mask for hashing/comparison */
-#define CF_HASH_MASK (CF_COUNT_MASK | CF_PARALLEL)
+#define CF_HASH_MASK (CF_PARALLEL)
/* Per-vCPU dynamic tracing state used to generate this TB */
uint32_t trace_vcpu_dstate;
commit c102c2409a5a134fd7f9ba61f69a5ae29df99c89
Author: Emilio G. Cota <cota@braap.org>
Date: Thu Oct 5 18:51:24 2017 -0400
FIXUP for "translate-all: use a binary search tree to ..."
Signed-off-by: Emilio G. Cota <cota@braap.org>
diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index 9f1faff..fe607ca 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -793,19 +793,19 @@ static gint tb_tc_cmp(gconstpointer ap, gconstpointer bp)
const struct tb_tc *b = bp;
/*
- * When both sizes are set, we know this isn't a lookup and therefore
- * the two buffers are non-overlapping.
+ * When both sizes are set, we know this isn't a lookup.
* This is the most likely case: every TB must be inserted; lookups
* are a lot less frequent.
*/
if (likely(a->size && b->size)) {
- /* a->ptr == b->ptr would mean the buffers overlap */
- g_assert(a->ptr != b->ptr);
-
if (a->ptr > b->ptr) {
return 1;
+ } else if (a->ptr < b->ptr) {
+ return -1;
}
- return -1;
+ /* a->ptr == b->ptr should happen only on deletions */
+ g_assert(a->size == b->size);
+ return 0;
}
/*
* All lookups have either .size field set to 0.
On Thu, Oct 05, 2017 at 19:24:16 -0400, Emilio G. Cota wrote: > I'm including the fixups below. Just pushed v5 of this series. I rebased the series on top of the current master, and added the already mentioned fixes plus a new one (see below). Grab the patches from: https://github.com/cota/qemu/tree/multi-tcg-v5 Changes since v4: - Rebase to current master. (fixed a few conflicts, mostly due to some targets having been converted to the generic translation loop.) - Add the removal of CF_COUNT_MASK as a separate "FIXUP" commit to signal that this still requires attention. - Fix tb TC comparison function to support deletions (otherwise -icount breaks) - [new fix] TCG regions: Rely on max_cpus instead of smp_cpus. Otherwise we'd break CPU hotplug on targets that support it. (max_cpus and smp_cpus are set from the command line with -smp foo,maxcpus=bar; max_cpus is always >= smp_cpus.) Thanks, Emilio
On 10/09/2017 12:24 PM, Emilio G. Cota wrote: > On Thu, Oct 05, 2017 at 19:24:16 -0400, Emilio G. Cota wrote: >> I'm including the fixups below. > > Just pushed v5 of this series. I rebased the series on top of the current master, > and added the already mentioned fixes plus a new one (see below). > > Grab the patches from: > https://github.com/cota/qemu/tree/multi-tcg-v5 Thanks. I was just about to do this rebase myself. > Changes since v4: > - Rebase to current master. (fixed a few conflicts, mostly due to some targets > having been converted to the generic translation loop.) > - Add the removal of CF_COUNT_MASK as a separate "FIXUP" commit to signal > that this still requires attention. > - Fix tb TC comparison function to support deletions (otherwise -icount breaks) > - [new fix] TCG regions: Rely on max_cpus instead of smp_cpus. Otherwise we'd > break CPU hotplug on targets that support it. (max_cpus and smp_cpus are > set from the command line with -smp foo,maxcpus=bar; max_cpus is always > >= smp_cpus.) Excellent. I'll work on reviewing these changes this week. In the meantime, I've cherry-picked about half of the patch set. r~
© 2016 - 2026 Red Hat, Inc.