[v3] Relax code buffer size limitation on aarch64 hosts

[Qemu-devel] [PATCH v3 2/3] tcg/aarch64: Use ADRP+ADD to compute target address

Posted by Pranith Kumar 8 years, 7 months ago

We use ADRP+ADD to compute the target address for goto_tb. This patch
introduces the NOP instruction which is used to align the above
instruction pair so that we can use one atomic instruction to patch
the destination offsets.

CC: Richard Henderson <rth@twiddle.net>
CC: Alex Bennée <alex.bennee@linaro.org>
Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 accel/tcg/translate-all.c    |  2 +-
 tcg/aarch64/tcg-target.inc.c | 34 +++++++++++++++++++++++++++++-----
 2 files changed, 30 insertions(+), 6 deletions(-)

diff --git a/accel/tcg/translate-all.c b/accel/tcg/translate-all.c
index f6ad46b613..65a92dbf67 100644
--- a/accel/tcg/translate-all.c
+++ b/accel/tcg/translate-all.c
@@ -522,7 +522,7 @@ static inline PageDesc *page_find(tb_page_addr_t index)
 #elif defined(__powerpc__)
 # define MAX_CODE_GEN_BUFFER_SIZE  (32u * 1024 * 1024)
 #elif defined(__aarch64__)
-# define MAX_CODE_GEN_BUFFER_SIZE  (128ul * 1024 * 1024)
+# define MAX_CODE_GEN_BUFFER_SIZE  (2ul * 1024 * 1024 * 1024)
 #elif defined(__s390x__)
   /* We have a +- 4GB range on the branches; leave some slop.  */
 # define MAX_CODE_GEN_BUFFER_SIZE  (3ul * 1024 * 1024 * 1024)
diff --git a/tcg/aarch64/tcg-target.inc.c b/tcg/aarch64/tcg-target.inc.c
index 8fce11ace7..f059d9d781 100644
--- a/tcg/aarch64/tcg-target.inc.c
+++ b/tcg/aarch64/tcg-target.inc.c
@@ -372,6 +372,7 @@ typedef enum {
     I3510_EON       = 0x4a200000,
     I3510_ANDS      = 0x6a000000,
 
+    NOP             = 0xd503201f,
     /* System instructions.  */
     DMB_ISH         = 0xd50338bf,
     DMB_LD          = 0x00000100,
@@ -866,10 +867,26 @@ static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *target)
 void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
 {
     tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
-    tcg_insn_unit *target = (tcg_insn_unit *)addr;
+    tcg_insn_unit i1, i2;
+    uint64_t pair;
 
-    reloc_pc26_atomic(code_ptr, target);
-    flush_icache_range(jmp_addr, jmp_addr + 4);
+    ptrdiff_t offset = addr - jmp_addr;
+
+    if (offset == sextract64(offset, 0, 26)) {
+        i1 = NOP;
+        i2 = I3206_B | ((offset >> 2) & 0x3ffffff);
+    } else {
+        offset = (addr >> 12) - (jmp_addr >> 12);
+
+        /* patch ADRP */
+        i2 = deposit32(*code_ptr++, 29, 2, offset & 0x3);
+        i2 = deposit32(i2, 5, 19, offset >> 2);
+        /* patch ADDI */
+        i1 = deposit32(*code_ptr, 10, 12, addr & 0xfff);
+    }
+    pair = (uint64_t)i1 << 32 | i2;
+    atomic_set((uint64_t *)jmp_addr, pair);
+    flush_icache_range(jmp_addr, jmp_addr + 8);
 }
 
 static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
@@ -1388,10 +1405,17 @@ static void tcg_out_op(TCGContext *s, TCGOpcode opc,
 #endif
         /* consistency for USE_DIRECT_JUMP */
         tcg_debug_assert(s->tb_jmp_insn_offset != NULL);
+        /* Ensure that ADRP+ADD are 8-byte aligned so that an atomic
+           write can be used to patch the target address. */
+        if ((uintptr_t)s->code_ptr & 7) {
+            tcg_out32(s, NOP);
+        }
         s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
         /* actual branch destination will be patched by
-           aarch64_tb_set_jmp_target later, beware retranslation. */
-        tcg_out_goto_noaddr(s);
+           aarch64_tb_set_jmp_target later. */
+        tcg_out_insn(s, 3406, ADRP, TCG_REG_TMP, 0);
+        tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_TMP, TCG_REG_TMP, 0);
+        tcg_out_insn(s, 3207, BR, TCG_REG_TMP);
         s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
         break;
 
-- 
2.13.0

Re: [Qemu-devel] [PATCH v3 2/3] tcg/aarch64: Use ADRP+ADD to compute target address

Posted by Richard Henderson 8 years, 7 months ago

On 06/29/2017 05:40 PM, Pranith Kumar wrote:
>   void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
>   {
>       tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
> -    tcg_insn_unit *target = (tcg_insn_unit *)addr;
> +    tcg_insn_unit i1, i2;
> +    uint64_t pair;
>   
> +    ptrdiff_t offset = addr - jmp_addr;
> +
> +    if (offset == sextract64(offset, 0, 26)) {
> +        i1 = NOP;
> +        i2 = I3206_B | ((offset >> 2) & 0x3ffffff);

Branch first, since that's the offset you calculated.
Also, the nop need not be executed.

> +    } else {
> +        offset = (addr >> 12) - (jmp_addr >> 12);
> +
> +        /* patch ADRP */
> +        i2 = deposit32(*code_ptr++, 29, 2, offset & 0x3);
> +        i2 = deposit32(i2, 5, 19, offset >> 2);
> +        /* patch ADDI */
> +        i1 = deposit32(*code_ptr, 10, 12, addr & 0xfff);

You can't just patch these insns, because they aren't necessarily ADRP+ADD. 
Indeed, they will very likely be B and NOP.  The first address we patch in is 
tb_jmp_reset_offset, which is the following opcode, which is definitely in 
range of the branch above.

r~

Re: [Qemu-devel] [PATCH v3 2/3] tcg/aarch64: Use ADRP+ADD to compute target address

Posted by Pranith Kumar 8 years, 7 months ago

On Fri, Jun 30, 2017 at 12:47 AM, Richard Henderson <rth@twiddle.net> wrote:
> On 06/29/2017 05:40 PM, Pranith Kumar wrote:
>>
>>   void aarch64_tb_set_jmp_target(uintptr_t jmp_addr, uintptr_t addr)
>>   {
>>       tcg_insn_unit *code_ptr = (tcg_insn_unit *)jmp_addr;
>> -    tcg_insn_unit *target = (tcg_insn_unit *)addr;
>> +    tcg_insn_unit i1, i2;
>> +    uint64_t pair;
>>   +    ptrdiff_t offset = addr - jmp_addr;
>> +
>> +    if (offset == sextract64(offset, 0, 26)) {
>> +        i1 = NOP;
>> +        i2 = I3206_B | ((offset >> 2) & 0x3ffffff);
>
>
> Branch first, since that's the offset you calculated.
> Also, the nop need not be executed.

This is exactly how I form the instruction pair below (B+NOP, not
NOP+B). But I get your point. It is confusing to use i1 for the second
instruction. I'll change it.

>
>> +    } else {
>> +        offset = (addr >> 12) - (jmp_addr >> 12);
>> +
>> +        /* patch ADRP */
>> +        i2 = deposit32(*code_ptr++, 29, 2, offset & 0x3);
>> +        i2 = deposit32(i2, 5, 19, offset >> 2);
>> +        /* patch ADDI */
>> +        i1 = deposit32(*code_ptr, 10, 12, addr & 0xfff);
>
>
> You can't just patch these insns, because they aren't necessarily ADRP+ADD.
> Indeed, they will very likely be B and NOP.  The first address we patch in
> is tb_jmp_reset_offset, which is the following opcode, which is definitely
> in range of the branch above.

Whoops, I totally missed that we patch these out the first time out. I
will explicitly generate the ADRP+ADD pair from here.

Thanks,
-- 
Pranith