[PATCH v2 31/54] accel/tcg: Always use IntervalTree for code lookups

Richard Henderson posted 54 patches 1 week, 2 days ago
[PATCH v2 31/54] accel/tcg: Always use IntervalTree for code lookups
Posted by Richard Henderson 1 week, 2 days ago
Because translation is special, we don't need the speed
of the direct-mapped softmmu tlb.  We cache a lookups in
DisasContextBase within the translator loop anyway.

Drop the addr_code comparator from CPUTLBEntry.
Go directly to the IntervalTree for MMU_INST_FETCH.
Derive exec flags from read flags.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/exec/cpu-all.h    |  3 ++
 include/exec/tlb-common.h |  5 ++-
 accel/tcg/cputlb.c        | 76 ++++++++++++++++++++++++---------------
 3 files changed, 52 insertions(+), 32 deletions(-)

diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
index 45e6676938..ad160c328a 100644
--- a/include/exec/cpu-all.h
+++ b/include/exec/cpu-all.h
@@ -339,6 +339,9 @@ static inline int cpu_mmu_index(CPUState *cs, bool ifetch)
     (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO \
     | TLB_FORCE_SLOW | TLB_DISCARD_WRITE)
 
+/* Filter read flags to exec flags. */
+#define TLB_EXEC_FLAGS_MASK  (TLB_MMIO)
+
 /*
  * Flags stored in CPUTLBEntryFull.slow_flags[x].
  * TLB_FORCE_SLOW must be set in CPUTLBEntry.addr_idx[x].
diff --git a/include/exec/tlb-common.h b/include/exec/tlb-common.h
index 300f9fae67..feaa471299 100644
--- a/include/exec/tlb-common.h
+++ b/include/exec/tlb-common.h
@@ -26,7 +26,6 @@ typedef union CPUTLBEntry {
     struct {
         uint64_t addr_read;
         uint64_t addr_write;
-        uint64_t addr_code;
         /*
          * Addend to virtual address to get host address.  IO accesses
          * use the corresponding iotlb value.
@@ -35,7 +34,7 @@ typedef union CPUTLBEntry {
     };
     /*
      * Padding to get a power of two size, as well as index
-     * access to addr_{read,write,code}.
+     * access to addr_{read,write}.
      */
     uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)];
 } CPUTLBEntry;
@@ -92,7 +91,7 @@ struct CPUTLBEntryFull {
      * Additional tlb flags for use by the slow path. If non-zero,
      * the corresponding CPUTLBEntry comparator must have TLB_FORCE_SLOW.
      */
-    uint8_t slow_flags[MMU_ACCESS_COUNT];
+    uint8_t slow_flags[2];
 
     /*
      * Allow target-specific additions to this structure.
diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 981098a6f2..be2ea1bc70 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -114,8 +114,9 @@ static inline uint64_t tlb_read_idx(const CPUTLBEntry *entry,
                       MMU_DATA_LOAD * sizeof(uint64_t));
     QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_write) !=
                       MMU_DATA_STORE * sizeof(uint64_t));
-    QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_code) !=
-                      MMU_INST_FETCH * sizeof(uint64_t));
+
+    tcg_debug_assert(access_type == MMU_DATA_LOAD ||
+                     access_type == MMU_DATA_STORE);
 
 #if TARGET_LONG_BITS == 32
     /* Use qatomic_read, in case of addr_write; only care about low bits. */
@@ -480,8 +481,7 @@ static bool tlb_hit_page_mask_anyprot(CPUTLBEntry *tlb_entry,
     mask &= TARGET_PAGE_MASK | TLB_INVALID_MASK;
 
     return (page == (tlb_entry->addr_read & mask) ||
-            page == (tlb_addr_write(tlb_entry) & mask) ||
-            page == (tlb_entry->addr_code & mask));
+            page == (tlb_addr_write(tlb_entry) & mask));
 }
 
 /* Called with tlb_c.lock held */
@@ -1184,9 +1184,6 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
     /* Now calculate the new entry */
     node->copy.addend = addend - addr_page;
 
-    tlb_set_compare(full, &node->copy, addr_page, read_flags,
-                    MMU_INST_FETCH, prot & PAGE_EXEC);
-
     if (wp_flags & BP_MEM_READ) {
         read_flags |= TLB_WATCHPOINT;
     }
@@ -1308,22 +1305,30 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o,
     /* Primary lookup in the fast tlb. */
     entry = tlbfast_entry(fast, addr);
     full = &desc->fulltlb[tlbfast_index(fast, addr)];
-    cmp = tlb_read_idx(entry, access_type);
-    if (tlb_hit(cmp, addr)) {
-        goto found;
+    if (access_type != MMU_INST_FETCH) {
+        cmp = tlb_read_idx(entry, access_type);
+        if (tlb_hit(cmp, addr)) {
+            goto found_data;
+        }
     }
 
     /* Secondary lookup in the IntervalTree. */
     node = tlbtree_lookup_addr(desc, addr);
     if (node) {
-        cmp = tlb_read_idx(&node->copy, access_type);
-        if (tlb_hit(cmp, addr)) {
-            /* Install the cached entry. */
-            qemu_spin_lock(&cpu->neg.tlb.c.lock);
-            copy_tlb_helper_locked(entry, &node->copy);
-            qemu_spin_unlock(&cpu->neg.tlb.c.lock);
-            *full = node->full;
-            goto found;
+        if (access_type == MMU_INST_FETCH) {
+            if (node->full.prot & PAGE_EXEC) {
+                goto found_code;
+            }
+        } else {
+            cmp = tlb_read_idx(&node->copy, access_type);
+            if (tlb_hit(cmp, addr)) {
+                /* Install the cached entry. */
+                qemu_spin_lock(&cpu->neg.tlb.c.lock);
+                copy_tlb_helper_locked(entry, &node->copy);
+                qemu_spin_unlock(&cpu->neg.tlb.c.lock);
+                *full = node->full;
+                goto found_data;
+            }
         }
     }
 
@@ -1333,9 +1338,14 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o,
         tcg_debug_assert(probe);
         return false;
     }
-
     o->did_tlb_fill = true;
 
+    if (access_type == MMU_INST_FETCH) {
+        node = tlbtree_lookup_addr(desc, addr);
+        tcg_debug_assert(node);
+        goto found_code;
+    }
+
     entry = tlbfast_entry(fast, addr);
     full = &desc->fulltlb[tlbfast_index(fast, addr)];
     cmp = tlb_read_idx(entry, access_type);
@@ -1345,14 +1355,29 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o,
      * called tlb_fill_align, so we know that this entry *is* valid.
      */
     flags &= ~TLB_INVALID_MASK;
+    goto found_data;
+
+ found_data:
+    flags &= cmp;
+    flags |= full->slow_flags[access_type];
+    o->flags = flags;
+    o->full = *full;
+    o->haddr = (void *)((uintptr_t)addr + entry->addend);
     goto done;
 
- found:
-    /* Alignment has not been checked by tlb_fill_align. */
-    {
+ found_code:
+    o->flags = node->copy.addr_read & TLB_EXEC_FLAGS_MASK;
+    o->full = node->full;
+    o->haddr = (void *)((uintptr_t)addr + node->copy.addend);
+    goto done;
+
+ done:
+    if (!o->did_tlb_fill) {
         int a_bits = memop_alignment_bits(memop);
 
         /*
+         * Alignment has not been checked by tlb_fill_align.
+         *
          * The TLB_CHECK_ALIGNED check differs from the normal alignment
          * check, in that this is based on the atomicity of the operation.
          * The intended use case is the ARM memory type field of each PTE,
@@ -1366,13 +1391,6 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o,
             cpu_unaligned_access(cpu, addr, access_type, i->mmu_idx, i->ra);
         }
     }
-
- done:
-    flags &= cmp;
-    flags |= full->slow_flags[access_type];
-    o->flags = flags;
-    o->full = *full;
-    o->haddr = (void *)((uintptr_t)addr + entry->addend);
     return true;
 }
 
-- 
2.43.0
Re: [PATCH v2 31/54] accel/tcg: Always use IntervalTree for code lookups
Posted by Pierrick Bouvier 1 week, 1 day ago
On 11/14/24 08:01, Richard Henderson wrote:
> Because translation is special, we don't need the speed
> of the direct-mapped softmmu tlb.  We cache a lookups in
> DisasContextBase within the translator loop anyway.
> 
> Drop the addr_code comparator from CPUTLBEntry.
> Go directly to the IntervalTree for MMU_INST_FETCH.
> Derive exec flags from read flags.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   include/exec/cpu-all.h    |  3 ++
>   include/exec/tlb-common.h |  5 ++-
>   accel/tcg/cputlb.c        | 76 ++++++++++++++++++++++++---------------
>   3 files changed, 52 insertions(+), 32 deletions(-)
> 
> diff --git a/include/exec/cpu-all.h b/include/exec/cpu-all.h
> index 45e6676938..ad160c328a 100644
> --- a/include/exec/cpu-all.h
> +++ b/include/exec/cpu-all.h
> @@ -339,6 +339,9 @@ static inline int cpu_mmu_index(CPUState *cs, bool ifetch)
>       (TLB_INVALID_MASK | TLB_NOTDIRTY | TLB_MMIO \
>       | TLB_FORCE_SLOW | TLB_DISCARD_WRITE)
>   
> +/* Filter read flags to exec flags. */
> +#define TLB_EXEC_FLAGS_MASK  (TLB_MMIO)
> +
>   /*
>    * Flags stored in CPUTLBEntryFull.slow_flags[x].
>    * TLB_FORCE_SLOW must be set in CPUTLBEntry.addr_idx[x].
> diff --git a/include/exec/tlb-common.h b/include/exec/tlb-common.h
> index 300f9fae67..feaa471299 100644
> --- a/include/exec/tlb-common.h
> +++ b/include/exec/tlb-common.h
> @@ -26,7 +26,6 @@ typedef union CPUTLBEntry {
>       struct {
>           uint64_t addr_read;
>           uint64_t addr_write;
> -        uint64_t addr_code;
>           /*
>            * Addend to virtual address to get host address.  IO accesses
>            * use the corresponding iotlb value.
> @@ -35,7 +34,7 @@ typedef union CPUTLBEntry {
>       };
>       /*
>        * Padding to get a power of two size, as well as index
> -     * access to addr_{read,write,code}.
> +     * access to addr_{read,write}.
>        */
>       uint64_t addr_idx[(1 << CPU_TLB_ENTRY_BITS) / sizeof(uint64_t)];
>   } CPUTLBEntry;
> @@ -92,7 +91,7 @@ struct CPUTLBEntryFull {
>        * Additional tlb flags for use by the slow path. If non-zero,
>        * the corresponding CPUTLBEntry comparator must have TLB_FORCE_SLOW.
>        */
> -    uint8_t slow_flags[MMU_ACCESS_COUNT];
> +    uint8_t slow_flags[2];
>   
>       /*
>        * Allow target-specific additions to this structure.
> diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
> index 981098a6f2..be2ea1bc70 100644
> --- a/accel/tcg/cputlb.c
> +++ b/accel/tcg/cputlb.c
> @@ -114,8 +114,9 @@ static inline uint64_t tlb_read_idx(const CPUTLBEntry *entry,
>                         MMU_DATA_LOAD * sizeof(uint64_t));
>       QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_write) !=
>                         MMU_DATA_STORE * sizeof(uint64_t));
> -    QEMU_BUILD_BUG_ON(offsetof(CPUTLBEntry, addr_code) !=
> -                      MMU_INST_FETCH * sizeof(uint64_t));
> +
> +    tcg_debug_assert(access_type == MMU_DATA_LOAD ||
> +                     access_type == MMU_DATA_STORE);
>   
>   #if TARGET_LONG_BITS == 32
>       /* Use qatomic_read, in case of addr_write; only care about low bits. */
> @@ -480,8 +481,7 @@ static bool tlb_hit_page_mask_anyprot(CPUTLBEntry *tlb_entry,
>       mask &= TARGET_PAGE_MASK | TLB_INVALID_MASK;
>   
>       return (page == (tlb_entry->addr_read & mask) ||
> -            page == (tlb_addr_write(tlb_entry) & mask) ||
> -            page == (tlb_entry->addr_code & mask));
> +            page == (tlb_addr_write(tlb_entry) & mask));
>   }
>   
>   /* Called with tlb_c.lock held */
> @@ -1184,9 +1184,6 @@ void tlb_set_page_full(CPUState *cpu, int mmu_idx,
>       /* Now calculate the new entry */
>       node->copy.addend = addend - addr_page;
>   
> -    tlb_set_compare(full, &node->copy, addr_page, read_flags,
> -                    MMU_INST_FETCH, prot & PAGE_EXEC);
> -
>       if (wp_flags & BP_MEM_READ) {
>           read_flags |= TLB_WATCHPOINT;
>       }
> @@ -1308,22 +1305,30 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o,
>       /* Primary lookup in the fast tlb. */
>       entry = tlbfast_entry(fast, addr);
>       full = &desc->fulltlb[tlbfast_index(fast, addr)];
> -    cmp = tlb_read_idx(entry, access_type);
> -    if (tlb_hit(cmp, addr)) {
> -        goto found;
> +    if (access_type != MMU_INST_FETCH) {
> +        cmp = tlb_read_idx(entry, access_type);
> +        if (tlb_hit(cmp, addr)) {
> +            goto found_data;
> +        }
>       }
>   
>       /* Secondary lookup in the IntervalTree. */
>       node = tlbtree_lookup_addr(desc, addr);
>       if (node) {
> -        cmp = tlb_read_idx(&node->copy, access_type);
> -        if (tlb_hit(cmp, addr)) {
> -            /* Install the cached entry. */
> -            qemu_spin_lock(&cpu->neg.tlb.c.lock);
> -            copy_tlb_helper_locked(entry, &node->copy);
> -            qemu_spin_unlock(&cpu->neg.tlb.c.lock);
> -            *full = node->full;
> -            goto found;
> +        if (access_type == MMU_INST_FETCH) {
> +            if (node->full.prot & PAGE_EXEC) {
> +                goto found_code;
> +            }
> +        } else {
> +            cmp = tlb_read_idx(&node->copy, access_type);
> +            if (tlb_hit(cmp, addr)) {
> +                /* Install the cached entry. */
> +                qemu_spin_lock(&cpu->neg.tlb.c.lock);
> +                copy_tlb_helper_locked(entry, &node->copy);
> +                qemu_spin_unlock(&cpu->neg.tlb.c.lock);
> +                *full = node->full;
> +                goto found_data;
> +            }
>           }
>       }
>   
> @@ -1333,9 +1338,14 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o,
>           tcg_debug_assert(probe);
>           return false;
>       }
> -
>       o->did_tlb_fill = true;
>   
> +    if (access_type == MMU_INST_FETCH) {
> +        node = tlbtree_lookup_addr(desc, addr);
> +        tcg_debug_assert(node);
> +        goto found_code;
> +    }
> +
>       entry = tlbfast_entry(fast, addr);
>       full = &desc->fulltlb[tlbfast_index(fast, addr)];
>       cmp = tlb_read_idx(entry, access_type);
> @@ -1345,14 +1355,29 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o,
>        * called tlb_fill_align, so we know that this entry *is* valid.
>        */
>       flags &= ~TLB_INVALID_MASK;
> +    goto found_data;
> +
> + found_data:
> +    flags &= cmp;
> +    flags |= full->slow_flags[access_type];
> +    o->flags = flags;
> +    o->full = *full;
> +    o->haddr = (void *)((uintptr_t)addr + entry->addend);
>       goto done;
>   
> - found:
> -    /* Alignment has not been checked by tlb_fill_align. */
> -    {
> + found_code:
> +    o->flags = node->copy.addr_read & TLB_EXEC_FLAGS_MASK;
> +    o->full = node->full;
> +    o->haddr = (void *)((uintptr_t)addr + node->copy.addend);
> +    goto done;
> +
> + done:
> +    if (!o->did_tlb_fill) {
>           int a_bits = memop_alignment_bits(memop);
>   
>           /*
> +         * Alignment has not been checked by tlb_fill_align.
> +         *
>            * The TLB_CHECK_ALIGNED check differs from the normal alignment
>            * check, in that this is based on the atomicity of the operation.
>            * The intended use case is the ARM memory type field of each PTE,
> @@ -1366,13 +1391,6 @@ static bool tlb_lookup(CPUState *cpu, TLBLookupOutput *o,
>               cpu_unaligned_access(cpu, addr, access_type, i->mmu_idx, i->ra);
>           }
>       }
> -
> - done:
> -    flags &= cmp;
> -    flags |= full->slow_flags[access_type];
> -    o->flags = flags;
> -    o->full = *full;
> -    o->haddr = (void *)((uintptr_t)addr + entry->addend);
>       return true;
>   }
>   

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>