include/exec/cpu-defs.h | 6 +++--- tcg/aarch64/tcg-target.h | 1 + tcg/arm/tcg-target.h | 1 + tcg/i386/tcg-target.h | 2 ++ tcg/ia64/tcg-target.h | 1 + tcg/mips/tcg-target.h | 2 ++ tcg/ppc/tcg-target.h | 1 + tcg/s390/tcg-target.h | 1 + tcg/sparc/tcg-target.h | 1 + tcg/tci/tcg-target.h | 1 + 10 files changed, 14 insertions(+), 3 deletions(-)
This patch increases the number of entries cached in the TLB. I went
over a few architectures to see if increasing it is problematic. Only
armv6 seems to have a limitation that only 8 bits can be used for
indexing these entries. For other architectures, the number of TLB
entries is increased to a 4K-sized cache. The patch also doubles the
number of victim TLB entries.
Some statistics collected from a build benchmark for various cache
sizes is listed below:
| TLB bits\vTLB entires |             8 |            16  |            32 |
|                     8 | 952.94(+0.0%) | 929.99(+2.4%)  | 919.02(+3.6%) |
|                    10 | 898.92(+5.6%) | 886.13(+7.0%)  | 887.03(+6.9%) |
|                    12 | 878.56(+7.8%) | 873.53(+8.3%)* | 875.34(+8.1%) |
The best combination for this workload came out to be 12 bits for the
TLB and a 16 entry vTLB cache.
Signed-off-by: Pranith Kumar <bobby.prani@gmail.com>
---
 include/exec/cpu-defs.h  | 6 +++---
 tcg/aarch64/tcg-target.h | 1 +
 tcg/arm/tcg-target.h     | 1 +
 tcg/i386/tcg-target.h    | 2 ++
 tcg/ia64/tcg-target.h    | 1 +
 tcg/mips/tcg-target.h    | 2 ++
 tcg/ppc/tcg-target.h     | 1 +
 tcg/s390/tcg-target.h    | 1 +
 tcg/sparc/tcg-target.h   | 1 +
 tcg/tci/tcg-target.h     | 1 +
 10 files changed, 14 insertions(+), 3 deletions(-)
diff --git a/include/exec/cpu-defs.h b/include/exec/cpu-defs.h
index bc8e7f848d..1957e3f32c 100644
--- a/include/exec/cpu-defs.h
+++ b/include/exec/cpu-defs.h
@@ -57,8 +57,8 @@ typedef uint64_t target_ulong;
 #endif
 
 #if !defined(CONFIG_USER_ONLY) && defined(CONFIG_TCG)
-/* use a fully associative victim tlb of 8 entries */
-#define CPU_VTLB_SIZE 8
+/* use a fully associative victim tlb of 16 entries */
+#define CPU_VTLB_SIZE 16
 
 #if HOST_LONG_BITS == 32 && TARGET_LONG_BITS == 32
 #define CPU_TLB_ENTRY_BITS 4
@@ -89,7 +89,7 @@ typedef uint64_t target_ulong;
  * of tlb_table inside env (which is non-trivial but not huge).
  */
 #define CPU_TLB_BITS                                             \
-    MIN(8,                                                       \
+    MIN(MIN(12, TCG_TARGET_TLB_MAX_INDEX_BITS),                  \
         TCG_TARGET_TLB_DISPLACEMENT_BITS - CPU_TLB_ENTRY_BITS -  \
         (NB_MMU_MODES <= 1 ? 0 :                                 \
          NB_MMU_MODES <= 2 ? 1 :                                 \
diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index b41a248bee..9f4558cd83 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -15,6 +15,7 @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE  4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 24
+#define TCG_TARGET_TLB_MAX_INDEX_BITS 32
 #undef TCG_TARGET_STACK_GROWSUP
 
 typedef enum {
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index a38be15a39..ebe27991f3 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -60,6 +60,7 @@ extern int arm_arch;
 #undef TCG_TARGET_STACK_GROWSUP
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
+#define TCG_TARGET_TLB_MAX_INDEX_BITS 8
 
 typedef enum {
     TCG_REG_R0 = 0,
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 73a15f7e80..5279af6eb1 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -162,6 +162,8 @@ extern bool have_popcnt;
 # define TCG_AREG0 TCG_REG_EBP
 #endif
 
+#define TCG_TARGET_TLB_MAX_INDEX_BITS (32 - CPU_TLB_ENTRY_BITS)
+
 static inline void flush_icache_range(uintptr_t start, uintptr_t stop)
 {
 }
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index 8f475fe742..35878e20c7 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -28,6 +28,7 @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE 16
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 21
+#define TCG_TARGET_TLB_MAX_INDEX_BITS 32
 
 typedef struct {
     uint64_t lo __attribute__((aligned(16)));
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index e9558d15bc..1b60e53169 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -39,6 +39,8 @@
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
 #define TCG_TARGET_NB_REGS 32
 
+#define TCG_TARGET_TLB_MAX_INDEX_BITS (16 - CPU_TLB_ENTRY_BITS)
+
 typedef enum {
     TCG_REG_ZERO = 0,
     TCG_REG_AT,
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index 5a092b038a..82e10c9471 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -34,6 +34,7 @@
 #define TCG_TARGET_NB_REGS 32
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 16
+#define TCG_TARGET_TLB_MAX_INDEX_BITS 32
 
 typedef enum {
     TCG_REG_R0,  TCG_REG_R1,  TCG_REG_R2,  TCG_REG_R3,
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index dc0e59193c..57f0e22532 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -27,6 +27,7 @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE 2
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 19
+#define TCG_TARGET_TLB_MAX_INDEX_BITS 32
 
 typedef enum TCGReg {
     TCG_REG_R0 = 0,
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index 4515c9ab48..378d218923 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -29,6 +29,7 @@
 
 #define TCG_TARGET_INSN_UNIT_SIZE 4
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
+#define TCG_TARGET_TLB_MAX_INDEX_BITS 12
 #define TCG_TARGET_NB_REGS 32
 
 typedef enum {
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 06963288dc..bb36e8197d 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -43,6 +43,7 @@
 #define TCG_TARGET_INTERPRETER 1
 #define TCG_TARGET_INSN_UNIT_SIZE 1
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 32
+#define TCG_TARGET_TLB_MAX_INDEX_BITS 32
 
 #if UINTPTR_MAX == UINT32_MAX
 # define TCG_TARGET_REG_BITS 32
-- 
2.13.0
                
            On 08/29/2017 10:23 AM, Pranith Kumar wrote: > This patch increases the number of entries cached in the TLB. I went > over a few architectures to see if increasing it is problematic. Only > armv6 seems to have a limitation that only 8 bits can be used for > indexing these entries. For other architectures, the number of TLB > entries is increased to a 4K-sized cache. The patch also doubles the > number of victim TLB entries. > > Some statistics collected from a build benchmark for various cache > sizes is listed below: > > | TLB bits\vTLB entires | 8 | 16 | 32 | > | 8 | 952.94(+0.0%) | 929.99(+2.4%) | 919.02(+3.6%) | > | 10 | 898.92(+5.6%) | 886.13(+7.0%) | 887.03(+6.9%) | > | 12 | 878.56(+7.8%) | 873.53(+8.3%)* | 875.34(+8.1%) | > > The best combination for this workload came out to be 12 bits for the > TLB and a 16 entry vTLB cache. > > Signed-off-by: Pranith Kumar <bobby.prani@gmail.com> > --- > include/exec/cpu-defs.h | 6 +++--- > tcg/aarch64/tcg-target.h | 1 + > tcg/arm/tcg-target.h | 1 + > tcg/i386/tcg-target.h | 2 ++ > tcg/ia64/tcg-target.h | 1 + > tcg/mips/tcg-target.h | 2 ++ > tcg/ppc/tcg-target.h | 1 + > tcg/s390/tcg-target.h | 1 + > tcg/sparc/tcg-target.h | 1 + > tcg/tci/tcg-target.h | 1 + > 10 files changed, 14 insertions(+), 3 deletions(-) Reviewed-by: Richard Henderson <richard.henderson@linaro.org> r~
On 08/29/2017 10:23 AM, Pranith Kumar wrote: > This patch increases the number of entries cached in the TLB. I went > over a few architectures to see if increasing it is problematic. Only > armv6 seems to have a limitation that only 8 bits can be used for > indexing these entries. For other architectures, the number of TLB > entries is increased to a 4K-sized cache. The patch also doubles the > number of victim TLB entries. > > Some statistics collected from a build benchmark for various cache > sizes is listed below: > > | TLB bits\vTLB entires | 8 | 16 | 32 | > | 8 | 952.94(+0.0%) | 929.99(+2.4%) | 919.02(+3.6%) | > | 10 | 898.92(+5.6%) | 886.13(+7.0%) | 887.03(+6.9%) | > | 12 | 878.56(+7.8%) | 873.53(+8.3%)* | 875.34(+8.1%) | > > The best combination for this workload came out to be 12 bits for the > TLB and a 16 entry vTLB cache. This significantly degrades performance of alpha-softmmu. It spends about 25% of all cpu time in memset. r~
On Tue, Sep 5, 2017 at 5:50 PM, Richard Henderson <rth@twiddle.net> wrote: > On 08/29/2017 10:23 AM, Pranith Kumar wrote: >> This patch increases the number of entries cached in the TLB. I went >> over a few architectures to see if increasing it is problematic. Only >> armv6 seems to have a limitation that only 8 bits can be used for >> indexing these entries. For other architectures, the number of TLB >> entries is increased to a 4K-sized cache. The patch also doubles the >> number of victim TLB entries. >> >> Some statistics collected from a build benchmark for various cache >> sizes is listed below: >> >> | TLB bits\vTLB entires | 8 | 16 | 32 | >> | 8 | 952.94(+0.0%) | 929.99(+2.4%) | 919.02(+3.6%) | >> | 10 | 898.92(+5.6%) | 886.13(+7.0%) | 887.03(+6.9%) | >> | 12 | 878.56(+7.8%) | 873.53(+8.3%)* | 875.34(+8.1%) | >> >> The best combination for this workload came out to be 12 bits for the >> TLB and a 16 entry vTLB cache. > > This significantly degrades performance of alpha-softmmu. > It spends about 25% of all cpu time in memset. What workload does it degrade for? I will try to reproduce and see which memset is causing this. -- Pranith
On 09/05/2017 05:33 PM, Pranith Kumar wrote: >> This significantly degrades performance of alpha-softmmu. >> It spends about 25% of all cpu time in memset. > > What workload does it degrade for? I will try to reproduce and see > which memset is causing this. emerge --update --ask @world which is a lot of python, iirc. It is the tlb flush memset that is causing this. The tlb has grown from a few kB to 300kB with your patch. r~
Richard Henderson <rth@twiddle.net> writes: > On 09/05/2017 05:33 PM, Pranith Kumar wrote: >>> This significantly degrades performance of alpha-softmmu. >>> It spends about 25% of all cpu time in memset. >> >> What workload does it degrade for? I will try to reproduce and see >> which memset is causing this. > > emerge --update --ask @world > > which is a lot of python, iirc. > > It is the tlb flush memset that is causing this. > The tlb has grown from a few kB to 300kB with your patch. Hmm I wonder if this is QEMU doing global flushes which could actually be partial TLB flushes? emerge is quite a memory hog though so its conceivable we are generating a lot of churn as it runs. -- Alex Bennée
© 2016 - 2025 Red Hat, Inc.