From nobody Thu Nov  6 06:17:03 2025
Delivered-To: importer@patchew.org
Received-SPF: pass (zoho.com: domain of gnu.org designates 208.118.235.17 as
 permitted sender) client-ip=208.118.235.17;
 envelope-from=qemu-devel-bounces+importer=patchew.org@nongnu.org;
 helo=lists.gnu.org;
Authentication-Results: mx.zohomail.com;
	spf=pass (zoho.com: domain of gnu.org designates 208.118.235.17 as permitted
 sender)  smtp.mailfrom=qemu-devel-bounces+importer=patchew.org@nongnu.org
Return-Path: <qemu-devel-bounces+importer=patchew.org@nongnu.org>
Received: from lists.gnu.org (lists.gnu.org [208.118.235.17]) by
 mx.zohomail.com
	with SMTPS id 153937124940056.2830850812702;
 Fri, 12 Oct 2018 12:07:29 -0700 (PDT)
Received: from localhost ([::1]:42311 helo=lists.gnu.org)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <qemu-devel-bounces+importer=patchew.org@nongnu.org>)
	id 1gB2mW-0004OZ-3c
	for importer@patchew.org; Fri, 12 Oct 2018 15:07:28 -0400
Received: from eggs.gnu.org ([2001:4830:134:3::10]:35836)
	by lists.gnu.org with esmtp (Exim 4.71)
	(envelope-from <cota@braap.org>) id 1gB2ju-0002lW-GQ
	for qemu-devel@nongnu.org; Fri, 12 Oct 2018 15:04:49 -0400
Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71)
	(envelope-from <cota@braap.org>) id 1gB2jq-0003dG-Jb
	for qemu-devel@nongnu.org; Fri, 12 Oct 2018 15:04:46 -0400
Received: from out3-smtp.messagingengine.com ([66.111.4.27]:55585)
	by eggs.gnu.org with esmtps (TLS1.0:DHE_RSA_AES_256_CBC_SHA1:32)
	(Exim 4.71) (envelope-from <cota@braap.org>) id 1gB2jo-0003bA-CR
	for qemu-devel@nongnu.org; Fri, 12 Oct 2018 15:04:41 -0400
Received: from compute4.internal (compute4.nyi.internal [10.202.2.44])
	by mailout.nyi.internal (Postfix) with ESMTP id 4211722183;
	Fri, 12 Oct 2018 15:04:37 -0400 (EDT)
Received: from mailfrontend2 ([10.202.2.163])
	by compute4.internal (MEProxy); Fri, 12 Oct 2018 15:04:37 -0400
Received: from localhost (flamenco.cs.columbia.edu [128.59.20.216])
	by mail.messagingengine.com (Postfix) with ESMTPA id DA3F1102F0;
	Fri, 12 Oct 2018 15:04:36 -0400 (EDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=braap.org; h=
	from:to:cc:subject:date:message-id:in-reply-to:references; s=
	mesmtp; bh=HN61Rfu9B8ULts5r2BqSzsKKdKrk81fzwko3vfx3mLk=; b=xFD4X
	2Kjs5ZhPdyV+7koU2ysJNnffTBSZxZMKb8/Un/fio0OT9sYi20uqNYZ/gBu9rvS4
	Znd4ajda4vNoKmwSYq9s0ony9irvGyfiIhzii6TH9GswtYLm5b0Ca3bGuuOH/ZHF
	XRP1NCnylSzSjGPaY7E4lCVDQZNCQuJsvK5RJA=
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=
	messagingengine.com; h=cc:date:from:in-reply-to:message-id
	:references:subject:to:x-me-proxy:x-me-proxy:x-me-sender
	:x-me-sender:x-sasl-enc; s=fm1; bh=HN61Rfu9B8ULts5r2BqSzsKKdKrk8
	1fzwko3vfx3mLk=; b=RdG8sMjiCYuj7YCEmIVXqLIWYe+JHneqOvbZ/ABEF704s
	x1dTeX10TwI+waiD0ITgLNS98g7rXG4aByGsKkzHpc21zC0frhLXHCXL8EZtIMeE
	/haWk5/Fg6noOymlBK4BzGd3cbHwwaGv21pEKLzt68/RdaXOGs5xSZePN1xfS1MJ
	4oHKZkArsvfrUQ5MWqCwVQLnmu/kXSjsD74xKeHR58l24O8mpQ8rIvvDn6p13S8e
	yIUjWYSsVNKe/tjmb1gHPZ5FHmu3LoBPCgLohHl5Qam5i1AScPCanGV3GeMEPzYf
	aLXqhYlfjic6U6ORxEGGJ2I3aZMAg5/hMmu3vA1ww==
X-ME-Sender: <xms:RfDAWzIuHEFFHHQApk-3_DWfYi4WTj5pgAGzuolfSIHDxE1KoSeRqw>
X-ME-Proxy: <xmx:RfDAW5zteCcH96NuATCmlmQzOdikVByeJfGEqKg3Gcbji0V-NVXQHw>
	<xmx:RfDAW-1ABTwKVhLXWMYi7dNeB7n5srB6hIPwV70OERpAug-NjUVsdw>
	<xmx:RfDAW1VqtPIta_4BqlE_MLwDu7t76AsFxvLAVNQBQfDCuc1SrNdsIA>
	<xmx:RfDAW6BIq3ITYHuVPEW_MfyUhrDPZ1m6palKZ68N5-eKvkW0vw0UYg>
	<xmx:RfDAW-Ljjy-1dfflpxYQxqanbghwhK2M7T5uVL0uq5yLXO7vFXDkhA>
	<xmx:RfDAW18WtaUqgv0mCXQbEV02RudPuQcQ9RNOv3MluPf2sTumNOvbfg>
From: "Emilio G. Cota" <cota@braap.org>
To: qemu-devel@nongnu.org
Date: Fri, 12 Oct 2018 15:04:34 -0400
Message-Id: <20181012190434.19477-4-cota@braap.org>
X-Mailer: git-send-email 2.17.1
In-Reply-To: <20181012190434.19477-1-cota@braap.org>
References: <20181012190434.19477-1-cota@braap.org>
X-detected-operating-system: by eggs.gnu.org: GNU/Linux 2.2.x-3.x [generic]
X-Received-From: 66.111.4.27
Subject: [Qemu-devel] [PATCH v4 3/3] tcg/i386: enable dynamic TLB sizing
X-BeenThere: qemu-devel@nongnu.org
X-Mailman-Version: 2.1.21
Precedence: list
List-Id: <qemu-devel.nongnu.org>
List-Unsubscribe: <https://lists.nongnu.org/mailman/options/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=unsubscribe>
List-Archive: <http://lists.nongnu.org/archive/html/qemu-devel/>
List-Post: <mailto:qemu-devel@nongnu.org>
List-Help: <mailto:qemu-devel-request@nongnu.org?subject=help>
List-Subscribe: <https://lists.nongnu.org/mailman/listinfo/qemu-devel>,
	<mailto:qemu-devel-request@nongnu.org?subject=subscribe>
Cc: =?UTF-8?q?Alex=20Benn=C3=A9e?= <alex.bennee@linaro.org>,
	Richard Henderson <richard.henderson@linaro.org>
Errors-To: qemu-devel-bounces+importer=patchew.org@nongnu.org
Sender: "Qemu-devel" <qemu-devel-bounces+importer=patchew.org@nongnu.org>
X-ZohoMail: RSF_0  Z_629925259 SPT_0
Content-Transfer-Encoding: quoted-printable
MIME-Version: 1.0
Content-Type: text/plain; charset="utf-8"

As the following experiments show, this a net perf gain,
particularly for memory-heavy workloads. Experiments
are run on an Intel i7-6700K CPU @ 4.00GHz.

1. System boot + shudown, debian aarch64:

- Before (tb-lock-v3):
 Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 run=
s):

       7469.363393      task-clock (msec)         #    0.998 CPUs utilized =
           ( +-  0.07% )
    31,507,707,190      cycles                    #    4.218 GHz           =
           ( +-  0.07% )
    57,101,577,452      instructions              #    1.81  insns per cycl=
e          ( +-  0.08% )
    10,265,531,804      branches                  # 1374.352 M/sec         =
           ( +-  0.07% )
       173,020,681      branch-misses             #    1.69% of all branche=
s          ( +-  0.10% )

       7.483359063 seconds time elapsed                                    =
      ( +-  0.08% )

- After:
 Performance counter stats for 'taskset -c 0 ../img/aarch64/die.sh' (10 run=
s):

       7185.036730      task-clock (msec)         #    0.999 CPUs utilized =
           ( +-  0.11% )
    30,303,501,143      cycles                    #    4.218 GHz           =
           ( +-  0.11% )
    54,198,386,487      instructions              #    1.79  insns per cycl=
e          ( +-  0.08% )
     9,726,518,945      branches                  # 1353.719 M/sec         =
           ( +-  0.08% )
       167,082,307      branch-misses             #    1.72% of all branche=
s          ( +-  0.08% )

       7.195597842 seconds time elapsed                                    =
      ( +-  0.11% )

That is, a 3.8% improvement.

2. System boot + shutdown, ubuntu 18.04 x86_64:

- Before (tb-lock-v3):
Performance counter stats for 'taskset -c 0 ../img/x86_64/ubuntu-die.sh -no=
graphic' (2 runs):

      49971.036482      task-clock (msec)         #    0.999 CPUs utilized =
           ( +-  1.62% )
   210,766,077,140      cycles                    #    4.218 GHz           =
           ( +-  1.63% )
   428,829,830,790      instructions              #    2.03  insns per cycl=
e          ( +-  0.75% )
    77,313,384,038      branches                  # 1547.164 M/sec         =
           ( +-  0.54% )
       835,610,706      branch-misses             #    1.08% of all branche=
s          ( +-  2.97% )

      50.003855102 seconds time elapsed                                    =
      ( +-  1.61% )

- After:
 Performance counter stats for 'taskset -c 0 ../img/x86_64/ubuntu-die.sh -n=
ographic' (2 runs):

      50118.124477      task-clock (msec)         #    0.999 CPUs utilized =
           ( +-  4.30% )
           132,396      context-switches          #    0.003 M/sec         =
           ( +-  1.20% )
                 0      cpu-migrations            #    0.000 K/sec         =
           ( +-100.00% )
           167,754      page-faults               #    0.003 M/sec         =
           ( +-  0.06% )
   211,414,701,601      cycles                    #    4.218 GHz           =
           ( +-  4.30% )
   <not supported>      stalled-cycles-frontend
   <not supported>      stalled-cycles-backend
   431,618,818,597      instructions              #    2.04  insns per cycl=
e          ( +-  6.40% )
    80,197,256,524      branches                  # 1600.165 M/sec         =
           ( +-  8.59% )
       794,830,352      branch-misses             #    0.99% of all branche=
s          ( +-  2.05% )

      50.177077175 seconds time elapsed                                    =
      ( +-  4.23% )

No improvement (within noise range).

3. x86_64 SPEC06int:
                              SPEC06int (test set)
                         [ Y axis: speedup over master ]
  8 +-+--+----+----+-----+----+----+----+----+----+----+-----+----+----+--+=
-+
    |                                                                      =
 |
    |                                                   tlb-lock-v3        =
 |
  7 +-+..................$$$...........................+indirection       +=
-+
    |                    $ $                              +resizing        =
 |
    |                    $ $                                               =
 |
  6 +-+..................$.$..............................................+=
-+
    |                    $ $                                               =
 |
    |                    $ $                                               =
 |
  5 +-+..................$.$..............................................+=
-+
    |                    $ $                                               =
 |
    |                    $ $                                               =
 |
  4 +-+..................$.$..............................................+=
-+
    |                    $ $                                               =
 |
    |          +++       $ $                                               =
 |
  3 +-+........$$+.......$.$..............................................+=
-+
    |          $$        $ $                                               =
 |
    |          $$        $ $                                 $$$           =
 |
  2 +-+........$$........$.$.................................$.$..........+=
-+
    |          $$        $ $                                 $ $       +$$ =
 |
    |          $$   $$+  $ $  $$$       +$$                  $ $  $$$   $$ =
 |
  1 +-+***#$***#$+**#$+**#+$**#+$**##$**##$***#$***#$+**#$+**#+$**#+$**##$+=
-+
    |  * *#$* *#$ **#$ **# $**# $** #$** #$* *#$* *#$ **#$ **# $**# $** #$ =
 |
    |  * *#$* *#$ **#$ **# $**# $** #$** #$* *#$* *#$ **#$ **# $**# $** #$ =
 |
  0 +-+***#$***#$-**#$-**#$$**#$$**##$**##$***#$***#$-**#$-**#$$**#$$**##$+=
-+
     401.bzi403.gc429445.g456.h462.libq464.h471.omne4483.xalancbgeomean
png: https://imgur.com/a/b1wn3wc

That is, a 1.53x average speedup over master, with a max speedup of 7.13x.

Note that "indirection" (i.e. the "cputlb: introduce indirection for TLB si=
ze"
patch in this series) incurs no overhead, on average.

To conclude, here is a different look at the SPEC06int results, using
linux-user as the baseline and comparing master and this series ("tlb-dyn"):

            Softmmu slowdown vs. linux-user for SPEC06int (test set)
                    [ Y axis: slowdown over linux-user ]
  14 +-+--+----+----+----+----+----+-----+----+----+----+----+----+----+--+=
-+
     |                                                                     =
 |
     |                                                       master        =
 |
  12 +-+...............+**..................................tlb-dyn.......+=
-+
     |                  **                                                 =
 |
     |                  **                                                 =
 |
     |                  **                                                 =
 |
  10 +-+................**................................................+=
-+
     |                  **                                                 =
 |
     |                  **                                                 =
 |
   8 +-+................**................................................+=
-+
     |                  **                                                 =
 |
     |                  **                                                 =
 |
     |                  **                                                 =
 |
   6 +-+................**................................................+=
-+
     |       ***        **                                                 =
 |
     |       * *        **                                                 =
 |
   4 +-+.....*.*........**.................................***............+=
-+
     |       * *        **                                 * *             =
 |
     |       * *  +++   **             ***            ***  * *  ***  ***   =
 |
     |       * *  +**++ **   **##      *+*#      ***  * *#+* *  * *##* *   =
 |
   2 +-+.....*.*##.**##.**##.**.#.**##.*+*#.***#.*+*#.*.*#.*.*#+*.*.#*.*##+=
-+
     |++***##*+*+#+**+#+**+#+**+#+**+#+*+*#+*+*#+*+*#+*+*#+*+*#+*+*+#*+*+#+=
+|
     |  * * #* * # ** # ** # ** # ** # * *# * *# * *# * *# * *# * * #* * # =
 |
   0 +-+***##***##-**##-**##-**##-**##-***#-***#-***#-***#-***#-***##***##+=
-+
      401.bzi403.g429445.g456.hm462.libq464.h471.omn4483.xalancbgeomean

png: https://imgur.com/a/eXkjMCE

After this series, we bring down the average softmmu overhead
from 2.77x to 1.80x, with a maximum slowdown of 2.48x (omnetpp).

Signed-off-by: Emilio G. Cota <cota@braap.org>
---
 tcg/i386/tcg-target.h     |  2 +-
 tcg/i386/tcg-target.inc.c | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 9e4bfa90d1..8b6475d786 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -27,7 +27,7 @@
=20
 #define TCG_TARGET_INSN_UNIT_SIZE  1
 #define TCG_TARGET_TLB_DISPLACEMENT_BITS 31
-#define TCG_TARGET_IMPLEMENTS_DYN_TLB 0
+#define TCG_TARGET_IMPLEMENTS_DYN_TLB 1
=20
 #ifdef __x86_64__
 # define TCG_TARGET_REG_BITS  64
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 436195894b..5cbb07deab 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -330,6 +330,7 @@ static inline int tcg_target_const_match(tcg_target_lon=
g val, TCGType type,
 #define OPC_ARITH_GvEv	(0x03)		/* ... plus (ARITH_FOO << 3) */
 #define OPC_ANDN        (0xf2 | P_EXT38)
 #define OPC_ADD_GvEv	(OPC_ARITH_GvEv | (ARITH_ADD << 3))
+#define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 #define OPC_BSF         (0xbc | P_EXT)
 #define OPC_BSR         (0xbd | P_EXT)
@@ -1625,7 +1626,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TC=
GReg addrlo, TCGReg addrhi,
         }
         if (TCG_TYPE_PTR =3D=3D TCG_TYPE_I64) {
             hrexw =3D P_REXW;
-            if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
+            if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
                 tlbtype =3D TCG_TYPE_I64;
                 tlbrexw =3D P_REXW;
             }
@@ -1633,6 +1634,15 @@ static inline void tcg_out_tlb_load(TCGContext *s, T=
CGReg addrlo, TCGReg addrhi,
     }
=20
     tcg_out_mov(s, tlbtype, r0, addrlo);
+    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
+                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
+
+    tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
+                         offsetof(CPUArchState, tlb_mask[mem_index]));
+
+    tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
+                         offsetof(CPUArchState, tlb_table[mem_index]));
+
     /* If the required alignment is at least as large as the access, simply
        copy the address and mask.  For lesser alignments, check that we do=
n't
        cross pages for the complete access.  */
@@ -1642,20 +1652,10 @@ static inline void tcg_out_tlb_load(TCGContext *s, =
TCGReg addrlo, TCGReg addrhi,
         tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_ma=
sk);
     }
     tlb_mask =3D (target_ulong)TARGET_PAGE_MASK | a_mask;
-
-    tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
-                   TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
-
     tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
-    tgen_arithi(s, ARITH_AND + tlbrexw, r0,
-                (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
-
-    tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
-                             offsetof(CPUArchState, tlb_table[mem_index][0=
])
-                             + which);
=20
     /* cmp 0(r0), r1 */
-    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
+    tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
=20
     /* Prepare for both the fast path add of the tlb addend, and the slow
        path function argument setup.  There are two cases worth note:
@@ -1672,7 +1672,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TC=
GReg addrlo, TCGReg addrhi,
=20
     if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
         /* cmp 4(r0), addrhi */
-        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
+        tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
=20
         /* jne slow_path */
         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
@@ -1684,7 +1684,7 @@ static inline void tcg_out_tlb_load(TCGContext *s, TC=
GReg addrlo, TCGReg addrhi,
=20
     /* add addend(r0), r1 */
     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
-                         offsetof(CPUTLBEntry, addend) - which);
+                         offsetof(CPUTLBEntry, addend));
 }
=20
 /*
--=20
2.17.1