[PATCH] target/arm: Allow only specific instructions based on the SCTLR_EL1.UCI bit

Idan Horowitz posted 1 patch 2 years, 2 months ago
Test checkpatch passed
Patches applied successfully (tree, apply log)
git fetch https://github.com/patchew-project/qemu tags/patchew/20220114004033.295199-1-idan.horowitz@gmail.com
Maintainers: Peter Maydell <peter.maydell@linaro.org>
target/arm/helper.c | 68 ++++++++++++++++++++++++++++++++++-----------
1 file changed, 52 insertions(+), 16 deletions(-)
[PATCH] target/arm: Allow only specific instructions based on the SCTLR_EL1.UCI bit
Posted by Idan Horowitz 2 years, 2 months ago
The SCTLR_EL1.UCI bit only affects a subset of cache maintenance
instructions as specified by the specification. Any other cache
maintenance instructions must still be trapped from EL0.

Signed-off-by: Idan Horowitz <idan.horowitz@gmail.com>
---
 target/arm/helper.c | 68 ++++++++++++++++++++++++++++++++++-----------
 1 file changed, 52 insertions(+), 16 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index cfca0f5ba6..ac75a268aa 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -4217,7 +4217,7 @@ static const ARMCPRegInfo ssbs_reginfo = {
     .readfn = aa64_ssbs_read, .writefn = aa64_ssbs_write
 };
 
-static CPAccessResult aa64_cacheop_poc_access(CPUARMState *env,
+static CPAccessResult aa64_cacheop_poc_uci_access(CPUARMState *env,
                                               const ARMCPRegInfo *ri,
                                               bool isread)
 {
@@ -4239,7 +4239,7 @@ static CPAccessResult aa64_cacheop_poc_access(CPUARMState *env,
     return CP_ACCESS_OK;
 }
 
-static CPAccessResult aa64_cacheop_pou_access(CPUARMState *env,
+static CPAccessResult aa64_cacheop_pou_uci_access(CPUARMState *env,
                                               const ARMCPRegInfo *ri,
                                               bool isread)
 {
@@ -4261,6 +4261,42 @@ static CPAccessResult aa64_cacheop_pou_access(CPUARMState *env,
     return CP_ACCESS_OK;
 }
 
+static CPAccessResult aa64_cacheop_poc_access(CPUARMState *env,
+                                              const ARMCPRegInfo *ri,
+                                              bool isread)
+{
+    /* Cache invalidate/clean to Point of Coherency or Persistence...  */
+    switch (arm_current_el(env)) {
+    case 0:
+        return CP_ACCESS_TRAP;
+    case 1:
+        /* ... EL1 must trap to EL2 if HCR_EL2.TPCP is set.  */
+        if (arm_hcr_el2_eff(env) & HCR_TPCP) {
+            return CP_ACCESS_TRAP_EL2;
+        }
+        break;
+    }
+    return CP_ACCESS_OK;
+}
+
+static CPAccessResult aa64_cacheop_pou_access(CPUARMState *env,
+                                              const ARMCPRegInfo *ri,
+                                              bool isread)
+{
+    /* Cache invalidate/clean to Point of Unification... */
+    switch (arm_current_el(env)) {
+    case 0:
+        return CP_ACCESS_TRAP;
+    case 1:
+        /* ... EL1 must trap to EL2 if HCR_EL2.TPU is set.  */
+        if (arm_hcr_el2_eff(env) & HCR_TPU) {
+            return CP_ACCESS_TRAP_EL2;
+        }
+        break;
+    }
+    return CP_ACCESS_OK;
+}
+
 /* See: D4.7.2 TLB maintenance requirements and the TLB maintenance instructions
  * Page D4-1736 (DDI0487A.b)
  */
@@ -4846,7 +4882,7 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
     { .name = "IC_IVAU", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 5, .opc2 = 1,
       .access = PL0_W, .type = ARM_CP_NOP,
-      .accessfn = aa64_cacheop_pou_access },
+      .accessfn = aa64_cacheop_pou_uci_access },
     { .name = "DC_IVAC", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 1,
       .access = PL1_W, .accessfn = aa64_cacheop_poc_access,
@@ -4857,18 +4893,18 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
     { .name = "DC_CVAC", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 10, .opc2 = 1,
       .access = PL0_W, .type = ARM_CP_NOP,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CSW", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 10, .opc2 = 2,
       .access = PL1_W, .accessfn = access_tsw, .type = ARM_CP_NOP },
     { .name = "DC_CVAU", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 11, .opc2 = 1,
       .access = PL0_W, .type = ARM_CP_NOP,
-      .accessfn = aa64_cacheop_pou_access },
+      .accessfn = aa64_cacheop_pou_uci_access },
     { .name = "DC_CIVAC", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 1,
       .access = PL0_W, .type = ARM_CP_NOP,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CISW", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 14, .opc2 = 2,
       .access = PL1_W, .accessfn = access_tsw, .type = ARM_CP_NOP },
@@ -7102,7 +7138,7 @@ static const ARMCPRegInfo dcpop_reg[] = {
     { .name = "DC_CVAP", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 12, .opc2 = 1,
       .access = PL0_W, .type = ARM_CP_NO_RAW | ARM_CP_SUPPRESS_TB_END,
-      .accessfn = aa64_cacheop_poc_access, .writefn = dccvap_writefn },
+      .accessfn = aa64_cacheop_poc_uci_access, .writefn = dccvap_writefn },
     REGINFO_SENTINEL
 };
 
@@ -7110,7 +7146,7 @@ static const ARMCPRegInfo dcpodp_reg[] = {
     { .name = "DC_CVADP", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 13, .opc2 = 1,
       .access = PL0_W, .type = ARM_CP_NO_RAW | ARM_CP_SUPPRESS_TB_END,
-      .accessfn = aa64_cacheop_poc_access, .writefn = dccvap_writefn },
+      .accessfn = aa64_cacheop_poc_uci_access, .writefn = dccvap_writefn },
     REGINFO_SENTINEL
 };
 #endif /*CONFIG_USER_ONLY*/
@@ -7227,35 +7263,35 @@ static const ARMCPRegInfo mte_el0_cacheop_reginfo[] = {
     { .name = "DC_CGVAC", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 10, .opc2 = 3,
       .type = ARM_CP_NOP, .access = PL0_W,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CGDVAC", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 10, .opc2 = 5,
       .type = ARM_CP_NOP, .access = PL0_W,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CGVAP", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 12, .opc2 = 3,
       .type = ARM_CP_NOP, .access = PL0_W,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CGDVAP", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 12, .opc2 = 5,
       .type = ARM_CP_NOP, .access = PL0_W,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CGVADP", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 13, .opc2 = 3,
       .type = ARM_CP_NOP, .access = PL0_W,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CGDVADP", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 13, .opc2 = 5,
       .type = ARM_CP_NOP, .access = PL0_W,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CIGVAC", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 3,
       .type = ARM_CP_NOP, .access = PL0_W,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_CIGDVAC", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 5,
       .type = ARM_CP_NOP, .access = PL0_W,
-      .accessfn = aa64_cacheop_poc_access },
+      .accessfn = aa64_cacheop_poc_uci_access },
     { .name = "DC_GVA", .state = ARM_CP_STATE_AA64,
       .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 4, .opc2 = 3,
       .access = PL0_W, .type = ARM_CP_DC_GVA,
-- 
2.34.1


Re: [PATCH] target/arm: Allow only specific instructions based on the SCTLR_EL1.UCI bit
Posted by Peter Maydell 2 years, 2 months ago
On Fri, 14 Jan 2022 at 00:40, Idan Horowitz <idan.horowitz@gmail.com> wrote:
>
> The SCTLR_EL1.UCI bit only affects a subset of cache maintenance
> instructions as specified by the specification. Any other cache
> maintenance instructions must still be trapped from EL0.

Hi; thanks for this patch. Do you have a test case which
demonstrates this behaviour? From reading the patch I
can't see any instructions where the patch changes the
behaviour of the emulation.

As far as I can see, the commit effectively changes the
accessfn for the following instructions to one which does
not check the UCI bit:
AArch64
 IC IALLUIS
 IC IALLU
 DC IVAC
 DC IGVAC
 DC IGDVAC
AArch32
 ICIALLUIS
 ICIALLU
 ICIMVAU
 DCCMVAU
 DCIMVAC
 DCCMVAC
 DCCIMVAC

and it is true that the architecture says that UCI
doesn't affect these instructions; they always UNDEF at EL0.

But for all of these instructions the reginfo struct
sets ".access = PL1_W". The .access field is always
checked before the .accessfn, so for any of these instructions
executed from EL0 I think we will always fail the .access
check and UNDEF the insn without calling the .accessfn.
So it doesn't matter that the .accessfn has "if EL0 then
check SCTLR_EL1.UCI", because when running the accessfn
for these insns we can never be in EL0.

Am I missing something?

thanks
-- PMM

Re: [PATCH] target/arm: Allow only specific instructions based on the SCTLR_EL1.UCI bit
Posted by Idan Horowitz 2 years, 2 months ago
On Thu, 20 Jan 2022 at 13:42, Peter Maydell <peter.maydell@linaro.org> wrote:
>
>
> But for all of these instructions the reginfo struct
> sets ".access = PL1_W". The .access field is always
> checked before the .accessfn, so for any of these instructions
> executed from EL0 I think we will always fail the .access
> check and UNDEF the insn without calling the .accessfn.
> So it doesn't matter that the .accessfn has "if EL0 then
> check SCTLR_EL1.UCI", because when running the accessfn
> for these insns we can never be in EL0.
>
> Am I missing something?
>

Hey, you are not missing anything, this patch indeed does not change
any external behaviour.
I should have specified, but the point of this patch is optimization:
during benchmarking of the various AArch64 instructions I found that
the cache flush instructions were quite slow, simply due to their
heavy access functions, so this is an attempt at simplifying them.

>
> thanks
> -- PMM

Regards,
Idan Horowitz

Re: [PATCH] target/arm: Allow only specific instructions based on the SCTLR_EL1.UCI bit
Posted by Peter Maydell 2 years, 2 months ago
On Thu, 20 Jan 2022 at 12:00, Idan Horowitz <idan.horowitz@gmail.com> wrote:
>
> On Thu, 20 Jan 2022 at 13:42, Peter Maydell <peter.maydell@linaro.org> wrote:
> >
> >
> > But for all of these instructions the reginfo struct
> > sets ".access = PL1_W". The .access field is always
> > checked before the .accessfn, so for any of these instructions
> > executed from EL0 I think we will always fail the .access
> > check and UNDEF the insn without calling the .accessfn.
> > So it doesn't matter that the .accessfn has "if EL0 then
> > check SCTLR_EL1.UCI", because when running the accessfn
> > for these insns we can never be in EL0.
> >
> > Am I missing something?
> >
>
> Hey, you are not missing anything, this patch indeed does not change
> any external behaviour.
> I should have specified, but the point of this patch is optimization:
> during benchmarking of the various AArch64 instructions I found that
> the cache flush instructions were quite slow, simply due to their
> heavy access functions, so this is an attempt at simplifying them.

But the code you are effectively removing is never executed
for the instructions where you're changing the access function.
If you're proposing this as a performance improvement, can
you provide before-and-after benchmarks demonstrating that
improvement ?

thanks
-- PMM

Re: [PATCH] target/arm: Allow only specific instructions based on the SCTLR_EL1.UCI bit
Posted by Idan Horowitz 2 years, 2 months ago
On Thu, 20 Jan 2022 at 14:32, Peter Maydell <peter.maydell@linaro.org> wrote:
>
>
> But the code you are effectively removing is never executed
> for the instructions where you're changing the access function.
> If you're proposing this as a performance improvement, can
> you provide before-and-after benchmarks demonstrating that
> improvement ?
>

I wanted to say that in my micro-benchmark of DC IVAC I saw a 1%
decrease in runtime, but I failed to replicate it again now, so I must
have accidentally ran it together with one of my other patches last
time.
Sorry for wasting your time with the review.

>
> thanks
> -- PMM

Regards,
Idan Horowitz

Re: [PATCH] target/arm: Allow only specific instructions based on the SCTLR_EL1.UCI bit
Posted by Peter Maydell 2 years, 2 months ago
On Thu, 20 Jan 2022 at 13:25, Idan Horowitz <idan.horowitz@gmail.com> wrote:
>
> On Thu, 20 Jan 2022 at 14:32, Peter Maydell <peter.maydell@linaro.org> wrote:
> >
> >
> > But the code you are effectively removing is never executed
> > for the instructions where you're changing the access function.
> > If you're proposing this as a performance improvement, can
> > you provide before-and-after benchmarks demonstrating that
> > improvement ?
> >
>
> I wanted to say that in my micro-benchmark of DC IVAC I saw a 1%
> decrease in runtime, but I failed to replicate it again now, so I must
> have accidentally ran it together with one of my other patches last
> time.
> Sorry for wasting your time with the review.

No worries.

Incidentally, it's not surprising that if you microbenchmark
the cache instructions the trap-checking appears as a large
component of it -- for QEMU cache ops are NOPs so trap checking
is the *only* thing that the instruction has to do. It's probably
worth looking at benchmarks of real workloads to try to identify
whether any particular instruction is a significant component
before spending much time on trying to improve its performance.

-- PMM