target/arm/helper.c | 68 ++++++++++++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 16 deletions(-)
The SCTLR_EL1.UCI bit only affects a subset of cache maintenance
instructions as specified by the specification. Any other cache
maintenance instructions must still be trapped from EL0.
Signed-off-by: Idan Horowitz <idan.horowitz@gmail.com>
---
target/arm/helper.c | 68 ++++++++++++++++++++++++++++++++++-----------
1 file changed, 52 insertions(+), 16 deletions(-)
diff --git a/target/arm/helper.c b/target/arm/helper.c
index cfca0f5ba6..ac75a268aa 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -4217,7 +4217,7 @@ static const ARMCPRegInfo ssbs_reginfo = {
.readfn = aa64_ssbs_read, .writefn = aa64_ssbs_write
};
-static CPAccessResult aa64_cacheop_poc_access(CPUARMState *env,
+static CPAccessResult aa64_cacheop_poc_uci_access(CPUARMState *env,
const ARMCPRegInfo *ri,
bool isread)
{
@@ -4239,7 +4239,7 @@ static CPAccessResult aa64_cacheop_poc_access(CPUARMState *env,
return CP_ACCESS_OK;
}
-static CPAccessResult aa64_cacheop_pou_access(CPUARMState *env,
+static CPAccessResult aa64_cacheop_pou_uci_access(CPUARMState *env,
const ARMCPRegInfo *ri,
bool isread)
{
@@ -4261,6 +4261,42 @@ static CPAccessResult aa64_cacheop_pou_access(CPUARMState *env,
return CP_ACCESS_OK;
}
+static CPAccessResult aa64_cacheop_poc_access(CPUARMState *env,
+ const ARMCPRegInfo *ri,
+ bool isread)
+{
+ /* Cache invalidate/clean to Point of Coherency or Persistence... */
+ switch (arm_current_el(env)) {
+ case 0:
+ return CP_ACCESS_TRAP;
+ case 1:
+ /* ... EL1 must trap to EL2 if HCR_EL2.TPCP is set. */
+ if (arm_hcr_el2_eff(env) & HCR_TPCP) {
+ return CP_ACCESS_TRAP_EL2;
+ }
+ break;
+ }
+ return CP_ACCESS_OK;
+}
+
+static CPAccessResult aa64_cacheop_pou_access(CPUARMState *env,
+ const ARMCPRegInfo *ri,
+ bool isread)
+{
+ /* Cache invalidate/clean to Point of Unification... */
+ switch (arm_current_el(env)) {
+ case 0:
+ return CP_ACCESS_TRAP;
+ case 1:
+ /* ... EL1 must trap to EL2 if HCR_EL2.TPU is set. */
+ if (arm_hcr_el2_eff(env) & HCR_TPU) {
+ return CP_ACCESS_TRAP_EL2;
+ }
+ break;
+ }
+ return CP_ACCESS_OK;
+}
+
/* See: D4.7.2 TLB maintenance requirements and the TLB maintenance instructions
* Page D4-1736 (DDI0487A.b)
*/
@@ -4846,7 +4882,7 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
{ .name = "IC_IVAU", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 5, .opc2 = 1,
.access = PL0_W, .type = ARM_CP_NOP,
- .accessfn = aa64_cacheop_pou_access },
+ .accessfn = aa64_cacheop_pou_uci_access },
{ .name = "DC_IVAC", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 0, .crn = 7, .crm = 6, .opc2 = 1,
.access = PL1_W, .accessfn = aa64_cacheop_poc_access,
@@ -4857,18 +4893,18 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
{ .name = "DC_CVAC", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 10, .opc2 = 1,
.access = PL0_W, .type = ARM_CP_NOP,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CSW", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 0, .crn = 7, .crm = 10, .opc2 = 2,
.access = PL1_W, .accessfn = access_tsw, .type = ARM_CP_NOP },
{ .name = "DC_CVAU", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 11, .opc2 = 1,
.access = PL0_W, .type = ARM_CP_NOP,
- .accessfn = aa64_cacheop_pou_access },
+ .accessfn = aa64_cacheop_pou_uci_access },
{ .name = "DC_CIVAC", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 1,
.access = PL0_W, .type = ARM_CP_NOP,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CISW", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 0, .crn = 7, .crm = 14, .opc2 = 2,
.access = PL1_W, .accessfn = access_tsw, .type = ARM_CP_NOP },
@@ -7102,7 +7138,7 @@ static const ARMCPRegInfo dcpop_reg[] = {
{ .name = "DC_CVAP", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 12, .opc2 = 1,
.access = PL0_W, .type = ARM_CP_NO_RAW | ARM_CP_SUPPRESS_TB_END,
- .accessfn = aa64_cacheop_poc_access, .writefn = dccvap_writefn },
+ .accessfn = aa64_cacheop_poc_uci_access, .writefn = dccvap_writefn },
REGINFO_SENTINEL
};
@@ -7110,7 +7146,7 @@ static const ARMCPRegInfo dcpodp_reg[] = {
{ .name = "DC_CVADP", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 13, .opc2 = 1,
.access = PL0_W, .type = ARM_CP_NO_RAW | ARM_CP_SUPPRESS_TB_END,
- .accessfn = aa64_cacheop_poc_access, .writefn = dccvap_writefn },
+ .accessfn = aa64_cacheop_poc_uci_access, .writefn = dccvap_writefn },
REGINFO_SENTINEL
};
#endif /*CONFIG_USER_ONLY*/
@@ -7227,35 +7263,35 @@ static const ARMCPRegInfo mte_el0_cacheop_reginfo[] = {
{ .name = "DC_CGVAC", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 10, .opc2 = 3,
.type = ARM_CP_NOP, .access = PL0_W,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CGDVAC", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 10, .opc2 = 5,
.type = ARM_CP_NOP, .access = PL0_W,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CGVAP", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 12, .opc2 = 3,
.type = ARM_CP_NOP, .access = PL0_W,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CGDVAP", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 12, .opc2 = 5,
.type = ARM_CP_NOP, .access = PL0_W,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CGVADP", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 13, .opc2 = 3,
.type = ARM_CP_NOP, .access = PL0_W,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CGDVADP", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 13, .opc2 = 5,
.type = ARM_CP_NOP, .access = PL0_W,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CIGVAC", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 3,
.type = ARM_CP_NOP, .access = PL0_W,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_CIGDVAC", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 14, .opc2 = 5,
.type = ARM_CP_NOP, .access = PL0_W,
- .accessfn = aa64_cacheop_poc_access },
+ .accessfn = aa64_cacheop_poc_uci_access },
{ .name = "DC_GVA", .state = ARM_CP_STATE_AA64,
.opc0 = 1, .opc1 = 3, .crn = 7, .crm = 4, .opc2 = 3,
.access = PL0_W, .type = ARM_CP_DC_GVA,
--
2.34.1
On Fri, 14 Jan 2022 at 00:40, Idan Horowitz <idan.horowitz@gmail.com> wrote: > > The SCTLR_EL1.UCI bit only affects a subset of cache maintenance > instructions as specified by the specification. Any other cache > maintenance instructions must still be trapped from EL0. Hi; thanks for this patch. Do you have a test case which demonstrates this behaviour? From reading the patch I can't see any instructions where the patch changes the behaviour of the emulation. As far as I can see, the commit effectively changes the accessfn for the following instructions to one which does not check the UCI bit: AArch64 IC IALLUIS IC IALLU DC IVAC DC IGVAC DC IGDVAC AArch32 ICIALLUIS ICIALLU ICIMVAU DCCMVAU DCIMVAC DCCMVAC DCCIMVAC and it is true that the architecture says that UCI doesn't affect these instructions; they always UNDEF at EL0. But for all of these instructions the reginfo struct sets ".access = PL1_W". The .access field is always checked before the .accessfn, so for any of these instructions executed from EL0 I think we will always fail the .access check and UNDEF the insn without calling the .accessfn. So it doesn't matter that the .accessfn has "if EL0 then check SCTLR_EL1.UCI", because when running the accessfn for these insns we can never be in EL0. Am I missing something? thanks -- PMM
On Thu, 20 Jan 2022 at 13:42, Peter Maydell <peter.maydell@linaro.org> wrote: > > > But for all of these instructions the reginfo struct > sets ".access = PL1_W". The .access field is always > checked before the .accessfn, so for any of these instructions > executed from EL0 I think we will always fail the .access > check and UNDEF the insn without calling the .accessfn. > So it doesn't matter that the .accessfn has "if EL0 then > check SCTLR_EL1.UCI", because when running the accessfn > for these insns we can never be in EL0. > > Am I missing something? > Hey, you are not missing anything, this patch indeed does not change any external behaviour. I should have specified, but the point of this patch is optimization: during benchmarking of the various AArch64 instructions I found that the cache flush instructions were quite slow, simply due to their heavy access functions, so this is an attempt at simplifying them. > > thanks > -- PMM Regards, Idan Horowitz
On Thu, 20 Jan 2022 at 12:00, Idan Horowitz <idan.horowitz@gmail.com> wrote: > > On Thu, 20 Jan 2022 at 13:42, Peter Maydell <peter.maydell@linaro.org> wrote: > > > > > > But for all of these instructions the reginfo struct > > sets ".access = PL1_W". The .access field is always > > checked before the .accessfn, so for any of these instructions > > executed from EL0 I think we will always fail the .access > > check and UNDEF the insn without calling the .accessfn. > > So it doesn't matter that the .accessfn has "if EL0 then > > check SCTLR_EL1.UCI", because when running the accessfn > > for these insns we can never be in EL0. > > > > Am I missing something? > > > > Hey, you are not missing anything, this patch indeed does not change > any external behaviour. > I should have specified, but the point of this patch is optimization: > during benchmarking of the various AArch64 instructions I found that > the cache flush instructions were quite slow, simply due to their > heavy access functions, so this is an attempt at simplifying them. But the code you are effectively removing is never executed for the instructions where you're changing the access function. If you're proposing this as a performance improvement, can you provide before-and-after benchmarks demonstrating that improvement ? thanks -- PMM
On Thu, 20 Jan 2022 at 14:32, Peter Maydell <peter.maydell@linaro.org> wrote: > > > But the code you are effectively removing is never executed > for the instructions where you're changing the access function. > If you're proposing this as a performance improvement, can > you provide before-and-after benchmarks demonstrating that > improvement ? > I wanted to say that in my micro-benchmark of DC IVAC I saw a 1% decrease in runtime, but I failed to replicate it again now, so I must have accidentally ran it together with one of my other patches last time. Sorry for wasting your time with the review. > > thanks > -- PMM Regards, Idan Horowitz
On Thu, 20 Jan 2022 at 13:25, Idan Horowitz <idan.horowitz@gmail.com> wrote: > > On Thu, 20 Jan 2022 at 14:32, Peter Maydell <peter.maydell@linaro.org> wrote: > > > > > > But the code you are effectively removing is never executed > > for the instructions where you're changing the access function. > > If you're proposing this as a performance improvement, can > > you provide before-and-after benchmarks demonstrating that > > improvement ? > > > > I wanted to say that in my micro-benchmark of DC IVAC I saw a 1% > decrease in runtime, but I failed to replicate it again now, so I must > have accidentally ran it together with one of my other patches last > time. > Sorry for wasting your time with the review. No worries. Incidentally, it's not surprising that if you microbenchmark the cache instructions the trap-checking appears as a large component of it -- for QEMU cache ops are NOPs so trap checking is the *only* thing that the instruction has to do. It's probably worth looking at benchmarks of real workloads to try to identify whether any particular instruction is a significant component before spending much time on trying to improve its performance. -- PMM
© 2016 - 2024 Red Hat, Inc.