[PATCH v6 5/6] target/arm/emulate: add atomic, compare-and-swap, and PAC load

Lucas Amaral posted 6 patches 1 day, 19 hours ago
Maintainers: Peter Maydell <peter.maydell@linaro.org>, Alexander Graf <agraf@csgraf.de>, Pedro Barbuda <pbarbuda@microsoft.com>, Mohamed Mediouni <mohamed@unpredictable.fr>
[PATCH v6 5/6] target/arm/emulate: add atomic, compare-and-swap, and PAC load
Posted by Lucas Amaral 1 day, 19 hours ago
Add emulation for remaining ISV=0 load/store instruction classes.

Atomic memory operations (DDI 0487 C3.3.2):
  - LDADD, LDCLR, LDEOR, LDSET: arithmetic/logic atomics
  - LDSMAX, LDSMIN, LDUMAX, LDUMIN: signed/unsigned min/max
  - SWP: atomic swap
  Non-atomic read-modify-write, sufficient for MMIO where concurrent
  access is not a concern.  Acquire/release semantics are ignored.

Compare-and-swap (DDI 0487 C3.3.1):
  - CAS/CASA/CASAL/CASL: single-register compare-and-swap
  - CASP/CASPA/CASPAL/CASPL: register-pair compare-and-swap
  CASP validates even register pairs; odd or r31 returns UNHANDLED.

Load with PAC (DDI 0487 C6.2.121):
  - LDRAA/LDRAB: pointer-authenticated load, offset/pre-indexed
  Pointer authentication is not emulated (equivalent to auth always
  succeeding), which is correct for MMIO since PAC is a software
  security mechanism, not a memory access semantic.

Decodetree differences from TCG:
  - %ldra_imm extracts the raw S:imm9 field; the handler scales by
    << 3.  TCG applies !function=times_8 in the formatter.
  - @ldra uses wildcards for fixed opcode bits that TCG locks down
    (bits 31:30, bit 20, bit 11); the fixed bits are matched by the
    instruction pattern instead.
  - @cas is an explicit format template; TCG uses inline field
    extraction.

CASP uses two explicit decode patterns for the 32/64-bit size
variants.  LDRA's offset immediate is stored raw in the decode;
the handler scales by << 3.

Signed-off-by: Lucas Amaral <lucaaamaral@gmail.com>
---
 target/arm/emulate/a64-ldst.decode |  45 ++++++
 target/arm/emulate/arm_emulate.c   | 233 +++++++++++++++++++++++++++++
 2 files changed, 278 insertions(+)

diff --git a/target/arm/emulate/a64-ldst.decode b/target/arm/emulate/a64-ldst.decode
index fadf6fd2..9292bfdf 100644
--- a/target/arm/emulate/a64-ldst.decode
+++ b/target/arm/emulate/a64-ldst.decode
@@ -16,6 +16,16 @@
 # Load/store pair (GPR and SIMD/FP)
 &ldstpair       rt2 rt rn imm sz sign w p
 
+# Atomic memory operations
+&atomic         rs rn rt a r sz
+
+# Compare-and-swap
+&cas            rs rn rt sz a r
+
+# Load with PAC (LDRAA/LDRAB, FEAT_PAuth)
+%ldra_imm       22:s1 12:9
+&ldra           rt rn imm m w
+
 # Load/store register offset
 &ldst           rm rn rt sign ext sz opt s
 
@@ -36,6 +46,15 @@
 # Load/store pair: imm7 is signed, scaled by element size in handler
 @ldstpair       .. ... . ... . imm:s7 rt2:5 rn:5 rt:5          &ldstpair
 
+# Atomics
+@atomic         sz:2 ... . .. a:1 r:1 . rs:5 . ... .. rn:5 rt:5   &atomic
+
+# Compare-and-swap: sz extracted by pattern (CAS) or set constant (CASP)
+@cas            .. ...... . a:1 . rs:5 r:1 ..... rn:5 rt:5        &cas
+
+# Load with PAC
+@ldra           .. ... . .. m:1 . . ......... w:1 . rn:5 rt:5     &ldra imm=%ldra_imm
+
 # Load/store register offset
 @ldst           .. ... . .. .. . rm:5 opt:3 s:1 .. rn:5 rt:5   &ldst
 
@@ -241,6 +260,32 @@ STR_v           00 111 1 00 10 1 ..... ... . 10 ..... .....    @ldst sign=0 ext=
 LDR_v           sz:2 111 1 00 01 1 ..... ... . 10 ..... .....  @ldst sign=0 ext=0
 LDR_v           00 111 1 00 11 1 ..... ... . 10 ..... .....    @ldst sign=0 ext=0 sz=4
 
+### Compare-and-swap
+
+# CAS / CASA / CASAL / CASL
+CAS             sz:2 001000 1 . 1 ..... . 11111 ..... .....     @cas
+
+# CASP / CASPA / CASPAL / CASPL (pair: Rt,Rt+1 and Rs,Rs+1)
+CASP            00 001000 0 . 1 ..... . 11111 ..... .....       @cas sz=2
+CASP            01 001000 0 . 1 ..... . 11111 ..... .....       @cas sz=3
+
+### Atomic memory operations
+
+LDADD           .. 111 0 00 . . 1 ..... 0000 00 ..... .....    @atomic
+LDCLR           .. 111 0 00 . . 1 ..... 0001 00 ..... .....    @atomic
+LDEOR           .. 111 0 00 . . 1 ..... 0010 00 ..... .....    @atomic
+LDSET           .. 111 0 00 . . 1 ..... 0011 00 ..... .....    @atomic
+LDSMAX          .. 111 0 00 . . 1 ..... 0100 00 ..... .....    @atomic
+LDSMIN          .. 111 0 00 . . 1 ..... 0101 00 ..... .....    @atomic
+LDUMAX          .. 111 0 00 . . 1 ..... 0110 00 ..... .....    @atomic
+LDUMIN          .. 111 0 00 . . 1 ..... 0111 00 ..... .....    @atomic
+SWP             .. 111 0 00 . . 1 ..... 1000 00 ..... .....    @atomic
+
+### Load with PAC (FEAT_PAuth)
+
+# LDRAA (M=0) / LDRAB (M=1), offset (W=0) / pre-indexed (W=1)
+LDRA            11 111 0 00 . . 1 ......... . 1 ..... .....  @ldra
+
 ### System instructions — DC cache maintenance
 
 # SYS with CRn=C7 covers all data cache operations (DC CIVAC, CVAC, etc.).
diff --git a/target/arm/emulate/arm_emulate.c b/target/arm/emulate/arm_emulate.c
index 7f876355..6601c9dc 100644
--- a/target/arm/emulate/arm_emulate.c
+++ b/target/arm/emulate/arm_emulate.c
@@ -547,6 +547,239 @@ static bool trans_LDXP(DisasContext *ctx, arg_stxr *a)
     return true;
 }
 
+/*
+ * Atomic memory operations (DDI 0487 C3.3.2)
+ *
+ * Non-atomic read-modify-write; sufficient for MMIO.
+ * Acquire/release semantics ignored (sequentially consistent by design).
+ */
+
+typedef uint64_t (*atomic_op_fn)(uint64_t old, uint64_t operand, int bits);
+
+static uint64_t atomic_add(uint64_t old, uint64_t op, int bits)
+{
+    return old + op;
+}
+
+static uint64_t atomic_clr(uint64_t old, uint64_t op, int bits)
+{
+    return old & ~op;
+}
+
+static uint64_t atomic_eor(uint64_t old, uint64_t op, int bits)
+{
+    return old ^ op;
+}
+
+static uint64_t atomic_set(uint64_t old, uint64_t op, int bits)
+{
+    return old | op;
+}
+
+static uint64_t atomic_smax(uint64_t old, uint64_t op, int bits)
+{
+    int64_t a = sextract64(old, 0, bits);
+    int64_t b = sextract64(op, 0, bits);
+    return (a >= b) ? old : op;
+}
+
+static uint64_t atomic_smin(uint64_t old, uint64_t op, int bits)
+{
+    int64_t a = sextract64(old, 0, bits);
+    int64_t b = sextract64(op, 0, bits);
+    return (a <= b) ? old : op;
+}
+
+static uint64_t atomic_umax(uint64_t old, uint64_t op, int bits)
+{
+    uint64_t mask = (bits == 64) ? UINT64_MAX : (1ULL << bits) - 1;
+    return ((old & mask) >= (op & mask)) ? old : op;
+}
+
+static uint64_t atomic_umin(uint64_t old, uint64_t op, int bits)
+{
+    uint64_t mask = (bits == 64) ? UINT64_MAX : (1ULL << bits) - 1;
+    return ((old & mask) <= (op & mask)) ? old : op;
+}
+
+static bool do_atomic(DisasContext *ctx, arg_atomic *a, atomic_op_fn fn)
+{
+    int esize = 1 << a->sz;
+    int bits = 8 * esize;
+    uint64_t va = base_read(ctx, a->rn);
+    uint8_t buf[8];
+
+    if (mem_read(ctx, va, buf, esize) != 0) {
+        return true;
+    }
+
+    uint64_t old = mem_ld(ctx, buf, esize);
+    uint64_t operand = gpr_read(ctx, a->rs);
+    uint64_t result = fn(old, operand, bits);
+
+    mem_st(ctx, buf, esize, result);
+    if (mem_write(ctx, va, buf, esize) != 0) {
+        return true;
+    }
+
+    /* Rt receives the old value (before modification) */
+    gpr_write(ctx, a->rt, old);
+    return true;
+}
+
+static bool trans_LDADD(DisasContext *ctx, arg_atomic *a)
+{
+    return do_atomic(ctx, a, atomic_add);
+}
+
+static bool trans_LDCLR(DisasContext *ctx, arg_atomic *a)
+{
+    return do_atomic(ctx, a, atomic_clr);
+}
+
+static bool trans_LDEOR(DisasContext *ctx, arg_atomic *a)
+{
+    return do_atomic(ctx, a, atomic_eor);
+}
+
+static bool trans_LDSET(DisasContext *ctx, arg_atomic *a)
+{
+    return do_atomic(ctx, a, atomic_set);
+}
+
+static bool trans_LDSMAX(DisasContext *ctx, arg_atomic *a)
+{
+    return do_atomic(ctx, a, atomic_smax);
+}
+
+static bool trans_LDSMIN(DisasContext *ctx, arg_atomic *a)
+{
+    return do_atomic(ctx, a, atomic_smin);
+}
+
+static bool trans_LDUMAX(DisasContext *ctx, arg_atomic *a)
+{
+    return do_atomic(ctx, a, atomic_umax);
+}
+
+static bool trans_LDUMIN(DisasContext *ctx, arg_atomic *a)
+{
+    return do_atomic(ctx, a, atomic_umin);
+}
+
+static bool trans_SWP(DisasContext *ctx, arg_atomic *a)
+{
+    int esize = 1 << a->sz;
+    uint64_t va = base_read(ctx, a->rn);
+    uint8_t buf[8];
+
+    if (mem_read(ctx, va, buf, esize) != 0) {
+        return true;
+    }
+
+    uint64_t old = mem_ld(ctx, buf, esize);
+    mem_st(ctx, buf, esize, gpr_read(ctx, a->rs));
+    if (mem_write(ctx, va, buf, esize) != 0) {
+        return true;
+    }
+
+    gpr_write(ctx, a->rt, old);
+    return true;
+}
+
+/* Compare-and-swap: CAS, CASP (DDI 0487 C3.3.1) */
+
+static bool trans_CAS(DisasContext *ctx, arg_cas *a)
+{
+    int esize = 1 << a->sz;
+    uint64_t va = base_read(ctx, a->rn);
+    uint8_t buf[8];
+
+    if (mem_read(ctx, va, buf, esize) != 0) {
+        return true;
+    }
+
+    uint64_t current = mem_ld(ctx, buf, esize);
+    uint64_t mask = (esize == 8) ? UINT64_MAX : (1ULL << (8 * esize)) - 1;
+    uint64_t compare = gpr_read(ctx, a->rs) & mask;
+
+    if ((current & mask) == compare) {
+        uint64_t newval = gpr_read(ctx, a->rt) & mask;
+        mem_st(ctx, buf, esize, newval);
+        if (mem_write(ctx, va, buf, esize) != 0) {
+            return true;
+        }
+    }
+
+    /* Rs receives the old memory value (whether or not swap occurred) */
+    gpr_write(ctx, a->rs, current);
+    return true;
+}
+
+/* CASP: compare-and-swap pair (Rs,Rs+1 compared; Rt,Rt+1 stored) */
+static bool trans_CASP(DisasContext *ctx, arg_cas *a)
+{
+    /* CASP requires even register pairs; odd or r31 is UNPREDICTABLE */
+    if ((a->rs & 1) || a->rs >= 31 || (a->rt & 1) || a->rt >= 31) {
+        return false;
+    }
+
+    int esize = 1 << a->sz;                   /* per-register size */
+    uint64_t va = base_read(ctx, a->rn);
+    uint8_t buf[16];
+
+    if (mem_read(ctx, va, buf, 2 * esize) != 0) {
+        return true;
+    }
+    uint64_t cur1 = mem_ld(ctx, buf, esize);
+    uint64_t cur2 = mem_ld(ctx, buf + esize, esize);
+
+    uint64_t mask = (esize == 8) ? UINT64_MAX : (1ULL << (8 * esize)) - 1;
+    uint64_t cmp1 = gpr_read(ctx, a->rs) & mask;
+    uint64_t cmp2 = gpr_read(ctx, a->rs + 1) & mask;
+
+    if ((cur1 & mask) == cmp1 && (cur2 & mask) == cmp2) {
+        uint64_t new1 = gpr_read(ctx, a->rt) & mask;
+        uint64_t new2 = gpr_read(ctx, a->rt + 1) & mask;
+        mem_st(ctx, buf, esize, new1);
+        mem_st(ctx, buf + esize, esize, new2);
+        if (mem_write(ctx, va, buf, 2 * esize) != 0) {
+            return true;
+        }
+    }
+
+    gpr_write(ctx, a->rs, cur1);
+    gpr_write(ctx, a->rs + 1, cur2);
+    return true;
+}
+
+/*
+ * Load with PAC: LDRAA / LDRAB (FEAT_PAuth)
+ * (DDI 0487 C6.2.121)
+ *
+ * Pointer authentication is not emulated -- the base register is used
+ * directly (equivalent to auth always succeeding).
+ */
+
+static bool trans_LDRA(DisasContext *ctx, arg_ldra *a)
+{
+    int64_t offset = (int64_t)a->imm << 3;  /* S:imm9, scaled by 8 */
+    uint64_t base = base_read(ctx, a->rn);
+    uint64_t va = base + offset;  /* auth not emulated */
+    uint8_t buf[8];
+
+    if (mem_read(ctx, va, buf, 8) != 0) {
+        return true;
+    }
+
+    gpr_write(ctx, a->rt, mem_ld(ctx, buf, 8));
+
+    if (a->w) {
+        base_write(ctx, a->rn, va);
+    }
+    return true;
+}
+
 /* PRFM, DC cache maintenance -- treated as NOP */
 static bool trans_NOP(DisasContext *ctx, arg_NOP *a)
 {
-- 
2.52.0


Re: [PATCH v6 5/6] target/arm/emulate: add atomic, compare-and-swap, and PAC load
Posted by Mohamed Mediouni 1 day, 18 hours ago

> On 10. Apr 2026, at 00:06, Lucas Amaral <lucaaamaral@gmail.com> wrote:
> 
> Add emulation for remaining ISV=0 load/store instruction classes.
> 
With the caveat of “why does this even happen”:

Maybe there should be a restriction to put a warning message
somewhere if an op happens on a non-MMIO range…

Reviewed-by: Mohamed Mediouni <mohamed@unpredictable.fr>


> Atomic memory operations (DDI 0487 C3.3.2):
>  - LDADD, LDCLR, LDEOR, LDSET: arithmetic/logic atomics
>  - LDSMAX, LDSMIN, LDUMAX, LDUMIN: signed/unsigned min/max
>  - SWP: atomic swap
>  Non-atomic read-modify-write, sufficient for MMIO where concurrent
>  access is not a concern.  Acquire/release semantics are ignored.
> 
> Compare-and-swap (DDI 0487 C3.3.1):
>  - CAS/CASA/CASAL/CASL: single-register compare-and-swap
>  - CASP/CASPA/CASPAL/CASPL: register-pair compare-and-swap
>  CASP validates even register pairs; odd or r31 returns UNHANDLED.
> 
> Load with PAC (DDI 0487 C6.2.121):
>  - LDRAA/LDRAB: pointer-authenticated load, offset/pre-indexed
>  Pointer authentication is not emulated (equivalent to auth always
>  succeeding), which is correct for MMIO since PAC is a software
>  security mechanism, not a memory access semantic.
> 
> Decodetree differences from TCG:
>  - %ldra_imm extracts the raw S:imm9 field; the handler scales by
>    << 3.  TCG applies !function=times_8 in the formatter.
>  - @ldra uses wildcards for fixed opcode bits that TCG locks down
>    (bits 31:30, bit 20, bit 11); the fixed bits are matched by the
>    instruction pattern instead.
>  - @cas is an explicit format template; TCG uses inline field
>    extraction.
> 
> CASP uses two explicit decode patterns for the 32/64-bit size
> variants.  LDRA's offset immediate is stored raw in the decode;
> the handler scales by << 3.
> 
> Signed-off-by: Lucas Amaral <lucaaamaral@gmail.com>
> ---
> target/arm/emulate/a64-ldst.decode |  45 ++++++
> target/arm/emulate/arm_emulate.c   | 233 +++++++++++++++++++++++++++++
> 2 files changed, 278 insertions(+)
> 
> diff --git a/target/arm/emulate/a64-ldst.decode b/target/arm/emulate/a64-ldst.decode
> index fadf6fd2..9292bfdf 100644
> --- a/target/arm/emulate/a64-ldst.decode
> +++ b/target/arm/emulate/a64-ldst.decode
> @@ -16,6 +16,16 @@
> # Load/store pair (GPR and SIMD/FP)
> &ldstpair       rt2 rt rn imm sz sign w p
> 
> +# Atomic memory operations
> +&atomic         rs rn rt a r sz
> +
> +# Compare-and-swap
> +&cas            rs rn rt sz a r
> +
> +# Load with PAC (LDRAA/LDRAB, FEAT_PAuth)
> +%ldra_imm       22:s1 12:9
> +&ldra           rt rn imm m w
> +
> # Load/store register offset
> &ldst           rm rn rt sign ext sz opt s
> 
> @@ -36,6 +46,15 @@
> # Load/store pair: imm7 is signed, scaled by element size in handler
> @ldstpair       .. ... . ... . imm:s7 rt2:5 rn:5 rt:5          &ldstpair
> 
> +# Atomics
> +@atomic         sz:2 ... . .. a:1 r:1 . rs:5 . ... .. rn:5 rt:5   &atomic
> +
> +# Compare-and-swap: sz extracted by pattern (CAS) or set constant (CASP)
> +@cas            .. ...... . a:1 . rs:5 r:1 ..... rn:5 rt:5        &cas
> +
> +# Load with PAC
> +@ldra           .. ... . .. m:1 . . ......... w:1 . rn:5 rt:5     &ldra imm=%ldra_imm
> +
> # Load/store register offset
> @ldst           .. ... . .. .. . rm:5 opt:3 s:1 .. rn:5 rt:5   &ldst
> 
> @@ -241,6 +260,32 @@ STR_v           00 111 1 00 10 1 ..... ... . 10 ..... .....    @ldst sign=0 ext=
> LDR_v           sz:2 111 1 00 01 1 ..... ... . 10 ..... .....  @ldst sign=0 ext=0
> LDR_v           00 111 1 00 11 1 ..... ... . 10 ..... .....    @ldst sign=0 ext=0 sz=4
> 
> +### Compare-and-swap
> +
> +# CAS / CASA / CASAL / CASL
> +CAS             sz:2 001000 1 . 1 ..... . 11111 ..... .....     @cas
> +
> +# CASP / CASPA / CASPAL / CASPL (pair: Rt,Rt+1 and Rs,Rs+1)
> +CASP            00 001000 0 . 1 ..... . 11111 ..... .....       @cas sz=2
> +CASP            01 001000 0 . 1 ..... . 11111 ..... .....       @cas sz=3
> +
> +### Atomic memory operations
> +
> +LDADD           .. 111 0 00 . . 1 ..... 0000 00 ..... .....    @atomic
> +LDCLR           .. 111 0 00 . . 1 ..... 0001 00 ..... .....    @atomic
> +LDEOR           .. 111 0 00 . . 1 ..... 0010 00 ..... .....    @atomic
> +LDSET           .. 111 0 00 . . 1 ..... 0011 00 ..... .....    @atomic
> +LDSMAX          .. 111 0 00 . . 1 ..... 0100 00 ..... .....    @atomic
> +LDSMIN          .. 111 0 00 . . 1 ..... 0101 00 ..... .....    @atomic
> +LDUMAX          .. 111 0 00 . . 1 ..... 0110 00 ..... .....    @atomic
> +LDUMIN          .. 111 0 00 . . 1 ..... 0111 00 ..... .....    @atomic
> +SWP             .. 111 0 00 . . 1 ..... 1000 00 ..... .....    @atomic
> +
> +### Load with PAC (FEAT_PAuth)
> +
> +# LDRAA (M=0) / LDRAB (M=1), offset (W=0) / pre-indexed (W=1)
> +LDRA            11 111 0 00 . . 1 ......... . 1 ..... .....  @ldra
> +
> ### System instructions — DC cache maintenance
> 
> # SYS with CRn=C7 covers all data cache operations (DC CIVAC, CVAC, etc.).
> diff --git a/target/arm/emulate/arm_emulate.c b/target/arm/emulate/arm_emulate.c
> index 7f876355..6601c9dc 100644
> --- a/target/arm/emulate/arm_emulate.c
> +++ b/target/arm/emulate/arm_emulate.c
> @@ -547,6 +547,239 @@ static bool trans_LDXP(DisasContext *ctx, arg_stxr *a)
>     return true;
> }
> 
> +/*
> + * Atomic memory operations (DDI 0487 C3.3.2)
> + *
> + * Non-atomic read-modify-write; sufficient for MMIO.
> + * Acquire/release semantics ignored (sequentially consistent by design).
> + */
> +
> +typedef uint64_t (*atomic_op_fn)(uint64_t old, uint64_t operand, int bits);
> +
> +static uint64_t atomic_add(uint64_t old, uint64_t op, int bits)
> +{
> +    return old + op;
> +}
> +
> +static uint64_t atomic_clr(uint64_t old, uint64_t op, int bits)
> +{
> +    return old & ~op;
> +}
> +
> +static uint64_t atomic_eor(uint64_t old, uint64_t op, int bits)
> +{
> +    return old ^ op;
> +}
> +
> +static uint64_t atomic_set(uint64_t old, uint64_t op, int bits)
> +{
> +    return old | op;
> +}
> +
> +static uint64_t atomic_smax(uint64_t old, uint64_t op, int bits)
> +{
> +    int64_t a = sextract64(old, 0, bits);
> +    int64_t b = sextract64(op, 0, bits);
> +    return (a >= b) ? old : op;
> +}
> +
> +static uint64_t atomic_smin(uint64_t old, uint64_t op, int bits)
> +{
> +    int64_t a = sextract64(old, 0, bits);
> +    int64_t b = sextract64(op, 0, bits);
> +    return (a <= b) ? old : op;
> +}
> +
> +static uint64_t atomic_umax(uint64_t old, uint64_t op, int bits)
> +{
> +    uint64_t mask = (bits == 64) ? UINT64_MAX : (1ULL << bits) - 1;
> +    return ((old & mask) >= (op & mask)) ? old : op;
> +}
> +
> +static uint64_t atomic_umin(uint64_t old, uint64_t op, int bits)
> +{
> +    uint64_t mask = (bits == 64) ? UINT64_MAX : (1ULL << bits) - 1;
> +    return ((old & mask) <= (op & mask)) ? old : op;
> +}
> +
> +static bool do_atomic(DisasContext *ctx, arg_atomic *a, atomic_op_fn fn)
> +{
> +    int esize = 1 << a->sz;
> +    int bits = 8 * esize;
> +    uint64_t va = base_read(ctx, a->rn);
> +    uint8_t buf[8];
> +
> +    if (mem_read(ctx, va, buf, esize) != 0) {
> +        return true;
> +    }
> +
> +    uint64_t old = mem_ld(ctx, buf, esize);
> +    uint64_t operand = gpr_read(ctx, a->rs);
> +    uint64_t result = fn(old, operand, bits);
> +
> +    mem_st(ctx, buf, esize, result);
> +    if (mem_write(ctx, va, buf, esize) != 0) {
> +        return true;
> +    }
> +
> +    /* Rt receives the old value (before modification) */
> +    gpr_write(ctx, a->rt, old);
> +    return true;
> +}
> +
> +static bool trans_LDADD(DisasContext *ctx, arg_atomic *a)
> +{
> +    return do_atomic(ctx, a, atomic_add);
> +}
> +
> +static bool trans_LDCLR(DisasContext *ctx, arg_atomic *a)
> +{
> +    return do_atomic(ctx, a, atomic_clr);
> +}
> +
> +static bool trans_LDEOR(DisasContext *ctx, arg_atomic *a)
> +{
> +    return do_atomic(ctx, a, atomic_eor);
> +}
> +
> +static bool trans_LDSET(DisasContext *ctx, arg_atomic *a)
> +{
> +    return do_atomic(ctx, a, atomic_set);
> +}
> +
> +static bool trans_LDSMAX(DisasContext *ctx, arg_atomic *a)
> +{
> +    return do_atomic(ctx, a, atomic_smax);
> +}
> +
> +static bool trans_LDSMIN(DisasContext *ctx, arg_atomic *a)
> +{
> +    return do_atomic(ctx, a, atomic_smin);
> +}
> +
> +static bool trans_LDUMAX(DisasContext *ctx, arg_atomic *a)
> +{
> +    return do_atomic(ctx, a, atomic_umax);
> +}
> +
> +static bool trans_LDUMIN(DisasContext *ctx, arg_atomic *a)
> +{
> +    return do_atomic(ctx, a, atomic_umin);
> +}
> +
> +static bool trans_SWP(DisasContext *ctx, arg_atomic *a)
> +{
> +    int esize = 1 << a->sz;
> +    uint64_t va = base_read(ctx, a->rn);
> +    uint8_t buf[8];
> +
> +    if (mem_read(ctx, va, buf, esize) != 0) {
> +        return true;
> +    }
> +
> +    uint64_t old = mem_ld(ctx, buf, esize);
> +    mem_st(ctx, buf, esize, gpr_read(ctx, a->rs));
> +    if (mem_write(ctx, va, buf, esize) != 0) {
> +        return true;
> +    }
> +
> +    gpr_write(ctx, a->rt, old);
> +    return true;
> +}
> +
> +/* Compare-and-swap: CAS, CASP (DDI 0487 C3.3.1) */
> +
> +static bool trans_CAS(DisasContext *ctx, arg_cas *a)
> +{
> +    int esize = 1 << a->sz;
> +    uint64_t va = base_read(ctx, a->rn);
> +    uint8_t buf[8];
> +
> +    if (mem_read(ctx, va, buf, esize) != 0) {
> +        return true;
> +    }
> +
> +    uint64_t current = mem_ld(ctx, buf, esize);
> +    uint64_t mask = (esize == 8) ? UINT64_MAX : (1ULL << (8 * esize)) - 1;
> +    uint64_t compare = gpr_read(ctx, a->rs) & mask;
> +
> +    if ((current & mask) == compare) {
> +        uint64_t newval = gpr_read(ctx, a->rt) & mask;
> +        mem_st(ctx, buf, esize, newval);
> +        if (mem_write(ctx, va, buf, esize) != 0) {
> +            return true;
> +        }
> +    }
> +
> +    /* Rs receives the old memory value (whether or not swap occurred) */
> +    gpr_write(ctx, a->rs, current);
> +    return true;
> +}
> +
> +/* CASP: compare-and-swap pair (Rs,Rs+1 compared; Rt,Rt+1 stored) */
> +static bool trans_CASP(DisasContext *ctx, arg_cas *a)
> +{
> +    /* CASP requires even register pairs; odd or r31 is UNPREDICTABLE */
> +    if ((a->rs & 1) || a->rs >= 31 || (a->rt & 1) || a->rt >= 31) {
> +        return false;
> +    }
> +
> +    int esize = 1 << a->sz;                   /* per-register size */
> +    uint64_t va = base_read(ctx, a->rn);
> +    uint8_t buf[16];
> +
> +    if (mem_read(ctx, va, buf, 2 * esize) != 0) {
> +        return true;
> +    }
> +    uint64_t cur1 = mem_ld(ctx, buf, esize);
> +    uint64_t cur2 = mem_ld(ctx, buf + esize, esize);
> +
> +    uint64_t mask = (esize == 8) ? UINT64_MAX : (1ULL << (8 * esize)) - 1;
> +    uint64_t cmp1 = gpr_read(ctx, a->rs) & mask;
> +    uint64_t cmp2 = gpr_read(ctx, a->rs + 1) & mask;
> +
> +    if ((cur1 & mask) == cmp1 && (cur2 & mask) == cmp2) {
> +        uint64_t new1 = gpr_read(ctx, a->rt) & mask;
> +        uint64_t new2 = gpr_read(ctx, a->rt + 1) & mask;
> +        mem_st(ctx, buf, esize, new1);
> +        mem_st(ctx, buf + esize, esize, new2);
> +        if (mem_write(ctx, va, buf, 2 * esize) != 0) {
> +            return true;
> +        }
> +    }
> +
> +    gpr_write(ctx, a->rs, cur1);
> +    gpr_write(ctx, a->rs + 1, cur2);
> +    return true;
> +}
> +
> +/*
> + * Load with PAC: LDRAA / LDRAB (FEAT_PAuth)
> + * (DDI 0487 C6.2.121)
> + *
> + * Pointer authentication is not emulated -- the base register is used
> + * directly (equivalent to auth always succeeding).
> + */
> +
> +static bool trans_LDRA(DisasContext *ctx, arg_ldra *a)
> +{
> +    int64_t offset = (int64_t)a->imm << 3;  /* S:imm9, scaled by 8 */
> +    uint64_t base = base_read(ctx, a->rn);
> +    uint64_t va = base + offset;  /* auth not emulated */
> +    uint8_t buf[8];
> +
> +    if (mem_read(ctx, va, buf, 8) != 0) {
> +        return true;
> +    }
> +
> +    gpr_write(ctx, a->rt, mem_ld(ctx, buf, 8));
> +
> +    if (a->w) {
> +        base_write(ctx, a->rn, va);
> +    }
> +    return true;
> +}
> +
> /* PRFM, DC cache maintenance -- treated as NOP */
> static bool trans_NOP(DisasContext *ctx, arg_NOP *a)
> {
> -- 
> 2.52.0
> 
>