[RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions

Ayan Kumar Halder posted 1 patch 2 years, 5 months ago
Test gitlab-ci passed
Patches applied successfully (tree, apply log)
git fetch https://gitlab.com/xen-project/patchew/xen tags/patchew/20211119165202.42442-1-ayankuma@xilinx.com
xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
xen/arch/arm/io.c     | 14 ++++++--
2 files changed, 88 insertions(+), 3 deletions(-)
[RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Ayan Kumar Halder 2 years, 5 months ago
At present, post indexing instructions are not emulated by Xen.
When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
result, data abort is triggered.

Added the logic to decode ldr/str post indexing instructions.
With this, Xen can decode instructions like these:-
ldr w2, [x1], #4
Thus, domU can read ioreg with post indexing instructions.

Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
---
Note to reviewer:-
This patch is based on an issue discussed in 
https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
"Xen/ARM - Query about a data abort seen while reading GICD registers"


 xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
 xen/arch/arm/io.c     | 14 ++++++--
 2 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
index 792c2e92a7..7b60bedbc5 100644
--- a/xen/arch/arm/decode.c
+++ b/xen/arch/arm/decode.c
@@ -84,6 +84,80 @@ bad_thumb2:
     return 1;
 }
 
+static inline int32_t extract32(uint32_t value, int start, int length)
+{
+    int32_t ret;
+
+    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
+        return -EINVAL;
+
+    ret = (value >> start) & (~0U >> (32 - length));
+
+    return ret;
+}
+
+static int decode_64bit_loadstore_postindexing(register_t pc, struct hsr_dabt *dabt)
+{
+    uint32_t instr;
+    int size;
+    int v;
+    int opc;
+    int rt;
+    int imm9;
+
+    /* For details on decoding, refer to Armv8 Architecture reference manual
+     * Section - "Load/store register (immediate post-indexed)", Pg 318
+    */
+    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
+        return -EFAULT;
+
+    /* First, let's check for the fixed values */
+
+    /*  As per the "Encoding table for the Loads and Stores group", Pg 299
+     * op4 = 1 - Load/store register (immediate post-indexed)
+     */
+    if ( extract32(instr, 10, 2) != 1 )
+        goto bad_64bit_loadstore;
+
+    /* For the following, refer to "Load/store register (immediate post-indexed)"
+     * to get the fixed values at various bit positions.
+     */
+    if ( extract32(instr, 21, 1) != 0 )
+        goto bad_64bit_loadstore;
+
+    if ( extract32(instr, 24, 2) != 0 )
+        goto bad_64bit_loadstore;
+
+    if ( extract32(instr, 27, 3) != 7 )
+        goto bad_64bit_loadstore;
+
+    size = extract32(instr, 30, 2);
+    v = extract32(instr, 26, 1);
+    opc = extract32(instr, 22, 1);
+
+    /* At the moment, we support STR(immediate) - 32 bit variant and
+     * LDR(immediate) - 32 bit variant only.
+     */
+    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
+        goto bad_64bit_loadstore;
+
+    rt = extract32(instr, 0, 5);
+    imm9 = extract32(instr, 12, 9);
+
+    if ( imm9 < 0 )
+        update_dabt(dabt, rt, size, true);
+    else
+        update_dabt(dabt, rt, size, false);
+
+    dabt->valid = 1;
+
+
+    return 0;
+bad_64bit_loadstore:
+    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
+    return 1;
+}
+
 static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
 {
     uint16_t instr;
@@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt)
     if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
         return decode_thumb(regs->pc, dabt);
 
+    if ( is_64bit_domain(current->domain) )
+        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
+
     /* TODO: Handle ARM instruction */
     gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
 
diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
index 729287e37c..49e80358c0 100644
--- a/xen/arch/arm/io.c
+++ b/xen/arch/arm/io.c
@@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
         .gpa = gpa,
         .dabt = dabt
     };
+    int rc;
 
     ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
 
     handler = find_mmio_handler(v->domain, info.gpa);
     if ( !handler )
     {
-        int rc;
-
         rc = try_fwd_ioserv(regs, v, &info);
         if ( rc == IO_HANDLED )
             return handle_ioserv(regs, v);
@@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
 
     /* All the instructions used on emulated MMIO region should be valid */
     if ( !dabt.valid )
-        return IO_ABORT;
+    {
+        /*
+         * Post indexing ldr/str instructions are not emulated by Xen. So, the
+         * ISS is invalid. In such a scenario, we try to manually decode the
+         * instruction from the program counter.
+         */
+        rc = decode_instruction(regs, &info.dabt);
+        if ( rc )
+            return IO_ABORT;
+    }
 
     /*
      * Erratum 766422: Thumb store translation fault to Hypervisor may
-- 
2.17.1


Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Julien Grall 2 years, 5 months ago
Hi Ayan,

On 19/11/2021 16:52, Ayan Kumar Halder wrote:
> At present, post indexing instructions are not emulated by Xen.

Can you explain in the commit message why this is a problem?

> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
> result, data abort is triggered.
> 
> Added the logic to decode ldr/str post indexing instructions.
> With this, Xen can decode instructions like these:-
> ldr w2, [x1], #4
> Thus, domU can read ioreg with post indexing instructions.
Hmmm.... Don't you also need to update the register x1 after the 
instruction was emulated?

> 
> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
> ---
> Note to reviewer:-
> This patch is based on an issue discussed in
> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
> "Xen/ARM - Query about a data abort seen while reading GICD registers"
> 
> 
>   xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
>   xen/arch/arm/io.c     | 14 ++++++--
>   2 files changed, 88 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
> index 792c2e92a7..7b60bedbc5 100644
> --- a/xen/arch/arm/decode.c
> +++ b/xen/arch/arm/decode.c
> @@ -84,6 +84,80 @@ bad_thumb2:
>       return 1;
>   }
>   
> +static inline int32_t extract32(uint32_t value, int start, int length)

Any reason to have start and length signed?

> +{
> +    int32_t ret;
> +
> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
> +        return -EINVAL;
> +
> +    ret = (value >> start) & (~0U >> (32 - length));

This would be easier to read if you use GENMASK().

> +
> +    return ret;
> +}
> +
> +static int decode_64bit_loadstore_postindexing(register_t pc, struct hsr_dabt *dabt)
> +{
> +    uint32_t instr;
> +    int size;
> +    int v;
> +    int opc;
> +    int rt;
> +    int imm9;

Should all those variables need to be signed?

> +
> +    /* For details on decoding, refer to Armv8 Architecture reference manual
> +     * Section - "Load/store register (immediate post-indexed)", Pg 318
The page number varies between revision of the Armv8 spec. So can you 
specify which version you used?

> +    */

The coding style for comment in Xen is:

/*
  * Foo
  * Bar
  */

> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
> +        return -EFAULT;
> +
> +    /* First, let's check for the fixed values */
> +
> +    /*  As per the "Encoding table for the Loads and Stores group", Pg 299

Same question here about the version.

> +     * op4 = 1 - Load/store register (immediate post-indexed)
> +     */

Coding style.

> +    if ( extract32(instr, 10, 2) != 1 )
> +        goto bad_64bit_loadstore;
> +
> +    /* For the following, refer to "Load/store register (immediate post-indexed)"
> +     * to get the fixed values at various bit positions.
> +     */
> +    if ( extract32(instr, 21, 1) != 0 )
> +        goto bad_64bit_loadstore;
> +
> +    if ( extract32(instr, 24, 2) != 0 )
> +        goto bad_64bit_loadstore;
> +
> +    if ( extract32(instr, 27, 3) != 7 )
> +        goto bad_64bit_loadstore;
> +
> +    size = extract32(instr, 30, 2);
> +    v = extract32(instr, 26, 1);
> +    opc = extract32(instr, 22, 1);
> +
> +    /* At the moment, we support STR(immediate) - 32 bit variant and
> +     * LDR(immediate) - 32 bit variant only.
> +     */

Coding style.

> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
>

The coding style for this should be:

if ( !(( size == 2 ) && ( ... ) ... ) )

  +        goto bad_64bit_loadstore;
> +
> +    rt = extract32(instr, 0, 5);
> +    imm9 = extract32(instr, 12, 9);
> +
> +    if ( imm9 < 0 )
> +        update_dabt(dabt, rt, size, true);
> +    else
> +        update_dabt(dabt, rt, size, false);

This could be simplified with:

update_dabt(dabt, rt, size, imm9 < 0);

> +
> +    dabt->valid = 1;
> +
> +
> +    return 0;
> +bad_64bit_loadstore:
> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
> +    return 1;
> +}
> +
>   static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>   {
>       uint16_t instr;
> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt)
>       if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
>           return decode_thumb(regs->pc, dabt);
>   
> +    if ( is_64bit_domain(current->domain) )

You can still run 32-bit code in 64-bit domain. So I think you want to 
check the SPSR mode.

> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
> +
>       /* TODO: Handle ARM instruction */
>       gprintk(XENLOG_ERR, "unhandled ARM instruction\n");

I think this comment should now be updated to "unhandled 32-bit ...".

>   
> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> index 729287e37c..49e80358c0 100644
> --- a/xen/arch/arm/io.c
> +++ b/xen/arch/arm/io.c
> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>           .gpa = gpa,
>           .dabt = dabt
>       };
> +    int rc;
>   
>       ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
>   
>       handler = find_mmio_handler(v->domain, info.gpa);
>       if ( !handler )
>       {
> -        int rc;
> -
>           rc = try_fwd_ioserv(regs, v, &info);
>           if ( rc == IO_HANDLED )
>               return handle_ioserv(regs, v);
> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>   
>       /* All the instructions used on emulated MMIO region should be valid */
>       if ( !dabt.valid )
> -        return IO_ABORT;
> +    {
> +        /*
> +         * Post indexing ldr/str instructions are not emulated by Xen. So, the
> +         * ISS is invalid. In such a scenario, we try to manually decode the
> +         * instruction from the program counter.

I am afraid this is wrong. The problem here is the processor didn't 
provide a valid syndrom for post-indexing ldr/str instructions. So in 
order to support them, Xen must decode.

> +         */
> +        rc = decode_instruction(regs, &info.dabt);

I actually expect this to also be useful when forwarding the I/O to 
device-model. So I would move the decoding earlier in the code and the 
check of dabt.valid earlier.

> +        if ( rc )
> +            return IO_ABORT;
> +    }
>   
>       /*
>        * Erratum 766422: Thumb store translation fault to Hypervisor may
> 

Cheers,

-- 
Julien Grall

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Ayan Kumar Halder 2 years, 5 months ago
Hi Julien/Stefano/Wei/Jan,

Many thanks for your review.

As some of the comments were inter-related, I have consolidated my 
response to all the comments below.

On 19/11/2021 17:26, Julien Grall wrote:
> Hi Ayan,
> 
> On 19/11/2021 16:52, Ayan Kumar Halder wrote:
>> At present, post indexing instructions are not emulated by Xen.
> 
> Can you explain in the commit message why this is a problem?

Yes, I will update the message as below :-
Armv8 hardware does not provide the correct syndrome for decoding post 
indexing ldr/str instructions. Thus any post indexing ldr/str 
instruction at EL1 results in a data abort with ISV=0. As a result, Xen 
needs to decode the instruction.

Thus, DomU will be able to read/write to ioreg space with post indexing 
instructions for 32 bit.

> 
>> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
>> result, data abort is triggered.
>>
>> Added the logic to decode ldr/str post indexing instructions.
>> With this, Xen can decode instructions like these:-
>> ldr w2, [x1], #4
>> Thus, domU can read ioreg with post indexing instructions.
> Hmmm.... Don't you also need to update the register x1 after the 
> instruction was emulated?

Yes, this is a mistake. X1 needs to be incremented by 4 after W2 is written.
> 
>>
>> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
>> ---
>> Note to reviewer:-
>> This patch is based on an issue discussed in
>> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html 
>>
>> "Xen/ARM - Query about a data abort seen while reading GICD registers"
>>
>>
>>   xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
>>   xen/arch/arm/io.c     | 14 ++++++--
>>   2 files changed, 88 insertions(+), 3 deletions(-)
>>
>> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
>> index 792c2e92a7..7b60bedbc5 100644
>> --- a/xen/arch/arm/decode.c
>> +++ b/xen/arch/arm/decode.c
>> @@ -84,6 +84,80 @@ bad_thumb2:
>>       return 1;
>>   }
>> +static inline int32_t extract32(uint32_t value, int start, int length)
> 
> Any reason to have start and length signed?

Again a mistake. There is no reason to use signed types here or in the 
other places.
As Jan Beulich has pointed out, I should be using unsigned int as per 
the CODING_STYLE.
> 
>> +{
>> +    int32_t ret;
>> +
>> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
>> +        return -EINVAL;
>> +
>> +    ret = (value >> start) & (~0U >> (32 - length));
> 
> This would be easier to read if you use GENMASK().

I see that GENMASK returns a register mask. In my scenario, I wish to 
compare the offsets 10, 21, 24 and 27 to some fixed value.

So, instead of using GENMASK four times, I can try the following
instr & MASK_for_10_21_24_27 == VALID_for_10_21_24_27 (Wei Chen's 
suggestion)


Also for size, v and opc, I can defined another bitmask to compare with 
VALID_for_32bit_LDR | VALID_for_32bit_STR.

Wei Chen, You had suggested using vreg_regx_extract(). However, I see 
that it is used to extract the complete u64 register value. In this 
case, I wish to compare certain offsets within a 32 bit register to some 
expected values. So, vreg_regx_extract() might not be appropriate and we 
can use the method mentioned before.

> 
>> +
>> +    return ret;
>> +}
>> +
>> +static int decode_64bit_loadstore_postindexing(register_t pc, struct 
>> hsr_dabt *dabt)
>> +{
>> +    uint32_t instr;
>> +    int size;
>> +    int v;
>> +    int opc;
>> +    int rt;
>> +    int imm9;
> 
> Should all those variables need to be signed?

A mistake. I will change them to unsigned int.
> 
>> +
>> +    /* For details on decoding, refer to Armv8 Architecture reference 
>> manual
>> +     * Section - "Load/store register (immediate post-indexed)", Pg 318
> The page number varies between revision of the Armv8 spec. So can you 
> specify which version you used?

Ack. I will mention the version.
> 
>> +    */
> 
> The coding style for comment in Xen is:
> 
> /*
>   * Foo
>   * Bar
>   */
Ack
> 
>> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof 
>> (instr)) )
>> +        return -EFAULT;
>> +
>> +    /* First, let's check for the fixed values */
>> +
>> +    /*  As per the "Encoding table for the Loads and Stores group", 
>> Pg 299
> 
> Same question here about the version.
Ack, I will mention the version.

> 
>> +     * op4 = 1 - Load/store register (immediate post-indexed)
>> +     */
> 
> Coding style.
Ack

> 
>> +    if ( extract32(instr, 10, 2) != 1 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    /* For the following, refer to "Load/store register (immediate 
>> post-indexed)"
>> +     * to get the fixed values at various bit positions.
>> +     */
>> +    if ( extract32(instr, 21, 1) != 0 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    if ( extract32(instr, 24, 2) != 0 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    if ( extract32(instr, 27, 3) != 7 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    size = extract32(instr, 30, 2);
>> +    v = extract32(instr, 26, 1);
>> +    opc = extract32(instr, 22, 1);

Stefano:- Thanks for catching my mistake. opc is 2 bits (bits 22, 23). I 
will fix this.

>> +
>> +    /* At the moment, we support STR(immediate) - 32 bit variant and
>> +     * LDR(immediate) - 32 bit variant only.
>> +     */
> 
> Coding style.
Ack

> 
>> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
>>
> 
> The coding style for this should be:
> 
> if ( !(( size == 2 ) && ( ... ) ... ) )
Ack

> 
>   +        goto bad_64bit_loadstore;
>> +
>> +    rt = extract32(instr, 0, 5);
>> +    imm9 = extract32(instr, 12, 9);
>> +
>> +    if ( imm9 < 0 )
>> +        update_dabt(dabt, rt, size, true);
>> +    else
>> +        update_dabt(dabt, rt, size, false);
> 
> This could be simplified with:
> 
> update_dabt(dabt, rt, size, imm9 < 0);
Ack

> 
>> +
>> +    dabt->valid = 1;
>> +
>> +
>> +    return 0;
>> +bad_64bit_loadstore:
>> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
>> +    return 1;
>> +}
>> +
>>   static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>>   {
>>       uint16_t instr;
>> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs 
>> *regs, struct hsr_dabt *dabt)
>>       if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
>>           return decode_thumb(regs->pc, dabt);
>> +    if ( is_64bit_domain(current->domain) )
> 
> You can still run 32-bit code in 64-bit domain. So I think you want to 
> check the SPSR mode.

Do you mean the following check :-
if ( (is_64bit_domain(current->domain) && (!(regs->spsr & PSR_MODE_BIT)) )

> 
>> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
>> +
>>       /* TODO: Handle ARM instruction */
>>       gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
> 
> I think this comment should now be updated to "unhandled 32-bit ...".

Ack
> 
>> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
>> index 729287e37c..49e80358c0 100644
>> --- a/xen/arch/arm/io.c
>> +++ b/xen/arch/arm/io.c
>> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct 
>> cpu_user_regs *regs,
>>           .gpa = gpa,
>>           .dabt = dabt
>>       };
>> +    int rc;
>>       ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
>>       handler = find_mmio_handler(v->domain, info.gpa);
>>       if ( !handler )
>>       {
>> -        int rc;
>> -
>>           rc = try_fwd_ioserv(regs, v, &info);
>>           if ( rc == IO_HANDLED )
>>               return handle_ioserv(regs, v);
>> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct 
>> cpu_user_regs *regs,
>>       /* All the instructions used on emulated MMIO region should be 
>> valid */
>>       if ( !dabt.valid )
>> -        return IO_ABORT;
>> +    {
>> +        /*
>> +         * Post indexing ldr/str instructions are not emulated by 
>> Xen. So, the
>> +         * ISS is invalid. In such a scenario, we try to manually 
>> decode the
>> +         * instruction from the program counter.
> 
> I am afraid this is wrong. The problem here is the processor didn't 
> provide a valid syndrom for post-indexing ldr/str instructions. So in 
> order to support them, Xen must decode.

Ack
> 
>> +         */
>> +        rc = decode_instruction(regs, &info.dabt);
> 
> I actually expect this to also be useful when forwarding the I/O to 
> device-model. So I would move the decoding earlier in the code and the 
> check of dabt.valid earlier.

You mean I will move decode_instruction() before find_mmio_handler() ?

Stefano > It doesn't look like we are setting dabt->write anywhere.

Ayan > Yes, this is a miss. Depending on the opc, this should be 
upadeted accordingly in decode_64bit_loadstore_postindexing().

Stefano > Also, is info.gpa in try_handle_mmio already updated in the 
pre-index
case? If not, do we need to apply the offset manually here?

Ayan > Sorry, I did not understand you. This patch is only related to 
the post indexing ldr/str issue. Why do we need to check for pre-indexing ?

Stefano > In the post-index case, we need to update the base address in 
the Rn
register?

Ayan > Yes this is a mistake, As Julien pointed out before, the register 
x1 ie Rn needs to the incremented.

Jan > In addition to Julien's comment regarding the function parameters 
- why
is the return type int32_t and not uint32_t? Plus as per ./CODING_STYLE
it really shouldn't be a fixed width type anyway, but e.g. unsigned int.

Ayan > Yes, this is a mistake. I will update int32_t to unsigned int in 
all the places.
However for extract32(), I don't think we need this function. Rather 
Wei's suggestion (ie instr & MASK_for_10_21_24_27 == 
VALID_for_10_21_24_27 ) makes the code simpler and shorter.

- Ayan

> 
>> +        if ( rc )
>> +            return IO_ABORT;
>> +    }
>>       /*
>>        * Erratum 766422: Thumb store translation fault to Hypervisor may
>>
> 
> Cheers,
> 

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Bertrand Marquis 2 years, 5 months ago
Hi Ayan

> On 22 Nov 2021, at 14:19, Ayan Kumar Halder <ayan.kumar.halder@xilinx.com> wrote:
> 
> Hi Julien/Stefano/Wei/Jan,
> 
> Many thanks for your review.
> 
> As some of the comments were inter-related, I have consolidated my response to all the comments below.
> 
> On 19/11/2021 17:26, Julien Grall wrote:
>> Hi Ayan,
>> On 19/11/2021 16:52, Ayan Kumar Halder wrote:
>>> At present, post indexing instructions are not emulated by Xen.
>> Can you explain in the commit message why this is a problem?
> 
> Yes, I will update the message as below :-
> Armv8 hardware does not provide the correct syndrome for decoding post indexing ldr/str instructions. Thus any post indexing ldr/str instruction at EL1 results in a data abort with ISV=0. As a result, Xen needs to decode the instruction.
> 
> Thus, DomU will be able to read/write to ioreg space with post indexing instructions for 32 bit.
> 
>>> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
>>> result, data abort is triggered.
>>> 
>>> Added the logic to decode ldr/str post indexing instructions.
>>> With this, Xen can decode instructions like these:-
>>> ldr w2, [x1], #4
>>> Thus, domU can read ioreg with post indexing instructions.
>> Hmmm.... Don't you also need to update the register x1 after the instruction was emulated?
> 
> Yes, this is a mistake. X1 needs to be incremented by 4 after W2 is written.
>>> 
>>> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
>>> ---
>>> Note to reviewer:-
>>> This patch is based on an issue discussed in
>>> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html 
>>> "Xen/ARM - Query about a data abort seen while reading GICD registers"
>>> 
>>> 
>>>   xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
>>>   xen/arch/arm/io.c     | 14 ++++++--
>>>   2 files changed, 88 insertions(+), 3 deletions(-)
>>> 
>>> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
>>> index 792c2e92a7..7b60bedbc5 100644
>>> --- a/xen/arch/arm/decode.c
>>> +++ b/xen/arch/arm/decode.c
>>> @@ -84,6 +84,80 @@ bad_thumb2:
>>>       return 1;
>>>   }
>>> +static inline int32_t extract32(uint32_t value, int start, int length)
>> Any reason to have start and length signed?
> 
> Again a mistake. There is no reason to use signed types here or in the other places.
> As Jan Beulich has pointed out, I should be using unsigned int as per the CODING_STYLE.
>>> +{
>>> +    int32_t ret;
>>> +
>>> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
>>> +        return -EINVAL;
>>> +
>>> +    ret = (value >> start) & (~0U >> (32 - length));
>> This would be easier to read if you use GENMASK().
> 
> I see that GENMASK returns a register mask. In my scenario, I wish to compare the offsets 10, 21, 24 and 27 to some fixed value.
> 
> So, instead of using GENMASK four times, I can try the following
> instr & MASK_for_10_21_24_27 == VALID_for_10_21_24_27 (Wei Chen's suggestion)
> 
> 
> Also for size, v and opc, I can defined another bitmask to compare with VALID_for_32bit_LDR | VALID_for_32bit_STR.
> 
> Wei Chen, You had suggested using vreg_regx_extract(). However, I see that it is used to extract the complete u64 register value. In this case, I wish to compare certain offsets within a 32 bit register to some expected values. So, vreg_regx_extract() might not be appropriate and we can use the method mentioned before.
> 
>>> +
>>> +    return ret;
>>> +}
>>> +
>>> +static int decode_64bit_loadstore_postindexing(register_t pc, struct hsr_dabt *dabt)
>>> +{
>>> +    uint32_t instr;
>>> +    int size;
>>> +    int v;
>>> +    int opc;
>>> +    int rt;
>>> +    int imm9;
>> Should all those variables need to be signed?
> 
> A mistake. I will change them to unsigned int.
>>> +
>>> +    /* For details on decoding, refer to Armv8 Architecture reference manual
>>> +     * Section - "Load/store register (immediate post-indexed)", Pg 318
>> The page number varies between revision of the Armv8 spec. So can you specify which version you used?
> 
> Ack. I will mention the version.
>>> +    */
>> The coding style for comment in Xen is:
>> /*
>>  * Foo
>>  * Bar
>>  */
> Ack
>>> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
>>> +        return -EFAULT;
>>> +
>>> +    /* First, let's check for the fixed values */
>>> +
>>> +    /*  As per the "Encoding table for the Loads and Stores group", Pg 299
>> Same question here about the version.
> Ack, I will mention the version.
> 
>>> +     * op4 = 1 - Load/store register (immediate post-indexed)
>>> +     */
>> Coding style.
> Ack
> 
>>> +    if ( extract32(instr, 10, 2) != 1 )
>>> +        goto bad_64bit_loadstore;
>>> +
>>> +    /* For the following, refer to "Load/store register (immediate post-indexed)"
>>> +     * to get the fixed values at various bit positions.
>>> +     */
>>> +    if ( extract32(instr, 21, 1) != 0 )
>>> +        goto bad_64bit_loadstore;
>>> +
>>> +    if ( extract32(instr, 24, 2) != 0 )
>>> +        goto bad_64bit_loadstore;
>>> +
>>> +    if ( extract32(instr, 27, 3) != 7 )
>>> +        goto bad_64bit_loadstore;
>>> +
>>> +    size = extract32(instr, 30, 2);
>>> +    v = extract32(instr, 26, 1);
>>> +    opc = extract32(instr, 22, 1);
> 
> Stefano:- Thanks for catching my mistake. opc is 2 bits (bits 22, 23). I will fix this.
> 
>>> +
>>> +    /* At the moment, we support STR(immediate) - 32 bit variant and
>>> +     * LDR(immediate) - 32 bit variant only.
>>> +     */
>> Coding style.
> Ack
> 
>>> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
>>> 
>> The coding style for this should be:
>> if ( !(( size == 2 ) && ( ... ) ... ) )
> Ack
> 
>>  +        goto bad_64bit_loadstore;
>>> +
>>> +    rt = extract32(instr, 0, 5);
>>> +    imm9 = extract32(instr, 12, 9);
>>> +
>>> +    if ( imm9 < 0 )
>>> +        update_dabt(dabt, rt, size, true);
>>> +    else
>>> +        update_dabt(dabt, rt, size, false);
>> This could be simplified with:
>> update_dabt(dabt, rt, size, imm9 < 0);
> Ack
> 
>>> +
>>> +    dabt->valid = 1;
>>> +
>>> +
>>> +    return 0;
>>> +bad_64bit_loadstore:
>>> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
>>> +    return 1;
>>> +}
>>> +
>>>   static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>>>   {
>>>       uint16_t instr;
>>> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt)
>>>       if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
>>>           return decode_thumb(regs->pc, dabt);
>>> +    if ( is_64bit_domain(current->domain) )
>> You can still run 32-bit code in 64-bit domain. So I think you want to check the SPSR mode.
> 
> Do you mean the following check :-
> if ( (is_64bit_domain(current->domain) && (!(regs->spsr & PSR_MODE_BIT)) )
> 
>>> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
>>> +
>>>       /* TODO: Handle ARM instruction */
>>>       gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
>> I think this comment should now be updated to "unhandled 32-bit ...".
> 
> Ack
>>> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
>>> index 729287e37c..49e80358c0 100644
>>> --- a/xen/arch/arm/io.c
>>> +++ b/xen/arch/arm/io.c
>>> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>>>           .gpa = gpa,
>>>           .dabt = dabt
>>>       };
>>> +    int rc;
>>>       ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
>>>       handler = find_mmio_handler(v->domain, info.gpa);
>>>       if ( !handler )
>>>       {
>>> -        int rc;
>>> -
>>>           rc = try_fwd_ioserv(regs, v, &info);
>>>           if ( rc == IO_HANDLED )
>>>               return handle_ioserv(regs, v);
>>> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>>>       /* All the instructions used on emulated MMIO region should be valid */
>>>       if ( !dabt.valid )
>>> -        return IO_ABORT;
>>> +    {
>>> +        /*
>>> +         * Post indexing ldr/str instructions are not emulated by Xen. So, the
>>> +         * ISS is invalid. In such a scenario, we try to manually decode the
>>> +         * instruction from the program counter.
>> I am afraid this is wrong. The problem here is the processor didn't provide a valid syndrom for post-indexing ldr/str instructions. So in order to support them, Xen must decode.
> 
> Ack
>>> +         */
>>> +        rc = decode_instruction(regs, &info.dabt);
>> I actually expect this to also be useful when forwarding the I/O to device-model. So I would move the decoding earlier in the code and the check of dabt.valid earlier.
> 
> You mean I will move decode_instruction() before find_mmio_handler() ?
> 
> Stefano > It doesn't look like we are setting dabt->write anywhere.
> 
> Ayan > Yes, this is a miss. Depending on the opc, this should be upadeted accordingly in decode_64bit_loadstore_postindexing().
> 
> Stefano > Also, is info.gpa in try_handle_mmio already updated in the pre-index
> case? If not, do we need to apply the offset manually here?
> 
> Ayan > Sorry, I did not understand you. This patch is only related to the post indexing ldr/str issue. Why do we need to check for pre-indexing ?
> 
> Stefano > In the post-index case, we need to update the base address in the Rn
> register?
> 
> Ayan > Yes this is a mistake, As Julien pointed out before, the register x1 ie Rn needs to the incremented.
> 
> Jan > In addition to Julien's comment regarding the function parameters - why
> is the return type int32_t and not uint32_t? Plus as per ./CODING_STYLE
> it really shouldn't be a fixed width type anyway, but e.g. unsigned int.
> 
> Ayan > Yes, this is a mistake. I will update int32_t to unsigned int in all the places.
> However for extract32(), I don't think we need this function. Rather Wei's suggestion (ie instr & MASK_for_10_21_24_27 == VALID_for_10_21_24_27 ) makes the code simpler and shorter.

In fact you could also use a union and define in a structure what the bits are.

Union instr {
	uint32_t value
	struct {
            ….
            Unsigned int size:2;
        }
}

This could simplify some of your code.

Cheers
Bertrand


> 
> - Ayan
> 
>>> +        if ( rc )
>>> +            return IO_ABORT;
>>> +    }
>>>       /*
>>>        * Erratum 766422: Thumb store translation fault to Hypervisor may
>>> 
>> Cheers,

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Stefano Stabellini 2 years, 5 months ago
On Mon, 22 Nov 2021, Ayan Kumar Halder wrote:
> Stefano > It doesn't look like we are setting dabt->write anywhere.
> 
> Ayan > Yes, this is a miss. Depending on the opc, this should be upadeted
> accordingly in decode_64bit_loadstore_postindexing().
> 
> Stefano > Also, is info.gpa in try_handle_mmio already updated in the
> pre-index
> case? If not, do we need to apply the offset manually here?
> 
> Ayan > Sorry, I did not understand you. This patch is only related to the post
> indexing ldr/str issue. Why do we need to check for pre-indexing ?

I thought you were trying to handle both post-indexing and pre-indexing.
It is OK if you intend to only handle post-indexing but considering that
most of the code is shared between the two, we might as well also make
pre-indexing work (unless it turns out it is more difficult).

In the pre-indexing case, I would imagine we need to update the base
address before taking any other actions.

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Julien Grall 2 years, 5 months ago
Hi,

On Mon, 22 Nov 2021 at 19:59, Stefano Stabellini <sstabellini@kernel.org> wrote:
>
> On Mon, 22 Nov 2021, Ayan Kumar Halder wrote:
> > Stefano > It doesn't look like we are setting dabt->write anywhere.
> >
> > Ayan > Yes, this is a miss. Depending on the opc, this should be upadeted
> > accordingly in decode_64bit_loadstore_postindexing().
> >
> > Stefano > Also, is info.gpa in try_handle_mmio already updated in the
> > pre-index
> > case? If not, do we need to apply the offset manually here?
> >
> > Ayan > Sorry, I did not understand you. This patch is only related to the post
> > indexing ldr/str issue. Why do we need to check for pre-indexing ?
>
> I thought you were trying to handle both post-indexing and pre-indexing.
> It is OK if you intend to only handle post-indexing but considering that
> most of the code is shared between the two, we might as well also make
> pre-indexing work (unless it turns out it is more difficult).

Wouldn't this effectively be dead code?

>
> In the pre-indexing case, I would imagine we need to update the base
> address before taking any other actions.

From my understanding, this would have already been performed by the
HW when the syndrome is valid. This may also be the case for
the non-valid case, but I haven't checked the Arm Arm.

Cheers,

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Stefano Stabellini 2 years, 5 months ago
On Mon, 22 Nov 2021, Julien Grall wrote:
> On Mon, 22 Nov 2021 at 19:59, Stefano Stabellini <sstabellini@kernel.org> wrote:
> >
> > On Mon, 22 Nov 2021, Ayan Kumar Halder wrote:
> > > Stefano > It doesn't look like we are setting dabt->write anywhere.
> > >
> > > Ayan > Yes, this is a miss. Depending on the opc, this should be upadeted
> > > accordingly in decode_64bit_loadstore_postindexing().
> > >
> > > Stefano > Also, is info.gpa in try_handle_mmio already updated in the
> > > pre-index
> > > case? If not, do we need to apply the offset manually here?
> > >
> > > Ayan > Sorry, I did not understand you. This patch is only related to the post
> > > indexing ldr/str issue. Why do we need to check for pre-indexing ?
> >
> > I thought you were trying to handle both post-indexing and pre-indexing.
> > It is OK if you intend to only handle post-indexing but considering that
> > most of the code is shared between the two, we might as well also make
> > pre-indexing work (unless it turns out it is more difficult).
> 
> Wouldn't this effectively be dead code?
> 
> >
> > In the pre-indexing case, I would imagine we need to update the base
> > address before taking any other actions.
> 
> >From my understanding, this would have already been performed by the
> HW when the syndrome is valid. This may also be the case for
> the non-valid case, but I haven't checked the Arm Arm.

It is not clear to me either, that's why I wrote "I would imagine"... I
was guessing that it is not done by the HW in the non-valid case but I
don't know.

Of course, if it is already done by the HW, that's all the better: no
need for us to do anything.

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Bertrand Marquis 2 years, 5 months ago
Hi Stefano,

> On 23 Nov 2021, at 04:13, Stefano Stabellini <sstabellini@kernel.org> wrote:
> 
> On Mon, 22 Nov 2021, Julien Grall wrote:
>> On Mon, 22 Nov 2021 at 19:59, Stefano Stabellini <sstabellini@kernel.org> wrote:
>>> 
>>> On Mon, 22 Nov 2021, Ayan Kumar Halder wrote:
>>>> Stefano > It doesn't look like we are setting dabt->write anywhere.
>>>> 
>>>> Ayan > Yes, this is a miss. Depending on the opc, this should be upadeted
>>>> accordingly in decode_64bit_loadstore_postindexing().
>>>> 
>>>> Stefano > Also, is info.gpa in try_handle_mmio already updated in the
>>>> pre-index
>>>> case? If not, do we need to apply the offset manually here?
>>>> 
>>>> Ayan > Sorry, I did not understand you. This patch is only related to the post
>>>> indexing ldr/str issue. Why do we need to check for pre-indexing ?
>>> 
>>> I thought you were trying to handle both post-indexing and pre-indexing.
>>> It is OK if you intend to only handle post-indexing but considering that
>>> most of the code is shared between the two, we might as well also make
>>> pre-indexing work (unless it turns out it is more difficult).
>> 
>> Wouldn't this effectively be dead code?

I agree this would be dead code. Pre-indexing is handled by the hardware, only post are not.

>> 
>>> 
>>> In the pre-indexing case, I would imagine we need to update the base
>>> address before taking any other actions.
>> 
>>> From my understanding, this would have already been performed by the
>> HW when the syndrome is valid. This may also be the case for
>> the non-valid case, but I haven't checked the Arm Arm.
> 
> It is not clear to me either, that's why I wrote "I would imagine"... I
> was guessing that it is not done by the HW in the non-valid case but I
> don't know.

This should be handled by the hardware here, so only post actions should
be handled here.

> 
> Of course, if it is already done by the HW, that's all the better: no
> need for us to do anything.

I am wondering though if other types of accesses could not be handled here
without major modification of the code like other sizes then 32bit.

There are also post instructions with shifting but to be honest I do not think this is really needed.

Regards
Bertrand




Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Bertrand Marquis 2 years, 5 months ago
Hi,

> On 23 Nov 2021, at 08:53, Bertrand Marquis <bertrand.marquis@arm.com> wrote:
> 
> Hi Stefano,
> 
>> On 23 Nov 2021, at 04:13, Stefano Stabellini <sstabellini@kernel.org> wrote:
>> 
>> On Mon, 22 Nov 2021, Julien Grall wrote:
>>> On Mon, 22 Nov 2021 at 19:59, Stefano Stabellini <sstabellini@kernel.org> wrote:
>>>> 
>>>> On Mon, 22 Nov 2021, Ayan Kumar Halder wrote:
>>>>> Stefano > It doesn't look like we are setting dabt->write anywhere.
>>>>> 
>>>>> Ayan > Yes, this is a miss. Depending on the opc, this should be upadeted
>>>>> accordingly in decode_64bit_loadstore_postindexing().
>>>>> 
>>>>> Stefano > Also, is info.gpa in try_handle_mmio already updated in the
>>>>> pre-index
>>>>> case? If not, do we need to apply the offset manually here?
>>>>> 
>>>>> Ayan > Sorry, I did not understand you. This patch is only related to the post
>>>>> indexing ldr/str issue. Why do we need to check for pre-indexing ?
>>>> 
>>>> I thought you were trying to handle both post-indexing and pre-indexing.
>>>> It is OK if you intend to only handle post-indexing but considering that
>>>> most of the code is shared between the two, we might as well also make
>>>> pre-indexing work (unless it turns out it is more difficult).
>>> 
>>> Wouldn't this effectively be dead code?
> 
> I agree this would be dead code. Pre-indexing is handled by the hardware, only post are not.
> 
>>> 
>>>> 
>>>> In the pre-indexing case, I would imagine we need to update the base
>>>> address before taking any other actions.
>>> 
>>>> From my understanding, this would have already been performed by the
>>> HW when the syndrome is valid. This may also be the case for
>>> the non-valid case, but I haven't checked the Arm Arm.
>> 
>> It is not clear to me either, that's why I wrote "I would imagine"... I
>> was guessing that it is not done by the HW in the non-valid case but I
>> don't know.
> 
> This should be handled by the hardware here, so only post actions should
> be handled here.
> 
>> 
>> Of course, if it is already done by the HW, that's all the better: no
>> need for us to do anything.
> 
> I am wondering though if other types of accesses could not be handled here
> without major modification of the code like other sizes then 32bit.

I did some checks and I think the following cases could be handled:
    ldr x2, [x1], #4
    nop
    ldr w2, [x1], #-4
    nop
    ldrh w2, [x1], #8
    nop
    ldrb w2, [x1], #16
    nop
    str x2, [x1], #4
    nop
    str w2, [x1], #-4
    nop
    strh w2, [x1], #8
    nop
    strb w2, [x1], #16
    nop

Anything that I could have missed ?

> 
> There are also post instructions with shifting but to be honest I do not think this is really needed.

Please ignore this, there is no post shifting.

Once this is done I can test and add a test to XTF on arm (on our side, upstreaming of this is in progress) to make sure this is maintained.

Regards
Bertrand


Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Julien Grall 2 years, 5 months ago

On 22/11/2021 14:19, Ayan Kumar Halder wrote:
> Hi Julien/Stefano/Wei/Jan,

Hi,

> As some of the comments were inter-related, I have consolidated my 
> response to all the comments below.
> 
> On 19/11/2021 17:26, Julien Grall wrote:
>> Hi Ayan,
>>
>> On 19/11/2021 16:52, Ayan Kumar Halder wrote:
>>> At present, post indexing instructions are not emulated by Xen.
>>
>> Can you explain in the commit message why this is a problem?
> 
> Yes, I will update the message as below :-
> Armv8 hardware does not provide the correct syndrome for decoding post 
> indexing ldr/str instructions. 

This statement implies that the issue happens on both AArch32 and 
AArch64 state. I am OK if we only handle the latter for now. But I would 
clarify it in the commit message.

> Thus any post indexing ldr/str 
> instruction at EL1 results in a data abort with ISV=0. 

Instruction from EL0 may also trap in Xen. So I would prefer if you just 
say "instruction executed by a domain results ...".

> As a result, Xen 
> needs to decode the instruction.

Can you give some information on the domain used. Something like:

"this was discovered with XXX which let the compiler deciding which 
instruction to use."

> 
> Thus, DomU will be able to read/write to ioreg space with post indexing 

I would say "domain" rather "domU" because all the domains are affected.

> instructions for 32 bit.

How about the following commit message:

"
xen/arm64: io: Decode 32-bit ldr/str post-indexing instructions

At the moment, Xen is only handling data abort with valid syndrome (i.e.
ISV=0). Unfortunately, this doesn't cover all the instructions a domain
could use to access MMIO regions.

For instance, Xilinx baremetal OS will use:

         volatile u32 *LocalAddr = (volatile u32 *)Addr;
         *LocalAddr = Value;

This leave the compiler to decide which store instructions to use. This
may be a post-index store instruction where the HW will not provide a
valid syndrome.

In order to handle post-indexing store/load instructions,
Xen will need to fetch and decode the instruction.

This patch only cover post-index store/load instructions from AArch64 
mode. For now, this is left unimplemented for trap from AArch32 mode.
"

>>> +{
>>> +    int32_t ret;
>>> +
>>> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
>>> +        return -EINVAL;
>>> +
>>> +    ret = (value >> start) & (~0U >> (32 - length));
>>
>> This would be easier to read if you use GENMASK().
> 
> I see that GENMASK returns a register mask. In my scenario, I wish to 
> compare the offsets 10, 21, 24 and 27 to some fixed value.
> 
> So, instead of using GENMASK four times, I can try the following
> instr & MASK_for_10_21_24_27 == VALID_for_10_21_24_27 (Wei Chen's 
> suggestion)

That would be OK with me. Alternatively, you could use the union 
approach suggested by Bertrand.

> 
> 
> Also for size, v and opc, I can defined another bitmask to compare with 
> VALID_for_32bit_LDR | VALID_for_32bit_STR.
> 
> Wei Chen, You had suggested using vreg_regx_extract(). However, I see 
> that it is used to extract the complete u64 register value. In this 
> case, I wish to compare certain offsets within a 32 bit register to some 
> expected values. So, vreg_regx_extract() might not be appropriate and we 
> can use the method mentioned before.

vreg_reg*_extract() are meant to work on a register. So I think they are 
not appropriate here as you don't deal with registers.

[...]

>>> +
>>> +    /* At the moment, we support STR(immediate) - 32 bit variant and
>>> +     * LDR(immediate) - 32 bit variant only.
>>> +     */
>>
>> Coding style.
> Ack
> 
>>
>>> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
>>>
>>
>> The coding style for this should be:
>>
>> if ( !(( size == 2 ) && ( ... ) ... ) )
> Ack
> 
>>
>>   +        goto bad_64bit_loadstore;
>>> +
>>> +    rt = extract32(instr, 0, 5);
>>> +    imm9 = extract32(instr, 12, 9);
>>> +
>>> +    if ( imm9 < 0 )
>>> +        update_dabt(dabt, rt, size, true);
>>> +    else
>>> +        update_dabt(dabt, rt, size, false);
>>
>> This could be simplified with:
>>
>> update_dabt(dabt, rt, size, imm9 < 0);
> Ack
> 
>>
>>> +
>>> +    dabt->valid = 1;
>>> +
>>> +
>>> +    return 0;
>>> +bad_64bit_loadstore:
>>> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
>>> +    return 1;
>>> +}
>>> +
>>>   static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>>>   {
>>>       uint16_t instr;
>>> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs 
>>> *regs, struct hsr_dabt *dabt)
>>>       if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
>>>           return decode_thumb(regs->pc, dabt);
>>> +    if ( is_64bit_domain(current->domain) )
>>
>> You can still run 32-bit code in 64-bit domain. So I think you want to 
>> check the SPSR mode.
> 
> Do you mean the following check :-
> if ( (is_64bit_domain(current->domain) && (!(regs->spsr & PSR_MODE_BIT)) )

This should work, but I think you can simplify to use:

!psr_mode_is_32bit()

>>> +         */
>>> +        rc = decode_instruction(regs, &info.dabt);
>>
>> I actually expect this to also be useful when forwarding the I/O to 
>> device-model. So I would move the decoding earlier in the code and the 
>> check of dabt.valid earlier.
> 
> You mean I will move decode_instruction() before find_mmio_handler() ?

Yes.

Cheers,

--
Julien Grall

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Stefano Stabellini 2 years, 5 months ago
On Fri, 19 Nov 2021, Ayan Kumar Halder wrote:
> At present, post indexing instructions are not emulated by Xen.
> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
> result, data abort is triggered.
> 
> Added the logic to decode ldr/str post indexing instructions.
> With this, Xen can decode instructions like these:-
> ldr w2, [x1], #4
> Thus, domU can read ioreg with post indexing instructions.
> 
> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
> ---
> Note to reviewer:-
> This patch is based on an issue discussed in 
> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
> "Xen/ARM - Query about a data abort seen while reading GICD registers"
> 
> 
>  xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
>  xen/arch/arm/io.c     | 14 ++++++--
>  2 files changed, 88 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
> index 792c2e92a7..7b60bedbc5 100644
> --- a/xen/arch/arm/decode.c
> +++ b/xen/arch/arm/decode.c
> @@ -84,6 +84,80 @@ bad_thumb2:
>      return 1;
>  }
>  
> +static inline int32_t extract32(uint32_t value, int start, int length)
> +{
> +    int32_t ret;
> +
> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
> +        return -EINVAL;
> +
> +    ret = (value >> start) & (~0U >> (32 - length));
> +
> +    return ret;
> +}
> +
> +static int decode_64bit_loadstore_postindexing(register_t pc, struct hsr_dabt *dabt)
> +{
> +    uint32_t instr;
> +    int size;
> +    int v;
> +    int opc;
> +    int rt;
> +    int imm9;
> +
> +    /* For details on decoding, refer to Armv8 Architecture reference manual
> +     * Section - "Load/store register (immediate post-indexed)", Pg 318
> +    */
> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
> +        return -EFAULT;
> +
> +    /* First, let's check for the fixed values */
> +
> +    /*  As per the "Encoding table for the Loads and Stores group", Pg 299
> +     * op4 = 1 - Load/store register (immediate post-indexed)
> +     */
> +    if ( extract32(instr, 10, 2) != 1 )
> +        goto bad_64bit_loadstore;
> +
> +    /* For the following, refer to "Load/store register (immediate post-indexed)"
> +     * to get the fixed values at various bit positions.
> +     */
> +    if ( extract32(instr, 21, 1) != 0 )
> +        goto bad_64bit_loadstore;
> +
> +    if ( extract32(instr, 24, 2) != 0 )
> +        goto bad_64bit_loadstore;
> +
> +    if ( extract32(instr, 27, 3) != 7 )
> +        goto bad_64bit_loadstore;
> +
> +    size = extract32(instr, 30, 2);
> +    v = extract32(instr, 26, 1);
> +    opc = extract32(instr, 22, 1);
> +
> +    /* At the moment, we support STR(immediate) - 32 bit variant and
> +     * LDR(immediate) - 32 bit variant only.
> +     */
> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
> +        goto bad_64bit_loadstore;

The opc field is actually 2 bits, not 1. I think we should get both
bits for opc even if some of the configurations are not interesting.


> +    rt = extract32(instr, 0, 5);
> +    imm9 = extract32(instr, 12, 9);
> +
> +    if ( imm9 < 0 )
> +        update_dabt(dabt, rt, size, true);
> +    else
> +        update_dabt(dabt, rt, size, false);

It doesn't look like we are setting dabt->write anywhere.

Also, is info.gpa in try_handle_mmio already updated in the pre-index
case? If not, do we need to apply the offset manually here?

In the post-index case, we need to update the base address in the Rn
register?


> +    dabt->valid = 1;
> +
> +
> +    return 0;
> +bad_64bit_loadstore:
> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
> +    return 1;
> +}
> +
>  static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>  {
>      uint16_t instr;
> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt)
>      if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
>          return decode_thumb(regs->pc, dabt);
>  
> +    if ( is_64bit_domain(current->domain) )
> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
> +
>      /* TODO: Handle ARM instruction */
>      gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
>  
> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> index 729287e37c..49e80358c0 100644
> --- a/xen/arch/arm/io.c
> +++ b/xen/arch/arm/io.c
> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>          .gpa = gpa,
>          .dabt = dabt
>      };
> +    int rc;
>  
>      ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
>  
>      handler = find_mmio_handler(v->domain, info.gpa);
>      if ( !handler )
>      {
> -        int rc;
> -
>          rc = try_fwd_ioserv(regs, v, &info);
>          if ( rc == IO_HANDLED )
>              return handle_ioserv(regs, v);
> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>  
>      /* All the instructions used on emulated MMIO region should be valid */
>      if ( !dabt.valid )
> -        return IO_ABORT;
> +    {
> +        /*
> +         * Post indexing ldr/str instructions are not emulated by Xen. So, the
> +         * ISS is invalid. In such a scenario, we try to manually decode the
> +         * instruction from the program counter.
> +         */
> +        rc = decode_instruction(regs, &info.dabt);
> +        if ( rc )
> +            return IO_ABORT;
> +    }
>  
>      /*
>       * Erratum 766422: Thumb store translation fault to Hypervisor may
> -- 
> 2.17.1
> 

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Jan Beulich 2 years, 5 months ago
On 20.11.2021 02:41, Stefano Stabellini wrote:
> On Fri, 19 Nov 2021, Ayan Kumar Halder wrote:
>> +static int decode_64bit_loadstore_postindexing(register_t pc, struct hsr_dabt *dabt)
>> +{
>> +    uint32_t instr;
>> +    int size;
>> +    int v;
>> +    int opc;
>> +    int rt;
>> +    int imm9;
>> +
>> +    /* For details on decoding, refer to Armv8 Architecture reference manual
>> +     * Section - "Load/store register (immediate post-indexed)", Pg 318
>> +    */
>> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
>> +        return -EFAULT;
>> +
>> +    /* First, let's check for the fixed values */
>> +
>> +    /*  As per the "Encoding table for the Loads and Stores group", Pg 299
>> +     * op4 = 1 - Load/store register (immediate post-indexed)
>> +     */
>> +    if ( extract32(instr, 10, 2) != 1 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    /* For the following, refer to "Load/store register (immediate post-indexed)"
>> +     * to get the fixed values at various bit positions.
>> +     */
>> +    if ( extract32(instr, 21, 1) != 0 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    if ( extract32(instr, 24, 2) != 0 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    if ( extract32(instr, 27, 3) != 7 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    size = extract32(instr, 30, 2);
>> +    v = extract32(instr, 26, 1);
>> +    opc = extract32(instr, 22, 1);
>> +
>> +    /* At the moment, we support STR(immediate) - 32 bit variant and
>> +     * LDR(immediate) - 32 bit variant only.
>> +     */
>> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
>> +        goto bad_64bit_loadstore;
> 
> The opc field is actually 2 bits, not 1. I think we should get both
> bits for opc even if some of the configurations are not interesting.

Even more so that checking the value extracted from a 1-bit field
against both 0 and 1 is pointless.

Jan


RE: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Wei Chen 2 years, 5 months ago
Hi Ayan,

> -----Original Message-----
> From: Xen-devel <xen-devel-bounces@lists.xenproject.org> On Behalf Of Ayan
> Kumar Halder
> Sent: 2021年11月20日 0:52
> To: xen-devel@lists.xenproject.org
> Cc: sstabellini@kernel.org; stefano.stabellini@xilinx.com; julien@xen.org;
> Volodymyr_Babchuk@epam.com; Bertrand Marquis <Bertrand.Marquis@arm.com>;
> Rahul Singh <Rahul.Singh@arm.com>; ayankuma@xilinx.com
> Subject: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-
> indexing instructions
> 
> At present, post indexing instructions are not emulated by Xen.
> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
> result, data abort is triggered.
> 
> Added the logic to decode ldr/str post indexing instructions.
> With this, Xen can decode instructions like these:-
> ldr w2, [x1], #4
> Thus, domU can read ioreg with post indexing instructions.
> 
> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
> ---
> Note to reviewer:-
> This patch is based on an issue discussed in
> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
> "Xen/ARM - Query about a data abort seen while reading GICD registers"
> 
> 
>  xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
>  xen/arch/arm/io.c     | 14 ++++++--
>  2 files changed, 88 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
> index 792c2e92a7..7b60bedbc5 100644
> --- a/xen/arch/arm/decode.c
> +++ b/xen/arch/arm/decode.c
> @@ -84,6 +84,80 @@ bad_thumb2:
>      return 1;
>  }
> 
> +static inline int32_t extract32(uint32_t value, int start, int length)
> +{
> +    int32_t ret;
> +
> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
> +        return -EINVAL;
> +
> +    ret = (value >> start) & (~0U >> (32 - length));
> +
> +    return ret;
> +}
> +

This function's behavior is very similar to the helps vreg_regx_extra
in vreg.h. If we will introduce more reg bit operation functions like
extract32. Can we consider to reuse them?

> +static int decode_64bit_loadstore_postindexing(register_t pc, struct
> hsr_dabt *dabt)
> +{
> +    uint32_t instr;
> +    int size;
> +    int v;
> +    int opc;
> +    int rt;
> +    int imm9;
> +
> +    /* For details on decoding, refer to Armv8 Architecture reference
> manual
> +     * Section - "Load/store register (immediate post-indexed)", Pg 318
> +    */

I have seen two styles of multiple line comments in this patch.
I checked the original file and it has no special comment.
So I think you may need to follow the multiple line comments you
have done for arm/io.c in this patch. And the same for some comments below.

> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
> +        return -EFAULT;
> +
> +    /* First, let's check for the fixed values */
> +
> +    /*  As per the "Encoding table for the Loads and Stores group", Pg
> 299
> +     * op4 = 1 - Load/store register (immediate post-indexed)
> +     */
> +    if ( extract32(instr, 10, 2) != 1 )
> +        goto bad_64bit_loadstore;
> +
> +    /* For the following, refer to "Load/store register (immediate post-
> indexed)"
> +     * to get the fixed values at various bit positions.
> +     */
> +    if ( extract32(instr, 21, 1) != 0 )
> +        goto bad_64bit_loadstore;
> +
> +    if ( extract32(instr, 24, 2) != 0 )
> +        goto bad_64bit_loadstore;
> +
> +    if ( extract32(instr, 27, 3) != 7 )
> +        goto bad_64bit_loadstore;
> +

If the check is fixed, why we don't define a VALID_MASK to check it,
something like:
if ( instr & MASK_for_21_24_27 == VALID_FOR_21_24_27 )


> +    size = extract32(instr, 30, 2);
> +    v = extract32(instr, 26, 1);
> +    opc = extract32(instr, 22, 1);
> +
> +    /* At the moment, we support STR(immediate) - 32 bit variant and
> +     * LDR(immediate) - 32 bit variant only.
> +     */
> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
> +        goto bad_64bit_loadstore;
> +
> +    rt = extract32(instr, 0, 5);
> +    imm9 = extract32(instr, 12, 9);
> +
> +    if ( imm9 < 0 )
> +        update_dabt(dabt, rt, size, true);
> +    else
> +        update_dabt(dabt, rt, size, false);
> +
> +    dabt->valid = 1;
> +
> +
> +    return 0;
> +bad_64bit_loadstore:
> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
> +    return 1;
> +}
> +
>  static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>  {
>      uint16_t instr;
> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs
> *regs, struct hsr_dabt *dabt)
>      if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
>          return decode_thumb(regs->pc, dabt);
> 
> +    if ( is_64bit_domain(current->domain) )
> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
> +
>      /* TODO: Handle ARM instruction */
>      gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
> 
> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> index 729287e37c..49e80358c0 100644
> --- a/xen/arch/arm/io.c
> +++ b/xen/arch/arm/io.c
> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct cpu_user_regs
> *regs,
>          .gpa = gpa,
>          .dabt = dabt
>      };
> +    int rc;
> 
>      ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
> 
>      handler = find_mmio_handler(v->domain, info.gpa);
>      if ( !handler )
>      {
> -        int rc;
> -
>          rc = try_fwd_ioserv(regs, v, &info);
>          if ( rc == IO_HANDLED )
>              return handle_ioserv(regs, v);
> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct cpu_user_regs
> *regs,
> 
>      /* All the instructions used on emulated MMIO region should be valid
> */
>      if ( !dabt.valid )
> -        return IO_ABORT;
> +    {
> +        /*
> +         * Post indexing ldr/str instructions are not emulated by Xen. So,
> the
> +         * ISS is invalid. In such a scenario, we try to manually decode
> the
> +         * instruction from the program counter.
> +         */
> +        rc = decode_instruction(regs, &info.dabt);
> +        if ( rc )
> +            return IO_ABORT;
> +    }
> 
>      /*
>       * Erratum 766422: Thumb store translation fault to Hypervisor may
> --
> 2.17.1
> 

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Jan Beulich 2 years, 5 months ago
On 19.11.2021 17:52, Ayan Kumar Halder wrote:
> --- a/xen/arch/arm/decode.c
> +++ b/xen/arch/arm/decode.c
> @@ -84,6 +84,80 @@ bad_thumb2:
>      return 1;
>  }
>  
> +static inline int32_t extract32(uint32_t value, int start, int length)
> +{
> +    int32_t ret;
> +
> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
> +        return -EINVAL;
> +
> +    ret = (value >> start) & (~0U >> (32 - length));
> +
> +    return ret;
> +}

In addition to Julien's comment regarding the function parameters - why
is the return type int32_t and not uint32_t? Plus as per ./CODING_STYLE
it really shouldn't be a fixed width type anyway, but e.g. unsigned int.

Jan


Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Andre Przywara 2 years, 5 months ago
On Fri, 19 Nov 2021 16:52:02 +0000
Ayan Kumar Halder <ayan.kumar.halder@xilinx.com> wrote:

Hi,

> At present, post indexing instructions are not emulated by Xen.
> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
> result, data abort is triggered.
> 
> Added the logic to decode ldr/str post indexing instructions.
> With this, Xen can decode instructions like these:-
> ldr w2, [x1], #4
> Thus, domU can read ioreg with post indexing instructions.

Where do those instructions come from? A (C) compiler? (Some mail in
another thread from Stefano suggests so)
If yes, I would argue that is broken:
IIUC C compilers assume normal memory attributes for every pointer they
handle, so they are free to use unaligned accesses, load/store exclusives,
split accesses (two halfword reads) and what not when generating code.
The GIC needs to be mapped as device memory, which explicitly forbids
unaligned accesses and exclusives (as in: always traps), so you cannot let
compiler-generated code access the GIC (or most other MMIO devices, for
that matter).
I know, this somewhat works(TM) in practise, because a uint32_t assignment
is very likely to end up in an ldr/str, but please let me know which car
this code ends up in, so that can I avoid this brand ;-)

You can tell the compiler to avoid unaligned accesses with -mstrict-align
(and should definitely do so when you are running C code with the MMU
off), but that still leaves exclusives and split accesses at the
compiler's discretion. A variation on the topic of split access is merged
writes, where the compiler uses NEON or SVE instructions, for instance, to
cover multiple words at once, possibly via some memset()/memcpy() routine.

On top there is this architectural restriction of the ARMv7/v8
virtualisation extension to not decode many "advanced" load/store
instructions in ESR_EL2.
Linux deliberately coded readl/writel using inline assembly, to only use
instructions that provide syndrome information, plus guarantee
device-memory compatible semantics.
Check out https://lwn.net/Articles/698014/ for a comprehensive discussion
of this whole MMIO topic.

So I think you should do the same in your guest/bare metal code: define
{read,write}{b,h,l,q} as inline assembly functions, using ldr?/str? only.
See xen/include/asm-arm/arm64/io.h for an example that uses static inline
functions in a header file, to generate most optimal code. Then always do
MMIO only via those accessors. That prevents any future compiler
surprises, plus makes you perfectly virtualisable.

Cheers,
Andre.

> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
> ---
> Note to reviewer:-
> This patch is based on an issue discussed in 
> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
> "Xen/ARM - Query about a data abort seen while reading GICD registers"
> 
> 
>  xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
>  xen/arch/arm/io.c     | 14 ++++++--
>  2 files changed, 88 insertions(+), 3 deletions(-)
> 
> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
> index 792c2e92a7..7b60bedbc5 100644
> --- a/xen/arch/arm/decode.c
> +++ b/xen/arch/arm/decode.c
> @@ -84,6 +84,80 @@ bad_thumb2:
>      return 1;
>  }
>  
> +static inline int32_t extract32(uint32_t value, int start, int length)
> +{
> +    int32_t ret;
> +
> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
> +        return -EINVAL;
> +
> +    ret = (value >> start) & (~0U >> (32 - length));
> +
> +    return ret;
> +}
> +
> +static int decode_64bit_loadstore_postindexing(register_t pc, struct hsr_dabt *dabt)
> +{
> +    uint32_t instr;
> +    int size;
> +    int v;
> +    int opc;
> +    int rt;
> +    int imm9;
> +
> +    /* For details on decoding, refer to Armv8 Architecture reference manual
> +     * Section - "Load/store register (immediate post-indexed)", Pg 318
> +    */
> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
> +        return -EFAULT;
> +
> +    /* First, let's check for the fixed values */
> +
> +    /*  As per the "Encoding table for the Loads and Stores group", Pg 299
> +     * op4 = 1 - Load/store register (immediate post-indexed)
> +     */
> +    if ( extract32(instr, 10, 2) != 1 )
> +        goto bad_64bit_loadstore;
> +
> +    /* For the following, refer to "Load/store register (immediate post-indexed)"
> +     * to get the fixed values at various bit positions.
> +     */
> +    if ( extract32(instr, 21, 1) != 0 )
> +        goto bad_64bit_loadstore;
> +
> +    if ( extract32(instr, 24, 2) != 0 )
> +        goto bad_64bit_loadstore;
> +
> +    if ( extract32(instr, 27, 3) != 7 )
> +        goto bad_64bit_loadstore;
> +
> +    size = extract32(instr, 30, 2);
> +    v = extract32(instr, 26, 1);
> +    opc = extract32(instr, 22, 1);
> +
> +    /* At the moment, we support STR(immediate) - 32 bit variant and
> +     * LDR(immediate) - 32 bit variant only.
> +     */
> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
> +        goto bad_64bit_loadstore;
> +
> +    rt = extract32(instr, 0, 5);
> +    imm9 = extract32(instr, 12, 9);
> +
> +    if ( imm9 < 0 )
> +        update_dabt(dabt, rt, size, true);
> +    else
> +        update_dabt(dabt, rt, size, false);
> +
> +    dabt->valid = 1;
> +
> +
> +    return 0;
> +bad_64bit_loadstore:
> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
> +    return 1;
> +}
> +
>  static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>  {
>      uint16_t instr;
> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt)
>      if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
>          return decode_thumb(regs->pc, dabt);
>  
> +    if ( is_64bit_domain(current->domain) )
> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
> +
>      /* TODO: Handle ARM instruction */
>      gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
>  
> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> index 729287e37c..49e80358c0 100644
> --- a/xen/arch/arm/io.c
> +++ b/xen/arch/arm/io.c
> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>          .gpa = gpa,
>          .dabt = dabt
>      };
> +    int rc;
>  
>      ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
>  
>      handler = find_mmio_handler(v->domain, info.gpa);
>      if ( !handler )
>      {
> -        int rc;
> -
>          rc = try_fwd_ioserv(regs, v, &info);
>          if ( rc == IO_HANDLED )
>              return handle_ioserv(regs, v);
> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>  
>      /* All the instructions used on emulated MMIO region should be valid */
>      if ( !dabt.valid )
> -        return IO_ABORT;
> +    {
> +        /*
> +         * Post indexing ldr/str instructions are not emulated by Xen. So, the
> +         * ISS is invalid. In such a scenario, we try to manually decode the
> +         * instruction from the program counter.
> +         */
> +        rc = decode_instruction(regs, &info.dabt);
> +        if ( rc )
> +            return IO_ABORT;
> +    }
>  
>      /*
>       * Erratum 766422: Thumb store translation fault to Hypervisor may


Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Ayan Kumar Halder 2 years, 5 months ago
Hi Andre,

Many thanks for your inputs.
Apologies if I sound dumb, but I need a few clarifications.

On 26/11/2021 13:14, Andre Przywara wrote:
> On Fri, 19 Nov 2021 16:52:02 +0000
> Ayan Kumar Halder <ayan.kumar.halder@xilinx.com> wrote:
> 
> Hi,
> 
>> At present, post indexing instructions are not emulated by Xen.
>> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
>> result, data abort is triggered.
>>
>> Added the logic to decode ldr/str post indexing instructions.
>> With this, Xen can decode instructions like these:-
>> ldr w2, [x1], #4
>> Thus, domU can read ioreg with post indexing instructions.
> 
> Where do those instructions come from? A (C) compiler? (Some mail in
> another thread from Stefano suggests so)
> If yes, I would argue that is broken:
> IIUC C compilers assume normal memory attributes for every pointer they
> handle, so they are free to use unaligned accesses, load/store exclusives,
> split accesses (two halfword reads) and what not when generating code.
> The GIC needs to be mapped as device memory, which explicitly forbids
> unaligned accesses and exclusives (as in: always traps), so you cannot let
> compiler-generated code access the GIC (or most other MMIO devices, for
> that matter).
> I know, this somewhat works(TM) in practise, because a uint32_t assignment
> is very likely to end up in an ldr/str, but please let me know which car
> this code ends up in, so that can I avoid this brand ;-)
> 
> You can tell the compiler to avoid unaligned accesses with -mstrict-align
> (and should definitely do so when you are running C code with the MMU
> off), but that still leaves exclusives and split accesses at the
> compiler's discretion. A variation on the topic of split access is merged
> writes, where the compiler uses NEON or SVE instructions, for instance, to
> cover multiple words at once, possibly via some memset()/memcpy() routine.

I understand that we should be using inline assembly instructions to 
access any MMIO region. This is to prevent the compiler doing any tricks.

But is there a restriction that post indexing instructions can never be 
used to access MMIO region ?

> 
> On top there is this architectural restriction of the ARMv7/v8
> virtualisation extension to not decode many "advanced" load/store
> instructions in ESR_EL2.
Where do I find this restriction ?

Are you telling me that load/store with post indexing is an "advanced" 
instruction and ArmV8 does not allow decoding of these instructions in 
ESR_EL2 ? Isn't that a very strong limitation ?

Also what is your opinion on Xen decoding these instructions ?

- Ayan

> Linux deliberately coded readl/writel using inline assembly, to only use
> instructions that provide syndrome information, plus guarantee
> device-memory compatible semantics.
> Check out https://lwn.net/Articles/698014/ for a comprehensive discussion
> of this whole MMIO topic.
> 
> So I think you should do the same in your guest/bare metal code: define
> {read,write}{b,h,l,q} as inline assembly functions, using ldr?/str? only.
> See xen/include/asm-arm/arm64/io.h for an example that uses static inline
> functions in a header file, to generate most optimal code. Then always do
> MMIO only via those accessors. That prevents any future compiler
> surprises, plus makes you perfectly virtualisable.
> 
> Cheers,
> Andre.
> 
>> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
>> ---
>> Note to reviewer:-
>> This patch is based on an issue discussed in
>> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
>> "Xen/ARM - Query about a data abort seen while reading GICD registers"
>>
>>
>>   xen/arch/arm/decode.c | 77 +++++++++++++++++++++++++++++++++++++++++++
>>   xen/arch/arm/io.c     | 14 ++++++--
>>   2 files changed, 88 insertions(+), 3 deletions(-)
>>
>> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
>> index 792c2e92a7..7b60bedbc5 100644
>> --- a/xen/arch/arm/decode.c
>> +++ b/xen/arch/arm/decode.c
>> @@ -84,6 +84,80 @@ bad_thumb2:
>>       return 1;
>>   }
>>   
>> +static inline int32_t extract32(uint32_t value, int start, int length)
>> +{
>> +    int32_t ret;
>> +
>> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
>> +        return -EINVAL;
>> +
>> +    ret = (value >> start) & (~0U >> (32 - length));
>> +
>> +    return ret;
>> +}
>> +
>> +static int decode_64bit_loadstore_postindexing(register_t pc, struct hsr_dabt *dabt)
>> +{
>> +    uint32_t instr;
>> +    int size;
>> +    int v;
>> +    int opc;
>> +    int rt;
>> +    int imm9;
>> +
>> +    /* For details on decoding, refer to Armv8 Architecture reference manual
>> +     * Section - "Load/store register (immediate post-indexed)", Pg 318
>> +    */
>> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof (instr)) )
>> +        return -EFAULT;
>> +
>> +    /* First, let's check for the fixed values */
>> +
>> +    /*  As per the "Encoding table for the Loads and Stores group", Pg 299
>> +     * op4 = 1 - Load/store register (immediate post-indexed)
>> +     */
>> +    if ( extract32(instr, 10, 2) != 1 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    /* For the following, refer to "Load/store register (immediate post-indexed)"
>> +     * to get the fixed values at various bit positions.
>> +     */
>> +    if ( extract32(instr, 21, 1) != 0 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    if ( extract32(instr, 24, 2) != 0 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    if ( extract32(instr, 27, 3) != 7 )
>> +        goto bad_64bit_loadstore;
>> +
>> +    size = extract32(instr, 30, 2);
>> +    v = extract32(instr, 26, 1);
>> +    opc = extract32(instr, 22, 1);
>> +
>> +    /* At the moment, we support STR(immediate) - 32 bit variant and
>> +     * LDR(immediate) - 32 bit variant only.
>> +     */
>> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
>> +        goto bad_64bit_loadstore;
>> +
>> +    rt = extract32(instr, 0, 5);
>> +    imm9 = extract32(instr, 12, 9);
>> +
>> +    if ( imm9 < 0 )
>> +        update_dabt(dabt, rt, size, true);
>> +    else
>> +        update_dabt(dabt, rt, size, false);
>> +
>> +    dabt->valid = 1;
>> +
>> +
>> +    return 0;
>> +bad_64bit_loadstore:
>> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
>> +    return 1;
>> +}
>> +
>>   static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>>   {
>>       uint16_t instr;
>> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs *regs, struct hsr_dabt *dabt)
>>       if ( is_32bit_domain(current->domain) && regs->cpsr & PSR_THUMB )
>>           return decode_thumb(regs->pc, dabt);
>>   
>> +    if ( is_64bit_domain(current->domain) )
>> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
>> +
>>       /* TODO: Handle ARM instruction */
>>       gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
>>   
>> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
>> index 729287e37c..49e80358c0 100644
>> --- a/xen/arch/arm/io.c
>> +++ b/xen/arch/arm/io.c
>> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>>           .gpa = gpa,
>>           .dabt = dabt
>>       };
>> +    int rc;
>>   
>>       ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
>>   
>>       handler = find_mmio_handler(v->domain, info.gpa);
>>       if ( !handler )
>>       {
>> -        int rc;
>> -
>>           rc = try_fwd_ioserv(regs, v, &info);
>>           if ( rc == IO_HANDLED )
>>               return handle_ioserv(regs, v);
>> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct cpu_user_regs *regs,
>>   
>>       /* All the instructions used on emulated MMIO region should be valid */
>>       if ( !dabt.valid )
>> -        return IO_ABORT;
>> +    {
>> +        /*
>> +         * Post indexing ldr/str instructions are not emulated by Xen. So, the
>> +         * ISS is invalid. In such a scenario, we try to manually decode the
>> +         * instruction from the program counter.
>> +         */
>> +        rc = decode_instruction(regs, &info.dabt);
>> +        if ( rc )
>> +            return IO_ABORT;
>> +    }
>>   
>>       /*
>>        * Erratum 766422: Thumb store translation fault to Hypervisor may
> 

Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Andre Przywara 2 years, 5 months ago
On Fri, 26 Nov 2021 15:28:06 +0000
Ayan Kumar Halder <ayan.kumar.halder@xilinx.com> wrote:

Hi Ayan,

> Many thanks for your inputs.
> Apologies if I sound dumb, but I need a few clarifications.
> 
> On 26/11/2021 13:14, Andre Przywara wrote:
> > On Fri, 19 Nov 2021 16:52:02 +0000
> > Ayan Kumar Halder <ayan.kumar.halder@xilinx.com> wrote:
> > 
> > Hi,
> >   
> >> At present, post indexing instructions are not emulated by Xen.
> >> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
> >> result, data abort is triggered.
> >>
> >> Added the logic to decode ldr/str post indexing instructions.
> >> With this, Xen can decode instructions like these:-
> >> ldr w2, [x1], #4
> >> Thus, domU can read ioreg with post indexing instructions.  
> > 
> > Where do those instructions come from? A (C) compiler? (Some mail in
> > another thread from Stefano suggests so)
> > If yes, I would argue that is broken:
> > IIUC C compilers assume normal memory attributes for every pointer they
> > handle, so they are free to use unaligned accesses, load/store exclusives,
> > split accesses (two halfword reads) and what not when generating code.
> > The GIC needs to be mapped as device memory, which explicitly forbids
> > unaligned accesses and exclusives (as in: always traps), so you cannot let
> > compiler-generated code access the GIC (or most other MMIO devices, for
> > that matter).
> > I know, this somewhat works(TM) in practise, because a uint32_t assignment
> > is very likely to end up in an ldr/str, but please let me know which car
> > this code ends up in, so that can I avoid this brand ;-)
> > 
> > You can tell the compiler to avoid unaligned accesses with -mstrict-align
> > (and should definitely do so when you are running C code with the MMU
> > off), but that still leaves exclusives and split accesses at the
> > compiler's discretion. A variation on the topic of split access is merged
> > writes, where the compiler uses NEON or SVE instructions, for instance, to
> > cover multiple words at once, possibly via some memset()/memcpy() routine.  
> 
> I understand that we should be using inline assembly instructions to 
> access any MMIO region. This is to prevent the compiler doing any tricks.
> 
> But is there a restriction that post indexing instructions can never be 
> used to access MMIO region ?

No, this is a pure virtualisation restriction, see below. On real
hardware/bare-metal, ldr/str with post or pre-indexing works and is fine
to use for MMIO.
But we need to have the right access width, matching the MMIO device's
expectation. So ldp/stp would probably be problematic, for instance.

> > On top there is this architectural restriction of the ARMv7/v8
> > virtualisation extension to not decode many "advanced" load/store
> > instructions in ESR_EL2.  
> Where do I find this restriction ?

That's described in the ESR_ELx syndrome description in the ARMv8 ARM (DDI
0487G.b), section "ISS encoding for an exception from a Data Abort" (page
D13-3219 in my Issue G.b copy):
"For other faults reported in ESR_EL2, ISV is 0 except for the following stage 2 aborts: ...."

> Are you telling me that load/store with post indexing is an "advanced" 
> instruction and ArmV8 does not allow decoding of these instructions in 
> ESR_EL2 ?

Yes, it is in the group of instructions for which the hardware does not
provide syndrome information in ESR_EL2: " .... but excluding Load
Exclusive or Store Exclusive and excluding those with writeback)."

> Isn't that a very strong limitation ?

I don't know about that, it's what it is and what it was for years. Linux
deliberately chose ldr/str only for readl/writel to be able to trap and
handle MMIO aborts in hypervisors.

If you do MMIO accesses the right way, using (inline) assembly only, then
you don't have the problem, and also avoid many others, see my previous
mail.

If you think of it from an architectural and implementation point of view
(and keep the RISC idea in mind): it should happen rarely, but would
require many gates for something that you can do in software as well.

> Also what is your opinion on Xen decoding these instructions ?

I would be careful, we deliberately avoid this in KVM. This bubbles up
from time to time, though, so we now allow delegating this case to
userland, so the VMM can do the decoding there.
In Xen you have less issues with walking the guest's page tables,
though (a major problem in KVM), but it still adds complexity to a
hypervisor which aims to be lean by design.
Another argument would be that just post/pre does not cover everything, and
the cases start to pile up quickly: what about the immediate versions,
ldxr, stp, NEON/SVE load/stores, etc. Since many of those are not safe for
MMIO anyway, you add a lot of code for little use (and which gets little
testing!).

Cheers,
Andre

> > Linux deliberately coded readl/writel using inline assembly, to only
> > use instructions that provide syndrome information, plus guarantee
> > device-memory compatible semantics.
> > Check out https://lwn.net/Articles/698014/ for a comprehensive
> > discussion of this whole MMIO topic.
> > 
> > So I think you should do the same in your guest/bare metal code: define
> > {read,write}{b,h,l,q} as inline assembly functions, using ldr?/str?
> > only. See xen/include/asm-arm/arm64/io.h for an example that uses
> > static inline functions in a header file, to generate most optimal
> > code. Then always do MMIO only via those accessors. That prevents any
> > future compiler surprises, plus makes you perfectly virtualisable.
> > 
> > Cheers,
> > Andre.
> >   
> >> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
> >> ---
> >> Note to reviewer:-
> >> This patch is based on an issue discussed in
> >> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
> >> "Xen/ARM - Query about a data abort seen while reading GICD registers"
> >>
> >>
> >>   xen/arch/arm/decode.c | 77
> >> +++++++++++++++++++++++++++++++++++++++++++ xen/arch/arm/io.c     |
> >> 14 ++++++-- 2 files changed, 88 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
> >> index 792c2e92a7..7b60bedbc5 100644
> >> --- a/xen/arch/arm/decode.c
> >> +++ b/xen/arch/arm/decode.c
> >> @@ -84,6 +84,80 @@ bad_thumb2:
> >>       return 1;
> >>   }
> >>   
> >> +static inline int32_t extract32(uint32_t value, int start, int
> >> length) +{
> >> +    int32_t ret;
> >> +
> >> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
> >> +        return -EINVAL;
> >> +
> >> +    ret = (value >> start) & (~0U >> (32 - length));
> >> +
> >> +    return ret;
> >> +}
> >> +
> >> +static int decode_64bit_loadstore_postindexing(register_t pc, struct
> >> hsr_dabt *dabt) +{
> >> +    uint32_t instr;
> >> +    int size;
> >> +    int v;
> >> +    int opc;
> >> +    int rt;
> >> +    int imm9;
> >> +
> >> +    /* For details on decoding, refer to Armv8 Architecture
> >> reference manual
> >> +     * Section - "Load/store register (immediate post-indexed)", Pg
> >> 318
> >> +    */
> >> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof
> >> (instr)) )
> >> +        return -EFAULT;
> >> +
> >> +    /* First, let's check for the fixed values */
> >> +
> >> +    /*  As per the "Encoding table for the Loads and Stores group",
> >> Pg 299
> >> +     * op4 = 1 - Load/store register (immediate post-indexed)
> >> +     */
> >> +    if ( extract32(instr, 10, 2) != 1 )
> >> +        goto bad_64bit_loadstore;
> >> +
> >> +    /* For the following, refer to "Load/store register (immediate
> >> post-indexed)"
> >> +     * to get the fixed values at various bit positions.
> >> +     */
> >> +    if ( extract32(instr, 21, 1) != 0 )
> >> +        goto bad_64bit_loadstore;
> >> +
> >> +    if ( extract32(instr, 24, 2) != 0 )
> >> +        goto bad_64bit_loadstore;
> >> +
> >> +    if ( extract32(instr, 27, 3) != 7 )
> >> +        goto bad_64bit_loadstore;
> >> +
> >> +    size = extract32(instr, 30, 2);
> >> +    v = extract32(instr, 26, 1);
> >> +    opc = extract32(instr, 22, 1);
> >> +
> >> +    /* At the moment, we support STR(immediate) - 32 bit variant and
> >> +     * LDR(immediate) - 32 bit variant only.
> >> +     */
> >> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
> >> +        goto bad_64bit_loadstore;
> >> +
> >> +    rt = extract32(instr, 0, 5);
> >> +    imm9 = extract32(instr, 12, 9);
> >> +
> >> +    if ( imm9 < 0 )
> >> +        update_dabt(dabt, rt, size, true);
> >> +    else
> >> +        update_dabt(dabt, rt, size, false);
> >> +
> >> +    dabt->valid = 1;
> >> +
> >> +
> >> +    return 0;
> >> +bad_64bit_loadstore:
> >> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
> >> +    return 1;
> >> +}
> >> +
> >>   static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
> >>   {
> >>       uint16_t instr;
> >> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs
> >> *regs, struct hsr_dabt *dabt) if ( is_32bit_domain(current->domain)
> >> && regs->cpsr & PSR_THUMB ) return decode_thumb(regs->pc, dabt);
> >>   
> >> +    if ( is_64bit_domain(current->domain) )
> >> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
> >> +
> >>       /* TODO: Handle ARM instruction */
> >>       gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
> >>   
> >> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
> >> index 729287e37c..49e80358c0 100644
> >> --- a/xen/arch/arm/io.c
> >> +++ b/xen/arch/arm/io.c
> >> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct
> >> cpu_user_regs *regs, .gpa = gpa,
> >>           .dabt = dabt
> >>       };
> >> +    int rc;
> >>   
> >>       ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
> >>   
> >>       handler = find_mmio_handler(v->domain, info.gpa);
> >>       if ( !handler )
> >>       {
> >> -        int rc;
> >> -
> >>           rc = try_fwd_ioserv(regs, v, &info);
> >>           if ( rc == IO_HANDLED )
> >>               return handle_ioserv(regs, v);
> >> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct
> >> cpu_user_regs *regs, 
> >>       /* All the instructions used on emulated MMIO region should be
> >> valid */ if ( !dabt.valid )
> >> -        return IO_ABORT;
> >> +    {
> >> +        /*
> >> +         * Post indexing ldr/str instructions are not emulated by
> >> Xen. So, the
> >> +         * ISS is invalid. In such a scenario, we try to manually
> >> decode the
> >> +         * instruction from the program counter.
> >> +         */
> >> +        rc = decode_instruction(regs, &info.dabt);
> >> +        if ( rc )
> >> +            return IO_ABORT;
> >> +    }
> >>   
> >>       /*
> >>        * Erratum 766422: Thumb store translation fault to Hypervisor
> >> may  
> >   


Re: [RFC PATCH] Added the logic to decode 32 bit ldr/str post-indexing instructions
Posted by Ayan Kumar Halder 2 years, 4 months ago
Hi All,

Thanks for giving your inputs.

On 26/11/2021 17:39, Andre Przywara wrote:
> On Fri, 26 Nov 2021 15:28:06 +0000
> Ayan Kumar Halder <ayan.kumar.halder@xilinx.com> wrote:
> 
> Hi Ayan,
> 
>> Many thanks for your inputs.
>> Apologies if I sound dumb, but I need a few clarifications.
>>
>> On 26/11/2021 13:14, Andre Przywara wrote:
>>> On Fri, 19 Nov 2021 16:52:02 +0000
>>> Ayan Kumar Halder <ayan.kumar.halder@xilinx.com> wrote:
>>>
>>> Hi,
>>>    
>>>> At present, post indexing instructions are not emulated by Xen.
>>>> When Xen gets the exception, EL2_ESR.ISV bit not set. Thus as a
>>>> result, data abort is triggered.
>>>>
>>>> Added the logic to decode ldr/str post indexing instructions.
>>>> With this, Xen can decode instructions like these:-
>>>> ldr w2, [x1], #4
>>>> Thus, domU can read ioreg with post indexing instructions.
>>>
>>> Where do those instructions come from? A (C) compiler? (Some mail in
>>> another thread from Stefano suggests so)
>>> If yes, I would argue that is broken:
>>> IIUC C compilers assume normal memory attributes for every pointer they
>>> handle, so they are free to use unaligned accesses, load/store exclusives,
>>> split accesses (two halfword reads) and what not when generating code.
>>> The GIC needs to be mapped as device memory, which explicitly forbids
>>> unaligned accesses and exclusives (as in: always traps), so you cannot let
>>> compiler-generated code access the GIC (or most other MMIO devices, for
>>> that matter).
>>> I know, this somewhat works(TM) in practise, because a uint32_t assignment
>>> is very likely to end up in an ldr/str, but please let me know which car
>>> this code ends up in, so that can I avoid this brand ;-)
>>>
>>> You can tell the compiler to avoid unaligned accesses with -mstrict-align
>>> (and should definitely do so when you are running C code with the MMU
>>> off), but that still leaves exclusives and split accesses at the
>>> compiler's discretion. A variation on the topic of split access is merged
>>> writes, where the compiler uses NEON or SVE instructions, for instance, to
>>> cover multiple words at once, possibly via some memset()/memcpy() routine.
>>
>> I understand that we should be using inline assembly instructions to
>> access any MMIO region. This is to prevent the compiler doing any tricks.
>>
>> But is there a restriction that post indexing instructions can never be
>> used to access MMIO region ?
> 
> No, this is a pure virtualisation restriction, see below. On real
> hardware/bare-metal, ldr/str with post or pre-indexing works and is fine
> to use for MMIO.
> But we need to have the right access width, matching the MMIO device's
> expectation. So ldp/stp would probably be problematic, for instance.
> 
>>> On top there is this architectural restriction of the ARMv7/v8
>>> virtualisation extension to not decode many "advanced" load/store
>>> instructions in ESR_EL2.
>> Where do I find this restriction ?
> 
> That's described in the ESR_ELx syndrome description in the ARMv8 ARM (DDI
> 0487G.b), section "ISS encoding for an exception from a Data Abort" (page
> D13-3219 in my Issue G.b copy):
> "For other faults reported in ESR_EL2, ISV is 0 except for the following stage 2 aborts: ...."
> 
>> Are you telling me that load/store with post indexing is an "advanced"
>> instruction and ArmV8 does not allow decoding of these instructions in
>> ESR_EL2 ?
> 
> Yes, it is in the group of instructions for which the hardware does not
> provide syndrome information in ESR_EL2: " .... but excluding Load
> Exclusive or Store Exclusive and excluding those with writeback)."
> 
>> Isn't that a very strong limitation ?
> 
> I don't know about that, it's what it is and what it was for years. Linux
> deliberately chose ldr/str only for readl/writel to be able to trap and
> handle MMIO aborts in hypervisors.
> 
> If you do MMIO accesses the right way, using (inline) assembly only, then
> you don't have the problem, and also avoid many others, see my previous
> mail.
> 
> If you think of it from an architectural and implementation point of view
> (and keep the RISC idea in mind): it should happen rarely, but would
> require many gates for something that you can do in software as well.
> 
>> Also what is your opinion on Xen decoding these instructions ?
> 
> I would be careful, we deliberately avoid this in KVM. This bubbles up
> from time to time, though, so we now allow delegating this case to
> userland, so the VMM can do the decoding there.
> In Xen you have less issues with walking the guest's page tables,
> though (a major problem in KVM), but it still adds complexity to a
> hypervisor which aims to be lean by design.
> Another argument would be that just post/pre does not cover everything, and
> the cases start to pile up quickly: what about the immediate versions,
> ldxr, stp, NEON/SVE load/stores, etc. Since many of those are not safe for
> MMIO anyway, you add a lot of code for little use (and which gets little
> testing!).

Many thanks for your explanation. It makes some sense. Unfortunately, I 
don't have much experience with hypervisors. So I will rely on you and 
other experts opinions on this. :)

I have sent out a v2 patch "[XEN v2] xen/arm64: io: Decode 32-bit 
ldr/str post-indexing instructions". Please have a look.

Stefano/Julien/Bertrand/Jan :- Please have a look at my v2 patch and let 
me know if it makes sense to add the decoding logic to Xen or if it 
makes the codebase very complex.

- Ayan

> 
> Cheers,
> Andre
> 
>>> Linux deliberately coded readl/writel using inline assembly, to only
>>> use instructions that provide syndrome information, plus guarantee
>>> device-memory compatible semantics.
>>> Check out https://lwn.net/Articles/698014/ for a comprehensive
>>> discussion of this whole MMIO topic.
>>>
>>> So I think you should do the same in your guest/bare metal code: define
>>> {read,write}{b,h,l,q} as inline assembly functions, using ldr?/str?
>>> only. See xen/include/asm-arm/arm64/io.h for an example that uses
>>> static inline functions in a header file, to generate most optimal
>>> code. Then always do MMIO only via those accessors. That prevents any
>>> future compiler surprises, plus makes you perfectly virtualisable.
>>>
>>> Cheers,
>>> Andre.
>>>    
>>>> Signed-off-by: Ayan Kumar Halder <ayankuma@xilinx.com>
>>>> ---
>>>> Note to reviewer:-
>>>> This patch is based on an issue discussed in
>>>> https://lists.xenproject.org/archives/html/xen-devel/2021-11/msg00969.html
>>>> "Xen/ARM - Query about a data abort seen while reading GICD registers"
>>>>
>>>>
>>>>    xen/arch/arm/decode.c | 77
>>>> +++++++++++++++++++++++++++++++++++++++++++ xen/arch/arm/io.c     |
>>>> 14 ++++++-- 2 files changed, 88 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/xen/arch/arm/decode.c b/xen/arch/arm/decode.c
>>>> index 792c2e92a7..7b60bedbc5 100644
>>>> --- a/xen/arch/arm/decode.c
>>>> +++ b/xen/arch/arm/decode.c
>>>> @@ -84,6 +84,80 @@ bad_thumb2:
>>>>        return 1;
>>>>    }
>>>>    
>>>> +static inline int32_t extract32(uint32_t value, int start, int
>>>> length) +{
>>>> +    int32_t ret;
>>>> +
>>>> +    if ( !(start >= 0 && length > 0 && length <= 32 - start) )
>>>> +        return -EINVAL;
>>>> +
>>>> +    ret = (value >> start) & (~0U >> (32 - length));
>>>> +
>>>> +    return ret;
>>>> +}
>>>> +
>>>> +static int decode_64bit_loadstore_postindexing(register_t pc, struct
>>>> hsr_dabt *dabt) +{
>>>> +    uint32_t instr;
>>>> +    int size;
>>>> +    int v;
>>>> +    int opc;
>>>> +    int rt;
>>>> +    int imm9;
>>>> +
>>>> +    /* For details on decoding, refer to Armv8 Architecture
>>>> reference manual
>>>> +     * Section - "Load/store register (immediate post-indexed)", Pg
>>>> 318
>>>> +    */
>>>> +    if ( raw_copy_from_guest(&instr, (void * __user)pc, sizeof
>>>> (instr)) )
>>>> +        return -EFAULT;
>>>> +
>>>> +    /* First, let's check for the fixed values */
>>>> +
>>>> +    /*  As per the "Encoding table for the Loads and Stores group",
>>>> Pg 299
>>>> +     * op4 = 1 - Load/store register (immediate post-indexed)
>>>> +     */
>>>> +    if ( extract32(instr, 10, 2) != 1 )
>>>> +        goto bad_64bit_loadstore;
>>>> +
>>>> +    /* For the following, refer to "Load/store register (immediate
>>>> post-indexed)"
>>>> +     * to get the fixed values at various bit positions.
>>>> +     */
>>>> +    if ( extract32(instr, 21, 1) != 0 )
>>>> +        goto bad_64bit_loadstore;
>>>> +
>>>> +    if ( extract32(instr, 24, 2) != 0 )
>>>> +        goto bad_64bit_loadstore;
>>>> +
>>>> +    if ( extract32(instr, 27, 3) != 7 )
>>>> +        goto bad_64bit_loadstore;
>>>> +
>>>> +    size = extract32(instr, 30, 2);
>>>> +    v = extract32(instr, 26, 1);
>>>> +    opc = extract32(instr, 22, 1);
>>>> +
>>>> +    /* At the moment, we support STR(immediate) - 32 bit variant and
>>>> +     * LDR(immediate) - 32 bit variant only.
>>>> +     */
>>>> +    if (!((size==2) && (v==0) && ((opc==0) || (opc==1))))
>>>> +        goto bad_64bit_loadstore;
>>>> +
>>>> +    rt = extract32(instr, 0, 5);
>>>> +    imm9 = extract32(instr, 12, 9);
>>>> +
>>>> +    if ( imm9 < 0 )
>>>> +        update_dabt(dabt, rt, size, true);
>>>> +    else
>>>> +        update_dabt(dabt, rt, size, false);
>>>> +
>>>> +    dabt->valid = 1;
>>>> +
>>>> +
>>>> +    return 0;
>>>> +bad_64bit_loadstore:
>>>> +    gprintk(XENLOG_ERR, "unhandled 64bit instruction 0x%x\n", instr);
>>>> +    return 1;
>>>> +}
>>>> +
>>>>    static int decode_thumb(register_t pc, struct hsr_dabt *dabt)
>>>>    {
>>>>        uint16_t instr;
>>>> @@ -155,6 +229,9 @@ int decode_instruction(const struct cpu_user_regs
>>>> *regs, struct hsr_dabt *dabt) if ( is_32bit_domain(current->domain)
>>>> && regs->cpsr & PSR_THUMB ) return decode_thumb(regs->pc, dabt);
>>>>    
>>>> +    if ( is_64bit_domain(current->domain) )
>>>> +        return decode_64bit_loadstore_postindexing(regs->pc, dabt);
>>>> +
>>>>        /* TODO: Handle ARM instruction */
>>>>        gprintk(XENLOG_ERR, "unhandled ARM instruction\n");
>>>>    
>>>> diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
>>>> index 729287e37c..49e80358c0 100644
>>>> --- a/xen/arch/arm/io.c
>>>> +++ b/xen/arch/arm/io.c
>>>> @@ -106,14 +106,13 @@ enum io_state try_handle_mmio(struct
>>>> cpu_user_regs *regs, .gpa = gpa,
>>>>            .dabt = dabt
>>>>        };
>>>> +    int rc;
>>>>    
>>>>        ASSERT(hsr.ec == HSR_EC_DATA_ABORT_LOWER_EL);
>>>>    
>>>>        handler = find_mmio_handler(v->domain, info.gpa);
>>>>        if ( !handler )
>>>>        {
>>>> -        int rc;
>>>> -
>>>>            rc = try_fwd_ioserv(regs, v, &info);
>>>>            if ( rc == IO_HANDLED )
>>>>                return handle_ioserv(regs, v);
>>>> @@ -123,7 +122,16 @@ enum io_state try_handle_mmio(struct
>>>> cpu_user_regs *regs,
>>>>        /* All the instructions used on emulated MMIO region should be
>>>> valid */ if ( !dabt.valid )
>>>> -        return IO_ABORT;
>>>> +    {
>>>> +        /*
>>>> +         * Post indexing ldr/str instructions are not emulated by
>>>> Xen. So, the
>>>> +         * ISS is invalid. In such a scenario, we try to manually
>>>> decode the
>>>> +         * instruction from the program counter.
>>>> +         */
>>>> +        rc = decode_instruction(regs, &info.dabt);
>>>> +        if ( rc )
>>>> +            return IO_ABORT;
>>>> +    }
>>>>    
>>>>        /*
>>>>         * Erratum 766422: Thumb store translation fault to Hypervisor
>>>> may
>>>    
>