[Qemu-devel] [PATCH v1 06/33] s390x/tcg: Implement VECTOR GENERATE BYTE MASK

David Hildenbrand posted 33 patches 6 years, 8 months ago
There is a newer version of this series
[Qemu-devel] [PATCH v1 06/33] s390x/tcg: Implement VECTOR GENERATE BYTE MASK
Posted by David Hildenbrand 6 years, 8 months ago
As we are working on byte elements, we can use i32 for element access.
Add write_vec_element_i32().

Signed-off-by: David Hildenbrand <david@redhat.com>
---
 target/s390x/insn-data.def      |  2 ++
 target/s390x/translate_vx.inc.c | 39 +++++++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)

diff --git a/target/s390x/insn-data.def b/target/s390x/insn-data.def
index 2b06cc9130..1bdfcf8130 100644
--- a/target/s390x/insn-data.def
+++ b/target/s390x/insn-data.def
@@ -977,6 +977,8 @@
 /* VECTOR GATHER ELEMENT */
     E(0xe713, VGEF,    VRV,   V,   la2, 0, 0, 0, vge, 0, MO_32, IF_VEC)
     E(0xe712, VGEG,    VRV,   V,   la2, 0, 0, 0, vge, 0, MO_64, IF_VEC)
+/* VECTOR GENERATE BYTE MASK */
+    F(0xe744, VGBM,    VRI_a, V,   0, 0, 0, 0, vgbm, 0, IF_VEC)
 
 #ifndef CONFIG_USER_ONLY
 /* COMPARE AND SWAP AND PURGE */
diff --git a/target/s390x/translate_vx.inc.c b/target/s390x/translate_vx.inc.c
index 56f403e40d..7775401dd3 100644
--- a/target/s390x/translate_vx.inc.c
+++ b/target/s390x/translate_vx.inc.c
@@ -105,6 +105,26 @@ static void write_vec_element_i64(TCGv_i64 src, int reg, uint8_t enr,
     }
 }
 
+static void write_vec_element_i32(TCGv_i32 src, int reg, uint8_t enr,
+                                  TCGMemOp memop)
+{
+    const int offs = vec_reg_offset(reg, enr, memop & MO_SIZE);
+
+    switch (memop) {
+    case MO_8:
+        tcg_gen_st8_i32(src, cpu_env, offs);
+        break;
+    case MO_16:
+        tcg_gen_st16_i32(src, cpu_env, offs);
+        break;
+    case MO_32:
+        tcg_gen_st_i32(src, cpu_env, offs);
+        break;
+    default:
+        g_assert_not_reached();
+    }
+}
+
 static void load_vec_element(DisasContext *s, uint8_t reg, uint8_t enr,
                              TCGv_i64 addr, uint8_t es)
 {
@@ -136,3 +156,22 @@ static DisasJumpType op_vge(DisasContext *s, DisasOps *o)
     tcg_temp_free_i64(tmp);
     return DISAS_NEXT;
 }
+
+static DisasJumpType op_vgbm(DisasContext *s, DisasOps *o)
+{
+    const uint16_t i2 = get_field(s->fields, i2);
+    TCGv_i32 ones = tcg_const_i32(-1u);
+    TCGv_i32 zeroes = tcg_const_i32(0);
+    int i;
+
+    for (i = 0; i < 16; i++) {
+        if (extract32(i2, 15 - i, 1)) {
+            write_vec_element_i32(ones, get_field(s->fields, v1), i, MO_8);
+        } else {
+            write_vec_element_i32(zeroes, get_field(s->fields, v1), i, MO_8);
+        }
+    }
+    tcg_temp_free_i32(ones);
+    tcg_temp_free_i32(zeroes);
+    return DISAS_NEXT;
+}
-- 
2.17.2


Re: [Qemu-devel] [PATCH v1 06/33] s390x/tcg: Implement VECTOR GENERATE BYTE MASK
Posted by Richard Henderson 6 years, 8 months ago
On 2/26/19 3:38 AM, David Hildenbrand wrote:
> +static DisasJumpType op_vgbm(DisasContext *s, DisasOps *o)
> +{
> +    const uint16_t i2 = get_field(s->fields, i2);
> +    TCGv_i32 ones = tcg_const_i32(-1u);
> +    TCGv_i32 zeroes = tcg_const_i32(0);
> +    int i;
> +
> +    for (i = 0; i < 16; i++) {
> +        if (extract32(i2, 15 - i, 1)) {
> +            write_vec_element_i32(ones, get_field(s->fields, v1), i, MO_8);
> +        } else {
> +            write_vec_element_i32(zeroes, get_field(s->fields, v1), i, MO_8);
> +        }
> +    }
> +    tcg_temp_free_i32(ones);
> +    tcg_temp_free_i32(zeroes);
> +    return DISAS_NEXT;
> +}

While this works, it's not in the spirit of

> Programming Note: VECTOR GENERATE BYTE
> MASK is the preferred method for setting a vector
> register to all zeroes or ones.

Better, I think, with

uint64_t generate_byte_mask(uint8_t mask)
{
    uint64_t r = 0;
    int i;
    for (i = 0; i < 8; i++) {
        if ((mask >> i) & 1) {
            r |= 0xffull << (i * 8);
        }
    }
    return r;
}

    if (i2 == (i2 & 0xff) * 0x0101) {
        /* masks for both halves of the vector are the same.
           trust tcg to produce a good constant loading.  */
        tcg_gen_gvec_dup64i(vec_full_reg_offset(s, v1), 16, 16,
                            generate_byte_mask(i2 & 0xff));
    } else {
        TCGv_i64 t = tcg_temp_new_i64();
        tcg_gen_movi_i64(t, generate_byte_mask(i2 >> 8));
        write_vec_element_i64(t, v1, 0, MO_64);
        tcg_gen_movi_i64(t, generate_byte_mask(i2 & 0xff));
        write_vec_element_i64(t, v1, 1, MO_64);
        tcg_temp_free_i64();
    }

Somewhere behind tcg_gen_gvec_dup64i, I check to see if the constant can be
decomposed further, which will eventually bottom out at

	vpxor	%xmm0,%xmm0,%xmm0		// all zeros
	vpcmpeq	%xmm0,%xmm0,%xmm0		// all ones

and even more interesting combinations for tcg/aarch64.



r~


Re: [Qemu-devel] [PATCH v1 06/33] s390x/tcg: Implement VECTOR GENERATE BYTE MASK
Posted by David Hildenbrand 6 years, 8 months ago
On 26.02.19 20:12, Richard Henderson wrote:
> On 2/26/19 3:38 AM, David Hildenbrand wrote:
>> +static DisasJumpType op_vgbm(DisasContext *s, DisasOps *o)
>> +{
>> +    const uint16_t i2 = get_field(s->fields, i2);
>> +    TCGv_i32 ones = tcg_const_i32(-1u);
>> +    TCGv_i32 zeroes = tcg_const_i32(0);
>> +    int i;
>> +
>> +    for (i = 0; i < 16; i++) {
>> +        if (extract32(i2, 15 - i, 1)) {
>> +            write_vec_element_i32(ones, get_field(s->fields, v1), i, MO_8);
>> +        } else {
>> +            write_vec_element_i32(zeroes, get_field(s->fields, v1), i, MO_8);
>> +        }
>> +    }
>> +    tcg_temp_free_i32(ones);
>> +    tcg_temp_free_i32(zeroes);
>> +    return DISAS_NEXT;
>> +}
> 
> While this works, it's not in the spirit of
> 
>> Programming Note: VECTOR GENERATE BYTE
>> MASK is the preferred method for setting a vector
>> register to all zeroes or ones.

Good point, I skipped that note so far.

> 
> Better, I think, with

Many instructions to implement, so little time to fine tune stuff so
far. However I have tests for VGBM, so I can easily get it working. Will
play with it!

> 
> uint64_t generate_byte_mask(uint8_t mask)
> {
>     uint64_t r = 0;
>     int i;
>     for (i = 0; i < 8; i++) {
>         if ((mask >> i) & 1) {
>             r |= 0xffull << (i * 8);
>         }
>     }
>     return r;
> }
> 
>     if (i2 == (i2 & 0xff) * 0x0101) {
>         /* masks for both halves of the vector are the same.
>            trust tcg to produce a good constant loading.  */
>         tcg_gen_gvec_dup64i(vec_full_reg_offset(s, v1), 16, 16,
>                             generate_byte_mask(i2 & 0xff));
>     } else {
>         TCGv_i64 t = tcg_temp_new_i64();
>         tcg_gen_movi_i64(t, generate_byte_mask(i2 >> 8));
>         write_vec_element_i64(t, v1, 0, MO_64);
>         tcg_gen_movi_i64(t, generate_byte_mask(i2 & 0xff));
>         write_vec_element_i64(t, v1, 1, MO_64);
>         tcg_temp_free_i64();
>     }
> 
> Somewhere behind tcg_gen_gvec_dup64i, I check to see if the constant can be
> decomposed further, which will eventually bottom out at
> 
> 	vpxor	%xmm0,%xmm0,%xmm0		// all zeros
> 	vpcmpeq	%xmm0,%xmm0,%xmm0		// all ones
> 
> and even more interesting combinations for tcg/aarch64.
> 
> 

At this point I want to highlight how helpful your reviews are. Amazing! :)

> 
> r~
> 


-- 

Thanks,

David / dhildenb

Re: [Qemu-devel] [PATCH v1 06/33] s390x/tcg: Implement VECTOR GENERATE BYTE MASK
Posted by David Hildenbrand 6 years, 8 months ago
On 26.02.19 20:23, David Hildenbrand wrote:
> On 26.02.19 20:12, Richard Henderson wrote:
>> On 2/26/19 3:38 AM, David Hildenbrand wrote:
>>> +static DisasJumpType op_vgbm(DisasContext *s, DisasOps *o)
>>> +{
>>> +    const uint16_t i2 = get_field(s->fields, i2);
>>> +    TCGv_i32 ones = tcg_const_i32(-1u);
>>> +    TCGv_i32 zeroes = tcg_const_i32(0);
>>> +    int i;
>>> +
>>> +    for (i = 0; i < 16; i++) {
>>> +        if (extract32(i2, 15 - i, 1)) {
>>> +            write_vec_element_i32(ones, get_field(s->fields, v1), i, MO_8);
>>> +        } else {
>>> +            write_vec_element_i32(zeroes, get_field(s->fields, v1), i, MO_8);
>>> +        }
>>> +    }
>>> +    tcg_temp_free_i32(ones);
>>> +    tcg_temp_free_i32(zeroes);
>>> +    return DISAS_NEXT;
>>> +}
>>
>> While this works, it's not in the spirit of
>>
>>> Programming Note: VECTOR GENERATE BYTE
>>> MASK is the preferred method for setting a vector
>>> register to all zeroes or ones.
> 
> Good point, I skipped that note so far.
> 
>>
>> Better, I think, with
> 
> Many instructions to implement, so little time to fine tune stuff so
> far. However I have tests for VGBM, so I can easily get it working. Will
> play with it!
> 
>>
>> uint64_t generate_byte_mask(uint8_t mask)
>> {
>>     uint64_t r = 0;
>>     int i;
>>     for (i = 0; i < 8; i++) {
>>         if ((mask >> i) & 1) {
>>             r |= 0xffull << (i * 8);
>>         }
>>     }
>>     return r;
>> }
>>
>>     if (i2 == (i2 & 0xff) * 0x0101) {
>>         /* masks for both halves of the vector are the same.
>>            trust tcg to produce a good constant loading.  */
>>         tcg_gen_gvec_dup64i(vec_full_reg_offset(s, v1), 16, 16,
>>                             generate_byte_mask(i2 & 0xff));
>>     } else {
>>         TCGv_i64 t = tcg_temp_new_i64();
>>         tcg_gen_movi_i64(t, generate_byte_mask(i2 >> 8));
>>         write_vec_element_i64(t, v1, 0, MO_64);
>>         tcg_gen_movi_i64(t, generate_byte_mask(i2 & 0xff));
>>         write_vec_element_i64(t, v1, 1, MO_64);
>>         tcg_temp_free_i64();
>>     }
>>
>> Somewhere behind tcg_gen_gvec_dup64i, I check to see if the constant can be
>> decomposed further, which will eventually bottom out at
>>
>> 	vpxor	%xmm0,%xmm0,%xmm0		// all zeros
>> 	vpcmpeq	%xmm0,%xmm0,%xmm0		// all ones
>>

Just tested with minor adaptions, works like a charm!

-- 

Thanks,

David / dhildenb