From: eopXD <eop.chen@sifive.com>
Signed-off-by: eop Chen <eop.chen@sifive.com>
Reviewed-by: Frank Chang <frank.chang@sifive.com>
---
target/riscv/insn_trans/trans_rvv.c.inc | 9 +++++++
target/riscv/vector_helper.c | 32 +++++++++++++++++++++++++
2 files changed, 41 insertions(+)
diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
index cc80bf00ff..66cfc8c603 100644
--- a/target/riscv/insn_trans/trans_rvv.c.inc
+++ b/target/riscv/insn_trans/trans_rvv.c.inc
@@ -711,6 +711,7 @@ static bool ld_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew)
data = FIELD_DP32(data, VDATA, VM, a->vm);
data = FIELD_DP32(data, VDATA, LMUL, emul);
data = FIELD_DP32(data, VDATA, NF, a->nf);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
return ldst_us_trans(a->rd, a->rs1, data, fn, s, false);
}
@@ -748,6 +749,7 @@ static bool st_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew)
data = FIELD_DP32(data, VDATA, VM, a->vm);
data = FIELD_DP32(data, VDATA, LMUL, emul);
data = FIELD_DP32(data, VDATA, NF, a->nf);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
return ldst_us_trans(a->rd, a->rs1, data, fn, s, true);
}
@@ -774,6 +776,7 @@ static bool ld_us_mask_op(DisasContext *s, arg_vlm_v *a, uint8_t eew)
/* EMUL = 1, NFIELDS = 1 */
data = FIELD_DP32(data, VDATA, LMUL, 0);
data = FIELD_DP32(data, VDATA, NF, 1);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
return ldst_us_trans(a->rd, a->rs1, data, fn, s, false);
}
@@ -791,6 +794,7 @@ static bool st_us_mask_op(DisasContext *s, arg_vsm_v *a, uint8_t eew)
/* EMUL = 1, NFIELDS = 1 */
data = FIELD_DP32(data, VDATA, LMUL, 0);
data = FIELD_DP32(data, VDATA, NF, 1);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
return ldst_us_trans(a->rd, a->rs1, data, fn, s, true);
}
@@ -862,6 +866,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
data = FIELD_DP32(data, VDATA, VM, a->vm);
data = FIELD_DP32(data, VDATA, LMUL, emul);
data = FIELD_DP32(data, VDATA, NF, a->nf);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, false);
}
@@ -891,6 +896,7 @@ static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
data = FIELD_DP32(data, VDATA, VM, a->vm);
data = FIELD_DP32(data, VDATA, LMUL, emul);
data = FIELD_DP32(data, VDATA, NF, a->nf);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
fn = fns[eew];
if (fn == NULL) {
return false;
@@ -991,6 +997,7 @@ static bool ld_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
data = FIELD_DP32(data, VDATA, VM, a->vm);
data = FIELD_DP32(data, VDATA, LMUL, emul);
data = FIELD_DP32(data, VDATA, NF, a->nf);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, false);
}
@@ -1043,6 +1050,7 @@ static bool st_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
data = FIELD_DP32(data, VDATA, VM, a->vm);
data = FIELD_DP32(data, VDATA, LMUL, emul);
data = FIELD_DP32(data, VDATA, NF, a->nf);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
}
@@ -1108,6 +1116,7 @@ static bool ldff_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew)
data = FIELD_DP32(data, VDATA, VM, a->vm);
data = FIELD_DP32(data, VDATA, LMUL, emul);
data = FIELD_DP32(data, VDATA, NF, a->nf);
+ data = FIELD_DP32(data, VDATA, VTA, s->vta);
return ldff_trans(a->rd, a->rs1, data, fn, s);
}
diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
index 39c79c59c2..1c7015e917 100644
--- a/target/riscv/vector_helper.c
+++ b/target/riscv/vector_helper.c
@@ -289,6 +289,9 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
uint32_t i, k;
uint32_t nf = vext_nf(desc);
uint32_t max_elems = vext_max_elems(desc, log2_esz);
+ uint32_t esz = 1 << log2_esz;
+ uint32_t total_elems = vext_get_total_elems(desc, esz);
+ uint32_t vta = vext_vta(desc);
for (i = env->vstart; i < env->vl; i++, env->vstart++) {
if (!vm && !vext_elem_mask(v0, i)) {
@@ -303,6 +306,11 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
}
}
env->vstart = 0;
+ /* set tail elements to 1s */
+ for (k = 0; k < nf; ++k) {
+ vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems,
+ env->vl * esz, total_elems * esz);
+ }
}
#define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \
@@ -348,6 +356,9 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
uint32_t i, k;
uint32_t nf = vext_nf(desc);
uint32_t max_elems = vext_max_elems(desc, log2_esz);
+ uint32_t esz = 1 << log2_esz;
+ uint32_t total_elems = vext_get_total_elems(desc, esz);
+ uint32_t vta = vext_vta(desc);
/* load bytes from guest memory */
for (i = env->vstart; i < evl; i++, env->vstart++) {
@@ -359,6 +370,11 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
}
}
env->vstart = 0;
+ /* set tail elements to 1s */
+ for (k = 0; k < nf; ++k) {
+ vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems,
+ env->vl * esz, total_elems * esz);
+ }
}
/*
@@ -458,6 +474,9 @@ vext_ldst_index(void *vd, void *v0, target_ulong base,
uint32_t nf = vext_nf(desc);
uint32_t vm = vext_vm(desc);
uint32_t max_elems = vext_max_elems(desc, log2_esz);
+ uint32_t esz = 1 << log2_esz;
+ uint32_t total_elems = vext_get_total_elems(desc, esz);
+ uint32_t vta = vext_vta(desc);
/* load bytes from guest memory */
for (i = env->vstart; i < env->vl; i++, env->vstart++) {
@@ -473,6 +492,11 @@ vext_ldst_index(void *vd, void *v0, target_ulong base,
}
}
env->vstart = 0;
+ /* set tail elements to 1s */
+ for (k = 0; k < nf; ++k) {
+ vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems,
+ env->vl * esz, total_elems * esz);
+ }
}
#define GEN_VEXT_LD_INDEX(NAME, ETYPE, INDEX_FN, LOAD_FN) \
@@ -540,6 +564,9 @@ vext_ldff(void *vd, void *v0, target_ulong base,
uint32_t nf = vext_nf(desc);
uint32_t vm = vext_vm(desc);
uint32_t max_elems = vext_max_elems(desc, log2_esz);
+ uint32_t esz = 1 << log2_esz;
+ uint32_t total_elems = vext_get_total_elems(desc, esz);
+ uint32_t vta = vext_vta(desc);
target_ulong addr, offset, remain;
/* probe every access*/
@@ -595,6 +622,11 @@ ProbeSuccess:
}
}
env->vstart = 0;
+ /* set tail elements to 1s */
+ for (k = 0; k < nf; ++k) {
+ vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems,
+ env->vl * esz, total_elems * esz);
+ }
}
#define GEN_VEXT_LDFF(NAME, ETYPE, LOAD_FN) \
--
2.34.1
在 2022/3/7 下午3:10, ~eopxd 写道:
> From: eopXD <eop.chen@sifive.com>
>
> Signed-off-by: eop Chen <eop.chen@sifive.com>
> Reviewed-by: Frank Chang <frank.chang@sifive.com>
> ---
> target/riscv/insn_trans/trans_rvv.c.inc | 9 +++++++
> target/riscv/vector_helper.c | 32 +++++++++++++++++++++++++
> 2 files changed, 41 insertions(+)
>
> diff --git a/target/riscv/insn_trans/trans_rvv.c.inc b/target/riscv/insn_trans/trans_rvv.c.inc
> index cc80bf00ff..66cfc8c603 100644
> --- a/target/riscv/insn_trans/trans_rvv.c.inc
> +++ b/target/riscv/insn_trans/trans_rvv.c.inc
> @@ -711,6 +711,7 @@ static bool ld_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew)
> data = FIELD_DP32(data, VDATA, VM, a->vm);
> data = FIELD_DP32(data, VDATA, LMUL, emul);
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> return ldst_us_trans(a->rd, a->rs1, data, fn, s, false);
> }
>
> @@ -748,6 +749,7 @@ static bool st_us_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew)
> data = FIELD_DP32(data, VDATA, VM, a->vm);
> data = FIELD_DP32(data, VDATA, LMUL, emul);
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> return ldst_us_trans(a->rd, a->rs1, data, fn, s, true);
> }
>
> @@ -774,6 +776,7 @@ static bool ld_us_mask_op(DisasContext *s, arg_vlm_v *a, uint8_t eew)
> /* EMUL = 1, NFIELDS = 1 */
> data = FIELD_DP32(data, VDATA, LMUL, 0);
> data = FIELD_DP32(data, VDATA, NF, 1);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> return ldst_us_trans(a->rd, a->rs1, data, fn, s, false);
> }
>
> @@ -791,6 +794,7 @@ static bool st_us_mask_op(DisasContext *s, arg_vsm_v *a, uint8_t eew)
> /* EMUL = 1, NFIELDS = 1 */
> data = FIELD_DP32(data, VDATA, LMUL, 0);
> data = FIELD_DP32(data, VDATA, NF, 1);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> return ldst_us_trans(a->rd, a->rs1, data, fn, s, true);
> }
>
> @@ -862,6 +866,7 @@ static bool ld_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
> data = FIELD_DP32(data, VDATA, VM, a->vm);
> data = FIELD_DP32(data, VDATA, LMUL, emul);
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> return ldst_stride_trans(a->rd, a->rs1, a->rs2, data, fn, s, false);
> }
>
> @@ -891,6 +896,7 @@ static bool st_stride_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
> data = FIELD_DP32(data, VDATA, VM, a->vm);
> data = FIELD_DP32(data, VDATA, LMUL, emul);
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> fn = fns[eew];
> if (fn == NULL) {
> return false;
> @@ -991,6 +997,7 @@ static bool ld_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
> data = FIELD_DP32(data, VDATA, VM, a->vm);
> data = FIELD_DP32(data, VDATA, LMUL, emul);
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, false);
> }
>
> @@ -1043,6 +1050,7 @@ static bool st_index_op(DisasContext *s, arg_rnfvm *a, uint8_t eew)
> data = FIELD_DP32(data, VDATA, VM, a->vm);
> data = FIELD_DP32(data, VDATA, LMUL, emul);
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> return ldst_index_trans(a->rd, a->rs1, a->rs2, data, fn, s, true);
> }
>
> @@ -1108,6 +1116,7 @@ static bool ldff_op(DisasContext *s, arg_r2nfvm *a, uint8_t eew)
> data = FIELD_DP32(data, VDATA, VM, a->vm);
> data = FIELD_DP32(data, VDATA, LMUL, emul);
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> + data = FIELD_DP32(data, VDATA, VTA, s->vta);
> return ldff_trans(a->rd, a->rs1, data, fn, s);
> }
>
> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
> index 39c79c59c2..1c7015e917 100644
> --- a/target/riscv/vector_helper.c
> +++ b/target/riscv/vector_helper.c
> @@ -289,6 +289,9 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
> uint32_t i, k;
> uint32_t nf = vext_nf(desc);
> uint32_t max_elems = vext_max_elems(desc, log2_esz);
> + uint32_t esz = 1 << log2_esz;
> + uint32_t total_elems = vext_get_total_elems(desc, esz);
> + uint32_t vta = vext_vta(desc);
>
> for (i = env->vstart; i < env->vl; i++, env->vstart++) {
> if (!vm && !vext_elem_mask(v0, i)) {
> @@ -303,6 +306,11 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
> }
> }
> env->vstart = 0;
> + /* set tail elements to 1s */
> + for (k = 0; k < nf; ++k) {
> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems,
> + env->vl * esz, total_elems * esz);
> + }
> }
>
> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \
> @@ -348,6 +356,9 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
> uint32_t i, k;
> uint32_t nf = vext_nf(desc);
> uint32_t max_elems = vext_max_elems(desc, log2_esz);
> + uint32_t esz = 1 << log2_esz;
> + uint32_t total_elems = vext_get_total_elems(desc, esz);
> + uint32_t vta = vext_vta(desc);
>
> /* load bytes from guest memory */
> for (i = env->vstart; i < evl; i++, env->vstart++) {
> @@ -359,6 +370,11 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
> }
> }
> env->vstart = 0;
> + /* set tail elements to 1s */
> + for (k = 0; k < nf; ++k) {
> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems,
> + env->vl * esz, total_elems * esz);
> + }
> }
>
It seems incorrect here. similar to following load/store helper.
In above instructions, following elements are loaded:
0 * max_elems ... 0 *max_elems + vl - 1
1 * max_elems ... 1 *max_elems + vl - 1
.......
(nf-1)* max_elems ... (nf-1)*max_elems + vl - 1
So, the elements[vl .. max_elems - 1] are tail elements, however
elements[vl ... 1* total_elems - 1] may not:
elements from max_elems to total_elems - 1 are active elements, If
total_elems > max_elems(LMUL< 1)
Or LMUL should be equal or greater than 1 here? I didn't find any
description about this from the spec.
I also have another question about the tail elements for these
load/store instructions:
when nf = 3, LMUL = 1, vl=vlmax, reg, reg+1, reg+2 will be loaded, then
whether elements in reg+3
(if they belong to the same register group) are tail elements?
Regards,
Weiwei Li
> Weiwei Li <liweiwei@iscas.ac.cn> 於 2022年3月28日 下午7:56 寫道:
>
>
> 在 2022/3/7 下午3:10, ~eopxd 写道:
>> From: eopXD <eop.chen@sifive.com>
>>
>> Signed-off-by: eop Chen <eop.chen@sifive.com>
>> Reviewed-by: Frank Chang <frank.chang@sifive.com>
>> ---
>> target/riscv/insn_trans/trans_rvv.c.inc | 9 +++++++
>> target/riscv/vector_helper.c | 32 +++++++++++++++++++++++++
>> 2 files changed, 41 insertions(+)
>>
>> diff --git a/target/riscv/vector_helper.c b/target/riscv/vector_helper.c
>> index 39c79c59c2..1c7015e917 100644
>> --- a/target/riscv/vector_helper.c
>> +++ b/target/riscv/vector_helper.c
>> @@ -289,6 +289,9 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
>> uint32_t i, k;
>> uint32_t nf = vext_nf(desc);
>> uint32_t max_elems = vext_max_elems(desc, log2_esz);
>> + uint32_t esz = 1 << log2_esz;
>> + uint32_t total_elems = vext_get_total_elems(desc, esz);
>> + uint32_t vta = vext_vta(desc);
>> for (i = env->vstart; i < env->vl; i++, env->vstart++) {
>> if (!vm && !vext_elem_mask(v0, i)) {
>> @@ -303,6 +306,11 @@ vext_ldst_stride(void *vd, void *v0, target_ulong base,
>> }
>> }
>> env->vstart = 0;
>> + /* set tail elements to 1s */
>> + for (k = 0; k < nf; ++k) {
>> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems,
>> + env->vl * esz, total_elems * esz);
>> + }
>> }
>> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN) \
>> @@ -348,6 +356,9 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>> uint32_t i, k;
>> uint32_t nf = vext_nf(desc);
>> uint32_t max_elems = vext_max_elems(desc, log2_esz);
>> + uint32_t esz = 1 << log2_esz;
>> + uint32_t total_elems = vext_get_total_elems(desc, esz);
>> + uint32_t vta = vext_vta(desc);
>> /* load bytes from guest memory */
>> for (i = env->vstart; i < evl; i++, env->vstart++) {
>> @@ -359,6 +370,11 @@ vext_ldst_us(void *vd, target_ulong base, CPURISCVState *env, uint32_t desc,
>> }
>> }
>> env->vstart = 0;
>> + /* set tail elements to 1s */
>> + for (k = 0; k < nf; ++k) {
>> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k * total_elems,
>> + env->vl * esz, total_elems * esz);
>> + }
>> }
>>
>
> It seems incorrect here. similar to following load/store helper.
>
> In above instructions, following elements are loaded:
>
> 0 * max_elems ... 0 *max_elems + vl - 1
>
> 1 * max_elems ... 1 *max_elems + vl - 1
>
> .......
>
> (nf-1)* max_elems ... (nf-1)*max_elems + vl - 1
>
> So, the elements[vl .. max_elems - 1] are tail elements, however elements[vl ... 1* total_elems - 1] may not:
>
> elements from max_elems to total_elems - 1 are active elements, If total_elems > max_elems(LMUL< 1)
>
> Or LMUL should be equal or greater than 1 here? I didn't find any description about this from the spec.
>
> I also have another question about the tail elements for these load/store instructions:
>
> when nf = 3, LMUL = 1, vl=vlmax, reg, reg+1, reg+2 will be loaded, then whether elements in reg+3
>
> (if they belong to the same register group) are tail elements?
>
> Regards,
>
> Weiwei Li
>
The LMUL sent into vector helper function from `trans_rvv.c.inc` takes EMUL
(effective LMUL) instead of LMUL. Take trans_rvv.c.inc::ld_us_op for example,
```
/*
* Vector load/store instructions have the EEW encoded
* directly in the instructions. The maximum vector size is
* calculated with EMUL rather than LMUL.
*/
uint8_t emul = vext_get_emul(s, eew);
data = FIELD_DP32(data, VDATA, VM, a->vm);
data = FIELD_DP32(data, VDATA, LMUL, emul);
data = FIELD_DP32(data, VDATA, NF, a->nf);
return ldst_us_trans(a->rd, a->rs1, data, fn, s, false);
```
And vext_get_emul always return something at least the length of a vector register:
```
static uint8_t vext_get_emul(DisasContext *s, uint8_t eew)
{
int8_t emul = eew - s->sew + s->lmul;
return emul < 0 ? 0 : emul;
}
```
In this case I guess the naming is a little bit misleading, `vext_max_elems` would be
equivalent to `vext_get_total_elems` for all load / store instructions, which guarantees
That LMUL is always equal or greater to 1. In conclusion, the behavior is correct here.
I don’t understand your second question though. If nf = 3, there will be 3 registers
involved with the instruction (namely reg, reg+1, reg+2). Why do we care about
(reg+3)?
Thanks for pointing out this question and all your efforts for reviewing. I really
appreciate it.
Regards,
eop Chen
在 2022/3/30 下午3:42, 陳約廷 写道:
>
>> Weiwei Li <liweiwei@iscas.ac.cn <mailto:liweiwei@iscas.ac.cn>> 於
>> 2022年3月28日 下午7:56 寫道:
>>
>>
>> 在 2022/3/7 下午3:10, ~eopxd 写道:
>>> From: eopXD <eop.chen@sifive.com <mailto:eop.chen@sifive.com>>
>>>
>>> Signed-off-by: eop Chen <eop.chen@sifive.com
>>> <mailto:eop.chen@sifive.com>>
>>> Reviewed-by: Frank Chang <frank.chang@sifive.com
>>> <mailto:frank.chang@sifive.com>>
>>> ---
>>> target/riscv/insn_trans/trans_rvv.c.inc | 9 +++++++
>>> target/riscv/vector_helper.c | 32 +++++++++++++++++++++++++
>>> 2 files changed, 41 insertions(+)
>>>
>>> diff --git a/target/riscv/vector_helper.c
>>> b/target/riscv/vector_helper.c
>>> index 39c79c59c2..1c7015e917 100644
>>> --- a/target/riscv/vector_helper.c
>>> +++ b/target/riscv/vector_helper.c
>>> @@ -289,6 +289,9 @@ vext_ldst_stride(void *vd, void *v0,
>>> target_ulong base,
>>> uint32_t i, k;
>>> uint32_t nf = vext_nf(desc);
>>> uint32_t max_elems = vext_max_elems(desc, log2_esz);
>>> + uint32_t esz = 1 << log2_esz;
>>> + uint32_t total_elems = vext_get_total_elems(desc, esz);
>>> + uint32_t vta = vext_vta(desc);
>>> for (i = env->vstart; i < env->vl; i++, env->vstart++) {
>>> if (!vm && !vext_elem_mask(v0, i)) {
>>> @@ -303,6 +306,11 @@ vext_ldst_stride(void *vd, void *v0,
>>> target_ulong base,
>>> }
>>> }
>>> env->vstart = 0;
>>> + /* set tail elements to 1s */
>>> + for (k = 0; k < nf; ++k) {
>>> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k *
>>> total_elems,
>>> + env->vl * esz, total_elems
>>> * esz);
>>> + }
>>> }
>>> #define GEN_VEXT_LD_STRIDE(NAME, ETYPE, LOAD_FN)
>>> \
>>> @@ -348,6 +356,9 @@ vext_ldst_us(void *vd, target_ulong base,
>>> CPURISCVState *env, uint32_t desc,
>>> uint32_t i, k;
>>> uint32_t nf = vext_nf(desc);
>>> uint32_t max_elems = vext_max_elems(desc, log2_esz);
>>> + uint32_t esz = 1 << log2_esz;
>>> + uint32_t total_elems = vext_get_total_elems(desc, esz);
>>> + uint32_t vta = vext_vta(desc);
>>> /* load bytes from guest memory */
>>> for (i = env->vstart; i < evl; i++, env->vstart++) {
>>> @@ -359,6 +370,11 @@ vext_ldst_us(void *vd, target_ulong base,
>>> CPURISCVState *env, uint32_t desc,
>>> }
>>> }
>>> env->vstart = 0;
>>> + /* set tail elements to 1s */
>>> + for (k = 0; k < nf; ++k) {
>>> + vext_set_elems_1s_fns[log2_esz](vd, vta, env->vl + k *
>>> total_elems,
>>> + env->vl * esz, total_elems
>>> * esz);
>>> + }
>>> }
>>>
>>
>> It seems incorrect here. similar to following load/store helper.
>>
>> In above instructions, following elements are loaded:
>>
>> 0 * max_elems ... 0 *max_elems + vl - 1
>>
>> 1 * max_elems ... 1 *max_elems + vl - 1
>>
>> .......
>>
>> (nf-1)* max_elems ... (nf-1)*max_elems + vl - 1
>>
>> So, the elements[vl .. max_elems - 1] are tail elements, however
>> elements[vl ... 1* total_elems - 1] may not:
>>
>> elements from max_elems to total_elems - 1 are active elements, If
>> total_elems > max_elems(LMUL< 1)
>>
>> Or LMUL should be equal or greater than 1 here? I didn't find any
>> description about this from the spec.
>>
>> I also have another question about the tail elements for these
>> load/store instructions:
>>
>> when nf = 3, LMUL = 1, vl=vlmax, reg, reg+1, reg+2 will be loaded,
>> then whether elements in reg+3
>>
>> (if they belong to the same register group) are tail elements?
>>
>> Regards,
>>
>> Weiwei Li
>>
>
> The LMUL sent into vector helper function from `trans_rvv.c.inc` takes
> EMUL
> (effective LMUL) instead of LMUL. Take trans_rvv.c.inc::ld_us_op for
> example,
>
> ```
> /*
> * Vector load/store instructions have the EEW encoded
> * directly in the instructions. The maximum vector size is
> * calculated with EMUL rather than LMUL.
> */
> uint8_t emul = vext_get_emul(s, eew);
> data = FIELD_DP32(data, VDATA, VM, a->vm);
> data = FIELD_DP32(data, VDATA, LMUL, emul);
> data = FIELD_DP32(data, VDATA, NF, a->nf);
> return ldst_us_trans(a->rd, a->rs1, data, fn, s, false);
> ```
>
> And vext_get_emul always return something at least the length of a
> vector register:
>
> ```
> static uint8_t vext_get_emul(DisasContext *s, uint8_t eew)
> {
> int8_t emul = eew - s->sew + s->lmul;
> return emul < 0 ? 0 : emul;
> }
> ```
>
> In this case I guess the naming is a little bit misleading,
> `vext_max_elems` would be
> equivalent to `vext_get_total_elems` for all load / store
> instructions, which guarantees
> That LMUL is always equal or greater to 1. In conclusion, the behavior
> is correct here.
OK. Thanks for your patient explaining.
Another question: max_elems is equal to total_elems when lmul >= 0.
So max_elems can be reused here instead of caculating total_elems again.
>
> I don’t understand your second question though. If nf = 3, there will
> be 3 registers
> involved with the instruction (namely reg, reg+1, reg+2). Why do we
> care about
> (reg+3)?
>
I just consider register group here. Reg, reg+1, reg+2 and reg+3 may
belong to the same register group.
Regards,
Weiwei Li
> Thanks for pointing out this question and all your efforts for
> reviewing. I really
> appreciate it.
>
> Regards,
>
> eop Chen
> Weiwei Li <liweiwei@iscas.ac.cn> 於 2022年3月30日 下午4:27 寫道: > 在 2022/3/30 下午3:42, 陳約廷 写道: >> >>> Weiwei Li <liweiwei@iscas.ac.cn <mailto:liweiwei@iscas.ac.cn>> 於 2022年3月28日 下午7:56 寫道: >>> >>> >>> 在 2022/3/7 下午3:10, ~eopxd 写道: >>>> From: eopXD <eop.chen@sifive.com <mailto:eop.chen@sifive.com>> > Another question: max_elems is equal to total_elems when lmul >= 0. > > So max_elems can be reused here instead of caculating total_elems again. > >> >> I don’t understand your second question though. If nf = 3, there will be 3 registers >> involved with the instruction (namely reg, reg+1, reg+2). Why do we care about >> (reg+3)? >> > I just consider register group here. Reg, reg+1, reg+2 and reg+3 may belong to the same register group. > > Regards, > > Weiwei Li > According to v-spec (under section 7.8): Each field will be held in successively numbered vector register groups. When EMUL>1 each field will occupy a vector register group held in multiple successively numbered vector registers, and the vector register group for each field must follow the usual vector register alignment constraints (e.g., when EMUL=2 and NFIELDS=4, each field’s vector register group must start at an even vector register, but does not have to start at a multiple of 8 vector register number). I think the spec has explained itself that NFIELDS represents the number of register groups involved in this instruction. Therefore in a register group of 4 (LMUL = m2), NFIELD should be no more than 2. The `vlmax` here would be (VLEN * 4 / EEW). In this sense, if the `vl` provided for the vector instruction is within the range 2 * vlmax / 4 <= vl <= 3 * vlmax / 4, the elements in the 4th register (namely reg+3) will all be counted as tail elements. I hope this answers your question. Regards, eop Chen
在 2022/3/30 下午6:02, eop Chen 写道: > > >> Weiwei Li <liweiwei@iscas.ac.cn <mailto:liweiwei@iscas.ac.cn>> 於 >> 2022年3月30日 下午4:27 寫道: >> 在 2022/3/30 下午3:42, 陳約廷 写道: >>> >>>> Weiwei Li <liweiwei@iscas.ac.cn <mailto:liweiwei@iscas.ac.cn>> 於 >>>> 2022年3月28日 下午7:56 寫道: >>>> >>>> >>>> 在 2022/3/7 下午3:10, ~eopxd 写道: >>>>> From: eopXD <eop.chen@sifive.com <mailto:eop.chen@sifive.com>> >> >> Another question: max_elems is equal to total_elems when lmul >= 0. >> >> So max_elems can be reused here instead of caculating total_elems again. >> >>> >>> I don’t understand your second question though. If nf = 3, there >>> will be 3 registers >>> involved with the instruction (namely reg, reg+1, reg+2). Why do we >>> care about >>> (reg+3)? >>> >> I just consider register group here. Reg, reg+1, reg+2 and reg+3 may >> belong to the same register group. >> >> Regards, >> >> Weiwei Li >> > > According to v-spec (under section 7.8): > > Each field will be held in successively numbered vector register > groups. When EMUL>1 > each field will occupy a vector register group held in multiple > successively numbered > vector registers, and the vector register group for each field > must follow the usual vector > register alignment constraints (e.g., when EMUL=2 and NFIELDS=4, > each field’s vector > register group must start at an even vector register, but does not > have to start at a multiple > of 8 vector register number). > > > I think the spec has explained itself that NFIELDS represents the > number of register groups involved > in this instruction. Therefore in a register group of 4 (LMUL = m2), > NFIELD should be no more than 2. > The `vlmax` here would be (VLEN * 4 / EEW). In this sense, if the `vl` > provided for the vector instruction > is within the range 2 * vlmax / 4 <= vl <= 3 * vlmax / 4, the elements > in the 4th register (namely reg+3) > will all be counted as tail elements. > > I hope this answers your question. OK, Thanks a lot. This truly answers my question, even though what I really want to know is the case for EMUL=1, and NFIELDS=3. since NFIELDS represents the number of register groups, not take the total of EMUL * NFIELDS into one register group , so reg+3 should not take into tail elements for my case. Regards, Weiwei Li > > Regards, > > eop Chen > >
© 2016 - 2026 Red Hat, Inc.