target/s390x/cpu_models.c | 2 -- target/s390x/gen-features.c | 2 ++ target/s390x/tcg/crypto_helper.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-)
In order for hosts running inside of TCG to initialize the kernel's
random number generator, we should support the PRNO_TRNG instruction,
backed in the usual way with the qemu_guest_getrandom helper. This is
confirmed working on Linux 5.19-rc6.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/cpu_models.c | 2 --
target/s390x/gen-features.c | 2 ++
target/s390x/tcg/crypto_helper.c | 32 ++++++++++++++++++++++++++++++++
3 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c
index 1a562d2801..90aac3d795 100644
--- a/target/s390x/cpu_models.c
+++ b/target/s390x/cpu_models.c
@@ -421,8 +421,6 @@ static void check_consistency(const S390CPUModel *model)
{ S390_FEAT_DFP_FAST, S390_FEAT_DFP },
{ S390_FEAT_TRANSACTIONAL_EXE, S390_FEAT_STFLE_49 },
{ S390_FEAT_EDAT_2, S390_FEAT_EDAT},
- { S390_FEAT_MSA_EXT_5, S390_FEAT_KIMD_SHA_512 },
- { S390_FEAT_MSA_EXT_5, S390_FEAT_KLMD_SHA_512 },
{ S390_FEAT_MSA_EXT_4, S390_FEAT_MSA_EXT_3 },
{ S390_FEAT_SIE_CMMA, S390_FEAT_CMM },
{ S390_FEAT_SIE_CMMA, S390_FEAT_SIE_GSLS },
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index ad140184b9..3d333e2789 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -749,6 +749,8 @@ static uint16_t qemu_V7_0[] = {
*/
static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
+ S390_FEAT_MSA_EXT_5,
+ S390_FEAT_PRNO_TRNG,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 138d9e7ad9..afd29f9cf0 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -12,12 +12,38 @@
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
#include "s390x-internal.h"
#include "tcg_s390x.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
+ uint64_t *buf_reg, uint64_t *len_reg)
+{
+ uint8_t tmp[256];
+ uint64_t len = *len_reg;
+ int reg_len = 64;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ while (len) {
+ size_t block = MIN(len, sizeof(tmp));
+
+ qemu_guest_getrandom_nofail(tmp, block);
+ for (size_t i = 0; i < block; ++i) {
+ cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra);
+ *buf_reg = deposit64(*buf_reg, 0, reg_len, *buf_reg + 1);
+ --*len_reg;
+ }
+ len -= block;
+ }
+}
+
uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
uint32_t type)
{
@@ -52,6 +78,12 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 114:
+ if (r1 & 1 || !r1 || r2 & 1 || !r2)
+ tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra);
+ fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
+ fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
+ break;
default:
/* we don't implement any other subfunction yet */
g_assert_not_reached();
--
2.35.1
On Wed, Jul 20, 2022 at 02:08:59PM +0200, Jason A. Donenfeld wrote: > + case 114: > + if (r1 & 1 || !r1 || r2 & 1 || !r2) > + tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); This is already handled in op_msa. I'm going to remove it for v4.
Am 20.07.22 um 14:08 schrieb Jason A. Donenfeld: > In order for hosts running inside of TCG to initialize the kernel's > random number generator, we should support the PRNO_TRNG instruction, > backed in the usual way with the qemu_guest_getrandom helper. This is > confirmed working on Linux 5.19-rc6. > > Cc: Thomas Huth <thuth@redhat.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Richard Henderson <richard.henderson@linaro.org> > Cc: Cornelia Huck <cohuck@redhat.com> > Cc: Harald Freudenberger <freude@linux.ibm.com> > Cc: Holger Dengler <dengler@linux.ibm.com> > Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> [...] > + case 114: > + if (r1 & 1 || !r1 || r2 & 1 || !r2) > + tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); > + fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); > + fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]); > + break; I think I agree with Harald that some aspects are missing. Linux does not seem to check, but we should also modify the query function to indicate the availability of 114. As the msa helper deals with many instructions ... target/s390x/tcg/insn-data.def: D(0xb91e, KMAC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMAC) target/s390x/tcg/insn-data.def: D(0xb928, PCKMO, RRE, MSA3, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCKMO) target/s390x/tcg/insn-data.def: D(0xb92a, KMF, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMF) target/s390x/tcg/insn-data.def: D(0xb92b, KMO, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMO) target/s390x/tcg/insn-data.def: D(0xb92c, PCC, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCC) target/s390x/tcg/insn-data.def: D(0xb92d, KMCTR, RRF_b, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMCTR) target/s390x/tcg/insn-data.def: D(0xb92e, KM, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KM) target/s390x/tcg/insn-data.def: D(0xb92f, KMC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMC) target/s390x/tcg/insn-data.def: D(0xb929, KMA, RRF_b, MSA8, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMA) target/s390x/tcg/insn-data.def: D(0xb93c, PPNO, RRE, MSA5, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PPNO) target/s390x/tcg/insn-data.def: D(0xb93e, KIMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KIMD) target/s390x/tcg/insn-data.def: D(0xb93f, KLMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KLMD) ... and in theory other instructions might also have 114 we should at least check that this is ppno/prno. Or we split out a prno helper from the msa helper.
On 02.08.22 15:26, Christian Borntraeger wrote: > > > Am 20.07.22 um 14:08 schrieb Jason A. Donenfeld: >> In order for hosts running inside of TCG to initialize the kernel's >> random number generator, we should support the PRNO_TRNG instruction, >> backed in the usual way with the qemu_guest_getrandom helper. This is >> confirmed working on Linux 5.19-rc6. >> >> Cc: Thomas Huth <thuth@redhat.com> >> Cc: David Hildenbrand <david@redhat.com> >> Cc: Richard Henderson <richard.henderson@linaro.org> >> Cc: Cornelia Huck <cohuck@redhat.com> >> Cc: Harald Freudenberger <freude@linux.ibm.com> >> Cc: Holger Dengler <dengler@linux.ibm.com> >> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> > [...] >> + case 114: >> + if (r1 & 1 || !r1 || r2 & 1 || !r2) >> + tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); >> + fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); >> + fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]); >> + break; > > I think I agree with Harald that some aspects are missing. > Linux does not seem to check, but we should also modify the query function to > indicate the availability of 114. > > As the msa helper deals with many instructions > ... > target/s390x/tcg/insn-data.def: D(0xb91e, KMAC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMAC) > target/s390x/tcg/insn-data.def: D(0xb928, PCKMO, RRE, MSA3, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCKMO) > target/s390x/tcg/insn-data.def: D(0xb92a, KMF, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMF) > target/s390x/tcg/insn-data.def: D(0xb92b, KMO, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMO) > target/s390x/tcg/insn-data.def: D(0xb92c, PCC, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCC) > target/s390x/tcg/insn-data.def: D(0xb92d, KMCTR, RRF_b, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMCTR) > target/s390x/tcg/insn-data.def: D(0xb92e, KM, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KM) > target/s390x/tcg/insn-data.def: D(0xb92f, KMC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMC) > target/s390x/tcg/insn-data.def: D(0xb929, KMA, RRF_b, MSA8, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMA) > target/s390x/tcg/insn-data.def: D(0xb93c, PPNO, RRE, MSA5, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PPNO) > target/s390x/tcg/insn-data.def: D(0xb93e, KIMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KIMD) > target/s390x/tcg/insn-data.def: D(0xb93f, KLMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KLMD) > ... > and in theory other instructions might also have 114 we should at least check that this is ppno/prno. > Or we split out a prno helper from the msa helper. > Doesn't s390_get_feat_block(type, subfunc); if (!test_be_bit(fc, subfunc)) { tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); } check that? As long as we don't implement 114 for any other instruction. that should properly fence off the other instructions. -- Thanks, David / dhildenb
Am 02.08.22 um 15:54 schrieb David Hildenbrand: > On 02.08.22 15:26, Christian Borntraeger wrote: >> >> >> Am 20.07.22 um 14:08 schrieb Jason A. Donenfeld: >>> In order for hosts running inside of TCG to initialize the kernel's >>> random number generator, we should support the PRNO_TRNG instruction, >>> backed in the usual way with the qemu_guest_getrandom helper. This is >>> confirmed working on Linux 5.19-rc6. >>> >>> Cc: Thomas Huth <thuth@redhat.com> >>> Cc: David Hildenbrand <david@redhat.com> >>> Cc: Richard Henderson <richard.henderson@linaro.org> >>> Cc: Cornelia Huck <cohuck@redhat.com> >>> Cc: Harald Freudenberger <freude@linux.ibm.com> >>> Cc: Holger Dengler <dengler@linux.ibm.com> >>> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> >> [...] >>> + case 114: >>> + if (r1 & 1 || !r1 || r2 & 1 || !r2) >>> + tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); >>> + fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); >>> + fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]); >>> + break; >> >> I think I agree with Harald that some aspects are missing. >> Linux does not seem to check, but we should also modify the query function to >> indicate the availability of 114. >> >> As the msa helper deals with many instructions >> ... >> target/s390x/tcg/insn-data.def: D(0xb91e, KMAC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMAC) >> target/s390x/tcg/insn-data.def: D(0xb928, PCKMO, RRE, MSA3, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCKMO) >> target/s390x/tcg/insn-data.def: D(0xb92a, KMF, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMF) >> target/s390x/tcg/insn-data.def: D(0xb92b, KMO, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMO) >> target/s390x/tcg/insn-data.def: D(0xb92c, PCC, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCC) >> target/s390x/tcg/insn-data.def: D(0xb92d, KMCTR, RRF_b, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMCTR) >> target/s390x/tcg/insn-data.def: D(0xb92e, KM, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KM) >> target/s390x/tcg/insn-data.def: D(0xb92f, KMC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMC) >> target/s390x/tcg/insn-data.def: D(0xb929, KMA, RRF_b, MSA8, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMA) >> target/s390x/tcg/insn-data.def: D(0xb93c, PPNO, RRE, MSA5, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PPNO) >> target/s390x/tcg/insn-data.def: D(0xb93e, KIMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KIMD) >> target/s390x/tcg/insn-data.def: D(0xb93f, KLMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KLMD) >> ... >> and in theory other instructions might also have 114 we should at least check that this is ppno/prno. >> Or we split out a prno helper from the msa helper. >> > > Doesn't > > s390_get_feat_block(type, subfunc); > if (!test_be_bit(fc, subfunc)) { > tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); > } > > check that? As long as we don't implement 114 for any other instruction. > that should properly fence off the other instructions. Right that would help. We should still take care of the query function.
On 02.08.22 16:01, Christian Borntraeger wrote: > > > Am 02.08.22 um 15:54 schrieb David Hildenbrand: >> On 02.08.22 15:26, Christian Borntraeger wrote: >>> >>> >>> Am 20.07.22 um 14:08 schrieb Jason A. Donenfeld: >>>> In order for hosts running inside of TCG to initialize the kernel's >>>> random number generator, we should support the PRNO_TRNG instruction, >>>> backed in the usual way with the qemu_guest_getrandom helper. This is >>>> confirmed working on Linux 5.19-rc6. >>>> >>>> Cc: Thomas Huth <thuth@redhat.com> >>>> Cc: David Hildenbrand <david@redhat.com> >>>> Cc: Richard Henderson <richard.henderson@linaro.org> >>>> Cc: Cornelia Huck <cohuck@redhat.com> >>>> Cc: Harald Freudenberger <freude@linux.ibm.com> >>>> Cc: Holger Dengler <dengler@linux.ibm.com> >>>> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> >>> [...] >>>> + case 114: >>>> + if (r1 & 1 || !r1 || r2 & 1 || !r2) >>>> + tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); >>>> + fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); >>>> + fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]); >>>> + break; >>> >>> I think I agree with Harald that some aspects are missing. >>> Linux does not seem to check, but we should also modify the query function to >>> indicate the availability of 114. >>> >>> As the msa helper deals with many instructions >>> ... >>> target/s390x/tcg/insn-data.def: D(0xb91e, KMAC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMAC) >>> target/s390x/tcg/insn-data.def: D(0xb928, PCKMO, RRE, MSA3, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCKMO) >>> target/s390x/tcg/insn-data.def: D(0xb92a, KMF, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMF) >>> target/s390x/tcg/insn-data.def: D(0xb92b, KMO, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMO) >>> target/s390x/tcg/insn-data.def: D(0xb92c, PCC, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCC) >>> target/s390x/tcg/insn-data.def: D(0xb92d, KMCTR, RRF_b, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMCTR) >>> target/s390x/tcg/insn-data.def: D(0xb92e, KM, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KM) >>> target/s390x/tcg/insn-data.def: D(0xb92f, KMC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMC) >>> target/s390x/tcg/insn-data.def: D(0xb929, KMA, RRF_b, MSA8, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMA) >>> target/s390x/tcg/insn-data.def: D(0xb93c, PPNO, RRE, MSA5, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PPNO) >>> target/s390x/tcg/insn-data.def: D(0xb93e, KIMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KIMD) >>> target/s390x/tcg/insn-data.def: D(0xb93f, KLMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KLMD) >>> ... >>> and in theory other instructions might also have 114 we should at least check that this is ppno/prno. >>> Or we split out a prno helper from the msa helper. >>> >> >> Doesn't >> >> s390_get_feat_block(type, subfunc); >> if (!test_be_bit(fc, subfunc)) { >> tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); >> } >> >> check that? As long as we don't implement 114 for any other instruction. >> that should properly fence off the other instructions. > > Right that would help. We should still take care of the query function. > s390_get_feat_block() should already take care of that as well, no? -- Thanks, David / dhildenb
Am 02.08.22 um 16:53 schrieb David Hildenbrand: > On 02.08.22 16:01, Christian Borntraeger wrote: >> >> >> Am 02.08.22 um 15:54 schrieb David Hildenbrand: >>> On 02.08.22 15:26, Christian Borntraeger wrote: >>>> >>>> >>>> Am 20.07.22 um 14:08 schrieb Jason A. Donenfeld: >>>>> In order for hosts running inside of TCG to initialize the kernel's >>>>> random number generator, we should support the PRNO_TRNG instruction, >>>>> backed in the usual way with the qemu_guest_getrandom helper. This is >>>>> confirmed working on Linux 5.19-rc6. >>>>> >>>>> Cc: Thomas Huth <thuth@redhat.com> >>>>> Cc: David Hildenbrand <david@redhat.com> >>>>> Cc: Richard Henderson <richard.henderson@linaro.org> >>>>> Cc: Cornelia Huck <cohuck@redhat.com> >>>>> Cc: Harald Freudenberger <freude@linux.ibm.com> >>>>> Cc: Holger Dengler <dengler@linux.ibm.com> >>>>> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> >>>> [...] >>>>> + case 114: >>>>> + if (r1 & 1 || !r1 || r2 & 1 || !r2) >>>>> + tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); >>>>> + fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); >>>>> + fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]); >>>>> + break; >>>> >>>> I think I agree with Harald that some aspects are missing. >>>> Linux does not seem to check, but we should also modify the query function to >>>> indicate the availability of 114. >>>> >>>> As the msa helper deals with many instructions >>>> ... >>>> target/s390x/tcg/insn-data.def: D(0xb91e, KMAC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMAC) >>>> target/s390x/tcg/insn-data.def: D(0xb928, PCKMO, RRE, MSA3, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCKMO) >>>> target/s390x/tcg/insn-data.def: D(0xb92a, KMF, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMF) >>>> target/s390x/tcg/insn-data.def: D(0xb92b, KMO, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMO) >>>> target/s390x/tcg/insn-data.def: D(0xb92c, PCC, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCC) >>>> target/s390x/tcg/insn-data.def: D(0xb92d, KMCTR, RRF_b, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMCTR) >>>> target/s390x/tcg/insn-data.def: D(0xb92e, KM, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KM) >>>> target/s390x/tcg/insn-data.def: D(0xb92f, KMC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMC) >>>> target/s390x/tcg/insn-data.def: D(0xb929, KMA, RRF_b, MSA8, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMA) >>>> target/s390x/tcg/insn-data.def: D(0xb93c, PPNO, RRE, MSA5, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PPNO) >>>> target/s390x/tcg/insn-data.def: D(0xb93e, KIMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KIMD) >>>> target/s390x/tcg/insn-data.def: D(0xb93f, KLMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KLMD) >>>> ... >>>> and in theory other instructions might also have 114 we should at least check that this is ppno/prno. >>>> Or we split out a prno helper from the msa helper. >>>> >>> >>> Doesn't >>> >>> s390_get_feat_block(type, subfunc); >>> if (!test_be_bit(fc, subfunc)) { >>> tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); >>> } >>> >>> check that? As long as we don't implement 114 for any other instruction. >>> that should properly fence off the other instructions. >> >> Right that would help. We should still take care of the query function. >> > s390_get_feat_block() should already take care of that as well, no? Ah right, yes it fills subfunc. So yes, that should do the trick. Sorry for the noise.
On 02.08.22 17:15, Christian Borntraeger wrote: > > > Am 02.08.22 um 16:53 schrieb David Hildenbrand: >> On 02.08.22 16:01, Christian Borntraeger wrote: >>> >>> >>> Am 02.08.22 um 15:54 schrieb David Hildenbrand: >>>> On 02.08.22 15:26, Christian Borntraeger wrote: >>>>> >>>>> >>>>> Am 20.07.22 um 14:08 schrieb Jason A. Donenfeld: >>>>>> In order for hosts running inside of TCG to initialize the kernel's >>>>>> random number generator, we should support the PRNO_TRNG instruction, >>>>>> backed in the usual way with the qemu_guest_getrandom helper. This is >>>>>> confirmed working on Linux 5.19-rc6. >>>>>> >>>>>> Cc: Thomas Huth <thuth@redhat.com> >>>>>> Cc: David Hildenbrand <david@redhat.com> >>>>>> Cc: Richard Henderson <richard.henderson@linaro.org> >>>>>> Cc: Cornelia Huck <cohuck@redhat.com> >>>>>> Cc: Harald Freudenberger <freude@linux.ibm.com> >>>>>> Cc: Holger Dengler <dengler@linux.ibm.com> >>>>>> Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> >>>>> [...] >>>>>> + case 114: >>>>>> + if (r1 & 1 || !r1 || r2 & 1 || !r2) >>>>>> + tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); >>>>>> + fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); >>>>>> + fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]); >>>>>> + break; >>>>> >>>>> I think I agree with Harald that some aspects are missing. >>>>> Linux does not seem to check, but we should also modify the query function to >>>>> indicate the availability of 114. >>>>> >>>>> As the msa helper deals with many instructions >>>>> ... >>>>> target/s390x/tcg/insn-data.def: D(0xb91e, KMAC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMAC) >>>>> target/s390x/tcg/insn-data.def: D(0xb928, PCKMO, RRE, MSA3, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCKMO) >>>>> target/s390x/tcg/insn-data.def: D(0xb92a, KMF, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMF) >>>>> target/s390x/tcg/insn-data.def: D(0xb92b, KMO, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMO) >>>>> target/s390x/tcg/insn-data.def: D(0xb92c, PCC, RRE, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PCC) >>>>> target/s390x/tcg/insn-data.def: D(0xb92d, KMCTR, RRF_b, MSA4, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMCTR) >>>>> target/s390x/tcg/insn-data.def: D(0xb92e, KM, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KM) >>>>> target/s390x/tcg/insn-data.def: D(0xb92f, KMC, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMC) >>>>> target/s390x/tcg/insn-data.def: D(0xb929, KMA, RRF_b, MSA8, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KMA) >>>>> target/s390x/tcg/insn-data.def: D(0xb93c, PPNO, RRE, MSA5, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_PPNO) >>>>> target/s390x/tcg/insn-data.def: D(0xb93e, KIMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KIMD) >>>>> target/s390x/tcg/insn-data.def: D(0xb93f, KLMD, RRE, MSA, 0, 0, 0, 0, msa, 0, S390_FEAT_TYPE_KLMD) >>>>> ... >>>>> and in theory other instructions might also have 114 we should at least check that this is ppno/prno. >>>>> Or we split out a prno helper from the msa helper. >>>>> >>>> >>>> Doesn't >>>> >>>> s390_get_feat_block(type, subfunc); >>>> if (!test_be_bit(fc, subfunc)) { >>>> tcg_s390_program_interrupt(env, PGM_SPECIFICATION, ra); >>>> } >>>> >>>> check that? As long as we don't implement 114 for any other instruction. >>>> that should properly fence off the other instructions. >>> >>> Right that would help. We should still take care of the query function. >>> >> s390_get_feat_block() should already take care of that as well, no? > > Ah right, yes it fills subfunc. So yes, that should do the trick. Sorry for the noise. > I had to look at that 2 times as well ... -- Thanks, David / dhildenb
Hi David, Christian, While this thread has your attention, I thought I'd reiterate my offer in: https://lore.kernel.org/qemu-devel/YuEoUwzDzBqFFpxe@zx2c4.com/ Do either of you want to "take ownership" of this patch to bring it past the finish line, and I can provide whatever additional crypto code you need for that? Jason
On 02.08.22 17:28, Jason A. Donenfeld wrote: > Hi David, Christian, > > While this thread has your attention, I thought I'd reiterate my offer in: > https://lore.kernel.org/qemu-devel/YuEoUwzDzBqFFpxe@zx2c4.com/ > > Do either of you want to "take ownership" of this patch to bring it > past the finish line, and I can provide whatever additional crypto > code you need for that? For me the patch is good enough. But sure, having a SHA512 implementation would be nice ... Long story short, I'll wire up whatever crypto stuff you can come up with ;) -- Thanks, David / dhildenb
On Tue, Aug 02, 2022 at 05:32:26PM +0200, David Hildenbrand wrote: > On 02.08.22 17:28, Jason A. Donenfeld wrote: > > Hi David, Christian, > > > > While this thread has your attention, I thought I'd reiterate my offer in: > > https://lore.kernel.org/qemu-devel/YuEoUwzDzBqFFpxe@zx2c4.com/ > > > > Do either of you want to "take ownership" of this patch to bring it > > past the finish line, and I can provide whatever additional crypto > > code you need for that? > > For me the patch is good enough. But sure, having a SHA512 > implementation would be nice ... > > Long story short, I'll wire up whatever crypto stuff you can come up with ;) Long story short, I started to take you up on that offer, but because I am an insane person, before I knew it, the whole thing was done... Patch series incoming. Jason
In addition to the prior TRNG patch from v3, this v4 adds SHA-512 support. I know, I know, I know -- I fussed around asking if somebody would help me implement this because it was "oh so hard", and offered to do the crypto part if someone would do the rest. But then once I had the crypto part, I wanted some way to test it and then... and then the implementation worked and passed the test vectors. So now these two patches together implement MSA EXT 5, and appear to be working with Linux's drivers for it. Cc: Thomas Huth <thuth@redhat.com> Cc: David Hildenbrand <david@redhat.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Richard Henderson <richard.henderson@linaro.org> Cc: Cornelia Huck <cohuck@redhat.com> Cc: Harald Freudenberger <freude@linux.ibm.com> Cc: Holger Dengler <dengler@linux.ibm.com> Jason A. Donenfeld (2): target/s390x: support PRNO_TRNG instruction target/s390x: support SHA-512 extensions target/s390x/gen-features.c | 4 + target/s390x/tcg/crypto_helper.c | 146 +++++++++++++++++++++++++++++++ 2 files changed, 150 insertions(+) -- 2.35.1
In order for hosts running inside of TCG to initialize the kernel's
random number generator, we should support the PRNO_TRNG instruction,
backed in the usual way with the qemu_guest_getrandom helper. This is
confirmed working on Linux 5.19.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 2 ++
target/s390x/tcg/crypto_helper.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index ad140184b9..3d333e2789 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -749,6 +749,8 @@ static uint16_t qemu_V7_0[] = {
*/
static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
+ S390_FEAT_MSA_EXT_5,
+ S390_FEAT_PRNO_TRNG,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 138d9e7ad9..8ad4ef1ace 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -12,12 +12,38 @@
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
#include "s390x-internal.h"
#include "tcg_s390x.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
+ uint64_t *buf_reg, uint64_t *len_reg)
+{
+ uint8_t tmp[256];
+ uint64_t len = *len_reg;
+ int reg_len = 64;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ while (len) {
+ size_t block = MIN(len, sizeof(tmp));
+
+ qemu_guest_getrandom_nofail(tmp, block);
+ for (size_t i = 0; i < block; ++i) {
+ cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra);
+ *buf_reg = deposit64(*buf_reg, 0, reg_len, *buf_reg + 1);
+ --*len_reg;
+ }
+ len -= block;
+ }
+}
+
uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
uint32_t type)
{
@@ -52,6 +78,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 114: /* CPACF_PRNO_TRNG */
+ fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
+ fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
+ break;
default:
/* we don't implement any other subfunction yet */
g_assert_not_reached();
--
2.35.1
In order to fully support MSA_EXT_5, we have to also support the SHA-512
special instructions. So implement those.
The implementation began as something TweetNacl-like, and then was
adjusted to be useful here. It's not very beautiful, but it is quite
short and compact, which is what we're going for.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 2 +
target/s390x/tcg/crypto_helper.c | 116 +++++++++++++++++++++++++++++++
2 files changed, 118 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 3d333e2789..b6d804fa6d 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -751,6 +751,8 @@ static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
S390_FEAT_MSA_EXT_5,
S390_FEAT_PRNO_TRNG,
+ S390_FEAT_KIMD_SHA_512,
+ S390_FEAT_KLMD_SHA_512,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 8ad4ef1ace..475627aa83 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -19,6 +19,112 @@
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); }
+static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); }
+static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); }
+static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); }
+static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); }
+static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); }
+static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); }
+
+static const uint64_t K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+ 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+ 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+ 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+ 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+ 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+ 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+ 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+ 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+ 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+ 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+ 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+ 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+ 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+static void kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
+{
+ uint64_t z[8], b[8], a[8], w[16], t;
+ int i, j;
+
+ for (i = 0; i < 8; ++i)
+ z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);
+
+ while (*len_reg >= 128) {
+ for (i = 0; i < 16; ++i) {
+ if (message_reg)
+ w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, *message_reg + 8 * i), ra);
+ else
+ w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
+ }
+
+ for (i = 0; i < 80; ++i) {
+ for (j = 0; j < 8; ++j)
+ b[j] = a[j];
+ t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
+ b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
+ b[3] += t;
+ for (j = 0; j < 8; ++j)
+ a[(j + 1) % 8] = b[j];
+ if (i % 16 == 15) {
+ for (j = 0; j < 16; ++j)
+ w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) +
+ sigma1(w[(j + 14) % 16]);
+ }
+ }
+
+ for (i = 0; i < 8; ++i) {
+ a[i] += z[i];
+ z[i] = a[i];
+ }
+
+ if (message_reg)
+ *message_reg += 128;
+ else
+ stack_buffer += 128;
+ *len_reg -= 128;
+ }
+
+ for (i = 0; i < 8; ++i)
+ cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
+}
+
+static void klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg)
+{
+ uint8_t x[256];
+ uint64_t i;
+ int j;
+
+ kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
+ for (i = 0; i < *len_reg; ++i)
+ x[i] = cpu_ldub_data_ra(env, wrap_address(env, *message_reg + i), ra);
+ *message_reg += *len_reg;
+ *len_reg = 0;
+ memset(x + i, 0, sizeof(x) - i);
+ x[i] = 128;
+ i = i < 112 ? 128 : 256;
+ for (j = 0; j < 16; ++j)
+ x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
+ kimd_sha512(env, ra, parameter_block, NULL, &i, x);
+}
+
static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
uint64_t *buf_reg, uint64_t *len_reg)
{
@@ -78,6 +184,16 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 3: /* CPACF_*_SHA_512 */
+ switch (type) {
+ case S390_FEAT_TYPE_KIMD:
+ kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL);
+ break;
+ case S390_FEAT_TYPE_KLMD:
+ klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
+ break;
+ }
+ break;
case 114: /* CPACF_PRNO_TRNG */
fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
--
2.35.1
On 02.08.22 21:00, Jason A. Donenfeld wrote: > In order to fully support MSA_EXT_5, we have to also support the SHA-512 > special instructions. So implement those. > > The implementation began as something TweetNacl-like, and then was > adjusted to be useful here. It's not very beautiful, but it is quite > short and compact, which is what we're going for. > Do we have to worry about copyright/authorship of the original code or did you write that from scratch? [...] I cannot really comment on the actual math, so I'll point out some code style thingies. > +static void kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) > +{ > + uint64_t z[8], b[8], a[8], w[16], t; > + int i, j; > + > + for (i = 0; i < 8; ++i) > + z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra); Please always use curly brackets in QEMU for code blocks, they are mandatory. > + > + while (*len_reg >= 128) { > + for (i = 0; i < 16; ++i) { i++, also for all cases below. > + if (message_reg) > + w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, *message_reg + 8 * i), ra); > + else > + w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]); > + } > + > + for (i = 0; i < 80; ++i) { > + for (j = 0; j < 8; ++j) > + b[j] = a[j]; > + t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16]; > + b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]); > + b[3] += t; > + for (j = 0; j < 8; ++j) > + a[(j + 1) % 8] = b[j]; > + if (i % 16 == 15) { > + for (j = 0; j < 16; ++j) > + w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + > + sigma1(w[(j + 14) % 16]); > + } > + } > + > + for (i = 0; i < 8; ++i) { > + a[i] += z[i]; > + z[i] = a[i]; > + } > + > + if (message_reg) > + *message_reg += 128; > + else > + stack_buffer += 128; > + *len_reg -= 128; > + } > + > + for (i = 0; i < 8; ++i) > + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); > +} > + > +static void klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > + uint64_t *message_reg, uint64_t *len_reg) > +{ > + uint8_t x[256]; > + uint64_t i; > + int j; > + > + kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); > + for (i = 0; i < *len_reg; ++i) > + x[i] = cpu_ldub_data_ra(env, wrap_address(env, *message_reg + i), ra); > + *message_reg += *len_reg; > + *len_reg = 0; > + memset(x + i, 0, sizeof(x) - i); > + x[i] = 128; > + i = i < 112 ? 128 : 256; > + for (j = 0; j < 16; ++j) > + x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra); > + kimd_sha512(env, ra, parameter_block, NULL, &i, x); > +} Are we properly handling the length register (r2 + 1) in the 24-bit/31-bit addressing mode? Similarly, are we properly handling updates to the message register (r2) depending on the addressing mode? It's worth noting that we might want to implement (also for PRNO-TRNG): "The operation is ended when all source bytes in the second operand have been pro- cessed (called normal completion), or when a CPU- determined number of blocks that is less than the length of the second operand have been processed (called partial completion). The CPU-determined number of blocks depends on the model, and may be a different number each time the instruction is exe- cuted. The CPU-determined number of blocks is usu- ally nonzero. In certain unusual situations, this number may be zero, and condition code 3 may be set with no progress." Otherwise, a large length can make us loop quite a while in QEMU, without the chance to deliver any other interrupts. -- Thanks, David / dhildenb
Hi David, On Wed, Aug 03, 2022 at 01:55:21PM +0200, David Hildenbrand wrote: > On 02.08.22 21:00, Jason A. Donenfeld wrote: > > In order to fully support MSA_EXT_5, we have to also support the SHA-512 > > special instructions. So implement those. > > > > The implementation began as something TweetNacl-like, and then was > > adjusted to be useful here. It's not very beautiful, but it is quite > > short and compact, which is what we're going for. > > > > Do we have to worry about copyright/authorship of the original code or > did you write that from scratch? I actually don't really remember how much of that is leftover from tweetnacl and how much I've rewritten - I've had some variant of this code or another kicking around in various projects and repos for a long time. But the tweetnacl stuff is public domain to begin with, so all good. > Are we properly handling the length register (r2 + 1) in the > 24-bit/31-bit addressing mode? > Similarly, are we properly handling updates to the message register (r2) > depending on the addressing mode? Ugh, probably not... I didn't do any of the deposit_64 stuff. I guess I'll look into that. > It's worth noting that we might want to implement (also for PRNO-TRNG): > > "The operation is ended when all > source bytes in the second operand have been pro- > cessed (called normal completion), or when a CPU- > determined number of blocks that is less than the > length of the second operand have been processed > (called partial completion). The CPU-determined > number of blocks depends on the model, and may be > a different number each time the instruction is exe- > cuted. The CPU-determined number of blocks is usu- > ally nonzero. In certain unusual situations, this > number may be zero, and condition code 3 may be > set with no progress." > > Otherwise, a large length can make us loop quite a while in QEMU, > without the chance to deliver any other interrupts. Hmm, okay. Looking at the Linux code, I see: s.even = (unsigned long)src; s.odd = (unsigned long)src_len; asm volatile( " lgr 0,%[fc]\n" " lgr 1,%[pba]\n" "0: .insn rre,%[opc] << 16,0,%[src]\n" " brc 1,0b\n" /* handle partial completion */ : [src] "+&d" (s.pair) : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), [opc] "i" (CPACF_KIMD) : "cc", "memory", "0", "1"); So I guess that means it'll just loop until it's done? Or do I need to return "1" from HELPER(msa)? Jason
On 2022-08-03 14:14, Jason A. Donenfeld wrote: > Hi David, > > On Wed, Aug 03, 2022 at 01:55:21PM +0200, David Hildenbrand wrote: >> On 02.08.22 21:00, Jason A. Donenfeld wrote: >> > In order to fully support MSA_EXT_5, we have to also support the SHA-512 >> > special instructions. So implement those. >> > >> > The implementation began as something TweetNacl-like, and then was >> > adjusted to be useful here. It's not very beautiful, but it is quite >> > short and compact, which is what we're going for. >> > >> >> Do we have to worry about copyright/authorship of the original code or >> did you write that from scratch? > > I actually don't really remember how much of that is leftover from > tweetnacl and how much I've rewritten - I've had some variant of this > code or another kicking around in various projects and repos for a long > time. But the tweetnacl stuff is public domain to begin with, so all > good. > >> Are we properly handling the length register (r2 + 1) in the >> 24-bit/31-bit addressing mode? >> Similarly, are we properly handling updates to the message register >> (r2) >> depending on the addressing mode? > > Ugh, probably not... I didn't do any of the deposit_64 stuff. I guess > I'll look into that. > >> It's worth noting that we might want to implement (also for >> PRNO-TRNG): >> >> "The operation is ended when all >> source bytes in the second operand have been pro- >> cessed (called normal completion), or when a CPU- >> determined number of blocks that is less than the >> length of the second operand have been processed >> (called partial completion). The CPU-determined >> number of blocks depends on the model, and may be >> a different number each time the instruction is exe- >> cuted. The CPU-determined number of blocks is usu- >> ally nonzero. In certain unusual situations, this >> number may be zero, and condition code 3 may be >> set with no progress." >> >> Otherwise, a large length can make us loop quite a while in QEMU, >> without the chance to deliver any other interrupts. > > Hmm, okay. Looking at the Linux code, I see: > > s.even = (unsigned long)src; > s.odd = (unsigned long)src_len; > asm volatile( > " lgr 0,%[fc]\n" > " lgr 1,%[pba]\n" > "0: .insn rre,%[opc] << 16,0,%[src]\n" > " brc 1,0b\n" /* handle partial completion */ > : [src] "+&d" (s.pair) > : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), > [opc] "i" (CPACF_KIMD) > : "cc", "memory", "0", "1"); > > So I guess that means it'll just loop until it's done? Or do I need to > return "1" from HELPER(msa)? > > Jason Hm, you don't really want to implement some kind of particial complete. Qemu is an emulation and you would have to implement some kind of fragmenting this based on machine generation. For my feeling this is way too overengineered. Btw. as there came the request to handle the 24-bit/31-bit addressing correctly. Is Qemu 32 bit supported ?
On 04.08.22 08:51, Harald Freudenberger wrote: > On 2022-08-03 14:14, Jason A. Donenfeld wrote: >> Hi David, >> >> On Wed, Aug 03, 2022 at 01:55:21PM +0200, David Hildenbrand wrote: >>> On 02.08.22 21:00, Jason A. Donenfeld wrote: >>>> In order to fully support MSA_EXT_5, we have to also support the SHA-512 >>>> special instructions. So implement those. >>>> >>>> The implementation began as something TweetNacl-like, and then was >>>> adjusted to be useful here. It's not very beautiful, but it is quite >>>> short and compact, which is what we're going for. >>>> >>> >>> Do we have to worry about copyright/authorship of the original code or >>> did you write that from scratch? >> >> I actually don't really remember how much of that is leftover from >> tweetnacl and how much I've rewritten - I've had some variant of this >> code or another kicking around in various projects and repos for a long >> time. But the tweetnacl stuff is public domain to begin with, so all >> good. >> >>> Are we properly handling the length register (r2 + 1) in the >>> 24-bit/31-bit addressing mode? >>> Similarly, are we properly handling updates to the message register >>> (r2) >>> depending on the addressing mode? >> >> Ugh, probably not... I didn't do any of the deposit_64 stuff. I guess >> I'll look into that. >> >>> It's worth noting that we might want to implement (also for >>> PRNO-TRNG): >>> >>> "The operation is ended when all >>> source bytes in the second operand have been pro- >>> cessed (called normal completion), or when a CPU- >>> determined number of blocks that is less than the >>> length of the second operand have been processed >>> (called partial completion). The CPU-determined >>> number of blocks depends on the model, and may be >>> a different number each time the instruction is exe- >>> cuted. The CPU-determined number of blocks is usu- >>> ally nonzero. In certain unusual situations, this >>> number may be zero, and condition code 3 may be >>> set with no progress." >>> >>> Otherwise, a large length can make us loop quite a while in QEMU, >>> without the chance to deliver any other interrupts. >> >> Hmm, okay. Looking at the Linux code, I see: >> >> s.even = (unsigned long)src; >> s.odd = (unsigned long)src_len; >> asm volatile( >> " lgr 0,%[fc]\n" >> " lgr 1,%[pba]\n" >> "0: .insn rre,%[opc] << 16,0,%[src]\n" >> " brc 1,0b\n" /* handle partial completion */ >> : [src] "+&d" (s.pair) >> : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), >> [opc] "i" (CPACF_KIMD) >> : "cc", "memory", "0", "1"); >> >> So I guess that means it'll just loop until it's done? Or do I need to >> return "1" from HELPER(msa)? >> >> Jason > > Hm, you don't really want to implement some kind of particial complete. > Qemu is an emulation and you would have to implement some kind of > fragmenting this based on machine generation. Do we? "The CPU-determined number of bytes depends on the model, and may be a different number each time the instruction is executed. The CPU-determined number of bytes is usually nonzero. In certain unusual situa- tions, this number may be zero, and condition code 3 may be set with no progress. However, the CPU pro- tects against endless recurrence of this no-progress case. " I read that as "do what you want, even on a given model it might be random." -- Thanks, David / dhildenb
Hi, On Thu, Aug 04, 2022 at 10:10:52AM +0200, David Hildenbrand wrote: > > Hm, you don't really want to implement some kind of particial complete. > > Qemu is an emulation and you would have to implement some kind of > > fragmenting this based on machine generation. > > Do we? > > "The > CPU-determined number of bytes depends on the > model, and may be a different number each time the > instruction is executed. The CPU-determined number > of bytes is usually nonzero. In certain unusual situa- > tions, this number may be zero, and condition code 3 > may be set with no progress. However, the CPU pro- > tects against endless recurrence of this no-progress > case. > " > > I read that as "do what you want, even on a given model it might be random." Just FYI, I implemented this, and it works in v6. Please take a look at: https://lore.kernel.org/qemu-devel/20220803171536.1314717-2-Jason@zx2c4.com/ So we can keep that. Or I can send a v7 that removes it. It wasn't very hard to implement, and it's not very hard to remove, so either way, just tell me what you want to do. Jason
Am 04.08.22 um 08:51 schrieb Harald Freudenberger: > On 2022-08-03 14:14, Jason A. Donenfeld wrote: >> Hi David, >> >> On Wed, Aug 03, 2022 at 01:55:21PM +0200, David Hildenbrand wrote: >>> On 02.08.22 21:00, Jason A. Donenfeld wrote: >>> > In order to fully support MSA_EXT_5, we have to also support the SHA-512 >>> > special instructions. So implement those. >>> > >>> > The implementation began as something TweetNacl-like, and then was >>> > adjusted to be useful here. It's not very beautiful, but it is quite >>> > short and compact, which is what we're going for. >>> > >>> >>> Do we have to worry about copyright/authorship of the original code or >>> did you write that from scratch? >> >> I actually don't really remember how much of that is leftover from >> tweetnacl and how much I've rewritten - I've had some variant of this >> code or another kicking around in various projects and repos for a long >> time. But the tweetnacl stuff is public domain to begin with, so all >> good. >> >>> Are we properly handling the length register (r2 + 1) in the >>> 24-bit/31-bit addressing mode? >>> Similarly, are we properly handling updates to the message register (r2) >>> depending on the addressing mode? >> >> Ugh, probably not... I didn't do any of the deposit_64 stuff. I guess >> I'll look into that. >> >>> It's worth noting that we might want to implement (also for PRNO-TRNG): >>> >>> "The operation is ended when all >>> source bytes in the second operand have been pro- >>> cessed (called normal completion), or when a CPU- >>> determined number of blocks that is less than the >>> length of the second operand have been processed >>> (called partial completion). The CPU-determined >>> number of blocks depends on the model, and may be >>> a different number each time the instruction is exe- >>> cuted. The CPU-determined number of blocks is usu- >>> ally nonzero. In certain unusual situations, this >>> number may be zero, and condition code 3 may be >>> set with no progress." >>> >>> Otherwise, a large length can make us loop quite a while in QEMU, >>> without the chance to deliver any other interrupts. >> >> Hmm, okay. Looking at the Linux code, I see: >> >> s.even = (unsigned long)src; >> s.odd = (unsigned long)src_len; >> asm volatile( >> " lgr 0,%[fc]\n" >> " lgr 1,%[pba]\n" >> "0: .insn rre,%[opc] << 16,0,%[src]\n" >> " brc 1,0b\n" /* handle partial completion */ >> : [src] "+&d" (s.pair) >> : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), >> [opc] "i" (CPACF_KIMD) >> : "cc", "memory", "0", "1"); >> >> So I guess that means it'll just loop until it's done? Or do I need to >> return "1" from HELPER(msa)? >> >> Jason > > Hm, you don't really want to implement some kind of particial complete. > Qemu is an emulation and you would have to implement some kind of > fragmenting this based on machine generation. For my feeling this is > way too overengineered. Btw. as there came the request to handle > the 24-bit/31-bit addressing correctly. Is Qemu 32 bit supported ? We do not support the esa390 mode, but the 24/31 bit _addressing_ modes are totally valid to be used in zarch mode (with sam31 for example). The kernel does that for example for some diagnoses under z/VM. Nobody in problem state should probably do that, but its possible.
Hi, On Thu, Aug 04, 2022 at 08:56:19AM +0200, Christian Borntraeger wrote: > We do not support the esa390 mode, but the 24/31 bit _addressing_ modes are > totally valid to be used in zarch mode (with sam31 for example). The kernel > does that for example for some diagnoses under z/VM. > Nobody in problem state should probably do that, but its possible. v6 of this series handles 24/31: https://lore.kernel.org/qemu-devel/20220803171536.1314717-1-Jason@zx2c4.com/ [unchanged for a while now] https://lore.kernel.org/qemu-devel/20220803171536.1314717-2-Jason@zx2c4.com/ [the new sha512 thing] Jason
On Wed, Aug 03, 2022 at 02:14:58PM +0200, Jason A. Donenfeld wrote: > s.even = (unsigned long)src; > s.odd = (unsigned long)src_len; > asm volatile( > " lgr 0,%[fc]\n" > " lgr 1,%[pba]\n" > "0: .insn rre,%[opc] << 16,0,%[src]\n" > " brc 1,0b\n" /* handle partial completion */ > : [src] "+&d" (s.pair) > : [fc] "d" (func), [pba] "d" ((unsigned long)(param)), > [opc] "i" (CPACF_KIMD) > : "cc", "memory", "0", "1"); > > So I guess that means it'll just loop until it's done? Or do I need to > return "1" from HELPER(msa)? Looks like returning 3 did the trick. v5 incoming... Jason
In order for hosts running inside of TCG to initialize the kernel's
random number generator, we should support the PRNO_TRNG instruction,
backed in the usual way with the qemu_guest_getrandom helper. This is
confirmed working on Linux 5.19.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 2 ++
target/s390x/tcg/crypto_helper.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index ad140184b9..3d333e2789 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -749,6 +749,8 @@ static uint16_t qemu_V7_0[] = {
*/
static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
+ S390_FEAT_MSA_EXT_5,
+ S390_FEAT_PRNO_TRNG,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 138d9e7ad9..8ad4ef1ace 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -12,12 +12,38 @@
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
#include "s390x-internal.h"
#include "tcg_s390x.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
+ uint64_t *buf_reg, uint64_t *len_reg)
+{
+ uint8_t tmp[256];
+ uint64_t len = *len_reg;
+ int reg_len = 64;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ while (len) {
+ size_t block = MIN(len, sizeof(tmp));
+
+ qemu_guest_getrandom_nofail(tmp, block);
+ for (size_t i = 0; i < block; ++i) {
+ cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra);
+ *buf_reg = deposit64(*buf_reg, 0, reg_len, *buf_reg + 1);
+ --*len_reg;
+ }
+ len -= block;
+ }
+}
+
uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
uint32_t type)
{
@@ -52,6 +78,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 114: /* CPACF_PRNO_TRNG */
+ fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
+ fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
+ break;
default:
/* we don't implement any other subfunction yet */
g_assert_not_reached();
--
2.35.1
In order to fully support MSA_EXT_5, we have to also support the SHA-512
special instructions. So implement those.
The implementation began as something TweetNacl-like, and then was
adjusted to be useful here. It's not very beautiful, but it is quite
short and compact, which is what we're going for.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 2 +
target/s390x/tcg/crypto_helper.c | 154 +++++++++++++++++++++++++++++++
2 files changed, 156 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 3d333e2789..b6d804fa6d 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -751,6 +751,8 @@ static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
S390_FEAT_MSA_EXT_5,
S390_FEAT_PRNO_TRNG,
+ S390_FEAT_KIMD_SHA_512,
+ S390_FEAT_KLMD_SHA_512,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 8ad4ef1ace..b5e46342d6 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -1,10 +1,12 @@
/*
* s390x crypto helpers
*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (c) 2017 Red Hat Inc
*
* Authors:
* David Hildenbrand <david@redhat.com>
+ * Jason A. Donenfeld <Jason@zx2c4.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
@@ -19,6 +21,150 @@
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); }
+static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); }
+static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); }
+static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); }
+static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); }
+static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); }
+static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); }
+
+static const uint64_t K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+ 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+ 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+ 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+ 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+ 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+ 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+ 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+ 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+ 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+ 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+ 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+ 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+ 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
+{
+ enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */
+ uint64_t z[8], b[8], a[8], w[16], t;
+ uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0;
+ int i, j, reg_len = 64, blocks = 0, cc = 0;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);
+ }
+
+ while (len >= 128) {
+ for (i = 0; i < 16; ++i) {
+ if (message) {
+ w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra);
+ } else {
+ w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
+ }
+ }
+
+ for (i = 0; i < 80; ++i) {
+ for (j = 0; j < 8; ++j) {
+ b[j] = a[j];
+ }
+ t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
+ b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
+ b[3] += t;
+ for (j = 0; j < 8; ++j) {
+ a[(j + 1) % 8] = b[j];
+ }
+ if (i % 16 == 15) {
+ for (j = 0; j < 16; ++j) {
+ w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]);
+ }
+ }
+ }
+
+ for (i = 0; i < 8; ++i) {
+ a[i] += z[i];
+ z[i] = a[i];
+ }
+
+ if (message)
+ message += 128;
+ else
+ stack_buffer += 128;
+ len -= 128;
+ processed += 128;
+
+ if (++blocks > MAX_BLOCKS_PER_RUN) {
+ cc = 3;
+ break;
+ }
+ }
+
+ for (i = 0; i < 8; ++i) {
+ cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
+ }
+
+ if (message_reg) {
+ *message_reg = deposit64(*message_reg, 0, reg_len, message);
+ }
+ *len_reg -= processed;
+ return cc;
+}
+
+static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg)
+{
+ uint8_t x[256];
+ uint64_t i, message, len;
+ int j, reg_len = 64, cc;
+
+ cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
+ if (cc)
+ return cc;
+
+ message = *message_reg;
+ len = *len_reg;
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ for (i = 0; i < len; ++i) {
+ x[i] = cpu_ldub_data_ra(env, wrap_address(env, message + i), ra);
+ }
+ memset(x + i, 0, sizeof(x) - i);
+ x[i] = 128;
+ i = i < 112 ? 128 : 256;
+ for (j = 0; j < 16; ++j) {
+ x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
+ }
+ if (kimd_sha512(env, ra, parameter_block, NULL, &i, x))
+ g_assert_not_reached(); /* It must handle at least 2 blocks. */
+ *message_reg = deposit64(*message_reg, 0, reg_len, message + len);
+ *len_reg -= len;
+ return 0;
+}
+
static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
uint64_t *buf_reg, uint64_t *len_reg)
{
@@ -78,6 +224,14 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 3: /* CPACF_*_SHA_512 */
+ switch (type) {
+ case S390_FEAT_TYPE_KIMD:
+ return kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL);
+ case S390_FEAT_TYPE_KLMD:
+ return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
+ }
+ break;
case 114: /* CPACF_PRNO_TRNG */
fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
--
2.35.1
In order for hosts running inside of TCG to initialize the kernel's
random number generator, we should support the PRNO_TRNG instruction,
backed in the usual way with the qemu_guest_getrandom helper. This is
confirmed working on Linux 5.19.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 2 ++
target/s390x/tcg/crypto_helper.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index ad140184b9..3d333e2789 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -749,6 +749,8 @@ static uint16_t qemu_V7_0[] = {
*/
static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
+ S390_FEAT_MSA_EXT_5,
+ S390_FEAT_PRNO_TRNG,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 138d9e7ad9..8ad4ef1ace 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -12,12 +12,38 @@
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
#include "s390x-internal.h"
#include "tcg_s390x.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
+ uint64_t *buf_reg, uint64_t *len_reg)
+{
+ uint8_t tmp[256];
+ uint64_t len = *len_reg;
+ int reg_len = 64;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ while (len) {
+ size_t block = MIN(len, sizeof(tmp));
+
+ qemu_guest_getrandom_nofail(tmp, block);
+ for (size_t i = 0; i < block; ++i) {
+ cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra);
+ *buf_reg = deposit64(*buf_reg, 0, reg_len, *buf_reg + 1);
+ --*len_reg;
+ }
+ len -= block;
+ }
+}
+
uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
uint32_t type)
{
@@ -52,6 +78,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 114: /* CPACF_PRNO_TRNG */
+ fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
+ fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
+ break;
default:
/* we don't implement any other subfunction yet */
g_assert_not_reached();
--
2.35.1
In order to fully support MSA_EXT_5, we have to also support the SHA-512
special instructions. So implement those.
The implementation began as something TweetNacl-like, and then was
adjusted to be useful here. It's not very beautiful, but it is quite
short and compact, which is what we're going for.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 2 +
target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++
2 files changed, 159 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 3d333e2789..b6d804fa6d 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -751,6 +751,8 @@ static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
S390_FEAT_MSA_EXT_5,
S390_FEAT_PRNO_TRNG,
+ S390_FEAT_KIMD_SHA_512,
+ S390_FEAT_KLMD_SHA_512,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 8ad4ef1ace..bb4823107c 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -1,10 +1,12 @@
/*
* s390x crypto helpers
*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (c) 2017 Red Hat Inc
*
* Authors:
* David Hildenbrand <david@redhat.com>
+ * Jason A. Donenfeld <Jason@zx2c4.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
@@ -19,6 +21,153 @@
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); }
+static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); }
+static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); }
+static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); }
+static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); }
+static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); }
+static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); }
+
+static const uint64_t K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+ 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+ 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+ 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+ 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+ 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+ 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+ 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+ 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+ 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+ 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+ 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+ 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+ 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
+{
+ enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */
+ uint64_t z[8], b[8], a[8], w[16], t;
+ uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0;
+ int i, j, reg_len = 64, blocks = 0, cc = 0;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);
+ }
+
+ while (len >= 128) {
+ if (++blocks > MAX_BLOCKS_PER_RUN) {
+ cc = 3;
+ break;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ if (message) {
+ w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra);
+ } else {
+ w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
+ }
+ }
+
+ for (i = 0; i < 80; ++i) {
+ for (j = 0; j < 8; ++j) {
+ b[j] = a[j];
+ }
+ t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
+ b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
+ b[3] += t;
+ for (j = 0; j < 8; ++j) {
+ a[(j + 1) % 8] = b[j];
+ }
+ if (i % 16 == 15) {
+ for (j = 0; j < 16; ++j) {
+ w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]);
+ }
+ }
+ }
+
+ for (i = 0; i < 8; ++i) {
+ a[i] += z[i];
+ z[i] = a[i];
+ }
+
+ if (message) {
+ message += 128;
+ } else {
+ stack_buffer += 128;
+ }
+ len -= 128;
+ processed += 128;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
+ }
+
+ if (message_reg) {
+ *message_reg = deposit64(*message_reg, 0, reg_len, message);
+ }
+ *len_reg -= processed;
+ return cc;
+}
+
+static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg)
+{
+ uint8_t x[256];
+ uint64_t i, message, len;
+ int j, reg_len = 64, cc;
+
+ cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
+ if (cc) {
+ return cc;
+ }
+
+ message = *message_reg;
+ len = *len_reg;
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ for (i = 0; i < len; ++i) {
+ x[i] = cpu_ldub_data_ra(env, wrap_address(env, message + i), ra);
+ }
+ memset(x + i, 0, sizeof(x) - i);
+ x[i] = 128;
+ i = i < 112 ? 128 : 256;
+ for (j = 0; j < 16; ++j) {
+ x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
+ }
+ if (kimd_sha512(env, ra, parameter_block, NULL, &i, x)) {
+ g_assert_not_reached(); /* It must handle at least 2 blocks. */
+ }
+ *message_reg = deposit64(*message_reg, 0, reg_len, message + len);
+ *len_reg -= len;
+ return 0;
+}
+
static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
uint64_t *buf_reg, uint64_t *len_reg)
{
@@ -78,6 +227,14 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 3: /* CPACF_*_SHA_512 */
+ switch (type) {
+ case S390_FEAT_TYPE_KIMD:
+ return kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL);
+ case S390_FEAT_TYPE_KLMD:
+ return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
+ }
+ break;
case 114: /* CPACF_PRNO_TRNG */
fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
--
2.35.1
In order for hosts running inside of TCG to initialize the kernel's
random number generator, we should support the PRNO_TRNG instruction,
backed in the usual way with the qemu_guest_getrandom helper. This is
confirmed working on Linux 5.19.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 2 ++
target/s390x/tcg/crypto_helper.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 32 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index ad140184b9..3d333e2789 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -749,6 +749,8 @@ static uint16_t qemu_V7_0[] = {
*/
static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
+ S390_FEAT_MSA_EXT_5,
+ S390_FEAT_PRNO_TRNG,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 138d9e7ad9..8ad4ef1ace 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -12,12 +12,38 @@
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
#include "s390x-internal.h"
#include "tcg_s390x.h"
#include "exec/helper-proto.h"
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
+ uint64_t *buf_reg, uint64_t *len_reg)
+{
+ uint8_t tmp[256];
+ uint64_t len = *len_reg;
+ int reg_len = 64;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ while (len) {
+ size_t block = MIN(len, sizeof(tmp));
+
+ qemu_guest_getrandom_nofail(tmp, block);
+ for (size_t i = 0; i < block; ++i) {
+ cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra);
+ *buf_reg = deposit64(*buf_reg, 0, reg_len, *buf_reg + 1);
+ --*len_reg;
+ }
+ len -= block;
+ }
+}
+
uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
uint32_t type)
{
@@ -52,6 +78,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 114: /* CPACF_PRNO_TRNG */
+ fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
+ fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
+ break;
default:
/* we don't implement any other subfunction yet */
g_assert_not_reached();
--
2.35.1
In order to fully support MSA_EXT_5, we have to also support the SHA-512
special instructions. So implement those.
The implementation began as something TweetNacl-like, and then was
adjusted to be useful here. It's not very beautiful, but it is quite
short and compact, which is what we're going for.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 2 +
target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++
2 files changed, 159 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 3d333e2789..b6d804fa6d 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -751,6 +751,8 @@ static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
S390_FEAT_MSA_EXT_5,
S390_FEAT_PRNO_TRNG,
+ S390_FEAT_KIMD_SHA_512,
+ S390_FEAT_KLMD_SHA_512,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 8ad4ef1ace..bb4823107c 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -1,10 +1,12 @@
/*
* s390x crypto helpers
*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (c) 2017 Red Hat Inc
*
* Authors:
* David Hildenbrand <david@redhat.com>
+ * Jason A. Donenfeld <Jason@zx2c4.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
@@ -19,6 +21,153 @@
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); }
+static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); }
+static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); }
+static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); }
+static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); }
+static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); }
+static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); }
+
+static const uint64_t K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+ 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+ 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+ 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+ 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+ 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+ 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+ 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+ 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+ 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+ 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+ 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+ 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+ 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
+{
+ enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */
+ uint64_t z[8], b[8], a[8], w[16], t;
+ uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0;
+ int i, j, reg_len = 64, blocks = 0, cc = 0;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);
+ }
+
+ while (len >= 128) {
+ if (++blocks > MAX_BLOCKS_PER_RUN) {
+ cc = 3;
+ break;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ if (message) {
+ w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra);
+ } else {
+ w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
+ }
+ }
+
+ for (i = 0; i < 80; ++i) {
+ for (j = 0; j < 8; ++j) {
+ b[j] = a[j];
+ }
+ t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
+ b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
+ b[3] += t;
+ for (j = 0; j < 8; ++j) {
+ a[(j + 1) % 8] = b[j];
+ }
+ if (i % 16 == 15) {
+ for (j = 0; j < 16; ++j) {
+ w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]);
+ }
+ }
+ }
+
+ for (i = 0; i < 8; ++i) {
+ a[i] += z[i];
+ z[i] = a[i];
+ }
+
+ if (message) {
+ message += 128;
+ } else {
+ stack_buffer += 128;
+ }
+ len -= 128;
+ processed += 128;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
+ }
+
+ if (message_reg) {
+ *message_reg = deposit64(*message_reg, 0, reg_len, message);
+ }
+ *len_reg -= processed;
+ return cc;
+}
+
+static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg)
+{
+ uint8_t x[256];
+ uint64_t i, message, len;
+ int j, reg_len = 64, cc;
+
+ cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
+ if (cc) {
+ return cc;
+ }
+
+ message = *message_reg;
+ len = *len_reg;
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ for (i = 0; i < len; ++i) {
+ x[i] = cpu_ldub_data_ra(env, wrap_address(env, message + i), ra);
+ }
+ memset(x + i, 0, sizeof(x) - i);
+ x[i] = 128;
+ i = i < 112 ? 128 : 256;
+ for (j = 0; j < 16; ++j) {
+ x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
+ }
+ if (kimd_sha512(env, ra, parameter_block, NULL, &i, x)) {
+ g_assert_not_reached(); /* It must handle at least 2 blocks. */
+ }
+ *message_reg = deposit64(*message_reg, 0, reg_len, message + len);
+ *len_reg -= len;
+ return 0;
+}
+
static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
uint64_t *buf_reg, uint64_t *len_reg)
{
@@ -78,6 +227,14 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 3: /* CPACF_*_SHA_512 */
+ switch (type) {
+ case S390_FEAT_TYPE_KIMD:
+ return kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL);
+ case S390_FEAT_TYPE_KLMD:
+ return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
+ }
+ break;
case 114: /* CPACF_PRNO_TRNG */
fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
--
2.35.1
On 03.08.22 19:15, Jason A. Donenfeld wrote: > In order to fully support MSA_EXT_5, we have to also support the SHA-512 > special instructions. So implement those. > > The implementation began as something TweetNacl-like, and then was > adjusted to be useful here. It's not very beautiful, but it is quite > short and compact, which is what we're going for. > NIT: we could think about reversing the order of patches. IIRC, patch #1 itself would trigger a warning when starting QEMU. Having this patch first make sense logically. > Cc: Thomas Huth <thuth@redhat.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Christian Borntraeger <borntraeger@linux.ibm.com> > Cc: Richard Henderson <richard.henderson@linaro.org> > Cc: Cornelia Huck <cohuck@redhat.com> > Cc: Harald Freudenberger <freude@linux.ibm.com> > Cc: Holger Dengler <dengler@linux.ibm.com> > Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> > --- > target/s390x/gen-features.c | 2 + > target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++ > 2 files changed, 159 insertions(+) > > diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c > index 3d333e2789..b6d804fa6d 100644 > --- a/target/s390x/gen-features.c > +++ b/target/s390x/gen-features.c > @@ -751,6 +751,8 @@ static uint16_t qemu_MAX[] = { > S390_FEAT_VECTOR_ENH2, > S390_FEAT_MSA_EXT_5, > S390_FEAT_PRNO_TRNG, > + S390_FEAT_KIMD_SHA_512, > + S390_FEAT_KLMD_SHA_512, > }; > > /****** END FEATURE DEFS ******/ > diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c > index 8ad4ef1ace..bb4823107c 100644 > --- a/target/s390x/tcg/crypto_helper.c > +++ b/target/s390x/tcg/crypto_helper.c > @@ -1,10 +1,12 @@ > /* > * s390x crypto helpers > * > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. > * Copyright (c) 2017 Red Hat Inc > * > * Authors: > * David Hildenbrand <david@redhat.com> > + * Jason A. Donenfeld <Jason@zx2c4.com> > * > * This work is licensed under the terms of the GNU GPL, version 2 or later. > * See the COPYING file in the top-level directory. > @@ -19,6 +21,153 @@ > #include "exec/exec-all.h" > #include "exec/cpu_ldst.h" > > +static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); } > +static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); } > +static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); } > +static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); } > +static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); } > +static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); } > +static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); } > + > +static const uint64_t K[80] = { > + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, > + 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, > + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, > + 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, > + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, > + 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, > + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, > + 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, > + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, > + 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, > + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, > + 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, > + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, > + 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, > + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, > + 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, > + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, > + 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, > + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, > + 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, > + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, > + 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, > + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, > + 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, > + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, > + 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, > + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL > +}; > + > +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) > +{ > + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ I'd just use a #define outside of the function for that. > + uint64_t z[8], b[8], a[8], w[16], t; > + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; > + int i, j, reg_len = 64, blocks = 0, cc = 0; > + > + if (!(env->psw.mask & PSW_MASK_64)) { > + len = (uint32_t)len; > + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > + } I'd call that message_reg_len. (same in other function) > + > + for (i = 0; i < 8; ++i) { > + z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra); I assume if we get any exception here, we simply didn't make any progress. > + } > + > + while (len >= 128) { > + if (++blocks > MAX_BLOCKS_PER_RUN) { > + cc = 3; > + break; > + } > + > + for (i = 0; i < 16; ++i) { > + if (message) { > + w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra); dito > + } else { > + w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]); > + } > + } > + > + for (i = 0; i < 80; ++i) { > + for (j = 0; j < 8; ++j) { > + b[j] = a[j]; > + } > + t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16]; > + b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]); > + b[3] += t; > + for (j = 0; j < 8; ++j) { > + a[(j + 1) % 8] = b[j]; > + } > + if (i % 16 == 15) { > + for (j = 0; j < 16; ++j) { > + w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]); > + } > + } > + } > + > + for (i = 0; i < 8; ++i) { > + a[i] += z[i]; > + z[i] = a[i]; > + } > + > + if (message) { > + message += 128; > + } else { > + stack_buffer += 128; > + } > + len -= 128; > + processed += 128; > + } > + > + for (i = 0; i < 8; ++i) { > + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); I wonder what happens if we get an exception somewhere in the middle here ... fortunately we can only involve 2 pages. > + } > + > + if (message_reg) { > + *message_reg = deposit64(*message_reg, 0, reg_len, message); > + } > + *len_reg -= processed; > + return cc; > +} > + > +static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > + uint64_t *message_reg, uint64_t *len_reg) > +{ > + uint8_t x[256]; > + uint64_t i, message, len; > + int j, reg_len = 64, cc; > + > + cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); > + if (cc) { > + return cc; > + } Doesn't kimd_sha512() update the length register? And if we return with cc=3, we'd be in trouble, no? One idea could be to simply only process one block at a time. Read all inputs first for that block and handle it completely without any register modifications. Perform all memory writes in a single call. Further, I wonder if we should factor out the core of kimd_sha512() to only work on temp buffers without any loading/storing of memory, and let only kimd_sha512/klmd_sha512 perform all loading/storing. Then it's much cleaner who modifies what. If you run out if ideas, I can give it a shot next week to see if I can clean handling up a bit.. -- Thanks, David / dhildenb
Hi David, On Fri, Aug 05, 2022 at 01:28:18PM +0200, David Hildenbrand wrote: > On 03.08.22 19:15, Jason A. Donenfeld wrote: > > In order to fully support MSA_EXT_5, we have to also support the SHA-512 > > special instructions. So implement those. > > > > The implementation began as something TweetNacl-like, and then was > > adjusted to be useful here. It's not very beautiful, but it is quite > > short and compact, which is what we're going for. > > > > NIT: we could think about reversing the order of patches. IIRC, patch #1 > itself would trigger a warning when starting QEMU. Having this patch > first make sense logically. Good idea. Will do. > > +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > > + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) > > +{ > > + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ > > I'd just use a #define outside of the function for that. Why? What does leaking this into file-level scope do? > > > + uint64_t z[8], b[8], a[8], w[16], t; > > + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; > > + int i, j, reg_len = 64, blocks = 0, cc = 0; > > + > > + if (!(env->psw.mask & PSW_MASK_64)) { > > + len = (uint32_t)len; > > + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > > + } > > > I'd call that message_reg_len. (same in other function) Will do. > > > > + > > + for (i = 0; i < 8; ++i) { > > + z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra); > > I assume if we get any exception here, we simply didn't make any progress. > > > + } > > + > > + while (len >= 128) { > > + if (++blocks > MAX_BLOCKS_PER_RUN) { > > + cc = 3; > > + break; > > + } > > + > > + for (i = 0; i < 16; ++i) { > > + if (message) { > > + w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra); > > dito Right, there's no progress, because it's only ever incremented at the end. And, more importantly, we only ever update the parameter_block after having done things successfully. > > + for (i = 0; i < 8; ++i) { > > + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); > > I wonder what happens if we get an exception somewhere in the middle > here ... fortunately we can only involve 2 pages. If this fails, then message_reg and len_reg won't be updated, so it will have to start over. If it fails part way through, though, then things are inconsistent. I don't think we want to hassle with trying to restore the previous state or something insane though. That seems a bit much. > > + cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); > > + if (cc) { > > + return cc; > > + } > > Doesn't kimd_sha512() update the length register? And if we return with > cc=3, we'd be in trouble, no? cc=3 means partial completion. In that case, klmd also returns with a partial completion. That's good and expected! It means that the next time it's called, it'll keep going where it left off. I've actually tried this with the Linux implementation, and it works as expected. > One idea could be to simply only process one block at a time. Read all > inputs first for that block and handle it completely without any > register modifications. Perform all memory writes in a single call. That *is* what already happens. Actually, the memory writes only ever happen at the very end of kimd_sha512. > Further, I wonder if we should factor out the core of kimd_sha512() to > only work on temp buffers without any loading/storing of memory, and let > only kimd_sha512/klmd_sha512 perform all loading/storing. Then it's much > cleaner who modifies what. That's not necessary and will complicate things ultimately. See the above; this is already working as expected. Jason
On 05.08.22 15:01, Jason A. Donenfeld wrote: > Hi David, > > On Fri, Aug 05, 2022 at 01:28:18PM +0200, David Hildenbrand wrote: >> On 03.08.22 19:15, Jason A. Donenfeld wrote: >>> In order to fully support MSA_EXT_5, we have to also support the SHA-512 >>> special instructions. So implement those. >>> >>> The implementation began as something TweetNacl-like, and then was >>> adjusted to be useful here. It's not very beautiful, but it is quite >>> short and compact, which is what we're going for. >>> >> >> NIT: we could think about reversing the order of patches. IIRC, patch #1 >> itself would trigger a warning when starting QEMU. Having this patch >> first make sense logically. > > Good idea. Will do. > >>> +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, >>> + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) >>> +{ >>> + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ >> >> I'd just use a #define outside of the function for that. > > Why? What does leaking this into file-level scope do? > I'd say common coding practice in QEMU, but I might be wrong ;) >> >>> + uint64_t z[8], b[8], a[8], w[16], t; >>> + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; >>> + int i, j, reg_len = 64, blocks = 0, cc = 0; >>> + >>> + if (!(env->psw.mask & PSW_MASK_64)) { >>> + len = (uint32_t)len; >>> + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; >>> + } >> [...] > >>> + for (i = 0; i < 8; ++i) { >>> + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); >> >> I wonder what happens if we get an exception somewhere in the middle >> here ... fortunately we can only involve 2 pages. > > If this fails, then message_reg and len_reg won't be updated, so it will > have to start over. If it fails part way through, though, then things > are inconsistent. I don't think we want to hassle with trying to restore > the previous state or something insane though. That seems a bit much. Okay, but there could be scenarios where we mess up? > >>> + cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); >>> + if (cc) { >>> + return cc; >>> + } >> >> Doesn't kimd_sha512() update the length register? And if we return with >> cc=3, we'd be in trouble, no? > > cc=3 means partial completion. In that case, klmd also returns with a > partial completion. That's good and expected! It means that the next > time it's called, it'll keep going where it left off. > > I've actually tried this with the Linux implementation, and it works as > expected. > >> One idea could be to simply only process one block at a time. Read all >> inputs first for that block and handle it completely without any >> register modifications. Perform all memory writes in a single call. > > That *is* what already happens. Actually, the memory writes only ever > happen at the very end of kimd_sha512. > >> Further, I wonder if we should factor out the core of kimd_sha512() to >> only work on temp buffers without any loading/storing of memory, and let >> only kimd_sha512/klmd_sha512 perform all loading/storing. Then it's much >> cleaner who modifies what. > > That's not necessary and will complicate things ultimately. See the > above; this is already working as expected. I'll have a closer look and see if I might improve it in the upcomming weeks. I'll be on vacation for ~1.5 weeks. And as history has shown, I need some days afterwards to dig through my overflowing mailbox :) -- Thanks, David / dhildenb
In order to fully support MSA_EXT_5, we have to support the SHA-512
special instructions. So implement those.
The implementation began as something TweetNacl-like, and then was
adjusted to be useful here. It's not very beautiful, but it is quite
short and compact, which is what we're going for.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 3 +
target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++
2 files changed, 160 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index ad140184b9..85ab69d04e 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -749,6 +749,9 @@ static uint16_t qemu_V7_0[] = {
*/
static uint16_t qemu_MAX[] = {
S390_FEAT_VECTOR_ENH2,
+ S390_FEAT_MSA_EXT_5,
+ S390_FEAT_KIMD_SHA_512,
+ S390_FEAT_KLMD_SHA_512,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 138d9e7ad9..4d45de8faa 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -1,10 +1,12 @@
/*
* s390x crypto helpers
*
+ * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved.
* Copyright (c) 2017 Red Hat Inc
*
* Authors:
* David Hildenbrand <david@redhat.com>
+ * Jason A. Donenfeld <Jason@zx2c4.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
@@ -18,6 +20,153 @@
#include "exec/exec-all.h"
#include "exec/cpu_ldst.h"
+static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); }
+static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); }
+static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); }
+static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); }
+static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); }
+static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); }
+static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); }
+
+static const uint64_t K[80] = {
+ 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL,
+ 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL,
+ 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL,
+ 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL,
+ 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL,
+ 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL,
+ 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL,
+ 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL,
+ 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL,
+ 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL,
+ 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL,
+ 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL,
+ 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL,
+ 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL,
+ 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL,
+ 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL,
+ 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL,
+ 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL,
+ 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL,
+ 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL,
+ 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL,
+ 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL,
+ 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL,
+ 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL,
+ 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL,
+ 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL,
+ 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL
+};
+
+static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer)
+{
+ enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */
+ uint64_t z[8], b[8], a[8], w[16], t;
+ uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0;
+ int i, j, message_reg_len = 64, blocks = 0, cc = 0;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra);
+ }
+
+ while (len >= 128) {
+ if (++blocks > MAX_BLOCKS_PER_RUN) {
+ cc = 3;
+ break;
+ }
+
+ for (i = 0; i < 16; ++i) {
+ if (message) {
+ w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra);
+ } else {
+ w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]);
+ }
+ }
+
+ for (i = 0; i < 80; ++i) {
+ for (j = 0; j < 8; ++j) {
+ b[j] = a[j];
+ }
+ t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16];
+ b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]);
+ b[3] += t;
+ for (j = 0; j < 8; ++j) {
+ a[(j + 1) % 8] = b[j];
+ }
+ if (i % 16 == 15) {
+ for (j = 0; j < 16; ++j) {
+ w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]);
+ }
+ }
+ }
+
+ for (i = 0; i < 8; ++i) {
+ a[i] += z[i];
+ z[i] = a[i];
+ }
+
+ if (message) {
+ message += 128;
+ } else {
+ stack_buffer += 128;
+ }
+ len -= 128;
+ processed += 128;
+ }
+
+ for (i = 0; i < 8; ++i) {
+ cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra);
+ }
+
+ if (message_reg) {
+ *message_reg = deposit64(*message_reg, 0, message_reg_len, message);
+ }
+ *len_reg -= processed;
+ return cc;
+}
+
+static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block,
+ uint64_t *message_reg, uint64_t *len_reg)
+{
+ uint8_t x[256];
+ uint64_t i, message, len;
+ int j, message_reg_len = 64, cc;
+
+ cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL);
+ if (cc) {
+ return cc;
+ }
+
+ message = *message_reg;
+ len = *len_reg;
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ for (i = 0; i < len; ++i) {
+ x[i] = cpu_ldub_data_ra(env, wrap_address(env, message + i), ra);
+ }
+ memset(x + i, 0, sizeof(x) - i);
+ x[i] = 128;
+ i = i < 112 ? 128 : 256;
+ for (j = 0; j < 16; ++j) {
+ x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra);
+ }
+ if (kimd_sha512(env, ra, parameter_block, NULL, &i, x)) {
+ g_assert_not_reached(); /* It must handle at least 2 blocks. */
+ }
+ *message_reg = deposit64(*message_reg, 0, message_reg_len, message + len);
+ *len_reg -= len;
+ return 0;
+}
+
uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
uint32_t type)
{
@@ -52,6 +201,14 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
cpu_stb_data_ra(env, param_addr, subfunc[i], ra);
}
break;
+ case 3: /* CPACF_*_SHA_512 */
+ switch (type) {
+ case S390_FEAT_TYPE_KIMD:
+ return kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL);
+ case S390_FEAT_TYPE_KLMD:
+ return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
+ }
+ break;
default:
/* we don't implement any other subfunction yet */
g_assert_not_reached();
--
2.35.1
Finally, I'm also having some spare minutes to have a look on this ... First, thank you for your work here, it's very appreciated! Some more comments inline below (mostly cosmetics since I'm not very much into this crypto stuff)... On 09/08/2022 17.03, Jason A. Donenfeld wrote: > In order to fully support MSA_EXT_5, we have to support the SHA-512 > special instructions. So implement those. > > The implementation began as something TweetNacl-like, and then was > adjusted to be useful here. It's not very beautiful, but it is quite > short and compact, which is what we're going for. > > Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> > --- > target/s390x/gen-features.c | 3 + > target/s390x/tcg/crypto_helper.c | 157 +++++++++++++++++++++++++++++++ > 2 files changed, 160 insertions(+) If you've got some spare time, it would be great to have a test for the new functions in the tests/tcg/s390x/ folder, too (but otherwise we can still add them later). > diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c > index ad140184b9..85ab69d04e 100644 > --- a/target/s390x/gen-features.c > +++ b/target/s390x/gen-features.c > @@ -749,6 +749,9 @@ static uint16_t qemu_V7_0[] = { > */ > static uint16_t qemu_MAX[] = { > S390_FEAT_VECTOR_ENH2, > + S390_FEAT_MSA_EXT_5, > + S390_FEAT_KIMD_SHA_512, > + S390_FEAT_KLMD_SHA_512, > }; I think we likely have to fence the bits off for older machine type versions, like it has been done in commit 4f9b6c7ddb2 for example. However, the patch for the new 7.2 machine type is not merged yet (but I've queued it on https://gitlab.com/thuth/qemu/-/commits/s390x-next/ ), so you either have to pick that manually into your branch, or we fix it up later (which would be ok for me, too). > /****** END FEATURE DEFS ******/ > diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c > index 138d9e7ad9..4d45de8faa 100644 > --- a/target/s390x/tcg/crypto_helper.c > +++ b/target/s390x/tcg/crypto_helper.c > @@ -1,10 +1,12 @@ > /* > * s390x crypto helpers > * > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. Please drop the "All rights reserved" ... it does not have any legal meaning anymore, and also sounds weird in the Open Source context. See: https://en.wikipedia.org/wiki/All_rights_reserved#Obsolescence > * Copyright (c) 2017 Red Hat Inc > * > * Authors: > * David Hildenbrand <david@redhat.com> > + * Jason A. Donenfeld <Jason@zx2c4.com> > * > * This work is licensed under the terms of the GNU GPL, version 2 or later. > * See the COPYING file in the top-level directory. > @@ -18,6 +20,153 @@ > #include "exec/exec-all.h" > #include "exec/cpu_ldst.h" > > +static uint64_t R(uint64_t x, int c) { return (x >> c) | (x << (64 - c)); } > +static uint64_t Ch(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (~x & z); } > +static uint64_t Maj(uint64_t x, uint64_t y, uint64_t z) { return (x & y) ^ (x & z) ^ (y & z); } > +static uint64_t Sigma0(uint64_t x) { return R(x, 28) ^ R(x, 34) ^ R(x, 39); } > +static uint64_t Sigma1(uint64_t x) { return R(x, 14) ^ R(x, 18) ^ R(x, 41); } > +static uint64_t sigma0(uint64_t x) { return R(x, 1) ^ R(x, 8) ^ (x >> 7); } > +static uint64_t sigma1(uint64_t x) { return R(x, 19) ^ R(x, 61) ^ (x >> 6); } > + > +static const uint64_t K[80] = { > + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, 0xb5c0fbcfec4d3b2fULL, > + 0xe9b5dba58189dbbcULL, 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, > + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, 0xd807aa98a3030242ULL, > + 0x12835b0145706fbeULL, 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, > + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, 0x9bdc06a725c71235ULL, > + 0xc19bf174cf692694ULL, 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, > + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, 0x2de92c6f592b0275ULL, > + 0x4a7484aa6ea6e483ULL, 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, > + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, 0xb00327c898fb213fULL, > + 0xbf597fc7beef0ee4ULL, 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, > + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, 0x27b70a8546d22ffcULL, > + 0x2e1b21385c26c926ULL, 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, > + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, 0x81c2c92e47edaee6ULL, > + 0x92722c851482353bULL, 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, > + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, 0xd192e819d6ef5218ULL, > + 0xd69906245565a910ULL, 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, > + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, 0x2748774cdf8eeb99ULL, > + 0x34b0bcb5e19b48a8ULL, 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, > + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, 0x748f82ee5defb2fcULL, > + 0x78a5636f43172f60ULL, 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, > + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, 0xbef9a3f7b2c67915ULL, > + 0xc67178f2e372532bULL, 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, > + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, 0x06f067aa72176fbaULL, > + 0x0a637dc5a2c898a6ULL, 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, > + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, 0x3c9ebe0a15c9bebcULL, > + 0x431d67c49c100d4cULL, 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, > + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL > +}; > + > +static int kimd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > + uint64_t *message_reg, uint64_t *len_reg, uint8_t *stack_buffer) > +{ > + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ > + uint64_t z[8], b[8], a[8], w[16], t; > + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; The line is very long, could you please declare message and len on separate lines? > + int i, j, message_reg_len = 64, blocks = 0, cc = 0; > + > + if (!(env->psw.mask & PSW_MASK_64)) { > + len = (uint32_t)len; > + message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > + } > + > + for (i = 0; i < 8; ++i) { > + z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra); Quite a long line again, maybe split it like this: abi_ptr addr = wrap_address(env, parameter_block + 8 * i); z[i] = a[i] = cpu_ldq_be_data_ra(env, addr, ra); > + } > + > + while (len >= 128) { > + if (++blocks > MAX_BLOCKS_PER_RUN) { > + cc = 3; > + break; > + } > + > + for (i = 0; i < 16; ++i) { > + if (message) { > + w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra); Long line again, please split. > + } else { > + w[i] = be64_to_cpu(((uint64_t *)stack_buffer)[i]); > + } > + } > + > + for (i = 0; i < 80; ++i) { > + for (j = 0; j < 8; ++j) { > + b[j] = a[j]; > + } > + t = a[7] + Sigma1(a[4]) + Ch(a[4], a[5], a[6]) + K[i] + w[i % 16]; > + b[7] = t + Sigma0(a[0]) + Maj(a[0], a[1], a[2]); > + b[3] += t; > + for (j = 0; j < 8; ++j) { > + a[(j + 1) % 8] = b[j]; > + } > + if (i % 16 == 15) { > + for (j = 0; j < 16; ++j) { > + w[j] += w[(j + 9) % 16] + sigma0(w[(j + 1) % 16]) + sigma1(w[(j + 14) % 16]); > + } > + } > + } > + > + for (i = 0; i < 8; ++i) { > + a[i] += z[i]; > + z[i] = a[i]; > + } > + > + if (message) { > + message += 128; > + } else { > + stack_buffer += 128; > + } > + len -= 128; > + processed += 128; > + } > + > + for (i = 0; i < 8; ++i) { > + cpu_stq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), z[i], ra); > + } > + > + if (message_reg) { > + *message_reg = deposit64(*message_reg, 0, message_reg_len, message); > + } > + *len_reg -= processed; > + return cc; > +} > + > +static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_block, > + uint64_t *message_reg, uint64_t *len_reg) > +{ > + uint8_t x[256]; > + uint64_t i, message, len; > + int j, message_reg_len = 64, cc; > + > + cc = kimd_sha512(env, ra, parameter_block, message_reg, len_reg, NULL); > + if (cc) { > + return cc; > + } > + > + message = *message_reg; > + len = *len_reg; > + if (!(env->psw.mask & PSW_MASK_64)) { > + len = (uint32_t)len; > + message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > + } > + > + for (i = 0; i < len; ++i) { > + x[i] = cpu_ldub_data_ra(env, wrap_address(env, message + i), ra); > + } > + memset(x + i, 0, sizeof(x) - i); > + x[i] = 128; > + i = i < 112 ? 128 : 256; > + for (j = 0; j < 16; ++j) { > + x[i - 16 + j] = cpu_ldub_data_ra(env, wrap_address(env, parameter_block + 64 + j), ra); > + } > + if (kimd_sha512(env, ra, parameter_block, NULL, &i, x)) { > + g_assert_not_reached(); /* It must handle at least 2 blocks. */ > + } > + *message_reg = deposit64(*message_reg, 0, message_reg_len, message + len); > + *len_reg -= len; > + return 0; > +} > + > uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3, > uint32_t type) > { > @@ -52,6 +201,14 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3, > cpu_stb_data_ra(env, param_addr, subfunc[i], ra); So for KIMD and KLMD, I think we now have to set the bit that corresponds to SHA-512 in the query status information, too? Otherwise the guest might not use the function if it thinks that it is not available? > } > break; > + case 3: /* CPACF_*_SHA_512 */ > + switch (type) { > + case S390_FEAT_TYPE_KIMD: > + return kimd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1], NULL); > + case S390_FEAT_TYPE_KLMD: > + return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]); > + } > + break; > default: > /* we don't implement any other subfunction yet */ > g_assert_not_reached(); Thomas
On Fri, Aug 26, 2022 at 12:21:36PM +0200, Thomas Huth wrote: > > + * Copyright (C) 2022 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. > > Please drop the "All rights reserved" ... it does not have any legal meaning No. > > +{ > > + enum { MAX_BLOCKS_PER_RUN = 64 }; /* This is arbitrary, just to keep interactivity. */ > > + uint64_t z[8], b[8], a[8], w[16], t; > > + uint64_t message = message_reg ? *message_reg : 0, len = *len_reg, processed = 0; > > The line is very long, could you please declare message and len on separate > lines? Will do. > > > + int i, j, message_reg_len = 64, blocks = 0, cc = 0; > > + > > + if (!(env->psw.mask & PSW_MASK_64)) { > > + len = (uint32_t)len; > > + message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > > + } > > + > > + for (i = 0; i < 8; ++i) { > > + z[i] = a[i] = cpu_ldq_be_data_ra(env, wrap_address(env, parameter_block + 8 * i), ra); > > Quite a long line again, maybe split it like this: > > abi_ptr addr = wrap_address(env, parameter_block + 8 * i); > z[i] = a[i] = cpu_ldq_be_data_ra(env, addr, ra); Sure. > > > + } > > + > > + while (len >= 128) { > > + if (++blocks > MAX_BLOCKS_PER_RUN) { > > + cc = 3; > > + break; > > + } > > + > > + for (i = 0; i < 16; ++i) { > > + if (message) { > > + w[i] = cpu_ldq_be_data_ra(env, wrap_address(env, message + 8 * i), ra); > > Long line again, please split. Okay. > > cpu_stb_data_ra(env, param_addr, subfunc[i], ra); > > So for KIMD and KLMD, I think we now have to set the bit that corresponds to > SHA-512 in the query status information, too? Otherwise the guest might not > use the function if it thinks that it is not available? That's already taken care of generically I think. This works fine from Linux's autodetection. Jason
In order for hosts running inside of TCG to initialize the kernel's
random number generator, we should support the PRNO_TRNG instruction,
backed in the usual way with the qemu_guest_getrandom helper. This is
confirmed working on Linux 5.19.
Cc: Thomas Huth <thuth@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Richard Henderson <richard.henderson@linaro.org>
Cc: Cornelia Huck <cohuck@redhat.com>
Cc: Harald Freudenberger <freude@linux.ibm.com>
Cc: Holger Dengler <dengler@linux.ibm.com>
Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
---
target/s390x/gen-features.c | 1 +
target/s390x/tcg/crypto_helper.c | 30 ++++++++++++++++++++++++++++++
2 files changed, 31 insertions(+)
diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c
index 85ab69d04e..423ae44315 100644
--- a/target/s390x/gen-features.c
+++ b/target/s390x/gen-features.c
@@ -752,6 +752,7 @@ static uint16_t qemu_MAX[] = {
S390_FEAT_MSA_EXT_5,
S390_FEAT_KIMD_SHA_512,
S390_FEAT_KLMD_SHA_512,
+ S390_FEAT_PRNO_TRNG,
};
/****** END FEATURE DEFS ******/
diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c
index 4d45de8faa..e155ae1f54 100644
--- a/target/s390x/tcg/crypto_helper.c
+++ b/target/s390x/tcg/crypto_helper.c
@@ -14,6 +14,7 @@
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
+#include "qemu/guest-random.h"
#include "s390x-internal.h"
#include "tcg_s390x.h"
#include "exec/helper-proto.h"
@@ -167,6 +168,31 @@ static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_bloc
return 0;
}
+static void fill_buf_random(CPUS390XState *env, uintptr_t ra,
+ uint64_t *buf_reg, uint64_t *len_reg)
+{
+ uint8_t tmp[256];
+ uint64_t len = *len_reg;
+ int message_reg_len = 64;
+
+ if (!(env->psw.mask & PSW_MASK_64)) {
+ len = (uint32_t)len;
+ message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24;
+ }
+
+ while (len) {
+ size_t block = MIN(len, sizeof(tmp));
+
+ qemu_guest_getrandom_nofail(tmp, block);
+ for (size_t i = 0; i < block; ++i) {
+ cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra);
+ *buf_reg = deposit64(*buf_reg, 0, message_reg_len, *buf_reg + 1);
+ --*len_reg;
+ }
+ len -= block;
+ }
+}
+
uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
uint32_t type)
{
@@ -209,6 +235,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3,
return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]);
}
break;
+ case 114: /* CPACF_PRNO_TRNG */
+ fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]);
+ fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]);
+ break;
default:
/* we don't implement any other subfunction yet */
g_assert_not_reached();
--
2.35.1
On 09/08/2022 17.03, Jason A. Donenfeld wrote: > In order for hosts running inside of TCG to initialize the kernel's > random number generator, we should support the PRNO_TRNG instruction, > backed in the usual way with the qemu_guest_getrandom helper. This is > confirmed working on Linux 5.19. > > Cc: Thomas Huth <thuth@redhat.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Christian Borntraeger <borntraeger@linux.ibm.com> > Cc: Richard Henderson <richard.henderson@linaro.org> > Cc: Cornelia Huck <cohuck@redhat.com> > Cc: Harald Freudenberger <freude@linux.ibm.com> > Cc: Holger Dengler <dengler@linux.ibm.com> > Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> > --- > target/s390x/gen-features.c | 1 + > target/s390x/tcg/crypto_helper.c | 30 ++++++++++++++++++++++++++++++ > 2 files changed, 31 insertions(+) Also here: If you've got some spare time, a test in tests/tcg/s390x/ would be very welcome! > diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c > index 85ab69d04e..423ae44315 100644 > --- a/target/s390x/gen-features.c > +++ b/target/s390x/gen-features.c > @@ -752,6 +752,7 @@ static uint16_t qemu_MAX[] = { > S390_FEAT_MSA_EXT_5, > S390_FEAT_KIMD_SHA_512, > S390_FEAT_KLMD_SHA_512, > + S390_FEAT_PRNO_TRNG, > }; (this will need some fencing for old machine types, too, just like in patch 1/2) > /****** END FEATURE DEFS ******/ > diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c > index 4d45de8faa..e155ae1f54 100644 > --- a/target/s390x/tcg/crypto_helper.c > +++ b/target/s390x/tcg/crypto_helper.c > @@ -14,6 +14,7 @@ > > #include "qemu/osdep.h" > #include "qemu/main-loop.h" > +#include "qemu/guest-random.h" > #include "s390x-internal.h" > #include "tcg_s390x.h" > #include "exec/helper-proto.h" > @@ -167,6 +168,31 @@ static int klmd_sha512(CPUS390XState *env, uintptr_t ra, uint64_t parameter_bloc > return 0; > } > > +static void fill_buf_random(CPUS390XState *env, uintptr_t ra, > + uint64_t *buf_reg, uint64_t *len_reg) > +{ > + uint8_t tmp[256]; > + uint64_t len = *len_reg; > + int message_reg_len = 64; > + > + if (!(env->psw.mask & PSW_MASK_64)) { > + len = (uint32_t)len; > + message_reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > + } > + > + while (len) { > + size_t block = MIN(len, sizeof(tmp)); > + > + qemu_guest_getrandom_nofail(tmp, block); > + for (size_t i = 0; i < block; ++i) { > + cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra); > + *buf_reg = deposit64(*buf_reg, 0, message_reg_len, *buf_reg + 1); > + --*len_reg; I know it's annoying, but technically, you must not touch the upper bits of the len_reg if running in 31- or 24-bit addressing mode. The Principles of Operations say: "In either the 24- or 31-bit addressing mode, bits 32-63 of the odd-numbered register are decremented by the number of bytes processed for the respective operand, and bits 0-31 of the register remain unchanged." > + } > + len -= block; > + } > +} > + > uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3, > uint32_t type) > { Don't you also need to modify the "query" part to signal the availability of the function? Doesn't Linux in the guest check the availability first before using it? > @@ -209,6 +235,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3, > return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]); > } > break; > + case 114: /* CPACF_PRNO_TRNG */ > + fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); > + fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]); > + break; > default: > /* we don't implement any other subfunction yet */ > g_assert_not_reached(); Maybe one more thing to check (according the "Special Conditions" section in the Principles of Operation): "A specification exception is recognized and no other action is taken if any of the following conditions exist: ... 2. The R1 or R2 fields designate an odd-numbered register or general register 0. This exception is recognized regardless of the function code. " Thomas
On Fri, Aug 26, 2022 at 01:28:11PM +0200, Thomas Huth wrote: > > + qemu_guest_getrandom_nofail(tmp, block); > > + for (size_t i = 0; i < block; ++i) { > > + cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra); > > + *buf_reg = deposit64(*buf_reg, 0, message_reg_len, *buf_reg + 1); > > + --*len_reg; > > I know it's annoying, but technically, you must not touch the upper bits of > the len_reg if running in 31- or 24-bit addressing mode. The Principles of > Operations say: > > "In either the 24- or 31-bit addressing mode, bits 32-63 of the odd-numbered > register are decremented by the number > of bytes processed for the respective operand, and > bits 0-31 of the register remain unchanged." > This is what I was trying to do with the use of deposit64, following David's guidance. Did I mess something up? > > + } > > + len -= block; > > + } > > +} > > + > > uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3, > > uint32_t type) > > { > > Don't you also need to modify the "query" part to signal the availability of > the function? Doesn't Linux in the guest check the availability first before > using it? I think this is already handled at the upper layers. Linux detects it fine. > > > @@ -209,6 +235,10 @@ uint32_t HELPER(msa)(CPUS390XState *env, uint32_t r1, uint32_t r2, uint32_t r3, > > return klmd_sha512(env, ra, env->regs[1], &env->regs[r2], &env->regs[r2 + 1]); > > } > > break; > > + case 114: /* CPACF_PRNO_TRNG */ > > + fill_buf_random(env, ra, &env->regs[r1], &env->regs[r1 + 1]); > > + fill_buf_random(env, ra, &env->regs[r2], &env->regs[r2 + 1]); > > + break; > > default: > > /* we don't implement any other subfunction yet */ > > g_assert_not_reached(); > > Maybe one more thing to check (according the "Special Conditions" section in > the Principles of Operation): > > "A specification exception is recognized and no other > action is taken if any of the following conditions exist: > > ... > > 2. The R1 or R2 fields designate an odd-numbered > register or general register 0. This exception is > recognized regardless of the function code. > " This is taken care of already by the function that calls into this function. Jason
On 29/08/2022 18.29, Jason A. Donenfeld wrote: > On Fri, Aug 26, 2022 at 01:28:11PM +0200, Thomas Huth wrote: >>> + qemu_guest_getrandom_nofail(tmp, block); >>> + for (size_t i = 0; i < block; ++i) { >>> + cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra); >>> + *buf_reg = deposit64(*buf_reg, 0, message_reg_len, *buf_reg + 1); >>> + --*len_reg; >> >> I know it's annoying, but technically, you must not touch the upper bits of >> the len_reg if running in 31- or 24-bit addressing mode. The Principles of >> Operations say: >> >> "In either the 24- or 31-bit addressing mode, bits 32-63 of the odd-numbered >> register are decremented by the number >> of bytes processed for the respective operand, and >> bits 0-31 of the register remain unchanged." >> > > This is what I was trying to do with the use of deposit64, following > David's guidance. Did I mess something up? Sorry for not following up earlier - I've been away from keyboard for a couple of weeks... Anyway, that was likely a wrong comment from my side anyway - I thought that "--*len_reg" might alter the upper bits, too, when there is no masking here. But since "len" has been constrained earlier in the function already, I think this cannot happen, so please never mind. I just saw that you also sent a v8 now, so I'll follow up on that version. Thomas
On 20.07.22 14:08, Jason A. Donenfeld wrote: > In order for hosts running inside of TCG to initialize the kernel's > random number generator, we should support the PRNO_TRNG instruction, > backed in the usual way with the qemu_guest_getrandom helper. This is > confirmed working on Linux 5.19-rc6. > > Cc: Thomas Huth <thuth@redhat.com> > Cc: David Hildenbrand <david@redhat.com> > Cc: Richard Henderson <richard.henderson@linaro.org> > Cc: Cornelia Huck <cohuck@redhat.com> > Cc: Harald Freudenberger <freude@linux.ibm.com> > Cc: Holger Dengler <dengler@linux.ibm.com> > Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com> > --- > target/s390x/cpu_models.c | 2 -- > target/s390x/gen-features.c | 2 ++ > target/s390x/tcg/crypto_helper.c | 32 ++++++++++++++++++++++++++++++++ > 3 files changed, 34 insertions(+), 2 deletions(-) > > diff --git a/target/s390x/cpu_models.c b/target/s390x/cpu_models.c > index 1a562d2801..90aac3d795 100644 > --- a/target/s390x/cpu_models.c > +++ b/target/s390x/cpu_models.c > @@ -421,8 +421,6 @@ static void check_consistency(const S390CPUModel *model) > { S390_FEAT_DFP_FAST, S390_FEAT_DFP }, > { S390_FEAT_TRANSACTIONAL_EXE, S390_FEAT_STFLE_49 }, > { S390_FEAT_EDAT_2, S390_FEAT_EDAT}, > - { S390_FEAT_MSA_EXT_5, S390_FEAT_KIMD_SHA_512 }, > - { S390_FEAT_MSA_EXT_5, S390_FEAT_KLMD_SHA_512 }, > { S390_FEAT_MSA_EXT_4, S390_FEAT_MSA_EXT_3 }, > { S390_FEAT_SIE_CMMA, S390_FEAT_CMM }, > { S390_FEAT_SIE_CMMA, S390_FEAT_SIE_GSLS }, > diff --git a/target/s390x/gen-features.c b/target/s390x/gen-features.c > index ad140184b9..3d333e2789 100644 > --- a/target/s390x/gen-features.c > +++ b/target/s390x/gen-features.c > @@ -749,6 +749,8 @@ static uint16_t qemu_V7_0[] = { > */ > static uint16_t qemu_MAX[] = { > S390_FEAT_VECTOR_ENH2, > + S390_FEAT_MSA_EXT_5, > + S390_FEAT_PRNO_TRNG, > }; > > /****** END FEATURE DEFS ******/ > diff --git a/target/s390x/tcg/crypto_helper.c b/target/s390x/tcg/crypto_helper.c > index 138d9e7ad9..afd29f9cf0 100644 > --- a/target/s390x/tcg/crypto_helper.c > +++ b/target/s390x/tcg/crypto_helper.c > @@ -12,12 +12,38 @@ > > #include "qemu/osdep.h" > #include "qemu/main-loop.h" > +#include "qemu/guest-random.h" > #include "s390x-internal.h" > #include "tcg_s390x.h" > #include "exec/helper-proto.h" > #include "exec/exec-all.h" > #include "exec/cpu_ldst.h" > > +static void fill_buf_random(CPUS390XState *env, uintptr_t ra, > + uint64_t *buf_reg, uint64_t *len_reg) > +{ > + uint8_t tmp[256]; > + uint64_t len = *len_reg; > + int reg_len = 64; > + > + if (!(env->psw.mask & PSW_MASK_64)) { > + len = (uint32_t)len; > + reg_len = (env->psw.mask & PSW_MASK_32) ? 32 : 24; > + } > + > + while (len) { > + size_t block = MIN(len, sizeof(tmp)); > + > + qemu_guest_getrandom_nofail(tmp, block); > + for (size_t i = 0; i < block; ++i) { > + cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra); > + *buf_reg = deposit64(*buf_reg, 0, reg_len, *buf_reg + 1); > + --*len_reg; > + } > + len -= block; > + } Yeah, that's better, although kind-off hard to read. We could process one guest page at a time, similar to how we handle target/s390x/tcg/mem_helper.c:access_memset and friends nowadays. But I won't force you to do that ;) This here is good enough for now, with room for improvement regarding efficiency. I did not review the doc in detail once again, maybe I get to that later this week. -- Thanks, David / dhildenb
Hey David, On Wed, Jul 20, 2022 at 08:41:48PM +0200, David Hildenbrand wrote: > I did not review the doc in detail once again, maybe I get to that later > this week. Did you ever get around to merging this patch? Is it in some tree somewhere? Jason
On 27/07/2022 03.35, Jason A. Donenfeld wrote: > Hey David, > > On Wed, Jul 20, 2022 at 08:41:48PM +0200, David Hildenbrand wrote: >> I did not review the doc in detail once again, maybe I get to that later >> this week. > > Did you ever get around to merging this patch? Is it in some tree > somewhere? QEMU is in the freeze phase now, so new feature won't be merged before the next release, see: https://wiki.qemu.org/Planning/7.1 Maybe we could use the time to implement the missing SHA512 stuff to avoid having an inconsistency between the Principles of Operation and the emulated machine in QEMU? Thomas
Hey Thomas, On Wed, Jul 27, 2022 at 08:32:22AM +0200, Thomas Huth wrote: > On 27/07/2022 03.35, Jason A. Donenfeld wrote: > > Hey David, > > > > On Wed, Jul 20, 2022 at 08:41:48PM +0200, David Hildenbrand wrote: > >> I did not review the doc in detail once again, maybe I get to that later > >> this week. > > > > Did you ever get around to merging this patch? Is it in some tree > > somewhere? > > QEMU is in the freeze phase now, so new feature won't be merged before the > next release, see: Yea, I understand, that's fine. > Maybe we could use the time to implement the missing SHA512 stuff to avoid > having an inconsistency between the Principles of Operation and the emulated > machine in QEMU? Ooooooooooooooofffff. You're not /wrong/ of course. This actually makes a lot of sense. But I was hoping to somehow skip out on this part, because I don't know much about s390 and wiring up the handlers seems finicky. But I can learn! Actually, though, any interest in working together on this? I can work on the crypto-side of things, fashioning a minimal sha512 implementation that's small enough it can fit in crypto_helper.c with support for the incremental block state stuff s390 needs, and then you can work on wiring in all the instructions and telling me what semantics you need from the crypto. Interested? (Offer of working together goes out to David too of course.) If so, maybe poke me on IRC? I'm zx2c4 on the various networks. Jason
Hi David, On Wed, Jul 20, 2022 at 08:41:48PM +0200, David Hildenbrand wrote: > > + while (len) { > > + size_t block = MIN(len, sizeof(tmp)); > > + > > + qemu_guest_getrandom_nofail(tmp, block); > > + for (size_t i = 0; i < block; ++i) { > > + cpu_stb_data_ra(env, wrap_address(env, *buf_reg), tmp[i], ra); > > + *buf_reg = deposit64(*buf_reg, 0, reg_len, *buf_reg + 1); > > + --*len_reg; > > + } > > + len -= block; > > + } > > Yeah, that's better, although kind-off hard to read. > > We could process one guest page at a time, similar to how we handle > target/s390x/tcg/mem_helper.c:access_memset and friends nowadays. > > But I won't force you to do that ;) > > This here is good enough for now, with room for improvement regarding > efficiency. > > I did not review the doc in detail once again, maybe I get to that later > this week. Alright, so we'll leave it be for now then and stick with this v3. The do_access_memset trick is clever, but sheesh, seems a bit overkill for here. On the real hardware, this instruction takes ~190us for every 32 byte chunk, so there's basically no way that we can possibly be worse than that. :) Jason
© 2016 - 2024 Red Hat, Inc.