Using 64-bit arithmetic increases the performance for xts-aes-128
when built with gcrypt:
Encrypt: 235 MB/s -> 320 MB/s
Decrypt: 245 MB/s -> 325 MB/s
Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
1 file changed, 33 insertions(+), 19 deletions(-)
diff --git a/crypto/xts.c b/crypto/xts.c
index ded4365191..f109c8a3ee 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -31,6 +31,12 @@ typedef struct {
uint64_t b;
} xts_uint128;
+#define xts_uint128_xor(D, S1, S2) \
+ do { \
+ (D)->a = (S1)->a ^ (S2)->a; \
+ (D)->b = (S1)->b ^ (S2)->b; \
+ } while (0)
+
static void xts_mult_x(uint8_t *I)
{
int x;
@@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
*/
static void xts_tweak_encdec(const void *ctx,
xts_cipher_func *func,
- const uint8_t *src,
- uint8_t *dst,
- uint8_t *iv)
+ const xts_uint128 *src,
+ xts_uint128 *dst,
+ xts_uint128 *iv)
{
- unsigned long x;
-
/* tweak encrypt block i */
- for (x = 0; x < XTS_BLOCK_SIZE; x++) {
- dst[x] = src[x] ^ iv[x];
- }
+ xts_uint128_xor(dst, src, iv);
- func(ctx, XTS_BLOCK_SIZE, dst, dst);
+ func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);
- for (x = 0; x < XTS_BLOCK_SIZE; x++) {
- dst[x] = dst[x] ^ iv[x];
- }
+ xts_uint128_xor(dst, dst, iv);
/* LFSR the tweak */
- xts_mult_x(iv);
+ xts_mult_x((uint8_t *)iv);
}
@@ -110,7 +110,11 @@ void xts_decrypt(const void *datactx,
encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
for (i = 0; i < lim; i++) {
- xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
+ xts_uint128 S, D;
+
+ memcpy(&S, src, XTS_BLOCK_SIZE);
+ xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
+ memcpy(dst, &D, XTS_BLOCK_SIZE);
src += XTS_BLOCK_SIZE;
dst += XTS_BLOCK_SIZE;
@@ -118,11 +122,13 @@ void xts_decrypt(const void *datactx,
/* if length is not a multiple of XTS_BLOCK_SIZE then */
if (mo > 0) {
+ xts_uint128 S, D;
memcpy(&CC, &T, XTS_BLOCK_SIZE);
xts_mult_x((uint8_t *)&CC);
/* PP = tweak decrypt block m-1 */
- xts_tweak_encdec(datactx, decfunc, src, (uint8_t *)&PP, (uint8_t *)&CC);
+ memcpy(&S, src, XTS_BLOCK_SIZE);
+ xts_tweak_encdec(datactx, decfunc, &S, &PP, &CC);
/* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
for (i = 0; i < mo; i++) {
@@ -134,7 +140,8 @@ void xts_decrypt(const void *datactx,
}
/* Pm-1 = Tweak uncrypt CC */
- xts_tweak_encdec(datactx, decfunc, (uint8_t *)&CC, dst, (uint8_t *)&T);
+ xts_tweak_encdec(datactx, decfunc, &CC, &D, &T);
+ memcpy(dst, &D, XTS_BLOCK_SIZE);
}
/* Decrypt the iv back */
@@ -171,7 +178,11 @@ void xts_encrypt(const void *datactx,
encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
for (i = 0; i < lim; i++) {
- xts_tweak_encdec(datactx, encfunc, src, dst, (uint8_t *)&T);
+ xts_uint128 S, D;
+
+ memcpy(&S, src, XTS_BLOCK_SIZE);
+ xts_tweak_encdec(datactx, encfunc, &S, &D, &T);
+ memcpy(dst, &D, XTS_BLOCK_SIZE);
dst += XTS_BLOCK_SIZE;
src += XTS_BLOCK_SIZE;
@@ -179,8 +190,10 @@ void xts_encrypt(const void *datactx,
/* if length is not a multiple of XTS_BLOCK_SIZE then */
if (mo > 0) {
+ xts_uint128 S, D;
/* CC = tweak encrypt block m-1 */
- xts_tweak_encdec(datactx, encfunc, src, (uint8_t *)&CC, (uint8_t *)&T);
+ memcpy(&S, src, XTS_BLOCK_SIZE);
+ xts_tweak_encdec(datactx, encfunc, &S, &CC, &T);
/* Cm = first length % XTS_BLOCK_SIZE bytes of CC */
for (i = 0; i < mo; i++) {
@@ -193,7 +206,8 @@ void xts_encrypt(const void *datactx,
}
/* Cm-1 = Tweak encrypt PP */
- xts_tweak_encdec(datactx, encfunc, (uint8_t *)&PP, dst, (uint8_t *)&T);
+ xts_tweak_encdec(datactx, encfunc, &PP, &D, &T);
+ memcpy(dst, &D, XTS_BLOCK_SIZE);
}
/* Decrypt the iv back */
--
2.17.1
On Tue 09 Oct 2018 02:55:39 PM CEST, Daniel P. Berrangé wrote:
> Using 64-bit arithmetic increases the performance for xts-aes-128
> when built with gcrypt:
>
> Encrypt: 235 MB/s -> 320 MB/s
> Decrypt: 245 MB/s -> 325 MB/s
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> ---
> crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
> 1 file changed, 33 insertions(+), 19 deletions(-)
>
> diff --git a/crypto/xts.c b/crypto/xts.c
> index ded4365191..f109c8a3ee 100644
> --- a/crypto/xts.c
> +++ b/crypto/xts.c
> @@ -31,6 +31,12 @@ typedef struct {
> uint64_t b;
> } xts_uint128;
>
> +#define xts_uint128_xor(D, S1, S2) \
> + do { \
> + (D)->a = (S1)->a ^ (S2)->a; \
> + (D)->b = (S1)->b ^ (S2)->b; \
> + } while (0)
> +
> static void xts_mult_x(uint8_t *I)
> {
> int x;
> @@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
> */
> static void xts_tweak_encdec(const void *ctx,
> xts_cipher_func *func,
> - const uint8_t *src,
> - uint8_t *dst,
> - uint8_t *iv)
> + const xts_uint128 *src,
> + xts_uint128 *dst,
> + xts_uint128 *iv)
> {
> - unsigned long x;
> -
> /* tweak encrypt block i */
> - for (x = 0; x < XTS_BLOCK_SIZE; x++) {
> - dst[x] = src[x] ^ iv[x];
> - }
> + xts_uint128_xor(dst, src, iv);
>
> - func(ctx, XTS_BLOCK_SIZE, dst, dst);
> + func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);
In the line of what I said earlier, perhaps it's clearer if you leave
everything as uint8_t * and simply make xts_uint128_xor() treat the
array as xts_uint128 internally.
> for (i = 0; i < lim; i++) {
> - xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> + xts_uint128 S, D;
> +
> + memcpy(&S, src, XTS_BLOCK_SIZE);
> + xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> + memcpy(dst, &D, XTS_BLOCK_SIZE);
Why do you need S and D?
Berto
On Tue, Oct 09, 2018 at 05:02:39PM +0200, Alberto Garcia wrote:
> On Tue 09 Oct 2018 02:55:39 PM CEST, Daniel P. Berrangé wrote:
> > Using 64-bit arithmetic increases the performance for xts-aes-128
> > when built with gcrypt:
> >
> > Encrypt: 235 MB/s -> 320 MB/s
> > Decrypt: 245 MB/s -> 325 MB/s
> >
> > Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> > ---
> > crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
> > 1 file changed, 33 insertions(+), 19 deletions(-)
> >
> > diff --git a/crypto/xts.c b/crypto/xts.c
> > index ded4365191..f109c8a3ee 100644
> > --- a/crypto/xts.c
> > +++ b/crypto/xts.c
> > @@ -31,6 +31,12 @@ typedef struct {
> > uint64_t b;
> > } xts_uint128;
> >
> > +#define xts_uint128_xor(D, S1, S2) \
> > + do { \
> > + (D)->a = (S1)->a ^ (S2)->a; \
> > + (D)->b = (S1)->b ^ (S2)->b; \
> > + } while (0)
> > +
> > static void xts_mult_x(uint8_t *I)
> > {
> > int x;
> > @@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
> > */
> > static void xts_tweak_encdec(const void *ctx,
> > xts_cipher_func *func,
> > - const uint8_t *src,
> > - uint8_t *dst,
> > - uint8_t *iv)
> > + const xts_uint128 *src,
> > + xts_uint128 *dst,
> > + xts_uint128 *iv)
> > {
> > - unsigned long x;
> > -
> > /* tweak encrypt block i */
> > - for (x = 0; x < XTS_BLOCK_SIZE; x++) {
> > - dst[x] = src[x] ^ iv[x];
> > - }
> > + xts_uint128_xor(dst, src, iv);
> >
> > - func(ctx, XTS_BLOCK_SIZE, dst, dst);
> > + func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);
>
> In the line of what I said earlier, perhaps it's clearer if you leave
> everything as uint8_t * and simply make xts_uint128_xor() treat the
> array as xts_uint128 internally.
>
> > for (i = 0; i < lim; i++) {
> > - xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> > + xts_uint128 S, D;
> > +
> > + memcpy(&S, src, XTS_BLOCK_SIZE);
> > + xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> > + memcpy(dst, &D, XTS_BLOCK_SIZE);
>
> Why do you need S and D?
I think src & dst pointers can't be guaranteed to be aligned sufficiently
for int64 operations, if we just cast from uint8t*.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
>> > for (i = 0; i < lim; i++) {
>> > - xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
>> > + xts_uint128 S, D;
>> > +
>> > + memcpy(&S, src, XTS_BLOCK_SIZE);
>> > + xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
>> > + memcpy(dst, &D, XTS_BLOCK_SIZE);
>>
>> Why do you need S and D?
>
> I think src & dst pointers can't be guaranteed to be aligned
> sufficiently for int64 operations, if we just cast from uint8t*.
I see. I did a quick test without the memcpy() calls and it doesn't seem
to have a visible effect on performance, but if it turns out that it
does then maybe this is worth investigating further. I suspect all
buffers received by this code are allocated with qemu_try_blockalign()
anyway, so it should be safe.
Berto
On Tue, Oct 09, 2018 at 05:30:25PM +0200, Alberto Garcia wrote:
> >> > for (i = 0; i < lim; i++) {
> >> > - xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> >> > + xts_uint128 S, D;
> >> > +
> >> > + memcpy(&S, src, XTS_BLOCK_SIZE);
> >> > + xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> >> > + memcpy(dst, &D, XTS_BLOCK_SIZE);
> >>
> >> Why do you need S and D?
> >
> > I think src & dst pointers can't be guaranteed to be aligned
> > sufficiently for int64 operations, if we just cast from uint8t*.
>
> I see. I did a quick test without the memcpy() calls and it doesn't seem
> to have a visible effect on performance, but if it turns out that it
> does then maybe this is worth investigating further. I suspect all
> buffers received by this code are allocated with qemu_try_blockalign()
> anyway, so it should be safe.
The extra memcpy() calls certainly had a perf impact when I added
them, so if we can determine that we can safely do without, that
would be desirable.
Regards,
Daniel
--
|: https://berrange.com -o- https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o- https://fstop138.berrange.com :|
|: https://entangle-photo.org -o- https://www.instagram.com/dberrange :|
On Tue 09 Oct 2018 05:31:32 PM CEST, Daniel P. Berrangé wrote:
> On Tue, Oct 09, 2018 at 05:30:25PM +0200, Alberto Garcia wrote:
>> >> > for (i = 0; i < lim; i++) {
>> >> > - xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
>> >> > + xts_uint128 S, D;
>> >> > +
>> >> > + memcpy(&S, src, XTS_BLOCK_SIZE);
>> >> > + xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
>> >> > + memcpy(dst, &D, XTS_BLOCK_SIZE);
>> >>
>> >> Why do you need S and D?
>> >
>> > I think src & dst pointers can't be guaranteed to be aligned
>> > sufficiently for int64 operations, if we just cast from uint8t*.
>>
>> I see. I did a quick test without the memcpy() calls and it doesn't
>> seem to have a visible effect on performance, but if it turns out
>> that it does then maybe this is worth investigating further. I
>> suspect all buffers received by this code are allocated with
>> qemu_try_blockalign() anyway, so it should be safe.
>
> The extra memcpy() calls certainly had a perf impact when I added
> them, so if we can determine that we can safely do without, that would
> be desirable.
So I was having a look at this. From the block layer everything comes
properly aligned. Then there's VirtioCrypto, which seems to allow XTS
mode but I couldn't quite tell from virtio_crypto_sym_op_helper() if all
buffers are guaranteed to be aligned.
What you could do is a runtime check (with QEMU_PTR_IS_ALIGNED()) and
decide what implementation to use.
A couple of additional thoughts:
- x86_64 (and others) allow unaligned memory accesses, and that might be
faster than copying the buffer using memcpy(). I haven't measured it
however.
- qcrypto_block_{encrypt,decrypt}_helper() (used for encrypted block
I/O) use the same buffer for input and output, so maybe it's worth
exploring if this fact allows for additional optimization if you still
need to use memcpy().
Berto
© 2016 - 2025 Red Hat, Inc.