[v1] crypto: improve performance of XTS cipher mode

[Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type

Posted by Daniel P. Berrangé 7 years ago

Using 64-bit arithmetic increases the performance for xts-aes-128
when built with gcrypt:

  Encrypt: 235 MB/s -> 320 MB/s
  Decrypt: 245 MB/s -> 325 MB/s

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 33 insertions(+), 19 deletions(-)

diff --git a/crypto/xts.c b/crypto/xts.c
index ded4365191..f109c8a3ee 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -31,6 +31,12 @@ typedef struct {
     uint64_t b;
 } xts_uint128;
 
+#define xts_uint128_xor(D, S1, S2)              \
+    do {                                        \
+        (D)->a = (S1)->a ^ (S2)->a;             \
+        (D)->b = (S1)->b ^ (S2)->b;             \
+    } while (0)
+
 static void xts_mult_x(uint8_t *I)
 {
     int x;
@@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
  */
 static void xts_tweak_encdec(const void *ctx,
                              xts_cipher_func *func,
-                             const uint8_t *src,
-                             uint8_t *dst,
-                             uint8_t *iv)
+                             const xts_uint128 *src,
+                             xts_uint128 *dst,
+                             xts_uint128 *iv)
 {
-    unsigned long x;
-
     /* tweak encrypt block i */
-    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
-        dst[x] = src[x] ^ iv[x];
-    }
+    xts_uint128_xor(dst, src, iv);
 
-    func(ctx, XTS_BLOCK_SIZE, dst, dst);
+    func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);
 
-    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
-        dst[x] = dst[x] ^ iv[x];
-    }
+    xts_uint128_xor(dst, dst, iv);
 
     /* LFSR the tweak */
-    xts_mult_x(iv);
+    xts_mult_x((uint8_t *)iv);
 }
 
 
@@ -110,7 +110,11 @@ void xts_decrypt(const void *datactx,
     encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
+        xts_uint128 S, D;
+
+        memcpy(&S, src, XTS_BLOCK_SIZE);
+        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
+        memcpy(dst, &D, XTS_BLOCK_SIZE);
 
         src += XTS_BLOCK_SIZE;
         dst += XTS_BLOCK_SIZE;
@@ -118,11 +122,13 @@ void xts_decrypt(const void *datactx,
 
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
+        xts_uint128 S, D;
         memcpy(&CC, &T, XTS_BLOCK_SIZE);
         xts_mult_x((uint8_t *)&CC);
 
         /* PP = tweak decrypt block m-1 */
-        xts_tweak_encdec(datactx, decfunc, src, (uint8_t *)&PP, (uint8_t *)&CC);
+        memcpy(&S, src, XTS_BLOCK_SIZE);
+        xts_tweak_encdec(datactx, decfunc, &S, &PP, &CC);
 
         /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
         for (i = 0; i < mo; i++) {
@@ -134,7 +140,8 @@ void xts_decrypt(const void *datactx,
         }
 
         /* Pm-1 = Tweak uncrypt CC */
-        xts_tweak_encdec(datactx, decfunc, (uint8_t *)&CC, dst, (uint8_t *)&T);
+        xts_tweak_encdec(datactx, decfunc, &CC, &D, &T);
+        memcpy(dst, &D, XTS_BLOCK_SIZE);
     }
 
     /* Decrypt the iv back */
@@ -171,7 +178,11 @@ void xts_encrypt(const void *datactx,
     encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encdec(datactx, encfunc, src, dst, (uint8_t *)&T);
+        xts_uint128 S, D;
+
+        memcpy(&S, src, XTS_BLOCK_SIZE);
+        xts_tweak_encdec(datactx, encfunc, &S, &D, &T);
+        memcpy(dst, &D, XTS_BLOCK_SIZE);
 
         dst += XTS_BLOCK_SIZE;
         src += XTS_BLOCK_SIZE;
@@ -179,8 +190,10 @@ void xts_encrypt(const void *datactx,
 
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
+        xts_uint128 S, D;
         /* CC = tweak encrypt block m-1 */
-        xts_tweak_encdec(datactx, encfunc, src, (uint8_t *)&CC, (uint8_t *)&T);
+        memcpy(&S, src, XTS_BLOCK_SIZE);
+        xts_tweak_encdec(datactx, encfunc, &S, &CC, &T);
 
         /* Cm = first length % XTS_BLOCK_SIZE bytes of CC */
         for (i = 0; i < mo; i++) {
@@ -193,7 +206,8 @@ void xts_encrypt(const void *datactx,
         }
 
         /* Cm-1 = Tweak encrypt PP */
-        xts_tweak_encdec(datactx, encfunc, (uint8_t *)&PP, dst, (uint8_t *)&T);
+        xts_tweak_encdec(datactx, encfunc, &PP, &D, &T);
+        memcpy(dst, &D, XTS_BLOCK_SIZE);
     }
 
     /* Decrypt the iv back */
-- 
2.17.1

Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type

Posted by Alberto Garcia 7 years ago

On Tue 09 Oct 2018 02:55:39 PM CEST, Daniel P. Berrangé wrote:
> Using 64-bit arithmetic increases the performance for xts-aes-128
> when built with gcrypt:
>
>   Encrypt: 235 MB/s -> 320 MB/s
>   Decrypt: 245 MB/s -> 325 MB/s
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> ---
>  crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
>  1 file changed, 33 insertions(+), 19 deletions(-)
>
> diff --git a/crypto/xts.c b/crypto/xts.c
> index ded4365191..f109c8a3ee 100644
> --- a/crypto/xts.c
> +++ b/crypto/xts.c
> @@ -31,6 +31,12 @@ typedef struct {
>      uint64_t b;
>  } xts_uint128;
>  
> +#define xts_uint128_xor(D, S1, S2)              \
> +    do {                                        \
> +        (D)->a = (S1)->a ^ (S2)->a;             \
> +        (D)->b = (S1)->b ^ (S2)->b;             \
> +    } while (0)
> +
>  static void xts_mult_x(uint8_t *I)
>  {
>      int x;
> @@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
>   */
>  static void xts_tweak_encdec(const void *ctx,
>                               xts_cipher_func *func,
> -                             const uint8_t *src,
> -                             uint8_t *dst,
> -                             uint8_t *iv)
> +                             const xts_uint128 *src,
> +                             xts_uint128 *dst,
> +                             xts_uint128 *iv)
>  {
> -    unsigned long x;
> -
>      /* tweak encrypt block i */
> -    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
> -        dst[x] = src[x] ^ iv[x];
> -    }
> +    xts_uint128_xor(dst, src, iv);
>  
> -    func(ctx, XTS_BLOCK_SIZE, dst, dst);
> +    func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);

In the line of what I said earlier, perhaps it's clearer if you leave
everything as uint8_t * and simply make xts_uint128_xor() treat the
array as xts_uint128 internally.

>      for (i = 0; i < lim; i++) {
> -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> +        xts_uint128 S, D;
> +
> +        memcpy(&S, src, XTS_BLOCK_SIZE);
> +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> +        memcpy(dst, &D, XTS_BLOCK_SIZE);

Why do you need S and D?

Berto

Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type

Posted by Daniel P. Berrangé 7 years ago

On Tue, Oct 09, 2018 at 05:02:39PM +0200, Alberto Garcia wrote:
> On Tue 09 Oct 2018 02:55:39 PM CEST, Daniel P. Berrangé wrote:
> > Using 64-bit arithmetic increases the performance for xts-aes-128
> > when built with gcrypt:
> >
> >   Encrypt: 235 MB/s -> 320 MB/s
> >   Decrypt: 245 MB/s -> 325 MB/s
> >
> > Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
> > ---
> >  crypto/xts.c | 52 +++++++++++++++++++++++++++++++++-------------------
> >  1 file changed, 33 insertions(+), 19 deletions(-)
> >
> > diff --git a/crypto/xts.c b/crypto/xts.c
> > index ded4365191..f109c8a3ee 100644
> > --- a/crypto/xts.c
> > +++ b/crypto/xts.c
> > @@ -31,6 +31,12 @@ typedef struct {
> >      uint64_t b;
> >  } xts_uint128;
> >  
> > +#define xts_uint128_xor(D, S1, S2)              \
> > +    do {                                        \
> > +        (D)->a = (S1)->a ^ (S2)->a;             \
> > +        (D)->b = (S1)->b ^ (S2)->b;             \
> > +    } while (0)
> > +
> >  static void xts_mult_x(uint8_t *I)
> >  {
> >      int x;
> > @@ -59,25 +65,19 @@ static void xts_mult_x(uint8_t *I)
> >   */
> >  static void xts_tweak_encdec(const void *ctx,
> >                               xts_cipher_func *func,
> > -                             const uint8_t *src,
> > -                             uint8_t *dst,
> > -                             uint8_t *iv)
> > +                             const xts_uint128 *src,
> > +                             xts_uint128 *dst,
> > +                             xts_uint128 *iv)
> >  {
> > -    unsigned long x;
> > -
> >      /* tweak encrypt block i */
> > -    for (x = 0; x < XTS_BLOCK_SIZE; x++) {
> > -        dst[x] = src[x] ^ iv[x];
> > -    }
> > +    xts_uint128_xor(dst, src, iv);
> >  
> > -    func(ctx, XTS_BLOCK_SIZE, dst, dst);
> > +    func(ctx, XTS_BLOCK_SIZE, (uint8_t *)dst, (uint8_t *)dst);
> 
> In the line of what I said earlier, perhaps it's clearer if you leave
> everything as uint8_t * and simply make xts_uint128_xor() treat the
> array as xts_uint128 internally.
> 
> >      for (i = 0; i < lim; i++) {
> > -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> > +        xts_uint128 S, D;
> > +
> > +        memcpy(&S, src, XTS_BLOCK_SIZE);
> > +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> > +        memcpy(dst, &D, XTS_BLOCK_SIZE);
> 
> Why do you need S and D?

I think src & dst pointers can't be guaranteed to be aligned sufficiently
for int64 operations, if we just cast from uint8t*. 

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type

Posted by Alberto Garcia 7 years ago

>> >      for (i = 0; i < lim; i++) {
>> > -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
>> > +        xts_uint128 S, D;
>> > +
>> > +        memcpy(&S, src, XTS_BLOCK_SIZE);
>> > +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
>> > +        memcpy(dst, &D, XTS_BLOCK_SIZE);
>> 
>> Why do you need S and D?
>
> I think src & dst pointers can't be guaranteed to be aligned
> sufficiently for int64 operations, if we just cast from uint8t*.

I see. I did a quick test without the memcpy() calls and it doesn't seem
to have a visible effect on performance, but if it turns out that it
does then maybe this is worth investigating further. I suspect all
buffers received by this code are allocated with qemu_try_blockalign()
anyway, so it should be safe.

Berto

Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type

Posted by Daniel P. Berrangé 7 years ago

On Tue, Oct 09, 2018 at 05:30:25PM +0200, Alberto Garcia wrote:
> >> >      for (i = 0; i < lim; i++) {
> >> > -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
> >> > +        xts_uint128 S, D;
> >> > +
> >> > +        memcpy(&S, src, XTS_BLOCK_SIZE);
> >> > +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
> >> > +        memcpy(dst, &D, XTS_BLOCK_SIZE);
> >> 
> >> Why do you need S and D?
> >
> > I think src & dst pointers can't be guaranteed to be aligned
> > sufficiently for int64 operations, if we just cast from uint8t*.
> 
> I see. I did a quick test without the memcpy() calls and it doesn't seem
> to have a visible effect on performance, but if it turns out that it
> does then maybe this is worth investigating further. I suspect all
> buffers received by this code are allocated with qemu_try_blockalign()
> anyway, so it should be safe.

The extra memcpy() calls certainly had a perf impact when I added
them, so if we can determine that we can safely do without, that
would be desirable.


Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [Qemu-devel] [PATCH 4/6] crypto: convert xts_tweak_encdec to use xts_uint128 type

Posted by Alberto Garcia 7 years ago

On Tue 09 Oct 2018 05:31:32 PM CEST, Daniel P. Berrangé wrote:
> On Tue, Oct 09, 2018 at 05:30:25PM +0200, Alberto Garcia wrote:
>> >> >      for (i = 0; i < lim; i++) {
>> >> > -        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
>> >> > +        xts_uint128 S, D;
>> >> > +
>> >> > +        memcpy(&S, src, XTS_BLOCK_SIZE);
>> >> > +        xts_tweak_encdec(datactx, decfunc, &S, &D, &T);
>> >> > +        memcpy(dst, &D, XTS_BLOCK_SIZE);
>> >> 
>> >> Why do you need S and D?
>> >
>> > I think src & dst pointers can't be guaranteed to be aligned
>> > sufficiently for int64 operations, if we just cast from uint8t*.
>> 
>> I see. I did a quick test without the memcpy() calls and it doesn't
>> seem to have a visible effect on performance, but if it turns out
>> that it does then maybe this is worth investigating further. I
>> suspect all buffers received by this code are allocated with
>> qemu_try_blockalign() anyway, so it should be safe.
>
> The extra memcpy() calls certainly had a perf impact when I added
> them, so if we can determine that we can safely do without, that would
> be desirable.

So I was having a look at this. From the block layer everything comes
properly aligned. Then there's VirtioCrypto, which seems to allow XTS
mode but I couldn't quite tell from virtio_crypto_sym_op_helper() if all
buffers are guaranteed to be aligned.

What you could do is a runtime check (with QEMU_PTR_IS_ALIGNED()) and
decide what implementation to use.

A couple of additional thoughts:

- x86_64 (and others) allow unaligned memory accesses, and that might be
  faster than copying the buffer using memcpy(). I haven't measured it
  however.

- qcrypto_block_{encrypt,decrypt}_helper() (used for encrypted block
  I/O) use the same buffer for input and output, so maybe it's worth
  exploring if this fact allows for additional optimization if you still
  need to use memcpy().

Berto