[v1] crypto: improve performance of XTS cipher mode

[Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type

Posted by Daniel P. Berrangé 7 years, 4 months ago

The new type is designed to allow use of 64-bit arithmetic instead
of operating 1-byte at a time. The following patches will use this to
improve performance.

Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
---
 crypto/xts.c | 45 +++++++++++++++++++++++++--------------------
 1 file changed, 25 insertions(+), 20 deletions(-)

diff --git a/crypto/xts.c b/crypto/xts.c
index 3c1a92f01d..ded4365191 100644
--- a/crypto/xts.c
+++ b/crypto/xts.c
@@ -26,6 +26,11 @@
 #include "qemu/osdep.h"
 #include "crypto/xts.h"
 
+typedef struct {
+    uint64_t a;
+    uint64_t b;
+} xts_uint128;
+
 static void xts_mult_x(uint8_t *I)
 {
     int x;
@@ -85,7 +90,7 @@ void xts_decrypt(const void *datactx,
                  uint8_t *dst,
                  const uint8_t *src)
 {
-    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
+    xts_uint128 PP, CC, T;
     unsigned long i, m, mo, lim;
 
     /* get number of blocks */
@@ -102,10 +107,10 @@ void xts_decrypt(const void *datactx,
     }
 
     /* encrypt the iv */
-    encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
+    encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encdec(datactx, decfunc, src, dst, T);
+        xts_tweak_encdec(datactx, decfunc, src, dst, (uint8_t *)&T);
 
         src += XTS_BLOCK_SIZE;
         dst += XTS_BLOCK_SIZE;
@@ -113,27 +118,27 @@ void xts_decrypt(const void *datactx,
 
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
-        memcpy(CC, T, XTS_BLOCK_SIZE);
-        xts_mult_x(CC);
+        memcpy(&CC, &T, XTS_BLOCK_SIZE);
+        xts_mult_x((uint8_t *)&CC);
 
         /* PP = tweak decrypt block m-1 */
-        xts_tweak_encdec(datactx, decfunc, src, PP, CC);
+        xts_tweak_encdec(datactx, decfunc, src, (uint8_t *)&PP, (uint8_t *)&CC);
 
         /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
         for (i = 0; i < mo; i++) {
-            CC[i] = src[XTS_BLOCK_SIZE + i];
-            dst[XTS_BLOCK_SIZE + i] = PP[i];
+            ((uint8_t *)&CC)[i] = src[XTS_BLOCK_SIZE + i];
+            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&PP)[i];
         }
         for (; i < XTS_BLOCK_SIZE; i++) {
-            CC[i] = PP[i];
+            ((uint8_t *)&CC)[i] = ((uint8_t *)&PP)[i];
         }
 
         /* Pm-1 = Tweak uncrypt CC */
-        xts_tweak_encdec(datactx, decfunc, CC, dst, T);
+        xts_tweak_encdec(datactx, decfunc, (uint8_t *)&CC, dst, (uint8_t *)&T);
     }
 
     /* Decrypt the iv back */
-    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, T);
+    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, (uint8_t *)&T);
 }
 
 
@@ -146,7 +151,7 @@ void xts_encrypt(const void *datactx,
                  uint8_t *dst,
                  const uint8_t *src)
 {
-    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
+    xts_uint128 PP, CC, T;
     unsigned long i, m, mo, lim;
 
     /* get number of blocks */
@@ -163,10 +168,10 @@ void xts_encrypt(const void *datactx,
     }
 
     /* encrypt the iv */
-    encfunc(tweakctx, XTS_BLOCK_SIZE, T, iv);
+    encfunc(tweakctx, XTS_BLOCK_SIZE, (uint8_t *)&T, iv);
 
     for (i = 0; i < lim; i++) {
-        xts_tweak_encdec(datactx, encfunc, src, dst, T);
+        xts_tweak_encdec(datactx, encfunc, src, dst, (uint8_t *)&T);
 
         dst += XTS_BLOCK_SIZE;
         src += XTS_BLOCK_SIZE;
@@ -175,22 +180,22 @@ void xts_encrypt(const void *datactx,
     /* if length is not a multiple of XTS_BLOCK_SIZE then */
     if (mo > 0) {
         /* CC = tweak encrypt block m-1 */
-        xts_tweak_encdec(datactx, encfunc, src, CC, T);
+        xts_tweak_encdec(datactx, encfunc, src, (uint8_t *)&CC, (uint8_t *)&T);
 
         /* Cm = first length % XTS_BLOCK_SIZE bytes of CC */
         for (i = 0; i < mo; i++) {
-            PP[i] = src[XTS_BLOCK_SIZE + i];
-            dst[XTS_BLOCK_SIZE + i] = CC[i];
+            ((uint8_t *)&PP)[i] = src[XTS_BLOCK_SIZE + i];
+            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&CC)[i];
         }
 
         for (; i < XTS_BLOCK_SIZE; i++) {
-            PP[i] = CC[i];
+            ((uint8_t *)&PP)[i] = ((uint8_t *)&CC)[i];
         }
 
         /* Cm-1 = Tweak encrypt PP */
-        xts_tweak_encdec(datactx, encfunc, PP, dst, T);
+        xts_tweak_encdec(datactx, encfunc, (uint8_t *)&PP, dst, (uint8_t *)&T);
     }
 
     /* Decrypt the iv back */
-    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, T);
+    decfunc(tweakctx, XTS_BLOCK_SIZE, iv, (uint8_t *)&T);
 }
-- 
2.17.1

Re: [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type

Posted by Alberto Garcia 7 years, 4 months ago

On Tue 09 Oct 2018 02:55:38 PM CEST, Daniel P. Berrangé wrote:
> The new type is designed to allow use of 64-bit arithmetic instead
> of operating 1-byte at a time. The following patches will use this to
> improve performance.
>
> Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>

I suppose that the fixes for the endianness problem may end up requiring
you to change this, but the patch itself is fine as it is now.

Reviewed-by: Alberto Garcia <berto@igalia.com>

Berto

Re: [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type

Posted by Alberto Garcia 7 years, 4 months ago

On Tue 09 Oct 2018 02:55:38 PM CEST, Daniel P. Berrangé wrote:

> @@ -85,7 +90,7 @@ void xts_decrypt(const void *datactx,
>                   uint8_t *dst,
>                   const uint8_t *src)
>  {
> -    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
> +    xts_uint128 PP, CC, T;
>      unsigned long i, m, mo, lim;

   [...]

>          /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
>          for (i = 0; i < mo; i++) {
> -            CC[i] = src[XTS_BLOCK_SIZE + i];
> -            dst[XTS_BLOCK_SIZE + i] = PP[i];
> +            ((uint8_t *)&CC)[i] = src[XTS_BLOCK_SIZE + i];
> +            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&PP)[i];
>          }

On second thoughts, these casts are a bit cumbersome. I wonder if it
isn't better to keep the array a uint8_t[] and only treat it as
xts_uint128 in the places where you actually do 64-bit operations
(xts_uint128_xor, xts_mult_x).

Berto

Re: [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type

Posted by Daniel P. Berrangé 7 years, 4 months ago

On Tue, Oct 09, 2018 at 04:50:16PM +0200, Alberto Garcia wrote:
> On Tue 09 Oct 2018 02:55:38 PM CEST, Daniel P. Berrangé wrote:
> 
> > @@ -85,7 +90,7 @@ void xts_decrypt(const void *datactx,
> >                   uint8_t *dst,
> >                   const uint8_t *src)
> >  {
> > -    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
> > +    xts_uint128 PP, CC, T;
> >      unsigned long i, m, mo, lim;
> 
>    [...]
> 
> >          /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
> >          for (i = 0; i < mo; i++) {
> > -            CC[i] = src[XTS_BLOCK_SIZE + i];
> > -            dst[XTS_BLOCK_SIZE + i] = PP[i];
> > +            ((uint8_t *)&CC)[i] = src[XTS_BLOCK_SIZE + i];
> > +            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&PP)[i];
> >          }
> 
> On second thoughts, these casts are a bit cumbersome. I wonder if it
> isn't better to keep the array a uint8_t[] and only treat it as
> xts_uint128 in the places where you actually do 64-bit operations
> (xts_uint128_xor, xts_mult_x).

I had done that originally, but it just shifts ugly casts from one
place to another place in the code. I preferred the idea of storing
it all as a 128bit data type since that's matching the operational
block size.

A further alternative is for xts_uint128 to be a union providing
both, and then have an extra level of access for respective fields,
which I had also tried at one time but ultimately i decided I didn't
mind the casts.

Regards,
Daniel
-- 
|: https://berrange.com      -o-    https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org         -o-            https://fstop138.berrange.com :|
|: https://entangle-photo.org    -o-    https://www.instagram.com/dberrange :|

Re: [Qemu-devel] [PATCH 3/6] crypto: introduce a xts_uint128 data type

Posted by Alberto Garcia 7 years, 4 months ago

On Tue 09 Oct 2018 04:58:39 PM CEST, Daniel P. Berrangé wrote:
>> > @@ -85,7 +90,7 @@ void xts_decrypt(const void *datactx,
>> >                   uint8_t *dst,
>> >                   const uint8_t *src)
>> >  {
>> > -    uint8_t PP[XTS_BLOCK_SIZE], CC[XTS_BLOCK_SIZE], T[XTS_BLOCK_SIZE];
>> > +    xts_uint128 PP, CC, T;
>> >      unsigned long i, m, mo, lim;
>> 
>>    [...]
>> 
>> >          /* Pm = first length % XTS_BLOCK_SIZE bytes of PP */
>> >          for (i = 0; i < mo; i++) {
>> > -            CC[i] = src[XTS_BLOCK_SIZE + i];
>> > -            dst[XTS_BLOCK_SIZE + i] = PP[i];
>> > +            ((uint8_t *)&CC)[i] = src[XTS_BLOCK_SIZE + i];
>> > +            dst[XTS_BLOCK_SIZE + i] = ((uint8_t *)&PP)[i];
>> >          }
>> 
>> On second thoughts, these casts are a bit cumbersome. I wonder if it
>> isn't better to keep the array a uint8_t[] and only treat it as
>> xts_uint128 in the places where you actually do 64-bit operations
>> (xts_uint128_xor, xts_mult_x).
>
> I had done that originally, but it just shifts ugly casts from one
> place to another place in the code.

Does it really? There's a dozen casts to uint8_t * in different
places. If you use uint_8[] you would only need something like this:

static void xts_mult_x(uint8_t *I8)
{
    xts_uint128 *I = (xts_uint128 *) I8;
    /* ... the rest of the function remains the same ... */
}

And something similar in xts_uint128_xor(), which could be an inline
function instead of a macro.

Berto