[PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

ling xu posted 2 patches 3 years, 6 months ago
Maintainers: Juan Quintela <quintela@redhat.com>, "Dr. David Alan Gilbert" <dgilbert@redhat.com>
[PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Posted by ling xu 3 years, 6 months ago
This commit update runtime check of AVX512, and implements avx512 of
xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
Compared with C version of xbzrle_encode_buffer function, avx512 version
can achieve almost 60%-70% performance improvement on unit test provided
by Qemu. In addition, we provide one more unit test called
"test_encode_decode_random", in which dirty data are randomly located in
4K page, and this case can achieve almost 140% performance gain.

Signed-off-by: ling xu <ling1.xu@intel.com>
Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
Co-authored-by: Jun Jin <jun.i.jin@intel.com>
---
 meson.build        |  16 ++++
 meson_options.txt  |   2 +
 migration/ram.c    |  41 ++++++++++
 migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
 migration/xbzrle.h |   4 +
 5 files changed, 244 insertions(+)

diff --git a/meson.build b/meson.build
index 294e9a8f32..4222b77e9f 100644
--- a/meson.build
+++ b/meson.build
@@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
     int main(int argc, char *argv[]) { return bar(argv[0]); }
   '''), error_message: 'AVX512F not available').allowed())
 
+config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
+  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
+  .require(cc.links('''
+    #pragma GCC push_options
+    #pragma GCC target("avx512bw")
+    #include <cpuid.h>
+    #include <immintrin.h>
+    static int bar(void *a) {
+
+      __m512i x = *(__m512i *)a;
+      __m512i res= _mm512_abs_epi8(x);
+      return res[1];
+    }
+    int main(int argc, char *argv[]) { return bar(argv[0]); }
+  '''), error_message: 'AVX512BW not available').allowed())
+
 have_pvrdma = get_option('pvrdma') \
   .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
   .require(cc.compiles(gnu_source_prefix + '''
diff --git a/meson_options.txt b/meson_options.txt
index e58e158396..07194bf680 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
        description: 'AVX2 optimizations')
 option('avx512f', type: 'feature', value: 'disabled',
        description: 'AVX512F optimizations')
+option('avx512bw', type: 'feature', value: 'auto',
+       description: 'AVX512BW optimizations')
 option('keyring', type: 'feature', value: 'auto',
        description: 'Linux keyring support')
 
diff --git a/migration/ram.c b/migration/ram.c
index dc1de9ddbc..d9c1ac2f7a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -83,6 +83,35 @@
 /* 0x80 is reserved in migration.h start with 0x100 next */
 #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
 
+#if defined(CONFIG_AVX512BW_OPT)
+static bool IS_CPU_SUPPORT_AVX512BW;
+#include "qemu/cpuid.h"
+static void __attribute__((constructor)) init_cpu_flag(void)
+{
+    unsigned max = __get_cpuid_max(0, NULL);
+    int a, b, c, d;
+    IS_CPU_SUPPORT_AVX512BW = false;
+    if (max >= 1) {
+        __cpuid(1, a, b, c, d);
+         /* We must check that AVX is not just available, but usable.  */
+        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
+            int bv;
+            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
+            __cpuid_count(7, 0, a, b, c, d);
+           /* 0xe6:
+            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
+            *                    and ZMM16-ZMM31 state are enabled by OS)
+            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
+            */
+            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
+                IS_CPU_SUPPORT_AVX512BW = true;
+            }
+        }
+    }
+    return ;
+}
+#endif
+
 XBZRLECacheStats xbzrle_counters;
 
 /* struct contains XBZRLE cache and a static page
@@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
     memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
 
     /* XBZRLE encoding (if there is no overflow) */
+    #if defined(CONFIG_AVX512BW_OPT)
+    if (likely(IS_CPU_SUPPORT_AVX512BW)) {
+        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
+                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
+                                               TARGET_PAGE_SIZE);
+    } else {
+        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
+                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
+                                           TARGET_PAGE_SIZE);
+    }
+    #else
     encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
                                        TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
                                        TARGET_PAGE_SIZE);
+    #endif
 
     /*
      * Update the cache contents, so that it corresponds to the data
diff --git a/migration/xbzrle.c b/migration/xbzrle.c
index 1ba482ded9..4db09fdbdb 100644
--- a/migration/xbzrle.c
+++ b/migration/xbzrle.c
@@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
 
     return d;
 }
+
+#if defined(CONFIG_AVX512BW_OPT)
+#pragma GCC push_options
+#pragma GCC target("avx512bw")
+
+#include <immintrin.h>
+#include <math.h>
+#define SET_ZERO512(r) r = _mm512_set1_epi32(0)
+int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                             uint8_t *dst, int dlen)
+{
+    uint32_t zrun_len = 0, nzrun_len = 0;
+    int d = 0, i = 0, num = 0;
+    uint8_t *nzrun_start = NULL;
+    int count512s = (slen >> 6);
+    int res = slen % 64;
+    bool never_same = true;
+    while (count512s--) {
+        if (d + 2 > dlen) {
+            return -1;
+        }
+        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
+                               0xffffffffffffffff, old_buf + i);
+        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
+                                                 0xffffffffffffffff, new_buf + i);
+        /* in mask bit 1 for same, 0 for diff */
+        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+
+        int bytesToCheck = 64;
+        bool is_same = (comp & 0x1);
+        while (bytesToCheck) {
+            if (is_same) {
+                if (nzrun_len) {
+                    d += uleb128_encode_small(dst + d, nzrun_len);
+                    if (d + nzrun_len > dlen) {
+                        return -1;
+                    }
+                    nzrun_start = new_buf + i - nzrun_len;
+                    memcpy(dst + d, nzrun_start, nzrun_len);
+                    d += nzrun_len;
+                    nzrun_len = 0;
+                }
+                if (comp == 0xffffffffffffffff) {
+                    i += 64;
+                    zrun_len += 64;
+                    break;
+                }
+                never_same = false;
+                num = __builtin_ctzl(~comp);
+                num = (num < bytesToCheck) ? num : bytesToCheck;
+                zrun_len += num;
+                bytesToCheck -= num;
+                comp >>= num;
+                i += num;
+                if (bytesToCheck) {
+                    /* still has different data after same data */
+                    d += uleb128_encode_small(dst + d, zrun_len);
+                    zrun_len = 0;
+                } else {
+                    break;
+                }
+            }
+            if (never_same || zrun_len) {
+                /*
+                 * never_same only acts if
+                 * data begins with diff in first count512s
+                 */
+                d += uleb128_encode_small(dst + d, zrun_len);
+                zrun_len = 0;
+                never_same = false;
+            }
+            /* has diff */
+            if ((bytesToCheck == 64) && (comp == 0x0)) {
+                i += 64;
+                nzrun_len += 64;
+                break;
+            }
+            num = __builtin_ctzl(comp);
+            num = (num < bytesToCheck) ? num : bytesToCheck;
+            nzrun_len += num;
+            bytesToCheck -= num;
+            comp >>= num;
+            i += num;
+            if (bytesToCheck) {
+                /* mask like 111000 */
+                d += uleb128_encode_small(dst + d, nzrun_len);
+                /* overflow */
+                if (d + nzrun_len > dlen) {
+                    return -1;
+                }
+                nzrun_start = new_buf + i - nzrun_len;
+                memcpy(dst + d, nzrun_start, nzrun_len);
+                d += nzrun_len;
+                nzrun_len = 0;
+                is_same = true;
+            }
+        }
+    }
+    if (res) {
+        /* the number of data is less than 64 */
+        unsigned long long mask = pow(2, res);
+        mask -= 1;
+        __m512i r = SET_ZERO512(r);
+        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
+        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
+        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
+
+        int bytesToCheck = res;
+        bool is_same = (comp & 0x1);
+        while (bytesToCheck) {
+            if (is_same) {
+                if (nzrun_len) {
+                    d += uleb128_encode_small(dst + d, nzrun_len);
+                    if (d + nzrun_len > dlen) {
+                        return -1;
+                    }
+                    nzrun_start = new_buf + i - nzrun_len;
+                    memcpy(dst + d, nzrun_start, nzrun_len);
+                    d += nzrun_len;
+                    nzrun_len = 0;
+                }
+                never_same = false;
+                num = __builtin_ctzl(~comp);
+                num = (num < bytesToCheck) ? num : bytesToCheck;
+                zrun_len += num;
+                bytesToCheck -= num;
+                comp >>= num;
+                i += num;
+                if (bytesToCheck) {
+                    /* diff after same */
+                    d += uleb128_encode_small(dst + d, zrun_len);
+                    zrun_len = 0;
+                } else {
+                    break;
+                }
+            }
+
+            if (never_same || zrun_len) {
+                d += uleb128_encode_small(dst + d, zrun_len);
+                zrun_len = 0;
+                never_same = false;
+            }
+            /* has diff */
+            num = __builtin_ctzl(comp);
+            num = (num < bytesToCheck) ? num : bytesToCheck;
+            nzrun_len += num;
+            bytesToCheck -= num;
+            comp >>= num;
+            i += num;
+            if (bytesToCheck) {
+                d += uleb128_encode_small(dst + d, nzrun_len);
+                /* overflow */
+                if (d + nzrun_len > dlen) {
+                    return -1;
+                }
+                nzrun_start = new_buf + i - nzrun_len;
+                memcpy(dst + d, nzrun_start, nzrun_len);
+                d += nzrun_len;
+                nzrun_len = 0;
+                is_same = true;
+            }
+        }
+    }
+
+    if (zrun_len) {
+        return (zrun_len == slen) ? 0 : d;
+    }
+    if (nzrun_len != 0) {
+        d += uleb128_encode_small(dst + d, nzrun_len);
+        /* overflow */
+        if (d + nzrun_len > dlen) {
+            return -1;
+        }
+        nzrun_start = new_buf + i - nzrun_len;
+        memcpy(dst + d, nzrun_start, nzrun_len);
+        d += nzrun_len;
+    }
+    return d;
+}
+#pragma GCC pop_options
+#endif
\ No newline at end of file
diff --git a/migration/xbzrle.h b/migration/xbzrle.h
index a0db507b9c..6247de5f00 100644
--- a/migration/xbzrle.h
+++ b/migration/xbzrle.h
@@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
                          uint8_t *dst, int dlen);
 
 int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
+#if defined(CONFIG_AVX512BW_OPT)
+int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
+                             uint8_t *dst, int dlen);
+#endif
 #endif
-- 
2.25.1
Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Posted by Richard Henderson 3 years, 6 months ago
On 8/8/22 00:48, ling xu wrote:
> This commit update runtime check of AVX512, and implements avx512 of
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 version
> can achieve almost 60%-70% performance improvement on unit test provided
> by Qemu. In addition, we provide one more unit test called
> "test_encode_decode_random", in which dirty data are randomly located in
> 4K page, and this case can achieve almost 140% performance gain.
> 
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>   meson.build        |  16 ++++
>   meson_options.txt  |   2 +
>   migration/ram.c    |  41 ++++++++++
>   migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
>   migration/xbzrle.h |   4 +
>   5 files changed, 244 insertions(+)
> 
> diff --git a/meson.build b/meson.build
> index 294e9a8f32..4222b77e9f 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>       int main(int argc, char *argv[]) { return bar(argv[0]); }
>     '''), error_message: 'AVX512F not available').allowed())
>   
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {
> +
> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);
> +      return res[1];
> +    }
> +    int main(int argc, char *argv[]) { return bar(argv[0]); }
> +  '''), error_message: 'AVX512BW not available').allowed())
> +
>   have_pvrdma = get_option('pvrdma') \
>     .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
>     .require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt
> index e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
>          description: 'AVX2 optimizations')
>   option('avx512f', type: 'feature', value: 'disabled',
>          description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +       description: 'AVX512BW optimizations')
>   option('keyring', type: 'feature', value: 'auto',
>          description: 'Linux keyring support')
>   
> diff --git a/migration/ram.c b/migration/ram.c
> index dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>   /* 0x80 is reserved in migration.h start with 0x100 next */
>   #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
>   
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;
> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void)
> +{
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    IS_CPU_SUPPORT_AVX512BW = false;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                IS_CPU_SUPPORT_AVX512BW = true;
> +            }
> +        }
> +    }
> +    return ;
> +}
> +#endif
> +
>   XBZRLECacheStats xbzrle_counters;
>   
>   /* struct contains XBZRLE cache and a static page
> @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>       memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>   
>       /* XBZRLE encoding (if there is no overflow) */
> +    #if defined(CONFIG_AVX512BW_OPT)
> +    if (likely(IS_CPU_SUPPORT_AVX512BW)) {
> +        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
> +                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                               TARGET_PAGE_SIZE);
> +    } else {
> +        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> +                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                           TARGET_PAGE_SIZE);
> +    }
> +    #else
>       encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
>                                          TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
>                                          TARGET_PAGE_SIZE);
> +    #endif
>   
>       /*
>        * Update the cache contents, so that it corresponds to the data
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c
> index 1ba482ded9..4db09fdbdb 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
>   
>       return d;
>   }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +
> +#include <immintrin.h>
> +#include <math.h>
> +#define SET_ZERO512(r) r = _mm512_set1_epi32(0)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen)
> +{
> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    int count512s = (slen >> 6);
> +    int res = slen % 64;
> +    bool never_same = true;
> +    while (count512s--) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                               0xffffffffffffffff, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                 0xffffffffffffffff, new_buf + i);
> +        /* in mask bit 1 for same, 0 for diff */
> +        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = 64;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                if (comp == 0xffffffffffffffff) {
> +                    i += 64;
> +                    zrun_len += 64;
> +                    break;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* still has different data after same data */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +            if (never_same || zrun_len) {
> +                /*
> +                 * never_same only acts if
> +                 * data begins with diff in first count512s
> +                 */
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            if ((bytesToCheck == 64) && (comp == 0x0)) {
> +                i += 64;
> +                nzrun_len += 64;
> +                break;
> +            }
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                /* mask like 111000 */
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +    if (res) {
> +        /* the number of data is less than 64 */
> +        unsigned long long mask = pow(2, res);

Um, what?  This is a stupid version of "1ull << res".


> +        mask -= 1;
> +        __m512i r = SET_ZERO512(r);
> +        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
> +        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = res;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {

Why have you unrolled this from the main loop?  That's the major advantage of using 
predicate registers, being able to fold the head (and/or tail) into the same loop.

> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);

Type error -- ctzl used with long long (which should be uint64_t).
You should be using ctz64().

> +                num = (num < bytesToCheck) ? num : bytesToCheck;

Why this test?  Don't you already know that ~comp != 0?

> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* diff after same */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +
> +            if (never_same || zrun_len) {
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }

More generally, what benefit are you *really* getting out of avx512?  You're doing 
predicated loads and compares, but they're strictly length-based.  Then you're using the 
result of the comparison in serial.  I really can't imagine this being efficient at all.


r~
Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Posted by Juan Quintela 3 years, 6 months ago
ling xu <ling1.xu@intel.com> wrote:
> This commit update runtime check of AVX512, and implements avx512 of
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 version
> can achieve almost 60%-70% performance improvement on unit test provided
> by Qemu. In addition, we provide one more unit test called
> "test_encode_decode_random", in which dirty data are randomly located in
> 4K page, and this case can achieve almost 140% performance gain.
>
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>  meson.build        |  16 ++++
>  meson_options.txt  |   2 +
>  migration/ram.c    |  41 ++++++++++
>  migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
>  migration/xbzrle.h |   4 +
>  5 files changed, 244 insertions(+)
>
> diff --git a/meson.build b/meson.build
> index 294e9a8f32..4222b77e9f 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>      int main(int argc, char *argv[]) { return bar(argv[0]); }
>    '''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {
> +
> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);
> +      return res[1];
> +    }
> +    int main(int argc, char *argv[]) { return bar(argv[0]); }
> +  '''), error_message: 'AVX512BW not available').allowed())
> +
>  have_pvrdma = get_option('pvrdma') \
>    .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
>    .require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt
> index e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
>         description: 'AVX2 optimizations')
>  option('avx512f', type: 'feature', value: 'disabled',
>         description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +       description: 'AVX512BW optimizations')
>  option('keyring', type: 'feature', value: 'auto',
>         description: 'Linux keyring support')
>  

[no clue about meson, it looks ok]

> diff --git a/migration/ram.c b/migration/ram.c
> index dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
>  
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;

An all caps global variable?

> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void)
> +{
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    IS_CPU_SUPPORT_AVX512BW = false;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                IS_CPU_SUPPORT_AVX512BW = true;
> +            }
> +        }
> +    }
> +    return ;
> +}
> +#endif
> +
>  XBZRLECacheStats xbzrle_counters;
>  
>  /* struct contains XBZRLE cache and a static page
> @@ -802,9 +831,21 @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>  
>      /* XBZRLE encoding (if there is no overflow) */
> +    #if defined(CONFIG_AVX512BW_OPT)
> +    if (likely(IS_CPU_SUPPORT_AVX512BW)) {

All distributions are go to have compile time support for AVX, but I am
not sure the percentage of machines that support avx

> +        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
> +                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                               TARGET_PAGE_SIZE);
> +    } else {
> +        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> +                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                           TARGET_PAGE_SIZE);
> +    }

the else part is the same than the #else part
> +    #else
>      encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
>                                         TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
>                                         TARGET_PAGE_SIZE);
> +    #endif

So, why don't just create a new function pointer:

int (*xbzrle_encode_buffer_func)(uint8_t *old_buf, uint8_t *new_buf, int slen,
                                 uint8_t *dst, int dlen) = xbzrle_encode_buffer;


And aad into init_cpu_flag() something in the line of:

	xbzrle_encode_buffer_func = xbrrle_encode_buffer_512;

?


>      /*
>       * Update the cache contents, so that it corresponds to the data
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c
> index 1ba482ded9..4db09fdbdb 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen)
>  
>      return d;
>  }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +
> +#include <immintrin.h>
> +#include <math.h>
> +#define SET_ZERO512(r) r = _mm512_set1_epi32(0)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen)
> +{

This is just personal taste, but I would rename this to:

xbzrle_encode_buffer_avx512?

> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    int count512s = (slen >> 6);
> +    int res = slen % 64;

res variable here means residual, normally we use "res" with meaning of
"result" in qemu.

> +    bool never_same = true;
> +    while (count512s--) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                               0xffffffffffffffff, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                 0xffffffffffffffff, new_buf + i);
> +        /* in mask bit 1 for same, 0 for diff */
> +        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = 64;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                if (comp == 0xffffffffffffffff) {
> +                    i += 64;
> +                    zrun_len += 64;
> +                    break;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* still has different data after same data */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +            if (never_same || zrun_len) {
> +                /*
> +                 * never_same only acts if
> +                 * data begins with diff in first count512s
> +                 */
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            if ((bytesToCheck == 64) && (comp == 0x0)) {
> +                i += 64;
> +                nzrun_len += 64;
> +                break;
> +            }
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                /* mask like 111000 */
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +    if (res) {
> +        /* the number of data is less than 64 */
> +        unsigned long long mask = pow(2, res);

Not your fault.

21st century.  Someone still use long long in a new API, sniff.

> +        mask -= 1;
> +        __m512i r = SET_ZERO512(r);
> +        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
> +        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = res;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* diff after same */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +
> +            if (never_same || zrun_len) {
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +
> +    if (zrun_len) {
> +        return (zrun_len == slen) ? 0 : d;
> +    }
> +    if (nzrun_len != 0) {
> +        d += uleb128_encode_small(dst + d, nzrun_len);
> +        /* overflow */
> +        if (d + nzrun_len > dlen) {
> +            return -1;
> +        }
> +        nzrun_start = new_buf + i - nzrun_len;
> +        memcpy(dst + d, nzrun_start, nzrun_len);
> +        d += nzrun_len;
> +    }
> +    return d;
> +}
> +#pragma GCC pop_options
> +#endif
> \ No newline at end of file
> diff --git a/migration/xbzrle.h b/migration/xbzrle.h
> index a0db507b9c..6247de5f00 100644
> --- a/migration/xbzrle.h
> +++ b/migration/xbzrle.h
> @@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
>                           uint8_t *dst, int dlen);
>  
>  int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen);
> +#if defined(CONFIG_AVX512BW_OPT)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen);
> +#endif
>  #endif
RE: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Posted by Xu, Ling1 3 years, 6 months ago
Hi, Juan, 
      Thanks for your advice. We have revised our code including: 1) change "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that variable isn't global variable; 2) use a function pointer to simplify code in ram.c; 3) change function name "xbzrle_encode_buffer_512" to "xbzrle_encode_buffer_avx512", change variable "res" to "countResidual" for better understanding, and replace "unsigned long long" with "uint64_t". 
       We will submit patch v4 to fix all issues mentioned in comments. 

Best Regard,
Ling

-----Original Message-----
From: Juan Quintela <quintela@redhat.com> 
Sent: Monday, August 8, 2022 9:12 PM
To: Xu, Ling1 <ling1.xu@intel.com>
Cc: qemu-devel@nongnu.org; dgilbert@redhat.com; Zhao, Zhou <zhou.zhao@intel.com>; Jin, Jun I <jun.i.jin@intel.com>
Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

ling xu <ling1.xu@intel.com> wrote:
> This commit update runtime check of AVX512, and implements avx512 of 
> xbzrle_encode_buffer function to accelerate xbzrle encoding speed.
> Compared with C version of xbzrle_encode_buffer function, avx512 
> version can achieve almost 60%-70% performance improvement on unit 
> test provided by Qemu. In addition, we provide one more unit test 
> called "test_encode_decode_random", in which dirty data are randomly 
> located in 4K page, and this case can achieve almost 140% performance gain.
>
> Signed-off-by: ling xu <ling1.xu@intel.com>
> Co-authored-by: Zhou Zhao <zhou.zhao@intel.com>
> Co-authored-by: Jun Jin <jun.i.jin@intel.com>
> ---
>  meson.build        |  16 ++++
>  meson_options.txt  |   2 +
>  migration/ram.c    |  41 ++++++++++
>  migration/xbzrle.c | 181 +++++++++++++++++++++++++++++++++++++++++++++
>  migration/xbzrle.h |   4 +
>  5 files changed, 244 insertions(+)
>
> diff --git a/meson.build b/meson.build index 294e9a8f32..4222b77e9f 
> 100644
> --- a/meson.build
> +++ b/meson.build
> @@ -2262,6 +2262,22 @@ config_host_data.set('CONFIG_AVX512F_OPT', get_option('avx512f') \
>      int main(int argc, char *argv[]) { return bar(argv[0]); }
>    '''), error_message: 'AVX512F not available').allowed())
>  
> +config_host_data.set('CONFIG_AVX512BW_OPT', get_option('avx512bw') \
> +  .require(have_cpuid_h, error_message: 'cpuid.h not available, 
> +cannot enable AVX512BW') \
> +  .require(cc.links('''
> +    #pragma GCC push_options
> +    #pragma GCC target("avx512bw")
> +    #include <cpuid.h>
> +    #include <immintrin.h>
> +    static int bar(void *a) {
> +
> +      __m512i x = *(__m512i *)a;
> +      __m512i res= _mm512_abs_epi8(x);
> +      return res[1];
> +    }
> +    int main(int argc, char *argv[]) { return bar(argv[0]); }  '''), 
> + error_message: 'AVX512BW not available').allowed())
> +
>  have_pvrdma = get_option('pvrdma') \
>    .require(rdma.found(), error_message: 'PVRDMA requires OpenFabrics libraries') \
>    .require(cc.compiles(gnu_source_prefix + '''
> diff --git a/meson_options.txt b/meson_options.txt index 
> e58e158396..07194bf680 100644
> --- a/meson_options.txt
> +++ b/meson_options.txt
> @@ -104,6 +104,8 @@ option('avx2', type: 'feature', value: 'auto',
>         description: 'AVX2 optimizations')  option('avx512f', type: 
> 'feature', value: 'disabled',
>         description: 'AVX512F optimizations')
> +option('avx512bw', type: 'feature', value: 'auto',
> +       description: 'AVX512BW optimizations')
>  option('keyring', type: 'feature', value: 'auto',
>         description: 'Linux keyring support')
>  

[no clue about meson, it looks ok]

> diff --git a/migration/ram.c b/migration/ram.c index 
> dc1de9ddbc..d9c1ac2f7a 100644
> --- a/migration/ram.c
> +++ b/migration/ram.c
> @@ -83,6 +83,35 @@
>  /* 0x80 is reserved in migration.h start with 0x100 next */
>  #define RAM_SAVE_FLAG_COMPRESS_PAGE    0x100
>  
> +#if defined(CONFIG_AVX512BW_OPT)
> +static bool IS_CPU_SUPPORT_AVX512BW;

An all caps global variable?

> +#include "qemu/cpuid.h"
> +static void __attribute__((constructor)) init_cpu_flag(void) {
> +    unsigned max = __get_cpuid_max(0, NULL);
> +    int a, b, c, d;
> +    IS_CPU_SUPPORT_AVX512BW = false;
> +    if (max >= 1) {
> +        __cpuid(1, a, b, c, d);
> +         /* We must check that AVX is not just available, but usable.  */
> +        if ((c & bit_OSXSAVE) && (c & bit_AVX) && max >= 7) {
> +            int bv;
> +            __asm("xgetbv" : "=a"(bv), "=d"(d) : "c"(0));
> +            __cpuid_count(7, 0, a, b, c, d);
> +           /* 0xe6:
> +            *  XCR0[7:5] = 111b (OPMASK state, upper 256-bit of ZMM0-ZMM15
> +            *                    and ZMM16-ZMM31 state are enabled by OS)
> +            *  XCR0[2:1] = 11b (XMM state and YMM state are enabled by OS)
> +            */
> +            if ((bv & 0xe6) == 0xe6 && (b & bit_AVX512BW)) {
> +                IS_CPU_SUPPORT_AVX512BW = true;
> +            }
> +        }
> +    }
> +    return ;
> +}
> +#endif
> +
>  XBZRLECacheStats xbzrle_counters;
>  
>  /* struct contains XBZRLE cache and a static page @@ -802,9 +831,21 
> @@ static int save_xbzrle_page(RAMState *rs, uint8_t **current_data,
>      memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE);
>  
>      /* XBZRLE encoding (if there is no overflow) */
> +    #if defined(CONFIG_AVX512BW_OPT)
> +    if (likely(IS_CPU_SUPPORT_AVX512BW)) {

All distributions are go to have compile time support for AVX, but I am not sure the percentage of machines that support avx

> +        encoded_len = xbzrle_encode_buffer_512(prev_cached_page, XBZRLE.current_buf,
> +                                               TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                               TARGET_PAGE_SIZE);
> +    } else {
> +        encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
> +                                           TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
> +                                           TARGET_PAGE_SIZE);
> +    }

the else part is the same than the #else part
> +    #else
>      encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf,
>                                         TARGET_PAGE_SIZE, XBZRLE.encoded_buf,
>                                         TARGET_PAGE_SIZE);
> +    #endif

So, why don't just create a new function pointer:

int (*xbzrle_encode_buffer_func)(uint8_t *old_buf, uint8_t *new_buf, int slen,
                                 uint8_t *dst, int dlen) = xbzrle_encode_buffer;


And aad into init_cpu_flag() something in the line of:

	xbzrle_encode_buffer_func = xbrrle_encode_buffer_512;

?


>      /*
>       * Update the cache contents, so that it corresponds to the data 
> diff --git a/migration/xbzrle.c b/migration/xbzrle.c index 
> 1ba482ded9..4db09fdbdb 100644
> --- a/migration/xbzrle.c
> +++ b/migration/xbzrle.c
> @@ -174,3 +174,184 @@ int xbzrle_decode_buffer(uint8_t *src, int slen, 
> uint8_t *dst, int dlen)
>  
>      return d;
>  }
> +
> +#if defined(CONFIG_AVX512BW_OPT)
> +#pragma GCC push_options
> +#pragma GCC target("avx512bw")
> +
> +#include <immintrin.h>
> +#include <math.h>
> +#define SET_ZERO512(r) r = _mm512_set1_epi32(0) int 
> +xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen) {

This is just personal taste, but I would rename this to:

xbzrle_encode_buffer_avx512?

> +    uint32_t zrun_len = 0, nzrun_len = 0;
> +    int d = 0, i = 0, num = 0;
> +    uint8_t *nzrun_start = NULL;
> +    int count512s = (slen >> 6);
> +    int res = slen % 64;

res variable here means residual, normally we use "res" with meaning of "result" in qemu.

> +    bool never_same = true;
> +    while (count512s--) {
> +        if (d + 2 > dlen) {
> +            return -1;
> +        }
> +        __m512i old_data = _mm512_mask_loadu_epi8(old_data,
> +                               0xffffffffffffffff, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(new_data,
> +                                                 0xffffffffffffffff, new_buf + i);
> +        /* in mask bit 1 for same, 0 for diff */
> +        __mmask64  comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = 64;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                if (comp == 0xffffffffffffffff) {
> +                    i += 64;
> +                    zrun_len += 64;
> +                    break;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* still has different data after same data */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +            if (never_same || zrun_len) {
> +                /*
> +                 * never_same only acts if
> +                 * data begins with diff in first count512s
> +                 */
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            if ((bytesToCheck == 64) && (comp == 0x0)) {
> +                i += 64;
> +                nzrun_len += 64;
> +                break;
> +            }
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                /* mask like 111000 */
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +    if (res) {
> +        /* the number of data is less than 64 */
> +        unsigned long long mask = pow(2, res);

Not your fault.

21st century.  Someone still use long long in a new API, sniff.

> +        mask -= 1;
> +        __m512i r = SET_ZERO512(r);
> +        __m512i old_data = _mm512_mask_loadu_epi8(r, mask, old_buf + i);
> +        __m512i new_data = _mm512_mask_loadu_epi8(r, mask, new_buf + i);
> +        __mmask64 comp = _mm512_cmpeq_epi8_mask(old_data, new_data);
> +
> +        int bytesToCheck = res;
> +        bool is_same = (comp & 0x1);
> +        while (bytesToCheck) {
> +            if (is_same) {
> +                if (nzrun_len) {
> +                    d += uleb128_encode_small(dst + d, nzrun_len);
> +                    if (d + nzrun_len > dlen) {
> +                        return -1;
> +                    }
> +                    nzrun_start = new_buf + i - nzrun_len;
> +                    memcpy(dst + d, nzrun_start, nzrun_len);
> +                    d += nzrun_len;
> +                    nzrun_len = 0;
> +                }
> +                never_same = false;
> +                num = __builtin_ctzl(~comp);
> +                num = (num < bytesToCheck) ? num : bytesToCheck;
> +                zrun_len += num;
> +                bytesToCheck -= num;
> +                comp >>= num;
> +                i += num;
> +                if (bytesToCheck) {
> +                    /* diff after same */
> +                    d += uleb128_encode_small(dst + d, zrun_len);
> +                    zrun_len = 0;
> +                } else {
> +                    break;
> +                }
> +            }
> +
> +            if (never_same || zrun_len) {
> +                d += uleb128_encode_small(dst + d, zrun_len);
> +                zrun_len = 0;
> +                never_same = false;
> +            }
> +            /* has diff */
> +            num = __builtin_ctzl(comp);
> +            num = (num < bytesToCheck) ? num : bytesToCheck;
> +            nzrun_len += num;
> +            bytesToCheck -= num;
> +            comp >>= num;
> +            i += num;
> +            if (bytesToCheck) {
> +                d += uleb128_encode_small(dst + d, nzrun_len);
> +                /* overflow */
> +                if (d + nzrun_len > dlen) {
> +                    return -1;
> +                }
> +                nzrun_start = new_buf + i - nzrun_len;
> +                memcpy(dst + d, nzrun_start, nzrun_len);
> +                d += nzrun_len;
> +                nzrun_len = 0;
> +                is_same = true;
> +            }
> +        }
> +    }
> +
> +    if (zrun_len) {
> +        return (zrun_len == slen) ? 0 : d;
> +    }
> +    if (nzrun_len != 0) {
> +        d += uleb128_encode_small(dst + d, nzrun_len);
> +        /* overflow */
> +        if (d + nzrun_len > dlen) {
> +            return -1;
> +        }
> +        nzrun_start = new_buf + i - nzrun_len;
> +        memcpy(dst + d, nzrun_start, nzrun_len);
> +        d += nzrun_len;
> +    }
> +    return d;
> +}
> +#pragma GCC pop_options
> +#endif
> \ No newline at end of file
> diff --git a/migration/xbzrle.h b/migration/xbzrle.h index 
> a0db507b9c..6247de5f00 100644
> --- a/migration/xbzrle.h
> +++ b/migration/xbzrle.h
> @@ -18,4 +18,8 @@ int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen,
>                           uint8_t *dst, int dlen);
>  
>  int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int 
> dlen);
> +#if defined(CONFIG_AVX512BW_OPT)
> +int xbzrle_encode_buffer_512(uint8_t *old_buf, uint8_t *new_buf, int slen,
> +                             uint8_t *dst, int dlen); #endif
>  #endif
Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Posted by Richard Henderson 3 years, 6 months ago
On 8/9/22 00:51, Xu, Ling1 wrote:
> Hi, Juan,
>        Thanks for your advice. We have revised our code including: 1) change "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that variable isn't global variable;

You can remove this variable entirely...

> 2) use a function pointer to simplify code in ram.c;

... because it's redundant with the function pointer.


r~
RE: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function
Posted by Xu, Ling1 3 years, 6 months ago
Hi, Richard,
      Thanks for your nice comments! Your suggestions are very helpful. We have revised code in ram.c according to your comments. As for "unroll residual from main loop" problem in algorithm, we will fix this later. Thanks for your time and patience~

Best Regards,
Ling

-----Original Message-----
From: Richard Henderson <richard.henderson@linaro.org> 
Sent: Wednesday, August 10, 2022 2:25 AM
To: Xu, Ling1 <ling1.xu@intel.com>; quintela@redhat.com
Cc: qemu-devel@nongnu.org; dgilbert@redhat.com; Zhao, Zhou <zhou.zhao@intel.com>; Jin, Jun I <jun.i.jin@intel.com>
Subject: Re: [PATCH v3 1/2] Update AVX512 support for xbzrle_encode_buffer function

On 8/9/22 00:51, Xu, Ling1 wrote:
> Hi, Juan,
>        Thanks for your advice. We have revised our code including: 1) change "IS_CPU_SUPPORT_AVX512BW" to "is_cpu_support_avx512bw" to indicate that variable isn't global variable;

You can remove this variable entirely...

> 2) use a function pointer to simplify code in ram.c;

... because it's redundant with the function pointer.


r~