From nobody Wed Apr  8 07:59:17 2026
Return-Path: <linux-kernel-owner@kernel.org>
X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on
	aws-us-west-2-korg-lkml-1.web.codeaurora.org
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id 000EAC67871
	for <linux-kernel@archiver.kernel.org>; Mon, 24 Oct 2022 22:13:29 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S231690AbiJXWN2 (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 24 Oct 2022 18:13:28 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38178 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S232565AbiJXWMw (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 24 Oct 2022 18:12:52 -0400
Received: from mail-vk1-xa31.google.com (mail-vk1-xa31.google.com
 [IPv6:2607:f8b0:4864:20::a31])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 334E230501E
        for <linux-kernel@vger.kernel.org>;
 Mon, 24 Oct 2022 13:28:42 -0700 (PDT)
Received: by mail-vk1-xa31.google.com with SMTP id m18so2112932vka.10
        for <linux-kernel@vger.kernel.org>;
 Mon, 24 Oct 2022 13:28:41 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=FtjfA87GphDJcI4Vb9iAM+chCBCFrIrn+2UzKfCSzkM=;
        b=k4bz2u0erMG2lsu2Psoe5/usHLFPMqhFmHnRrJe0M+hB86PaLpQ7eCFfAzFmdzz1Qv
         q7ZZnGR1H6LfaNjZ/JrHA77mttFZoADtt6TAJK0F9q6u5z66+75iRGMyo+8GHDPT9p4P
         mFbGQjjLg/uKEg8zJpOSxyXDw2AJWwwea8IGNnaBLli3KvHu8L1bLPvbltmjMRVxTpXD
         3y6RGGHjtq6aFJSiYUHIGPZ7cMF3d6d7RyrWfuHcH1+8p4JuWM4Pz4qzFjJo2GrtB1qm
         5oweYcc5sy5AtXjxueMZPMQLluXjjp3PVWMS8Zp9AY9c10z2DxBq3O+uA5RJ6HvXyKOj
         Kmnw==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=FtjfA87GphDJcI4Vb9iAM+chCBCFrIrn+2UzKfCSzkM=;
        b=dFRjmZ0+Yyq7Zcir/HvAC7lcTlEBfAvyeUdcQKk3BjfdEncYnq3QbG/abegoIi8R0L
         QEU+hOuwiL8BlvCEA6JWC5cRCpBMed6SWitBh9wQG2KoNIsW1y6iwZNZClLC+Q11c4jV
         8cJfpFYWCshbcdBcDMJJFTbxAUGEduirz9+tx/9mq83mI8xEY7ACKmWBrZ0xQ/yR77ml
         zVCP4uK486vOkhK0nErX/GR9p2vzwWCQQ3zNIeXTa/p6hrBBqztFbXmhYuEiWsPvbWWa
         Wz1wxSnmarU3q+uQ/kScqVbyL56SxjgOlI33OrtnqJYYHk5Xjzf7HidcnXF5DYdDT55O
         SMpQ==
X-Gm-Message-State: ACrzQf2uJqxV6LC9eU7sPerx3iXnqoJdVm9l4k0gdpoqBQHyp5qKVTvb
        1oq+Axs1Qyu+Q8UXlEVL7UYtSm+dp7Q=
X-Google-Smtp-Source: 
 AMsMyM5LGD+kKoUy+UyXdcaHRMuZb+H9htNgOT9oSVdgZ1XZ1DqfomlJfGo6OT3w8iAWsBX5oQp0ww==
X-Received: by 2002:a17:902:ce0e:b0:17d:a730:3835 with SMTP id
 k14-20020a170902ce0e00b0017da7303835mr36674422plg.131.1666642613053;
        Mon, 24 Oct 2022 13:16:53 -0700 (PDT)
Received: from localhost.localdomain (c-98-35-160-214.hsd1.ca.comcast.net.
 [98.35.160.214])
        by smtp.gmail.com with ESMTPSA id
 k14-20020aa7972e000000b0056bb4dc8164sm173518pfg.193.2022.10.24.13.16.51
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 24 Oct 2022 13:16:52 -0700 (PDT)
From: Nick Terrell <nickrterrell@gmail.com>
To: Nick Terrell <terrelln@fb.com>
Cc: linux-kernel@vger.kernel.org,
        Nick Terrell <nickrterrell@gmail.com>,
        =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= <andre.goddard@gmail.com>,
        David Sterba <dsterba@suse.com>,
        Sam Hardeman <natrox@outlook.com>,
        Kernel Team <Kernel-team@fb.com>
Subject: [PATCH 1/2] zstd: Move zstd-common module exports to
 zstd_common_module.c
Date: Mon, 24 Oct 2022 13:26:05 -0700
Message-Id: <20221024202606.404049-2-nickrterrell@gmail.com>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221024202606.404049-1-nickrterrell@gmail.com>
References: <20221024202606.404049-1-nickrterrell@gmail.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Nick Terrell <terrelln@fb.com>

The zstd codebase is imported from the upstream zstd repo, and is over-writ=
ten on
every update. Upstream keeps the kernel specific code separate from the main
library. So the module definition is moved into the zstd_common_module.c fi=
le.
This matches the pattern followed by the zstd-compress and zstd-decompress =
files.

I've done build and boot testing on x86-64, i386, and aarch64. I've
verified that zstd built both as modules and built-in build and boot.

Signed-off-by: Nick Terrell <terrelln@fb.com>
---
 lib/zstd/Makefile                |  1 +
 lib/zstd/common/entropy_common.c |  4 ----
 lib/zstd/common/zstd_common.c    | 10 ----------
 lib/zstd/zstd_common_module.c    | 32 ++++++++++++++++++++++++++++++++
 4 files changed, 33 insertions(+), 14 deletions(-)
 create mode 100644 lib/zstd/zstd_common_module.c

diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile
index 440bd0007ae2..20f08c644b71 100644
--- a/lib/zstd/Makefile
+++ b/lib/zstd/Makefile
@@ -35,6 +35,7 @@ zstd_decompress-y :=3D \
 		decompress/zstd_decompress_block.o \
=20
 zstd_common-y :=3D \
+		zstd_common_module.o \
 		common/debug.o \
 		common/entropy_common.o \
 		common/error_private.o \
diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_com=
mon.c
index a311808c0d56..6353249de614 100644
--- a/lib/zstd/common/entropy_common.c
+++ b/lib/zstd/common/entropy_common.c
@@ -15,7 +15,6 @@
 /* *************************************
 *  Dependencies
 ***************************************/
-#include <linux/module.h>
 #include "mem.h"
 #include "error_private.h"       /* ERR_*, ERROR */
 #define FSE_STATIC_LINKING_ONLY  /* FSE_MIN_TABLELOG */
@@ -240,7 +239,6 @@ size_t FSE_readNCount(
 {
     return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, h=
eaderBuffer, hbSize, /* bmi2 */ 0);
 }
-EXPORT_SYMBOL_GPL(FSE_readNCount);
=20
 /*! HUF_readStats() :
     Read compact Huffman tree, saved by HUF_writeCTable().
@@ -256,7 +254,6 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U=
32* rankStats,
     U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
     return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr,=
 tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0);
 }
-EXPORT_SYMBOL_GPL(HUF_readStats);
=20
 FORCE_INLINE_TEMPLATE size_t
 HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats,
@@ -357,4 +354,3 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSi=
ze, U32* rankStats,
     (void)bmi2;
     return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSym=
bolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize);
 }
-EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c
index 0f1f63be25d9..3d7e35b309b5 100644
--- a/lib/zstd/common/zstd_common.c
+++ b/lib/zstd/common/zstd_common.c
@@ -13,7 +13,6 @@
 /*-*************************************
 *  Dependencies
 ***************************************/
-#include <linux/module.h>
 #define ZSTD_DEPS_NEED_MALLOC
 #include "zstd_deps.h"   /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_mems=
et */
 #include "error_private.h"
@@ -36,17 +35,14 @@ const char* ZSTD_versionString(void) { return ZSTD_VERS=
ION_STRING; }
  *  tells if a return value is an error code
  *  symbol is required for external callers */
 unsigned ZSTD_isError(size_t code) { return ERR_isError(code); }
-EXPORT_SYMBOL_GPL(ZSTD_isError);
=20
 /*! ZSTD_getErrorName() :
  *  provides error code string from function result (useful for debugging)=
 */
 const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code)=
; }
-EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
=20
 /*! ZSTD_getError() :
  *  convert a `size_t` function result into a proper ZSTD_errorCode enum */
 ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(co=
de); }
-EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
=20
 /*! ZSTD_getErrorString() :
  *  provides error code string from enum */
@@ -63,7 +59,6 @@ void* ZSTD_customMalloc(size_t size, ZSTD_customMem custo=
mMem)
         return customMem.customAlloc(customMem.opaque, size);
     return ZSTD_malloc(size);
 }
-EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
=20
 void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem)
 {
@@ -76,7 +71,6 @@ void* ZSTD_customCalloc(size_t size, ZSTD_customMem custo=
mMem)
     }
     return ZSTD_calloc(1, size);
 }
-EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
=20
 void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
 {
@@ -87,7 +81,3 @@ void ZSTD_customFree(void* ptr, ZSTD_customMem customMem)
             ZSTD_free(ptr);
     }
 }
-EXPORT_SYMBOL_GPL(ZSTD_customFree);
-
-MODULE_LICENSE("Dual BSD/GPL");
-MODULE_DESCRIPTION("Zstd Common");
diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c
new file mode 100644
index 000000000000..22686e367e6f
--- /dev/null
+++ b/lib/zstd/zstd_common_module.c
@@ -0,0 +1,32 @@
+// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in=
 the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (=
found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#include <linux/module.h>
+
+#include "common/huf.h"
+#include "common/fse.h"
+#include "common/zstd_internal.h"
+
+// Export symbols shared by compress and decompress into a common module
+
+#undef ZSTD_isError   /* defined within zstd_internal.h */
+EXPORT_SYMBOL_GPL(FSE_readNCount);
+EXPORT_SYMBOL_GPL(HUF_readStats);
+EXPORT_SYMBOL_GPL(HUF_readStats_wksp);
+EXPORT_SYMBOL_GPL(ZSTD_isError);
+EXPORT_SYMBOL_GPL(ZSTD_getErrorName);
+EXPORT_SYMBOL_GPL(ZSTD_getErrorCode);
+EXPORT_SYMBOL_GPL(ZSTD_customMalloc);
+EXPORT_SYMBOL_GPL(ZSTD_customCalloc);
+EXPORT_SYMBOL_GPL(ZSTD_customFree);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_DESCRIPTION("Zstd Common");
--=20
2.38.1
From nobody Wed Apr  8 07:59:17 2026
Return-Path: <linux-kernel-owner@kernel.org>
Received: from vger.kernel.org (vger.kernel.org [23.128.96.18])
	by smtp.lore.kernel.org (Postfix) with ESMTP id CD202C38A2D
	for <linux-kernel@archiver.kernel.org>; Mon, 24 Oct 2022 22:05:23 +0000 (UTC)
Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand
        id S231150AbiJXWFW (ORCPT <rfc822;linux-kernel@archiver.kernel.org>);
        Mon, 24 Oct 2022 18:05:22 -0400
Received: from lindbergh.monkeyblade.net ([23.128.96.19]:36170 "EHLO
        lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org
        with ESMTP id S231978AbiJXWEz (ORCPT
        <rfc822;linux-kernel@vger.kernel.org>);
        Mon, 24 Oct 2022 18:04:55 -0400
Received: from mail-pl1-x62d.google.com (mail-pl1-x62d.google.com
 [IPv6:2607:f8b0:4864:20::62d])
        by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 851879B877
        for <linux-kernel@vger.kernel.org>;
 Mon, 24 Oct 2022 13:18:37 -0700 (PDT)
Received: by mail-pl1-x62d.google.com with SMTP id p3so8457931pld.10
        for <linux-kernel@vger.kernel.org>;
 Mon, 24 Oct 2022 13:18:37 -0700 (PDT)
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=gmail.com; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:from:to:cc:subject:date
         :message-id:reply-to;
        bh=aB3pyQlqU5SvdVBqjlavg6jCXdGT84BHAaSWMzVkXOo=;
        b=YLYcPh3xqTG9AeKDSkjtmLRFUlHAUM8tfpWOlUo0ZXdZIAI6rFABrhvJWJpsqmZnV4
         Z26OsfqsppHuX2fctY+4eBWExvZLQba53Wk2E6FCgNl98lOL4nmj7jIcOv60wA5ZxB9K
         K78uqapxkvEYLhhkjeMcU6JjF3SiS4IV4rzLlfkWn6/29aqsmZK9XigdbY1Jd0OLge3K
         aYYM0Tq9hwXw2muFyd/SQBbTNZjdZ5dKtW9ezOQybcQFlttRpi8ITnE36hYNw+rq7HI9
         sQxeK2zfC592FO3lMm2x0gy2WiZcfmzLtCdPSH/HFHSZN96kjbZ9av+8OcDD4kluUAup
         7ugA==
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
        d=1e100.net; s=20210112;
        h=content-transfer-encoding:mime-version:references:in-reply-to
         :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc
         :subject:date:message-id:reply-to;
        bh=aB3pyQlqU5SvdVBqjlavg6jCXdGT84BHAaSWMzVkXOo=;
        b=LTRStc4dfx+sL4/vgKvm0TLaH5ch+hLd8O2aj4lG3Zsj53ofYOxwDfRJK94yEJ9fMr
         +YaDb+3ZDCMMdABboxEPRwZpIqKvpf4g0pJSCcqNUWjEAG6Dz5J6w4bPQEBx44J0bQRS
         MeWjRI88lbO4r6XqMaGW5TZS89XmDfWXGTt7tIMgmuGNc7t4SHxT8wU+19OMeV5AZBhp
         EmOUet0nda8QEChinuLG9SghV4GkmBRRNOHru1NgoTyKENepxjz5eHfhTsZDkFavIuNx
         cHK2gjTjKJKMtuUJ5RRpXuxWrlqMjqs8bFRd/oRff/nhkwYvnxCrUFmgMCgXacgWqdFf
         w0iw==
X-Gm-Message-State: ACrzQf1LDbjjXtfcqn4L1Udm7fkGkmJ8/P+MNBgBW868jEWXS+F27YB5
        b2O9tFny+wf5XdWsKz0uFSM=
X-Google-Smtp-Source: 
 AMsMyM6ac3ZzzIv1sM0wR5ZU2+PHPb0yp84WErkBW2Z+dFRhw+IGwZY+Zpgpvn03lom549Z9j1g03A==
X-Received: by 2002:a17:90b:1e11:b0:20d:90b3:45a0 with SMTP id
 pg17-20020a17090b1e1100b0020d90b345a0mr72718807pjb.29.1666642615999;
        Mon, 24 Oct 2022 13:16:55 -0700 (PDT)
Received: from localhost.localdomain (c-98-35-160-214.hsd1.ca.comcast.net.
 [98.35.160.214])
        by smtp.gmail.com with ESMTPSA id
 k14-20020aa7972e000000b0056bb4dc8164sm173518pfg.193.2022.10.24.13.16.53
        (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256);
        Mon, 24 Oct 2022 13:16:55 -0700 (PDT)
From: Nick Terrell <nickrterrell@gmail.com>
To: Nick Terrell <terrelln@fb.com>
Cc: linux-kernel@vger.kernel.org,
        Nick Terrell <nickrterrell@gmail.com>,
        =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= <andre.goddard@gmail.com>,
        David Sterba <dsterba@suse.com>,
        Sam Hardeman <natrox@outlook.com>,
        Kernel Team <Kernel-team@fb.com>
Subject: [PATCH 2/2] zstd: import usptream v1.5.2
Date: Mon, 24 Oct 2022 13:26:06 -0700
Message-Id: <20221024202606.404049-3-nickrterrell@gmail.com>
X-Mailer: git-send-email 2.38.1
In-Reply-To: <20221024202606.404049-1-nickrterrell@gmail.com>
References: <20221024202606.404049-1-nickrterrell@gmail.com>
MIME-Version: 1.0
Content-Transfer-Encoding: quoted-printable
Precedence: bulk
List-ID: <linux-kernel.vger.kernel.org>
X-Mailing-List: linux-kernel@vger.kernel.org
Content-Type: text/plain; charset="utf-8"

From: Nick Terrell <terrelln@fb.com>

Updates the kernel's zstd library to v1.5.2, the latest zstd release.
The upstream tag it is updated to is `v1.5.2-kernel`, which contains
several cherry-picked commits on top of the v1.5.2 release which are
required for the kernel update. I will create this tag once the PR is
ready to merge, until then reference the temporary upstream branch
`v1.5.2-kernel-cherrypicks`.

I plan to submit this patch as part of the v6.2 merge window.

I've done basic build testing & testing on x86-64, i386, and aarch64.
I'm merging these patches into my `zstd-next` branch, which is pulled
into `linux-next` for further testing.

I've benchmarked BtrFS with zstd compression on a x86-64 machine, and
saw these results. Decompression speed is a small win across the board.
The lower compression levels 1-4 see both compression speed and
compression ratio wins. The higher compression levels see a small
compression speed loss and about neutral ratio. I expect the lower
compression levels to be used much more heavily than the high
compression levels, so this should be a net win.

Level	CTime	DTime	Ratio
1	-2.95%	-1.1%	-0.7%
3	-3.5%	-1.2%	-0.5%
5	+3.7%	-1.0%	+0.0%
7	+3.2%	-0.9%	+0.0%
9	-4.3%	-0.8%	+0.1%

Signed-off-by: Nick Terrell <terrelln@fb.com>
---
 include/linux/zstd_lib.h                      |  479 ++--
 lib/zstd/common/bitstream.h                   |    9 +
 lib/zstd/common/compiler.h                    |   67 +-
 lib/zstd/common/entropy_common.c              |    7 +-
 lib/zstd/common/error_private.h               |   81 +-
 lib/zstd/common/fse.h                         |    3 +-
 lib/zstd/common/fse_decompress.c              |    2 +-
 lib/zstd/common/huf.h                         |   46 +-
 lib/zstd/common/mem.h                         |    2 +
 lib/zstd/common/portability_macros.h          |   93 +
 lib/zstd/common/zstd_internal.h               |  175 +-
 lib/zstd/compress/clevels.h                   |  132 ++
 lib/zstd/compress/fse_compress.c              |   83 +-
 lib/zstd/compress/huf_compress.c              |  644 +++++-
 lib/zstd/compress/zstd_compress.c             | 2000 +++++++++++++----
 lib/zstd/compress/zstd_compress_internal.h    |  375 +++-
 lib/zstd/compress/zstd_compress_literals.c    |    9 +-
 lib/zstd/compress/zstd_compress_literals.h    |    4 +-
 lib/zstd/compress/zstd_compress_sequences.c   |   31 +-
 lib/zstd/compress/zstd_compress_superblock.c  |  295 +--
 lib/zstd/compress/zstd_cwksp.h                |  225 +-
 lib/zstd/compress/zstd_double_fast.c          |  413 +++-
 lib/zstd/compress/zstd_fast.c                 |  441 ++--
 lib/zstd/compress/zstd_lazy.c                 | 1352 ++++++++---
 lib/zstd/compress/zstd_lazy.h                 |   38 +
 lib/zstd/compress/zstd_ldm.c                  |   76 +-
 lib/zstd/compress/zstd_ldm.h                  |    1 +
 lib/zstd/compress/zstd_ldm_geartab.h          |    5 +-
 lib/zstd/compress/zstd_opt.c                  |  402 ++--
 lib/zstd/decompress/huf_decompress.c          |  912 ++++++--
 lib/zstd/decompress/zstd_decompress.c         |   80 +-
 lib/zstd/decompress/zstd_decompress_block.c   | 1022 +++++++--
 lib/zstd/decompress/zstd_decompress_block.h   |   10 +-
 .../decompress/zstd_decompress_internal.h     |   38 +-
 lib/zstd/decompress_sources.h                 |    6 +
 lib/zstd/zstd_compress_module.c               |    6 +-
 36 files changed, 6955 insertions(+), 2609 deletions(-)
 create mode 100644 lib/zstd/common/portability_macros.h
 create mode 100644 lib/zstd/compress/clevels.h

diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h
index b8c7dbf98390..79d55465d5c1 100644
--- a/include/linux/zstd_lib.h
+++ b/include/linux/zstd_lib.h
@@ -17,8 +17,16 @@
=20
=20
 /* =3D=3D=3D=3D=3D   ZSTDLIB_API : control library symbols visibility   =
=3D=3D=3D=3D=3D */
-#define ZSTDLIB_VISIBILITY=20
-#define ZSTDLIB_API ZSTDLIB_VISIBILITY
+#ifndef ZSTDLIB_VISIBLE
+#  if (__GNUC__ >=3D 4) && !defined(__MINGW32__)
+#    define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default")))
+#    define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden")))
+#  else
+#    define ZSTDLIB_VISIBLE
+#    define ZSTDLIB_HIDDEN
+#  endif
+#endif
+#define ZSTDLIB_API ZSTDLIB_VISIBLE
=20
=20
 /* ***********************************************************************=
******
@@ -56,8 +64,8 @@
=20
 /*------   Version   ------*/
 #define ZSTD_VERSION_MAJOR    1
-#define ZSTD_VERSION_MINOR    4
-#define ZSTD_VERSION_RELEASE  10
+#define ZSTD_VERSION_MINOR    5
+#define ZSTD_VERSION_RELEASE  2
 #define ZSTD_VERSION_NUMBER  (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_M=
INOR *100 + ZSTD_VERSION_RELEASE)
=20
 /*! ZSTD_versionNumber() :
@@ -94,7 +102,6 @@ ZSTDLIB_API const char* ZSTD_versionString(void);
 #define ZSTD_BLOCKSIZE_MAX     (1<<ZSTD_BLOCKSIZELOG_MAX)
=20
=20
-
 /* *************************************
 *  Simple API
 ***************************************/
@@ -151,7 +158,7 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize=
(const void *src, size_t
  * @return : decompressed size of `src` frame content _if known and not em=
pty_, 0 otherwise. */
 ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, s=
ize_t srcSize);
=20
-/*! ZSTD_findFrameCompressedSize() :
+/*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+
  * `src` should point to the start of a ZSTD frame or skippable frame.
  * `srcSize` must be >=3D first frame size
  * @return : the compressed size of the first frame starting at `src`,
@@ -165,8 +172,9 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const v=
oid* src, size_t srcSize)
 ZSTDLIB_API size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum c=
ompressed size in worst case single-pass scenario */
 ZSTDLIB_API unsigned    ZSTD_isError(size_t code);          /*!< tells if =
a `size_t` function result is an error code */
 ZSTDLIB_API const char* ZSTD_getErrorName(size_t code);     /*!< provides =
readable string from an error code */
-ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum n=
egative compression level allowed */
+ZSTDLIB_API int         ZSTD_minCLevel(void);               /*!< minimum n=
egative compression level allowed, requires v1.4.0+ */
 ZSTDLIB_API int         ZSTD_maxCLevel(void);               /*!< maximum c=
ompression level available */
+ZSTDLIB_API int         ZSTD_defaultCLevel(void);           /*!< default c=
ompression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */
=20
=20
 /* *************************************
@@ -219,9 +227,9 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
                                  const void* src, size_t srcSize);
=20
=20
-/* *************************************
-*  Advanced compression API
-***************************************/
+/* *******************************************
+*  Advanced compression API (Requires v1.4.0+)
+**********************************************/
=20
 /* API design :
  *   Parameters are pushed one by one into an existing context,
@@ -232,7 +240,7 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx,
  *
  *   It's possible to reset all parameters to "default" using ZSTD_CCtx_re=
set().
  *
- *   This API supercedes all other "advanced" API entry points in the expe=
rimental section.
+ *   This API supersedes all other "advanced" API entry points in the expe=
rimental section.
  *   In the future, we expect to remove from experimental API entry points=
 which are redundant with this API.
  */
=20
@@ -251,7 +259,6 @@ typedef enum { ZSTD_fast=3D1,
                          Only the order (from fast to strong) is guarantee=
d */
 } ZSTD_strategy;
=20
-
 typedef enum {
=20
     /* compression parameters
@@ -317,7 +324,6 @@ typedef enum {
                               * The higher the value of selected strategy,=
 the more complex it is,
                               * resulting in stronger and slower compressi=
on.
                               * Special: value 0 means "use default strate=
gy". */
-
     /* LDM mode parameters */
     ZSTD_c_enableLongDistanceMatching=3D160, /* Enable long distance match=
ing.
                                      * This parameter is designed to impro=
ve compression ratio
@@ -374,7 +380,7 @@ typedef enum {
     ZSTD_c_jobSize=3D401,      /* Size of a compression job. This value is=
 enforced only when nbWorkers >=3D 1.
                               * Each compression job is completed in paral=
lel, so this value can indirectly impact the nb of active threads.
                               * 0 means default, which is dynamically dete=
rmined based on compression parameters.
-                              * Job size must be a minimum of overlap size=
, or 1 MB, whichever is largest.
+                              * Job size must be a minimum of overlap size=
, or ZSTDMT_JOBSIZE_MIN (=3D 512 KB), whichever is largest.
                               * The minimum size is automatically and tran=
sparently enforced. */
     ZSTD_c_overlapLog=3D402,   /* Control the overlap size, as a fraction =
of window size.
                               * The overlap size is an amount of data relo=
aded from previous job at the beginning of a new job.
@@ -404,6 +410,8 @@ typedef enum {
      * ZSTD_c_stableOutBuffer
      * ZSTD_c_blockDelimiters
      * ZSTD_c_validateSequences
+     * ZSTD_c_useBlockSplitter
+     * ZSTD_c_useRowMatchFinder
      * Because they are not stable, it's necessary to define ZSTD_STATIC_L=
INKING_ONLY to access them.
      * note : never ever use experimentalParam? names directly;
      *        also, the enums values themselves are unstable and can still=
 change.
@@ -419,7 +427,10 @@ typedef enum {
      ZSTD_c_experimentalParam9=3D1006,
      ZSTD_c_experimentalParam10=3D1007,
      ZSTD_c_experimentalParam11=3D1008,
-     ZSTD_c_experimentalParam12=3D1009
+     ZSTD_c_experimentalParam12=3D1009,
+     ZSTD_c_experimentalParam13=3D1010,
+     ZSTD_c_experimentalParam14=3D1011,
+     ZSTD_c_experimentalParam15=3D1012
 } ZSTD_cParameter;
=20
 typedef struct {
@@ -504,9 +515,9 @@ ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx,
                              const void* src, size_t srcSize);
=20
=20
-/* *************************************
-*  Advanced decompression API
-***************************************/
+/* *********************************************
+*  Advanced decompression API (Requires v1.4.0+)
+************************************************/
=20
 /* The advanced API pushes parameters one by one into an existing DCtx con=
text.
  * Parameters are sticky, and remain valid for all following frames
@@ -668,7 +679,7 @@ typedef enum {
                         : note : multithreaded compression will block to f=
lush as much output as possible. */
 } ZSTD_EndDirective;
=20
-/*! ZSTD_compressStream2() :
+/*! ZSTD_compressStream2() : Requires v1.4.0+
  *  Behaves about the same as ZSTD_compressStream, with additional control=
 on end directive.
  *  - Compression parameters are pushed into CCtx before starting compress=
ion, using ZSTD_CCtx_set*()
  *  - Compression parameters cannot be changed once compression is started=
 (save a list of exceptions in multi-threading mode)
@@ -714,11 +725,11 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void);   /*< r=
ecommended size for output
=20
=20
 /* ***********************************************************************=
******
- * This following is a legacy streaming API.
+ * This following is a legacy streaming API, available since v1.0+ .
  * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2().
  * It is redundant, but remains fully supported.
- * Advanced parameters and dictionary compression can only be used through=
 the
- * new API.
+ * Streaming in combination with advanced parameters and dictionary compre=
ssion
+ * can only be used through the new API.
  *************************************************************************=
*****/
=20
 /*!
@@ -796,7 +807,7 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void);   /*!< re=
commended size for output
 /*! ZSTD_compress_usingDict() :
  *  Compression at an explicit compression level using a Dictionary.
  *  A dictionary can be any arbitrary data segment (also called a prefix),
- *  or a buffer with specified information (see dictBuilder/zdict.h).
+ *  or a buffer with specified information (see zdict.h).
  *  Note : This function loads the dictionary, resulting in significant st=
artup delay.
  *         It's intended for a dictionary used only once.
  *  Note 2 : When `dict =3D=3D NULL || dictSize < 8` no dictionary is used=
. */
@@ -879,19 +890,25 @@ ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DC=
tx* dctx,
  *  Dictionary helper functions
  *******************************/
=20
-/*! ZSTD_getDictID_fromDict() :
+/*! ZSTD_getDictID_fromDict() : Requires v1.4.0+
  *  Provides the dictID stored within dictionary.
  *  if @return =3D=3D 0, the dictionary is not conformant with Zstandard s=
pecification.
  *  It can still be loaded, but as a content-only dictionary. */
 ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dict=
Size);
=20
-/*! ZSTD_getDictID_fromDDict() :
+/*! ZSTD_getDictID_fromCDict() : Requires v1.5.0+
+ *  Provides the dictID of the dictionary loaded into `cdict`.
+ *  If @return =3D=3D 0, the dictionary is not conformant to Zstandard spe=
cification, or empty.
+ *  Non-conformant dictionaries can still be loaded, but as content-only d=
ictionaries. */
+ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict);
+
+/*! ZSTD_getDictID_fromDDict() : Requires v1.4.0+
  *  Provides the dictID of the dictionary loaded into `ddict`.
  *  If @return =3D=3D 0, the dictionary is not conformant to Zstandard spe=
cification, or empty.
  *  Non-conformant dictionaries can still be loaded, but as content-only d=
ictionaries. */
 ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict);
=20
-/*! ZSTD_getDictID_fromFrame() :
+/*! ZSTD_getDictID_fromFrame() : Requires v1.4.0+
  *  Provides the dictID required to decompressed the frame stored within `=
src`.
  *  If @return =3D=3D 0, the dictID could not be decoded.
  *  This could for one of the following reasons :
@@ -905,16 +922,16 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const v=
oid* src, size_t srcSize);
=20
=20
 /* ***********************************************************************=
******
- * Advanced dictionary and prefix API
+ * Advanced dictionary and prefix API (Requires v1.4.0+)
  *
  * This API allows dictionaries to be used with ZSTD_compress2(),
- * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky,=
 and
+ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sti=
cky, and
  * only reset with the context is reset with ZSTD_reset_parameters or
  * ZSTD_reset_session_and_parameters. Prefixes are single-use.
  *************************************************************************=
*****/
=20
=20
-/*! ZSTD_CCtx_loadDictionary() :
+/*! ZSTD_CCtx_loadDictionary() : Requires v1.4.0+
  *  Create an internal CDict from `dict` buffer.
  *  Decompression will have to use same dictionary.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
@@ -933,7 +950,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const voi=
d* src, size_t srcSize);
  *           to precisely select how dictionary content must be interprete=
d. */
 ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* d=
ict, size_t dictSize);
=20
-/*! ZSTD_CCtx_refCDict() :
+/*! ZSTD_CCtx_refCDict() : Requires v1.4.0+
  *  Reference a prepared dictionary, to be used for all next compressed fr=
ames.
  *  Note that compression parameters are enforced from within CDict,
  *  and supersede any compression parameter previously set within CCtx.
@@ -947,7 +964,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* =
cctx, const void* dict, s
  *  Note 2 : CDict is just referenced, its lifetime must outlive its usage=
 within CCtx. */
 ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* c=
dict);
=20
-/*! ZSTD_CCtx_refPrefix() :
+/*! ZSTD_CCtx_refPrefix() : Requires v1.4.0+
  *  Reference a prefix (single-usage dictionary) for next compressed frame.
  *  A prefix is **only used once**. Tables are discarded at end of frame (=
ZSTD_e_end).
  *  Decompression will need same prefix to properly regenerate data.
@@ -968,7 +985,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, =
const ZSTD_CDict* cdict);
 ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
                                  const void* prefix, size_t prefixSize);
=20
-/*! ZSTD_DCtx_loadDictionary() :
+/*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+
  *  Create an internal DDict from dict buffer,
  *  to be used to decompress next frames.
  *  The dictionary remains valid for all future frames, until explicitly i=
nvalidated.
@@ -985,7 +1002,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx,
  */
 ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* d=
ict, size_t dictSize);
=20
-/*! ZSTD_DCtx_refDDict() :
+/*! ZSTD_DCtx_refDDict() : Requires v1.4.0+
  *  Reference a prepared dictionary, to be used to decompress next frames.
  *  The dictionary remains active for decompression of future frames using=
 same DCtx.
  *
@@ -1003,7 +1020,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx=
* dctx, const void* dict, s
  */
 ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* d=
dict);
=20
-/*! ZSTD_DCtx_refPrefix() :
+/*! ZSTD_DCtx_refPrefix() : Requires v1.4.0+
  *  Reference a prefix (single-usage dictionary) to decompress next frame.
  *  This is the reverse operation of ZSTD_CCtx_refPrefix(),
  *  and must use the same prefix as the one used during compression.
@@ -1024,7 +1041,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dct=
x,
=20
 /* =3D=3D=3D   Memory management   =3D=3D=3D */
=20
-/*! ZSTD_sizeof_*() :
+/*! ZSTD_sizeof_*() : Requires v1.4.0+
  *  These functions give the _current_ memory usage of selected object.
  *  Note that object memory usage can evolve (increase or decrease) over t=
ime. */
 ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx);
@@ -1049,6 +1066,29 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDic=
t* ddict);
 #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY)
 #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY
=20
+/* This can be overridden externally to hide static symbols. */
+#ifndef ZSTDLIB_STATIC_API
+#define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE
+#endif
+
+/* Deprecation warnings :
+ * Should these warnings be a problem, it is generally possible to disable=
 them,
+ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_W=
ARNINGS in Visual.
+ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS.
+ */
+#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS
+#  define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API  /* disable deprecat=
ion warnings */
+#else
+#  if (defined(GNUC) && (GNUC > 4 || (GNUC =3D=3D 4 && GNUC_MINOR >=3D 5))=
) || defined(__clang__)
+#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((dep=
recated(message)))
+#  elif (__GNUC__ >=3D 3)
+#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((dep=
recated))
+#  else
+#    pragma message("WARNING: You need to implement ZSTD_DEPRECATED for th=
is compiler")
+#    define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API
+#  endif
+#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */
+
 /* ***********************************************************************=
***************
  *   experimental API (static linking only)
  *************************************************************************=
***************
@@ -1111,9 +1151,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict=
* ddict);
 #define ZSTD_SRCSIZEHINT_MIN        0
 #define ZSTD_SRCSIZEHINT_MAX        INT_MAX
=20
-/* internal */
-#define ZSTD_HASHLOG3_MAX           17
-
=20
 /* ---  Advanced types  --- */
=20
@@ -1255,6 +1292,15 @@ typedef enum {
   ZSTD_lcm_uncompressed =3D 2   /*< Always emit uncompressed literals. */
 } ZSTD_literalCompressionMode_e;
=20
+typedef enum {
+  /* Note: This enum controls features which are conditionally beneficial.=
 Zstd typically will make a final
+   * decision on whether or not to enable the feature (ZSTD_ps_auto), but =
setting the switch to ZSTD_ps_enable
+   * or ZSTD_ps_disable allow for a force enable/disable the feature.
+   */
+  ZSTD_ps_auto =3D 0,         /* Let the library automatically determine w=
hether the feature shall be enabled */
+  ZSTD_ps_enable =3D 1,       /* Force-enable the feature */
+  ZSTD_ps_disable =3D 2       /* Do not use the feature */
+} ZSTD_paramSwitch_e;
=20
 /* *************************************
 *  Frame size functions
@@ -1281,7 +1327,7 @@ typedef enum {
  *   note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it=
 must traverse the input to
  *            read each contained frame header.  This is fast as most of t=
he data is skipped,
  *            however it does mean that all frame data must be present and=
 valid. */
-ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, =
size_t srcSize);
+ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void=
* src, size_t srcSize);
=20
 /*! ZSTD_decompressBound() :
  *  `src` should point to the start of a series of ZSTD encoded and/or ski=
ppable frames
@@ -1296,13 +1342,13 @@ ZSTDLIB_API unsigned long long ZSTD_findDecompresse=
dSize(const void* src, size_t
  *  note 3  : when the decompressed size field isn't available, the upper-=
bound for that frame is calculated by:
  *              upper-bound =3D # blocks * min(128 KB, Window_Size)
  */
-ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_=
t srcSize);
+ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src=
, size_t srcSize);
=20
 /*! ZSTD_frameHeaderSize() :
  *  srcSize must be >=3D ZSTD_FRAMEHEADERSIZE_PREFIX.
  * @return : size of the Frame Header,
  *           or an error code (if srcSize is too small) */
-ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t src=
Size);
=20
 typedef enum {
   ZSTD_sf_noBlockDelimiters =3D 0,         /* Representation of ZSTD_Seque=
nce has no block delimiters, sequences only */
@@ -1325,12 +1371,12 @@ typedef enum {
  * @return : number of sequences generated
  */
=20
-ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* ou=
tSeqs,
+ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Seque=
nce* outSeqs,
                                           size_t outSeqsSize, const void* =
src, size_t srcSize);
=20
 /*! ZSTD_mergeBlockDelimiters() :
  * Given an array of ZSTD_Sequence, remove all sequences that represent bl=
ock delimiters/last literals
- * by merging them into into the literals of the next sequence.
+ * by merging them into the literals of the next sequence.
  *
  * As such, the final generated result has no explicit representation of b=
lock boundaries,
  * and the final last literals segment is not represented in the sequences.
@@ -1339,7 +1385,7 @@ ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* =
zc, ZSTD_Sequence* outSeqs,
  * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters
  * @return : number of sequences left after merging
  */
-ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, siz=
e_t seqsSize);
+ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequenc=
es, size_t seqsSize);
=20
 /*! ZSTD_compressSequences() :
  * Compress an array of ZSTD_Sequence, generated from the original source =
buffer, into dst.
@@ -1369,7 +1415,7 @@ ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Seq=
uence* sequences, size_t se
  *         and cannot emit an RLE block that disagrees with the repcode hi=
story
  * @return : final compressed size or a ZSTD error.
  */
-ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst=
, size_t dstSize,
+ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, vo=
id* dst, size_t dstSize,
                                   const ZSTD_Sequence* inSeqs, size_t inSe=
qsSize,
                                   const void* src, size_t srcSize);
=20
@@ -1377,7 +1423,7 @@ ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* =
const cctx, void* dst, size
 /*! ZSTD_writeSkippableFrame() :
  * Generates a zstd skippable frame containing data given by src, and writ=
es it to dst buffer.
  *
- * Skippable frames begin with a a 4-byte magic number. There are 16 possi=
ble choices of magic number,
+ * Skippable frames begin with a 4-byte magic number. There are 16 possibl=
e choices of magic number,
  * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+1=
5.
  * As such, the parameter magicVariant controls the exact skippable frame =
magic number variant used, so
  * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant.
@@ -1387,9 +1433,29 @@ ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx*=
 const cctx, void* dst, size
  *
  * @return : number of bytes written or a ZSTD error.
  */
-ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity,
+ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCa=
pacity,
                                             const void* src, size_t srcSiz=
e, unsigned magicVariant);
=20
+/*! ZSTD_readSkippableFrame() :
+ * Retrieves a zstd skippable frame containing data given by src, and writ=
es it to dst buffer.
+ *
+ * The parameter magicVariant will receive the magicVariant that was suppl=
ied when the frame was written,
+ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the=
 caller is not interested
+ * in the magicVariant.
+ *
+ * Returns an error if destination buffer is not large enough, or if the f=
rame is not skippable.
+ *
+ * @return : number of bytes written or a ZSTD error.
+ */
+ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, =
unsigned* magicVariant,
+                                            const void* src, size_t srcSiz=
e);
+
+/*! ZSTD_isSkippableFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier =
for a skippable frame.
+ */
+ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size=
);
+
+
=20
 /* *************************************
 *  Memory management
@@ -1418,10 +1484,10 @@ ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* d=
st, size_t dstCapacity,
  *  Note 2 : only single-threaded compression is supported.
  *  ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if Z=
STD_c_nbWorkers is >=3D 1.
  */
-ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
-ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionPara=
meters cParams);
-ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_p=
arams* params);
-ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compress=
ionParameters cParams);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD=
_CCtx_params* params);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void);
=20
 /*! ZSTD_estimateCStreamSize() :
  *  ZSTD_estimateCStreamSize() will provide a budget large enough for any =
compression level up to selected one.
@@ -1436,20 +1502,20 @@ ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void);
  *  Note : if streaming is init with function ZSTD_init?Stream_usingDict(),
  *         an internal ?Dict will be created, which additional size is not=
 estimated here.
  *         In this case, get total size by adding ZSTD_estimate?DictSize */
-ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
-ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionP=
arameters cParams);
-ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCt=
x_params* params);
-ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
-ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, siz=
e_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compr=
essionParameters cParams);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const Z=
STD_CCtx_params* params);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* s=
rc, size_t srcSize);
=20
 /*! ZSTD_estimate?DictSize() :
  *  ZSTD_estimateCDictSize() will bet that src size is relatively "small",=
 and content is copied, like ZSTD_createCDict().
  *  ZSTD_estimateCDictSize_advanced() makes it possible to control compres=
sion parameters precisely, like ZSTD_createCDict_advanced().
  *  Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logica=
lly smaller.
  */
-ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compression=
Level);
-ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_c=
ompressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
-ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMe=
thod_e dictLoadMethod);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize(size_t dictSize, int comp=
ressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize,=
 ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod);
+ZSTDLIB_STATIC_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dic=
tLoadMethod_e dictLoadMethod);
=20
 /*! ZSTD_initStatic*() :
  *  Initialize an object using a pre-allocated fixed-size buffer.
@@ -1472,20 +1538,20 @@ ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t di=
ctSize, ZSTD_dictLoadMethod_e
  *  Limitation 2 : static cctx currently not compatible with multi-threadi=
ng.
  *  Limitation 3 : static dctx is incompatible with legacy support.
  */
-ZSTDLIB_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size_t work=
spaceSize);
-ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t w=
orkspaceSize);    /*< same as ZSTD_initStaticCCtx() */
+ZSTDLIB_STATIC_API ZSTD_CCtx*    ZSTD_initStaticCCtx(void* workspace, size=
_t workspaceSize);
+ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, s=
ize_t workspaceSize);    /*< same as ZSTD_initStaticCCtx() */
=20
-ZSTDLIB_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size_t work=
spaceSize);
-ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t w=
orkspaceSize);    /*< same as ZSTD_initStaticDCtx() */
+ZSTDLIB_STATIC_API ZSTD_DCtx*    ZSTD_initStaticDCtx(void* workspace, size=
_t workspaceSize);
+ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, s=
ize_t workspaceSize);    /*< same as ZSTD_initStaticDCtx() */
=20
-ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict(
+ZSTDLIB_STATIC_API const ZSTD_CDict* ZSTD_initStaticCDict(
                                         void* workspace, size_t workspaceS=
ize,
                                         const void* dict, size_t dictSize,
                                         ZSTD_dictLoadMethod_e dictLoadMeth=
od,
                                         ZSTD_dictContentType_e dictContent=
Type,
                                         ZSTD_compressionParameters cParams=
);
=20
-ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict(
+ZSTDLIB_STATIC_API const ZSTD_DDict* ZSTD_initStaticDDict(
                                         void* workspace, size_t workspaceS=
ize,
                                         const void* dict, size_t dictSize,
                                         ZSTD_dictLoadMethod_e dictLoadMeth=
od,
@@ -1504,44 +1570,44 @@ static
 __attribute__((__unused__))
 ZSTD_customMem const ZSTD_defaultCMem =3D { NULL, NULL, NULL };  /*< this =
constant defers to stdlib's functions */
=20
-ZSTDLIB_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem customMe=
m);
-ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem custo=
mMem);
-ZSTDLIB_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem customMe=
m);
-ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem custo=
mMem);
+ZSTDLIB_STATIC_API ZSTD_CCtx*    ZSTD_createCCtx_advanced(ZSTD_customMem c=
ustomMem);
+ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMe=
m customMem);
+ZSTDLIB_STATIC_API ZSTD_DCtx*    ZSTD_createDCtx_advanced(ZSTD_customMem c=
ustomMem);
+ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMe=
m customMem);
=20
-ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t=
 dictSize,
+ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict,=
 size_t dictSize,
                                                   ZSTD_dictLoadMethod_e di=
ctLoadMethod,
                                                   ZSTD_dictContentType_e d=
ictContentType,
                                                   ZSTD_compressionParamete=
rs cParams,
                                                   ZSTD_customMem customMem=
);
=20
-/* ! Thread pool :
- * These prototypes make it possible to share a thread pool among multiple=
 compression contexts.
- * This can limit resources for applications with multiple threads where e=
ach one uses
- * a threaded compression mode (via ZSTD_c_nbWorkers parameter).
- * ZSTD_createThreadPool creates a new thread pool with a given number of =
threads.
- * Note that the lifetime of such pool must exist while being used.
- * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL ar=
gument value
- * to use an internal thread pool).
- * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer.
+/*! Thread pool :
+ *  These prototypes make it possible to share a thread pool among multipl=
e compression contexts.
+ *  This can limit resources for applications with multiple threads where =
each one uses
+ *  a threaded compression mode (via ZSTD_c_nbWorkers parameter).
+ *  ZSTD_createThreadPool creates a new thread pool with a given number of=
 threads.
+ *  Note that the lifetime of such pool must exist while being used.
+ *  ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL a=
rgument value
+ *  to use an internal thread pool).
+ *  ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer.
  */
 typedef struct POOL_ctx_s ZSTD_threadPool;
-ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads);
-ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool);  /* accept N=
ULL pointer */
-ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPoo=
l* pool);
+ZSTDLIB_STATIC_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThread=
s);
+ZSTDLIB_STATIC_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool);  /* a=
ccept NULL pointer */
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_th=
readPool* pool);
=20
=20
 /*
  * This API is temporary and is expected to change or disappear in the fut=
ure!
  */
-ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2(
+ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced2(
     const void* dict, size_t dictSize,
     ZSTD_dictLoadMethod_e dictLoadMethod,
     ZSTD_dictContentType_e dictContentType,
     const ZSTD_CCtx_params* cctxParams,
     ZSTD_customMem customMem);
=20
-ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(
+ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_advanced(
     const void* dict, size_t dictSize,
     ZSTD_dictLoadMethod_e dictLoadMethod,
     ZSTD_dictContentType_e dictContentType,
@@ -1558,28 +1624,22 @@ ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(
  *  As a consequence, `dictBuffer` **must** outlive CDict,
  *  and its content must remain unmodified throughout the lifetime of CDic=
t.
  *  note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod=
=3D=3DZSTD_dlm_byRef */
-ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffe=
r, size_t dictSize, int compressionLevel);
-
-/*! ZSTD_getDictID_fromCDict() :
- *  Provides the dictID of the dictionary loaded into `cdict`.
- *  If @return =3D=3D 0, the dictionary is not conformant to Zstandard spe=
cification, or empty.
- *  Non-conformant dictionaries can still be loaded, but as content-only d=
ictionaries. */
-ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict);
+ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* di=
ctBuffer, size_t dictSize, int compressionLevel);
=20
 /*! ZSTD_getCParams() :
  * @return ZSTD_compressionParameters structure for a selected compression=
 level and estimated srcSize.
  * `estimatedSrcSize` value is optional, select 0 if not known */
-ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLeve=
l, unsigned long long estimatedSrcSize, size_t dictSize);
+ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_getCParams(int compress=
ionLevel, unsigned long long estimatedSrcSize, size_t dictSize);
=20
 /*! ZSTD_getParams() :
  *  same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object=
 instead of sub-component `ZSTD_compressionParameters`.
  *  All fields of `ZSTD_frameParameters` are set to default : contentSize=
=3D1, checksum=3D0, noDictID=3D0 */
-ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned =
long long estimatedSrcSize, size_t dictSize);
+ZSTDLIB_STATIC_API ZSTD_parameters ZSTD_getParams(int compressionLevel, un=
signed long long estimatedSrcSize, size_t dictSize);
=20
 /*! ZSTD_checkCParams() :
  *  Ensure param values remain within authorized range.
  * @return 0 on success, or an error code (can be checked with ZSTD_isErro=
r()) */
-ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params);
+ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters par=
ams);
=20
 /*! ZSTD_adjustCParams() :
  *  optimize params for a given `srcSize` and `dictSize`.
@@ -1587,23 +1647,25 @@ ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressi=
onParameters params);
  * `dictSize` must be `0` when there is no dictionary.
  *  cPar can be invalid : all parameters will be clamped within valid rang=
e in the @return struct.
  *  This function never fails (wide contract) */
-ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compression=
Parameters cPar, unsigned long long srcSize, size_t dictSize);
+ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_comp=
ressionParameters cPar, unsigned long long srcSize, size_t dictSize);
=20
 /*! ZSTD_compress_advanced() :
  *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZST=
D_CCtx_setParameter() and other parameter setters.
- *  This prototype will be marked as deprecated and generate compilation w=
arning on reaching v1.5.x */
-ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
+ *  This prototype will generate compilation warnings. */
+ZSTD_DEPRECATED("use ZSTD_compress2")
+size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx,
                                           void* dst, size_t dstCapacity,
                                     const void* src, size_t srcSize,
                                     const void* dict,size_t dictSize,
                                           ZSTD_parameters params);
=20
 /*! ZSTD_compress_usingCDict_advanced() :
- *  Note : this function is now REDUNDANT.
+ *  Note : this function is now DEPRECATED.
  *         It can be replaced by ZSTD_compress2(), in combination with ZST=
D_CCtx_loadDictionary() and other parameter setters.
- *  This prototype will be marked as deprecated and generate compilation w=
arning in some future version */
-ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+ *  This prototype will generate compilation warnings. */
+ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary")
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
                                               void* dst, size_t dstCapacit=
y,
                                         const void* src, size_t srcSize,
                                         const ZSTD_CDict* cdict,
@@ -1613,18 +1675,18 @@ ZSTDLIB_API size_t ZSTD_compress_usingCDict_advance=
d(ZSTD_CCtx* cctx,
 /*! ZSTD_CCtx_loadDictionary_byReference() :
  *  Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenc=
ed, instead of being copied into CCtx.
  *  It saves some memory, but also requires that `dict` outlives its usage=
 within `cctx` */
-ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, c=
onst void* dict, size_t dictSize);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* =
cctx, const void* dict, size_t dictSize);
=20
 /*! ZSTD_CCtx_loadDictionary_advanced() :
  *  Same as ZSTD_CCtx_loadDictionary(), but gives finer control over
  *  how to load the dictionary (by copy ? by reference ?)
  *  and how to interpret it (automatic ? force raw mode ? full mode only ?=
) */
-ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, cons=
t void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_d=
ictContentType_e dictContentType);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cct=
x, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod,=
 ZSTD_dictContentType_e dictContentType);
=20
 /*! ZSTD_CCtx_refPrefix_advanced() :
  *  Same as ZSTD_CCtx_refPrefix(), but gives finer control over
  *  how to interpret prefix content (automatic ? force raw mode (default) =
? full mode only ?) */
-ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const voi=
d* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, co=
nst void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType=
);
=20
 /* =3D=3D=3D   experimental parameters   =3D=3D=3D */
 /* these parameters can be used with ZSTD_setParameter()
@@ -1663,9 +1725,15 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD=
_CCtx* cctx, const void* pre
  * See the comments on that enum for an explanation of the feature. */
 #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4
=20
-/* Controls how the literals are compressed (default is auto).
- * The value must be of type ZSTD_literalCompressionMode_e.
- * See ZSTD_literalCompressionMode_t enum definition for details.
+/* Controlled with ZSTD_paramSwitch_e enum.
+ * Default is ZSTD_ps_auto.
+ * Set to ZSTD_ps_disable to never compress literals.
+ * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed =
literals
+ * may still be emitted if huffman is not beneficial to use.)
+ *
+ * By default, in ZSTD_ps_auto, the library will decide at runtime whether=
 to use
+ * literals compression based on the compression parameters - specifically,
+ * negative compression levels do not use literal compression.
  */
 #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5
=20
@@ -1728,7 +1796,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_=
CCtx* cctx, const void* pre
  *
  * Note that this means that the CDict tables can no longer be copied into=
 the
  * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be
- * useable. The dictionary can only be attached or reloaded.
+ * usable. The dictionary can only be attached or reloaded.
  *
  * In general, you should expect compression to be faster--sometimes very =
much
  * so--and CDict creation to be slightly slower. Eventually, we will proba=
bly
@@ -1817,12 +1885,55 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZST=
D_CCtx* cctx, const void* pre
  */
 #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12
=20
+/* ZSTD_c_useBlockSplitter
+ * Controlled with ZSTD_paramSwitch_e enum.
+ * Default is ZSTD_ps_auto.
+ * Set to ZSTD_ps_disable to never use block splitter.
+ * Set to ZSTD_ps_enable to always use block splitter.
+ *
+ * By default, in ZSTD_ps_auto, the library will decide at runtime whether=
 to use
+ * block splitting based on the compression parameters.
+ */
+#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13
+
+/* ZSTD_c_useRowMatchFinder
+ * Controlled with ZSTD_paramSwitch_e enum.
+ * Default is ZSTD_ps_auto.
+ * Set to ZSTD_ps_disable to never use row-based matchfinder.
+ * Set to ZSTD_ps_enable to force usage of row-based matchfinder.
+ *
+ * By default, in ZSTD_ps_auto, the library will decide at runtime whether=
 to use
+ * the row-based matchfinder based on support for SIMD instructions and th=
e window log.
+ * Note that this only pertains to compression strategies: greedy, lazy, a=
nd lazy2
+ */
+#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14
+
+/* ZSTD_c_deterministicRefPrefix
+ * Default is 0 =3D=3D disabled. Set to 1 to enable.
+ *
+ * Zstd produces different results for prefix compression when the prefix =
is
+ * directly adjacent to the data about to be compressed vs. when it isn't.
+ * This is because zstd detects that the two buffers are contiguous and it=
 can
+ * use a more efficient match finding algorithm. However, this produces di=
fferent
+ * results than when the two buffers are non-contiguous. This flag forces =
zstd
+ * to always load the prefix in non-contiguous mode, even if it happens to=
 be
+ * adjacent to the data, to guarantee determinism.
+ *
+ * If you really care about determinism when using a dictionary or prefix,
+ * like when doing delta compression, you should select this option. It co=
mes
+ * at a speed penalty of about ~2.5% if the dictionary and data happened t=
o be
+ * contiguous, and is free if they weren't contiguous. We don't expect that
+ * intentionally making the dictionary and data contiguous will be worth t=
he
+ * cost to memcpy() the data.
+ */
+#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15
+
 /*! ZSTD_CCtx_getParameter() :
  *  Get the requested compression parameter value, selected by enum ZSTD_c=
Parameter,
  *  and store it into int* value.
  * @return : 0, or an error code (which can be tested with ZSTD_isError()).
  */
-ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cPar=
ameter param, int* value);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZS=
TD_cParameter param, int* value);
=20
=20
 /*! ZSTD_CCtx_params :
@@ -1842,27 +1953,27 @@ ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZST=
D_CCtx* cctx, ZSTD_cParameter
  *  This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams()
  *  for static allocation of CCtx for single-threaded compression.
  */
-ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
-ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);  /* acce=
pt NULL pointer */
+ZSTDLIB_STATIC_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void);
+ZSTDLIB_STATIC_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params);  =
/* accept NULL pointer */
=20
 /*! ZSTD_CCtxParams_reset() :
  *  Reset params to default values.
  */
-ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params);
=20
 /*! ZSTD_CCtxParams_init() :
  *  Initializes the compression parameters of cctxParams according to
  *  compression level. All other parameters are reset to their default val=
ues.
  */
-ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int =
compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParam=
s, int compressionLevel);
=20
 /*! ZSTD_CCtxParams_init_advanced() :
  *  Initializes the compression and frame parameters of cctxParams accordi=
ng to
  *  params. All other parameters are reset to their default values.
  */
-ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxPar=
ams, ZSTD_parameters params);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* =
cctxParams, ZSTD_parameters params);
=20
-/*! ZSTD_CCtxParams_setParameter() :
+/*! ZSTD_CCtxParams_setParameter() : Requires v1.4.0+
  *  Similar to ZSTD_CCtx_setParameter.
  *  Set one compression parameter, selected by enum ZSTD_cParameter.
  *  Parameters must be applied to a ZSTD_CCtx using
@@ -1870,14 +1981,14 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZS=
TD_CCtx_params* cctxParams, Z
  * @result : a code representing success or failure (which can be tested w=
ith
  *           ZSTD_isError()).
  */
-ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, =
ZSTD_cParameter param, int value);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* p=
arams, ZSTD_cParameter param, int value);
=20
 /*! ZSTD_CCtxParams_getParameter() :
  * Similar to ZSTD_CCtx_getParameter.
  * Get the requested value of one compression parameter, selected by enum =
ZSTD_cParameter.
  * @result : 0, or an error code (which can be tested with ZSTD_isError()).
  */
-ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* pa=
rams, ZSTD_cParameter param, int* value);
+ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_par=
ams* params, ZSTD_cParameter param, int* value);
=20
 /*! ZSTD_CCtx_setParametersUsingCCtxParams() :
  *  Apply a set of ZSTD_CCtx_params to the compression context.
@@ -1886,7 +1997,7 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const=
 ZSTD_CCtx_params* params,
  *    if nbWorkers>=3D1, new parameters will be picked up at next job,
  *       with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jo=
bSize, and overlapLog are not updated).
  */
-ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
+ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParametersUsingCCtxParams(
         ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params);
=20
 /*! ZSTD_compressStream2_simpleArgs() :
@@ -1895,7 +2006,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxPa=
rams(
  *  This variant might be helpful for binders from dynamic languages
  *  which have troubles handling structures containing memory pointers.
  */
-ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs (
+ZSTDLIB_STATIC_API size_t ZSTD_compressStream2_simpleArgs (
                             ZSTD_CCtx* cctx,
                             void* dst, size_t dstCapacity, size_t* dstPos,
                       const void* src, size_t srcSize, size_t* srcPos,
@@ -1911,33 +2022,33 @@ ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs (
  *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always=
 be 0.
  *  Note 2 : Legacy Frame Identifiers are considered valid only if Legacy =
Support is enabled.
  *  Note 3 : Skippable Frame Identifiers are considered valid. */
-ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
+ZSTDLIB_STATIC_API unsigned ZSTD_isFrame(const void* buffer, size_t size);
=20
 /*! ZSTD_createDDict_byReference() :
  *  Create a digested dictionary, ready to start decompression operation w=
ithout startup delay.
  *  Dictionary content is referenced, and therefore stays in dictBuffer.
  *  It is important that dictBuffer outlives DDict,
  *  it must remain read accessible throughout the lifetime of DDict */
-ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffe=
r, size_t dictSize);
+ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* di=
ctBuffer, size_t dictSize);
=20
 /*! ZSTD_DCtx_loadDictionary_byReference() :
  *  Same as ZSTD_DCtx_loadDictionary(),
  *  but references `dict` content instead of copying it into `dctx`.
  *  This saves memory if `dict` remains around.,
  *  However, it's imperative that `dict` remains accessible (and unmodifie=
d) while being used, so it must outlive decompression. */
-ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, c=
onst void* dict, size_t dictSize);
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* =
dctx, const void* dict, size_t dictSize);
=20
 /*! ZSTD_DCtx_loadDictionary_advanced() :
  *  Same as ZSTD_DCtx_loadDictionary(),
  *  but gives direct control over
  *  how to load the dictionary (by copy ? by reference ?)
  *  and how to interpret it (automatic ? force raw mode ? full mode only ?=
). */
-ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, cons=
t void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_d=
ictContentType_e dictContentType);
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dct=
x, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod,=
 ZSTD_dictContentType_e dictContentType);
=20
 /*! ZSTD_DCtx_refPrefix_advanced() :
  *  Same as ZSTD_DCtx_refPrefix(), but gives finer control over
  *  how to interpret prefix content (automatic ? force raw mode (default) =
? full mode only ?) */
-ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const voi=
d* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType);
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, co=
nst void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType=
);
=20
 /*! ZSTD_DCtx_setMaxWindowSize() :
  *  Refuses allocating internal buffers for frames requiring a window size=
 larger than provided limit.
@@ -1946,14 +2057,14 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZST=
D_DCtx* dctx, const void* pre
  *  By default, a decompression context accepts all window sizes <=3D (1 <=
< ZSTD_WINDOWLOG_LIMIT_DEFAULT)
  * @return : 0, or an error code (which can be tested using ZSTD_isError()=
).
  */
-ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxW=
indowSize);
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size=
_t maxWindowSize);
=20
 /*! ZSTD_DCtx_getParameter() :
  *  Get the requested decompression parameter value, selected by enum ZSTD=
_dParameter,
  *  and store it into int* value.
  * @return : 0, or an error code (which can be tested with ZSTD_isError()).
  */
-ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter=
 param, int* value);
+ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dPa=
rameter param, int* value);
=20
 /* ZSTD_d_format
  * experimental parameter,
@@ -2028,11 +2139,13 @@ ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx=
* dctx, ZSTD_dParameter param
=20
=20
 /*! ZSTD_DCtx_setFormat() :
+ *  This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter().
  *  Instruct the decoder context about what kind of data to decode next.
  *  This instruction is mandatory to decode data without a fully-formed he=
ader,
  *  such ZSTD_f_zstd1_magicless for example.
  * @return : 0, or an error code (which can be tested using ZSTD_isError()=
). */
-ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e form=
at);
+ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead")
+size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format);
=20
 /*! ZSTD_decompressStream_simpleArgs() :
  *  Same as ZSTD_decompressStream(),
@@ -2040,7 +2153,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dct=
x, ZSTD_format_e format);
  *  This can be helpful for binders from dynamic languages
  *  which have troubles handling structures containing memory pointers.
  */
-ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs (
+ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs (
                             ZSTD_DCtx* dctx,
                             void* dst, size_t dstCapacity, size_t* dstPos,
                       const void* src, size_t srcSize, size_t* srcPos);
@@ -2056,7 +2169,7 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs (
 /*=3D=3D=3D=3D=3D   Advanced Streaming compression functions  =3D=3D=3D=3D=
=3D*/
=20
 /*! ZSTD_initCStream_srcSize() :
- * This function is deprecated, and equivalent to:
+ * This function is DEPRECATED, and equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any)
  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLev=
el);
@@ -2065,15 +2178,15 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs=
 (
  * pledgedSrcSize must be correct. If it is not known at init time, use
  * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older progr=
ams,
  * "0" also disables frame content size field. It may be enabled in the fu=
ture.
- * Note : this prototype will be marked as deprecated and generate compila=
tion warnings on reaching v1.5.x
+ * This prototype will generate compilation warnings.
  */
-ZSTDLIB_API size_t
-ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions=
")
+size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
                          int compressionLevel,
                          unsigned long long pledgedSrcSize);
=20
 /*! ZSTD_initCStream_usingDict() :
- * This function is deprecated, and is equivalent to:
+ * This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLev=
el);
  *     ZSTD_CCtx_loadDictionary(zcs, dict, dictSize);
@@ -2082,15 +2195,15 @@ ZSTD_initCStream_srcSize(ZSTD_CStream* zcs,
  * dict =3D=3D NULL or dictSize < 8, in which case no dict is used.
  * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd diction=
ary if
  * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm=
_byCopy.
- * Note : this prototype will be marked as deprecated and generate compila=
tion warnings on reaching v1.5.x
+ * This prototype will generate compilation warnings.
  */
-ZSTDLIB_API size_t
-ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions=
")
+size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
                      const void* dict, size_t dictSize,
                            int compressionLevel);
=20
 /*! ZSTD_initCStream_advanced() :
- * This function is deprecated, and is approximately equivalent to:
+ * This function is DEPRECATED, and is approximately equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     // Pseudocode: Set each zstd parameter and leave the rest as-is.
  *     for ((param, value) : params) {
@@ -2102,23 +2215,24 @@ ZSTD_initCStream_usingDict(ZSTD_CStream* zcs,
  * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy.
  * pledgedSrcSize must be correct.
  * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOW=
N.
- * Note : this prototype will be marked as deprecated and generate compila=
tion warnings on reaching v1.5.x
+ * This prototype will generate compilation warnings.
  */
-ZSTDLIB_API size_t
-ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions=
")
+size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs,
                     const void* dict, size_t dictSize,
                           ZSTD_parameters params,
                           unsigned long long pledgedSrcSize);
=20
 /*! ZSTD_initCStream_usingCDict() :
- * This function is deprecated, and equivalent to:
+ * This function is DEPRECATED, and equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     ZSTD_CCtx_refCDict(zcs, cdict);
  *
  * note : cdict will just be referenced, and must outlive compression sess=
ion
- * Note : this prototype will be marked as deprecated and generate compila=
tion warnings on reaching v1.5.x
+ * This prototype will generate compilation warnings.
  */
-ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZS=
TD_CDict* cdict);
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h fo=
r detailed instructions")
+size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cd=
ict);
=20
 /*! ZSTD_initCStream_usingCDict_advanced() :
  *   This function is DEPRECATED, and is approximately equivalent to:
@@ -2133,18 +2247,21 @@ ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD=
_CStream* zcs, const ZSTD_CDi
  * same as ZSTD_initCStream_usingCDict(), with control over frame paramete=
rs.
  * pledgedSrcSize must be correct. If srcSize is not known at init time, u=
se
  * value ZSTD_CONTENTSIZE_UNKNOWN.
- * Note : this prototype will be marked as deprecated and generate compila=
tion warnings on reaching v1.5.x
+ * This prototype will generate compilation warnings.
  */
-ZSTDLIB_API size_t
-ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h fo=
r detailed instructions")
+size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs,
                                const ZSTD_CDict* cdict,
                                      ZSTD_frameParameters fParams,
                                      unsigned long long pledgedSrcSize);
=20
 /*! ZSTD_resetCStream() :
- * This function is deprecated, and is equivalent to:
+ * This function is DEPRECATED, and is equivalent to:
  *     ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only);
  *     ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize);
+ * Note: ZSTD_resetCStream() interprets pledgedSrcSize =3D=3D 0 as ZSTD_CO=
NTENTSIZE_UNKNOWN, but
+ *       ZSTD_CCtx_setPledgedSrcSize() does not do the same, so ZSTD_CONTE=
NTSIZE_UNKNOWN must be
+ *       explicitly specified.
  *
  *  start a new frame, using same parameters from previous frame.
  *  This is typically useful to skip dictionary loading stage, since it wi=
ll re-use it in-place.
@@ -2154,9 +2271,10 @@ ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* z=
cs,
  *  For the time being, pledgedSrcSize=3D=3D0 is interpreted as "srcSize u=
nknown" for compatibility with older programs,
  *  but it will change to mean "empty" in future version, so use macro ZST=
D_CONTENTSIZE_UNKNOWN instead.
  * @return : 0, or an error code (which can be tested using ZSTD_isError())
- *  Note : this prototype will be marked as deprecated and generate compil=
ation warnings on reaching v1.5.x
+ *  This prototype will generate compilation warnings.
  */
-ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long=
 pledgedSrcSize);
+ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions=
")
+size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcS=
ize);
=20
=20
 typedef struct {
@@ -2174,7 +2292,7 @@ typedef struct {
  * Note : (ingested - consumed) is amount of input data buffered internall=
y, not yet compressed.
  * Aggregates progression inside active worker threads.
  */
-ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx=
* cctx);
+ZSTDLIB_STATIC_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZS=
TD_CCtx* cctx);
=20
 /*! ZSTD_toFlushNow() :
  *  Tell how many bytes are ready to be flushed immediately.
@@ -2189,7 +2307,7 @@ ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgre=
ssion(const ZSTD_CCtx* cctx
  *    therefore flush speed is limited by production speed of oldest job
  *    irrespective of the speed of concurrent (and newer) jobs.
  */
-ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
+ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
=20
=20
 /*=3D=3D=3D=3D=3D   Advanced Streaming decompression functions  =3D=3D=3D=
=3D=3D*/
@@ -2203,7 +2321,7 @@ ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx);
  * note: no dictionary will be used if dict =3D=3D NULL or dictSize < 8
  * Note : this prototype will be marked as deprecated and generate compila=
tion warnings on reaching v1.5.x
  */
-ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const voi=
d* dict, size_t dictSize);
+ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, co=
nst void* dict, size_t dictSize);
=20
 /*!
  * This function is deprecated, and is equivalent to:
@@ -2214,7 +2332,7 @@ ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DS=
tream* zds, const void* dic
  * note : ddict is referenced, it must outlive decompression session
  * Note : this prototype will be marked as deprecated and generate compila=
tion warnings on reaching v1.5.x
  */
-ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZS=
TD_DDict* ddict);
+ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, c=
onst ZSTD_DDict* ddict);
=20
 /*!
  * This function is deprecated, and is equivalent to:
@@ -2224,7 +2342,7 @@ ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_D=
Stream* zds, const ZSTD_DDi
  * re-use decompression parameters from previous init; saves dictionary lo=
ading
  * Note : this prototype will be marked as deprecated and generate compila=
tion warnings on reaching v1.5.x
  */
-ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
+ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds);
=20
=20
 /* *******************************************************************
@@ -2243,8 +2361,7 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zd=
s);
   ZSTD_CCtx object can be re-used multiple times within successive compres=
sion operations.
=20
   Start by initializing a context.
-  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictiona=
ry compression,
-  or ZSTD_compressBegin_advanced(), for finer parameter control.
+  Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictiona=
ry compression.
   It's also possible to duplicate a reference context which has already be=
en initialized, using ZSTD_copyCCtx()
=20
   Then, consume your input using ZSTD_compressContinue().
@@ -2267,17 +2384,19 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* =
zds);
 */
=20
 /*=3D=3D=3D=3D=3D   Buffer-less streaming compression functions  =3D=3D=3D=
=3D=3D*/
-ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLeve=
l);
-ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const voi=
d* dict, size_t dictSize, int compressionLevel);
-ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void=
* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledged=
SrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZS=
TD_CONTENTSIZE_UNKNOWN */
-ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZS=
TD_CDict* cdict); /*< note: fails if cdict=3D=3DNULL */
-ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const=
 cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, u=
nsigned long long const pledgedSrcSize);   /* compression parameters are al=
ready set within cdict. pledgedSrcSize must be correct. If srcSize is not k=
nown, use macro ZSTD_CONTENTSIZE_UNKNOWN */
-ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* prepare=
dCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcSize is =
not known, use ZSTD_CONTENTSIZE_UNKNOWN */
-
-ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_=
t dstCapacity, const void* src, size_t srcSize);
-ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dst=
Capacity, const void* src, size_t srcSize);
-
-
+ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compress=
ionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, co=
nst void* dict, size_t dictSize, int compressionLevel);
+ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, c=
onst ZSTD_CDict* cdict); /*< note: fails if cdict=3D=3DNULL */
+ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* =
preparedCCtx, unsigned long long pledgedSrcSize); /*<  note: if pledgedSrcS=
ize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */
+
+ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst=
, size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, siz=
e_t dstCapacity, const void* src, size_t srcSize);
+
+/* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_adv=
anced() are now DEPRECATED and will generate a compiler warning */
+ZSTD_DEPRECATED("use advanced API to access custom parameters")
+size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size=
_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*=
< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSI=
ZE_UNKNOWN */
+ZSTD_DEPRECATED("use advanced API to access custom parameters")
+size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const=
 ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long=
 long const pledgedSrcSize);   /* compression parameters are already set wi=
thin cdict. pledgedSrcSize must be correct. If srcSize is not known, use ma=
cro ZSTD_CONTENTSIZE_UNKNOWN */
 /*
   Buffer-less streaming decompression (synchronous mode)
=20
@@ -2368,24 +2487,24 @@ typedef struct {
  * @return : 0, `zfhPtr` is correctly filled,
  *          >0, `srcSize` is too small, value is wanted `srcSize` amount,
  *           or an error code, which can be tested using ZSTD_isError() */
-ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const voi=
d* src, size_t srcSize);   /*< doesn't consume input */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, co=
nst void* src, size_t srcSize);   /*< doesn't consume input */
 /*! ZSTD_getFrameHeader_advanced() :
  *  same as ZSTD_getFrameHeader(),
  *  with added capability to select a format (like ZSTD_f_zstd1_magicless)=
 */
-ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, =
const void* src, size_t srcSize, ZSTD_format_e format);
-ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSi=
ze, unsigned long long frameContentSize);  /*< when frame content size is n=
ot known, pass in frameContentSize =3D=3D ZSTD_CONTENTSIZE_UNKNOWN */
+ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* z=
fhPtr, const void* src, size_t srcSize, ZSTD_format_e format);
+ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long w=
indowSize, unsigned long long frameContentSize);  /*< when frame content si=
ze is not known, pass in frameContentSize =3D=3D ZSTD_CONTENTSIZE_UNKNOWN */
=20
-ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
-ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const v=
oid* dict, size_t dictSize);
-ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const =
ZSTD_DDict* ddict);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, =
const void* dict, size_t dictSize);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx,=
 const ZSTD_DDict* ddict);
=20
-ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
-ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, siz=
e_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* d=
st, size_t dstCapacity, const void* src, size_t srcSize);
=20
 /* misc */
-ZSTDLIB_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* prepare=
dDCtx);
+ZSTDLIB_STATIC_API void   ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* =
preparedDCtx);
 typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZS=
TDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputT=
ype_e;
-ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx);
+ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx=
);
=20
=20
=20
@@ -2422,10 +2541,10 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType=
(ZSTD_DCtx* dctx);
 */
=20
 /*=3D=3D=3D=3D=3D   Raw zstd block functions  =3D=3D=3D=3D=3D*/
-ZSTDLIB_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
-ZSTDLIB_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst, size_t=
 dstCapacity, const void* src, size_t srcSize);
-ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t=
 dstCapacity, const void* src, size_t srcSize);
-ZSTDLIB_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void* block=
Start, size_t blockSize);  /*< insert uncompressed block into `dctx` histor=
y. Useful for multi-blocks decompression. */
+ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize   (const ZSTD_CCtx* cctx);
+ZSTDLIB_STATIC_API size_t ZSTD_compressBlock  (ZSTD_CCtx* cctx, void* dst,=
 size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst,=
 size_t dstCapacity, const void* src, size_t srcSize);
+ZSTDLIB_STATIC_API size_t ZSTD_insertBlock    (ZSTD_DCtx* dctx, const void=
* blockStart, size_t blockSize);  /*< insert uncompressed block into `dctx`=
 history. Useful for multi-blocks decompression. */
=20
=20
 #endif   /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */
diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h
index 28248abe8612..feef3a1b1d60 100644
--- a/lib/zstd/common/bitstream.h
+++ b/lib/zstd/common/bitstream.h
@@ -313,7 +313,16 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(=
size_t bitContainer, U32 c
     U32 const regMask =3D sizeof(bitContainer)*8 - 1;
     /* if start > regMask, bitstream is corrupted, and result is undefined=
 */
     assert(nbBits < BIT_MASK_SIZE);
+    /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is bett=
er
+     * than accessing memory. When bmi2 instruction is not present, we con=
sider
+     * such cpus old (pre-Haswell, 2013) and their performance is not of t=
hat
+     * importance.
+     */
+#if defined(__x86_64__) || defined(_M_X86)
+    return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1=
);
+#else
     return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
+#endif
 }
=20
 MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, =
U32 const nbBits)
diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h
index f5a9c70a228a..c42d39faf9bd 100644
--- a/lib/zstd/common/compiler.h
+++ b/lib/zstd/common/compiler.h
@@ -11,6 +11,8 @@
 #ifndef ZSTD_COMPILER_H
 #define ZSTD_COMPILER_H
=20
+#include "portability_macros.h"
+
 /*-*******************************************************
 *  Compiler specifics
 *********************************************************/
@@ -34,7 +36,7 @@
=20
 /*
   On MSVC qsort requires that functions passed into it use the __cdecl cal=
ling conversion(CC).
-  This explictly marks such functions as __cdecl so that the code will sti=
ll compile
+  This explicitly marks such functions as __cdecl so that the code will st=
ill compile
   if a CC other than __cdecl has been made the default.
 */
 #define WIN_CDECL
@@ -70,25 +72,13 @@
=20
=20
 /* target attribute */
-#ifndef __has_attribute
-  #define __has_attribute(x) 0  /* Compatibility with non-clang compilers.=
 */
-#endif
 #define TARGET_ATTRIBUTE(target) __attribute__((__target__(target)))
=20
-/* Enable runtime BMI2 dispatch based on the CPU.
- * Enabled for clang & gcc >=3D4.8 on x86 when BMI2 isn't enabled by defau=
lt.
+/* Target attribute for BMI2 dynamic dispatch.
+ * Enable lzcnt, bmi, and bmi2.
+ * We test for bmi1 & bmi2. lzcnt is included in bmi1.
  */
-#ifndef DYNAMIC_BMI2
-  #if ((defined(__clang__) && __has_attribute(__target__)) \
-      || (defined(__GNUC__) \
-          && (__GNUC__ >=3D 5 || (__GNUC__ =3D=3D 4 && __GNUC_MINOR__ >=3D=
 8)))) \
-      && (defined(__x86_64__) || defined(_M_X86)) \
-      && !defined(__BMI2__)
-  #  define DYNAMIC_BMI2 1
-  #else
-  #  define DYNAMIC_BMI2 0
-  #endif
-#endif
+#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2")
=20
 /* prefetch
  * can be disabled, by declaring NO_PREFETCH build macro */
@@ -115,8 +105,9 @@
 }
=20
 /* vectorization
- * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */
-#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__)
+ * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax,
+ * and some compilers, like Intel ICC and MCST LCC, do not support it at a=
ll. */
+#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__)=
 && !defined(__LCC__)
 #  if (__GNUC__ =3D=3D 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >=3D 5)
 #    define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize")))
 #  else
@@ -134,20 +125,18 @@
 #define LIKELY(x) (__builtin_expect((x), 1))
 #define UNLIKELY(x) (__builtin_expect((x), 0))
=20
+#if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC_=
_ > 4 || (__GNUC__ =3D=3D 4 && __GNUC_MINOR__ >=3D 5)))
+#  define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); }
+#else
+#  define ZSTD_UNREACHABLE { assert(0); }
+#endif
+
 /* disable warnings */
=20
 /*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
=20
=20
-/* compat. with non-clang compilers */
-#ifndef __has_builtin
-#  define __has_builtin(x) 0
-#endif
-
-/* compat. with non-clang compilers */
-#ifndef __has_feature
-#  define __has_feature(x) 0
-#endif
+/* compile time determination of SIMD support */
=20
 /* C-language Attributes are added in C23. */
 #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(_=
_has_c_attribute)
@@ -168,10 +157,28 @@
  */
 #define ZSTD_FALLTHROUGH fallthrough
=20
-/* detects whether we are being compiled under msan */
+/*-**************************************************************
+*  Alignment check
+*****************************************************************/
+
+/* this test was initially positioned in mem.h,
+ * but this file is removed (or replaced) for linux kernel
+ * so it's now hosted in compiler.h,
+ * which remains valid for both user & kernel spaces.
+ */
+
+#ifndef ZSTD_ALIGNOF
+/* covers gcc, clang & MSVC */
+/* note : this section must come first, before C11,
+ * due to a limitation in the kernel source generator */
+#  define ZSTD_ALIGNOF(T) __alignof(T)
+
+#endif /* ZSTD_ALIGNOF */
=20
+/*-**************************************************************
+*  Sanitizer
+*****************************************************************/
=20
-/* detects whether we are being compiled under asan */
=20
=20
 #endif /* ZSTD_COMPILER_H */
diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_com=
mon.c
index 6353249de614..fef67056f052 100644
--- a/lib/zstd/common/entropy_common.c
+++ b/lib/zstd/common/entropy_common.c
@@ -212,7 +212,7 @@ static size_t FSE_readNCount_body_default(
 }
=20
 #if DYNAMIC_BMI2
-TARGET_ATTRIBUTE("bmi2") static size_t FSE_readNCount_body_bmi2(
+BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2(
         short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPt=
r,
         const void* headerBuffer, size_t hbSize)
 {
@@ -240,6 +240,7 @@ size_t FSE_readNCount(
     return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, h=
eaderBuffer, hbSize, /* bmi2 */ 0);
 }
=20
+
 /*! HUF_readStats() :
     Read compact Huffman tree, saved by HUF_writeCTable().
     `huffWeight` is destination buffer.
@@ -293,7 +294,7 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32=
* rankStats,
     ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32));
     weightTotal =3D 0;
     {   U32 n; for (n=3D0; n<oSize; n++) {
-            if (huffWeight[n] >=3D HUF_TABLELOG_MAX) return ERROR(corrupti=
on_detected);
+            if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_=
detected);
             rankStats[huffWeight[n]]++;
             weightTotal +=3D (1 << huffWeight[n]) >> 1;
     }   }
@@ -331,7 +332,7 @@ static size_t HUF_readStats_body_default(BYTE* huffWeig=
ht, size_t hwSize, U32* r
 }
=20
 #if DYNAMIC_BMI2
-static TARGET_ATTRIBUTE("bmi2") size_t HUF_readStats_body_bmi2(BYTE* huffW=
eight, size_t hwSize, U32* rankStats,
+static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeig=
ht, size_t hwSize, U32* rankStats,
                      U32* nbSymbolsPtr, U32* tableLogPtr,
                      const void* src, size_t srcSize,
                      void* workSpace, size_t wkspSize)
diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_privat=
e.h
index d14e686adf95..ca5101e542fa 100644
--- a/lib/zstd/common/error_private.h
+++ b/lib/zstd/common/error_private.h
@@ -18,8 +18,10 @@
 /* ****************************************
 *  Dependencies
 ******************************************/
-#include "zstd_deps.h"    /* size_t */
 #include <linux/zstd_errors.h>  /* enum list */
+#include "compiler.h"
+#include "debug.h"
+#include "zstd_deps.h"       /* size_t */
=20
=20
 /* ****************************************
@@ -62,5 +64,82 @@ ERR_STATIC const char* ERR_getErrorName(size_t code)
     return ERR_getErrorString(ERR_getErrorCode(code));
 }
=20
+/*
+ * Ignore: this is an internal helper.
+ *
+ * This is a helper function to help force C99-correctness during compilat=
ion.
+ * Under strict compilation modes, variadic macro arguments can't be empty.
+ * However, variadic function arguments can be. Using a function therefore=
 lets
+ * us statically check that at least one (string) argument was passed,
+ * independent of the compilation flags.
+ */
+static INLINE_KEYWORD UNUSED_ATTR
+void _force_has_format_string(const char *format, ...) {
+  (void)format;
+}
+
+/*
+ * Ignore: this is an internal helper.
+ *
+ * We want to force this function invocation to be syntactically correct, =
but
+ * we don't want to force runtime evaluation of its arguments.
+ */
+#define _FORCE_HAS_FORMAT_STRING(...) \
+  if (0) { \
+    _force_has_format_string(__VA_ARGS__); \
+  }
+
+#define ERR_QUOTE(str) #str
+
+/*
+ * Return the specified error if the condition evaluates to true.
+ *
+ * In debug modes, prints additional information.
+ * In order to do that (particularly, printing the conditional that failed=
),
+ * this can't just wrap RETURN_ERROR().
+ */
+#define RETURN_ERROR_IF(cond, err, ...) \
+  if (cond) { \
+    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
+           __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \
+    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+    RAWLOG(3, ": " __VA_ARGS__); \
+    RAWLOG(3, "\n"); \
+    return ERROR(err); \
+  }
+
+/*
+ * Unconditionally return the specified error.
+ *
+ * In debug modes, prints additional information.
+ */
+#define RETURN_ERROR(err, ...) \
+  do { \
+    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
+           __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \
+    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+    RAWLOG(3, ": " __VA_ARGS__); \
+    RAWLOG(3, "\n"); \
+    return ERROR(err); \
+  } while(0);
+
+/*
+ * If the provided expression evaluates to an error code, returns that err=
or code.
+ *
+ * In debug modes, prints additional information.
+ */
+#define FORWARD_IF_ERROR(err, ...) \
+  do { \
+    size_t const err_code =3D (err); \
+    if (ERR_isError(err_code)) { \
+      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
+             __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code=
)); \
+      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
+      RAWLOG(3, ": " __VA_ARGS__); \
+      RAWLOG(3, "\n"); \
+      return err_code; \
+    } \
+  } while(0);
+
=20
 #endif /* ERROR_H_MODULE */
diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h
index 0bb174c2c367..4507043b2287 100644
--- a/lib/zstd/common/fse.h
+++ b/lib/zstd/common/fse.h
@@ -333,8 +333,9 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned ch=
ar symbolValue);
 /* FSE_buildCTable_wksp() :
  * Same as FSE_buildCTable(), but using an externally allocated scratch bu=
ffer (`workSpace`).
  * `wkspSize` must be >=3D `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolV=
alue, tableLog)` of `unsigned`.
+ * See FSE_buildCTable_wksp() for breakdown of workspace usage.
  */
-#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (max=
SymbolValue + 2 + (1ull << (tableLog - 2)))
+#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((m=
axSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* a=
dditional 8 bytes for potential table overwrite */)
 #define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(=
unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog))
 size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter=
, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspS=
ize);
=20
diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompr=
ess.c
index 2c8bbe3e4c14..a0d06095be83 100644
--- a/lib/zstd/common/fse_decompress.c
+++ b/lib/zstd/common/fse_decompress.c
@@ -365,7 +365,7 @@ static size_t FSE_decompress_wksp_body_default(void* ds=
t, size_t dstCapacity, co
 }
=20
 #if DYNAMIC_BMI2
-TARGET_ATTRIBUTE("bmi2") static size_t FSE_decompress_wksp_body_bmi2(void*=
 dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLo=
g, void* workSpace, size_t wkspSize)
+BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* ds=
t, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, =
void* workSpace, size_t wkspSize)
 {
     return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxL=
og, workSpace, wkspSize, 1);
 }
diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h
index 88c5586646aa..5042ff870308 100644
--- a/lib/zstd/common/huf.h
+++ b/lib/zstd/common/huf.h
@@ -86,9 +86,9 @@ HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t ds=
tCapacity,
=20
 /* HUF_compress4X_wksp() :
  *  Same as HUF_compress2(), but uses externally allocated `workSpace`.
- * `workspace` must have minimum alignment of 4, and be at least as large =
as HUF_WORKSPACE_SIZE */
-#define HUF_WORKSPACE_SIZE ((6 << 10) + 256)
-#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32))
+ * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */
+#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */)
+#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64))
 HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity,
                                      const void* src, size_t srcSize,
                                      unsigned maxSymbolValue, unsigned tab=
leLog,
@@ -113,11 +113,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst,=
 size_t dstCapacity,
=20
=20
 /* *** Constants *** */
-#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (du=
e to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */
+#define HUF_TABLELOG_MAX      12      /* max runtime value of tableLog (du=
e to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */
 #define HUF_TABLELOG_DEFAULT  11      /* default tableLog value when none =
specified */
 #define HUF_SYMBOLVALUE_MAX  255
=20
-#define HUF_TABLELOG_ABSOLUTEMAX  15  /* absolute limit of HUF_MAX_TABLELO=
G. Beyond that value, code does not work */
+#define HUF_TABLELOG_ABSOLUTEMAX  12  /* absolute limit of HUF_MAX_TABLELO=
G. Beyond that value, code does not work */
 #if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX)
 #  error "HUF_TABLELOG_MAX is too large !"
 #endif
@@ -133,15 +133,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst,=
 size_t dstCapacity,
=20
 /* static allocation of HUF's Compression Table */
 /* this is a private definition, just exposed for allocation and strict al=
iasing purpose. never EVER access its members directly */
-struct HUF_CElt_s {
-  U16  val;
-  BYTE nbBits;
-};   /* typedef'd to HUF_CElt */
-typedef struct HUF_CElt_s HUF_CElt;   /* consider it an incomplete type */
-#define HUF_CTABLE_SIZE_U32(maxSymbolValue)   ((maxSymbolValue)+1)   /* Us=
e tables of U32, for proper alignment */
-#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_U32(maxSymb=
olValue) * sizeof(U32))
+typedef size_t HUF_CElt;   /* consider it an incomplete type */
+#define HUF_CTABLE_SIZE_ST(maxSymbolValue)   ((maxSymbolValue)+2)   /* Use=
 tables of size_t, for proper alignment */
+#define HUF_CTABLE_SIZE(maxSymbolValue)       (HUF_CTABLE_SIZE_ST(maxSymbo=
lValue) * sizeof(size_t))
 #define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \
-    HUF_CElt name[HUF_CTABLE_SIZE_U32(maxSymbolValue)] /* no final ; */
+    HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */
=20
 /* static allocation of HUF's DTable */
 typedef U32 HUF_DTable;
@@ -191,6 +187,7 @@ size_t HUF_buildCTable (HUF_CElt* CTable, const unsigne=
d* count, unsigned maxSym
 size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTab=
le, unsigned maxSymbolValue, unsigned huffLog);
 size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* =
CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t =
workspaceSize);
 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* s=
rc, size_t srcSize, const HUF_CElt* CTable);
+size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const vo=
id* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* =
count, unsigned maxSymbolValue);
 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsi=
gned maxSymbolValue);
=20
@@ -203,12 +200,13 @@ typedef enum {
  *  Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat=
 !=3D HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
  *  If it doesn't, it sets *repeat =3D HUF_repeat_none, and it sets hufTab=
le to the table used.
- *  If preferRepeat then the old table will always be used if valid. */
+ *  If preferRepeat then the old table will always be used if valid.
+ *  If suspectUncompressible then some sampling checks will be run to pote=
ntially skip huffman coding */
 size_t HUF_compress4X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,    /*< `workSpace=
` must be aligned on 4-bytes boundaries, `wkspSize` must be >=3D HUF_WORKSP=
ACE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferR=
epeat, int bmi2);
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferR=
epeat, int bmi2, unsigned suspectUncompressible);
=20
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buff=
er.
@@ -246,11 +244,10 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hw=
Size,
  *  Loading a CTable saved with HUF_writeCTable() */
 size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, cons=
t void* src, size_t srcSize, unsigned *hasZeroWeights);
=20
-/* HUF_getNbBits() :
+/* HUF_getNbBitsFromCTable() :
  *  Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed=
 <=3D HUF_SYMBOLVALUE_MAX
- *  Note 1 : is not inlined, as HUF_CElt definition is private
- *  Note 2 : const void* used, so that it can provide a statically allocat=
ed table as argument (which uses type U32) */
-U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue);
+ *  Note 1 : is not inlined, as HUF_CElt definition is private */
+U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue);
=20
 /*
  * HUF_decompress() does the following:
@@ -302,18 +299,20 @@ size_t HUF_decompress4X2_usingDTable(void* dst, size_=
t maxDstSize, const void* c
 /* =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D */
=20
 size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t =
srcSize, unsigned maxSymbolValue, unsigned tableLog);
-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, si=
ze_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, =
size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPAC=
E_SIZE_U32 unsigned */
+size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, si=
ze_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, =
size_t wkspSize);  /*< `workSpace` must be a table of at least HUF_WORKSPAC=
E_SIZE_U64 U64 */
 size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* s=
rc, size_t srcSize, const HUF_CElt* CTable);
+size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const vo=
id* src, size_t srcSize, const HUF_CElt* CTable, int bmi2);
 /* HUF_compress1X_repeat() :
  *  Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat=
 !=3D HUF_repeat_none.
  *  If it uses hufTable it does not modify hufTable or repeat.
  *  If it doesn't, it sets *repeat =3D HUF_repeat_none, and it sets hufTab=
le to the table used.
- *  If preferRepeat then the old table will always be used if valid. */
+ *  If preferRepeat then the old table will always be used if valid.
+ *  If suspectUncompressible then some sampling checks will be run to pote=
ntially skip huffman coding */
 size_t HUF_compress1X_repeat(void* dst, size_t dstSize,
                        const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned tableLog,
                        void* workSpace, size_t wkspSize,   /*< `workSpace`=
 must be aligned on 4-bytes boundaries, `wkspSize` must be >=3D HUF_WORKSPA=
CE_SIZE */
-                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferR=
epeat, int bmi2);
+                       HUF_CElt* hufTable, HUF_repeat* repeat, int preferR=
epeat, int bmi2, unsigned suspectUncompressible);
=20
 size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, siz=
e_t cSrcSize);   /* single-symbol decoder */
 #ifndef HUF_FORCE_DECOMPRESS_X1
@@ -351,6 +350,9 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* d=
ctx, void* dst, size_t ds
 #ifndef HUF_FORCE_DECOMPRESS_X2
 size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, siz=
e_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
 #endif
+#ifndef HUF_FORCE_DECOMPRESS_X1
+size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, siz=
e_t srcSize, void* workSpace, size_t wkspSize, int bmi2);
+#endif
=20
 #endif /* HUF_STATIC_LINKING_ONLY */
=20
diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h
index dcdd586a9fd9..1d9cc03924ca 100644
--- a/lib/zstd/common/mem.h
+++ b/lib/zstd/common/mem.h
@@ -30,6 +30,8 @@
 *  Basic Types
 *****************************************************************/
 typedef uint8_t  BYTE;
+typedef uint8_t  U8;
+typedef int8_t   S8;
 typedef uint16_t U16;
 typedef int16_t  S16;
 typedef uint32_t U32;
diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portabi=
lity_macros.h
new file mode 100644
index 000000000000..0e3b2c0a527d
--- /dev/null
+++ b/lib/zstd/common/portability_macros.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in=
 the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (=
found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_PORTABILITY_MACROS_H
+#define ZSTD_PORTABILITY_MACROS_H
+
+/*
+ * This header file contains macro defintions to support portability.
+ * This header is shared between C and ASM code, so it MUST only
+ * contain macro definitions. It MUST not contain any C code.
+ *
+ * This header ONLY defines macros to detect platforms/feature support.
+ *
+ */
+
+
+/* compat. with non-clang compilers */
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+
+/* compat. with non-clang compilers */
+#ifndef __has_builtin
+#  define __has_builtin(x) 0
+#endif
+
+/* compat. with non-clang compilers */
+#ifndef __has_feature
+#  define __has_feature(x) 0
+#endif
+
+/* detects whether we are being compiled under msan */
+
+/* detects whether we are being compiled under asan */
+
+/* detects whether we are being compiled under dfsan */
+
+/* Mark the internal assembly functions as hidden  */
+#ifdef __ELF__
+# define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func
+#else
+# define ZSTD_HIDE_ASM_FUNCTION(func)
+#endif
+
+/* Enable runtime BMI2 dispatch based on the CPU.
+ * Enabled for clang & gcc >=3D4.8 on x86 when BMI2 isn't enabled by defau=
lt.
+ */
+#ifndef DYNAMIC_BMI2
+  #if ((defined(__clang__) && __has_attribute(__target__)) \
+      || (defined(__GNUC__) \
+          && (__GNUC__ >=3D 5 || (__GNUC__ =3D=3D 4 && __GNUC_MINOR__ >=3D=
 8)))) \
+      && (defined(__x86_64__) || defined(_M_X64)) \
+      && !defined(__BMI2__)
+  #  define DYNAMIC_BMI2 1
+  #else
+  #  define DYNAMIC_BMI2 0
+  #endif
+#endif
+
+/*
+ * Only enable assembly for GNUC comptabile compilers,
+ * because other platforms may not support GAS assembly syntax.
+ *
+ * Only enable assembly for Linux / MacOS, other platforms may
+ * work, but they haven't been tested. This could likely be
+ * extended to BSD systems.
+ *
+ * Disable assembly when MSAN is enabled, because MSAN requires
+ * 100% of code to be instrumented to work.
+ */
+#define ZSTD_ASM_SUPPORTED 1
+
+/*
+ * Determines whether we should enable assembly for x86-64
+ * with BMI2.
+ *
+ * Enable if all of the following conditions hold:
+ * - ASM hasn't been explicitly disabled by defining ZSTD_DISABLE_ASM
+ * - Assembly is supported
+ * - We are compiling for x86-64 and either:
+ *   - DYNAMIC_BMI2 is enabled
+ *   - BMI2 is supported at compile time
+ */
+#define ZSTD_ENABLE_ASM_X86_64_BMI2 0
+
+#endif /* ZSTD_PORTABILITY_MACROS_H */
diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_interna=
l.h
index fc6f3a9b40c0..93305d9b41bb 100644
--- a/lib/zstd/common/zstd_internal.h
+++ b/lib/zstd/common/zstd_internal.h
@@ -20,6 +20,7 @@
 *  Dependencies
 ***************************************/
 #include "compiler.h"
+#include "cpu.h"
 #include "mem.h"
 #include "debug.h"                 /* assert, DEBUGLOG, RAWLOG, g_debuglev=
el */
 #include "error_private.h"
@@ -47,81 +48,7 @@
 #undef MAX
 #define MIN(a,b) ((a)<(b) ? (a) : (b))
 #define MAX(a,b) ((a)>(b) ? (a) : (b))
-
-/*
- * Ignore: this is an internal helper.
- *
- * This is a helper function to help force C99-correctness during compilat=
ion.
- * Under strict compilation modes, variadic macro arguments can't be empty.
- * However, variadic function arguments can be. Using a function therefore=
 lets
- * us statically check that at least one (string) argument was passed,
- * independent of the compilation flags.
- */
-static INLINE_KEYWORD UNUSED_ATTR
-void _force_has_format_string(const char *format, ...) {
-  (void)format;
-}
-
-/*
- * Ignore: this is an internal helper.
- *
- * We want to force this function invocation to be syntactically correct, =
but
- * we don't want to force runtime evaluation of its arguments.
- */
-#define _FORCE_HAS_FORMAT_STRING(...) \
-  if (0) { \
-    _force_has_format_string(__VA_ARGS__); \
-  }
-
-/*
- * Return the specified error if the condition evaluates to true.
- *
- * In debug modes, prints additional information.
- * In order to do that (particularly, printing the conditional that failed=
),
- * this can't just wrap RETURN_ERROR().
- */
-#define RETURN_ERROR_IF(cond, err, ...) \
-  if (cond) { \
-    RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \
-           __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \
-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
-    RAWLOG(3, ": " __VA_ARGS__); \
-    RAWLOG(3, "\n"); \
-    return ERROR(err); \
-  }
-
-/*
- * Unconditionally return the specified error.
- *
- * In debug modes, prints additional information.
- */
-#define RETURN_ERROR(err, ...) \
-  do { \
-    RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \
-           __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \
-    _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
-    RAWLOG(3, ": " __VA_ARGS__); \
-    RAWLOG(3, "\n"); \
-    return ERROR(err); \
-  } while(0);
-
-/*
- * If the provided expression evaluates to an error code, returns that err=
or code.
- *
- * In debug modes, prints additional information.
- */
-#define FORWARD_IF_ERROR(err, ...) \
-  do { \
-    size_t const err_code =3D (err); \
-    if (ERR_isError(err_code)) { \
-      RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \
-             __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_cod=
e)); \
-      _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \
-      RAWLOG(3, ": " __VA_ARGS__); \
-      RAWLOG(3, "\n"); \
-      return err_code; \
-    } \
-  } while(0);
+#define BOUNDED(min,val,max) (MAX(min,MIN(val,max)))
=20
=20
 /*-*************************************
@@ -130,7 +57,6 @@ void _force_has_format_string(const char *format, ...) {
 #define ZSTD_OPT_NUM    (1<<12)
=20
 #define ZSTD_REP_NUM      3                 /* number of repcodes */
-#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
 static UNUSED_ATTR const U32 repStartValue[ZSTD_REP_NUM] =3D { 1, 4, 8 };
=20
 #define KB *(1 <<10)
@@ -182,7 +108,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_=
repeat } symbolEncodingTy
 /* Each table cannot take more than #symbols * FSELog bits */
 #define ZSTD_MAX_FSE_HEADERS_SIZE (((MaxML + 1) * MLFSELog + (MaxLL + 1) *=
 LLFSELog + (MaxOff + 1) * OffFSELog + 7) / 8)
=20
-static UNUSED_ATTR const U32 LL_bits[MaxLL+1] =3D {
+static UNUSED_ATTR const U8 LL_bits[MaxLL+1] =3D {
      0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0,
      1, 1, 1, 1, 2, 2, 3, 3,
@@ -199,7 +125,7 @@ static UNUSED_ATTR const S16 LL_defaultNorm[MaxLL+1] =
=3D {
 #define LL_DEFAULTNORMLOG 6  /* for static allocation */
 static UNUSED_ATTR const U32 LL_defaultNormLog =3D LL_DEFAULTNORMLOG;
=20
-static UNUSED_ATTR const U32 ML_bits[MaxML+1] =3D {
+static UNUSED_ATTR const U8 ML_bits[MaxML+1] =3D {
      0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 0,
@@ -234,12 +160,31 @@ static UNUSED_ATTR const U32 OF_defaultNormLog =3D OF=
_DEFAULTNORMLOG;
 *  Shared functions to include for inlining
 *********************************************/
 static void ZSTD_copy8(void* dst, const void* src) {
+#if defined(ZSTD_ARCH_ARM_NEON)
+    vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src));
+#else
     ZSTD_memcpy(dst, src, 8);
+#endif
 }
-
 #define COPY8(d,s) { ZSTD_copy8(d,s); d+=3D8; s+=3D8; }
+
+/* Need to use memmove here since the literal buffer can now be located wi=
thin
+   the dst buffer. In circumstances where the op "catches up" to where the
+   literal buffer is, there can be partial overlaps in this call on the fi=
nal
+   copy if the literal is being shifted by less than 16 bytes. */
 static void ZSTD_copy16(void* dst, const void* src) {
-    ZSTD_memcpy(dst, src, 16);
+#if defined(ZSTD_ARCH_ARM_NEON)
+    vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src));
+#elif defined(ZSTD_ARCH_X86_SSE2)
+    _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src));
+#elif defined(__clang__)
+    ZSTD_memmove(dst, src, 16);
+#else
+    /* ZSTD_memmove is not inlined properly by gcc */
+    BYTE copy16_buf[16];
+    ZSTD_memcpy(copy16_buf, src, 16);
+    ZSTD_memcpy(dst, copy16_buf, 16);
+#endif
 }
 #define COPY16(d,s) { ZSTD_copy16(d,s); d+=3D16; s+=3D16; }
=20
@@ -267,8 +212,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_=
t length, ZSTD_overlap_e
     BYTE* op =3D (BYTE*)dst;
     BYTE* const oend =3D op + length;
=20
-    assert(diff >=3D 8 || (ovtype =3D=3D ZSTD_no_overlap && diff <=3D -WIL=
DCOPY_VECLEN));
-
     if (ovtype =3D=3D ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLE=
N) {
         /* Handle short offset copies. */
         do {
@@ -331,11 +274,18 @@ typedef enum {
 *  Private declarations
 *********************************************/
 typedef struct seqDef_s {
-    U32 offset;         /* Offset code of the sequence */
+    U32 offBase;   /* offBase =3D=3D Offset + ZSTD_REP_NUM, or repcode 1,2=
,3 */
     U16 litLength;
-    U16 matchLength;
+    U16 mlBase;    /* mlBase =3D=3D matchLength - MINMATCH */
 } seqDef;
=20
+/* Controls whether seqStore has a single "long" litLength or matchLength.=
 See seqStore_t. */
+typedef enum {
+    ZSTD_llt_none =3D 0,             /* no longLengthType */
+    ZSTD_llt_literalLength =3D 1,    /* represents a long literal */
+    ZSTD_llt_matchLength =3D 2       /* represents a long match */
+} ZSTD_longLengthType_e;
+
 typedef struct {
     seqDef* sequencesStart;
     seqDef* sequences;      /* ptr to end of sequences */
@@ -347,12 +297,12 @@ typedef struct {
     size_t maxNbSeq;
     size_t maxNbLit;
=20
-    /* longLengthPos and longLengthID to allow us to represent either a si=
ngle litLength or matchLength
+    /* longLengthPos and longLengthType to allow us to represent either a =
single litLength or matchLength
      * in the seqStore that has a value larger than U16 (if it exists). To=
 do so, we increment
      * the existing value of the litLength or matchLength by 0x10000.
      */
-    U32   longLengthID;   /* 0 =3D=3D no longLength; 1 =3D=3D Represent th=
e long literal; 2 =3D=3D Represent the long match; */
-    U32   longLengthPos;  /* Index of the sequence to apply long length mo=
dification to */
+    ZSTD_longLengthType_e   longLengthType;
+    U32                     longLengthPos;  /* Index of the sequence to ap=
ply long length modification to */
 } seqStore_t;
=20
 typedef struct {
@@ -362,18 +312,18 @@ typedef struct {
=20
 /*
  * Returns the ZSTD_sequenceLength for the given sequences. It handles the=
 decoding of long sequences
- * indicated by longLengthPos and longLengthID, and adds MINMATCH back to =
matchLength.
+ * indicated by longLengthPos and longLengthType, and adds MINMATCH back t=
o matchLength.
  */
 MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* se=
qStore, seqDef const* seq)
 {
     ZSTD_sequenceLength seqLen;
     seqLen.litLength =3D seq->litLength;
-    seqLen.matchLength =3D seq->matchLength + MINMATCH;
+    seqLen.matchLength =3D seq->mlBase + MINMATCH;
     if (seqStore->longLengthPos =3D=3D (U32)(seq - seqStore->sequencesStar=
t)) {
-        if (seqStore->longLengthID =3D=3D 1) {
+        if (seqStore->longLengthType =3D=3D ZSTD_llt_literalLength) {
             seqLen.litLength +=3D 0xFFFF;
         }
-        if (seqStore->longLengthID =3D=3D 2) {
+        if (seqStore->longLengthType =3D=3D ZSTD_llt_matchLength) {
             seqLen.matchLength +=3D 0xFFFF;
         }
     }
@@ -419,6 +369,41 @@ MEM_STATIC U32 ZSTD_highbit32(U32 val)   /* compress, =
dictBuilder, decodeCorpus
     }
 }
=20
+/*
+ * Counts the number of trailing zeros of a `size_t`.
+ * Most compilers should support CTZ as a builtin. A backup
+ * implementation is provided if the builtin isn't supported, but
+ * it may not be terribly efficient.
+ */
+MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val)
+{
+    if (MEM_64bits()) {
+#       if (__GNUC__ >=3D 4)
+            return __builtin_ctzll((U64)val);
+#       else
+            static const int DeBruijnBytePos[64] =3D {  0,  1,  2,  7,  3,=
 13,  8, 19,
+                                                      4, 25, 14, 28,  9, 3=
4, 20, 56,
+                                                      5, 17, 26, 54, 15, 4=
1, 29, 43,
+                                                      10, 31, 38, 35, 21, =
45, 49, 57,
+                                                      63,  6, 12, 18, 24, =
27, 33, 55,
+                                                      16, 53, 40, 42, 30, =
37, 44, 48,
+                                                      62, 11, 23, 32, 52, =
39, 36, 47,
+                                                      61, 22, 51, 46, 60, =
50, 59, 58 };
+            return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218=
A392CDABBD3FULL)) >> 58];
+#       endif
+    } else { /* 32 bits */
+#       if (__GNUC__ >=3D 3)
+            return __builtin_ctz((U32)val);
+#       else
+            static const int DeBruijnBytePos[32] =3D {  0,  1, 28,  2, 29,=
 14, 24,  3,
+                                                     30, 22, 20, 15, 25, 1=
7,  4,  8,
+                                                     31, 27, 13, 23, 21, 1=
9, 16,  7,
+                                                     26, 12, 18,  6, 11,  =
5, 10,  9 };
+            return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)=
) >> 27];
+#       endif
+    }
+}
+
=20
 /* ZSTD_invalidateRepCodes() :
  * ensures next compression will not use repcodes from previous block.
@@ -445,6 +430,14 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcS=
ize,
 size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
                        const void* src, size_t srcSize);
=20
+/*
+ * @returns true iff the CPU supports dynamic BMI2 dispatch.
+ */
+MEM_STATIC int ZSTD_cpuSupportsBmi2(void)
+{
+    ZSTD_cpuid_t cpuid =3D ZSTD_cpuid();
+    return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid);
+}
=20
=20
 #endif   /* ZSTD_CCOMMON_H_MODULE */
diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h
new file mode 100644
index 000000000000..d9a76112ec3a
--- /dev/null
+++ b/lib/zstd/compress/clevels.h
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) Yann Collet, Facebook, Inc.
+ * All rights reserved.
+ *
+ * This source code is licensed under both the BSD-style license (found in=
 the
+ * LICENSE file in the root directory of this source tree) and the GPLv2 (=
found
+ * in the COPYING file in the root directory of this source tree).
+ * You may select, at your option, one of the above-listed licenses.
+ */
+
+#ifndef ZSTD_CLEVELS_H
+#define ZSTD_CLEVELS_H
+
+#define ZSTD_STATIC_LINKING_ONLY  /* ZSTD_compressionParameters  */
+#include <linux/zstd.h>
+
+/*-=3D=3D=3D=3D=3D  Pre-defined compression levels  =3D=3D=3D=3D=3D-*/
+
+#define ZSTD_MAX_CLEVEL     22
+
+__attribute__((__unused__))
+
+static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MA=
X_CLEVEL+1] =3D {
+{   /* "default" - for any srcSize > 256 KB */
+    /* W,  C,  H,  S,  L, TL, strat */
+    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels=
 */
+    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
+    { 20, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
+    { 21, 16, 17,  1,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 21, 18, 18,  1,  5,  0, ZSTD_dfast   },  /* level  4 */
+    { 21, 18, 19,  3,  5,  2, ZSTD_greedy  },  /* level  5 */
+    { 21, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6 */
+    { 21, 19, 20,  4,  5,  8, ZSTD_lazy    },  /* level  7 */
+    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  8 */
+    { 22, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
+    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 10 */
+    { 22, 21, 22,  6,  5, 16, ZSTD_lazy2   },  /* level 11 */
+    { 22, 22, 23,  6,  5, 32, ZSTD_lazy2   },  /* level 12 */
+    { 22, 22, 22,  4,  5, 32, ZSTD_btlazy2 },  /* level 13 */
+    { 22, 22, 23,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
+    { 22, 23, 23,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
+    { 22, 22, 22,  5,  5, 48, ZSTD_btopt   },  /* level 16 */
+    { 23, 23, 22,  5,  4, 64, ZSTD_btopt   },  /* level 17 */
+    { 23, 23, 22,  6,  3, 64, ZSTD_btultra },  /* level 18 */
+    { 23, 24, 22,  7,  3,256, ZSTD_btultra2},  /* level 19 */
+    { 25, 25, 23,  7,  3,256, ZSTD_btultra2},  /* level 20 */
+    { 26, 26, 24,  7,  3,512, ZSTD_btultra2},  /* level 21 */
+    { 27, 27, 25,  9,  3,999, ZSTD_btultra2},  /* level 22 */
+},
+{   /* for srcSize <=3D 256 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels=
 */
+    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 18, 14, 14,  1,  5,  0, ZSTD_dfast   },  /* level  2 */
+    { 18, 16, 16,  1,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 18, 16, 17,  3,  5,  2, ZSTD_greedy  },  /* level  4.*/
+    { 18, 17, 18,  5,  5,  2, ZSTD_greedy  },  /* level  5.*/
+    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
+    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
+    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 18, 18, 19,  5,  4, 12, ZSTD_btlazy2 },  /* level 11.*/
+    { 18, 19, 19,  7,  4, 12, ZSTD_btlazy2 },  /* level 12.*/
+    { 18, 18, 19,  4,  4, 16, ZSTD_btopt   },  /* level 13 */
+    { 18, 18, 19,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 18, 18, 19,  6,  3,128, ZSTD_btopt   },  /* level 15.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 18, 19, 19,  6,  3,128, ZSTD_btultra2},  /* level 18.*/
+    { 18, 19, 19,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 18, 19, 19, 10,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 18, 19, 19, 12,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 18, 19, 19, 13,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <=3D 128 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels=
 */
+    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
+    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
+    { 17, 15, 16,  2,  5,  0, ZSTD_dfast   },  /* level  3 */
+    { 17, 17, 17,  2,  4,  0, ZSTD_dfast   },  /* level  4 */
+    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
+    { 17, 16, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
+    { 17, 16, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 17, 16, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
+    { 17, 16, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
+    { 17, 16, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
+    { 17, 17, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 11 */
+    { 17, 18, 17,  7,  4, 12, ZSTD_btlazy2 },  /* level 12 */
+    { 17, 18, 17,  3,  4, 12, ZSTD_btopt   },  /* level 13.*/
+    { 17, 18, 17,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
+    { 17, 18, 17,  6,  3,256, ZSTD_btopt   },  /* level 15.*/
+    { 17, 18, 17,  6,  3,128, ZSTD_btultra },  /* level 16.*/
+    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 17.*/
+    { 17, 18, 17, 10,  3,512, ZSTD_btultra },  /* level 18.*/
+    { 17, 18, 17,  5,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 17, 18, 17,  7,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 17, 18, 17,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 17, 18, 17, 11,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+{   /* for srcSize <=3D 16 KB */
+    /* W,  C,  H,  S,  L,  T, strat */
+    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels=
 */
+    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
+    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
+    { 14, 14, 15,  2,  4,  0, ZSTD_dfast   },  /* level  3 */
+    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4 */
+    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
+    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
+    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
+    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
+    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
+    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
+    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
+    { 14, 15, 14,  4,  3, 24, ZSTD_btopt   },  /* level 12.*/
+    { 14, 15, 14,  5,  3, 32, ZSTD_btultra },  /* level 13.*/
+    { 14, 15, 15,  6,  3, 64, ZSTD_btultra },  /* level 14.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra },  /* level 15.*/
+    { 14, 15, 15,  5,  3, 48, ZSTD_btultra2},  /* level 16.*/
+    { 14, 15, 15,  6,  3,128, ZSTD_btultra2},  /* level 17.*/
+    { 14, 15, 15,  7,  3,256, ZSTD_btultra2},  /* level 18.*/
+    { 14, 15, 15,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
+    { 14, 15, 15,  8,  3,512, ZSTD_btultra2},  /* level 20.*/
+    { 14, 15, 15,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
+    { 14, 15, 15, 10,  3,999, ZSTD_btultra2},  /* level 22.*/
+},
+};
+
+
+
+#endif  /* ZSTD_CLEVELS_H */
diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compr=
ess.c
index 436985b620e5..ec5b1ca6d71a 100644
--- a/lib/zstd/compress/fse_compress.c
+++ b/lib/zstd/compress/fse_compress.c
@@ -75,13 +75,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
     void* const FSCT =3D ((U32*)ptr) + 1 /* header */ + (tableLog ? tableS=
ize>>1 : 1) ;
     FSE_symbolCompressionTransform* const symbolTT =3D (FSE_symbolCompress=
ionTransform*) (FSCT);
     U32 const step =3D FSE_TABLESTEP(tableSize);
+    U32 const maxSV1 =3D maxSymbolValue+1;
=20
-    U32* cumul =3D (U32*)workSpace;
-    FSE_FUNCTION_TYPE* tableSymbol =3D (FSE_FUNCTION_TYPE*)(cumul + (maxSy=
mbolValue + 2));
+    U16* cumul =3D (U16*)workSpace;   /* size =3D maxSV1 */
+    FSE_FUNCTION_TYPE* const tableSymbol =3D (FSE_FUNCTION_TYPE*)(cumul + =
(maxSV1+1));  /* size =3D tableSize */
=20
     U32 highThreshold =3D tableSize-1;
=20
-    if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte al=
igned */
+    assert(((size_t)workSpace & 1) =3D=3D 0);  /* Must be 2 bytes-aligned =
*/
     if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSi=
ze) return ERROR(tableLog_tooLarge);
     /* CTable header */
     tableU16[-2] =3D (U16) tableLog;
@@ -98,20 +99,61 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
     /* symbol start positions */
     {   U32 u;
         cumul[0] =3D 0;
-        for (u=3D1; u <=3D maxSymbolValue+1; u++) {
+        for (u=3D1; u <=3D maxSV1; u++) {
             if (normalizedCounter[u-1]=3D=3D-1) {  /* Low proba symbol */
                 cumul[u] =3D cumul[u-1] + 1;
                 tableSymbol[highThreshold--] =3D (FSE_FUNCTION_TYPE)(u-1);
             } else {
-                cumul[u] =3D cumul[u-1] + normalizedCounter[u-1];
+                assert(normalizedCounter[u-1] >=3D 0);
+                cumul[u] =3D cumul[u-1] + (U16)normalizedCounter[u-1];
+                assert(cumul[u] >=3D cumul[u-1]);  /* no overflow */
         }   }
-        cumul[maxSymbolValue+1] =3D tableSize+1;
+        cumul[maxSV1] =3D (U16)(tableSize+1);
     }
=20
     /* Spread symbols */
-    {   U32 position =3D 0;
+    if (highThreshold =3D=3D tableSize - 1) {
+        /* Case for no low prob count symbols. Lay down 8 bytes at a time
+         * to reduce branch misses since we are operating on a small block
+         */
+        BYTE* const spread =3D tableSymbol + tableSize; /* size =3D tableS=
ize + 8 (may write beyond tableSize) */
+        {   U64 const add =3D 0x0101010101010101ull;
+            size_t pos =3D 0;
+            U64 sv =3D 0;
+            U32 s;
+            for (s=3D0; s<maxSV1; ++s, sv +=3D add) {
+                int i;
+                int const n =3D normalizedCounter[s];
+                MEM_write64(spread + pos, sv);
+                for (i =3D 8; i < n; i +=3D 8) {
+                    MEM_write64(spread + pos + i, sv);
+                }
+                assert(n>=3D0);
+                pos +=3D (size_t)n;
+            }
+        }
+        /* Spread symbols across the table. Lack of lowprob symbols means =
that
+         * we don't need variable sized inner loop, so we can unroll the l=
oop and
+         * reduce branch misses.
+         */
+        {   size_t position =3D 0;
+            size_t s;
+            size_t const unroll =3D 2; /* Experimentally determined optima=
l unroll */
+            assert(tableSize % unroll =3D=3D 0); /* FSE_MIN_TABLELOG is 5 =
*/
+            for (s =3D 0; s < (size_t)tableSize; s +=3D unroll) {
+                size_t u;
+                for (u =3D 0; u < unroll; ++u) {
+                    size_t const uPosition =3D (position + (u * step)) & t=
ableMask;
+                    tableSymbol[uPosition] =3D spread[s + u];
+                }
+                position =3D (position + (unroll * step)) & tableMask;
+            }
+            assert(position =3D=3D 0);   /* Must have initialized all posi=
tions */
+        }
+    } else {
+        U32 position =3D 0;
         U32 symbol;
-        for (symbol=3D0; symbol<=3DmaxSymbolValue; symbol++) {
+        for (symbol=3D0; symbol<maxSV1; symbol++) {
             int nbOccurrences;
             int const freq =3D normalizedCounter[symbol];
             for (nbOccurrences=3D0; nbOccurrences<freq; nbOccurrences++) {
@@ -120,7 +162,6 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
                 while (position > highThreshold)
                     position =3D (position + step) & tableMask;   /* Low p=
roba area */
         }   }
-
         assert(position=3D=3D0);  /* Must have initialized all positions */
     }
=20
@@ -144,16 +185,17 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
             case -1:
             case  1:
                 symbolTT[s].deltaNbBits =3D (tableLog << 16) - (1<<tableLo=
g);
-                symbolTT[s].deltaFindState =3D total - 1;
+                assert(total <=3D INT_MAX);
+                symbolTT[s].deltaFindState =3D (int)(total - 1);
                 total ++;
                 break;
             default :
-                {
-                    U32 const maxBitsOut =3D tableLog - BIT_highbit32 (nor=
malizedCounter[s]-1);
-                    U32 const minStatePlus =3D normalizedCounter[s] << max=
BitsOut;
+                assert(normalizedCounter[s] > 1);
+                {   U32 const maxBitsOut =3D tableLog - BIT_highbit32 ((U3=
2)normalizedCounter[s]-1);
+                    U32 const minStatePlus =3D (U32)normalizedCounter[s] <=
< maxBitsOut;
                     symbolTT[s].deltaNbBits =3D (maxBitsOut << 16) - minSt=
atePlus;
-                    symbolTT[s].deltaFindState =3D total - normalizedCount=
er[s];
-                    total +=3D  normalizedCounter[s];
+                    symbolTT[s].deltaFindState =3D (int)(total - (unsigned=
)normalizedCounter[s]);
+                    total +=3D  (unsigned)normalizedCounter[s];
     }   }   }   }
=20
 #if 0  /* debug : symbol costs */
@@ -164,8 +206,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
                 symbol, normalizedCounter[symbol],
                 FSE_getMaxNbBits(symbolTT, symbol),
                 (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256);
-        }
-    }
+    }   }
 #endif
=20
     return 0;
@@ -173,16 +214,18 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct,
=20
=20
=20
-
 #ifndef FSE_COMMONDEFS_ONLY
=20
-
 /*-**************************************************************
 *  FSE NCount encoding
 ****************************************************************/
 size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog)
 {
-    size_t const maxHeaderSize =3D (((maxSymbolValue+1) * tableLog) >> 3) =
+ 3;
+    size_t const maxHeaderSize =3D (((maxSymbolValue+1) * tableLog
+                                   + 4 /* bitCount initialized at 4 */
+                                   + 2 /* first two symbols may use one ad=
ditional bit each */) / 8)
+                                    + 1 /* round up to whole nb bytes */
+                                    + 2 /* additional two bytes for bitstr=
eam flush */;
     return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND;  /* maxSymbol=
Value=3D=3D0 ? use default */
 }
=20
diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compr=
ess.c
index f76a526bfa54..74ef0db47621 100644
--- a/lib/zstd/compress/huf_compress.c
+++ b/lib/zstd/compress/huf_compress.c
@@ -50,6 +50,28 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_=
t srcSize, unsigned maxS
 /* *******************************************************
 *  HUF : Huffman block compression
 *********************************************************/
+#define HUF_WORKSPACE_MAX_ALIGNMENT 8
+
+static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePt=
r, size_t align)
+{
+    size_t const mask =3D align - 1;
+    size_t const rem =3D (size_t)workspace & mask;
+    size_t const add =3D (align - rem) & mask;
+    BYTE* const aligned =3D (BYTE*)workspace + add;
+    assert((align & (align - 1)) =3D=3D 0); /* pow 2 */
+    assert(align <=3D HUF_WORKSPACE_MAX_ALIGNMENT);
+    if (*workspaceSizePtr >=3D add) {
+        assert(add < align);
+        assert(((size_t)aligned & mask) =3D=3D 0);
+        *workspaceSizePtr -=3D add;
+        return aligned;
+    } else {
+        *workspaceSizePtr =3D 0;
+        return NULL;
+    }
+}
+
+
 /* HUF_compressWeights() :
  * Same as FSE_compress(), but dedicated to huff0's weights compression.
  * The use case needs much less stack memory.
@@ -72,7 +94,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSi=
ze, const void* weightT
=20
     unsigned maxSymbolValue =3D HUF_TABLELOG_MAX;
     U32 tableLog =3D MAX_FSE_TABLELOG_FOR_HUFF_HEADER;
-    HUF_CompressWeightsWksp* wksp =3D (HUF_CompressWeightsWksp*)workspace;
+    HUF_CompressWeightsWksp* wksp =3D (HUF_CompressWeightsWksp*)HUF_alignU=
pWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
=20
     if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENE=
RIC);
=20
@@ -103,6 +125,40 @@ static size_t HUF_compressWeights(void* dst, size_t ds=
tSize, const void* weightT
     return (size_t)(op-ostart);
 }
=20
+static size_t HUF_getNbBits(HUF_CElt elt)
+{
+    return elt & 0xFF;
+}
+
+static size_t HUF_getNbBitsFast(HUF_CElt elt)
+{
+    return elt;
+}
+
+static size_t HUF_getValue(HUF_CElt elt)
+{
+    return elt & ~0xFF;
+}
+
+static size_t HUF_getValueFast(HUF_CElt elt)
+{
+    return elt;
+}
+
+static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits)
+{
+    assert(nbBits <=3D HUF_TABLELOG_ABSOLUTEMAX);
+    *elt =3D nbBits;
+}
+
+static void HUF_setValue(HUF_CElt* elt, size_t value)
+{
+    size_t const nbBits =3D HUF_getNbBits(*elt);
+    if (nbBits > 0) {
+        assert((value >> nbBits) =3D=3D 0);
+        *elt |=3D value << (sizeof(HUF_CElt) * 8 - nbBits);
+    }
+}
=20
 typedef struct {
     HUF_CompressWeightsWksp wksp;
@@ -114,9 +170,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSi=
ze,
                             const HUF_CElt* CTable, unsigned maxSymbolValu=
e, unsigned huffLog,
                             void* workspace, size_t workspaceSize)
 {
+    HUF_CElt const* const ct =3D CTable + 1;
     BYTE* op =3D (BYTE*)dst;
     U32 n;
-    HUF_WriteCTableWksp* wksp =3D (HUF_WriteCTableWksp*)workspace;
+    HUF_WriteCTableWksp* wksp =3D (HUF_WriteCTableWksp*)HUF_alignUpWorkspa=
ce(workspace, &workspaceSize, ZSTD_ALIGNOF(U32));
=20
     /* check conditions */
     if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC);
@@ -127,9 +184,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSi=
ze,
     for (n=3D1; n<huffLog+1; n++)
         wksp->bitsToWeight[n] =3D (BYTE)(huffLog + 1 - n);
     for (n=3D0; n<maxSymbolValue; n++)
-        wksp->huffWeight[n] =3D wksp->bitsToWeight[CTable[n].nbBits];
+        wksp->huffWeight[n] =3D wksp->bitsToWeight[HUF_getNbBits(ct[n])];
=20
     /* attempt weights compression by FSE */
+    if (maxDstSize < 1) return ERROR(dstSize_tooSmall);
     {   CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huf=
fWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) );
         if ((hSize>1) & (hSize < maxSymbolValue/2)) {   /* FSE compressed =
*/
             op[0] =3D (BYTE)hSize;
@@ -163,6 +221,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxS=
ymbolValuePtr, const void
     U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];   /* large enough for value=
s from 0 to 16 */
     U32 tableLog =3D 0;
     U32 nbSymbols =3D 0;
+    HUF_CElt* const ct =3D CTable + 1;
=20
     /* get symbol weights */
     CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, r=
ankVal, &nbSymbols, &tableLog, src, srcSize));
@@ -172,6 +231,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxS=
ymbolValuePtr, const void
     if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooS=
mall);
=20
+    CTable[0] =3D tableLog;
+
     /* Prepare base value per rank */
     {   U32 n, nextRankStart =3D 0;
         for (n=3D1; n<=3DtableLog; n++) {
@@ -183,13 +244,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* ma=
xSymbolValuePtr, const void
     /* fill nbBits */
     {   U32 n; for (n=3D0; n<nbSymbols; n++) {
             const U32 w =3D huffWeight[n];
-            CTable[n].nbBits =3D (BYTE)(tableLog + 1 - w) & -(w !=3D 0);
+            HUF_setNbBits(ct + n, (BYTE)(tableLog + 1 - w) & -(w !=3D 0));
     }   }
=20
     /* fill val */
     {   U16 nbPerRank[HUF_TABLELOG_MAX+2]  =3D {0};  /* support w=3D0=3D>n=
=3DtableLog+1 */
         U16 valPerRank[HUF_TABLELOG_MAX+2] =3D {0};
-        { U32 n; for (n=3D0; n<nbSymbols; n++) nbPerRank[CTable[n].nbBits]=
++; }
+        { U32 n; for (n=3D0; n<nbSymbols; n++) nbPerRank[HUF_getNbBits(ct[=
n])]++; }
         /* determine stating value per rank */
         valPerRank[tableLog+1] =3D 0;   /* for w=3D=3D0 */
         {   U16 min =3D 0;
@@ -199,18 +260,18 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* ma=
xSymbolValuePtr, const void
                 min >>=3D 1;
         }   }
         /* assign value within rank, symbol order */
-        { U32 n; for (n=3D0; n<nbSymbols; n++) CTable[n].val =3D valPerRan=
k[CTable[n].nbBits]++; }
+        { U32 n; for (n=3D0; n<nbSymbols; n++) HUF_setValue(ct + n, valPer=
Rank[HUF_getNbBits(ct[n])]++); }
     }
=20
     *maxSymbolValuePtr =3D nbSymbols - 1;
     return readSize;
 }
=20
-U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue)
+U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue)
 {
-    const HUF_CElt* table =3D (const HUF_CElt*)symbolTable;
+    const HUF_CElt* ct =3D CTable + 1;
     assert(symbolValue <=3D HUF_SYMBOLVALUE_MAX);
-    return table[symbolValue].nbBits;
+    return (U32)HUF_getNbBits(ct[symbolValue]);
 }
=20
=20
@@ -364,22 +425,118 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 l=
astNonNull, U32 maxNbBits)
 }
=20
 typedef struct {
-    U32 base;
-    U32 curr;
+    U16 base;
+    U16 curr;
 } rankPos;
=20
 typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];
=20
-#define RANK_POSITION_TABLE_SIZE 32
+/* Number of buckets available for HUF_sort() */
+#define RANK_POSITION_TABLE_SIZE 192
=20
 typedef struct {
   huffNodeTable huffNodeTbl;
   rankPos rankPosition[RANK_POSITION_TABLE_SIZE];
 } HUF_buildCTable_wksp_tables;
=20
+/* RANK_POSITION_DISTINCT_COUNT_CUTOFF =3D=3D Cutoff point in HUF_sort() b=
uckets for which we use log2 bucketing.
+ * Strategy is to use as many buckets as possible for representing distinct
+ * counts while using the remainder to represent all "large" counts.
+ *
+ * To satisfy this requirement for 192 buckets, we can do the following:
+ * Let buckets 0-166 represent distinct counts of [0, 166]
+ * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITI=
ON_MAX_COUNT_LOG using log2 bucketing.
+ */
+#define RANK_POSITION_MAX_COUNT_LOG 32
+#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - R=
ANK_POSITION_MAX_COUNT_LOG - 1 /* =3D=3D 158 */
+#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGI=
N + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* =3D=3D 166 */
+
+/* Return the appropriate bucket index for a given count. See definition of
+ * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strate=
gy.
+ */
+static U32 HUF_getIndex(U32 const count) {
+    return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF)
+        ? count
+        : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN;
+}
+
+/* Helper swap function for HUF_quickSortPartition() */
+static void HUF_swapNodes(nodeElt* a, nodeElt* b) {
+	nodeElt tmp =3D *a;
+	*a =3D *b;
+	*b =3D tmp;
+}
+
+/* Returns 0 if the huffNode array is not sorted by descending count */
+MEM_STATIC int HUF_isSorted(nodeElt huffNode[], U32 const maxSymbolValue1)=
 {
+    U32 i;
+    for (i =3D 1; i < maxSymbolValue1; ++i) {
+        if (huffNode[i].count > huffNode[i-1].count) {
+            return 0;
+        }
+    }
+    return 1;
+}
+
+/* Insertion sort by descending order */
+HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int =
const high) {
+    int i;
+    int const size =3D high-low+1;
+    huffNode +=3D low;
+    for (i =3D 1; i < size; ++i) {
+        nodeElt const key =3D huffNode[i];
+        int j =3D i - 1;
+        while (j >=3D 0 && huffNode[j].count < key.count) {
+            huffNode[j + 1] =3D huffNode[j];
+            j--;
+        }
+        huffNode[j + 1] =3D key;
+    }
+}
+
+/* Pivot helper function for quicksort. */
+static int HUF_quickSortPartition(nodeElt arr[], int const low, int const =
high) {
+    /* Simply select rightmost element as pivot. "Better" selectors like
+     * median-of-three don't experimentally appear to have any benefit.
+     */
+    U32 const pivot =3D arr[high].count;
+    int i =3D low - 1;
+    int j =3D low;
+    for ( ; j < high; j++) {
+        if (arr[j].count > pivot) {
+            i++;
+            HUF_swapNodes(&arr[i], &arr[j]);
+        }
+    }
+    HUF_swapNodes(&arr[i + 1], &arr[high]);
+    return i + 1;
+}
+
+/* Classic quicksort by descending with partially iterative calls
+ * to reduce worst case callstack size.
+ */
+static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) {
+    int const kInsertionSortThreshold =3D 8;
+    if (high - low < kInsertionSortThreshold) {
+        HUF_insertionSort(arr, low, high);
+        return;
+    }
+    while (low < high) {
+        int const idx =3D HUF_quickSortPartition(arr, low, high);
+        if (idx - low < high - idx) {
+            HUF_simpleQuickSort(arr, low, idx - 1);
+            low =3D idx + 1;
+        } else {
+            HUF_simpleQuickSort(arr, idx + 1, high);
+            high =3D idx - 1;
+        }
+    }
+}
+
 /*
  * HUF_sort():
  * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing or=
der.
+ * This is a typical bucket sorting strategy that uses either quicksort or=
 insertion sort to sort each bucket.
  *
  * @param[out] huffNode       Sorted symbols by decreasing count. Only mem=
bers `.count` and `.byte` are filled.
  *                            Must have (maxSymbolValue + 1) entries.
@@ -387,44 +544,52 @@ typedef struct {
  * @param[in]  maxSymbolValue Maximum symbol value.
  * @param      rankPosition   This is a scratch workspace. Must have RANK_=
POSITION_TABLE_SIZE entries.
  */
-static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymb=
olValue, rankPos* rankPosition)
-{
-    int n;
-    int const maxSymbolValue1 =3D (int)maxSymbolValue + 1;
+static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const=
 maxSymbolValue, rankPos rankPosition[]) {
+    U32 n;
+    U32 const maxSymbolValue1 =3D maxSymbolValue+1;
=20
     /* Compute base and set curr to base.
-     * For symbol s let lowerRank =3D BIT_highbit32(count[n]+1) and rank =
=3D lowerRank + 1.
-     * Then 2^lowerRank <=3D count[n]+1 <=3D 2^rank.
+     * For symbol s let lowerRank =3D HUF_getIndex(count[n]) and rank =3D =
lowerRank + 1.
+     * See HUF_getIndex to see bucketing strategy.
      * We attribute each symbol to lowerRank's base value, because we want=
 to know where
      * each rank begins in the output, so for rank R we want to count rank=
s R+1 and above.
      */
     ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TAB=
LE_SIZE);
     for (n =3D 0; n < maxSymbolValue1; ++n) {
-        U32 lowerRank =3D BIT_highbit32(count[n] + 1);
+        U32 lowerRank =3D HUF_getIndex(count[n]);
+        assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1);
         rankPosition[lowerRank].base++;
     }
+
     assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base =3D=3D 0);
+    /* Set up the rankPosition table */
     for (n =3D RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) {
         rankPosition[n-1].base +=3D rankPosition[n].base;
         rankPosition[n-1].curr =3D rankPosition[n-1].base;
     }
-    /* Sort */
+
+    /* Insert each symbol into their appropriate bucket, setting up rankPo=
sition table. */
     for (n =3D 0; n < maxSymbolValue1; ++n) {
         U32 const c =3D count[n];
-        U32 const r =3D BIT_highbit32(c+1) + 1;
-        U32 pos =3D rankPosition[r].curr++;
-        /* Insert into the correct position in the rank.
-         * We have at most 256 symbols, so this insertion should be fine.
-         */
-        while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)=
) {
-            huffNode[pos] =3D huffNode[pos-1];
-            pos--;
-        }
+        U32 const r =3D HUF_getIndex(c) + 1;
+        U32 const pos =3D rankPosition[r].curr++;
+        assert(pos < maxSymbolValue1);
         huffNode[pos].count =3D c;
         huffNode[pos].byte  =3D (BYTE)n;
     }
-}
=20
+    /* Sort each bucket. */
+    for (n =3D RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABL=
E_SIZE - 1; ++n) {
+        U32 const bucketSize =3D rankPosition[n].curr-rankPosition[n].base;
+        U32 const bucketStartIdx =3D rankPosition[n].base;
+        if (bucketSize > 1) {
+            assert(bucketStartIdx < maxSymbolValue1);
+            HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1=
);
+        }
+    }
+
+    assert(HUF_isSorted(huffNode, maxSymbolValue1));
+}
=20
 /* HUF_buildCTable_wksp() :
  *  Same as HUF_buildCTable(), but using externally allocated scratch buff=
er.
@@ -487,6 +652,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymb=
olValue)
  */
 static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffN=
ode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits)
 {
+    HUF_CElt* const ct =3D CTable + 1;
     /* fill result into ctable (val, nbBits) */
     int n;
     U16 nbPerRank[HUF_TABLELOG_MAX+1] =3D {0};
@@ -502,20 +668,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable,=
 nodeElt const* huffNode, i
             min >>=3D 1;
     }   }
     for (n=3D0; n<alphabetSize; n++)
-        CTable[huffNode[n].byte].nbBits =3D huffNode[n].nbBits;   /* push =
nbBits per symbol, symbol order */
+        HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits);   /* pus=
h nbBits per symbol, symbol order */
     for (n=3D0; n<alphabetSize; n++)
-        CTable[n].val =3D valPerRank[CTable[n].nbBits]++;   /* assign valu=
e within rank, symbol order */
+        HUF_setValue(ct + n, valPerRank[HUF_getNbBits(ct[n])]++);   /* ass=
ign value within rank, symbol order */
+    CTable[0] =3D maxNbBits;
 }
=20
-size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 ma=
xSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
+size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 =
maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize)
 {
-    HUF_buildCTable_wksp_tables* const wksp_tables =3D (HUF_buildCTable_wk=
sp_tables*)workSpace;
+    HUF_buildCTable_wksp_tables* const wksp_tables =3D (HUF_buildCTable_wk=
sp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32));
     nodeElt* const huffNode0 =3D wksp_tables->huffNodeTbl;
     nodeElt* const huffNode =3D huffNode0+1;
     int nonNullRank;
=20
     /* safety checks */
-    if (((size_t)workSpace & 3) !=3D 0) return ERROR(GENERIC);  /* must be=
 aligned on 4-bytes boundaries */
     if (wkspSize < sizeof(HUF_buildCTable_wksp_tables))
       return ERROR(workSpace_tooSmall);
     if (maxNbBits =3D=3D 0) maxNbBits =3D HUF_TABLELOG_DEFAULT;
@@ -533,99 +699,334 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const u=
nsigned* count, U32 maxSymbo
     maxNbBits =3D HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits);
     if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC);   /* check fi=
t into table */
=20
-    HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, m=
axNbBits);
+    HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue,=
 maxNbBits);
=20
     return maxNbBits;
 }
=20
 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* =
count, unsigned maxSymbolValue)
 {
+    HUF_CElt const* ct =3D CTable + 1;
     size_t nbBits =3D 0;
     int s;
     for (s =3D 0; s <=3D (int)maxSymbolValue; ++s) {
-        nbBits +=3D CTable[s].nbBits * count[s];
+        nbBits +=3D HUF_getNbBits(ct[s]) * count[s];
     }
     return nbBits >> 3;
 }
=20
 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsi=
gned maxSymbolValue) {
+  HUF_CElt const* ct =3D CTable + 1;
   int bad =3D 0;
   int s;
   for (s =3D 0; s <=3D (int)maxSymbolValue; ++s) {
-    bad |=3D (count[s] !=3D 0) & (CTable[s].nbBits =3D=3D 0);
+    bad |=3D (count[s] !=3D 0) & (HUF_getNbBits(ct[s]) =3D=3D 0);
   }
   return !bad;
 }
=20
 size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); }
=20
+/* HUF_CStream_t:
+ * Huffman uses its own BIT_CStream_t implementation.
+ * There are three major differences from BIT_CStream_t:
+ *   1. HUF_addBits() takes a HUF_CElt (size_t) which is
+ *      the pair (nbBits, value) in the format:
+ *      format:
+ *        - Bits [0, 4)            =3D nbBits
+ *        - Bits [4, 64 - nbBits)  =3D 0
+ *        - Bits [64 - nbBits, 64) =3D value
+ *   2. The bitContainer is built from the upper bits and
+ *      right shifted. E.g. to add a new value of N bits
+ *      you right shift the bitContainer by N, then or in
+ *      the new value into the N upper bits.
+ *   3. The bitstream has two bit containers. You can add
+ *      bits to the second container and merge them into
+ *      the first container.
+ */
+
+#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8)
+
+typedef struct {
+    size_t bitContainer[2];
+    size_t bitPos[2];
+
+    BYTE* startPtr;
+    BYTE* ptr;
+    BYTE* endPtr;
+} HUF_CStream_t;
+
+/*! HUF_initCStream():
+ * Initializes the bitstream.
+ * @returns 0 or an error code.
+ */
+static size_t HUF_initCStream(HUF_CStream_t* bitC,
+                                  void* startPtr, size_t dstCapacity)
+{
+    ZSTD_memset(bitC, 0, sizeof(*bitC));
+    bitC->startPtr =3D (BYTE*)startPtr;
+    bitC->ptr =3D bitC->startPtr;
+    bitC->endPtr =3D bitC->startPtr + dstCapacity - sizeof(bitC->bitContai=
ner[0]);
+    if (dstCapacity <=3D sizeof(bitC->bitContainer[0])) return ERROR(dstSi=
ze_tooSmall);
+    return 0;
+}
+
+/*! HUF_addBits():
+ * Adds the symbol stored in HUF_CElt elt to the bitstream.
+ *
+ * @param elt   The element we're adding. This is a (nbBits, value) pair.
+ *              See the HUF_CStream_t docs for the format.
+ * @param idx   Insert into the bitstream at this idx.
+ * @param kFast This is a template parameter. If the bitstream is guarante=
ed
+ *              to have at least 4 unused bits after this call it may be 1,
+ *              otherwise it must be 0. HUF_addBits() is faster when fast =
is set.
+ */
+FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, =
int idx, int kFast)
+{
+    assert(idx <=3D 1);
+    assert(HUF_getNbBits(elt) <=3D HUF_TABLELOG_ABSOLUTEMAX);
+    /* This is efficient on x86-64 with BMI2 because shrx
+     * only reads the low 6 bits of the register. The compiler
+     * knows this and elides the mask. When fast is set,
+     * every operation can use the same value loaded from elt.
+     */
+    bitC->bitContainer[idx] >>=3D HUF_getNbBits(elt);
+    bitC->bitContainer[idx] |=3D kFast ? HUF_getValueFast(elt) : HUF_getVa=
lue(elt);
+    /* We only read the low 8 bits of bitC->bitPos[idx] so it
+     * doesn't matter that the high bits have noise from the value.
+     */
+    bitC->bitPos[idx] +=3D HUF_getNbBitsFast(elt);
+    assert((bitC->bitPos[idx] & 0xFF) <=3D HUF_BITS_IN_CONTAINER);
+    /* The last 4-bits of elt are dirty if fast is set,
+     * so we must not be overwriting bits that have already been
+     * inserted into the bit container.
+     */
+#if DEBUGLEVEL >=3D 1
+    {
+        size_t const nbBits =3D HUF_getNbBits(elt);
+        size_t const dirtyBits =3D nbBits =3D=3D 0 ? 0 : BIT_highbit32((U3=
2)nbBits) + 1;
+        (void)dirtyBits;
+        /* Middle bits are 0. */
+        assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) =3D=3D 0);
+        /* We didn't overwrite any bits in the bit container. */
+        assert(!kFast || (bitC->bitPos[idx] & 0xFF) <=3D HUF_BITS_IN_CONTA=
INER);
+        (void)dirtyBits;
+    }
+#endif
+}
+
+FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC)
+{
+    bitC->bitContainer[1] =3D 0;
+    bitC->bitPos[1] =3D 0;
+}
+
+/*! HUF_mergeIndex1() :
+ * Merges the bit container @ index 1 into the bit container @ index 0
+ * and zeros the bit container @ index 1.
+ */
+FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC)
+{
+    assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER);
+    bitC->bitContainer[0] >>=3D (bitC->bitPos[1] & 0xFF);
+    bitC->bitContainer[0] |=3D bitC->bitContainer[1];
+    bitC->bitPos[0] +=3D bitC->bitPos[1];
+    assert((bitC->bitPos[0] & 0xFF) <=3D HUF_BITS_IN_CONTAINER);
+}
+
+/*! HUF_flushBits() :
+* Flushes the bits in the bit container @ index 0.
+*
+* @post bitPos will be < 8.
+* @param kFast If kFast is set then we must know a-priori that
+*              the bit container will not overflow.
+*/
+FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast)
+{
+    /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */
+    size_t const nbBits =3D bitC->bitPos[0] & 0xFF;
+    size_t const nbBytes =3D nbBits >> 3;
+    /* The top nbBits bits of bitContainer are the ones we need. */
+    size_t const bitContainer =3D bitC->bitContainer[0] >> (HUF_BITS_IN_CO=
NTAINER - nbBits);
+    /* Mask bitPos to account for the bytes we consumed. */
+    bitC->bitPos[0] &=3D 7;
+    assert(nbBits > 0);
+    assert(nbBits <=3D sizeof(bitC->bitContainer[0]) * 8);
+    assert(bitC->ptr <=3D bitC->endPtr);
+    MEM_writeLEST(bitC->ptr, bitContainer);
+    bitC->ptr +=3D nbBytes;
+    assert(!kFast || bitC->ptr <=3D bitC->endPtr);
+    if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr =3D bitC->endPtr;
+    /* bitContainer doesn't need to be modified because the leftover
+     * bits are already the top bitPos bits. And we don't care about
+     * noise in the lower values.
+     */
+}
+
+/*! HUF_endMark()
+ * @returns The Huffman stream end mark: A 1-bit value =3D 1.
+ */
+static HUF_CElt HUF_endMark(void)
+{
+    HUF_CElt endMark;
+    HUF_setNbBits(&endMark, 1);
+    HUF_setValue(&endMark, 1);
+    return endMark;
+}
+
+/*! HUF_closeCStream() :
+ *  @return Size of CStream, in bytes,
+ *          or 0 if it could not fit into dstBuffer */
+static size_t HUF_closeCStream(HUF_CStream_t* bitC)
+{
+    HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0);
+    HUF_flushBits(bitC, /* kFast */ 0);
+    {
+        size_t const nbBits =3D bitC->bitPos[0] & 0xFF;
+        if (bitC->ptr >=3D bitC->endPtr) return 0; /* overflow detected */
+        return (bitC->ptr - bitC->startPtr) + (nbBits > 0);
+    }
+}
+
 FORCE_INLINE_TEMPLATE void
-HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTabl=
e)
+HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTabl=
e, int idx, int fast)
 {
-    BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits);
+    HUF_addBits(bitCPtr, CTable[symbol], idx, fast);
 }
=20
-#define HUF_FLUSHBITS(s)  BIT_flushBits(s)
+FORCE_INLINE_TEMPLATE void
+HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC,
+                                   const BYTE* ip, size_t srcSize,
+                                   const HUF_CElt* ct,
+                                   int kUnroll, int kFastFlush, int kLastF=
ast)
+{
+    /* Join to kUnroll */
+    int n =3D (int)srcSize;
+    int rem =3D n % kUnroll;
+    if (rem > 0) {
+        for (; rem > 0; --rem) {
+            HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0);
+        }
+        HUF_flushBits(bitC, kFastFlush);
+    }
+    assert(n % kUnroll =3D=3D 0);
+
+    /* Join to 2 * kUnroll */
+    if (n % (2 * kUnroll)) {
+        int u;
+        for (u =3D 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast);
+        HUF_flushBits(bitC, kFastFlush);
+        n -=3D kUnroll;
+    }
+    assert(n % (2 * kUnroll) =3D=3D 0);
+
+    for (; n>0; n-=3D 2 * kUnroll) {
+        /* Encode kUnroll symbols into the bitstream @ index 0. */
+        int u;
+        for (u =3D 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ =
1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast *=
/ kLastFast);
+        HUF_flushBits(bitC, kFastFlush);
+        /* Encode kUnroll symbols into the bitstream @ index 1.
+         * This allows us to start filling the bit container
+         * without any data dependencies.
+         */
+        HUF_zeroIndex1(bitC);
+        for (u =3D 1; u < kUnroll; ++u) {
+            HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /=
* fast */ 1);
+        }
+        HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1,=
 /* fast */ kLastFast);
+        /* Merge bitstream @ index 1 into the bitstream @ index 0 */
+        HUF_mergeIndex1(bitC);
+        HUF_flushBits(bitC, kFastFlush);
+    }
+    assert(n =3D=3D 0);
+
+}
=20
-#define HUF_FLUSHBITS_1(stream) \
-    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSH=
BITS(stream)
+/*
+ * Returns a tight upper bound on the output space needed by Huffman
+ * with 8 bytes buffer to handle over-writes. If the output is at least
+ * this large we don't need to do bounds checks during Huffman encoding.
+ */
+static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog)
+{
+    return ((srcSize * tableLog) >> 3) + 8;
+}
=20
-#define HUF_FLUSHBITS_2(stream) \
-    if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSH=
BITS(stream)
=20
 FORCE_INLINE_TEMPLATE size_t
 HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize,
                                    const void* src, size_t srcSize,
                                    const HUF_CElt* CTable)
 {
+    U32 const tableLog =3D (U32)CTable[0];
+    HUF_CElt const* ct =3D CTable + 1;
     const BYTE* ip =3D (const BYTE*) src;
     BYTE* const ostart =3D (BYTE*)dst;
     BYTE* const oend =3D ostart + dstSize;
     BYTE* op =3D ostart;
-    size_t n;
-    BIT_CStream_t bitC;
+    HUF_CStream_t bitC;
=20
     /* init */
     if (dstSize < 8) return 0;   /* not enough space to compress */
-    { size_t const initErr =3D BIT_initCStream(&bitC, op, (size_t)(oend-op=
));
+    { size_t const initErr =3D HUF_initCStream(&bitC, op, (size_t)(oend-op=
));
       if (HUF_isError(initErr)) return 0; }
=20
-    n =3D srcSize & ~3;  /* join to mod 4 */
-    switch (srcSize & 3)
-    {
-        case 3:
-            HUF_encodeSymbol(&bitC, ip[n+ 2], CTable);
-            HUF_FLUSHBITS_2(&bitC);
-            ZSTD_FALLTHROUGH;
-        case 2:
-            HUF_encodeSymbol(&bitC, ip[n+ 1], CTable);
-            HUF_FLUSHBITS_1(&bitC);
-            ZSTD_FALLTHROUGH;
-        case 1:
-            HUF_encodeSymbol(&bitC, ip[n+ 0], CTable);
-            HUF_FLUSHBITS(&bitC);
-            ZSTD_FALLTHROUGH;
-        case 0: ZSTD_FALLTHROUGH;
-        default: break;
-    }
-
-    for (; n>0; n-=3D4) {  /* note : n&3=3D=3D0 at this stage */
-        HUF_encodeSymbol(&bitC, ip[n- 1], CTable);
-        HUF_FLUSHBITS_1(&bitC);
-        HUF_encodeSymbol(&bitC, ip[n- 2], CTable);
-        HUF_FLUSHBITS_2(&bitC);
-        HUF_encodeSymbol(&bitC, ip[n- 3], CTable);
-        HUF_FLUSHBITS_1(&bitC);
-        HUF_encodeSymbol(&bitC, ip[n- 4], CTable);
-        HUF_FLUSHBITS(&bitC);
-    }
-
-    return BIT_closeCStream(&bitC);
+    if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tab=
leLog > 11)
+        HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, =
ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0);
+    else {
+        if (MEM_32bits()) {
+            switch (tableLog) {
+            case 11:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 10: ZSTD_FALLTHROUGH;
+            case 9: ZSTD_FALLTHROUGH;
+            case 8:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            case 7: ZSTD_FALLTHROUGH;
+            default:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            }
+        } else {
+            switch (tableLog) {
+            case 11:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 10:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            case 9:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 8:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 7:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0);
+                break;
+            case 6: ZSTD_FALLTHROUGH;
+            default:
+                HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s=
rcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1);
+                break;
+            }
+        }
+    }
+    assert(bitC.ptr <=3D bitC.endPtr);
+
+    return HUF_closeCStream(&bitC);
 }
=20
 #if DYNAMIC_BMI2
=20
-static TARGET_ATTRIBUTE("bmi2") size_t
+static BMI2_TARGET_ATTRIBUTE size_t
 HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize,
                                    const void* src, size_t srcSize,
                                    const HUF_CElt* CTable)
@@ -667,9 +1068,13 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t=
 dstSize,
=20
 size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* s=
rc, size_t srcSize, const HUF_CElt* CTable)
 {
-    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize,=
 CTable, /* bmi2 */ 0);
+    return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTa=
ble, /* bmi2 */ 0);
 }
=20
+size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const vo=
id* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+{
+    return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize,=
 CTable, bmi2);
+}
=20
 static size_t
 HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize,
@@ -689,8 +1094,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t =
dstSize,
=20
     assert(op <=3D oend);
     {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(=
oend-op), ip, segmentSize, CTable, bmi2) );
-        if (cSize=3D=3D0) return 0;
-        assert(cSize <=3D 65535);
+        if (cSize =3D=3D 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart, (U16)cSize);
         op +=3D cSize;
     }
@@ -698,8 +1102,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t =
dstSize,
     ip +=3D segmentSize;
     assert(op <=3D oend);
     {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(=
oend-op), ip, segmentSize, CTable, bmi2) );
-        if (cSize=3D=3D0) return 0;
-        assert(cSize <=3D 65535);
+        if (cSize =3D=3D 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+2, (U16)cSize);
         op +=3D cSize;
     }
@@ -707,8 +1110,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t =
dstSize,
     ip +=3D segmentSize;
     assert(op <=3D oend);
     {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(=
oend-op), ip, segmentSize, CTable, bmi2) );
-        if (cSize=3D=3D0) return 0;
-        assert(cSize <=3D 65535);
+        if (cSize =3D=3D 0 || cSize > 65535) return 0;
         MEM_writeLE16(ostart+4, (U16)cSize);
         op +=3D cSize;
     }
@@ -717,7 +1119,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t =
dstSize,
     assert(op <=3D oend);
     assert(ip <=3D iend);
     {   CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(=
oend-op), ip, (size_t)(iend-ip), CTable, bmi2) );
-        if (cSize=3D=3D0) return 0;
+        if (cSize =3D=3D 0 || cSize > 65535) return 0;
         op +=3D cSize;
     }
=20
@@ -726,7 +1128,12 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t=
 dstSize,
=20
 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* s=
rc, size_t srcSize, const HUF_CElt* CTable)
 {
-    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize,=
 CTable, /* bmi2 */ 0);
+    return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTa=
ble, /* bmi2 */ 0);
+}
+
+size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const vo=
id* src, size_t srcSize, const HUF_CElt* CTable, int bmi2)
+{
+    return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize,=
 CTable, bmi2);
 }
=20
 typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e;
@@ -750,35 +1157,38 @@ static size_t HUF_compressCTable_internal(
=20
 typedef struct {
     unsigned count[HUF_SYMBOLVALUE_MAX + 1];
-    HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1];
+    HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)];
     union {
         HUF_buildCTable_wksp_tables buildCTable_wksp;
         HUF_WriteCTableWksp writeCTable_wksp;
+        U32 hist_wksp[HIST_WKSP_SIZE_U32];
     } wksps;
 } HUF_compress_tables_t;
=20
+#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096
+#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10  /* Must be >=3D 2 */
+
 /* HUF_compress_internal() :
  * `workSpace_align4` must be aligned on 4-bytes boundaries,
- * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsign=
ed */
+ * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsign=
ed */
 static size_t
 HUF_compress_internal (void* dst, size_t dstSize,
                  const void* src, size_t srcSize,
                        unsigned maxSymbolValue, unsigned huffLog,
                        HUF_nbStreams_e nbStreams,
-                       void* workSpace_align4, size_t wkspSize,
+                       void* workSpace, size_t wkspSize,
                        HUF_CElt* oldHufTable, HUF_repeat* repeat, int pref=
erRepeat,
-                 const int bmi2)
+                 const int bmi2, unsigned suspectUncompressible)
 {
-    HUF_compress_tables_t* const table =3D (HUF_compress_tables_t*)workSpa=
ce_align4;
+    HUF_compress_tables_t* const table =3D (HUF_compress_tables_t*)HUF_ali=
gnUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t));
     BYTE* const ostart =3D (BYTE*)dst;
     BYTE* const oend =3D ostart + dstSize;
     BYTE* op =3D ostart;
=20
-    HUF_STATIC_ASSERT(sizeof(*table) <=3D HUF_WORKSPACE_SIZE);
-    assert(((size_t)workSpace_align4 & 3) =3D=3D 0);   /* must be aligned =
on 4-bytes boundaries */
+    HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <=3D HU=
F_WORKSPACE_SIZE);
=20
     /* checks & inits */
-    if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall);
+    if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall);
     if (!srcSize) return 0;  /* Uncompressed */
     if (!dstSize) return 0;  /* cannot fit anything within dst budget */
     if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong);   /* cur=
rent block size limit */
@@ -794,8 +1204,23 @@ HUF_compress_internal (void* dst, size_t dstSize,
                                            nbStreams, oldHufTable, bmi2);
     }
=20
+    /* If uncompressible data is suspected, do a smaller sampling first */
+    DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >=3D 2);
+    if (suspectUncompressible && srcSize >=3D (SUSPECT_INCOMPRESSIBLE_SAMP=
LE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) {
+        size_t largestTotal =3D 0;
+        {   unsigned maxSymbolValueBegin =3D maxSymbolValue;
+            CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxS=
ymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+            largestTotal +=3D largestBegin;
+        }
+        {   unsigned maxSymbolValueEnd =3D maxSymbolValue;
+            CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSym=
bolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZ=
E, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) );
+            largestTotal +=3D largestEnd;
+        }
+        if (largestTotal <=3D ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >>=
 7)+4) return 0;   /* heuristic : probably not compressible enough */
+    }
+
     /* Scan input and build symbol stats */
-    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue,=
 (const BYTE*)src, srcSize, workSpace_align4, wkspSize) );
+    {   CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue,=
 (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.his=
t_wksp)) );
         if (largest =3D=3D srcSize) { *ostart =3D ((const BYTE*)src)[0]; r=
eturn 1; }   /* single symbol, rle */
         if (largest <=3D (srcSize >> 7)+4) return 0;   /* heuristic : prob=
ably not compressible enough */
     }
@@ -820,9 +1245,12 @@ HUF_compress_internal (void* dst, size_t dstSize,
                                             &table->wksps.buildCTable_wksp=
, sizeof(table->wksps.buildCTable_wksp));
         CHECK_F(maxBits);
         huffLog =3D (U32)maxBits;
-        /* Zero unused symbols in CTable, so we can check it for validity =
*/
-        ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0,
-               sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_=
CElt)));
+    }
+    /* Zero unused symbols in CTable, so we can check it for validity */
+    {
+        size_t const ctableSize =3D HUF_CTABLE_SIZE_ST(maxSymbolValue);
+        size_t const unusedSize =3D sizeof(table->CTable) - ctableSize * s=
izeof(HUF_CElt);
+        ZSTD_memset(table->CTable + ctableSize, 0, unusedSize);
     }
=20
     /* Write table description header */
@@ -859,19 +1287,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSiz=
e,
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_singleStream,
                                  workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/);
+                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
 }
=20
 size_t HUF_compress1X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRe=
peat, int bmi2)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRe=
peat,
+                      int bmi2, unsigned suspectUncompressible)
 {
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_singleStream,
                                  workSpace, wkspSize, hufTable,
-                                 repeat, preferRepeat, bmi2);
+                                 repeat, preferRepeat, bmi2, suspectUncomp=
ressible);
 }
=20
 /* HUF_compress4X_repeat():
@@ -885,21 +1314,22 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSiz=
e,
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_fourStreams,
                                  workSpace, wkspSize,
-                                 NULL, NULL, 0, 0 /*bmi2*/);
+                                 NULL, NULL, 0, 0 /*bmi2*/, 0);
 }
=20
 /* HUF_compress4X_repeat():
  * compress input using 4 streams.
+ * consider skipping quickly
  * re-use an existing huffman compression table */
 size_t HUF_compress4X_repeat (void* dst, size_t dstSize,
                       const void* src, size_t srcSize,
                       unsigned maxSymbolValue, unsigned huffLog,
                       void* workSpace, size_t wkspSize,
-                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRe=
peat, int bmi2)
+                      HUF_CElt* hufTable, HUF_repeat* repeat, int preferRe=
peat, int bmi2, unsigned suspectUncompressible)
 {
     return HUF_compress_internal(dst, dstSize, src, srcSize,
                                  maxSymbolValue, huffLog, HUF_fourStreams,
                                  workSpace, wkspSize,
-                                 hufTable, repeat, preferRepeat, bmi2);
+                                 hufTable, repeat, preferRepeat, bmi2, sus=
pectUncompressible);
 }
=20
diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_com=
press.c
index a4e916008b3a..f620cafca633 100644
--- a/lib/zstd/compress/zstd_compress.c
+++ b/lib/zstd/compress/zstd_compress.c
@@ -12,7 +12,6 @@
 *  Dependencies
 ***************************************/
 #include "../common/zstd_deps.h"  /* INT_MAX, ZSTD_memset, ZSTD_memcpy */
-#include "../common/cpu.h"
 #include "../common/mem.h"
 #include "hist.h"           /* HIST_countFast_wksp */
 #define FSE_STATIC_LINKING_ONLY   /* FSE_encodeSymbol */
@@ -39,6 +38,18 @@
  * Note that functions with explicit context such as ZSTD_compressCCtx() a=
re unaffected.
  */
=20
+/*!
+ * ZSTD_HASHLOG3_MAX :
+ * Maximum size of the hash table dedicated to find 3-bytes matches,
+ * in log format, aka 17 =3D> 1 << 17 =3D=3D 128Ki positions.
+ * This structure is only used in zstd_opt.
+ * Since allocation is centralized for all strategies, it has to be known =
here.
+ * The actual (selected) size of the hash table is then stored in ZSTD_mat=
chState_t.hashLog3,
+ * so that zstd_opt.c doesn't need to know about this constant.
+ */
+#ifndef ZSTD_HASHLOG3_MAX
+#  define ZSTD_HASHLOG3_MAX 17
+#endif
=20
 /*-*************************************
 *  Helper functions
@@ -69,6 +80,10 @@ struct ZSTD_CDict_s {
     ZSTD_customMem customMem;
     U32 dictID;
     int compressionLevel; /* 0 indicates that advanced API was used to sel=
ect CDict params */
+    ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict w=
as created with params that would use
+                                           * row-based matchfinder. Unless=
 the cdict is reloaded, we will use
+                                           * the same greedy/lazy matchfin=
der at compression time.
+                                           */
 };  /* typedef'd to ZSTD_CDict within "zstd.h" */
=20
 ZSTD_CCtx* ZSTD_createCCtx(void)
@@ -81,7 +96,7 @@ static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem=
 memManager)
     assert(cctx !=3D NULL);
     ZSTD_memset(cctx, 0, sizeof(*cctx));
     cctx->customMem =3D memManager;
-    cctx->bmi2 =3D ZSTD_cpuid_bmi2(ZSTD_cpuid());
+    cctx->bmi2 =3D ZSTD_cpuSupportsBmi2();
     {   size_t const err =3D ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters);
         assert(!ZSTD_isError(err));
         (void)err;
@@ -192,12 +207,64 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs)
 /* private API call, for dictBuilder only */
 const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->s=
eqStore); }
=20
+/* Returns true if the strategy supports using a row based matchfinder */
+static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) {
+    return (strategy >=3D ZSTD_greedy && strategy <=3D ZSTD_lazy2);
+}
+
+/* Returns true if the strategy and useRowMatchFinder mode indicate that w=
e will use the row based matchfinder
+ * for this compression.
+ */
+static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZST=
D_paramSwitch_e mode) {
+    assert(mode !=3D ZSTD_ps_auto);
+    return ZSTD_rowMatchFinderSupported(strategy) && (mode =3D=3D ZSTD_ps_=
enable);
+}
+
+/* Returns row matchfinder usage given an initial mode and cParams */
+static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_=
e mode,
+                                                         const ZSTD_compre=
ssionParameters* const cParams) {
+#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON)
+    int const kHasSIMD128 =3D 1;
+#else
+    int const kHasSIMD128 =3D 0;
+#endif
+    if (mode !=3D ZSTD_ps_auto) return mode; /* if requested enabled, but =
no SIMD, we still will use row matchfinder */
+    mode =3D ZSTD_ps_disable;
+    if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode;
+    if (kHasSIMD128) {
+        if (cParams->windowLog > 14) mode =3D ZSTD_ps_enable;
+    } else {
+        if (cParams->windowLog > 17) mode =3D ZSTD_ps_enable;
+    }
+    return mode;
+}
+
+/* Returns block splitter usage (generally speaking, when using slower/str=
onger compression modes) */
+static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e=
 mode,
+                                                        const ZSTD_compres=
sionParameters* const cParams) {
+    if (mode !=3D ZSTD_ps_auto) return mode;
+    return (cParams->strategy >=3D ZSTD_btopt && cParams->windowLog >=3D 1=
7) ? ZSTD_ps_enable : ZSTD_ps_disable;
+}
+
+/* Returns 1 if the arguments indicate that we should allocate a chainTabl=
e, 0 otherwise */
+static int ZSTD_allocateChainTable(const ZSTD_strategy strategy,
+                                   const ZSTD_paramSwitch_e useRowMatchFin=
der,
+                                   const U32 forDDSDict) {
+    assert(useRowMatchFinder !=3D ZSTD_ps_auto);
+    /* We always should allocate a chaintable if we are allocating a match=
state for a DDS dictionary matchstate.
+     * We do not allocate a chaintable if we are using ZSTD_fast, or are u=
sing the row-based matchfinder.
+     */
+    return forDDSDict || ((strategy !=3D ZSTD_fast) && !ZSTD_rowMatchFinde=
rUsed(strategy, useRowMatchFinder));
+}
+
 /* Returns 1 if compression parameters are such that we should
  * enable long distance matching (wlog >=3D 27, strategy >=3D btopt).
  * Returns 0 otherwise.
  */
-static U32 ZSTD_CParams_shouldEnableLdm(const ZSTD_compressionParameters* =
const cParams) {
-    return cParams->strategy >=3D ZSTD_btopt && cParams->windowLog >=3D 27;
+static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode,
+                                 const ZSTD_compressionParameters* const c=
Params) {
+    if (mode !=3D ZSTD_ps_auto) return mode;
+    return (cParams->strategy >=3D ZSTD_btopt && cParams->windowLog >=3D 2=
7) ? ZSTD_ps_enable : ZSTD_ps_disable;
 }
=20
 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams(
@@ -208,15 +275,15 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParam=
s(
     ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT);
     cctxParams.cParams =3D cParams;
=20
-    if (ZSTD_CParams_shouldEnableLdm(&cParams)) {
-        DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including LDM into =
cctx params");
-        cctxParams.ldmParams.enableLdm =3D 1;
-        /* LDM is enabled by default for optimal parser and window size >=
=3D 128MB */
+    /* Adjust advanced params according to cParams */
+    cctxParams.ldmParams.enableLdm =3D ZSTD_resolveEnableLdm(cctxParams.ld=
mParams.enableLdm, &cParams);
+    if (cctxParams.ldmParams.enableLdm =3D=3D ZSTD_ps_enable) {
         ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams);
         assert(cctxParams.ldmParams.hashLog >=3D cctxParams.ldmParams.buck=
etSizeLog);
         assert(cctxParams.ldmParams.hashRateLog < 32);
     }
-
+    cctxParams.useBlockSplitter =3D ZSTD_resolveBlockSplitterMode(cctxPara=
ms.useBlockSplitter, &cParams);
+    cctxParams.useRowMatchFinder =3D ZSTD_resolveRowMatchFinderMode(cctxPa=
rams.useRowMatchFinder, &cParams);
     assert(!ZSTD_checkCParams(cParams));
     return cctxParams;
 }
@@ -275,6 +342,11 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_pa=
rams* cctxParams, ZSTD_par
      * But, set it for tracing anyway.
      */
     cctxParams->compressionLevel =3D compressionLevel;
+    cctxParams->useRowMatchFinder =3D ZSTD_resolveRowMatchFinderMode(cctxP=
arams->useRowMatchFinder, &params->cParams);
+    cctxParams->useBlockSplitter =3D ZSTD_resolveBlockSplitterMode(cctxPar=
ams->useBlockSplitter, &params->cParams);
+    cctxParams->ldmParams.enableLdm =3D ZSTD_resolveEnableLdm(cctxParams->=
ldmParams.enableLdm, &params->cParams);
+    DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=3D%d, us=
eBlockSplitter=3D%d ldm=3D%d",
+                cctxParams->useRowMatchFinder, cctxParams->useBlockSplitte=
r, cctxParams->ldmParams.enableLdm);
 }
=20
 size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_pa=
rameters params)
@@ -431,9 +503,9 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param)
         return bounds;
=20
     case ZSTD_c_literalCompressionMode:
-        ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_hu=
ffman < ZSTD_lcm_uncompressed);
-        bounds.lowerBound =3D ZSTD_lcm_auto;
-        bounds.upperBound =3D ZSTD_lcm_uncompressed;
+        ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable=
 < ZSTD_ps_disable);
+        bounds.lowerBound =3D (int)ZSTD_ps_auto;
+        bounds.upperBound =3D (int)ZSTD_ps_disable;
         return bounds;
=20
     case ZSTD_c_targetCBlockSize:
@@ -462,6 +534,21 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter para=
m)
         bounds.upperBound =3D 1;
         return bounds;
=20
+    case ZSTD_c_useBlockSplitter:
+        bounds.lowerBound =3D (int)ZSTD_ps_auto;
+        bounds.upperBound =3D (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_useRowMatchFinder:
+        bounds.lowerBound =3D (int)ZSTD_ps_auto;
+        bounds.upperBound =3D (int)ZSTD_ps_disable;
+        return bounds;
+
+    case ZSTD_c_deterministicRefPrefix:
+        bounds.lowerBound =3D 0;
+        bounds.upperBound =3D 1;
+        return bounds;
+
     default:
         bounds.error =3D ERROR(parameter_unsupported);
         return bounds;
@@ -523,6 +610,9 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter para=
m)
     case ZSTD_c_stableOutBuffer:
     case ZSTD_c_blockDelimiters:
     case ZSTD_c_validateSequences:
+    case ZSTD_c_useBlockSplitter:
+    case ZSTD_c_useRowMatchFinder:
+    case ZSTD_c_deterministicRefPrefix:
     default:
         return 0;
     }
@@ -575,6 +665,9 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cPa=
rameter param, int value)
     case ZSTD_c_stableOutBuffer:
     case ZSTD_c_blockDelimiters:
     case ZSTD_c_validateSequences:
+    case ZSTD_c_useBlockSplitter:
+    case ZSTD_c_useRowMatchFinder:
+    case ZSTD_c_deterministicRefPrefix:
         break;
=20
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
@@ -672,7 +765,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* C=
CtxParams,
     }
=20
     case ZSTD_c_literalCompressionMode : {
-        const ZSTD_literalCompressionMode_e lcm =3D (ZSTD_literalCompressi=
onMode_e)value;
+        const ZSTD_paramSwitch_e lcm =3D (ZSTD_paramSwitch_e)value;
         BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm);
         CCtxParams->literalCompressionMode =3D lcm;
         return CCtxParams->literalCompressionMode;
@@ -699,7 +792,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* C=
CtxParams,
         return CCtxParams->enableDedicatedDictSearch;
=20
     case ZSTD_c_enableLongDistanceMatching :
-        CCtxParams->ldmParams.enableLdm =3D (value!=3D0);
+        CCtxParams->ldmParams.enableLdm =3D (ZSTD_paramSwitch_e)value;
         return CCtxParams->ldmParams.enableLdm;
=20
     case ZSTD_c_ldmHashLog :
@@ -758,6 +851,21 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* =
CCtxParams,
         CCtxParams->validateSequences =3D value;
         return CCtxParams->validateSequences;
=20
+    case ZSTD_c_useBlockSplitter:
+        BOUNDCHECK(ZSTD_c_useBlockSplitter, value);
+        CCtxParams->useBlockSplitter =3D (ZSTD_paramSwitch_e)value;
+        return CCtxParams->useBlockSplitter;
+
+    case ZSTD_c_useRowMatchFinder:
+        BOUNDCHECK(ZSTD_c_useRowMatchFinder, value);
+        CCtxParams->useRowMatchFinder =3D (ZSTD_paramSwitch_e)value;
+        return CCtxParams->useRowMatchFinder;
+
+    case ZSTD_c_deterministicRefPrefix:
+        BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value);
+        CCtxParams->deterministicRefPrefix =3D !!value;
+        return CCtxParams->deterministicRefPrefix;
+
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
 }
@@ -863,6 +971,15 @@ size_t ZSTD_CCtxParams_getParameter(
     case ZSTD_c_validateSequences :
         *value =3D (int)CCtxParams->validateSequences;
         break;
+    case ZSTD_c_useBlockSplitter :
+        *value =3D (int)CCtxParams->useBlockSplitter;
+        break;
+    case ZSTD_c_useRowMatchFinder :
+        *value =3D (int)CCtxParams->useRowMatchFinder;
+        break;
+    case ZSTD_c_deterministicRefPrefix:
+        *value =3D (int)CCtxParams->deterministicRefPrefix;
+        break;
     default: RETURN_ERROR(parameter_unsupported, "unknown parameter");
     }
     return 0;
@@ -889,7 +1006,7 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams(
     return 0;
 }
=20
-ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned l=
ong long pledgedSrcSize)
+size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long ple=
dgedSrcSize)
 {
     DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrc=
Size);
     RETURN_ERROR_IF(cctx->streamStage !=3D zcss_init, stage_wrong,
@@ -969,14 +1086,14 @@ size_t ZSTD_CCtx_loadDictionary_advanced(
     return 0;
 }
=20
-ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(
+size_t ZSTD_CCtx_loadDictionary_byReference(
       ZSTD_CCtx* cctx, const void* dict, size_t dictSize)
 {
     return ZSTD_CCtx_loadDictionary_advanced(
             cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto);
 }
=20
-ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* d=
ict, size_t dictSize)
+size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t =
dictSize)
 {
     return ZSTD_CCtx_loadDictionary_advanced(
             cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto);
@@ -1146,7 +1263,7 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameter=
s cPar,
         break;
     case ZSTD_cpm_createCDict:
         /* Assume a small source size when creating a dictionary
-         * with an unkown source size.
+         * with an unknown source size.
          */
         if (dictSize && srcSize =3D=3D ZSTD_CONTENTSIZE_UNKNOWN)
             srcSize =3D minSrcSize;
@@ -1220,7 +1337,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxPar=
ams(
       srcSizeHint =3D CCtxParams->srcSizeHint;
     }
     cParams =3D ZSTD_getCParams_internal(CCtxParams->compressionLevel, src=
SizeHint, dictSize, mode);
-    if (CCtxParams->ldmParams.enableLdm) cParams.windowLog =3D ZSTD_LDM_DE=
FAULT_WINDOW_LOG;
+    if (CCtxParams->ldmParams.enableLdm =3D=3D ZSTD_ps_enable) cParams.win=
dowLog =3D ZSTD_LDM_DEFAULT_WINDOW_LOG;
     ZSTD_overrideCParams(&cParams, &CCtxParams->cParams);
     assert(!ZSTD_checkCParams(cParams));
     /* srcSizeHint =3D=3D 0 means 0 */
@@ -1229,9 +1346,14 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxPa=
rams(
=20
 static size_t
 ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams,
+                       const ZSTD_paramSwitch_e useRowMatchFinder,
+                       const U32 enableDedicatedDictSearch,
                        const U32 forCCtx)
 {
-    size_t const chainSize =3D (cParams->strategy =3D=3D ZSTD_fast) ? 0 : =
((size_t)1 << cParams->chainLog);
+    /* chain table size should be 0 for fast or row-hash strategies */
+    size_t const chainSize =3D ZSTD_allocateChainTable(cParams->strategy, =
useRowMatchFinder, enableDedicatedDictSearch && !forCCtx)
+                                ? ((size_t)1 << cParams->chainLog)
+                                : 0;
     size_t const hSize =3D ((size_t)1) << cParams->hashLog;
     U32    const hashLog3 =3D (forCCtx && cParams->minMatch=3D=3D3) ? MIN(=
ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
     size_t const h3Size =3D hashLog3 ? ((size_t)1) << hashLog3 : 0;
@@ -1241,43 +1363,53 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParame=
ters* const cParams,
                             + hSize * sizeof(U32)
                             + h3Size * sizeof(U32);
     size_t const optPotentialSpace =3D
-        ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32))
-      + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32))
-      + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32))
-      + ZSTD_cwksp_alloc_size((1<<Litbits) * sizeof(U32))
-      + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t))
-      + ZSTD_cwksp_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
+        ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((1<<Litbits) * sizeof(U32))
+      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match=
_t))
+      + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optim=
al_t));
+    size_t const lazyAdditionalSpace =3D ZSTD_rowMatchFinderUsed(cParams->=
strategy, useRowMatchFinder)
+                                            ? ZSTD_cwksp_aligned_alloc_siz=
e(hSize*sizeof(U16))
+                                            : 0;
     size_t const optSpace =3D (forCCtx && (cParams->strategy >=3D ZSTD_bto=
pt))
                                 ? optPotentialSpace
                                 : 0;
+    size_t const slackSpace =3D ZSTD_cwksp_slack_space_required();
+
+    /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 u=
int32_t) */
+    ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >=3D 4 && ZSTD_WINDOWLOG_MIN >=3D =
4 && ZSTD_CHAINLOG_MIN >=3D 4);
+    assert(useRowMatchFinder !=3D ZSTD_ps_auto);
+
     DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u",
                 (U32)chainSize, (U32)hSize, (U32)h3Size);
-    return tableSpace + optSpace;
+    return tableSpace + optSpace + slackSpace + lazyAdditionalSpace;
 }
=20
 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal(
         const ZSTD_compressionParameters* cParams,
         const ldmParams_t* ldmParams,
         const int isStatic,
+        const ZSTD_paramSwitch_e useRowMatchFinder,
         const size_t buffInSize,
         const size_t buffOutSize,
         const U64 pledgedSrcSize)
 {
-    size_t const windowSize =3D MAX(1, (size_t)MIN(((U64)1 << cParams->win=
dowLog), pledgedSrcSize));
+    size_t const windowSize =3D (size_t) BOUNDED(1ULL, 1ULL << cParams->wi=
ndowLog, pledgedSrcSize);
     size_t const blockSize =3D MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
     U32    const divider =3D (cParams->minMatch=3D=3D3) ? 3 : 4;
     size_t const maxNbSeq =3D blockSize / divider;
     size_t const tokenSpace =3D ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH =
+ blockSize)
-                            + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqD=
ef))
+                            + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * siz=
eof(seqDef))
                             + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(=
BYTE));
     size_t const entropySpace =3D ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_=
SIZE);
     size_t const blockStateSpace =3D 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD=
_compressedBlockState_t));
-    size_t const matchStateSize =3D ZSTD_sizeof_matchState(cParams, /* for=
CCtx */ 1);
+    size_t const matchStateSize =3D ZSTD_sizeof_matchState(cParams, useRow=
MatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1);
=20
     size_t const ldmSpace =3D ZSTD_ldm_getTableSize(*ldmParams);
     size_t const maxNbLdmSeq =3D ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSiz=
e);
-    size_t const ldmSeqSpace =3D ldmParams->enableLdm ?
-        ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
+    size_t const ldmSeqSpace =3D ldmParams->enableLdm =3D=3D ZSTD_ps_enabl=
e ?
+        ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0;
=20
=20
     size_t const bufferSpace =3D ZSTD_cwksp_alloc_size(buffInSize)
@@ -1303,19 +1435,32 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const =
ZSTD_CCtx_params* params)
 {
     ZSTD_compressionParameters const cParams =3D
                 ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNK=
NOWN, 0, ZSTD_cpm_noAttachDict);
+    ZSTD_paramSwitch_e const useRowMatchFinder =3D ZSTD_resolveRowMatchFin=
derMode(params->useRowMatchFinder,
+                                                                          =
     &cParams);
=20
     RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is=
 supported for single-threaded compression only.");
     /* estimateCCtxSize is for one-shot compression. So no buffers should
      * be needed. However, we still allocate two 0-sized buffers, which can
      * take space under ASAN. */
     return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-        &cParams, &params->ldmParams, 1, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN);
+        &cParams, &params->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CON=
TENTSIZE_UNKNOWN);
 }
=20
 size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cPara=
ms)
 {
-    ZSTD_CCtx_params const params =3D ZSTD_makeCCtxParamsFromCParams(cPara=
ms);
-    return ZSTD_estimateCCtxSize_usingCCtxParams(&params);
+    ZSTD_CCtx_params initialParams =3D ZSTD_makeCCtxParamsFromCParams(cPar=
ams);
+    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) {
+        /* Pick bigger of not using and using row-based matchfinder for gr=
eedy and lazy strategies */
+        size_t noRowCCtxSize;
+        size_t rowCCtxSize;
+        initialParams.useRowMatchFinder =3D ZSTD_ps_disable;
+        noRowCCtxSize =3D ZSTD_estimateCCtxSize_usingCCtxParams(&initialPa=
rams);
+        initialParams.useRowMatchFinder =3D ZSTD_ps_enable;
+        rowCCtxSize =3D ZSTD_estimateCCtxSize_usingCCtxParams(&initialPara=
ms);
+        return MAX(noRowCCtxSize, rowCCtxSize);
+    } else {
+        return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams);
+    }
 }
=20
 static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel)
@@ -1355,17 +1500,29 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(con=
st ZSTD_CCtx_params* params)
         size_t const outBuffSize =3D (params->outBufferMode =3D=3D ZSTD_bm=
_buffered)
                 ? ZSTD_compressBound(blockSize) + 1
                 : 0;
+        ZSTD_paramSwitch_e const useRowMatchFinder =3D ZSTD_resolveRowMatc=
hFinderMode(params->useRowMatchFinder, &params->cParams);
=20
         return ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-            &cParams, &params->ldmParams, 1, inBuffSize, outBuffSize,
+            &cParams, &params->ldmParams, 1, useRowMatchFinder, inBuffSize=
, outBuffSize,
             ZSTD_CONTENTSIZE_UNKNOWN);
     }
 }
=20
 size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cP=
arams)
 {
-    ZSTD_CCtx_params const params =3D ZSTD_makeCCtxParamsFromCParams(cPara=
ms);
-    return ZSTD_estimateCStreamSize_usingCCtxParams(&params);
+    ZSTD_CCtx_params initialParams =3D ZSTD_makeCCtxParamsFromCParams(cPar=
ams);
+    if (ZSTD_rowMatchFinderSupported(cParams.strategy)) {
+        /* Pick bigger of not using and using row-based matchfinder for gr=
eedy and lazy strategies */
+        size_t noRowCCtxSize;
+        size_t rowCCtxSize;
+        initialParams.useRowMatchFinder =3D ZSTD_ps_disable;
+        noRowCCtxSize =3D ZSTD_estimateCStreamSize_usingCCtxParams(&initia=
lParams);
+        initialParams.useRowMatchFinder =3D ZSTD_ps_enable;
+        rowCCtxSize =3D ZSTD_estimateCStreamSize_usingCCtxParams(&initialP=
arams);
+        return MAX(noRowCCtxSize, rowCCtxSize);
+    } else {
+        return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams);
+    }
 }
=20
 static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel)
@@ -1480,20 +1637,27 @@ typedef enum {
     ZSTD_resetTarget_CCtx
 } ZSTD_resetTarget_e;
=20
+
 static size_t
 ZSTD_reset_matchState(ZSTD_matchState_t* ms,
                       ZSTD_cwksp* ws,
                 const ZSTD_compressionParameters* cParams,
+                const ZSTD_paramSwitch_e useRowMatchFinder,
                 const ZSTD_compResetPolicy_e crp,
                 const ZSTD_indexResetPolicy_e forceResetIndex,
                 const ZSTD_resetTarget_e forWho)
 {
-    size_t const chainSize =3D (cParams->strategy =3D=3D ZSTD_fast) ? 0 : =
((size_t)1 << cParams->chainLog);
+    /* disable chain table allocation for fast or row-based strategies */
+    size_t const chainSize =3D ZSTD_allocateChainTable(cParams->strategy, =
useRowMatchFinder,
+                                                     ms->dedicatedDictSear=
ch && (forWho =3D=3D ZSTD_resetTarget_CDict))
+                                ? ((size_t)1 << cParams->chainLog)
+                                : 0;
     size_t const hSize =3D ((size_t)1) << cParams->hashLog;
     U32    const hashLog3 =3D ((forWho =3D=3D ZSTD_resetTarget_CCtx) && cP=
arams->minMatch=3D=3D3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0;
     size_t const h3Size =3D hashLog3 ? ((size_t)1) << hashLog3 : 0;
=20
     DEBUGLOG(4, "reset indices : %u", forceResetIndex =3D=3D ZSTDirp_reset=
);
+    assert(useRowMatchFinder !=3D ZSTD_ps_auto);
     if (forceResetIndex =3D=3D ZSTDirp_reset) {
         ZSTD_window_init(&ms->window);
         ZSTD_cwksp_mark_tables_dirty(ws);
@@ -1532,11 +1696,23 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms,
         ms->opt.priceTable =3D (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned=
(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t));
     }
=20
+    if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) {
+        {   /* Row match finder needs an additional table of hashes ("tags=
") */
+            size_t const tagTableSize =3D hSize*sizeof(U16);
+            ms->tagTable =3D (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTable=
Size);
+            if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize);
+        }
+        {   /* Switch to 32-entry rows if searchLog is 5 (or more) */
+            U32 const rowLog =3D BOUNDED(4, cParams->searchLog, 6);
+            assert(cParams->hashLog >=3D rowLog);
+            ms->rowHashLog =3D cParams->hashLog - rowLog;
+        }
+    }
+
     ms->cParams =3D *cParams;
=20
     RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation,
                     "failed a workspace allocation in ZSTD_reset_matchStat=
e");
-
     return 0;
 }
=20
@@ -1553,61 +1729,87 @@ static int ZSTD_indexTooCloseToMax(ZSTD_window_t w)
     return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOV=
ERFLOW_MARGIN);
 }
=20
+/* ZSTD_dictTooBig():
+ * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be load=
ed in
+ * one go generically. So we ensure that in that case we reset the tables =
to zero,
+ * so that we can load as much of the dictionary as possible.
+ */
+static int ZSTD_dictTooBig(size_t const loadedDictSize)
+{
+    return loadedDictSize > ZSTD_CHUNKSIZE_MAX;
+}
+
 /*! ZSTD_resetCCtx_internal() :
-    note : `params` are assumed fully validated at this stage */
+ * @param loadedDictSize The size of the dictionary to be loaded
+ * into the context, if any. If no dictionary is used, or the
+ * dictionary is being attached / copied, then pass 0.
+ * note : `params` are assumed fully validated at this stage.
+ */
 static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
-                                      ZSTD_CCtx_params params,
+                                      ZSTD_CCtx_params const* params,
                                       U64 const pledgedSrcSize,
+                                      size_t const loadedDictSize,
                                       ZSTD_compResetPolicy_e const crp,
                                       ZSTD_buffered_policy_e const zbuff)
 {
     ZSTD_cwksp* const ws =3D &zc->workspace;
-    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=3D%u, wlog=3D%u",
-                (U32)pledgedSrcSize, params.cParams.windowLog);
-    assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
+    DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=3D%u, wlog=3D%u, =
useRowMatchFinder=3D%d useBlockSplitter=3D%d",
+                (U32)pledgedSrcSize, params->cParams.windowLog, (int)param=
s->useRowMatchFinder, (int)params->useBlockSplitter);
+    assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
=20
     zc->isFirstBlock =3D 1;
=20
-    if (params.ldmParams.enableLdm) {
+    /* Set applied params early so we can modify them for LDM,
+     * and point params at the applied params.
+     */
+    zc->appliedParams =3D *params;
+    params =3D &zc->appliedParams;
+
+    assert(params->useRowMatchFinder !=3D ZSTD_ps_auto);
+    assert(params->useBlockSplitter !=3D ZSTD_ps_auto);
+    assert(params->ldmParams.enableLdm !=3D ZSTD_ps_auto);
+    if (params->ldmParams.enableLdm =3D=3D ZSTD_ps_enable) {
         /* Adjust long distance matching parameters */
-        ZSTD_ldm_adjustParameters(&params.ldmParams, &params.cParams);
-        assert(params.ldmParams.hashLog >=3D params.ldmParams.bucketSizeLo=
g);
-        assert(params.ldmParams.hashRateLog < 32);
+        ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, &params->c=
Params);
+        assert(params->ldmParams.hashLog >=3D params->ldmParams.bucketSize=
Log);
+        assert(params->ldmParams.hashRateLog < 32);
     }
=20
-    {   size_t const windowSize =3D MAX(1, (size_t)MIN(((U64)1 << params.c=
Params.windowLog), pledgedSrcSize));
+    {   size_t const windowSize =3D MAX(1, (size_t)MIN(((U64)1 << params->=
cParams.windowLog), pledgedSrcSize));
         size_t const blockSize =3D MIN(ZSTD_BLOCKSIZE_MAX, windowSize);
-        U32    const divider =3D (params.cParams.minMatch=3D=3D3) ? 3 : 4;
+        U32    const divider =3D (params->cParams.minMatch=3D=3D3) ? 3 : 4;
         size_t const maxNbSeq =3D blockSize / divider;
-        size_t const buffOutSize =3D (zbuff =3D=3D ZSTDb_buffered && param=
s.outBufferMode =3D=3D ZSTD_bm_buffered)
+        size_t const buffOutSize =3D (zbuff =3D=3D ZSTDb_buffered && param=
s->outBufferMode =3D=3D ZSTD_bm_buffered)
                 ? ZSTD_compressBound(blockSize) + 1
                 : 0;
-        size_t const buffInSize =3D (zbuff =3D=3D ZSTDb_buffered && params=
.inBufferMode =3D=3D ZSTD_bm_buffered)
+        size_t const buffInSize =3D (zbuff =3D=3D ZSTDb_buffered && params=
->inBufferMode =3D=3D ZSTD_bm_buffered)
                 ? windowSize + blockSize
                 : 0;
-        size_t const maxNbLdmSeq =3D ZSTD_ldm_getMaxNbSeq(params.ldmParams=
, blockSize);
+        size_t const maxNbLdmSeq =3D ZSTD_ldm_getMaxNbSeq(params->ldmParam=
s, blockSize);
=20
         int const indexTooClose =3D ZSTD_indexTooCloseToMax(zc->blockState=
.matchState.window);
+        int const dictTooBig =3D ZSTD_dictTooBig(loadedDictSize);
         ZSTD_indexResetPolicy_e needsIndexReset =3D
-            (!indexTooClose && zc->initialized) ? ZSTDirp_continue : ZSTDi=
rp_reset;
+            (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_re=
set : ZSTDirp_continue;
=20
         size_t const neededSpace =3D
             ZSTD_estimateCCtxSize_usingCCtxParams_internal(
-                &params.cParams, &params.ldmParams, zc->staticSize !=3D 0,
+                &params->cParams, &params->ldmParams, zc->staticSize !=3D =
0, params->useRowMatchFinder,
                 buffInSize, buffOutSize, pledgedSrcSize);
+        int resizeWorkspace;
+
         FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!");
=20
         if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0);
=20
-        /* Check if workspace is large enough, alloc a new one if needed */
-        {
+        {   /* Check if workspace is large enough, alloc a new one if need=
ed */
             int const workspaceTooSmall =3D ZSTD_cwksp_sizeof(ws) < needed=
Space;
             int const workspaceWasteful =3D ZSTD_cwksp_check_wasteful(ws, =
neededSpace);
-
+            resizeWorkspace =3D workspaceTooSmall || workspaceWasteful;
             DEBUGLOG(4, "Need %zu B workspace", neededSpace);
             DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, bl=
ockSize);
=20
-            if (workspaceTooSmall || workspaceWasteful) {
+            if (resizeWorkspace) {
                 DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB",
                             ZSTD_cwksp_sizeof(ws) >> 10,
                             neededSpace >> 10);
@@ -1629,14 +1831,13 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
                 zc->blockState.nextCBlock =3D (ZSTD_compressedBlockState_t=
*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t));
                 RETURN_ERROR_IF(zc->blockState.nextCBlock =3D=3D NULL, mem=
ory_allocation, "couldn't allocate nextCBlock");
                 zc->entropyWorkspace =3D (U32*) ZSTD_cwksp_reserve_object(=
ws, ENTROPY_WORKSPACE_SIZE);
-                RETURN_ERROR_IF(zc->blockState.nextCBlock =3D=3D NULL, mem=
ory_allocation, "couldn't allocate entropyWorkspace");
+                RETURN_ERROR_IF(zc->entropyWorkspace =3D=3D NULL, memory_a=
llocation, "couldn't allocate entropyWorkspace");
         }   }
=20
         ZSTD_cwksp_clear(ws);
=20
         /* init params */
-        zc->appliedParams =3D params;
-        zc->blockState.matchState.cParams =3D params.cParams;
+        zc->blockState.matchState.cParams =3D params->cParams;
         zc->pledgedSrcSizePlusOne =3D pledgedSrcSize+1;
         zc->consumedSrcSize =3D 0;
         zc->producedCSize =3D 0;
@@ -1667,11 +1868,11 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         zc->outBuff =3D (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize);
=20
         /* ldm bucketOffsets table */
-        if (params.ldmParams.enableLdm) {
+        if (params->ldmParams.enableLdm =3D=3D ZSTD_ps_enable) {
             /* TODO: avoid memset? */
             size_t const numBuckets =3D
-                  ((size_t)1) << (params.ldmParams.hashLog -
-                                  params.ldmParams.bucketSizeLog);
+                  ((size_t)1) << (params->ldmParams.hashLog -
+                                  params->ldmParams.bucketSizeLog);
             zc->ldmState.bucketOffsets =3D ZSTD_cwksp_reserve_buffer(ws, n=
umBuckets);
             ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets);
         }
@@ -1687,32 +1888,28 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc,
         FORWARD_IF_ERROR(ZSTD_reset_matchState(
             &zc->blockState.matchState,
             ws,
-            &params.cParams,
+            &params->cParams,
+            params->useRowMatchFinder,
             crp,
             needsIndexReset,
             ZSTD_resetTarget_CCtx), "");
=20
         /* ldm hash table */
-        if (params.ldmParams.enableLdm) {
+        if (params->ldmParams.enableLdm =3D=3D ZSTD_ps_enable) {
             /* TODO: avoid memset? */
-            size_t const ldmHSize =3D ((size_t)1) << params.ldmParams.hash=
Log;
+            size_t const ldmHSize =3D ((size_t)1) << params->ldmParams.has=
hLog;
             zc->ldmState.hashTable =3D (ldmEntry_t*)ZSTD_cwksp_reserve_ali=
gned(ws, ldmHSize * sizeof(ldmEntry_t));
             ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEn=
try_t));
             zc->ldmSequences =3D (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, m=
axNbLdmSeq * sizeof(rawSeq));
             zc->maxNbLdmSequences =3D maxNbLdmSeq;
=20
             ZSTD_window_init(&zc->ldmState.window);
-            ZSTD_window_clear(&zc->ldmState.window);
             zc->ldmState.loadedDictEnd =3D 0;
         }
=20
-        /* Due to alignment, when reusing a workspace, we can actually con=
sume
-         * up to 3 extra bytes for alignment. See the comments in zstd_cwk=
sp.h
-         */
-        assert(ZSTD_cwksp_used(ws) >=3D neededSpace &&
-               ZSTD_cwksp_used(ws) <=3D neededSpace + 3);
-
         DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available=
", ZSTD_cwksp_available_space(ws));
+        assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, r=
esizeWorkspace));
+
         zc->initialized =3D 1;
=20
         return 0;
@@ -1768,6 +1965,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
                         U64 pledgedSrcSize,
                         ZSTD_buffered_policy_e zbuff)
 {
+    DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=3D%llu",
+                (unsigned long long)pledgedSrcSize);
     {
         ZSTD_compressionParameters adjusted_cdict_cParams =3D cdict->match=
State.cParams;
         unsigned const windowLog =3D params.cParams.windowLog;
@@ -1783,7 +1982,9 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx,
         params.cParams =3D ZSTD_adjustCParams_internal(adjusted_cdict_cPar=
ams, pledgedSrcSize,
                                                      cdict->dictContentSiz=
e, ZSTD_cpm_attachDict);
         params.cParams.windowLog =3D windowLog;
-        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcS=
ize,
+        params.useRowMatchFinder =3D cdict->useRowMatchFinder;    /* cdict=
 overrides */
+        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrc=
Size,
+                                                 /* loadedDictSize */ 0,
                                                  ZSTDcrp_makeClean, zbuff)=
, "");
         assert(cctx->appliedParams.cParams.strategy =3D=3D adjusted_cdict_=
cParams.strategy);
     }
@@ -1827,15 +2028,17 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CC=
tx* cctx,
     const ZSTD_compressionParameters *cdict_cParams =3D &cdict->matchState=
.cParams;
=20
     assert(!cdict->matchState.dedicatedDictSearch);
-
-    DEBUGLOG(4, "copying dictionary into context");
+    DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=3D%llu",
+                (unsigned long long)pledgedSrcSize);
=20
     {   unsigned const windowLog =3D params.cParams.windowLog;
         assert(windowLog !=3D 0);
         /* Copy only compression parameters related to tables. */
         params.cParams =3D *cdict_cParams;
         params.cParams.windowLog =3D windowLog;
-        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcS=
ize,
+        params.useRowMatchFinder =3D cdict->useRowMatchFinder;
+        FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, &params, pledgedSrc=
Size,
+                                                 /* loadedDictSize */ 0,
                                                  ZSTDcrp_leaveDirty, zbuff=
), "");
         assert(cctx->appliedParams.cParams.strategy =3D=3D cdict_cParams->=
strategy);
         assert(cctx->appliedParams.cParams.hashLog =3D=3D cdict_cParams->h=
ashLog);
@@ -1843,17 +2046,30 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CC=
tx* cctx,
     }
=20
     ZSTD_cwksp_mark_tables_dirty(&cctx->workspace);
+    assert(params.useRowMatchFinder !=3D ZSTD_ps_auto);
=20
     /* copy tables */
-    {   size_t const chainSize =3D (cdict_cParams->strategy =3D=3D ZSTD_fa=
st) ? 0 : ((size_t)1 << cdict_cParams->chainLog);
+    {   size_t const chainSize =3D ZSTD_allocateChainTable(cdict_cParams->=
strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */)
+                                                            ? ((size_t)1 <=
< cdict_cParams->chainLog)
+                                                            : 0;
         size_t const hSize =3D  (size_t)1 << cdict_cParams->hashLog;
=20
         ZSTD_memcpy(cctx->blockState.matchState.hashTable,
                cdict->matchState.hashTable,
                hSize * sizeof(U32));
-        ZSTD_memcpy(cctx->blockState.matchState.chainTable,
+        /* Do not copy cdict's chainTable if cctx has parameters such that=
 it would not use chainTable */
+        if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, =
cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) {
+            ZSTD_memcpy(cctx->blockState.matchState.chainTable,
                cdict->matchState.chainTable,
                chainSize * sizeof(U32));
+        }
+        /* copy tag table */
+        if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRow=
MatchFinder)) {
+            size_t const tagTableSize =3D hSize*sizeof(U16);
+            ZSTD_memcpy(cctx->blockState.matchState.tagTable,
+                cdict->matchState.tagTable,
+                tagTableSize);
+        }
     }
=20
     /* Zero the hashTable3, since the cdict never fills it */
@@ -1917,16 +2133,22 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dst=
CCtx,
                             U64 pledgedSrcSize,
                             ZSTD_buffered_policy_e zbuff)
 {
-    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
     RETURN_ERROR_IF(srcCCtx->stage!=3DZSTDcs_init, stage_wrong,
                     "Can't copy a ctx that's not in init stage.");
-
+    DEBUGLOG(5, "ZSTD_copyCCtx_internal");
     ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_cust=
omMem));
     {   ZSTD_CCtx_params params =3D dstCCtx->requestedParams;
         /* Copy only compression parameters related to tables. */
         params.cParams =3D srcCCtx->appliedParams.cParams;
+        assert(srcCCtx->appliedParams.useRowMatchFinder !=3D ZSTD_ps_auto);
+        assert(srcCCtx->appliedParams.useBlockSplitter !=3D ZSTD_ps_auto);
+        assert(srcCCtx->appliedParams.ldmParams.enableLdm !=3D ZSTD_ps_aut=
o);
+        params.useRowMatchFinder =3D srcCCtx->appliedParams.useRowMatchFin=
der;
+        params.useBlockSplitter =3D srcCCtx->appliedParams.useBlockSplitte=
r;
+        params.ldmParams =3D srcCCtx->appliedParams.ldmParams;
         params.fParams =3D fParams;
-        ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize,
+        ZSTD_resetCCtx_internal(dstCCtx, &params, pledgedSrcSize,
+                                /* loadedDictSize */ 0,
                                 ZSTDcrp_leaveDirty, zbuff);
         assert(dstCCtx->appliedParams.cParams.windowLog =3D=3D srcCCtx->ap=
pliedParams.cParams.windowLog);
         assert(dstCCtx->appliedParams.cParams.strategy =3D=3D srcCCtx->app=
liedParams.cParams.strategy);
@@ -1938,7 +2160,11 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstC=
Ctx,
     ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace);
=20
     /* copy tables */
-    {   size_t const chainSize =3D (srcCCtx->appliedParams.cParams.strateg=
y =3D=3D ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chai=
nLog);
+    {   size_t const chainSize =3D ZSTD_allocateChainTable(srcCCtx->applie=
dParams.cParams.strategy,
+                                                         srcCCtx->appliedP=
arams.useRowMatchFinder,
+                                                         0 /* forDDSDict *=
/)
+                                    ? ((size_t)1 << srcCCtx->appliedParams=
.cParams.chainLog)
+                                    : 0;
         size_t const hSize =3D  (size_t)1 << srcCCtx->appliedParams.cParam=
s.hashLog;
         int const h3log =3D srcCCtx->blockState.matchState.hashLog3;
         size_t const h3Size =3D h3log ? ((size_t)1 << h3log) : 0;
@@ -2005,6 +2231,8 @@ ZSTD_reduceTable_internal (U32* const table, U32 cons=
t size, U32 const reducerVa
     int const nbRows =3D (int)size / ZSTD_ROWSIZE;
     int cellNb =3D 0;
     int rowNb;
+    /* Protect special index values < ZSTD_WINDOW_START_INDEX. */
+    U32 const reducerThreshold =3D reducerValue + ZSTD_WINDOW_START_INDEX;
     assert((size & (ZSTD_ROWSIZE-1)) =3D=3D 0);  /* multiple of ZSTD_ROWSI=
ZE */
     assert(size < (1U<<31));   /* can be casted to int */
=20
@@ -2012,12 +2240,17 @@ ZSTD_reduceTable_internal (U32* const table, U32 co=
nst size, U32 const reducerVa
     for (rowNb=3D0 ; rowNb < nbRows ; rowNb++) {
         int column;
         for (column=3D0; column<ZSTD_ROWSIZE; column++) {
-            if (preserveMark) {
-                U32 const adder =3D (table[cellNb] =3D=3D ZSTD_DUBT_UNSORT=
ED_MARK) ? reducerValue : 0;
-                table[cellNb] +=3D adder;
+            U32 newVal;
+            if (preserveMark && table[cellNb] =3D=3D ZSTD_DUBT_UNSORTED_MA=
RK) {
+                /* This write is pointless, but is required(?) for the com=
piler
+                 * to auto-vectorize the loop. */
+                newVal =3D ZSTD_DUBT_UNSORTED_MARK;
+            } else if (table[cellNb] < reducerThreshold) {
+                newVal =3D 0;
+            } else {
+                newVal =3D table[cellNb] - reducerValue;
             }
-            if (table[cellNb] < reducerValue) table[cellNb] =3D 0;
-            else table[cellNb] -=3D reducerValue;
+            table[cellNb] =3D newVal;
             cellNb++;
     }   }
 }
@@ -2040,7 +2273,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, =
ZSTD_CCtx_params const* par
         ZSTD_reduceTable(ms->hashTable, hSize, reducerValue);
     }
=20
-    if (params->cParams.strategy !=3D ZSTD_fast) {
+    if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMa=
tchFinder, (U32)ms->dedicatedDictSearch)) {
         U32 const chainSize =3D (U32)1 << params->cParams.chainLog;
         if (params->cParams.strategy =3D=3D ZSTD_btlazy2)
             ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerVal=
ue);
@@ -2072,14 +2305,14 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr)
     assert(nbSeq <=3D seqStorePtr->maxNbSeq);
     for (u=3D0; u<nbSeq; u++) {
         U32 const llv =3D sequences[u].litLength;
-        U32 const mlv =3D sequences[u].matchLength;
+        U32 const mlv =3D sequences[u].mlBase;
         llCodeTable[u] =3D (BYTE)ZSTD_LLcode(llv);
-        ofCodeTable[u] =3D (BYTE)ZSTD_highbit32(sequences[u].offset);
+        ofCodeTable[u] =3D (BYTE)ZSTD_highbit32(sequences[u].offBase);
         mlCodeTable[u] =3D (BYTE)ZSTD_MLcode(mlv);
     }
-    if (seqStorePtr->longLengthID=3D=3D1)
+    if (seqStorePtr->longLengthType=3D=3DZSTD_llt_literalLength)
         llCodeTable[seqStorePtr->longLengthPos] =3D MaxLL;
-    if (seqStorePtr->longLengthID=3D=3D2)
+    if (seqStorePtr->longLengthType=3D=3DZSTD_llt_matchLength)
         mlCodeTable[seqStorePtr->longLengthPos] =3D MaxML;
 }
=20
@@ -2093,10 +2326,161 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCt=
x_params* cctxParams)
     return (cctxParams->targetCBlockSize !=3D 0);
 }
=20
-/* ZSTD_entropyCompressSequences_internal():
- * actually compresses both literals and sequences */
+/* ZSTD_blockSplitterEnabled():
+ * Returns if block splitting param is being used
+ * If used, compression will do best effort to split a block in order to i=
mprove compression ratio.
+ * At the time this function is called, the parameter must be finalized.
+ * Returns 1 if true, 0 otherwise. */
+static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams)
+{
+    DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=3D%d)", cctxP=
arams->useBlockSplitter);
+    assert(cctxParams->useBlockSplitter !=3D ZSTD_ps_auto);
+    return (cctxParams->useBlockSplitter =3D=3D ZSTD_ps_enable);
+}
+
+/* Type returned by ZSTD_buildSequencesStatistics containing finalized sym=
bol encoding types
+ * and size of the sequences statistics
+ */
+typedef struct {
+    U32 LLtype;
+    U32 Offtype;
+    U32 MLtype;
+    size_t size;
+    size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZST=
D_entropyCompressSeqStore_internal() */
+} ZSTD_symbolEncodingTypeStats_t;
+
+/* ZSTD_buildSequencesStatistics():
+ * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `=
size` field.
+ * Modifies `nextEntropy` to have the appropriate values as a side effect.
+ * nbSeq must be greater than 0.
+ *
+ * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxS=
eq + 1)*sizeof(U32)
+ */
+static ZSTD_symbolEncodingTypeStats_t
+ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq,
+                        const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTab=
les_t* nextEntropy,
+                              BYTE* dst, const BYTE* const dstEnd,
+                              ZSTD_strategy strategy, unsigned* countWorks=
pace,
+                              void* entropyWorkspace, size_t entropyWkspSi=
ze) {
+    BYTE* const ostart =3D dst;
+    const BYTE* const oend =3D dstEnd;
+    BYTE* op =3D ostart;
+    FSE_CTable* CTable_LitLength =3D nextEntropy->litlengthCTable;
+    FSE_CTable* CTable_OffsetBits =3D nextEntropy->offcodeCTable;
+    FSE_CTable* CTable_MatchLength =3D nextEntropy->matchlengthCTable;
+    const BYTE* const ofCodeTable =3D seqStorePtr->ofCode;
+    const BYTE* const llCodeTable =3D seqStorePtr->llCode;
+    const BYTE* const mlCodeTable =3D seqStorePtr->mlCode;
+    ZSTD_symbolEncodingTypeStats_t stats;
+
+    stats.lastCountSize =3D 0;
+    /* convert length/distances into codes */
+    ZSTD_seqToCodes(seqStorePtr);
+    assert(op <=3D oend);
+    assert(nbSeq !=3D 0); /* ZSTD_selectEncodingType() divides by nbSeq */
+    /* build CTable for Literal Lengths */
+    {   unsigned max =3D MaxLL;
+        size_t const mostFrequent =3D HIST_countFast_wksp(countWorkspace, =
&max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fa=
il */
+        DEBUGLOG(5, "Building LL table");
+        nextEntropy->litlength_repeatMode =3D prevEntropy->litlength_repea=
tMode;
+        stats.LLtype =3D ZSTD_selectEncodingType(&nextEntropy->litlength_r=
epeatMode,
+                                        countWorkspace, max, mostFrequent,=
 nbSeq,
+                                        LLFSELog, prevEntropy->litlengthCT=
able,
+                                        LL_defaultNorm, LL_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(set_basic < set_compressed && set_rle < set_compressed);
+        assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_r=
epeatMode !=3D FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize =3D ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LL=
type,
+                countWorkspace, max, llCodeTable, nbSeq,
+                LL_defaultNorm, LL_defaultNormLog, MaxLL,
+                prevEntropy->litlengthCTable,
+                sizeof(prevEntropy->litlengthCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed");
+                stats.size =3D countSize;
+                return stats;
+            }
+            if (stats.LLtype =3D=3D set_compressed)
+                stats.lastCountSize =3D countSize;
+            op +=3D countSize;
+            assert(op <=3D oend);
+    }   }
+    /* build CTable for Offsets */
+    {   unsigned max =3D MaxOff;
+        size_t const mostFrequent =3D HIST_countFast_wksp(
+            countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, en=
tropyWkspSize);  /* can't fail */
+        /* We can only use the basic table if max <=3D DefaultMaxOff, othe=
rwise the offsets are too large */
+        ZSTD_defaultPolicy_e const defaultPolicy =3D (max <=3D DefaultMaxO=
ff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
+        DEBUGLOG(5, "Building OF table");
+        nextEntropy->offcode_repeatMode =3D prevEntropy->offcode_repeatMod=
e;
+        stats.Offtype =3D ZSTD_selectEncodingType(&nextEntropy->offcode_re=
peatMode,
+                                        countWorkspace, max, mostFrequent,=
 nbSeq,
+                                        OffFSELog, prevEntropy->offcodeCTa=
ble,
+                                        OF_defaultNorm, OF_defaultNormLog,
+                                        defaultPolicy, strategy);
+        assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_re=
peatMode !=3D FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize =3D ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.=
Offtype,
+                countWorkspace, max, ofCodeTable, nbSeq,
+                OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
+                prevEntropy->offcodeCTable,
+                sizeof(prevEntropy->offcodeCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed");
+                stats.size =3D countSize;
+                return stats;
+            }
+            if (stats.Offtype =3D=3D set_compressed)
+                stats.lastCountSize =3D countSize;
+            op +=3D countSize;
+            assert(op <=3D oend);
+    }   }
+    /* build CTable for MatchLengths */
+    {   unsigned max =3D MaxML;
+        size_t const mostFrequent =3D HIST_countFast_wksp(
+            countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, en=
tropyWkspSize);   /* can't fail */
+        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend=
-op));
+        nextEntropy->matchlength_repeatMode =3D prevEntropy->matchlength_r=
epeatMode;
+        stats.MLtype =3D ZSTD_selectEncodingType(&nextEntropy->matchlength=
_repeatMode,
+                                        countWorkspace, max, mostFrequent,=
 nbSeq,
+                                        MLFSELog, prevEntropy->matchlength=
CTable,
+                                        ML_defaultNorm, ML_defaultNormLog,
+                                        ZSTD_defaultAllowed, strategy);
+        assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength=
_repeatMode !=3D FSE_repeat_none)); /* We don't copy tables */
+        {   size_t const countSize =3D ZSTD_buildCTable(
+                op, (size_t)(oend - op),
+                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.=
MLtype,
+                countWorkspace, max, mlCodeTable, nbSeq,
+                ML_defaultNorm, ML_defaultNormLog, MaxML,
+                prevEntropy->matchlengthCTable,
+                sizeof(prevEntropy->matchlengthCTable),
+                entropyWorkspace, entropyWkspSize);
+            if (ZSTD_isError(countSize)) {
+                DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed");
+                stats.size =3D countSize;
+                return stats;
+            }
+            if (stats.MLtype =3D=3D set_compressed)
+                stats.lastCountSize =3D countSize;
+            op +=3D countSize;
+            assert(op <=3D oend);
+    }   }
+    stats.size =3D (size_t)(op-ostart);
+    return stats;
+}
+
+/* ZSTD_entropyCompressSeqStore_internal():
+ * compresses both literals and sequences
+ * Returns compressed size of block, or a zstd error.
+ */
+#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20
 MEM_STATIC size_t
-ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr,
+ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr,
                           const ZSTD_entropyCTables_t* prevEntropy,
                                 ZSTD_entropyCTables_t* nextEntropy,
                           const ZSTD_CCtx_params* cctxParams,
@@ -2110,36 +2494,38 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* =
seqStorePtr,
     FSE_CTable* CTable_LitLength =3D nextEntropy->fse.litlengthCTable;
     FSE_CTable* CTable_OffsetBits =3D nextEntropy->fse.offcodeCTable;
     FSE_CTable* CTable_MatchLength =3D nextEntropy->fse.matchlengthCTable;
-    U32 LLtype, Offtype, MLtype;   /* compressed, raw or rle */
     const seqDef* const sequences =3D seqStorePtr->sequencesStart;
+    const size_t nbSeq =3D seqStorePtr->sequences - seqStorePtr->sequences=
Start;
     const BYTE* const ofCodeTable =3D seqStorePtr->ofCode;
     const BYTE* const llCodeTable =3D seqStorePtr->llCode;
     const BYTE* const mlCodeTable =3D seqStorePtr->mlCode;
     BYTE* const ostart =3D (BYTE*)dst;
     BYTE* const oend =3D ostart + dstCapacity;
     BYTE* op =3D ostart;
-    size_t const nbSeq =3D (size_t)(seqStorePtr->sequences - seqStorePtr->=
sequencesStart);
-    BYTE* seqHead;
-    BYTE* lastNCount =3D NULL;
+    size_t lastCountSize;
=20
     entropyWorkspace =3D count + (MaxSeq + 1);
     entropyWkspSize -=3D (MaxSeq + 1) * sizeof(*count);
=20
-    DEBUGLOG(4, "ZSTD_entropyCompressSequences_internal (nbSeq=3D%zu)", nb=
Seq);
+    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=3D%zu)", nbS=
eq);
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >=3D (1<<MAX(MLFSELog,LLFSELog))=
);
     assert(entropyWkspSize >=3D HUF_WORKSPACE_SIZE);
=20
     /* Compress literals */
     {   const BYTE* const literals =3D seqStorePtr->litStart;
+        size_t const numSequences =3D seqStorePtr->sequences - seqStorePtr=
->sequencesStart;
+        size_t const numLiterals =3D seqStorePtr->lit - seqStorePtr->litSt=
art;
+        /* Base suspicion of uncompressibility on ratio of literals to seq=
uences */
+        unsigned const suspectUncompressible =3D (numSequences =3D=3D 0) |=
| (numLiterals / numSequences >=3D SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO);
         size_t const litSize =3D (size_t)(seqStorePtr->lit - literals);
         size_t const cSize =3D ZSTD_compressLiterals(
                                     &prevEntropy->huf, &nextEntropy->huf,
                                     cctxParams->cParams.strategy,
-                                    ZSTD_disableLiteralsCompression(cctxPa=
rams),
+                                    ZSTD_literalsCompressionIsDisabled(cct=
xParams),
                                     op, dstCapacity,
                                     literals, litSize,
                                     entropyWorkspace, entropyWkspSize,
-                                    bmi2);
+                                    bmi2, suspectUncompressible);
         FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed");
         assert(cSize <=3D dstCapacity);
         op +=3D cSize;
@@ -2165,95 +2551,20 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* =
seqStorePtr,
         ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntro=
py->fse));
         return (size_t)(op - ostart);
     }
-
-    /* seqHead : flags for FSE encoding type */
-    seqHead =3D op++;
-    assert(op <=3D oend);
-
-    /* convert length/distances into codes */
-    ZSTD_seqToCodes(seqStorePtr);
-    /* build CTable for Literal Lengths */
-    {   unsigned max =3D MaxLL;
-        size_t const mostFrequent =3D HIST_countFast_wksp(count, &max, llC=
odeTable, nbSeq, entropyWorkspace, entropyWkspSize);   /* can't fail */
-        DEBUGLOG(5, "Building LL table");
-        nextEntropy->fse.litlength_repeatMode =3D prevEntropy->fse.litleng=
th_repeatMode;
-        LLtype =3D ZSTD_selectEncodingType(&nextEntropy->fse.litlength_rep=
eatMode,
-                                        count, max, mostFrequent, nbSeq,
-                                        LLFSELog, prevEntropy->fse.litleng=
thCTable,
-                                        LL_defaultNorm, LL_defaultNormLog,
-                                        ZSTD_defaultAllowed, strategy);
-        assert(set_basic < set_compressed && set_rle < set_compressed);
-        assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_rep=
eatMode !=3D FSE_repeat_none)); /* We don't copy tables */
-        {   size_t const countSize =3D ZSTD_buildCTable(
-                op, (size_t)(oend - op),
-                CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
-                count, max, llCodeTable, nbSeq,
-                LL_defaultNorm, LL_defaultNormLog, MaxLL,
-                prevEntropy->fse.litlengthCTable,
-                sizeof(prevEntropy->fse.litlengthCTable),
-                entropyWorkspace, entropyWkspSize);
-            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens fail=
ed");
-            if (LLtype =3D=3D set_compressed)
-                lastNCount =3D op;
-            op +=3D countSize;
-            assert(op <=3D oend);
-    }   }
-    /* build CTable for Offsets */
-    {   unsigned max =3D MaxOff;
-        size_t const mostFrequent =3D HIST_countFast_wksp(
-            count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWksp=
Size);  /* can't fail */
-        /* We can only use the basic table if max <=3D DefaultMaxOff, othe=
rwise the offsets are too large */
-        ZSTD_defaultPolicy_e const defaultPolicy =3D (max <=3D DefaultMaxO=
ff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
-        DEBUGLOG(5, "Building OF table");
-        nextEntropy->fse.offcode_repeatMode =3D prevEntropy->fse.offcode_r=
epeatMode;
-        Offtype =3D ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repe=
atMode,
-                                        count, max, mostFrequent, nbSeq,
-                                        OffFSELog, prevEntropy->fse.offcod=
eCTable,
-                                        OF_defaultNorm, OF_defaultNormLog,
-                                        defaultPolicy, strategy);
-        assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repe=
atMode !=3D FSE_repeat_none)); /* We don't copy tables */
-        {   size_t const countSize =3D ZSTD_buildCTable(
-                op, (size_t)(oend - op),
-                CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtyp=
e,
-                count, max, ofCodeTable, nbSeq,
-                OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-                prevEntropy->fse.offcodeCTable,
-                sizeof(prevEntropy->fse.offcodeCTable),
-                entropyWorkspace, entropyWkspSize);
-            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets fail=
ed");
-            if (Offtype =3D=3D set_compressed)
-                lastNCount =3D op;
-            op +=3D countSize;
-            assert(op <=3D oend);
-    }   }
-    /* build CTable for MatchLengths */
-    {   unsigned max =3D MaxML;
-        size_t const mostFrequent =3D HIST_countFast_wksp(
-            count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWksp=
Size);   /* can't fail */
-        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend=
-op));
-        nextEntropy->fse.matchlength_repeatMode =3D prevEntropy->fse.match=
length_repeatMode;
-        MLtype =3D ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_r=
epeatMode,
-                                        count, max, mostFrequent, nbSeq,
-                                        MLFSELog, prevEntropy->fse.matchle=
ngthCTable,
-                                        ML_defaultNorm, ML_defaultNormLog,
-                                        ZSTD_defaultAllowed, strategy);
-        assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_r=
epeatMode !=3D FSE_repeat_none)); /* We don't copy tables */
-        {   size_t const countSize =3D ZSTD_buildCTable(
-                op, (size_t)(oend - op),
-                CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
-                count, max, mlCodeTable, nbSeq,
-                ML_defaultNorm, ML_defaultNormLog, MaxML,
-                prevEntropy->fse.matchlengthCTable,
-                sizeof(prevEntropy->fse.matchlengthCTable),
-                entropyWorkspace, entropyWkspSize);
-            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths=
 failed");
-            if (MLtype =3D=3D set_compressed)
-                lastNCount =3D op;
-            op +=3D countSize;
-            assert(op <=3D oend);
-    }   }
-
-    *seqHead =3D (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2));
+    {
+        ZSTD_symbolEncodingTypeStats_t stats;
+        BYTE* seqHead =3D op++;
+        /* build stats for sequences */
+        stats =3D ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq,
+                                             &prevEntropy->fse, &nextEntro=
py->fse,
+                                              op, oend,
+                                              strategy, count,
+                                              entropyWorkspace, entropyWks=
pSize);
+        FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed=
!");
+        *seqHead =3D (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stat=
s.MLtype<<2));
+        lastCountSize =3D stats.lastCountSize;
+        op +=3D stats.size;
+    }
=20
     {   size_t const bitstreamSize =3D ZSTD_encodeSequences(
                                         op, (size_t)(oend - op),
@@ -2273,9 +2584,9 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* se=
qStorePtr,
          * In this exceedingly rare case, we will simply emit an uncompres=
sed
          * block, since it isn't worth optimizing.
          */
-        if (lastNCount && (op - lastNCount) < 4) {
-            /* NCountSize >=3D 2 && bitstreamSize > 0 =3D=3D> lastCountSiz=
e =3D=3D 3 */
-            assert(op - lastNCount =3D=3D 3);
+        if (lastCountSize && (lastCountSize + bitstreamSize) < 4) {
+            /* lastCountSize >=3D 2 && bitstreamSize > 0 =3D=3D> lastCount=
Size =3D=3D 3 */
+            assert(lastCountSize + bitstreamSize =3D=3D 3);
             DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <=3D 1.3=
.4 by "
                         "emitting an uncompressed block.");
             return 0;
@@ -2287,7 +2598,7 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* se=
qStorePtr,
 }
=20
 MEM_STATIC size_t
-ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr,
+ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr,
                        const ZSTD_entropyCTables_t* prevEntropy,
                              ZSTD_entropyCTables_t* nextEntropy,
                        const ZSTD_CCtx_params* cctxParams,
@@ -2296,7 +2607,7 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr,
                              void* entropyWorkspace, size_t entropyWkspSiz=
e,
                              int bmi2)
 {
-    size_t const cSize =3D ZSTD_entropyCompressSequences_internal(
+    size_t const cSize =3D ZSTD_entropyCompressSeqStore_internal(
                             seqStorePtr, prevEntropy, nextEntropy, cctxPar=
ams,
                             dst, dstCapacity,
                             entropyWorkspace, entropyWkspSize, bmi2);
@@ -2306,20 +2617,20 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStoreP=
tr,
      */
     if ((cSize =3D=3D ERROR(dstSize_tooSmall)) & (srcSize <=3D dstCapacity=
))
         return 0;  /* block not compressed */
-    FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSequences_internal failed=
");
+    FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"=
);
=20
     /* Check compressibility */
     {   size_t const maxCSize =3D srcSize - ZSTD_minGain(srcSize, cctxPara=
ms->cParams.strategy);
         if (cSize >=3D maxCSize) return 0;  /* block not compressed */
     }
-    DEBUGLOG(4, "ZSTD_entropyCompressSequences() cSize: %zu\n", cSize);
+    DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize);
     return cSize;
 }
=20
 /* ZSTD_selectBlockCompressor() :
  * Not static, but internal use only (used by long distance matcher)
  * assumption : strat is a valid strategy */
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_=
dictMode_e dictMode)
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_=
paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode)
 {
     static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX=
+1] =3D {
         { ZSTD_compressBlock_fast  /* default for 0 */,
@@ -2367,7 +2678,28 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD=
_strategy strat, ZSTD_dictMo
     ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast =3D=3D 1);
=20
     assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat));
-    selectedCompressor =3D blockCompressor[(int)dictMode][(int)strat];
+    DEBUGLOG(4, "Selected block compressor: dictMode=3D%d strat=3D%d rowMa=
tchfinder=3D%d", (int)dictMode, (int)strat, (int)useRowMatchFinder);
+    if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) {
+        static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] =
=3D {
+            { ZSTD_compressBlock_greedy_row,
+            ZSTD_compressBlock_lazy_row,
+            ZSTD_compressBlock_lazy2_row },
+            { ZSTD_compressBlock_greedy_extDict_row,
+            ZSTD_compressBlock_lazy_extDict_row,
+            ZSTD_compressBlock_lazy2_extDict_row },
+            { ZSTD_compressBlock_greedy_dictMatchState_row,
+            ZSTD_compressBlock_lazy_dictMatchState_row,
+            ZSTD_compressBlock_lazy2_dictMatchState_row },
+            { ZSTD_compressBlock_greedy_dedicatedDictSearch_row,
+            ZSTD_compressBlock_lazy_dedicatedDictSearch_row,
+            ZSTD_compressBlock_lazy2_dedicatedDictSearch_row }
+        };
+        DEBUGLOG(4, "Selecting a row-based matchfinder");
+        assert(useRowMatchFinder !=3D ZSTD_ps_auto);
+        selectedCompressor =3D rowBasedBlockCompressors[(int)dictMode][(in=
t)strat - (int)ZSTD_greedy];
+    } else {
+        selectedCompressor =3D blockCompressor[(int)dictMode][(int)strat];
+    }
     assert(selectedCompressor !=3D NULL);
     return selectedCompressor;
 }
@@ -2383,7 +2715,7 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr)
 {
     ssPtr->lit =3D ssPtr->litStart;
     ssPtr->sequences =3D ssPtr->sequencesStart;
-    ssPtr->longLengthID =3D 0;
+    ssPtr->longLengthType =3D ZSTD_llt_none;
 }
=20
 typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e;
@@ -2430,15 +2762,16 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, con=
st void* src, size_t srcSize)
                 zc->blockState.nextCBlock->rep[i] =3D zc->blockState.prevC=
Block->rep[i];
         }
         if (zc->externSeqStore.pos < zc->externSeqStore.size) {
-            assert(!zc->appliedParams.ldmParams.enableLdm);
+            assert(zc->appliedParams.ldmParams.enableLdm =3D=3D ZSTD_ps_di=
sable);
             /* Updates ldmSeqStore.pos */
             lastLLSize =3D
                 ZSTD_ldm_blockCompress(&zc->externSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
+                                       zc->appliedParams.useRowMatchFinder,
                                        src, srcSize);
             assert(zc->externSeqStore.pos <=3D zc->externSeqStore.size);
-        } else if (zc->appliedParams.ldmParams.enableLdm) {
+        } else if (zc->appliedParams.ldmParams.enableLdm =3D=3D ZSTD_ps_en=
able) {
             rawSeqStore_t ldmSeqStore =3D kNullRawSeqStore;
=20
             ldmSeqStore.seq =3D zc->ldmSequences;
@@ -2452,10 +2785,13 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, con=
st void* src, size_t srcSize)
                 ZSTD_ldm_blockCompress(&ldmSeqStore,
                                        ms, &zc->seqStore,
                                        zc->blockState.nextCBlock->rep,
+                                       zc->appliedParams.useRowMatchFinder,
                                        src, srcSize);
             assert(ldmSeqStore.pos =3D=3D ldmSeqStore.size);
         } else {   /* not long range mode */
-            ZSTD_blockCompressor const blockCompressor =3D ZSTD_selectBloc=
kCompressor(zc->appliedParams.cParams.strategy, dictMode);
+            ZSTD_blockCompressor const blockCompressor =3D ZSTD_selectBloc=
kCompressor(zc->appliedParams.cParams.strategy,
+                                                                          =
          zc->appliedParams.useRowMatchFinder,
+                                                                          =
          dictMode);
             ms->ldmSeqStore =3D NULL;
             lastLLSize =3D blockCompressor(ms, &zc->seqStore, zc->blockSta=
te.nextCBlock->rep, src, srcSize);
         }
@@ -2483,22 +2819,22 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
     assert(zc->seqCollector.maxSequences >=3D seqStoreSeqSize + 1);
     ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeo=
f(repcodes_t));
     for (i =3D 0; i < seqStoreSeqSize; ++i) {
-        U32 rawOffset =3D seqStoreSeqs[i].offset - ZSTD_REP_NUM;
+        U32 rawOffset =3D seqStoreSeqs[i].offBase - ZSTD_REP_NUM;
         outSeqs[i].litLength =3D seqStoreSeqs[i].litLength;
-        outSeqs[i].matchLength =3D seqStoreSeqs[i].matchLength + MINMATCH;
+        outSeqs[i].matchLength =3D seqStoreSeqs[i].mlBase + MINMATCH;
         outSeqs[i].rep =3D 0;
=20
         if (i =3D=3D seqStore->longLengthPos) {
-            if (seqStore->longLengthID =3D=3D 1) {
+            if (seqStore->longLengthType =3D=3D ZSTD_llt_literalLength) {
                 outSeqs[i].litLength +=3D 0x10000;
-            } else if (seqStore->longLengthID =3D=3D 2) {
+            } else if (seqStore->longLengthType =3D=3D ZSTD_llt_matchLengt=
h) {
                 outSeqs[i].matchLength +=3D 0x10000;
             }
         }
=20
-        if (seqStoreSeqs[i].offset <=3D ZSTD_REP_NUM) {
+        if (seqStoreSeqs[i].offBase <=3D ZSTD_REP_NUM) {
             /* Derive the correct offset corresponding to a repcode */
-            outSeqs[i].rep =3D seqStoreSeqs[i].offset;
+            outSeqs[i].rep =3D seqStoreSeqs[i].offBase;
             if (outSeqs[i].litLength !=3D 0) {
                 rawOffset =3D updatedRepcodes.rep[outSeqs[i].rep - 1];
             } else {
@@ -2512,9 +2848,9 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc)
         outSeqs[i].offset =3D rawOffset;
         /* seqStoreSeqs[i].offset =3D=3D offCode+1, and ZSTD_updateRep() e=
xpects offCode
            so we provide seqStoreSeqs[i].offset - 1 */
-        updatedRepcodes =3D ZSTD_updateRep(updatedRepcodes.rep,
-                                         seqStoreSeqs[i].offset - 1,
-                                         seqStoreSeqs[i].litLength =3D=3D =
0);
+        ZSTD_updateRep(updatedRepcodes.rep,
+                       seqStoreSeqs[i].offBase - 1,
+                       seqStoreSeqs[i].litLength =3D=3D 0);
         literalsRead +=3D outSeqs[i].litLength;
     }
     /* Insert last literals (if any exist) in the block as a sequence with=
 ml =3D=3D off =3D=3D 0.
@@ -2602,16 +2938,740 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStor=
e)
     return nbSeqs < 4 && nbLits < 10;
 }
=20
-static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc)
+static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockStat=
e_t* const bs)
+{
+    ZSTD_compressedBlockState_t* const tmp =3D bs->prevCBlock;
+    bs->prevCBlock =3D bs->nextCBlock;
+    bs->nextCBlock =3D tmp;
+}
+
+/* Writes the block header */
+static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32=
 lastBlock) {
+    U32 const cBlockHeader =3D cSize =3D=3D 1 ?
+                        lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize <=
< 3) :
+                        lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSiz=
e << 3);
+    MEM_writeLE24(op, cBlockHeader);
+    DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u=
", cSize, blockSize, lastBlock);
+}
+
+/* ZSTD_buildBlockEntropyStats_literals() :
+ *  Builds entropy for the literals.
+ *  Stores literals block type (raw, rle, compressed, repeat) and
+ *  huffman description table to hufMetadata.
+ *  Requires ENTROPY_WORKSPACE_SIZE workspace
+ *  @return : size of huffman description table or error code */
+static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t=
 srcSize,
+                                            const ZSTD_hufCTables_t* prevH=
uf,
+                                                  ZSTD_hufCTables_t* nextH=
uf,
+                                                  ZSTD_hufCTablesMetadata_=
t* hufMetadata,
+                                                  const int literalsCompre=
ssionIsDisabled,
+                                                  void* workspace, size_t =
wkspSize)
+{
+    BYTE* const wkspStart =3D (BYTE*)workspace;
+    BYTE* const wkspEnd =3D wkspStart + wkspSize;
+    BYTE* const countWkspStart =3D wkspStart;
+    unsigned* const countWksp =3D (unsigned*)workspace;
+    const size_t countWkspSize =3D (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsi=
gned);
+    BYTE* const nodeWksp =3D countWkspStart + countWkspSize;
+    const size_t nodeWkspSize =3D wkspEnd-nodeWksp;
+    unsigned maxSymbolValue =3D HUF_SYMBOLVALUE_MAX;
+    unsigned huffLog =3D HUF_TABLELOG_DEFAULT;
+    HUF_repeat repeat =3D prevHuf->repeatMode;
+    DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=3D%zu)", sr=
cSize);
+
+    /* Prepare nextEntropy assuming reusing the existing table */
+    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+
+    if (literalsCompressionIsDisabled) {
+        DEBUGLOG(5, "set_basic - disabled");
+        hufMetadata->hType =3D set_basic;
+        return 0;
+    }
+
+    /* small ? don't even attempt compression (speed opt) */
+#ifndef COMPRESS_LITERALS_SIZE_MIN
+#define COMPRESS_LITERALS_SIZE_MIN 63
+#endif
+    {   size_t const minLitSize =3D (prevHuf->repeatMode =3D=3D HUF_repeat=
_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
+        if (srcSize <=3D minLitSize) {
+            DEBUGLOG(5, "set_basic - too small");
+            hufMetadata->hType =3D set_basic;
+            return 0;
+        }
+    }
+
+    /* Scan input and build symbol stats */
+    {   size_t const largest =3D HIST_count_wksp (countWksp, &maxSymbolVal=
ue, (const BYTE*)src, srcSize, workspace, wkspSize);
+        FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
+        if (largest =3D=3D srcSize) {
+            DEBUGLOG(5, "set_rle");
+            hufMetadata->hType =3D set_rle;
+            return 0;
+        }
+        if (largest <=3D (srcSize >> 7)+4) {
+            DEBUGLOG(5, "set_basic - no gain");
+            hufMetadata->hType =3D set_basic;
+            return 0;
+        }
+    }
+
+    /* Validate the previous Huffman table */
+    if (repeat =3D=3D HUF_repeat_check && !HUF_validateCTable((HUF_CElt co=
nst*)prevHuf->CTable, countWksp, maxSymbolValue)) {
+        repeat =3D HUF_repeat_none;
+    }
+
+    /* Build Huffman Tree */
+    ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
+    huffLog =3D HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
+    {   size_t const maxBits =3D HUF_buildCTable_wksp((HUF_CElt*)nextHuf->=
CTable, countWksp,
+                                                    maxSymbolValue, huffLo=
g,
+                                                    nodeWksp, nodeWkspSize=
);
+        FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
+        huffLog =3D (U32)maxBits;
+        {   /* Build and write the CTable */
+            size_t const newCSize =3D HUF_estimateCompressedSize(
+                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
+            size_t const hSize =3D HUF_writeCTable_wksp(
+                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesB=
uffer),
+                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
+                    nodeWksp, nodeWkspSize);
+            /* Check against repeating the previous CTable */
+            if (repeat !=3D HUF_repeat_none) {
+                size_t const oldCSize =3D HUF_estimateCompressedSize(
+                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSy=
mbolValue);
+                if (oldCSize < srcSize && (oldCSize <=3D hSize + newCSize =
|| hSize + 12 >=3D srcSize)) {
+                    DEBUGLOG(5, "set_repeat - smaller");
+                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                    hufMetadata->hType =3D set_repeat;
+                    return 0;
+                }
+            }
+            if (newCSize + hSize >=3D srcSize) {
+                DEBUGLOG(5, "set_basic - no gains");
+                ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
+                hufMetadata->hType =3D set_basic;
+                return 0;
+            }
+            DEBUGLOG(5, "set_compressed (hSize=3D%u)", (U32)hSize);
+            hufMetadata->hType =3D set_compressed;
+            nextHuf->repeatMode =3D HUF_repeat_check;
+            return hSize;
+        }
+    }
+}
+
+
+/* ZSTD_buildDummySequencesStatistics():
+ * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set=
_basic,
+ * and updates nextEntropy to the appropriate repeatMode.
+ */
+static ZSTD_symbolEncodingTypeStats_t
+ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) {
+    ZSTD_symbolEncodingTypeStats_t stats =3D {set_basic, set_basic, set_ba=
sic, 0, 0};
+    nextEntropy->litlength_repeatMode =3D FSE_repeat_none;
+    nextEntropy->offcode_repeatMode =3D FSE_repeat_none;
+    nextEntropy->matchlength_repeatMode =3D FSE_repeat_none;
+    return stats;
+}
+
+/* ZSTD_buildBlockEntropyStats_sequences() :
+ *  Builds entropy for the sequences.
+ *  Stores symbol compression modes and fse table to fseMetadata.
+ *  Requires ENTROPY_WORKSPACE_SIZE wksp.
+ *  @return : size of fse tables or error code */
+static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePt=
r,
+                                              const ZSTD_fseCTables_t* pre=
vEntropy,
+                                                    ZSTD_fseCTables_t* nex=
tEntropy,
+                                              const ZSTD_CCtx_params* cctx=
Params,
+                                                    ZSTD_fseCTablesMetadat=
a_t* fseMetadata,
+                                                    void* workspace, size_=
t wkspSize)
+{
+    ZSTD_strategy const strategy =3D cctxParams->cParams.strategy;
+    size_t const nbSeq =3D seqStorePtr->sequences - seqStorePtr->sequences=
Start;
+    BYTE* const ostart =3D fseMetadata->fseTablesBuffer;
+    BYTE* const oend =3D ostart + sizeof(fseMetadata->fseTablesBuffer);
+    BYTE* op =3D ostart;
+    unsigned* countWorkspace =3D (unsigned*)workspace;
+    unsigned* entropyWorkspace =3D countWorkspace + (MaxSeq + 1);
+    size_t entropyWorkspaceSize =3D wkspSize - (MaxSeq + 1) * sizeof(*coun=
tWorkspace);
+    ZSTD_symbolEncodingTypeStats_t stats;
+
+    DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=3D%zu)", nbS=
eq);
+    stats =3D nbSeq !=3D 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nb=
Seq,
+                                          prevEntropy, nextEntropy, op, oe=
nd,
+                                          strategy, countWorkspace,
+                                          entropyWorkspace, entropyWorkspa=
ceSize)
+                       : ZSTD_buildDummySequencesStatistics(nextEntropy);
+    FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!");
+    fseMetadata->llType =3D (symbolEncodingType_e) stats.LLtype;
+    fseMetadata->ofType =3D (symbolEncodingType_e) stats.Offtype;
+    fseMetadata->mlType =3D (symbolEncodingType_e) stats.MLtype;
+    fseMetadata->lastCountSize =3D stats.lastCountSize;
+    return stats.size;
+}
+
+
+/* ZSTD_buildBlockEntropyStats() :
+ *  Builds entropy for the block.
+ *  Requires workspace size ENTROPY_WORKSPACE_SIZE
+ *
+ *  @return : 0 on success or error code
+ */
+size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+                             const ZSTD_entropyCTables_t* prevEntropy,
+                                   ZSTD_entropyCTables_t* nextEntropy,
+                             const ZSTD_CCtx_params* cctxParams,
+                                   ZSTD_entropyCTablesMetadata_t* entropyM=
etadata,
+                                   void* workspace, size_t wkspSize)
+{
+    size_t const litSize =3D seqStorePtr->lit - seqStorePtr->litStart;
+    entropyMetadata->hufMetadata.hufDesSize =3D
+        ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSiz=
e,
+                                            &prevEntropy->huf, &nextEntrop=
y->huf,
+                                            &entropyMetadata->hufMetadata,
+                                            ZSTD_literalsCompressionIsDisa=
bled(cctxParams),
+                                            workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildB=
lockEntropyStats_literals failed");
+    entropyMetadata->fseMetadata.fseTablesSize =3D
+        ZSTD_buildBlockEntropyStats_sequences(seqStorePtr,
+                                              &prevEntropy->fse, &nextEntr=
opy->fse,
+                                              cctxParams,
+                                              &entropyMetadata->fseMetadat=
a,
+                                              workspace, wkspSize);
+    FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_bui=
ldBlockEntropyStats_sequences failed");
+    return 0;
+}
+
+/* Returns the size estimate for the literals section (header + content) o=
f a block */
+static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t =
litSize,
+                                                const ZSTD_hufCTables_t* h=
uf,
+                                                const ZSTD_hufCTablesMetad=
ata_t* hufMetadata,
+                                                void* workspace, size_t wk=
spSize,
+                                                int writeEntropy)
+{
+    unsigned* const countWksp =3D (unsigned*)workspace;
+    unsigned maxSymbolValue =3D HUF_SYMBOLVALUE_MAX;
+    size_t literalSectionHeaderSize =3D 3 + (litSize >=3D 1 KB) + (litSize=
 >=3D 16 KB);
+    U32 singleStream =3D litSize < 256;
+
+    if (hufMetadata->hType =3D=3D set_basic) return litSize;
+    else if (hufMetadata->hType =3D=3D set_rle) return 1;
+    else if (hufMetadata->hType =3D=3D set_compressed || hufMetadata->hTyp=
e =3D=3D set_repeat) {
+        size_t const largest =3D HIST_count_wksp (countWksp, &maxSymbolVal=
ue, (const BYTE*)literals, litSize, workspace, wkspSize);
+        if (ZSTD_isError(largest)) return litSize;
+        {   size_t cLitSizeEstimate =3D HUF_estimateCompressedSize((const =
HUF_CElt*)huf->CTable, countWksp, maxSymbolValue);
+            if (writeEntropy) cLitSizeEstimate +=3D hufMetadata->hufDesSiz=
e;
+            if (!singleStream) cLitSizeEstimate +=3D 6; /* multi-stream hu=
ffman uses 6-byte jump table */
+            return cLitSizeEstimate + literalSectionHeaderSize;
+    }   }
+    assert(0); /* impossible */
+    return 0;
+}
+
+/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) o=
f a block */
+static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type,
+                        const BYTE* codeTable, size_t nbSeq, unsigned maxC=
ode,
+                        const FSE_CTable* fseCTable,
+                        const U8* additionalBits,
+                        short const* defaultNorm, U32 defaultNormLog, U32 =
defaultMax,
+                        void* workspace, size_t wkspSize)
+{
+    unsigned* const countWksp =3D (unsigned*)workspace;
+    const BYTE* ctp =3D codeTable;
+    const BYTE* const ctStart =3D ctp;
+    const BYTE* const ctEnd =3D ctStart + nbSeq;
+    size_t cSymbolTypeSizeEstimateInBits =3D 0;
+    unsigned max =3D maxCode;
+
+    HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wksp=
Size);  /* can't fail */
+    if (type =3D=3D set_basic) {
+        /* We selected this encoding type, so it must be valid. */
+        assert(max <=3D defaultMax);
+        (void)defaultMax;
+        cSymbolTypeSizeEstimateInBits =3D ZSTD_crossEntropyCost(defaultNor=
m, defaultNormLog, countWksp, max);
+    } else if (type =3D=3D set_rle) {
+        cSymbolTypeSizeEstimateInBits =3D 0;
+    } else if (type =3D=3D set_compressed || type =3D=3D set_repeat) {
+        cSymbolTypeSizeEstimateInBits =3D ZSTD_fseBitCost(fseCTable, count=
Wksp, max);
+    }
+    if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) {
+        return nbSeq * 10;
+    }
+    while (ctp < ctEnd) {
+        if (additionalBits) cSymbolTypeSizeEstimateInBits +=3D additionalB=
its[*ctp];
+        else cSymbolTypeSizeEstimateInBits +=3D *ctp; /* for offset, offse=
t code is also the number of additional bits */
+        ctp++;
+    }
+    return cSymbolTypeSizeEstimateInBits >> 3;
+}
+
+/* Returns the size estimate for the sequences section (header + content) =
of a block */
+static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable,
+                                                  const BYTE* llCodeTable,
+                                                  const BYTE* mlCodeTable,
+                                                  size_t nbSeq,
+                                                  const ZSTD_fseCTables_t*=
 fseTables,
+                                                  const ZSTD_fseCTablesMet=
adata_t* fseMetadata,
+                                                  void* workspace, size_t =
wkspSize,
+                                                  int writeEntropy)
+{
+    size_t sequencesSectionHeaderSize =3D 1 /* seqHead */ + 1 /* min seqSi=
ze size */ + (nbSeq >=3D 128) + (nbSeq >=3D LONGNBSEQ);
+    size_t cSeqSizeEstimate =3D 0;
+    cSeqSizeEstimate +=3D ZSTD_estimateBlockSize_symbolType(fseMetadata->o=
fType, ofCodeTable, nbSeq, MaxOff,
+                                         fseTables->offcodeCTable, NULL,
+                                         OF_defaultNorm, OF_defaultNormLog=
, DefaultMaxOff,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate +=3D ZSTD_estimateBlockSize_symbolType(fseMetadata->l=
lType, llCodeTable, nbSeq, MaxLL,
+                                         fseTables->litlengthCTable, LL_bi=
ts,
+                                         LL_defaultNorm, LL_defaultNormLog=
, MaxLL,
+                                         workspace, wkspSize);
+    cSeqSizeEstimate +=3D ZSTD_estimateBlockSize_symbolType(fseMetadata->m=
lType, mlCodeTable, nbSeq, MaxML,
+                                         fseTables->matchlengthCTable, ML_=
bits,
+                                         ML_defaultNorm, ML_defaultNormLog=
, MaxML,
+                                         workspace, wkspSize);
+    if (writeEntropy) cSeqSizeEstimate +=3D fseMetadata->fseTablesSize;
+    return cSeqSizeEstimate + sequencesSectionHeaderSize;
+}
+
+/* Returns the size estimate for a given stream of literals, of, ll, ml */
+static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize,
+                                     const BYTE* ofCodeTable,
+                                     const BYTE* llCodeTable,
+                                     const BYTE* mlCodeTable,
+                                     size_t nbSeq,
+                                     const ZSTD_entropyCTables_t* entropy,
+                                     const ZSTD_entropyCTablesMetadata_t* =
entropyMetadata,
+                                     void* workspace, size_t wkspSize,
+                                     int writeLitEntropy, int writeSeqEntr=
opy) {
+    size_t const literalsSize =3D ZSTD_estimateBlockSize_literal(literals,=
 litSize,
+                                                         &entropy->huf, &e=
ntropyMetadata->hufMetadata,
+                                                         workspace, wkspSi=
ze, writeLitEntropy);
+    size_t const seqSize =3D ZSTD_estimateBlockSize_sequences(ofCodeTable,=
 llCodeTable, mlCodeTable,
+                                                         nbSeq, &entropy->=
fse, &entropyMetadata->fseMetadata,
+                                                         workspace, wkspSi=
ze, writeSeqEntropy);
+    return seqSize + literalsSize + ZSTD_blockHeaderSize;
+}
+
+/* Builds entropy statistics and uses them for blocksize estimation.
+ *
+ * Returns the estimated compressed size of the seqStore, or a zstd error.
+ */
+static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_=
t* seqStore, ZSTD_CCtx* zc) {
+    ZSTD_entropyCTablesMetadata_t* entropyMetadata =3D &zc->blockSplitCtx.=
entropyMetadata;
+    DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()");
+    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore,
+                    &zc->blockState.prevCBlock->entropy,
+                    &zc->blockState.nextCBlock->entropy,
+                    &zc->appliedParams,
+                    entropyMetadata,
+                    zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* static=
ally allocated in resetCCtx */), "");
+    return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->l=
it - seqStore->litStart),
+                    seqStore->ofCode, seqStore->llCode, seqStore->mlCode,
+                    (size_t)(seqStore->sequences - seqStore->sequencesStar=
t),
+                    &zc->blockState.nextCBlock->entropy, entropyMetadata, =
zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE,
+                    (int)(entropyMetadata->hufMetadata.hType =3D=3D set_co=
mpressed), 1);
+}
+
+/* Returns literals bytes represented in a seqStore */
+static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqS=
tore) {
+    size_t literalsBytes =3D 0;
+    size_t const nbSeqs =3D seqStore->sequences - seqStore->sequencesStart;
+    size_t i;
+    for (i =3D 0; i < nbSeqs; ++i) {
+        seqDef seq =3D seqStore->sequencesStart[i];
+        literalsBytes +=3D seq.litLength;
+        if (i =3D=3D seqStore->longLengthPos && seqStore->longLengthType =
=3D=3D ZSTD_llt_literalLength) {
+            literalsBytes +=3D 0x10000;
+        }
+    }
+    return literalsBytes;
+}
+
+/* Returns match bytes represented in a seqStore */
+static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStor=
e) {
+    size_t matchBytes =3D 0;
+    size_t const nbSeqs =3D seqStore->sequences - seqStore->sequencesStart;
+    size_t i;
+    for (i =3D 0; i < nbSeqs; ++i) {
+        seqDef seq =3D seqStore->sequencesStart[i];
+        matchBytes +=3D seq.mlBase + MINMATCH;
+        if (i =3D=3D seqStore->longLengthPos && seqStore->longLengthType =
=3D=3D ZSTD_llt_matchLength) {
+            matchBytes +=3D 0x10000;
+        }
+    }
+    return matchBytes;
+}
+
+/* Derives the seqStore that is a chunk of the originalSeqStore from [star=
tIdx, endIdx).
+ * Stores the result in resultSeqStore.
+ */
+static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore,
+                               const seqStore_t* originalSeqStore,
+                                     size_t startIdx, size_t endIdx) {
+    BYTE* const litEnd =3D originalSeqStore->lit;
+    size_t literalsBytes;
+    size_t literalsBytesPreceding =3D 0;
+
+    *resultSeqStore =3D *originalSeqStore;
+    if (startIdx > 0) {
+        resultSeqStore->sequences =3D originalSeqStore->sequencesStart + s=
tartIdx;
+        literalsBytesPreceding =3D ZSTD_countSeqStoreLiteralsBytes(resultS=
eqStore);
+    }
+
+    /* Move longLengthPos into the correct position if necessary */
+    if (originalSeqStore->longLengthType !=3D ZSTD_llt_none) {
+        if (originalSeqStore->longLengthPos < startIdx || originalSeqStore=
->longLengthPos > endIdx) {
+            resultSeqStore->longLengthType =3D ZSTD_llt_none;
+        } else {
+            resultSeqStore->longLengthPos -=3D (U32)startIdx;
+        }
+    }
+    resultSeqStore->sequencesStart =3D originalSeqStore->sequencesStart + =
startIdx;
+    resultSeqStore->sequences =3D originalSeqStore->sequencesStart + endId=
x;
+    literalsBytes =3D ZSTD_countSeqStoreLiteralsBytes(resultSeqStore);
+    resultSeqStore->litStart +=3D literalsBytesPreceding;
+    if (endIdx =3D=3D (size_t)(originalSeqStore->sequences - originalSeqSt=
ore->sequencesStart)) {
+        /* This accounts for possible last literals if the derived chunk r=
eaches the end of the block */
+        resultSeqStore->lit =3D litEnd;
+    } else {
+        resultSeqStore->lit =3D resultSeqStore->litStart+literalsBytes;
+    }
+    resultSeqStore->llCode +=3D startIdx;
+    resultSeqStore->mlCode +=3D startIdx;
+    resultSeqStore->ofCode +=3D startIdx;
+}
+
+/*
+ * Returns the raw offset represented by the combination of offCode, ll0, =
and repcode history.
+ * offCode must represent a repcode in the numeric representation of ZSTD_=
storeSeq().
+ */
+static U32
+ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offC=
ode, const U32 ll0)
+{
+    U32 const adjustedOffCode =3D STORED_REPCODE(offCode) - 1 + ll0;  /* [=
 0 - 3 ] */
+    assert(STORED_IS_REPCODE(offCode));
+    if (adjustedOffCode =3D=3D ZSTD_REP_NUM) {
+        /* litlength =3D=3D 0 and offCode =3D=3D 2 implies selection of fi=
rst repcode - 1 */
+        assert(rep[0] > 0);
+        return rep[0] - 1;
+    }
+    return rep[adjustedOffCode];
+}
+
+/*
+ * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in =
offset history that may arise
+ * due to emission of RLE/raw blocks that disturb the offset history,
+ * and replaces any repcodes within the seqStore that may be invalid.
+ *
+ * dRepcodes are updated as would be on the decompression side.
+ * cRepcodes are updated exactly in accordance with the seqStore.
+ *
+ * Note : this function assumes seq->offBase respects the following number=
ing scheme :
+ *        0 : invalid
+ *        1-3 : repcode 1-3
+ *        4+ : real_offset+3
+ */
+static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, rep=
codes_t* const cRepcodes,
+                                          seqStore_t* const seqStore, U32 =
const nbSeq) {
+    U32 idx =3D 0;
+    for (; idx < nbSeq; ++idx) {
+        seqDef* const seq =3D seqStore->sequencesStart + idx;
+        U32 const ll0 =3D (seq->litLength =3D=3D 0);
+        U32 const offCode =3D OFFBASE_TO_STORED(seq->offBase);
+        assert(seq->offBase > 0);
+        if (STORED_IS_REPCODE(offCode)) {
+            U32 const dRawOffset =3D ZSTD_resolveRepcodeToRawOffset(dRepco=
des->rep, offCode, ll0);
+            U32 const cRawOffset =3D ZSTD_resolveRepcodeToRawOffset(cRepco=
des->rep, offCode, ll0);
+            /* Adjust simulated decompression repcode history if we come a=
cross a mismatch. Replace
+             * the repcode with the offset it actually references, determi=
ned by the compression
+             * repcode history.
+             */
+            if (dRawOffset !=3D cRawOffset) {
+                seq->offBase =3D cRawOffset + ZSTD_REP_NUM;
+            }
+        }
+        /* Compression repcode history is always updated with values direc=
tly from the unmodified seqStore.
+         * Decompression repcode history may use modified seq->offset valu=
e taken from compression repcode history.
+         */
+        ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll=
0);
+        ZSTD_updateRep(cRepcodes->rep, offCode, ll0);
+    }
+}
+
+/* ZSTD_compressSeqStore_singleBlock():
+ * Compresses a seqStore into a block with a block header, into the buffer=
 dst.
+ *
+ * Returns the total size of that block (including header) or a ZSTD error=
 code.
+ */
+static size_t
+ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStor=
e,
+                                  repcodes_t* const dRep, repcodes_t* cons=
t cRep,
+                                  void* dst, size_t dstCapacity,
+                                  const void* src, size_t srcSize,
+                                  U32 lastBlock, U32 isPartition)
 {
-    ZSTD_compressedBlockState_t* const tmp =3D zc->blockState.prevCBlock;
-    zc->blockState.prevCBlock =3D zc->blockState.nextCBlock;
-    zc->blockState.nextCBlock =3D tmp;
+    const U32 rleMaxLength =3D 25;
+    BYTE* op =3D (BYTE*)dst;
+    const BYTE* ip =3D (const BYTE*)src;
+    size_t cSize;
+    size_t cSeqsSize;
+
+    /* In case of an RLE or raw block, the simulated decompression repcode=
 history must be reset */
+    repcodes_t const dRepOriginal =3D *dRep;
+    DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock");
+    if (isPartition)
+        ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore=
->sequences - seqStore->sequencesStart));
+
+    RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, =
"Block header doesn't fit");
+    cSeqsSize =3D ZSTD_entropyCompressSeqStore(seqStore,
+                &zc->blockState.prevCBlock->entropy, &zc->blockState.nextC=
Block->entropy,
+                &zc->appliedParams,
+                op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderS=
ize,
+                srcSize,
+                zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically=
 allocated in resetCCtx */,
+                zc->bmi2);
+    FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!");
+
+    if (!zc->isFirstBlock &&
+        cSeqsSize < rleMaxLength &&
+        ZSTD_isRLE((BYTE const*)src, srcSize)) {
+        /* We don't want to emit our first block as a RLE even if it quali=
fies because
+        * doing so will cause the decoder (cli only) to throw a "should co=
nsume all input error."
+        * This is only an issue for zstd <=3D v1.4.3
+        */
+        cSeqsSize =3D 1;
+    }
+
+    if (zc->seqCollector.collectSequences) {
+        ZSTD_copyBlockSequences(zc);
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+        return 0;
+    }
+
+    if (cSeqsSize =3D=3D 0) {
+        cSize =3D ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastB=
lock);
+        FORWARD_IF_ERROR(cSize, "Nocompress block failed");
+        DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize);
+        *dRep =3D dRepOriginal; /* reset simulated decompression repcode h=
istory */
+    } else if (cSeqsSize =3D=3D 1) {
+        cSize =3D ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, las=
tBlock);
+        FORWARD_IF_ERROR(cSize, "RLE compress block failed");
+        DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize);
+        *dRep =3D dRepOriginal; /* reset simulated decompression repcode h=
istory */
+    } else {
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
+        writeBlockHeader(op, cSeqsSize, srcSize, lastBlock);
+        cSize =3D ZSTD_blockHeaderSize + cSeqsSize;
+        DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize);
+    }
+
+    if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode =3D=3D F=
SE_repeat_valid)
+        zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode =3D FSE_=
repeat_check;
+
+    return cSize;
+}
+
+/* Struct to keep track of where we are in our recursive calls. */
+typedef struct {
+    U32* splitLocations;    /* Array of split indices */
+    size_t idx;             /* The current index within splitLocations bei=
ng worked on */
+} seqStoreSplits;
+
+#define MIN_SEQUENCES_BLOCK_SPLITTING 300
+
+/* Helper function to perform the recursive search for block splits.
+ * Estimates the cost of seqStore prior to split, and estimates the cost o=
f splitting the sequences in half.
+ * If advantageous to split, then we recurse down the two sub-blocks. If n=
ot, or if an error occurred in estimation, then
+ * we do not recurse.
+ *
+ * Note: The recursion depth is capped by a heuristic minimum number of se=
quences, defined by MIN_SEQUENCES_BLOCK_SPLITTING.
+ * In theory, this means the absolute largest recursion depth is 10 =3D=3D=
 log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING).
+ * In practice, recursion depth usually doesn't go beyond 4.
+ *
+ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS=
. At ZSTD_MAX_NB_BLOCK_SPLITS =3D=3D 196 with the current existing blockSize
+ * maximum of 128 KB, this value is actually impossible to reach.
+ */
+static void
+ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size=
_t endIdx,
+                             ZSTD_CCtx* zc, const seqStore_t* origSeqStore)
+{
+    seqStore_t* fullSeqStoreChunk =3D &zc->blockSplitCtx.fullSeqStoreChunk;
+    seqStore_t* firstHalfSeqStore =3D &zc->blockSplitCtx.firstHalfSeqStore;
+    seqStore_t* secondHalfSeqStore =3D &zc->blockSplitCtx.secondHalfSeqSto=
re;
+    size_t estimatedOriginalSize;
+    size_t estimatedFirstHalfSize;
+    size_t estimatedSecondHalfSize;
+    size_t midIdx =3D (startIdx + endIdx)/2;
+
+    if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >=
=3D ZSTD_MAX_NB_BLOCK_SPLITS) {
+        DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences");
+        return;
+    }
+    DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=3D%zu endIdx=3D%zu=
", startIdx, endIdx);
+    ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, en=
dIdx);
+    ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, mi=
dIdx);
+    ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, end=
Idx);
+    estimatedOriginalSize =3D ZSTD_buildEntropyStatisticsAndEstimateSubBlo=
ckSize(fullSeqStoreChunk, zc);
+    estimatedFirstHalfSize =3D ZSTD_buildEntropyStatisticsAndEstimateSubBl=
ockSize(firstHalfSeqStore, zc);
+    estimatedSecondHalfSize =3D ZSTD_buildEntropyStatisticsAndEstimateSubB=
lockSize(secondHalfSeqStore, zc);
+    DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %=
zu -- Second half split: %zu",
+             estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecon=
dHalfSize);
+    if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirst=
HalfSize) || ZSTD_isError(estimatedSecondHalfSize)) {
+        return;
+    }
+    if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOrigin=
alSize) {
+        ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeq=
Store);
+        splits->splitLocations[splits->idx] =3D (U32)midIdx;
+        splits->idx++;
+        ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqSt=
ore);
+    }
+}
+
+/* Base recursive function. Populates a table with intra-block partition i=
ndices that can improve compression ratio.
+ *
+ * Returns the number of splits made (which equals the size of the partiti=
on table - 1).
+ */
+static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 =
nbSeq) {
+    seqStoreSplits splits =3D {partitions, 0};
+    if (nbSeq <=3D 4) {
+        DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split");
+        /* Refuse to try and split anything with less than 4 sequences */
+        return 0;
+    }
+    ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore);
+    splits.splitLocations[splits.idx] =3D nbSeq;
+    DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits=
.idx+1);
+    return splits.idx;
+}
+
+/* ZSTD_compressBlock_splitBlock():
+ * Attempts to split a given block into multiple blocks to improve compres=
sion ratio.
+ *
+ * Returns combined size of all blocks (which includes headers), or a ZSTD=
 error code.
+ */
+static size_t
+ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t ds=
tCapacity,
+                                       const void* src, size_t blockSize, =
U32 lastBlock, U32 nbSeq)
+{
+    size_t cSize =3D 0;
+    const BYTE* ip =3D (const BYTE*)src;
+    BYTE* op =3D (BYTE*)dst;
+    size_t i =3D 0;
+    size_t srcBytesTotal =3D 0;
+    U32* partitions =3D zc->blockSplitCtx.partitions; /* size =3D=3D ZSTD_=
MAX_NB_BLOCK_SPLITS */
+    seqStore_t* nextSeqStore =3D &zc->blockSplitCtx.nextSeqStore;
+    seqStore_t* currSeqStore =3D &zc->blockSplitCtx.currSeqStore;
+    size_t numSplits =3D ZSTD_deriveBlockSplits(zc, partitions, nbSeq);
+
+    /* If a block is split and some partitions are emitted as RLE/uncompre=
ssed, then repcode history
+     * may become invalid. In order to reconcile potentially invalid repco=
des, we keep track of two
+     * separate repcode histories that simulate repcode history on compres=
sion and decompression side,
+     * and use the histories to determine whether we must replace a partic=
ular repcode with its raw offset.
+     *
+     * 1) cRep gets updated for each partition, regardless of whether the =
block was emitted as uncompressed
+     *    or RLE. This allows us to retrieve the offset value that an inva=
lid repcode references within
+     *    a nocompress/RLE block.
+     * 2) dRep gets updated only for compressed partitions, and when a rep=
code gets replaced, will use
+     *    the replacement offset value rather than the original repcode to=
 update the repcode history.
+     *    dRep also will be the final repcode history sent to the next blo=
ck.
+     *
+     * See ZSTD_seqStore_resolveOffCodes() for more details.
+     */
+    repcodes_t dRep;
+    repcodes_t cRep;
+    ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_=
t));
+    ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_=
t));
+    ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t));
+
+    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=3D%u,=
 dictLimit=3D%u, nextToUpdate=3D%u)",
+                (unsigned)dstCapacity, (unsigned)zc->blockState.matchState=
.window.dictLimit,
+                (unsigned)zc->blockState.matchState.nextToUpdate);
+
+    if (numSplits =3D=3D 0) {
+        size_t cSizeSingleBlock =3D ZSTD_compressSeqStore_singleBlock(zc, =
&zc->seqStore,
+                                                                   &dRep, =
&cRep,
+                                                                    op, ds=
tCapacity,
+                                                                    ip, bl=
ockSize,
+                                                                    lastBl=
ock, 0 /* isPartition */);
+        FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from =
splitBlock_internal() failed!");
+        DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits");
+        assert(cSizeSingleBlock <=3D ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeader=
Size);
+        return cSizeSingleBlock;
+    }
+
+    ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]=
);
+    for (i =3D 0; i <=3D numSplits; ++i) {
+        size_t srcBytes;
+        size_t cSizeChunk;
+        U32 const lastPartition =3D (i =3D=3D numSplits);
+        U32 lastBlockEntireSrc =3D 0;
+
+        srcBytes =3D ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_=
countSeqStoreMatchBytes(currSeqStore);
+        srcBytesTotal +=3D srcBytes;
+        if (lastPartition) {
+            /* This is the final partition, need to account for possible l=
ast literals */
+            srcBytes +=3D blockSize - srcBytesTotal;
+            lastBlockEntireSrc =3D lastBlock;
+        } else {
+            ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitio=
ns[i], partitions[i+1]);
+        }
+
+        cSizeChunk =3D ZSTD_compressSeqStore_singleBlock(zc, currSeqStore,
+                                                      &dRep, &cRep,
+                                                       op, dstCapacity,
+                                                       ip, srcBytes,
+                                                       lastBlockEntireSrc,=
 1 /* isPartition */);
+        DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntr=
opyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk);
+        FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!");
+
+        ip +=3D srcBytes;
+        op +=3D cSizeChunk;
+        dstCapacity -=3D cSizeChunk;
+        cSize +=3D cSizeChunk;
+        *currSeqStore =3D *nextSeqStore;
+        assert(cSizeChunk <=3D ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize);
+    }
+    /* cRep and dRep may have diverged during the compression. If so, we u=
se the dRep repcodes
+     * for the next block.
+     */
+    ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_=
t));
+    return cSize;
+}
+
+static size_t
+ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc,
+                              void* dst, size_t dstCapacity,
+                              const void* src, size_t srcSize, U32 lastBlo=
ck)
+{
+    const BYTE* ip =3D (const BYTE*)src;
+    BYTE* op =3D (BYTE*)dst;
+    U32 nbSeq;
+    size_t cSize;
+    DEBUGLOG(4, "ZSTD_compressBlock_splitBlock");
+    assert(zc->appliedParams.useBlockSplitter =3D=3D ZSTD_ps_enable);
+
+    {   const size_t bss =3D ZSTD_buildSeqStore(zc, src, srcSize);
+        FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed");
+        if (bss =3D=3D ZSTDbss_noCompress) {
+            if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode =
=3D=3D FSE_repeat_valid)
+                zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode =
=3D FSE_repeat_check;
+            cSize =3D ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, l=
astBlock);
+            FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed");
+            DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block");
+            return cSize;
+        }
+        nbSeq =3D (U32)(zc->seqStore.sequences - zc->seqStore.sequencesSta=
rt);
+    }
+
+    cSize =3D ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity,=
 src, srcSize, lastBlock, nbSeq);
+    FORWARD_IF_ERROR(cSize, "Splitting blocks failed!");
+    return cSize;
 }
=20
-static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
-                                        void* dst, size_t dstCapacity,
-                                        const void* src, size_t srcSize, U=
32 frame)
+static size_t
+ZSTD_compressBlock_internal(ZSTD_CCtx* zc,
+                            void* dst, size_t dstCapacity,
+                            const void* src, size_t srcSize, U32 frame)
 {
     /* This the upper bound for the length of an rle block.
      * This isn't the actual upper bound. Finding the real threshold
@@ -2632,12 +3692,12 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx=
* zc,
=20
     if (zc->seqCollector.collectSequences) {
         ZSTD_copyBlockSequences(zc);
-        ZSTD_confirmRepcodesAndEntropyTables(zc);
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
         return 0;
     }
=20
     /* encode sequences and literals */
-    cSize =3D ZSTD_entropyCompressSequences(&zc->seqStore,
+    cSize =3D ZSTD_entropyCompressSeqStore(&zc->seqStore,
             &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBloc=
k->entropy,
             &zc->appliedParams,
             dst, dstCapacity,
@@ -2645,12 +3705,6 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx*=
 zc,
             zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically all=
ocated in resetCCtx */,
             zc->bmi2);
=20
-    if (zc->seqCollector.collectSequences) {
-        ZSTD_copyBlockSequences(zc);
-        return 0;
-    }
-
-
     if (frame &&
         /* We don't want to emit our first block as a RLE even if it quali=
fies because
          * doing so will cause the decoder (cli only) to throw a "should c=
onsume all input error."
@@ -2666,7 +3720,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* =
zc,
=20
 out:
     if (!ZSTD_isError(cSize) && cSize > 1) {
-        ZSTD_confirmRepcodesAndEntropyTables(zc);
+        ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState);
     }
     /* We check that dictionaries have offset codes available for the first
      * block. After the first block, the offcode table might not have large
@@ -2719,7 +3773,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_bod=
y(ZSTD_CCtx* zc,
                 size_t const maxCSize =3D srcSize - ZSTD_minGain(srcSize, =
zc->appliedParams.cParams.strategy);
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed");
                 if (cSize !=3D 0 && cSize < maxCSize + ZSTD_blockHeaderSiz=
e) {
-                    ZSTD_confirmRepcodesAndEntropyTables(zc);
+                    ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->b=
lockState);
                     return cSize;
                 }
             }
@@ -2759,9 +3813,9 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchSt=
ate_t* ms,
                                          void const* ip,
                                          void const* iend)
 {
-    if (ZSTD_window_needOverflowCorrection(ms->window, iend)) {
-        U32 const maxDist =3D (U32)1 << params->cParams.windowLog;
-        U32 const cycleLog =3D ZSTD_cycleLog(params->cParams.chainLog, par=
ams->cParams.strategy);
+    U32 const cycleLog =3D ZSTD_cycleLog(params->cParams.chainLog, params-=
>cParams.strategy);
+    U32 const maxDist =3D (U32)1 << params->cParams.windowLog;
+    if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, =
ms->loadedDictEnd, ip, iend)) {
         U32 const correction =3D ZSTD_window_correctOverflow(&ms->window, =
cycleLog, maxDist, ip);
         ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <=3D 30);
         ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <=3D 30);
@@ -2784,7 +3838,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchSt=
ate_t* ms,
 *   Frame is supposed already started (header already produced)
 *   @return : compressed size, or an error code
 */
-static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx,
+static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx,
                                      void* dst, size_t dstCapacity,
                                const void* src, size_t srcSize,
                                      U32 lastFrameChunk)
@@ -2814,6 +3868,7 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cc=
tx,
         ZSTD_overflowCorrectIfNeeded(
             ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize=
);
         ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->=
loadedDictEnd, &ms->dictMatchState);
+        ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDi=
ctEnd, &ms->dictMatchState);
=20
         /* Ensure hash/chain table insertion resumes no sooner than lowlim=
it */
         if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate =3D m=
s->window.lowLimit;
@@ -2824,6 +3879,10 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* c=
ctx,
                 FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSi=
ze failed");
                 assert(cSize > 0);
                 assert(cSize <=3D blockSize + ZSTD_blockHeaderSize);
+            } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) {
+                cSize =3D ZSTD_compressBlock_splitBlock(cctx, op, dstCapac=
ity, ip, blockSize, lastBlock);
+                FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock fai=
led");
+                assert(cSize > 0 || cctx->seqCollector.collectSequences =
=3D=3D 1);
             } else {
                 cSize =3D ZSTD_compressBlock_internal(cctx,
                                         op+ZSTD_blockHeaderSize, dstCapaci=
ty-ZSTD_blockHeaderSize,
@@ -2946,7 +4005,7 @@ size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cct=
x, rawSeq* seq, size_t nbSe
 {
     RETURN_ERROR_IF(cctx->stage !=3D ZSTDcs_init, stage_wrong,
                     "wrong cctx stage");
-    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm,
+    RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm =3D=3D ZSTD_ps=
_enable,
                     parameter_unsupported,
                     "incompatible with ldm");
     cctx->externSeqStore.seq =3D seq;
@@ -2983,11 +4042,12 @@ static size_t ZSTD_compressContinue_internal (ZSTD_=
CCtx* cctx,
=20
     if (!srcSize) return fhSize;  /* do not generate an empty block if no =
input */
=20
-    if (!ZSTD_window_update(&ms->window, src, srcSize)) {
+    if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContigu=
ous)) {
+        ms->forceNonContiguous =3D 0;
         ms->nextToUpdate =3D ms->window.dictLimit;
     }
-    if (cctx->appliedParams.ldmParams.enableLdm) {
-        ZSTD_window_update(&cctx->ldmState.window, src, srcSize);
+    if (cctx->appliedParams.ldmParams.enableLdm =3D=3D ZSTD_ps_enable) {
+        ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceN=
onContiguous */ 0);
     }
=20
     if (!frame) {
@@ -3055,63 +4115,86 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_match=
State_t* ms,
 {
     const BYTE* ip =3D (const BYTE*) src;
     const BYTE* const iend =3D ip + srcSize;
+    int const loadLdmDict =3D params->ldmParams.enableLdm =3D=3D ZSTD_ps_e=
nable && ls !=3D NULL;
=20
-    ZSTD_window_update(&ms->window, src, srcSize);
+    /* Assert that we the ms params match the params we're being given */
+    ZSTD_assertEqualCParams(params->cParams, ms->cParams);
+
+    if (srcSize > ZSTD_CHUNKSIZE_MAX) {
+        /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_=
MAX.
+         * Dictionaries right at the edge will immediately trigger overflow
+         * correction, but I don't want to insert extra constraints here.
+         */
+        U32 const maxDictSize =3D ZSTD_CURRENT_MAX - 1;
+        /* We must have cleared our windows when our source is this large.=
 */
+        assert(ZSTD_window_isEmpty(ms->window));
+        if (loadLdmDict)
+            assert(ZSTD_window_isEmpty(ls->window));
+        /* If the dictionary is too large, only load the suffix of the dic=
tionary. */
+        if (srcSize > maxDictSize) {
+            ip =3D iend - maxDictSize;
+            src =3D ip;
+            srcSize =3D maxDictSize;
+        }
+    }
+
+    DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=3D%d", (i=
nt)params->useRowMatchFinder);
+    ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */=
 0);
     ms->loadedDictEnd =3D params->forceWindow ? 0 : (U32)(iend - ms->windo=
w.base);
+    ms->forceNonContiguous =3D params->deterministicRefPrefix;
=20
-    if (params->ldmParams.enableLdm && ls !=3D NULL) {
-        ZSTD_window_update(&ls->window, src, srcSize);
+    if (loadLdmDict) {
+        ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguou=
s */ 0);
         ls->loadedDictEnd =3D params->forceWindow ? 0 : (U32)(iend - ls->w=
indow.base);
     }
=20
-    /* Assert that we the ms params match the params we're being given */
-    ZSTD_assertEqualCParams(params->cParams, ms->cParams);
-
     if (srcSize <=3D HASH_READ_SIZE) return 0;
=20
-    while (iend - ip > HASH_READ_SIZE) {
-        size_t const remaining =3D (size_t)(iend - ip);
-        size_t const chunk =3D MIN(remaining, ZSTD_CHUNKSIZE_MAX);
-        const BYTE* const ichunk =3D ip + chunk;
-
-        ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk);
+    ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend);
=20
-        if (params->ldmParams.enableLdm && ls !=3D NULL)
-            ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src =
+ srcSize, &params->ldmParams);
+    if (loadLdmDict)
+        ZSTD_ldm_fillHashTable(ls, ip, iend, &params->ldmParams);
=20
-        switch(params->cParams.strategy)
-        {
-        case ZSTD_fast:
-            ZSTD_fillHashTable(ms, ichunk, dtlm);
-            break;
-        case ZSTD_dfast:
-            ZSTD_fillDoubleHashTable(ms, ichunk, dtlm);
-            break;
+    switch(params->cParams.strategy)
+    {
+    case ZSTD_fast:
+        ZSTD_fillHashTable(ms, iend, dtlm);
+        break;
+    case ZSTD_dfast:
+        ZSTD_fillDoubleHashTable(ms, iend, dtlm);
+        break;
=20
-        case ZSTD_greedy:
-        case ZSTD_lazy:
-        case ZSTD_lazy2:
-            if (chunk >=3D HASH_READ_SIZE && ms->dedicatedDictSearch) {
-                assert(chunk =3D=3D remaining); /* must load everything in=
 one go */
-                ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HA=
SH_READ_SIZE);
-            } else if (chunk >=3D HASH_READ_SIZE) {
-                ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE);
+    case ZSTD_greedy:
+    case ZSTD_lazy:
+    case ZSTD_lazy2:
+        assert(srcSize >=3D HASH_READ_SIZE);
+        if (ms->dedicatedDictSearch) {
+            assert(ms->chainTable !=3D NULL);
+            ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_REA=
D_SIZE);
+        } else {
+            assert(params->useRowMatchFinder !=3D ZSTD_ps_auto);
+            if (params->useRowMatchFinder =3D=3D ZSTD_ps_enable) {
+                size_t const tagTableSize =3D ((size_t)1 << params->cParam=
s.hashLog) * sizeof(U16);
+                ZSTD_memset(ms->tagTable, 0, tagTableSize);
+                ZSTD_row_update(ms, iend-HASH_READ_SIZE);
+                DEBUGLOG(4, "Using row-based hash table for lazy dict");
+            } else {
+                ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE);
+                DEBUGLOG(4, "Using chain-based hash table for lazy dict");
             }
-            break;
-
-        case ZSTD_btlazy2:   /* we want the dictionary table fully sorted =
*/
-        case ZSTD_btopt:
-        case ZSTD_btultra:
-        case ZSTD_btultra2:
-            if (chunk >=3D HASH_READ_SIZE)
-                ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk);
-            break;
-
-        default:
-            assert(0);  /* not possible : not a valid strategy id */
         }
+        break;
+
+    case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
+    case ZSTD_btopt:
+    case ZSTD_btultra:
+    case ZSTD_btultra2:
+        assert(srcSize >=3D HASH_READ_SIZE);
+        ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend);
+        break;
=20
-        ip =3D ichunk;
+    default:
+        assert(0);  /* not possible : not a valid strategy id */
     }
=20
     ms->nextToUpdate =3D (U32)(iend - ms->window.base);
@@ -3250,7 +4333,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressed=
BlockState_t* bs,
     const BYTE* const dictEnd =3D dictPtr + dictSize;
     size_t dictID;
     size_t eSize;
-
     ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >=3D (1<<MAX(MLFSELog,LLFSELog))=
);
     assert(dictSize >=3D 8);
     assert(MEM_readLE32(dictPtr) =3D=3D ZSTD_MAGIC_DICTIONARY);
@@ -3321,6 +4403,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* =
cctx,
                                     const ZSTD_CCtx_params* params, U64 pl=
edgedSrcSize,
                                     ZSTD_buffered_policy_e zbuff)
 {
+    size_t const dictContentSize =3D cdict ? cdict->dictContentSize : dict=
Size;
     DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=3D%u", params->cParams.=
windowLog);
     /* params are supposed to be fully validated at this point */
     assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams)));
@@ -3335,7 +4418,8 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* =
cctx,
         return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSi=
ze, zbuff);
     }
=20
-    FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSiz=
e,
+    FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize,
+                                     dictContentSize,
                                      ZSTDcrp_makeClean, zbuff) , "");
     {   size_t const dictID =3D cdict ?
                 ZSTD_compress_insertDictionary(
@@ -3350,7 +4434,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* =
cctx,
         FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed");
         assert(dictID <=3D UINT_MAX);
         cctx->dictID =3D (U32)dictID;
-        cctx->dictContentSize =3D cdict ? cdict->dictContentSize : dictSiz=
e;
+        cctx->dictContentSize =3D dictContentSize;
     }
     return 0;
 }
@@ -3485,15 +4569,14 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx,
                          const void* dict,size_t dictSize,
                                ZSTD_parameters params)
 {
-    ZSTD_CCtx_params cctxParams;
     DEBUGLOG(4, "ZSTD_compress_advanced");
     FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), "");
-    ZSTD_CCtxParams_init_internal(&cctxParams, &params, ZSTD_NO_CLEVEL);
+    ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, ZSTD_NO=
_CLEVEL);
     return ZSTD_compress_advanced_internal(cctx,
                                            dst, dstCapacity,
                                            src, srcSize,
                                            dict, dictSize,
-                                           &cctxParams);
+                                           &cctx->simpleApiParams);
 }
=20
 /* Internal */
@@ -3517,14 +4600,13 @@ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx,
                          const void* dict, size_t dictSize,
                                int compressionLevel)
 {
-    ZSTD_CCtx_params cctxParams;
     {
         ZSTD_parameters const params =3D ZSTD_getParams_internal(compressi=
onLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict);
         assert(params.fParams.contentSizeFlag =3D=3D 1);
-        ZSTD_CCtxParams_init_internal(&cctxParams, &params, (compressionLe=
vel =3D=3D 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel);
+        ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, &params, (co=
mpressionLevel =3D=3D 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel);
     }
     DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=3D%u)", (unsigned)srcSiz=
e);
-    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, sr=
cSize, dict, dictSize, &cctxParams);
+    return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, sr=
cSize, dict, dictSize, &cctx->simpleApiParams);
 }
=20
 size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx,
@@ -3561,7 +4643,10 @@ size_t ZSTD_estimateCDictSize_advanced(
     DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict));
     return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
          + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE)
-         + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0)
+         /* enableDedicatedDictSearch =3D=3D 1 ensures that CDict estimati=
on will not be too small
+          * in case we are using DDS with row-hash. */
+         + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode=
(ZSTD_ps_auto, &cParams),
+                                  /* enableDedicatedDictSearch */ 1, /* fo=
rCCtx */ 0)
          + (dictLoadMethod =3D=3D ZSTD_dlm_byRef ? 0
             : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void=
 *))));
 }
@@ -3592,9 +4677,6 @@ static size_t ZSTD_initCDict_internal(
     assert(!ZSTD_checkCParams(params.cParams));
     cdict->matchState.cParams =3D params.cParams;
     cdict->matchState.dedicatedDictSearch =3D params.enableDedicatedDictSe=
arch;
-    if (cdict->matchState.dedicatedDictSearch && dictSize > ZSTD_CHUNKSIZE=
_MAX) {
-        cdict->matchState.dedicatedDictSearch =3D 0;
-    }
     if ((dictLoadMethod =3D=3D ZSTD_dlm_byRef) || (!dictBuffer) || (!dictS=
ize)) {
         cdict->dictContent =3D dictBuffer;
     } else {
@@ -3615,6 +4697,7 @@ static size_t ZSTD_initCDict_internal(
         &cdict->matchState,
         &cdict->workspace,
         &params.cParams,
+        params.useRowMatchFinder,
         ZSTDcrp_makeClean,
         ZSTDirp_reset,
         ZSTD_resetTarget_CDict), "");
@@ -3638,14 +4721,17 @@ static size_t ZSTD_initCDict_internal(
=20
 static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize,
                                       ZSTD_dictLoadMethod_e dictLoadMethod,
-                                      ZSTD_compressionParameters cParams, =
ZSTD_customMem customMem)
+                                      ZSTD_compressionParameters cParams,
+                                      ZSTD_paramSwitch_e useRowMatchFinder,
+                                      U32 enableDedicatedDictSearch,
+                                      ZSTD_customMem customMem)
 {
     if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
=20
     {   size_t const workspaceSize =3D
             ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +
             ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) +
-            ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) +
+            ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedi=
catedDictSearch, /* forCCtx */ 0) +
             (dictLoadMethod =3D=3D ZSTD_dlm_byRef ? 0
              : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(voi=
d*))));
         void* const workspace =3D ZSTD_customMalloc(workspaceSize, customM=
em);
@@ -3664,7 +4750,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal=
(size_t dictSize,
         ZSTD_cwksp_move(&cdict->workspace, &ws);
         cdict->customMem =3D customMem;
         cdict->compressionLevel =3D ZSTD_NO_CLEVEL; /* signals advanced AP=
I usage */
-
+        cdict->useRowMatchFinder =3D useRowMatchFinder;
         return cdict;
     }
 }
@@ -3686,7 +4772,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dic=
tBuffer, size_t dictSize,
         &cctxParams, customMem);
 }
=20
-ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2(
+ZSTD_CDict* ZSTD_createCDict_advanced2(
         const void* dict, size_t dictSize,
         ZSTD_dictLoadMethod_e dictLoadMethod,
         ZSTD_dictContentType_e dictContentType,
@@ -3716,10 +4802,13 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2(
             &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_crea=
teCDict);
     }
=20
+    DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDe=
dicatedDictSearch);
     cctxParams.cParams =3D cParams;
+    cctxParams.useRowMatchFinder =3D ZSTD_resolveRowMatchFinderMode(cctxPa=
rams.useRowMatchFinder, &cParams);
=20
     cdict =3D ZSTD_createCDict_advanced_internal(dictSize,
                         dictLoadMethod, cctxParams.cParams,
+                        cctxParams.useRowMatchFinder, cctxParams.enableDed=
icatedDictSearch,
                         customMem);
=20
     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
@@ -3788,7 +4877,9 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
                                  ZSTD_dictContentType_e dictContentType,
                                  ZSTD_compressionParameters cParams)
 {
-    size_t const matchStateSize =3D ZSTD_sizeof_matchState(&cParams, /* fo=
rCCtx */ 0);
+    ZSTD_paramSwitch_e const useRowMatchFinder =3D ZSTD_resolveRowMatchFin=
derMode(ZSTD_ps_auto, &cParams);
+    /* enableDedicatedDictSearch =3D=3D 1 ensures matchstate is not too sm=
all in case this CDict will be used for DDS + row hash */
+    size_t const matchStateSize =3D ZSTD_sizeof_matchState(&cParams, useRo=
wMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0);
     size_t const neededSize =3D ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict))
                             + (dictLoadMethod =3D=3D ZSTD_dlm_byRef ? 0
                                : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(di=
ctSize, sizeof(void*))))
@@ -3813,6 +4904,8 @@ const ZSTD_CDict* ZSTD_initStaticCDict(
=20
     ZSTD_CCtxParams_init(&params, 0);
     params.cParams =3D cParams;
+    params.useRowMatchFinder =3D useRowMatchFinder;
+    cdict->useRowMatchFinder =3D useRowMatchFinder;
=20
     if (ZSTD_isError( ZSTD_initCDict_internal(cdict,
                                               dict, dictSize,
@@ -3839,15 +4932,15 @@ unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict*=
 cdict)
     return cdict->dictID;
 }
=20
-
-/* ZSTD_compressBegin_usingCDict_advanced() :
- * cdict must be !=3D NULL */
-size_t ZSTD_compressBegin_usingCDict_advanced(
+/* ZSTD_compressBegin_usingCDict_internal() :
+ * Implementation of various ZSTD_compressBegin_usingCDict* functions.
+ */
+static size_t ZSTD_compressBegin_usingCDict_internal(
     ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
     ZSTD_frameParameters const fParams, unsigned long long const pledgedSr=
cSize)
 {
     ZSTD_CCtx_params cctxParams;
-    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced");
+    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal");
     RETURN_ERROR_IF(cdict=3D=3DNULL, dictionary_wrong, "NULL pointer!");
     /* Initialize the cctxParams from the cdict */
     {
@@ -3879,25 +4972,48 @@ size_t ZSTD_compressBegin_usingCDict_advanced(
                                         ZSTDb_not_buffered);
 }
=20
+
+/* ZSTD_compressBegin_usingCDict_advanced() :
+ * This function is DEPRECATED.
+ * cdict must be !=3D NULL */
+size_t ZSTD_compressBegin_usingCDict_advanced(
+    ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict,
+    ZSTD_frameParameters const fParams, unsigned long long const pledgedSr=
cSize)
+{
+    return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pl=
edgedSrcSize);
+}
+
 /* ZSTD_compressBegin_usingCDict() :
- * pledgedSrcSize=3D0 means "unknown"
- * if pledgedSrcSize>0, it will enable contentSizeFlag */
+ * cdict must be !=3D NULL */
 size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cd=
ict)
 {
     ZSTD_frameParameters const fParams =3D { 0 /*content*/, 0 /*checksum*/=
, 0 /*noDictID*/ };
-    DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag =3D=3D %u", !f=
Params.noDictIDFlag);
-    return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZS=
TD_CONTENTSIZE_UNKNOWN);
+    return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZS=
TD_CONTENTSIZE_UNKNOWN);
 }
=20
-size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+/*! ZSTD_compress_usingCDict_internal():
+ * Implementation of various ZSTD_compress_usingCDict* functions.
+ */
+static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx,
                                 void* dst, size_t dstCapacity,
                                 const void* src, size_t srcSize,
                                 const ZSTD_CDict* cdict, ZSTD_frameParamet=
ers fParams)
 {
-    FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, f=
Params, srcSize), "");   /* will check if cdict !=3D NULL */
+    FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, f=
Params, srcSize), ""); /* will check if cdict !=3D NULL */
     return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize);
 }
=20
+/*! ZSTD_compress_usingCDict_advanced():
+ * This function is DEPRECATED.
+ */
+size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                                const void* src, size_t srcSize,
+                                const ZSTD_CDict* cdict, ZSTD_frameParamet=
ers fParams)
+{
+    return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, =
srcSize, cdict, fParams);
+}
+
 /*! ZSTD_compress_usingCDict() :
  *  Compression using a digested Dictionary.
  *  Faster startup than ZSTD_compress_usingDict(), recommended when same d=
ictionary is used multiple times.
@@ -3909,7 +5025,7 @@ size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx,
                                 const ZSTD_CDict* cdict)
 {
     ZSTD_frameParameters const fParams =3D { 1 /*content*/, 0 /*checksum*/=
, 0 /*noDictID*/ };
-    return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, =
srcSize, cdict, fParams);
+    return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, =
srcSize, cdict, fParams);
 }
=20
=20
@@ -4313,8 +5429,13 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CC=
tx* cctx,
     FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local di=
ct if present. */
     ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict));   /* sing=
le usage */
     assert(prefixDict.dict=3D=3DNULL || cctx->cdict=3D=3DNULL);    /* only=
 one can be set */
-    if (cctx->cdict)
-        params.compressionLevel =3D cctx->cdict->compressionLevel; /* let =
cdict take priority in terms of compression level */
+    if (cctx->cdict && !cctx->localDict.cdict) {
+        /* Let the cdict's compression level take priority over the reques=
ted params.
+         * But do not take the cdict's compression level if the "cdict" is=
 actually a localDict
+         * generated from ZSTD_initLocalDict().
+         */
+        params.compressionLevel =3D cctx->cdict->compressionLevel;
+    }
     DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage");
     if (endOp =3D=3D ZSTD_e_end) cctx->pledgedSrcSizePlusOne =3D inSize + =
1;  /* auto-fix pledgedSrcSize */
     {
@@ -4327,11 +5448,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CC=
tx* cctx,
                 dictSize, mode);
     }
=20
-    if (ZSTD_CParams_shouldEnableLdm(&params.cParams)) {
-        /* Enable LDM by default for optimal parser and window size >=3D 1=
28MB */
-        DEBUGLOG(4, "LDM enabled by default (window size >=3D 128MB, strat=
egy >=3D btopt)");
-        params.ldmParams.enableLdm =3D 1;
-    }
+    params.useBlockSplitter =3D ZSTD_resolveBlockSplitterMode(params.useBl=
ockSplitter, &params.cParams);
+    params.ldmParams.enableLdm =3D ZSTD_resolveEnableLdm(params.ldmParams.=
enableLdm, &params.cParams);
+    params.useRowMatchFinder =3D ZSTD_resolveRowMatchFinderMode(params.use=
RowMatchFinder, &params.cParams);
=20
     {   U64 const pledgedSrcSize =3D cctx->pledgedSrcSizePlusOne - 1;
         assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams)));
@@ -4436,39 +5555,39 @@ typedef struct {
     size_t posInSrc;        /* Number of bytes given by sequences provided=
 so far */
 } ZSTD_sequencePosition;
=20
-/* Returns a ZSTD error code if sequence is not valid */
-static size_t ZSTD_validateSequence(U32 offCode, U32 matchLength,
-                                    size_t posInSrc, U32 windowLog, size_t=
 dictSize, U32 minMatch) {
-    size_t offsetBound;
-    U32 windowSize =3D 1 << windowLog;
-    /* posInSrc represents the amount of data the the decoder would decode=
 up to this point.
+/* ZSTD_validateSequence() :
+ * @offCode : is presumed to follow format required by ZSTD_storeSeq()
+ * @returns a ZSTD error code if sequence is not valid
+ */
+static size_t
+ZSTD_validateSequence(U32 offCode, U32 matchLength,
+                      size_t posInSrc, U32 windowLog, size_t dictSize)
+{
+    U32 const windowSize =3D 1 << windowLog;
+    /* posInSrc represents the amount of data the decoder would decode up =
to this point.
      * As long as the amount of data decoded is less than or equal to wind=
ow size, offsets may be
      * larger than the total length of output decoded in order to referenc=
e the dict, even larger than
      * window size. After output surpasses windowSize, we're limited to wi=
ndowSize offsets again.
      */
-    offsetBound =3D posInSrc > windowSize ? (size_t)windowSize : posInSrc =
+ (size_t)dictSize;
-    RETURN_ERROR_IF(offCode > offsetBound + ZSTD_REP_MOVE, corruption_dete=
cted, "Offset too large!");
-    RETURN_ERROR_IF(matchLength < minMatch, corruption_detected, "Matchlen=
gth too small");
+    size_t const offsetBound =3D posInSrc > windowSize ? (size_t)windowSiz=
e : posInSrc + (size_t)dictSize;
+    RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detect=
ed, "Offset too large!");
+    RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlen=
gth too small");
     return 0;
 }
=20
 /* Returns an offset code, given a sequence's raw offset, the ongoing repc=
ode array, and whether litLength =3D=3D 0 */
-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM]=
, U32 ll0) {
-    U32 offCode =3D rawOffset + ZSTD_REP_MOVE;
-    U32 repCode =3D 0;
+static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM]=
, U32 ll0)
+{
+    U32 offCode =3D STORE_OFFSET(rawOffset);
=20
     if (!ll0 && rawOffset =3D=3D rep[0]) {
-        repCode =3D 1;
+        offCode =3D STORE_REPCODE_1;
     } else if (rawOffset =3D=3D rep[1]) {
-        repCode =3D 2 - ll0;
+        offCode =3D STORE_REPCODE(2 - ll0);
     } else if (rawOffset =3D=3D rep[2]) {
-        repCode =3D 3 - ll0;
+        offCode =3D STORE_REPCODE(3 - ll0);
     } else if (ll0 && rawOffset =3D=3D rep[0] - 1) {
-        repCode =3D 3;
-    }
-    if (repCode) {
-        /* ZSTD_storeSeq expects a number in the range [0, 2] to represent=
 a repcode */
-        offCode =3D repCode - 1;
+        offCode =3D STORE_REPCODE_3;
     }
     return offCode;
 }
@@ -4476,18 +5595,17 @@ static U32 ZSTD_finalizeOffCode(U32 rawOffset, cons=
t U32 rep[ZSTD_REP_NUM], U32
 /* Returns 0 on success, and a ZSTD_error otherwise. This function scans t=
hrough an array of
  * ZSTD_Sequence, storing the sequences it finds, until it reaches a block=
 delimiter.
  */
-static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cc=
tx, ZSTD_sequencePosition* seqPos,
-                                                             const ZSTD_Se=
quence* const inSeqs, size_t inSeqsSize,
-                                                             const void* s=
rc, size_t blockSize) {
+static size_t
+ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx,
+                                              ZSTD_sequencePosition* seqPo=
s,
+                                        const ZSTD_Sequence* const inSeqs,=
 size_t inSeqsSize,
+                                        const void* src, size_t blockSize)
+{
     U32 idx =3D seqPos->idx;
     BYTE const* ip =3D (BYTE const*)(src);
     const BYTE* const iend =3D ip + blockSize;
     repcodes_t updatedRepcodes;
     U32 dictSize;
-    U32 litLength;
-    U32 matchLength;
-    U32 ll0;
-    U32 offCode;
=20
     if (cctx->cdict) {
         dictSize =3D (U32)cctx->cdict->dictContentSize;
@@ -4498,23 +5616,22 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitB=
lockDelim(ZSTD_CCtx* cctx, ZS
     }
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, siz=
eof(repcodes_t));
     for (; (inSeqs[idx].matchLength !=3D 0 || inSeqs[idx].offset !=3D 0) &=
& idx < inSeqsSize; ++idx) {
-        litLength =3D inSeqs[idx].litLength;
-        matchLength =3D inSeqs[idx].matchLength;
-        ll0 =3D litLength =3D=3D 0;
-        offCode =3D ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcod=
es.rep, ll0);
-        updatedRepcodes =3D ZSTD_updateRep(updatedRepcodes.rep, offCode, l=
l0);
+        U32 const litLength =3D inSeqs[idx].litLength;
+        U32 const ll0 =3D (litLength =3D=3D 0);
+        U32 const matchLength =3D inSeqs[idx].matchLength;
+        U32 const offCode =3D ZSTD_finalizeOffCode(inSeqs[idx].offset, upd=
atedRepcodes.rep, ll0);
+        ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
=20
         DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode,=
 matchLength, litLength);
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc +=3D litLength + matchLength;
             FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, s=
eqPos->posInSrc,
-                                                cctx->appliedParams.cParam=
s.windowLog, dictSize,
-                                                cctx->appliedParams.cParam=
s.minMatch),
+                                                cctx->appliedParams.cParam=
s.windowLog, dictSize),
                                                 "Sequence validation faile=
d");
         }
         RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memor=
y_allocation,
                         "Not enough memory allocated. Try adjusting ZSTD_c=
_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, match=
Length - MINMATCH);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, match=
Length);
         ip +=3D matchLength + litLength;
     }
     ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, siz=
eof(repcodes_t));
@@ -4541,9 +5658,11 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBl=
ockDelim(ZSTD_CCtx* cctx, ZS
  * avoid splitting a match, or to avoid splitting a match such that it wou=
ld produce a match
  * smaller than MINMATCH. In this case, we return the number of bytes that=
 we didn't read from this block.
  */
-static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZS=
TD_sequencePosition* seqPos,
-                                                       const ZSTD_Sequence=
* const inSeqs, size_t inSeqsSize,
-                                                       const void* src, si=
ze_t blockSize) {
+static size_t
+ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePos=
ition* seqPos,
+                                   const ZSTD_Sequence* const inSeqs, size=
_t inSeqsSize,
+                                   const void* src, size_t blockSize)
+{
     U32 idx =3D seqPos->idx;
     U32 startPosInSequence =3D seqPos->posInSequence;
     U32 endPosInSequence =3D seqPos->posInSequence + (U32)blockSize;
@@ -4553,10 +5672,6 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDel=
im(ZSTD_CCtx* cctx, ZSTD_seq
     repcodes_t updatedRepcodes;
     U32 bytesAdjustment =3D 0;
     U32 finalMatchSplit =3D 0;
-    U32 litLength;
-    U32 matchLength;
-    U32 rawOffset;
-    U32 offCode;
=20
     if (cctx->cdict) {
         dictSize =3D cctx->cdict->dictContentSize;
@@ -4570,9 +5685,10 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDel=
im(ZSTD_CCtx* cctx, ZSTD_seq
     ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, siz=
eof(repcodes_t));
     while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) {
         const ZSTD_Sequence currSeq =3D inSeqs[idx];
-        litLength =3D currSeq.litLength;
-        matchLength =3D currSeq.matchLength;
-        rawOffset =3D currSeq.offset;
+        U32 litLength =3D currSeq.litLength;
+        U32 matchLength =3D currSeq.matchLength;
+        U32 const rawOffset =3D currSeq.offset;
+        U32 offCode;
=20
         /* Modify the sequence depending on where endPosInSequence lies */
         if (endPosInSequence >=3D currSeq.litLength + currSeq.matchLength)=
 {
@@ -4625,22 +5741,21 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDe=
lim(ZSTD_CCtx* cctx, ZSTD_seq
             }
         }
         /* Check if this offset can be represented with a repcode */
-        {   U32 ll0 =3D (litLength =3D=3D 0);
+        {   U32 const ll0 =3D (litLength =3D=3D 0);
             offCode =3D ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.re=
p, ll0);
-            updatedRepcodes =3D ZSTD_updateRep(updatedRepcodes.rep, offCod=
e, ll0);
+            ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0);
         }
=20
         if (cctx->appliedParams.validateSequences) {
             seqPos->posInSrc +=3D litLength + matchLength;
             FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, s=
eqPos->posInSrc,
-                                                   cctx->appliedParams.cPa=
rams.windowLog, dictSize,
-                                                   cctx->appliedParams.cPa=
rams.minMatch),
+                                                   cctx->appliedParams.cPa=
rams.windowLog, dictSize),
                                                    "Sequence validation fa=
iled");
         }
         DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode,=
 matchLength, litLength);
         RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memor=
y_allocation,
                         "Not enough memory allocated. Try adjusting ZSTD_c=
_minMatch.");
-        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, match=
Length - MINMATCH);
+        ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, match=
Length);
         ip +=3D matchLength + litLength;
     }
     DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[=
idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength);
@@ -4665,7 +5780,8 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDeli=
m(ZSTD_CCtx* cctx, ZSTD_seq
 typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosit=
ion* seqPos,
                                        const ZSTD_Sequence* const inSeqs, =
size_t inSeqsSize,
                                        const void* src, size_t blockSize);
-static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e=
 mode) {
+static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e=
 mode)
+{
     ZSTD_sequenceCopier sequenceCopier =3D NULL;
     assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode));
     if (mode =3D=3D ZSTD_sf_explicitBlockDelimiters) {
@@ -4679,12 +5795,15 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopie=
r(ZSTD_sequenceFormat_e mode)
=20
 /* Compress, block-by-block, all of the sequences given.
  *
- * Returns the cumulative size of all compressed blocks (including their h=
eaders), otherwise a ZSTD error.
+ * Returns the cumulative size of all compressed blocks (including their h=
eaders),
+ * otherwise a ZSTD error.
  */
-static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
-                                              void* dst, size_t dstCapacit=
y,
-                                              const ZSTD_Sequence* inSeqs,=
 size_t inSeqsSize,
-                                              const void* src, size_t srcS=
ize) {
+static size_t
+ZSTD_compressSequences_internal(ZSTD_CCtx* cctx,
+                                void* dst, size_t dstCapacity,
+                          const ZSTD_Sequence* inSeqs, size_t inSeqsSize,
+                          const void* src, size_t srcSize)
+{
     size_t cSize =3D 0;
     U32 lastBlock;
     size_t blockSize;
@@ -4694,7 +5813,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CC=
tx* cctx,
=20
     BYTE const* ip =3D (BYTE const*)src;
     BYTE* op =3D (BYTE*)dst;
-    ZSTD_sequenceCopier sequenceCopier =3D ZSTD_selectSequenceCopier(cctx-=
>appliedParams.blockDelimiters);
+    ZSTD_sequenceCopier const sequenceCopier =3D ZSTD_selectSequenceCopier=
(cctx->appliedParams.blockDelimiters);
=20
     DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize:=
 %zu", srcSize, inSeqsSize);
     /* Special case: empty frame */
@@ -4732,7 +5851,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CC=
tx* cctx,
             continue;
         }
=20
-        compressedSeqsSize =3D ZSTD_entropyCompressSequences(&cctx->seqSto=
re,
+        compressedSeqsSize =3D ZSTD_entropyCompressSeqStore(&cctx->seqStor=
e,
                                 &cctx->blockState.prevCBlock->entropy, &cc=
tx->blockState.nextCBlock->entropy,
                                 &cctx->appliedParams,
                                 op + ZSTD_blockHeaderSize /* Leave space f=
or block header */, dstCapacity - ZSTD_blockHeaderSize,
@@ -4764,7 +5883,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CC=
tx* cctx,
         } else {
             U32 cBlockHeader;
             /* Error checking and repcodes update */
-            ZSTD_confirmRepcodesAndEntropyTables(cctx);
+            ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockSt=
ate);
             if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMod=
e =3D=3D FSE_repeat_valid)
                 cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMod=
e =3D FSE_repeat_check;
=20
@@ -4794,7 +5913,8 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CC=
tx* cctx,
=20
 size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dst=
Capacity,
                               const ZSTD_Sequence* inSeqs, size_t inSeqsSi=
ze,
-                              const void* src, size_t srcSize) {
+                              const void* src, size_t srcSize)
+{
     BYTE* op =3D (BYTE*)dst;
     size_t cSize =3D 0;
     size_t compressedBlocksSize =3D 0;
@@ -4861,117 +5981,11 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outB=
uffer* output)
=20
=20
 /*-=3D=3D=3D=3D=3D  Pre-defined compression levels  =3D=3D=3D=3D=3D-*/
+#include "clevels.h"
=20
-#define ZSTD_MAX_CLEVEL     22
 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; }
 int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; }
-
-static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MA=
X_CLEVEL+1] =3D {
-{   /* "default" - for any srcSize > 256 KB */
-    /* W,  C,  H,  S,  L, TL, strat */
-    { 19, 12, 13,  1,  6,  1, ZSTD_fast    },  /* base for negative levels=
 */
-    { 19, 13, 14,  1,  7,  0, ZSTD_fast    },  /* level  1 */
-    { 20, 15, 16,  1,  6,  0, ZSTD_fast    },  /* level  2 */
-    { 21, 16, 17,  1,  5,  0, ZSTD_dfast   },  /* level  3 */
-    { 21, 18, 18,  1,  5,  0, ZSTD_dfast   },  /* level  4 */
-    { 21, 18, 19,  2,  5,  2, ZSTD_greedy  },  /* level  5 */
-    { 21, 19, 19,  3,  5,  4, ZSTD_greedy  },  /* level  6 */
-    { 21, 19, 19,  3,  5,  8, ZSTD_lazy    },  /* level  7 */
-    { 21, 19, 19,  3,  5, 16, ZSTD_lazy2   },  /* level  8 */
-    { 21, 19, 20,  4,  5, 16, ZSTD_lazy2   },  /* level  9 */
-    { 22, 20, 21,  4,  5, 16, ZSTD_lazy2   },  /* level 10 */
-    { 22, 21, 22,  4,  5, 16, ZSTD_lazy2   },  /* level 11 */
-    { 22, 21, 22,  5,  5, 16, ZSTD_lazy2   },  /* level 12 */
-    { 22, 21, 22,  5,  5, 32, ZSTD_btlazy2 },  /* level 13 */
-    { 22, 22, 23,  5,  5, 32, ZSTD_btlazy2 },  /* level 14 */
-    { 22, 23, 23,  6,  5, 32, ZSTD_btlazy2 },  /* level 15 */
-    { 22, 22, 22,  5,  5, 48, ZSTD_btopt   },  /* level 16 */
-    { 23, 23, 22,  5,  4, 64, ZSTD_btopt   },  /* level 17 */
-    { 23, 23, 22,  6,  3, 64, ZSTD_btultra },  /* level 18 */
-    { 23, 24, 22,  7,  3,256, ZSTD_btultra2},  /* level 19 */
-    { 25, 25, 23,  7,  3,256, ZSTD_btultra2},  /* level 20 */
-    { 26, 26, 24,  7,  3,512, ZSTD_btultra2},  /* level 21 */
-    { 27, 27, 25,  9,  3,999, ZSTD_btultra2},  /* level 22 */
-},
-{   /* for srcSize <=3D 256 KB */
-    /* W,  C,  H,  S,  L,  T, strat */
-    { 18, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels=
 */
-    { 18, 13, 14,  1,  6,  0, ZSTD_fast    },  /* level  1 */
-    { 18, 14, 14,  1,  5,  0, ZSTD_dfast   },  /* level  2 */
-    { 18, 16, 16,  1,  4,  0, ZSTD_dfast   },  /* level  3 */
-    { 18, 16, 17,  2,  5,  2, ZSTD_greedy  },  /* level  4.*/
-    { 18, 18, 18,  3,  5,  2, ZSTD_greedy  },  /* level  5.*/
-    { 18, 18, 19,  3,  5,  4, ZSTD_lazy    },  /* level  6.*/
-    { 18, 18, 19,  4,  4,  4, ZSTD_lazy    },  /* level  7 */
-    { 18, 18, 19,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
-    { 18, 18, 19,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
-    { 18, 18, 19,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
-    { 18, 18, 19,  5,  4, 12, ZSTD_btlazy2 },  /* level 11.*/
-    { 18, 19, 19,  7,  4, 12, ZSTD_btlazy2 },  /* level 12.*/
-    { 18, 18, 19,  4,  4, 16, ZSTD_btopt   },  /* level 13 */
-    { 18, 18, 19,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
-    { 18, 18, 19,  6,  3,128, ZSTD_btopt   },  /* level 15.*/
-    { 18, 19, 19,  6,  3,128, ZSTD_btultra },  /* level 16.*/
-    { 18, 19, 19,  8,  3,256, ZSTD_btultra },  /* level 17.*/
-    { 18, 19, 19,  6,  3,128, ZSTD_btultra2},  /* level 18.*/
-    { 18, 19, 19,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
-    { 18, 19, 19, 10,  3,512, ZSTD_btultra2},  /* level 20.*/
-    { 18, 19, 19, 12,  3,512, ZSTD_btultra2},  /* level 21.*/
-    { 18, 19, 19, 13,  3,999, ZSTD_btultra2},  /* level 22.*/
-},
-{   /* for srcSize <=3D 128 KB */
-    /* W,  C,  H,  S,  L,  T, strat */
-    { 17, 12, 12,  1,  5,  1, ZSTD_fast    },  /* base for negative levels=
 */
-    { 17, 12, 13,  1,  6,  0, ZSTD_fast    },  /* level  1 */
-    { 17, 13, 15,  1,  5,  0, ZSTD_fast    },  /* level  2 */
-    { 17, 15, 16,  2,  5,  0, ZSTD_dfast   },  /* level  3 */
-    { 17, 17, 17,  2,  4,  0, ZSTD_dfast   },  /* level  4 */
-    { 17, 16, 17,  3,  4,  2, ZSTD_greedy  },  /* level  5 */
-    { 17, 17, 17,  3,  4,  4, ZSTD_lazy    },  /* level  6 */
-    { 17, 17, 17,  3,  4,  8, ZSTD_lazy2   },  /* level  7 */
-    { 17, 17, 17,  4,  4,  8, ZSTD_lazy2   },  /* level  8 */
-    { 17, 17, 17,  5,  4,  8, ZSTD_lazy2   },  /* level  9 */
-    { 17, 17, 17,  6,  4,  8, ZSTD_lazy2   },  /* level 10 */
-    { 17, 17, 17,  5,  4,  8, ZSTD_btlazy2 },  /* level 11 */
-    { 17, 18, 17,  7,  4, 12, ZSTD_btlazy2 },  /* level 12 */
-    { 17, 18, 17,  3,  4, 12, ZSTD_btopt   },  /* level 13.*/
-    { 17, 18, 17,  4,  3, 32, ZSTD_btopt   },  /* level 14.*/
-    { 17, 18, 17,  6,  3,256, ZSTD_btopt   },  /* level 15.*/
-    { 17, 18, 17,  6,  3,128, ZSTD_btultra },  /* level 16.*/
-    { 17, 18, 17,  8,  3,256, ZSTD_btultra },  /* level 17.*/
-    { 17, 18, 17, 10,  3,512, ZSTD_btultra },  /* level 18.*/
-    { 17, 18, 17,  5,  3,256, ZSTD_btultra2},  /* level 19.*/
-    { 17, 18, 17,  7,  3,512, ZSTD_btultra2},  /* level 20.*/
-    { 17, 18, 17,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
-    { 17, 18, 17, 11,  3,999, ZSTD_btultra2},  /* level 22.*/
-},
-{   /* for srcSize <=3D 16 KB */
-    /* W,  C,  H,  S,  L,  T, strat */
-    { 14, 12, 13,  1,  5,  1, ZSTD_fast    },  /* base for negative levels=
 */
-    { 14, 14, 15,  1,  5,  0, ZSTD_fast    },  /* level  1 */
-    { 14, 14, 15,  1,  4,  0, ZSTD_fast    },  /* level  2 */
-    { 14, 14, 15,  2,  4,  0, ZSTD_dfast   },  /* level  3 */
-    { 14, 14, 14,  4,  4,  2, ZSTD_greedy  },  /* level  4 */
-    { 14, 14, 14,  3,  4,  4, ZSTD_lazy    },  /* level  5.*/
-    { 14, 14, 14,  4,  4,  8, ZSTD_lazy2   },  /* level  6 */
-    { 14, 14, 14,  6,  4,  8, ZSTD_lazy2   },  /* level  7 */
-    { 14, 14, 14,  8,  4,  8, ZSTD_lazy2   },  /* level  8.*/
-    { 14, 15, 14,  5,  4,  8, ZSTD_btlazy2 },  /* level  9.*/
-    { 14, 15, 14,  9,  4,  8, ZSTD_btlazy2 },  /* level 10.*/
-    { 14, 15, 14,  3,  4, 12, ZSTD_btopt   },  /* level 11.*/
-    { 14, 15, 14,  4,  3, 24, ZSTD_btopt   },  /* level 12.*/
-    { 14, 15, 14,  5,  3, 32, ZSTD_btultra },  /* level 13.*/
-    { 14, 15, 15,  6,  3, 64, ZSTD_btultra },  /* level 14.*/
-    { 14, 15, 15,  7,  3,256, ZSTD_btultra },  /* level 15.*/
-    { 14, 15, 15,  5,  3, 48, ZSTD_btultra2},  /* level 16.*/
-    { 14, 15, 15,  6,  3,128, ZSTD_btultra2},  /* level 17.*/
-    { 14, 15, 15,  7,  3,256, ZSTD_btultra2},  /* level 18.*/
-    { 14, 15, 15,  8,  3,256, ZSTD_btultra2},  /* level 19.*/
-    { 14, 15, 15,  8,  3,512, ZSTD_btultra2},  /* level 20.*/
-    { 14, 15, 15,  9,  3,512, ZSTD_btultra2},  /* level 21.*/
-    { 14, 15, 15, 10,  3,999, ZSTD_btultra2},  /* level 22.*/
-},
-};
+int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; }
=20
 static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int =
const compressionLevel, size_t const dictSize)
 {
@@ -4999,7 +6013,7 @@ static int ZSTD_dedicatedDictSearch_isSupported(
 {
     return (cParams->strategy >=3D ZSTD_greedy)
         && (cParams->strategy <=3D ZSTD_lazy2)
-        && (cParams->hashLog >=3D cParams->chainLog)
+        && (cParams->hashLog > cParams->chainLog)
         && (cParams->chainLog <=3D 24);
 }
=20
@@ -5018,6 +6032,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams(
         case ZSTD_lazy:
         case ZSTD_lazy2:
             cParams->hashLog -=3D ZSTD_LAZY_DDSS_BUCKET_LOG;
+            if (cParams->hashLog < ZSTD_HASHLOG_MIN) {
+                cParams->hashLog =3D ZSTD_HASHLOG_MIN;
+            }
             break;
         case ZSTD_btlazy2:
         case ZSTD_btopt:
@@ -5066,6 +6083,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_int=
ernal(int compressionLevel,
     else row =3D compressionLevel;
=20
     {   ZSTD_compressionParameters cp =3D ZSTD_defaultCParameters[tableID]=
[row];
+        DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u=
 strat: %u", tableID, row, (U32)cp.strategy);
         /* acceleration factor */
         if (compressionLevel < 0) {
             int const clampedCompressionLevel =3D MAX(ZSTD_minCLevel(), co=
mpressionLevel);
diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress=
/zstd_compress_internal.h
index 685d2f996cc2..71697a11ae30 100644
--- a/lib/zstd/compress/zstd_compress_internal.h
+++ b/lib/zstd/compress/zstd_compress_internal.h
@@ -57,7 +57,7 @@ typedef struct {
 } ZSTD_localDict;
=20
 typedef struct {
-    HUF_CElt CTable[HUF_CTABLE_SIZE_U32(255)];
+    HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)];
     HUF_repeat repeatMode;
 } ZSTD_hufCTables_t;
=20
@@ -75,8 +75,55 @@ typedef struct {
     ZSTD_fseCTables_t fse;
 } ZSTD_entropyCTables_t;
=20
+/* *********************************************
+*  Entropy buffer statistics structs and funcs *
+***********************************************/
+/* ZSTD_hufCTablesMetadata_t :
+ *  Stores Literals Block Type for a super-block in hType, and
+ *  huffman tree description in hufDesBuffer.
+ *  hufDesSize refers to the size of huffman tree description in bytes.
+ *  This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */
 typedef struct {
-    U32 off;            /* Offset code (offset + ZSTD_REP_MOVE) for the ma=
tch */
+    symbolEncodingType_e hType;
+    BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE];
+    size_t hufDesSize;
+} ZSTD_hufCTablesMetadata_t;
+
+/* ZSTD_fseCTablesMetadata_t :
+ *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type,=
 and
+ *  fse tables in fseTablesBuffer.
+ *  fseTablesSize refers to the size of fse tables in bytes.
+ *  This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() =
*/
+typedef struct {
+    symbolEncodingType_e llType;
+    symbolEncodingType_e ofType;
+    symbolEncodingType_e mlType;
+    BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE];
+    size_t fseTablesSize;
+    size_t lastCountSize; /* This is to account for bug in 1.3.4. More det=
ail in ZSTD_entropyCompressSeqStore_internal() */
+} ZSTD_fseCTablesMetadata_t;
+
+typedef struct {
+    ZSTD_hufCTablesMetadata_t hufMetadata;
+    ZSTD_fseCTablesMetadata_t fseMetadata;
+} ZSTD_entropyCTablesMetadata_t;
+
+/* ZSTD_buildBlockEntropyStats() :
+ *  Builds entropy for the block.
+ *  @return : 0 on success or error code */
+size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr,
+                             const ZSTD_entropyCTables_t* prevEntropy,
+                                   ZSTD_entropyCTables_t* nextEntropy,
+                             const ZSTD_CCtx_params* cctxParams,
+                                   ZSTD_entropyCTablesMetadata_t* entropyM=
etadata,
+                                   void* workspace, size_t wkspSize);
+
+/* *******************************
+*  Compression internals structs *
+*********************************/
+
+typedef struct {
+    U32 off;            /* Offset sumtype code for the match, using ZSTD_s=
toreSeq() format */
     U32 len;            /* Raw length of match */
 } ZSTD_match_t;
=20
@@ -126,7 +173,7 @@ typedef struct {
     U32  offCodeSumBasePrice;    /* to compare to log2(offreq)  */
     ZSTD_OptPrice_e priceType;   /* prices can be determined dynamically, =
or follow a pre-defined cost structure */
     const ZSTD_entropyCTables_t* symbolCosts;  /* pre-calculated dictionar=
y statistics */
-    ZSTD_literalCompressionMode_e literalCompressionMode;
+    ZSTD_paramSwitch_e literalCompressionMode;
 } optState_t;
=20
 typedef struct {
@@ -135,14 +182,23 @@ typedef struct {
 } ZSTD_compressedBlockState_t;
=20
 typedef struct {
-    BYTE const* nextSrc;    /* next block here to continue on current pref=
ix */
-    BYTE const* base;       /* All regular indexes relative to this positi=
on */
-    BYTE const* dictBase;   /* extDict indexes relative to this position */
-    U32 dictLimit;          /* below that point, need extDict */
-    U32 lowLimit;           /* below that point, no more valid data */
+    BYTE const* nextSrc;       /* next block here to continue on current p=
refix */
+    BYTE const* base;          /* All regular indexes relative to this pos=
ition */
+    BYTE const* dictBase;      /* extDict indexes relative to this positio=
n */
+    U32 dictLimit;             /* below that point, need extDict */
+    U32 lowLimit;              /* below that point, no more valid data */
+    U32 nbOverflowCorrections; /* Number of times overflow correction has =
run since
+                                * ZSTD_window_init(). Useful for debugging=
 coredumps
+                                * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FRE=
QUENTLY.
+                                */
 } ZSTD_window_t;
=20
+#define ZSTD_WINDOW_START_INDEX 2
+
 typedef struct ZSTD_matchState_t ZSTD_matchState_t;
+
+#define ZSTD_ROW_HASH_CACHE_SIZE 8       /* Size of prefetching hash cache=
 for row-based matchfinder */
+
 struct ZSTD_matchState_t {
     ZSTD_window_t window;   /* State for window round buffer management */
     U32 loadedDictEnd;      /* index of end of dictionary, within context'=
s referential.
@@ -154,9 +210,17 @@ struct ZSTD_matchState_t {
                              */
     U32 nextToUpdate;       /* index from which to continue table update */
     U32 hashLog3;           /* dispatch table for matches of len=3D=3D3 : =
larger =3D=3D faster, more memory */
+
+    U32 rowHashLog;                          /* For row-based matchfinder:=
 Hashlog based on nb of rows in the hashTable.*/
+    U16* tagTable;                           /* For row-based matchFinder:=
 A row-based table containing the hashes and head index. */
+    U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder:=
 a cache of hashes to improve speed */
+
     U32* hashTable;
     U32* hashTable3;
     U32* chainTable;
+
+    U32 forceNonContiguous; /* Non-zero if we should force non-contiguous =
load for the next window update. */
+
     int dedicatedDictSearch;  /* Indicates whether this matchState is usin=
g the
                                * dedicated dictionary search structure.
                                */
@@ -196,7 +260,7 @@ typedef struct {
 } ldmState_t;
=20
 typedef struct {
-    U32 enableLdm;          /* 1 if enable long distance matching */
+    ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps=
_auto by default */
     U32 hashLog;            /* Log size of hashTable */
     U32 bucketSizeLog;      /* Log bucket size for collision resolution, a=
t most 8 */
     U32 minMatchLength;     /* Minimum match length */
@@ -227,7 +291,7 @@ struct ZSTD_CCtx_params_s {
                                 * There is no guarantee that hint is close=
 to actual source size */
=20
     ZSTD_dictAttachPref_e attachDictPref;
-    ZSTD_literalCompressionMode_e literalCompressionMode;
+    ZSTD_paramSwitch_e literalCompressionMode;
=20
     /* Multithreading: used to pass parameters to mtctx */
     int nbWorkers;
@@ -249,6 +313,15 @@ struct ZSTD_CCtx_params_s {
     ZSTD_sequenceFormat_e blockDelimiters;
     int validateSequences;
=20
+    /* Block splitting */
+    ZSTD_paramSwitch_e useBlockSplitter;
+
+    /* Param for deciding whether to use row-based matchfinder */
+    ZSTD_paramSwitch_e useRowMatchFinder;
+
+    /* Always load a dictionary in ext-dict mode (not prefix mode)? */
+    int deterministicRefPrefix;
+
     /* Internal use, for createCCtxParams() and freeCCtxParams() only */
     ZSTD_customMem customMem;
 };  /* typedef'd to ZSTD_CCtx_params within "zstd.h" */
@@ -266,12 +339,29 @@ typedef enum {
     ZSTDb_buffered
 } ZSTD_buffered_policy_e;
=20
+/*
+ * Struct that contains all elements of block splitter that should be allo=
cated
+ * in a wksp.
+ */
+#define ZSTD_MAX_NB_BLOCK_SPLITS 196
+typedef struct {
+    seqStore_t fullSeqStoreChunk;
+    seqStore_t firstHalfSeqStore;
+    seqStore_t secondHalfSeqStore;
+    seqStore_t currSeqStore;
+    seqStore_t nextSeqStore;
+
+    U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS];
+    ZSTD_entropyCTablesMetadata_t entropyMetadata;
+} ZSTD_blockSplitCtx;
+
 struct ZSTD_CCtx_s {
     ZSTD_compressionStage_e stage;
     int cParamsChanged;                  /* =3D=3D 1 if cParams(except wlo=
g) or compression level are changed in requestedParams. Triggers transmissi=
on of new params to ZSTDMT (if available) then reset to 0. */
     int bmi2;                            /* =3D=3D 1 if the CPU supports B=
MI2 and 0 otherwise. CPU support is determined dynamically once per context=
 lifetime. */
     ZSTD_CCtx_params requestedParams;
     ZSTD_CCtx_params appliedParams;
+    ZSTD_CCtx_params simpleApiParams;    /* Param storage used by the simp=
le API - not sticky. Must only be used in top-level simple API functions fo=
r storage. */
     U32   dictID;
     size_t dictContentSize;
=20
@@ -296,7 +386,7 @@ struct ZSTD_CCtx_s {
     ZSTD_blockState_t blockState;
     U32* entropyWorkspace;  /* entropy workspace of ENTROPY_WORKSPACE_SIZE=
 bytes */
=20
-    /* Wether we are streaming or not */
+    /* Whether we are streaming or not */
     ZSTD_buffered_policy_e bufferedPolicy;
=20
     /* streaming */
@@ -324,6 +414,9 @@ struct ZSTD_CCtx_s {
     /* Multi-threading */
=20
     /* Tracing */
+
+    /* Workspace for block splitter */
+    ZSTD_blockSplitCtx blockSplitCtx;
 };
=20
 typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e;
@@ -358,7 +451,7 @@ typedef enum {
 typedef size_t (*ZSTD_blockCompressor) (
         ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_=
dictMode_e dictMode);
+ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_=
paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode);
=20
=20
 MEM_STATIC U32 ZSTD_LLcode(U32 litLength)
@@ -392,31 +485,6 @@ MEM_STATIC U32 ZSTD_MLcode(U32 mlBase)
     return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Cod=
e[mlBase];
 }
=20
-typedef struct repcodes_s {
-    U32 rep[3];
-} repcodes_t;
-
-MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U=
32 const ll0)
-{
-    repcodes_t newReps;
-    if (offset >=3D ZSTD_REP_NUM) {  /* full offset */
-        newReps.rep[2] =3D rep[1];
-        newReps.rep[1] =3D rep[0];
-        newReps.rep[0] =3D offset - ZSTD_REP_MOVE;
-    } else {   /* repcode */
-        U32 const repCode =3D offset + ll0;
-        if (repCode > 0) {  /* note : if repCode=3D=3D0, no change */
-            U32 const currentOffset =3D (repCode=3D=3DZSTD_REP_NUM) ? (rep=
[0] - 1) : rep[repCode];
-            newReps.rep[2] =3D (repCode >=3D 2) ? rep[1] : rep[2];
-            newReps.rep[1] =3D rep[0];
-            newReps.rep[0] =3D currentOffset;
-        } else {   /* repCode =3D=3D 0 */
-            ZSTD_memcpy(&newReps, rep, sizeof(newReps));
-        }
-    }
-    return newReps;
-}
-
 /* ZSTD_cParam_withinBounds:
  * @return 1 if value is within cParam bounds,
  * 0 otherwise */
@@ -465,17 +533,17 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_s=
trategy strat)
     return (srcSize >> minlog) + 2;
 }
=20
-MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cct=
xParams)
+MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* =
cctxParams)
 {
     switch (cctxParams->literalCompressionMode) {
-    case ZSTD_lcm_huffman:
+    case ZSTD_ps_enable:
         return 0;
-    case ZSTD_lcm_uncompressed:
+    case ZSTD_ps_disable:
         return 1;
     default:
         assert(0 /* impossible: pre-validated */);
         ZSTD_FALLTHROUGH;
-    case ZSTD_lcm_auto:
+    case ZSTD_ps_auto:
         return (cctxParams->cParams.strategy =3D=3D ZSTD_fast) && (cctxPar=
ams->cParams.targetLength > 0);
     }
 }
@@ -485,7 +553,9 @@ MEM_STATIC int ZSTD_disableLiteralsCompression(const ZS=
TD_CCtx_params* cctxParam
  *  Only called when the sequence ends past ilimit_w, so it only needs to =
be optimized for single
  *  large copies.
  */
-static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* co=
nst iend, BYTE const* ilimit_w) {
+static void
+ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BY=
TE const* ilimit_w)
+{
     assert(iend > ilimit_w);
     if (ip <=3D ilimit_w) {
         ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap);
@@ -495,14 +565,30 @@ static void ZSTD_safecopyLiterals(BYTE* op, BYTE cons=
t* ip, BYTE const* const ie
     while (ip < iend) *op++ =3D *ip++;
 }
=20
+#define ZSTD_REP_MOVE     (ZSTD_REP_NUM-1)
+#define STORE_REPCODE_1 STORE_REPCODE(1)
+#define STORE_REPCODE_2 STORE_REPCODE(2)
+#define STORE_REPCODE_3 STORE_REPCODE(3)
+#define STORE_REPCODE(r) (assert((r)>=3D1), assert((r)<=3D3), (r)-1)
+#define STORE_OFFSET(o)  (assert((o)>0), o + ZSTD_REP_MOVE)
+#define STORED_IS_OFFSET(o)  ((o) > ZSTD_REP_MOVE)
+#define STORED_IS_REPCODE(o) ((o) <=3D ZSTD_REP_MOVE)
+#define STORED_OFFSET(o)  (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE)
+#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1)  /* return=
s ID 1,2,3 */
+#define STORED_TO_OFFBASE(o) ((o)+1)
+#define OFFBASE_TO_STORED(o) ((o)-1)
+
 /*! ZSTD_storeSeq() :
- *  Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t.
- *  `offCode` : distance to match + ZSTD_REP_MOVE (values <=3D ZSTD_REP_MO=
VE are repCodes).
- *  `mlBase` : matchLength - MINMATCH
+ *  Store a sequence (litlen, litPtr, offCode and matchLength) into seqSto=
re_t.
+ *  @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and S=
TORE_OFFSET().
+ *  @matchLength : must be >=3D MINMATCH
  *  Allowed to overread literals up to litLimit.
 */
-HINT_INLINE UNUSED_ATTR
-void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* =
literals, const BYTE* litLimit, U32 offCode, size_t mlBase)
+HINT_INLINE UNUSED_ATTR void
+ZSTD_storeSeq(seqStore_t* seqStorePtr,
+              size_t litLength, const BYTE* literals, const BYTE* litLimit,
+              U32 offBase_minus1,
+              size_t matchLength)
 {
     BYTE const* const litLimit_w =3D litLimit - WILDCOPY_OVERLENGTH;
     BYTE const* const litEnd =3D literals + litLength;
@@ -511,7 +597,7 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litL=
ength, const BYTE* litera
     if (g_start=3D=3DNULL) g_start =3D (const BYTE*)literals;  /* note : i=
ndex only works for compression within a single segment */
     {   U32 const pos =3D (U32)((const BYTE*)literals - g_start);
         DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u",
-               pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode);
+               pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1);
     }
 #endif
     assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) =
< seqStorePtr->maxNbSeq);
@@ -535,26 +621,66 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t li=
tLength, const BYTE* litera
=20
     /* literal Length */
     if (litLength>0xFFFF) {
-        assert(seqStorePtr->longLengthID =3D=3D 0); /* there can only be a=
 single long length */
-        seqStorePtr->longLengthID =3D 1;
+        assert(seqStorePtr->longLengthType =3D=3D ZSTD_llt_none); /* there=
 can only be a single long length */
+        seqStorePtr->longLengthType =3D ZSTD_llt_literalLength;
         seqStorePtr->longLengthPos =3D (U32)(seqStorePtr->sequences - seqS=
torePtr->sequencesStart);
     }
     seqStorePtr->sequences[0].litLength =3D (U16)litLength;
=20
     /* match offset */
-    seqStorePtr->sequences[0].offset =3D offCode + 1;
+    seqStorePtr->sequences[0].offBase =3D STORED_TO_OFFBASE(offBase_minus1=
);
=20
     /* match Length */
-    if (mlBase>0xFFFF) {
-        assert(seqStorePtr->longLengthID =3D=3D 0); /* there can only be a=
 single long length */
-        seqStorePtr->longLengthID =3D 2;
-        seqStorePtr->longLengthPos =3D (U32)(seqStorePtr->sequences - seqS=
torePtr->sequencesStart);
+    assert(matchLength >=3D MINMATCH);
+    {   size_t const mlBase =3D matchLength - MINMATCH;
+        if (mlBase>0xFFFF) {
+            assert(seqStorePtr->longLengthType =3D=3D ZSTD_llt_none); /* t=
here can only be a single long length */
+            seqStorePtr->longLengthType =3D ZSTD_llt_matchLength;
+            seqStorePtr->longLengthPos =3D (U32)(seqStorePtr->sequences - =
seqStorePtr->sequencesStart);
+        }
+        seqStorePtr->sequences[0].mlBase =3D (U16)mlBase;
     }
-    seqStorePtr->sequences[0].matchLength =3D (U16)mlBase;
=20
     seqStorePtr->sequences++;
 }
=20
+/* ZSTD_updateRep() :
+ * updates in-place @rep (array of repeat offsets)
+ * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_st=
oreSeq()
+ */
+MEM_STATIC void
+ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const =
ll0)
+{
+    if (STORED_IS_OFFSET(offBase_minus1)) {  /* full offset */
+        rep[2] =3D rep[1];
+        rep[1] =3D rep[0];
+        rep[0] =3D STORED_OFFSET(offBase_minus1);
+    } else {   /* repcode */
+        U32 const repCode =3D STORED_REPCODE(offBase_minus1) - 1 + ll0;
+        if (repCode > 0) {  /* note : if repCode=3D=3D0, no change */
+            U32 const currentOffset =3D (repCode=3D=3DZSTD_REP_NUM) ? (rep=
[0] - 1) : rep[repCode];
+            rep[2] =3D (repCode >=3D 2) ? rep[1] : rep[2];
+            rep[1] =3D rep[0];
+            rep[0] =3D currentOffset;
+        } else {   /* repCode =3D=3D 0 */
+            /* nothing to do */
+        }
+    }
+}
+
+typedef struct repcodes_s {
+    U32 rep[3];
+} repcodes_t;
+
+MEM_STATIC repcodes_t
+ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 con=
st ll0)
+{
+    repcodes_t newReps;
+    ZSTD_memcpy(&newReps, rep, sizeof(newReps));
+    ZSTD_updateRep(newReps.rep, offBase_minus1, ll0);
+    return newReps;
+}
+
=20
 /*-*************************************
 *  Match length counter
@@ -778,6 +904,13 @@ MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* windo=
w)
     window->dictLimit =3D end;
 }
=20
+MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window)
+{
+    return window.dictLimit =3D=3D ZSTD_WINDOW_START_INDEX &&
+           window.lowLimit =3D=3D ZSTD_WINDOW_START_INDEX &&
+           (window.nextSrc - window.base) =3D=3D ZSTD_WINDOW_START_INDEX;
+}
+
 /*
  * ZSTD_window_hasExtDict():
  * Returns non-zero if the window has a non-empty extDict.
@@ -801,15 +934,71 @@ MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(c=
onst ZSTD_matchState_t *ms)
             ZSTD_noDict;
 }
=20
+/* Defining this macro to non-zero tells zstd to run the overflow correcti=
on
+ * code much more frequently. This is very inefficient, and should only be
+ * used for tests and fuzzers.
+ */
+#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY
+#  ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+#    define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1
+#  else
+#    define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0
+#  endif
+#endif
+
+/*
+ * ZSTD_window_canOverflowCorrect():
+ * Returns non-zero if the indices are large enough for overflow correction
+ * to work correctly without impacting compression ratio.
+ */
+MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window,
+                                              U32 cycleLog,
+                                              U32 maxDist,
+                                              U32 loadedDictEnd,
+                                              void const* src)
+{
+    U32 const cycleSize =3D 1u << cycleLog;
+    U32 const curr =3D (U32)((BYTE const*)src - window.base);
+    U32 const minIndexToOverflowCorrect =3D cycleSize
+                                        + MAX(maxDist, cycleSize)
+                                        + ZSTD_WINDOW_START_INDEX;
+
+    /* Adjust the min index to backoff the overflow correction frequency,
+     * so we don't waste too much CPU in overflow correction. If this
+     * computation overflows we don't really care, we just need to make
+     * sure it is at least minIndexToOverflowCorrect.
+     */
+    U32 const adjustment =3D window.nbOverflowCorrections + 1;
+    U32 const adjustedIndex =3D MAX(minIndexToOverflowCorrect * adjustment,
+                                  minIndexToOverflowCorrect);
+    U32 const indexLargeEnough =3D curr > adjustedIndex;
+
+    /* Only overflow correct early if the dictionary is invalidated alread=
y,
+     * so we don't hurt compression ratio.
+     */
+    U32 const dictionaryInvalidated =3D curr > maxDist + loadedDictEnd;
+
+    return indexLargeEnough && dictionaryInvalidated;
+}
+
 /*
  * ZSTD_window_needOverflowCorrection():
  * Returns non-zero if the indices are getting too large and need overflow
  * protection.
  */
 MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const wind=
ow,
+                                                  U32 cycleLog,
+                                                  U32 maxDist,
+                                                  U32 loadedDictEnd,
+                                                  void const* src,
                                                   void const* srcEnd)
 {
     U32 const curr =3D (U32)((BYTE const*)srcEnd - window.base);
+    if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) {
+        if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, load=
edDictEnd, src)) {
+            return 1;
+        }
+    }
     return curr > ZSTD_CURRENT_MAX;
 }
=20
@@ -821,7 +1010,6 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD=
_window_t const window,
  *
  * The least significant cycleLog bits of the indices must remain the same,
  * which may be 0. Every index up to maxDist in the past must be valid.
- * NOTE: (maxDist & cycleMask) must be zero.
  */
 MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycl=
eLog,
                                            U32 maxDist, void const* src)
@@ -845,32 +1033,52 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_wind=
ow_t* window, U32 cycleLog,
      * 3. (cctx->lowLimit + 1<<windowLog) < 1<<32:
      *    windowLog <=3D 31 =3D=3D> 3<<29 + 1<<windowLog < 7<<29 < 1<<32.
      */
-    U32 const cycleMask =3D (1U << cycleLog) - 1;
+    U32 const cycleSize =3D 1u << cycleLog;
+    U32 const cycleMask =3D cycleSize - 1;
     U32 const curr =3D (U32)((BYTE const*)src - window->base);
-    U32 const currentCycle0 =3D curr & cycleMask;
-    /* Exclude zero so that newCurrent - maxDist >=3D 1. */
-    U32 const currentCycle1 =3D currentCycle0 =3D=3D 0 ? (1U << cycleLog) =
: currentCycle0;
-    U32 const newCurrent =3D currentCycle1 + maxDist;
+    U32 const currentCycle =3D curr & cycleMask;
+    /* Ensure newCurrent - maxDist >=3D ZSTD_WINDOW_START_INDEX. */
+    U32 const currentCycleCorrection =3D currentCycle < ZSTD_WINDOW_START_=
INDEX
+                                     ? MAX(cycleSize, ZSTD_WINDOW_START_IN=
DEX)
+                                     : 0;
+    U32 const newCurrent =3D currentCycle
+                         + currentCycleCorrection
+                         + MAX(maxDist, cycleSize);
     U32 const correction =3D curr - newCurrent;
-    assert((maxDist & cycleMask) =3D=3D 0);
+    /* maxDist must be a power of two so that:
+     *   (newCurrent & cycleMask) =3D=3D (curr & cycleMask)
+     * This is required to not corrupt the chains / binary tree.
+     */
+    assert((maxDist & (maxDist - 1)) =3D=3D 0);
+    assert((curr & cycleMask) =3D=3D (newCurrent & cycleMask));
     assert(curr > newCurrent);
-    /* Loose bound, should be around 1<<29 (see above) */
-    assert(correction > 1<<28);
+    if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) {
+        /* Loose bound, should be around 1<<29 (see above) */
+        assert(correction > 1<<28);
+    }
=20
     window->base +=3D correction;
     window->dictBase +=3D correction;
-    if (window->lowLimit <=3D correction) window->lowLimit =3D 1;
-    else window->lowLimit -=3D correction;
-    if (window->dictLimit <=3D correction) window->dictLimit =3D 1;
-    else window->dictLimit -=3D correction;
+    if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) {
+        window->lowLimit =3D ZSTD_WINDOW_START_INDEX;
+    } else {
+        window->lowLimit -=3D correction;
+    }
+    if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) {
+        window->dictLimit =3D ZSTD_WINDOW_START_INDEX;
+    } else {
+        window->dictLimit -=3D correction;
+    }
=20
     /* Ensure we can still reference the full window. */
     assert(newCurrent >=3D maxDist);
-    assert(newCurrent - maxDist >=3D 1);
+    assert(newCurrent - maxDist >=3D ZSTD_WINDOW_START_INDEX);
     /* Ensure that lowLimit and dictLimit didn't underflow. */
     assert(window->lowLimit <=3D newCurrent);
     assert(window->dictLimit <=3D newCurrent);
=20
+    ++window->nbOverflowCorrections;
+
     DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=3D0x%x", correction,
              window->lowLimit);
     return correction;
@@ -975,11 +1183,13 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window,
=20
 MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) {
     ZSTD_memset(window, 0, sizeof(*window));
-    window->base =3D (BYTE const*)"";
-    window->dictBase =3D (BYTE const*)"";
-    window->dictLimit =3D 1;    /* start from 1, so that 1st position is v=
alid */
-    window->lowLimit =3D 1;     /* it ensures first and later CCtx usages =
compress the same */
-    window->nextSrc =3D window->base + 1;   /* see issue #1241 */
+    window->base =3D (BYTE const*)" ";
+    window->dictBase =3D (BYTE const*)" ";
+    ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX);=
 /* Start above ZSTD_DUBT_UNSORTED_MARK */
+    window->dictLimit =3D ZSTD_WINDOW_START_INDEX;    /* start from >0, so=
 that 1st position is valid */
+    window->lowLimit =3D ZSTD_WINDOW_START_INDEX;     /* it ensures first =
and later CCtx usages compress the same */
+    window->nextSrc =3D window->base + ZSTD_WINDOW_START_INDEX;   /* see i=
ssue #1241 */
+    window->nbOverflowCorrections =3D 0;
 }
=20
 /*
@@ -990,7 +1200,8 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window=
) {
  * Returns non-zero if the segment is contiguous.
  */
 MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window,
-                                  void const* src, size_t srcSize)
+                                  void const* src, size_t srcSize,
+                                  int forceNonContiguous)
 {
     BYTE const* const ip =3D (BYTE const*)src;
     U32 contiguous =3D 1;
@@ -1000,7 +1211,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* wind=
ow,
     assert(window->base !=3D NULL);
     assert(window->dictBase !=3D NULL);
     /* Check if blocks follow each other */
-    if (src !=3D window->nextSrc) {
+    if (src !=3D window->nextSrc || forceNonContiguous) {
         /* not contiguous */
         size_t const distanceFromBase =3D (size_t)(window->nextSrc - windo=
w->base);
         DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", win=
dow->dictLimit);
@@ -1030,15 +1241,15 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* wi=
ndow,
  */
 MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 c=
urr, unsigned windowLog)
 {
-    U32    const maxDistance =3D 1U << windowLog;
-    U32    const lowestValid =3D ms->window.lowLimit;
-    U32    const withinWindow =3D (curr - lowestValid > maxDistance) ? cur=
r - maxDistance : lowestValid;
-    U32    const isDictionary =3D (ms->loadedDictEnd !=3D 0);
+    U32 const maxDistance =3D 1U << windowLog;
+    U32 const lowestValid =3D ms->window.lowLimit;
+    U32 const withinWindow =3D (curr - lowestValid > maxDistance) ? curr -=
 maxDistance : lowestValid;
+    U32 const isDictionary =3D (ms->loadedDictEnd !=3D 0);
     /* When using a dictionary the entire dictionary is valid if a single =
byte of the dictionary
      * is within the window. We invalidate the dictionary (and set loadedD=
ictEnd to 0) when it isn't
      * valid for the entire block. So this check is sufficient to find the=
 lowest valid match index.
      */
-    U32    const matchLowest =3D isDictionary ? lowestValid : withinWindow;
+    U32 const matchLowest =3D isDictionary ? lowestValid : withinWindow;
     return matchLowest;
 }
=20
diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress=
/zstd_compress_literals.c
index 655bcda4d1f1..52b0a8059aba 100644
--- a/lib/zstd/compress/zstd_compress_literals.c
+++ b/lib/zstd/compress/zstd_compress_literals.c
@@ -73,7 +73,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* pr=
evHuf,
                               void* dst, size_t dstCapacity,
                         const void* src, size_t srcSize,
                               void* entropyWorkspace, size_t entropyWorksp=
aceSize,
-                        const int bmi2)
+                        const int bmi2,
+                        unsigned suspectUncompressible)
 {
     size_t const minGain =3D ZSTD_minGain(srcSize, strategy);
     size_t const lhSize =3D 3 + (srcSize >=3D 1 KB) + (srcSize >=3D 16 KB);
@@ -105,11 +106,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const=
* prevHuf,
             HUF_compress1X_repeat(
                 ostart+lhSize, dstCapacity-lhSize, src, srcSize,
                 HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspac=
e, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) :
+                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, s=
uspectUncompressible) :
             HUF_compress4X_repeat(
                 ostart+lhSize, dstCapacity-lhSize, src, srcSize,
                 HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspac=
e, entropyWorkspaceSize,
-                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2);
+                (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, s=
uspectUncompressible);
         if (repeat !=3D HUF_repeat_none) {
             /* reused the existing table */
             DEBUGLOG(5, "Reusing previous huffman table");
@@ -117,7 +118,7 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* =
prevHuf,
         }
     }
=20
-    if ((cLitSize=3D=3D0) | (cLitSize >=3D srcSize - minGain) | ERR_isErro=
r(cLitSize)) {
+    if ((cLitSize=3D=3D0) || (cLitSize >=3D srcSize - minGain) || ERR_isEr=
ror(cLitSize)) {
         ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
         return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize);
     }
diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress=
/zstd_compress_literals.h
index 9904c0cd30a0..9775fb97cb70 100644
--- a/lib/zstd/compress/zstd_compress_literals.h
+++ b/lib/zstd/compress/zstd_compress_literals.h
@@ -18,12 +18,14 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCa=
pacity, const void* src,
=20
 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const=
 void* src, size_t srcSize);
=20
+/* If suspectUncompressible then some sampling checks will be run to poten=
tially skip huffman coding */
 size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf,
                               ZSTD_hufCTables_t* nextHuf,
                               ZSTD_strategy strategy, int disableLiteralCo=
mpression,
                               void* dst, size_t dstCapacity,
                         const void* src, size_t srcSize,
                               void* entropyWorkspace, size_t entropyWorksp=
aceSize,
-                        const int bmi2);
+                        const int bmi2,
+                        unsigned suspectUncompressible);
=20
 #endif /* ZSTD_COMPRESS_LITERALS_H */
diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compres=
s/zstd_compress_sequences.c
index dcfcdc9cc5e8..21ddc1b37acf 100644
--- a/lib/zstd/compress/zstd_compress_sequences.c
+++ b/lib/zstd/compress/zstd_compress_sequences.c
@@ -85,6 +85,8 @@ static size_t ZSTD_entropyCost(unsigned const* count, uns=
igned const max, size_t
 {
     unsigned cost =3D 0;
     unsigned s;
+
+    assert(total > 0);
     for (s =3D 0; s <=3D max; ++s) {
         unsigned norm =3D (unsigned)((256 * count[s]) / total);
         if (count[s] !=3D 0 && norm =3D=3D 0)
@@ -273,10 +275,11 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity,
         assert(nbSeq_1 > 1);
         assert(entropyWorkspaceSize >=3D sizeof(ZSTD_BuildCTableWksp));
         (void)entropyWorkspaceSize;
-        FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, n=
bSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "");
-        {   size_t const NCountSize =3D FSE_writeNCount(op, oend - op, wks=
p->norm, max, tableLog);   /* overflow protected */
+        FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, n=
bSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed");
+        assert(oend >=3D op);
+        {   size_t const NCountSize =3D FSE_writeNCount(op, (size_t)(oend =
- op), wksp->norm, max, tableLog);   /* overflow protected */
             FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed");
-            FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, =
max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "");
+            FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, =
max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp faile=
d");
             return NCountSize;
         }
     }
@@ -310,19 +313,19 @@ ZSTD_encodeSequences_body(
     FSE_initCState2(&stateLitLength,   CTable_LitLength,   llCodeTable[nbS=
eq-1]);
     BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCode=
Table[nbSeq-1]]);
     if (MEM_32bits()) BIT_flushBits(&blockStream);
-    BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCo=
deTable[nbSeq-1]]);
+    BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTab=
le[nbSeq-1]]);
     if (MEM_32bits()) BIT_flushBits(&blockStream);
     if (longOffsets) {
         U32 const ofBits =3D ofCodeTable[nbSeq-1];
         unsigned const extraBits =3D ofBits - MIN(ofBits, STREAM_ACCUMULAT=
OR_MIN-1);
         if (extraBits) {
-            BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits=
);
+            BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBit=
s);
             BIT_flushBits(&blockStream);
         }
-        BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits,
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits,
                     ofBits - extraBits);
     } else {
-        BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[n=
bSeq-1]);
+        BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[=
nbSeq-1]);
     }
     BIT_flushBits(&blockStream);
=20
@@ -336,8 +339,8 @@ ZSTD_encodeSequences_body(
             U32  const mlBits =3D ML_bits[mlCode];
             DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u=
",
                         (unsigned)sequences[n].litLength,
-                        (unsigned)sequences[n].matchLength + MINMATCH,
-                        (unsigned)sequences[n].offset);
+                        (unsigned)sequences[n].mlBase + MINMATCH,
+                        (unsigned)sequences[n].offBase);
                                                                           =
  /* 32b*/  /* 64b*/
                                                                           =
  /* (7)*/  /* (7)*/
             FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode);     =
  /* 15 */  /* 15 */
@@ -348,18 +351,18 @@ ZSTD_encodeSequences_body(
                 BIT_flushBits(&blockStream);                              =
  /* (7)*/
             BIT_addBits(&blockStream, sequences[n].litLength, llBits);
             if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&block=
Stream);
-            BIT_addBits(&blockStream, sequences[n].matchLength, mlBits);
+            BIT_addBits(&blockStream, sequences[n].mlBase, mlBits);
             if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits=
(&blockStream);
             if (longOffsets) {
                 unsigned const extraBits =3D ofBits - MIN(ofBits, STREAM_A=
CCUMULATOR_MIN-1);
                 if (extraBits) {
-                    BIT_addBits(&blockStream, sequences[n].offset, extraBi=
ts);
+                    BIT_addBits(&blockStream, sequences[n].offBase, extraB=
its);
                     BIT_flushBits(&blockStream);                          =
  /* (7)*/
                 }
-                BIT_addBits(&blockStream, sequences[n].offset >> extraBits,
+                BIT_addBits(&blockStream, sequences[n].offBase >> extraBit=
s,
                             ofBits - extraBits);                          =
  /* 31 */
             } else {
-                BIT_addBits(&blockStream, sequences[n].offset, ofBits);   =
  /* 31 */
+                BIT_addBits(&blockStream, sequences[n].offBase, ofBits);  =
   /* 31 */
             }
             BIT_flushBits(&blockStream);                                  =
  /* (7)*/
             DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr -=
 blockStream.ptr));
@@ -396,7 +399,7 @@ ZSTD_encodeSequences_default(
=20
 #if DYNAMIC_BMI2
=20
-static TARGET_ATTRIBUTE("bmi2") size_t
+static BMI2_TARGET_ATTRIBUTE size_t
 ZSTD_encodeSequences_bmi2(
             void* dst, size_t dstCapacity,
             FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable,
diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compre=
ss/zstd_compress_superblock.c
index b0610b255653..17d836cc84e8 100644
--- a/lib/zstd/compress/zstd_compress_superblock.c
+++ b/lib/zstd/compress/zstd_compress_superblock.c
@@ -15,289 +15,10 @@
=20
 #include "../common/zstd_internal.h"  /* ZSTD_getSequenceLength */
 #include "hist.h"                     /* HIST_countFast_wksp */
-#include "zstd_compress_internal.h"
+#include "zstd_compress_internal.h"   /* ZSTD_[huf|fse|entropy]CTablesMeta=
data_t */
 #include "zstd_compress_sequences.h"
 #include "zstd_compress_literals.h"
=20
-/*-*************************************
-*  Superblock entropy buffer structs
-***************************************/
-/* ZSTD_hufCTablesMetadata_t :
- *  Stores Literals Block Type for a super-block in hType, and
- *  huffman tree description in hufDesBuffer.
- *  hufDesSize refers to the size of huffman tree description in bytes.
- *  This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */
-typedef struct {
-    symbolEncodingType_e hType;
-    BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE];
-    size_t hufDesSize;
-} ZSTD_hufCTablesMetadata_t;
-
-/* ZSTD_fseCTablesMetadata_t :
- *  Stores symbol compression modes for a super-block in {ll, ol, ml}Type,=
 and
- *  fse tables in fseTablesBuffer.
- *  fseTablesSize refers to the size of fse tables in bytes.
- *  This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() =
*/
-typedef struct {
-    symbolEncodingType_e llType;
-    symbolEncodingType_e ofType;
-    symbolEncodingType_e mlType;
-    BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE];
-    size_t fseTablesSize;
-    size_t lastCountSize; /* This is to account for bug in 1.3.4. More det=
ail in ZSTD_compressSubBlock_sequences() */
-} ZSTD_fseCTablesMetadata_t;
-
-typedef struct {
-    ZSTD_hufCTablesMetadata_t hufMetadata;
-    ZSTD_fseCTablesMetadata_t fseMetadata;
-} ZSTD_entropyCTablesMetadata_t;
-
-
-/* ZSTD_buildSuperBlockEntropy_literal() :
- *  Builds entropy for the super-block literals.
- *  Stores literals block type (raw, rle, compressed, repeat) and
- *  huffman description table to hufMetadata.
- *  @return : size of huffman description table or error code */
-static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t =
srcSize,
-                                            const ZSTD_hufCTables_t* prevH=
uf,
-                                                  ZSTD_hufCTables_t* nextH=
uf,
-                                                  ZSTD_hufCTablesMetadata_=
t* hufMetadata,
-                                                  const int disableLiteral=
sCompression,
-                                                  void* workspace, size_t =
wkspSize)
-{
-    BYTE* const wkspStart =3D (BYTE*)workspace;
-    BYTE* const wkspEnd =3D wkspStart + wkspSize;
-    BYTE* const countWkspStart =3D wkspStart;
-    unsigned* const countWksp =3D (unsigned*)workspace;
-    const size_t countWkspSize =3D (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsi=
gned);
-    BYTE* const nodeWksp =3D countWkspStart + countWkspSize;
-    const size_t nodeWkspSize =3D wkspEnd-nodeWksp;
-    unsigned maxSymbolValue =3D 255;
-    unsigned huffLog =3D HUF_TABLELOG_DEFAULT;
-    HUF_repeat repeat =3D prevHuf->repeatMode;
-
-    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=3D%zu)", src=
Size);
-
-    /* Prepare nextEntropy assuming reusing the existing table */
-    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-
-    if (disableLiteralsCompression) {
-        DEBUGLOG(5, "set_basic - disabled");
-        hufMetadata->hType =3D set_basic;
-        return 0;
-    }
-
-    /* small ? don't even attempt compression (speed opt) */
-#   define COMPRESS_LITERALS_SIZE_MIN 63
-    {   size_t const minLitSize =3D (prevHuf->repeatMode =3D=3D HUF_repeat=
_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN;
-        if (srcSize <=3D minLitSize) {
-            DEBUGLOG(5, "set_basic - too small");
-            hufMetadata->hType =3D set_basic;
-            return 0;
-        }
-    }
-
-    /* Scan input and build symbol stats */
-    {   size_t const largest =3D HIST_count_wksp (countWksp, &maxSymbolVal=
ue, (const BYTE*)src, srcSize, workspace, wkspSize);
-        FORWARD_IF_ERROR(largest, "HIST_count_wksp failed");
-        if (largest =3D=3D srcSize) {
-            DEBUGLOG(5, "set_rle");
-            hufMetadata->hType =3D set_rle;
-            return 0;
-        }
-        if (largest <=3D (srcSize >> 7)+4) {
-            DEBUGLOG(5, "set_basic - no gain");
-            hufMetadata->hType =3D set_basic;
-            return 0;
-        }
-    }
-
-    /* Validate the previous Huffman table */
-    if (repeat =3D=3D HUF_repeat_check && !HUF_validateCTable((HUF_CElt co=
nst*)prevHuf->CTable, countWksp, maxSymbolValue)) {
-        repeat =3D HUF_repeat_none;
-    }
-
-    /* Build Huffman Tree */
-    ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable));
-    huffLog =3D HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue);
-    {   size_t const maxBits =3D HUF_buildCTable_wksp((HUF_CElt*)nextHuf->=
CTable, countWksp,
-                                                    maxSymbolValue, huffLo=
g,
-                                                    nodeWksp, nodeWkspSize=
);
-        FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp");
-        huffLog =3D (U32)maxBits;
-        {   /* Build and write the CTable */
-            size_t const newCSize =3D HUF_estimateCompressedSize(
-                    (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue);
-            size_t const hSize =3D HUF_writeCTable_wksp(
-                    hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesB=
uffer),
-                    (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog,
-                    nodeWksp, nodeWkspSize);
-            /* Check against repeating the previous CTable */
-            if (repeat !=3D HUF_repeat_none) {
-                size_t const oldCSize =3D HUF_estimateCompressedSize(
-                        (HUF_CElt const*)prevHuf->CTable, countWksp, maxSy=
mbolValue);
-                if (oldCSize < srcSize && (oldCSize <=3D hSize + newCSize =
|| hSize + 12 >=3D srcSize)) {
-                    DEBUGLOG(5, "set_repeat - smaller");
-                    ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                    hufMetadata->hType =3D set_repeat;
-                    return 0;
-                }
-            }
-            if (newCSize + hSize >=3D srcSize) {
-                DEBUGLOG(5, "set_basic - no gains");
-                ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf));
-                hufMetadata->hType =3D set_basic;
-                return 0;
-            }
-            DEBUGLOG(5, "set_compressed (hSize=3D%u)", (U32)hSize);
-            hufMetadata->hType =3D set_compressed;
-            nextHuf->repeatMode =3D HUF_repeat_check;
-            return hSize;
-        }
-    }
-}
-
-/* ZSTD_buildSuperBlockEntropy_sequences() :
- *  Builds entropy for the super-block sequences.
- *  Stores symbol compression modes and fse table to fseMetadata.
- *  @return : size of fse tables or error code */
-static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePt=
r,
-                                              const ZSTD_fseCTables_t* pre=
vEntropy,
-                                                    ZSTD_fseCTables_t* nex=
tEntropy,
-                                              const ZSTD_CCtx_params* cctx=
Params,
-                                                    ZSTD_fseCTablesMetadat=
a_t* fseMetadata,
-                                                    void* workspace, size_=
t wkspSize)
-{
-    BYTE* const wkspStart =3D (BYTE*)workspace;
-    BYTE* const wkspEnd =3D wkspStart + wkspSize;
-    BYTE* const countWkspStart =3D wkspStart;
-    unsigned* const countWksp =3D (unsigned*)workspace;
-    const size_t countWkspSize =3D (MaxSeq + 1) * sizeof(unsigned);
-    BYTE* const cTableWksp =3D countWkspStart + countWkspSize;
-    const size_t cTableWkspSize =3D wkspEnd-cTableWksp;
-    ZSTD_strategy const strategy =3D cctxParams->cParams.strategy;
-    FSE_CTable* CTable_LitLength =3D nextEntropy->litlengthCTable;
-    FSE_CTable* CTable_OffsetBits =3D nextEntropy->offcodeCTable;
-    FSE_CTable* CTable_MatchLength =3D nextEntropy->matchlengthCTable;
-    const BYTE* const ofCodeTable =3D seqStorePtr->ofCode;
-    const BYTE* const llCodeTable =3D seqStorePtr->llCode;
-    const BYTE* const mlCodeTable =3D seqStorePtr->mlCode;
-    size_t const nbSeq =3D seqStorePtr->sequences - seqStorePtr->sequences=
Start;
-    BYTE* const ostart =3D fseMetadata->fseTablesBuffer;
-    BYTE* const oend =3D ostart + sizeof(fseMetadata->fseTablesBuffer);
-    BYTE* op =3D ostart;
-
-    assert(cTableWkspSize >=3D (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE=
));
-    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=3D%zu)", nbS=
eq);
-    ZSTD_memset(workspace, 0, wkspSize);
-
-    fseMetadata->lastCountSize =3D 0;
-    /* convert length/distances into codes */
-    ZSTD_seqToCodes(seqStorePtr);
-    /* build CTable for Literal Lengths */
-    {   U32 LLtype;
-        unsigned max =3D MaxLL;
-        size_t const mostFrequent =3D HIST_countFast_wksp(countWksp, &max,=
 llCodeTable, nbSeq, workspace, wkspSize);  /* can't fail */
-        DEBUGLOG(5, "Building LL table");
-        nextEntropy->litlength_repeatMode =3D prevEntropy->litlength_repea=
tMode;
-        LLtype =3D ZSTD_selectEncodingType(&nextEntropy->litlength_repeatM=
ode,
-                                        countWksp, max, mostFrequent, nbSe=
q,
-                                        LLFSELog, prevEntropy->litlengthCT=
able,
-                                        LL_defaultNorm, LL_defaultNormLog,
-                                        ZSTD_defaultAllowed, strategy);
-        assert(set_basic < set_compressed && set_rle < set_compressed);
-        assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatM=
ode !=3D FSE_repeat_none)); /* We don't copy tables */
-        {   size_t const countSize =3D ZSTD_buildCTable(op, oend - op, CTa=
ble_LitLength, LLFSELog, (symbolEncodingType_e)LLtype,
-                                                    countWksp, max, llCode=
Table, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL,
-                                                    prevEntropy->litlength=
CTable, sizeof(prevEntropy->litlengthCTable),
-                                                    cTableWksp, cTableWksp=
Size);
-            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens fail=
ed");
-            if (LLtype =3D=3D set_compressed)
-                fseMetadata->lastCountSize =3D countSize;
-            op +=3D countSize;
-            fseMetadata->llType =3D (symbolEncodingType_e) LLtype;
-    }   }
-    /* build CTable for Offsets */
-    {   U32 Offtype;
-        unsigned max =3D MaxOff;
-        size_t const mostFrequent =3D HIST_countFast_wksp(countWksp, &max,=
 ofCodeTable, nbSeq, workspace, wkspSize);  /* can't fail */
-        /* We can only use the basic table if max <=3D DefaultMaxOff, othe=
rwise the offsets are too large */
-        ZSTD_defaultPolicy_e const defaultPolicy =3D (max <=3D DefaultMaxO=
ff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed;
-        DEBUGLOG(5, "Building OF table");
-        nextEntropy->offcode_repeatMode =3D prevEntropy->offcode_repeatMod=
e;
-        Offtype =3D ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMo=
de,
-                                        countWksp, max, mostFrequent, nbSe=
q,
-                                        OffFSELog, prevEntropy->offcodeCTa=
ble,
-                                        OF_defaultNorm, OF_defaultNormLog,
-                                        defaultPolicy, strategy);
-        assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMo=
de !=3D FSE_repeat_none)); /* We don't copy tables */
-        {   size_t const countSize =3D ZSTD_buildCTable(op, oend - op, CTa=
ble_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype,
-                                                    countWksp, max, ofCode=
Table, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff,
-                                                    prevEntropy->offcodeCT=
able, sizeof(prevEntropy->offcodeCTable),
-                                                    cTableWksp, cTableWksp=
Size);
-            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets fail=
ed");
-            if (Offtype =3D=3D set_compressed)
-                fseMetadata->lastCountSize =3D countSize;
-            op +=3D countSize;
-            fseMetadata->ofType =3D (symbolEncodingType_e) Offtype;
-    }   }
-    /* build CTable for MatchLengths */
-    {   U32 MLtype;
-        unsigned max =3D MaxML;
-        size_t const mostFrequent =3D HIST_countFast_wksp(countWksp, &max,=
 mlCodeTable, nbSeq, workspace, wkspSize);   /* can't fail */
-        DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend=
-op));
-        nextEntropy->matchlength_repeatMode =3D prevEntropy->matchlength_r=
epeatMode;
-        MLtype =3D ZSTD_selectEncodingType(&nextEntropy->matchlength_repea=
tMode,
-                                        countWksp, max, mostFrequent, nbSe=
q,
-                                        MLFSELog, prevEntropy->matchlength=
CTable,
-                                        ML_defaultNorm, ML_defaultNormLog,
-                                        ZSTD_defaultAllowed, strategy);
-        assert(!(MLtype < set_compressed && nextEntropy->matchlength_repea=
tMode !=3D FSE_repeat_none)); /* We don't copy tables */
-        {   size_t const countSize =3D ZSTD_buildCTable(op, oend - op, CTa=
ble_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype,
-                                                    countWksp, max, mlCode=
Table, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML,
-                                                    prevEntropy->matchleng=
thCTable, sizeof(prevEntropy->matchlengthCTable),
-                                                    cTableWksp, cTableWksp=
Size);
-            FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths=
 failed");
-            if (MLtype =3D=3D set_compressed)
-                fseMetadata->lastCountSize =3D countSize;
-            op +=3D countSize;
-            fseMetadata->mlType =3D (symbolEncodingType_e) MLtype;
-    }   }
-    assert((size_t) (op-ostart) <=3D sizeof(fseMetadata->fseTablesBuffer));
-    return op-ostart;
-}
-
-
-/* ZSTD_buildSuperBlockEntropy() :
- *  Builds entropy for the super-block.
- *  @return : 0 on success or error code */
-static size_t
-ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr,
-                      const ZSTD_entropyCTables_t* prevEntropy,
-                            ZSTD_entropyCTables_t* nextEntropy,
-                      const ZSTD_CCtx_params* cctxParams,
-                            ZSTD_entropyCTablesMetadata_t* entropyMetadata,
-                            void* workspace, size_t wkspSize)
-{
-    size_t const litSize =3D seqStorePtr->lit - seqStorePtr->litStart;
-    DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy");
-    entropyMetadata->hufMetadata.hufDesSize =3D
-        ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize,
-                                            &prevEntropy->huf, &nextEntrop=
y->huf,
-                                            &entropyMetadata->hufMetadata,
-                                            ZSTD_disableLiteralsCompressio=
n(cctxParams),
-                                            workspace, wkspSize);
-    FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildS=
uperBlockEntropy_literal failed");
-    entropyMetadata->fseMetadata.fseTablesSize =3D
-        ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr,
-                                              &prevEntropy->fse, &nextEntr=
opy->fse,
-                                              cctxParams,
-                                              &entropyMetadata->fseMetadat=
a,
-                                              workspace, wkspSize);
-    FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_bui=
ldSuperBlockEntropy_sequences failed");
-    return 0;
-}
-
 /* ZSTD_compressSubBlock_literal() :
  *  Compresses literals section for a sub-block.
  *  When we have to write the Huffman table we will sometimes choose a hea=
der
@@ -411,8 +132,7 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const=
* seqStore, const seqDef*
     const seqDef* sp =3D sstart;
     size_t matchLengthSum =3D 0;
     size_t litLengthSum =3D 0;
-    /* Only used by assert(), suppress unused variable warnings in product=
ion. */
-    (void)litLengthSum;
+    (void)(litLengthSum); /* suppress unused variable warning on some envi=
ronments */
     while (send-sp > 0) {
         ZSTD_sequenceLength const seqLen =3D ZSTD_getSequenceLength(seqSto=
re, sp);
         litLengthSum +=3D seqLen.litLength;
@@ -605,7 +325,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const B=
YTE* literals, size_t lit
 static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e ty=
pe,
                         const BYTE* codeTable, unsigned maxCode,
                         size_t nbSeq, const FSE_CTable* fseCTable,
-                        const U32* additionalBits,
+                        const U8* additionalBits,
                         short const* defaultNorm, U32 defaultNormLog, U32 =
defaultMax,
                         void* workspace, size_t wkspSize)
 {
@@ -646,8 +366,9 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const=
 BYTE* ofCodeTable,
                                                   void* workspace, size_t =
wkspSize,
                                                   int writeEntropy)
 {
-    size_t sequencesSectionHeaderSize =3D 3; /* Use hard coded size of 3 b=
ytes */
+    size_t const sequencesSectionHeaderSize =3D 3; /* Use hard coded size =
of 3 bytes */
     size_t cSeqSizeEstimate =3D 0;
+    if (nbSeq =3D=3D 0) return sequencesSectionHeaderSize;
     cSeqSizeEstimate +=3D ZSTD_estimateSubBlockSize_symbolType(fseMetadata=
->ofType, ofCodeTable, MaxOff,
                                          nbSeq, fseTables->offcodeCTable, =
NULL,
                                          OF_defaultNorm, OF_defaultNormLog=
, DefaultMaxOff,
@@ -754,7 +475,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStor=
e_t* seqStorePtr,
         /* I think there is an optimization opportunity here.
          * Calling ZSTD_estimateSubBlockSize for every sequence can be was=
teful
          * since it recalculates estimate from scratch.
-         * For example, it would recount literal distribution and symbol c=
odes everytime.
+         * For example, it would recount literal distribution and symbol c=
odes every time.
          */
         cBlockSizeEstimate =3D ZSTD_estimateSubBlockSize(lp, litSize, ofCo=
dePtr, llCodePtr, mlCodePtr, seqCount,
                                                        &nextCBlock->entrop=
y, entropyMetadata,
@@ -818,7 +539,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStor=
e_t* seqStorePtr,
             repcodes_t rep;
             ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep));
             for (seq =3D sstart; seq < sp; ++seq) {
-                rep =3D ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getS=
equenceLength(seqStorePtr, seq).litLength =3D=3D 0);
+                ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequence=
Length(seqStorePtr, seq).litLength =3D=3D 0);
             }
             ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep));
         }
@@ -833,7 +554,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc,
                                unsigned lastBlock) {
     ZSTD_entropyCTablesMetadata_t entropyMetadata;
=20
-    FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore,
+    FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore,
           &zc->blockState.prevCBlock->entropy,
           &zc->blockState.nextCBlock->entropy,
           &zc->appliedParams,
diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h
index 98e359adf5d4..349fc923c355 100644
--- a/lib/zstd/compress/zstd_cwksp.h
+++ b/lib/zstd/compress/zstd_cwksp.h
@@ -32,6 +32,10 @@
 #define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128
 #endif
=20
+
+/* Set our tables and aligneds to align by 64 bytes */
+#define ZSTD_CWKSP_ALIGNMENT_BYTES 64
+
 /*-*************************************
 *  Structures
 ***************************************/
@@ -114,10 +118,11 @@ typedef enum {
  * - Tables: these are any of several different datastructures (hash table=
s,
  *   chain tables, binary trees) that all respect a common format: they are
  *   uint32_t arrays, all of whose values are between 0 and (nextSrc - bas=
e).
- *   Their sizes depend on the cparams.
+ *   Their sizes depend on the cparams. These tables are 64-byte aligned.
  *
  * - Aligned: these buffers are used for various purposes that require 4 b=
yte
- *   alignment, but don't require any initialization before they're used.
+ *   alignment, but don't require any initialization before they're used. =
These
+ *   buffers are each aligned to 64 bytes.
  *
  * - Buffers: these buffers are used for various purposes that don't requi=
re
  *   any alignment or initialization before they're used. This means they =
can
@@ -130,8 +135,7 @@ typedef enum {
  *
  * 1. Objects
  * 2. Buffers
- * 3. Aligned
- * 4. Tables
+ * 3. Aligned/Tables
  *
  * Attempts to reserve objects of different types out of order will fail.
  */
@@ -184,6 +188,8 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t =
const align) {
  * Since tables aren't currently redzoned, you don't need to call through =
this
  * to figure out how much space you need for the matchState tables. Everyt=
hing
  * else is though.
+ *
+ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_=
alloc_size().
  */
 MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) {
     if (size =3D=3D 0)
@@ -191,66 +197,139 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size)=
 {
     return size;
 }
=20
-MEM_STATIC void ZSTD_cwksp_internal_advance_phase(
-        ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) {
+/*
+ * Returns an adjusted alloc size that is the nearest larger multiple of 6=
4 bytes.
+ * Used to determine the number of bytes required for a given "aligned".
+ */
+MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) {
+    return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNME=
NT_BYTES));
+}
+
+/*
+ * Returns the amount of additional space the cwksp must allocate
+ * for internal purposes (currently only alignment).
+ */
+MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) {
+    /* For alignment, the wksp will always allocate an additional n_1=3D[1=
, 64] bytes
+     * to align the beginning of tables section, as well as another n_2=3D=
[0, 63] bytes
+     * to align the beginning of the aligned section.
+     *
+     * n_1 + n_2 =3D=3D 64 bytes if the cwksp is freshly allocated, due to=
 tables and
+     * aligneds being sized in multiples of 64 bytes.
+     */
+    size_t const slackSpace =3D ZSTD_CWKSP_ALIGNMENT_BYTES;
+    return slackSpace;
+}
+
+
+/*
+ * Return the number of additional bytes required to align a pointer to th=
e given number of bytes.
+ * alignBytes must be a power of two.
+ */
+MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t al=
ignBytes) {
+    size_t const alignBytesMask =3D alignBytes - 1;
+    size_t const bytes =3D (alignBytes - ((size_t)ptr & (alignBytesMask)))=
 & alignBytesMask;
+    assert((alignBytes & alignBytesMask) =3D=3D 0);
+    assert(bytes !=3D ZSTD_CWKSP_ALIGNMENT_BYTES);
+    return bytes;
+}
+
+/*
+ * Internal function. Do not use directly.
+ * Reserves the given number of bytes within the aligned/buffer segment of=
 the wksp,
+ * which counts from the end of the wksp (as opposed to the object/table s=
egment).
+ *
+ * Returns a pointer to the beginning of that space.
+ */
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const byte=
s)
+{
+    void* const alloc =3D (BYTE*)ws->allocStart - bytes;
+    void* const bottom =3D ws->tableEnd;
+    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
+        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
+    ZSTD_cwksp_assert_internal_consistency(ws);
+    assert(alloc >=3D bottom);
+    if (alloc < bottom) {
+        DEBUGLOG(4, "cwksp: alloc failed!");
+        ws->allocFailed =3D 1;
+        return NULL;
+    }
+    /* the area is reserved from the end of wksp.
+     * If it overlaps with tableValidEnd, it voids guarantees on values' r=
ange */
+    if (alloc < ws->tableValidEnd) {
+        ws->tableValidEnd =3D alloc;
+    }
+    ws->allocStart =3D alloc;
+    return alloc;
+}
+
+/*
+ * Moves the cwksp to the next phase, and does any necessary allocations.
+ * cwksp initialization must necessarily go through each phase in order.
+ * Returns a 0 on success, or zstd error
+ */
+MEM_STATIC size_t
+ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e=
 phase)
+{
     assert(phase >=3D ws->phase);
     if (phase > ws->phase) {
+        /* Going from allocating objects to allocating buffers */
         if (ws->phase < ZSTD_cwksp_alloc_buffers &&
                 phase >=3D ZSTD_cwksp_alloc_buffers) {
             ws->tableValidEnd =3D ws->objectEnd;
         }
+
+        /* Going from allocating buffers to allocating aligneds/tables */
         if (ws->phase < ZSTD_cwksp_alloc_aligned &&
                 phase >=3D ZSTD_cwksp_alloc_aligned) {
-            /* If unaligned allocations down from a too-large top have lef=
t us
-             * unaligned, we need to realign our alloc ptr. Technically, t=
his
-             * can consume space that is unaccounted for in the neededSpace
-             * calculation. However, I believe this can only happen when t=
he
-             * workspace is too large, and specifically when it is too lar=
ge
-             * by a larger margin than the space that will be consumed. */
-            /* TODO: cleaner, compiler warning friendly way to do this??? =
*/
-            ws->allocStart =3D (BYTE*)ws->allocStart - ((size_t)ws->allocS=
tart & (sizeof(U32)-1));
-            if (ws->allocStart < ws->tableValidEnd) {
-                ws->tableValidEnd =3D ws->allocStart;
+            {   /* Align the start of the "aligned" to 64 bytes. Use [1, 6=
4] bytes. */
+                size_t const bytesToAlign =3D
+                    ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align=
_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES);
+                DEBUGLOG(5, "reserving aligned alignment addtl space: %zu"=
, bytesToAlign);
+                ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWK=
SP_ALIGNMENT_BYTES - 1)) =3D=3D 0); /* power of 2 */
+                RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(=
ws, bytesToAlign),
+                                memory_allocation, "aligned phase - alignm=
ent initial allocation failed!");
             }
-        }
+            {   /* Align the start of the tables to 64 bytes. Use [0, 63] =
bytes */
+                void* const alloc =3D ws->objectEnd;
+                size_t const bytesToAlign =3D ZSTD_cwksp_bytes_to_align_pt=
r(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES);
+                void* const objectEnd =3D (BYTE*)alloc + bytesToAlign;
+                DEBUGLOG(5, "reserving table alignment addtl space: %zu", =
bytesToAlign);
+                RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_alloc=
ation,
+                                "table phase - alignment initial allocatio=
n failed!");
+                ws->objectEnd =3D objectEnd;
+                ws->tableEnd =3D objectEnd;  /* table area starts being em=
pty */
+                if (ws->tableValidEnd < ws->tableEnd) {
+                    ws->tableValidEnd =3D ws->tableEnd;
+        }   }   }
         ws->phase =3D phase;
+        ZSTD_cwksp_assert_internal_consistency(ws);
     }
+    return 0;
 }
=20
 /*
  * Returns whether this object/buffer/etc was allocated in this workspace.
  */
-MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* pt=
r) {
+MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* pt=
r)
+{
     return (ptr !=3D NULL) && (ws->workspace <=3D ptr) && (ptr <=3D ws->wo=
rkspaceEnd);
 }
=20
 /*
  * Internal function. Do not use directly.
  */
-MEM_STATIC void* ZSTD_cwksp_reserve_internal(
-        ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) {
+MEM_STATIC void*
+ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc=
_phase_e phase)
+{
     void* alloc;
-    void* bottom =3D ws->tableEnd;
-    ZSTD_cwksp_internal_advance_phase(ws, phase);
-    alloc =3D (BYTE *)ws->allocStart - bytes;
-
-    if (bytes =3D=3D 0)
+    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || byte=
s =3D=3D 0) {
         return NULL;
+    }
=20
=20
-    DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining",
-        alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
-    ZSTD_cwksp_assert_internal_consistency(ws);
-    assert(alloc >=3D bottom);
-    if (alloc < bottom) {
-        DEBUGLOG(4, "cwksp: alloc failed!");
-        ws->allocFailed =3D 1;
-        return NULL;
-    }
-    if (alloc < ws->tableValidEnd) {
-        ws->tableValidEnd =3D alloc;
-    }
-    ws->allocStart =3D alloc;
+    alloc =3D ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes);
=20
=20
     return alloc;
@@ -259,33 +338,44 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal(
 /*
  * Reserves and returns unaligned memory.
  */
-MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) {
+MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes)
+{
     return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_=
buffers);
 }
=20
 /*
- * Reserves and returns memory sized on and aligned on sizeof(unsigned).
+ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMEN=
T_BYTES (64 bytes).
  */
-MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) {
-    assert((bytes & (sizeof(U32)-1)) =3D=3D 0);
-    return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(=
U32)), ZSTD_cwksp_alloc_aligned);
+MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes)
+{
+    void* ptr =3D ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, =
ZSTD_CWKSP_ALIGNMENT_BYTES),
+                                            ZSTD_cwksp_alloc_aligned);
+    assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))=3D=3D 0);
+    return ptr;
 }
=20
 /*
- * Aligned on sizeof(unsigned). These buffers have the special property th=
at
+ * Aligned on 64 bytes. These buffers have the special property that
  * their values remain constrained, allowing us to re-use them without
  * memset()-ing them.
  */
-MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) {
+MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes)
+{
     const ZSTD_cwksp_alloc_phase_e phase =3D ZSTD_cwksp_alloc_aligned;
-    void* alloc =3D ws->tableEnd;
-    void* end =3D (BYTE *)alloc + bytes;
-    void* top =3D ws->allocStart;
+    void* alloc;
+    void* end;
+    void* top;
+
+    if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) {
+        return NULL;
+    }
+    alloc =3D ws->tableEnd;
+    end =3D (BYTE *)alloc + bytes;
+    top =3D ws->allocStart;
=20
     DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining",
         alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes);
     assert((bytes & (sizeof(U32)-1)) =3D=3D 0);
-    ZSTD_cwksp_internal_advance_phase(ws, phase);
     ZSTD_cwksp_assert_internal_consistency(ws);
     assert(end <=3D top);
     if (end > top) {
@@ -296,27 +386,31 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp*=
 ws, size_t bytes) {
     ws->tableEnd =3D end;
=20
=20
+    assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) =3D=3D 0);
+    assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))=3D=3D 0);
     return alloc;
 }
=20
 /*
  * Aligned on sizeof(void*).
+ * Note : should happen only once, at workspace first initialization
  */
-MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) {
-    size_t roundedBytes =3D ZSTD_cwksp_align(bytes, sizeof(void*));
+MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes)
+{
+    size_t const roundedBytes =3D ZSTD_cwksp_align(bytes, sizeof(void*));
     void* alloc =3D ws->objectEnd;
     void* end =3D (BYTE*)alloc + roundedBytes;
=20
=20
-    DEBUGLOG(5,
+    DEBUGLOG(4,
         "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes =
remaining",
         alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - round=
edBytes);
-    assert(((size_t)alloc & (sizeof(void*)-1)) =3D=3D 0);
-    assert((bytes & (sizeof(void*)-1)) =3D=3D 0);
+    assert((size_t)alloc % ZSTD_ALIGNOF(void*) =3D=3D 0);
+    assert(bytes % ZSTD_ALIGNOF(void*) =3D=3D 0);
     ZSTD_cwksp_assert_internal_consistency(ws);
     /* we must be in the first phase, no advance is possible */
     if (ws->phase !=3D ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd)=
 {
-        DEBUGLOG(4, "cwksp: object alloc failed!");
+        DEBUGLOG(3, "cwksp: object alloc failed!");
         ws->allocFailed =3D 1;
         return NULL;
     }
@@ -328,7 +422,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* =
ws, size_t bytes) {
     return alloc;
 }
=20
-MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) {
+MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws)
+{
     DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty");
=20
=20
@@ -451,6 +546,24 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cw=
ksp* ws) {
 *  Functions Checking Free Space
 ***************************************/
=20
+/* ZSTD_alignmentSpaceWithinBounds() :
+ * Returns if the estimated space needed for a wksp is within an acceptabl=
e limit of the
+ * actual amount of space used.
+ */
+MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* =
const ws,
+                                                        size_t const estim=
atedSpace, int resizedWorkspace) {
+    if (resizedWorkspace) {
+        /* Resized/newly allocated wksp should have exact bounds */
+        return ZSTD_cwksp_used(ws) =3D=3D estimatedSpace;
+    } else {
+        /* Due to alignment, when reusing a workspace, we can actually con=
sume 63 fewer or more bytes
+         * than estimatedSpace. See the comments in zstd_cwksp.h for detai=
ls.
+         */
+        return (ZSTD_cwksp_used(ws) >=3D estimatedSpace - 63) && (ZSTD_cwk=
sp_used(ws) <=3D estimatedSpace + 63);
+    }
+}
+
+
 MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) {
     return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd);
 }
diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_=
double_fast.c
index b0424d23ac57..76933dea2624 100644
--- a/lib/zstd/compress/zstd_double_fast.c
+++ b/lib/zstd/compress/zstd_double_fast.c
@@ -48,10 +48,216 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms,
=20
=20
 FORCE_INLINE_TEMPLATE
-size_t ZSTD_compressBlock_doubleFast_generic(
+size_t ZSTD_compressBlock_doubleFast_noDict_generic(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize, U32 const mls /* template */)
+{
+    ZSTD_compressionParameters const* cParams =3D &ms->cParams;
+    U32* const hashLong =3D ms->hashTable;
+    const U32 hBitsL =3D cParams->hashLog;
+    U32* const hashSmall =3D ms->chainTable;
+    const U32 hBitsS =3D cParams->chainLog;
+    const BYTE* const base =3D ms->window.base;
+    const BYTE* const istart =3D (const BYTE*)src;
+    const BYTE* anchor =3D istart;
+    const U32 endIndex =3D (U32)((size_t)(istart - base) + srcSize);
+    /* presumes that, if there is a dictionary, it must be using Attach mo=
de */
+    const U32 prefixLowestIndex =3D ZSTD_getLowestPrefixIndex(ms, endIndex=
, cParams->windowLog);
+    const BYTE* const prefixLowest =3D base + prefixLowestIndex;
+    const BYTE* const iend =3D istart + srcSize;
+    const BYTE* const ilimit =3D iend - HASH_READ_SIZE;
+    U32 offset_1=3Drep[0], offset_2=3Drep[1];
+    U32 offsetSaved =3D 0;
+
+    size_t mLength;
+    U32 offset;
+    U32 curr;
+
+    /* how many positions to search before increasing step size */
+    const size_t kStepIncr =3D 1 << kSearchStrength;
+    /* the position at which to increment the step size if no match is fou=
nd */
+    const BYTE* nextStep;
+    size_t step; /* the current step size */
+
+    size_t hl0; /* the long hash at ip */
+    size_t hl1; /* the long hash at ip1 */
+
+    U32 idxl0; /* the long match index for ip */
+    U32 idxl1; /* the long match index for ip1 */
+
+    const BYTE* matchl0; /* the long match for ip */
+    const BYTE* matchs0; /* the short match for ip */
+    const BYTE* matchl1; /* the long match for ip1 */
+
+    const BYTE* ip =3D istart; /* the current position */
+    const BYTE* ip1; /* the next position */
+
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic");
+
+    /* init */
+    ip +=3D ((ip - prefixLowest) =3D=3D 0);
+    {
+        U32 const current =3D (U32)(ip - base);
+        U32 const windowLow =3D ZSTD_getLowestPrefixIndex(ms, current, cPa=
rams->windowLog);
+        U32 const maxRep =3D current - windowLow;
+        if (offset_2 > maxRep) offsetSaved =3D offset_2, offset_2 =3D 0;
+        if (offset_1 > maxRep) offsetSaved =3D offset_1, offset_1 =3D 0;
+    }
+
+    /* Outer Loop: one iteration per match found and stored */
+    while (1) {
+        step =3D 1;
+        nextStep =3D ip + kStepIncr;
+        ip1 =3D ip + step;
+
+        if (ip1 > ilimit) {
+            goto _cleanup;
+        }
+
+        hl0 =3D ZSTD_hashPtr(ip, hBitsL, 8);
+        idxl0 =3D hashLong[hl0];
+        matchl0 =3D base + idxl0;
+
+        /* Inner Loop: one iteration per search / position */
+        do {
+            const size_t hs0 =3D ZSTD_hashPtr(ip, hBitsS, mls);
+            const U32 idxs0 =3D hashSmall[hs0];
+            curr =3D (U32)(ip-base);
+            matchs0 =3D base + idxs0;
+
+            hashLong[hl0] =3D hashSmall[hs0] =3D curr;   /* update hash ta=
bles */
+
+            /* check noDict repcode */
+            if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) =3D=3D MEM_rea=
d32(ip+1))) {
+                mLength =3D ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
+                ip++;
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 STORE_REPCODE_1, mLength);
+                goto _match_stored;
+            }
+
+            hl1 =3D ZSTD_hashPtr(ip1, hBitsL, 8);
+
+            if (idxl0 > prefixLowestIndex) {
+                /* check prefix long match */
+                if (MEM_read64(matchl0) =3D=3D MEM_read64(ip)) {
+                    mLength =3D ZSTD_count(ip+8, matchl0+8, iend) + 8;
+                    offset =3D (U32)(ip-matchl0);
+                    while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-=
1] =3D=3D matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */
+                    goto _match_found;
+                }
+            }
+
+            idxl1 =3D hashLong[hl1];
+            matchl1 =3D base + idxl1;
+
+            if (idxs0 > prefixLowestIndex) {
+                /* check prefix short match */
+                if (MEM_read32(matchs0) =3D=3D MEM_read32(ip)) {
+                    goto _search_next_long;
+                }
+            }
+
+            if (ip1 >=3D nextStep) {
+                PREFETCH_L1(ip1 + 64);
+                PREFETCH_L1(ip1 + 128);
+                step++;
+                nextStep +=3D kStepIncr;
+            }
+            ip =3D ip1;
+            ip1 +=3D step;
+
+            hl0 =3D hl1;
+            idxl0 =3D idxl1;
+            matchl0 =3D matchl1;
+    #if defined(__aarch64__)
+            PREFETCH_L1(ip+256);
+    #endif
+        } while (ip1 <=3D ilimit);
+
+_cleanup:
+        /* save reps for next block */
+        rep[0] =3D offset_1 ? offset_1 : offsetSaved;
+        rep[1] =3D offset_2 ? offset_2 : offsetSaved;
+
+        /* Return the last literals size */
+        return (size_t)(iend - anchor);
+
+_search_next_long:
+
+        /* check prefix long +1 match */
+        if (idxl1 > prefixLowestIndex) {
+            if (MEM_read64(matchl1) =3D=3D MEM_read64(ip1)) {
+                ip =3D ip1;
+                mLength =3D ZSTD_count(ip+8, matchl1+8, iend) + 8;
+                offset =3D (U32)(ip-matchl1);
+                while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] =
=3D=3D matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */
+                goto _match_found;
+            }
+        }
+
+        /* if no long +1 match, explore the short match we found */
+        mLength =3D ZSTD_count(ip+4, matchs0+4, iend) + 4;
+        offset =3D (U32)(ip - matchs0);
+        while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] =3D=3D ma=
tchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */
+
+        /* fall-through */
+
+_match_found: /* requires ip, offset, mLength */
+        offset_2 =3D offset_1;
+        offset_1 =3D offset;
+
+        if (step < 4) {
+            /* It is unsafe to write this value back to the hashtable when=
 ip1 is
+             * greater than or equal to the new ip we will have after we'r=
e done
+             * processing this match. Rather than perform that test direct=
ly
+             * (ip1 >=3D ip + mLength), which costs speed in practice, we =
do a simpler
+             * more predictable test. The minmatch even if we take a short=
 match is
+             * 4 bytes, so as long as step, the distance between ip and ip1
+             * (initially) is less than 4, we know ip1 < new ip. */
+            hashLong[hl1] =3D (U32)(ip1 - base);
+        }
+
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_O=
FFSET(offset), mLength);
+
+_match_stored:
+        /* match found */
+        ip +=3D mLength;
+        anchor =3D ip;
+
+        if (ip <=3D ilimit) {
+            /* Complementary insertion */
+            /* done after iLimit test, as candidates could be > iend-8 */
+            {   U32 const indexToInsert =3D curr+2;
+                hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] =3D =
indexToInsert;
+                hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] =3D (U32)(ip-2-bas=
e);
+                hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] =
=3D indexToInsert;
+                hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] =3D (U32)(ip-1-=
base);
+            }
+
+            /* check immediate repcode */
+            while ( (ip <=3D ilimit)
+                 && ( (offset_2>0)
+                    & (MEM_read32(ip) =3D=3D MEM_read32(ip - offset_2)) ))=
 {
+                /* store sequence */
+                size_t const rLength =3D ZSTD_count(ip+4, ip+4-offset_2, i=
end) + 4;
+                U32 const tmpOff =3D offset_2; offset_2 =3D offset_1; offs=
et_1 =3D tmpOff;  /* swap offset_2 <=3D> offset_1 */
+                hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D (U32)(ip-base=
);
+                hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D (U32)(ip-base);
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, =
rLength);
+                ip +=3D rLength;
+                anchor =3D ip;
+                continue;   /* faster when present ... (?) */
+            }
+        }
+    }
+}
+
+
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
-        U32 const mls /* template */, ZSTD_dictMode_e const dictMode)
+        U32 const mls /* template */)
 {
     ZSTD_compressionParameters const* cParams =3D &ms->cParams;
     U32* const hashLong =3D ms->hashTable;
@@ -72,54 +278,30 @@ size_t ZSTD_compressBlock_doubleFast_generic(
     U32 offsetSaved =3D 0;
=20
     const ZSTD_matchState_t* const dms =3D ms->dictMatchState;
-    const ZSTD_compressionParameters* const dictCParams =3D
-                                     dictMode =3D=3D ZSTD_dictMatchState ?
-                                     &dms->cParams : NULL;
-    const U32* const dictHashLong  =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     dms->hashTable : NULL;
-    const U32* const dictHashSmall =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     dms->chainTable : NULL;
-    const U32 dictStartIndex       =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     dms->window.dictLimit : 0;
-    const BYTE* const dictBase     =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     dms->window.base : NULL;
-    const BYTE* const dictStart    =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     dictBase + dictStartIndex : NULL;
-    const BYTE* const dictEnd      =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     dms->window.nextSrc : NULL;
-    const U32 dictIndexDelta       =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     prefixLowestIndex - (U32)(dictEnd - d=
ictBase) :
-                                     0;
-    const U32 dictHBitsL           =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     dictCParams->hashLog : hBitsL;
-    const U32 dictHBitsS           =3D dictMode =3D=3D ZSTD_dictMatchState=
 ?
-                                     dictCParams->chainLog : hBitsS;
+    const ZSTD_compressionParameters* const dictCParams =3D &dms->cParams;
+    const U32* const dictHashLong  =3D dms->hashTable;
+    const U32* const dictHashSmall =3D dms->chainTable;
+    const U32 dictStartIndex       =3D dms->window.dictLimit;
+    const BYTE* const dictBase     =3D dms->window.base;
+    const BYTE* const dictStart    =3D dictBase + dictStartIndex;
+    const BYTE* const dictEnd      =3D dms->window.nextSrc;
+    const U32 dictIndexDelta       =3D prefixLowestIndex - (U32)(dictEnd -=
 dictBase);
+    const U32 dictHBitsL           =3D dictCParams->hashLog;
+    const U32 dictHBitsS           =3D dictCParams->chainLog;
     const U32 dictAndPrefixLength  =3D (U32)((ip - prefixLowest) + (dictEn=
d - dictStart));
=20
-    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic");
-
-    assert(dictMode =3D=3D ZSTD_noDict || dictMode =3D=3D ZSTD_dictMatchSt=
ate);
+    DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic");
=20
     /* if a dictionary is attached, it must be within window range */
-    if (dictMode =3D=3D ZSTD_dictMatchState) {
-        assert(ms->window.dictLimit + (1U << cParams->windowLog) >=3D endI=
ndex);
-    }
+    assert(ms->window.dictLimit + (1U << cParams->windowLog) >=3D endIndex=
);
=20
     /* init */
     ip +=3D (dictAndPrefixLength =3D=3D 0);
-    if (dictMode =3D=3D ZSTD_noDict) {
-        U32 const curr =3D (U32)(ip - base);
-        U32 const windowLow =3D ZSTD_getLowestPrefixIndex(ms, curr, cParam=
s->windowLog);
-        U32 const maxRep =3D curr - windowLow;
-        if (offset_2 > maxRep) offsetSaved =3D offset_2, offset_2 =3D 0;
-        if (offset_1 > maxRep) offsetSaved =3D offset_1, offset_1 =3D 0;
-    }
-    if (dictMode =3D=3D ZSTD_dictMatchState) {
-        /* dictMatchState repCode checks don't currently handle repCode =
=3D=3D 0
-         * disabling. */
-        assert(offset_1 <=3D dictAndPrefixLength);
-        assert(offset_2 <=3D dictAndPrefixLength);
-    }
+
+    /* dictMatchState repCode checks don't currently handle repCode =3D=3D=
 0
+     * disabling. */
+    assert(offset_1 <=3D dictAndPrefixLength);
+    assert(offset_2 <=3D dictAndPrefixLength);
=20
     /* Main Search Loop */
     while (ip < ilimit) {   /* < instead of <=3D, because repcode check at=
 (ip+1) */
@@ -135,29 +317,18 @@ size_t ZSTD_compressBlock_doubleFast_generic(
         const BYTE* matchLong =3D base + matchIndexL;
         const BYTE* match =3D base + matchIndexS;
         const U32 repIndex =3D curr + 1 - offset_1;
-        const BYTE* repMatch =3D (dictMode =3D=3D ZSTD_dictMatchState
-                            && repIndex < prefixLowestIndex) ?
+        const BYTE* repMatch =3D (repIndex < prefixLowestIndex) ?
                                dictBase + (repIndex - dictIndexDelta) :
                                base + repIndex;
         hashLong[h2] =3D hashSmall[h] =3D curr;   /* update hash tables */
=20
-        /* check dictMatchState repcode */
-        if (dictMode =3D=3D ZSTD_dictMatchState
-            && ((U32)((prefixLowestIndex-1) - repIndex) >=3D 3 /* intentio=
nal underflow */)
+        /* check repcode */
+        if (((U32)((prefixLowestIndex-1) - repIndex) >=3D 3 /* intentional=
 underflow */)
             && (MEM_read32(repMatch) =3D=3D MEM_read32(ip+1)) ) {
             const BYTE* repMatchEnd =3D repIndex < prefixLowestIndex ? dic=
tEnd : iend;
             mLength =3D ZSTD_count_2segments(ip+1+4, repMatch+4, iend, rep=
MatchEnd, prefixLowest) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, =
mLength-MINMATCH);
-            goto _match_stored;
-        }
-
-        /* check noDict repcode */
-        if ( dictMode =3D=3D ZSTD_noDict
-          && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) =3D=3D MEM_read3=
2(ip+1)))) {
-            mLength =3D ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4;
-            ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, =
mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO=
RE_REPCODE_1, mLength);
             goto _match_stored;
         }
=20
@@ -169,7 +340,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(
                 while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1]=
 =3D=3D matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */
                 goto _match_found;
             }
-        } else if (dictMode =3D=3D ZSTD_dictMatchState) {
+        } else {
             /* check dictMatchState long match */
             U32 const dictMatchIndexL =3D dictHashLong[dictHL];
             const BYTE* dictMatchL =3D dictBase + dictMatchIndexL;
@@ -187,7 +358,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(
             if (MEM_read32(match) =3D=3D MEM_read32(ip)) {
                 goto _search_next_long;
             }
-        } else if (dictMode =3D=3D ZSTD_dictMatchState) {
+        } else {
             /* check dictMatchState short match */
             U32 const dictMatchIndexS =3D dictHashSmall[dictHS];
             match =3D dictBase + dictMatchIndexS;
@@ -220,7 +391,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(
                     while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-=
1] =3D=3D matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */
                     goto _match_found;
                 }
-            } else if (dictMode =3D=3D ZSTD_dictMatchState) {
+            } else {
                 /* check dict long +1 match */
                 U32 const dictMatchIndexL3 =3D dictHashLong[dictHLNext];
                 const BYTE* dictMatchL3 =3D dictBase + dictMatchIndexL3;
@@ -234,7 +405,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(
         }   }   }
=20
         /* if no long +1 match, explore the short match we found */
-        if (dictMode =3D=3D ZSTD_dictMatchState && matchIndexS < prefixLow=
estIndex) {
+        if (matchIndexS < prefixLowestIndex) {
             mLength =3D ZSTD_count_2segments(ip+4, match+4, iend, dictEnd,=
 prefixLowest) + 4;
             offset =3D (U32)(curr - matchIndexS);
             while (((ip>anchor) & (match>dictStart)) && (ip[-1] =3D=3D mat=
ch[-1])) { ip--; match--; mLength++; } /* catch up */
@@ -248,7 +419,7 @@ size_t ZSTD_compressBlock_doubleFast_generic(
         offset_2 =3D offset_1;
         offset_1 =3D offset;
=20
-        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset =
+ ZSTD_REP_MOVE, mLength-MINMATCH);
+        ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_O=
FFSET(offset), mLength);
=20
 _match_stored:
         /* match found */
@@ -266,43 +437,27 @@ size_t ZSTD_compressBlock_doubleFast_generic(
             }
=20
             /* check immediate repcode */
-            if (dictMode =3D=3D ZSTD_dictMatchState) {
-                while (ip <=3D ilimit) {
-                    U32 const current2 =3D (U32)(ip-base);
-                    U32 const repIndex2 =3D current2 - offset_2;
-                    const BYTE* repMatch2 =3D dictMode =3D=3D ZSTD_dictMat=
chState
-                        && repIndex2 < prefixLowestIndex ?
-                            dictBase + repIndex2 - dictIndexDelta :
-                            base + repIndex2;
-                    if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >=
=3D 3 /* intentional overflow */)
-                       && (MEM_read32(repMatch2) =3D=3D MEM_read32(ip)) ) {
-                        const BYTE* const repEnd2 =3D repIndex2 < prefixLo=
westIndex ? dictEnd : iend;
-                        size_t const repLength2 =3D ZSTD_count_2segments(i=
p+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4;
-                        U32 tmpOffset =3D offset_2; offset_2 =3D offset_1;=
 offset_1 =3D tmpOffset;   /* swap offset_2 <=3D> offset_1 */
-                        ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLen=
gth2-MINMATCH);
-                        hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D curre=
nt2;
-                        hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D current2;
-                        ip +=3D repLength2;
-                        anchor =3D ip;
-                        continue;
-                    }
-                    break;
-            }   }
-
-            if (dictMode =3D=3D ZSTD_noDict) {
-                while ( (ip <=3D ilimit)
-                     && ( (offset_2>0)
-                        & (MEM_read32(ip) =3D=3D MEM_read32(ip - offset_2)=
) )) {
-                    /* store sequence */
-                    size_t const rLength =3D ZSTD_count(ip+4, ip+4-offset_=
2, iend) + 4;
-                    U32 const tmpOff =3D offset_2; offset_2 =3D offset_1; =
offset_1 =3D tmpOff;  /* swap offset_2 <=3D> offset_1 */
-                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D (U32)(ip-=
base);
-                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D (U32)(ip-bas=
e);
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MI=
NMATCH);
-                    ip +=3D rLength;
+            while (ip <=3D ilimit) {
+                U32 const current2 =3D (U32)(ip-base);
+                U32 const repIndex2 =3D current2 - offset_2;
+                const BYTE* repMatch2 =3D repIndex2 < prefixLowestIndex ?
+                        dictBase + repIndex2 - dictIndexDelta :
+                        base + repIndex2;
+                if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >=3D 3=
 /* intentional overflow */)
+                   && (MEM_read32(repMatch2) =3D=3D MEM_read32(ip)) ) {
+                    const BYTE* const repEnd2 =3D repIndex2 < prefixLowest=
Index ? dictEnd : iend;
+                    size_t const repLength2 =3D ZSTD_count_2segments(ip+4,=
 repMatch2+4, iend, repEnd2, prefixLowest) + 4;
+                    U32 tmpOffset =3D offset_2; offset_2 =3D offset_1; off=
set_1 =3D tmpOffset;   /* swap offset_2 <=3D> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE=
_1, repLength2);
+                    hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D current2;
+                    hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D current2;
+                    ip +=3D repLength2;
                     anchor =3D ip;
-                    continue;   /* faster when present ... (?) */
-        }   }   }
+                    continue;
+                }
+                break;
+            }
+        }
     }   /* while (ip < ilimit) */
=20
     /* save reps for next block */
@@ -313,6 +468,24 @@ size_t ZSTD_compressBlock_doubleFast_generic(
     return (size_t)(iend - anchor);
 }
=20
+#define ZSTD_GEN_DFAST_FN(dictMode, mls)                                  =
                               \
+    static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls(       =
                               \
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_=
NUM],                          \
+            void const* src, size_t srcSize)                              =
                               \
+    {                                                                     =
                               \
+        return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqS=
tore, rep, src, srcSize, mls); \
+    }
+
+ZSTD_GEN_DFAST_FN(noDict, 4)
+ZSTD_GEN_DFAST_FN(noDict, 5)
+ZSTD_GEN_DFAST_FN(noDict, 6)
+ZSTD_GEN_DFAST_FN(noDict, 7)
+
+ZSTD_GEN_DFAST_FN(dictMatchState, 4)
+ZSTD_GEN_DFAST_FN(dictMatchState, 5)
+ZSTD_GEN_DFAST_FN(dictMatchState, 6)
+ZSTD_GEN_DFAST_FN(dictMatchState, 7)
+
=20
 size_t ZSTD_compressBlock_doubleFast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -323,13 +496,13 @@ size_t ZSTD_compressBlock_doubleFast(
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, 4, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, s=
rc, srcSize);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, 5, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, s=
rc, srcSize);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, 6, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, s=
rc, srcSize);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, 7, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, s=
rc, srcSize);
     }
 }
=20
@@ -343,13 +516,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState(
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, 4, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore=
, rep, src, srcSize);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, 5, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore=
, rep, src, srcSize);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, 6, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore=
, rep, src, srcSize);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, 7, ZSTD_dictMatchState);
+        return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore=
, rep, src, srcSize);
     }
 }
=20
@@ -385,7 +558,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_gen=
eric(
=20
     /* if extDict is invalidated due to maxDistance, switch to "regular" v=
ariant */
     if (prefixStartIndex =3D=3D dictStartIndex)
-        return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr=
c, srcSize, mls, ZSTD_noDict);
+        return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSi=
ze);
=20
     /* Search Loop */
     while (ip < ilimit) {  /* < instead of <=3D, because (ip+1) */
@@ -407,12 +580,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_g=
eneric(
         hashSmall[hSmall] =3D hashLong[hLong] =3D curr;   /* update hash t=
able */
=20
         if ((((U32)((prefixStartIndex-1) - repIndex) >=3D 3) /* intentiona=
l underflow : ensure repIndex doesn't overlap dict + prefix */
-            & (repIndex > dictStartIndex))
+            & (offset_1 <=3D curr+1 - dictStartIndex)) /* note: we are sea=
rching at curr+1 */
           && (MEM_read32(repMatch) =3D=3D MEM_read32(ip+1)) ) {
             const BYTE* repMatchEnd =3D repIndex < prefixStartIndex ? dict=
End : iend;
             mLength =3D ZSTD_count_2segments(ip+1+4, repMatch+4, iend, rep=
MatchEnd, prefixStart) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, =
mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO=
RE_REPCODE_1, mLength);
         } else {
             if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong=
) =3D=3D MEM_read64(ip))) {
                 const BYTE* const matchEnd =3D matchLongIndex < prefixStar=
tIndex ? dictEnd : iend;
@@ -423,7 +596,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_gen=
eric(
                 while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] =
=3D=3D matchLong[-1])) { ip--; matchLong--; mLength++; }   /* catch up */
                 offset_2 =3D offset_1;
                 offset_1 =3D offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 STORE_OFFSET(offset), mLength);
=20
             } else if ((matchIndex > dictStartIndex) && (MEM_read32(match)=
 =3D=3D MEM_read32(ip))) {
                 size_t const h3 =3D ZSTD_hashPtr(ip+1, hBitsL, 8);
@@ -448,7 +621,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_gen=
eric(
                 }
                 offset_2 =3D offset_1;
                 offset_1 =3D offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 STORE_OFFSET(offset), mLength);
=20
             } else {
                 ip +=3D ((ip-anchor) >> kSearchStrength) + 1;
@@ -475,12 +648,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_g=
eneric(
                 U32 const repIndex2 =3D current2 - offset_2;
                 const BYTE* repMatch2 =3D repIndex2 < prefixStartIndex ? d=
ictBase + repIndex2 : base + repIndex2;
                 if ( (((U32)((prefixStartIndex-1) - repIndex2) >=3D 3)   /=
* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */
-                    & (repIndex2 > dictStartIndex))
+                    & (offset_2 <=3D current2 - dictStartIndex))
                   && (MEM_read32(repMatch2) =3D=3D MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 =3D repIndex2 < prefixStartI=
ndex ? dictEnd : iend;
                     size_t const repLength2 =3D ZSTD_count_2segments(ip+4,=
 repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 const tmpOffset =3D offset_2; offset_2 =3D offset_=
1; offset_1 =3D tmpOffset;   /* swap offset_2 <=3D> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2=
-MINMATCH);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE=
_1, repLength2);
                     hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D current2;
                     hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D current2;
                     ip +=3D repLength2;
@@ -498,6 +671,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_ge=
neric(
     return (size_t)(iend - anchor);
 }
=20
+ZSTD_GEN_DFAST_FN(extDict, 4)
+ZSTD_GEN_DFAST_FN(extDict, 5)
+ZSTD_GEN_DFAST_FN(extDict, 6)
+ZSTD_GEN_DFAST_FN(extDict, 7)
=20
 size_t ZSTD_compressBlock_doubleFast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -508,12 +685,12 @@ size_t ZSTD_compressBlock_doubleFast_extDict(
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore,=
 rep, src, srcSize, 4);
+        return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, =
src, srcSize);
     case 5 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore,=
 rep, src, srcSize, 5);
+        return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, =
src, srcSize);
     case 6 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore,=
 rep, src, srcSize, 6);
+        return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, =
src, srcSize);
     case 7 :
-        return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore,=
 rep, src, srcSize, 7);
+        return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, =
src, srcSize);
     }
 }
diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c
index 96b7d48e2868..a752e6beab52 100644
--- a/lib/zstd/compress/zstd_fast.c
+++ b/lib/zstd/compress/zstd_fast.c
@@ -43,145 +43,294 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
 }
=20
=20
+/*
+ * If you squint hard enough (and ignore repcodes), the search operation a=
t any
+ * given position is broken into 4 stages:
+ *
+ * 1. Hash   (map position to hash value via input read)
+ * 2. Lookup (map hash val to index via hashtable read)
+ * 3. Load   (map index to value at that position via input read)
+ * 4. Compare
+ *
+ * Each of these steps involves a memory read at an address which is compu=
ted
+ * from the previous step. This means these steps must be sequenced and th=
eir
+ * latencies are cumulative.
+ *
+ * Rather than do 1->2->3->4 sequentially for a single position before mov=
ing
+ * onto the next, this implementation interleaves these operations across =
the
+ * next few positions:
+ *
+ * R =3D Repcode Read & Compare
+ * H =3D Hash
+ * T =3D Table Lookup
+ * M =3D Match Read & Compare
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N   | ... M
+ * N+1 | ...   TM
+ * N+2 |    R H   T M
+ * N+3 |         H    TM
+ * N+4 |           R H   T M
+ * N+5 |                H   ...
+ * N+6 |                  R ...
+ *
+ * This is very much analogous to the pipelining of execution in a CPU. An=
d just
+ * like a CPU, we have to dump the pipeline when we find a match (i.e., ta=
ke a
+ * branch).
+ *
+ * When this happens, we throw away our current state, and do the followin=
g prep
+ * to re-enter the loop:
+ *
+ * Pos | Time -->
+ * ----+-------------------
+ * N   | H T
+ * N+1 |  H
+ *
+ * This is also the work we do at the beginning to enter the loop initiall=
y.
+ */
 FORCE_INLINE_TEMPLATE size_t
-ZSTD_compressBlock_fast_generic(
+ZSTD_compressBlock_fast_noDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize,
-        U32 const mls)
+        U32 const mls, U32 const hasStep)
 {
     const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
     U32* const hashTable =3D ms->hashTable;
     U32 const hlog =3D cParams->hashLog;
     /* support stepSize of 0 */
-    size_t const stepSize =3D cParams->targetLength + !(cParams->targetLen=
gth) + 1;
+    size_t const stepSize =3D hasStep ? (cParams->targetLength + !(cParams=
->targetLength) + 1) : 2;
     const BYTE* const base =3D ms->window.base;
     const BYTE* const istart =3D (const BYTE*)src;
-    /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */
-    const BYTE* ip0 =3D istart;
-    const BYTE* ip1;
-    const BYTE* anchor =3D istart;
     const U32   endIndex =3D (U32)((size_t)(istart - base) + srcSize);
     const U32   prefixStartIndex =3D ZSTD_getLowestPrefixIndex(ms, endInde=
x, cParams->windowLog);
     const BYTE* const prefixStart =3D base + prefixStartIndex;
     const BYTE* const iend =3D istart + srcSize;
     const BYTE* const ilimit =3D iend - HASH_READ_SIZE;
-    U32 offset_1=3Drep[0], offset_2=3Drep[1];
+
+    const BYTE* anchor =3D istart;
+    const BYTE* ip0 =3D istart;
+    const BYTE* ip1;
+    const BYTE* ip2;
+    const BYTE* ip3;
+    U32 current0;
+
+    U32 rep_offset1 =3D rep[0];
+    U32 rep_offset2 =3D rep[1];
     U32 offsetSaved =3D 0;
=20
-    /* init */
+    size_t hash0; /* hash for ip0 */
+    size_t hash1; /* hash for ip1 */
+    U32 idx; /* match idx for ip0 */
+    U32 mval; /* src value at match idx */
+
+    U32 offcode;
+    const BYTE* match0;
+    size_t mLength;
+
+    /* ip0 and ip1 are always adjacent. The targetLength skipping and
+     * uncompressibility acceleration is applied to every other position,
+     * matching the behavior of #1562. step therefore represents the gap
+     * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
+    size_t step;
+    const BYTE* nextStep;
+    const size_t kStepIncr =3D (1 << (kSearchStrength - 1));
+
     DEBUGLOG(5, "ZSTD_compressBlock_fast_generic");
     ip0 +=3D (ip0 =3D=3D prefixStart);
-    ip1 =3D ip0 + 1;
     {   U32 const curr =3D (U32)(ip0 - base);
         U32 const windowLow =3D ZSTD_getLowestPrefixIndex(ms, curr, cParam=
s->windowLog);
         U32 const maxRep =3D curr - windowLow;
-        if (offset_2 > maxRep) offsetSaved =3D offset_2, offset_2 =3D 0;
-        if (offset_1 > maxRep) offsetSaved =3D offset_1, offset_1 =3D 0;
+        if (rep_offset2 > maxRep) offsetSaved =3D rep_offset2, rep_offset2=
 =3D 0;
+        if (rep_offset1 > maxRep) offsetSaved =3D rep_offset1, rep_offset1=
 =3D 0;
     }
=20
-    /* Main Search Loop */
-#ifdef __INTEL_COMPILER
-    /* From intel 'The vector pragma indicates that the loop should be
-     * vectorized if it is legal to do so'. Can be used together with
-     * #pragma ivdep (but have opted to exclude that because intel
-     * warns against using it).*/
-    #pragma vector always
-#endif
-    while (ip1 < ilimit) {   /* < instead of <=3D, because check at ip0+2 =
*/
-        size_t mLength;
-        BYTE const* ip2 =3D ip0 + 2;
-        size_t const h0 =3D ZSTD_hashPtr(ip0, hlog, mls);
-        U32 const val0 =3D MEM_read32(ip0);
-        size_t const h1 =3D ZSTD_hashPtr(ip1, hlog, mls);
-        U32 const val1 =3D MEM_read32(ip1);
-        U32 const current0 =3D (U32)(ip0-base);
-        U32 const current1 =3D (U32)(ip1-base);
-        U32 const matchIndex0 =3D hashTable[h0];
-        U32 const matchIndex1 =3D hashTable[h1];
-        BYTE const* repMatch =3D ip2 - offset_1;
-        const BYTE* match0 =3D base + matchIndex0;
-        const BYTE* match1 =3D base + matchIndex1;
-        U32 offcode;
-
-#if defined(__aarch64__)
-        PREFETCH_L1(ip0+256);
-#endif
-
-        hashTable[h0] =3D current0;   /* update hash table */
-        hashTable[h1] =3D current1;   /* update hash table */
-
-        assert(ip0 + 1 =3D=3D ip1);
-
-        if ((offset_1 > 0) & (MEM_read32(repMatch) =3D=3D MEM_read32(ip2))=
) {
-            mLength =3D (ip2[-1] =3D=3D repMatch[-1]) ? 1 : 0;
-            ip0 =3D ip2 - mLength;
-            match0 =3D repMatch - mLength;
+    /* start each op */
+_start: /* Requires: ip0 */
+
+    step =3D stepSize;
+    nextStep =3D ip0 + kStepIncr;
+
+    /* calculate positions, ip0 - anchor =3D=3D 0, so we skip step calc */
+    ip1 =3D ip0 + 1;
+    ip2 =3D ip0 + step;
+    ip3 =3D ip2 + 1;
+
+    if (ip3 >=3D ilimit) {
+        goto _cleanup;
+    }
+
+    hash0 =3D ZSTD_hashPtr(ip0, hlog, mls);
+    hash1 =3D ZSTD_hashPtr(ip1, hlog, mls);
+
+    idx =3D hashTable[hash0];
+
+    do {
+        /* load repcode match for ip[2]*/
+        const U32 rval =3D MEM_read32(ip2 - rep_offset1);
+
+        /* write back hash table entry */
+        current0 =3D (U32)(ip0 - base);
+        hashTable[hash0] =3D current0;
+
+        /* check repcode at ip[2] */
+        if ((MEM_read32(ip2) =3D=3D rval) & (rep_offset1 > 0)) {
+            ip0 =3D ip2;
+            match0 =3D ip0 - rep_offset1;
+            mLength =3D ip0[-1] =3D=3D match0[-1];
+            ip0 -=3D mLength;
+            match0 -=3D mLength;
+            offcode =3D STORE_REPCODE_1;
             mLength +=3D 4;
-            offcode =3D 0;
             goto _match;
         }
-        if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) =3D=3D =
val0) {
-            /* found a regular match */
-            goto _offset;
+
+        /* load match for ip[0] */
+        if (idx >=3D prefixStartIndex) {
+            mval =3D MEM_read32(base + idx);
+        } else {
+            mval =3D MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
         }
-        if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) =3D=3D =
val1) {
-            /* found a regular match after one literal */
-            ip0 =3D ip1;
-            match0 =3D match1;
+
+        /* check match at ip[0] */
+        if (MEM_read32(ip0) =3D=3D mval) {
+            /* found a match! */
             goto _offset;
         }
-        {   size_t const step =3D ((size_t)(ip0-anchor) >> (kSearchStrengt=
h - 1)) + stepSize;
-            assert(step >=3D 2);
-            ip0 +=3D step;
-            ip1 +=3D step;
-            continue;
+
+        /* lookup ip[1] */
+        idx =3D hashTable[hash1];
+
+        /* hash ip[2] */
+        hash0 =3D hash1;
+        hash1 =3D ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 =3D ip1;
+        ip1 =3D ip2;
+        ip2 =3D ip3;
+
+        /* write back hash table entry */
+        current0 =3D (U32)(ip0 - base);
+        hashTable[hash0] =3D current0;
+
+        /* load match for ip[0] */
+        if (idx >=3D prefixStartIndex) {
+            mval =3D MEM_read32(base + idx);
+        } else {
+            mval =3D MEM_read32(ip0) ^ 1; /* guaranteed to not match. */
         }
-_offset: /* Requires: ip0, match0 */
-        /* Compute the offset code */
-        offset_2 =3D offset_1;
-        offset_1 =3D (U32)(ip0-match0);
-        offcode =3D offset_1 + ZSTD_REP_MOVE;
-        mLength =3D 4;
-        /* Count the backwards match length */
-        while (((ip0>anchor) & (match0>prefixStart))
-             && (ip0[-1] =3D=3D match0[-1])) { ip0--; match0--; mLength++;=
 } /* catch up */
=20
-_match: /* Requires: ip0, match0, offcode */
-        /* Count the forward length */
-        mLength +=3D ZSTD_count(ip0+mLength, match0+mLength, iend);
-        ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcod=
e, mLength-MINMATCH);
-        /* match found */
-        ip0 +=3D mLength;
-        anchor =3D ip0;
+        /* check match at ip[0] */
+        if (MEM_read32(ip0) =3D=3D mval) {
+            /* found a match! */
+            goto _offset;
+        }
=20
-        if (ip0 <=3D ilimit) {
-            /* Fill Table */
-            assert(base+current0+2 > istart);  /* check base overflow */
-            hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] =3D curren=
t0+2;  /* here because current+2 could be > iend-8 */
-            hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] =3D (U32)(ip0-2-base=
);
-
-            if (offset_2 > 0) { /* offset_2=3D=3D0 means offset_2 is inval=
idated */
-                while ( (ip0 <=3D ilimit) && (MEM_read32(ip0) =3D=3D MEM_r=
ead32(ip0 - offset_2)) ) {
-                    /* store sequence */
-                    size_t const rLength =3D ZSTD_count(ip0+4, ip0+4-offse=
t_2, iend) + 4;
-                    { U32 const tmpOff =3D offset_2; offset_2 =3D offset_1=
; offset_1 =3D tmpOff; } /* swap offset_2 <=3D> offset_1 */
-                    hashTable[ZSTD_hashPtr(ip0, hlog, mls)] =3D (U32)(ip0-=
base);
-                    ip0 +=3D rLength;
-                    ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 =
/*offCode*/, rLength-MINMATCH);
-                    anchor =3D ip0;
-                    continue;   /* faster when present (confirmed on gcc-8=
) ... (?) */
-        }   }   }
-        ip1 =3D ip0 + 1;
-    }
+        /* lookup ip[1] */
+        idx =3D hashTable[hash1];
+
+        /* hash ip[2] */
+        hash0 =3D hash1;
+        hash1 =3D ZSTD_hashPtr(ip2, hlog, mls);
+
+        /* advance to next positions */
+        ip0 =3D ip1;
+        ip1 =3D ip2;
+        ip2 =3D ip0 + step;
+        ip3 =3D ip1 + step;
+
+        /* calculate step */
+        if (ip2 >=3D nextStep) {
+            step++;
+            PREFETCH_L1(ip1 + 64);
+            PREFETCH_L1(ip1 + 128);
+            nextStep +=3D kStepIncr;
+        }
+    } while (ip3 < ilimit);
+
+_cleanup:
+    /* Note that there are probably still a couple positions we could sear=
ch.
+     * However, it seems to be a meaningful performance hit to try to sear=
ch
+     * them. So let's not. */
=20
     /* save reps for next block */
-    rep[0] =3D offset_1 ? offset_1 : offsetSaved;
-    rep[1] =3D offset_2 ? offset_2 : offsetSaved;
+    rep[0] =3D rep_offset1 ? rep_offset1 : offsetSaved;
+    rep[1] =3D rep_offset2 ? rep_offset2 : offsetSaved;
=20
     /* Return the last literals size */
     return (size_t)(iend - anchor);
+
+_offset: /* Requires: ip0, idx */
+
+    /* Compute the offset code. */
+    match0 =3D base + idx;
+    rep_offset2 =3D rep_offset1;
+    rep_offset1 =3D (U32)(ip0-match0);
+    offcode =3D STORE_OFFSET(rep_offset1);
+    mLength =3D 4;
+
+    /* Count the backwards match length. */
+    while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] =3D=3D match0=
[-1])) {
+        ip0--;
+        match0--;
+        mLength++;
+    }
+
+_match: /* Requires: ip0, match0, offcode */
+
+    /* Count the forward length. */
+    mLength +=3D ZSTD_count(ip0 + mLength, match0 + mLength, iend);
+
+    ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode,=
 mLength);
+
+    ip0 +=3D mLength;
+    anchor =3D ip0;
+
+    /* write next hash table entry */
+    if (ip1 < ip0) {
+        hashTable[hash1] =3D (U32)(ip1 - base);
+    }
+
+    /* Fill table and check for immediate repcode. */
+    if (ip0 <=3D ilimit) {
+        /* Fill Table */
+        assert(base+current0+2 > istart);  /* check base overflow */
+        hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] =3D current0+2=
;  /* here because current+2 could be > iend-8 */
+        hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] =3D (U32)(ip0-2-base);
+
+        if (rep_offset2 > 0) { /* rep_offset2=3D=3D0 means rep_offset2 is =
invalidated */
+            while ( (ip0 <=3D ilimit) && (MEM_read32(ip0) =3D=3D MEM_read3=
2(ip0 - rep_offset2)) ) {
+                /* store sequence */
+                size_t const rLength =3D ZSTD_count(ip0+4, ip0+4-rep_offse=
t2, iend) + 4;
+                { U32 const tmpOff =3D rep_offset2; rep_offset2 =3D rep_of=
fset1; rep_offset1 =3D tmpOff; } /* swap rep_offset2 <=3D> rep_offset1 */
+                hashTable[ZSTD_hashPtr(ip0, hlog, mls)] =3D (U32)(ip0-base=
);
+                ip0 +=3D rLength;
+                ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_=
REPCODE_1, rLength);
+                anchor =3D ip0;
+                continue;   /* faster when present (confirmed on gcc-8) ..=
. (?) */
+    }   }   }
+
+    goto _start;
 }
=20
+#define ZSTD_GEN_FAST_FN(dictMode, mls, step)                             =
                               \
+    static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step(    =
                                  \
+            ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_=
NUM],                    \
+            void const* src, size_t srcSize)                              =
                         \
+    {                                                                     =
                         \
+        return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, =
rep, src, srcSize, mls, step); \
+    }
+
+ZSTD_GEN_FAST_FN(noDict, 4, 1)
+ZSTD_GEN_FAST_FN(noDict, 5, 1)
+ZSTD_GEN_FAST_FN(noDict, 6, 1)
+ZSTD_GEN_FAST_FN(noDict, 7, 1)
+
+ZSTD_GEN_FAST_FN(noDict, 4, 0)
+ZSTD_GEN_FAST_FN(noDict, 5, 0)
+ZSTD_GEN_FAST_FN(noDict, 6, 0)
+ZSTD_GEN_FAST_FN(noDict, 7, 0)
=20
 size_t ZSTD_compressBlock_fast(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -189,24 +338,40 @@ size_t ZSTD_compressBlock_fast(
 {
     U32 const mls =3D ms->cParams.minMatch;
     assert(ms->dictMatchState =3D=3D NULL);
-    switch(mls)
-    {
-    default: /* includes case 3 */
-    case 4 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src=
Size, 4);
-    case 5 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src=
Size, 5);
-    case 6 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src=
Size, 6);
-    case 7 :
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src=
Size, 7);
+    if (ms->cParams.targetLength > 1) {
+        switch(mls)
+        {
+        default: /* includes case 3 */
+        case 4 :
+            return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, s=
rc, srcSize);
+        case 5 :
+            return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, s=
rc, srcSize);
+        case 6 :
+            return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, s=
rc, srcSize);
+        case 7 :
+            return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, s=
rc, srcSize);
+        }
+    } else {
+        switch(mls)
+        {
+        default: /* includes case 3 */
+        case 4 :
+            return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, s=
rc, srcSize);
+        case 5 :
+            return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, s=
rc, srcSize);
+        case 6 :
+            return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, s=
rc, srcSize);
+        case 7 :
+            return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, s=
rc, srcSize);
+        }
+
     }
 }
=20
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_fast_dictMatchState_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        void const* src, size_t srcSize, U32 const mls)
+        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
 {
     const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
     U32* const hashTable =3D ms->hashTable;
@@ -242,6 +407,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     assert(endIndex - prefixStartIndex <=3D maxDistance);
     (void)maxDistance; (void)endIndex;   /* these variables are not used w=
hen assert() is disabled */
=20
+    (void)hasStep; /* not currently specialized on whether it's accelerate=
d */
+
     /* ensure there will be no underflow
      * when translating a dict index into a local index */
     assert(prefixStartIndex >=3D (U32)(dictEnd - dictBase));
@@ -272,7 +439,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
             const BYTE* const repMatchEnd =3D repIndex < prefixStartIndex =
? dictEnd : iend;
             mLength =3D ZSTD_count_2segments(ip+1+4, repMatch+4, iend, rep=
MatchEnd, prefixStart) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, =
mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO=
RE_REPCODE_1, mLength);
         } else if ( (matchIndex <=3D prefixStartIndex) ) {
             size_t const dictHash =3D ZSTD_hashPtr(ip, dictHLog, mls);
             U32 const dictMatchIndex =3D dictHashTable[dictHash];
@@ -292,7 +459,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
                 } /* catch up */
                 offset_2 =3D offset_1;
                 offset_1 =3D offset;
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 STORE_OFFSET(offset), mLength);
             }
         } else if (MEM_read32(match) !=3D MEM_read32(ip)) {
             /* it's not a match, and we're not going to check the dictiona=
ry */
@@ -307,7 +474,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
                  && (ip[-1] =3D=3D match[-1])) { ip--; match--; mLength++;=
 } /* catch up */
             offset_2 =3D offset_1;
             offset_1 =3D offset;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, off=
set + ZSTD_REP_MOVE, mLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO=
RE_OFFSET(offset), mLength);
         }
=20
         /* match found */
@@ -332,7 +499,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
                     const BYTE* const repEnd2 =3D repIndex2 < prefixStartI=
ndex ? dictEnd : iend;
                     size_t const repLength2 =3D ZSTD_count_2segments(ip+4,=
 repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     U32 tmpOffset =3D offset_2; offset_2 =3D offset_1; off=
set_1 =3D tmpOffset;   /* swap offset_2 <=3D> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2=
-MINMATCH);
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE=
_1, repLength2);
                     hashTable[ZSTD_hashPtr(ip, hlog, mls)] =3D current2;
                     ip +=3D repLength2;
                     anchor =3D ip;
@@ -351,6 +518,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
     return (size_t)(iend - anchor);
 }
=20
+
+ZSTD_GEN_FAST_FN(dictMatchState, 4, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 5, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 6, 0)
+ZSTD_GEN_FAST_FN(dictMatchState, 7, 0)
+
 size_t ZSTD_compressBlock_fast_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize)
@@ -361,20 +534,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState(
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore=
, rep, src, srcSize, 4);
+        return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, re=
p, src, srcSize);
     case 5 :
-        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore=
, rep, src, srcSize, 5);
+        return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, re=
p, src, srcSize);
     case 6 :
-        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore=
, rep, src, srcSize, 6);
+        return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, re=
p, src, srcSize);
     case 7 :
-        return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore=
, rep, src, srcSize, 7);
+        return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, re=
p, src, srcSize);
     }
 }
=20
=20
 static size_t ZSTD_compressBlock_fast_extDict_generic(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
-        void const* src, size_t srcSize, U32 const mls)
+        void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
 {
     const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
     U32* const hashTable =3D ms->hashTable;
@@ -398,11 +571,13 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     const BYTE* const ilimit =3D iend - 8;
     U32 offset_1=3Drep[0], offset_2=3Drep[1];
=20
+    (void)hasStep; /* not currently specialized on whether it's accelerate=
d */
+
     DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=3D%u)",=
 offset_1);
=20
     /* switch to "regular" variant if extDict is invalidated due to maxDis=
tance */
     if (prefixStartIndex =3D=3D dictStartIndex)
-        return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src=
Size, mls);
+        return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
=20
     /* Search Loop */
     while (ip < ilimit) {  /* < instead of <=3D, because (ip+1) */
@@ -416,14 +591,14 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
         const BYTE* const repMatch =3D repBase + repIndex;
         hashTable[h] =3D curr;   /* update hash table */
         DEBUGLOG(7, "offset_1 =3D %u , curr =3D %u", offset_1, curr);
-        assert(offset_1 <=3D curr +1);   /* check repIndex */
=20
-        if ( (((U32)((prefixStartIndex-1) - repIndex) >=3D 3) /* intention=
al underflow */ & (repIndex > dictStartIndex))
+        if ( ( ((U32)((prefixStartIndex-1) - repIndex) >=3D 3) /* intentio=
nal underflow */
+             & (offset_1 <=3D curr+1 - dictStartIndex) ) /* note: we are s=
earching at curr+1 */
            && (MEM_read32(repMatch) =3D=3D MEM_read32(ip+1)) ) {
             const BYTE* const repMatchEnd =3D repIndex < prefixStartIndex =
? dictEnd : iend;
             size_t const rLength =3D ZSTD_count_2segments(ip+1 +4, repMatc=
h +4, iend, repMatchEnd, prefixStart) + 4;
             ip++;
-            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, =
rLength-MINMATCH);
+            ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO=
RE_REPCODE_1, rLength);
             ip +=3D rLength;
             anchor =3D ip;
         } else {
@@ -439,7 +614,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
                 size_t mLength =3D ZSTD_count_2segments(ip+4, match+4, ien=
d, matchEnd, prefixStart) + 4;
                 while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] =3D=
=3D match[-1])) { ip--; match--; mLength++; }   /* catch up */
                 offset_2 =3D offset_1; offset_1 =3D offset;  /* update off=
set history */
-                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 offset + ZSTD_REP_MOVE, mLength-MINMATCH);
+                ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,=
 STORE_OFFSET(offset), mLength);
                 ip +=3D mLength;
                 anchor =3D ip;
         }   }
@@ -453,12 +628,12 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
                 U32 const current2 =3D (U32)(ip-base);
                 U32 const repIndex2 =3D current2 - offset_2;
                 const BYTE* const repMatch2 =3D repIndex2 < prefixStartInd=
ex ? dictBase + repIndex2 : base + repIndex2;
-                if ( (((U32)((prefixStartIndex-1) - repIndex2) >=3D 3) & (=
repIndex2 > dictStartIndex))  /* intentional overflow */
+                if ( (((U32)((prefixStartIndex-1) - repIndex2) >=3D 3) & (=
offset_2 <=3D curr - dictStartIndex))  /* intentional overflow */
                    && (MEM_read32(repMatch2) =3D=3D MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 =3D repIndex2 < prefixStartI=
ndex ? dictEnd : iend;
                     size_t const repLength2 =3D ZSTD_count_2segments(ip+4,=
 repMatch2+4, iend, repEnd2, prefixStart) + 4;
                     { U32 const tmpOffset =3D offset_2; offset_2 =3D offse=
t_1; offset_1 =3D tmpOffset; }  /* swap offset_2 <=3D> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 =
/*offcode*/, repLength2-MINMATCH);
+                    ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, ST=
ORE_REPCODE_1, repLength2);
                     hashTable[ZSTD_hashPtr(ip, hlog, mls)] =3D current2;
                     ip +=3D repLength2;
                     anchor =3D ip;
@@ -475,6 +650,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic(
     return (size_t)(iend - anchor);
 }
=20
+ZSTD_GEN_FAST_FN(extDict, 4, 0)
+ZSTD_GEN_FAST_FN(extDict, 5, 0)
+ZSTD_GEN_FAST_FN(extDict, 6, 0)
+ZSTD_GEN_FAST_FN(extDict, 7, 0)
=20
 size_t ZSTD_compressBlock_fast_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -485,12 +664,12 @@ size_t ZSTD_compressBlock_fast_extDict(
     {
     default: /* includes case 3 */
     case 4 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, =
src, srcSize, 4);
+        return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src,=
 srcSize);
     case 5 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, =
src, srcSize, 5);
+        return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src,=
 srcSize);
     case 6 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, =
src, srcSize, 6);
+        return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src,=
 srcSize);
     case 7 :
-        return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, =
src, srcSize, 7);
+        return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src,=
 srcSize);
     }
 }
diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c
index fb54d4e28a2b..0298a01a7504 100644
--- a/lib/zstd/compress/zstd_lazy.c
+++ b/lib/zstd/compress/zstd_lazy.c
@@ -61,7 +61,7 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms,
  *  assumption : curr >=3D btlow =3D=3D (curr - btmask)
  *  doesn't fail */
 static void
-ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
+ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
                  U32 curr, const BYTE* inputEnd,
                  U32 nbCompares, U32 btLow,
                  const ZSTD_dictMode_e dictMode)
@@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms,
=20
 static size_t
 ZSTD_DUBT_findBetterDictMatch (
-        ZSTD_matchState_t* ms,
+        const ZSTD_matchState_t* ms,
         const BYTE* const ip, const BYTE* const iend,
         size_t* offsetPtr,
         size_t bestLength,
@@ -197,8 +197,8 @@ ZSTD_DUBT_findBetterDictMatch (
             U32 matchIndex =3D dictMatchIndex + dictIndexDelta;
             if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(=
curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) {
                 DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found bet=
ter match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, match=
Index %u)",
-                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetP=
tr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex);
-                bestLength =3D matchLength, *offsetPtr =3D ZSTD_REP_MOVE +=
 curr - matchIndex;
+                    curr, (U32)bestLength, (U32)matchLength, (U32)*offsetP=
tr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex);
+                bestLength =3D matchLength, *offsetPtr =3D STORE_OFFSET(cu=
rr - matchIndex);
             }
             if (ip+matchLength =3D=3D iend) {   /* reached end of input : =
ip[matchLength] is not valid, no way to know if it's larger or smaller than=
 match */
                 break;   /* drop, to guarantee consistency (miss a little =
bit of compression) */
@@ -218,7 +218,7 @@ ZSTD_DUBT_findBetterDictMatch (
     }
=20
     if (bestLength >=3D MINMATCH) {
-        U32 const mIndex =3D curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (vo=
id)mIndex;
+        U32 const mIndex =3D curr - (U32)STORED_OFFSET(*offsetPtr); (void)=
mIndex;
         DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of le=
ngth %u and offsetCode %u (pos %u)",
                     curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
     }
@@ -328,7 +328,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx =3D matchIndex + (U32)matchLength;
                 if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbi=
t32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) )
-                    bestLength =3D matchLength, *offsetPtr =3D ZSTD_REP_MO=
VE + curr - matchIndex;
+                    bestLength =3D matchLength, *offsetPtr =3D STORE_OFFSE=
T(curr - matchIndex);
                 if (ip+matchLength =3D=3D iend) {   /* equal : no way to k=
now if inf or sup */
                     if (dictMode =3D=3D ZSTD_dictMatchState) {
                         nbCompares =3D 0; /* in addition to avoiding check=
ing any
@@ -368,7 +368,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
         assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased =
*/
         ms->nextToUpdate =3D matchEndIdx - 8;   /* skip repetitive pattern=
s */
         if (bestLength >=3D MINMATCH) {
-            U32 const mIndex =3D curr - ((U32)*offsetPtr - ZSTD_REP_MOVE);=
 (void)mIndex;
+            U32 const mIndex =3D curr - (U32)STORED_OFFSET(*offsetPtr); (v=
oid)mIndex;
             DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of leng=
th %u and offsetCode %u (pos %u)",
                         curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
         }
@@ -391,91 +391,9 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
     return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMod=
e);
 }
=20
-
-static size_t
-ZSTD_BtFindBestMatch_selectMLS (  ZSTD_matchState_t* ms,
-                            const BYTE* ip, const BYTE* const iLimit,
-                                  size_t* offsetPtr)
-{
-    switch(ms->cParams.minMatch)
-    {
-    default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZST=
D_noDict);
-    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZST=
D_noDict);
-    case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZST=
D_noDict);
-    }
-}
-
-
-static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS (
-                        ZSTD_matchState_t* ms,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
-{
-    switch(ms->cParams.minMatch)
-    {
-    default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZST=
D_dictMatchState);
-    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZST=
D_dictMatchState);
-    case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZST=
D_dictMatchState);
-    }
-}
-
-
-static size_t ZSTD_BtFindBestMatch_extDict_selectMLS (
-                        ZSTD_matchState_t* ms,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
-{
-    switch(ms->cParams.minMatch)
-    {
-    default : /* includes case 3 */
-    case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZST=
D_extDict);
-    case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZST=
D_extDict);
-    case 7 :
-    case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZST=
D_extDict);
-    }
-}
-
-
-
 /* *********************************
-*  Hash Chain
+* Dedicated dict search
 ***********************************/
-#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]
-
-/* Update chains up to ip (excluded)
-   Assumption : always within prefix (i.e. not within extDict) */
-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
-                        ZSTD_matchState_t* ms,
-                        const ZSTD_compressionParameters* const cParams,
-                        const BYTE* ip, U32 const mls)
-{
-    U32* const hashTable  =3D ms->hashTable;
-    const U32 hashLog =3D cParams->hashLog;
-    U32* const chainTable =3D ms->chainTable;
-    const U32 chainMask =3D (1 << cParams->chainLog) - 1;
-    const BYTE* const base =3D ms->window.base;
-    const U32 target =3D (U32)(ip - base);
-    U32 idx =3D ms->nextToUpdate;
-
-    while(idx < target) { /* catch up */
-        size_t const h =3D ZSTD_hashPtr(base+idx, hashLog, mls);
-        NEXT_IN_CHAIN(idx, chainMask) =3D hashTable[h];
-        hashTable[h] =3D idx;
-        idx++;
-    }
-
-    ms->nextToUpdate =3D target;
-    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
-}
-
-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
-    const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
-    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cPar=
ams.minMatch);
-}
=20
 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, c=
onst BYTE* const ip)
 {
@@ -485,7 +403,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_=
matchState_t* ms, const B
     U32* const chainTable =3D ms->chainTable;
     U32 const chainSize =3D 1 << ms->cParams.chainLog;
     U32 idx =3D ms->nextToUpdate;
-    U32 const minChain =3D chainSize < target ? target - chainSize : idx;
+    U32 const minChain =3D chainSize < target - idx ? target - chainSize :=
 idx;
     U32 const bucketSize =3D 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
     U32 const cacheSize =3D bucketSize - 1;
     U32 const chainAttempts =3D (1 << ms->cParams.searchLog) - cacheSize;
@@ -499,13 +417,12 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZST=
D_matchState_t* ms, const B
     U32 const hashLog =3D ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
     U32* const tmpHashTable =3D hashTable;
     U32* const tmpChainTable =3D hashTable + ((size_t)1 << hashLog);
-    U32 const tmpChainSize =3D ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << h=
ashLog;
+    U32 const tmpChainSize =3D (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1)=
 << hashLog;
     U32 const tmpMinChain =3D tmpChainSize < target ? target - tmpChainSiz=
e : idx;
-
     U32 hashIdx;
=20
     assert(ms->cParams.chainLog <=3D 24);
-    assert(ms->cParams.hashLog >=3D ms->cParams.chainLog);
+    assert(ms->cParams.hashLog > ms->cParams.chainLog);
     assert(idx !=3D 0);
     assert(tmpMinChain <=3D minChain);
=20
@@ -536,7 +453,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_=
matchState_t* ms, const B
             if (count =3D=3D cacheSize) {
                 for (count =3D 0; count < chainLimit;) {
                     if (i < minChain) {
-                        if (!i || countBeyondMinChain++ > cacheSize) {
+                        if (!i || ++countBeyondMinChain > cacheSize) {
                             /* only allow pulling `cacheSize` number of en=
tries
                              * into the cache or chainTable beyond `minCha=
in`,
                              * to replace the entries pulled out of the
@@ -592,10 +509,143 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZS=
TD_matchState_t* ms, const B
     ms->nextToUpdate =3D target;
 }
=20
+/* Returns the longest match length found in the dedicated dict search str=
ucture.
+ * If none are longer than the argument ml, then ml will be returned.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, =
U32 nbAttempts,
+                                            const ZSTD_matchState_t* const=
 dms,
+                                            const BYTE* const ip, const BY=
TE* const iLimit,
+                                            const BYTE* const prefixStart,=
 const U32 curr,
+                                            const U32 dictLimit, const siz=
e_t ddsIdx) {
+    const U32 ddsLowestIndex  =3D dms->window.dictLimit;
+    const BYTE* const ddsBase =3D dms->window.base;
+    const BYTE* const ddsEnd  =3D dms->window.nextSrc;
+    const U32 ddsSize         =3D (U32)(ddsEnd - ddsBase);
+    const U32 ddsIndexDelta   =3D dictLimit - ddsSize;
+    const U32 bucketSize      =3D (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
+    const U32 bucketLimit     =3D nbAttempts < bucketSize - 1 ? nbAttempts=
 : bucketSize - 1;
+    U32 ddsAttempt;
+    U32 matchIndex;
+
+    for (ddsAttempt =3D 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
+        PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
+    }
+
+    {
+        U32 const chainPackedPointer =3D dms->hashTable[ddsIdx + bucketSiz=
e - 1];
+        U32 const chainIndex =3D chainPackedPointer >> 8;
+
+        PREFETCH_L1(&dms->chainTable[chainIndex]);
+    }
+
+    for (ddsAttempt =3D 0; ddsAttempt < bucketLimit; ddsAttempt++) {
+        size_t currentMl=3D0;
+        const BYTE* match;
+        matchIndex =3D dms->hashTable[ddsIdx + ddsAttempt];
+        match =3D ddsBase + matchIndex;
+
+        if (!matchIndex) {
+            return ml;
+        }
+
+        /* guaranteed by table construction */
+        (void)ddsLowestIndex;
+        assert(matchIndex >=3D ddsLowestIndex);
+        assert(match+4 <=3D ddsEnd);
+        if (MEM_read32(match) =3D=3D MEM_read32(ip)) {
+            /* assumption : matchIndex <=3D dictLimit-4 (by table construc=
tion) */
+            currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLimit, ddsE=
nd, prefixStart) + 4;
+        }
+
+        /* save best solution */
+        if (currentMl > ml) {
+            ml =3D currentMl;
+            *offsetPtr =3D STORE_OFFSET(curr - (matchIndex + ddsIndexDelta=
));
+            if (ip+currentMl =3D=3D iLimit) {
+                /* best possible, avoids read overflow on next attempt */
+                return ml;
+            }
+        }
+    }
+
+    {
+        U32 const chainPackedPointer =3D dms->hashTable[ddsIdx + bucketSiz=
e - 1];
+        U32 chainIndex =3D chainPackedPointer >> 8;
+        U32 const chainLength =3D chainPackedPointer & 0xFF;
+        U32 const chainAttempts =3D nbAttempts - ddsAttempt;
+        U32 const chainLimit =3D chainAttempts > chainLength ? chainLength=
 : chainAttempts;
+        U32 chainAttempt;
+
+        for (chainAttempt =3D 0 ; chainAttempt < chainLimit; chainAttempt+=
+) {
+            PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttemp=
t]);
+        }
+
+        for (chainAttempt =3D 0 ; chainAttempt < chainLimit; chainAttempt+=
+, chainIndex++) {
+            size_t currentMl=3D0;
+            const BYTE* match;
+            matchIndex =3D dms->chainTable[chainIndex];
+            match =3D ddsBase + matchIndex;
+
+            /* guaranteed by table construction */
+            assert(matchIndex >=3D ddsLowestIndex);
+            assert(match+4 <=3D ddsEnd);
+            if (MEM_read32(match) =3D=3D MEM_read32(ip)) {
+                /* assumption : matchIndex <=3D dictLimit-4 (by table cons=
truction) */
+                currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLimit, =
ddsEnd, prefixStart) + 4;
+            }
+
+            /* save best solution */
+            if (currentMl > ml) {
+                ml =3D currentMl;
+                *offsetPtr =3D STORE_OFFSET(curr - (matchIndex + ddsIndexD=
elta));
+                if (ip+currentMl =3D=3D iLimit) break; /* best possible, a=
voids read overflow on next attempt */
+            }
+        }
+    }
+    return ml;
+}
+
+
+/* *********************************
+*  Hash Chain
+***********************************/
+#define NEXT_IN_CHAIN(d, mask)   chainTable[(d) & (mask)]
+
+/* Update chains up to ip (excluded)
+   Assumption : always within prefix (i.e. not within extDict) */
+FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
+                        ZSTD_matchState_t* ms,
+                        const ZSTD_compressionParameters* const cParams,
+                        const BYTE* ip, U32 const mls)
+{
+    U32* const hashTable  =3D ms->hashTable;
+    const U32 hashLog =3D cParams->hashLog;
+    U32* const chainTable =3D ms->chainTable;
+    const U32 chainMask =3D (1 << cParams->chainLog) - 1;
+    const BYTE* const base =3D ms->window.base;
+    const U32 target =3D (U32)(ip - base);
+    U32 idx =3D ms->nextToUpdate;
+
+    while(idx < target) { /* catch up */
+        size_t const h =3D ZSTD_hashPtr(base+idx, hashLog, mls);
+        NEXT_IN_CHAIN(idx, chainMask) =3D hashTable[h];
+        hashTable[h] =3D idx;
+        idx++;
+    }
+
+    ms->nextToUpdate =3D target;
+    return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
+}
+
+U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
+    const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
+    return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cPar=
ams.minMatch);
+}
=20
 /* inlining is important to hardwire a hot branch (template emulation) */
 FORCE_INLINE_TEMPLATE
-size_t ZSTD_HcFindBestMatch_generic (
+size_t ZSTD_HcFindBestMatch(
                         ZSTD_matchState_t* ms,
                         const BYTE* const ip, const BYTE* const iLimit,
                         size_t* offsetPtr,
@@ -653,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic (
         /* save best solution */
         if (currentMl > ml) {
             ml =3D currentMl;
-            *offsetPtr =3D curr - matchIndex + ZSTD_REP_MOVE;
+            *offsetPtr =3D STORE_OFFSET(curr - matchIndex);
             if (ip+currentMl =3D=3D iLimit) break; /* best possible, avoid=
s read overflow on next attempt */
         }
=20
@@ -663,90 +713,8 @@ size_t ZSTD_HcFindBestMatch_generic (
=20
     assert(nbAttempts <=3D (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven'=
t underflowed. */
     if (dictMode =3D=3D ZSTD_dedicatedDictSearch) {
-        const U32 ddsLowestIndex  =3D dms->window.dictLimit;
-        const BYTE* const ddsBase =3D dms->window.base;
-        const BYTE* const ddsEnd  =3D dms->window.nextSrc;
-        const U32 ddsSize         =3D (U32)(ddsEnd - ddsBase);
-        const U32 ddsIndexDelta   =3D dictLimit - ddsSize;
-        const U32 bucketSize      =3D (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
-        const U32 bucketLimit     =3D nbAttempts < bucketSize - 1 ? nbAtte=
mpts : bucketSize - 1;
-        U32 ddsAttempt;
-
-        for (ddsAttempt =3D 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
-            PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
-        }
-
-        {
-            U32 const chainPackedPointer =3D dms->hashTable[ddsIdx + bucke=
tSize - 1];
-            U32 const chainIndex =3D chainPackedPointer >> 8;
-
-            PREFETCH_L1(&dms->chainTable[chainIndex]);
-        }
-
-        for (ddsAttempt =3D 0; ddsAttempt < bucketLimit; ddsAttempt++) {
-            size_t currentMl=3D0;
-            const BYTE* match;
-            matchIndex =3D dms->hashTable[ddsIdx + ddsAttempt];
-            match =3D ddsBase + matchIndex;
-
-            if (!matchIndex) {
-                return ml;
-            }
-
-            /* guaranteed by table construction */
-            (void)ddsLowestIndex;
-            assert(matchIndex >=3D ddsLowestIndex);
-            assert(match+4 <=3D ddsEnd);
-            if (MEM_read32(match) =3D=3D MEM_read32(ip)) {
-                /* assumption : matchIndex <=3D dictLimit-4 (by table cons=
truction) */
-                currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLimit, =
ddsEnd, prefixStart) + 4;
-            }
-
-            /* save best solution */
-            if (currentMl > ml) {
-                ml =3D currentMl;
-                *offsetPtr =3D curr - (matchIndex + ddsIndexDelta) + ZSTD_=
REP_MOVE;
-                if (ip+currentMl =3D=3D iLimit) {
-                    /* best possible, avoids read overflow on next attempt=
 */
-                    return ml;
-                }
-            }
-        }
-
-        {
-            U32 const chainPackedPointer =3D dms->hashTable[ddsIdx + bucke=
tSize - 1];
-            U32 chainIndex =3D chainPackedPointer >> 8;
-            U32 const chainLength =3D chainPackedPointer & 0xFF;
-            U32 const chainAttempts =3D nbAttempts - ddsAttempt;
-            U32 const chainLimit =3D chainAttempts > chainLength ? chainLe=
ngth : chainAttempts;
-            U32 chainAttempt;
-
-            for (chainAttempt =3D 0 ; chainAttempt < chainLimit; chainAtte=
mpt++) {
-                PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAt=
tempt]);
-            }
-
-            for (chainAttempt =3D 0 ; chainAttempt < chainLimit; chainAtte=
mpt++, chainIndex++) {
-                size_t currentMl=3D0;
-                const BYTE* match;
-                matchIndex =3D dms->chainTable[chainIndex];
-                match =3D ddsBase + matchIndex;
-
-                /* guaranteed by table construction */
-                assert(matchIndex >=3D ddsLowestIndex);
-                assert(match+4 <=3D ddsEnd);
-                if (MEM_read32(match) =3D=3D MEM_read32(ip)) {
-                    /* assumption : matchIndex <=3D dictLimit-4 (by table =
construction) */
-                    currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLim=
it, ddsEnd, prefixStart) + 4;
-                }
-
-                /* save best solution */
-                if (currentMl > ml) {
-                    ml =3D currentMl;
-                    *offsetPtr =3D curr - (matchIndex + ddsIndexDelta) + Z=
STD_REP_MOVE;
-                    if (ip+currentMl =3D=3D iLimit) break; /* best possibl=
e, avoids read overflow on next attempt */
-                }
-            }
-        }
+        ml =3D ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttem=
pts, dms,
+                                                  ip, iLimit, prefixStart,=
 curr, dictLimit, ddsIdx);
     } else if (dictMode =3D=3D ZSTD_dictMatchState) {
         const U32* const dmsChainTable =3D dms->chainTable;
         const U32 dmsChainSize         =3D (1 << dms->cParams.chainLog);
@@ -770,7 +738,8 @@ size_t ZSTD_HcFindBestMatch_generic (
             /* save best solution */
             if (currentMl > ml) {
                 ml =3D currentMl;
-                *offsetPtr =3D curr - (matchIndex + dmsIndexDelta) + ZSTD_=
REP_MOVE;
+                assert(curr > matchIndex + dmsIndexDelta);
+                *offsetPtr =3D STORE_OFFSET(curr - (matchIndex + dmsIndexD=
elta));
                 if (ip+currentMl =3D=3D iLimit) break; /* best possible, a=
voids read overflow on next attempt */
             }
=20
@@ -783,75 +752,725 @@ size_t ZSTD_HcFindBestMatch_generic (
     return ml;
 }
=20
+/* *********************************
+* (SIMD) Row-based matchfinder
+***********************************/
+/* Constants for row-based hash */
+#define ZSTD_ROW_HASH_TAG_OFFSET 16     /* byte offset of hashes in the ma=
tch state's tagTable from the beginning of a row */
+#define ZSTD_ROW_HASH_TAG_BITS 8        /* nb bits to use for the tag */
+#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
+#define ZSTD_ROW_HASH_MAX_ENTRIES 64    /* absolute maximum number of entr=
ies per row, for all configurations */
+
+#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
=20
-FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
-                        ZSTD_matchState_t* ms,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
+typedef U64 ZSTD_VecMask;   /* Clarifies when we are interacting with a U6=
4 representing a mask of matches */
+
+/* ZSTD_VecMask_next():
+ * Starting from the LSB, returns the idx of the next non-zero bit.
+ * Basically counting the nb of trailing zeroes.
+ */
+static U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
+    assert(val !=3D 0);
+#   if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ =3D=3D 3) && (_=
_GNUC_MINOR__ >=3D 4))))
+    if (sizeof(size_t) =3D=3D 4) {
+        U32 mostSignificantWord =3D (U32)(val >> 32);
+        U32 leastSignificantWord =3D (U32)val;
+        if (leastSignificantWord =3D=3D 0) {
+            return 32 + (U32)__builtin_ctz(mostSignificantWord);
+        } else {
+            return (U32)__builtin_ctz(leastSignificantWord);
+        }
+    } else {
+        return (U32)__builtin_ctzll(val);
+    }
+#   else
+    /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%2=
0Count
+     * and: https://stackoverflow.com/questions/2709430/count-number-of-bi=
ts-in-a-64-bit-long-big-integer
+     */
+    val =3D ~val & (val - 1ULL); /* Lowest set bit mask */
+    val =3D val - ((val >> 1) & 0x5555555555555555);
+    val =3D (val & 0x3333333333333333ULL) + ((val >> 2) & 0x33333333333333=
33ULL);
+    return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x10101010=
1010101ULL) >> 56);
+#   endif
+}
+
+/* ZSTD_rotateRight_*():
+ * Rotates a bitfield to the right by "count" bits.
+ * https://en.wikipedia.org/w/index.php?title=3DCircular_shift&oldid=3D991=
635599#Implementing_circular_shifts
+ */
+FORCE_INLINE_TEMPLATE
+U64 ZSTD_rotateRight_U64(U64 const value, U32 count) {
+    assert(count < 64);
+    count &=3D 0x3F; /* for fickle pattern recognition */
+    return (value >> count) | (U64)(value << ((0U - count) & 0x3F));
+}
+
+FORCE_INLINE_TEMPLATE
+U32 ZSTD_rotateRight_U32(U32 const value, U32 count) {
+    assert(count < 32);
+    count &=3D 0x1F; /* for fickle pattern recognition */
+    return (value >> count) | (U32)(value << ((0U - count) & 0x1F));
+}
+
+FORCE_INLINE_TEMPLATE
+U16 ZSTD_rotateRight_U16(U16 const value, U32 count) {
+    assert(count < 16);
+    count &=3D 0x0F; /* for fickle pattern recognition */
+    return (value >> count) | (U16)(value << ((0U - count) & 0x0F));
+}
+
+/* ZSTD_row_nextIndex():
+ * Returns the next index to insert at within a tagTable row, and updates =
the "head"
+ * value to reflect the update. Essentially cycles backwards from [0, {ent=
ries per row})
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const=
 rowMask) {
+  U32 const next =3D (*tagRow - 1) & rowMask;
+  *tagRow =3D (BYTE)next;
+  return next;
+}
+
+/* ZSTD_isAligned():
+ * Checks that a pointer is aligned to "align" bytes which must be a power=
 of 2.
+ */
+MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
+    assert((align & (align - 1)) =3D=3D 0);
+    return (((size_t)ptr) & (align - 1)) =3D=3D 0;
+}
+
+/* ZSTD_row_prefetch():
+ * Performs prefetching for the hashTable and tagTable at a given row.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 con=
st* tagTable, U32 const relRow, U32 const rowLog) {
+    PREFETCH_L1(hashTable + relRow);
+    if (rowLog >=3D 5) {
+        PREFETCH_L1(hashTable + relRow + 16);
+        /* Note: prefetching more of the hash table does not appear to be =
beneficial for 128-entry rows */
+    }
+    PREFETCH_L1(tagTable + relRow);
+    if (rowLog =3D=3D 6) {
+        PREFETCH_L1(tagTable + relRow + 32);
+    }
+    assert(rowLog =3D=3D 4 || rowLog =3D=3D 5 || rowLog =3D=3D 6);
+    assert(ZSTD_isAligned(hashTable + relRow, 64));                 /* pre=
fetched hash row always 64-byte aligned */
+    assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* pre=
fetched tagRow sits on correct multiple of bytes (32,64,128) */
+}
+
+/* ZSTD_row_fillHashCache():
+ * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH=
_CACHE_SIZE entries,
+ * but not beyond iLimit.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, c=
onst BYTE* base,
+                                   U32 const rowLog, U32 const mls,
+                                   U32 idx, const BYTE* const iLimit)
 {
-    switch(ms->cParams.minMatch)
-    {
-    default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 4, ZSTD_noDict);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 5, ZSTD_noDict);
-    case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 6, ZSTD_noDict);
+    U32 const* const hashTable =3D ms->hashTable;
+    U16 const* const tagTable =3D ms->tagTable;
+    U32 const hashLog =3D ms->rowHashLog;
+    U32 const maxElemsToPrefetch =3D (base + idx) > iLimit ? 0 : (U32)(iLi=
mit - (base + idx) + 1);
+    U32 const lim =3D idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefet=
ch);
+
+    for (; idx < lim; ++idx) {
+        U32 const hash =3D (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_RO=
W_HASH_TAG_BITS, mls);
+        U32 const row =3D (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+        ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] =3D hash;
     }
+
+    DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms-=
>hashCache[0], ms->hashCache[1],
+                                                     ms->hashCache[2], ms-=
>hashCache[3], ms->hashCache[4],
+                                                     ms->hashCache[5], ms-=
>hashCache[6], ms->hashCache[7]);
 }
=20
+/* ZSTD_row_nextCachedHash():
+ * Returns the hash of base + idx, and replaces the hash in the hash cache=
 with the byte at
+ * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate =
rows from hashTable and tagTable.
+ */
+FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* h=
ashTable,
+                                                  U16 const* tagTable, BYT=
E const* base,
+                                                  U32 idx, U32 const hashL=
og,
+                                                  U32 const rowLog, U32 co=
nst mls)
+{
+    U32 const newHash =3D (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_S=
IZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+    U32 const row =3D (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+    ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
+    {   U32 const hash =3D cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
+        cache[idx & ZSTD_ROW_HASH_CACHE_MASK] =3D newHash;
+        return hash;
+    }
+}
=20
-static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS (
-                        ZSTD_matchState_t* ms,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
+/* ZSTD_row_update_internalImpl():
+ * Updates the hash table with positions starting from updateStartIdx unti=
l updateEndIdx.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t*=
 ms,
+                                                        U32 updateStartIdx=
, U32 const updateEndIdx,
+                                                        U32 const mls, U32=
 const rowLog,
+                                                        U32 const rowMask,=
 U32 const useCache)
 {
-    switch(ms->cParams.minMatch)
-    {
-    default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 4, ZSTD_dictMatchState);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 5, ZSTD_dictMatchState);
-    case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 6, ZSTD_dictMatchState);
+    U32* const hashTable =3D ms->hashTable;
+    U16* const tagTable =3D ms->tagTable;
+    U32 const hashLog =3D ms->rowHashLog;
+    const BYTE* const base =3D ms->window.base;
+
+    DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=3D%u, upda=
teEndIdx=3D%u", updateStartIdx, updateEndIdx);
+    for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
+        U32 const hash =3D useCache ? ZSTD_row_nextCachedHash(ms->hashCach=
e, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls)
+                                  : (U32)ZSTD_hashPtr(base + updateStartId=
x, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
+        U32 const relRow =3D (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        U32* const row =3D hashTable + relRow;
+        BYTE* tagRow =3D (BYTE*)(tagTable + relRow);  /* Though tagTable i=
s laid out as a table of U16, each tag is only 1 byte.
+                                                       Explicit cast allow=
s us to get exact desired position within each row */
+        U32 const pos =3D ZSTD_row_nextIndex(tagRow, rowMask);
+
+        assert(hash =3D=3D ZSTD_hashPtr(base + updateStartIdx, hashLog + Z=
STD_ROW_HASH_TAG_BITS, mls));
+        ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] =3D hash & ZSTD_RO=
W_HASH_TAG_MASK;
+        row[pos] =3D updateStartIdx;
     }
 }
=20
+/* ZSTD_row_update_internal():
+ * Inserts the byte at ip into the appropriate position in the hash table,=
 and updates ms->nextToUpdate.
+ * Skips sections of long matches as is necessary.
+ */
+FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms,=
 const BYTE* ip,
+                                                    U32 const mls, U32 con=
st rowLog,
+                                                    U32 const rowMask, U32=
 const useCache)
+{
+    U32 idx =3D ms->nextToUpdate;
+    const BYTE* const base =3D ms->window.base;
+    const U32 target =3D (U32)(ip - base);
+    const U32 kSkipThreshold =3D 384;
+    const U32 kMaxMatchStartPositionsToUpdate =3D 96;
+    const U32 kMaxMatchEndPositionsToUpdate =3D 32;
+
+    if (useCache) {
+        /* Only skip positions when using hash cache, i.e.
+         * if we are loading a dict, don't skip anything.
+         * If we decide to skip, then we only update a set number
+         * of positions at the beginning and end of the match.
+         */
+        if (UNLIKELY(target - idx > kSkipThreshold)) {
+            U32 const bound =3D idx + kMaxMatchStartPositionsToUpdate;
+            ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowM=
ask, useCache);
+            idx =3D target - kMaxMatchEndPositionsToUpdate;
+            ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1);
+        }
+    }
+    assert(target >=3D idx);
+    ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, us=
eCache);
+    ms->nextToUpdate =3D target;
+}
=20
-static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS (
-                        ZSTD_matchState_t* ms,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
+/* ZSTD_row_update():
+ * External wrapper for ZSTD_row_update_internal(). Used for filling the h=
ashtable during dictionary
+ * processing.
+ */
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
+    const U32 rowLog =3D BOUNDED(4, ms->cParams.searchLog, 6);
+    const U32 rowMask =3D (1u << rowLog) - 1;
+    const U32 mls =3D MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */);
+
+    DEBUGLOG(5, "ZSTD_row_update(), rowLog=3D%u", rowLog);
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use c=
ache */);
+}
+
+#if defined(ZSTD_ARCH_X86_SSE2)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, c=
onst U32 head)
 {
-    switch(ms->cParams.minMatch)
-    {
-    default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 4, ZSTD_dedicatedDictSearch);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 5, ZSTD_dedicatedDictSearch);
-    case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 6, ZSTD_dedicatedDictSearch);
+    const __m128i comparisonMask =3D _mm_set1_epi8((char)tag);
+    int matches[4] =3D {0};
+    int i;
+    assert(nbChunks =3D=3D 1 || nbChunks =3D=3D 2 || nbChunks =3D=3D 4);
+    for (i=3D0; i<nbChunks; i++) {
+        const __m128i chunk =3D _mm_loadu_si128((const __m128i*)(const voi=
d*)(src + 16*i));
+        const __m128i equalMask =3D _mm_cmpeq_epi8(chunk, comparisonMask);
+        matches[i] =3D _mm_movemask_epi8(equalMask);
     }
+    if (nbChunks =3D=3D 1) return ZSTD_rotateRight_U16((U16)matches[0], he=
ad);
+    if (nbChunks =3D=3D 2) return ZSTD_rotateRight_U32((U32)matches[1] << =
16 | (U32)matches[0], head);
+    assert(nbChunks =3D=3D 4);
+    return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] <<=
 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
 }
+#endif
=20
+/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly=
-computed "tag" matches
+ * the hash at the nth position in a row of the tagTable.
+ * Each row is a circular buffer beginning at the value of "head". So we m=
ust rotate the "matches" bitfield
+ * to match up with the actual layout of the entries within the hashTable =
*/
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 =
head, const U32 rowEntries)
+{
+    const BYTE* const src =3D tagRow + ZSTD_ROW_HASH_TAG_OFFSET;
+    assert((rowEntries =3D=3D 16) || (rowEntries =3D=3D 32) || rowEntries =
=3D=3D 64);
+    assert(rowEntries <=3D ZSTD_ROW_HASH_MAX_ENTRIES);
+
+#if defined(ZSTD_ARCH_X86_SSE2)
+
+    return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head);
+
+#else /* SW or NEON-LE */
+
+# if defined(ZSTD_ARCH_ARM_NEON)
+  /* This NEON path only works for little endian - otherwise use SWAR belo=
w */
+    if (MEM_isLittleEndian()) {
+        if (rowEntries =3D=3D 16) {
+            const uint8x16_t chunk =3D vld1q_u8(src);
+            const uint16x8_t equalMask =3D vreinterpretq_u16_u8(vceqq_u8(c=
hunk, vdupq_n_u8(tag)));
+            const uint16x8_t t0 =3D vshlq_n_u16(equalMask, 7);
+            const uint32x4_t t1 =3D vreinterpretq_u32_u16(vsriq_n_u16(t0, =
t0, 14));
+            const uint64x2_t t2 =3D vreinterpretq_u64_u32(vshrq_n_u32(t1, =
14));
+            const uint8x16_t t3 =3D vreinterpretq_u8_u64(vsraq_n_u64(t2, t=
2, 28));
+            const U16 hi =3D (U16)vgetq_lane_u8(t3, 8);
+            const U16 lo =3D (U16)vgetq_lane_u8(t3, 0);
+            return ZSTD_rotateRight_U16((hi << 8) | lo, head);
+        } else if (rowEntries =3D=3D 32) {
+            const uint16x8x2_t chunk =3D vld2q_u16((const U16*)(const void=
*)src);
+            const uint8x16_t chunk0 =3D vreinterpretq_u8_u16(chunk.val[0]);
+            const uint8x16_t chunk1 =3D vreinterpretq_u8_u16(chunk.val[1]);
+            const uint8x16_t equalMask0 =3D vceqq_u8(chunk0, vdupq_n_u8(ta=
g));
+            const uint8x16_t equalMask1 =3D vceqq_u8(chunk1, vdupq_n_u8(ta=
g));
+            const int8x8_t pack0 =3D vqmovn_s16(vreinterpretq_s16_u8(equal=
Mask0));
+            const int8x8_t pack1 =3D vqmovn_s16(vreinterpretq_s16_u8(equal=
Mask1));
+            const uint8x8_t t0 =3D vreinterpret_u8_s8(pack0);
+            const uint8x8_t t1 =3D vreinterpret_u8_s8(pack1);
+            const uint8x8_t t2 =3D vsri_n_u8(t1, t0, 2);
+            const uint8x8x2_t t3 =3D vuzp_u8(t2, t0);
+            const uint8x8_t t4 =3D vsri_n_u8(t3.val[1], t3.val[0], 4);
+            const U32 matches =3D vget_lane_u32(vreinterpret_u32_u8(t4), 0=
);
+            return ZSTD_rotateRight_U32(matches, head);
+        } else { /* rowEntries =3D=3D 64 */
+            const uint8x16x4_t chunk =3D vld4q_u8(src);
+            const uint8x16_t dup =3D vdupq_n_u8(tag);
+            const uint8x16_t cmp0 =3D vceqq_u8(chunk.val[0], dup);
+            const uint8x16_t cmp1 =3D vceqq_u8(chunk.val[1], dup);
+            const uint8x16_t cmp2 =3D vceqq_u8(chunk.val[2], dup);
+            const uint8x16_t cmp3 =3D vceqq_u8(chunk.val[3], dup);
+
+            const uint8x16_t t0 =3D vsriq_n_u8(cmp1, cmp0, 1);
+            const uint8x16_t t1 =3D vsriq_n_u8(cmp3, cmp2, 1);
+            const uint8x16_t t2 =3D vsriq_n_u8(t1, t0, 2);
+            const uint8x16_t t3 =3D vsriq_n_u8(t2, t2, 4);
+            const uint8x8_t t4 =3D vshrn_n_u16(vreinterpretq_u16_u8(t3), 4=
);
+            const U64 matches =3D vget_lane_u64(vreinterpret_u64_u8(t4), 0=
);
+            return ZSTD_rotateRight_U64(matches, head);
+        }
+    }
+# endif /* ZSTD_ARCH_ARM_NEON */
+    /* SWAR */
+    {   const size_t chunkSize =3D sizeof(size_t);
+        const size_t shiftAmount =3D ((chunkSize * 8) - chunkSize);
+        const size_t xFF =3D ~((size_t)0);
+        const size_t x01 =3D xFF / 0xFF;
+        const size_t x80 =3D x01 << 7;
+        const size_t splatChar =3D tag * x01;
+        ZSTD_VecMask matches =3D 0;
+        int i =3D rowEntries - chunkSize;
+        assert((sizeof(size_t) =3D=3D 4) || (sizeof(size_t) =3D=3D 8));
+        if (MEM_isLittleEndian()) { /* runtime check so have two loops */
+            const size_t extractMagic =3D (xFF / 0x7F) >> chunkSize;
+            do {
+                size_t chunk =3D MEM_readST(&src[i]);
+                chunk ^=3D splatChar;
+                chunk =3D (((chunk | x80) - x01) | chunk) & x80;
+                matches <<=3D chunkSize;
+                matches |=3D (chunk * extractMagic) >> shiftAmount;
+                i -=3D chunkSize;
+            } while (i >=3D 0);
+        } else { /* big endian: reverse bits during extraction */
+            const size_t msb =3D xFF ^ (xFF >> 1);
+            const size_t extractMagic =3D (msb / 0x1FF) | msb;
+            do {
+                size_t chunk =3D MEM_readST(&src[i]);
+                chunk ^=3D splatChar;
+                chunk =3D (((chunk | x80) - x01) | chunk) & x80;
+                matches <<=3D chunkSize;
+                matches |=3D ((chunk >> 7) * extractMagic) >> shiftAmount;
+                i -=3D chunkSize;
+            } while (i >=3D 0);
+        }
+        matches =3D ~matches;
+        if (rowEntries =3D=3D 16) {
+            return ZSTD_rotateRight_U16((U16)matches, head);
+        } else if (rowEntries =3D=3D 32) {
+            return ZSTD_rotateRight_U32((U32)matches, head);
+        } else {
+            return ZSTD_rotateRight_U64((U64)matches, head);
+        }
+    }
+#endif
+}
=20
-FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
+/* The high-level approach of the SIMD row based match finder is as follow=
s:
+ * - Figure out where to insert the new entry:
+ *      - Generate a hash from a byte along with an additional 1-byte "sho=
rt hash". The additional byte is our "tag"
+ *      - The hashTable is effectively split into groups or "rows" of 16 o=
r 32 entries of U32, and the hash determines
+ *        which row to insert into.
+ *      - Determine the correct position within the row to insert the entr=
y into. Each row of 16 or 32 can
+ *        be considered as a circular buffer with a "head" index that resi=
des in the tagTable.
+ *      - Also insert the "tag" into the equivalent row and position in th=
e tagTable.
+ *          - Note: The tagTable has 17 or 33 1-byte entries per row, due =
to 16 or 32 tags, and 1 "head" entry.
+ *                  The 17 or 33 entry rows are spaced out to occur every =
32 or 64 bytes, respectively,
+ *                  for alignment/performance reasons, leaving some bytes =
unused.
+ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byt=
e "short hash" and
+ *   generate a bitfield that we can cycle through to check the collisions=
 in the hash table.
+ * - Pick the longest match.
+ */
+FORCE_INLINE_TEMPLATE
+size_t ZSTD_RowFindBestMatch(
                         ZSTD_matchState_t* ms,
-                        const BYTE* ip, const BYTE* const iLimit,
-                        size_t* offsetPtr)
+                        const BYTE* const ip, const BYTE* const iLimit,
+                        size_t* offsetPtr,
+                        const U32 mls, const ZSTD_dictMode_e dictMode,
+                        const U32 rowLog)
 {
-    switch(ms->cParams.minMatch)
-    {
-    default : /* includes case 3 */
-    case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 4, ZSTD_extDict);
-    case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 5, ZSTD_extDict);
-    case 7 :
-    case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr=
, 6, ZSTD_extDict);
+    U32* const hashTable =3D ms->hashTable;
+    U16* const tagTable =3D ms->tagTable;
+    U32* const hashCache =3D ms->hashCache;
+    const U32 hashLog =3D ms->rowHashLog;
+    const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
+    const BYTE* const base =3D ms->window.base;
+    const BYTE* const dictBase =3D ms->window.dictBase;
+    const U32 dictLimit =3D ms->window.dictLimit;
+    const BYTE* const prefixStart =3D base + dictLimit;
+    const BYTE* const dictEnd =3D dictBase + dictLimit;
+    const U32 curr =3D (U32)(ip-base);
+    const U32 maxDistance =3D 1U << cParams->windowLog;
+    const U32 lowestValid =3D ms->window.lowLimit;
+    const U32 withinMaxDistance =3D (curr - lowestValid > maxDistance) ? c=
urr - maxDistance : lowestValid;
+    const U32 isDictionary =3D (ms->loadedDictEnd !=3D 0);
+    const U32 lowLimit =3D isDictionary ? lowestValid : withinMaxDistance;
+    const U32 rowEntries =3D (1U << rowLog);
+    const U32 rowMask =3D rowEntries - 1;
+    const U32 cappedSearchLog =3D MIN(cParams->searchLog, rowLog); /* nb o=
f searches is capped at nb entries per row */
+    U32 nbAttempts =3D 1U << cappedSearchLog;
+    size_t ml=3D4-1;
+
+    /* DMS/DDS variables that may be referenced laster */
+    const ZSTD_matchState_t* const dms =3D ms->dictMatchState;
+
+    /* Initialize the following variables to satisfy static analyzer */
+    size_t ddsIdx =3D 0;
+    U32 ddsExtraAttempts =3D 0; /* cctx hash tables are limited in searche=
s, but allow extra searches into DDS */
+    U32 dmsTag =3D 0;
+    U32* dmsRow =3D NULL;
+    BYTE* dmsTagRow =3D NULL;
+
+    if (dictMode =3D=3D ZSTD_dedicatedDictSearch) {
+        const U32 ddsHashLog =3D dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUC=
KET_LOG;
+        {   /* Prefetch DDS hashtable entry */
+            ddsIdx =3D ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS=
_BUCKET_LOG;
+            PREFETCH_L1(&dms->hashTable[ddsIdx]);
+        }
+        ddsExtraAttempts =3D cParams->searchLog > rowLog ? 1U << (cParams-=
>searchLog - rowLog) : 0;
+    }
+
+    if (dictMode =3D=3D ZSTD_dictMatchState) {
+        /* Prefetch DMS rows */
+        U32* const dmsHashTable =3D dms->hashTable;
+        U16* const dmsTagTable =3D dms->tagTable;
+        U32 const dmsHash =3D (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD=
_ROW_HASH_TAG_BITS, mls);
+        U32 const dmsRelRow =3D (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << row=
Log;
+        dmsTag =3D dmsHash & ZSTD_ROW_HASH_TAG_MASK;
+        dmsTagRow =3D (BYTE*)(dmsTagTable + dmsRelRow);
+        dmsRow =3D dmsHashTable + dmsRelRow;
+        ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
+    }
+
+    /* Update the hashTable and tagTable up to (but not including) ip */
+    ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache *=
/);
+    {   /* Get the hash for ip, compute the appropriate row */
+        U32 const hash =3D ZSTD_row_nextCachedHash(hashCache, hashTable, t=
agTable, base, curr, hashLog, rowLog, mls);
+        U32 const relRow =3D (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
+        U32 const tag =3D hash & ZSTD_ROW_HASH_TAG_MASK;
+        U32* const row =3D hashTable + relRow;
+        BYTE* tagRow =3D (BYTE*)(tagTable + relRow);
+        U32 const head =3D *tagRow & rowMask;
+        U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+        size_t numMatches =3D 0;
+        size_t currMatch =3D 0;
+        ZSTD_VecMask matches =3D ZSTD_row_getMatchMask(tagRow, (BYTE)tag, =
head, rowEntries);
+
+        /* Cycle through the matches and prefetch */
+        for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &=
=3D (matches - 1)) {
+            U32 const matchPos =3D (head + ZSTD_VecMask_next(matches)) & r=
owMask;
+            U32 const matchIndex =3D row[matchPos];
+            assert(numMatches < rowEntries);
+            if (matchIndex < lowLimit)
+                break;
+            if ((dictMode !=3D ZSTD_extDict) || matchIndex >=3D dictLimit)=
 {
+                PREFETCH_L1(base + matchIndex);
+            } else {
+                PREFETCH_L1(dictBase + matchIndex);
+            }
+            matchBuffer[numMatches++] =3D matchIndex;
+        }
+
+        /* Speed opt: insert current byte into hashtable too. This allows =
us to avoid one iteration of the loop
+           in ZSTD_row_update_internal() at the next search. */
+        {
+            U32 const pos =3D ZSTD_row_nextIndex(tagRow, rowMask);
+            tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] =3D (BYTE)tag;
+            row[pos] =3D ms->nextToUpdate++;
+        }
+
+        /* Return the longest match */
+        for (; currMatch < numMatches; ++currMatch) {
+            U32 const matchIndex =3D matchBuffer[currMatch];
+            size_t currentMl=3D0;
+            assert(matchIndex < curr);
+            assert(matchIndex >=3D lowLimit);
+
+            if ((dictMode !=3D ZSTD_extDict) || matchIndex >=3D dictLimit)=
 {
+                const BYTE* const match =3D base + matchIndex;
+                assert(matchIndex >=3D dictLimit);   /* ensures this is tr=
ue if dictMode !=3D ZSTD_extDict */
+                if (match[ml] =3D=3D ip[ml])   /* potentially better */
+                    currentMl =3D ZSTD_count(ip, match, iLimit);
+            } else {
+                const BYTE* const match =3D dictBase + matchIndex;
+                assert(match+4 <=3D dictEnd);
+                if (MEM_read32(match) =3D=3D MEM_read32(ip))   /* assumpti=
on : matchIndex <=3D dictLimit-4 (by table construction) */
+                    currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLim=
it, dictEnd, prefixStart) + 4;
+            }
+
+            /* Save best solution */
+            if (currentMl > ml) {
+                ml =3D currentMl;
+                *offsetPtr =3D STORE_OFFSET(curr - matchIndex);
+                if (ip+currentMl =3D=3D iLimit) break; /* best possible, a=
voids read overflow on next attempt */
+            }
+        }
+    }
+
+    assert(nbAttempts <=3D (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven'=
t underflowed. */
+    if (dictMode =3D=3D ZSTD_dedicatedDictSearch) {
+        ml =3D ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttem=
pts + ddsExtraAttempts, dms,
+                                                  ip, iLimit, prefixStart,=
 curr, dictLimit, ddsIdx);
+    } else if (dictMode =3D=3D ZSTD_dictMatchState) {
+        /* TODO: Measure and potentially add prefetching to DMS */
+        const U32 dmsLowestIndex       =3D dms->window.dictLimit;
+        const BYTE* const dmsBase      =3D dms->window.base;
+        const BYTE* const dmsEnd       =3D dms->window.nextSrc;
+        const U32 dmsSize              =3D (U32)(dmsEnd - dmsBase);
+        const U32 dmsIndexDelta        =3D dictLimit - dmsSize;
+
+        {   U32 const head =3D *dmsTagRow & rowMask;
+            U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
+            size_t numMatches =3D 0;
+            size_t currMatch =3D 0;
+            ZSTD_VecMask matches =3D ZSTD_row_getMatchMask(dmsTagRow, (BYT=
E)dmsTag, head, rowEntries);
+
+            for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matche=
s &=3D (matches - 1)) {
+                U32 const matchPos =3D (head + ZSTD_VecMask_next(matches))=
 & rowMask;
+                U32 const matchIndex =3D dmsRow[matchPos];
+                if (matchIndex < dmsLowestIndex)
+                    break;
+                PREFETCH_L1(dmsBase + matchIndex);
+                matchBuffer[numMatches++] =3D matchIndex;
+            }
+
+            /* Return the longest match */
+            for (; currMatch < numMatches; ++currMatch) {
+                U32 const matchIndex =3D matchBuffer[currMatch];
+                size_t currentMl=3D0;
+                assert(matchIndex >=3D dmsLowestIndex);
+                assert(matchIndex < curr);
+
+                {   const BYTE* const match =3D dmsBase + matchIndex;
+                    assert(match+4 <=3D dmsEnd);
+                    if (MEM_read32(match) =3D=3D MEM_read32(ip))
+                        currentMl =3D ZSTD_count_2segments(ip+4, match+4, =
iLimit, dmsEnd, prefixStart) + 4;
+                }
+
+                if (currentMl > ml) {
+                    ml =3D currentMl;
+                    assert(curr > matchIndex + dmsIndexDelta);
+                    *offsetPtr =3D STORE_OFFSET(curr - (matchIndex + dmsIn=
dexDelta));
+                    if (ip+currentMl =3D=3D iLimit) break;
+                }
+            }
+        }
     }
+    return ml;
 }
=20
=20
+/*
+ * Generate search functions templated on (dictMode, mls, rowLog).
+ * These functions are outlined for code size & compilation time.
+ * ZSTD_searchMax() dispatches to the correct implementation function.
+ *
+ * TODO: The start of the search function involves loading and calculating=
 a
+ * bunch of constants from the ZSTD_matchState_t. These computations could=
 be
+ * done in an initialization function, and saved somewhere in the match st=
ate.
+ * Then we could pass a pointer to the saved state instead of the match st=
ate,
+ * and avoid duplicate computations.
+ *
+ * TODO: Move the match re-winding into searchMax. This improves compressi=
on
+ * ratio, and unlocks further simplifications with the next TODO.
+ *
+ * TODO: Try moving the repcode search into searchMax. After the re-winding
+ * and repcode search are in searchMax, there is no more logic in the match
+ * finder loop that requires knowledge about the dictMode. So we should be
+ * able to avoid force inlining it, and we can join the extDict loop with
+ * the single segment loop. It should go in searchMax instead of its own
+ * function to avoid having multiple virtual function calls per search.
+ */
+
+#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##=
_##mls
+#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##=
_##mls
+#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##=
dictMode##_##mls##_##rowLog
+
+#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
+
+#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls)                              =
             \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)(         =
             \
+            ZSTD_matchState_t* ms,                                        =
             \
+            const BYTE* ip, const BYTE* const iLimit,                     =
             \
+            size_t* offBasePtr)                                           =
             \
+    {                                                                     =
             \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) =3D=3D mls);          =
                 \
+        return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_=
##dictMode); \
+    }                                                                     =
             \
+
+#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls)                              =
            \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)(         =
            \
+            ZSTD_matchState_t* ms,                                        =
            \
+            const BYTE* ip, const BYTE* const iLimit,                     =
            \
+            size_t* offsetPtr)                                            =
            \
+    {                                                                     =
            \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) =3D=3D mls);          =
                \
+        return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_#=
#dictMode); \
+    }                                                                     =
            \
+
+#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)                     =
                     \
+    ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(=
                     \
+            ZSTD_matchState_t* ms,                                        =
                     \
+            const BYTE* ip, const BYTE* const iLimit,                     =
                     \
+            size_t* offsetPtr)                                            =
                     \
+    {                                                                     =
                     \
+        assert(MAX(4, MIN(6, ms->cParams.minMatch)) =3D=3D mls);          =
                         \
+        assert(MAX(4, MIN(6, ms->cParams.searchLog)) =3D=3D rowLog);      =
                         \
+        return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_=
##dictMode, rowLog); \
+    }                                                                     =
                     \
+
+#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
+    X(dictMode, mls, 4)                        \
+    X(dictMode, mls, 5)                        \
+    X(dictMode, mls, 6)
+
+#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4)      \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5)      \
+    ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
+
+#define ZSTD_FOR_EACH_MLS(X, dictMode) \
+    X(dictMode, 4)                     \
+    X(dictMode, 5)                     \
+    X(dictMode, 6)
+
+#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
+    X(__VA_ARGS__, noDict)              \
+    X(__VA_ARGS__, extDict)             \
+    X(__VA_ARGS__, dictMatchState)      \
+    X(__VA_ARGS__, dedicatedDictSearch)
+
+/* Generate row search fns for each combination of (dictMode, mls, rowLog)=
 */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
+/* Generate binary Tree search fns for each combination of (dictMode, mls)=
 */
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
+/* Generate hash chain search fns for each combination of (dictMode, mls) =
*/
+ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
+
+typedef enum { search_hashChain=3D0, search_binaryTree=3D1, search_rowHash=
=3D2 } searchMethod_e;
+
+#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls)                         \
+    case mls:                                                             \
+        return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
+#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls)                         \
+    case mls:                                                             \
+        return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
+#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog)                =
         \
+    case rowLog:                                                          =
         \
+        return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, off=
setPtr);
+
+#define ZSTD_SWITCH_MLS(X, dictMode)   \
+    switch (mls) {                     \
+        ZSTD_FOR_EACH_MLS(X, dictMode) \
+    }
+
+#define ZSTD_SWITCH_ROWLOG(dictMode, mls)                                 =
   \
+    case mls:                                                             =
   \
+        switch (rowLog) {                                                 =
   \
+            ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, ml=
s) \
+        }                                                                 =
   \
+        ZSTD_UNREACHABLE;                                                 =
   \
+        break;
+
+#define ZSTD_SWITCH_SEARCH_METHOD(dictMode)                       \
+    switch (searchMethod) {                                       \
+        case search_hashChain:                                    \
+            ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
+            break;                                                \
+        case search_binaryTree:                                   \
+            ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
+            break;                                                \
+        case search_rowHash:                                      \
+            ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode)         \
+            break;                                                \
+    }                                                             \
+    ZSTD_UNREACHABLE;
+
+/*
+ * Searches for the longest match at @p ip.
+ * Dispatches to the correct implementation function based on the
+ * (searchMethod, dictMode, mls, rowLog). We use switch statements
+ * here instead of using an indirect function call through a function
+ * pointer because after Spectre and Meltdown mitigations, indirect
+ * function calls can be very costly, especially in the kernel.
+ *
+ * NOTE: dictMode and searchMethod should be templated, so those switch
+ * statements should be optimized out. Only the mls & rowLog switches
+ * should be left.
+ *
+ * @param ms The match state.
+ * @param ip The position to search at.
+ * @param iend The end of the input data.
+ * @param[out] offsetPtr Stores the match offset into this pointer.
+ * @param mls The minimum search length, in the range [4, 6].
+ * @param rowLog The row log (if applicable), in the range [4, 6].
+ * @param searchMethod The search method to use (templated).
+ * @param dictMode The dictMode (templated).
+ *
+ * @returns The length of the longest match found, or < mls if no match is=
 found.
+ * If a match is found its offset is stored in @p offsetPtr.
+ */
+FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
+    ZSTD_matchState_t* ms,
+    const BYTE* ip,
+    const BYTE* iend,
+    size_t* offsetPtr,
+    U32 const mls,
+    U32 const rowLog,
+    searchMethod_e const searchMethod,
+    ZSTD_dictMode_e const dictMode)
+{
+    if (dictMode =3D=3D ZSTD_noDict) {
+        ZSTD_SWITCH_SEARCH_METHOD(noDict)
+    } else if (dictMode =3D=3D ZSTD_extDict) {
+        ZSTD_SWITCH_SEARCH_METHOD(extDict)
+    } else if (dictMode =3D=3D ZSTD_dictMatchState) {
+        ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
+    } else if (dictMode =3D=3D ZSTD_dedicatedDictSearch) {
+        ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
+    }
+    ZSTD_UNREACHABLE;
+    return 0;
+}
+
 /* *******************************
 *  Common parser - lazy strategy
 *********************************/
-typedef enum { search_hashChain, search_binaryTree } searchMethod_e;
=20
 FORCE_INLINE_TEMPLATE size_t
 ZSTD_compressBlock_lazy_generic(
@@ -865,41 +1484,13 @@ ZSTD_compressBlock_lazy_generic(
     const BYTE* ip =3D istart;
     const BYTE* anchor =3D istart;
     const BYTE* const iend =3D istart + srcSize;
-    const BYTE* const ilimit =3D iend - 8;
+    const BYTE* const ilimit =3D (searchMethod =3D=3D search_rowHash) ? ie=
nd - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
     const BYTE* const base =3D ms->window.base;
     const U32 prefixLowestIndex =3D ms->window.dictLimit;
     const BYTE* const prefixLowest =3D base + prefixLowestIndex;
+    const U32 mls =3D BOUNDED(4, ms->cParams.minMatch, 6);
+    const U32 rowLog =3D BOUNDED(4, ms->cParams.searchLog, 6);
=20
-    typedef size_t (*searchMax_f)(
-                        ZSTD_matchState_t* ms,
-                        const BYTE* ip, const BYTE* iLimit, size_t* offset=
Ptr);
-
-    /*
-     * This table is indexed first by the four ZSTD_dictMode_e values, and=
 then
-     * by the two searchMethod_e values. NULLs are placed for configuratio=
ns
-     * that should never occur (extDict modes go to the other implementati=
on
-     * below and there is no DDSS for binary tree search yet).
-     */
-    const searchMax_f searchFuncs[4][2] =3D {
-        {
-            ZSTD_HcFindBestMatch_selectMLS,
-            ZSTD_BtFindBestMatch_selectMLS
-        },
-        {
-            NULL,
-            NULL
-        },
-        {
-            ZSTD_HcFindBestMatch_dictMatchState_selectMLS,
-            ZSTD_BtFindBestMatch_dictMatchState_selectMLS
-        },
-        {
-            ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS,
-            NULL
-        }
-    };
-
-    searchMax_f const searchMax =3D searchFuncs[dictMode][searchMethod =3D=
=3D search_binaryTree];
     U32 offset_1 =3D rep[0], offset_2 =3D rep[1], savedOffset=3D0;
=20
     const int isDMS =3D dictMode =3D=3D ZSTD_dictMatchState;
@@ -915,11 +1506,7 @@ ZSTD_compressBlock_lazy_generic(
                                      0;
     const U32 dictAndPrefixLength =3D (U32)((ip - prefixLowest) + (dictEnd=
 - dictLowest));
=20
-    assert(searchMax !=3D NULL);
-
-    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=3D%u)", (U32)di=
ctMode);
-
-    /* init */
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=3D%u) (searchFu=
nc=3D%u)", (U32)dictMode, (U32)searchMethod);
     ip +=3D (dictAndPrefixLength =3D=3D 0);
     if (dictMode =3D=3D ZSTD_noDict) {
         U32 const curr =3D (U32)(ip - base);
@@ -935,6 +1522,12 @@ ZSTD_compressBlock_lazy_generic(
         assert(offset_2 <=3D dictAndPrefixLength);
     }
=20
+    if (searchMethod =3D=3D search_rowHash) {
+        ZSTD_row_fillHashCache(ms, base, rowLog,
+                            MIN(ms->cParams.minMatch, 6 /* mls caps out at=
 6 */),
+                            ms->nextToUpdate, ilimit);
+    }
+
     /* Match Loop */
 #if defined(__x86_64__)
     /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when =
the
@@ -944,8 +1537,9 @@ ZSTD_compressBlock_lazy_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=3D0;
-        size_t offset=3D0;
+        size_t offcode=3DSTORE_REPCODE_1;
         const BYTE* start=3Dip+1;
+        DEBUGLOG(7, "search baseline (depth 0)");
=20
         /* check repCode */
         if (isDxS) {
@@ -969,9 +1563,9 @@ ZSTD_compressBlock_lazy_generic(
=20
         /* first search (depth 0) */
         {   size_t offsetFound =3D 999999999;
-            size_t const ml2 =3D searchMax(ms, ip, iend, &offsetFound);
+            size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &offsetFound=
, mls, rowLog, searchMethod, dictMode);
             if (ml2 > matchLength)
-                matchLength =3D ml2, start =3D ip, offset=3DoffsetFound;
+                matchLength =3D ml2, start =3D ip, offcode=3DoffsetFound;
         }
=20
         if (matchLength < 4) {
@@ -982,14 +1576,15 @@ ZSTD_compressBlock_lazy_generic(
         /* let's try to find a better solution */
         if (depth>=3D1)
         while (ip<ilimit) {
+            DEBUGLOG(7, "search depth 1");
             ip ++;
             if ( (dictMode =3D=3D ZSTD_noDict)
-              && (offset) && ((offset_1>0) & (MEM_read32(ip) =3D=3D MEM_re=
ad32(ip - offset_1)))) {
+              && (offcode) && ((offset_1>0) & (MEM_read32(ip) =3D=3D MEM_r=
ead32(ip - offset_1)))) {
                 size_t const mlRep =3D ZSTD_count(ip+4, ip+4-offset_1, ien=
d) + 4;
                 int const gain2 =3D (int)(mlRep * 3);
-                int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit32((=
U32)offset+1) + 1);
+                int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit32((=
U32)STORED_TO_OFFBASE(offcode)) + 1);
                 if ((mlRep >=3D 4) && (gain2 > gain1))
-                    matchLength =3D mlRep, offset =3D 0, start =3D ip;
+                    matchLength =3D mlRep, offcode =3D STORE_REPCODE_1, st=
art =3D ip;
             }
             if (isDxS) {
                 const U32 repIndex =3D (U32)(ip - base) - offset_1;
@@ -1001,30 +1596,31 @@ ZSTD_compressBlock_lazy_generic(
                     const BYTE* repMatchEnd =3D repIndex < prefixLowestInd=
ex ? dictEnd : iend;
                     size_t const mlRep =3D ZSTD_count_2segments(ip+4, repM=
atch+4, iend, repMatchEnd, prefixLowest) + 4;
                     int const gain2 =3D (int)(mlRep * 3);
-                    int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit=
32((U32)offset+1) + 1);
+                    int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit=
32((U32)STORED_TO_OFFBASE(offcode)) + 1);
                     if ((mlRep >=3D 4) && (gain2 > gain1))
-                        matchLength =3D mlRep, offset =3D 0, start =3D ip;
+                        matchLength =3D mlRep, offcode =3D STORE_REPCODE_1=
, start =3D ip;
                 }
             }
             {   size_t offset2=3D999999999;
-                size_t const ml2 =3D searchMax(ms, ip, iend, &offset2);
-                int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)offs=
et2+1));   /* raw approx */
-                int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit32((=
U32)offset+1) + 4);
+                size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &offset2=
, mls, rowLog, searchMethod, dictMode);
+                int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)STOR=
ED_TO_OFFBASE(offset2)));   /* raw approx */
+                int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit32((=
U32)STORED_TO_OFFBASE(offcode)) + 4);
                 if ((ml2 >=3D 4) && (gain2 > gain1)) {
-                    matchLength =3D ml2, offset =3D offset2, start =3D ip;
+                    matchLength =3D ml2, offcode =3D offset2, start =3D ip;
                     continue;   /* search a better one */
             }   }
=20
             /* let's find an even better one */
             if ((depth=3D=3D2) && (ip<ilimit)) {
+                DEBUGLOG(7, "search depth 2");
                 ip ++;
                 if ( (dictMode =3D=3D ZSTD_noDict)
-                  && (offset) && ((offset_1>0) & (MEM_read32(ip) =3D=3D ME=
M_read32(ip - offset_1)))) {
+                  && (offcode) && ((offset_1>0) & (MEM_read32(ip) =3D=3D M=
EM_read32(ip - offset_1)))) {
                     size_t const mlRep =3D ZSTD_count(ip+4, ip+4-offset_1,=
 iend) + 4;
                     int const gain2 =3D (int)(mlRep * 4);
-                    int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit=
32((U32)offset+1) + 1);
+                    int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit=
32((U32)STORED_TO_OFFBASE(offcode)) + 1);
                     if ((mlRep >=3D 4) && (gain2 > gain1))
-                        matchLength =3D mlRep, offset =3D 0, start =3D ip;
+                        matchLength =3D mlRep, offcode =3D STORE_REPCODE_1=
, start =3D ip;
                 }
                 if (isDxS) {
                     const U32 repIndex =3D (U32)(ip - base) - offset_1;
@@ -1036,46 +1632,45 @@ ZSTD_compressBlock_lazy_generic(
                         const BYTE* repMatchEnd =3D repIndex < prefixLowes=
tIndex ? dictEnd : iend;
                         size_t const mlRep =3D ZSTD_count_2segments(ip+4, =
repMatch+4, iend, repMatchEnd, prefixLowest) + 4;
                         int const gain2 =3D (int)(mlRep * 4);
-                        int const gain1 =3D (int)(matchLength*4 - ZSTD_hig=
hbit32((U32)offset+1) + 1);
+                        int const gain1 =3D (int)(matchLength*4 - ZSTD_hig=
hbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
                         if ((mlRep >=3D 4) && (gain2 > gain1))
-                            matchLength =3D mlRep, offset =3D 0, start =3D=
 ip;
+                            matchLength =3D mlRep, offcode =3D STORE_REPCO=
DE_1, start =3D ip;
                     }
                 }
                 {   size_t offset2=3D999999999;
-                    size_t const ml2 =3D searchMax(ms, ip, iend, &offset2);
-                    int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)=
offset2+1));   /* raw approx */
-                    int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit=
32((U32)offset+1) + 7);
+                    size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &off=
set2, mls, rowLog, searchMethod, dictMode);
+                    int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)=
STORED_TO_OFFBASE(offset2)));   /* raw approx */
+                    int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit=
32((U32)STORED_TO_OFFBASE(offcode)) + 7);
                     if ((ml2 >=3D 4) && (gain2 > gain1)) {
-                        matchLength =3D ml2, offset =3D offset2, start =3D=
 ip;
+                        matchLength =3D ml2, offcode =3D offset2, start =
=3D ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
         }
=20
         /* NOTE:
-         * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior.
-         * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, w=
hich
-         * overflows the pointer, which is undefined behavior.
+         * Pay attention that `start[-value]` can lead to strange undefine=
d behavior
+         * notably if `value` is unsigned, resulting in a large positive `=
-value`.
          */
         /* catch up */
-        if (offset) {
+        if (STORED_IS_OFFSET(offcode)) {
             if (dictMode =3D=3D ZSTD_noDict) {
-                while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE=
) > prefixLowest))
-                     && (start[-1] =3D=3D (start-(offset-ZSTD_REP_MOVE))[-=
1]) )  /* only search for offset within prefix */
+                while ( ((start > anchor) & (start - STORED_OFFSET(offcode=
) > prefixLowest))
+                     && (start[-1] =3D=3D (start-STORED_OFFSET(offcode))[-=
1]) )  /* only search for offset within prefix */
                     { start--; matchLength++; }
             }
             if (isDxS) {
-                U32 const matchIndex =3D (U32)((start-base) - (offset - ZS=
TD_REP_MOVE));
+                U32 const matchIndex =3D (U32)((size_t)(start-base) - STOR=
ED_OFFSET(offcode));
                 const BYTE* match =3D (matchIndex < prefixLowestIndex) ? d=
ictBase + matchIndex - dictIndexDelta : base + matchIndex;
                 const BYTE* const mStart =3D (matchIndex < prefixLowestInd=
ex) ? dictLowest : prefixLowest;
                 while ((start>anchor) && (match>mStart) && (start[-1] =3D=
=3D match[-1])) { start--; match--; matchLength++; }  /* catch up */
             }
-            offset_2 =3D offset_1; offset_1 =3D (U32)(offset - ZSTD_REP_MO=
VE);
+            offset_2 =3D offset_1; offset_1 =3D (U32)STORED_OFFSET(offcode=
);
         }
         /* store sequence */
 _storeSequence:
-        {   size_t const litLength =3D start - anchor;
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, =
matchLength-MINMATCH);
+        {   size_t const litLength =3D (size_t)(start - anchor);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode,=
 matchLength);
             anchor =3D ip =3D start + matchLength;
         }
=20
@@ -1091,8 +1686,8 @@ ZSTD_compressBlock_lazy_generic(
                    && (MEM_read32(repMatch) =3D=3D MEM_read32(ip)) ) {
                     const BYTE* const repEnd2 =3D repIndex < prefixLowestI=
ndex ? dictEnd : iend;
                     matchLength =3D ZSTD_count_2segments(ip+4, repMatch+4,=
 iend, repEnd2, prefixLowest) + 4;
-                    offset =3D offset_2; offset_2 =3D offset_1; offset_1 =
=3D (U32)offset;   /* swap offset_2 <=3D> offset_1 */
-                    ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLengt=
h-MINMATCH);
+                    offcode =3D offset_2; offset_2 =3D offset_1; offset_1 =
=3D (U32)offcode;   /* swap offset_2 <=3D> offset_1 */
+                    ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE=
_1, matchLength);
                     ip +=3D matchLength;
                     anchor =3D ip;
                     continue;
@@ -1106,8 +1701,8 @@ ZSTD_compressBlock_lazy_generic(
                  && (MEM_read32(ip) =3D=3D MEM_read32(ip - offset_2)) ) {
                 /* store sequence */
                 matchLength =3D ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
-                offset =3D offset_2; offset_2 =3D offset_1; offset_1 =3D (=
U32)offset; /* swap repcodes */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MI=
NMATCH);
+                offcode =3D offset_2; offset_2 =3D offset_1; offset_1 =3D =
(U32)offcode; /* swap repcodes */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, =
matchLength);
                 ip +=3D matchLength;
                 anchor =3D ip;
                 continue;   /* faster when present ... (?) */
@@ -1200,6 +1795,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
     return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_hashChain, 0, ZSTD_dedicatedDictSearch);
 }
=20
+/* Row-based matchfinder */
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 2, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 1, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 0, ZSTD_noDict);
+}
+
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 2, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 1, ZSTD_dictMatchState);
+}
+
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 0, ZSTD_dictMatchState);
+}
+
+
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 2, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 1, ZSTD_dedicatedDictSearch);
+}
+
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize=
, search_rowHash, 0, ZSTD_dedicatedDictSearch);
+}
=20
 FORCE_INLINE_TEMPLATE
 size_t ZSTD_compressBlock_lazy_extDict_generic(
@@ -1212,7 +1871,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
     const BYTE* ip =3D istart;
     const BYTE* anchor =3D istart;
     const BYTE* const iend =3D istart + srcSize;
-    const BYTE* const ilimit =3D iend - 8;
+    const BYTE* const ilimit =3D searchMethod =3D=3D search_rowHash ? iend=
 - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8;
     const BYTE* const base =3D ms->window.base;
     const U32 dictLimit =3D ms->window.dictLimit;
     const BYTE* const prefixStart =3D base + dictLimit;
@@ -1220,18 +1879,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
     const BYTE* const dictEnd  =3D dictBase + dictLimit;
     const BYTE* const dictStart  =3D dictBase + ms->window.lowLimit;
     const U32 windowLog =3D ms->cParams.windowLog;
-
-    typedef size_t (*searchMax_f)(
-                        ZSTD_matchState_t* ms,
-                        const BYTE* ip, const BYTE* iLimit, size_t* offset=
Ptr);
-    searchMax_f searchMax =3D searchMethod=3D=3Dsearch_binaryTree ? ZSTD_B=
tFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS;
+    const U32 mls =3D BOUNDED(4, ms->cParams.minMatch, 6);
+    const U32 rowLog =3D BOUNDED(4, ms->cParams.searchLog, 6);
=20
     U32 offset_1 =3D rep[0], offset_2 =3D rep[1];
=20
-    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic");
+    DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=3D%u)=
", (U32)searchMethod);
=20
     /* init */
     ip +=3D (ip =3D=3D prefixStart);
+    if (searchMethod =3D=3D search_rowHash) {
+        ZSTD_row_fillHashCache(ms, base, rowLog,
+                               MIN(ms->cParams.minMatch, 6 /* mls caps out=
 at 6 */),
+                               ms->nextToUpdate, ilimit);
+    }
=20
     /* Match Loop */
 #if defined(__x86_64__)
@@ -1242,7 +1903,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
 #endif
     while (ip < ilimit) {
         size_t matchLength=3D0;
-        size_t offset=3D0;
+        size_t offcode=3DSTORE_REPCODE_1;
         const BYTE* start=3Dip+1;
         U32 curr =3D (U32)(ip-base);
=20
@@ -1251,7 +1912,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             const U32 repIndex =3D (U32)(curr+1 - offset_1);
             const BYTE* const repBase =3D repIndex < dictLimit ? dictBase =
: base;
             const BYTE* const repMatch =3D repBase + repIndex;
-            if (((U32)((dictLimit-1) - repIndex) >=3D 3) & (repIndex > win=
dowLow))   /* intentional overflow */
+            if ( ((U32)((dictLimit-1) - repIndex) >=3D 3) /* intentional o=
verflow */
+               & (offset_1 <=3D curr+1 - windowLow) ) /* note: we are sear=
ching at curr+1 */
             if (MEM_read32(ip+1) =3D=3D MEM_read32(repMatch)) {
                 /* repcode detected we should take it */
                 const BYTE* const repEnd =3D repIndex < dictLimit ? dictEn=
d : iend;
@@ -1261,9 +1923,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
=20
         /* first search (depth 0) */
         {   size_t offsetFound =3D 999999999;
-            size_t const ml2 =3D searchMax(ms, ip, iend, &offsetFound);
+            size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &offsetFound=
, mls, rowLog, searchMethod, ZSTD_extDict);
             if (ml2 > matchLength)
-                matchLength =3D ml2, start =3D ip, offset=3DoffsetFound;
+                matchLength =3D ml2, start =3D ip, offcode=3DoffsetFound;
         }
=20
         if (matchLength < 4) {
@@ -1277,29 +1939,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             ip ++;
             curr++;
             /* check repCode */
-            if (offset) {
+            if (offcode) {
                 const U32 windowLow =3D ZSTD_getLowestMatchIndex(ms, curr,=
 windowLog);
                 const U32 repIndex =3D (U32)(curr - offset_1);
                 const BYTE* const repBase =3D repIndex < dictLimit ? dictB=
ase : base;
                 const BYTE* const repMatch =3D repBase + repIndex;
-                if (((U32)((dictLimit-1) - repIndex) >=3D 3) & (repIndex >=
 windowLow))  /* intentional overflow */
+                if ( ((U32)((dictLimit-1) - repIndex) >=3D 3) /* intention=
al overflow : do not test positions overlapping 2 memory segments  */
+                   & (offset_1 <=3D curr - windowLow) ) /* equivalent to `=
curr > repIndex >=3D windowLow` */
                 if (MEM_read32(ip) =3D=3D MEM_read32(repMatch)) {
                     /* repcode detected */
                     const BYTE* const repEnd =3D repIndex < dictLimit ? di=
ctEnd : iend;
                     size_t const repLength =3D ZSTD_count_2segments(ip+4, =
repMatch+4, iend, repEnd, prefixStart) + 4;
                     int const gain2 =3D (int)(repLength * 3);
-                    int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit=
32((U32)offset+1) + 1);
+                    int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit=
32((U32)STORED_TO_OFFBASE(offcode)) + 1);
                     if ((repLength >=3D 4) && (gain2 > gain1))
-                        matchLength =3D repLength, offset =3D 0, start =3D=
 ip;
+                        matchLength =3D repLength, offcode =3D STORE_REPCO=
DE_1, start =3D ip;
             }   }
=20
             /* search match, depth 1 */
             {   size_t offset2=3D999999999;
-                size_t const ml2 =3D searchMax(ms, ip, iend, &offset2);
-                int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)offs=
et2+1));   /* raw approx */
-                int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit32((=
U32)offset+1) + 4);
+                size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &offset2=
, mls, rowLog, searchMethod, ZSTD_extDict);
+                int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)STOR=
ED_TO_OFFBASE(offset2)));   /* raw approx */
+                int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit32((=
U32)STORED_TO_OFFBASE(offcode)) + 4);
                 if ((ml2 >=3D 4) && (gain2 > gain1)) {
-                    matchLength =3D ml2, offset =3D offset2, start =3D ip;
+                    matchLength =3D ml2, offcode =3D offset2, start =3D ip;
                     continue;   /* search a better one */
             }   }
=20
@@ -1308,47 +1971,48 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
                 ip ++;
                 curr++;
                 /* check repCode */
-                if (offset) {
+                if (offcode) {
                     const U32 windowLow =3D ZSTD_getLowestMatchIndex(ms, c=
urr, windowLog);
                     const U32 repIndex =3D (U32)(curr - offset_1);
                     const BYTE* const repBase =3D repIndex < dictLimit ? d=
ictBase : base;
                     const BYTE* const repMatch =3D repBase + repIndex;
-                    if (((U32)((dictLimit-1) - repIndex) >=3D 3) & (repInd=
ex > windowLow))  /* intentional overflow */
+                    if ( ((U32)((dictLimit-1) - repIndex) >=3D 3) /* inten=
tional overflow : do not test positions overlapping 2 memory segments  */
+                       & (offset_1 <=3D curr - windowLow) ) /* equivalent =
to `curr > repIndex >=3D windowLow` */
                     if (MEM_read32(ip) =3D=3D MEM_read32(repMatch)) {
                         /* repcode detected */
                         const BYTE* const repEnd =3D repIndex < dictLimit =
? dictEnd : iend;
                         size_t const repLength =3D ZSTD_count_2segments(ip=
+4, repMatch+4, iend, repEnd, prefixStart) + 4;
                         int const gain2 =3D (int)(repLength * 4);
-                        int const gain1 =3D (int)(matchLength*4 - ZSTD_hig=
hbit32((U32)offset+1) + 1);
+                        int const gain1 =3D (int)(matchLength*4 - ZSTD_hig=
hbit32((U32)STORED_TO_OFFBASE(offcode)) + 1);
                         if ((repLength >=3D 4) && (gain2 > gain1))
-                            matchLength =3D repLength, offset =3D 0, start=
 =3D ip;
+                            matchLength =3D repLength, offcode =3D STORE_R=
EPCODE_1, start =3D ip;
                 }   }
=20
                 /* search match, depth 2 */
                 {   size_t offset2=3D999999999;
-                    size_t const ml2 =3D searchMax(ms, ip, iend, &offset2);
-                    int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)=
offset2+1));   /* raw approx */
-                    int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit=
32((U32)offset+1) + 7);
+                    size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &off=
set2, mls, rowLog, searchMethod, ZSTD_extDict);
+                    int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)=
STORED_TO_OFFBASE(offset2)));   /* raw approx */
+                    int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit=
32((U32)STORED_TO_OFFBASE(offcode)) + 7);
                     if ((ml2 >=3D 4) && (gain2 > gain1)) {
-                        matchLength =3D ml2, offset =3D offset2, start =3D=
 ip;
+                        matchLength =3D ml2, offcode =3D offset2, start =
=3D ip;
                         continue;
             }   }   }
             break;  /* nothing found : store previous solution */
         }
=20
         /* catch up */
-        if (offset) {
-            U32 const matchIndex =3D (U32)((start-base) - (offset - ZSTD_R=
EP_MOVE));
+        if (STORED_IS_OFFSET(offcode)) {
+            U32 const matchIndex =3D (U32)((size_t)(start-base) - STORED_O=
FFSET(offcode));
             const BYTE* match =3D (matchIndex < dictLimit) ? dictBase + ma=
tchIndex : base + matchIndex;
             const BYTE* const mStart =3D (matchIndex < dictLimit) ? dictSt=
art : prefixStart;
             while ((start>anchor) && (match>mStart) && (start[-1] =3D=3D m=
atch[-1])) { start--; match--; matchLength++; }  /* catch up */
-            offset_2 =3D offset_1; offset_1 =3D (U32)(offset - ZSTD_REP_MO=
VE);
+            offset_2 =3D offset_1; offset_1 =3D (U32)STORED_OFFSET(offcode=
);
         }
=20
         /* store sequence */
 _storeSequence:
-        {   size_t const litLength =3D start - anchor;
-            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, =
matchLength-MINMATCH);
+        {   size_t const litLength =3D (size_t)(start - anchor);
+            ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode,=
 matchLength);
             anchor =3D ip =3D start + matchLength;
         }
=20
@@ -1359,13 +2023,14 @@ size_t ZSTD_compressBlock_lazy_extDict_generic(
             const U32 repIndex =3D repCurrent - offset_2;
             const BYTE* const repBase =3D repIndex < dictLimit ? dictBase =
: base;
             const BYTE* const repMatch =3D repBase + repIndex;
-            if (((U32)((dictLimit-1) - repIndex) >=3D 3) & (repIndex > win=
dowLow))  /* intentional overflow */
+            if ( ((U32)((dictLimit-1) - repIndex) >=3D 3) /* intentional o=
verflow : do not test positions overlapping 2 memory segments  */
+               & (offset_2 <=3D repCurrent - windowLow) ) /* equivalent to=
 `curr > repIndex >=3D windowLow` */
             if (MEM_read32(ip) =3D=3D MEM_read32(repMatch)) {
                 /* repcode detected we should take it */
                 const BYTE* const repEnd =3D repIndex < dictLimit ? dictEn=
d : iend;
                 matchLength =3D ZSTD_count_2segments(ip+4, repMatch+4, ien=
d, repEnd, prefixStart) + 4;
-                offset =3D offset_2; offset_2 =3D offset_1; offset_1 =3D (=
U32)offset;   /* swap offset history */
-                ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MI=
NMATCH);
+                offcode =3D offset_2; offset_2 =3D offset_1; offset_1 =3D =
(U32)offcode;   /* swap offset history */
+                ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, =
matchLength);
                 ip +=3D matchLength;
                 anchor =3D ip;
                 continue;   /* faster when present ... (?) */
@@ -1412,3 +2077,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict(
 {
     return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src,=
 srcSize, search_binaryTree, 2);
 }
+
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src,=
 srcSize, search_rowHash, 0);
+}
+
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src,=
 srcSize, search_rowHash, 1);
+}
+
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize)
+
+{
+    return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src,=
 srcSize, search_rowHash, 2);
+}
diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h
index 2fc5a6182134..e5bdf4df8dde 100644
--- a/lib/zstd/compress/zstd_lazy.h
+++ b/lib/zstd/compress/zstd_lazy.h
@@ -23,6 +23,7 @@
 #define ZSTD_LAZY_DDSS_BUCKET_LOG 2
=20
 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip);
+void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip);
=20
 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, c=
onst BYTE* const ip);
=20
@@ -40,6 +41,15 @@ size_t ZSTD_compressBlock_lazy(
 size_t ZSTD_compressBlock_greedy(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
=20
 size_t ZSTD_compressBlock_btlazy2_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -53,6 +63,15 @@ size_t ZSTD_compressBlock_lazy_dictMatchState(
 size_t ZSTD_compressBlock_greedy_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dictMatchState_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
=20
 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -63,6 +82,15 @@ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
 size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
=20
 size_t ZSTD_compressBlock_greedy_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
@@ -73,9 +101,19 @@ size_t ZSTD_compressBlock_lazy_extDict(
 size_t ZSTD_compressBlock_lazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_greedy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
+size_t ZSTD_compressBlock_lazy2_extDict_row(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        void const* src, size_t srcSize);
 size_t ZSTD_compressBlock_btlazy2_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         void const* src, size_t srcSize);
+       =20
=20
=20
 #endif /* ZSTD_LAZY_H */
diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c
index 8ef7e88a5add..dd86fc83e7dd 100644
--- a/lib/zstd/compress/zstd_ldm.c
+++ b/lib/zstd/compress/zstd_ldm.c
@@ -57,6 +57,33 @@ static void ZSTD_ldm_gear_init(ldmRollingHashState_t* st=
ate, ldmParams_t const*
     }
 }
=20
+/* ZSTD_ldm_gear_reset()
+ * Feeds [data, data + minMatchLength) into the hash without registering a=
ny
+ * splits. This effectively resets the hash state. This is used when skipp=
ing
+ * over data, either at the beginning of a block, or skipping sections.
+ */
+static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state,
+                                BYTE const* data, size_t minMatchLength)
+{
+    U64 hash =3D state->rolling;
+    size_t n =3D 0;
+
+#define GEAR_ITER_ONCE() do {                                  \
+        hash =3D (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \
+        n +=3D 1;                                                \
+    } while (0)
+    while (n + 3 < minMatchLength) {
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+        GEAR_ITER_ONCE();
+    }
+    while (n < minMatchLength) {
+        GEAR_ITER_ONCE();
+    }
+#undef GEAR_ITER_ONCE
+}
+
 /* ZSTD_ldm_gear_feed():
  *
  * Registers in the splits array all the split points found in the first
@@ -132,12 +159,12 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params)
     size_t const ldmBucketSize =3D ((size_t)1) << (params.hashLog - ldmBuc=
ketSizeLog);
     size_t const totalSize =3D ZSTD_cwksp_alloc_size(ldmBucketSize)
                            + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEn=
try_t));
-    return params.enableLdm ? totalSize : 0;
+    return params.enableLdm =3D=3D ZSTD_ps_enable ? totalSize : 0;
 }
=20
 size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
 {
-    return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0;
+    return params.enableLdm =3D=3D ZSTD_ps_enable ? (maxChunkSize / params=
.minMatchLength) : 0;
 }
=20
 /* ZSTD_ldm_getBucket() :
@@ -255,7 +282,7 @@ void ZSTD_ldm_fillHashTable(
     while (ip < iend) {
         size_t hashed;
         unsigned n;
-       =20
+
         numSplits =3D 0;
         hashed =3D ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &=
numSplits);
=20
@@ -327,16 +354,8 @@ static size_t ZSTD_ldm_generateSequences_internal(
=20
     /* Initialize the rolling hash state with the first minMatchLength byt=
es */
     ZSTD_ldm_gear_init(&hashState, params);
-    {
-        size_t n =3D 0;
-
-        while (n < minMatchLength) {
-            numSplits =3D 0;
-            n +=3D ZSTD_ldm_gear_feed(&hashState, ip + n, minMatchLength -=
 n,
-                                    splits, &numSplits);
-        }
-        ip +=3D minMatchLength;
-    }
+    ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength);
+    ip +=3D minMatchLength;
=20
     while (ip < ilimit) {
         size_t hashed;
@@ -361,6 +380,7 @@ static size_t ZSTD_ldm_generateSequences_internal(
         for (n =3D 0; n < numSplits; n++) {
             size_t forwardMatchLength =3D 0, backwardMatchLength =3D 0,
                    bestMatchLength =3D 0, mLength;
+            U32 offset;
             BYTE const* const split =3D candidates[n].split;
             U32 const checksum =3D candidates[n].checksum;
             U32 const hash =3D candidates[n].hash;
@@ -428,9 +448,9 @@ static size_t ZSTD_ldm_generateSequences_internal(
             }
=20
             /* Match found */
+            offset =3D (U32)(split - base) - bestEntry->offset;
             mLength =3D forwardMatchLength + backwardMatchLength;
             {
-                U32 const offset =3D (U32)(split - base) - bestEntry->offs=
et;
                 rawSeq* const seq =3D rawSeqStore->seq + rawSeqStore->size;
=20
                 /* Out of sequence storage */
@@ -447,6 +467,21 @@ static size_t ZSTD_ldm_generateSequences_internal(
             ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params);
=20
             anchor =3D split + forwardMatchLength;
+
+            /* If we find a match that ends after the data that we've hash=
ed
+             * then we have a repeating, overlapping, pattern. E.g. all ze=
ros.
+             * If one repetition of the pattern matches our `stopMask` the=
n all
+             * repetitions will. We don't need to insert them all into out=
 table,
+             * only the first one. So skip over overlapping matches.
+             * This is a major speed boost (20x) for compressing a single =
byte
+             * repeated, when that byte ends up in the table.
+             */
+            if (anchor > ip + hashed) {
+                ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, m=
inMatchLength);
+                /* Continue the outer loop at anchor (ip + hashed =3D=3D a=
nchor). */
+                ip =3D anchor - hashed;
+                break;
+            }
         }
=20
         ip +=3D hashed;
@@ -500,7 +535,7 @@ size_t ZSTD_ldm_generateSequences(
=20
         assert(chunkStart < iend);
         /* 1. Perform overflow correction if necessary. */
-        if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)=
) {
+        if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDis=
t, ldmState->loadedDictEnd, chunkStart, chunkEnd)) {
             U32 const ldmHSize =3D 1U << params->hashLog;
             U32 const correction =3D ZSTD_window_correctOverflow(
                 &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart);
@@ -544,7 +579,9 @@ size_t ZSTD_ldm_generateSequences(
     return 0;
 }
=20
-void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U3=
2 const minMatch) {
+void
+ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 con=
st minMatch)
+{
     while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) {
         rawSeq* seq =3D rawSeqStore->seq + rawSeqStore->pos;
         if (srcSize <=3D seq->litLength) {
@@ -622,12 +659,13 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* raw=
SeqStore, size_t nbBytes) {
=20
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
     ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+    ZSTD_paramSwitch_e useRowMatchFinder,
     void const* src, size_t srcSize)
 {
     const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
     unsigned const minMatch =3D cParams->minMatch;
     ZSTD_blockCompressor const blockCompressor =3D
-        ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dict=
Mode(ms));
+        ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, Z=
STD_matchState_dictMode(ms));
     /* Input bounds */
     BYTE const* const istart =3D (BYTE const*)src;
     BYTE const* const iend =3D istart + srcSize;
@@ -673,8 +711,8 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStor=
e,
             rep[0] =3D sequence.offset;
             /* Store the sequence */
             ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend,
-                          sequence.offset + ZSTD_REP_MOVE,
-                          sequence.matchLength - MINMATCH);
+                          STORE_OFFSET(sequence.offset),
+                          sequence.matchLength);
             ip +=3D sequence.matchLength;
         }
     }
diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h
index 25b25270b72e..fbc6a5e88fd7 100644
--- a/lib/zstd/compress/zstd_ldm.h
+++ b/lib/zstd/compress/zstd_ldm.h
@@ -63,6 +63,7 @@ size_t ZSTD_ldm_generateSequences(
  */
 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
             ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_=
NUM],
+            ZSTD_paramSwitch_e useRowMatchFinder,
             void const* src, size_t srcSize);
=20
 /*
diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_=
ldm_geartab.h
index e5c24d856b0a..647f865be290 100644
--- a/lib/zstd/compress/zstd_ldm_geartab.h
+++ b/lib/zstd/compress/zstd_ldm_geartab.h
@@ -11,7 +11,10 @@
 #ifndef ZSTD_LDM_GEARTAB_H
 #define ZSTD_LDM_GEARTAB_H
=20
-static U64 ZSTD_ldm_gearTab[256] =3D {
+#include "../common/compiler.h" /* UNUSED_ATTR */
+#include "../common/mem.h"      /* U64 */
+
+static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] =3D {
     0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc,
     0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05,
     0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e,
diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c
index dfc55e3e8119..fd82acfda62f 100644
--- a/lib/zstd/compress/zstd_opt.c
+++ b/lib/zstd/compress/zstd_opt.c
@@ -8,25 +8,12 @@
  * You may select, at your option, one of the above-listed licenses.
  */
=20
-/*
- * Disable inlining for the optimal parser for the kernel build.
- * It is unlikely to be used in the kernel, and where it is used
- * latency shouldn't matter because it is very slow to begin with.
- * We prefer a ~180KB binary size win over faster optimal parsing.
- *
- * TODO(https://github.com/facebook/zstd/issues/2862):
- * Improve the code size of the optimal parser in general, so we
- * don't need this hack for the kernel build.
- */
-#define ZSTD_NO_INLINE 1
-
 #include "zstd_compress_internal.h"
 #include "hist.h"
 #include "zstd_opt.h"
=20
=20
 #define ZSTD_LITFREQ_ADD    2   /* scaling factor for litFreq, so that fre=
quencies adapt faster to new stats */
-#define ZSTD_FREQ_DIV       4   /* log factor when using previous stats to=
 init next stats */
 #define ZSTD_MAX_PRICE     (1<<30)
=20
 #define ZSTD_PREDEF_THRESHOLD 1024   /* if srcSize < ZSTD_PREDEF_THRESHOLD=
, symbols' cost is assumed static, directly determined by pre-defined distr=
ibutions */
@@ -36,11 +23,11 @@
 *  Price functions for optimal parser
 ***************************************/
=20
-#if 0    /* approximation at bit level */
+#if 0    /* approximation at bit level (for tests) */
 #  define BITCOST_ACCURACY 0
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
-#  define WEIGHT(stat)  ((void)opt, ZSTD_bitWeight(stat))
-#elif 0  /* fractional bit accuracy */
+#  define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat))
+#elif 0  /* fractional bit accuracy (for tests) */
 #  define BITCOST_ACCURACY 8
 #  define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY)
 #  define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat))
@@ -78,7 +65,7 @@ MEM_STATIC double ZSTD_fCost(U32 price)
=20
 static int ZSTD_compressedLiterals(optState_t const* const optPtr)
 {
-    return optPtr->literalCompressionMode !=3D ZSTD_lcm_uncompressed;
+    return optPtr->literalCompressionMode !=3D ZSTD_ps_disable;
 }
=20
 static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel)
@@ -91,25 +78,46 @@ static void ZSTD_setBasePrices(optState_t* optPtr, int =
optLevel)
 }
=20
=20
-/* ZSTD_downscaleStat() :
- * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus)
- * return the resulting sum of elements */
-static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus)
+static U32 sum_u32(const unsigned table[], size_t nbElts)
+{
+    size_t n;
+    U32 total =3D 0;
+    for (n=3D0; n<nbElts; n++) {
+        total +=3D table[n];
+    }
+    return total;
+}
+
+static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shif=
t)
 {
     U32 s, sum=3D0;
-    DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=3D%u)", (unsigned)lastEltIndex=
+1);
-    assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31);
+    DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=3D%u, shift=3D%u)", (unsigned=
)lastEltIndex+1, (unsigned)shift);
+    assert(shift < 30);
     for (s=3D0; s<lastEltIndex+1; s++) {
-        table[s] =3D 1 + (table[s] >> (ZSTD_FREQ_DIV+malus));
+        table[s] =3D 1 + (table[s] >> shift);
         sum +=3D table[s];
     }
     return sum;
 }
=20
+/* ZSTD_scaleStats() :
+ * reduce all elements in table is sum too large
+ * return the resulting sum of elements */
+static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarge=
t)
+{
+    U32 const prevsum =3D sum_u32(table, lastEltIndex+1);
+    U32 const factor =3D prevsum >> logTarget;
+    DEBUGLOG(5, "ZSTD_scaleStats (nbElts=3D%u, target=3D%u)", (unsigned)la=
stEltIndex+1, (unsigned)logTarget);
+    assert(logTarget < 30);
+    if (factor <=3D 1) return prevsum;
+    return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)=
);
+}
+
 /* ZSTD_rescaleFreqs() :
  * if first block (detected by optPtr->litLengthSum =3D=3D 0) : init stati=
stics
  *    take hints from dictionary if there is one
- *    or init from zero, using src for literals stats, or flat 1 for match=
 symbols
+ *    and init from zero if there is none,
+ *    using src for literals stats, and baseline stats for sequence symbols
  * otherwise downscale existing stats, to be used as seed for next block.
  */
 static void
@@ -138,7 +146,7 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
                 optPtr->litSum =3D 0;
                 for (lit=3D0; lit<=3DMaxLit; lit++) {
                     U32 const scaleLog =3D 11;   /* scale to 2K */
-                    U32 const bitCost =3D HUF_getNbBits(optPtr->symbolCost=
s->huf.CTable, lit);
+                    U32 const bitCost =3D HUF_getNbBitsFromCTable(optPtr->=
symbolCosts->huf.CTable, lit);
                     assert(bitCost <=3D scaleLog);
                     optPtr->litFreq[lit] =3D bitCost ? 1 << (scaleLog-bitC=
ost) : 1 /*minimum to calculate cost*/;
                     optPtr->litSum +=3D optPtr->litFreq[lit];
@@ -186,14 +194,19 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
             if (compressedLiterals) {
                 unsigned lit =3D MaxLit;
                 HIST_count_simple(optPtr->litFreq, &lit, src, srcSize);   =
/* use raw first block to init statistics */
-                optPtr->litSum =3D ZSTD_downscaleStat(optPtr->litFreq, Max=
Lit, 1);
+                optPtr->litSum =3D ZSTD_downscaleStats(optPtr->litFreq, Ma=
xLit, 8);
             }
=20
-            {   unsigned ll;
-                for (ll=3D0; ll<=3DMaxLL; ll++)
-                    optPtr->litLengthFreq[ll] =3D 1;
+            {   unsigned const baseLLfreqs[MaxLL+1] =3D {
+                    4, 2, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1
+                };
+                ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(bas=
eLLfreqs));
+                optPtr->litLengthSum =3D sum_u32(baseLLfreqs, MaxLL+1);
             }
-            optPtr->litLengthSum =3D MaxLL+1;
=20
             {   unsigned ml;
                 for (ml=3D0; ml<=3DMaxML; ml++)
@@ -201,21 +214,26 @@ ZSTD_rescaleFreqs(optState_t* const optPtr,
             }
             optPtr->matchLengthSum =3D MaxML+1;
=20
-            {   unsigned of;
-                for (of=3D0; of<=3DMaxOff; of++)
-                    optPtr->offCodeFreq[of] =3D 1;
+            {   unsigned const baseOFCfreqs[MaxOff+1] =3D {
+                    6, 2, 1, 1, 2, 3, 4, 4,
+                    4, 3, 2, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1,
+                    1, 1, 1, 1, 1, 1, 1, 1
+                };
+                ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(base=
OFCfreqs));
+                optPtr->offCodeSum =3D sum_u32(baseOFCfreqs, MaxOff+1);
             }
-            optPtr->offCodeSum =3D MaxOff+1;
+
=20
         }
=20
     } else {   /* new block : re-use previous statistics, scaled down */
=20
         if (compressedLiterals)
-            optPtr->litSum =3D ZSTD_downscaleStat(optPtr->litFreq, MaxLit,=
 1);
-        optPtr->litLengthSum =3D ZSTD_downscaleStat(optPtr->litLengthFreq,=
 MaxLL, 0);
-        optPtr->matchLengthSum =3D ZSTD_downscaleStat(optPtr->matchLengthF=
req, MaxML, 0);
-        optPtr->offCodeSum =3D ZSTD_downscaleStat(optPtr->offCodeFreq, Max=
Off, 0);
+            optPtr->litSum =3D ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12=
);
+        optPtr->litLengthSum =3D ZSTD_scaleStats(optPtr->litLengthFreq, Ma=
xLL, 11);
+        optPtr->matchLengthSum =3D ZSTD_scaleStats(optPtr->matchLengthFreq=
, MaxML, 11);
+        optPtr->offCodeSum =3D ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff=
, 11);
     }
=20
     ZSTD_setBasePrices(optPtr, optLevel);
@@ -251,7 +269,16 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const lite=
rals, U32 const litLength,
  * cost of literalLength symbol */
 static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* cons=
t optPtr, int optLevel)
 {
-    if (optPtr->priceType =3D=3D zop_predef) return WEIGHT(litLength, optL=
evel);
+    assert(litLength <=3D ZSTD_BLOCKSIZE_MAX);
+    if (optPtr->priceType =3D=3D zop_predef)
+        return WEIGHT(litLength, optLevel);
+    /* We can't compute the litLength price for sizes >=3D ZSTD_BLOCKSIZE_=
MAX
+     * because it isn't representable in the zstd format. So instead just
+     * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the bl=
ock
+     * would be all literals.
+     */
+    if (litLength =3D=3D ZSTD_BLOCKSIZE_MAX)
+        return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX=
 - 1, optPtr, optLevel);
=20
     /* dynamic statistics */
     {   U32 const llCode =3D ZSTD_LLcode(litLength);
@@ -264,15 +291,17 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, c=
onst optState_t* const optP
 /* ZSTD_getMatchPrice() :
  * Provides the cost of the match part (offset + matchLength) of a sequence
  * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a=
 sequence.
- * optLevel: when <2, favors small offset for decompression speed (improve=
d cache efficiency) */
+ * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are rea=
l_offsets+2
+ * @optLevel: when <2, favors small offset for decompression speed (improv=
ed cache efficiency)
+ */
 FORCE_INLINE_TEMPLATE U32
-ZSTD_getMatchPrice(U32 const offset,
+ZSTD_getMatchPrice(U32 const offcode,
                    U32 const matchLength,
              const optState_t* const optPtr,
                    int const optLevel)
 {
     U32 price;
-    U32 const offCode =3D ZSTD_highbit32(offset+1);
+    U32 const offCode =3D ZSTD_highbit32(STORED_TO_OFFBASE(offcode));
     U32 const mlBase =3D matchLength - MINMATCH;
     assert(matchLength >=3D MINMATCH);
=20
@@ -315,8 +344,8 @@ static void ZSTD_updateStats(optState_t* const optPtr,
         optPtr->litLengthSum++;
     }
=20
-    /* match offset code (0-2=3D>repCode; 3+=3D>offset+2) */
-    {   U32 const offCode =3D ZSTD_highbit32(offsetCode+1);
+    /* offset code : expected to follow storeSeq() numeric representation =
*/
+    {   U32 const offCode =3D ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)=
);
         assert(offCode <=3D MaxOff);
         optPtr->offCodeFreq[offCode]++;
         optPtr->offCodeSum++;
@@ -350,7 +379,7 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U3=
2 length)
=20
 /* Update hashTable3 up to ip (excluded)
    Assumption : always within prefix (i.e. not within extDict) */
-static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms,
+static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms,
                                               U32* nextToUpdate3,
                                               const BYTE* const ip)
 {
@@ -376,11 +405,13 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_ma=
tchState_t* ms,
 *  Binary Tree search
 ***************************************/
 /* ZSTD_insertBt1() : add one or multiple positions to tree.
- *  ip : assumed <=3D iend-8 .
+ * @param ip assumed <=3D iend-8 .
+ * @param target The target of ZSTD_updateTree_internal() - we are filling=
 to this position
  * @return : nb of positions added */
 static U32 ZSTD_insertBt1(
-                ZSTD_matchState_t* ms,
+                const ZSTD_matchState_t* ms,
                 const BYTE* const ip, const BYTE* const iend,
+                U32 const target,
                 U32 const mls, const int extDict)
 {
     const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
@@ -403,7 +434,10 @@ static U32 ZSTD_insertBt1(
     U32* smallerPtr =3D bt + 2*(curr&btMask);
     U32* largerPtr  =3D smallerPtr + 1;
     U32 dummy32;   /* to be nullified at the end */
-    U32 const windowLow =3D ms->window.lowLimit;
+    /* windowLow is based on target because
+     * we only need positions that will be in the window at the end of the=
 tree update.
+     */
+    U32 const windowLow =3D ZSTD_getLowestMatchIndex(ms, target, cParams->=
windowLog);
     U32 matchEndIdx =3D curr+8+1;
     size_t bestLength =3D 8;
     U32 nbCompares =3D 1U << cParams->searchLog;
@@ -416,6 +450,7 @@ static U32 ZSTD_insertBt1(
=20
     DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr);
=20
+    assert(curr <=3D target);
     assert(ip <=3D iend-8);   /* required for h calculation */
     hashTable[h] =3D curr;   /* Update Hash Table */
=20
@@ -504,7 +539,7 @@ void ZSTD_updateTree_internal(
                 idx, target, dictMode);
=20
     while(idx < target) {
-        U32 const forward =3D ZSTD_insertBt1(ms, base+idx, iend, mls, dict=
Mode =3D=3D ZSTD_extDict);
+        U32 const forward =3D ZSTD_insertBt1(ms, base+idx, iend, target, m=
ls, dictMode =3D=3D ZSTD_extDict);
         assert(idx < (U32)(idx + forward));
         idx +=3D forward;
     }
@@ -609,7 +644,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of lengt=
h %u",
                             repCode, ll0, repOffset, repLen);
                 bestLength =3D repLen;
-                matches[mnum].off =3D repCode - ll0;
+                matches[mnum].off =3D STORE_REPCODE(repCode - ll0 + 1);  /=
* expect value between 1 and 3 */
                 matches[mnum].len =3D (U32)repLen;
                 mnum++;
                 if ( (repLen > sufficient_len)
@@ -638,7 +673,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 bestLength =3D mlen;
                 assert(curr > matchIndex3);
                 assert(mnum=3D=3D0);  /* no prior solution */
-                matches[0].off =3D (curr - matchIndex3) + ZSTD_REP_MOVE;
+                matches[0].off =3D STORE_OFFSET(curr - matchIndex3);
                 matches[0].len =3D (U32)mlen;
                 mnum =3D 1;
                 if ( (mlen > sufficient_len) |
@@ -647,7 +682,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
                     return 1;
         }   }   }
         /* no dictMatchState lookup: dicts don't have a populated HC3 tabl=
e */
-    }
+    }  /* if (mls =3D=3D 3) */
=20
     hashTable[h] =3D curr;   /* Update Hash Table */
=20
@@ -672,20 +707,19 @@ U32 ZSTD_insertBtAndGetAllMatches (
=20
         if (matchLength > bestLength) {
             DEBUGLOG(8, "found match of length %u at distance %u (offCode=
=3D%u)",
-                    (U32)matchLength, curr - matchIndex, curr - matchIndex=
 + ZSTD_REP_MOVE);
+                    (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr=
 - matchIndex));
             assert(matchEndIdx > matchIndex);
             if (matchLength > matchEndIdx - matchIndex)
                 matchEndIdx =3D matchIndex + (U32)matchLength;
             bestLength =3D matchLength;
-            matches[mnum].off =3D (curr - matchIndex) + ZSTD_REP_MOVE;
+            matches[mnum].off =3D STORE_OFFSET(curr - matchIndex);
             matches[mnum].len =3D (U32)matchLength;
             mnum++;
             if ( (matchLength > ZSTD_OPT_NUM)
                | (ip+matchLength =3D=3D iLimit) /* equal : no way to know =
if inf or sup */) {
                 if (dictMode =3D=3D ZSTD_dictMatchState) nbCompares =3D 0;=
 /* break should also skip searching dms */
                 break; /* drop, to preserve bt consistency (miss a little =
bit of compression) */
-            }
-        }
+        }   }
=20
         if (match[matchLength] < ip[matchLength]) {
             /* match smaller than current */
@@ -721,18 +755,17 @@ U32 ZSTD_insertBtAndGetAllMatches (
             if (matchLength > bestLength) {
                 matchIndex =3D dictMatchIndex + dmsIndexDelta;
                 DEBUGLOG(8, "found dms match of length %u at distance %u (=
offCode=3D%u)",
-                        (U32)matchLength, curr - matchIndex, curr - matchI=
ndex + ZSTD_REP_MOVE);
+                        (U32)matchLength, curr - matchIndex, STORE_OFFSET(=
curr - matchIndex));
                 if (matchLength > matchEndIdx - matchIndex)
                     matchEndIdx =3D matchIndex + (U32)matchLength;
                 bestLength =3D matchLength;
-                matches[mnum].off =3D (curr - matchIndex) + ZSTD_REP_MOVE;
+                matches[mnum].off =3D STORE_OFFSET(curr - matchIndex);
                 matches[mnum].len =3D (U32)matchLength;
                 mnum++;
                 if ( (matchLength > ZSTD_OPT_NUM)
                    | (ip+matchLength =3D=3D iLimit) /* equal : no way to k=
now if inf or sup */) {
                     break;   /* drop, to guarantee consistency (miss a lit=
tle bit of compression) */
-                }
-            }
+            }   }
=20
             if (dictMatchIndex <=3D dmsBtLow) { break; }   /* beyond tree =
size, stop the search */
             if (match[matchLength] < ip[matchLength]) {
@@ -742,39 +775,91 @@ U32 ZSTD_insertBtAndGetAllMatches (
                 /* match is larger than current */
                 commonLengthLarger =3D matchLength;
                 dictMatchIndex =3D nextPtr[0];
-            }
-        }
-    }
+    }   }   }  /* if (dictMode =3D=3D ZSTD_dictMatchState) */
=20
     assert(matchEndIdx > curr+8);
     ms->nextToUpdate =3D matchEndIdx - 8;  /* skip repetitive patterns */
     return mnum;
 }
=20
-
-FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
-                        ZSTD_match_t* matches,   /* store result (match fo=
und, increasing size) in this table */
-                        ZSTD_matchState_t* ms,
-                        U32* nextToUpdate3,
-                        const BYTE* ip, const BYTE* const iHighLimit, cons=
t ZSTD_dictMode_e dictMode,
-                        const U32 rep[ZSTD_REP_NUM],
-                        U32 const ll0,
-                        U32 const lengthToBeat)
+typedef U32 (*ZSTD_getAllMatchesFn)(
+    ZSTD_match_t*,
+    ZSTD_matchState_t*,
+    U32*,
+    const BYTE*,
+    const BYTE*,
+    const U32 rep[ZSTD_REP_NUM],
+    U32 const ll0,
+    U32 const lengthToBeat);
+
+FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal(
+        ZSTD_match_t* matches,
+        ZSTD_matchState_t* ms,
+        U32* nextToUpdate3,
+        const BYTE* ip,
+        const BYTE* const iHighLimit,
+        const U32 rep[ZSTD_REP_NUM],
+        U32 const ll0,
+        U32 const lengthToBeat,
+        const ZSTD_dictMode_e dictMode,
+        const U32 mls)
 {
-    const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
-    U32 const matchLengthSearch =3D cParams->minMatch;
-    DEBUGLOG(8, "ZSTD_BtGetAllMatches");
-    if (ip < ms->window.base + ms->nextToUpdate) return 0;   /* skipped ar=
ea */
-    ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMo=
de);
-    switch(matchLengthSearch)
-    {
-    case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdat=
e3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3);
-    default :
-    case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdat=
e3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4);
-    case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdat=
e3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5);
-    case 7 :
-    case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdat=
e3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6);
+    assert(BOUNDED(3, ms->cParams.minMatch, 6) =3D=3D mls);
+    DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=3D%d, mls=3D%u)", (int)dict=
Mode, mls);
+    if (ip < ms->window.base + ms->nextToUpdate)
+        return 0;   /* skipped area */
+    ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode);
+    return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, i=
HighLimit, dictMode, rep, ll0, lengthToBeat, mls);
+}
+
+#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##d=
ictMode##_##mls
+
+#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls)            \
+    static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)(      \
+            ZSTD_match_t* matches,                             \
+            ZSTD_matchState_t* ms,                             \
+            U32* nextToUpdate3,                                \
+            const BYTE* ip,                                    \
+            const BYTE* const iHighLimit,                      \
+            const U32 rep[ZSTD_REP_NUM],                       \
+            U32 const ll0,                                     \
+            U32 const lengthToBeat)                            \
+    {                                                          \
+        return ZSTD_btGetAllMatches_internal(                  \
+                matches, ms, nextToUpdate3, ip, iHighLimit,    \
+                rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \
+    }
+
+#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5)  \
+    GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6)
+
+GEN_ZSTD_BT_GET_ALL_MATCHES(noDict)
+GEN_ZSTD_BT_GET_ALL_MATCHES(extDict)
+GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState)
+
+#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode)  \
+    {                                            \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \
+        ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6)  \
     }
+
+static ZSTD_getAllMatchesFn
+ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e co=
nst dictMode)
+{
+    ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] =3D {
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict),
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict),
+        ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState)
+    };
+    U32 const mls =3D BOUNDED(3, ms->cParams.minMatch, 6);
+    assert((U32)dictMode < 3);
+    assert(mls - 3 < 4);
+    return getAllMatchesFns[(int)dictMode][mls - 3];
 }
=20
 /* ***********************
@@ -783,16 +868,18 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
=20
 /* Struct containing info needed to make decision about ldm inclusion */
 typedef struct {
-    rawSeqStore_t seqStore;         /* External match candidates store for=
 this block */
-    U32 startPosInBlock;            /* Start position of the current match=
 candidate */
-    U32 endPosInBlock;              /* End position of the current match c=
andidate */
-    U32 offset;                     /* Offset of the match candidate */
+    rawSeqStore_t seqStore;   /* External match candidates store for this =
block */
+    U32 startPosInBlock;      /* Start position of the current match candi=
date */
+    U32 endPosInBlock;        /* End position of the current match candida=
te */
+    U32 offset;               /* Offset of the match candidate */
 } ZSTD_optLdm_t;
=20
 /* ZSTD_optLdm_skipRawSeqStoreBytes():
- * Moves forward in rawSeqStore by nbBytes, which will update the fields '=
pos' and 'posInSequence'.
+ * Moves forward in @rawSeqStore by @nbBytes,
+ * which will update the fields 'pos' and 'posInSequence'.
  */
-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, s=
ize_t nbBytes) {
+static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, s=
ize_t nbBytes)
+{
     U32 currPos =3D (U32)(rawSeqStore->posInSequence + nbBytes);
     while (currPos && rawSeqStore->pos < rawSeqStore->size) {
         rawSeq currSeq =3D rawSeqStore->seq[rawSeqStore->pos];
@@ -813,8 +900,10 @@ static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqSto=
re_t* rawSeqStore, size_t
  * Calculates the beginning and end of the next match in the current block.
  * Updates 'pos' and 'posInSequence' of the ldmSeqStore.
  */
-static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, =
U32 currPosInBlock,
-                                                   U32 blockBytesRemaining=
) {
+static void
+ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosI=
nBlock,
+                                       U32 blockBytesRemaining)
+{
     rawSeq currSeq;
     U32 currBlockEndPos;
     U32 literalsBytesRemaining;
@@ -826,8 +915,8 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD=
_optLdm_t* optLdm, U32 cu
         optLdm->endPosInBlock =3D UINT_MAX;
         return;
     }
-    /* Calculate appropriate bytes left in matchLength and litLength after=
 adjusting
-       based on ldmSeqStore->posInSequence */
+    /* Calculate appropriate bytes left in matchLength and litLength
+     * after adjusting based on ldmSeqStore->posInSequence */
     currSeq =3D optLdm->seqStore.seq[optLdm->seqStore.pos];
     assert(optLdm->seqStore.posInSequence <=3D currSeq.litLength + currSeq=
.matchLength);
     currBlockEndPos =3D currPosInBlock + blockBytesRemaining;
@@ -863,15 +952,16 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZS=
TD_optLdm_t* optLdm, U32 cu
 }
=20
 /* ZSTD_optLdm_maybeAddMatch():
- * Adds a match if it's long enough, based on it's 'matchStartPosInBlock'
- * and 'matchEndPosInBlock', into 'matches'. Maintains the correct orderin=
g of 'matches'
+ * Adds a match if it's long enough,
+ * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock',
+ * into 'matches'. Maintains the correct ordering of 'matches'.
  */
 static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatche=
s,
-                                      ZSTD_optLdm_t* optLdm, U32 currPosIn=
Block) {
-    U32 posDiff =3D currPosInBlock - optLdm->startPosInBlock;
+                                      const ZSTD_optLdm_t* optLdm, U32 cur=
rPosInBlock)
+{
+    U32 const posDiff =3D currPosInBlock - optLdm->startPosInBlock;
     /* Note: ZSTD_match_t actually contains offCode and matchLength (befor=
e subtracting MINMATCH) */
-    U32 candidateMatchLength =3D optLdm->endPosInBlock - optLdm->startPosI=
nBlock - posDiff;
-    U32 candidateOffCode =3D optLdm->offset + ZSTD_REP_MOVE;
+    U32 const candidateMatchLength =3D optLdm->endPosInBlock - optLdm->sta=
rtPosInBlock - posDiff;
=20
     /* Ensure that current block position is not outside of the match */
     if (currPosInBlock < optLdm->startPosInBlock
@@ -881,6 +971,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* mat=
ches, U32* nbMatches,
     }
=20
     if (*nbMatches =3D=3D 0 || ((candidateMatchLength > matches[*nbMatches=
-1].len) && *nbMatches < ZSTD_OPT_NUM)) {
+        U32 const candidateOffCode =3D STORE_OFFSET(optLdm->offset);
         DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate mat=
ch (offCode: %u matchLength %u) at block position=3D%u",
                  candidateOffCode, candidateMatchLength, currPosInBlock);
         matches[*nbMatches].len =3D candidateMatchLength;
@@ -892,8 +983,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* ma=
tches, U32* nbMatches,
 /* ZSTD_optLdm_processMatchCandidate():
  * Wrapper function to update ldm seq store and call ldm functions as nece=
ssary.
  */
-static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_=
match_t* matches, U32* nbMatches,
-                                              U32 currPosInBlock, U32 rema=
iningBytes) {
+static void
+ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm,
+                                  ZSTD_match_t* matches, U32* nbMatches,
+                                  U32 currPosInBlock, U32 remainingBytes)
+{
     if (optLdm->seqStore.size =3D=3D 0 || optLdm->seqStore.pos >=3D optLdm=
->seqStore.size) {
         return;
     }
@@ -904,19 +998,19 @@ static void ZSTD_optLdm_processMatchCandidate(ZSTD_op=
tLdm_t* optLdm, ZSTD_match_
              * at the end of a match from the ldm seq store, and will ofte=
n be some bytes
              * over beyond matchEndPosInBlock. As such, we need to correct=
 for these "overshoots"
              */
-            U32 posOvershoot =3D currPosInBlock - optLdm->endPosInBlock;
+            U32 const posOvershoot =3D currPosInBlock - optLdm->endPosInBl=
ock;
             ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOversho=
ot);
-        }=20
+        }
         ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, rem=
ainingBytes);
     }
     ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock);
 }
=20
+
 /*-*******************************
 *  Optimal parser
 *********************************/
=20
-
 static U32 ZSTD_totalLen(ZSTD_optimal_t sol)
 {
     return sol.litlen + sol.mlen;
@@ -957,6 +1051,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
     const BYTE* const prefixStart =3D base + ms->window.dictLimit;
     const ZSTD_compressionParameters* const cParams =3D &ms->cParams;
=20
+    ZSTD_getAllMatchesFn getAllMatches =3D ZSTD_selectBtGetAllMatches(ms, =
dictMode);
+
     U32 const sufficient_len =3D MIN(cParams->targetLength, ZSTD_OPT_NUM -=
1);
     U32 const minMatch =3D (cParams->minMatch =3D=3D 3) ? 3 : 4;
     U32 nextToUpdate3 =3D ms->nextToUpdate;
@@ -984,7 +1080,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
         /* find first match */
         {   U32 const litlen =3D (U32)(ip - anchor);
             U32 const ll0 =3D !litlen;
-            U32 nbMatches =3D ZSTD_BtGetAllMatches(matches, ms, &nextToUpd=
ate3, ip, iend, dictMode, rep, ll0, minMatch);
+            U32 nbMatches =3D getAllMatches(matches, ms, &nextToUpdate3, i=
p, iend, rep, ll0, minMatch);
             ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches,
                                               (U32)(ip-istart), (U32)(iend=
 - ip));
             if (!nbMatches) { ip++; continue; }
@@ -998,18 +1094,18 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
              * in every price. We include the literal length to avoid nega=
tive
              * prices when we subtract the previous literal length.
              */
-            opt[0].price =3D ZSTD_litLengthPrice(litlen, optStatePtr, optL=
evel);
+            opt[0].price =3D (int)ZSTD_litLengthPrice(litlen, optStatePtr,=
 optLevel);
=20
             /* large match -> immediate encoding */
             {   U32 const maxML =3D matches[nbMatches-1].len;
-                U32 const maxOffset =3D matches[nbMatches-1].off;
+                U32 const maxOffcode =3D matches[nbMatches-1].off;
                 DEBUGLOG(6, "found %u matches of maxLength=3D%u and maxOff=
Code=3D%u at cPos=3D%u =3D> start new series",
-                            nbMatches, maxML, maxOffset, (U32)(ip-prefixSt=
art));
+                            nbMatches, maxML, maxOffcode, (U32)(ip-prefixS=
tart));
=20
                 if (maxML > sufficient_len) {
                     lastSequence.litlen =3D litlen;
                     lastSequence.mlen =3D maxML;
-                    lastSequence.off =3D maxOffset;
+                    lastSequence.off =3D maxOffcode;
                     DEBUGLOG(6, "large match (%u>%u), immediate encoding",
                                 maxML, sufficient_len);
                     cur =3D 0;
@@ -1018,24 +1114,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* m=
s,
             }   }
=20
             /* set prices for first matches starting position =3D=3D 0 */
-            {   U32 const literalsPrice =3D opt[0].price + ZSTD_litLengthP=
rice(0, optStatePtr, optLevel);
+            assert(opt[0].price >=3D 0);
+            {   U32 const literalsPrice =3D (U32)opt[0].price + ZSTD_litLe=
ngthPrice(0, optStatePtr, optLevel);
                 U32 pos;
                 U32 matchNb;
                 for (pos =3D 1; pos < minMatch; pos++) {
                     opt[pos].price =3D ZSTD_MAX_PRICE;   /* mlen, litlen a=
nd price will be fixed during forward scanning */
                 }
                 for (matchNb =3D 0; matchNb < nbMatches; matchNb++) {
-                    U32 const offset =3D matches[matchNb].off;
+                    U32 const offcode =3D matches[matchNb].off;
                     U32 const end =3D matches[matchNb].len;
                     for ( ; pos <=3D end ; pos++ ) {
-                        U32 const matchPrice =3D ZSTD_getMatchPrice(offset=
, pos, optStatePtr, optLevel);
+                        U32 const matchPrice =3D ZSTD_getMatchPrice(offcod=
e, pos, optStatePtr, optLevel);
                         U32 const sequencePrice =3D literalsPrice + matchP=
rice;
                         DEBUGLOG(7, "rPos:%u =3D> set initial price : %.2f=
",
                                     pos, ZSTD_fCost(sequencePrice));
                         opt[pos].mlen =3D pos;
-                        opt[pos].off =3D offset;
+                        opt[pos].off =3D offcode;
                         opt[pos].litlen =3D litlen;
-                        opt[pos].price =3D sequencePrice;
+                        opt[pos].price =3D (int)sequencePrice;
                 }   }
                 last_pos =3D pos-1;
             }
@@ -1050,9 +1147,9 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
             /* Fix current position with one literal if cheaper */
             {   U32 const litlen =3D (opt[cur-1].mlen =3D=3D 0) ? opt[cur-=
1].litlen + 1 : 1;
                 int const price =3D opt[cur-1].price
-                                + ZSTD_rawLiteralsCost(ip+cur-1, 1, optSta=
tePtr, optLevel)
-                                + ZSTD_litLengthPrice(litlen, optStatePtr,=
 optLevel)
-                                - ZSTD_litLengthPrice(litlen-1, optStatePt=
r, optLevel);
+                                + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, o=
ptStatePtr, optLevel)
+                                + (int)ZSTD_litLengthPrice(litlen, optStat=
ePtr, optLevel)
+                                - (int)ZSTD_litLengthPrice(litlen-1, optSt=
atePtr, optLevel);
                 assert(price < 1000000000); /* overflow check */
                 if (price <=3D opt[cur].price) {
                     DEBUGLOG(7, "cPos:%zi=3D=3DrPos:%u : better price (%.2=
f<=3D%.2f) using literal (ll=3D=3D%u) (hist:%u,%u,%u)",
@@ -1078,7 +1175,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
             assert(cur >=3D opt[cur].mlen);
             if (opt[cur].mlen !=3D 0) {
                 U32 const prev =3D cur - opt[cur].mlen;
-                repcodes_t newReps =3D ZSTD_updateRep(opt[prev].rep, opt[c=
ur].off, opt[cur].litlen=3D=3D0);
+                repcodes_t const newReps =3D ZSTD_newRep(opt[prev].rep, op=
t[cur].off, opt[cur].litlen=3D=3D0);
                 ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t));
             } else {
                 ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcode=
s_t));
@@ -1095,11 +1192,12 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* m=
s,
                 continue;  /* skip unpromising positions; about ~+6% speed=
, -0.01 ratio */
             }
=20
+            assert(opt[cur].price >=3D 0);
             {   U32 const ll0 =3D (opt[cur].mlen !=3D 0);
                 U32 const litlen =3D (opt[cur].mlen =3D=3D 0) ? opt[cur].l=
itlen : 0;
-                U32 const previousPrice =3D opt[cur].price;
+                U32 const previousPrice =3D (U32)opt[cur].price;
                 U32 const basePrice =3D previousPrice + ZSTD_litLengthPric=
e(0, optStatePtr, optLevel);
-                U32 nbMatches =3D ZSTD_BtGetAllMatches(matches, ms, &nextT=
oUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch);
+                U32 nbMatches =3D getAllMatches(matches, ms, &nextToUpdate=
3, inr, iend, opt[cur].rep, ll0, minMatch);
                 U32 matchNb;
=20
                 ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMat=
ches,
@@ -1137,7 +1235,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
=20
                     for (mlen =3D lastML; mlen >=3D startML; mlen--) {  /*=
 scan downward */
                         U32 const pos =3D cur + mlen;
-                        int const price =3D basePrice + ZSTD_getMatchPrice=
(offset, mlen, optStatePtr, optLevel);
+                        int const price =3D (int)basePrice + (int)ZSTD_get=
MatchPrice(offset, mlen, optStatePtr, optLevel);
=20
                         if ((pos > last_pos) || (price < opt[pos].price)) {
                             DEBUGLOG(7, "rPos:%u (ml=3D%2u) =3D> new bette=
r price (%.2f<%.2f)",
@@ -1167,7 +1265,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
          * update them while traversing the sequences.
          */
         if (lastSequence.mlen !=3D 0) {
-            repcodes_t reps =3D ZSTD_updateRep(opt[cur].rep, lastSequence.=
off, lastSequence.litlen=3D=3D0);
+            repcodes_t const reps =3D ZSTD_newRep(opt[cur].rep, lastSequen=
ce.off, lastSequence.litlen=3D=3D0);
             ZSTD_memcpy(rep, &reps, sizeof(reps));
         } else {
             ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t));
@@ -1211,7 +1309,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms,
=20
                     assert(anchor + llen <=3D iend);
                     ZSTD_updateStats(optStatePtr, llen, anchor, offCode, m=
len);
-                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, m=
len-MINMATCH);
+                    ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, m=
len);
                     anchor +=3D advance;
                     ip =3D anchor;
             }   }
@@ -1223,38 +1321,30 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* m=
s,
     return (size_t)(iend - anchor);
 }
=20
+static size_t ZSTD_compressBlock_opt0(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 0 /* optLevel */, dictMode);
+}
+
+static size_t ZSTD_compressBlock_opt2(
+        ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
+        const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode)
+{
+    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 2 /* optLevel */, dictMode);
+}
=20
 size_t ZSTD_compressBlock_btopt(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock_btopt");
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 0 /*optLevel*/, ZSTD_noDict);
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_n=
oDict);
 }
=20
=20
-/* used in 2-pass strategy */
-static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus)
-{
-    U32 s, sum=3D0;
-    assert(ZSTD_FREQ_DIV+bonus >=3D 0);
-    for (s=3D0; s<lastEltIndex+1; s++) {
-        table[s] <<=3D ZSTD_FREQ_DIV+bonus;
-        table[s]--;
-        sum +=3D table[s];
-    }
-    return sum;
-}
=20
-/* used in 2-pass strategy */
-MEM_STATIC void ZSTD_upscaleStats(optState_t* optPtr)
-{
-    if (ZSTD_compressedLiterals(optPtr))
-        optPtr->litSum =3D ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0);
-    optPtr->litLengthSum =3D ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL=
, 0);
-    optPtr->matchLengthSum =3D ZSTD_upscaleStat(optPtr->matchLengthFreq, M=
axML, 0);
-    optPtr->offCodeSum =3D ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0=
);
-}
=20
 /* ZSTD_initStats_ultra():
  * make a first compression pass, just to seed stats with more accurate st=
arting values.
@@ -1276,7 +1366,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
     assert(ms->window.dictLimit =3D=3D ms->window.lowLimit);   /* no dicti=
onary */
     assert(ms->window.dictLimit - ms->nextToUpdate <=3D 1);  /* no prefix =
(note: intentional overflow, defined as 2-complement) */
=20
-    ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /=
*optLevel*/, ZSTD_noDict);   /* generate stats into ms->opt*/
+    ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDic=
t);   /* generate stats into ms->opt*/
=20
     /* invalidate first scan from history */
     ZSTD_resetSeqStore(seqStore);
@@ -1285,8 +1375,6 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms,
     ms->window.lowLimit =3D ms->window.dictLimit;
     ms->nextToUpdate =3D ms->window.dictLimit;
=20
-    /* re-inforce weight of collected statistics */
-    ZSTD_upscaleStats(&ms->opt);
 }
=20
 size_t ZSTD_compressBlock_btultra(
@@ -1294,7 +1382,7 @@ size_t ZSTD_compressBlock_btultra(
         const void* src, size_t srcSize)
 {
     DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=3D%zu)", srcSize);
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 2 /*optLevel*/, ZSTD_noDict);
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_n=
oDict);
 }
=20
 size_t ZSTD_compressBlock_btultra2(
@@ -1322,35 +1410,35 @@ size_t ZSTD_compressBlock_btultra2(
         ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize);
     }
=20
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 2 /*optLevel*/, ZSTD_noDict);
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_n=
oDict);
 }
=20
 size_t ZSTD_compressBlock_btopt_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 0 /*optLevel*/, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_d=
ictMatchState);
 }
=20
 size_t ZSTD_compressBlock_btultra_dictMatchState(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 2 /*optLevel*/, ZSTD_dictMatchState);
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_d=
ictMatchState);
 }
=20
 size_t ZSTD_compressBlock_btopt_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 0 /*optLevel*/, ZSTD_extDict);
+    return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_e=
xtDict);
 }
=20
 size_t ZSTD_compressBlock_btultra_extDict(
         ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
         const void* src, size_t srcSize)
 {
-    return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,=
 2 /*optLevel*/, ZSTD_extDict);
+    return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_e=
xtDict);
 }
=20
 /* note : no btultra2 variant for extDict nor dictMatchState,
diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf=
_decompress.c
index 5105e59ac04a..89b269a641c7 100644
--- a/lib/zstd/decompress/huf_decompress.c
+++ b/lib/zstd/decompress/huf_decompress.c
@@ -22,6 +22,13 @@
 #define HUF_STATIC_LINKING_ONLY
 #include "../common/huf.h"
 #include "../common/error_private.h"
+#include "../common/zstd_internal.h"
+
+/* **************************************************************
+*  Constants
+****************************************************************/
+
+#define HUF_DECODER_FAST_TABLELOG 11
=20
 /* **************************************************************
 *  Macros
@@ -36,6 +43,26 @@
 #error "Cannot force the use of the X1 and X2 decoders at the same time!"
 #endif
=20
+#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2
+# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE
+#else
+# define HUF_ASM_X86_64_BMI2_ATTRS
+#endif
+
+#define HUF_EXTERN_C
+#define HUF_ASM_DECL HUF_EXTERN_C
+
+#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+# define HUF_NEED_BMI2_FUNCTION 1
+#else
+# define HUF_NEED_BMI2_FUNCTION 0
+#endif
+
+#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__))
+# define HUF_NEED_DEFAULT_FUNCTION 1
+#else
+# define HUF_NEED_DEFAULT_FUNCTION 0
+#endif
=20
 /* **************************************************************
 *  Error Management
@@ -65,7 +92,7 @@
         return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable);           =
  \
     }                                                                     =
  \
                                                                           =
  \
-    static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2(                     =
  \
+    static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2(                        =
  \
                   void* dst,  size_t dstSize,                             =
  \
             const void* cSrc, size_t cSrcSize,                            =
  \
             const HUF_DTable* DTable)                                     =
  \
@@ -107,13 +134,147 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable=
* table)
     return dtd;
 }
=20
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+static size_t HUF_initDStream(BYTE const* ip) {
+    BYTE const lastByte =3D ip[7];
+    size_t const bitsConsumed =3D lastByte ? 8 - BIT_highbit32(lastByte) :=
 0;
+    size_t const value =3D MEM_readLEST(ip) | 1;
+    assert(bitsConsumed <=3D 8);
+    return value << bitsConsumed;
+}
+typedef struct {
+    BYTE const* ip[4];
+    BYTE* op[4];
+    U64 bits[4];
+    void const* dt;
+    BYTE const* ilimit;
+    BYTE* oend;
+    BYTE const* iend[4];
+} HUF_DecompressAsmArgs;
+
+/*
+ * Initializes args for the asm decoding loop.
+ * @returns 0 on success
+ *          1 if the fallback implementation should be used.
+ *          Or an error code on failure.
+ */
+static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void=
* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* D=
Table)
+{
+    void const* dt =3D DTable + 1;
+    U32 const dtLog =3D HUF_getDTableDesc(DTable).tableLog;
+
+    const BYTE* const ilimit =3D (const BYTE*)src + 6 + 8;
+
+    BYTE* const oend =3D (BYTE*)dst + dstSize;
+
+    /* The following condition is false on x32 platform,
+     * but HUF_asm is not compatible with this ABI */
+    if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1;
+
+    /* strict minimum : jump table + 1 byte per stream */
+    if (srcSize < 10)
+        return ERROR(corruption_detected);
+
+    /* Must have at least 8 bytes per stream because we don't handle initi=
alizing smaller bit containers.
+     * If table log is not correct at this point, fallback to the old deco=
der.
+     * On small inputs we don't have enough data to trigger the fast loop,=
 so use the old decoder.
+     */
+    if (dtLog !=3D HUF_DECODER_FAST_TABLELOG)
+        return 1;
+
+    /* Read the jump table. */
+    {
+        const BYTE* const istart =3D (const BYTE*)src;
+        size_t const length1 =3D MEM_readLE16(istart);
+        size_t const length2 =3D MEM_readLE16(istart+2);
+        size_t const length3 =3D MEM_readLE16(istart+4);
+        size_t const length4 =3D srcSize - (length1 + length2 + length3 + =
6);
+        args->iend[0] =3D istart + 6;  /* jumpTable */
+        args->iend[1] =3D args->iend[0] + length1;
+        args->iend[2] =3D args->iend[1] + length2;
+        args->iend[3] =3D args->iend[2] + length3;
+
+        /* HUF_initDStream() requires this, and this small of an input
+         * won't benefit from the ASM loop anyways.
+         * length1 must be >=3D 16 so that ip[0] >=3D ilimit before the lo=
op
+         * starts.
+         */
+        if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8)
+            return 1;
+        if (length4 > srcSize) return ERROR(corruption_detected);   /* ove=
rflow */
+    }
+    /* ip[] contains the position that is currently loaded into bits[]. */
+    args->ip[0] =3D args->iend[1] - sizeof(U64);
+    args->ip[1] =3D args->iend[2] - sizeof(U64);
+    args->ip[2] =3D args->iend[3] - sizeof(U64);
+    args->ip[3] =3D (BYTE const*)src + srcSize - sizeof(U64);
+
+    /* op[] contains the output pointers. */
+    args->op[0] =3D (BYTE*)dst;
+    args->op[1] =3D args->op[0] + (dstSize+3)/4;
+    args->op[2] =3D args->op[1] + (dstSize+3)/4;
+    args->op[3] =3D args->op[2] + (dstSize+3)/4;
+
+    /* No point to call the ASM loop for tiny outputs. */
+    if (args->op[3] >=3D oend)
+        return 1;
+
+    /* bits[] is the bit container.
+        * It is read from the MSB down to the LSB.
+        * It is shifted left as it is read, and zeros are
+        * shifted in. After the lowest valid bit a 1 is
+        * set, so that CountTrailingZeros(bits[]) can be used
+        * to count how many bits we've consumed.
+        */
+    args->bits[0] =3D HUF_initDStream(args->ip[0]);
+    args->bits[1] =3D HUF_initDStream(args->ip[1]);
+    args->bits[2] =3D HUF_initDStream(args->ip[2]);
+    args->bits[3] =3D HUF_initDStream(args->ip[3]);
+
+    /* If ip[] >=3D ilimit, it is guaranteed to be safe to
+        * reload bits[]. It may be beyond its section, but is
+        * guaranteed to be valid (>=3D istart).
+        */
+    args->ilimit =3D ilimit;
+
+    args->oend =3D oend;
+    args->dt =3D dt;
+
+    return 0;
+}
+
+static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressA=
smArgs const* args, int stream, BYTE* segmentEnd)
+{
+    /* Validate that we haven't overwritten. */
+    if (args->op[stream] > segmentEnd)
+        return ERROR(corruption_detected);
+    /* Validate that we haven't read beyond iend[].
+        * Note that ip[] may be < iend[] because the MSB is
+        * the next bit to read, and we may have consumed 100%
+        * of the stream, so down to iend[i] - 8 is valid.
+        */
+    if (args->ip[stream] < args->iend[stream] - 8)
+        return ERROR(corruption_detected);
+
+    /* Construct the BIT_DStream_t. */
+    bit->bitContainer =3D MEM_readLE64(args->ip[stream]);
+    bit->bitsConsumed =3D ZSTD_countTrailingZeros((size_t)args->bits[strea=
m]);
+    bit->start =3D (const char*)args->iend[0];
+    bit->limitPtr =3D bit->start + sizeof(size_t);
+    bit->ptr =3D (const char*)args->ip[stream];
+
+    return 0;
+}
+#endif
+
=20
 #ifndef HUF_FORCE_DECOMPRESS_X2
=20
 /*-***************************/
 /*  single-symbol decoding   */
 /*-***************************/
-typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1;   /* single-symbol =
decoding */
+typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1;   /* single-symbol =
decoding */
=20
 /*
  * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entri=
es at
@@ -122,14 +283,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1=
;   /* single-symbol decodi
 static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) {
     U64 D4;
     if (MEM_isLittleEndian()) {
-        D4 =3D symbol + (nbBits << 8);
-    } else {
         D4 =3D (symbol << 8) + nbBits;
+    } else {
+        D4 =3D symbol + (nbBits << 8);
     }
     D4 *=3D 0x0001000100010001ULL;
     return D4;
 }
=20
+/*
+ * Increase the tableLog to targetTableLog and rescales the stats.
+ * If tableLog > targetTableLog this is a no-op.
+ * @returns New tableLog
+ */
+static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols,=
 U32 tableLog, U32 targetTableLog)
+{
+    if (tableLog > targetTableLog)
+        return tableLog;
+    if (tableLog < targetTableLog) {
+        U32 const scale =3D targetTableLog - tableLog;
+        U32 s;
+        /* Increase the weight for all non-zero probability symbols by sca=
le. */
+        for (s =3D 0; s < nbSymbols; ++s) {
+            huffWeight[s] +=3D (BYTE)((huffWeight[s] =3D=3D 0) ? 0 : scale=
);
+        }
+        /* Update rankVal to reflect the new weights.
+         * All weights except 0 get moved to weight + scale.
+         * Weights [1, scale] are empty.
+         */
+        for (s =3D targetTableLog; s > scale; --s) {
+            rankVal[s] =3D rankVal[s - scale];
+        }
+        for (s =3D scale; s > 0; --s) {
+            rankVal[s] =3D 0;
+        }
+    }
+    return targetTableLog;
+}
+
 typedef struct {
         U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1];
         U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1];
@@ -162,8 +353,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, =
const void* src, size_t sr
     iSize =3D HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1=
, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, size=
of(wksp->statsWksp), bmi2);
     if (HUF_isError(iSize)) return iSize;
=20
+
     /* Table header */
     {   DTableDesc dtd =3D HUF_getDTableDesc(DTable);
+        U32 const maxTableLog =3D dtd.maxTableLog + 1;
+        U32 const targetTableLog =3D MIN(maxTableLog, HUF_DECODER_FAST_TAB=
LELOG);
+        tableLog =3D HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbS=
ymbols, tableLog, targetTableLog);
         if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_too=
Large);   /* DTable too small, Huffman tree cannot fit in */
         dtd.tableType =3D 0;
         dtd.tableLog =3D (BYTE)tableLog;
@@ -207,7 +402,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, c=
onst void* src, size_t sr
=20
     /* fill DTable
      * We fill all entries of each weight in order.
-     * That way length is a constant for each iteration of the outter loop.
+     * That way length is a constant for each iteration of the outer loop.
      * We can switch based on the length to a different inner loop which is
      * optimized for that particular case.
      */
@@ -304,11 +499,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitD=
Ptr, BYTE* const pEnd, cons
     BYTE* const pStart =3D p;
=20
     /* up to 4 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) & (p=
 < pEnd-3)) {
-        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
-        HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+    if ((pEnd - p) > 3) {
+        while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) =
& (p < pEnd-3)) {
+            HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_1(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_2(p, bitDPtr);
+            HUF_DECODE_SYMBOLX1_0(p, bitDPtr);
+        }
+    } else {
+        BIT_reloadDStream(bitDPtr);
     }
=20
     /* [0-3] symbols remaining */
@@ -388,33 +587,36 @@ HUF_decompress4X1_usingDTable_internal_body(
         U32 endSignal =3D 1;
=20
         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* ov=
erflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);      /* ov=
erflow */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
         CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
=20
         /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode=
 */
-        for ( ; (endSignal) & (op4 < olimit) ; ) {
-            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
-            endSignal &=3D BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DStrea=
m_unfinished;
-            endSignal &=3D BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DStrea=
m_unfinished;
-            endSignal &=3D BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DStrea=
m_unfinished;
-            endSignal &=3D BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DStrea=
m_unfinished;
+        if ((size_t)(oend - op4) >=3D sizeof(size_t)) {
+            for ( ; (endSignal) & (op4 < olimit) ; ) {
+                HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX1_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX1_0(op2, &bitD2);
+                HUF_DECODE_SYMBOLX1_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX1_0(op4, &bitD4);
+                endSignal &=3D BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DS=
tream_unfinished;
+                endSignal &=3D BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DS=
tream_unfinished;
+                endSignal &=3D BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DS=
tream_unfinished;
+                endSignal &=3D BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DS=
tream_unfinished;
+            }
         }
=20
         /* check corruption */
@@ -440,6 +642,79 @@ HUF_decompress4X1_usingDTable_internal_body(
     }
 }
=20
+#if HUF_NEED_BMI2_FUNCTION
+static BMI2_TARGET_ATTRIBUTE
+size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSi=
ze, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc,=
 cSrcSize, DTable);
+}
+#endif
+
+#if HUF_NEED_DEFAULT_FUNCTION
+static
+size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t ds=
tSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc,=
 cSrcSize, DTable);
+}
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF=
_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+
+static HUF_ASM_X86_64_BMI2_ATTRS
+size_t
+HUF_decompress4X1_usingDTable_internal_bmi2_asm(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable)
+{
+    void const* dt =3D DTable + 1;
+    const BYTE* const iend =3D (const BYTE*)cSrc + 6;
+    BYTE* const oend =3D (BYTE*)dst + dstSize;
+    HUF_DecompressAsmArgs args;
+    {
+        size_t const ret =3D HUF_DecompressAsmArgs_init(&args, dst, dstSiz=
e, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+        if (ret !=3D 0)
+            return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSiz=
e, cSrc, cSrcSize, DTable);
+    }
+
+    assert(args.ip[0] >=3D args.ilimit);
+    HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args);
+
+    /* Our loop guarantees that ip[] >=3D ilimit and that we haven't
+    * overwritten any op[].
+    */
+    assert(args.ip[0] >=3D iend);
+    assert(args.ip[1] >=3D iend);
+    assert(args.ip[2] >=3D iend);
+    assert(args.ip[3] >=3D iend);
+    assert(args.op[3] <=3D oend);
+    (void)iend;
+
+    /* finish bit streams one by one. */
+    {
+        size_t const segmentSize =3D (dstSize+3) / 4;
+        BYTE* segmentEnd =3D (BYTE*)dst;
+        int i;
+        for (i =3D 0; i < 4; ++i) {
+            BIT_DStream_t bit;
+            if (segmentSize <=3D (size_t)(oend - segmentEnd))
+                segmentEnd +=3D segmentSize;
+            else
+                segmentEnd =3D oend;
+            FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segm=
entEnd), "corruption");
+            /* Decompress and validate that we've produced exactly the exp=
ected length. */
+            args.op[i] +=3D HUF_decodeStreamX1(args.op[i], &bit, segmentEn=
d, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG);
+            if (args.op[i] !=3D segmentEnd) return ERROR(corruption_detect=
ed);
+        }
+    }
+
+    /* decoded size */
+    return dstSize;
+}
+#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
=20
 typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize,
                                                const void *cSrc,
@@ -447,8 +722,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *d=
st, size_t dstSize,
                                                const HUF_DTable *DTable);
=20
 HUF_DGEN(HUF_decompress1X1_usingDTable_internal)
-HUF_DGEN(HUF_decompress4X1_usingDTable_internal)
=20
+static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dst=
Size, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+# if ZSTD_ENABLE_ASM_X86_64_BMI2
+        return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSiz=
e, cSrc, cSrcSize, DTable);
+# else
+        return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, c=
Src, cSrcSize, DTable);
+# endif
+    }
+#else
+    (void)bmi2;
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+    return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, c=
Src, cSrcSize, DTable);
+#else
+    return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cS=
rc, cSrcSize, DTable);
+#endif
+}
=20
=20
 size_t HUF_decompress1X1_usingDTable(
@@ -518,106 +813,226 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx=
, void* dst, size_t dstSize,
 /* *************************/
=20
 typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2;  /*=
 double-symbols decoding */
-typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t;
+typedef struct { BYTE symbol; } sortedSymbol_t;
 typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1];
 typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX];
=20
+/*
+ * Constructs a HUF_DEltX2 in a U32.
+ */
+static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int lev=
el)
+{
+    U32 seq;
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) =3D=3D 0);
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) =3D=3D 2);
+    DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) =3D=3D 3);
+    DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) =3D=3D sizeof(U32));
+    if (MEM_isLittleEndian()) {
+        seq =3D level =3D=3D 1 ? symbol : (baseSeq + (symbol << 8));
+        return seq + (nbBits << 16) + ((U32)level << 24);
+    } else {
+        seq =3D level =3D=3D 1 ? (symbol << 8) : ((baseSeq << 8) + symbol);
+        return (seq << 16) + (nbBits << 8) + (U32)level;
+    }
+}
=20
-/* HUF_fillDTableX2Level2() :
- * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 =
*/
-static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const =
U32 consumed,
-                           const U32* rankValOrigin, const int minWeight,
-                           const sortedSymbol_t* sortedSymbols, const U32 =
sortedListSize,
-                           U32 nbBitsBaseline, U16 baseSeq, U32* wksp, siz=
e_t wkspSize)
+/*
+ * Constructs a HUF_DEltX2.
+ */
+static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int=
 level)
 {
     HUF_DEltX2 DElt;
-    U32* rankVal =3D wksp;
+    U32 const val =3D HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
+    DEBUG_STATIC_ASSERT(sizeof(DElt) =3D=3D sizeof(val));
+    ZSTD_memcpy(&DElt, &val, sizeof(val));
+    return DElt;
+}
+
+/*
+ * Constructs 2 HUF_DEltX2s and packs them into a U64.
+ */
+static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int lev=
el)
+{
+    U32 DElt =3D HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level);
+    return (U64)DElt + ((U64)DElt << 32);
+}
=20
-    assert(wkspSize >=3D HUF_TABLELOG_MAX + 1);
-    (void)wkspSize;
-    /* get pre-calculated rankVal */
-    ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + =
1));
+/*
+ * Fills the DTable rank with all the symbols from [begin, end) that are e=
ach
+ * nbBits long.
+ *
+ * @param DTableRank The start of the rank in the DTable.
+ * @param begin The first symbol to fill (inclusive).
+ * @param end The last symbol to fill (exclusive).
+ * @param nbBits Each symbol is nbBits long.
+ * @param tableLog The table log.
+ * @param baseSeq If level =3D=3D 1 { 0 } else { the first level symbol }
+ * @param level The level in the table. Must be 1 or 2.
+ */
+static void HUF_fillDTableX2ForWeight(
+    HUF_DEltX2* DTableRank,
+    sortedSymbol_t const* begin, sortedSymbol_t const* end,
+    U32 nbBits, U32 tableLog,
+    U16 baseSeq, int const level)
+{
+    U32 const length =3D 1U << ((tableLog - nbBits) & 0x1F /* quiet static=
-analyzer */);
+    const sortedSymbol_t* ptr;
+    assert(level >=3D 1 && level <=3D 2);
+    switch (length) {
+    case 1:
+        for (ptr =3D begin; ptr !=3D end; ++ptr) {
+            HUF_DEltX2 const DElt =3D HUF_buildDEltX2(ptr->symbol, nbBits,=
 baseSeq, level);
+            *DTableRank++ =3D DElt;
+        }
+        break;
+    case 2:
+        for (ptr =3D begin; ptr !=3D end; ++ptr) {
+            HUF_DEltX2 const DElt =3D HUF_buildDEltX2(ptr->symbol, nbBits,=
 baseSeq, level);
+            DTableRank[0] =3D DElt;
+            DTableRank[1] =3D DElt;
+            DTableRank +=3D 2;
+        }
+        break;
+    case 4:
+        for (ptr =3D begin; ptr !=3D end; ++ptr) {
+            U64 const DEltX2 =3D HUF_buildDEltX2U64(ptr->symbol, nbBits, b=
aseSeq, level);
+            ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+            DTableRank +=3D 4;
+        }
+        break;
+    case 8:
+        for (ptr =3D begin; ptr !=3D end; ++ptr) {
+            U64 const DEltX2 =3D HUF_buildDEltX2U64(ptr->symbol, nbBits, b=
aseSeq, level);
+            ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
+            DTableRank +=3D 8;
+        }
+        break;
+    default:
+        for (ptr =3D begin; ptr !=3D end; ++ptr) {
+            U64 const DEltX2 =3D HUF_buildDEltX2U64(ptr->symbol, nbBits, b=
aseSeq, level);
+            HUF_DEltX2* const DTableRankEnd =3D DTableRank + length;
+            for (; DTableRank !=3D DTableRankEnd; DTableRank +=3D 8) {
+                ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2));
+                ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2));
+            }
+        }
+        break;
+    }
+}
=20
-    /* fill skipped values */
+/* HUF_fillDTableX2Level2() :
+ * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 =
*/
+static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, cons=
t U32 consumedBits,
+                           const U32* rankVal, const int minWeight, const =
int maxWeight1,
+                           const sortedSymbol_t* sortedSymbols, U32 const*=
 rankStart,
+                           U32 nbBitsBaseline, U16 baseSeq)
+{
+    /* Fill skipped values (all positions up to rankVal[minWeight]).
+     * These are positions only get a single symbol because the combined w=
eight
+     * is too large.
+     */
     if (minWeight>1) {
-        U32 i, skipSize =3D rankVal[minWeight];
-        MEM_writeLE16(&(DElt.sequence), baseSeq);
-        DElt.nbBits   =3D (BYTE)(consumed);
-        DElt.length   =3D 1;
-        for (i =3D 0; i < skipSize; i++)
-            DTable[i] =3D DElt;
+        U32 const length =3D 1U << ((targetLog - consumedBits) & 0x1F /* q=
uiet static-analyzer */);
+        U64 const DEltX2 =3D HUF_buildDEltX2U64(baseSeq, consumedBits, /* =
baseSeq */ 0, /* level */ 1);
+        int const skipSize =3D rankVal[minWeight];
+        assert(length > 1);
+        assert((U32)skipSize < length);
+        switch (length) {
+        case 2:
+            assert(skipSize =3D=3D 1);
+            ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2));
+            break;
+        case 4:
+            assert(skipSize <=3D 4);
+            ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2));
+            ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2));
+            break;
+        default:
+            {
+                int i;
+                for (i =3D 0; i < skipSize; i +=3D 8) {
+                    ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2));
+                    ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2));
+                }
+            }
+        }
     }
=20
-    /* fill DTable */
-    {   U32 s; for (s=3D0; s<sortedListSize; s++) {   /* note : sortedSymb=
ols already skipped */
-            const U32 symbol =3D sortedSymbols[s].symbol;
-            const U32 weight =3D sortedSymbols[s].weight;
-            const U32 nbBits =3D nbBitsBaseline - weight;
-            const U32 length =3D 1 << (sizeLog-nbBits);
-            const U32 start =3D rankVal[weight];
-            U32 i =3D start;
-            const U32 end =3D start + length;
-
-            MEM_writeLE16(&(DElt.sequence), (U16)(baseSeq + (symbol << 8))=
);
-            DElt.nbBits =3D (BYTE)(nbBits + consumed);
-            DElt.length =3D 2;
-            do { DTable[i++] =3D DElt; } while (i<end);   /* since length =
>=3D 1 */
-
-            rankVal[weight] +=3D length;
-    }   }
+    /* Fill each of the second level symbols by weight. */
+    {
+        int w;
+        for (w =3D minWeight; w < maxWeight1; ++w) {
+            int const begin =3D rankStart[w];
+            int const end =3D rankStart[w+1];
+            U32 const nbBits =3D nbBitsBaseline - w;
+            U32 const totalBits =3D nbBits + consumedBits;
+            HUF_fillDTableX2ForWeight(
+                DTable + rankVal[w],
+                sortedSymbols + begin, sortedSymbols + end,
+                totalBits, targetLog,
+                baseSeq, /* level */ 2);
+        }
+    }
 }
=20
-
 static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog,
-                           const sortedSymbol_t* sortedList, const U32 sor=
tedListSize,
+                           const sortedSymbol_t* sortedList,
                            const U32* rankStart, rankVal_t rankValOrigin, =
const U32 maxWeight,
-                           const U32 nbBitsBaseline, U32* wksp, size_t wks=
pSize)
+                           const U32 nbBitsBaseline)
 {
-    U32* rankVal =3D wksp;
+    U32* const rankVal =3D rankValOrigin[0];
     const int scaleLog =3D nbBitsBaseline - targetLog;   /* note : targetL=
og >=3D srcLog, hence scaleLog <=3D 1 */
     const U32 minBits  =3D nbBitsBaseline - maxWeight;
-    U32 s;
-
-    assert(wkspSize >=3D HUF_TABLELOG_MAX + 1);
-    wksp +=3D HUF_TABLELOG_MAX + 1;
-    wkspSize -=3D HUF_TABLELOG_MAX + 1;
-
-    ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + =
1));
-
-    /* fill DTable */
-    for (s=3D0; s<sortedListSize; s++) {
-        const U16 symbol =3D sortedList[s].symbol;
-        const U32 weight =3D sortedList[s].weight;
-        const U32 nbBits =3D nbBitsBaseline - weight;
-        const U32 start =3D rankVal[weight];
-        const U32 length =3D 1 << (targetLog-nbBits);
-
-        if (targetLog-nbBits >=3D minBits) {   /* enough room for a second=
 symbol */
-            U32 sortedRank;
+    int w;
+    int const wEnd =3D (int)maxWeight + 1;
+
+    /* Fill DTable in order of weight. */
+    for (w =3D 1; w < wEnd; ++w) {
+        int const begin =3D (int)rankStart[w];
+        int const end =3D (int)rankStart[w+1];
+        U32 const nbBits =3D nbBitsBaseline - w;
+
+        if (targetLog-nbBits >=3D minBits) {
+            /* Enough room for a second symbol. */
+            int start =3D rankVal[w];
+            U32 const length =3D 1U << ((targetLog - nbBits) & 0x1F /* qui=
et static-analyzer */);
             int minWeight =3D nbBits + scaleLog;
+            int s;
             if (minWeight < 1) minWeight =3D 1;
-            sortedRank =3D rankStart[minWeight];
-            HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits,
-                           rankValOrigin[nbBits], minWeight,
-                           sortedList+sortedRank, sortedListSize-sortedRan=
k,
-                           nbBitsBaseline, symbol, wksp, wkspSize);
+            /* Fill the DTable for every symbol of weight w.
+             * These symbols get at least 1 second symbol.
+             */
+            for (s =3D begin; s !=3D end; ++s) {
+                HUF_fillDTableX2Level2(
+                    DTable + start, targetLog, nbBits,
+                    rankValOrigin[nbBits], minWeight, wEnd,
+                    sortedList, rankStart,
+                    nbBitsBaseline, sortedList[s].symbol);
+                start +=3D length;
+            }
         } else {
-            HUF_DEltX2 DElt;
-            MEM_writeLE16(&(DElt.sequence), symbol);
-            DElt.nbBits =3D (BYTE)(nbBits);
-            DElt.length =3D 1;
-            {   U32 const end =3D start + length;
-                U32 u;
-                for (u =3D start; u < end; u++) DTable[u] =3D DElt;
-        }   }
-        rankVal[weight] +=3D length;
+            /* Only a single symbol. */
+            HUF_fillDTableX2ForWeight(
+                DTable + rankVal[w],
+                sortedList + begin, sortedList + end,
+                nbBits, targetLog,
+                /* baseSeq */ 0, /* level */ 1);
+        }
     }
 }
=20
 typedef struct {
     rankValCol_t rankVal[HUF_TABLELOG_MAX];
     U32 rankStats[HUF_TABLELOG_MAX + 1];
-    U32 rankStart0[HUF_TABLELOG_MAX + 2];
+    U32 rankStart0[HUF_TABLELOG_MAX + 3];
     sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1];
     BYTE weightList[HUF_SYMBOLVALUE_MAX + 1];
     U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32];
@@ -627,9 +1042,16 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
                        const void* src, size_t srcSize,
                              void* workSpace, size_t wkspSize)
 {
-    U32 tableLog, maxW, sizeOfSort, nbSymbols;
+    return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wks=
pSize, /* bmi2 */ 0);
+}
+
+size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable,
+                       const void* src, size_t srcSize,
+                             void* workSpace, size_t wkspSize, int bmi2)
+{
+    U32 tableLog, maxW, nbSymbols;
     DTableDesc dtd =3D HUF_getDTableDesc(DTable);
-    U32 const maxTableLog =3D dtd.maxTableLog;
+    U32 maxTableLog =3D dtd.maxTableLog;
     size_t iSize;
     void* dtPtr =3D DTable+1;   /* force compiler to avoid strict-aliasing=
 */
     HUF_DEltX2* const dt =3D (HUF_DEltX2*)dtPtr;
@@ -647,11 +1069,12 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
     if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge);
     /* ZSTD_memset(weightList, 0, sizeof(weightList)); */  /* is not neces=
sary, even though some analyzer complain ... */
=20
-    iSize =3D HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1=
, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, s=
izeof(wksp->calleeWksp), /* bmi2 */ 0);
+    iSize =3D HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1=
, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, s=
izeof(wksp->calleeWksp), bmi2);
     if (HUF_isError(iSize)) return iSize;
=20
     /* check result */
     if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge);   /* DTab=
le can't fit code depth */
+    if (tableLog <=3D HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECOD=
ER_FAST_TABLELOG) maxTableLog =3D HUF_DECODER_FAST_TABLELOG;
=20
     /* find maxWeight */
     for (maxW =3D tableLog; wksp->rankStats[maxW]=3D=3D0; maxW--) {}  /* n=
ecessarily finds a solution before 0 */
@@ -664,7 +1087,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
             rankStart[w] =3D curr;
         }
         rankStart[0] =3D nextRankStart;   /* put all 0w symbols at the end=
 of sorted list*/
-        sizeOfSort =3D nextRankStart;
+        rankStart[maxW+1] =3D nextRankStart;
     }
=20
     /* sort symbols by weight */
@@ -673,7 +1096,6 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
             U32 const w =3D wksp->weightList[s];
             U32 const r =3D rankStart[w]++;
             wksp->sortedSymbol[r].symbol =3D (BYTE)s;
-            wksp->sortedSymbol[r].weight =3D (BYTE)w;
         }
         rankStart[0] =3D 0;   /* forget 0w symbols; this is beginning of w=
eight(1) */
     }
@@ -698,10 +1120,9 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable,
     }   }   }   }
=20
     HUF_fillDTableX2(dt, maxTableLog,
-                   wksp->sortedSymbol, sizeOfSort,
+                   wksp->sortedSymbol,
                    wksp->rankStart0, wksp->rankVal, maxW,
-                   tableLog+1,
-                   wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32=
));
+                   tableLog+1);
=20
     dtd.tableLog =3D (BYTE)maxTableLog;
     dtd.tableType =3D 1;
@@ -714,7 +1135,7 @@ FORCE_INLINE_TEMPLATE U32
 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt,=
 const U32 dtLog)
 {
     size_t const val =3D BIT_lookBitsFast(DStream, dtLog);   /* note : dtL=
og >=3D 1 */
-    ZSTD_memcpy(op, dt+val, 2);
+    ZSTD_memcpy(op, &dt[val].sequence, 2);
     BIT_skipBits(DStream, dt[val].nbBits);
     return dt[val].length;
 }
@@ -723,15 +1144,17 @@ FORCE_INLINE_TEMPLATE U32
 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2*=
 dt, const U32 dtLog)
 {
     size_t const val =3D BIT_lookBitsFast(DStream, dtLog);   /* note : dtL=
og >=3D 1 */
-    ZSTD_memcpy(op, dt+val, 1);
-    if (dt[val].length=3D=3D1) BIT_skipBits(DStream, dt[val].nbBits);
-    else {
+    ZSTD_memcpy(op, &dt[val].sequence, 1);
+    if (dt[val].length=3D=3D1) {
+        BIT_skipBits(DStream, dt[val].nbBits);
+    } else {
         if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) {
             BIT_skipBits(DStream, dt[val].nbBits);
             if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8))
                 /* ugly hack; works only because it's the last symbol. Not=
e : can't easily extract nbBits from just this symbol */
                 DStream->bitsConsumed =3D (sizeof(DStream->bitContainer)*8=
);
-    }   }
+        }
+    }
     return 1;
 }
=20
@@ -753,19 +1176,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, =
BYTE* const pEnd,
     BYTE* const pStart =3D p;
=20
     /* up to 8 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) & (p=
 < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
-        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
-        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    if ((size_t)(pEnd - p) >=3D sizeof(bitDPtr->bitContainer)) {
+        if (dtLog <=3D 11 && MEM_64bits()) {
+            /* up to 10 symbols at a time */
+            while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinish=
ed) & (p < pEnd-9)) {
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+            }
+        } else {
+            /* up to 8 symbols at a time */
+            while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinish=
ed) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) {
+                HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_1(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_2(p, bitDPtr);
+                HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+            }
+        }
+    } else {
+        BIT_reloadDStream(bitDPtr);
     }
=20
     /* closer to end : up to 2 symbols at a time */
-    while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) & (p=
 <=3D pEnd-2))
-        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
+    if ((size_t)(pEnd - p) >=3D 2) {
+        while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) =
& (p <=3D pEnd-2))
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);
=20
-    while (p <=3D pEnd-2)
-        HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : reache=
d the end of DStream */
+        while (p <=3D pEnd-2)
+            HUF_DECODE_SYMBOLX2_0(p, bitDPtr);   /* no need to reload : re=
ached the end of DStream */
+    }
=20
     if (p < pEnd)
         p +=3D HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog);
@@ -799,7 +1240,6 @@ HUF_decompress1X2_usingDTable_internal_body(
     /* decoded size */
     return dstSize;
 }
-
 FORCE_INLINE_TEMPLATE size_t
 HUF_decompress4X2_usingDTable_internal_body(
           void* dst,  size_t dstSize,
@@ -841,57 +1281,60 @@ HUF_decompress4X2_usingDTable_internal_body(
         U32 const dtLog =3D dtd.tableLog;
=20
         if (length4 > cSrcSize) return ERROR(corruption_detected);   /* ov=
erflow */
+        if (opStart4 > oend) return ERROR(corruption_detected);      /* ov=
erflow */
         CHECK_F( BIT_initDStream(&bitD1, istart1, length1) );
         CHECK_F( BIT_initDStream(&bitD2, istart2, length2) );
         CHECK_F( BIT_initDStream(&bitD3, istart3, length3) );
         CHECK_F( BIT_initDStream(&bitD4, istart4, length4) );
=20
         /* 16-32 symbols per loop (4-8 symbols per stream) */
-        for ( ; (endSignal) & (op4 < olimit); ) {
+        if ((size_t)(oend - op4) >=3D sizeof(size_t)) {
+            for ( ; (endSignal) & (op4 < olimit); ) {
 #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__))
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
-            endSignal &=3D BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DStrea=
m_unfinished;
-            endSignal &=3D BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DStrea=
m_unfinished;
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
-            endSignal &=3D BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DStrea=
m_unfinished;
-            endSignal &=3D BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DStrea=
m_unfinished;
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+                endSignal &=3D BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DS=
tream_unfinished;
+                endSignal &=3D BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DS=
tream_unfinished;
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+                endSignal &=3D BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DS=
tream_unfinished;
+                endSignal &=3D BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DS=
tream_unfinished;
 #else
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
-            HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
-            HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
-            HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
-            HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
-            endSignal =3D (U32)LIKELY((U32)
-                        (BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DStream_=
unfinished)
-                      & (BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DStream_=
unfinished)
-                      & (BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DStream_=
unfinished)
-                      & (BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DStream_=
unfinished));
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_1(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_1(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_1(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_1(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_2(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_2(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_2(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_2(op4, &bitD4);
+                HUF_DECODE_SYMBOLX2_0(op1, &bitD1);
+                HUF_DECODE_SYMBOLX2_0(op2, &bitD2);
+                HUF_DECODE_SYMBOLX2_0(op3, &bitD3);
+                HUF_DECODE_SYMBOLX2_0(op4, &bitD4);
+                endSignal =3D (U32)LIKELY((U32)
+                            (BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DStr=
eam_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DStrea=
m_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DStrea=
m_unfinished)
+                        & (BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DStrea=
m_unfinished));
 #endif
+            }
         }
=20
         /* check corruption */
@@ -915,8 +1358,99 @@ HUF_decompress4X2_usingDTable_internal_body(
     }
 }
=20
+#if HUF_NEED_BMI2_FUNCTION
+static BMI2_TARGET_ATTRIBUTE
+size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSi=
ze, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc,=
 cSrcSize, DTable);
+}
+#endif
+
+#if HUF_NEED_DEFAULT_FUNCTION
+static
+size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t ds=
tSize, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable) {
+    return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc,=
 cSrcSize, DTable);
+}
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2
+
+HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF=
_DecompressAsmArgs* args) ZSTDLIB_HIDDEN;
+
+static HUF_ASM_X86_64_BMI2_ATTRS size_t
+HUF_decompress4X2_usingDTable_internal_bmi2_asm(
+          void* dst,  size_t dstSize,
+    const void* cSrc, size_t cSrcSize,
+    const HUF_DTable* DTable) {
+    void const* dt =3D DTable + 1;
+    const BYTE* const iend =3D (const BYTE*)cSrc + 6;
+    BYTE* const oend =3D (BYTE*)dst + dstSize;
+    HUF_DecompressAsmArgs args;
+    {
+        size_t const ret =3D HUF_DecompressAsmArgs_init(&args, dst, dstSiz=
e, cSrc, cSrcSize, DTable);
+        FORWARD_IF_ERROR(ret, "Failed to init asm args");
+        if (ret !=3D 0)
+            return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSiz=
e, cSrc, cSrcSize, DTable);
+    }
+
+    assert(args.ip[0] >=3D args.ilimit);
+    HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args);
+
+    /* note : op4 already verified within main loop */
+    assert(args.ip[0] >=3D iend);
+    assert(args.ip[1] >=3D iend);
+    assert(args.ip[2] >=3D iend);
+    assert(args.ip[3] >=3D iend);
+    assert(args.op[3] <=3D oend);
+    (void)iend;
+
+    /* finish bitStreams one by one */
+    {
+        size_t const segmentSize =3D (dstSize+3) / 4;
+        BYTE* segmentEnd =3D (BYTE*)dst;
+        int i;
+        for (i =3D 0; i < 4; ++i) {
+            BIT_DStream_t bit;
+            if (segmentSize <=3D (size_t)(oend - segmentEnd))
+                segmentEnd +=3D segmentSize;
+            else
+                segmentEnd =3D oend;
+            FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segm=
entEnd), "corruption");
+            args.op[i] +=3D HUF_decodeStreamX2(args.op[i], &bit, segmentEn=
d, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG);
+            if (args.op[i] !=3D segmentEnd)
+                return ERROR(corruption_detected);
+        }
+    }
+
+    /* decoded size */
+    return dstSize;
+}
+#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */
+
+static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dst=
Size, void const* cSrc,
+                    size_t cSrcSize, HUF_DTable const* DTable, int bmi2)
+{
+#if DYNAMIC_BMI2
+    if (bmi2) {
+# if ZSTD_ENABLE_ASM_X86_64_BMI2
+        return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSiz=
e, cSrc, cSrcSize, DTable);
+# else
+        return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, c=
Src, cSrcSize, DTable);
+# endif
+    }
+#else
+    (void)bmi2;
+#endif
+
+#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)
+    return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, c=
Src, cSrcSize, DTable);
+#else
+    return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cS=
rc, cSrcSize, DTable);
+#endif
+}
+
 HUF_DGEN(HUF_decompress1X2_usingDTable_internal)
-HUF_DGEN(HUF_decompress4X2_usingDTable_internal)
=20
 size_t HUF_decompress1X2_usingDTable(
           void* dst,  size_t dstSize,
@@ -1025,25 +1559,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size=
_t maxDstSize,
=20
 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2)
 typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t;
-static const algo_time_t algoTime[16 /* Quantization */][3 /* single, doub=
le, quad */] =3D
+static const algo_time_t algoTime[16 /* Quantization */][2 /* single, doub=
le */] =3D
 {
     /* single, double, quad */
-    {{0,0}, {1,1}, {2,2}},  /* Q=3D=3D0 : impossible */
-    {{0,0}, {1,1}, {2,2}},  /* Q=3D=3D1 : impossible */
-    {{  38,130}, {1313, 74}, {2151, 38}},   /* Q =3D=3D 2 : 12-18% */
-    {{ 448,128}, {1353, 74}, {2238, 41}},   /* Q =3D=3D 3 : 18-25% */
-    {{ 556,128}, {1353, 74}, {2238, 47}},   /* Q =3D=3D 4 : 25-32% */
-    {{ 714,128}, {1418, 74}, {2436, 53}},   /* Q =3D=3D 5 : 32-38% */
-    {{ 883,128}, {1437, 74}, {2464, 61}},   /* Q =3D=3D 6 : 38-44% */
-    {{ 897,128}, {1515, 75}, {2622, 68}},   /* Q =3D=3D 7 : 44-50% */
-    {{ 926,128}, {1613, 75}, {2730, 75}},   /* Q =3D=3D 8 : 50-56% */
-    {{ 947,128}, {1729, 77}, {3359, 77}},   /* Q =3D=3D 9 : 56-62% */
-    {{1107,128}, {2083, 81}, {4006, 84}},   /* Q =3D=3D10 : 62-69% */
-    {{1177,128}, {2379, 87}, {4785, 88}},   /* Q =3D=3D11 : 69-75% */
-    {{1242,128}, {2415, 93}, {5155, 84}},   /* Q =3D=3D12 : 75-81% */
-    {{1349,128}, {2644,106}, {5260,106}},   /* Q =3D=3D13 : 81-87% */
-    {{1455,128}, {2422,124}, {4174,124}},   /* Q =3D=3D14 : 87-93% */
-    {{ 722,128}, {1891,145}, {1936,146}},   /* Q =3D=3D15 : 93-99% */
+    {{0,0}, {1,1}},  /* Q=3D=3D0 : impossible */
+    {{0,0}, {1,1}},  /* Q=3D=3D1 : impossible */
+    {{ 150,216}, { 381,119}},   /* Q =3D=3D 2 : 12-18% */
+    {{ 170,205}, { 514,112}},   /* Q =3D=3D 3 : 18-25% */
+    {{ 177,199}, { 539,110}},   /* Q =3D=3D 4 : 25-32% */
+    {{ 197,194}, { 644,107}},   /* Q =3D=3D 5 : 32-38% */
+    {{ 221,192}, { 735,107}},   /* Q =3D=3D 6 : 38-44% */
+    {{ 256,189}, { 881,106}},   /* Q =3D=3D 7 : 44-50% */
+    {{ 359,188}, {1167,109}},   /* Q =3D=3D 8 : 50-56% */
+    {{ 582,187}, {1570,114}},   /* Q =3D=3D 9 : 56-62% */
+    {{ 688,187}, {1712,122}},   /* Q =3D=3D10 : 62-69% */
+    {{ 825,186}, {1965,136}},   /* Q =3D=3D11 : 69-75% */
+    {{ 976,185}, {2131,150}},   /* Q =3D=3D12 : 75-81% */
+    {{1180,186}, {2070,175}},   /* Q =3D=3D13 : 81-87% */
+    {{1377,185}, {1731,202}},   /* Q =3D=3D14 : 87-93% */
+    {{1412,185}, {1695,202}},   /* Q =3D=3D15 : 93-99% */
 };
 #endif
=20
@@ -1070,7 +1604,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSiz=
e)
         U32 const D256 =3D (U32)(dstSize >> 8);
         U32 const DTime0 =3D algoTime[Q][0].tableTime + (algoTime[Q][0].de=
code256Time * D256);
         U32 DTime1 =3D algoTime[Q][1].tableTime + (algoTime[Q][1].decode25=
6Time * D256);
-        DTime1 +=3D DTime1 >> 3;  /* advantage to algorithm using less mem=
ory, to reduce cache eviction */
+        DTime1 +=3D DTime1 >> 5;  /* small advantage to algorithm using le=
ss memory, to reduce cache eviction */
         return DTime1 < DTime0;
     }
 #endif
diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zs=
td_decompress.c
index b4d81d84479a..b9b935a9f5c0 100644
--- a/lib/zstd/decompress/zstd_decompress.c
+++ b/lib/zstd/decompress/zstd_decompress.c
@@ -53,7 +53,6 @@
 *  Dependencies
 *********************************************************/
 #include "../common/zstd_deps.h"   /* ZSTD_memcpy, ZSTD_memmove, ZSTD_mems=
et */
-#include "../common/cpu.h"         /* bmi2 */
 #include "../common/mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
 #include "../common/fse.h"
@@ -252,11 +251,11 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx)
     dctx->inBuffSize  =3D 0;
     dctx->outBuffSize =3D 0;
     dctx->streamStage =3D zdss_init;
-    dctx->legacyContext =3D NULL;
-    dctx->previousLegacyVersion =3D 0;
     dctx->noForwardProgress =3D 0;
     dctx->oversizedDuration =3D 0;
-    dctx->bmi2 =3D ZSTD_cpuid_bmi2(ZSTD_cpuid());
+#if DYNAMIC_BMI2
+    dctx->bmi2 =3D ZSTD_cpuSupportsBmi2();
+#endif
     dctx->ddictSet =3D NULL;
     ZSTD_DCtx_resetParameters(dctx);
 #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
@@ -277,8 +276,7 @@ ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t =
workspaceSize)
     return dctx;
 }
=20
-ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
-{
+static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) {
     if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL;
=20
     {   ZSTD_DCtx* const dctx =3D (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dc=
tx), customMem);
@@ -289,10 +287,15 @@ ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem cu=
stomMem)
     }
 }
=20
+ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem)
+{
+    return ZSTD_createDCtx_internal(customMem);
+}
+
 ZSTD_DCtx* ZSTD_createDCtx(void)
 {
     DEBUGLOG(3, "ZSTD_createDCtx");
-    return ZSTD_createDCtx_advanced(ZSTD_defaultCMem);
+    return ZSTD_createDCtx_internal(ZSTD_defaultCMem);
 }
=20
 static void ZSTD_clearDict(ZSTD_DCtx* dctx)
@@ -370,6 +373,19 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size)
     return 0;
 }
=20
+/*! ZSTD_isSkippableFrame() :
+ *  Tells if the content of `buffer` starts with a valid Frame Identifier =
for a skippable frame.
+ *  Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always=
 be 0.
+ */
+unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size)
+{
+    if (size < ZSTD_FRAMEIDSIZE) return 0;
+    {   U32 const magic =3D MEM_readLE32(buffer);
+        if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) =3D=3D ZSTD_MAGIC_SKIPPABL=
E_START) return 1;
+    }
+    return 0;
+}
+
 /* ZSTD_frameHeaderSize_internal() :
  *  srcSize must be large enough to reach header size fields.
  *  note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless.
@@ -497,7 +513,6 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, co=
nst void* src, size_t src
     return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1=
);
 }
=20
-
 /* ZSTD_getFrameContentSize() :
  *  compatible with legacy mode
  * @return : decompressed size of the single frame pointed to be `src` if =
known, otherwise
@@ -532,6 +547,37 @@ static size_t readSkippableFrameSize(void const* src, =
size_t srcSize)
     }
 }
=20
+/*! ZSTD_readSkippableFrame() :
+ * Retrieves a zstd skippable frame containing data given by src, and writ=
es it to dst buffer.
+ *
+ * The parameter magicVariant will receive the magicVariant that was suppl=
ied when the frame was written,
+ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START.  This can be NULL if the=
 caller is not interested
+ * in the magicVariant.
+ *
+ * Returns an error if destination buffer is not large enough, or if the f=
rame is not skippable.
+ *
+ * @return : number of bytes written or a ZSTD error.
+ */
+ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, =
unsigned* magicVariant,
+                                            const void* src, size_t srcSiz=
e)
+{
+    U32 const magicNumber =3D MEM_readLE32(src);
+    size_t skippableFrameSize =3D readSkippableFrameSize(src, srcSize);
+    size_t skippableContentSize =3D skippableFrameSize - ZSTD_SKIPPABLEHEA=
DERSIZE;
+
+    /* check input validity */
+    RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_u=
nsupported, "");
+    RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skipp=
ableFrameSize > srcSize, srcSize_wrong, "");
+    RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, =
"");
+
+    /* deliver payload */
+    if (skippableContentSize > 0  && dst !=3D NULL)
+        ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, ski=
ppableContentSize);
+    if (magicVariant !=3D NULL)
+        *magicVariant =3D magicNumber - ZSTD_MAGIC_SKIPPABLE_START;
+    return skippableContentSize;
+}
+
 /* ZSTD_findDecompressedSize() :
  *  compatible with legacy mode
  *  `srcSize` must be the exact length of some number of ZSTD compressed a=
nd/or
@@ -824,7 +870,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx,
         switch(blockProperties.blockType)
         {
         case bt_compressed:
-            decodedSize =3D ZSTD_decompressBlock_internal(dctx, op, (size_=
t)(oend-op), ip, cBlockSize, /* frame */ 1);
+            decodedSize =3D ZSTD_decompressBlock_internal(dctx, op, (size_=
t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming);
             break;
         case bt_raw :
             decodedSize =3D ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, c=
BlockSize);
@@ -976,7 +1022,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, =
const void* src, size_t sr
 {
 #if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=3D1)
     size_t regenSize;
-    ZSTD_DCtx* const dctx =3D ZSTD_createDCtx();
+    ZSTD_DCtx* const dctx =3D  ZSTD_createDCtx_internal(ZSTD_defaultCMem);
     RETURN_ERROR_IF(dctx=3D=3DNULL, memory_allocation, "NULL pointer!");
     regenSize =3D ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize=
);
     ZSTD_freeDCtx(dctx);
@@ -996,7 +1042,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, =
const void* src, size_t sr
 size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expect=
ed; }
=20
 /*
- * Similar to ZSTD_nextSrcSizeToDecompress(), but when when a block input =
can be streamed,
+ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can b=
e streamed,
  * we allow taking a partial block as the input. Currently only raw uncomp=
ressed blocks can
  * be streamed.
  *
@@ -1010,7 +1056,7 @@ static size_t ZSTD_nextSrcSizeToDecompressWithInputSi=
ze(ZSTD_DCtx* dctx, size_t
         return dctx->expected;
     if (dctx->bType !=3D bt_raw)
         return dctx->expected;
-    return MIN(MAX(inputSize, 1), dctx->expected);
+    return BOUNDED(1, inputSize, dctx->expected);
 }
=20
 ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) {
@@ -1116,7 +1162,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void*=
 dst, size_t dstCapacity, c
             {
             case bt_compressed:
                 DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed");
-                rSize =3D ZSTD_decompressBlock_internal(dctx, dst, dstCapa=
city, src, srcSize, /* frame */ 1);
+                rSize =3D ZSTD_decompressBlock_internal(dctx, dst, dstCapa=
city, src, srcSize, /* frame */ 1, is_streaming);
                 dctx->expected =3D 0;  /* Streaming not supported */
                 break;
             case bt_raw :
@@ -1438,7 +1484,7 @@ size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx,
 ZSTD_DStream* ZSTD_createDStream(void)
 {
     DEBUGLOG(3, "ZSTD_createDStream");
-    return ZSTD_createDStream_advanced(ZSTD_defaultCMem);
+    return ZSTD_createDCtx_internal(ZSTD_defaultCMem);
 }
=20
 ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize)
@@ -1448,7 +1494,7 @@ ZSTD_DStream* ZSTD_initStaticDStream(void *workspace,=
 size_t workspaceSize)
=20
 ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem)
 {
-    return ZSTD_createDCtx_advanced(customMem);
+    return ZSTD_createDCtx_internal(customMem);
 }
=20
 size_t ZSTD_freeDStream(ZSTD_DStream* zds)
@@ -1708,7 +1754,8 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx)
 size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned=
 long long frameContentSize)
 {
     size_t const blockSize =3D (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX=
);
-    unsigned long long const neededRBSize =3D windowSize + blockSize + (WI=
LDCOPY_OVERLENGTH * 2);
+    /* space is needed to store the litbuffer after the output of a given =
block without stomping the extDict of a previous run, as well as to cover b=
oth windows against wildcopy*/
+    unsigned long long const neededRBSize =3D windowSize + blockSize + ZST=
D_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2);
     unsigned long long const neededSize =3D MIN(frameContentSize, neededRB=
Size);
     size_t const minRBSize =3D (size_t) neededSize;
     RETURN_ERROR_IF((unsigned long long)minRBSize !=3D neededSize,
@@ -1842,7 +1889,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_=
outBuffer* output, ZSTD_inB
             DEBUGLOG(5, "stage zdss_init =3D> transparent reset ");
             zds->streamStage =3D zdss_loadHeader;
             zds->lhSize =3D zds->inPos =3D zds->outStart =3D zds->outEnd =
=3D 0;
-            zds->legacyVersion =3D 0;
             zds->hostageByte =3D 0;
             zds->expectedOutBuffer =3D *output;
             ZSTD_FALLTHROUGH;
diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompr=
ess/zstd_decompress_block.c
index 2d101d9a842e..c1913b8e7c89 100644
--- a/lib/zstd/decompress/zstd_decompress_block.c
+++ b/lib/zstd/decompress/zstd_decompress_block.c
@@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSi=
ze,
     }
 }
=20
+/* Allocate buffer for literals, either overlapping current dst, or split =
between dst and litExtraBuffer, or stored entirely within litExtraBuffer */
+static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, =
const size_t dstCapacity, const size_t litSize,
+    const streaming_operation streaming, const size_t expectedWriteSize, c=
onst unsigned splitImmediately)
+{
+    if (streaming =3D=3D not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX=
 + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH)
+    {
+        /* room for litbuffer to fit without read faulting */
+        dctx->litBuffer =3D (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVE=
RLENGTH;
+        dctx->litBufferEnd =3D dctx->litBuffer + litSize;
+        dctx->litBufferLocation =3D ZSTD_in_dst;
+    }
+    else if (litSize > ZSTD_LITBUFFEREXTRASIZE)
+    {
+        /* won't fit in litExtraBuffer, so it will be split between end of=
 dst and extra buffer */
+        if (splitImmediately) {
+            /* won't fit in litExtraBuffer, so it will be split between en=
d of dst and extra buffer */
+            dctx->litBuffer =3D (BYTE*)dst + expectedWriteSize - litSize +=
 ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH;
+            dctx->litBufferEnd =3D dctx->litBuffer + litSize - ZSTD_LITBUF=
FEREXTRASIZE;
+        }
+        else {
+            /* initially this will be stored entirely in dst during huffma=
n decoding, it will partially shifted to litExtraBuffer after */
+            dctx->litBuffer =3D (BYTE*)dst + expectedWriteSize - litSize;
+            dctx->litBufferEnd =3D (BYTE*)dst + expectedWriteSize;
+        }
+        dctx->litBufferLocation =3D ZSTD_split;
+    }
+    else
+    {
+        /* fits entirely within litExtraBuffer, so no split is necessary */
+        dctx->litBuffer =3D dctx->litExtraBuffer;
+        dctx->litBufferEnd =3D dctx->litBuffer + litSize;
+        dctx->litBufferLocation =3D ZSTD_not_in_dst;
+    }
+}
=20
 /* Hidden declaration for fullbench */
 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
-                          const void* src, size_t srcSize);
+                          const void* src, size_t srcSize,
+                          void* dst, size_t dstCapacity, const streaming_o=
peration streaming);
 /*! ZSTD_decodeLiteralsBlock() :
+ * Where it is possible to do so without being stomped by the output durin=
g decompression, the literals block will be stored
+ * in the dstBuffer.  If there is room to do so, it will be stored in full=
 in the excess dst space after where the current
+ * block will be output.  Otherwise it will be stored at the end of the cu=
rrent dst blockspace, with a small portion being
+ * stored in dctx->litExtraBuffer to help keep it "ahead" of the current o=
utput write.
+ *
  * @return : nb of bytes read from src (< srcSize )
  *  note : symbol not declared but exposed for fullbench */
 size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
-                          const void* src, size_t srcSize)   /* note : src=
Size < BLOCKSIZE */
+                          const void* src, size_t srcSize,   /* note : src=
Size < BLOCKSIZE */
+                          void* dst, size_t dstCapacity, const streaming_o=
peration streaming)
 {
     DEBUGLOG(5, "ZSTD_decodeLiteralsBlock");
     RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, "");
@@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                 U32 const lhlCode =3D (istart[0] >> 2) & 3;
                 U32 const lhc =3D MEM_readLE32(istart);
                 size_t hufSuccess;
+                size_t expectedWriteSize =3D MIN(ZSTD_BLOCKSIZE_MAX, dstCa=
pacity);
                 switch(lhlCode)
                 {
                 case 0: case 1: default:   /* note : default is impossible=
, since lhlCode into [0..3] */
@@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     litCSize =3D (lhc >> 22) + ((size_t)istart[4] << 10);
                     break;
                 }
+                RETURN_ERROR_IF(litSize > 0 && dst =3D=3D NULL, dstSize_to=
oSmall, "NULL not handled");
                 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_d=
etected, "");
                 RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_de=
tected, "");
+                RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooS=
mall, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSiz=
e, streaming, expectedWriteSize, 0);
=20
                 /* prefetch huffman table if cold */
                 if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) {
@@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     if (singleStream) {
                         hufSuccess =3D HUF_decompress1X_usingDTable_bmi2(
                             dctx->litBuffer, litSize, istart+lhSize, litCS=
ize,
-                            dctx->HUFptr, dctx->bmi2);
+                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
                     } else {
                         hufSuccess =3D HUF_decompress4X_usingDTable_bmi2(
                             dctx->litBuffer, litSize, istart+lhSize, litCS=
ize,
-                            dctx->HUFptr, dctx->bmi2);
+                            dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx));
                     }
                 } else {
                     if (singleStream) {
@@ -150,15 +195,22 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                         hufSuccess =3D HUF_decompress1X1_DCtx_wksp_bmi2(
                             dctx->entropy.hufTable, dctx->litBuffer, litSi=
ze,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), dctx->bmi2);
+                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dc=
tx));
 #endif
                     } else {
                         hufSuccess =3D HUF_decompress4X_hufOnly_wksp_bmi2(
                             dctx->entropy.hufTable, dctx->litBuffer, litSi=
ze,
                             istart+lhSize, litCSize, dctx->workspace,
-                            sizeof(dctx->workspace), dctx->bmi2);
+                            sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dc=
tx));
                     }
                 }
+                if (dctx->litBufferLocation =3D=3D ZSTD_split)
+                {
+                    ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd -=
 ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                    ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE=
 - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE);
+                    dctx->litBuffer +=3D ZSTD_LITBUFFEREXTRASIZE - WILDCOP=
Y_OVERLENGTH;
+                    dctx->litBufferEnd -=3D WILDCOPY_OVERLENGTH;
+                }
=20
                 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detect=
ed, "");
=20
@@ -166,13 +218,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                 dctx->litSize =3D litSize;
                 dctx->litEntropy =3D 1;
                 if (litEncType=3D=3Dset_compressed) dctx->HUFptr =3D dctx-=
>entropy.hufTable;
-                ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_O=
VERLENGTH);
                 return litCSize + lhSize;
             }
=20
         case set_basic:
             {   size_t litSize, lhSize;
                 U32 const lhlCode =3D ((istart[0]) >> 2) & 3;
+                size_t expectedWriteSize =3D MIN(ZSTD_BLOCKSIZE_MAX, dstCa=
pacity);
                 switch(lhlCode)
                 {
                 case 0: case 2: default:   /* note : default is impossible=
, since lhlCode into [0..3] */
@@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     break;
                 }
=20
+                RETURN_ERROR_IF(litSize > 0 && dst =3D=3D NULL, dstSize_to=
oSmall, "NULL not handled");
+                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSm=
all, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSiz=
e, streaming, expectedWriteSize, 1);
                 if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) {  /* ri=
sk reading beyond src buffer with wildcopy */
                     RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_d=
etected, "");
-                    ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize);
+                    if (dctx->litBufferLocation =3D=3D ZSTD_split)
+                    {
+                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litS=
ize - ZSTD_LITBUFFEREXTRASIZE);
+                        ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize =
+ litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE);
+                    }
+                    else
+                    {
+                        ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litS=
ize);
+                    }
                     dctx->litPtr =3D dctx->litBuffer;
                     dctx->litSize =3D litSize;
-                    ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCO=
PY_OVERLENGTH);
                     return lhSize+litSize;
                 }
                 /* direct reference into compressed stream */
                 dctx->litPtr =3D istart+lhSize;
                 dctx->litSize =3D litSize;
+                dctx->litBufferEnd =3D dctx->litPtr + litSize;
+                dctx->litBufferLocation =3D ZSTD_not_in_dst;
                 return lhSize+litSize;
             }
=20
         case set_rle:
             {   U32 const lhlCode =3D ((istart[0]) >> 2) & 3;
                 size_t litSize, lhSize;
+                size_t expectedWriteSize =3D MIN(ZSTD_BLOCKSIZE_MAX, dstCa=
pacity);
                 switch(lhlCode)
                 {
                 case 0: case 2: default:   /* note : default is impossible=
, since lhlCode into [0..3] */
@@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx,
                     RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSi=
ze >=3D MIN_CBLOCK_SIZE =3D=3D 3; here we need lhSize+1 =3D 4");
                     break;
                 }
+                RETURN_ERROR_IF(litSize > 0 && dst =3D=3D NULL, dstSize_to=
oSmall, "NULL not handled");
                 RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_d=
etected, "");
-                ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WIL=
DCOPY_OVERLENGTH);
+                RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSm=
all, "");
+                ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSiz=
e, streaming, expectedWriteSize, 1);
+                if (dctx->litBufferLocation =3D=3D ZSTD_split)
+                {
+                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize -=
 ZSTD_LITBUFFEREXTRASIZE);
+                    ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD=
_LITBUFFEREXTRASIZE);
+                }
+                else
+                {
+                    ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize);
+                }
                 dctx->litPtr =3D dctx->litBuffer;
                 dctx->litSize =3D litSize;
                 return lhSize+1;
@@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<<ML_DEF=
AULTNORMLOG)+1] =3D {
 };   /* ML_defaultDTable */
=20
=20
-static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U32 =
nbAddBits)
+static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, U32 baseValue, U8 n=
bAddBits)
 {
     void* ptr =3D dt;
     ZSTD_seqSymbol_header* const DTableH =3D (ZSTD_seqSymbol_header*)ptr;
@@ -355,7 +431,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, =
U32 baseValue, U32 nbAddB
     cell->nbBits =3D 0;
     cell->nextState =3D 0;
     assert(nbAddBits < 255);
-    cell->nbAdditionalBits =3D (BYTE)nbAddBits;
+    cell->nbAdditionalBits =3D nbAddBits;
     cell->baseValue =3D baseValue;
 }
=20
@@ -367,7 +443,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, =
U32 baseValue, U32 nbAddB
 FORCE_INLINE_TEMPLATE
 void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
             const short* normalizedCounter, unsigned maxSymbolValue,
-            const U32* baseValue, const U32* nbAdditionalBits,
+            const U32* baseValue, const U8* nbAdditionalBits,
             unsigned tableLog, void* wksp, size_t wkspSize)
 {
     ZSTD_seqSymbol* const tableDecode =3D dt+1;
@@ -478,7 +554,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
             tableDecode[u].nbBits =3D (BYTE) (tableLog - BIT_highbit32(nex=
tState) );
             tableDecode[u].nextState =3D (U16) ( (nextState << tableDecode=
[u].nbBits) - tableSize);
             assert(nbAdditionalBits[symbol] < 255);
-            tableDecode[u].nbAdditionalBits =3D (BYTE)nbAdditionalBits[sym=
bol];
+            tableDecode[u].nbAdditionalBits =3D nbAdditionalBits[symbol];
             tableDecode[u].baseValue =3D baseValue[symbol];
         }
     }
@@ -487,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt,
 /* Avoids the FORCE_INLINE of the _body() function. */
 static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt,
             const short* normalizedCounter, unsigned maxSymbolValue,
-            const U32* baseValue, const U32* nbAdditionalBits,
+            const U32* baseValue, const U8* nbAdditionalBits,
             unsigned tableLog, void* wksp, size_t wkspSize)
 {
     ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -495,9 +571,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSym=
bol* dt,
 }
=20
 #if DYNAMIC_BMI2
-TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seq=
Symbol* dt,
+BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSym=
bol* dt,
             const short* normalizedCounter, unsigned maxSymbolValue,
-            const U32* baseValue, const U32* nbAdditionalBits,
+            const U32* baseValue, const U8* nbAdditionalBits,
             unsigned tableLog, void* wksp, size_t wkspSize)
 {
     ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue,
@@ -507,7 +583,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable=
_body_bmi2(ZSTD_seqSymbol
=20
 void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
             const short* normalizedCounter, unsigned maxSymbolValue,
-            const U32* baseValue, const U32* nbAdditionalBits,
+            const U32* baseValue, const U8* nbAdditionalBits,
             unsigned tableLog, void* wksp, size_t wkspSize, int bmi2)
 {
 #if DYNAMIC_BMI2
@@ -529,7 +605,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
 static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_s=
eqSymbol** DTablePtr,
                                  symbolEncodingType_e type, unsigned max, =
U32 maxLog,
                                  const void* src, size_t srcSize,
-                                 const U32* baseValue, const U32* nbAdditi=
onalBits,
+                                 const U32* baseValue, const U8* nbAdditio=
nalBits,
                                  const ZSTD_seqSymbol* defaultTable, U32 f=
lagRepeatTable,
                                  int ddictIsCold, int nbSeq, U32* wksp, si=
ze_t wkspSize,
                                  int bmi2)
@@ -541,7 +617,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTable=
Space, const ZSTD_seqSymb
         RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""=
);
         {   U32 const symbol =3D *(const BYTE*)src;
             U32 const baseline =3D baseValue[symbol];
-            U32 const nbBits =3D nbAdditionalBits[symbol];
+            U8 const nbBits =3D nbAdditionalBits[symbol];
             ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits);
         }
         *DTablePtr =3D DTableSpace;
@@ -620,7 +696,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSe=
qPtr,
                                                       LL_defaultDTable, dc=
tx->fseEntropy,
                                                       dctx->ddictIsCold, n=
bSeq,
                                                       dctx->workspace, siz=
eof(dctx->workspace),
-                                                      dctx->bmi2);
+                                                      ZSTD_DCtx_get_bmi2(d=
ctx));
             RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "Z=
STD_buildSeqTable failed");
             ip +=3D llhSize;
         }
@@ -632,7 +708,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSe=
qPtr,
                                                       OF_defaultDTable, dc=
tx->fseEntropy,
                                                       dctx->ddictIsCold, n=
bSeq,
                                                       dctx->workspace, siz=
eof(dctx->workspace),
-                                                      dctx->bmi2);
+                                                      ZSTD_DCtx_get_bmi2(d=
ctx));
             RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "Z=
STD_buildSeqTable failed");
             ip +=3D ofhSize;
         }
@@ -644,7 +720,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSe=
qPtr,
                                                       ML_defaultDTable, dc=
tx->fseEntropy,
                                                       dctx->ddictIsCold, n=
bSeq,
                                                       dctx->workspace, siz=
eof(dctx->workspace),
-                                                      dctx->bmi2);
+                                                      ZSTD_DCtx_get_bmi2(d=
ctx));
             RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "Z=
STD_buildSeqTable failed");
             ip +=3D mlhSize;
         }
@@ -658,7 +734,6 @@ typedef struct {
     size_t litLength;
     size_t matchLength;
     size_t offset;
-    const BYTE* match;
 } seq_t;
=20
 typedef struct {
@@ -672,9 +747,6 @@ typedef struct {
     ZSTD_fseState stateOffb;
     ZSTD_fseState stateML;
     size_t prevOffset[ZSTD_REP_NUM];
-    const BYTE* prefixStart;
-    const BYTE* dictEnd;
-    size_t pos;
 } seqState_t;
=20
 /*! ZSTD_overlapCopy8() :
@@ -717,7 +789,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE cons=
t** ip, size_t offset) {
  *         - ZSTD_overlap_src_before_dst: The src and dst may overlap and =
may be any distance apart.
  *           The src buffer must be before the dst buffer.
  */
-static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, pt=
rdiff_t length, ZSTD_overlap_e ovtype) {
+static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* =
ip, ptrdiff_t length, ZSTD_overlap_e ovtype) {
     ptrdiff_t const diff =3D op - ip;
     BYTE* const oend =3D op + length;
=20
@@ -733,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w,=
 BYTE const* ip, ptrdiff_
         /* Copy 8 bytes and ensure the offset >=3D 8 when there can be ove=
rlap. */
         assert(length >=3D 8);
         ZSTD_overlapCopy8(&op, &ip, diff);
+        length -=3D 8;
         assert(op - ip >=3D 8);
         assert(op <=3D oend);
     }
@@ -747,8 +820,31 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w=
, BYTE const* ip, ptrdiff_
         assert(oend > oend_w);
         ZSTD_wildcopy(op, ip, oend_w - op, ovtype);
         ip +=3D oend_w - op;
-        op =3D oend_w;
+        op +=3D oend_w - op;
+    }
+    /* Handle the leftovers. */
+    while (op < oend) *op++ =3D *ip++;
+}
+
+/* ZSTD_safecopyDstBeforeSrc():
+ * This version allows overlap with dst before src, or handles the non-ove=
rlap case with dst after src
+ * Kept separate from more common ZSTD_safecopy case to avoid performance =
impact to the safecopy common case */
+static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t =
length) {
+    ptrdiff_t const diff =3D op - ip;
+    BYTE* const oend =3D op + length;
+
+    if (length < 8 || diff > -8) {
+        /* Handle short lengths, close overlaps, and dst not before src. */
+        while (op < oend) *op++ =3D *ip++;
+        return;
+    }
+
+    if (op <=3D oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) {
+        ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_ove=
rlap);
+        ip +=3D oend - WILDCOPY_OVERLENGTH - op;
+        op +=3D oend - WILDCOPY_OVERLENGTH - op;
     }
+
     /* Handle the leftovers. */
     while (op < oend) *op++ =3D *ip++;
 }
@@ -763,9 +859,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w,=
 BYTE const* ip, ptrdiff_
  */
 FORCE_NOINLINE
 size_t ZSTD_execSequenceEnd(BYTE* op,
-                            BYTE* const oend, seq_t sequence,
-                            const BYTE** litPtr, const BYTE* const litLimi=
t,
-                            const BYTE* const prefixStart, const BYTE* con=
st virtualStart, const BYTE* const dictEnd)
+    BYTE* const oend, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const B=
YTE* const dictEnd)
 {
     BYTE* const oLitEnd =3D op + sequence.litLength;
     size_t const sequenceLength =3D sequence.litLength + sequence.matchLen=
gth;
@@ -788,27 +884,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op,
     if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
         /* offset beyond prefix */
         RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart)=
, corruption_detected, "");
-        match =3D dictEnd - (prefixStart-match);
+        match =3D dictEnd - (prefixStart - match);
         if (match + sequence.matchLength <=3D dictEnd) {
             ZSTD_memmove(oLitEnd, match, sequence.matchLength);
             return sequenceLength;
         }
         /* span extDict & currentPrefixSegment */
         {   size_t const length1 =3D dictEnd - match;
-            ZSTD_memmove(oLitEnd, match, length1);
-            op =3D oLitEnd + length1;
-            sequence.matchLength -=3D length1;
-            match =3D prefixStart;
-    }   }
+        ZSTD_memmove(oLitEnd, match, length1);
+        op =3D oLitEnd + length1;
+        sequence.matchLength -=3D length1;
+        match =3D prefixStart;
+        }
+    }
+    ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_sr=
c_before_dst);
+    return sequenceLength;
+}
+
+/* ZSTD_execSequenceEndSplitLitBuffer():
+ * This version is intended to be used during instances where the litBuffe=
r is still split.  It is kept separate to avoid performance impact for the =
good case.
+ */
+FORCE_NOINLINE
+size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op,
+    BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const B=
YTE* const dictEnd)
+{
+    BYTE* const oLitEnd =3D op + sequence.litLength;
+    size_t const sequenceLength =3D sequence.litLength + sequence.matchLen=
gth;
+    const BYTE* const iLitEnd =3D *litPtr + sequence.litLength;
+    const BYTE* match =3D oLitEnd - sequence.offset;
+
+
+    /* bounds checks : careful of address space overflow in 32-bit mode */
+    RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall=
, "last match must fit within dstBuffer");
+    RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), cor=
ruption_detected, "try to read beyond literal buffer");
+    assert(op < op + sequenceLength);
+    assert(oLitEnd < op + sequenceLength);
+
+    /* copy literals */
+    RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dst=
Size_tooSmall, "output should not catch up to and overwrite literal buffer"=
);
+    ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength);
+    op =3D oLitEnd;
+    *litPtr =3D iLitEnd;
+
+    /* copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix */
+        RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart)=
, corruption_detected, "");
+        match =3D dictEnd - (prefixStart - match);
+        if (match + sequence.matchLength <=3D dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 =3D dictEnd - match;
+        ZSTD_memmove(oLitEnd, match, length1);
+        op =3D oLitEnd + length1;
+        sequence.matchLength -=3D length1;
+        match =3D prefixStart;
+        }
+    }
     ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_sr=
c_before_dst);
     return sequenceLength;
 }
=20
 HINT_INLINE
 size_t ZSTD_execSequence(BYTE* op,
-                         BYTE* const oend, seq_t sequence,
-                         const BYTE** litPtr, const BYTE* const litLimit,
-                         const BYTE* const prefixStart, const BYTE* const =
virtualStart, const BYTE* const dictEnd)
+    BYTE* const oend, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const B=
YTE* const dictEnd)
 {
     BYTE* const oLitEnd =3D op + sequence.litLength;
     size_t const sequenceLength =3D sequence.litLength + sequence.matchLen=
gth;
@@ -817,6 +962,98 @@ size_t ZSTD_execSequence(BYTE* op,
     const BYTE* const iLitEnd =3D *litPtr + sequence.litLength;
     const BYTE* match =3D oLitEnd - sequence.offset;
=20
+    assert(op !=3D NULL /* Precondition */);
+    assert(oend_w < oend /* No underflow */);
+    /* Handle edge cases in a slow path:
+     *   - Read beyond end of literals
+     *   - Match end is within WILDCOPY_OVERLIMIT of oend
+     *   - 32-bit mode and the match length overflows
+     */
+    if (UNLIKELY(
+        iLitEnd > litLimit ||
+        oMatchEnd > oend_w ||
+        (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_O=
VERLENGTH)))
+        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, =
prefixStart, virtualStart, dictEnd);
+
+    /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
+    assert(op <=3D oLitEnd /* No overflow */);
+    assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */);
+    assert(oMatchEnd <=3D oend /* No underflow */);
+    assert(iLitEnd <=3D litLimit /* Literal length is in bounds */);
+    assert(oLitEnd <=3D oend_w /* Can wildcopy literals */);
+    assert(oMatchEnd <=3D oend_w /* Can wildcopy matches */);
+
+    /* Copy Literals:
+     * Split out litLength <=3D 16 since it is nearly always true. +1.6% o=
n gcc-9.
+     * We likely don't need the full 32-byte wildcopy.
+     */
+    assert(WILDCOPY_OVERLENGTH >=3D 16);
+    ZSTD_copy16(op, (*litPtr));
+    if (UNLIKELY(sequence.litLength > 16)) {
+        ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZS=
TD_no_overlap);
+    }
+    op =3D oLitEnd;
+    *litPtr =3D iLitEnd;   /* update for next sequence */
+
+    /* Copy Match */
+    if (sequence.offset > (size_t)(oLitEnd - prefixStart)) {
+        /* offset beyond prefix -> go into extDict */
+        RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virt=
ualStart)), corruption_detected, "");
+        match =3D dictEnd + (match - prefixStart);
+        if (match + sequence.matchLength <=3D dictEnd) {
+            ZSTD_memmove(oLitEnd, match, sequence.matchLength);
+            return sequenceLength;
+        }
+        /* span extDict & currentPrefixSegment */
+        {   size_t const length1 =3D dictEnd - match;
+        ZSTD_memmove(oLitEnd, match, length1);
+        op =3D oLitEnd + length1;
+        sequence.matchLength -=3D length1;
+        match =3D prefixStart;
+        }
+    }
+    /* Match within prefix of 1 or more bytes */
+    assert(op <=3D oMatchEnd);
+    assert(oMatchEnd <=3D oend_w);
+    assert(match >=3D prefixStart);
+    assert(sequence.matchLength >=3D 1);
+
+    /* Nearly all offsets are >=3D WILDCOPY_VECLEN bytes, which means we c=
an use wildcopy
+     * without overlap checking.
+     */
+    if (LIKELY(sequence.offset >=3D WILDCOPY_VECLEN)) {
+        /* We bet on a full wildcopy for matches, since we expect matches =
to be
+         * longer than literals (in general). In silesia, ~10% of matches =
are longer
+         * than 16 bytes.
+         */
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_=
overlap);
+        return sequenceLength;
+    }
+    assert(sequence.offset < WILDCOPY_VECLEN);
+
+    /* Copy 8 bytes and spread the offset to be >=3D 8. */
+    ZSTD_overlapCopy8(&op, &match, sequence.offset);
+
+    /* If the match length is > 8 bytes, then continue with the wildcopy. =
*/
+    if (sequence.matchLength > 8) {
+        assert(op < oMatchEnd);
+        ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD=
_overlap_src_before_dst);
+    }
+    return sequenceLength;
+}
+
+HINT_INLINE
+size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op,
+    BYTE* const oend, const BYTE* const oend_w, seq_t sequence,
+    const BYTE** litPtr, const BYTE* const litLimit,
+    const BYTE* const prefixStart, const BYTE* const virtualStart, const B=
YTE* const dictEnd)
+{
+    BYTE* const oLitEnd =3D op + sequence.litLength;
+    size_t const sequenceLength =3D sequence.litLength + sequence.matchLen=
gth;
+    BYTE* const oMatchEnd =3D op + sequenceLength;   /* risk : address spa=
ce overflow (32-bits) */
+    const BYTE* const iLitEnd =3D *litPtr + sequence.litLength;
+    const BYTE* match =3D oLitEnd - sequence.offset;
+
     assert(op !=3D NULL /* Precondition */);
     assert(oend_w < oend /* No underflow */);
     /* Handle edge cases in a slow path:
@@ -828,7 +1065,7 @@ size_t ZSTD_execSequence(BYTE* op,
             iLitEnd > litLimit ||
             oMatchEnd > oend_w ||
             (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCO=
PY_OVERLENGTH)))
-        return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, =
prefixStart, virtualStart, dictEnd);
+        return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequen=
ce, litPtr, litLimit, prefixStart, virtualStart, dictEnd);
=20
     /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */
     assert(op <=3D oLitEnd /* No overflow */);
@@ -896,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op,
     return sequenceLength;
 }
=20
+
 static void
 ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZST=
D_seqSymbol* dt)
 {
@@ -909,20 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStr=
eam_t* bitD, const ZSTD_seqS
 }
=20
 FORCE_INLINE_TEMPLATE void
-ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD)
-{
-    ZSTD_seqSymbol const DInfo =3D DStatePtr->table[DStatePtr->state];
-    U32 const nbBits =3D DInfo.nbBits;
-    size_t const lowBits =3D BIT_readBits(bitD, nbBits);
-    DStatePtr->state =3D DInfo.nextState + lowBits;
-}
-
-FORCE_INLINE_TEMPLATE void
-ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD=
, ZSTD_seqSymbol const DInfo)
+ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD=
, U16 nextState, U32 nbBits)
 {
-    U32 const nbBits =3D DInfo.nbBits;
     size_t const lowBits =3D BIT_readBits(bitD, nbBits);
-    DStatePtr->state =3D DInfo.nextState + lowBits;
+    DStatePtr->state =3D nextState + lowBits;
 }
=20
 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the max=
imum
@@ -936,116 +1164,105 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DState=
Ptr, BIT_DStream_t* bitD, ZSTD
         : 0)
=20
 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=3D1 } ZSTD_lo=
ngOffset_e;
-typedef enum { ZSTD_p_noPrefetch=3D0, ZSTD_p_prefetch=3D1 } ZSTD_prefetch_=
e;
=20
 FORCE_INLINE_TEMPLATE seq_t
-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffs=
ets, const ZSTD_prefetch_e prefetch)
+ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffs=
ets)
 {
     seq_t seq;
-    ZSTD_seqSymbol const llDInfo =3D seqState->stateLL.table[seqState->sta=
teLL.state];
-    ZSTD_seqSymbol const mlDInfo =3D seqState->stateML.table[seqState->sta=
teML.state];
-    ZSTD_seqSymbol const ofDInfo =3D seqState->stateOffb.table[seqState->s=
tateOffb.state];
-    U32 const llBase =3D llDInfo.baseValue;
-    U32 const mlBase =3D mlDInfo.baseValue;
-    U32 const ofBase =3D ofDInfo.baseValue;
-    BYTE const llBits =3D llDInfo.nbAdditionalBits;
-    BYTE const mlBits =3D mlDInfo.nbAdditionalBits;
-    BYTE const ofBits =3D ofDInfo.nbAdditionalBits;
-    BYTE const totalBits =3D llBits+mlBits+ofBits;
-
-    /* sequence */
-    {   size_t offset;
-        if (ofBits > 1) {
-            ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset =3D=3D 1);
-            ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 =3D=3D 5);
-            assert(ofBits <=3D MaxOff);
-            if (MEM_32bits() && longOffsets && (ofBits >=3D STREAM_ACCUMUL=
ATOR_MIN_32)) {
-                U32 const extraBits =3D ofBits - MIN(ofBits, 32 - seqState=
->DStream.bitsConsumed);
-                offset =3D ofBase + (BIT_readBitsFast(&seqState->DStream, =
ofBits - extraBits) << extraBits);
-                BIT_reloadDStream(&seqState->DStream);
-                if (extraBits) offset +=3D BIT_readBitsFast(&seqState->DSt=
ream, extraBits);
-                assert(extraBits <=3D LONG_OFFSETS_MAX_EXTRA_BITS_32);   /=
* to avoid another reload */
-            } else {
-                offset =3D ofBase + BIT_readBitsFast(&seqState->DStream, o=
fBits/*>0*/);   /* <=3D  (ZSTD_WINDOWLOG_MAX-1) bits */
-                if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);
-            }
-            seqState->prevOffset[2] =3D seqState->prevOffset[1];
-            seqState->prevOffset[1] =3D seqState->prevOffset[0];
-            seqState->prevOffset[0] =3D offset;
-        } else {
-            U32 const ll0 =3D (llBase =3D=3D 0);
-            if (LIKELY((ofBits =3D=3D 0))) {
-                if (LIKELY(!ll0))
-                    offset =3D seqState->prevOffset[0];
-                else {
-                    offset =3D seqState->prevOffset[1];
-                    seqState->prevOffset[1] =3D seqState->prevOffset[0];
-                    seqState->prevOffset[0] =3D offset;
+    const ZSTD_seqSymbol* const llDInfo =3D seqState->stateLL.table + seqS=
tate->stateLL.state;
+    const ZSTD_seqSymbol* const mlDInfo =3D seqState->stateML.table + seqS=
tate->stateML.state;
+    const ZSTD_seqSymbol* const ofDInfo =3D seqState->stateOffb.table + se=
qState->stateOffb.state;
+    seq.matchLength =3D mlDInfo->baseValue;
+    seq.litLength =3D llDInfo->baseValue;
+    {   U32 const ofBase =3D ofDInfo->baseValue;
+        BYTE const llBits =3D llDInfo->nbAdditionalBits;
+        BYTE const mlBits =3D mlDInfo->nbAdditionalBits;
+        BYTE const ofBits =3D ofDInfo->nbAdditionalBits;
+        BYTE const totalBits =3D llBits+mlBits+ofBits;
+
+        U16 const llNext =3D llDInfo->nextState;
+        U16 const mlNext =3D mlDInfo->nextState;
+        U16 const ofNext =3D ofDInfo->nextState;
+        U32 const llnbBits =3D llDInfo->nbBits;
+        U32 const mlnbBits =3D mlDInfo->nbBits;
+        U32 const ofnbBits =3D ofDInfo->nbBits;
+        /*
+         * As gcc has better branch and block analyzers, sometimes it is o=
nly
+         * valuable to mark likelyness for clang, it gives around 3-4% of
+         * performance.
+         */
+
+        /* sequence */
+        {   size_t offset;
+    #if defined(__clang__)
+            if (LIKELY(ofBits > 1)) {
+    #else
+            if (ofBits > 1) {
+    #endif
+                ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset =3D=3D 1);
+                ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 =3D=3D 5=
);
+                assert(ofBits <=3D MaxOff);
+                if (MEM_32bits() && longOffsets && (ofBits >=3D STREAM_ACC=
UMULATOR_MIN_32)) {
+                    U32 const extraBits =3D ofBits - MIN(ofBits, 32 - seqS=
tate->DStream.bitsConsumed);
+                    offset =3D ofBase + (BIT_readBitsFast(&seqState->DStre=
am, ofBits - extraBits) << extraBits);
+                    BIT_reloadDStream(&seqState->DStream);
+                    if (extraBits) offset +=3D BIT_readBitsFast(&seqState-=
>DStream, extraBits);
+                    assert(extraBits <=3D LONG_OFFSETS_MAX_EXTRA_BITS_32);=
   /* to avoid another reload */
+                } else {
+                    offset =3D ofBase + BIT_readBitsFast(&seqState->DStrea=
m, ofBits/*>0*/);   /* <=3D  (ZSTD_WINDOWLOG_MAX-1) bits */
+                    if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream=
);
                 }
+                seqState->prevOffset[2] =3D seqState->prevOffset[1];
+                seqState->prevOffset[1] =3D seqState->prevOffset[0];
+                seqState->prevOffset[0] =3D offset;
             } else {
-                offset =3D ofBase + ll0 + BIT_readBitsFast(&seqState->DStr=
eam, 1);
-                {   size_t temp =3D (offset=3D=3D3) ? seqState->prevOffset=
[0] - 1 : seqState->prevOffset[offset];
-                    temp +=3D !temp;   /* 0 is not valid; input is corrupt=
ed; force offset to 1 */
-                    if (offset !=3D 1) seqState->prevOffset[2] =3D seqStat=
e->prevOffset[1];
-                    seqState->prevOffset[1] =3D seqState->prevOffset[0];
-                    seqState->prevOffset[0] =3D offset =3D temp;
-        }   }   }
-        seq.offset =3D offset;
-    }
-
-    seq.matchLength =3D mlBase;
-    if (mlBits > 0)
-        seq.matchLength +=3D BIT_readBitsFast(&seqState->DStream, mlBits/*=
>0*/);
-
-    if (MEM_32bits() && (mlBits+llBits >=3D STREAM_ACCUMULATOR_MIN_32-LONG=
_OFFSETS_MAX_EXTRA_BITS_32))
-        BIT_reloadDStream(&seqState->DStream);
-    if (MEM_64bits() && UNLIKELY(totalBits >=3D STREAM_ACCUMULATOR_MIN_64-=
(LLFSELog+MLFSELog+OffFSELog)))
-        BIT_reloadDStream(&seqState->DStream);
-    /* Ensure there are enough bits to read the rest of data in 64-bit mod=
e. */
-    ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR=
_MIN_64);
-
-    seq.litLength =3D llBase;
-    if (llBits > 0)
-        seq.litLength +=3D BIT_readBitsFast(&seqState->DStream, llBits/*>0=
*/);
-
-    if (MEM_32bits())
-        BIT_reloadDStream(&seqState->DStream);
-
-    DEBUGLOG(6, "seq: litL=3D%u, matchL=3D%u, offset=3D%u",
-                (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset);
-
-    if (prefetch =3D=3D ZSTD_p_prefetch) {
-        size_t const pos =3D seqState->pos + seq.litLength;
-        const BYTE* const matchBase =3D (seq.offset > pos) ? seqState->dic=
tEnd : seqState->prefixStart;
-        seq.match =3D matchBase + pos - seq.offset;  /* note : this operat=
ion can overflow when seq.offset is really too large, which can only happen=
 when input is corrupted.
-                                                    * No consequence thoug=
h : no memory access will occur, offset is only used for prefetching */
-        seqState->pos =3D pos + seq.matchLength;
-    }
-
-    /* ANS state update
-     * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo().
-     * clang-9.2.0 does 7% worse with ZSTD_updateFseState().
-     * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the
-     * better option, so it is the default for other compilers. But, if you
-     * measure that it is worse, please put up a pull request.
-     */
-    {
-#if !defined(__clang__)
-        const int kUseUpdateFseState =3D 1;
-#else
-        const int kUseUpdateFseState =3D 0;
-#endif
-        if (kUseUpdateFseState) {
-            ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream);  =
  /* <=3D  9 bits */
-            ZSTD_updateFseState(&seqState->stateML, &seqState->DStream);  =
  /* <=3D  9 bits */
-            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /*=
 <=3D 18 bits */
-            ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);=
  /* <=3D  8 bits */
-        } else {
-            ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DS=
tream, llDInfo);    /* <=3D  9 bits */
-            ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DS=
tream, mlDInfo);    /* <=3D  9 bits */
-            if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /*=
 <=3D 18 bits */
-            ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->=
DStream, ofDInfo);  /* <=3D  8 bits */
+                U32 const ll0 =3D (llDInfo->baseValue =3D=3D 0);
+                if (LIKELY((ofBits =3D=3D 0))) {
+                    offset =3D seqState->prevOffset[ll0];
+                    seqState->prevOffset[1] =3D seqState->prevOffset[!ll0];
+                    seqState->prevOffset[0] =3D offset;
+                } else {
+                    offset =3D ofBase + ll0 + BIT_readBitsFast(&seqState->=
DStream, 1);
+                    {   size_t temp =3D (offset=3D=3D3) ? seqState->prevOf=
fset[0] - 1 : seqState->prevOffset[offset];
+                        temp +=3D !temp;   /* 0 is not valid; input is cor=
rupted; force offset to 1 */
+                        if (offset !=3D 1) seqState->prevOffset[2] =3D seq=
State->prevOffset[1];
+                        seqState->prevOffset[1] =3D seqState->prevOffset[0=
];
+                        seqState->prevOffset[0] =3D offset =3D temp;
+            }   }   }
+            seq.offset =3D offset;
         }
+
+    #if defined(__clang__)
+        if (UNLIKELY(mlBits > 0))
+    #else
+        if (mlBits > 0)
+    #endif
+            seq.matchLength +=3D BIT_readBitsFast(&seqState->DStream, mlBi=
ts/*>0*/);
+
+        if (MEM_32bits() && (mlBits+llBits >=3D STREAM_ACCUMULATOR_MIN_32-=
LONG_OFFSETS_MAX_EXTRA_BITS_32))
+            BIT_reloadDStream(&seqState->DStream);
+        if (MEM_64bits() && UNLIKELY(totalBits >=3D STREAM_ACCUMULATOR_MIN=
_64-(LLFSELog+MLFSELog+OffFSELog)))
+            BIT_reloadDStream(&seqState->DStream);
+        /* Ensure there are enough bits to read the rest of data in 64-bit=
 mode. */
+        ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMUL=
ATOR_MIN_64);
+
+    #if defined(__clang__)
+        if (UNLIKELY(llBits > 0))
+    #else
+        if (llBits > 0)
+    #endif
+            seq.litLength +=3D BIT_readBitsFast(&seqState->DStream, llBits=
/*>0*/);
+
+        if (MEM_32bits())
+            BIT_reloadDStream(&seqState->DStream);
+
+        DEBUGLOG(6, "seq: litL=3D%u, matchL=3D%u, offset=3D%u",
+                    (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.off=
set);
+
+        ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStrea=
m, llNext, llnbBits);    /* <=3D  9 bits */
+        ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStrea=
m, mlNext, mlnbBits);    /* <=3D  9 bits */
+        if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream);    /* <=
=3D 18 bits */
+        ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStr=
eam, ofNext, ofnbBits);  /* <=3D  8 bits */
     }
=20
     return seq;
@@ -1098,9 +1315,11 @@ MEM_STATIC void ZSTD_assertValidSequence(
 #endif
=20
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
+
+
 FORCE_INLINE_TEMPLATE size_t
 DONT_VECTORIZE
-ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
+ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx,
                                void* dst, size_t maxDstSize,
                          const void* seqStart, size_t seqSize, int nbSeq,
                          const ZSTD_longOffset_e isLongOffset,
@@ -1112,17 +1331,16 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
     BYTE* const oend =3D ostart + maxDstSize;
     BYTE* op =3D ostart;
     const BYTE* litPtr =3D dctx->litPtr;
-    const BYTE* const litEnd =3D litPtr + dctx->litSize;
+    const BYTE* litBufferEnd =3D dctx->litBufferEnd;
     const BYTE* const prefixStart =3D (const BYTE*) (dctx->prefixStart);
     const BYTE* const vBase =3D (const BYTE*) (dctx->virtualStart);
     const BYTE* const dictEnd =3D (const BYTE*) (dctx->dictEnd);
-    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+    DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer");
     (void)frame;
=20
     /* Regen sequences */
     if (nbSeq) {
         seqState_t seqState;
-        size_t error =3D 0;
         dctx->fseEntropy =3D 1;
         { U32 i; for (i=3D0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] =
=3D dctx->entropy.rep[i]; }
         RETURN_ERROR_IF(
@@ -1138,70 +1356,255 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx,
                 BIT_DStream_endOfBuffer < BIT_DStream_completed &&
                 BIT_DStream_completed < BIT_DStream_overflow);
=20
+        /* decompress without overrunning litPtr begins */
+        {
+            seq_t sequence =3D ZSTD_decodeSequence(&seqState, isLongOffset=
);
+            /* Align the decompression loop to 32 + 16 bytes.
+                *
+                * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% =
decompression
+                * speed swings based on the alignment of the decompression=
 loop. This
+                * performance swing is caused by parts of the decompressio=
n loop falling
+                * out of the DSB. The entire decompression loop should fit=
 in the DSB,
+                * when it can't we get much worse performance. You can mea=
sure if you've
+                * hit the good case or the bad case with this perf command=
 for some
+                * compressed file test.zst:
+                *
+                *   perf stat -e cycles -e instructions -e idq.all_dsb_cyc=
les_any_uops \
+                *             -e idq.all_mite_cycles_any_uops -- ./zstd -t=
q test.zst
+                *
+                * If you see most cycles served out of the MITE you've hit=
 the bad case.
+                * If you see most cycles served out of the DSB you've hit =
the good case.
+                * If it is pretty even then you may be in an okay case.
+                *
+                * This issue has been reproduced on the following CPUs:
+                *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel =
Core i9
+                *               Use Instruments->Counters to get DSB/MITE =
cycles.
+                *               I never got performance swings, but I was =
able to
+                *               go from the good case of mostly DSB to hal=
f of the
+                *               cycles served from MITE.
+                *   - Coffeelake: Intel i9-9900k
+                *   - Coffeelake: Intel i7-9700k
+                *
+                * I haven't been able to reproduce the instability or DSB =
misses on any
+                * of the following CPUS:
+                *   - Haswell
+                *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
+                *   - Skylake
+                *
+                * Alignment is done for each of the three major decompress=
ion loops:
+                *   - ZSTD_decompressSequences_bodySplitLitBuffer - prespl=
it section of the literal buffer
+                *   - ZSTD_decompressSequences_bodySplitLitBuffer - postsp=
lit section of the literal buffer
+                *   - ZSTD_decompressSequences_body
+                * Alignment choices are made to minimize large swings on b=
ad cases and influence on performance
+                * from changes external to this code, rather than to overo=
ptimize on the current commit.
+                *
+                * If you are seeing performance stability this script can =
help test.
+                * It tests on 4 commits in zstd where I saw performance ch=
ange.
+                *
+                *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e99=
351564473f4
+                */
 #if defined(__x86_64__)
-        /* Align the decompression loop to 32 + 16 bytes.
-         *
-         * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompr=
ession
-         * speed swings based on the alignment of the decompression loop. =
This
-         * performance swing is caused by parts of the decompression loop =
falling
-         * out of the DSB. The entire decompression loop should fit in the=
 DSB,
-         * when it can't we get much worse performance. You can measure if=
 you've
-         * hit the good case or the bad case with this perf command for so=
me
-         * compressed file test.zst:
-         *
-         *   perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any=
_uops \
-         *             -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.=
zst
-         *
-         * If you see most cycles served out of the MITE you've hit the ba=
d case.
-         * If you see most cycles served out of the DSB you've hit the goo=
d case.
-         * If it is pretty even then you may be in an okay case.
-         *
-         * I've been able to reproduce this issue on the following CPUs:
-         *   - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9
-         *               Use Instruments->Counters to get DSB/MITE cycles.
-         *               I never got performance swings, but I was able to
-         *               go from the good case of mostly DSB to half of the
-         *               cycles served from MITE.
-         *   - Coffeelake: Intel i9-9900k
-         *
-         * I haven't been able to reproduce the instability or DSB misses =
on any
-         * of the following CPUS:
-         *   - Haswell
-         *   - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH
-         *   - Skylake
-         *
-         * If you are seeing performance stability this script can help te=
st.
-         * It tests on 4 commits in zstd where I saw performance change.
-         *
-         *   https://gist.github.com/terrelln/9889fc06a423fd5ca6e993515644=
73f4
-         */
-        __asm__(".p2align 5");
-        __asm__("nop");
-        __asm__(".p2align 4");
+            __asm__(".p2align 6");
+#  if __GNUC__ >=3D 7
+	    /* good for gcc-7, gcc-9, and gcc-11 */
+            __asm__("nop");
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 4");
+#    if __GNUC__ =3D=3D 8 || __GNUC__ =3D=3D 10
+	    /* good for gcc-8 and gcc-10 */
+            __asm__("nop");
+            __asm__(".p2align 3");
+#    endif
+#  endif
+#endif
+
+            /* Handle the initial state where litBuffer is currently split=
 between dst and litExtraBuffer */
+            for (; litPtr + sequence.litLength <=3D dctx->litBufferEnd; ) {
+                size_t const oneSeqSize =3D ZSTD_execSequenceSplitLitBuffe=
r(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &l=
itPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A=
SSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen=
ce, prefixStart, vBase);
+#endif
+                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                    return oneSeqSize;
+                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqS=
ize);
+                op +=3D oneSeqSize;
+                if (UNLIKELY(!--nbSeq))
+                    break;
+                BIT_reloadDStream(&(seqState.DStream));
+                sequence =3D ZSTD_decodeSequence(&seqState, isLongOffset);
+            }
+
+            /* If there are more sequences, they will need to read literal=
s from litExtraBuffer; copy over the remainder from dst and update litPtr a=
nd litEnd */
+            if (nbSeq > 0) {
+                const size_t leftoverLit =3D dctx->litBufferEnd - litPtr;
+                if (leftoverLit)
+                {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dst=
Size_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequence.litLength -=3D leftoverLit;
+                    op +=3D leftoverLit;
+                }
+                litPtr =3D dctx->litExtraBuffer;
+                litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTR=
ASIZE;
+                dctx->litBufferLocation =3D ZSTD_not_in_dst;
+                {
+                    size_t const oneSeqSize =3D ZSTD_execSequence(op, oend=
, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A=
SSERT_VALID_SEQUENCE)
+                    assert(!ZSTD_isError(oneSeqSize));
+                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, se=
quence, prefixStart, vBase);
+#endif
+                    if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                        return oneSeqSize;
+                    DEBUGLOG(6, "regenerated sequence size : %u", (U32)one=
SeqSize);
+                    op +=3D oneSeqSize;
+                    if (--nbSeq)
+                        BIT_reloadDStream(&(seqState.DStream));
+                }
+            }
+        }
+
+        if (nbSeq > 0) /* there is remaining lit from extra buffer */
+        {
+
+#if defined(__x86_64__)
+            __asm__(".p2align 6");
+            __asm__("nop");
+#  if __GNUC__ !=3D 7
+            /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and cla=
ng */
+            __asm__(".p2align 4");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  elif __GNUC__ >=3D 11
+            __asm__(".p2align 3");
+#  else
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  endif
+#endif
+
+            for (; ; ) {
+                seq_t const sequence =3D ZSTD_decodeSequence(&seqState, is=
LongOffset);
+                size_t const oneSeqSize =3D ZSTD_execSequence(op, oend, se=
quence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A=
SSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen=
ce, prefixStart, vBase);
+#endif
+                if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                    return oneSeqSize;
+                DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqS=
ize);
+                op +=3D oneSeqSize;
+                if (UNLIKELY(!--nbSeq))
+                    break;
+                BIT_reloadDStream(&(seqState.DStream));
+            }
+        }
+
+        /* check if reached exact end */
+        DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after de=
code loop, remaining nbSeq : %i", nbSeq);
+        RETURN_ERROR_IF(nbSeq, corruption_detected, "");
+        RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream=
_completed, corruption_detected, "");
+        /* save reps for next block */
+        { U32 i; for (i=3D0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] =3D=
 (U32)(seqState.prevOffset[i]); }
+    }
+
+    /* last literal segment */
+    if (dctx->litBufferLocation =3D=3D ZSTD_split)  /* split hasn't been r=
eached yet, first get dst then copy litExtraBuffer */
+    {
+        size_t const lastLLSize =3D litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall=
, "");
+        if (op !=3D NULL) {
+            ZSTD_memmove(op, litPtr, lastLLSize);
+            op +=3D lastLLSize;
+        }
+        litPtr =3D dctx->litExtraBuffer;
+        litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+        dctx->litBufferLocation =3D ZSTD_not_in_dst;
+    }
+    {   size_t const lastLLSize =3D litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, =
"");
+        if (op !=3D NULL) {
+            ZSTD_memcpy(op, litPtr, lastLLSize);
+            op +=3D lastLLSize;
+        }
+    }
+
+    return op-ostart;
+}
+
+FORCE_INLINE_TEMPLATE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequences_body(ZSTD_DCtx* dctx,
+    void* dst, size_t maxDstSize,
+    const void* seqStart, size_t seqSize, int nbSeq,
+    const ZSTD_longOffset_e isLongOffset,
+    const int frame)
+{
+    const BYTE* ip =3D (const BYTE*)seqStart;
+    const BYTE* const iend =3D ip + seqSize;
+    BYTE* const ostart =3D (BYTE*)dst;
+    BYTE* const oend =3D dctx->litBufferLocation =3D=3D ZSTD_not_in_dst ? =
ostart + maxDstSize : dctx->litBuffer;
+    BYTE* op =3D ostart;
+    const BYTE* litPtr =3D dctx->litPtr;
+    const BYTE* const litEnd =3D litPtr + dctx->litSize;
+    const BYTE* const prefixStart =3D (const BYTE*)(dctx->prefixStart);
+    const BYTE* const vBase =3D (const BYTE*)(dctx->virtualStart);
+    const BYTE* const dictEnd =3D (const BYTE*)(dctx->dictEnd);
+    DEBUGLOG(5, "ZSTD_decompressSequences_body");
+    (void)frame;
+
+    /* Regen sequences */
+    if (nbSeq) {
+        seqState_t seqState;
+        dctx->fseEntropy =3D 1;
+        { U32 i; for (i =3D 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[=
i] =3D dctx->entropy.rep[i]; }
+        RETURN_ERROR_IF(
+            ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)),
+            corruption_detected, "");
+        ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTp=
tr);
+        ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OF=
Tptr);
+        ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTp=
tr);
+        assert(dst !=3D NULL);
+
+        ZSTD_STATIC_ASSERT(
+            BIT_DStream_unfinished < BIT_DStream_completed &&
+            BIT_DStream_endOfBuffer < BIT_DStream_completed &&
+            BIT_DStream_completed < BIT_DStream_overflow);
+
+#if defined(__x86_64__)
+            __asm__(".p2align 6");
+            __asm__("nop");
+#  if __GNUC__ >=3D 7
+            __asm__(".p2align 5");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  else
+            __asm__(".p2align 4");
+            __asm__("nop");
+            __asm__(".p2align 3");
+#  endif
 #endif
+
         for ( ; ; ) {
-            seq_t const sequence =3D ZSTD_decodeSequence(&seqState, isLong=
Offset, ZSTD_p_noPrefetch);
+            seq_t const sequence =3D ZSTD_decodeSequence(&seqState, isLong=
Offset);
             size_t const oneSeqSize =3D ZSTD_execSequence(op, oend, sequen=
ce, &litPtr, litEnd, prefixStart, vBase, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A=
SSERT_VALID_SEQUENCE)
             assert(!ZSTD_isError(oneSeqSize));
             if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, =
prefixStart, vBase);
 #endif
+            if (UNLIKELY(ZSTD_isError(oneSeqSize)))
+                return oneSeqSize;
             DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize);
-            BIT_reloadDStream(&(seqState.DStream));
             op +=3D oneSeqSize;
-            /* gcc and clang both don't like early returns in this loop.
-             * Instead break and check for an error at the end of the loop.
-             */
-            if (UNLIKELY(ZSTD_isError(oneSeqSize))) {
-                error =3D oneSeqSize;
+            if (UNLIKELY(!--nbSeq))
                 break;
-            }
-            if (UNLIKELY(!--nbSeq)) break;
+            BIT_reloadDStream(&(seqState.DStream));
         }
=20
         /* check if reached exact end */
         DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, rem=
aining nbSeq : %i", nbSeq);
-        if (ZSTD_isError(error)) return error;
         RETURN_ERROR_IF(nbSeq, corruption_detected, "");
         RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream=
_completed, corruption_detected, "");
         /* save reps for next block */
@@ -1229,9 +1632,37 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx,
 {
     return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, =
seqSize, nbSeq, isLongOffset, frame);
 }
+
+static size_t
+ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx,
+                                               void* dst, size_t maxDstSiz=
e,
+                                         const void* seqStart, size_t seqS=
ize, int nbSeq,
+                                         const ZSTD_longOffset_e isLongOff=
set,
+                                         const int frame)
+{
+    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSi=
ze, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
=20
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
+
+FORCE_INLINE_TEMPLATE size_t
+ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence,
+                   const BYTE* const prefixStart, const BYTE* const dictEn=
d)
+{
+    prefetchPos +=3D sequence.litLength;
+    {   const BYTE* const matchBase =3D (sequence.offset > prefetchPos) ? =
dictEnd : prefixStart;
+        const BYTE* const match =3D matchBase + prefetchPos - sequence.off=
set; /* note : this operation can overflow when seq.offset is really too la=
rge, which can only happen when input is corrupted.
+                                                                          =
    * No consequence though : memory address is only used for prefetching, =
not for dereferencing */
+        PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE);   /* note :=
 it's safe to invoke PREFETCH() on any memory address, including invalid on=
es */
+    }
+    return prefetchPos + sequence.matchLength;
+}
+
+/* This decoding function employs prefetching
+ * to reduce latency impact of cache misses.
+ * It's generally employed when block contains a significant portion of lo=
ng-distance matches
+ * or when coupled with a "cold" dictionary */
 FORCE_INLINE_TEMPLATE size_t
 ZSTD_decompressSequencesLong_body(
                                ZSTD_DCtx* dctx,
@@ -1243,10 +1674,10 @@ ZSTD_decompressSequencesLong_body(
     const BYTE* ip =3D (const BYTE*)seqStart;
     const BYTE* const iend =3D ip + seqSize;
     BYTE* const ostart =3D (BYTE*)dst;
-    BYTE* const oend =3D ostart + maxDstSize;
+    BYTE* const oend =3D dctx->litBufferLocation =3D=3D ZSTD_in_dst ? dctx=
->litBuffer : ostart + maxDstSize;
     BYTE* op =3D ostart;
     const BYTE* litPtr =3D dctx->litPtr;
-    const BYTE* const litEnd =3D litPtr + dctx->litSize;
+    const BYTE* litBufferEnd =3D dctx->litBufferEnd;
     const BYTE* const prefixStart =3D (const BYTE*) (dctx->prefixStart);
     const BYTE* const dictStart =3D (const BYTE*) (dctx->virtualStart);
     const BYTE* const dictEnd =3D (const BYTE*) (dctx->dictEnd);
@@ -1254,18 +1685,17 @@ ZSTD_decompressSequencesLong_body(
=20
     /* Regen sequences */
     if (nbSeq) {
-#define STORED_SEQS 4
+#define STORED_SEQS 8
 #define STORED_SEQS_MASK (STORED_SEQS-1)
-#define ADVANCED_SEQS 4
+#define ADVANCED_SEQS STORED_SEQS
         seq_t sequences[STORED_SEQS];
         int const seqAdvance =3D MIN(nbSeq, ADVANCED_SEQS);
         seqState_t seqState;
         int seqNb;
+        size_t prefetchPos =3D (size_t)(op-prefixStart); /* track position=
 relative to prefixStart */
+
         dctx->fseEntropy =3D 1;
         { int i; for (i=3D0; i<ZSTD_REP_NUM; i++) seqState.prevOffset[i] =
=3D dctx->entropy.rep[i]; }
-        seqState.prefixStart =3D prefixStart;
-        seqState.pos =3D (size_t)(op-prefixStart);
-        seqState.dictEnd =3D dictEnd;
         assert(dst !=3D NULL);
         assert(iend >=3D ip);
         RETURN_ERROR_IF(
@@ -1277,36 +1707,100 @@ ZSTD_decompressSequencesLong_body(
=20
         /* prepare in advance */
         for (seqNb=3D0; (BIT_reloadDStream(&seqState.DStream) <=3D BIT_DSt=
ream_completed) && (seqNb<seqAdvance); seqNb++) {
-            sequences[seqNb] =3D ZSTD_decodeSequence(&seqState, isLongOffs=
et, ZSTD_p_prefetch);
-            PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seq=
Nb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invok=
e PREFETCH() on any memory address, including invalid ones */
+            seq_t const sequence =3D ZSTD_decodeSequence(&seqState, isLong=
Offset);
+            prefetchPos =3D ZSTD_prefetchMatch(prefetchPos, sequence, pref=
ixStart, dictEnd);
+            sequences[seqNb] =3D sequence;
         }
         RETURN_ERROR_IF(seqNb<seqAdvance, corruption_detected, "");
=20
-        /* decode and decompress */
-        for ( ; (BIT_reloadDStream(&(seqState.DStream)) <=3D BIT_DStream_c=
ompleted) && (seqNb<nbSeq) ; seqNb++) {
-            seq_t const sequence =3D ZSTD_decodeSequence(&seqState, isLong=
Offset, ZSTD_p_prefetch);
-            size_t const oneSeqSize =3D ZSTD_execSequence(op, oend, sequen=
ces[(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litEnd, prefixStart=
, dictStart, dictEnd);
+        /* decompress without stomping litBuffer */
+        for (; (BIT_reloadDStream(&(seqState.DStream)) <=3D BIT_DStream_co=
mpleted) && (seqNb < nbSeq); seqNb++) {
+            seq_t sequence =3D ZSTD_decodeSequence(&seqState, isLongOffset=
);
+            size_t oneSeqSize;
+
+            if (dctx->litBufferLocation =3D=3D ZSTD_split && litPtr + sequ=
ences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBuff=
erEnd)
+            {
+                /* lit buffer is reaching split point, empty out the first=
 buffer and transition to litExtraBuffer */
+                const size_t leftoverLit =3D dctx->litBufferEnd - litPtr;
+                if (leftoverLit)
+                {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dst=
Size_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].=
litLength -=3D leftoverLit;
+                    op +=3D leftoverLit;
+                }
+                litPtr =3D dctx->litExtraBuffer;
+                litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTR=
ASIZE;
+                dctx->litBufferLocation =3D ZSTD_not_in_dst;
+                oneSeqSize =3D ZSTD_execSequence(op, oend, sequences[(seqN=
b - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart,=
 dictStart, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A=
SSERT_VALID_SEQUENCE)
-            assert(!ZSTD_isError(oneSeqSize));
-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[=
(seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen=
ces[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
 #endif
-            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
-            PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequ=
ence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memo=
ry address, including invalid ones */
-            sequences[seqNb & STORED_SEQS_MASK] =3D sequence;
-            op +=3D oneSeqSize;
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+
+                prefetchPos =3D ZSTD_prefetchMatch(prefetchPos, sequence, =
prefixStart, dictEnd);
+                sequences[seqNb & STORED_SEQS_MASK] =3D sequence;
+                op +=3D oneSeqSize;
+            }
+            else
+            {
+                /* lit buffer is either wholly contained in first or secon=
d split, or not split at all*/
+                oneSeqSize =3D dctx->litBufferLocation =3D=3D ZSTD_split ?
+                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + seq=
uences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVE=
RLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, li=
tBufferEnd, prefixStart, dictStart, dictEnd) :
+                    ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCE=
D_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart,=
 dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A=
SSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen=
ces[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+
+                prefetchPos =3D ZSTD_prefetchMatch(prefetchPos, sequence, =
prefixStart, dictEnd);
+                sequences[seqNb & STORED_SEQS_MASK] =3D sequence;
+                op +=3D oneSeqSize;
+            }
         }
         RETURN_ERROR_IF(seqNb<nbSeq, corruption_detected, "");
=20
         /* finish queue */
         seqNb -=3D seqAdvance;
         for ( ; seqNb<nbSeq ; seqNb++) {
-            size_t const oneSeqSize =3D ZSTD_execSequence(op, oend, sequen=
ces[seqNb&STORED_SEQS_MASK], &litPtr, litEnd, prefixStart, dictStart, dictE=
nd);
+            seq_t *sequence =3D &(sequences[seqNb&STORED_SEQS_MASK]);
+            if (dctx->litBufferLocation =3D=3D ZSTD_split && litPtr + sequ=
ence->litLength > dctx->litBufferEnd)
+            {
+                const size_t leftoverLit =3D dctx->litBufferEnd - litPtr;
+                if (leftoverLit)
+                {
+                    RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dst=
Size_tooSmall, "remaining lit must fit within dstBuffer");
+                    ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit);
+                    sequence->litLength -=3D leftoverLit;
+                    op +=3D leftoverLit;
+                }
+                litPtr =3D dctx->litExtraBuffer;
+                litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTR=
ASIZE;
+                dctx->litBufferLocation =3D ZSTD_not_in_dst;
+                {
+                    size_t const oneSeqSize =3D ZSTD_execSequence(op, oend=
, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd);
 #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A=
SSERT_VALID_SEQUENCE)
-            assert(!ZSTD_isError(oneSeqSize));
-            if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[=
seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+                    assert(!ZSTD_isError(oneSeqSize));
+                    if (frame) ZSTD_assertValidSequence(dctx, op, oend, se=
quences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
 #endif
-            if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
-            op +=3D oneSeqSize;
+                    if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                    op +=3D oneSeqSize;
+                }
+            }
+            else
+            {
+                size_t const oneSeqSize =3D dctx->litBufferLocation =3D=3D=
 ZSTD_split ?
+                    ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + seq=
uence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, p=
refixStart, dictStart, dictEnd) :
+                    ZSTD_execSequence(op, oend, *sequence, &litPtr, litBuf=
ferEnd, prefixStart, dictStart, dictEnd);
+#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A=
SSERT_VALID_SEQUENCE)
+                assert(!ZSTD_isError(oneSeqSize));
+                if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen=
ces[seqNb&STORED_SEQS_MASK], prefixStart, dictStart);
+#endif
+                if (ZSTD_isError(oneSeqSize)) return oneSeqSize;
+                op +=3D oneSeqSize;
+            }
         }
=20
         /* save reps for next block */
@@ -1314,10 +1808,21 @@ ZSTD_decompressSequencesLong_body(
     }
=20
     /* last literal segment */
-    {   size_t const lastLLSize =3D litEnd - litPtr;
+    if (dctx->litBufferLocation =3D=3D ZSTD_split)  /* first deplete liter=
al buffer in dst, then copy litExtraBuffer */
+    {
+        size_t const lastLLSize =3D litBufferEnd - litPtr;
+        RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall=
, "");
+        if (op !=3D NULL) {
+            ZSTD_memmove(op, litPtr, lastLLSize);
+            op +=3D lastLLSize;
+        }
+        litPtr =3D dctx->litExtraBuffer;
+        litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE;
+    }
+    {   size_t const lastLLSize =3D litBufferEnd - litPtr;
         RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, =
"");
         if (op !=3D NULL) {
-            ZSTD_memcpy(op, litPtr, lastLLSize);
+            ZSTD_memmove(op, litPtr, lastLLSize);
             op +=3D lastLLSize;
         }
     }
@@ -1341,7 +1846,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx,
 #if DYNAMIC_BMI2
=20
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
-static TARGET_ATTRIBUTE("bmi2") size_t
+static BMI2_TARGET_ATTRIBUTE size_t
 DONT_VECTORIZE
 ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
@@ -1351,10 +1856,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx,
 {
     return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, =
seqSize, nbSeq, isLongOffset, frame);
 }
+static BMI2_TARGET_ATTRIBUTE size_t
+DONT_VECTORIZE
+ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx,
+                                 void* dst, size_t maxDstSize,
+                           const void* seqStart, size_t seqSize, int nbSeq,
+                           const ZSTD_longOffset_e isLongOffset,
+                           const int frame)
+{
+    return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSi=
ze, seqStart, seqSize, nbSeq, isLongOffset, frame);
+}
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
=20
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT
-static TARGET_ATTRIBUTE("bmi2") size_t
+static BMI2_TARGET_ATTRIBUTE size_t
 ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx,
                                  void* dst, size_t maxDstSize,
                            const void* seqStart, size_t seqSize, int nbSeq,
@@ -1383,11 +1898,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst=
, size_t maxDstSize,
 {
     DEBUGLOG(5, "ZSTD_decompressSequences");
 #if DYNAMIC_BMI2
-    if (dctx->bmi2) {
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
         return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqSta=
rt, seqSize, nbSeq, isLongOffset, frame);
     }
 #endif
-  return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart,=
 seqSize, nbSeq, isLongOffset, frame);
+    return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStar=
t, seqSize, nbSeq, isLongOffset, frame);
+}
+static size_t
+ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t =
maxDstSize,
+                                 const void* seqStart, size_t seqSize, int=
 nbSeq,
+                                 const ZSTD_longOffset_e isLongOffset,
+                                 const int frame)
+{
+    DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer");
+#if DYNAMIC_BMI2
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
+        return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxD=
stSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
+    }
+#endif
+    return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDs=
tSize, seqStart, seqSize, nbSeq, isLongOffset, frame);
 }
 #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */
=20
@@ -1407,7 +1936,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx,
 {
     DEBUGLOG(5, "ZSTD_decompressSequencesLong");
 #if DYNAMIC_BMI2
-    if (dctx->bmi2) {
+    if (ZSTD_DCtx_get_bmi2(dctx)) {
         return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, se=
qStart, seqSize, nbSeq, isLongOffset, frame);
     }
 #endif
@@ -1448,7 +1977,7 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTab=
le)
 size_t
 ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                               void* dst, size_t dstCapacity,
-                        const void* src, size_t srcSize, const int frame)
+                        const void* src, size_t srcSize, const int frame, =
const streaming_operation streaming)
 {   /* blockType =3D=3D blockCompressed */
     const BYTE* ip =3D (const BYTE*)src;
     /* isLongOffset must be true if there are long offsets.
@@ -1463,7 +1992,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
     RETURN_ERROR_IF(srcSize >=3D ZSTD_BLOCKSIZE_MAX, srcSize_wrong, "");
=20
     /* Decode literals section */
-    {   size_t const litCSize =3D ZSTD_decodeLiteralsBlock(dctx, src, srcS=
ize);
+    {   size_t const litCSize =3D ZSTD_decodeLiteralsBlock(dctx, src, srcS=
ize, dst, dstCapacity, streaming);
         DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize);
         if (ZSTD_isError(litCSize)) return litCSize;
         ip +=3D litCSize;
@@ -1511,7 +2040,10 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
=20
 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG
         /* else */
-        return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSiz=
e, nbSeq, isLongOffset, frame);
+        if (dctx->litBufferLocation =3D=3D ZSTD_split)
+            return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCa=
pacity, ip, srcSize, nbSeq, isLongOffset, frame);
+        else
+            return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, sr=
cSize, nbSeq, isLongOffset, frame);
 #endif
     }
 }
@@ -1534,7 +2066,7 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx,
 {
     size_t dSize;
     ZSTD_checkContinuity(dctx, dst, dstCapacity);
-    dSize =3D ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, s=
rcSize, /* frame */ 0);
+    dSize =3D ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, s=
rcSize, /* frame */ 0, not_streaming);
     dctx->previousDstEnd =3D (char*)dst + dSize;
     return dSize;
 }
diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompr=
ess/zstd_decompress_block.h
index e7f5f6689459..3d2d57a5d25a 100644
--- a/lib/zstd/decompress/zstd_decompress_block.h
+++ b/lib/zstd/decompress/zstd_decompress_block.h
@@ -33,6 +33,12 @@
  */
=20
=20
+ /* Streaming state is used to inform allocation of the literal buffer */
+typedef enum {
+    not_streaming =3D 0,
+    is_streaming =3D 1
+} streaming_operation;
+
 /* ZSTD_decompressBlock_internal() :
  * decompress block, starting at `src`,
  * into destination buffer `dst`.
@@ -41,7 +47,7 @@
  */
 size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
                                void* dst, size_t dstCapacity,
-                         const void* src, size_t srcSize, const int frame);
+                         const void* src, size_t srcSize, const int frame,=
 const streaming_operation streaming);
=20
 /* ZSTD_buildFSETable() :
  * generate FSE decoding table for one symbol (ll, ml or off)
@@ -54,7 +60,7 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
  */
 void ZSTD_buildFSETable(ZSTD_seqSymbol* dt,
              const short* normalizedCounter, unsigned maxSymbolValue,
-             const U32* baseValue, const U32* nbAdditionalBits,
+             const U32* baseValue, const U8* nbAdditionalBits,
                    unsigned tableLog, void* wksp, size_t wkspSize,
                    int bmi2);
=20
diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/deco=
mpress/zstd_decompress_internal.h
index 4b9052f68755..98102edb6a83 100644
--- a/lib/zstd/decompress/zstd_decompress_internal.h
+++ b/lib/zstd/decompress/zstd_decompress_internal.h
@@ -20,7 +20,7 @@
  *  Dependencies
  *********************************************************/
 #include "../common/mem.h"             /* BYTE, U16, U32 */
-#include "../common/zstd_internal.h"   /* ZSTD_seqSymbol */
+#include "../common/zstd_internal.h"   /* constants : MaxLL, MaxML, MaxOff=
, LLFSELog, etc. */
=20
=20
=20
@@ -40,7 +40,7 @@ static UNUSED_ATTR const U32 OF_base[MaxOff+1] =3D {
                  0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3=
FFFFD, 0x7FFFFD,
                  0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1=
FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD };
=20
-static UNUSED_ATTR const U32 OF_bits[MaxOff+1] =3D {
+static UNUSED_ATTR const U8 OF_bits[MaxOff+1] =3D {
                      0,  1,  2,  3,  4,  5,  6,  7,
                      8,  9, 10, 11, 12, 13, 14, 15,
                     16, 17, 18, 19, 20, 21, 22, 23,
@@ -106,6 +106,22 @@ typedef struct {
     size_t ddictPtrCount;
 } ZSTD_DDictHashSet;
=20
+#ifndef ZSTD_DECODER_INTERNAL_BUFFER
+#  define ZSTD_DECODER_INTERNAL_BUFFER  (1 << 16)
+#endif
+
+#define ZSTD_LBMIN 64
+#define ZSTD_LBMAX (128 << 10)
+
+/* extra buffer, compensates when dst is not large enough to store litBuff=
er */
+#define ZSTD_LITBUFFEREXTRASIZE  BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL=
_BUFFER, ZSTD_LBMAX)
+
+typedef enum {
+    ZSTD_not_in_dst =3D 0,  /* Stored entirely within litExtraBuffer */
+    ZSTD_in_dst =3D 1,           /* Stored entirely within dst (in memory =
after current output write) */
+    ZSTD_split =3D 2            /* Split between litExtraBuffer and dst */
+} ZSTD_litLocation_e;
+
 struct ZSTD_DCtx_s
 {
     const ZSTD_seqSymbol* LLTptr;
@@ -136,7 +152,9 @@ struct ZSTD_DCtx_s
     size_t litSize;
     size_t rleSize;
     size_t staticSize;
+#if DYNAMIC_BMI2 !=3D 0
     int bmi2;                     /* =3D=3D 1 if the CPU supports BMI2 and=
 0 otherwise. CPU support is determined dynamically once per context lifeti=
me. */
+#endif
=20
     /* dictionary */
     ZSTD_DDict* ddictLocal;
@@ -158,16 +176,16 @@ struct ZSTD_DCtx_s
     size_t outStart;
     size_t outEnd;
     size_t lhSize;
-    void* legacyContext;
-    U32 previousLegacyVersion;
-    U32 legacyVersion;
     U32 hostageByte;
     int noForwardProgress;
     ZSTD_bufferMode_e outBufferMode;
     ZSTD_outBuffer expectedOutBuffer;
=20
     /* workspace */
-    BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH];
+    BYTE* litBuffer;
+    const BYTE* litBufferEnd;
+    ZSTD_litLocation_e litBufferLocation;
+    BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /*=
 literal buffer can be split between storage within dst and within this scr=
atch buffer */
     BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX];
=20
     size_t oversizedDuration;
@@ -180,6 +198,14 @@ struct ZSTD_DCtx_s
     /* Tracing */
 };  /* typedef'd to ZSTD_DCtx within "zstd.h" */
=20
+MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) {
+#if DYNAMIC_BMI2 !=3D 0
+	return dctx->bmi2;
+#else
+    (void)dctx;
+	return 0;
+#endif
+}
=20
 /*-*******************************************************
  *  Shared internal functions
diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h
index 0fbec508f285..a06ca187aab5 100644
--- a/lib/zstd/decompress_sources.h
+++ b/lib/zstd/decompress_sources.h
@@ -16,6 +16,12 @@
  * decompression.
  */
=20
+/*
+ * Disable the ASM Huffman implementation because we need to
+ * include all the sources.
+ */
+#define ZSTD_DISABLE_ASM 1
+
 #include "common/debug.c"
 #include "common/entropy_common.c"
 #include "common/error_private.c"
diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_modul=
e.c
index 65548a4bb934..04e1b5c01d9b 100644
--- a/lib/zstd/zstd_compress_module.c
+++ b/lib/zstd/zstd_compress_module.c
@@ -133,7 +133,11 @@ EXPORT_SYMBOL(zstd_init_cstream);
 size_t zstd_reset_cstream(zstd_cstream *cstream,
 	unsigned long long pledged_src_size)
 {
-	return ZSTD_resetCStream(cstream, pledged_src_size);
+	if (pledged_src_size =3D=3D 0)
+		pledged_src_size =3D ZSTD_CONTENTSIZE_UNKNOWN;
+	ZSTD_FORWARD_IF_ERR( ZSTD_CCtx_reset(cstream, ZSTD_reset_session_only) );
+	ZSTD_FORWARD_IF_ERR( ZSTD_CCtx_setPledgedSrcSize(cstream, pledged_src_siz=
e) );
+	return 0;
 }
 EXPORT_SYMBOL(zstd_reset_cstream);
=20
--=20
2.38.1