From nobody Wed Apr 8 07:59:17 2026 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on aws-us-west-2-korg-lkml-1.web.codeaurora.org Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id 000EAC67871 for ; Mon, 24 Oct 2022 22:13:29 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231690AbiJXWN2 (ORCPT ); Mon, 24 Oct 2022 18:13:28 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:38178 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S232565AbiJXWMw (ORCPT ); Mon, 24 Oct 2022 18:12:52 -0400 Received: from mail-vk1-xa31.google.com (mail-vk1-xa31.google.com [IPv6:2607:f8b0:4864:20::a31]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 334E230501E for ; Mon, 24 Oct 2022 13:28:42 -0700 (PDT) Received: by mail-vk1-xa31.google.com with SMTP id m18so2112932vka.10 for ; Mon, 24 Oct 2022 13:28:41 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=FtjfA87GphDJcI4Vb9iAM+chCBCFrIrn+2UzKfCSzkM=; b=k4bz2u0erMG2lsu2Psoe5/usHLFPMqhFmHnRrJe0M+hB86PaLpQ7eCFfAzFmdzz1Qv q7ZZnGR1H6LfaNjZ/JrHA77mttFZoADtt6TAJK0F9q6u5z66+75iRGMyo+8GHDPT9p4P mFbGQjjLg/uKEg8zJpOSxyXDw2AJWwwea8IGNnaBLli3KvHu8L1bLPvbltmjMRVxTpXD 3y6RGGHjtq6aFJSiYUHIGPZ7cMF3d6d7RyrWfuHcH1+8p4JuWM4Pz4qzFjJo2GrtB1qm 5oweYcc5sy5AtXjxueMZPMQLluXjjp3PVWMS8Zp9AY9c10z2DxBq3O+uA5RJ6HvXyKOj Kmnw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=FtjfA87GphDJcI4Vb9iAM+chCBCFrIrn+2UzKfCSzkM=; b=dFRjmZ0+Yyq7Zcir/HvAC7lcTlEBfAvyeUdcQKk3BjfdEncYnq3QbG/abegoIi8R0L QEU+hOuwiL8BlvCEA6JWC5cRCpBMed6SWitBh9wQG2KoNIsW1y6iwZNZClLC+Q11c4jV 8cJfpFYWCshbcdBcDMJJFTbxAUGEduirz9+tx/9mq83mI8xEY7ACKmWBrZ0xQ/yR77ml zVCP4uK486vOkhK0nErX/GR9p2vzwWCQQ3zNIeXTa/p6hrBBqztFbXmhYuEiWsPvbWWa Wz1wxSnmarU3q+uQ/kScqVbyL56SxjgOlI33OrtnqJYYHk5Xjzf7HidcnXF5DYdDT55O SMpQ== X-Gm-Message-State: ACrzQf2uJqxV6LC9eU7sPerx3iXnqoJdVm9l4k0gdpoqBQHyp5qKVTvb 1oq+Axs1Qyu+Q8UXlEVL7UYtSm+dp7Q= X-Google-Smtp-Source: AMsMyM5LGD+kKoUy+UyXdcaHRMuZb+H9htNgOT9oSVdgZ1XZ1DqfomlJfGo6OT3w8iAWsBX5oQp0ww== X-Received: by 2002:a17:902:ce0e:b0:17d:a730:3835 with SMTP id k14-20020a170902ce0e00b0017da7303835mr36674422plg.131.1666642613053; Mon, 24 Oct 2022 13:16:53 -0700 (PDT) Received: from localhost.localdomain (c-98-35-160-214.hsd1.ca.comcast.net. [98.35.160.214]) by smtp.gmail.com with ESMTPSA id k14-20020aa7972e000000b0056bb4dc8164sm173518pfg.193.2022.10.24.13.16.51 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 24 Oct 2022 13:16:52 -0700 (PDT) From: Nick Terrell To: Nick Terrell Cc: linux-kernel@vger.kernel.org, Nick Terrell , =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= , David Sterba , Sam Hardeman , Kernel Team Subject: [PATCH 1/2] zstd: Move zstd-common module exports to zstd_common_module.c Date: Mon, 24 Oct 2022 13:26:05 -0700 Message-Id: <20221024202606.404049-2-nickrterrell@gmail.com> X-Mailer: git-send-email 2.38.1 In-Reply-To: <20221024202606.404049-1-nickrterrell@gmail.com> References: <20221024202606.404049-1-nickrterrell@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Nick Terrell The zstd codebase is imported from the upstream zstd repo, and is over-writ= ten on every update. Upstream keeps the kernel specific code separate from the main library. So the module definition is moved into the zstd_common_module.c fi= le. This matches the pattern followed by the zstd-compress and zstd-decompress = files. I've done build and boot testing on x86-64, i386, and aarch64. I've verified that zstd built both as modules and built-in build and boot. Signed-off-by: Nick Terrell --- lib/zstd/Makefile | 1 + lib/zstd/common/entropy_common.c | 4 ---- lib/zstd/common/zstd_common.c | 10 ---------- lib/zstd/zstd_common_module.c | 32 ++++++++++++++++++++++++++++++++ 4 files changed, 33 insertions(+), 14 deletions(-) create mode 100644 lib/zstd/zstd_common_module.c diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile index 440bd0007ae2..20f08c644b71 100644 --- a/lib/zstd/Makefile +++ b/lib/zstd/Makefile @@ -35,6 +35,7 @@ zstd_decompress-y :=3D \ decompress/zstd_decompress_block.o \ =20 zstd_common-y :=3D \ + zstd_common_module.o \ common/debug.o \ common/entropy_common.o \ common/error_private.o \ diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_com= mon.c index a311808c0d56..6353249de614 100644 --- a/lib/zstd/common/entropy_common.c +++ b/lib/zstd/common/entropy_common.c @@ -15,7 +15,6 @@ /* ************************************* * Dependencies ***************************************/ -#include #include "mem.h" #include "error_private.h" /* ERR_*, ERROR */ #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ @@ -240,7 +239,6 @@ size_t FSE_readNCount( { return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, h= eaderBuffer, hbSize, /* bmi2 */ 0); } -EXPORT_SYMBOL_GPL(FSE_readNCount); =20 /*! HUF_readStats() : Read compact Huffman tree, saved by HUF_writeCTable(). @@ -256,7 +254,6 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U= 32* rankStats, U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr,= tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); } -EXPORT_SYMBOL_GPL(HUF_readStats); =20 FORCE_INLINE_TEMPLATE size_t HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, @@ -357,4 +354,3 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSi= ze, U32* rankStats, (void)bmi2; return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSym= bolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); } -EXPORT_SYMBOL_GPL(HUF_readStats_wksp); diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c index 0f1f63be25d9..3d7e35b309b5 100644 --- a/lib/zstd/common/zstd_common.c +++ b/lib/zstd/common/zstd_common.c @@ -13,7 +13,6 @@ /*-************************************* * Dependencies ***************************************/ -#include #define ZSTD_DEPS_NEED_MALLOC #include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_mems= et */ #include "error_private.h" @@ -36,17 +35,14 @@ const char* ZSTD_versionString(void) { return ZSTD_VERS= ION_STRING; } * tells if a return value is an error code * symbol is required for external callers */ unsigned ZSTD_isError(size_t code) { return ERR_isError(code); } -EXPORT_SYMBOL_GPL(ZSTD_isError); =20 /*! ZSTD_getErrorName() : * provides error code string from function result (useful for debugging)= */ const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code)= ; } -EXPORT_SYMBOL_GPL(ZSTD_getErrorName); =20 /*! ZSTD_getError() : * convert a `size_t` function result into a proper ZSTD_errorCode enum */ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(co= de); } -EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); =20 /*! ZSTD_getErrorString() : * provides error code string from enum */ @@ -63,7 +59,6 @@ void* ZSTD_customMalloc(size_t size, ZSTD_customMem custo= mMem) return customMem.customAlloc(customMem.opaque, size); return ZSTD_malloc(size); } -EXPORT_SYMBOL_GPL(ZSTD_customMalloc); =20 void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) { @@ -76,7 +71,6 @@ void* ZSTD_customCalloc(size_t size, ZSTD_customMem custo= mMem) } return ZSTD_calloc(1, size); } -EXPORT_SYMBOL_GPL(ZSTD_customCalloc); =20 void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) { @@ -87,7 +81,3 @@ void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) ZSTD_free(ptr); } } -EXPORT_SYMBOL_GPL(ZSTD_customFree); - -MODULE_LICENSE("Dual BSD/GPL"); -MODULE_DESCRIPTION("Zstd Common"); diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c new file mode 100644 index 000000000000..22686e367e6f --- /dev/null +++ b/lib/zstd/zstd_common_module.c @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in= the + * LICENSE file in the root directory of this source tree) and the GPLv2 (= found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#include + +#include "common/huf.h" +#include "common/fse.h" +#include "common/zstd_internal.h" + +// Export symbols shared by compress and decompress into a common module + +#undef ZSTD_isError /* defined within zstd_internal.h */ +EXPORT_SYMBOL_GPL(FSE_readNCount); +EXPORT_SYMBOL_GPL(HUF_readStats); +EXPORT_SYMBOL_GPL(HUF_readStats_wksp); +EXPORT_SYMBOL_GPL(ZSTD_isError); +EXPORT_SYMBOL_GPL(ZSTD_getErrorName); +EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); +EXPORT_SYMBOL_GPL(ZSTD_customMalloc); +EXPORT_SYMBOL_GPL(ZSTD_customCalloc); +EXPORT_SYMBOL_GPL(ZSTD_customFree); + +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_DESCRIPTION("Zstd Common"); --=20 2.38.1 From nobody Wed Apr 8 07:59:17 2026 Return-Path: Received: from vger.kernel.org (vger.kernel.org [23.128.96.18]) by smtp.lore.kernel.org (Postfix) with ESMTP id CD202C38A2D for ; Mon, 24 Oct 2022 22:05:23 +0000 (UTC) Received: (majordomo@vger.kernel.org) by vger.kernel.org via listexpand id S231150AbiJXWFW (ORCPT ); Mon, 24 Oct 2022 18:05:22 -0400 Received: from lindbergh.monkeyblade.net ([23.128.96.19]:36170 "EHLO lindbergh.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S231978AbiJXWEz (ORCPT ); Mon, 24 Oct 2022 18:04:55 -0400 Received: from mail-pl1-x62d.google.com (mail-pl1-x62d.google.com [IPv6:2607:f8b0:4864:20::62d]) by lindbergh.monkeyblade.net (Postfix) with ESMTPS id 851879B877 for ; Mon, 24 Oct 2022 13:18:37 -0700 (PDT) Received: by mail-pl1-x62d.google.com with SMTP id p3so8457931pld.10 for ; Mon, 24 Oct 2022 13:18:37 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:from:to:cc:subject:date :message-id:reply-to; bh=aB3pyQlqU5SvdVBqjlavg6jCXdGT84BHAaSWMzVkXOo=; b=YLYcPh3xqTG9AeKDSkjtmLRFUlHAUM8tfpWOlUo0ZXdZIAI6rFABrhvJWJpsqmZnV4 Z26OsfqsppHuX2fctY+4eBWExvZLQba53Wk2E6FCgNl98lOL4nmj7jIcOv60wA5ZxB9K K78uqapxkvEYLhhkjeMcU6JjF3SiS4IV4rzLlfkWn6/29aqsmZK9XigdbY1Jd0OLge3K aYYM0Tq9hwXw2muFyd/SQBbTNZjdZ5dKtW9ezOQybcQFlttRpi8ITnE36hYNw+rq7HI9 sQxeK2zfC592FO3lMm2x0gy2WiZcfmzLtCdPSH/HFHSZN96kjbZ9av+8OcDD4kluUAup 7ugA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=content-transfer-encoding:mime-version:references:in-reply-to :message-id:date:subject:cc:to:from:x-gm-message-state:from:to:cc :subject:date:message-id:reply-to; bh=aB3pyQlqU5SvdVBqjlavg6jCXdGT84BHAaSWMzVkXOo=; b=LTRStc4dfx+sL4/vgKvm0TLaH5ch+hLd8O2aj4lG3Zsj53ofYOxwDfRJK94yEJ9fMr +YaDb+3ZDCMMdABboxEPRwZpIqKvpf4g0pJSCcqNUWjEAG6Dz5J6w4bPQEBx44J0bQRS MeWjRI88lbO4r6XqMaGW5TZS89XmDfWXGTt7tIMgmuGNc7t4SHxT8wU+19OMeV5AZBhp EmOUet0nda8QEChinuLG9SghV4GkmBRRNOHru1NgoTyKENepxjz5eHfhTsZDkFavIuNx cHK2gjTjKJKMtuUJ5RRpXuxWrlqMjqs8bFRd/oRff/nhkwYvnxCrUFmgMCgXacgWqdFf w0iw== X-Gm-Message-State: ACrzQf1LDbjjXtfcqn4L1Udm7fkGkmJ8/P+MNBgBW868jEWXS+F27YB5 b2O9tFny+wf5XdWsKz0uFSM= X-Google-Smtp-Source: AMsMyM6ac3ZzzIv1sM0wR5ZU2+PHPb0yp84WErkBW2Z+dFRhw+IGwZY+Zpgpvn03lom549Z9j1g03A== X-Received: by 2002:a17:90b:1e11:b0:20d:90b3:45a0 with SMTP id pg17-20020a17090b1e1100b0020d90b345a0mr72718807pjb.29.1666642615999; Mon, 24 Oct 2022 13:16:55 -0700 (PDT) Received: from localhost.localdomain (c-98-35-160-214.hsd1.ca.comcast.net. [98.35.160.214]) by smtp.gmail.com with ESMTPSA id k14-20020aa7972e000000b0056bb4dc8164sm173518pfg.193.2022.10.24.13.16.53 (version=TLS1_3 cipher=TLS_AES_256_GCM_SHA384 bits=256/256); Mon, 24 Oct 2022 13:16:55 -0700 (PDT) From: Nick Terrell To: Nick Terrell Cc: linux-kernel@vger.kernel.org, Nick Terrell , =?UTF-8?q?Andr=C3=A9=20Goddard=20Rosa?= , David Sterba , Sam Hardeman , Kernel Team Subject: [PATCH 2/2] zstd: import usptream v1.5.2 Date: Mon, 24 Oct 2022 13:26:06 -0700 Message-Id: <20221024202606.404049-3-nickrterrell@gmail.com> X-Mailer: git-send-email 2.38.1 In-Reply-To: <20221024202606.404049-1-nickrterrell@gmail.com> References: <20221024202606.404049-1-nickrterrell@gmail.com> MIME-Version: 1.0 Content-Transfer-Encoding: quoted-printable Precedence: bulk List-ID: X-Mailing-List: linux-kernel@vger.kernel.org Content-Type: text/plain; charset="utf-8" From: Nick Terrell Updates the kernel's zstd library to v1.5.2, the latest zstd release. The upstream tag it is updated to is `v1.5.2-kernel`, which contains several cherry-picked commits on top of the v1.5.2 release which are required for the kernel update. I will create this tag once the PR is ready to merge, until then reference the temporary upstream branch `v1.5.2-kernel-cherrypicks`. I plan to submit this patch as part of the v6.2 merge window. I've done basic build testing & testing on x86-64, i386, and aarch64. I'm merging these patches into my `zstd-next` branch, which is pulled into `linux-next` for further testing. I've benchmarked BtrFS with zstd compression on a x86-64 machine, and saw these results. Decompression speed is a small win across the board. The lower compression levels 1-4 see both compression speed and compression ratio wins. The higher compression levels see a small compression speed loss and about neutral ratio. I expect the lower compression levels to be used much more heavily than the high compression levels, so this should be a net win. Level CTime DTime Ratio 1 -2.95% -1.1% -0.7% 3 -3.5% -1.2% -0.5% 5 +3.7% -1.0% +0.0% 7 +3.2% -0.9% +0.0% 9 -4.3% -0.8% +0.1% Signed-off-by: Nick Terrell --- include/linux/zstd_lib.h | 479 ++-- lib/zstd/common/bitstream.h | 9 + lib/zstd/common/compiler.h | 67 +- lib/zstd/common/entropy_common.c | 7 +- lib/zstd/common/error_private.h | 81 +- lib/zstd/common/fse.h | 3 +- lib/zstd/common/fse_decompress.c | 2 +- lib/zstd/common/huf.h | 46 +- lib/zstd/common/mem.h | 2 + lib/zstd/common/portability_macros.h | 93 + lib/zstd/common/zstd_internal.h | 175 +- lib/zstd/compress/clevels.h | 132 ++ lib/zstd/compress/fse_compress.c | 83 +- lib/zstd/compress/huf_compress.c | 644 +++++- lib/zstd/compress/zstd_compress.c | 2000 +++++++++++++---- lib/zstd/compress/zstd_compress_internal.h | 375 +++- lib/zstd/compress/zstd_compress_literals.c | 9 +- lib/zstd/compress/zstd_compress_literals.h | 4 +- lib/zstd/compress/zstd_compress_sequences.c | 31 +- lib/zstd/compress/zstd_compress_superblock.c | 295 +-- lib/zstd/compress/zstd_cwksp.h | 225 +- lib/zstd/compress/zstd_double_fast.c | 413 +++- lib/zstd/compress/zstd_fast.c | 441 ++-- lib/zstd/compress/zstd_lazy.c | 1352 ++++++++--- lib/zstd/compress/zstd_lazy.h | 38 + lib/zstd/compress/zstd_ldm.c | 76 +- lib/zstd/compress/zstd_ldm.h | 1 + lib/zstd/compress/zstd_ldm_geartab.h | 5 +- lib/zstd/compress/zstd_opt.c | 402 ++-- lib/zstd/decompress/huf_decompress.c | 912 ++++++-- lib/zstd/decompress/zstd_decompress.c | 80 +- lib/zstd/decompress/zstd_decompress_block.c | 1022 +++++++-- lib/zstd/decompress/zstd_decompress_block.h | 10 +- .../decompress/zstd_decompress_internal.h | 38 +- lib/zstd/decompress_sources.h | 6 + lib/zstd/zstd_compress_module.c | 6 +- 36 files changed, 6955 insertions(+), 2609 deletions(-) create mode 100644 lib/zstd/common/portability_macros.h create mode 100644 lib/zstd/compress/clevels.h diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h index b8c7dbf98390..79d55465d5c1 100644 --- a/include/linux/zstd_lib.h +++ b/include/linux/zstd_lib.h @@ -17,8 +17,16 @@ =20 =20 /* =3D=3D=3D=3D=3D ZSTDLIB_API : control library symbols visibility = =3D=3D=3D=3D=3D */ -#define ZSTDLIB_VISIBILITY=20 -#define ZSTDLIB_API ZSTDLIB_VISIBILITY +#ifndef ZSTDLIB_VISIBLE +# if (__GNUC__ >=3D 4) && !defined(__MINGW32__) +# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) +# define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) +# else +# define ZSTDLIB_VISIBLE +# define ZSTDLIB_HIDDEN +# endif +#endif +#define ZSTDLIB_API ZSTDLIB_VISIBLE =20 =20 /* ***********************************************************************= ****** @@ -56,8 +64,8 @@ =20 /*------ Version ------*/ #define ZSTD_VERSION_MAJOR 1 -#define ZSTD_VERSION_MINOR 4 -#define ZSTD_VERSION_RELEASE 10 +#define ZSTD_VERSION_MINOR 5 +#define ZSTD_VERSION_RELEASE 2 #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_M= INOR *100 + ZSTD_VERSION_RELEASE) =20 /*! ZSTD_versionNumber() : @@ -94,7 +102,6 @@ ZSTDLIB_API const char* ZSTD_versionString(void); #define ZSTD_BLOCKSIZE_MAX (1<=3D first frame size * @return : the compressed size of the first frame starting at `src`, @@ -165,8 +172,9 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const v= oid* src, size_t srcSize) ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum c= ompressed size in worst case single-pass scenario */ ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if = a `size_t` function result is an error code */ ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides = readable string from an error code */ -ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum n= egative compression level allowed */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum n= egative compression level allowed, requires v1.4.0+ */ ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum c= ompression level available */ +ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default c= ompression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ =20 =20 /* ************************************* @@ -219,9 +227,9 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, const void* src, size_t srcSize); =20 =20 -/* ************************************* -* Advanced compression API -***************************************/ +/* ******************************************* +* Advanced compression API (Requires v1.4.0+) +**********************************************/ =20 /* API design : * Parameters are pushed one by one into an existing context, @@ -232,7 +240,7 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, * * It's possible to reset all parameters to "default" using ZSTD_CCtx_re= set(). * - * This API supercedes all other "advanced" API entry points in the expe= rimental section. + * This API supersedes all other "advanced" API entry points in the expe= rimental section. * In the future, we expect to remove from experimental API entry points= which are redundant with this API. */ =20 @@ -251,7 +259,6 @@ typedef enum { ZSTD_fast=3D1, Only the order (from fast to strong) is guarantee= d */ } ZSTD_strategy; =20 - typedef enum { =20 /* compression parameters @@ -317,7 +324,6 @@ typedef enum { * The higher the value of selected strategy,= the more complex it is, * resulting in stronger and slower compressi= on. * Special: value 0 means "use default strate= gy". */ - /* LDM mode parameters */ ZSTD_c_enableLongDistanceMatching=3D160, /* Enable long distance match= ing. * This parameter is designed to impro= ve compression ratio @@ -374,7 +380,7 @@ typedef enum { ZSTD_c_jobSize=3D401, /* Size of a compression job. This value is= enforced only when nbWorkers >=3D 1. * Each compression job is completed in paral= lel, so this value can indirectly impact the nb of active threads. * 0 means default, which is dynamically dete= rmined based on compression parameters. - * Job size must be a minimum of overlap size= , or 1 MB, whichever is largest. + * Job size must be a minimum of overlap size= , or ZSTDMT_JOBSIZE_MIN (=3D 512 KB), whichever is largest. * The minimum size is automatically and tran= sparently enforced. */ ZSTD_c_overlapLog=3D402, /* Control the overlap size, as a fraction = of window size. * The overlap size is an amount of data relo= aded from previous job at the beginning of a new job. @@ -404,6 +410,8 @@ typedef enum { * ZSTD_c_stableOutBuffer * ZSTD_c_blockDelimiters * ZSTD_c_validateSequences + * ZSTD_c_useBlockSplitter + * ZSTD_c_useRowMatchFinder * Because they are not stable, it's necessary to define ZSTD_STATIC_L= INKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still= change. @@ -419,7 +427,10 @@ typedef enum { ZSTD_c_experimentalParam9=3D1006, ZSTD_c_experimentalParam10=3D1007, ZSTD_c_experimentalParam11=3D1008, - ZSTD_c_experimentalParam12=3D1009 + ZSTD_c_experimentalParam12=3D1009, + ZSTD_c_experimentalParam13=3D1010, + ZSTD_c_experimentalParam14=3D1011, + ZSTD_c_experimentalParam15=3D1012 } ZSTD_cParameter; =20 typedef struct { @@ -504,9 +515,9 @@ ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, const void* src, size_t srcSize); =20 =20 -/* ************************************* -* Advanced decompression API -***************************************/ +/* ********************************************* +* Advanced decompression API (Requires v1.4.0+) +************************************************/ =20 /* The advanced API pushes parameters one by one into an existing DCtx con= text. * Parameters are sticky, and remain valid for all following frames @@ -668,7 +679,7 @@ typedef enum { : note : multithreaded compression will block to f= lush as much output as possible. */ } ZSTD_EndDirective; =20 -/*! ZSTD_compressStream2() : +/*! ZSTD_compressStream2() : Requires v1.4.0+ * Behaves about the same as ZSTD_compressStream, with additional control= on end directive. * - Compression parameters are pushed into CCtx before starting compress= ion, using ZSTD_CCtx_set*() * - Compression parameters cannot be changed once compression is started= (save a list of exceptions in multi-threading mode) @@ -714,11 +725,11 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< r= ecommended size for output =20 =20 /* ***********************************************************************= ****** - * This following is a legacy streaming API. + * This following is a legacy streaming API, available since v1.0+ . * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). * It is redundant, but remains fully supported. - * Advanced parameters and dictionary compression can only be used through= the - * new API. + * Streaming in combination with advanced parameters and dictionary compre= ssion + * can only be used through the new API. *************************************************************************= *****/ =20 /*! @@ -796,7 +807,7 @@ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< re= commended size for output /*! ZSTD_compress_usingDict() : * Compression at an explicit compression level using a Dictionary. * A dictionary can be any arbitrary data segment (also called a prefix), - * or a buffer with specified information (see dictBuilder/zdict.h). + * or a buffer with specified information (see zdict.h). * Note : This function loads the dictionary, resulting in significant st= artup delay. * It's intended for a dictionary used only once. * Note 2 : When `dict =3D=3D NULL || dictSize < 8` no dictionary is used= . */ @@ -879,19 +890,25 @@ ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DC= tx* dctx, * Dictionary helper functions *******************************/ =20 -/*! ZSTD_getDictID_fromDict() : +/*! ZSTD_getDictID_fromDict() : Requires v1.4.0+ * Provides the dictID stored within dictionary. * if @return =3D=3D 0, the dictionary is not conformant with Zstandard s= pecification. * It can still be loaded, but as a content-only dictionary. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dict= Size); =20 -/*! ZSTD_getDictID_fromDDict() : +/*! ZSTD_getDictID_fromCDict() : Requires v1.5.0+ + * Provides the dictID of the dictionary loaded into `cdict`. + * If @return =3D=3D 0, the dictionary is not conformant to Zstandard spe= cification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only d= ictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); + +/*! ZSTD_getDictID_fromDDict() : Requires v1.4.0+ * Provides the dictID of the dictionary loaded into `ddict`. * If @return =3D=3D 0, the dictionary is not conformant to Zstandard spe= cification, or empty. * Non-conformant dictionaries can still be loaded, but as content-only d= ictionaries. */ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); =20 -/*! ZSTD_getDictID_fromFrame() : +/*! ZSTD_getDictID_fromFrame() : Requires v1.4.0+ * Provides the dictID required to decompressed the frame stored within `= src`. * If @return =3D=3D 0, the dictID could not be decoded. * This could for one of the following reasons : @@ -905,16 +922,16 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const v= oid* src, size_t srcSize); =20 =20 /* ***********************************************************************= ****** - * Advanced dictionary and prefix API + * Advanced dictionary and prefix API (Requires v1.4.0+) * * This API allows dictionaries to be used with ZSTD_compress2(), - * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky,= and + * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sti= cky, and * only reset with the context is reset with ZSTD_reset_parameters or * ZSTD_reset_session_and_parameters. Prefixes are single-use. *************************************************************************= *****/ =20 =20 -/*! ZSTD_CCtx_loadDictionary() : +/*! ZSTD_CCtx_loadDictionary() : Requires v1.4.0+ * Create an internal CDict from `dict` buffer. * Decompression will have to use same dictionary. * @result : 0, or an error code (which can be tested with ZSTD_isError()). @@ -933,7 +950,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const voi= d* src, size_t srcSize); * to precisely select how dictionary content must be interprete= d. */ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* d= ict, size_t dictSize); =20 -/*! ZSTD_CCtx_refCDict() : +/*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ * Reference a prepared dictionary, to be used for all next compressed fr= ames. * Note that compression parameters are enforced from within CDict, * and supersede any compression parameter previously set within CCtx. @@ -947,7 +964,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* = cctx, const void* dict, s * Note 2 : CDict is just referenced, its lifetime must outlive its usage= within CCtx. */ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* c= dict); =20 -/*! ZSTD_CCtx_refPrefix() : +/*! ZSTD_CCtx_refPrefix() : Requires v1.4.0+ * Reference a prefix (single-usage dictionary) for next compressed frame. * A prefix is **only used once**. Tables are discarded at end of frame (= ZSTD_e_end). * Decompression will need same prefix to properly regenerate data. @@ -968,7 +985,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, = const ZSTD_CDict* cdict); ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize); =20 -/*! ZSTD_DCtx_loadDictionary() : +/*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ * Create an internal DDict from dict buffer, * to be used to decompress next frames. * The dictionary remains valid for all future frames, until explicitly i= nvalidated. @@ -985,7 +1002,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, */ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* d= ict, size_t dictSize); =20 -/*! ZSTD_DCtx_refDDict() : +/*! ZSTD_DCtx_refDDict() : Requires v1.4.0+ * Reference a prepared dictionary, to be used to decompress next frames. * The dictionary remains active for decompression of future frames using= same DCtx. * @@ -1003,7 +1020,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx= * dctx, const void* dict, s */ ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* d= dict); =20 -/*! ZSTD_DCtx_refPrefix() : +/*! ZSTD_DCtx_refPrefix() : Requires v1.4.0+ * Reference a prefix (single-usage dictionary) to decompress next frame. * This is the reverse operation of ZSTD_CCtx_refPrefix(), * and must use the same prefix as the one used during compression. @@ -1024,7 +1041,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dct= x, =20 /* =3D=3D=3D Memory management =3D=3D=3D */ =20 -/*! ZSTD_sizeof_*() : +/*! ZSTD_sizeof_*() : Requires v1.4.0+ * These functions give the _current_ memory usage of selected object. * Note that object memory usage can evolve (increase or decrease) over t= ime. */ ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); @@ -1049,6 +1066,29 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDic= t* ddict); #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY =20 +/* This can be overridden externally to hide static symbols. */ +#ifndef ZSTDLIB_STATIC_API +#define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE +#endif + +/* Deprecation warnings : + * Should these warnings be a problem, it is generally possible to disable= them, + * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_W= ARNINGS in Visual. + * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. + */ +#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecat= ion warnings */ +#else +# if (defined(GNUC) && (GNUC > 4 || (GNUC =3D=3D 4 && GNUC_MINOR >=3D 5))= ) || defined(__clang__) +# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((dep= recated(message))) +# elif (__GNUC__ >=3D 3) +# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((dep= recated)) +# else +# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for th= is compiler") +# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +# endif +#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ + /* ***********************************************************************= *************** * experimental API (static linking only) *************************************************************************= *************** @@ -1111,9 +1151,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict= * ddict); #define ZSTD_SRCSIZEHINT_MIN 0 #define ZSTD_SRCSIZEHINT_MAX INT_MAX =20 -/* internal */ -#define ZSTD_HASHLOG3_MAX 17 - =20 /* --- Advanced types --- */ =20 @@ -1255,6 +1292,15 @@ typedef enum { ZSTD_lcm_uncompressed =3D 2 /*< Always emit uncompressed literals. */ } ZSTD_literalCompressionMode_e; =20 +typedef enum { + /* Note: This enum controls features which are conditionally beneficial.= Zstd typically will make a final + * decision on whether or not to enable the feature (ZSTD_ps_auto), but = setting the switch to ZSTD_ps_enable + * or ZSTD_ps_disable allow for a force enable/disable the feature. + */ + ZSTD_ps_auto =3D 0, /* Let the library automatically determine w= hether the feature shall be enabled */ + ZSTD_ps_enable =3D 1, /* Force-enable the feature */ + ZSTD_ps_disable =3D 2 /* Do not use the feature */ +} ZSTD_paramSwitch_e; =20 /* ************************************* * Frame size functions @@ -1281,7 +1327,7 @@ typedef enum { * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it= must traverse the input to * read each contained frame header. This is fast as most of t= he data is skipped, * however it does mean that all frame data must be present and= valid. */ -ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, = size_t srcSize); +ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void= * src, size_t srcSize); =20 /*! ZSTD_decompressBound() : * `src` should point to the start of a series of ZSTD encoded and/or ski= ppable frames @@ -1296,13 +1342,13 @@ ZSTDLIB_API unsigned long long ZSTD_findDecompresse= dSize(const void* src, size_t * note 3 : when the decompressed size field isn't available, the upper-= bound for that frame is calculated by: * upper-bound =3D # blocks * min(128 KB, Window_Size) */ -ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_= t srcSize); +ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src= , size_t srcSize); =20 /*! ZSTD_frameHeaderSize() : * srcSize must be >=3D ZSTD_FRAMEHEADERSIZE_PREFIX. * @return : size of the Frame Header, * or an error code (if srcSize is too small) */ -ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t src= Size); =20 typedef enum { ZSTD_sf_noBlockDelimiters =3D 0, /* Representation of ZSTD_Seque= nce has no block delimiters, sequences only */ @@ -1325,12 +1371,12 @@ typedef enum { * @return : number of sequences generated */ =20 -ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* ou= tSeqs, +ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Seque= nce* outSeqs, size_t outSeqsSize, const void* = src, size_t srcSize); =20 /*! ZSTD_mergeBlockDelimiters() : * Given an array of ZSTD_Sequence, remove all sequences that represent bl= ock delimiters/last literals - * by merging them into into the literals of the next sequence. + * by merging them into the literals of the next sequence. * * As such, the final generated result has no explicit representation of b= lock boundaries, * and the final last literals segment is not represented in the sequences. @@ -1339,7 +1385,7 @@ ZSTDLIB_API size_t ZSTD_generateSequences(ZSTD_CCtx* = zc, ZSTD_Sequence* outSeqs, * setting of ZSTD_c_blockDelimiters as ZSTD_sf_noBlockDelimiters * @return : number of sequences left after merging */ -ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, siz= e_t seqsSize); +ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequenc= es, size_t seqsSize); =20 /*! ZSTD_compressSequences() : * Compress an array of ZSTD_Sequence, generated from the original source = buffer, into dst. @@ -1369,7 +1415,7 @@ ZSTDLIB_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Seq= uence* sequences, size_t se * and cannot emit an RLE block that disagrees with the repcode hi= story * @return : final compressed size or a ZSTD error. */ -ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst= , size_t dstSize, +ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, vo= id* dst, size_t dstSize, const ZSTD_Sequence* inSeqs, size_t inSe= qsSize, const void* src, size_t srcSize); =20 @@ -1377,7 +1423,7 @@ ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx* = const cctx, void* dst, size /*! ZSTD_writeSkippableFrame() : * Generates a zstd skippable frame containing data given by src, and writ= es it to dst buffer. * - * Skippable frames begin with a a 4-byte magic number. There are 16 possi= ble choices of magic number, + * Skippable frames begin with a 4-byte magic number. There are 16 possibl= e choices of magic number, * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+1= 5. * As such, the parameter magicVariant controls the exact skippable frame = magic number variant used, so * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. @@ -1387,9 +1433,29 @@ ZSTDLIB_API size_t ZSTD_compressSequences(ZSTD_CCtx*= const cctx, void* dst, size * * @return : number of bytes written or a ZSTD error. */ -ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, +ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCa= pacity, const void* src, size_t srcSiz= e, unsigned magicVariant); =20 +/*! ZSTD_readSkippableFrame() : + * Retrieves a zstd skippable frame containing data given by src, and writ= es it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was suppl= ied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the= caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the f= rame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, = unsigned* magicVariant, + const void* src, size_t srcSiz= e); + +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier = for a skippable frame. + */ +ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size= ); + + =20 /* ************************************* * Memory management @@ -1418,10 +1484,10 @@ ZSTDLIB_API size_t ZSTD_writeSkippableFrame(void* d= st, size_t dstCapacity, * Note 2 : only single-threaded compression is supported. * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if Z= STD_c_nbWorkers is >=3D 1. */ -ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); -ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionPara= meters cParams); -ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_p= arams* params); -ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compress= ionParameters cParams); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD= _CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); =20 /*! ZSTD_estimateCStreamSize() : * ZSTD_estimateCStreamSize() will provide a budget large enough for any = compression level up to selected one. @@ -1436,20 +1502,20 @@ ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), * an internal ?Dict will be created, which additional size is not= estimated here. * In this case, get total size by adding ZSTD_estimate?DictSize */ -ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); -ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionP= arameters cParams); -ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCt= x_params* params); -ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); -ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, siz= e_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compr= essionParameters cParams); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const Z= STD_CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* s= rc, size_t srcSize); =20 /*! ZSTD_estimate?DictSize() : * ZSTD_estimateCDictSize() will bet that src size is relatively "small",= and content is copied, like ZSTD_createCDict(). * ZSTD_estimateCDictSize_advanced() makes it possible to control compres= sion parameters precisely, like ZSTD_createCDict_advanced(). * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logica= lly smaller. */ -ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compression= Level); -ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_c= ompressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); -ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMe= thod_e dictLoadMethod); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize(size_t dictSize, int comp= ressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize,= ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_STATIC_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dic= tLoadMethod_e dictLoadMethod); =20 /*! ZSTD_initStatic*() : * Initialize an object using a pre-allocated fixed-size buffer. @@ -1472,20 +1538,20 @@ ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t di= ctSize, ZSTD_dictLoadMethod_e * Limitation 2 : static cctx currently not compatible with multi-threadi= ng. * Limitation 3 : static dctx is incompatible with legacy support. */ -ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t work= spaceSize); -ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t w= orkspaceSize); /*< same as ZSTD_initStaticCCtx() */ +ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size= _t workspaceSize); +ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, s= ize_t workspaceSize); /*< same as ZSTD_initStaticCCtx() */ =20 -ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t work= spaceSize); -ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t w= orkspaceSize); /*< same as ZSTD_initStaticDCtx() */ +ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size= _t workspaceSize); +ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, s= ize_t workspaceSize); /*< same as ZSTD_initStaticDCtx() */ =20 -ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( +ZSTDLIB_STATIC_API const ZSTD_CDict* ZSTD_initStaticCDict( void* workspace, size_t workspaceS= ize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMeth= od, ZSTD_dictContentType_e dictContent= Type, ZSTD_compressionParameters cParams= ); =20 -ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( +ZSTDLIB_STATIC_API const ZSTD_DDict* ZSTD_initStaticDDict( void* workspace, size_t workspaceS= ize, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMeth= od, @@ -1504,44 +1570,44 @@ static __attribute__((__unused__)) ZSTD_customMem const ZSTD_defaultCMem =3D { NULL, NULL, NULL }; /*< this = constant defers to stdlib's functions */ =20 -ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMe= m); -ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem custo= mMem); -ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMe= m); -ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem custo= mMem); +ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem c= ustomMem); +ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMe= m customMem); +ZSTDLIB_STATIC_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem c= ustomMem); +ZSTDLIB_STATIC_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMe= m customMem); =20 -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t= dictSize, +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict,= size_t dictSize, ZSTD_dictLoadMethod_e di= ctLoadMethod, ZSTD_dictContentType_e d= ictContentType, ZSTD_compressionParamete= rs cParams, ZSTD_customMem customMem= ); =20 -/* ! Thread pool : - * These prototypes make it possible to share a thread pool among multiple= compression contexts. - * This can limit resources for applications with multiple threads where e= ach one uses - * a threaded compression mode (via ZSTD_c_nbWorkers parameter). - * ZSTD_createThreadPool creates a new thread pool with a given number of = threads. - * Note that the lifetime of such pool must exist while being used. - * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL ar= gument value - * to use an internal thread pool). - * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. +/*! Thread pool : + * These prototypes make it possible to share a thread pool among multipl= e compression contexts. + * This can limit resources for applications with multiple threads where = each one uses + * a threaded compression mode (via ZSTD_c_nbWorkers parameter). + * ZSTD_createThreadPool creates a new thread pool with a given number of= threads. + * Note that the lifetime of such pool must exist while being used. + * ZSTD_CCtx_refThreadPool assigns a thread pool to a context (use NULL a= rgument value + * to use an internal thread pool). + * ZSTD_freeThreadPool frees a thread pool, accepts NULL pointer. */ typedef struct POOL_ctx_s ZSTD_threadPool; -ZSTDLIB_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThreads); -ZSTDLIB_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* accept N= ULL pointer */ -ZSTDLIB_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_threadPoo= l* pool); +ZSTDLIB_STATIC_API ZSTD_threadPool* ZSTD_createThreadPool(size_t numThread= s); +ZSTDLIB_STATIC_API void ZSTD_freeThreadPool (ZSTD_threadPool* pool); /* a= ccept NULL pointer */ +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refThreadPool(ZSTD_CCtx* cctx, ZSTD_th= readPool* pool); =20 =20 /* * This API is temporary and is expected to change or disappear in the fut= ure! */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_advanced2( const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, const ZSTD_CCtx_params* cctxParams, ZSTD_customMem customMem); =20 -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced( +ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_advanced( const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, @@ -1558,28 +1624,22 @@ ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced( * As a consequence, `dictBuffer` **must** outlive CDict, * and its content must remain unmodified throughout the lifetime of CDic= t. * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod= =3D=3DZSTD_dlm_byRef */ -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffe= r, size_t dictSize, int compressionLevel); - -/*! ZSTD_getDictID_fromCDict() : - * Provides the dictID of the dictionary loaded into `cdict`. - * If @return =3D=3D 0, the dictionary is not conformant to Zstandard spe= cification, or empty. - * Non-conformant dictionaries can still be loaded, but as content-only d= ictionaries. */ -ZSTDLIB_API unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict* cdict); +ZSTDLIB_STATIC_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* di= ctBuffer, size_t dictSize, int compressionLevel); =20 /*! ZSTD_getCParams() : * @return ZSTD_compressionParameters structure for a selected compression= level and estimated srcSize. * `estimatedSrcSize` value is optional, select 0 if not known */ -ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLeve= l, unsigned long long estimatedSrcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_getCParams(int compress= ionLevel, unsigned long long estimatedSrcSize, size_t dictSize); =20 /*! ZSTD_getParams() : * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object= instead of sub-component `ZSTD_compressionParameters`. * All fields of `ZSTD_frameParameters` are set to default : contentSize= =3D1, checksum=3D0, noDictID=3D0 */ -ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned = long long estimatedSrcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_parameters ZSTD_getParams(int compressionLevel, un= signed long long estimatedSrcSize, size_t dictSize); =20 /*! ZSTD_checkCParams() : * Ensure param values remain within authorized range. * @return 0 on success, or an error code (can be checked with ZSTD_isErro= r()) */ -ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); +ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters par= ams); =20 /*! ZSTD_adjustCParams() : * optimize params for a given `srcSize` and `dictSize`. @@ -1587,23 +1647,25 @@ ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressi= onParameters params); * `dictSize` must be `0` when there is no dictionary. * cPar can be invalid : all parameters will be clamped within valid rang= e in the @return struct. * This function never fails (wide contract) */ -ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compression= Parameters cPar, unsigned long long srcSize, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_comp= ressionParameters cPar, unsigned long long srcSize, size_t dictSize); =20 /*! ZSTD_compress_advanced() : * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZST= D_CCtx_setParameter() and other parameter setters. - * This prototype will be marked as deprecated and generate compilation w= arning on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + * This prototype will generate compilation warnings. */ +ZSTD_DEPRECATED("use ZSTD_compress2") +size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const void* dict,size_t dictSize, ZSTD_parameters params); =20 /*! ZSTD_compress_usingCDict_advanced() : - * Note : this function is now REDUNDANT. + * Note : this function is now DEPRECATED. * It can be replaced by ZSTD_compress2(), in combination with ZST= D_CCtx_loadDictionary() and other parameter setters. - * This prototype will be marked as deprecated and generate compilation w= arning in some future version */ -ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + * This prototype will generate compilation warnings. */ +ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, void* dst, size_t dstCapacit= y, const void* src, size_t srcSize, const ZSTD_CDict* cdict, @@ -1613,18 +1675,18 @@ ZSTDLIB_API size_t ZSTD_compress_usingCDict_advance= d(ZSTD_CCtx* cctx, /*! ZSTD_CCtx_loadDictionary_byReference() : * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenc= ed, instead of being copied into CCtx. * It saves some memory, but also requires that `dict` outlives its usage= within `cctx` */ -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, c= onst void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* = cctx, const void* dict, size_t dictSize); =20 /*! ZSTD_CCtx_loadDictionary_advanced() : * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over * how to load the dictionary (by copy ? by reference ?) * and how to interpret it (automatic ? force raw mode ? full mode only ?= ) */ -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, cons= t void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_d= ictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cct= x, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod,= ZSTD_dictContentType_e dictContentType); =20 /*! ZSTD_CCtx_refPrefix_advanced() : * Same as ZSTD_CCtx_refPrefix(), but gives finer control over * how to interpret prefix content (automatic ? force raw mode (default) = ? full mode only ?) */ -ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const voi= d* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, co= nst void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType= ); =20 /* =3D=3D=3D experimental parameters =3D=3D=3D */ /* these parameters can be used with ZSTD_setParameter() @@ -1663,9 +1725,15 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD= _CCtx* cctx, const void* pre * See the comments on that enum for an explanation of the feature. */ #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 =20 -/* Controls how the literals are compressed (default is auto). - * The value must be of type ZSTD_literalCompressionMode_e. - * See ZSTD_literalCompressionMode_t enum definition for details. +/* Controlled with ZSTD_paramSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never compress literals. + * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed = literals + * may still be emitted if huffman is not beneficial to use.) + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether= to use + * literals compression based on the compression parameters - specifically, + * negative compression levels do not use literal compression. */ #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 =20 @@ -1728,7 +1796,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_= CCtx* cctx, const void* pre * * Note that this means that the CDict tables can no longer be copied into= the * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be - * useable. The dictionary can only be attached or reloaded. + * usable. The dictionary can only be attached or reloaded. * * In general, you should expect compression to be faster--sometimes very = much * so--and CDict creation to be slightly slower. Eventually, we will proba= bly @@ -1817,12 +1885,55 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZST= D_CCtx* cctx, const void* pre */ #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 =20 +/* ZSTD_c_useBlockSplitter + * Controlled with ZSTD_paramSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use block splitter. + * Set to ZSTD_ps_enable to always use block splitter. + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether= to use + * block splitting based on the compression parameters. + */ +#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13 + +/* ZSTD_c_useRowMatchFinder + * Controlled with ZSTD_paramSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use row-based matchfinder. + * Set to ZSTD_ps_enable to force usage of row-based matchfinder. + * + * By default, in ZSTD_ps_auto, the library will decide at runtime whether= to use + * the row-based matchfinder based on support for SIMD instructions and th= e window log. + * Note that this only pertains to compression strategies: greedy, lazy, a= nd lazy2 + */ +#define ZSTD_c_useRowMatchFinder ZSTD_c_experimentalParam14 + +/* ZSTD_c_deterministicRefPrefix + * Default is 0 =3D=3D disabled. Set to 1 to enable. + * + * Zstd produces different results for prefix compression when the prefix = is + * directly adjacent to the data about to be compressed vs. when it isn't. + * This is because zstd detects that the two buffers are contiguous and it= can + * use a more efficient match finding algorithm. However, this produces di= fferent + * results than when the two buffers are non-contiguous. This flag forces = zstd + * to always load the prefix in non-contiguous mode, even if it happens to= be + * adjacent to the data, to guarantee determinism. + * + * If you really care about determinism when using a dictionary or prefix, + * like when doing delta compression, you should select this option. It co= mes + * at a speed penalty of about ~2.5% if the dictionary and data happened t= o be + * contiguous, and is free if they weren't contiguous. We don't expect that + * intentionally making the dictionary and data contiguous will be worth t= he + * cost to memcpy() the data. + */ +#define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_c= Parameter, * and store it into int* value. * @return : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZSTD_cPar= ameter param, int* value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_getParameter(const ZSTD_CCtx* cctx, ZS= TD_cParameter param, int* value); =20 =20 /*! ZSTD_CCtx_params : @@ -1842,27 +1953,27 @@ ZSTDLIB_API size_t ZSTD_CCtx_getParameter(const ZST= D_CCtx* cctx, ZSTD_cParameter * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() * for static allocation of CCtx for single-threaded compression. */ -ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); -ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); /* acce= pt NULL pointer */ +ZSTDLIB_STATIC_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_STATIC_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); = /* accept NULL pointer */ =20 /*! ZSTD_CCtxParams_reset() : * Reset params to default values. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); =20 /*! ZSTD_CCtxParams_init() : * Initializes the compression parameters of cctxParams according to * compression level. All other parameters are reset to their default val= ues. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int = compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParam= s, int compressionLevel); =20 /*! ZSTD_CCtxParams_init_advanced() : * Initializes the compression and frame parameters of cctxParams accordi= ng to * params. All other parameters are reset to their default values. */ -ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxPar= ams, ZSTD_parameters params); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* = cctxParams, ZSTD_parameters params); =20 -/*! ZSTD_CCtxParams_setParameter() : +/*! ZSTD_CCtxParams_setParameter() : Requires v1.4.0+ * Similar to ZSTD_CCtx_setParameter. * Set one compression parameter, selected by enum ZSTD_cParameter. * Parameters must be applied to a ZSTD_CCtx using @@ -1870,14 +1981,14 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZS= TD_CCtx_params* cctxParams, Z * @result : a code representing success or failure (which can be tested w= ith * ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, = ZSTD_cParameter param, int value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* p= arams, ZSTD_cParameter param, int value); =20 /*! ZSTD_CCtxParams_getParameter() : * Similar to ZSTD_CCtx_getParameter. * Get the requested value of one compression parameter, selected by enum = ZSTD_cParameter. * @result : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_params* pa= rams, ZSTD_cParameter param, int* value); +ZSTDLIB_STATIC_API size_t ZSTD_CCtxParams_getParameter(const ZSTD_CCtx_par= ams* params, ZSTD_cParameter param, int* value); =20 /*! ZSTD_CCtx_setParametersUsingCCtxParams() : * Apply a set of ZSTD_CCtx_params to the compression context. @@ -1886,7 +1997,7 @@ ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(const= ZSTD_CCtx_params* params, * if nbWorkers>=3D1, new parameters will be picked up at next job, * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jo= bSize, and overlapLog are not updated). */ -ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( +ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); =20 /*! ZSTD_compressStream2_simpleArgs() : @@ -1895,7 +2006,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxPa= rams( * This variant might be helpful for binders from dynamic languages * which have troubles handling structures containing memory pointers. */ -ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( +ZSTDLIB_STATIC_API size_t ZSTD_compressStream2_simpleArgs ( ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos, @@ -1911,33 +2022,33 @@ ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always= be 0. * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy = Support is enabled. * Note 3 : Skippable Frame Identifiers are considered valid. */ -ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); +ZSTDLIB_STATIC_API unsigned ZSTD_isFrame(const void* buffer, size_t size); =20 /*! ZSTD_createDDict_byReference() : * Create a digested dictionary, ready to start decompression operation w= ithout startup delay. * Dictionary content is referenced, and therefore stays in dictBuffer. * It is important that dictBuffer outlives DDict, * it must remain read accessible throughout the lifetime of DDict */ -ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffe= r, size_t dictSize); +ZSTDLIB_STATIC_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* di= ctBuffer, size_t dictSize); =20 /*! ZSTD_DCtx_loadDictionary_byReference() : * Same as ZSTD_DCtx_loadDictionary(), * but references `dict` content instead of copying it into `dctx`. * This saves memory if `dict` remains around., * However, it's imperative that `dict` remains accessible (and unmodifie= d) while being used, so it must outlive decompression. */ -ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, c= onst void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* = dctx, const void* dict, size_t dictSize); =20 /*! ZSTD_DCtx_loadDictionary_advanced() : * Same as ZSTD_DCtx_loadDictionary(), * but gives direct control over * how to load the dictionary (by copy ? by reference ?) * and how to interpret it (automatic ? force raw mode ? full mode only ?= ). */ -ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, cons= t void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_d= ictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dct= x, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod,= ZSTD_dictContentType_e dictContentType); =20 /*! ZSTD_DCtx_refPrefix_advanced() : * Same as ZSTD_DCtx_refPrefix(), but gives finer control over * how to interpret prefix content (automatic ? force raw mode (default) = ? full mode only ?) */ -ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const voi= d* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, co= nst void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType= ); =20 /*! ZSTD_DCtx_setMaxWindowSize() : * Refuses allocating internal buffers for frames requiring a window size= larger than provided limit. @@ -1946,14 +2057,14 @@ ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZST= D_DCtx* dctx, const void* pre * By default, a decompression context accepts all window sizes <=3D (1 <= < ZSTD_WINDOWLOG_LIMIT_DEFAULT) * @return : 0, or an error code (which can be tested using ZSTD_isError()= ). */ -ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxW= indowSize); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size= _t maxWindowSize); =20 /*! ZSTD_DCtx_getParameter() : * Get the requested decompression parameter value, selected by enum ZSTD= _dParameter, * and store it into int* value. * @return : 0, or an error code (which can be tested with ZSTD_isError()). */ -ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter= param, int* value); +ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dPa= rameter param, int* value); =20 /* ZSTD_d_format * experimental parameter, @@ -2028,11 +2139,13 @@ ZSTDLIB_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx= * dctx, ZSTD_dParameter param =20 =20 /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). * Instruct the decoder context about what kind of data to decode next. * This instruction is mandatory to decode data without a fully-formed he= ader, * such ZSTD_f_zstd1_magicless for example. * @return : 0, or an error code (which can be tested using ZSTD_isError()= ). */ -ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e form= at); +ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") +size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); =20 /*! ZSTD_decompressStream_simpleArgs() : * Same as ZSTD_decompressStream(), @@ -2040,7 +2153,7 @@ ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dct= x, ZSTD_format_e format); * This can be helpful for binders from dynamic languages * which have troubles handling structures containing memory pointers. */ -ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( +ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, size_t* dstPos, const void* src, size_t srcSize, size_t* srcPos); @@ -2056,7 +2169,7 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( /*=3D=3D=3D=3D=3D Advanced Streaming compression functions =3D=3D=3D=3D= =3D*/ =20 /*! ZSTD_initCStream_srcSize() : - * This function is deprecated, and equivalent to: + * This function is DEPRECATED, and equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLev= el); @@ -2065,15 +2178,15 @@ ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs= ( * pledgedSrcSize must be correct. If it is not known at init time, use * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older progr= ams, * "0" also disables frame content size field. It may be enabled in the fu= ture. - * Note : this prototype will be marked as deprecated and generate compila= tion warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions= ") +size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pledgedSrcSize); =20 /*! ZSTD_initCStream_usingDict() : - * This function is deprecated, and is equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLev= el); * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); @@ -2082,15 +2195,15 @@ ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, * dict =3D=3D NULL or dictSize < 8, in which case no dict is used. * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd diction= ary if * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm= _byCopy. - * Note : this prototype will be marked as deprecated and generate compila= tion warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions= ") +size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel); =20 /*! ZSTD_initCStream_advanced() : - * This function is deprecated, and is approximately equivalent to: + * This function is DEPRECATED, and is approximately equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * // Pseudocode: Set each zstd parameter and leave the rest as-is. * for ((param, value) : params) { @@ -2102,23 +2215,24 @@ ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. * pledgedSrcSize must be correct. * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOW= N. - * Note : this prototype will be marked as deprecated and generate compila= tion warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_advanced(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions= ") +size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); =20 /*! ZSTD_initCStream_usingCDict() : - * This function is deprecated, and equivalent to: + * This function is DEPRECATED, and equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_refCDict(zcs, cdict); * * note : cdict will just be referenced, and must outlive compression sess= ion - * Note : this prototype will be marked as deprecated and generate compila= tion warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZS= TD_CDict* cdict); +ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h fo= r detailed instructions") +size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cd= ict); =20 /*! ZSTD_initCStream_usingCDict_advanced() : * This function is DEPRECATED, and is approximately equivalent to: @@ -2133,18 +2247,21 @@ ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD= _CStream* zcs, const ZSTD_CDi * same as ZSTD_initCStream_usingCDict(), with control over frame paramete= rs. * pledgedSrcSize must be correct. If srcSize is not known at init time, u= se * value ZSTD_CONTENTSIZE_UNKNOWN. - * Note : this prototype will be marked as deprecated and generate compila= tion warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t -ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, +ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h fo= r detailed instructions") +size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, const ZSTD_CDict* cdict, ZSTD_frameParameters fParams, unsigned long long pledgedSrcSize); =20 /*! ZSTD_resetCStream() : - * This function is deprecated, and is equivalent to: + * This function is DEPRECATED, and is equivalent to: * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * Note: ZSTD_resetCStream() interprets pledgedSrcSize =3D=3D 0 as ZSTD_CO= NTENTSIZE_UNKNOWN, but + * ZSTD_CCtx_setPledgedSrcSize() does not do the same, so ZSTD_CONTE= NTSIZE_UNKNOWN must be + * explicitly specified. * * start a new frame, using same parameters from previous frame. * This is typically useful to skip dictionary loading stage, since it wi= ll re-use it in-place. @@ -2154,9 +2271,10 @@ ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* z= cs, * For the time being, pledgedSrcSize=3D=3D0 is interpreted as "srcSize u= nknown" for compatibility with older programs, * but it will change to mean "empty" in future version, so use macro ZST= D_CONTENTSIZE_UNKNOWN instead. * @return : 0, or an error code (which can be tested using ZSTD_isError()) - * Note : this prototype will be marked as deprecated and generate compil= ation warnings on reaching v1.5.x + * This prototype will generate compilation warnings. */ -ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long= pledgedSrcSize); +ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions= ") +size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcS= ize); =20 =20 typedef struct { @@ -2174,7 +2292,7 @@ typedef struct { * Note : (ingested - consumed) is amount of input data buffered internall= y, not yet compressed. * Aggregates progression inside active worker threads. */ -ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx= * cctx); +ZSTDLIB_STATIC_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZS= TD_CCtx* cctx); =20 /*! ZSTD_toFlushNow() : * Tell how many bytes are ready to be flushed immediately. @@ -2189,7 +2307,7 @@ ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgre= ssion(const ZSTD_CCtx* cctx * therefore flush speed is limited by production speed of oldest job * irrespective of the speed of concurrent (and newer) jobs. */ -ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); +ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); =20 =20 /*=3D=3D=3D=3D=3D Advanced Streaming decompression functions =3D=3D=3D= =3D=3D*/ @@ -2203,7 +2321,7 @@ ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); * note: no dictionary will be used if dict =3D=3D NULL or dictSize < 8 * Note : this prototype will be marked as deprecated and generate compila= tion warnings on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const voi= d* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, co= nst void* dict, size_t dictSize); =20 /*! * This function is deprecated, and is equivalent to: @@ -2214,7 +2332,7 @@ ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DS= tream* zds, const void* dic * note : ddict is referenced, it must outlive decompression session * Note : this prototype will be marked as deprecated and generate compila= tion warnings on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZS= TD_DDict* ddict); +ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, c= onst ZSTD_DDict* ddict); =20 /*! * This function is deprecated, and is equivalent to: @@ -2224,7 +2342,7 @@ ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_D= Stream* zds, const ZSTD_DDi * re-use decompression parameters from previous init; saves dictionary lo= ading * Note : this prototype will be marked as deprecated and generate compila= tion warnings on reaching v1.5.x */ -ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); +ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); =20 =20 /* ******************************************************************* @@ -2243,8 +2361,7 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zd= s); ZSTD_CCtx object can be re-used multiple times within successive compres= sion operations. =20 Start by initializing a context. - Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictiona= ry compression, - or ZSTD_compressBegin_advanced(), for finer parameter control. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictiona= ry compression. It's also possible to duplicate a reference context which has already be= en initialized, using ZSTD_copyCCtx() =20 Then, consume your input using ZSTD_compressContinue(). @@ -2267,17 +2384,19 @@ ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* = zds); */ =20 /*=3D=3D=3D=3D=3D Buffer-less streaming compression functions =3D=3D=3D= =3D=3D*/ -ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLeve= l); -ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const voi= d* dict, size_t dictSize, int compressionLevel); -ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void= * dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledged= SrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZS= TD_CONTENTSIZE_UNKNOWN */ -ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZS= TD_CDict* cdict); /*< note: fails if cdict=3D=3DNULL */ -ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const= cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, u= nsigned long long const pledgedSrcSize); /* compression parameters are al= ready set within cdict. pledgedSrcSize must be correct. If srcSize is not k= nown, use macro ZSTD_CONTENTSIZE_UNKNOWN */ -ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* prepare= dCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is = not known, use ZSTD_CONTENTSIZE_UNKNOWN */ - -ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_= t dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dst= Capacity, const void* src, size_t srcSize); - - +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compress= ionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, co= nst void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, c= onst ZSTD_CDict* cdict); /*< note: fails if cdict=3D=3DNULL */ +ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* = preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcS= ize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst= , size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, siz= e_t dstCapacity, const void* src, size_t srcSize); + +/* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_adv= anced() are now DEPRECATED and will generate a compiler warning */ +ZSTD_DEPRECATED("use advanced API to access custom parameters") +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size= _t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*= < pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSI= ZE_UNKNOWN */ +ZSTD_DEPRECATED("use advanced API to access custom parameters") +size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const= ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long= long const pledgedSrcSize); /* compression parameters are already set wi= thin cdict. pledgedSrcSize must be correct. If srcSize is not known, use ma= cro ZSTD_CONTENTSIZE_UNKNOWN */ /* Buffer-less streaming decompression (synchronous mode) =20 @@ -2368,24 +2487,24 @@ typedef struct { * @return : 0, `zfhPtr` is correctly filled, * >0, `srcSize` is too small, value is wanted `srcSize` amount, * or an error code, which can be tested using ZSTD_isError() */ -ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const voi= d* src, size_t srcSize); /*< doesn't consume input */ +ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, co= nst void* src, size_t srcSize); /*< doesn't consume input */ /*! ZSTD_getFrameHeader_advanced() : * same as ZSTD_getFrameHeader(), * with added capability to select a format (like ZSTD_f_zstd1_magicless)= */ -ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, = const void* src, size_t srcSize, ZSTD_format_e format); -ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSi= ze, unsigned long long frameContentSize); /*< when frame content size is n= ot known, pass in frameContentSize =3D=3D ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* z= fhPtr, const void* src, size_t srcSize, ZSTD_format_e format); +ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long w= indowSize, unsigned long long frameContentSize); /*< when frame content si= ze is not known, pass in frameContentSize =3D=3D ZSTD_CONTENTSIZE_UNKNOWN */ =20 -ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); -ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const v= oid* dict, size_t dictSize); -ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const = ZSTD_DDict* ddict); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, = const void* dict, size_t dictSize); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx,= const ZSTD_DDict* ddict); =20 -ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); -ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, siz= e_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* d= st, size_t dstCapacity, const void* src, size_t srcSize); =20 /* misc */ -ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* prepare= dDCtx); +ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* = preparedDCtx); typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZS= TDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputT= ype_e; -ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx= ); =20 =20 =20 @@ -2422,10 +2541,10 @@ ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType= (ZSTD_DCtx* dctx); */ =20 /*=3D=3D=3D=3D=3D Raw zstd block functions =3D=3D=3D=3D=3D*/ -ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); -ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t= dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t= dstCapacity, const void* src, size_t srcSize); -ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* block= Start, size_t blockSize); /*< insert uncompressed block into `dctx` histor= y. Useful for multi-blocks decompression. */ +ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst,= size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst,= size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void= * blockStart, size_t blockSize); /*< insert uncompressed block into `dctx`= history. Useful for multi-blocks decompression. */ =20 =20 #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h index 28248abe8612..feef3a1b1d60 100644 --- a/lib/zstd/common/bitstream.h +++ b/lib/zstd/common/bitstream.h @@ -313,7 +313,16 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(= size_t bitContainer, U32 c U32 const regMask =3D sizeof(bitContainer)*8 - 1; /* if start > regMask, bitstream is corrupted, and result is undefined= */ assert(nbBits < BIT_MASK_SIZE); + /* x86 transform & ((1 << nbBits) - 1) to bzhi instruction, it is bett= er + * than accessing memory. When bmi2 instruction is not present, we con= sider + * such cpus old (pre-Haswell, 2013) and their performance is not of t= hat + * importance. + */ +#if defined(__x86_64__) || defined(_M_X86) + return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1= ); +#else return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; +#endif } =20 MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, = U32 const nbBits) diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h index f5a9c70a228a..c42d39faf9bd 100644 --- a/lib/zstd/common/compiler.h +++ b/lib/zstd/common/compiler.h @@ -11,6 +11,8 @@ #ifndef ZSTD_COMPILER_H #define ZSTD_COMPILER_H =20 +#include "portability_macros.h" + /*-******************************************************* * Compiler specifics *********************************************************/ @@ -34,7 +36,7 @@ =20 /* On MSVC qsort requires that functions passed into it use the __cdecl cal= ling conversion(CC). - This explictly marks such functions as __cdecl so that the code will sti= ll compile + This explicitly marks such functions as __cdecl so that the code will st= ill compile if a CC other than __cdecl has been made the default. */ #define WIN_CDECL @@ -70,25 +72,13 @@ =20 =20 /* target attribute */ -#ifndef __has_attribute - #define __has_attribute(x) 0 /* Compatibility with non-clang compilers.= */ -#endif #define TARGET_ATTRIBUTE(target) __attribute__((__target__(target))) =20 -/* Enable runtime BMI2 dispatch based on the CPU. - * Enabled for clang & gcc >=3D4.8 on x86 when BMI2 isn't enabled by defau= lt. +/* Target attribute for BMI2 dynamic dispatch. + * Enable lzcnt, bmi, and bmi2. + * We test for bmi1 & bmi2. lzcnt is included in bmi1. */ -#ifndef DYNAMIC_BMI2 - #if ((defined(__clang__) && __has_attribute(__target__)) \ - || (defined(__GNUC__) \ - && (__GNUC__ >=3D 5 || (__GNUC__ =3D=3D 4 && __GNUC_MINOR__ >=3D= 8)))) \ - && (defined(__x86_64__) || defined(_M_X86)) \ - && !defined(__BMI2__) - # define DYNAMIC_BMI2 1 - #else - # define DYNAMIC_BMI2 0 - #endif -#endif +#define BMI2_TARGET_ATTRIBUTE TARGET_ATTRIBUTE("lzcnt,bmi,bmi2") =20 /* prefetch * can be disabled, by declaring NO_PREFETCH build macro */ @@ -115,8 +105,9 @@ } =20 /* vectorization - * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */ -#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, + * and some compilers, like Intel ICC and MCST LCC, do not support it at a= ll. */ +#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__)= && !defined(__LCC__) # if (__GNUC__ =3D=3D 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >=3D 5) # define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) # else @@ -134,20 +125,18 @@ #define LIKELY(x) (__builtin_expect((x), 1)) #define UNLIKELY(x) (__builtin_expect((x), 0)) =20 +#if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC_= _ > 4 || (__GNUC__ =3D=3D 4 && __GNUC_MINOR__ >=3D 5))) +# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } +#else +# define ZSTD_UNREACHABLE { assert(0); } +#endif + /* disable warnings */ =20 /*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/ =20 =20 -/* compat. with non-clang compilers */ -#ifndef __has_builtin -# define __has_builtin(x) 0 -#endif - -/* compat. with non-clang compilers */ -#ifndef __has_feature -# define __has_feature(x) 0 -#endif +/* compile time determination of SIMD support */ =20 /* C-language Attributes are added in C23. */ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ > 201710L) && defined(_= _has_c_attribute) @@ -168,10 +157,28 @@ */ #define ZSTD_FALLTHROUGH fallthrough =20 -/* detects whether we are being compiled under msan */ +/*-************************************************************** +* Alignment check +*****************************************************************/ + +/* this test was initially positioned in mem.h, + * but this file is removed (or replaced) for linux kernel + * so it's now hosted in compiler.h, + * which remains valid for both user & kernel spaces. + */ + +#ifndef ZSTD_ALIGNOF +/* covers gcc, clang & MSVC */ +/* note : this section must come first, before C11, + * due to a limitation in the kernel source generator */ +# define ZSTD_ALIGNOF(T) __alignof(T) + +#endif /* ZSTD_ALIGNOF */ =20 +/*-************************************************************** +* Sanitizer +*****************************************************************/ =20 -/* detects whether we are being compiled under asan */ =20 =20 #endif /* ZSTD_COMPILER_H */ diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_com= mon.c index 6353249de614..fef67056f052 100644 --- a/lib/zstd/common/entropy_common.c +++ b/lib/zstd/common/entropy_common.c @@ -212,7 +212,7 @@ static size_t FSE_readNCount_body_default( } =20 #if DYNAMIC_BMI2 -TARGET_ATTRIBUTE("bmi2") static size_t FSE_readNCount_body_bmi2( +BMI2_TARGET_ATTRIBUTE static size_t FSE_readNCount_body_bmi2( short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPt= r, const void* headerBuffer, size_t hbSize) { @@ -240,6 +240,7 @@ size_t FSE_readNCount( return FSE_readNCount_bmi2(normalizedCounter, maxSVPtr, tableLogPtr, h= eaderBuffer, hbSize, /* bmi2 */ 0); } =20 + /*! HUF_readStats() : Read compact Huffman tree, saved by HUF_writeCTable(). `huffWeight` is destination buffer. @@ -293,7 +294,7 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32= * rankStats, ZSTD_memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32)); weightTotal =3D 0; { U32 n; for (n=3D0; n=3D HUF_TABLELOG_MAX) return ERROR(corrupti= on_detected); + if (huffWeight[n] > HUF_TABLELOG_MAX) return ERROR(corruption_= detected); rankStats[huffWeight[n]]++; weightTotal +=3D (1 << huffWeight[n]) >> 1; } } @@ -331,7 +332,7 @@ static size_t HUF_readStats_body_default(BYTE* huffWeig= ht, size_t hwSize, U32* r } =20 #if DYNAMIC_BMI2 -static TARGET_ATTRIBUTE("bmi2") size_t HUF_readStats_body_bmi2(BYTE* huffW= eight, size_t hwSize, U32* rankStats, +static BMI2_TARGET_ATTRIBUTE size_t HUF_readStats_body_bmi2(BYTE* huffWeig= ht, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_privat= e.h index d14e686adf95..ca5101e542fa 100644 --- a/lib/zstd/common/error_private.h +++ b/lib/zstd/common/error_private.h @@ -18,8 +18,10 @@ /* **************************************** * Dependencies ******************************************/ -#include "zstd_deps.h" /* size_t */ #include /* enum list */ +#include "compiler.h" +#include "debug.h" +#include "zstd_deps.h" /* size_t */ =20 =20 /* **************************************** @@ -62,5 +64,82 @@ ERR_STATIC const char* ERR_getErrorName(size_t code) return ERR_getErrorString(ERR_getErrorCode(code)); } =20 +/* + * Ignore: this is an internal helper. + * + * This is a helper function to help force C99-correctness during compilat= ion. + * Under strict compilation modes, variadic macro arguments can't be empty. + * However, variadic function arguments can be. Using a function therefore= lets + * us statically check that at least one (string) argument was passed, + * independent of the compilation flags. + */ +static INLINE_KEYWORD UNUSED_ATTR +void _force_has_format_string(const char *format, ...) { + (void)format; +} + +/* + * Ignore: this is an internal helper. + * + * We want to force this function invocation to be syntactically correct, = but + * we don't want to force runtime evaluation of its arguments. + */ +#define _FORCE_HAS_FORMAT_STRING(...) \ + if (0) { \ + _force_has_format_string(__VA_ARGS__); \ + } + +#define ERR_QUOTE(str) #str + +/* + * Return the specified error if the condition evaluates to true. + * + * In debug modes, prints additional information. + * In order to do that (particularly, printing the conditional that failed= ), + * this can't just wrap RETURN_ERROR(). + */ +#define RETURN_ERROR_IF(cond, err, ...) \ + if (cond) { \ + RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ + __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } + +/* + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +#define RETURN_ERROR(err, ...) \ + do { \ + RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ + __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } while(0); + +/* + * If the provided expression evaluates to an error code, returns that err= or code. + * + * In debug modes, prints additional information. + */ +#define FORWARD_IF_ERROR(err, ...) \ + do { \ + size_t const err_code =3D (err); \ + if (ERR_isError(err_code)) { \ + RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ + __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code= )); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return err_code; \ + } \ + } while(0); + =20 #endif /* ERROR_H_MODULE */ diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h index 0bb174c2c367..4507043b2287 100644 --- a/lib/zstd/common/fse.h +++ b/lib/zstd/common/fse.h @@ -333,8 +333,9 @@ size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned ch= ar symbolValue); /* FSE_buildCTable_wksp() : * Same as FSE_buildCTable(), but using an externally allocated scratch bu= ffer (`workSpace`). * `wkspSize` must be >=3D `FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolV= alue, tableLog)` of `unsigned`. + * See FSE_buildCTable_wksp() for breakdown of workspace usage. */ -#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (max= SymbolValue + 2 + (1ull << (tableLog - 2))) +#define FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog) (((m= axSymbolValue + 2) + (1ull << (tableLog)))/2 + sizeof(U64)/sizeof(U32) /* a= dditional 8 bytes for potential table overwrite */) #define FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) (sizeof(= unsigned) * FSE_BUILD_CTABLE_WORKSPACE_SIZE_U32(maxSymbolValue, tableLog)) size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter= , unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspS= ize); =20 diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompr= ess.c index 2c8bbe3e4c14..a0d06095be83 100644 --- a/lib/zstd/common/fse_decompress.c +++ b/lib/zstd/common/fse_decompress.c @@ -365,7 +365,7 @@ static size_t FSE_decompress_wksp_body_default(void* ds= t, size_t dstCapacity, co } =20 #if DYNAMIC_BMI2 -TARGET_ATTRIBUTE("bmi2") static size_t FSE_decompress_wksp_body_bmi2(void*= dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLo= g, void* workSpace, size_t wkspSize) +BMI2_TARGET_ATTRIBUTE static size_t FSE_decompress_wksp_body_bmi2(void* ds= t, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, = void* workSpace, size_t wkspSize) { return FSE_decompress_wksp_body(dst, dstCapacity, cSrc, cSrcSize, maxL= og, workSpace, wkspSize, 1); } diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h index 88c5586646aa..5042ff870308 100644 --- a/lib/zstd/common/huf.h +++ b/lib/zstd/common/huf.h @@ -86,9 +86,9 @@ HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t ds= tCapacity, =20 /* HUF_compress4X_wksp() : * Same as HUF_compress2(), but uses externally allocated `workSpace`. - * `workspace` must have minimum alignment of 4, and be at least as large = as HUF_WORKSPACE_SIZE */ -#define HUF_WORKSPACE_SIZE ((6 << 10) + 256) -#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32)) + * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ +#define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) +#define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tab= leLog, @@ -113,11 +113,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst,= size_t dstCapacity, =20 =20 /* *** Constants *** */ -#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (du= e to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ +#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (du= e to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ #define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none = specified */ #define HUF_SYMBOLVALUE_MAX 255 =20 -#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELO= G. Beyond that value, code does not work */ +#define HUF_TABLELOG_ABSOLUTEMAX 12 /* absolute limit of HUF_MAX_TABLELO= G. Beyond that value, code does not work */ #if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX) # error "HUF_TABLELOG_MAX is too large !" #endif @@ -133,15 +133,11 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst,= size_t dstCapacity, =20 /* static allocation of HUF's Compression Table */ /* this is a private definition, just exposed for allocation and strict al= iasing purpose. never EVER access its members directly */ -struct HUF_CElt_s { - U16 val; - BYTE nbBits; -}; /* typedef'd to HUF_CElt */ -typedef struct HUF_CElt_s HUF_CElt; /* consider it an incomplete type */ -#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Us= e tables of U32, for proper alignment */ -#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymb= olValue) * sizeof(U32)) +typedef size_t HUF_CElt; /* consider it an incomplete type */ +#define HUF_CTABLE_SIZE_ST(maxSymbolValue) ((maxSymbolValue)+2) /* Use= tables of size_t, for proper alignment */ +#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_ST(maxSymbo= lValue) * sizeof(size_t)) #define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \ - HUF_CElt name[HUF_CTABLE_SIZE_U32(maxSymbolValue)] /* no final ; */ + HUF_CElt name[HUF_CTABLE_SIZE_ST(maxSymbolValue)] /* no final ; */ =20 /* static allocation of HUF's DTable */ typedef U32 HUF_DTable; @@ -191,6 +187,7 @@ size_t HUF_buildCTable (HUF_CElt* CTable, const unsigne= d* count, unsigned maxSym size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTab= le, unsigned maxSymbolValue, unsigned huffLog); size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* = CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t = workspaceSize); size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* s= rc, size_t srcSize, const HUF_CElt* CTable); +size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const vo= id* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* = count, unsigned maxSymbolValue); int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsi= gned maxSymbolValue); =20 @@ -203,12 +200,13 @@ typedef enum { * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat= !=3D HUF_repeat_none. * If it uses hufTable it does not modify hufTable or repeat. * If it doesn't, it sets *repeat =3D HUF_repeat_none, and it sets hufTab= le to the table used. - * If preferRepeat then the old table will always be used if valid. */ + * If preferRepeat then the old table will always be used if valid. + * If suspectUncompressible then some sampling checks will be run to pote= ntially skip huffman coding */ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /*< `workSpace= ` must be aligned on 4-bytes boundaries, `wkspSize` must be >=3D HUF_WORKSP= ACE_SIZE */ - HUF_CElt* hufTable, HUF_repeat* repeat, int preferR= epeat, int bmi2); + HUF_CElt* hufTable, HUF_repeat* repeat, int preferR= epeat, int bmi2, unsigned suspectUncompressible); =20 /* HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buff= er. @@ -246,11 +244,10 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hw= Size, * Loading a CTable saved with HUF_writeCTable() */ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, cons= t void* src, size_t srcSize, unsigned *hasZeroWeights); =20 -/* HUF_getNbBits() : +/* HUF_getNbBitsFromCTable() : * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed= <=3D HUF_SYMBOLVALUE_MAX - * Note 1 : is not inlined, as HUF_CElt definition is private - * Note 2 : const void* used, so that it can provide a statically allocat= ed table as argument (which uses type U32) */ -U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue); + * Note 1 : is not inlined, as HUF_CElt definition is private */ +U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); =20 /* * HUF_decompress() does the following: @@ -302,18 +299,20 @@ size_t HUF_decompress4X2_usingDTable(void* dst, size_= t maxDstSize, const void* c /* =3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D=3D */ =20 size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t = srcSize, unsigned maxSymbolValue, unsigned tableLog); -size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, si= ze_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, = size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPAC= E_SIZE_U32 unsigned */ +size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, si= ze_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, = size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPAC= E_SIZE_U64 U64 */ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* s= rc, size_t srcSize, const HUF_CElt* CTable); +size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const vo= id* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); /* HUF_compress1X_repeat() : * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat= !=3D HUF_repeat_none. * If it uses hufTable it does not modify hufTable or repeat. * If it doesn't, it sets *repeat =3D HUF_repeat_none, and it sets hufTab= le to the table used. - * If preferRepeat then the old table will always be used if valid. */ + * If preferRepeat then the old table will always be used if valid. + * If suspectUncompressible then some sampling checks will be run to pote= ntially skip huffman coding */ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, /*< `workSpace`= must be aligned on 4-bytes boundaries, `wkspSize` must be >=3D HUF_WORKSPA= CE_SIZE */ - HUF_CElt* hufTable, HUF_repeat* repeat, int preferR= epeat, int bmi2); + HUF_CElt* hufTable, HUF_repeat* repeat, int preferR= epeat, int bmi2, unsigned suspectUncompressible); =20 size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, siz= e_t cSrcSize); /* single-symbol decoder */ #ifndef HUF_FORCE_DECOMPRESS_X1 @@ -351,6 +350,9 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* d= ctx, void* dst, size_t ds #ifndef HUF_FORCE_DECOMPRESS_X2 size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, siz= e_t srcSize, void* workSpace, size_t wkspSize, int bmi2); #endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, siz= e_t srcSize, void* workSpace, size_t wkspSize, int bmi2); +#endif =20 #endif /* HUF_STATIC_LINKING_ONLY */ =20 diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h index dcdd586a9fd9..1d9cc03924ca 100644 --- a/lib/zstd/common/mem.h +++ b/lib/zstd/common/mem.h @@ -30,6 +30,8 @@ * Basic Types *****************************************************************/ typedef uint8_t BYTE; +typedef uint8_t U8; +typedef int8_t S8; typedef uint16_t U16; typedef int16_t S16; typedef uint32_t U32; diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portabi= lity_macros.h new file mode 100644 index 000000000000..0e3b2c0a527d --- /dev/null +++ b/lib/zstd/common/portability_macros.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in= the + * LICENSE file in the root directory of this source tree) and the GPLv2 (= found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_PORTABILITY_MACROS_H +#define ZSTD_PORTABILITY_MACROS_H + +/* + * This header file contains macro defintions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * + * This header ONLY defines macros to detect platforms/feature support. + * + */ + + +/* compat. with non-clang compilers */ +#ifndef __has_attribute + #define __has_attribute(x) 0 +#endif + +/* compat. with non-clang compilers */ +#ifndef __has_builtin +# define __has_builtin(x) 0 +#endif + +/* compat. with non-clang compilers */ +#ifndef __has_feature +# define __has_feature(x) 0 +#endif + +/* detects whether we are being compiled under msan */ + +/* detects whether we are being compiled under asan */ + +/* detects whether we are being compiled under dfsan */ + +/* Mark the internal assembly functions as hidden */ +#ifdef __ELF__ +# define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func +#else +# define ZSTD_HIDE_ASM_FUNCTION(func) +#endif + +/* Enable runtime BMI2 dispatch based on the CPU. + * Enabled for clang & gcc >=3D4.8 on x86 when BMI2 isn't enabled by defau= lt. + */ +#ifndef DYNAMIC_BMI2 + #if ((defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ + && (__GNUC__ >=3D 5 || (__GNUC__ =3D=3D 4 && __GNUC_MINOR__ >=3D= 8)))) \ + && (defined(__x86_64__) || defined(_M_X64)) \ + && !defined(__BMI2__) + # define DYNAMIC_BMI2 1 + #else + # define DYNAMIC_BMI2 0 + #endif +#endif + +/* + * Only enable assembly for GNUC comptabile compilers, + * because other platforms may not support GAS assembly syntax. + * + * Only enable assembly for Linux / MacOS, other platforms may + * work, but they haven't been tested. This could likely be + * extended to BSD systems. + * + * Disable assembly when MSAN is enabled, because MSAN requires + * 100% of code to be instrumented to work. + */ +#define ZSTD_ASM_SUPPORTED 1 + +/* + * Determines whether we should enable assembly for x86-64 + * with BMI2. + * + * Enable if all of the following conditions hold: + * - ASM hasn't been explicitly disabled by defining ZSTD_DISABLE_ASM + * - Assembly is supported + * - We are compiling for x86-64 and either: + * - DYNAMIC_BMI2 is enabled + * - BMI2 is supported at compile time + */ +#define ZSTD_ENABLE_ASM_X86_64_BMI2 0 + +#endif /* ZSTD_PORTABILITY_MACROS_H */ diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_interna= l.h index fc6f3a9b40c0..93305d9b41bb 100644 --- a/lib/zstd/common/zstd_internal.h +++ b/lib/zstd/common/zstd_internal.h @@ -20,6 +20,7 @@ * Dependencies ***************************************/ #include "compiler.h" +#include "cpu.h" #include "mem.h" #include "debug.h" /* assert, DEBUGLOG, RAWLOG, g_debuglev= el */ #include "error_private.h" @@ -47,81 +48,7 @@ #undef MAX #define MIN(a,b) ((a)<(b) ? (a) : (b)) #define MAX(a,b) ((a)>(b) ? (a) : (b)) - -/* - * Ignore: this is an internal helper. - * - * This is a helper function to help force C99-correctness during compilat= ion. - * Under strict compilation modes, variadic macro arguments can't be empty. - * However, variadic function arguments can be. Using a function therefore= lets - * us statically check that at least one (string) argument was passed, - * independent of the compilation flags. - */ -static INLINE_KEYWORD UNUSED_ATTR -void _force_has_format_string(const char *format, ...) { - (void)format; -} - -/* - * Ignore: this is an internal helper. - * - * We want to force this function invocation to be syntactically correct, = but - * we don't want to force runtime evaluation of its arguments. - */ -#define _FORCE_HAS_FORMAT_STRING(...) \ - if (0) { \ - _force_has_format_string(__VA_ARGS__); \ - } - -/* - * Return the specified error if the condition evaluates to true. - * - * In debug modes, prints additional information. - * In order to do that (particularly, printing the conditional that failed= ), - * this can't just wrap RETURN_ERROR(). - */ -#define RETURN_ERROR_IF(cond, err, ...) \ - if (cond) { \ - RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ - __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return ERROR(err); \ - } - -/* - * Unconditionally return the specified error. - * - * In debug modes, prints additional information. - */ -#define RETURN_ERROR(err, ...) \ - do { \ - RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ - __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return ERROR(err); \ - } while(0); - -/* - * If the provided expression evaluates to an error code, returns that err= or code. - * - * In debug modes, prints additional information. - */ -#define FORWARD_IF_ERROR(err, ...) \ - do { \ - size_t const err_code =3D (err); \ - if (ERR_isError(err_code)) { \ - RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ - __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_cod= e)); \ - _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ - RAWLOG(3, ": " __VA_ARGS__); \ - RAWLOG(3, "\n"); \ - return err_code; \ - } \ - } while(0); +#define BOUNDED(min,val,max) (MAX(min,MIN(val,max))) =20 =20 /*-************************************* @@ -130,7 +57,6 @@ void _force_has_format_string(const char *format, ...) { #define ZSTD_OPT_NUM (1<<12) =20 #define ZSTD_REP_NUM 3 /* number of repcodes */ -#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) static UNUSED_ATTR const U32 repStartValue[ZSTD_REP_NUM] =3D { 1, 4, 8 }; =20 #define KB *(1 <<10) @@ -182,7 +108,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_= repeat } symbolEncodingTy /* Each table cannot take more than #symbols * FSELog bits */ #define ZSTD_MAX_FSE_HEADERS_SIZE (((MaxML + 1) * MLFSELog + (MaxLL + 1) *= LLFSELog + (MaxOff + 1) * OffFSELog + 7) / 8) =20 -static UNUSED_ATTR const U32 LL_bits[MaxLL+1] =3D { +static UNUSED_ATTR const U8 LL_bits[MaxLL+1] =3D { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 3, 3, @@ -199,7 +125,7 @@ static UNUSED_ATTR const S16 LL_defaultNorm[MaxLL+1] = =3D { #define LL_DEFAULTNORMLOG 6 /* for static allocation */ static UNUSED_ATTR const U32 LL_defaultNormLog =3D LL_DEFAULTNORMLOG; =20 -static UNUSED_ATTR const U32 ML_bits[MaxML+1] =3D { +static UNUSED_ATTR const U8 ML_bits[MaxML+1] =3D { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -234,12 +160,31 @@ static UNUSED_ATTR const U32 OF_defaultNormLog =3D OF= _DEFAULTNORMLOG; * Shared functions to include for inlining *********************************************/ static void ZSTD_copy8(void* dst, const void* src) { +#if defined(ZSTD_ARCH_ARM_NEON) + vst1_u8((uint8_t*)dst, vld1_u8((const uint8_t*)src)); +#else ZSTD_memcpy(dst, src, 8); +#endif } - #define COPY8(d,s) { ZSTD_copy8(d,s); d+=3D8; s+=3D8; } + +/* Need to use memmove here since the literal buffer can now be located wi= thin + the dst buffer. In circumstances where the op "catches up" to where the + literal buffer is, there can be partial overlaps in this call on the fi= nal + copy if the literal is being shifted by less than 16 bytes. */ static void ZSTD_copy16(void* dst, const void* src) { - ZSTD_memcpy(dst, src, 16); +#if defined(ZSTD_ARCH_ARM_NEON) + vst1q_u8((uint8_t*)dst, vld1q_u8((const uint8_t*)src)); +#elif defined(ZSTD_ARCH_X86_SSE2) + _mm_storeu_si128((__m128i*)dst, _mm_loadu_si128((const __m128i*)src)); +#elif defined(__clang__) + ZSTD_memmove(dst, src, 16); +#else + /* ZSTD_memmove is not inlined properly by gcc */ + BYTE copy16_buf[16]; + ZSTD_memcpy(copy16_buf, src, 16); + ZSTD_memcpy(dst, copy16_buf, 16); +#endif } #define COPY16(d,s) { ZSTD_copy16(d,s); d+=3D16; s+=3D16; } =20 @@ -267,8 +212,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_= t length, ZSTD_overlap_e BYTE* op =3D (BYTE*)dst; BYTE* const oend =3D op + length; =20 - assert(diff >=3D 8 || (ovtype =3D=3D ZSTD_no_overlap && diff <=3D -WIL= DCOPY_VECLEN)); - if (ovtype =3D=3D ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLE= N) { /* Handle short offset copies. */ do { @@ -331,11 +274,18 @@ typedef enum { * Private declarations *********************************************/ typedef struct seqDef_s { - U32 offset; /* Offset code of the sequence */ + U32 offBase; /* offBase =3D=3D Offset + ZSTD_REP_NUM, or repcode 1,2= ,3 */ U16 litLength; - U16 matchLength; + U16 mlBase; /* mlBase =3D=3D matchLength - MINMATCH */ } seqDef; =20 +/* Controls whether seqStore has a single "long" litLength or matchLength.= See seqStore_t. */ +typedef enum { + ZSTD_llt_none =3D 0, /* no longLengthType */ + ZSTD_llt_literalLength =3D 1, /* represents a long literal */ + ZSTD_llt_matchLength =3D 2 /* represents a long match */ +} ZSTD_longLengthType_e; + typedef struct { seqDef* sequencesStart; seqDef* sequences; /* ptr to end of sequences */ @@ -347,12 +297,12 @@ typedef struct { size_t maxNbSeq; size_t maxNbLit; =20 - /* longLengthPos and longLengthID to allow us to represent either a si= ngle litLength or matchLength + /* longLengthPos and longLengthType to allow us to represent either a = single litLength or matchLength * in the seqStore that has a value larger than U16 (if it exists). To= do so, we increment * the existing value of the litLength or matchLength by 0x10000. */ - U32 longLengthID; /* 0 =3D=3D no longLength; 1 =3D=3D Represent th= e long literal; 2 =3D=3D Represent the long match; */ - U32 longLengthPos; /* Index of the sequence to apply long length mo= dification to */ + ZSTD_longLengthType_e longLengthType; + U32 longLengthPos; /* Index of the sequence to ap= ply long length modification to */ } seqStore_t; =20 typedef struct { @@ -362,18 +312,18 @@ typedef struct { =20 /* * Returns the ZSTD_sequenceLength for the given sequences. It handles the= decoding of long sequences - * indicated by longLengthPos and longLengthID, and adds MINMATCH back to = matchLength. + * indicated by longLengthPos and longLengthType, and adds MINMATCH back t= o matchLength. */ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* se= qStore, seqDef const* seq) { ZSTD_sequenceLength seqLen; seqLen.litLength =3D seq->litLength; - seqLen.matchLength =3D seq->matchLength + MINMATCH; + seqLen.matchLength =3D seq->mlBase + MINMATCH; if (seqStore->longLengthPos =3D=3D (U32)(seq - seqStore->sequencesStar= t)) { - if (seqStore->longLengthID =3D=3D 1) { + if (seqStore->longLengthType =3D=3D ZSTD_llt_literalLength) { seqLen.litLength +=3D 0xFFFF; } - if (seqStore->longLengthID =3D=3D 2) { + if (seqStore->longLengthType =3D=3D ZSTD_llt_matchLength) { seqLen.matchLength +=3D 0xFFFF; } } @@ -419,6 +369,41 @@ MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, = dictBuilder, decodeCorpus } } =20 +/* + * Counts the number of trailing zeros of a `size_t`. + * Most compilers should support CTZ as a builtin. A backup + * implementation is provided if the builtin isn't supported, but + * it may not be terribly efficient. + */ +MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +{ + if (MEM_64bits()) { +# if (__GNUC__ >=3D 4) + return __builtin_ctzll((U64)val); +# else + static const int DeBruijnBytePos[64] =3D { 0, 1, 2, 7, 3,= 13, 8, 19, + 4, 25, 14, 28, 9, 3= 4, 20, 56, + 5, 17, 26, 54, 15, 4= 1, 29, 43, + 10, 31, 38, 35, 21, = 45, 49, 57, + 63, 6, 12, 18, 24, = 27, 33, 55, + 16, 53, 40, 42, 30, = 37, 44, 48, + 62, 11, 23, 32, 52, = 39, 36, 47, + 61, 22, 51, 46, 60, = 50, 59, 58 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218= A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if (__GNUC__ >=3D 3) + return __builtin_ctz((U32)val); +# else + static const int DeBruijnBytePos[32] =3D { 0, 1, 28, 2, 29,= 14, 24, 3, + 30, 22, 20, 15, 25, 1= 7, 4, 8, + 31, 27, 13, 23, 21, 1= 9, 16, 7, + 26, 12, 18, 6, 11, = 5, 10, 9 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)= ) >> 27]; +# endif + } +} + =20 /* ZSTD_invalidateRepCodes() : * ensures next compression will not use repcodes from previous block. @@ -445,6 +430,14 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcS= ize, size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, const void* src, size_t srcSize); =20 +/* + * @returns true iff the CPU supports dynamic BMI2 dispatch. + */ +MEM_STATIC int ZSTD_cpuSupportsBmi2(void) +{ + ZSTD_cpuid_t cpuid =3D ZSTD_cpuid(); + return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); +} =20 =20 #endif /* ZSTD_CCOMMON_H_MODULE */ diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h new file mode 100644 index 000000000000..d9a76112ec3a --- /dev/null +++ b/lib/zstd/compress/clevels.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in= the + * LICENSE file in the root directory of this source tree) and the GPLv2 (= found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_CLEVELS_H +#define ZSTD_CLEVELS_H + +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_compressionParameters */ +#include + +/*-=3D=3D=3D=3D=3D Pre-defined compression levels =3D=3D=3D=3D=3D-*/ + +#define ZSTD_MAX_CLEVEL 22 + +__attribute__((__unused__)) + +static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MA= X_CLEVEL+1] =3D { +{ /* "default" - for any srcSize > 256 KB */ + /* W, C, H, S, L, TL, strat */ + { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels= */ + { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ + { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ + { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ + { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ + { 21, 18, 19, 3, 5, 2, ZSTD_greedy }, /* level 5 */ + { 21, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6 */ + { 21, 19, 20, 4, 5, 8, ZSTD_lazy }, /* level 7 */ + { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 8 */ + { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ + { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 10 */ + { 22, 21, 22, 6, 5, 16, ZSTD_lazy2 }, /* level 11 */ + { 22, 22, 23, 6, 5, 32, ZSTD_lazy2 }, /* level 12 */ + { 22, 22, 22, 4, 5, 32, ZSTD_btlazy2 }, /* level 13 */ + { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ + { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ + { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ + { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ + { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ + { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ + { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ + { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ + { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ +}, +{ /* for srcSize <=3D 256 KB */ + /* W, C, H, S, L, T, strat */ + { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels= */ + { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ + { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ + { 18, 16, 17, 3, 5, 2, ZSTD_greedy }, /* level 4.*/ + { 18, 17, 18, 5, 5, 2, ZSTD_greedy }, /* level 5.*/ + { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ + { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ + { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ + { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ + { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ + { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <=3D 128 KB */ + /* W, C, H, S, L, T, strat */ + { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels= */ + { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ + { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ + { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ + { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ + { 17, 16, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ + { 17, 16, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 17, 16, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 17, 16, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 17, 16, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ + { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ + { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ + { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ + { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ + { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <=3D 16 KB */ + /* W, C, H, S, L, T, strat */ + { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels= */ + { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ + { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ + { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ + { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ + { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ + { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ + { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ + { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ + { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ + { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ + { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ + { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ + { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ + { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ + { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ + { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +}; + + + +#endif /* ZSTD_CLEVELS_H */ diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compr= ess.c index 436985b620e5..ec5b1ca6d71a 100644 --- a/lib/zstd/compress/fse_compress.c +++ b/lib/zstd/compress/fse_compress.c @@ -75,13 +75,14 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, void* const FSCT =3D ((U32*)ptr) + 1 /* header */ + (tableLog ? tableS= ize>>1 : 1) ; FSE_symbolCompressionTransform* const symbolTT =3D (FSE_symbolCompress= ionTransform*) (FSCT); U32 const step =3D FSE_TABLESTEP(tableSize); + U32 const maxSV1 =3D maxSymbolValue+1; =20 - U32* cumul =3D (U32*)workSpace; - FSE_FUNCTION_TYPE* tableSymbol =3D (FSE_FUNCTION_TYPE*)(cumul + (maxSy= mbolValue + 2)); + U16* cumul =3D (U16*)workSpace; /* size =3D maxSV1 */ + FSE_FUNCTION_TYPE* const tableSymbol =3D (FSE_FUNCTION_TYPE*)(cumul + = (maxSV1+1)); /* size =3D tableSize */ =20 U32 highThreshold =3D tableSize-1; =20 - if ((size_t)workSpace & 3) return ERROR(GENERIC); /* Must be 4 byte al= igned */ + assert(((size_t)workSpace & 1) =3D=3D 0); /* Must be 2 bytes-aligned = */ if (FSE_BUILD_CTABLE_WORKSPACE_SIZE(maxSymbolValue, tableLog) > wkspSi= ze) return ERROR(tableLog_tooLarge); /* CTable header */ tableU16[-2] =3D (U16) tableLog; @@ -98,20 +99,61 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, /* symbol start positions */ { U32 u; cumul[0] =3D 0; - for (u=3D1; u <=3D maxSymbolValue+1; u++) { + for (u=3D1; u <=3D maxSV1; u++) { if (normalizedCounter[u-1]=3D=3D-1) { /* Low proba symbol */ cumul[u] =3D cumul[u-1] + 1; tableSymbol[highThreshold--] =3D (FSE_FUNCTION_TYPE)(u-1); } else { - cumul[u] =3D cumul[u-1] + normalizedCounter[u-1]; + assert(normalizedCounter[u-1] >=3D 0); + cumul[u] =3D cumul[u-1] + (U16)normalizedCounter[u-1]; + assert(cumul[u] >=3D cumul[u-1]); /* no overflow */ } } - cumul[maxSymbolValue+1] =3D tableSize+1; + cumul[maxSV1] =3D (U16)(tableSize+1); } =20 /* Spread symbols */ - { U32 position =3D 0; + if (highThreshold =3D=3D tableSize - 1) { + /* Case for no low prob count symbols. Lay down 8 bytes at a time + * to reduce branch misses since we are operating on a small block + */ + BYTE* const spread =3D tableSymbol + tableSize; /* size =3D tableS= ize + 8 (may write beyond tableSize) */ + { U64 const add =3D 0x0101010101010101ull; + size_t pos =3D 0; + U64 sv =3D 0; + U32 s; + for (s=3D0; s=3D0); + pos +=3D (size_t)n; + } + } + /* Spread symbols across the table. Lack of lowprob symbols means = that + * we don't need variable sized inner loop, so we can unroll the l= oop and + * reduce branch misses. + */ + { size_t position =3D 0; + size_t s; + size_t const unroll =3D 2; /* Experimentally determined optima= l unroll */ + assert(tableSize % unroll =3D=3D 0); /* FSE_MIN_TABLELOG is 5 = */ + for (s =3D 0; s < (size_t)tableSize; s +=3D unroll) { + size_t u; + for (u =3D 0; u < unroll; ++u) { + size_t const uPosition =3D (position + (u * step)) & t= ableMask; + tableSymbol[uPosition] =3D spread[s + u]; + } + position =3D (position + (unroll * step)) & tableMask; + } + assert(position =3D=3D 0); /* Must have initialized all posi= tions */ + } + } else { + U32 position =3D 0; U32 symbol; - for (symbol=3D0; symbol<=3DmaxSymbolValue; symbol++) { + for (symbol=3D0; symbol highThreshold) position =3D (position + step) & tableMask; /* Low p= roba area */ } } - assert(position=3D=3D0); /* Must have initialized all positions */ } =20 @@ -144,16 +185,17 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, case -1: case 1: symbolTT[s].deltaNbBits =3D (tableLog << 16) - (1< 1); + { U32 const maxBitsOut =3D tableLog - BIT_highbit32 ((U3= 2)normalizedCounter[s]-1); + U32 const minStatePlus =3D (U32)normalizedCounter[s] <= < maxBitsOut; symbolTT[s].deltaNbBits =3D (maxBitsOut << 16) - minSt= atePlus; - symbolTT[s].deltaFindState =3D total - normalizedCount= er[s]; - total +=3D normalizedCounter[s]; + symbolTT[s].deltaFindState =3D (int)(total - (unsigned= )normalizedCounter[s]); + total +=3D (unsigned)normalizedCounter[s]; } } } } =20 #if 0 /* debug : symbol costs */ @@ -164,8 +206,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, symbol, normalizedCounter[symbol], FSE_getMaxNbBits(symbolTT, symbol), (double)FSE_bitCost(symbolTT, tableLog, symbol, 8) / 256); - } - } + } } #endif =20 return 0; @@ -173,16 +214,18 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, =20 =20 =20 - #ifndef FSE_COMMONDEFS_ONLY =20 - /*-************************************************************** * FSE NCount encoding ****************************************************************/ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) { - size_t const maxHeaderSize =3D (((maxSymbolValue+1) * tableLog) >> 3) = + 3; + size_t const maxHeaderSize =3D (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one ad= ditional bit each */) / 8) + + 1 /* round up to whole nb bytes */ + + 2 /* additional two bytes for bitstr= eam flush */; return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbol= Value=3D=3D0 ? use default */ } =20 diff --git a/lib/zstd/compress/huf_compress.c b/lib/zstd/compress/huf_compr= ess.c index f76a526bfa54..74ef0db47621 100644 --- a/lib/zstd/compress/huf_compress.c +++ b/lib/zstd/compress/huf_compress.c @@ -50,6 +50,28 @@ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_= t srcSize, unsigned maxS /* ******************************************************* * HUF : Huffman block compression *********************************************************/ +#define HUF_WORKSPACE_MAX_ALIGNMENT 8 + +static void* HUF_alignUpWorkspace(void* workspace, size_t* workspaceSizePt= r, size_t align) +{ + size_t const mask =3D align - 1; + size_t const rem =3D (size_t)workspace & mask; + size_t const add =3D (align - rem) & mask; + BYTE* const aligned =3D (BYTE*)workspace + add; + assert((align & (align - 1)) =3D=3D 0); /* pow 2 */ + assert(align <=3D HUF_WORKSPACE_MAX_ALIGNMENT); + if (*workspaceSizePtr >=3D add) { + assert(add < align); + assert(((size_t)aligned & mask) =3D=3D 0); + *workspaceSizePtr -=3D add; + return aligned; + } else { + *workspaceSizePtr =3D 0; + return NULL; + } +} + + /* HUF_compressWeights() : * Same as FSE_compress(), but dedicated to huff0's weights compression. * The use case needs much less stack memory. @@ -72,7 +94,7 @@ static size_t HUF_compressWeights(void* dst, size_t dstSi= ze, const void* weightT =20 unsigned maxSymbolValue =3D HUF_TABLELOG_MAX; U32 tableLog =3D MAX_FSE_TABLELOG_FOR_HUFF_HEADER; - HUF_CompressWeightsWksp* wksp =3D (HUF_CompressWeightsWksp*)workspace; + HUF_CompressWeightsWksp* wksp =3D (HUF_CompressWeightsWksp*)HUF_alignU= pWorkspace(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); =20 if (workspaceSize < sizeof(HUF_CompressWeightsWksp)) return ERROR(GENE= RIC); =20 @@ -103,6 +125,40 @@ static size_t HUF_compressWeights(void* dst, size_t ds= tSize, const void* weightT return (size_t)(op-ostart); } =20 +static size_t HUF_getNbBits(HUF_CElt elt) +{ + return elt & 0xFF; +} + +static size_t HUF_getNbBitsFast(HUF_CElt elt) +{ + return elt; +} + +static size_t HUF_getValue(HUF_CElt elt) +{ + return elt & ~0xFF; +} + +static size_t HUF_getValueFast(HUF_CElt elt) +{ + return elt; +} + +static void HUF_setNbBits(HUF_CElt* elt, size_t nbBits) +{ + assert(nbBits <=3D HUF_TABLELOG_ABSOLUTEMAX); + *elt =3D nbBits; +} + +static void HUF_setValue(HUF_CElt* elt, size_t value) +{ + size_t const nbBits =3D HUF_getNbBits(*elt); + if (nbBits > 0) { + assert((value >> nbBits) =3D=3D 0); + *elt |=3D value << (sizeof(HUF_CElt) * 8 - nbBits); + } +} =20 typedef struct { HUF_CompressWeightsWksp wksp; @@ -114,9 +170,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSi= ze, const HUF_CElt* CTable, unsigned maxSymbolValu= e, unsigned huffLog, void* workspace, size_t workspaceSize) { + HUF_CElt const* const ct =3D CTable + 1; BYTE* op =3D (BYTE*)dst; U32 n; - HUF_WriteCTableWksp* wksp =3D (HUF_WriteCTableWksp*)workspace; + HUF_WriteCTableWksp* wksp =3D (HUF_WriteCTableWksp*)HUF_alignUpWorkspa= ce(workspace, &workspaceSize, ZSTD_ALIGNOF(U32)); =20 /* check conditions */ if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); @@ -127,9 +184,10 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSi= ze, for (n=3D1; nbitsToWeight[n] =3D (BYTE)(huffLog + 1 - n); for (n=3D0; nhuffWeight[n] =3D wksp->bitsToWeight[CTable[n].nbBits]; + wksp->huffWeight[n] =3D wksp->bitsToWeight[HUF_getNbBits(ct[n])]; =20 /* attempt weights compression by FSE */ + if (maxDstSize < 1) return ERROR(dstSize_tooSmall); { CHECK_V_F(hSize, HUF_compressWeights(op+1, maxDstSize-1, wksp->huf= fWeight, maxSymbolValue, &wksp->wksp, sizeof(wksp->wksp)) ); if ((hSize>1) & (hSize < maxSymbolValue/2)) { /* FSE compressed = */ op[0] =3D (BYTE)hSize; @@ -163,6 +221,7 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxS= ymbolValuePtr, const void U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; /* large enough for value= s from 0 to 16 */ U32 tableLog =3D 0; U32 nbSymbols =3D 0; + HUF_CElt* const ct =3D CTable + 1; =20 /* get symbol weights */ CHECK_V_F(readSize, HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX+1, r= ankVal, &nbSymbols, &tableLog, src, srcSize)); @@ -172,6 +231,8 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxS= ymbolValuePtr, const void if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooS= mall); =20 + CTable[0] =3D tableLog; + /* Prepare base value per rank */ { U32 n, nextRankStart =3D 0; for (n=3D1; n<=3DtableLog; n++) { @@ -183,13 +244,13 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* ma= xSymbolValuePtr, const void /* fill nbBits */ { U32 n; for (n=3D0; nn= =3DtableLog+1 */ U16 valPerRank[HUF_TABLELOG_MAX+2] =3D {0}; - { U32 n; for (n=3D0; n>=3D 1; } } /* assign value within rank, symbol order */ - { U32 n; for (n=3D0; n huffNode[i-1].count) { + return 0; + } + } + return 1; +} + +/* Insertion sort by descending order */ +HINT_INLINE void HUF_insertionSort(nodeElt huffNode[], int const low, int = const high) { + int i; + int const size =3D high-low+1; + huffNode +=3D low; + for (i =3D 1; i < size; ++i) { + nodeElt const key =3D huffNode[i]; + int j =3D i - 1; + while (j >=3D 0 && huffNode[j].count < key.count) { + huffNode[j + 1] =3D huffNode[j]; + j--; + } + huffNode[j + 1] =3D key; + } +} + +/* Pivot helper function for quicksort. */ +static int HUF_quickSortPartition(nodeElt arr[], int const low, int const = high) { + /* Simply select rightmost element as pivot. "Better" selectors like + * median-of-three don't experimentally appear to have any benefit. + */ + U32 const pivot =3D arr[high].count; + int i =3D low - 1; + int j =3D low; + for ( ; j < high; j++) { + if (arr[j].count > pivot) { + i++; + HUF_swapNodes(&arr[i], &arr[j]); + } + } + HUF_swapNodes(&arr[i + 1], &arr[high]); + return i + 1; +} + +/* Classic quicksort by descending with partially iterative calls + * to reduce worst case callstack size. + */ +static void HUF_simpleQuickSort(nodeElt arr[], int low, int high) { + int const kInsertionSortThreshold =3D 8; + if (high - low < kInsertionSortThreshold) { + HUF_insertionSort(arr, low, high); + return; + } + while (low < high) { + int const idx =3D HUF_quickSortPartition(arr, low, high); + if (idx - low < high - idx) { + HUF_simpleQuickSort(arr, low, idx - 1); + low =3D idx + 1; + } else { + HUF_simpleQuickSort(arr, idx + 1, high); + high =3D idx - 1; + } + } +} + /* * HUF_sort(): * Sorts the symbols [0, maxSymbolValue] by count[symbol] in decreasing or= der. + * This is a typical bucket sorting strategy that uses either quicksort or= insertion sort to sort each bucket. * * @param[out] huffNode Sorted symbols by decreasing count. Only mem= bers `.count` and `.byte` are filled. * Must have (maxSymbolValue + 1) entries. @@ -387,44 +544,52 @@ typedef struct { * @param[in] maxSymbolValue Maximum symbol value. * @param rankPosition This is a scratch workspace. Must have RANK_= POSITION_TABLE_SIZE entries. */ -static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymb= olValue, rankPos* rankPosition) -{ - int n; - int const maxSymbolValue1 =3D (int)maxSymbolValue + 1; +static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const= maxSymbolValue, rankPos rankPosition[]) { + U32 n; + U32 const maxSymbolValue1 =3D maxSymbolValue+1; =20 /* Compute base and set curr to base. - * For symbol s let lowerRank =3D BIT_highbit32(count[n]+1) and rank = =3D lowerRank + 1. - * Then 2^lowerRank <=3D count[n]+1 <=3D 2^rank. + * For symbol s let lowerRank =3D HUF_getIndex(count[n]) and rank =3D = lowerRank + 1. + * See HUF_getIndex to see bucketing strategy. * We attribute each symbol to lowerRank's base value, because we want= to know where * each rank begins in the output, so for rank R we want to count rank= s R+1 and above. */ ZSTD_memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TAB= LE_SIZE); for (n =3D 0; n < maxSymbolValue1; ++n) { - U32 lowerRank =3D BIT_highbit32(count[n] + 1); + U32 lowerRank =3D HUF_getIndex(count[n]); + assert(lowerRank < RANK_POSITION_TABLE_SIZE - 1); rankPosition[lowerRank].base++; } + assert(rankPosition[RANK_POSITION_TABLE_SIZE - 1].base =3D=3D 0); + /* Set up the rankPosition table */ for (n =3D RANK_POSITION_TABLE_SIZE - 1; n > 0; --n) { rankPosition[n-1].base +=3D rankPosition[n].base; rankPosition[n-1].curr =3D rankPosition[n-1].base; } - /* Sort */ + + /* Insert each symbol into their appropriate bucket, setting up rankPo= sition table. */ for (n =3D 0; n < maxSymbolValue1; ++n) { U32 const c =3D count[n]; - U32 const r =3D BIT_highbit32(c+1) + 1; - U32 pos =3D rankPosition[r].curr++; - /* Insert into the correct position in the rank. - * We have at most 256 symbols, so this insertion should be fine. - */ - while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)= ) { - huffNode[pos] =3D huffNode[pos-1]; - pos--; - } + U32 const r =3D HUF_getIndex(c) + 1; + U32 const pos =3D rankPosition[r].curr++; + assert(pos < maxSymbolValue1); huffNode[pos].count =3D c; huffNode[pos].byte =3D (BYTE)n; } -} =20 + /* Sort each bucket. */ + for (n =3D RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABL= E_SIZE - 1; ++n) { + U32 const bucketSize =3D rankPosition[n].curr-rankPosition[n].base; + U32 const bucketStartIdx =3D rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); + HUF_simpleQuickSort(huffNode + bucketStartIdx, 0, bucketSize-1= ); + } + } + + assert(HUF_isSorted(huffNode, maxSymbolValue1)); +} =20 /* HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buff= er. @@ -487,6 +652,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymb= olValue) */ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffN= ode, int nonNullRank, U32 maxSymbolValue, U32 maxNbBits) { + HUF_CElt* const ct =3D CTable + 1; /* fill result into ctable (val, nbBits) */ int n; U16 nbPerRank[HUF_TABLELOG_MAX+1] =3D {0}; @@ -502,20 +668,20 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable,= nodeElt const* huffNode, i min >>=3D 1; } } for (n=3D0; nhuffNodeTbl; nodeElt* const huffNode =3D huffNode0+1; int nonNullRank; =20 /* safety checks */ - if (((size_t)workSpace & 3) !=3D 0) return ERROR(GENERIC); /* must be= aligned on 4-bytes boundaries */ if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) return ERROR(workSpace_tooSmall); if (maxNbBits =3D=3D 0) maxNbBits =3D HUF_TABLELOG_DEFAULT; @@ -533,99 +699,334 @@ size_t HUF_buildCTable_wksp (HUF_CElt* tree, const u= nsigned* count, U32 maxSymbo maxNbBits =3D HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fi= t into table */ =20 - HUF_buildCTableFromTree(tree, huffNode, nonNullRank, maxSymbolValue, m= axNbBits); + HUF_buildCTableFromTree(CTable, huffNode, nonNullRank, maxSymbolValue,= maxNbBits); =20 return maxNbBits; } =20 size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* = count, unsigned maxSymbolValue) { + HUF_CElt const* ct =3D CTable + 1; size_t nbBits =3D 0; int s; for (s =3D 0; s <=3D (int)maxSymbolValue; ++s) { - nbBits +=3D CTable[s].nbBits * count[s]; + nbBits +=3D HUF_getNbBits(ct[s]) * count[s]; } return nbBits >> 3; } =20 int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsi= gned maxSymbolValue) { + HUF_CElt const* ct =3D CTable + 1; int bad =3D 0; int s; for (s =3D 0; s <=3D (int)maxSymbolValue; ++s) { - bad |=3D (count[s] !=3D 0) & (CTable[s].nbBits =3D=3D 0); + bad |=3D (count[s] !=3D 0) & (HUF_getNbBits(ct[s]) =3D=3D 0); } return !bad; } =20 size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } =20 +/* HUF_CStream_t: + * Huffman uses its own BIT_CStream_t implementation. + * There are three major differences from BIT_CStream_t: + * 1. HUF_addBits() takes a HUF_CElt (size_t) which is + * the pair (nbBits, value) in the format: + * format: + * - Bits [0, 4) =3D nbBits + * - Bits [4, 64 - nbBits) =3D 0 + * - Bits [64 - nbBits, 64) =3D value + * 2. The bitContainer is built from the upper bits and + * right shifted. E.g. to add a new value of N bits + * you right shift the bitContainer by N, then or in + * the new value into the N upper bits. + * 3. The bitstream has two bit containers. You can add + * bits to the second container and merge them into + * the first container. + */ + +#define HUF_BITS_IN_CONTAINER (sizeof(size_t) * 8) + +typedef struct { + size_t bitContainer[2]; + size_t bitPos[2]; + + BYTE* startPtr; + BYTE* ptr; + BYTE* endPtr; +} HUF_CStream_t; + +/*! HUF_initCStream(): + * Initializes the bitstream. + * @returns 0 or an error code. + */ +static size_t HUF_initCStream(HUF_CStream_t* bitC, + void* startPtr, size_t dstCapacity) +{ + ZSTD_memset(bitC, 0, sizeof(*bitC)); + bitC->startPtr =3D (BYTE*)startPtr; + bitC->ptr =3D bitC->startPtr; + bitC->endPtr =3D bitC->startPtr + dstCapacity - sizeof(bitC->bitContai= ner[0]); + if (dstCapacity <=3D sizeof(bitC->bitContainer[0])) return ERROR(dstSi= ze_tooSmall); + return 0; +} + +/*! HUF_addBits(): + * Adds the symbol stored in HUF_CElt elt to the bitstream. + * + * @param elt The element we're adding. This is a (nbBits, value) pair. + * See the HUF_CStream_t docs for the format. + * @param idx Insert into the bitstream at this idx. + * @param kFast This is a template parameter. If the bitstream is guarante= ed + * to have at least 4 unused bits after this call it may be 1, + * otherwise it must be 0. HUF_addBits() is faster when fast = is set. + */ +FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, = int idx, int kFast) +{ + assert(idx <=3D 1); + assert(HUF_getNbBits(elt) <=3D HUF_TABLELOG_ABSOLUTEMAX); + /* This is efficient on x86-64 with BMI2 because shrx + * only reads the low 6 bits of the register. The compiler + * knows this and elides the mask. When fast is set, + * every operation can use the same value loaded from elt. + */ + bitC->bitContainer[idx] >>=3D HUF_getNbBits(elt); + bitC->bitContainer[idx] |=3D kFast ? HUF_getValueFast(elt) : HUF_getVa= lue(elt); + /* We only read the low 8 bits of bitC->bitPos[idx] so it + * doesn't matter that the high bits have noise from the value. + */ + bitC->bitPos[idx] +=3D HUF_getNbBitsFast(elt); + assert((bitC->bitPos[idx] & 0xFF) <=3D HUF_BITS_IN_CONTAINER); + /* The last 4-bits of elt are dirty if fast is set, + * so we must not be overwriting bits that have already been + * inserted into the bit container. + */ +#if DEBUGLEVEL >=3D 1 + { + size_t const nbBits =3D HUF_getNbBits(elt); + size_t const dirtyBits =3D nbBits =3D=3D 0 ? 0 : BIT_highbit32((U3= 2)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) =3D=3D 0); + /* We didn't overwrite any bits in the bit container. */ + assert(!kFast || (bitC->bitPos[idx] & 0xFF) <=3D HUF_BITS_IN_CONTA= INER); + (void)dirtyBits; + } +#endif +} + +FORCE_INLINE_TEMPLATE void HUF_zeroIndex1(HUF_CStream_t* bitC) +{ + bitC->bitContainer[1] =3D 0; + bitC->bitPos[1] =3D 0; +} + +/*! HUF_mergeIndex1() : + * Merges the bit container @ index 1 into the bit container @ index 0 + * and zeros the bit container @ index 1. + */ +FORCE_INLINE_TEMPLATE void HUF_mergeIndex1(HUF_CStream_t* bitC) +{ + assert((bitC->bitPos[1] & 0xFF) < HUF_BITS_IN_CONTAINER); + bitC->bitContainer[0] >>=3D (bitC->bitPos[1] & 0xFF); + bitC->bitContainer[0] |=3D bitC->bitContainer[1]; + bitC->bitPos[0] +=3D bitC->bitPos[1]; + assert((bitC->bitPos[0] & 0xFF) <=3D HUF_BITS_IN_CONTAINER); +} + +/*! HUF_flushBits() : +* Flushes the bits in the bit container @ index 0. +* +* @post bitPos will be < 8. +* @param kFast If kFast is set then we must know a-priori that +* the bit container will not overflow. +*/ +FORCE_INLINE_TEMPLATE void HUF_flushBits(HUF_CStream_t* bitC, int kFast) +{ + /* The upper bits of bitPos are noisy, so we must mask by 0xFF. */ + size_t const nbBits =3D bitC->bitPos[0] & 0xFF; + size_t const nbBytes =3D nbBits >> 3; + /* The top nbBits bits of bitContainer are the ones we need. */ + size_t const bitContainer =3D bitC->bitContainer[0] >> (HUF_BITS_IN_CO= NTAINER - nbBits); + /* Mask bitPos to account for the bytes we consumed. */ + bitC->bitPos[0] &=3D 7; + assert(nbBits > 0); + assert(nbBits <=3D sizeof(bitC->bitContainer[0]) * 8); + assert(bitC->ptr <=3D bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitContainer); + bitC->ptr +=3D nbBytes; + assert(!kFast || bitC->ptr <=3D bitC->endPtr); + if (!kFast && bitC->ptr > bitC->endPtr) bitC->ptr =3D bitC->endPtr; + /* bitContainer doesn't need to be modified because the leftover + * bits are already the top bitPos bits. And we don't care about + * noise in the lower values. + */ +} + +/*! HUF_endMark() + * @returns The Huffman stream end mark: A 1-bit value =3D 1. + */ +static HUF_CElt HUF_endMark(void) +{ + HUF_CElt endMark; + HUF_setNbBits(&endMark, 1); + HUF_setValue(&endMark, 1); + return endMark; +} + +/*! HUF_closeCStream() : + * @return Size of CStream, in bytes, + * or 0 if it could not fit into dstBuffer */ +static size_t HUF_closeCStream(HUF_CStream_t* bitC) +{ + HUF_addBits(bitC, HUF_endMark(), /* idx */ 0, /* kFast */ 0); + HUF_flushBits(bitC, /* kFast */ 0); + { + size_t const nbBits =3D bitC->bitPos[0] & 0xFF; + if (bitC->ptr >=3D bitC->endPtr) return 0; /* overflow detected */ + return (bitC->ptr - bitC->startPtr) + (nbBits > 0); + } +} + FORCE_INLINE_TEMPLATE void -HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTabl= e) +HUF_encodeSymbol(HUF_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTabl= e, int idx, int fast) { - BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); + HUF_addBits(bitCPtr, CTable[symbol], idx, fast); } =20 -#define HUF_FLUSHBITS(s) BIT_flushBits(s) +FORCE_INLINE_TEMPLATE void +HUF_compress1X_usingCTable_internal_body_loop(HUF_CStream_t* bitC, + const BYTE* ip, size_t srcSize, + const HUF_CElt* ct, + int kUnroll, int kFastFlush, int kLastF= ast) +{ + /* Join to kUnroll */ + int n =3D (int)srcSize; + int rem =3D n % kUnroll; + if (rem > 0) { + for (; rem > 0; --rem) { + HUF_encodeSymbol(bitC, ip[--n], ct, 0, /* fast */ 0); + } + HUF_flushBits(bitC, kFastFlush); + } + assert(n % kUnroll =3D=3D 0); + + /* Join to 2 * kUnroll */ + if (n % (2 * kUnroll)) { + int u; + for (u =3D 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, 0, 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, 0, kLastFast); + HUF_flushBits(bitC, kFastFlush); + n -=3D kUnroll; + } + assert(n % (2 * kUnroll) =3D=3D 0); + + for (; n>0; n-=3D 2 * kUnroll) { + /* Encode kUnroll symbols into the bitstream @ index 0. */ + int u; + for (u =3D 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - u], ct, /* idx */ 0, /* fast */ = 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll], ct, /* idx */ 0, /* fast *= / kLastFast); + HUF_flushBits(bitC, kFastFlush); + /* Encode kUnroll symbols into the bitstream @ index 1. + * This allows us to start filling the bit container + * without any data dependencies. + */ + HUF_zeroIndex1(bitC); + for (u =3D 1; u < kUnroll; ++u) { + HUF_encodeSymbol(bitC, ip[n - kUnroll - u], ct, /* idx */ 1, /= * fast */ 1); + } + HUF_encodeSymbol(bitC, ip[n - kUnroll - kUnroll], ct, /* idx */ 1,= /* fast */ kLastFast); + /* Merge bitstream @ index 1 into the bitstream @ index 0 */ + HUF_mergeIndex1(bitC); + HUF_flushBits(bitC, kFastFlush); + } + assert(n =3D=3D 0); + +} =20 -#define HUF_FLUSHBITS_1(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSH= BITS(stream) +/* + * Returns a tight upper bound on the output space needed by Huffman + * with 8 bytes buffer to handle over-writes. If the output is at least + * this large we don't need to do bounds checks during Huffman encoding. + */ +static size_t HUF_tightCompressBound(size_t srcSize, size_t tableLog) +{ + return ((srcSize * tableLog) >> 3) + 8; +} =20 -#define HUF_FLUSHBITS_2(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSH= BITS(stream) =20 FORCE_INLINE_TEMPLATE size_t HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { + U32 const tableLog =3D (U32)CTable[0]; + HUF_CElt const* ct =3D CTable + 1; const BYTE* ip =3D (const BYTE*) src; BYTE* const ostart =3D (BYTE*)dst; BYTE* const oend =3D ostart + dstSize; BYTE* op =3D ostart; - size_t n; - BIT_CStream_t bitC; + HUF_CStream_t bitC; =20 /* init */ if (dstSize < 8) return 0; /* not enough space to compress */ - { size_t const initErr =3D BIT_initCStream(&bitC, op, (size_t)(oend-op= )); + { size_t const initErr =3D HUF_initCStream(&bitC, op, (size_t)(oend-op= )); if (HUF_isError(initErr)) return 0; } =20 - n =3D srcSize & ~3; /* join to mod 4 */ - switch (srcSize & 3) - { - case 3: - HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); - HUF_FLUSHBITS_2(&bitC); - ZSTD_FALLTHROUGH; - case 2: - HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); - HUF_FLUSHBITS_1(&bitC); - ZSTD_FALLTHROUGH; - case 1: - HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); - HUF_FLUSHBITS(&bitC); - ZSTD_FALLTHROUGH; - case 0: ZSTD_FALLTHROUGH; - default: break; - } - - for (; n>0; n-=3D4) { /* note : n&3=3D=3D0 at this stage */ - HUF_encodeSymbol(&bitC, ip[n- 1], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 2], CTable); - HUF_FLUSHBITS_2(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 3], CTable); - HUF_FLUSHBITS_1(&bitC); - HUF_encodeSymbol(&bitC, ip[n- 4], CTable); - HUF_FLUSHBITS(&bitC); - } - - return BIT_closeCStream(&bitC); + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tab= leLog > 11) + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, srcSize, = ct, /* kUnroll */ MEM_32bits() ? 2 : 4, /* kFast */ 0, /* kLastFast */ 0); + else { + if (MEM_32bits()) { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: ZSTD_FALLTHROUGH; + case 9: ZSTD_FALLTHROUGH; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 2, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 7: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 3, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } else { + switch (tableLog) { + case 11: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 10: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 5, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + case 9: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 6, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 8: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 7, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 7: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 8, /* kFastFlush */ 1, /* kLastFast */ 0); + break; + case 6: ZSTD_FALLTHROUGH; + default: + HUF_compress1X_usingCTable_internal_body_loop(&bitC, ip, s= rcSize, ct, /* kUnroll */ 9, /* kFastFlush */ 1, /* kLastFast */ 1); + break; + } + } + } + assert(bitC.ptr <=3D bitC.endPtr); + + return HUF_closeCStream(&bitC); } =20 #if DYNAMIC_BMI2 =20 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) @@ -667,9 +1068,13 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t= dstSize, =20 size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* s= rc, size_t srcSize, const HUF_CElt* CTable) { - return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize,= CTable, /* bmi2 */ 0); + return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTa= ble, /* bmi2 */ 0); } =20 +size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const vo= id* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +{ + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize,= CTable, bmi2); +} =20 static size_t HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, @@ -689,8 +1094,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t = dstSize, =20 assert(op <=3D oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(= oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize=3D=3D0) return 0; - assert(cSize <=3D 65535); + if (cSize =3D=3D 0 || cSize > 65535) return 0; MEM_writeLE16(ostart, (U16)cSize); op +=3D cSize; } @@ -698,8 +1102,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t = dstSize, ip +=3D segmentSize; assert(op <=3D oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(= oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize=3D=3D0) return 0; - assert(cSize <=3D 65535); + if (cSize =3D=3D 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+2, (U16)cSize); op +=3D cSize; } @@ -707,8 +1110,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t = dstSize, ip +=3D segmentSize; assert(op <=3D oend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(= oend-op), ip, segmentSize, CTable, bmi2) ); - if (cSize=3D=3D0) return 0; - assert(cSize <=3D 65535); + if (cSize =3D=3D 0 || cSize > 65535) return 0; MEM_writeLE16(ostart+4, (U16)cSize); op +=3D cSize; } @@ -717,7 +1119,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t = dstSize, assert(op <=3D oend); assert(ip <=3D iend); { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(= oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); - if (cSize=3D=3D0) return 0; + if (cSize =3D=3D 0 || cSize > 65535) return 0; op +=3D cSize; } =20 @@ -726,7 +1128,12 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t= dstSize, =20 size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* s= rc, size_t srcSize, const HUF_CElt* CTable) { - return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize,= CTable, /* bmi2 */ 0); + return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTa= ble, /* bmi2 */ 0); +} + +size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const vo= id* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +{ + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize,= CTable, bmi2); } =20 typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; @@ -750,35 +1157,38 @@ static size_t HUF_compressCTable_internal( =20 typedef struct { unsigned count[HUF_SYMBOLVALUE_MAX + 1]; - HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(HUF_SYMBOLVALUE_MAX)]; union { HUF_buildCTable_wksp_tables buildCTable_wksp; HUF_WriteCTableWksp writeCTable_wksp; + U32 hist_wksp[HIST_WKSP_SIZE_U32]; } wksps; } HUF_compress_tables_t; =20 +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 +#define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >=3D 2 */ + /* HUF_compress_internal() : * `workSpace_align4` must be aligned on 4-bytes boundaries, - * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U32 unsign= ed */ + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsign= ed */ static size_t HUF_compress_internal (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, HUF_nbStreams_e nbStreams, - void* workSpace_align4, size_t wkspSize, + void* workSpace, size_t wkspSize, HUF_CElt* oldHufTable, HUF_repeat* repeat, int pref= erRepeat, - const int bmi2) + const int bmi2, unsigned suspectUncompressible) { - HUF_compress_tables_t* const table =3D (HUF_compress_tables_t*)workSpa= ce_align4; + HUF_compress_tables_t* const table =3D (HUF_compress_tables_t*)HUF_ali= gnUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); BYTE* const ostart =3D (BYTE*)dst; BYTE* const oend =3D ostart + dstSize; BYTE* op =3D ostart; =20 - HUF_STATIC_ASSERT(sizeof(*table) <=3D HUF_WORKSPACE_SIZE); - assert(((size_t)workSpace_align4 & 3) =3D=3D 0); /* must be aligned = on 4-bytes boundaries */ + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <=3D HU= F_WORKSPACE_SIZE); =20 /* checks & inits */ - if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); + if (wkspSize < sizeof(*table)) return ERROR(workSpace_tooSmall); if (!srcSize) return 0; /* Uncompressed */ if (!dstSize) return 0; /* cannot fit anything within dst budget */ if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* cur= rent block size limit */ @@ -794,8 +1204,23 @@ HUF_compress_internal (void* dst, size_t dstSize, nbStreams, oldHufTable, bmi2); } =20 + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >=3D 2); + if (suspectUncompressible && srcSize >=3D (SUSPECT_INCOMPRESSIBLE_SAMP= LE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal =3D 0; + { unsigned maxSymbolValueBegin =3D maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxS= ymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal +=3D largestBegin; + } + { unsigned maxSymbolValueEnd =3D maxSymbolValue; + CHECK_V_F(largestEnd, HIST_count_simple (table->count, &maxSym= bolValueEnd, (const BYTE*)src + srcSize - SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZ= E, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal +=3D largestEnd; + } + if (largestTotal <=3D ((2 * SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) >>= 7)+4) return 0; /* heuristic : probably not compressible enough */ + } + /* Scan input and build symbol stats */ - { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue,= (const BYTE*)src, srcSize, workSpace_align4, wkspSize) ); + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue,= (const BYTE*)src, srcSize, table->wksps.hist_wksp, sizeof(table->wksps.his= t_wksp)) ); if (largest =3D=3D srcSize) { *ostart =3D ((const BYTE*)src)[0]; r= eturn 1; } /* single symbol, rle */ if (largest <=3D (srcSize >> 7)+4) return 0; /* heuristic : prob= ably not compressible enough */ } @@ -820,9 +1245,12 @@ HUF_compress_internal (void* dst, size_t dstSize, &table->wksps.buildCTable_wksp= , sizeof(table->wksps.buildCTable_wksp)); CHECK_F(maxBits); huffLog =3D (U32)maxBits; - /* Zero unused symbols in CTable, so we can check it for validity = */ - ZSTD_memset(table->CTable + (maxSymbolValue + 1), 0, - sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_= CElt))); + } + /* Zero unused symbols in CTable, so we can check it for validity */ + { + size_t const ctableSize =3D HUF_CTABLE_SIZE_ST(maxSymbolValue); + size_t const unusedSize =3D sizeof(table->CTable) - ctableSize * s= izeof(HUF_CElt); + ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); } =20 /* Write table description header */ @@ -859,19 +1287,20 @@ size_t HUF_compress1X_wksp (void* dst, size_t dstSiz= e, return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_singleStream, workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + NULL, NULL, 0, 0 /*bmi2*/, 0); } =20 size_t HUF_compress1X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRe= peat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRe= peat, + int bmi2, unsigned suspectUncompressible) { return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_singleStream, workSpace, wkspSize, hufTable, - repeat, preferRepeat, bmi2); + repeat, preferRepeat, bmi2, suspectUncomp= ressible); } =20 /* HUF_compress4X_repeat(): @@ -885,21 +1314,22 @@ size_t HUF_compress4X_wksp (void* dst, size_t dstSiz= e, return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_fourStreams, workSpace, wkspSize, - NULL, NULL, 0, 0 /*bmi2*/); + NULL, NULL, 0, 0 /*bmi2*/, 0); } =20 /* HUF_compress4X_repeat(): * compress input using 4 streams. + * consider skipping quickly * re-use an existing huffman compression table */ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned huffLog, void* workSpace, size_t wkspSize, - HUF_CElt* hufTable, HUF_repeat* repeat, int preferRe= peat, int bmi2) + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRe= peat, int bmi2, unsigned suspectUncompressible) { return HUF_compress_internal(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, HUF_fourStreams, workSpace, wkspSize, - hufTable, repeat, preferRepeat, bmi2); + hufTable, repeat, preferRepeat, bmi2, sus= pectUncompressible); } =20 diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_com= press.c index a4e916008b3a..f620cafca633 100644 --- a/lib/zstd/compress/zstd_compress.c +++ b/lib/zstd/compress/zstd_compress.c @@ -12,7 +12,6 @@ * Dependencies ***************************************/ #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ -#include "../common/cpu.h" #include "../common/mem.h" #include "hist.h" /* HIST_countFast_wksp */ #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ @@ -39,6 +38,18 @@ * Note that functions with explicit context such as ZSTD_compressCCtx() a= re unaffected. */ =20 +/*! + * ZSTD_HASHLOG3_MAX : + * Maximum size of the hash table dedicated to find 3-bytes matches, + * in log format, aka 17 =3D> 1 << 17 =3D=3D 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known = here. + * The actual (selected) size of the hash table is then stored in ZSTD_mat= chState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ +#ifndef ZSTD_HASHLOG3_MAX +# define ZSTD_HASHLOG3_MAX 17 +#endif =20 /*-************************************* * Helper functions @@ -69,6 +80,10 @@ struct ZSTD_CDict_s { ZSTD_customMem customMem; U32 dictID; int compressionLevel; /* 0 indicates that advanced API was used to sel= ect CDict params */ + ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict w= as created with params that would use + * row-based matchfinder. Unless= the cdict is reloaded, we will use + * the same greedy/lazy matchfin= der at compression time. + */ }; /* typedef'd to ZSTD_CDict within "zstd.h" */ =20 ZSTD_CCtx* ZSTD_createCCtx(void) @@ -81,7 +96,7 @@ static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem= memManager) assert(cctx !=3D NULL); ZSTD_memset(cctx, 0, sizeof(*cctx)); cctx->customMem =3D memManager; - cctx->bmi2 =3D ZSTD_cpuid_bmi2(ZSTD_cpuid()); + cctx->bmi2 =3D ZSTD_cpuSupportsBmi2(); { size_t const err =3D ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); assert(!ZSTD_isError(err)); (void)err; @@ -192,12 +207,64 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) /* private API call, for dictBuilder only */ const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->s= eqStore); } =20 +/* Returns true if the strategy supports using a row based matchfinder */ +static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { + return (strategy >=3D ZSTD_greedy && strategy <=3D ZSTD_lazy2); +} + +/* Returns true if the strategy and useRowMatchFinder mode indicate that w= e will use the row based matchfinder + * for this compression. + */ +static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZST= D_paramSwitch_e mode) { + assert(mode !=3D ZSTD_ps_auto); + return ZSTD_rowMatchFinderSupported(strategy) && (mode =3D=3D ZSTD_ps_= enable); +} + +/* Returns row matchfinder usage given an initial mode and cParams */ +static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_= e mode, + const ZSTD_compre= ssionParameters* const cParams) { +#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON) + int const kHasSIMD128 =3D 1; +#else + int const kHasSIMD128 =3D 0; +#endif + if (mode !=3D ZSTD_ps_auto) return mode; /* if requested enabled, but = no SIMD, we still will use row matchfinder */ + mode =3D ZSTD_ps_disable; + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; + if (kHasSIMD128) { + if (cParams->windowLog > 14) mode =3D ZSTD_ps_enable; + } else { + if (cParams->windowLog > 17) mode =3D ZSTD_ps_enable; + } + return mode; +} + +/* Returns block splitter usage (generally speaking, when using slower/str= onger compression modes) */ +static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e= mode, + const ZSTD_compres= sionParameters* const cParams) { + if (mode !=3D ZSTD_ps_auto) return mode; + return (cParams->strategy >=3D ZSTD_btopt && cParams->windowLog >=3D 1= 7) ? ZSTD_ps_enable : ZSTD_ps_disable; +} + +/* Returns 1 if the arguments indicate that we should allocate a chainTabl= e, 0 otherwise */ +static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + const ZSTD_paramSwitch_e useRowMatchFin= der, + const U32 forDDSDict) { + assert(useRowMatchFinder !=3D ZSTD_ps_auto); + /* We always should allocate a chaintable if we are allocating a match= state for a DDS dictionary matchstate. + * We do not allocate a chaintable if we are using ZSTD_fast, or are u= sing the row-based matchfinder. + */ + return forDDSDict || ((strategy !=3D ZSTD_fast) && !ZSTD_rowMatchFinde= rUsed(strategy, useRowMatchFinder)); +} + /* Returns 1 if compression parameters are such that we should * enable long distance matching (wlog >=3D 27, strategy >=3D btopt). * Returns 0 otherwise. */ -static U32 ZSTD_CParams_shouldEnableLdm(const ZSTD_compressionParameters* = const cParams) { - return cParams->strategy >=3D ZSTD_btopt && cParams->windowLog >=3D 27; +static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const c= Params) { + if (mode !=3D ZSTD_ps_auto) return mode; + return (cParams->strategy >=3D ZSTD_btopt && cParams->windowLog >=3D 2= 7) ? ZSTD_ps_enable : ZSTD_ps_disable; } =20 static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( @@ -208,15 +275,15 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParam= s( ZSTD_CCtxParams_init(&cctxParams, ZSTD_CLEVEL_DEFAULT); cctxParams.cParams =3D cParams; =20 - if (ZSTD_CParams_shouldEnableLdm(&cParams)) { - DEBUGLOG(4, "ZSTD_makeCCtxParamsFromCParams(): Including LDM into = cctx params"); - cctxParams.ldmParams.enableLdm =3D 1; - /* LDM is enabled by default for optimal parser and window size >= =3D 128MB */ + /* Adjust advanced params according to cParams */ + cctxParams.ldmParams.enableLdm =3D ZSTD_resolveEnableLdm(cctxParams.ld= mParams.enableLdm, &cParams); + if (cctxParams.ldmParams.enableLdm =3D=3D ZSTD_ps_enable) { ZSTD_ldm_adjustParameters(&cctxParams.ldmParams, &cParams); assert(cctxParams.ldmParams.hashLog >=3D cctxParams.ldmParams.buck= etSizeLog); assert(cctxParams.ldmParams.hashRateLog < 32); } - + cctxParams.useBlockSplitter =3D ZSTD_resolveBlockSplitterMode(cctxPara= ms.useBlockSplitter, &cParams); + cctxParams.useRowMatchFinder =3D ZSTD_resolveRowMatchFinderMode(cctxPa= rams.useRowMatchFinder, &cParams); assert(!ZSTD_checkCParams(cParams)); return cctxParams; } @@ -275,6 +342,11 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_pa= rams* cctxParams, ZSTD_par * But, set it for tracing anyway. */ cctxParams->compressionLevel =3D compressionLevel; + cctxParams->useRowMatchFinder =3D ZSTD_resolveRowMatchFinderMode(cctxP= arams->useRowMatchFinder, ¶ms->cParams); + cctxParams->useBlockSplitter =3D ZSTD_resolveBlockSplitterMode(cctxPar= ams->useBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm =3D ZSTD_resolveEnableLdm(cctxParams->= ldmParams.enableLdm, ¶ms->cParams); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=3D%d, us= eBlockSplitter=3D%d ldm=3D%d", + cctxParams->useRowMatchFinder, cctxParams->useBlockSplitte= r, cctxParams->ldmParams.enableLdm); } =20 size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_pa= rameters params) @@ -431,9 +503,9 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) return bounds; =20 case ZSTD_c_literalCompressionMode: - ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_hu= ffman < ZSTD_lcm_uncompressed); - bounds.lowerBound =3D ZSTD_lcm_auto; - bounds.upperBound =3D ZSTD_lcm_uncompressed; + ZSTD_STATIC_ASSERT(ZSTD_ps_auto < ZSTD_ps_enable && ZSTD_ps_enable= < ZSTD_ps_disable); + bounds.lowerBound =3D (int)ZSTD_ps_auto; + bounds.upperBound =3D (int)ZSTD_ps_disable; return bounds; =20 case ZSTD_c_targetCBlockSize: @@ -462,6 +534,21 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter para= m) bounds.upperBound =3D 1; return bounds; =20 + case ZSTD_c_useBlockSplitter: + bounds.lowerBound =3D (int)ZSTD_ps_auto; + bounds.upperBound =3D (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_useRowMatchFinder: + bounds.lowerBound =3D (int)ZSTD_ps_auto; + bounds.upperBound =3D (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_deterministicRefPrefix: + bounds.lowerBound =3D 0; + bounds.upperBound =3D 1; + return bounds; + default: bounds.error =3D ERROR(parameter_unsupported); return bounds; @@ -523,6 +610,9 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter para= m) case ZSTD_c_stableOutBuffer: case ZSTD_c_blockDelimiters: case ZSTD_c_validateSequences: + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: default: return 0; } @@ -575,6 +665,9 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cPa= rameter param, int value) case ZSTD_c_stableOutBuffer: case ZSTD_c_blockDelimiters: case ZSTD_c_validateSequences: + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: break; =20 default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); @@ -672,7 +765,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* C= CtxParams, } =20 case ZSTD_c_literalCompressionMode : { - const ZSTD_literalCompressionMode_e lcm =3D (ZSTD_literalCompressi= onMode_e)value; + const ZSTD_paramSwitch_e lcm =3D (ZSTD_paramSwitch_e)value; BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); CCtxParams->literalCompressionMode =3D lcm; return CCtxParams->literalCompressionMode; @@ -699,7 +792,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* C= CtxParams, return CCtxParams->enableDedicatedDictSearch; =20 case ZSTD_c_enableLongDistanceMatching : - CCtxParams->ldmParams.enableLdm =3D (value!=3D0); + CCtxParams->ldmParams.enableLdm =3D (ZSTD_paramSwitch_e)value; return CCtxParams->ldmParams.enableLdm; =20 case ZSTD_c_ldmHashLog : @@ -758,6 +851,21 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* = CCtxParams, CCtxParams->validateSequences =3D value; return CCtxParams->validateSequences; =20 + case ZSTD_c_useBlockSplitter: + BOUNDCHECK(ZSTD_c_useBlockSplitter, value); + CCtxParams->useBlockSplitter =3D (ZSTD_paramSwitch_e)value; + return CCtxParams->useBlockSplitter; + + case ZSTD_c_useRowMatchFinder: + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); + CCtxParams->useRowMatchFinder =3D (ZSTD_paramSwitch_e)value; + return CCtxParams->useRowMatchFinder; + + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix =3D !!value; + return CCtxParams->deterministicRefPrefix; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } @@ -863,6 +971,15 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_validateSequences : *value =3D (int)CCtxParams->validateSequences; break; + case ZSTD_c_useBlockSplitter : + *value =3D (int)CCtxParams->useBlockSplitter; + break; + case ZSTD_c_useRowMatchFinder : + *value =3D (int)CCtxParams->useRowMatchFinder; + break; + case ZSTD_c_deterministicRefPrefix: + *value =3D (int)CCtxParams->deterministicRefPrefix; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; @@ -889,7 +1006,7 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( return 0; } =20 -ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned l= ong long pledgedSrcSize) +size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long ple= dgedSrcSize) { DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrc= Size); RETURN_ERROR_IF(cctx->streamStage !=3D zcss_init, stage_wrong, @@ -969,14 +1086,14 @@ size_t ZSTD_CCtx_loadDictionary_advanced( return 0; } =20 -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference( +size_t ZSTD_CCtx_loadDictionary_byReference( ZSTD_CCtx* cctx, const void* dict, size_t dictSize) { return ZSTD_CCtx_loadDictionary_advanced( cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); } =20 -ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* d= ict, size_t dictSize) +size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t = dictSize) { return ZSTD_CCtx_loadDictionary_advanced( cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); @@ -1146,7 +1263,7 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameter= s cPar, break; case ZSTD_cpm_createCDict: /* Assume a small source size when creating a dictionary - * with an unkown source size. + * with an unknown source size. */ if (dictSize && srcSize =3D=3D ZSTD_CONTENTSIZE_UNKNOWN) srcSize =3D minSrcSize; @@ -1220,7 +1337,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxPar= ams( srcSizeHint =3D CCtxParams->srcSizeHint; } cParams =3D ZSTD_getCParams_internal(CCtxParams->compressionLevel, src= SizeHint, dictSize, mode); - if (CCtxParams->ldmParams.enableLdm) cParams.windowLog =3D ZSTD_LDM_DE= FAULT_WINDOW_LOG; + if (CCtxParams->ldmParams.enableLdm =3D=3D ZSTD_ps_enable) cParams.win= dowLog =3D ZSTD_LDM_DEFAULT_WINDOW_LOG; ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); assert(!ZSTD_checkCParams(cParams)); /* srcSizeHint =3D=3D 0 means 0 */ @@ -1229,9 +1346,14 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxPa= rams( =20 static size_t ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + const ZSTD_paramSwitch_e useRowMatchFinder, + const U32 enableDedicatedDictSearch, const U32 forCCtx) { - size_t const chainSize =3D (cParams->strategy =3D=3D ZSTD_fast) ? 0 : = ((size_t)1 << cParams->chainLog); + /* chain table size should be 0 for fast or row-hash strategies */ + size_t const chainSize =3D ZSTD_allocateChainTable(cParams->strategy, = useRowMatchFinder, enableDedicatedDictSearch && !forCCtx) + ? ((size_t)1 << cParams->chainLog) + : 0; size_t const hSize =3D ((size_t)1) << cParams->hashLog; U32 const hashLog3 =3D (forCCtx && cParams->minMatch=3D=3D3) ? MIN(= ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size =3D hashLog3 ? ((size_t)1) << hashLog3 : 0; @@ -1241,43 +1363,53 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParame= ters* const cParams, + hSize * sizeof(U32) + h3Size * sizeof(U32); size_t const optPotentialSpace =3D - ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32)) - + ZSTD_cwksp_alloc_size((1<= strategy, useRowMatchFinder) + ? ZSTD_cwksp_aligned_alloc_siz= e(hSize*sizeof(U16)) + : 0; size_t const optSpace =3D (forCCtx && (cParams->strategy >=3D ZSTD_bto= pt)) ? optPotentialSpace : 0; + size_t const slackSpace =3D ZSTD_cwksp_slack_space_required(); + + /* tables are guaranteed to be sized in multiples of 64 bytes (or 16 u= int32_t) */ + ZSTD_STATIC_ASSERT(ZSTD_HASHLOG_MIN >=3D 4 && ZSTD_WINDOWLOG_MIN >=3D = 4 && ZSTD_CHAINLOG_MIN >=3D 4); + assert(useRowMatchFinder !=3D ZSTD_ps_auto); + DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", (U32)chainSize, (U32)hSize, (U32)h3Size); - return tableSpace + optSpace; + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; } =20 static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( const ZSTD_compressionParameters* cParams, const ldmParams_t* ldmParams, const int isStatic, + const ZSTD_paramSwitch_e useRowMatchFinder, const size_t buffInSize, const size_t buffOutSize, const U64 pledgedSrcSize) { - size_t const windowSize =3D MAX(1, (size_t)MIN(((U64)1 << cParams->win= dowLog), pledgedSrcSize)); + size_t const windowSize =3D (size_t) BOUNDED(1ULL, 1ULL << cParams->wi= ndowLog, pledgedSrcSize); size_t const blockSize =3D MIN(ZSTD_BLOCKSIZE_MAX, windowSize); U32 const divider =3D (cParams->minMatch=3D=3D3) ? 3 : 4; size_t const maxNbSeq =3D blockSize / divider; size_t const tokenSpace =3D ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH = + blockSize) - + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqD= ef)) + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * siz= eof(seqDef)) + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(= BYTE)); size_t const entropySpace =3D ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_= SIZE); size_t const blockStateSpace =3D 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD= _compressedBlockState_t)); - size_t const matchStateSize =3D ZSTD_sizeof_matchState(cParams, /* for= CCtx */ 1); + size_t const matchStateSize =3D ZSTD_sizeof_matchState(cParams, useRow= MatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); =20 size_t const ldmSpace =3D ZSTD_ldm_getTableSize(*ldmParams); size_t const maxNbLdmSeq =3D ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSiz= e); - size_t const ldmSeqSpace =3D ldmParams->enableLdm ? - ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; + size_t const ldmSeqSpace =3D ldmParams->enableLdm =3D=3D ZSTD_ps_enabl= e ? + ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; =20 =20 size_t const bufferSpace =3D ZSTD_cwksp_alloc_size(buffInSize) @@ -1303,19 +1435,32 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const = ZSTD_CCtx_params* params) { ZSTD_compressionParameters const cParams =3D ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNK= NOWN, 0, ZSTD_cpm_noAttachDict); + ZSTD_paramSwitch_e const useRowMatchFinder =3D ZSTD_resolveRowMatchFin= derMode(params->useRowMatchFinder, + = &cParams); =20 RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is= supported for single-threaded compression only."); /* estimateCCtxSize is for one-shot compression. So no buffers should * be needed. However, we still allocate two 0-sized buffers, which can * take space under ASAN. */ return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CON= TENTSIZE_UNKNOWN); } =20 size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cPara= ms) { - ZSTD_CCtx_params const params =3D ZSTD_makeCCtxParamsFromCParams(cPara= ms); - return ZSTD_estimateCCtxSize_usingCCtxParams(¶ms); + ZSTD_CCtx_params initialParams =3D ZSTD_makeCCtxParamsFromCParams(cPar= ams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for gr= eedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder =3D ZSTD_ps_disable; + noRowCCtxSize =3D ZSTD_estimateCCtxSize_usingCCtxParams(&initialPa= rams); + initialParams.useRowMatchFinder =3D ZSTD_ps_enable; + rowCCtxSize =3D ZSTD_estimateCCtxSize_usingCCtxParams(&initialPara= ms); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCCtxSize_usingCCtxParams(&initialParams); + } } =20 static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) @@ -1355,17 +1500,29 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(con= st ZSTD_CCtx_params* params) size_t const outBuffSize =3D (params->outBufferMode =3D=3D ZSTD_bm= _buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; + ZSTD_paramSwitch_e const useRowMatchFinder =3D ZSTD_resolveRowMatc= hFinderMode(params->useRowMatchFinder, ¶ms->cParams); =20 return ZSTD_estimateCCtxSize_usingCCtxParams_internal( - &cParams, ¶ms->ldmParams, 1, inBuffSize, outBuffSize, + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize= , outBuffSize, ZSTD_CONTENTSIZE_UNKNOWN); } } =20 size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cP= arams) { - ZSTD_CCtx_params const params =3D ZSTD_makeCCtxParamsFromCParams(cPara= ms); - return ZSTD_estimateCStreamSize_usingCCtxParams(¶ms); + ZSTD_CCtx_params initialParams =3D ZSTD_makeCCtxParamsFromCParams(cPar= ams); + if (ZSTD_rowMatchFinderSupported(cParams.strategy)) { + /* Pick bigger of not using and using row-based matchfinder for gr= eedy and lazy strategies */ + size_t noRowCCtxSize; + size_t rowCCtxSize; + initialParams.useRowMatchFinder =3D ZSTD_ps_disable; + noRowCCtxSize =3D ZSTD_estimateCStreamSize_usingCCtxParams(&initia= lParams); + initialParams.useRowMatchFinder =3D ZSTD_ps_enable; + rowCCtxSize =3D ZSTD_estimateCStreamSize_usingCCtxParams(&initialP= arams); + return MAX(noRowCCtxSize, rowCCtxSize); + } else { + return ZSTD_estimateCStreamSize_usingCCtxParams(&initialParams); + } } =20 static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) @@ -1480,20 +1637,27 @@ typedef enum { ZSTD_resetTarget_CCtx } ZSTD_resetTarget_e; =20 + static size_t ZSTD_reset_matchState(ZSTD_matchState_t* ms, ZSTD_cwksp* ws, const ZSTD_compressionParameters* cParams, + const ZSTD_paramSwitch_e useRowMatchFinder, const ZSTD_compResetPolicy_e crp, const ZSTD_indexResetPolicy_e forceResetIndex, const ZSTD_resetTarget_e forWho) { - size_t const chainSize =3D (cParams->strategy =3D=3D ZSTD_fast) ? 0 : = ((size_t)1 << cParams->chainLog); + /* disable chain table allocation for fast or row-based strategies */ + size_t const chainSize =3D ZSTD_allocateChainTable(cParams->strategy, = useRowMatchFinder, + ms->dedicatedDictSear= ch && (forWho =3D=3D ZSTD_resetTarget_CDict)) + ? ((size_t)1 << cParams->chainLog) + : 0; size_t const hSize =3D ((size_t)1) << cParams->hashLog; U32 const hashLog3 =3D ((forWho =3D=3D ZSTD_resetTarget_CCtx) && cP= arams->minMatch=3D=3D3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; size_t const h3Size =3D hashLog3 ? ((size_t)1) << hashLog3 : 0; =20 DEBUGLOG(4, "reset indices : %u", forceResetIndex =3D=3D ZSTDirp_reset= ); + assert(useRowMatchFinder !=3D ZSTD_ps_auto); if (forceResetIndex =3D=3D ZSTDirp_reset) { ZSTD_window_init(&ms->window); ZSTD_cwksp_mark_tables_dirty(ws); @@ -1532,11 +1696,23 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, ms->opt.priceTable =3D (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned= (ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); } =20 + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { + { /* Row match finder needs an additional table of hashes ("tags= ") */ + size_t const tagTableSize =3D hSize*sizeof(U16); + ms->tagTable =3D (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTable= Size); + if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog =3D BOUNDED(4, cParams->searchLog, 6); + assert(cParams->hashLog >=3D rowLog); + ms->rowHashLog =3D cParams->hashLog - rowLog; + } + } + ms->cParams =3D *cParams; =20 RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, "failed a workspace allocation in ZSTD_reset_matchStat= e"); - return 0; } =20 @@ -1553,61 +1729,87 @@ static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOV= ERFLOW_MARGIN); } =20 +/* ZSTD_dictTooBig(): + * When dictionaries are larger than ZSTD_CHUNKSIZE_MAX they can't be load= ed in + * one go generically. So we ensure that in that case we reset the tables = to zero, + * so that we can load as much of the dictionary as possible. + */ +static int ZSTD_dictTooBig(size_t const loadedDictSize) +{ + return loadedDictSize > ZSTD_CHUNKSIZE_MAX; +} + /*! ZSTD_resetCCtx_internal() : - note : `params` are assumed fully validated at this stage */ + * @param loadedDictSize The size of the dictionary to be loaded + * into the context, if any. If no dictionary is used, or the + * dictionary is being attached / copied, then pass 0. + * note : `params` are assumed fully validated at this stage. + */ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, - ZSTD_CCtx_params params, + ZSTD_CCtx_params const* params, U64 const pledgedSrcSize, + size_t const loadedDictSize, ZSTD_compResetPolicy_e const crp, ZSTD_buffered_policy_e const zbuff) { ZSTD_cwksp* const ws =3D &zc->workspace; - DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=3D%u, wlog=3D%u", - (U32)pledgedSrcSize, params.cParams.windowLog); - assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=3D%u, wlog=3D%u, = useRowMatchFinder=3D%d useBlockSplitter=3D%d", + (U32)pledgedSrcSize, params->cParams.windowLog, (int)param= s->useRowMatchFinder, (int)params->useBlockSplitter); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); =20 zc->isFirstBlock =3D 1; =20 - if (params.ldmParams.enableLdm) { + /* Set applied params early so we can modify them for LDM, + * and point params at the applied params. + */ + zc->appliedParams =3D *params; + params =3D &zc->appliedParams; + + assert(params->useRowMatchFinder !=3D ZSTD_ps_auto); + assert(params->useBlockSplitter !=3D ZSTD_ps_auto); + assert(params->ldmParams.enableLdm !=3D ZSTD_ps_auto); + if (params->ldmParams.enableLdm =3D=3D ZSTD_ps_enable) { /* Adjust long distance matching parameters */ - ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); - assert(params.ldmParams.hashLog >=3D params.ldmParams.bucketSizeLo= g); - assert(params.ldmParams.hashRateLog < 32); + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->c= Params); + assert(params->ldmParams.hashLog >=3D params->ldmParams.bucketSize= Log); + assert(params->ldmParams.hashRateLog < 32); } =20 - { size_t const windowSize =3D MAX(1, (size_t)MIN(((U64)1 << params.c= Params.windowLog), pledgedSrcSize)); + { size_t const windowSize =3D MAX(1, (size_t)MIN(((U64)1 << params->= cParams.windowLog), pledgedSrcSize)); size_t const blockSize =3D MIN(ZSTD_BLOCKSIZE_MAX, windowSize); - U32 const divider =3D (params.cParams.minMatch=3D=3D3) ? 3 : 4; + U32 const divider =3D (params->cParams.minMatch=3D=3D3) ? 3 : 4; size_t const maxNbSeq =3D blockSize / divider; - size_t const buffOutSize =3D (zbuff =3D=3D ZSTDb_buffered && param= s.outBufferMode =3D=3D ZSTD_bm_buffered) + size_t const buffOutSize =3D (zbuff =3D=3D ZSTDb_buffered && param= s->outBufferMode =3D=3D ZSTD_bm_buffered) ? ZSTD_compressBound(blockSize) + 1 : 0; - size_t const buffInSize =3D (zbuff =3D=3D ZSTDb_buffered && params= .inBufferMode =3D=3D ZSTD_bm_buffered) + size_t const buffInSize =3D (zbuff =3D=3D ZSTDb_buffered && params= ->inBufferMode =3D=3D ZSTD_bm_buffered) ? windowSize + blockSize : 0; - size_t const maxNbLdmSeq =3D ZSTD_ldm_getMaxNbSeq(params.ldmParams= , blockSize); + size_t const maxNbLdmSeq =3D ZSTD_ldm_getMaxNbSeq(params->ldmParam= s, blockSize); =20 int const indexTooClose =3D ZSTD_indexTooCloseToMax(zc->blockState= .matchState.window); + int const dictTooBig =3D ZSTD_dictTooBig(loadedDictSize); ZSTD_indexResetPolicy_e needsIndexReset =3D - (!indexTooClose && zc->initialized) ? ZSTDirp_continue : ZSTDi= rp_reset; + (indexTooClose || dictTooBig || !zc->initialized) ? ZSTDirp_re= set : ZSTDirp_continue; =20 size_t const neededSpace =3D ZSTD_estimateCCtxSize_usingCCtxParams_internal( - ¶ms.cParams, ¶ms.ldmParams, zc->staticSize !=3D 0, + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize !=3D = 0, params->useRowMatchFinder, buffInSize, buffOutSize, pledgedSrcSize); + int resizeWorkspace; + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); =20 if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); =20 - /* Check if workspace is large enough, alloc a new one if needed */ - { + { /* Check if workspace is large enough, alloc a new one if need= ed */ int const workspaceTooSmall =3D ZSTD_cwksp_sizeof(ws) < needed= Space; int const workspaceWasteful =3D ZSTD_cwksp_check_wasteful(ws, = neededSpace); - + resizeWorkspace =3D workspaceTooSmall || workspaceWasteful; DEBUGLOG(4, "Need %zu B workspace", neededSpace); DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, bl= ockSize); =20 - if (workspaceTooSmall || workspaceWasteful) { + if (resizeWorkspace) { DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", ZSTD_cwksp_sizeof(ws) >> 10, neededSpace >> 10); @@ -1629,14 +1831,13 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->blockState.nextCBlock =3D (ZSTD_compressedBlockState_t= *) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); RETURN_ERROR_IF(zc->blockState.nextCBlock =3D=3D NULL, mem= ory_allocation, "couldn't allocate nextCBlock"); zc->entropyWorkspace =3D (U32*) ZSTD_cwksp_reserve_object(= ws, ENTROPY_WORKSPACE_SIZE); - RETURN_ERROR_IF(zc->blockState.nextCBlock =3D=3D NULL, mem= ory_allocation, "couldn't allocate entropyWorkspace"); + RETURN_ERROR_IF(zc->entropyWorkspace =3D=3D NULL, memory_a= llocation, "couldn't allocate entropyWorkspace"); } } =20 ZSTD_cwksp_clear(ws); =20 /* init params */ - zc->appliedParams =3D params; - zc->blockState.matchState.cParams =3D params.cParams; + zc->blockState.matchState.cParams =3D params->cParams; zc->pledgedSrcSizePlusOne =3D pledgedSrcSize+1; zc->consumedSrcSize =3D 0; zc->producedCSize =3D 0; @@ -1667,11 +1868,11 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, zc->outBuff =3D (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); =20 /* ldm bucketOffsets table */ - if (params.ldmParams.enableLdm) { + if (params->ldmParams.enableLdm =3D=3D ZSTD_ps_enable) { /* TODO: avoid memset? */ size_t const numBuckets =3D - ((size_t)1) << (params.ldmParams.hashLog - - params.ldmParams.bucketSizeLog); + ((size_t)1) << (params->ldmParams.hashLog - + params->ldmParams.bucketSizeLog); zc->ldmState.bucketOffsets =3D ZSTD_cwksp_reserve_buffer(ws, n= umBuckets); ZSTD_memset(zc->ldmState.bucketOffsets, 0, numBuckets); } @@ -1687,32 +1888,28 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, FORWARD_IF_ERROR(ZSTD_reset_matchState( &zc->blockState.matchState, ws, - ¶ms.cParams, + ¶ms->cParams, + params->useRowMatchFinder, crp, needsIndexReset, ZSTD_resetTarget_CCtx), ""); =20 /* ldm hash table */ - if (params.ldmParams.enableLdm) { + if (params->ldmParams.enableLdm =3D=3D ZSTD_ps_enable) { /* TODO: avoid memset? */ - size_t const ldmHSize =3D ((size_t)1) << params.ldmParams.hash= Log; + size_t const ldmHSize =3D ((size_t)1) << params->ldmParams.has= hLog; zc->ldmState.hashTable =3D (ldmEntry_t*)ZSTD_cwksp_reserve_ali= gned(ws, ldmHSize * sizeof(ldmEntry_t)); ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEn= try_t)); zc->ldmSequences =3D (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, m= axNbLdmSeq * sizeof(rawSeq)); zc->maxNbLdmSequences =3D maxNbLdmSeq; =20 ZSTD_window_init(&zc->ldmState.window); - ZSTD_window_clear(&zc->ldmState.window); zc->ldmState.loadedDictEnd =3D 0; } =20 - /* Due to alignment, when reusing a workspace, we can actually con= sume - * up to 3 extra bytes for alignment. See the comments in zstd_cwk= sp.h - */ - assert(ZSTD_cwksp_used(ws) >=3D neededSpace && - ZSTD_cwksp_used(ws) <=3D neededSpace + 3); - DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available= ", ZSTD_cwksp_available_space(ws)); + assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, r= esizeWorkspace)); + zc->initialized =3D 1; =20 return 0; @@ -1768,6 +1965,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { + DEBUGLOG(4, "ZSTD_resetCCtx_byAttachingCDict() pledgedSrcSize=3D%llu", + (unsigned long long)pledgedSrcSize); { ZSTD_compressionParameters adjusted_cdict_cParams =3D cdict->match= State.cParams; unsigned const windowLog =3D params.cParams.windowLog; @@ -1783,7 +1982,9 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, params.cParams =3D ZSTD_adjustCParams_internal(adjusted_cdict_cPar= ams, pledgedSrcSize, cdict->dictContentSiz= e, ZSTD_cpm_attachDict); params.cParams.windowLog =3D windowLog; - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcS= ize, + params.useRowMatchFinder =3D cdict->useRowMatchFinder; /* cdict= overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrc= Size, + /* loadedDictSize */ 0, ZSTDcrp_makeClean, zbuff)= , ""); assert(cctx->appliedParams.cParams.strategy =3D=3D adjusted_cdict_= cParams.strategy); } @@ -1827,15 +2028,17 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CC= tx* cctx, const ZSTD_compressionParameters *cdict_cParams =3D &cdict->matchState= .cParams; =20 assert(!cdict->matchState.dedicatedDictSearch); - - DEBUGLOG(4, "copying dictionary into context"); + DEBUGLOG(4, "ZSTD_resetCCtx_byCopyingCDict() pledgedSrcSize=3D%llu", + (unsigned long long)pledgedSrcSize); =20 { unsigned const windowLog =3D params.cParams.windowLog; assert(windowLog !=3D 0); /* Copy only compression parameters related to tables. */ params.cParams =3D *cdict_cParams; params.cParams.windowLog =3D windowLog; - FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcS= ize, + params.useRowMatchFinder =3D cdict->useRowMatchFinder; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrc= Size, + /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff= ), ""); assert(cctx->appliedParams.cParams.strategy =3D=3D cdict_cParams->= strategy); assert(cctx->appliedParams.cParams.hashLog =3D=3D cdict_cParams->h= ashLog); @@ -1843,17 +2046,30 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CC= tx* cctx, } =20 ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); + assert(params.useRowMatchFinder !=3D ZSTD_ps_auto); =20 /* copy tables */ - { size_t const chainSize =3D (cdict_cParams->strategy =3D=3D ZSTD_fa= st) ? 0 : ((size_t)1 << cdict_cParams->chainLog); + { size_t const chainSize =3D ZSTD_allocateChainTable(cdict_cParams->= strategy, cdict->useRowMatchFinder, 0 /* DDS guaranteed disabled */) + ? ((size_t)1 <= < cdict_cParams->chainLog) + : 0; size_t const hSize =3D (size_t)1 << cdict_cParams->hashLog; =20 ZSTD_memcpy(cctx->blockState.matchState.hashTable, cdict->matchState.hashTable, hSize * sizeof(U32)); - ZSTD_memcpy(cctx->blockState.matchState.chainTable, + /* Do not copy cdict's chainTable if cctx has parameters such that= it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, = cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { + ZSTD_memcpy(cctx->blockState.matchState.chainTable, cdict->matchState.chainTable, chainSize * sizeof(U32)); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRow= MatchFinder)) { + size_t const tagTableSize =3D hSize*sizeof(U16); + ZSTD_memcpy(cctx->blockState.matchState.tagTable, + cdict->matchState.tagTable, + tagTableSize); + } } =20 /* Zero the hashTable3, since the cdict never fills it */ @@ -1917,16 +2133,22 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dst= CCtx, U64 pledgedSrcSize, ZSTD_buffered_policy_e zbuff) { - DEBUGLOG(5, "ZSTD_copyCCtx_internal"); RETURN_ERROR_IF(srcCCtx->stage!=3DZSTDcs_init, stage_wrong, "Can't copy a ctx that's not in init stage."); - + DEBUGLOG(5, "ZSTD_copyCCtx_internal"); ZSTD_memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_cust= omMem)); { ZSTD_CCtx_params params =3D dstCCtx->requestedParams; /* Copy only compression parameters related to tables. */ params.cParams =3D srcCCtx->appliedParams.cParams; + assert(srcCCtx->appliedParams.useRowMatchFinder !=3D ZSTD_ps_auto); + assert(srcCCtx->appliedParams.useBlockSplitter !=3D ZSTD_ps_auto); + assert(srcCCtx->appliedParams.ldmParams.enableLdm !=3D ZSTD_ps_aut= o); + params.useRowMatchFinder =3D srcCCtx->appliedParams.useRowMatchFin= der; + params.useBlockSplitter =3D srcCCtx->appliedParams.useBlockSplitte= r; + params.ldmParams =3D srcCCtx->appliedParams.ldmParams; params.fParams =3D fParams; - ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, ZSTDcrp_leaveDirty, zbuff); assert(dstCCtx->appliedParams.cParams.windowLog =3D=3D srcCCtx->ap= pliedParams.cParams.windowLog); assert(dstCCtx->appliedParams.cParams.strategy =3D=3D srcCCtx->app= liedParams.cParams.strategy); @@ -1938,7 +2160,11 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstC= Ctx, ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); =20 /* copy tables */ - { size_t const chainSize =3D (srcCCtx->appliedParams.cParams.strateg= y =3D=3D ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chai= nLog); + { size_t const chainSize =3D ZSTD_allocateChainTable(srcCCtx->applie= dParams.cParams.strategy, + srcCCtx->appliedP= arams.useRowMatchFinder, + 0 /* forDDSDict *= /) + ? ((size_t)1 << srcCCtx->appliedParams= .cParams.chainLog) + : 0; size_t const hSize =3D (size_t)1 << srcCCtx->appliedParams.cParam= s.hashLog; int const h3log =3D srcCCtx->blockState.matchState.hashLog3; size_t const h3Size =3D h3log ? ((size_t)1 << h3log) : 0; @@ -2005,6 +2231,8 @@ ZSTD_reduceTable_internal (U32* const table, U32 cons= t size, U32 const reducerVa int const nbRows =3D (int)size / ZSTD_ROWSIZE; int cellNb =3D 0; int rowNb; + /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ + U32 const reducerThreshold =3D reducerValue + ZSTD_WINDOW_START_INDEX; assert((size & (ZSTD_ROWSIZE-1)) =3D=3D 0); /* multiple of ZSTD_ROWSI= ZE */ assert(size < (1U<<31)); /* can be casted to int */ =20 @@ -2012,12 +2240,17 @@ ZSTD_reduceTable_internal (U32* const table, U32 co= nst size, U32 const reducerVa for (rowNb=3D0 ; rowNb < nbRows ; rowNb++) { int column; for (column=3D0; columnhashTable, hSize, reducerValue); } =20 - if (params->cParams.strategy !=3D ZSTD_fast) { + if (ZSTD_allocateChainTable(params->cParams.strategy, params->useRowMa= tchFinder, (U32)ms->dedicatedDictSearch)) { U32 const chainSize =3D (U32)1 << params->cParams.chainLog; if (params->cParams.strategy =3D=3D ZSTD_btlazy2) ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerVal= ue); @@ -2072,14 +2305,14 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) assert(nbSeq <=3D seqStorePtr->maxNbSeq); for (u=3D0; ulongLengthID=3D=3D1) + if (seqStorePtr->longLengthType=3D=3DZSTD_llt_literalLength) llCodeTable[seqStorePtr->longLengthPos] =3D MaxLL; - if (seqStorePtr->longLengthID=3D=3D2) + if (seqStorePtr->longLengthType=3D=3DZSTD_llt_matchLength) mlCodeTable[seqStorePtr->longLengthPos] =3D MaxML; } =20 @@ -2093,10 +2326,161 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCt= x_params* cctxParams) return (cctxParams->targetCBlockSize !=3D 0); } =20 -/* ZSTD_entropyCompressSequences_internal(): - * actually compresses both literals and sequences */ +/* ZSTD_blockSplitterEnabled(): + * Returns if block splitting param is being used + * If used, compression will do best effort to split a block in order to i= mprove compression ratio. + * At the time this function is called, the parameter must be finalized. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) +{ + DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=3D%d)", cctxP= arams->useBlockSplitter); + assert(cctxParams->useBlockSplitter !=3D ZSTD_ps_auto); + return (cctxParams->useBlockSplitter =3D=3D ZSTD_ps_enable); +} + +/* Type returned by ZSTD_buildSequencesStatistics containing finalized sym= bol encoding types + * and size of the sequences statistics + */ +typedef struct { + U32 LLtype; + U32 Offtype; + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZST= D_entropyCompressSeqStore_internal() */ +} ZSTD_symbolEncodingTypeStats_t; + +/* ZSTD_buildSequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t, or a zstd error code in the `= size` field. + * Modifies `nextEntropy` to have the appropriate values as a side effect. + * nbSeq must be greater than 0. + * + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxS= eq + 1)*sizeof(U32) + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTab= les_t* nextEntropy, + BYTE* dst, const BYTE* const dstEnd, + ZSTD_strategy strategy, unsigned* countWorks= pace, + void* entropyWorkspace, size_t entropyWkspSi= ze) { + BYTE* const ostart =3D dst; + const BYTE* const oend =3D dstEnd; + BYTE* op =3D ostart; + FSE_CTable* CTable_LitLength =3D nextEntropy->litlengthCTable; + FSE_CTable* CTable_OffsetBits =3D nextEntropy->offcodeCTable; + FSE_CTable* CTable_MatchLength =3D nextEntropy->matchlengthCTable; + const BYTE* const ofCodeTable =3D seqStorePtr->ofCode; + const BYTE* const llCodeTable =3D seqStorePtr->llCode; + const BYTE* const mlCodeTable =3D seqStorePtr->mlCode; + ZSTD_symbolEncodingTypeStats_t stats; + + stats.lastCountSize =3D 0; + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr); + assert(op <=3D oend); + assert(nbSeq !=3D 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ + { unsigned max =3D MaxLL; + size_t const mostFrequent =3D HIST_countFast_wksp(countWorkspace, = &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fa= il */ + DEBUGLOG(5, "Building LL table"); + nextEntropy->litlength_repeatMode =3D prevEntropy->litlength_repea= tMode; + stats.LLtype =3D ZSTD_selectEncodingType(&nextEntropy->litlength_r= epeatMode, + countWorkspace, max, mostFrequent,= nbSeq, + LLFSELog, prevEntropy->litlengthCT= able, + LL_defaultNorm, LL_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(set_basic < set_compressed && set_rle < set_compressed); + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_r= epeatMode !=3D FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize =3D ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LL= type, + countWorkspace, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, + sizeof(prevEntropy->litlengthCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for LitLens failed"); + stats.size =3D countSize; + return stats; + } + if (stats.LLtype =3D=3D set_compressed) + stats.lastCountSize =3D countSize; + op +=3D countSize; + assert(op <=3D oend); + } } + /* build CTable for Offsets */ + { unsigned max =3D MaxOff; + size_t const mostFrequent =3D HIST_countFast_wksp( + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, en= tropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <=3D DefaultMaxOff, othe= rwise the offsets are too large */ + ZSTD_defaultPolicy_e const defaultPolicy =3D (max <=3D DefaultMaxO= ff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode =3D prevEntropy->offcode_repeatMod= e; + stats.Offtype =3D ZSTD_selectEncodingType(&nextEntropy->offcode_re= peatMode, + countWorkspace, max, mostFrequent,= nbSeq, + OffFSELog, prevEntropy->offcodeCTa= ble, + OF_defaultNorm, OF_defaultNormLog, + defaultPolicy, strategy); + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_re= peatMode !=3D FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize =3D ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.= Offtype, + countWorkspace, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, + sizeof(prevEntropy->offcodeCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for Offsets failed"); + stats.size =3D countSize; + return stats; + } + if (stats.Offtype =3D=3D set_compressed) + stats.lastCountSize =3D countSize; + op +=3D countSize; + assert(op <=3D oend); + } } + /* build CTable for MatchLengths */ + { unsigned max =3D MaxML; + size_t const mostFrequent =3D HIST_countFast_wksp( + countWorkspace, &max, mlCodeTable, nbSeq, entropyWorkspace, en= tropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend= -op)); + nextEntropy->matchlength_repeatMode =3D prevEntropy->matchlength_r= epeatMode; + stats.MLtype =3D ZSTD_selectEncodingType(&nextEntropy->matchlength= _repeatMode, + countWorkspace, max, mostFrequent,= nbSeq, + MLFSELog, prevEntropy->matchlength= CTable, + ML_defaultNorm, ML_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength= _repeatMode !=3D FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize =3D ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.= MLtype, + countWorkspace, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, + sizeof(prevEntropy->matchlengthCTable), + entropyWorkspace, entropyWkspSize); + if (ZSTD_isError(countSize)) { + DEBUGLOG(3, "ZSTD_buildCTable for MatchLengths failed"); + stats.size =3D countSize; + return stats; + } + if (stats.MLtype =3D=3D set_compressed) + stats.lastCountSize =3D countSize; + op +=3D countSize; + assert(op <=3D oend); + } } + stats.size =3D (size_t)(op-ostart); + return stats; +} + +/* ZSTD_entropyCompressSeqStore_internal(): + * compresses both literals and sequences + * Returns compressed size of block, or a zstd error. + */ +#define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 MEM_STATIC size_t -ZSTD_entropyCompressSequences_internal(seqStore_t* seqStorePtr, +ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, @@ -2110,36 +2494,38 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* = seqStorePtr, FSE_CTable* CTable_LitLength =3D nextEntropy->fse.litlengthCTable; FSE_CTable* CTable_OffsetBits =3D nextEntropy->fse.offcodeCTable; FSE_CTable* CTable_MatchLength =3D nextEntropy->fse.matchlengthCTable; - U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ const seqDef* const sequences =3D seqStorePtr->sequencesStart; + const size_t nbSeq =3D seqStorePtr->sequences - seqStorePtr->sequences= Start; const BYTE* const ofCodeTable =3D seqStorePtr->ofCode; const BYTE* const llCodeTable =3D seqStorePtr->llCode; const BYTE* const mlCodeTable =3D seqStorePtr->mlCode; BYTE* const ostart =3D (BYTE*)dst; BYTE* const oend =3D ostart + dstCapacity; BYTE* op =3D ostart; - size_t const nbSeq =3D (size_t)(seqStorePtr->sequences - seqStorePtr->= sequencesStart); - BYTE* seqHead; - BYTE* lastNCount =3D NULL; + size_t lastCountSize; =20 entropyWorkspace =3D count + (MaxSeq + 1); entropyWkspSize -=3D (MaxSeq + 1) * sizeof(*count); =20 - DEBUGLOG(4, "ZSTD_entropyCompressSequences_internal (nbSeq=3D%zu)", nb= Seq); + DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=3D%zu)", nbS= eq); ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >=3D (1<=3D HUF_WORKSPACE_SIZE); =20 /* Compress literals */ { const BYTE* const literals =3D seqStorePtr->litStart; + size_t const numSequences =3D seqStorePtr->sequences - seqStorePtr= ->sequencesStart; + size_t const numLiterals =3D seqStorePtr->lit - seqStorePtr->litSt= art; + /* Base suspicion of uncompressibility on ratio of literals to seq= uences */ + unsigned const suspectUncompressible =3D (numSequences =3D=3D 0) |= | (numLiterals / numSequences >=3D SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); size_t const litSize =3D (size_t)(seqStorePtr->lit - literals); size_t const cSize =3D ZSTD_compressLiterals( &prevEntropy->huf, &nextEntropy->huf, cctxParams->cParams.strategy, - ZSTD_disableLiteralsCompression(cctxPa= rams), + ZSTD_literalsCompressionIsDisabled(cct= xParams), op, dstCapacity, literals, litSize, entropyWorkspace, entropyWkspSize, - bmi2); + bmi2, suspectUncompressible); FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); assert(cSize <=3D dstCapacity); op +=3D cSize; @@ -2165,95 +2551,20 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* = seqStorePtr, ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntro= py->fse)); return (size_t)(op - ostart); } - - /* seqHead : flags for FSE encoding type */ - seqHead =3D op++; - assert(op <=3D oend); - - /* convert length/distances into codes */ - ZSTD_seqToCodes(seqStorePtr); - /* build CTable for Literal Lengths */ - { unsigned max =3D MaxLL; - size_t const mostFrequent =3D HIST_countFast_wksp(count, &max, llC= odeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ - DEBUGLOG(5, "Building LL table"); - nextEntropy->fse.litlength_repeatMode =3D prevEntropy->fse.litleng= th_repeatMode; - LLtype =3D ZSTD_selectEncodingType(&nextEntropy->fse.litlength_rep= eatMode, - count, max, mostFrequent, nbSeq, - LLFSELog, prevEntropy->fse.litleng= thCTable, - LL_defaultNorm, LL_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(set_basic < set_compressed && set_rle < set_compressed); - assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_rep= eatMode !=3D FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize =3D ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, - count, max, llCodeTable, nbSeq, - LL_defaultNorm, LL_defaultNormLog, MaxLL, - prevEntropy->fse.litlengthCTable, - sizeof(prevEntropy->fse.litlengthCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens fail= ed"); - if (LLtype =3D=3D set_compressed) - lastNCount =3D op; - op +=3D countSize; - assert(op <=3D oend); - } } - /* build CTable for Offsets */ - { unsigned max =3D MaxOff; - size_t const mostFrequent =3D HIST_countFast_wksp( - count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWksp= Size); /* can't fail */ - /* We can only use the basic table if max <=3D DefaultMaxOff, othe= rwise the offsets are too large */ - ZSTD_defaultPolicy_e const defaultPolicy =3D (max <=3D DefaultMaxO= ff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; - DEBUGLOG(5, "Building OF table"); - nextEntropy->fse.offcode_repeatMode =3D prevEntropy->fse.offcode_r= epeatMode; - Offtype =3D ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repe= atMode, - count, max, mostFrequent, nbSeq, - OffFSELog, prevEntropy->fse.offcod= eCTable, - OF_defaultNorm, OF_defaultNormLog, - defaultPolicy, strategy); - assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repe= atMode !=3D FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize =3D ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtyp= e, - count, max, ofCodeTable, nbSeq, - OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, - prevEntropy->fse.offcodeCTable, - sizeof(prevEntropy->fse.offcodeCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets fail= ed"); - if (Offtype =3D=3D set_compressed) - lastNCount =3D op; - op +=3D countSize; - assert(op <=3D oend); - } } - /* build CTable for MatchLengths */ - { unsigned max =3D MaxML; - size_t const mostFrequent =3D HIST_countFast_wksp( - count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWksp= Size); /* can't fail */ - DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend= -op)); - nextEntropy->fse.matchlength_repeatMode =3D prevEntropy->fse.match= length_repeatMode; - MLtype =3D ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_r= epeatMode, - count, max, mostFrequent, nbSeq, - MLFSELog, prevEntropy->fse.matchle= ngthCTable, - ML_defaultNorm, ML_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_r= epeatMode !=3D FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize =3D ZSTD_buildCTable( - op, (size_t)(oend - op), - CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, - count, max, mlCodeTable, nbSeq, - ML_defaultNorm, ML_defaultNormLog, MaxML, - prevEntropy->fse.matchlengthCTable, - sizeof(prevEntropy->fse.matchlengthCTable), - entropyWorkspace, entropyWkspSize); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths= failed"); - if (MLtype =3D=3D set_compressed) - lastNCount =3D op; - op +=3D countSize; - assert(op <=3D oend); - } } - - *seqHead =3D (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + { + ZSTD_symbolEncodingTypeStats_t stats; + BYTE* seqHead =3D op++; + /* build stats for sequences */ + stats =3D ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntro= py->fse, + op, oend, + strategy, count, + entropyWorkspace, entropyWks= pSize); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed= !"); + *seqHead =3D (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stat= s.MLtype<<2)); + lastCountSize =3D stats.lastCountSize; + op +=3D stats.size; + } =20 { size_t const bitstreamSize =3D ZSTD_encodeSequences( op, (size_t)(oend - op), @@ -2273,9 +2584,9 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* se= qStorePtr, * In this exceedingly rare case, we will simply emit an uncompres= sed * block, since it isn't worth optimizing. */ - if (lastNCount && (op - lastNCount) < 4) { - /* NCountSize >=3D 2 && bitstreamSize > 0 =3D=3D> lastCountSiz= e =3D=3D 3 */ - assert(op - lastNCount =3D=3D 3); + if (lastCountSize && (lastCountSize + bitstreamSize) < 4) { + /* lastCountSize >=3D 2 && bitstreamSize > 0 =3D=3D> lastCount= Size =3D=3D 3 */ + assert(lastCountSize + bitstreamSize =3D=3D 3); DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <=3D 1.3= .4 by " "emitting an uncompressed block."); return 0; @@ -2287,7 +2598,7 @@ ZSTD_entropyCompressSequences_internal(seqStore_t* se= qStorePtr, } =20 MEM_STATIC size_t -ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, +ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, const ZSTD_entropyCTables_t* prevEntropy, ZSTD_entropyCTables_t* nextEntropy, const ZSTD_CCtx_params* cctxParams, @@ -2296,7 +2607,7 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStorePtr, void* entropyWorkspace, size_t entropyWkspSiz= e, int bmi2) { - size_t const cSize =3D ZSTD_entropyCompressSequences_internal( + size_t const cSize =3D ZSTD_entropyCompressSeqStore_internal( seqStorePtr, prevEntropy, nextEntropy, cctxPar= ams, dst, dstCapacity, entropyWorkspace, entropyWkspSize, bmi2); @@ -2306,20 +2617,20 @@ ZSTD_entropyCompressSequences(seqStore_t* seqStoreP= tr, */ if ((cSize =3D=3D ERROR(dstSize_tooSmall)) & (srcSize <=3D dstCapacity= )) return 0; /* block not compressed */ - FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSequences_internal failed= "); + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"= ); =20 /* Check compressibility */ { size_t const maxCSize =3D srcSize - ZSTD_minGain(srcSize, cctxPara= ms->cParams.strategy); if (cSize >=3D maxCSize) return 0; /* block not compressed */ } - DEBUGLOG(4, "ZSTD_entropyCompressSequences() cSize: %zu\n", cSize); + DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); return cSize; } =20 /* ZSTD_selectBlockCompressor() : * Not static, but internal use only (used by long distance matcher) * assumption : strat is a valid strategy */ -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_= dictMode_e dictMode) +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_= paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) { static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX= +1] =3D { { ZSTD_compressBlock_fast /* default for 0 */, @@ -2367,7 +2678,28 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD= _strategy strat, ZSTD_dictMo ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast =3D=3D 1); =20 assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); - selectedCompressor =3D blockCompressor[(int)dictMode][(int)strat]; + DEBUGLOG(4, "Selected block compressor: dictMode=3D%d strat=3D%d rowMa= tchfinder=3D%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { + static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = =3D { + { ZSTD_compressBlock_greedy_row, + ZSTD_compressBlock_lazy_row, + ZSTD_compressBlock_lazy2_row }, + { ZSTD_compressBlock_greedy_extDict_row, + ZSTD_compressBlock_lazy_extDict_row, + ZSTD_compressBlock_lazy2_extDict_row }, + { ZSTD_compressBlock_greedy_dictMatchState_row, + ZSTD_compressBlock_lazy_dictMatchState_row, + ZSTD_compressBlock_lazy2_dictMatchState_row }, + { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, + ZSTD_compressBlock_lazy_dedicatedDictSearch_row, + ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } + }; + DEBUGLOG(4, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder !=3D ZSTD_ps_auto); + selectedCompressor =3D rowBasedBlockCompressors[(int)dictMode][(in= t)strat - (int)ZSTD_greedy]; + } else { + selectedCompressor =3D blockCompressor[(int)dictMode][(int)strat]; + } assert(selectedCompressor !=3D NULL); return selectedCompressor; } @@ -2383,7 +2715,7 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) { ssPtr->lit =3D ssPtr->litStart; ssPtr->sequences =3D ssPtr->sequencesStart; - ssPtr->longLengthID =3D 0; + ssPtr->longLengthType =3D ZSTD_llt_none; } =20 typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; @@ -2430,15 +2762,16 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, con= st void* src, size_t srcSize) zc->blockState.nextCBlock->rep[i] =3D zc->blockState.prevC= Block->rep[i]; } if (zc->externSeqStore.pos < zc->externSeqStore.size) { - assert(!zc->appliedParams.ldmParams.enableLdm); + assert(zc->appliedParams.ldmParams.enableLdm =3D=3D ZSTD_ps_di= sable); /* Updates ldmSeqStore.pos */ lastLLSize =3D ZSTD_ldm_blockCompress(&zc->externSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, src, srcSize); assert(zc->externSeqStore.pos <=3D zc->externSeqStore.size); - } else if (zc->appliedParams.ldmParams.enableLdm) { + } else if (zc->appliedParams.ldmParams.enableLdm =3D=3D ZSTD_ps_en= able) { rawSeqStore_t ldmSeqStore =3D kNullRawSeqStore; =20 ldmSeqStore.seq =3D zc->ldmSequences; @@ -2452,10 +2785,13 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, con= st void* src, size_t srcSize) ZSTD_ldm_blockCompress(&ldmSeqStore, ms, &zc->seqStore, zc->blockState.nextCBlock->rep, + zc->appliedParams.useRowMatchFinder, src, srcSize); assert(ldmSeqStore.pos =3D=3D ldmSeqStore.size); } else { /* not long range mode */ - ZSTD_blockCompressor const blockCompressor =3D ZSTD_selectBloc= kCompressor(zc->appliedParams.cParams.strategy, dictMode); + ZSTD_blockCompressor const blockCompressor =3D ZSTD_selectBloc= kCompressor(zc->appliedParams.cParams.strategy, + = zc->appliedParams.useRowMatchFinder, + = dictMode); ms->ldmSeqStore =3D NULL; lastLLSize =3D blockCompressor(ms, &zc->seqStore, zc->blockSta= te.nextCBlock->rep, src, srcSize); } @@ -2483,22 +2819,22 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) assert(zc->seqCollector.maxSequences >=3D seqStoreSeqSize + 1); ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeo= f(repcodes_t)); for (i =3D 0; i < seqStoreSeqSize; ++i) { - U32 rawOffset =3D seqStoreSeqs[i].offset - ZSTD_REP_NUM; + U32 rawOffset =3D seqStoreSeqs[i].offBase - ZSTD_REP_NUM; outSeqs[i].litLength =3D seqStoreSeqs[i].litLength; - outSeqs[i].matchLength =3D seqStoreSeqs[i].matchLength + MINMATCH; + outSeqs[i].matchLength =3D seqStoreSeqs[i].mlBase + MINMATCH; outSeqs[i].rep =3D 0; =20 if (i =3D=3D seqStore->longLengthPos) { - if (seqStore->longLengthID =3D=3D 1) { + if (seqStore->longLengthType =3D=3D ZSTD_llt_literalLength) { outSeqs[i].litLength +=3D 0x10000; - } else if (seqStore->longLengthID =3D=3D 2) { + } else if (seqStore->longLengthType =3D=3D ZSTD_llt_matchLengt= h) { outSeqs[i].matchLength +=3D 0x10000; } } =20 - if (seqStoreSeqs[i].offset <=3D ZSTD_REP_NUM) { + if (seqStoreSeqs[i].offBase <=3D ZSTD_REP_NUM) { /* Derive the correct offset corresponding to a repcode */ - outSeqs[i].rep =3D seqStoreSeqs[i].offset; + outSeqs[i].rep =3D seqStoreSeqs[i].offBase; if (outSeqs[i].litLength !=3D 0) { rawOffset =3D updatedRepcodes.rep[outSeqs[i].rep - 1]; } else { @@ -2512,9 +2848,9 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) outSeqs[i].offset =3D rawOffset; /* seqStoreSeqs[i].offset =3D=3D offCode+1, and ZSTD_updateRep() e= xpects offCode so we provide seqStoreSeqs[i].offset - 1 */ - updatedRepcodes =3D ZSTD_updateRep(updatedRepcodes.rep, - seqStoreSeqs[i].offset - 1, - seqStoreSeqs[i].litLength =3D=3D = 0); + ZSTD_updateRep(updatedRepcodes.rep, + seqStoreSeqs[i].offBase - 1, + seqStoreSeqs[i].litLength =3D=3D 0); literalsRead +=3D outSeqs[i].litLength; } /* Insert last literals (if any exist) in the block as a sequence with= ml =3D=3D off =3D=3D 0. @@ -2602,16 +2938,740 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStor= e) return nbSeqs < 4 && nbLits < 10; } =20 -static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc) +static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockStat= e_t* const bs) +{ + ZSTD_compressedBlockState_t* const tmp =3D bs->prevCBlock; + bs->prevCBlock =3D bs->nextCBlock; + bs->nextCBlock =3D tmp; +} + +/* Writes the block header */ +static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32= lastBlock) { + U32 const cBlockHeader =3D cSize =3D=3D 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize <= < 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSiz= e << 3); + MEM_writeLE24(op, cBlockHeader); + DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u= ", cSize, blockSize, lastBlock); +} + +/* ZSTD_buildBlockEntropyStats_literals() : + * Builds entropy for the literals. + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace + * @return : size of huffman description table or error code */ +static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t= srcSize, + const ZSTD_hufCTables_t* prevH= uf, + ZSTD_hufCTables_t* nextH= uf, + ZSTD_hufCTablesMetadata_= t* hufMetadata, + const int literalsCompre= ssionIsDisabled, + void* workspace, size_t = wkspSize) +{ + BYTE* const wkspStart =3D (BYTE*)workspace; + BYTE* const wkspEnd =3D wkspStart + wkspSize; + BYTE* const countWkspStart =3D wkspStart; + unsigned* const countWksp =3D (unsigned*)workspace; + const size_t countWkspSize =3D (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsi= gned); + BYTE* const nodeWksp =3D countWkspStart + countWkspSize; + const size_t nodeWkspSize =3D wkspEnd-nodeWksp; + unsigned maxSymbolValue =3D HUF_SYMBOLVALUE_MAX; + unsigned huffLog =3D HUF_TABLELOG_DEFAULT; + HUF_repeat repeat =3D prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=3D%zu)", sr= cSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (literalsCompressionIsDisabled) { + DEBUGLOG(5, "set_basic - disabled"); + hufMetadata->hType =3D set_basic; + return 0; + } + + /* small ? don't even attempt compression (speed opt) */ +#ifndef COMPRESS_LITERALS_SIZE_MIN +#define COMPRESS_LITERALS_SIZE_MIN 63 +#endif + { size_t const minLitSize =3D (prevHuf->repeatMode =3D=3D HUF_repeat= _valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <=3D minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType =3D set_basic; + return 0; + } + } + + /* Scan input and build symbol stats */ + { size_t const largest =3D HIST_count_wksp (countWksp, &maxSymbolVal= ue, (const BYTE*)src, srcSize, workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest =3D=3D srcSize) { + DEBUGLOG(5, "set_rle"); + hufMetadata->hType =3D set_rle; + return 0; + } + if (largest <=3D (srcSize >> 7)+4) { + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType =3D set_basic; + return 0; + } + } + + /* Validate the previous Huffman table */ + if (repeat =3D=3D HUF_repeat_check && !HUF_validateCTable((HUF_CElt co= nst*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat =3D HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); + huffLog =3D HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + { size_t const maxBits =3D HUF_buildCTable_wksp((HUF_CElt*)nextHuf->= CTable, countWksp, + maxSymbolValue, huffLo= g, + nodeWksp, nodeWkspSize= ); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog =3D (U32)maxBits; + { /* Build and write the CTable */ + size_t const newCSize =3D HUF_estimateCompressedSize( + (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); + size_t const hSize =3D HUF_writeCTable_wksp( + hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesB= uffer), + (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + /* Check against repeating the previous CTable */ + if (repeat !=3D HUF_repeat_none) { + size_t const oldCSize =3D HUF_estimateCompressedSize( + (HUF_CElt const*)prevHuf->CTable, countWksp, maxSy= mbolValue); + if (oldCSize < srcSize && (oldCSize <=3D hSize + newCSize = || hSize + 12 >=3D srcSize)) { + DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType =3D set_repeat; + return 0; + } + } + if (newCSize + hSize >=3D srcSize) { + DEBUGLOG(5, "set_basic - no gains"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType =3D set_basic; + return 0; + } + DEBUGLOG(5, "set_compressed (hSize=3D%u)", (U32)hSize); + hufMetadata->hType =3D set_compressed; + nextHuf->repeatMode =3D HUF_repeat_check; + return hSize; + } + } +} + + +/* ZSTD_buildDummySequencesStatistics(): + * Returns a ZSTD_symbolEncodingTypeStats_t with all encoding types as set= _basic, + * and updates nextEntropy to the appropriate repeatMode. + */ +static ZSTD_symbolEncodingTypeStats_t +ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + ZSTD_symbolEncodingTypeStats_t stats =3D {set_basic, set_basic, set_ba= sic, 0, 0}; + nextEntropy->litlength_repeatMode =3D FSE_repeat_none; + nextEntropy->offcode_repeatMode =3D FSE_repeat_none; + nextEntropy->matchlength_repeatMode =3D FSE_repeat_none; + return stats; +} + +/* ZSTD_buildBlockEntropyStats_sequences() : + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. + * @return : size of fse tables or error code */ +static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePt= r, + const ZSTD_fseCTables_t* pre= vEntropy, + ZSTD_fseCTables_t* nex= tEntropy, + const ZSTD_CCtx_params* cctx= Params, + ZSTD_fseCTablesMetadat= a_t* fseMetadata, + void* workspace, size_= t wkspSize) +{ + ZSTD_strategy const strategy =3D cctxParams->cParams.strategy; + size_t const nbSeq =3D seqStorePtr->sequences - seqStorePtr->sequences= Start; + BYTE* const ostart =3D fseMetadata->fseTablesBuffer; + BYTE* const oend =3D ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op =3D ostart; + unsigned* countWorkspace =3D (unsigned*)workspace; + unsigned* entropyWorkspace =3D countWorkspace + (MaxSeq + 1); + size_t entropyWorkspaceSize =3D wkspSize - (MaxSeq + 1) * sizeof(*coun= tWorkspace); + ZSTD_symbolEncodingTypeStats_t stats; + + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_sequences (nbSeq=3D%zu)", nbS= eq); + stats =3D nbSeq !=3D 0 ? ZSTD_buildSequencesStatistics(seqStorePtr, nb= Seq, + prevEntropy, nextEntropy, op, oe= nd, + strategy, countWorkspace, + entropyWorkspace, entropyWorkspa= ceSize) + : ZSTD_buildDummySequencesStatistics(nextEntropy); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); + fseMetadata->llType =3D (symbolEncodingType_e) stats.LLtype; + fseMetadata->ofType =3D (symbolEncodingType_e) stats.Offtype; + fseMetadata->mlType =3D (symbolEncodingType_e) stats.MLtype; + fseMetadata->lastCountSize =3D stats.lastCountSize; + return stats.size; +} + + +/* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE + * + * @return : 0 on success or error code + */ +size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyM= etadata, + void* workspace, size_t wkspSize) +{ + size_t const litSize =3D seqStorePtr->lit - seqStorePtr->litStart; + entropyMetadata->hufMetadata.hufDesSize =3D + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSiz= e, + &prevEntropy->huf, &nextEntrop= y->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisa= bled(cctxParams), + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildB= lockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize =3D + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, + &prevEntropy->fse, &nextEntr= opy->fse, + cctxParams, + &entropyMetadata->fseMetadat= a, + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_bui= ldBlockEntropyStats_sequences failed"); + return 0; +} + +/* Returns the size estimate for the literals section (header + content) o= f a block */ +static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t = litSize, + const ZSTD_hufCTables_t* h= uf, + const ZSTD_hufCTablesMetad= ata_t* hufMetadata, + void* workspace, size_t wk= spSize, + int writeEntropy) +{ + unsigned* const countWksp =3D (unsigned*)workspace; + unsigned maxSymbolValue =3D HUF_SYMBOLVALUE_MAX; + size_t literalSectionHeaderSize =3D 3 + (litSize >=3D 1 KB) + (litSize= >=3D 16 KB); + U32 singleStream =3D litSize < 256; + + if (hufMetadata->hType =3D=3D set_basic) return litSize; + else if (hufMetadata->hType =3D=3D set_rle) return 1; + else if (hufMetadata->hType =3D=3D set_compressed || hufMetadata->hTyp= e =3D=3D set_repeat) { + size_t const largest =3D HIST_count_wksp (countWksp, &maxSymbolVal= ue, (const BYTE*)literals, litSize, workspace, wkspSize); + if (ZSTD_isError(largest)) return litSize; + { size_t cLitSizeEstimate =3D HUF_estimateCompressedSize((const = HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); + if (writeEntropy) cLitSizeEstimate +=3D hufMetadata->hufDesSiz= e; + if (!singleStream) cLitSizeEstimate +=3D 6; /* multi-stream hu= ffman uses 6-byte jump table */ + return cLitSizeEstimate + literalSectionHeaderSize; + } } + assert(0); /* impossible */ + return 0; +} + +/* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) o= f a block */ +static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + const BYTE* codeTable, size_t nbSeq, unsigned maxC= ode, + const FSE_CTable* fseCTable, + const U8* additionalBits, + short const* defaultNorm, U32 defaultNormLog, U32 = defaultMax, + void* workspace, size_t wkspSize) +{ + unsigned* const countWksp =3D (unsigned*)workspace; + const BYTE* ctp =3D codeTable; + const BYTE* const ctStart =3D ctp; + const BYTE* const ctEnd =3D ctStart + nbSeq; + size_t cSymbolTypeSizeEstimateInBits =3D 0; + unsigned max =3D maxCode; + + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wksp= Size); /* can't fail */ + if (type =3D=3D set_basic) { + /* We selected this encoding type, so it must be valid. */ + assert(max <=3D defaultMax); + (void)defaultMax; + cSymbolTypeSizeEstimateInBits =3D ZSTD_crossEntropyCost(defaultNor= m, defaultNormLog, countWksp, max); + } else if (type =3D=3D set_rle) { + cSymbolTypeSizeEstimateInBits =3D 0; + } else if (type =3D=3D set_compressed || type =3D=3D set_repeat) { + cSymbolTypeSizeEstimateInBits =3D ZSTD_fseBitCost(fseCTable, count= Wksp, max); + } + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) { + return nbSeq * 10; + } + while (ctp < ctEnd) { + if (additionalBits) cSymbolTypeSizeEstimateInBits +=3D additionalB= its[*ctp]; + else cSymbolTypeSizeEstimateInBits +=3D *ctp; /* for offset, offse= t code is also the number of additional bits */ + ctp++; + } + return cSymbolTypeSizeEstimateInBits >> 3; +} + +/* Returns the size estimate for the sequences section (header + content) = of a block */ +static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_fseCTables_t*= fseTables, + const ZSTD_fseCTablesMet= adata_t* fseMetadata, + void* workspace, size_t = wkspSize, + int writeEntropy) +{ + size_t sequencesSectionHeaderSize =3D 1 /* seqHead */ + 1 /* min seqSi= ze size */ + (nbSeq >=3D 128) + (nbSeq >=3D LONGNBSEQ); + size_t cSeqSizeEstimate =3D 0; + cSeqSizeEstimate +=3D ZSTD_estimateBlockSize_symbolType(fseMetadata->o= fType, ofCodeTable, nbSeq, MaxOff, + fseTables->offcodeCTable, NULL, + OF_defaultNorm, OF_defaultNormLog= , DefaultMaxOff, + workspace, wkspSize); + cSeqSizeEstimate +=3D ZSTD_estimateBlockSize_symbolType(fseMetadata->l= lType, llCodeTable, nbSeq, MaxLL, + fseTables->litlengthCTable, LL_bi= ts, + LL_defaultNorm, LL_defaultNormLog= , MaxLL, + workspace, wkspSize); + cSeqSizeEstimate +=3D ZSTD_estimateBlockSize_symbolType(fseMetadata->m= lType, mlCodeTable, nbSeq, MaxML, + fseTables->matchlengthCTable, ML_= bits, + ML_defaultNorm, ML_defaultNormLog= , MaxML, + workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate +=3D fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +/* Returns the size estimate for a given stream of literals, of, ll, ml */ +static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* = entropyMetadata, + void* workspace, size_t wkspSize, + int writeLitEntropy, int writeSeqEntr= opy) { + size_t const literalsSize =3D ZSTD_estimateBlockSize_literal(literals,= litSize, + &entropy->huf, &e= ntropyMetadata->hufMetadata, + workspace, wkspSi= ze, writeLitEntropy); + size_t const seqSize =3D ZSTD_estimateBlockSize_sequences(ofCodeTable,= llCodeTable, mlCodeTable, + nbSeq, &entropy->= fse, &entropyMetadata->fseMetadata, + workspace, wkspSi= ze, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; +} + +/* Builds entropy statistics and uses them for blocksize estimation. + * + * Returns the estimated compressed size of the seqStore, or a zstd error. + */ +static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_= t* seqStore, ZSTD_CCtx* zc) { + ZSTD_entropyCTablesMetadata_t* entropyMetadata =3D &zc->blockSplitCtx.= entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* static= ally allocated in resetCCtx */), ""); + return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->l= it - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStar= t), + &zc->blockState.nextCBlock->entropy, entropyMetadata, = zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + (int)(entropyMetadata->hufMetadata.hType =3D=3D set_co= mpressed), 1); +} + +/* Returns literals bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqS= tore) { + size_t literalsBytes =3D 0; + size_t const nbSeqs =3D seqStore->sequences - seqStore->sequencesStart; + size_t i; + for (i =3D 0; i < nbSeqs; ++i) { + seqDef seq =3D seqStore->sequencesStart[i]; + literalsBytes +=3D seq.litLength; + if (i =3D=3D seqStore->longLengthPos && seqStore->longLengthType = =3D=3D ZSTD_llt_literalLength) { + literalsBytes +=3D 0x10000; + } + } + return literalsBytes; +} + +/* Returns match bytes represented in a seqStore */ +static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStor= e) { + size_t matchBytes =3D 0; + size_t const nbSeqs =3D seqStore->sequences - seqStore->sequencesStart; + size_t i; + for (i =3D 0; i < nbSeqs; ++i) { + seqDef seq =3D seqStore->sequencesStart[i]; + matchBytes +=3D seq.mlBase + MINMATCH; + if (i =3D=3D seqStore->longLengthPos && seqStore->longLengthType = =3D=3D ZSTD_llt_matchLength) { + matchBytes +=3D 0x10000; + } + } + return matchBytes; +} + +/* Derives the seqStore that is a chunk of the originalSeqStore from [star= tIdx, endIdx). + * Stores the result in resultSeqStore. + */ +static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + const seqStore_t* originalSeqStore, + size_t startIdx, size_t endIdx) { + BYTE* const litEnd =3D originalSeqStore->lit; + size_t literalsBytes; + size_t literalsBytesPreceding =3D 0; + + *resultSeqStore =3D *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences =3D originalSeqStore->sequencesStart + s= tartIdx; + literalsBytesPreceding =3D ZSTD_countSeqStoreLiteralsBytes(resultS= eqStore); + } + + /* Move longLengthPos into the correct position if necessary */ + if (originalSeqStore->longLengthType !=3D ZSTD_llt_none) { + if (originalSeqStore->longLengthPos < startIdx || originalSeqStore= ->longLengthPos > endIdx) { + resultSeqStore->longLengthType =3D ZSTD_llt_none; + } else { + resultSeqStore->longLengthPos -=3D (U32)startIdx; + } + } + resultSeqStore->sequencesStart =3D originalSeqStore->sequencesStart + = startIdx; + resultSeqStore->sequences =3D originalSeqStore->sequencesStart + endId= x; + literalsBytes =3D ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + resultSeqStore->litStart +=3D literalsBytesPreceding; + if (endIdx =3D=3D (size_t)(originalSeqStore->sequences - originalSeqSt= ore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk r= eaches the end of the block */ + resultSeqStore->lit =3D litEnd; + } else { + resultSeqStore->lit =3D resultSeqStore->litStart+literalsBytes; + } + resultSeqStore->llCode +=3D startIdx; + resultSeqStore->mlCode +=3D startIdx; + resultSeqStore->ofCode +=3D startIdx; +} + +/* + * Returns the raw offset represented by the combination of offCode, ll0, = and repcode history. + * offCode must represent a repcode in the numeric representation of ZSTD_= storeSeq(). + */ +static U32 +ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offC= ode, const U32 ll0) +{ + U32 const adjustedOffCode =3D STORED_REPCODE(offCode) - 1 + ll0; /* [= 0 - 3 ] */ + assert(STORED_IS_REPCODE(offCode)); + if (adjustedOffCode =3D=3D ZSTD_REP_NUM) { + /* litlength =3D=3D 0 and offCode =3D=3D 2 implies selection of fi= rst repcode - 1 */ + assert(rep[0] > 0); + return rep[0] - 1; + } + return rep[adjustedOffCode]; +} + +/* + * ZSTD_seqStore_resolveOffCodes() reconciles any possible divergences in = offset history that may arise + * due to emission of RLE/raw blocks that disturb the offset history, + * and replaces any repcodes within the seqStore that may be invalid. + * + * dRepcodes are updated as would be on the decompression side. + * cRepcodes are updated exactly in accordance with the seqStore. + * + * Note : this function assumes seq->offBase respects the following number= ing scheme : + * 0 : invalid + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, rep= codes_t* const cRepcodes, + seqStore_t* const seqStore, U32 = const nbSeq) { + U32 idx =3D 0; + for (; idx < nbSeq; ++idx) { + seqDef* const seq =3D seqStore->sequencesStart + idx; + U32 const ll0 =3D (seq->litLength =3D=3D 0); + U32 const offCode =3D OFFBASE_TO_STORED(seq->offBase); + assert(seq->offBase > 0); + if (STORED_IS_REPCODE(offCode)) { + U32 const dRawOffset =3D ZSTD_resolveRepcodeToRawOffset(dRepco= des->rep, offCode, ll0); + U32 const cRawOffset =3D ZSTD_resolveRepcodeToRawOffset(cRepco= des->rep, offCode, ll0); + /* Adjust simulated decompression repcode history if we come a= cross a mismatch. Replace + * the repcode with the offset it actually references, determi= ned by the compression + * repcode history. + */ + if (dRawOffset !=3D cRawOffset) { + seq->offBase =3D cRawOffset + ZSTD_REP_NUM; + } + } + /* Compression repcode history is always updated with values direc= tly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset valu= e taken from compression repcode history. + */ + ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll= 0); + ZSTD_updateRep(cRepcodes->rep, offCode, ll0); + } +} + +/* ZSTD_compressSeqStore_singleBlock(): + * Compresses a seqStore into a block with a block header, into the buffer= dst. + * + * Returns the total size of that block (including header) or a ZSTD error= code. + */ +static size_t +ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStor= e, + repcodes_t* const dRep, repcodes_t* cons= t cRep, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) { - ZSTD_compressedBlockState_t* const tmp =3D zc->blockState.prevCBlock; - zc->blockState.prevCBlock =3D zc->blockState.nextCBlock; - zc->blockState.nextCBlock =3D tmp; + const U32 rleMaxLength =3D 25; + BYTE* op =3D (BYTE*)dst; + const BYTE* ip =3D (const BYTE*)src; + size_t cSize; + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode= history must be reset */ + repcodes_t const dRepOriginal =3D *dRep; + DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore= ->sequences - seqStore->sequencesStart)); + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, = "Block header doesn't fit"); + cSeqsSize =3D ZSTD_entropyCompressSeqStore(seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextC= Block->entropy, + &zc->appliedParams, + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderS= ize, + srcSize, + zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically= allocated in resetCCtx */, + zc->bmi2); + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + + if (!zc->isFirstBlock && + cSeqsSize < rleMaxLength && + ZSTD_isRLE((BYTE const*)src, srcSize)) { + /* We don't want to emit our first block as a RLE even if it quali= fies because + * doing so will cause the decoder (cli only) to throw a "should co= nsume all input error." + * This is only an issue for zstd <=3D v1.4.3 + */ + cSeqsSize =3D 1; + } + + if (zc->seqCollector.collectSequences) { + ZSTD_copyBlockSequences(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } + + if (cSeqsSize =3D=3D 0) { + cSize =3D ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastB= lock); + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); + DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); + *dRep =3D dRepOriginal; /* reset simulated decompression repcode h= istory */ + } else if (cSeqsSize =3D=3D 1) { + cSize =3D ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, las= tBlock); + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); + DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); + *dRep =3D dRepOriginal; /* reset simulated decompression repcode h= istory */ + } else { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); + cSize =3D ZSTD_blockHeaderSize + cSeqsSize; + DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); + } + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode =3D=3D F= SE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode =3D FSE_= repeat_check; + + return cSize; +} + +/* Struct to keep track of where we are in our recursive calls. */ +typedef struct { + U32* splitLocations; /* Array of split indices */ + size_t idx; /* The current index within splitLocations bei= ng worked on */ +} seqStoreSplits; + +#define MIN_SEQUENCES_BLOCK_SPLITTING 300 + +/* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost o= f splitting the sequences in half. + * If advantageous to split, then we recurse down the two sub-blocks. If n= ot, or if an error occurred in estimation, then + * we do not recurse. + * + * Note: The recursion depth is capped by a heuristic minimum number of se= quences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 =3D=3D= log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * + * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS= . At ZSTD_MAX_NB_BLOCK_SPLITS =3D=3D 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ +static void +ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size= _t endIdx, + ZSTD_CCtx* zc, const seqStore_t* origSeqStore) +{ + seqStore_t* fullSeqStoreChunk =3D &zc->blockSplitCtx.fullSeqStoreChunk; + seqStore_t* firstHalfSeqStore =3D &zc->blockSplitCtx.firstHalfSeqStore; + seqStore_t* secondHalfSeqStore =3D &zc->blockSplitCtx.secondHalfSeqSto= re; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx =3D (startIdx + endIdx)/2; + + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= =3D ZSTD_MAX_NB_BLOCK_SPLITS) { + DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); + return; + } + DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=3D%zu endIdx=3D%zu= ", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, en= dIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, mi= dIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, end= Idx); + estimatedOriginalSize =3D ZSTD_buildEntropyStatisticsAndEstimateSubBlo= ckSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize =3D ZSTD_buildEntropyStatisticsAndEstimateSubBl= ockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize =3D ZSTD_buildEntropyStatisticsAndEstimateSubB= lockSize(secondHalfSeqStore, zc); + DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %= zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecon= dHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirst= HalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOrigin= alSize) { + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeq= Store); + splits->splitLocations[splits->idx] =3D (U32)midIdx; + splits->idx++; + ZSTD_deriveBlockSplitsHelper(splits, midIdx, endIdx, zc, origSeqSt= ore); + } +} + +/* Base recursive function. Populates a table with intra-block partition i= ndices that can improve compression ratio. + * + * Returns the number of splits made (which equals the size of the partiti= on table - 1). + */ +static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 = nbSeq) { + seqStoreSplits splits =3D {partitions, 0}; + if (nbSeq <=3D 4) { + DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } + ZSTD_deriveBlockSplitsHelper(&splits, 0, nbSeq, zc, &zc->seqStore); + splits.splitLocations[splits.idx] =3D nbSeq; + DEBUGLOG(5, "ZSTD_deriveBlockSplits: final nb partitions: %zu", splits= .idx+1); + return splits.idx; +} + +/* ZSTD_compressBlock_splitBlock(): + * Attempts to split a given block into multiple blocks to improve compres= sion ratio. + * + * Returns combined size of all blocks (which includes headers), or a ZSTD= error code. + */ +static size_t +ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t ds= tCapacity, + const void* src, size_t blockSize, = U32 lastBlock, U32 nbSeq) +{ + size_t cSize =3D 0; + const BYTE* ip =3D (const BYTE*)src; + BYTE* op =3D (BYTE*)dst; + size_t i =3D 0; + size_t srcBytesTotal =3D 0; + U32* partitions =3D zc->blockSplitCtx.partitions; /* size =3D=3D ZSTD_= MAX_NB_BLOCK_SPLITS */ + seqStore_t* nextSeqStore =3D &zc->blockSplitCtx.nextSeqStore; + seqStore_t* currSeqStore =3D &zc->blockSplitCtx.currSeqStore; + size_t numSplits =3D ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompre= ssed, then repcode history + * may become invalid. In order to reconcile potentially invalid repco= des, we keep track of two + * separate repcode histories that simulate repcode history on compres= sion and decompression side, + * and use the histories to determine whether we must replace a partic= ular repcode with its raw offset. + * + * 1) cRep gets updated for each partition, regardless of whether the = block was emitted as uncompressed + * or RLE. This allows us to retrieve the offset value that an inva= lid repcode references within + * a nocompress/RLE block. + * 2) dRep gets updated only for compressed partitions, and when a rep= code gets replaced, will use + * the replacement offset value rather than the original repcode to= update the repcode history. + * dRep also will be the final repcode history sent to the next blo= ck. + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ + repcodes_t dRep; + repcodes_t cRep; + ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_= t)); + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_= t)); + ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); + + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=3D%u,= dictLimit=3D%u, nextToUpdate=3D%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState= .window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits =3D=3D 0) { + size_t cSizeSingleBlock =3D ZSTD_compressSeqStore_singleBlock(zc, = &zc->seqStore, + &dRep, = &cRep, + op, ds= tCapacity, + ip, bl= ockSize, + lastBl= ock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from = splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); + assert(cSizeSingleBlock <=3D ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeader= Size); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]= ); + for (i =3D 0; i <=3D numSplits; ++i) { + size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition =3D (i =3D=3D numSplits); + U32 lastBlockEntireSrc =3D 0; + + srcBytes =3D ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_= countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal +=3D srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible l= ast literals */ + srcBytes +=3D blockSize - srcBytesTotal; + lastBlockEntireSrc =3D lastBlock; + } else { + ZSTD_deriveSeqStoreChunk(nextSeqStore, &zc->seqStore, partitio= ns[i], partitions[i+1]); + } + + cSizeChunk =3D ZSTD_compressSeqStore_singleBlock(zc, currSeqStore, + &dRep, &cRep, + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc,= 1 /* isPartition */); + DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntr= opyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip +=3D srcBytes; + op +=3D cSizeChunk; + dstCapacity -=3D cSizeChunk; + cSize +=3D cSizeChunk; + *currSeqStore =3D *nextSeqStore; + assert(cSizeChunk <=3D ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); + } + /* cRep and dRep may have diverged during the compression. If so, we u= se the dRep repcodes + * for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_= t)); + return cSize; +} + +static size_t +ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlo= ck) +{ + const BYTE* ip =3D (const BYTE*)src; + BYTE* op =3D (BYTE*)dst; + U32 nbSeq; + size_t cSize; + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); + assert(zc->appliedParams.useBlockSplitter =3D=3D ZSTD_ps_enable); + + { const size_t bss =3D ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss =3D=3D ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = =3D=3D FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = =3D FSE_repeat_check; + cSize =3D ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, l= astBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; + } + nbSeq =3D (U32)(zc->seqStore.sequences - zc->seqStore.sequencesSta= rt); + } + + cSize =3D ZSTD_compressBlock_splitBlock_internal(zc, dst, dstCapacity,= src, srcSize, lastBlock, nbSeq); + FORWARD_IF_ERROR(cSize, "Splitting blocks failed!"); + return cSize; } =20 -static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, - void* dst, size_t dstCapacity, - const void* src, size_t srcSize, U= 32 frame) +static size_t +ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) { /* This the upper bound for the length of an rle block. * This isn't the actual upper bound. Finding the real threshold @@ -2632,12 +3692,12 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx= * zc, =20 if (zc->seqCollector.collectSequences) { ZSTD_copyBlockSequences(zc); - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); return 0; } =20 /* encode sequences and literals */ - cSize =3D ZSTD_entropyCompressSequences(&zc->seqStore, + cSize =3D ZSTD_entropyCompressSeqStore(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBloc= k->entropy, &zc->appliedParams, dst, dstCapacity, @@ -2645,12 +3705,6 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx*= zc, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically all= ocated in resetCCtx */, zc->bmi2); =20 - if (zc->seqCollector.collectSequences) { - ZSTD_copyBlockSequences(zc); - return 0; - } - - if (frame && /* We don't want to emit our first block as a RLE even if it quali= fies because * doing so will cause the decoder (cli only) to throw a "should c= onsume all input error." @@ -2666,7 +3720,7 @@ static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* = zc, =20 out: if (!ZSTD_isError(cSize) && cSize > 1) { - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); } /* We check that dictionaries have offset codes available for the first * block. After the first block, the offcode table might not have large @@ -2719,7 +3773,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_bod= y(ZSTD_CCtx* zc, size_t const maxCSize =3D srcSize - ZSTD_minGain(srcSize, = zc->appliedParams.cParams.strategy); FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); if (cSize !=3D 0 && cSize < maxCSize + ZSTD_blockHeaderSiz= e) { - ZSTD_confirmRepcodesAndEntropyTables(zc); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->b= lockState); return cSize; } } @@ -2759,9 +3813,9 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchSt= ate_t* ms, void const* ip, void const* iend) { - if (ZSTD_window_needOverflowCorrection(ms->window, iend)) { - U32 const maxDist =3D (U32)1 << params->cParams.windowLog; - U32 const cycleLog =3D ZSTD_cycleLog(params->cParams.chainLog, par= ams->cParams.strategy); + U32 const cycleLog =3D ZSTD_cycleLog(params->cParams.chainLog, params-= >cParams.strategy); + U32 const maxDist =3D (U32)1 << params->cParams.windowLog; + if (ZSTD_window_needOverflowCorrection(ms->window, cycleLog, maxDist, = ms->loadedDictEnd, ip, iend)) { U32 const correction =3D ZSTD_window_correctOverflow(&ms->window, = cycleLog, maxDist, ip); ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <=3D 30); ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <=3D 30); @@ -2784,7 +3838,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchSt= ate_t* ms, * Frame is supposed already started (header already produced) * @return : compressed size, or an error code */ -static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, +static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastFrameChunk) @@ -2814,6 +3868,7 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cc= tx, ZSTD_overflowCorrectIfNeeded( ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize= ); ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->= loadedDictEnd, &ms->dictMatchState); + ZSTD_window_enforceMaxDist(&ms->window, ip, maxDist, &ms->loadedDi= ctEnd, &ms->dictMatchState); =20 /* Ensure hash/chain table insertion resumes no sooner than lowlim= it */ if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate =3D m= s->window.lowLimit; @@ -2824,6 +3879,10 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* c= ctx, FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSi= ze failed"); assert(cSize > 0); assert(cSize <=3D blockSize + ZSTD_blockHeaderSize); + } else if (ZSTD_blockSplitterEnabled(&cctx->appliedParams)) { + cSize =3D ZSTD_compressBlock_splitBlock(cctx, op, dstCapac= ity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_splitBlock fai= led"); + assert(cSize > 0 || cctx->seqCollector.collectSequences = =3D=3D 1); } else { cSize =3D ZSTD_compressBlock_internal(cctx, op+ZSTD_blockHeaderSize, dstCapaci= ty-ZSTD_blockHeaderSize, @@ -2946,7 +4005,7 @@ size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cct= x, rawSeq* seq, size_t nbSe { RETURN_ERROR_IF(cctx->stage !=3D ZSTDcs_init, stage_wrong, "wrong cctx stage"); - RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm, + RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm =3D=3D ZSTD_ps= _enable, parameter_unsupported, "incompatible with ldm"); cctx->externSeqStore.seq =3D seq; @@ -2983,11 +4042,12 @@ static size_t ZSTD_compressContinue_internal (ZSTD_= CCtx* cctx, =20 if (!srcSize) return fhSize; /* do not generate an empty block if no = input */ =20 - if (!ZSTD_window_update(&ms->window, src, srcSize)) { + if (!ZSTD_window_update(&ms->window, src, srcSize, ms->forceNonContigu= ous)) { + ms->forceNonContiguous =3D 0; ms->nextToUpdate =3D ms->window.dictLimit; } - if (cctx->appliedParams.ldmParams.enableLdm) { - ZSTD_window_update(&cctx->ldmState.window, src, srcSize); + if (cctx->appliedParams.ldmParams.enableLdm =3D=3D ZSTD_ps_enable) { + ZSTD_window_update(&cctx->ldmState.window, src, srcSize, /* forceN= onContiguous */ 0); } =20 if (!frame) { @@ -3055,63 +4115,86 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_match= State_t* ms, { const BYTE* ip =3D (const BYTE*) src; const BYTE* const iend =3D ip + srcSize; + int const loadLdmDict =3D params->ldmParams.enableLdm =3D=3D ZSTD_ps_e= nable && ls !=3D NULL; =20 - ZSTD_window_update(&ms->window, src, srcSize); + /* Assert that we the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + + if (srcSize > ZSTD_CHUNKSIZE_MAX) { + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_= MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ + U32 const maxDictSize =3D ZSTD_CURRENT_MAX - 1; + /* We must have cleared our windows when our source is this large.= */ + assert(ZSTD_window_isEmpty(ms->window)); + if (loadLdmDict) + assert(ZSTD_window_isEmpty(ls->window)); + /* If the dictionary is too large, only load the suffix of the dic= tionary. */ + if (srcSize > maxDictSize) { + ip =3D iend - maxDictSize; + src =3D ip; + srcSize =3D maxDictSize; + } + } + + DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=3D%d", (i= nt)params->useRowMatchFinder); + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */= 0); ms->loadedDictEnd =3D params->forceWindow ? 0 : (U32)(iend - ms->windo= w.base); + ms->forceNonContiguous =3D params->deterministicRefPrefix; =20 - if (params->ldmParams.enableLdm && ls !=3D NULL) { - ZSTD_window_update(&ls->window, src, srcSize); + if (loadLdmDict) { + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguou= s */ 0); ls->loadedDictEnd =3D params->forceWindow ? 0 : (U32)(iend - ls->w= indow.base); } =20 - /* Assert that we the ms params match the params we're being given */ - ZSTD_assertEqualCParams(params->cParams, ms->cParams); - if (srcSize <=3D HASH_READ_SIZE) return 0; =20 - while (iend - ip > HASH_READ_SIZE) { - size_t const remaining =3D (size_t)(iend - ip); - size_t const chunk =3D MIN(remaining, ZSTD_CHUNKSIZE_MAX); - const BYTE* const ichunk =3D ip + chunk; - - ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk); + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); =20 - if (params->ldmParams.enableLdm && ls !=3D NULL) - ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src = + srcSize, ¶ms->ldmParams); + if (loadLdmDict) + ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); =20 - switch(params->cParams.strategy) - { - case ZSTD_fast: - ZSTD_fillHashTable(ms, ichunk, dtlm); - break; - case ZSTD_dfast: - ZSTD_fillDoubleHashTable(ms, ichunk, dtlm); - break; + switch(params->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, iend, dtlm); + break; + case ZSTD_dfast: + ZSTD_fillDoubleHashTable(ms, iend, dtlm); + break; =20 - case ZSTD_greedy: - case ZSTD_lazy: - case ZSTD_lazy2: - if (chunk >=3D HASH_READ_SIZE && ms->dedicatedDictSearch) { - assert(chunk =3D=3D remaining); /* must load everything in= one go */ - ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HA= SH_READ_SIZE); - } else if (chunk >=3D HASH_READ_SIZE) { - ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE); + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + assert(srcSize >=3D HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable !=3D NULL); + ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, iend-HASH_REA= D_SIZE); + } else { + assert(params->useRowMatchFinder !=3D ZSTD_ps_auto); + if (params->useRowMatchFinder =3D=3D ZSTD_ps_enable) { + size_t const tagTableSize =3D ((size_t)1 << params->cParam= s.hashLog) * sizeof(U16); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); + } else { + ZSTD_insertAndFindFirstIndex(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); } - break; - - case ZSTD_btlazy2: /* we want the dictionary table fully sorted = */ - case ZSTD_btopt: - case ZSTD_btultra: - case ZSTD_btultra2: - if (chunk >=3D HASH_READ_SIZE) - ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk); - break; - - default: - assert(0); /* not possible : not a valid strategy id */ } + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + assert(srcSize >=3D HASH_READ_SIZE); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); + break; =20 - ip =3D ichunk; + default: + assert(0); /* not possible : not a valid strategy id */ } =20 ms->nextToUpdate =3D (U32)(iend - ms->window.base); @@ -3250,7 +4333,6 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressed= BlockState_t* bs, const BYTE* const dictEnd =3D dictPtr + dictSize; size_t dictID; size_t eSize; - ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >=3D (1<=3D 8); assert(MEM_readLE32(dictPtr) =3D=3D ZSTD_MAGIC_DICTIONARY); @@ -3321,6 +4403,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* = cctx, const ZSTD_CCtx_params* params, U64 pl= edgedSrcSize, ZSTD_buffered_policy_e zbuff) { + size_t const dictContentSize =3D cdict ? cdict->dictContentSize : dict= Size; DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=3D%u", params->cParams.= windowLog); /* params are supposed to be fully validated at this point */ assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); @@ -3335,7 +4418,8 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* = cctx, return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSi= ze, zbuff); } =20 - FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSiz= e, + FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + dictContentSize, ZSTDcrp_makeClean, zbuff) , ""); { size_t const dictID =3D cdict ? ZSTD_compress_insertDictionary( @@ -3350,7 +4434,7 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* = cctx, FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); assert(dictID <=3D UINT_MAX); cctx->dictID =3D (U32)dictID; - cctx->dictContentSize =3D cdict ? cdict->dictContentSize : dictSiz= e; + cctx->dictContentSize =3D dictContentSize; } return 0; } @@ -3485,15 +4569,14 @@ size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, const void* dict,size_t dictSize, ZSTD_parameters params) { - ZSTD_CCtx_params cctxParams; DEBUGLOG(4, "ZSTD_compress_advanced"); FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); - ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, ZSTD_NO_CLEVEL); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, ZSTD_NO= _CLEVEL); return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, - &cctxParams); + &cctx->simpleApiParams); } =20 /* Internal */ @@ -3517,14 +4600,13 @@ size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) { - ZSTD_CCtx_params cctxParams; { ZSTD_parameters const params =3D ZSTD_getParams_internal(compressi= onLevel, srcSize, dict ? dictSize : 0, ZSTD_cpm_noAttachDict); assert(params.fParams.contentSizeFlag =3D=3D 1); - ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLe= vel =3D=3D 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); + ZSTD_CCtxParams_init_internal(&cctx->simpleApiParams, ¶ms, (co= mpressionLevel =3D=3D 0) ? ZSTD_CLEVEL_DEFAULT: compressionLevel); } DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=3D%u)", (unsigned)srcSiz= e); - return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, sr= cSize, dict, dictSize, &cctxParams); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, sr= cSize, dict, dictSize, &cctx->simpleApiParams); } =20 size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, @@ -3561,7 +4643,10 @@ size_t ZSTD_estimateCDictSize_advanced( DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) - + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + /* enableDedicatedDictSearch =3D=3D 1 ensures that CDict estimati= on will not be too small + * in case we are using DDS with row-hash. */ + + ZSTD_sizeof_matchState(&cParams, ZSTD_resolveRowMatchFinderMode= (ZSTD_ps_auto, &cParams), + /* enableDedicatedDictSearch */ 1, /* fo= rCCtx */ 0) + (dictLoadMethod =3D=3D ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void= *)))); } @@ -3592,9 +4677,6 @@ static size_t ZSTD_initCDict_internal( assert(!ZSTD_checkCParams(params.cParams)); cdict->matchState.cParams =3D params.cParams; cdict->matchState.dedicatedDictSearch =3D params.enableDedicatedDictSe= arch; - if (cdict->matchState.dedicatedDictSearch && dictSize > ZSTD_CHUNKSIZE= _MAX) { - cdict->matchState.dedicatedDictSearch =3D 0; - } if ((dictLoadMethod =3D=3D ZSTD_dlm_byRef) || (!dictBuffer) || (!dictS= ize)) { cdict->dictContent =3D dictBuffer; } else { @@ -3615,6 +4697,7 @@ static size_t ZSTD_initCDict_internal( &cdict->matchState, &cdict->workspace, ¶ms.cParams, + params.useRowMatchFinder, ZSTDcrp_makeClean, ZSTDirp_reset, ZSTD_resetTarget_CDict), ""); @@ -3638,14 +4721,17 @@ static size_t ZSTD_initCDict_internal( =20 static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, - ZSTD_compressionParameters cParams, = ZSTD_customMem customMem) + ZSTD_compressionParameters cParams, + ZSTD_paramSwitch_e useRowMatchFinder, + U32 enableDedicatedDictSearch, + ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; =20 { size_t const workspaceSize =3D ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + - ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, enableDedi= catedDictSearch, /* forCCtx */ 0) + (dictLoadMethod =3D=3D ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(voi= d*)))); void* const workspace =3D ZSTD_customMalloc(workspaceSize, customM= em); @@ -3664,7 +4750,7 @@ static ZSTD_CDict* ZSTD_createCDict_advanced_internal= (size_t dictSize, ZSTD_cwksp_move(&cdict->workspace, &ws); cdict->customMem =3D customMem; cdict->compressionLevel =3D ZSTD_NO_CLEVEL; /* signals advanced AP= I usage */ - + cdict->useRowMatchFinder =3D useRowMatchFinder; return cdict; } } @@ -3686,7 +4772,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dic= tBuffer, size_t dictSize, &cctxParams, customMem); } =20 -ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( +ZSTD_CDict* ZSTD_createCDict_advanced2( const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType, @@ -3716,10 +4802,13 @@ ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced2( &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_crea= teCDict); } =20 + DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDe= dicatedDictSearch); cctxParams.cParams =3D cParams; + cctxParams.useRowMatchFinder =3D ZSTD_resolveRowMatchFinderMode(cctxPa= rams.useRowMatchFinder, &cParams); =20 cdict =3D ZSTD_createCDict_advanced_internal(dictSize, dictLoadMethod, cctxParams.cParams, + cctxParams.useRowMatchFinder, cctxParams.enableDed= icatedDictSearch, customMem); =20 if (ZSTD_isError( ZSTD_initCDict_internal(cdict, @@ -3788,7 +4877,9 @@ const ZSTD_CDict* ZSTD_initStaticCDict( ZSTD_dictContentType_e dictContentType, ZSTD_compressionParameters cParams) { - size_t const matchStateSize =3D ZSTD_sizeof_matchState(&cParams, /* fo= rCCtx */ 0); + ZSTD_paramSwitch_e const useRowMatchFinder =3D ZSTD_resolveRowMatchFin= derMode(ZSTD_ps_auto, &cParams); + /* enableDedicatedDictSearch =3D=3D 1 ensures matchstate is not too sm= all in case this CDict will be used for DDS + row hash */ + size_t const matchStateSize =3D ZSTD_sizeof_matchState(&cParams, useRo= wMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); size_t const neededSize =3D ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + (dictLoadMethod =3D=3D ZSTD_dlm_byRef ? 0 : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(di= ctSize, sizeof(void*)))) @@ -3813,6 +4904,8 @@ const ZSTD_CDict* ZSTD_initStaticCDict( =20 ZSTD_CCtxParams_init(¶ms, 0); params.cParams =3D cParams; + params.useRowMatchFinder =3D useRowMatchFinder; + cdict->useRowMatchFinder =3D useRowMatchFinder; =20 if (ZSTD_isError( ZSTD_initCDict_internal(cdict, dict, dictSize, @@ -3839,15 +4932,15 @@ unsigned ZSTD_getDictID_fromCDict(const ZSTD_CDict*= cdict) return cdict->dictID; } =20 - -/* ZSTD_compressBegin_usingCDict_advanced() : - * cdict must be !=3D NULL */ -size_t ZSTD_compressBegin_usingCDict_advanced( +/* ZSTD_compressBegin_usingCDict_internal() : + * Implementation of various ZSTD_compressBegin_usingCDict* functions. + */ +static size_t ZSTD_compressBegin_usingCDict_internal( ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSr= cSize) { ZSTD_CCtx_params cctxParams; - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_internal"); RETURN_ERROR_IF(cdict=3D=3DNULL, dictionary_wrong, "NULL pointer!"); /* Initialize the cctxParams from the cdict */ { @@ -3879,25 +4972,48 @@ size_t ZSTD_compressBegin_usingCDict_advanced( ZSTDb_not_buffered); } =20 + +/* ZSTD_compressBegin_usingCDict_advanced() : + * This function is DEPRECATED. + * cdict must be !=3D NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSr= cSize) +{ + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, pl= edgedSrcSize); +} + /* ZSTD_compressBegin_usingCDict() : - * pledgedSrcSize=3D0 means "unknown" - * if pledgedSrcSize>0, it will enable contentSizeFlag */ + * cdict must be !=3D NULL */ size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cd= ict) { ZSTD_frameParameters const fParams =3D { 0 /*content*/, 0 /*checksum*/= , 0 /*noDictID*/ }; - DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag =3D=3D %u", !f= Params.noDictIDFlag); - return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZS= TD_CONTENTSIZE_UNKNOWN); + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZS= TD_CONTENTSIZE_UNKNOWN); } =20 -size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, +/*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize, const ZSTD_CDict* cdict, ZSTD_frameParamet= ers fParams) { - FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, f= Params, srcSize), ""); /* will check if cdict !=3D NULL */ + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, f= Params, srcSize), ""); /* will check if cdict !=3D NULL */ return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); } =20 +/*! ZSTD_compress_usingCDict_advanced(): + * This function is DEPRECATED. + */ +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParamet= ers fParams) +{ + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, = srcSize, cdict, fParams); +} + /*! ZSTD_compress_usingCDict() : * Compression using a digested Dictionary. * Faster startup than ZSTD_compress_usingDict(), recommended when same d= ictionary is used multiple times. @@ -3909,7 +5025,7 @@ size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) { ZSTD_frameParameters const fParams =3D { 1 /*content*/, 0 /*checksum*/= , 0 /*noDictID*/ }; - return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, = srcSize, cdict, fParams); + return ZSTD_compress_usingCDict_internal(cctx, dst, dstCapacity, src, = srcSize, cdict, fParams); } =20 =20 @@ -4313,8 +5429,13 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CC= tx* cctx, FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local di= ct if present. */ ZSTD_memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* sing= le usage */ assert(prefixDict.dict=3D=3DNULL || cctx->cdict=3D=3DNULL); /* only= one can be set */ - if (cctx->cdict) - params.compressionLevel =3D cctx->cdict->compressionLevel; /* let = cdict take priority in terms of compression level */ + if (cctx->cdict && !cctx->localDict.cdict) { + /* Let the cdict's compression level take priority over the reques= ted params. + * But do not take the cdict's compression level if the "cdict" is= actually a localDict + * generated from ZSTD_initLocalDict(). + */ + params.compressionLevel =3D cctx->cdict->compressionLevel; + } DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); if (endOp =3D=3D ZSTD_e_end) cctx->pledgedSrcSizePlusOne =3D inSize + = 1; /* auto-fix pledgedSrcSize */ { @@ -4327,11 +5448,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CC= tx* cctx, dictSize, mode); } =20 - if (ZSTD_CParams_shouldEnableLdm(¶ms.cParams)) { - /* Enable LDM by default for optimal parser and window size >=3D 1= 28MB */ - DEBUGLOG(4, "LDM enabled by default (window size >=3D 128MB, strat= egy >=3D btopt)"); - params.ldmParams.enableLdm =3D 1; - } + params.useBlockSplitter =3D ZSTD_resolveBlockSplitterMode(params.useBl= ockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm =3D ZSTD_resolveEnableLdm(params.ldmParams.= enableLdm, ¶ms.cParams); + params.useRowMatchFinder =3D ZSTD_resolveRowMatchFinderMode(params.use= RowMatchFinder, ¶ms.cParams); =20 { U64 const pledgedSrcSize =3D cctx->pledgedSrcSizePlusOne - 1; assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); @@ -4436,39 +5555,39 @@ typedef struct { size_t posInSrc; /* Number of bytes given by sequences provided= so far */ } ZSTD_sequencePosition; =20 -/* Returns a ZSTD error code if sequence is not valid */ -static size_t ZSTD_validateSequence(U32 offCode, U32 matchLength, - size_t posInSrc, U32 windowLog, size_t= dictSize, U32 minMatch) { - size_t offsetBound; - U32 windowSize =3D 1 << windowLog; - /* posInSrc represents the amount of data the the decoder would decode= up to this point. +/* ZSTD_validateSequence() : + * @offCode : is presumed to follow format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ +static size_t +ZSTD_validateSequence(U32 offCode, U32 matchLength, + size_t posInSrc, U32 windowLog, size_t dictSize) +{ + U32 const windowSize =3D 1 << windowLog; + /* posInSrc represents the amount of data the decoder would decode up = to this point. * As long as the amount of data decoded is less than or equal to wind= ow size, offsets may be * larger than the total length of output decoded in order to referenc= e the dict, even larger than * window size. After output surpasses windowSize, we're limited to wi= ndowSize offsets again. */ - offsetBound =3D posInSrc > windowSize ? (size_t)windowSize : posInSrc = + (size_t)dictSize; - RETURN_ERROR_IF(offCode > offsetBound + ZSTD_REP_MOVE, corruption_dete= cted, "Offset too large!"); - RETURN_ERROR_IF(matchLength < minMatch, corruption_detected, "Matchlen= gth too small"); + size_t const offsetBound =3D posInSrc > windowSize ? (size_t)windowSiz= e : posInSrc + (size_t)dictSize; + RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detect= ed, "Offset too large!"); + RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlen= gth too small"); return 0; } =20 /* Returns an offset code, given a sequence's raw offset, the ongoing repc= ode array, and whether litLength =3D=3D 0 */ -static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM]= , U32 ll0) { - U32 offCode =3D rawOffset + ZSTD_REP_MOVE; - U32 repCode =3D 0; +static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM]= , U32 ll0) +{ + U32 offCode =3D STORE_OFFSET(rawOffset); =20 if (!ll0 && rawOffset =3D=3D rep[0]) { - repCode =3D 1; + offCode =3D STORE_REPCODE_1; } else if (rawOffset =3D=3D rep[1]) { - repCode =3D 2 - ll0; + offCode =3D STORE_REPCODE(2 - ll0); } else if (rawOffset =3D=3D rep[2]) { - repCode =3D 3 - ll0; + offCode =3D STORE_REPCODE(3 - ll0); } else if (ll0 && rawOffset =3D=3D rep[0] - 1) { - repCode =3D 3; - } - if (repCode) { - /* ZSTD_storeSeq expects a number in the range [0, 2] to represent= a repcode */ - offCode =3D repCode - 1; + offCode =3D STORE_REPCODE_3; } return offCode; } @@ -4476,18 +5595,17 @@ static U32 ZSTD_finalizeOffCode(U32 rawOffset, cons= t U32 rep[ZSTD_REP_NUM], U32 /* Returns 0 on success, and a ZSTD_error otherwise. This function scans t= hrough an array of * ZSTD_Sequence, storing the sequences it finds, until it reaches a block= delimiter. */ -static size_t ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cc= tx, ZSTD_sequencePosition* seqPos, - const ZSTD_Se= quence* const inSeqs, size_t inSeqsSize, - const void* s= rc, size_t blockSize) { +static size_t +ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPo= s, + const ZSTD_Sequence* const inSeqs,= size_t inSeqsSize, + const void* src, size_t blockSize) +{ U32 idx =3D seqPos->idx; BYTE const* ip =3D (BYTE const*)(src); const BYTE* const iend =3D ip + blockSize; repcodes_t updatedRepcodes; U32 dictSize; - U32 litLength; - U32 matchLength; - U32 ll0; - U32 offCode; =20 if (cctx->cdict) { dictSize =3D (U32)cctx->cdict->dictContentSize; @@ -4498,23 +5616,22 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitB= lockDelim(ZSTD_CCtx* cctx, ZS } ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, siz= eof(repcodes_t)); for (; (inSeqs[idx].matchLength !=3D 0 || inSeqs[idx].offset !=3D 0) &= & idx < inSeqsSize; ++idx) { - litLength =3D inSeqs[idx].litLength; - matchLength =3D inSeqs[idx].matchLength; - ll0 =3D litLength =3D=3D 0; - offCode =3D ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcod= es.rep, ll0); - updatedRepcodes =3D ZSTD_updateRep(updatedRepcodes.rep, offCode, l= l0); + U32 const litLength =3D inSeqs[idx].litLength; + U32 const ll0 =3D (litLength =3D=3D 0); + U32 const matchLength =3D inSeqs[idx].matchLength; + U32 const offCode =3D ZSTD_finalizeOffCode(inSeqs[idx].offset, upd= atedRepcodes.rep, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); =20 DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode,= matchLength, litLength); if (cctx->appliedParams.validateSequences) { seqPos->posInSrc +=3D litLength + matchLength; FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, s= eqPos->posInSrc, - cctx->appliedParams.cParam= s.windowLog, dictSize, - cctx->appliedParams.cParam= s.minMatch), + cctx->appliedParams.cParam= s.windowLog, dictSize), "Sequence validation faile= d"); } RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memor= y_allocation, "Not enough memory allocated. Try adjusting ZSTD_c= _minMatch."); - ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, match= Length - MINMATCH); + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, match= Length); ip +=3D matchLength + litLength; } ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, siz= eof(repcodes_t)); @@ -4541,9 +5658,11 @@ static size_t ZSTD_copySequencesToSeqStoreExplicitBl= ockDelim(ZSTD_CCtx* cctx, ZS * avoid splitting a match, or to avoid splitting a match such that it wou= ld produce a match * smaller than MINMATCH. In this case, we return the number of bytes that= we didn't read from this block. */ -static size_t ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZS= TD_sequencePosition* seqPos, - const ZSTD_Sequence= * const inSeqs, size_t inSeqsSize, - const void* src, si= ze_t blockSize) { +static size_t +ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePos= ition* seqPos, + const ZSTD_Sequence* const inSeqs, size= _t inSeqsSize, + const void* src, size_t blockSize) +{ U32 idx =3D seqPos->idx; U32 startPosInSequence =3D seqPos->posInSequence; U32 endPosInSequence =3D seqPos->posInSequence + (U32)blockSize; @@ -4553,10 +5672,6 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDel= im(ZSTD_CCtx* cctx, ZSTD_seq repcodes_t updatedRepcodes; U32 bytesAdjustment =3D 0; U32 finalMatchSplit =3D 0; - U32 litLength; - U32 matchLength; - U32 rawOffset; - U32 offCode; =20 if (cctx->cdict) { dictSize =3D cctx->cdict->dictContentSize; @@ -4570,9 +5685,10 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDel= im(ZSTD_CCtx* cctx, ZSTD_seq ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, siz= eof(repcodes_t)); while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { const ZSTD_Sequence currSeq =3D inSeqs[idx]; - litLength =3D currSeq.litLength; - matchLength =3D currSeq.matchLength; - rawOffset =3D currSeq.offset; + U32 litLength =3D currSeq.litLength; + U32 matchLength =3D currSeq.matchLength; + U32 const rawOffset =3D currSeq.offset; + U32 offCode; =20 /* Modify the sequence depending on where endPosInSequence lies */ if (endPosInSequence >=3D currSeq.litLength + currSeq.matchLength)= { @@ -4625,22 +5741,21 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDe= lim(ZSTD_CCtx* cctx, ZSTD_seq } } /* Check if this offset can be represented with a repcode */ - { U32 ll0 =3D (litLength =3D=3D 0); + { U32 const ll0 =3D (litLength =3D=3D 0); offCode =3D ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.re= p, ll0); - updatedRepcodes =3D ZSTD_updateRep(updatedRepcodes.rep, offCod= e, ll0); + ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); } =20 if (cctx->appliedParams.validateSequences) { seqPos->posInSrc +=3D litLength + matchLength; FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, s= eqPos->posInSrc, - cctx->appliedParams.cPa= rams.windowLog, dictSize, - cctx->appliedParams.cPa= rams.minMatch), + cctx->appliedParams.cPa= rams.windowLog, dictSize), "Sequence validation fa= iled"); } DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode,= matchLength, litLength); RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memor= y_allocation, "Not enough memory allocated. Try adjusting ZSTD_c= _minMatch."); - ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, match= Length - MINMATCH); + ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, match= Length); ip +=3D matchLength + litLength; } DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[= idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); @@ -4665,7 +5780,8 @@ static size_t ZSTD_copySequencesToSeqStoreNoBlockDeli= m(ZSTD_CCtx* cctx, ZSTD_seq typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosit= ion* seqPos, const ZSTD_Sequence* const inSeqs, = size_t inSeqsSize, const void* src, size_t blockSize); -static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e= mode) { +static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e= mode) +{ ZSTD_sequenceCopier sequenceCopier =3D NULL; assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode)); if (mode =3D=3D ZSTD_sf_explicitBlockDelimiters) { @@ -4679,12 +5795,15 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopie= r(ZSTD_sequenceFormat_e mode) =20 /* Compress, block-by-block, all of the sequences given. * - * Returns the cumulative size of all compressed blocks (including their h= eaders), otherwise a ZSTD error. + * Returns the cumulative size of all compressed blocks (including their h= eaders), + * otherwise a ZSTD error. */ -static size_t ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, - void* dst, size_t dstCapacit= y, - const ZSTD_Sequence* inSeqs,= size_t inSeqsSize, - const void* src, size_t srcS= ize) { +static size_t +ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) +{ size_t cSize =3D 0; U32 lastBlock; size_t blockSize; @@ -4694,7 +5813,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CC= tx* cctx, =20 BYTE const* ip =3D (BYTE const*)src; BYTE* op =3D (BYTE*)dst; - ZSTD_sequenceCopier sequenceCopier =3D ZSTD_selectSequenceCopier(cctx-= >appliedParams.blockDelimiters); + ZSTD_sequenceCopier const sequenceCopier =3D ZSTD_selectSequenceCopier= (cctx->appliedParams.blockDelimiters); =20 DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize:= %zu", srcSize, inSeqsSize); /* Special case: empty frame */ @@ -4732,7 +5851,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CC= tx* cctx, continue; } =20 - compressedSeqsSize =3D ZSTD_entropyCompressSequences(&cctx->seqSto= re, + compressedSeqsSize =3D ZSTD_entropyCompressSeqStore(&cctx->seqStor= e, &cctx->blockState.prevCBlock->entropy, &cc= tx->blockState.nextCBlock->entropy, &cctx->appliedParams, op + ZSTD_blockHeaderSize /* Leave space f= or block header */, dstCapacity - ZSTD_blockHeaderSize, @@ -4764,7 +5883,7 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CC= tx* cctx, } else { U32 cBlockHeader; /* Error checking and repcodes update */ - ZSTD_confirmRepcodesAndEntropyTables(cctx); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockSt= ate); if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMod= e =3D=3D FSE_repeat_valid) cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMod= e =3D FSE_repeat_check; =20 @@ -4794,7 +5913,8 @@ static size_t ZSTD_compressSequences_internal(ZSTD_CC= tx* cctx, =20 size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dst= Capacity, const ZSTD_Sequence* inSeqs, size_t inSeqsSi= ze, - const void* src, size_t srcSize) { + const void* src, size_t srcSize) +{ BYTE* op =3D (BYTE*)dst; size_t cSize =3D 0; size_t compressedBlocksSize =3D 0; @@ -4861,117 +5981,11 @@ size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outB= uffer* output) =20 =20 /*-=3D=3D=3D=3D=3D Pre-defined compression levels =3D=3D=3D=3D=3D-*/ +#include "clevels.h" =20 -#define ZSTD_MAX_CLEVEL 22 int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } - -static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MA= X_CLEVEL+1] =3D { -{ /* "default" - for any srcSize > 256 KB */ - /* W, C, H, S, L, TL, strat */ - { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels= */ - { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ - { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ - { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ - { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ - { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */ - { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */ - { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */ - { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */ - { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ - { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ - { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */ - { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */ - { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */ - { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ - { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ - { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ - { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ - { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ - { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ - { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ - { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ - { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ -}, -{ /* for srcSize <=3D 256 KB */ - /* W, C, H, S, L, T, strat */ - { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels= */ - { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ - { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ - { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ - { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/ - { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/ - { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ - { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ - { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ - { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ - { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ - { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ - { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ - { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ - { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ - { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ - { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ - { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ - { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ - { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -{ /* for srcSize <=3D 128 KB */ - /* W, C, H, S, L, T, strat */ - { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels= */ - { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ - { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ - { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ - { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ - { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ - { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ - { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ - { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ - { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ - { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ - { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ - { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ - { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ - { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ - { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ - { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ - { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ - { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ - { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -{ /* for srcSize <=3D 16 KB */ - /* W, C, H, S, L, T, strat */ - { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels= */ - { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ - { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ - { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ - { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ - { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ - { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ - { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ - { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ - { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ - { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ - { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ - { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ - { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ - { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ - { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ - { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ - { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ - { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ - { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ - { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ - { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ - { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ -}, -}; +int ZSTD_defaultCLevel(void) { return ZSTD_CLEVEL_DEFAULT; } =20 static ZSTD_compressionParameters ZSTD_dedicatedDictSearch_getCParams(int = const compressionLevel, size_t const dictSize) { @@ -4999,7 +6013,7 @@ static int ZSTD_dedicatedDictSearch_isSupported( { return (cParams->strategy >=3D ZSTD_greedy) && (cParams->strategy <=3D ZSTD_lazy2) - && (cParams->hashLog >=3D cParams->chainLog) + && (cParams->hashLog > cParams->chainLog) && (cParams->chainLog <=3D 24); } =20 @@ -5018,6 +6032,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( case ZSTD_lazy: case ZSTD_lazy2: cParams->hashLog -=3D ZSTD_LAZY_DDSS_BUCKET_LOG; + if (cParams->hashLog < ZSTD_HASHLOG_MIN) { + cParams->hashLog =3D ZSTD_HASHLOG_MIN; + } break; case ZSTD_btlazy2: case ZSTD_btopt: @@ -5066,6 +6083,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_int= ernal(int compressionLevel, else row =3D compressionLevel; =20 { ZSTD_compressionParameters cp =3D ZSTD_defaultCParameters[tableID]= [row]; + DEBUGLOG(5, "ZSTD_getCParams_internal selected tableID: %u row: %u= strat: %u", tableID, row, (U32)cp.strategy); /* acceleration factor */ if (compressionLevel < 0) { int const clampedCompressionLevel =3D MAX(ZSTD_minCLevel(), co= mpressionLevel); diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress= /zstd_compress_internal.h index 685d2f996cc2..71697a11ae30 100644 --- a/lib/zstd/compress/zstd_compress_internal.h +++ b/lib/zstd/compress/zstd_compress_internal.h @@ -57,7 +57,7 @@ typedef struct { } ZSTD_localDict; =20 typedef struct { - HUF_CElt CTable[HUF_CTABLE_SIZE_U32(255)]; + HUF_CElt CTable[HUF_CTABLE_SIZE_ST(255)]; HUF_repeat repeatMode; } ZSTD_hufCTables_t; =20 @@ -75,8 +75,55 @@ typedef struct { ZSTD_fseCTables_t fse; } ZSTD_entropyCTables_t; =20 +/* ********************************************* +* Entropy buffer statistics structs and funcs * +***********************************************/ +/* ZSTD_hufCTablesMetadata_t : + * Stores Literals Block Type for a super-block in hType, and + * huffman tree description in hufDesBuffer. + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */ typedef struct { - U32 off; /* Offset code (offset + ZSTD_REP_MOVE) for the ma= tch */ + symbolEncodingType_e hType; + BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; + size_t hufDesSize; +} ZSTD_hufCTablesMetadata_t; + +/* ZSTD_fseCTablesMetadata_t : + * Stores symbol compression modes for a super-block in {ll, ol, ml}Type,= and + * fse tables in fseTablesBuffer. + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() = */ +typedef struct { + symbolEncodingType_e llType; + symbolEncodingType_e ofType; + symbolEncodingType_e mlType; + BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More det= ail in ZSTD_entropyCompressSeqStore_internal() */ +} ZSTD_fseCTablesMetadata_t; + +typedef struct { + ZSTD_hufCTablesMetadata_t hufMetadata; + ZSTD_fseCTablesMetadata_t fseMetadata; +} ZSTD_entropyCTablesMetadata_t; + +/* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyM= etadata, + void* workspace, size_t wkspSize); + +/* ******************************* +* Compression internals structs * +*********************************/ + +typedef struct { + U32 off; /* Offset sumtype code for the match, using ZSTD_s= toreSeq() format */ U32 len; /* Raw length of match */ } ZSTD_match_t; =20 @@ -126,7 +173,7 @@ typedef struct { U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, = or follow a pre-defined cost structure */ const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionar= y statistics */ - ZSTD_literalCompressionMode_e literalCompressionMode; + ZSTD_paramSwitch_e literalCompressionMode; } optState_t; =20 typedef struct { @@ -135,14 +182,23 @@ typedef struct { } ZSTD_compressedBlockState_t; =20 typedef struct { - BYTE const* nextSrc; /* next block here to continue on current pref= ix */ - BYTE const* base; /* All regular indexes relative to this positi= on */ - BYTE const* dictBase; /* extDict indexes relative to this position */ - U32 dictLimit; /* below that point, need extDict */ - U32 lowLimit; /* below that point, no more valid data */ + BYTE const* nextSrc; /* next block here to continue on current p= refix */ + BYTE const* base; /* All regular indexes relative to this pos= ition */ + BYTE const* dictBase; /* extDict indexes relative to this positio= n */ + U32 dictLimit; /* below that point, need extDict */ + U32 lowLimit; /* below that point, no more valid data */ + U32 nbOverflowCorrections; /* Number of times overflow correction has = run since + * ZSTD_window_init(). Useful for debugging= coredumps + * and for ZSTD_WINDOW_OVERFLOW_CORRECT_FRE= QUENTLY. + */ } ZSTD_window_t; =20 +#define ZSTD_WINDOW_START_INDEX 2 + typedef struct ZSTD_matchState_t ZSTD_matchState_t; + +#define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache= for row-based matchfinder */ + struct ZSTD_matchState_t { ZSTD_window_t window; /* State for window round buffer management */ U32 loadedDictEnd; /* index of end of dictionary, within context'= s referential. @@ -154,9 +210,17 @@ struct ZSTD_matchState_t { */ U32 nextToUpdate; /* index from which to continue table update */ U32 hashLog3; /* dispatch table for matches of len=3D=3D3 : = larger =3D=3D faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder:= Hashlog based on nb of rows in the hashTable.*/ + U16* tagTable; /* For row-based matchFinder:= A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder:= a cache of hashes to improve speed */ + U32* hashTable; U32* hashTable3; U32* chainTable; + + U32 forceNonContiguous; /* Non-zero if we should force non-contiguous = load for the next window update. */ + int dedicatedDictSearch; /* Indicates whether this matchState is usin= g the * dedicated dictionary search structure. */ @@ -196,7 +260,7 @@ typedef struct { } ldmState_t; =20 typedef struct { - U32 enableLdm; /* 1 if enable long distance matching */ + ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps= _auto by default */ U32 hashLog; /* Log size of hashTable */ U32 bucketSizeLog; /* Log bucket size for collision resolution, a= t most 8 */ U32 minMatchLength; /* Minimum match length */ @@ -227,7 +291,7 @@ struct ZSTD_CCtx_params_s { * There is no guarantee that hint is close= to actual source size */ =20 ZSTD_dictAttachPref_e attachDictPref; - ZSTD_literalCompressionMode_e literalCompressionMode; + ZSTD_paramSwitch_e literalCompressionMode; =20 /* Multithreading: used to pass parameters to mtctx */ int nbWorkers; @@ -249,6 +313,15 @@ struct ZSTD_CCtx_params_s { ZSTD_sequenceFormat_e blockDelimiters; int validateSequences; =20 + /* Block splitting */ + ZSTD_paramSwitch_e useBlockSplitter; + + /* Param for deciding whether to use row-based matchfinder */ + ZSTD_paramSwitch_e useRowMatchFinder; + + /* Always load a dictionary in ext-dict mode (not prefix mode)? */ + int deterministicRefPrefix; + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ ZSTD_customMem customMem; }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ @@ -266,12 +339,29 @@ typedef enum { ZSTDb_buffered } ZSTD_buffered_policy_e; =20 +/* + * Struct that contains all elements of block splitter that should be allo= cated + * in a wksp. + */ +#define ZSTD_MAX_NB_BLOCK_SPLITS 196 +typedef struct { + seqStore_t fullSeqStoreChunk; + seqStore_t firstHalfSeqStore; + seqStore_t secondHalfSeqStore; + seqStore_t currSeqStore; + seqStore_t nextSeqStore; + + U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS]; + ZSTD_entropyCTablesMetadata_t entropyMetadata; +} ZSTD_blockSplitCtx; + struct ZSTD_CCtx_s { ZSTD_compressionStage_e stage; int cParamsChanged; /* =3D=3D 1 if cParams(except wlo= g) or compression level are changed in requestedParams. Triggers transmissi= on of new params to ZSTDMT (if available) then reset to 0. */ int bmi2; /* =3D=3D 1 if the CPU supports B= MI2 and 0 otherwise. CPU support is determined dynamically once per context= lifetime. */ ZSTD_CCtx_params requestedParams; ZSTD_CCtx_params appliedParams; + ZSTD_CCtx_params simpleApiParams; /* Param storage used by the simp= le API - not sticky. Must only be used in top-level simple API functions fo= r storage. */ U32 dictID; size_t dictContentSize; =20 @@ -296,7 +386,7 @@ struct ZSTD_CCtx_s { ZSTD_blockState_t blockState; U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE= bytes */ =20 - /* Wether we are streaming or not */ + /* Whether we are streaming or not */ ZSTD_buffered_policy_e bufferedPolicy; =20 /* streaming */ @@ -324,6 +414,9 @@ struct ZSTD_CCtx_s { /* Multi-threading */ =20 /* Tracing */ + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; }; =20 typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; @@ -358,7 +451,7 @@ typedef enum { typedef size_t (*ZSTD_blockCompressor) ( ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); -ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_= dictMode_e dictMode); +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_= paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); =20 =20 MEM_STATIC U32 ZSTD_LLcode(U32 litLength) @@ -392,31 +485,6 @@ MEM_STATIC U32 ZSTD_MLcode(U32 mlBase) return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Cod= e[mlBase]; } =20 -typedef struct repcodes_s { - U32 rep[3]; -} repcodes_t; - -MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U= 32 const ll0) -{ - repcodes_t newReps; - if (offset >=3D ZSTD_REP_NUM) { /* full offset */ - newReps.rep[2] =3D rep[1]; - newReps.rep[1] =3D rep[0]; - newReps.rep[0] =3D offset - ZSTD_REP_MOVE; - } else { /* repcode */ - U32 const repCode =3D offset + ll0; - if (repCode > 0) { /* note : if repCode=3D=3D0, no change */ - U32 const currentOffset =3D (repCode=3D=3DZSTD_REP_NUM) ? (rep= [0] - 1) : rep[repCode]; - newReps.rep[2] =3D (repCode >=3D 2) ? rep[1] : rep[2]; - newReps.rep[1] =3D rep[0]; - newReps.rep[0] =3D currentOffset; - } else { /* repCode =3D=3D 0 */ - ZSTD_memcpy(&newReps, rep, sizeof(newReps)); - } - } - return newReps; -} - /* ZSTD_cParam_withinBounds: * @return 1 if value is within cParam bounds, * 0 otherwise */ @@ -465,17 +533,17 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_s= trategy strat) return (srcSize >> minlog) + 2; } =20 -MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cct= xParams) +MEM_STATIC int ZSTD_literalsCompressionIsDisabled(const ZSTD_CCtx_params* = cctxParams) { switch (cctxParams->literalCompressionMode) { - case ZSTD_lcm_huffman: + case ZSTD_ps_enable: return 0; - case ZSTD_lcm_uncompressed: + case ZSTD_ps_disable: return 1; default: assert(0 /* impossible: pre-validated */); ZSTD_FALLTHROUGH; - case ZSTD_lcm_auto: + case ZSTD_ps_auto: return (cctxParams->cParams.strategy =3D=3D ZSTD_fast) && (cctxPar= ams->cParams.targetLength > 0); } } @@ -485,7 +553,9 @@ MEM_STATIC int ZSTD_disableLiteralsCompression(const ZS= TD_CCtx_params* cctxParam * Only called when the sequence ends past ilimit_w, so it only needs to = be optimized for single * large copies. */ -static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* co= nst iend, BYTE const* ilimit_w) { +static void +ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BY= TE const* ilimit_w) +{ assert(iend > ilimit_w); if (ip <=3D ilimit_w) { ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap); @@ -495,14 +565,30 @@ static void ZSTD_safecopyLiterals(BYTE* op, BYTE cons= t* ip, BYTE const* const ie while (ip < iend) *op++ =3D *ip++; } =20 +#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +#define STORE_REPCODE_1 STORE_REPCODE(1) +#define STORE_REPCODE_2 STORE_REPCODE(2) +#define STORE_REPCODE_3 STORE_REPCODE(3) +#define STORE_REPCODE(r) (assert((r)>=3D1), assert((r)<=3D3), (r)-1) +#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +#define STORED_IS_REPCODE(o) ((o) <=3D ZSTD_REP_MOVE) +#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* return= s ID 1,2,3 */ +#define STORED_TO_OFFBASE(o) ((o)+1) +#define OFFBASE_TO_STORED(o) ((o)-1) + /*! ZSTD_storeSeq() : - * Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t. - * `offCode` : distance to match + ZSTD_REP_MOVE (values <=3D ZSTD_REP_MO= VE are repCodes). - * `mlBase` : matchLength - MINMATCH + * Store a sequence (litlen, litPtr, offCode and matchLength) into seqSto= re_t. + * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and S= TORE_OFFSET(). + * @matchLength : must be >=3D MINMATCH * Allowed to overread literals up to litLimit. */ -HINT_INLINE UNUSED_ATTR -void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* = literals, const BYTE* litLimit, U32 offCode, size_t mlBase) +HINT_INLINE UNUSED_ATTR void +ZSTD_storeSeq(seqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, + U32 offBase_minus1, + size_t matchLength) { BYTE const* const litLimit_w =3D litLimit - WILDCOPY_OVERLENGTH; BYTE const* const litEnd =3D literals + litLength; @@ -511,7 +597,7 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litL= ength, const BYTE* litera if (g_start=3D=3DNULL) g_start =3D (const BYTE*)literals; /* note : i= ndex only works for compression within a single segment */ { U32 const pos =3D (U32)((const BYTE*)literals - g_start); DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", - pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode); + pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); } #endif assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) = < seqStorePtr->maxNbSeq); @@ -535,26 +621,66 @@ void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t li= tLength, const BYTE* litera =20 /* literal Length */ if (litLength>0xFFFF) { - assert(seqStorePtr->longLengthID =3D=3D 0); /* there can only be a= single long length */ - seqStorePtr->longLengthID =3D 1; + assert(seqStorePtr->longLengthType =3D=3D ZSTD_llt_none); /* there= can only be a single long length */ + seqStorePtr->longLengthType =3D ZSTD_llt_literalLength; seqStorePtr->longLengthPos =3D (U32)(seqStorePtr->sequences - seqS= torePtr->sequencesStart); } seqStorePtr->sequences[0].litLength =3D (U16)litLength; =20 /* match offset */ - seqStorePtr->sequences[0].offset =3D offCode + 1; + seqStorePtr->sequences[0].offBase =3D STORED_TO_OFFBASE(offBase_minus1= ); =20 /* match Length */ - if (mlBase>0xFFFF) { - assert(seqStorePtr->longLengthID =3D=3D 0); /* there can only be a= single long length */ - seqStorePtr->longLengthID =3D 2; - seqStorePtr->longLengthPos =3D (U32)(seqStorePtr->sequences - seqS= torePtr->sequencesStart); + assert(matchLength >=3D MINMATCH); + { size_t const mlBase =3D matchLength - MINMATCH; + if (mlBase>0xFFFF) { + assert(seqStorePtr->longLengthType =3D=3D ZSTD_llt_none); /* t= here can only be a single long length */ + seqStorePtr->longLengthType =3D ZSTD_llt_matchLength; + seqStorePtr->longLengthPos =3D (U32)(seqStorePtr->sequences - = seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].mlBase =3D (U16)mlBase; } - seqStorePtr->sequences[0].matchLength =3D (U16)mlBase; =20 seqStorePtr->sequences++; } =20 +/* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) + * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_st= oreSeq() + */ +MEM_STATIC void +ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const = ll0) +{ + if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ + rep[2] =3D rep[1]; + rep[1] =3D rep[0]; + rep[0] =3D STORED_OFFSET(offBase_minus1); + } else { /* repcode */ + U32 const repCode =3D STORED_REPCODE(offBase_minus1) - 1 + ll0; + if (repCode > 0) { /* note : if repCode=3D=3D0, no change */ + U32 const currentOffset =3D (repCode=3D=3DZSTD_REP_NUM) ? (rep= [0] - 1) : rep[repCode]; + rep[2] =3D (repCode >=3D 2) ? rep[1] : rep[2]; + rep[1] =3D rep[0]; + rep[0] =3D currentOffset; + } else { /* repCode =3D=3D 0 */ + /* nothing to do */ + } + } +} + +typedef struct repcodes_s { + U32 rep[3]; +} repcodes_t; + +MEM_STATIC repcodes_t +ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 con= st ll0) +{ + repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); + ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); + return newReps; +} + =20 /*-************************************* * Match length counter @@ -778,6 +904,13 @@ MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* windo= w) window->dictLimit =3D end; } =20 +MEM_STATIC U32 ZSTD_window_isEmpty(ZSTD_window_t const window) +{ + return window.dictLimit =3D=3D ZSTD_WINDOW_START_INDEX && + window.lowLimit =3D=3D ZSTD_WINDOW_START_INDEX && + (window.nextSrc - window.base) =3D=3D ZSTD_WINDOW_START_INDEX; +} + /* * ZSTD_window_hasExtDict(): * Returns non-zero if the window has a non-empty extDict. @@ -801,15 +934,71 @@ MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(c= onst ZSTD_matchState_t *ms) ZSTD_noDict; } =20 +/* Defining this macro to non-zero tells zstd to run the overflow correcti= on + * code much more frequently. This is very inefficient, and should only be + * used for tests and fuzzers. + */ +#ifndef ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY +# ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +# define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 1 +# else +# define ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY 0 +# endif +#endif + +/* + * ZSTD_window_canOverflowCorrect(): + * Returns non-zero if the indices are large enough for overflow correction + * to work correctly without impacting compression ratio. + */ +MEM_STATIC U32 ZSTD_window_canOverflowCorrect(ZSTD_window_t const window, + U32 cycleLog, + U32 maxDist, + U32 loadedDictEnd, + void const* src) +{ + U32 const cycleSize =3D 1u << cycleLog; + U32 const curr =3D (U32)((BYTE const*)src - window.base); + U32 const minIndexToOverflowCorrect =3D cycleSize + + MAX(maxDist, cycleSize) + + ZSTD_WINDOW_START_INDEX; + + /* Adjust the min index to backoff the overflow correction frequency, + * so we don't waste too much CPU in overflow correction. If this + * computation overflows we don't really care, we just need to make + * sure it is at least minIndexToOverflowCorrect. + */ + U32 const adjustment =3D window.nbOverflowCorrections + 1; + U32 const adjustedIndex =3D MAX(minIndexToOverflowCorrect * adjustment, + minIndexToOverflowCorrect); + U32 const indexLargeEnough =3D curr > adjustedIndex; + + /* Only overflow correct early if the dictionary is invalidated alread= y, + * so we don't hurt compression ratio. + */ + U32 const dictionaryInvalidated =3D curr > maxDist + loadedDictEnd; + + return indexLargeEnough && dictionaryInvalidated; +} + /* * ZSTD_window_needOverflowCorrection(): * Returns non-zero if the indices are getting too large and need overflow * protection. */ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const wind= ow, + U32 cycleLog, + U32 maxDist, + U32 loadedDictEnd, + void const* src, void const* srcEnd) { U32 const curr =3D (U32)((BYTE const*)srcEnd - window.base); + if (ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { + if (ZSTD_window_canOverflowCorrect(window, cycleLog, maxDist, load= edDictEnd, src)) { + return 1; + } + } return curr > ZSTD_CURRENT_MAX; } =20 @@ -821,7 +1010,6 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD= _window_t const window, * * The least significant cycleLog bits of the indices must remain the same, * which may be 0. Every index up to maxDist in the past must be valid. - * NOTE: (maxDist & cycleMask) must be zero. */ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycl= eLog, U32 maxDist, void const* src) @@ -845,32 +1033,52 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_wind= ow_t* window, U32 cycleLog, * 3. (cctx->lowLimit + 1< 3<<29 + 1<base); - U32 const currentCycle0 =3D curr & cycleMask; - /* Exclude zero so that newCurrent - maxDist >=3D 1. */ - U32 const currentCycle1 =3D currentCycle0 =3D=3D 0 ? (1U << cycleLog) = : currentCycle0; - U32 const newCurrent =3D currentCycle1 + maxDist; + U32 const currentCycle =3D curr & cycleMask; + /* Ensure newCurrent - maxDist >=3D ZSTD_WINDOW_START_INDEX. */ + U32 const currentCycleCorrection =3D currentCycle < ZSTD_WINDOW_START_= INDEX + ? MAX(cycleSize, ZSTD_WINDOW_START_IN= DEX) + : 0; + U32 const newCurrent =3D currentCycle + + currentCycleCorrection + + MAX(maxDist, cycleSize); U32 const correction =3D curr - newCurrent; - assert((maxDist & cycleMask) =3D=3D 0); + /* maxDist must be a power of two so that: + * (newCurrent & cycleMask) =3D=3D (curr & cycleMask) + * This is required to not corrupt the chains / binary tree. + */ + assert((maxDist & (maxDist - 1)) =3D=3D 0); + assert((curr & cycleMask) =3D=3D (newCurrent & cycleMask)); assert(curr > newCurrent); - /* Loose bound, should be around 1<<29 (see above) */ - assert(correction > 1<<28); + if (!ZSTD_WINDOW_OVERFLOW_CORRECT_FREQUENTLY) { + /* Loose bound, should be around 1<<29 (see above) */ + assert(correction > 1<<28); + } =20 window->base +=3D correction; window->dictBase +=3D correction; - if (window->lowLimit <=3D correction) window->lowLimit =3D 1; - else window->lowLimit -=3D correction; - if (window->dictLimit <=3D correction) window->dictLimit =3D 1; - else window->dictLimit -=3D correction; + if (window->lowLimit < correction + ZSTD_WINDOW_START_INDEX) { + window->lowLimit =3D ZSTD_WINDOW_START_INDEX; + } else { + window->lowLimit -=3D correction; + } + if (window->dictLimit < correction + ZSTD_WINDOW_START_INDEX) { + window->dictLimit =3D ZSTD_WINDOW_START_INDEX; + } else { + window->dictLimit -=3D correction; + } =20 /* Ensure we can still reference the full window. */ assert(newCurrent >=3D maxDist); - assert(newCurrent - maxDist >=3D 1); + assert(newCurrent - maxDist >=3D ZSTD_WINDOW_START_INDEX); /* Ensure that lowLimit and dictLimit didn't underflow. */ assert(window->lowLimit <=3D newCurrent); assert(window->dictLimit <=3D newCurrent); =20 + ++window->nbOverflowCorrections; + DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=3D0x%x", correction, window->lowLimit); return correction; @@ -975,11 +1183,13 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, =20 MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { ZSTD_memset(window, 0, sizeof(*window)); - window->base =3D (BYTE const*)""; - window->dictBase =3D (BYTE const*)""; - window->dictLimit =3D 1; /* start from 1, so that 1st position is v= alid */ - window->lowLimit =3D 1; /* it ensures first and later CCtx usages = compress the same */ - window->nextSrc =3D window->base + 1; /* see issue #1241 */ + window->base =3D (BYTE const*)" "; + window->dictBase =3D (BYTE const*)" "; + ZSTD_STATIC_ASSERT(ZSTD_DUBT_UNSORTED_MARK < ZSTD_WINDOW_START_INDEX);= /* Start above ZSTD_DUBT_UNSORTED_MARK */ + window->dictLimit =3D ZSTD_WINDOW_START_INDEX; /* start from >0, so= that 1st position is valid */ + window->lowLimit =3D ZSTD_WINDOW_START_INDEX; /* it ensures first = and later CCtx usages compress the same */ + window->nextSrc =3D window->base + ZSTD_WINDOW_START_INDEX; /* see i= ssue #1241 */ + window->nbOverflowCorrections =3D 0; } =20 /* @@ -990,7 +1200,8 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window= ) { * Returns non-zero if the segment is contiguous. */ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, - void const* src, size_t srcSize) + void const* src, size_t srcSize, + int forceNonContiguous) { BYTE const* const ip =3D (BYTE const*)src; U32 contiguous =3D 1; @@ -1000,7 +1211,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* wind= ow, assert(window->base !=3D NULL); assert(window->dictBase !=3D NULL); /* Check if blocks follow each other */ - if (src !=3D window->nextSrc) { + if (src !=3D window->nextSrc || forceNonContiguous) { /* not contiguous */ size_t const distanceFromBase =3D (size_t)(window->nextSrc - windo= w->base); DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", win= dow->dictLimit); @@ -1030,15 +1241,15 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* wi= ndow, */ MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 c= urr, unsigned windowLog) { - U32 const maxDistance =3D 1U << windowLog; - U32 const lowestValid =3D ms->window.lowLimit; - U32 const withinWindow =3D (curr - lowestValid > maxDistance) ? cur= r - maxDistance : lowestValid; - U32 const isDictionary =3D (ms->loadedDictEnd !=3D 0); + U32 const maxDistance =3D 1U << windowLog; + U32 const lowestValid =3D ms->window.lowLimit; + U32 const withinWindow =3D (curr - lowestValid > maxDistance) ? curr -= maxDistance : lowestValid; + U32 const isDictionary =3D (ms->loadedDictEnd !=3D 0); /* When using a dictionary the entire dictionary is valid if a single = byte of the dictionary * is within the window. We invalidate the dictionary (and set loadedD= ictEnd to 0) when it isn't * valid for the entire block. So this check is sufficient to find the= lowest valid match index. */ - U32 const matchLowest =3D isDictionary ? lowestValid : withinWindow; + U32 const matchLowest =3D isDictionary ? lowestValid : withinWindow; return matchLowest; } =20 diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress= /zstd_compress_literals.c index 655bcda4d1f1..52b0a8059aba 100644 --- a/lib/zstd/compress/zstd_compress_literals.c +++ b/lib/zstd/compress/zstd_compress_literals.c @@ -73,7 +73,8 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* pr= evHuf, void* dst, size_t dstCapacity, const void* src, size_t srcSize, void* entropyWorkspace, size_t entropyWorksp= aceSize, - const int bmi2) + const int bmi2, + unsigned suspectUncompressible) { size_t const minGain =3D ZSTD_minGain(srcSize, strategy); size_t const lhSize =3D 3 + (srcSize >=3D 1 KB) + (srcSize >=3D 16 KB); @@ -105,11 +106,11 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const= * prevHuf, HUF_compress1X_repeat( ostart+lhSize, dstCapacity-lhSize, src, srcSize, HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspac= e, entropyWorkspaceSize, - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) : + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, s= uspectUncompressible) : HUF_compress4X_repeat( ostart+lhSize, dstCapacity-lhSize, src, srcSize, HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspac= e, entropyWorkspaceSize, - (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2); + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, s= uspectUncompressible); if (repeat !=3D HUF_repeat_none) { /* reused the existing table */ DEBUGLOG(5, "Reusing previous huffman table"); @@ -117,7 +118,7 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* = prevHuf, } } =20 - if ((cLitSize=3D=3D0) | (cLitSize >=3D srcSize - minGain) | ERR_isErro= r(cLitSize)) { + if ((cLitSize=3D=3D0) || (cLitSize >=3D srcSize - minGain) || ERR_isEr= ror(cLitSize)) { ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); } diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress= /zstd_compress_literals.h index 9904c0cd30a0..9775fb97cb70 100644 --- a/lib/zstd/compress/zstd_compress_literals.h +++ b/lib/zstd/compress/zstd_compress_literals.h @@ -18,12 +18,14 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCa= pacity, const void* src, =20 size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const= void* src, size_t srcSize); =20 +/* If suspectUncompressible then some sampling checks will be run to poten= tially skip huffman coding */ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, ZSTD_hufCTables_t* nextHuf, ZSTD_strategy strategy, int disableLiteralCo= mpression, void* dst, size_t dstCapacity, const void* src, size_t srcSize, void* entropyWorkspace, size_t entropyWorksp= aceSize, - const int bmi2); + const int bmi2, + unsigned suspectUncompressible); =20 #endif /* ZSTD_COMPRESS_LITERALS_H */ diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compres= s/zstd_compress_sequences.c index dcfcdc9cc5e8..21ddc1b37acf 100644 --- a/lib/zstd/compress/zstd_compress_sequences.c +++ b/lib/zstd/compress/zstd_compress_sequences.c @@ -85,6 +85,8 @@ static size_t ZSTD_entropyCost(unsigned const* count, uns= igned const max, size_t { unsigned cost =3D 0; unsigned s; + + assert(total > 0); for (s =3D 0; s <=3D max; ++s) { unsigned norm =3D (unsigned)((256 * count[s]) / total); if (count[s] !=3D 0 && norm =3D=3D 0) @@ -273,10 +275,11 @@ ZSTD_buildCTable(void* dst, size_t dstCapacity, assert(nbSeq_1 > 1); assert(entropyWorkspaceSize >=3D sizeof(ZSTD_BuildCTableWksp)); (void)entropyWorkspaceSize; - FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, n= bSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), ""); - { size_t const NCountSize =3D FSE_writeNCount(op, oend - op, wks= p->norm, max, tableLog); /* overflow protected */ + FORWARD_IF_ERROR(FSE_normalizeCount(wksp->norm, tableLog, count, n= bSeq_1, max, ZSTD_useLowProbCount(nbSeq_1)), "FSE_normalizeCount failed"); + assert(oend >=3D op); + { size_t const NCountSize =3D FSE_writeNCount(op, (size_t)(oend = - op), wksp->norm, max, tableLog); /* overflow protected */ FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); - FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, = max, tableLog, wksp->wksp, sizeof(wksp->wksp)), ""); + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, wksp->norm, = max, tableLog, wksp->wksp, sizeof(wksp->wksp)), "FSE_buildCTable_wksp faile= d"); return NCountSize; } } @@ -310,19 +313,19 @@ ZSTD_encodeSequences_body( FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbS= eq-1]); BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCode= Table[nbSeq-1]]); if (MEM_32bits()) BIT_flushBits(&blockStream); - BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCo= deTable[nbSeq-1]]); + BIT_addBits(&blockStream, sequences[nbSeq-1].mlBase, ML_bits[mlCodeTab= le[nbSeq-1]]); if (MEM_32bits()) BIT_flushBits(&blockStream); if (longOffsets) { U32 const ofBits =3D ofCodeTable[nbSeq-1]; unsigned const extraBits =3D ofBits - MIN(ofBits, STREAM_ACCUMULAT= OR_MIN-1); if (extraBits) { - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits= ); + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, extraBit= s); BIT_flushBits(&blockStream); } - BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase >> extraBits, ofBits - extraBits); } else { - BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[n= bSeq-1]); + BIT_addBits(&blockStream, sequences[nbSeq-1].offBase, ofCodeTable[= nbSeq-1]); } BIT_flushBits(&blockStream); =20 @@ -336,8 +339,8 @@ ZSTD_encodeSequences_body( U32 const mlBits =3D ML_bits[mlCode]; DEBUGLOG(6, "encoding: litlen:%2u - matchlen:%2u - offCode:%7u= ", (unsigned)sequences[n].litLength, - (unsigned)sequences[n].matchLength + MINMATCH, - (unsigned)sequences[n].offset); + (unsigned)sequences[n].mlBase + MINMATCH, + (unsigned)sequences[n].offBase); = /* 32b*/ /* 64b*/ = /* (7)*/ /* (7)*/ FSE_encodeSymbol(&blockStream, &stateOffsetBits, ofCode); = /* 15 */ /* 15 */ @@ -348,18 +351,18 @@ ZSTD_encodeSequences_body( BIT_flushBits(&blockStream); = /* (7)*/ BIT_addBits(&blockStream, sequences[n].litLength, llBits); if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&block= Stream); - BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); + BIT_addBits(&blockStream, sequences[n].mlBase, mlBits); if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits= (&blockStream); if (longOffsets) { unsigned const extraBits =3D ofBits - MIN(ofBits, STREAM_A= CCUMULATOR_MIN-1); if (extraBits) { - BIT_addBits(&blockStream, sequences[n].offset, extraBi= ts); + BIT_addBits(&blockStream, sequences[n].offBase, extraB= its); BIT_flushBits(&blockStream); = /* (7)*/ } - BIT_addBits(&blockStream, sequences[n].offset >> extraBits, + BIT_addBits(&blockStream, sequences[n].offBase >> extraBit= s, ofBits - extraBits); = /* 31 */ } else { - BIT_addBits(&blockStream, sequences[n].offset, ofBits); = /* 31 */ + BIT_addBits(&blockStream, sequences[n].offBase, ofBits); = /* 31 */ } BIT_flushBits(&blockStream); = /* (7)*/ DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr -= blockStream.ptr)); @@ -396,7 +399,7 @@ ZSTD_encodeSequences_default( =20 #if DYNAMIC_BMI2 =20 -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t ZSTD_encodeSequences_bmi2( void* dst, size_t dstCapacity, FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compre= ss/zstd_compress_superblock.c index b0610b255653..17d836cc84e8 100644 --- a/lib/zstd/compress/zstd_compress_superblock.c +++ b/lib/zstd/compress/zstd_compress_superblock.c @@ -15,289 +15,10 @@ =20 #include "../common/zstd_internal.h" /* ZSTD_getSequenceLength */ #include "hist.h" /* HIST_countFast_wksp */ -#include "zstd_compress_internal.h" +#include "zstd_compress_internal.h" /* ZSTD_[huf|fse|entropy]CTablesMeta= data_t */ #include "zstd_compress_sequences.h" #include "zstd_compress_literals.h" =20 -/*-************************************* -* Superblock entropy buffer structs -***************************************/ -/* ZSTD_hufCTablesMetadata_t : - * Stores Literals Block Type for a super-block in hType, and - * huffman tree description in hufDesBuffer. - * hufDesSize refers to the size of huffman tree description in bytes. - * This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */ -typedef struct { - symbolEncodingType_e hType; - BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; - size_t hufDesSize; -} ZSTD_hufCTablesMetadata_t; - -/* ZSTD_fseCTablesMetadata_t : - * Stores symbol compression modes for a super-block in {ll, ol, ml}Type,= and - * fse tables in fseTablesBuffer. - * fseTablesSize refers to the size of fse tables in bytes. - * This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() = */ -typedef struct { - symbolEncodingType_e llType; - symbolEncodingType_e ofType; - symbolEncodingType_e mlType; - BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; - size_t fseTablesSize; - size_t lastCountSize; /* This is to account for bug in 1.3.4. More det= ail in ZSTD_compressSubBlock_sequences() */ -} ZSTD_fseCTablesMetadata_t; - -typedef struct { - ZSTD_hufCTablesMetadata_t hufMetadata; - ZSTD_fseCTablesMetadata_t fseMetadata; -} ZSTD_entropyCTablesMetadata_t; - - -/* ZSTD_buildSuperBlockEntropy_literal() : - * Builds entropy for the super-block literals. - * Stores literals block type (raw, rle, compressed, repeat) and - * huffman description table to hufMetadata. - * @return : size of huffman description table or error code */ -static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t = srcSize, - const ZSTD_hufCTables_t* prevH= uf, - ZSTD_hufCTables_t* nextH= uf, - ZSTD_hufCTablesMetadata_= t* hufMetadata, - const int disableLiteral= sCompression, - void* workspace, size_t = wkspSize) -{ - BYTE* const wkspStart =3D (BYTE*)workspace; - BYTE* const wkspEnd =3D wkspStart + wkspSize; - BYTE* const countWkspStart =3D wkspStart; - unsigned* const countWksp =3D (unsigned*)workspace; - const size_t countWkspSize =3D (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsi= gned); - BYTE* const nodeWksp =3D countWkspStart + countWkspSize; - const size_t nodeWkspSize =3D wkspEnd-nodeWksp; - unsigned maxSymbolValue =3D 255; - unsigned huffLog =3D HUF_TABLELOG_DEFAULT; - HUF_repeat repeat =3D prevHuf->repeatMode; - - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=3D%zu)", src= Size); - - /* Prepare nextEntropy assuming reusing the existing table */ - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - - if (disableLiteralsCompression) { - DEBUGLOG(5, "set_basic - disabled"); - hufMetadata->hType =3D set_basic; - return 0; - } - - /* small ? don't even attempt compression (speed opt) */ -# define COMPRESS_LITERALS_SIZE_MIN 63 - { size_t const minLitSize =3D (prevHuf->repeatMode =3D=3D HUF_repeat= _valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; - if (srcSize <=3D minLitSize) { - DEBUGLOG(5, "set_basic - too small"); - hufMetadata->hType =3D set_basic; - return 0; - } - } - - /* Scan input and build symbol stats */ - { size_t const largest =3D HIST_count_wksp (countWksp, &maxSymbolVal= ue, (const BYTE*)src, srcSize, workspace, wkspSize); - FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); - if (largest =3D=3D srcSize) { - DEBUGLOG(5, "set_rle"); - hufMetadata->hType =3D set_rle; - return 0; - } - if (largest <=3D (srcSize >> 7)+4) { - DEBUGLOG(5, "set_basic - no gain"); - hufMetadata->hType =3D set_basic; - return 0; - } - } - - /* Validate the previous Huffman table */ - if (repeat =3D=3D HUF_repeat_check && !HUF_validateCTable((HUF_CElt co= nst*)prevHuf->CTable, countWksp, maxSymbolValue)) { - repeat =3D HUF_repeat_none; - } - - /* Build Huffman Tree */ - ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); - huffLog =3D HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); - { size_t const maxBits =3D HUF_buildCTable_wksp((HUF_CElt*)nextHuf->= CTable, countWksp, - maxSymbolValue, huffLo= g, - nodeWksp, nodeWkspSize= ); - FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); - huffLog =3D (U32)maxBits; - { /* Build and write the CTable */ - size_t const newCSize =3D HUF_estimateCompressedSize( - (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); - size_t const hSize =3D HUF_writeCTable_wksp( - hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesB= uffer), - (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, - nodeWksp, nodeWkspSize); - /* Check against repeating the previous CTable */ - if (repeat !=3D HUF_repeat_none) { - size_t const oldCSize =3D HUF_estimateCompressedSize( - (HUF_CElt const*)prevHuf->CTable, countWksp, maxSy= mbolValue); - if (oldCSize < srcSize && (oldCSize <=3D hSize + newCSize = || hSize + 12 >=3D srcSize)) { - DEBUGLOG(5, "set_repeat - smaller"); - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - hufMetadata->hType =3D set_repeat; - return 0; - } - } - if (newCSize + hSize >=3D srcSize) { - DEBUGLOG(5, "set_basic - no gains"); - ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); - hufMetadata->hType =3D set_basic; - return 0; - } - DEBUGLOG(5, "set_compressed (hSize=3D%u)", (U32)hSize); - hufMetadata->hType =3D set_compressed; - nextHuf->repeatMode =3D HUF_repeat_check; - return hSize; - } - } -} - -/* ZSTD_buildSuperBlockEntropy_sequences() : - * Builds entropy for the super-block sequences. - * Stores symbol compression modes and fse table to fseMetadata. - * @return : size of fse tables or error code */ -static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePt= r, - const ZSTD_fseCTables_t* pre= vEntropy, - ZSTD_fseCTables_t* nex= tEntropy, - const ZSTD_CCtx_params* cctx= Params, - ZSTD_fseCTablesMetadat= a_t* fseMetadata, - void* workspace, size_= t wkspSize) -{ - BYTE* const wkspStart =3D (BYTE*)workspace; - BYTE* const wkspEnd =3D wkspStart + wkspSize; - BYTE* const countWkspStart =3D wkspStart; - unsigned* const countWksp =3D (unsigned*)workspace; - const size_t countWkspSize =3D (MaxSeq + 1) * sizeof(unsigned); - BYTE* const cTableWksp =3D countWkspStart + countWkspSize; - const size_t cTableWkspSize =3D wkspEnd-cTableWksp; - ZSTD_strategy const strategy =3D cctxParams->cParams.strategy; - FSE_CTable* CTable_LitLength =3D nextEntropy->litlengthCTable; - FSE_CTable* CTable_OffsetBits =3D nextEntropy->offcodeCTable; - FSE_CTable* CTable_MatchLength =3D nextEntropy->matchlengthCTable; - const BYTE* const ofCodeTable =3D seqStorePtr->ofCode; - const BYTE* const llCodeTable =3D seqStorePtr->llCode; - const BYTE* const mlCodeTable =3D seqStorePtr->mlCode; - size_t const nbSeq =3D seqStorePtr->sequences - seqStorePtr->sequences= Start; - BYTE* const ostart =3D fseMetadata->fseTablesBuffer; - BYTE* const oend =3D ostart + sizeof(fseMetadata->fseTablesBuffer); - BYTE* op =3D ostart; - - assert(cTableWkspSize >=3D (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE= )); - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=3D%zu)", nbS= eq); - ZSTD_memset(workspace, 0, wkspSize); - - fseMetadata->lastCountSize =3D 0; - /* convert length/distances into codes */ - ZSTD_seqToCodes(seqStorePtr); - /* build CTable for Literal Lengths */ - { U32 LLtype; - unsigned max =3D MaxLL; - size_t const mostFrequent =3D HIST_countFast_wksp(countWksp, &max,= llCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - DEBUGLOG(5, "Building LL table"); - nextEntropy->litlength_repeatMode =3D prevEntropy->litlength_repea= tMode; - LLtype =3D ZSTD_selectEncodingType(&nextEntropy->litlength_repeatM= ode, - countWksp, max, mostFrequent, nbSe= q, - LLFSELog, prevEntropy->litlengthCT= able, - LL_defaultNorm, LL_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(set_basic < set_compressed && set_rle < set_compressed); - assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatM= ode !=3D FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize =3D ZSTD_buildCTable(op, oend - op, CTa= ble_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, - countWksp, max, llCode= Table, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, - prevEntropy->litlength= CTable, sizeof(prevEntropy->litlengthCTable), - cTableWksp, cTableWksp= Size); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens fail= ed"); - if (LLtype =3D=3D set_compressed) - fseMetadata->lastCountSize =3D countSize; - op +=3D countSize; - fseMetadata->llType =3D (symbolEncodingType_e) LLtype; - } } - /* build CTable for Offsets */ - { U32 Offtype; - unsigned max =3D MaxOff; - size_t const mostFrequent =3D HIST_countFast_wksp(countWksp, &max,= ofCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - /* We can only use the basic table if max <=3D DefaultMaxOff, othe= rwise the offsets are too large */ - ZSTD_defaultPolicy_e const defaultPolicy =3D (max <=3D DefaultMaxO= ff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; - DEBUGLOG(5, "Building OF table"); - nextEntropy->offcode_repeatMode =3D prevEntropy->offcode_repeatMod= e; - Offtype =3D ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMo= de, - countWksp, max, mostFrequent, nbSe= q, - OffFSELog, prevEntropy->offcodeCTa= ble, - OF_defaultNorm, OF_defaultNormLog, - defaultPolicy, strategy); - assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMo= de !=3D FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize =3D ZSTD_buildCTable(op, oend - op, CTa= ble_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, - countWksp, max, ofCode= Table, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, - prevEntropy->offcodeCT= able, sizeof(prevEntropy->offcodeCTable), - cTableWksp, cTableWksp= Size); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets fail= ed"); - if (Offtype =3D=3D set_compressed) - fseMetadata->lastCountSize =3D countSize; - op +=3D countSize; - fseMetadata->ofType =3D (symbolEncodingType_e) Offtype; - } } - /* build CTable for MatchLengths */ - { U32 MLtype; - unsigned max =3D MaxML; - size_t const mostFrequent =3D HIST_countFast_wksp(countWksp, &max,= mlCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ - DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend= -op)); - nextEntropy->matchlength_repeatMode =3D prevEntropy->matchlength_r= epeatMode; - MLtype =3D ZSTD_selectEncodingType(&nextEntropy->matchlength_repea= tMode, - countWksp, max, mostFrequent, nbSe= q, - MLFSELog, prevEntropy->matchlength= CTable, - ML_defaultNorm, ML_defaultNormLog, - ZSTD_defaultAllowed, strategy); - assert(!(MLtype < set_compressed && nextEntropy->matchlength_repea= tMode !=3D FSE_repeat_none)); /* We don't copy tables */ - { size_t const countSize =3D ZSTD_buildCTable(op, oend - op, CTa= ble_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, - countWksp, max, mlCode= Table, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, - prevEntropy->matchleng= thCTable, sizeof(prevEntropy->matchlengthCTable), - cTableWksp, cTableWksp= Size); - FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths= failed"); - if (MLtype =3D=3D set_compressed) - fseMetadata->lastCountSize =3D countSize; - op +=3D countSize; - fseMetadata->mlType =3D (symbolEncodingType_e) MLtype; - } } - assert((size_t) (op-ostart) <=3D sizeof(fseMetadata->fseTablesBuffer)); - return op-ostart; -} - - -/* ZSTD_buildSuperBlockEntropy() : - * Builds entropy for the super-block. - * @return : 0 on success or error code */ -static size_t -ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr, - const ZSTD_entropyCTables_t* prevEntropy, - ZSTD_entropyCTables_t* nextEntropy, - const ZSTD_CCtx_params* cctxParams, - ZSTD_entropyCTablesMetadata_t* entropyMetadata, - void* workspace, size_t wkspSize) -{ - size_t const litSize =3D seqStorePtr->lit - seqStorePtr->litStart; - DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy"); - entropyMetadata->hufMetadata.hufDesSize =3D - ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize, - &prevEntropy->huf, &nextEntrop= y->huf, - &entropyMetadata->hufMetadata, - ZSTD_disableLiteralsCompressio= n(cctxParams), - workspace, wkspSize); - FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildS= uperBlockEntropy_literal failed"); - entropyMetadata->fseMetadata.fseTablesSize =3D - ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr, - &prevEntropy->fse, &nextEntr= opy->fse, - cctxParams, - &entropyMetadata->fseMetadat= a, - workspace, wkspSize); - FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_bui= ldSuperBlockEntropy_sequences failed"); - return 0; -} - /* ZSTD_compressSubBlock_literal() : * Compresses literals section for a sub-block. * When we have to write the Huffman table we will sometimes choose a hea= der @@ -411,8 +132,7 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const= * seqStore, const seqDef* const seqDef* sp =3D sstart; size_t matchLengthSum =3D 0; size_t litLengthSum =3D 0; - /* Only used by assert(), suppress unused variable warnings in product= ion. */ - (void)litLengthSum; + (void)(litLengthSum); /* suppress unused variable warning on some envi= ronments */ while (send-sp > 0) { ZSTD_sequenceLength const seqLen =3D ZSTD_getSequenceLength(seqSto= re, sp); litLengthSum +=3D seqLen.litLength; @@ -605,7 +325,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const B= YTE* literals, size_t lit static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e ty= pe, const BYTE* codeTable, unsigned maxCode, size_t nbSeq, const FSE_CTable* fseCTable, - const U32* additionalBits, + const U8* additionalBits, short const* defaultNorm, U32 defaultNormLog, U32 = defaultMax, void* workspace, size_t wkspSize) { @@ -646,8 +366,9 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const= BYTE* ofCodeTable, void* workspace, size_t = wkspSize, int writeEntropy) { - size_t sequencesSectionHeaderSize =3D 3; /* Use hard coded size of 3 b= ytes */ + size_t const sequencesSectionHeaderSize =3D 3; /* Use hard coded size = of 3 bytes */ size_t cSeqSizeEstimate =3D 0; + if (nbSeq =3D=3D 0) return sequencesSectionHeaderSize; cSeqSizeEstimate +=3D ZSTD_estimateSubBlockSize_symbolType(fseMetadata= ->ofType, ofCodeTable, MaxOff, nbSeq, fseTables->offcodeCTable, = NULL, OF_defaultNorm, OF_defaultNormLog= , DefaultMaxOff, @@ -754,7 +475,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStor= e_t* seqStorePtr, /* I think there is an optimization opportunity here. * Calling ZSTD_estimateSubBlockSize for every sequence can be was= teful * since it recalculates estimate from scratch. - * For example, it would recount literal distribution and symbol c= odes everytime. + * For example, it would recount literal distribution and symbol c= odes every time. */ cBlockSizeEstimate =3D ZSTD_estimateSubBlockSize(lp, litSize, ofCo= dePtr, llCodePtr, mlCodePtr, seqCount, &nextCBlock->entrop= y, entropyMetadata, @@ -818,7 +539,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStor= e_t* seqStorePtr, repcodes_t rep; ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); for (seq =3D sstart; seq < sp; ++seq) { - rep =3D ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getS= equenceLength(seqStorePtr, seq).litLength =3D=3D 0); + ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequence= Length(seqStorePtr, seq).litLength =3D=3D 0); } ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); } @@ -833,7 +554,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, unsigned lastBlock) { ZSTD_entropyCTablesMetadata_t entropyMetadata; =20 - FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore, + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, &zc->appliedParams, diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h index 98e359adf5d4..349fc923c355 100644 --- a/lib/zstd/compress/zstd_cwksp.h +++ b/lib/zstd/compress/zstd_cwksp.h @@ -32,6 +32,10 @@ #define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128 #endif =20 + +/* Set our tables and aligneds to align by 64 bytes */ +#define ZSTD_CWKSP_ALIGNMENT_BYTES 64 + /*-************************************* * Structures ***************************************/ @@ -114,10 +118,11 @@ typedef enum { * - Tables: these are any of several different datastructures (hash table= s, * chain tables, binary trees) that all respect a common format: they are * uint32_t arrays, all of whose values are between 0 and (nextSrc - bas= e). - * Their sizes depend on the cparams. + * Their sizes depend on the cparams. These tables are 64-byte aligned. * * - Aligned: these buffers are used for various purposes that require 4 b= yte - * alignment, but don't require any initialization before they're used. + * alignment, but don't require any initialization before they're used. = These + * buffers are each aligned to 64 bytes. * * - Buffers: these buffers are used for various purposes that don't requi= re * any alignment or initialization before they're used. This means they = can @@ -130,8 +135,7 @@ typedef enum { * * 1. Objects * 2. Buffers - * 3. Aligned - * 4. Tables + * 3. Aligned/Tables * * Attempts to reserve objects of different types out of order will fail. */ @@ -184,6 +188,8 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t = const align) { * Since tables aren't currently redzoned, you don't need to call through = this * to figure out how much space you need for the matchState tables. Everyt= hing * else is though. + * + * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_= alloc_size(). */ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { if (size =3D=3D 0) @@ -191,66 +197,139 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size)= { return size; } =20 -MEM_STATIC void ZSTD_cwksp_internal_advance_phase( - ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) { +/* + * Returns an adjusted alloc size that is the nearest larger multiple of 6= 4 bytes. + * Used to determine the number of bytes required for a given "aligned". + */ +MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNME= NT_BYTES)); +} + +/* + * Returns the amount of additional space the cwksp must allocate + * for internal purposes (currently only alignment). + */ +MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { + /* For alignment, the wksp will always allocate an additional n_1=3D[1= , 64] bytes + * to align the beginning of tables section, as well as another n_2=3D= [0, 63] bytes + * to align the beginning of the aligned section. + * + * n_1 + n_2 =3D=3D 64 bytes if the cwksp is freshly allocated, due to= tables and + * aligneds being sized in multiples of 64 bytes. + */ + size_t const slackSpace =3D ZSTD_CWKSP_ALIGNMENT_BYTES; + return slackSpace; +} + + +/* + * Return the number of additional bytes required to align a pointer to th= e given number of bytes. + * alignBytes must be a power of two. + */ +MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t al= ignBytes) { + size_t const alignBytesMask =3D alignBytes - 1; + size_t const bytes =3D (alignBytes - ((size_t)ptr & (alignBytesMask)))= & alignBytesMask; + assert((alignBytes & alignBytesMask) =3D=3D 0); + assert(bytes !=3D ZSTD_CWKSP_ALIGNMENT_BYTES); + return bytes; +} + +/* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of= the wksp, + * which counts from the end of the wksp (as opposed to the object/table s= egment). + * + * Returns a pointer to the beginning of that space. + */ +MEM_STATIC void* +ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const byte= s) +{ + void* const alloc =3D (BYTE*)ws->allocStart - bytes; + void* const bottom =3D ws->tableEnd; + DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(alloc >=3D bottom); + if (alloc < bottom) { + DEBUGLOG(4, "cwksp: alloc failed!"); + ws->allocFailed =3D 1; + return NULL; + } + /* the area is reserved from the end of wksp. + * If it overlaps with tableValidEnd, it voids guarantees on values' r= ange */ + if (alloc < ws->tableValidEnd) { + ws->tableValidEnd =3D alloc; + } + ws->allocStart =3D alloc; + return alloc; +} + +/* + * Moves the cwksp to the next phase, and does any necessary allocations. + * cwksp initialization must necessarily go through each phase in order. + * Returns a 0 on success, or zstd error + */ +MEM_STATIC size_t +ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e= phase) +{ assert(phase >=3D ws->phase); if (phase > ws->phase) { + /* Going from allocating objects to allocating buffers */ if (ws->phase < ZSTD_cwksp_alloc_buffers && phase >=3D ZSTD_cwksp_alloc_buffers) { ws->tableValidEnd =3D ws->objectEnd; } + + /* Going from allocating buffers to allocating aligneds/tables */ if (ws->phase < ZSTD_cwksp_alloc_aligned && phase >=3D ZSTD_cwksp_alloc_aligned) { - /* If unaligned allocations down from a too-large top have lef= t us - * unaligned, we need to realign our alloc ptr. Technically, t= his - * can consume space that is unaccounted for in the neededSpace - * calculation. However, I believe this can only happen when t= he - * workspace is too large, and specifically when it is too lar= ge - * by a larger margin than the space that will be consumed. */ - /* TODO: cleaner, compiler warning friendly way to do this??? = */ - ws->allocStart =3D (BYTE*)ws->allocStart - ((size_t)ws->allocS= tart & (sizeof(U32)-1)); - if (ws->allocStart < ws->tableValidEnd) { - ws->tableValidEnd =3D ws->allocStart; + { /* Align the start of the "aligned" to 64 bytes. Use [1, 6= 4] bytes. */ + size_t const bytesToAlign =3D + ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align= _ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); + DEBUGLOG(5, "reserving aligned alignment addtl space: %zu"= , bytesToAlign); + ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWK= SP_ALIGNMENT_BYTES - 1)) =3D=3D 0); /* power of 2 */ + RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(= ws, bytesToAlign), + memory_allocation, "aligned phase - alignm= ent initial allocation failed!"); } - } + { /* Align the start of the tables to 64 bytes. Use [0, 63] = bytes */ + void* const alloc =3D ws->objectEnd; + size_t const bytesToAlign =3D ZSTD_cwksp_bytes_to_align_pt= r(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); + void* const objectEnd =3D (BYTE*)alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", = bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_alloc= ation, + "table phase - alignment initial allocatio= n failed!"); + ws->objectEnd =3D objectEnd; + ws->tableEnd =3D objectEnd; /* table area starts being em= pty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd =3D ws->tableEnd; + } } } ws->phase =3D phase; + ZSTD_cwksp_assert_internal_consistency(ws); } + return 0; } =20 /* * Returns whether this object/buffer/etc was allocated in this workspace. */ -MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* pt= r) { +MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* pt= r) +{ return (ptr !=3D NULL) && (ws->workspace <=3D ptr) && (ptr <=3D ws->wo= rkspaceEnd); } =20 /* * Internal function. Do not use directly. */ -MEM_STATIC void* ZSTD_cwksp_reserve_internal( - ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) { +MEM_STATIC void* +ZSTD_cwksp_reserve_internal(ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc= _phase_e phase) +{ void* alloc; - void* bottom =3D ws->tableEnd; - ZSTD_cwksp_internal_advance_phase(ws, phase); - alloc =3D (BYTE *)ws->allocStart - bytes; - - if (bytes =3D=3D 0) + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase)) || byte= s =3D=3D 0) { return NULL; + } =20 =20 - DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", - alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); - ZSTD_cwksp_assert_internal_consistency(ws); - assert(alloc >=3D bottom); - if (alloc < bottom) { - DEBUGLOG(4, "cwksp: alloc failed!"); - ws->allocFailed =3D 1; - return NULL; - } - if (alloc < ws->tableValidEnd) { - ws->tableValidEnd =3D alloc; - } - ws->allocStart =3D alloc; + alloc =3D ZSTD_cwksp_reserve_internal_buffer_space(ws, bytes); =20 =20 return alloc; @@ -259,33 +338,44 @@ MEM_STATIC void* ZSTD_cwksp_reserve_internal( /* * Reserves and returns unaligned memory. */ -MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) { +MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) +{ return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_= buffers); } =20 /* - * Reserves and returns memory sized on and aligned on sizeof(unsigned). + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMEN= T_BYTES (64 bytes). */ -MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) { - assert((bytes & (sizeof(U32)-1)) =3D=3D 0); - return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(= U32)), ZSTD_cwksp_alloc_aligned); +MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) +{ + void* ptr =3D ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, = ZSTD_CWKSP_ALIGNMENT_BYTES), + ZSTD_cwksp_alloc_aligned); + assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))=3D=3D 0); + return ptr; } =20 /* - * Aligned on sizeof(unsigned). These buffers have the special property th= at + * Aligned on 64 bytes. These buffers have the special property that * their values remain constrained, allowing us to re-use them without * memset()-ing them. */ -MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { +MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) +{ const ZSTD_cwksp_alloc_phase_e phase =3D ZSTD_cwksp_alloc_aligned; - void* alloc =3D ws->tableEnd; - void* end =3D (BYTE *)alloc + bytes; - void* top =3D ws->allocStart; + void* alloc; + void* end; + void* top; + + if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { + return NULL; + } + alloc =3D ws->tableEnd; + end =3D (BYTE *)alloc + bytes; + top =3D ws->allocStart; =20 DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining", alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); assert((bytes & (sizeof(U32)-1)) =3D=3D 0); - ZSTD_cwksp_internal_advance_phase(ws, phase); ZSTD_cwksp_assert_internal_consistency(ws); assert(end <=3D top); if (end > top) { @@ -296,27 +386,31 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp*= ws, size_t bytes) { ws->tableEnd =3D end; =20 =20 + assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) =3D=3D 0); + assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))=3D=3D 0); return alloc; } =20 /* * Aligned on sizeof(void*). + * Note : should happen only once, at workspace first initialization */ -MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { - size_t roundedBytes =3D ZSTD_cwksp_align(bytes, sizeof(void*)); +MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) +{ + size_t const roundedBytes =3D ZSTD_cwksp_align(bytes, sizeof(void*)); void* alloc =3D ws->objectEnd; void* end =3D (BYTE*)alloc + roundedBytes; =20 =20 - DEBUGLOG(5, + DEBUGLOG(4, "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes = remaining", alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - round= edBytes); - assert(((size_t)alloc & (sizeof(void*)-1)) =3D=3D 0); - assert((bytes & (sizeof(void*)-1)) =3D=3D 0); + assert((size_t)alloc % ZSTD_ALIGNOF(void*) =3D=3D 0); + assert(bytes % ZSTD_ALIGNOF(void*) =3D=3D 0); ZSTD_cwksp_assert_internal_consistency(ws); /* we must be in the first phase, no advance is possible */ if (ws->phase !=3D ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd)= { - DEBUGLOG(4, "cwksp: object alloc failed!"); + DEBUGLOG(3, "cwksp: object alloc failed!"); ws->allocFailed =3D 1; return NULL; } @@ -328,7 +422,8 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* = ws, size_t bytes) { return alloc; } =20 -MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) { +MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) +{ DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty"); =20 =20 @@ -451,6 +546,24 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cw= ksp* ws) { * Functions Checking Free Space ***************************************/ =20 +/* ZSTD_alignmentSpaceWithinBounds() : + * Returns if the estimated space needed for a wksp is within an acceptabl= e limit of the + * actual amount of space used. + */ +MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* = const ws, + size_t const estim= atedSpace, int resizedWorkspace) { + if (resizedWorkspace) { + /* Resized/newly allocated wksp should have exact bounds */ + return ZSTD_cwksp_used(ws) =3D=3D estimatedSpace; + } else { + /* Due to alignment, when reusing a workspace, we can actually con= sume 63 fewer or more bytes + * than estimatedSpace. See the comments in zstd_cwksp.h for detai= ls. + */ + return (ZSTD_cwksp_used(ws) >=3D estimatedSpace - 63) && (ZSTD_cwk= sp_used(ws) <=3D estimatedSpace + 63); + } +} + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) { return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd); } diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_= double_fast.c index b0424d23ac57..76933dea2624 100644 --- a/lib/zstd/compress/zstd_double_fast.c +++ b/lib/zstd/compress/zstd_double_fast.c @@ -48,10 +48,216 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, =20 =20 FORCE_INLINE_TEMPLATE -size_t ZSTD_compressBlock_doubleFast_generic( +size_t ZSTD_compressBlock_doubleFast_noDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) +{ + ZSTD_compressionParameters const* cParams =3D &ms->cParams; + U32* const hashLong =3D ms->hashTable; + const U32 hBitsL =3D cParams->hashLog; + U32* const hashSmall =3D ms->chainTable; + const U32 hBitsS =3D cParams->chainLog; + const BYTE* const base =3D ms->window.base; + const BYTE* const istart =3D (const BYTE*)src; + const BYTE* anchor =3D istart; + const U32 endIndex =3D (U32)((size_t)(istart - base) + srcSize); + /* presumes that, if there is a dictionary, it must be using Attach mo= de */ + const U32 prefixLowestIndex =3D ZSTD_getLowestPrefixIndex(ms, endIndex= , cParams->windowLog); + const BYTE* const prefixLowest =3D base + prefixLowestIndex; + const BYTE* const iend =3D istart + srcSize; + const BYTE* const ilimit =3D iend - HASH_READ_SIZE; + U32 offset_1=3Drep[0], offset_2=3Drep[1]; + U32 offsetSaved =3D 0; + + size_t mLength; + U32 offset; + U32 curr; + + /* how many positions to search before increasing step size */ + const size_t kStepIncr =3D 1 << kSearchStrength; + /* the position at which to increment the step size if no match is fou= nd */ + const BYTE* nextStep; + size_t step; /* the current step size */ + + size_t hl0; /* the long hash at ip */ + size_t hl1; /* the long hash at ip1 */ + + U32 idxl0; /* the long match index for ip */ + U32 idxl1; /* the long match index for ip1 */ + + const BYTE* matchl0; /* the long match for ip */ + const BYTE* matchs0; /* the short match for ip */ + const BYTE* matchl1; /* the long match for ip1 */ + + const BYTE* ip =3D istart; /* the current position */ + const BYTE* ip1; /* the next position */ + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); + + /* init */ + ip +=3D ((ip - prefixLowest) =3D=3D 0); + { + U32 const current =3D (U32)(ip - base); + U32 const windowLow =3D ZSTD_getLowestPrefixIndex(ms, current, cPa= rams->windowLog); + U32 const maxRep =3D current - windowLow; + if (offset_2 > maxRep) offsetSaved =3D offset_2, offset_2 =3D 0; + if (offset_1 > maxRep) offsetSaved =3D offset_1, offset_1 =3D 0; + } + + /* Outer Loop: one iteration per match found and stored */ + while (1) { + step =3D 1; + nextStep =3D ip + kStepIncr; + ip1 =3D ip + step; + + if (ip1 > ilimit) { + goto _cleanup; + } + + hl0 =3D ZSTD_hashPtr(ip, hBitsL, 8); + idxl0 =3D hashLong[hl0]; + matchl0 =3D base + idxl0; + + /* Inner Loop: one iteration per search / position */ + do { + const size_t hs0 =3D ZSTD_hashPtr(ip, hBitsS, mls); + const U32 idxs0 =3D hashSmall[hs0]; + curr =3D (U32)(ip-base); + matchs0 =3D base + idxs0; + + hashLong[hl0] =3D hashSmall[hs0] =3D curr; /* update hash ta= bles */ + + /* check noDict repcode */ + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) =3D=3D MEM_rea= d32(ip+1))) { + mLength =3D ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= STORE_REPCODE_1, mLength); + goto _match_stored; + } + + hl1 =3D ZSTD_hashPtr(ip1, hBitsL, 8); + + if (idxl0 > prefixLowestIndex) { + /* check prefix long match */ + if (MEM_read64(matchl0) =3D=3D MEM_read64(ip)) { + mLength =3D ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset =3D (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-= 1] =3D=3D matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; + } + } + + idxl1 =3D hashLong[hl1]; + matchl1 =3D base + idxl1; + + if (idxs0 > prefixLowestIndex) { + /* check prefix short match */ + if (MEM_read32(matchs0) =3D=3D MEM_read32(ip)) { + goto _search_next_long; + } + } + + if (ip1 >=3D nextStep) { + PREFETCH_L1(ip1 + 64); + PREFETCH_L1(ip1 + 128); + step++; + nextStep +=3D kStepIncr; + } + ip =3D ip1; + ip1 +=3D step; + + hl0 =3D hl1; + idxl0 =3D idxl1; + matchl0 =3D matchl1; + #if defined(__aarch64__) + PREFETCH_L1(ip+256); + #endif + } while (ip1 <=3D ilimit); + +_cleanup: + /* save reps for next block */ + rep[0] =3D offset_1 ? offset_1 : offsetSaved; + rep[1] =3D offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + +_search_next_long: + + /* check prefix long +1 match */ + if (idxl1 > prefixLowestIndex) { + if (MEM_read64(matchl1) =3D=3D MEM_read64(ip1)) { + ip =3D ip1; + mLength =3D ZSTD_count(ip+8, matchl1+8, iend) + 8; + offset =3D (U32)(ip-matchl1); + while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] = =3D=3D matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ + goto _match_found; + } + } + + /* if no long +1 match, explore the short match we found */ + mLength =3D ZSTD_count(ip+4, matchs0+4, iend) + 4; + offset =3D (U32)(ip - matchs0); + while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] =3D=3D ma= tchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */ + + /* fall-through */ + +_match_found: /* requires ip, offset, mLength */ + offset_2 =3D offset_1; + offset_1 =3D offset; + + if (step < 4) { + /* It is unsafe to write this value back to the hashtable when= ip1 is + * greater than or equal to the new ip we will have after we'r= e done + * processing this match. Rather than perform that test direct= ly + * (ip1 >=3D ip + mLength), which costs speed in practice, we = do a simpler + * more predictable test. The minmatch even if we take a short= match is + * 4 bytes, so as long as step, the distance between ip and ip1 + * (initially) is less than 4, we know ip1 < new ip. */ + hashLong[hl1] =3D (U32)(ip1 - base); + } + + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_O= FFSET(offset), mLength); + +_match_stored: + /* match found */ + ip +=3D mLength; + anchor =3D ip; + + if (ip <=3D ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert =3D curr+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] =3D = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] =3D (U32)(ip-2-bas= e); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = =3D indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] =3D (U32)(ip-1-= base); + } + + /* check immediate repcode */ + while ( (ip <=3D ilimit) + && ( (offset_2>0) + & (MEM_read32(ip) =3D=3D MEM_read32(ip - offset_2)) ))= { + /* store sequence */ + size_t const rLength =3D ZSTD_count(ip+4, ip+4-offset_2, i= end) + 4; + U32 const tmpOff =3D offset_2; offset_2 =3D offset_1; offs= et_1 =3D tmpOff; /* swap offset_2 <=3D> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D (U32)(ip-base= ); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D (U32)(ip-base); + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, = rLength); + ip +=3D rLength; + anchor =3D ip; + continue; /* faster when present ... (?) */ + } + } + } +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls /* template */, ZSTD_dictMode_e const dictMode) + U32 const mls /* template */) { ZSTD_compressionParameters const* cParams =3D &ms->cParams; U32* const hashLong =3D ms->hashTable; @@ -72,54 +278,30 @@ size_t ZSTD_compressBlock_doubleFast_generic( U32 offsetSaved =3D 0; =20 const ZSTD_matchState_t* const dms =3D ms->dictMatchState; - const ZSTD_compressionParameters* const dictCParams =3D - dictMode =3D=3D ZSTD_dictMatchState ? - &dms->cParams : NULL; - const U32* const dictHashLong =3D dictMode =3D=3D ZSTD_dictMatchState= ? - dms->hashTable : NULL; - const U32* const dictHashSmall =3D dictMode =3D=3D ZSTD_dictMatchState= ? - dms->chainTable : NULL; - const U32 dictStartIndex =3D dictMode =3D=3D ZSTD_dictMatchState= ? - dms->window.dictLimit : 0; - const BYTE* const dictBase =3D dictMode =3D=3D ZSTD_dictMatchState= ? - dms->window.base : NULL; - const BYTE* const dictStart =3D dictMode =3D=3D ZSTD_dictMatchState= ? - dictBase + dictStartIndex : NULL; - const BYTE* const dictEnd =3D dictMode =3D=3D ZSTD_dictMatchState= ? - dms->window.nextSrc : NULL; - const U32 dictIndexDelta =3D dictMode =3D=3D ZSTD_dictMatchState= ? - prefixLowestIndex - (U32)(dictEnd - d= ictBase) : - 0; - const U32 dictHBitsL =3D dictMode =3D=3D ZSTD_dictMatchState= ? - dictCParams->hashLog : hBitsL; - const U32 dictHBitsS =3D dictMode =3D=3D ZSTD_dictMatchState= ? - dictCParams->chainLog : hBitsS; + const ZSTD_compressionParameters* const dictCParams =3D &dms->cParams; + const U32* const dictHashLong =3D dms->hashTable; + const U32* const dictHashSmall =3D dms->chainTable; + const U32 dictStartIndex =3D dms->window.dictLimit; + const BYTE* const dictBase =3D dms->window.base; + const BYTE* const dictStart =3D dictBase + dictStartIndex; + const BYTE* const dictEnd =3D dms->window.nextSrc; + const U32 dictIndexDelta =3D prefixLowestIndex - (U32)(dictEnd -= dictBase); + const U32 dictHBitsL =3D dictCParams->hashLog; + const U32 dictHBitsS =3D dictCParams->chainLog; const U32 dictAndPrefixLength =3D (U32)((ip - prefixLowest) + (dictEn= d - dictStart)); =20 - DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic"); - - assert(dictMode =3D=3D ZSTD_noDict || dictMode =3D=3D ZSTD_dictMatchSt= ate); + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); =20 /* if a dictionary is attached, it must be within window range */ - if (dictMode =3D=3D ZSTD_dictMatchState) { - assert(ms->window.dictLimit + (1U << cParams->windowLog) >=3D endI= ndex); - } + assert(ms->window.dictLimit + (1U << cParams->windowLog) >=3D endIndex= ); =20 /* init */ ip +=3D (dictAndPrefixLength =3D=3D 0); - if (dictMode =3D=3D ZSTD_noDict) { - U32 const curr =3D (U32)(ip - base); - U32 const windowLow =3D ZSTD_getLowestPrefixIndex(ms, curr, cParam= s->windowLog); - U32 const maxRep =3D curr - windowLow; - if (offset_2 > maxRep) offsetSaved =3D offset_2, offset_2 =3D 0; - if (offset_1 > maxRep) offsetSaved =3D offset_1, offset_1 =3D 0; - } - if (dictMode =3D=3D ZSTD_dictMatchState) { - /* dictMatchState repCode checks don't currently handle repCode = =3D=3D 0 - * disabling. */ - assert(offset_1 <=3D dictAndPrefixLength); - assert(offset_2 <=3D dictAndPrefixLength); - } + + /* dictMatchState repCode checks don't currently handle repCode =3D=3D= 0 + * disabling. */ + assert(offset_1 <=3D dictAndPrefixLength); + assert(offset_2 <=3D dictAndPrefixLength); =20 /* Main Search Loop */ while (ip < ilimit) { /* < instead of <=3D, because repcode check at= (ip+1) */ @@ -135,29 +317,18 @@ size_t ZSTD_compressBlock_doubleFast_generic( const BYTE* matchLong =3D base + matchIndexL; const BYTE* match =3D base + matchIndexS; const U32 repIndex =3D curr + 1 - offset_1; - const BYTE* repMatch =3D (dictMode =3D=3D ZSTD_dictMatchState - && repIndex < prefixLowestIndex) ? + const BYTE* repMatch =3D (repIndex < prefixLowestIndex) ? dictBase + (repIndex - dictIndexDelta) : base + repIndex; hashLong[h2] =3D hashSmall[h] =3D curr; /* update hash tables */ =20 - /* check dictMatchState repcode */ - if (dictMode =3D=3D ZSTD_dictMatchState - && ((U32)((prefixLowestIndex-1) - repIndex) >=3D 3 /* intentio= nal underflow */) + /* check repcode */ + if (((U32)((prefixLowestIndex-1) - repIndex) >=3D 3 /* intentional= underflow */) && (MEM_read32(repMatch) =3D=3D MEM_read32(ip+1)) ) { const BYTE* repMatchEnd =3D repIndex < prefixLowestIndex ? dic= tEnd : iend; mLength =3D ZSTD_count_2segments(ip+1+4, repMatch+4, iend, rep= MatchEnd, prefixLowest) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, = mLength-MINMATCH); - goto _match_stored; - } - - /* check noDict repcode */ - if ( dictMode =3D=3D ZSTD_noDict - && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) =3D=3D MEM_read3= 2(ip+1)))) { - mLength =3D ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; - ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, = mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO= RE_REPCODE_1, mLength); goto _match_stored; } =20 @@ -169,7 +340,7 @@ size_t ZSTD_compressBlock_doubleFast_generic( while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1]= =3D=3D matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ goto _match_found; } - } else if (dictMode =3D=3D ZSTD_dictMatchState) { + } else { /* check dictMatchState long match */ U32 const dictMatchIndexL =3D dictHashLong[dictHL]; const BYTE* dictMatchL =3D dictBase + dictMatchIndexL; @@ -187,7 +358,7 @@ size_t ZSTD_compressBlock_doubleFast_generic( if (MEM_read32(match) =3D=3D MEM_read32(ip)) { goto _search_next_long; } - } else if (dictMode =3D=3D ZSTD_dictMatchState) { + } else { /* check dictMatchState short match */ U32 const dictMatchIndexS =3D dictHashSmall[dictHS]; match =3D dictBase + dictMatchIndexS; @@ -220,7 +391,7 @@ size_t ZSTD_compressBlock_doubleFast_generic( while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-= 1] =3D=3D matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ goto _match_found; } - } else if (dictMode =3D=3D ZSTD_dictMatchState) { + } else { /* check dict long +1 match */ U32 const dictMatchIndexL3 =3D dictHashLong[dictHLNext]; const BYTE* dictMatchL3 =3D dictBase + dictMatchIndexL3; @@ -234,7 +405,7 @@ size_t ZSTD_compressBlock_doubleFast_generic( } } } =20 /* if no long +1 match, explore the short match we found */ - if (dictMode =3D=3D ZSTD_dictMatchState && matchIndexS < prefixLow= estIndex) { + if (matchIndexS < prefixLowestIndex) { mLength =3D ZSTD_count_2segments(ip+4, match+4, iend, dictEnd,= prefixLowest) + 4; offset =3D (U32)(curr - matchIndexS); while (((ip>anchor) & (match>dictStart)) && (ip[-1] =3D=3D mat= ch[-1])) { ip--; match--; mLength++; } /* catch up */ @@ -248,7 +419,7 @@ size_t ZSTD_compressBlock_doubleFast_generic( offset_2 =3D offset_1; offset_1 =3D offset; =20 - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset = + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_O= FFSET(offset), mLength); =20 _match_stored: /* match found */ @@ -266,43 +437,27 @@ size_t ZSTD_compressBlock_doubleFast_generic( } =20 /* check immediate repcode */ - if (dictMode =3D=3D ZSTD_dictMatchState) { - while (ip <=3D ilimit) { - U32 const current2 =3D (U32)(ip-base); - U32 const repIndex2 =3D current2 - offset_2; - const BYTE* repMatch2 =3D dictMode =3D=3D ZSTD_dictMat= chState - && repIndex2 < prefixLowestIndex ? - dictBase + repIndex2 - dictIndexDelta : - base + repIndex2; - if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= =3D 3 /* intentional overflow */) - && (MEM_read32(repMatch2) =3D=3D MEM_read32(ip)) ) { - const BYTE* const repEnd2 =3D repIndex2 < prefixLo= westIndex ? dictEnd : iend; - size_t const repLength2 =3D ZSTD_count_2segments(i= p+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; - U32 tmpOffset =3D offset_2; offset_2 =3D offset_1;= offset_1 =3D tmpOffset; /* swap offset_2 <=3D> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLen= gth2-MINMATCH); - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D curre= nt2; - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D current2; - ip +=3D repLength2; - anchor =3D ip; - continue; - } - break; - } } - - if (dictMode =3D=3D ZSTD_noDict) { - while ( (ip <=3D ilimit) - && ( (offset_2>0) - & (MEM_read32(ip) =3D=3D MEM_read32(ip - offset_2)= ) )) { - /* store sequence */ - size_t const rLength =3D ZSTD_count(ip+4, ip+4-offset_= 2, iend) + 4; - U32 const tmpOff =3D offset_2; offset_2 =3D offset_1; = offset_1 =3D tmpOff; /* swap offset_2 <=3D> offset_1 */ - hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D (U32)(ip-= base); - hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D (U32)(ip-bas= e); - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MI= NMATCH); - ip +=3D rLength; + while (ip <=3D ilimit) { + U32 const current2 =3D (U32)(ip-base); + U32 const repIndex2 =3D current2 - offset_2; + const BYTE* repMatch2 =3D repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; + if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >=3D 3= /* intentional overflow */) + && (MEM_read32(repMatch2) =3D=3D MEM_read32(ip)) ) { + const BYTE* const repEnd2 =3D repIndex2 < prefixLowest= Index ? dictEnd : iend; + size_t const repLength2 =3D ZSTD_count_2segments(ip+4,= repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset =3D offset_2; offset_2 =3D offset_1; off= set_1 =3D tmpOffset; /* swap offset_2 <=3D> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE= _1, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D current2; + ip +=3D repLength2; anchor =3D ip; - continue; /* faster when present ... (?) */ - } } } + continue; + } + break; + } + } } /* while (ip < ilimit) */ =20 /* save reps for next block */ @@ -313,6 +468,24 @@ size_t ZSTD_compressBlock_doubleFast_generic( return (size_t)(iend - anchor); } =20 +#define ZSTD_GEN_DFAST_FN(dictMode, mls) = \ + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( = \ + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_= NUM], \ + void const* src, size_t srcSize) = \ + { = \ + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqS= tore, rep, src, srcSize, mls); \ + } + +ZSTD_GEN_DFAST_FN(noDict, 4) +ZSTD_GEN_DFAST_FN(noDict, 5) +ZSTD_GEN_DFAST_FN(noDict, 6) +ZSTD_GEN_DFAST_FN(noDict, 7) + +ZSTD_GEN_DFAST_FN(dictMatchState, 4) +ZSTD_GEN_DFAST_FN(dictMatchState, 5) +ZSTD_GEN_DFAST_FN(dictMatchState, 6) +ZSTD_GEN_DFAST_FN(dictMatchState, 7) + =20 size_t ZSTD_compressBlock_doubleFast( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -323,13 +496,13 @@ size_t ZSTD_compressBlock_doubleFast( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, 4, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_4(ms, seqStore, rep, s= rc, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, 5, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_5(ms, seqStore, rep, s= rc, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, 6, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_6(ms, seqStore, rep, s= rc, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, 7, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast_noDict_7(ms, seqStore, rep, s= rc, srcSize); } } =20 @@ -343,13 +516,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, 4, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_4(ms, seqStore= , rep, src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, 5, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_5(ms, seqStore= , rep, src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, 6, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_6(ms, seqStore= , rep, src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, 7, ZSTD_dictMatchState); + return ZSTD_compressBlock_doubleFast_dictMatchState_7(ms, seqStore= , rep, src, srcSize); } } =20 @@ -385,7 +558,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_gen= eric( =20 /* if extDict is invalidated due to maxDistance, switch to "regular" v= ariant */ if (prefixStartIndex =3D=3D dictStartIndex) - return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, sr= c, srcSize, mls, ZSTD_noDict); + return ZSTD_compressBlock_doubleFast(ms, seqStore, rep, src, srcSi= ze); =20 /* Search Loop */ while (ip < ilimit) { /* < instead of <=3D, because (ip+1) */ @@ -407,12 +580,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_g= eneric( hashSmall[hSmall] =3D hashLong[hLong] =3D curr; /* update hash t= able */ =20 if ((((U32)((prefixStartIndex-1) - repIndex) >=3D 3) /* intentiona= l underflow : ensure repIndex doesn't overlap dict + prefix */ - & (repIndex > dictStartIndex)) + & (offset_1 <=3D curr+1 - dictStartIndex)) /* note: we are sea= rching at curr+1 */ && (MEM_read32(repMatch) =3D=3D MEM_read32(ip+1)) ) { const BYTE* repMatchEnd =3D repIndex < prefixStartIndex ? dict= End : iend; mLength =3D ZSTD_count_2segments(ip+1+4, repMatch+4, iend, rep= MatchEnd, prefixStart) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, = mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO= RE_REPCODE_1, mLength); } else { if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong= ) =3D=3D MEM_read64(ip))) { const BYTE* const matchEnd =3D matchLongIndex < prefixStar= tIndex ? dictEnd : iend; @@ -423,7 +596,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_gen= eric( while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] = =3D=3D matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ offset_2 =3D offset_1; offset_1 =3D offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= STORE_OFFSET(offset), mLength); =20 } else if ((matchIndex > dictStartIndex) && (MEM_read32(match)= =3D=3D MEM_read32(ip))) { size_t const h3 =3D ZSTD_hashPtr(ip+1, hBitsL, 8); @@ -448,7 +621,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_gen= eric( } offset_2 =3D offset_1; offset_1 =3D offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= STORE_OFFSET(offset), mLength); =20 } else { ip +=3D ((ip-anchor) >> kSearchStrength) + 1; @@ -475,12 +648,12 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_g= eneric( U32 const repIndex2 =3D current2 - offset_2; const BYTE* repMatch2 =3D repIndex2 < prefixStartIndex ? d= ictBase + repIndex2 : base + repIndex2; if ( (((U32)((prefixStartIndex-1) - repIndex2) >=3D 3) /= * intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ - & (repIndex2 > dictStartIndex)) + & (offset_2 <=3D current2 - dictStartIndex)) && (MEM_read32(repMatch2) =3D=3D MEM_read32(ip)) ) { const BYTE* const repEnd2 =3D repIndex2 < prefixStartI= ndex ? dictEnd : iend; size_t const repLength2 =3D ZSTD_count_2segments(ip+4,= repMatch2+4, iend, repEnd2, prefixStart) + 4; U32 const tmpOffset =3D offset_2; offset_2 =3D offset_= 1; offset_1 =3D tmpOffset; /* swap offset_2 <=3D> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2= -MINMATCH); + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE= _1, repLength2); hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] =3D current2; hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] =3D current2; ip +=3D repLength2; @@ -498,6 +671,10 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_ge= neric( return (size_t)(iend - anchor); } =20 +ZSTD_GEN_DFAST_FN(extDict, 4) +ZSTD_GEN_DFAST_FN(extDict, 5) +ZSTD_GEN_DFAST_FN(extDict, 6) +ZSTD_GEN_DFAST_FN(extDict, 7) =20 size_t ZSTD_compressBlock_doubleFast_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -508,12 +685,12 @@ size_t ZSTD_compressBlock_doubleFast_extDict( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore,= rep, src, srcSize, 4); + return ZSTD_compressBlock_doubleFast_extDict_4(ms, seqStore, rep, = src, srcSize); case 5 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore,= rep, src, srcSize, 5); + return ZSTD_compressBlock_doubleFast_extDict_5(ms, seqStore, rep, = src, srcSize); case 6 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore,= rep, src, srcSize, 6); + return ZSTD_compressBlock_doubleFast_extDict_6(ms, seqStore, rep, = src, srcSize); case 7 : - return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore,= rep, src, srcSize, 7); + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, = src, srcSize); } } diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c index 96b7d48e2868..a752e6beab52 100644 --- a/lib/zstd/compress/zstd_fast.c +++ b/lib/zstd/compress/zstd_fast.c @@ -43,145 +43,294 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, } =20 =20 +/* + * If you squint hard enough (and ignore repcodes), the search operation a= t any + * given position is broken into 4 stages: + * + * 1. Hash (map position to hash value via input read) + * 2. Lookup (map hash val to index via hashtable read) + * 3. Load (map index to value at that position via input read) + * 4. Compare + * + * Each of these steps involves a memory read at an address which is compu= ted + * from the previous step. This means these steps must be sequenced and th= eir + * latencies are cumulative. + * + * Rather than do 1->2->3->4 sequentially for a single position before mov= ing + * onto the next, this implementation interleaves these operations across = the + * next few positions: + * + * R =3D Repcode Read & Compare + * H =3D Hash + * T =3D Table Lookup + * M =3D Match Read & Compare + * + * Pos | Time --> + * ----+------------------- + * N | ... M + * N+1 | ... TM + * N+2 | R H T M + * N+3 | H TM + * N+4 | R H T M + * N+5 | H ... + * N+6 | R ... + * + * This is very much analogous to the pipelining of execution in a CPU. An= d just + * like a CPU, we have to dump the pipeline when we find a match (i.e., ta= ke a + * branch). + * + * When this happens, we throw away our current state, and do the followin= g prep + * to re-enter the loop: + * + * Pos | Time --> + * ----+------------------- + * N | H T + * N+1 | H + * + * This is also the work we do at the beginning to enter the loop initiall= y. + */ FORCE_INLINE_TEMPLATE size_t -ZSTD_compressBlock_fast_generic( +ZSTD_compressBlock_fast_noDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize, - U32 const mls) + U32 const mls, U32 const hasStep) { const ZSTD_compressionParameters* const cParams =3D &ms->cParams; U32* const hashTable =3D ms->hashTable; U32 const hlog =3D cParams->hashLog; /* support stepSize of 0 */ - size_t const stepSize =3D cParams->targetLength + !(cParams->targetLen= gth) + 1; + size_t const stepSize =3D hasStep ? (cParams->targetLength + !(cParams= ->targetLength) + 1) : 2; const BYTE* const base =3D ms->window.base; const BYTE* const istart =3D (const BYTE*)src; - /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */ - const BYTE* ip0 =3D istart; - const BYTE* ip1; - const BYTE* anchor =3D istart; const U32 endIndex =3D (U32)((size_t)(istart - base) + srcSize); const U32 prefixStartIndex =3D ZSTD_getLowestPrefixIndex(ms, endInde= x, cParams->windowLog); const BYTE* const prefixStart =3D base + prefixStartIndex; const BYTE* const iend =3D istart + srcSize; const BYTE* const ilimit =3D iend - HASH_READ_SIZE; - U32 offset_1=3Drep[0], offset_2=3Drep[1]; + + const BYTE* anchor =3D istart; + const BYTE* ip0 =3D istart; + const BYTE* ip1; + const BYTE* ip2; + const BYTE* ip3; + U32 current0; + + U32 rep_offset1 =3D rep[0]; + U32 rep_offset2 =3D rep[1]; U32 offsetSaved =3D 0; =20 - /* init */ + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ + U32 idx; /* match idx for ip0 */ + U32 mval; /* src value at match idx */ + + U32 offcode; + const BYTE* match0; + size_t mLength; + + /* ip0 and ip1 are always adjacent. The targetLength skipping and + * uncompressibility acceleration is applied to every other position, + * matching the behavior of #1562. step therefore represents the gap + * between pairs of positions, from ip0 to ip2 or ip1 to ip3. */ + size_t step; + const BYTE* nextStep; + const size_t kStepIncr =3D (1 << (kSearchStrength - 1)); + DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); ip0 +=3D (ip0 =3D=3D prefixStart); - ip1 =3D ip0 + 1; { U32 const curr =3D (U32)(ip0 - base); U32 const windowLow =3D ZSTD_getLowestPrefixIndex(ms, curr, cParam= s->windowLog); U32 const maxRep =3D curr - windowLow; - if (offset_2 > maxRep) offsetSaved =3D offset_2, offset_2 =3D 0; - if (offset_1 > maxRep) offsetSaved =3D offset_1, offset_1 =3D 0; + if (rep_offset2 > maxRep) offsetSaved =3D rep_offset2, rep_offset2= =3D 0; + if (rep_offset1 > maxRep) offsetSaved =3D rep_offset1, rep_offset1= =3D 0; } =20 - /* Main Search Loop */ -#ifdef __INTEL_COMPILER - /* From intel 'The vector pragma indicates that the loop should be - * vectorized if it is legal to do so'. Can be used together with - * #pragma ivdep (but have opted to exclude that because intel - * warns against using it).*/ - #pragma vector always -#endif - while (ip1 < ilimit) { /* < instead of <=3D, because check at ip0+2 = */ - size_t mLength; - BYTE const* ip2 =3D ip0 + 2; - size_t const h0 =3D ZSTD_hashPtr(ip0, hlog, mls); - U32 const val0 =3D MEM_read32(ip0); - size_t const h1 =3D ZSTD_hashPtr(ip1, hlog, mls); - U32 const val1 =3D MEM_read32(ip1); - U32 const current0 =3D (U32)(ip0-base); - U32 const current1 =3D (U32)(ip1-base); - U32 const matchIndex0 =3D hashTable[h0]; - U32 const matchIndex1 =3D hashTable[h1]; - BYTE const* repMatch =3D ip2 - offset_1; - const BYTE* match0 =3D base + matchIndex0; - const BYTE* match1 =3D base + matchIndex1; - U32 offcode; - -#if defined(__aarch64__) - PREFETCH_L1(ip0+256); -#endif - - hashTable[h0] =3D current0; /* update hash table */ - hashTable[h1] =3D current1; /* update hash table */ - - assert(ip0 + 1 =3D=3D ip1); - - if ((offset_1 > 0) & (MEM_read32(repMatch) =3D=3D MEM_read32(ip2))= ) { - mLength =3D (ip2[-1] =3D=3D repMatch[-1]) ? 1 : 0; - ip0 =3D ip2 - mLength; - match0 =3D repMatch - mLength; + /* start each op */ +_start: /* Requires: ip0 */ + + step =3D stepSize; + nextStep =3D ip0 + kStepIncr; + + /* calculate positions, ip0 - anchor =3D=3D 0, so we skip step calc */ + ip1 =3D ip0 + 1; + ip2 =3D ip0 + step; + ip3 =3D ip2 + 1; + + if (ip3 >=3D ilimit) { + goto _cleanup; + } + + hash0 =3D ZSTD_hashPtr(ip0, hlog, mls); + hash1 =3D ZSTD_hashPtr(ip1, hlog, mls); + + idx =3D hashTable[hash0]; + + do { + /* load repcode match for ip[2]*/ + const U32 rval =3D MEM_read32(ip2 - rep_offset1); + + /* write back hash table entry */ + current0 =3D (U32)(ip0 - base); + hashTable[hash0] =3D current0; + + /* check repcode at ip[2] */ + if ((MEM_read32(ip2) =3D=3D rval) & (rep_offset1 > 0)) { + ip0 =3D ip2; + match0 =3D ip0 - rep_offset1; + mLength =3D ip0[-1] =3D=3D match0[-1]; + ip0 -=3D mLength; + match0 -=3D mLength; + offcode =3D STORE_REPCODE_1; mLength +=3D 4; - offcode =3D 0; goto _match; } - if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) =3D=3D = val0) { - /* found a regular match */ - goto _offset; + + /* load match for ip[0] */ + if (idx >=3D prefixStartIndex) { + mval =3D MEM_read32(base + idx); + } else { + mval =3D MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ } - if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) =3D=3D = val1) { - /* found a regular match after one literal */ - ip0 =3D ip1; - match0 =3D match1; + + /* check match at ip[0] */ + if (MEM_read32(ip0) =3D=3D mval) { + /* found a match! */ goto _offset; } - { size_t const step =3D ((size_t)(ip0-anchor) >> (kSearchStrengt= h - 1)) + stepSize; - assert(step >=3D 2); - ip0 +=3D step; - ip1 +=3D step; - continue; + + /* lookup ip[1] */ + idx =3D hashTable[hash1]; + + /* hash ip[2] */ + hash0 =3D hash1; + hash1 =3D ZSTD_hashPtr(ip2, hlog, mls); + + /* advance to next positions */ + ip0 =3D ip1; + ip1 =3D ip2; + ip2 =3D ip3; + + /* write back hash table entry */ + current0 =3D (U32)(ip0 - base); + hashTable[hash0] =3D current0; + + /* load match for ip[0] */ + if (idx >=3D prefixStartIndex) { + mval =3D MEM_read32(base + idx); + } else { + mval =3D MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ } -_offset: /* Requires: ip0, match0 */ - /* Compute the offset code */ - offset_2 =3D offset_1; - offset_1 =3D (U32)(ip0-match0); - offcode =3D offset_1 + ZSTD_REP_MOVE; - mLength =3D 4; - /* Count the backwards match length */ - while (((ip0>anchor) & (match0>prefixStart)) - && (ip0[-1] =3D=3D match0[-1])) { ip0--; match0--; mLength++;= } /* catch up */ =20 -_match: /* Requires: ip0, match0, offcode */ - /* Count the forward length */ - mLength +=3D ZSTD_count(ip0+mLength, match0+mLength, iend); - ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcod= e, mLength-MINMATCH); - /* match found */ - ip0 +=3D mLength; - anchor =3D ip0; + /* check match at ip[0] */ + if (MEM_read32(ip0) =3D=3D mval) { + /* found a match! */ + goto _offset; + } =20 - if (ip0 <=3D ilimit) { - /* Fill Table */ - assert(base+current0+2 > istart); /* check base overflow */ - hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] =3D curren= t0+2; /* here because current+2 could be > iend-8 */ - hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] =3D (U32)(ip0-2-base= ); - - if (offset_2 > 0) { /* offset_2=3D=3D0 means offset_2 is inval= idated */ - while ( (ip0 <=3D ilimit) && (MEM_read32(ip0) =3D=3D MEM_r= ead32(ip0 - offset_2)) ) { - /* store sequence */ - size_t const rLength =3D ZSTD_count(ip0+4, ip0+4-offse= t_2, iend) + 4; - { U32 const tmpOff =3D offset_2; offset_2 =3D offset_1= ; offset_1 =3D tmpOff; } /* swap offset_2 <=3D> offset_1 */ - hashTable[ZSTD_hashPtr(ip0, hlog, mls)] =3D (U32)(ip0-= base); - ip0 +=3D rLength; - ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 = /*offCode*/, rLength-MINMATCH); - anchor =3D ip0; - continue; /* faster when present (confirmed on gcc-8= ) ... (?) */ - } } } - ip1 =3D ip0 + 1; - } + /* lookup ip[1] */ + idx =3D hashTable[hash1]; + + /* hash ip[2] */ + hash0 =3D hash1; + hash1 =3D ZSTD_hashPtr(ip2, hlog, mls); + + /* advance to next positions */ + ip0 =3D ip1; + ip1 =3D ip2; + ip2 =3D ip0 + step; + ip3 =3D ip1 + step; + + /* calculate step */ + if (ip2 >=3D nextStep) { + step++; + PREFETCH_L1(ip1 + 64); + PREFETCH_L1(ip1 + 128); + nextStep +=3D kStepIncr; + } + } while (ip3 < ilimit); + +_cleanup: + /* Note that there are probably still a couple positions we could sear= ch. + * However, it seems to be a meaningful performance hit to try to sear= ch + * them. So let's not. */ =20 /* save reps for next block */ - rep[0] =3D offset_1 ? offset_1 : offsetSaved; - rep[1] =3D offset_2 ? offset_2 : offsetSaved; + rep[0] =3D rep_offset1 ? rep_offset1 : offsetSaved; + rep[1] =3D rep_offset2 ? rep_offset2 : offsetSaved; =20 /* Return the last literals size */ return (size_t)(iend - anchor); + +_offset: /* Requires: ip0, idx */ + + /* Compute the offset code. */ + match0 =3D base + idx; + rep_offset2 =3D rep_offset1; + rep_offset1 =3D (U32)(ip0-match0); + offcode =3D STORE_OFFSET(rep_offset1); + mLength =3D 4; + + /* Count the backwards match length. */ + while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-1] =3D=3D match0= [-1])) { + ip0--; + match0--; + mLength++; + } + +_match: /* Requires: ip0, match0, offcode */ + + /* Count the forward length. */ + mLength +=3D ZSTD_count(ip0 + mLength, match0 + mLength, iend); + + ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode,= mLength); + + ip0 +=3D mLength; + anchor =3D ip0; + + /* write next hash table entry */ + if (ip1 < ip0) { + hashTable[hash1] =3D (U32)(ip1 - base); + } + + /* Fill table and check for immediate repcode. */ + if (ip0 <=3D ilimit) { + /* Fill Table */ + assert(base+current0+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] =3D current0+2= ; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] =3D (U32)(ip0-2-base); + + if (rep_offset2 > 0) { /* rep_offset2=3D=3D0 means rep_offset2 is = invalidated */ + while ( (ip0 <=3D ilimit) && (MEM_read32(ip0) =3D=3D MEM_read3= 2(ip0 - rep_offset2)) ) { + /* store sequence */ + size_t const rLength =3D ZSTD_count(ip0+4, ip0+4-rep_offse= t2, iend) + 4; + { U32 const tmpOff =3D rep_offset2; rep_offset2 =3D rep_of= fset1; rep_offset1 =3D tmpOff; } /* swap rep_offset2 <=3D> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] =3D (U32)(ip0-base= ); + ip0 +=3D rLength; + ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_= REPCODE_1, rLength); + anchor =3D ip0; + continue; /* faster when present (confirmed on gcc-8) ..= . (?) */ + } } } + + goto _start; } =20 +#define ZSTD_GEN_FAST_FN(dictMode, mls, step) = \ + static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( = \ + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_= NUM], \ + void const* src, size_t srcSize) = \ + { = \ + return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, = rep, src, srcSize, mls, step); \ + } + +ZSTD_GEN_FAST_FN(noDict, 4, 1) +ZSTD_GEN_FAST_FN(noDict, 5, 1) +ZSTD_GEN_FAST_FN(noDict, 6, 1) +ZSTD_GEN_FAST_FN(noDict, 7, 1) + +ZSTD_GEN_FAST_FN(noDict, 4, 0) +ZSTD_GEN_FAST_FN(noDict, 5, 0) +ZSTD_GEN_FAST_FN(noDict, 6, 0) +ZSTD_GEN_FAST_FN(noDict, 7, 0) =20 size_t ZSTD_compressBlock_fast( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -189,24 +338,40 @@ size_t ZSTD_compressBlock_fast( { U32 const mls =3D ms->cParams.minMatch; assert(ms->dictMatchState =3D=3D NULL); - switch(mls) - { - default: /* includes case 3 */ - case 4 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src= Size, 4); - case 5 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src= Size, 5); - case 6 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src= Size, 6); - case 7 : - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src= Size, 7); + if (ms->cParams.targetLength > 1) { + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, s= rc, srcSize); + case 5 : + return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, s= rc, srcSize); + case 6 : + return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, s= rc, srcSize); + case 7 : + return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, s= rc, srcSize); + } + } else { + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, s= rc, srcSize); + case 5 : + return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, s= rc, srcSize); + case 6 : + return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, s= rc, srcSize); + case 7 : + return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, s= rc, srcSize); + } + } } =20 FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_fast_dictMatchState_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls) + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) { const ZSTD_compressionParameters* const cParams =3D &ms->cParams; U32* const hashTable =3D ms->hashTable; @@ -242,6 +407,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( assert(endIndex - prefixStartIndex <=3D maxDistance); (void)maxDistance; (void)endIndex; /* these variables are not used w= hen assert() is disabled */ =20 + (void)hasStep; /* not currently specialized on whether it's accelerate= d */ + /* ensure there will be no underflow * when translating a dict index into a local index */ assert(prefixStartIndex >=3D (U32)(dictEnd - dictBase)); @@ -272,7 +439,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const BYTE* const repMatchEnd =3D repIndex < prefixStartIndex = ? dictEnd : iend; mLength =3D ZSTD_count_2segments(ip+1+4, repMatch+4, iend, rep= MatchEnd, prefixStart) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, = mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO= RE_REPCODE_1, mLength); } else if ( (matchIndex <=3D prefixStartIndex) ) { size_t const dictHash =3D ZSTD_hashPtr(ip, dictHLog, mls); U32 const dictMatchIndex =3D dictHashTable[dictHash]; @@ -292,7 +459,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( } /* catch up */ offset_2 =3D offset_1; offset_1 =3D offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= STORE_OFFSET(offset), mLength); } } else if (MEM_read32(match) !=3D MEM_read32(ip)) { /* it's not a match, and we're not going to check the dictiona= ry */ @@ -307,7 +474,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( && (ip[-1] =3D=3D match[-1])) { ip--; match--; mLength++;= } /* catch up */ offset_2 =3D offset_1; offset_1 =3D offset; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, off= set + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO= RE_OFFSET(offset), mLength); } =20 /* match found */ @@ -332,7 +499,7 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( const BYTE* const repEnd2 =3D repIndex2 < prefixStartI= ndex ? dictEnd : iend; size_t const repLength2 =3D ZSTD_count_2segments(ip+4,= repMatch2+4, iend, repEnd2, prefixStart) + 4; U32 tmpOffset =3D offset_2; offset_2 =3D offset_1; off= set_1 =3D tmpOffset; /* swap offset_2 <=3D> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2= -MINMATCH); + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE= _1, repLength2); hashTable[ZSTD_hashPtr(ip, hlog, mls)] =3D current2; ip +=3D repLength2; anchor =3D ip; @@ -351,6 +518,12 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( return (size_t)(iend - anchor); } =20 + +ZSTD_GEN_FAST_FN(dictMatchState, 4, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 5, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) +ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) + size_t ZSTD_compressBlock_fast_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize) @@ -361,20 +534,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore= , rep, src, srcSize, 4); + return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, re= p, src, srcSize); case 5 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore= , rep, src, srcSize, 5); + return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, re= p, src, srcSize); case 6 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore= , rep, src, srcSize, 6); + return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, re= p, src, srcSize); case 7 : - return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore= , rep, src, srcSize, 7); + return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, re= p, src, srcSize); } } =20 =20 static size_t ZSTD_compressBlock_fast_extDict_generic( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], - void const* src, size_t srcSize, U32 const mls) + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) { const ZSTD_compressionParameters* const cParams =3D &ms->cParams; U32* const hashTable =3D ms->hashTable; @@ -398,11 +571,13 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( const BYTE* const ilimit =3D iend - 8; U32 offset_1=3Drep[0], offset_2=3Drep[1]; =20 + (void)hasStep; /* not currently specialized on whether it's accelerate= d */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=3D%u)",= offset_1); =20 /* switch to "regular" variant if extDict is invalidated due to maxDis= tance */ if (prefixStartIndex =3D=3D dictStartIndex) - return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, src= Size, mls); + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); =20 /* Search Loop */ while (ip < ilimit) { /* < instead of <=3D, because (ip+1) */ @@ -416,14 +591,14 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( const BYTE* const repMatch =3D repBase + repIndex; hashTable[h] =3D curr; /* update hash table */ DEBUGLOG(7, "offset_1 =3D %u , curr =3D %u", offset_1, curr); - assert(offset_1 <=3D curr +1); /* check repIndex */ =20 - if ( (((U32)((prefixStartIndex-1) - repIndex) >=3D 3) /* intention= al underflow */ & (repIndex > dictStartIndex)) + if ( ( ((U32)((prefixStartIndex-1) - repIndex) >=3D 3) /* intentio= nal underflow */ + & (offset_1 <=3D curr+1 - dictStartIndex) ) /* note: we are s= earching at curr+1 */ && (MEM_read32(repMatch) =3D=3D MEM_read32(ip+1)) ) { const BYTE* const repMatchEnd =3D repIndex < prefixStartIndex = ? dictEnd : iend; size_t const rLength =3D ZSTD_count_2segments(ip+1 +4, repMatc= h +4, iend, repMatchEnd, prefixStart) + 4; ip++; - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, = rLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STO= RE_REPCODE_1, rLength); ip +=3D rLength; anchor =3D ip; } else { @@ -439,7 +614,7 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( size_t mLength =3D ZSTD_count_2segments(ip+4, match+4, ien= d, matchEnd, prefixStart) + 4; while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] =3D= =3D match[-1])) { ip--; match--; mLength++; } /* catch up */ offset_2 =3D offset_1; offset_1 =3D offset; /* update off= set history */ - ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend,= STORE_OFFSET(offset), mLength); ip +=3D mLength; anchor =3D ip; } } @@ -453,12 +628,12 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( U32 const current2 =3D (U32)(ip-base); U32 const repIndex2 =3D current2 - offset_2; const BYTE* const repMatch2 =3D repIndex2 < prefixStartInd= ex ? dictBase + repIndex2 : base + repIndex2; - if ( (((U32)((prefixStartIndex-1) - repIndex2) >=3D 3) & (= repIndex2 > dictStartIndex)) /* intentional overflow */ + if ( (((U32)((prefixStartIndex-1) - repIndex2) >=3D 3) & (= offset_2 <=3D curr - dictStartIndex)) /* intentional overflow */ && (MEM_read32(repMatch2) =3D=3D MEM_read32(ip)) ) { const BYTE* const repEnd2 =3D repIndex2 < prefixStartI= ndex ? dictEnd : iend; size_t const repLength2 =3D ZSTD_count_2segments(ip+4,= repMatch2+4, iend, repEnd2, prefixStart) + 4; { U32 const tmpOffset =3D offset_2; offset_2 =3D offse= t_1; offset_1 =3D tmpOffset; } /* swap offset_2 <=3D> offset_1 */ - ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 = /*offcode*/, repLength2-MINMATCH); + ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, ST= ORE_REPCODE_1, repLength2); hashTable[ZSTD_hashPtr(ip, hlog, mls)] =3D current2; ip +=3D repLength2; anchor =3D ip; @@ -475,6 +650,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( return (size_t)(iend - anchor); } =20 +ZSTD_GEN_FAST_FN(extDict, 4, 0) +ZSTD_GEN_FAST_FN(extDict, 5, 0) +ZSTD_GEN_FAST_FN(extDict, 6, 0) +ZSTD_GEN_FAST_FN(extDict, 7, 0) =20 size_t ZSTD_compressBlock_fast_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -485,12 +664,12 @@ size_t ZSTD_compressBlock_fast_extDict( { default: /* includes case 3 */ case 4 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, = src, srcSize, 4); + return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src,= srcSize); case 5 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, = src, srcSize, 5); + return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src,= srcSize); case 6 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, = src, srcSize, 6); + return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src,= srcSize); case 7 : - return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, = src, srcSize, 7); + return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src,= srcSize); } } diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c index fb54d4e28a2b..0298a01a7504 100644 --- a/lib/zstd/compress/zstd_lazy.c +++ b/lib/zstd/compress/zstd_lazy.c @@ -61,7 +61,7 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, * assumption : curr >=3D btlow =3D=3D (curr - btmask) * doesn't fail */ static void -ZSTD_insertDUBT1(ZSTD_matchState_t* ms, +ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, U32 curr, const BYTE* inputEnd, U32 nbCompares, U32 btLow, const ZSTD_dictMode_e dictMode) @@ -151,7 +151,7 @@ ZSTD_insertDUBT1(ZSTD_matchState_t* ms, =20 static size_t ZSTD_DUBT_findBetterDictMatch ( - ZSTD_matchState_t* ms, + const ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, size_t* offsetPtr, size_t bestLength, @@ -197,8 +197,8 @@ ZSTD_DUBT_findBetterDictMatch ( U32 matchIndex =3D dictMatchIndex + dictIndexDelta; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(= curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found bet= ter match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, match= Index %u)", - curr, (U32)bestLength, (U32)matchLength, (U32)*offsetP= tr, ZSTD_REP_MOVE + curr - matchIndex, dictMatchIndex, matchIndex); - bestLength =3D matchLength, *offsetPtr =3D ZSTD_REP_MOVE += curr - matchIndex; + curr, (U32)bestLength, (U32)matchLength, (U32)*offsetP= tr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); + bestLength =3D matchLength, *offsetPtr =3D STORE_OFFSET(cu= rr - matchIndex); } if (ip+matchLength =3D=3D iend) { /* reached end of input : = ip[matchLength] is not valid, no way to know if it's larger or smaller than= match */ break; /* drop, to guarantee consistency (miss a little = bit of compression) */ @@ -218,7 +218,7 @@ ZSTD_DUBT_findBetterDictMatch ( } =20 if (bestLength >=3D MINMATCH) { - U32 const mIndex =3D curr - ((U32)*offsetPtr - ZSTD_REP_MOVE); (vo= id)mIndex; + U32 const mIndex =3D curr - (U32)STORED_OFFSET(*offsetPtr); (void)= mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of le= ngth %u and offsetCode %u (pos %u)", curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } @@ -328,7 +328,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, if (matchLength > matchEndIdx - matchIndex) matchEndIdx =3D matchIndex + (U32)matchLength; if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbi= t32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) - bestLength =3D matchLength, *offsetPtr =3D ZSTD_REP_MO= VE + curr - matchIndex; + bestLength =3D matchLength, *offsetPtr =3D STORE_OFFSE= T(curr - matchIndex); if (ip+matchLength =3D=3D iend) { /* equal : no way to k= now if inf or sup */ if (dictMode =3D=3D ZSTD_dictMatchState) { nbCompares =3D 0; /* in addition to avoiding check= ing any @@ -368,7 +368,7 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased = */ ms->nextToUpdate =3D matchEndIdx - 8; /* skip repetitive pattern= s */ if (bestLength >=3D MINMATCH) { - U32 const mIndex =3D curr - ((U32)*offsetPtr - ZSTD_REP_MOVE);= (void)mIndex; + U32 const mIndex =3D curr - (U32)STORED_OFFSET(*offsetPtr); (v= oid)mIndex; DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of leng= th %u and offsetCode %u (pos %u)", curr, (U32)bestLength, (U32)*offsetPtr, mIndex); } @@ -391,91 +391,9 @@ ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMod= e); } =20 - -static size_t -ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZST= D_noDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZST= D_noDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZST= D_noDict); - } -} - - -static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZST= D_dictMatchState); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZST= D_dictMatchState); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZST= D_dictMatchState); - } -} - - -static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) -{ - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZST= D_extDict); - case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZST= D_extDict); - case 7 : - case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZST= D_extDict); - } -} - - - /* ********************************* -* Hash Chain +* Dedicated dict search ***********************************/ -#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] - -/* Update chains up to ip (excluded) - Assumption : always within prefix (i.e. not within extDict) */ -FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( - ZSTD_matchState_t* ms, - const ZSTD_compressionParameters* const cParams, - const BYTE* ip, U32 const mls) -{ - U32* const hashTable =3D ms->hashTable; - const U32 hashLog =3D cParams->hashLog; - U32* const chainTable =3D ms->chainTable; - const U32 chainMask =3D (1 << cParams->chainLog) - 1; - const BYTE* const base =3D ms->window.base; - const U32 target =3D (U32)(ip - base); - U32 idx =3D ms->nextToUpdate; - - while(idx < target) { /* catch up */ - size_t const h =3D ZSTD_hashPtr(base+idx, hashLog, mls); - NEXT_IN_CHAIN(idx, chainMask) =3D hashTable[h]; - hashTable[h] =3D idx; - idx++; - } - - ms->nextToUpdate =3D target; - return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; -} - -U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { - const ZSTD_compressionParameters* const cParams =3D &ms->cParams; - return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cPar= ams.minMatch); -} =20 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, c= onst BYTE* const ip) { @@ -485,7 +403,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_= matchState_t* ms, const B U32* const chainTable =3D ms->chainTable; U32 const chainSize =3D 1 << ms->cParams.chainLog; U32 idx =3D ms->nextToUpdate; - U32 const minChain =3D chainSize < target ? target - chainSize : idx; + U32 const minChain =3D chainSize < target - idx ? target - chainSize := idx; U32 const bucketSize =3D 1 << ZSTD_LAZY_DDSS_BUCKET_LOG; U32 const cacheSize =3D bucketSize - 1; U32 const chainAttempts =3D (1 << ms->cParams.searchLog) - cacheSize; @@ -499,13 +417,12 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZST= D_matchState_t* ms, const B U32 const hashLog =3D ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG; U32* const tmpHashTable =3D hashTable; U32* const tmpChainTable =3D hashTable + ((size_t)1 << hashLog); - U32 const tmpChainSize =3D ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << h= ashLog; + U32 const tmpChainSize =3D (U32)((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1)= << hashLog; U32 const tmpMinChain =3D tmpChainSize < target ? target - tmpChainSiz= e : idx; - U32 hashIdx; =20 assert(ms->cParams.chainLog <=3D 24); - assert(ms->cParams.hashLog >=3D ms->cParams.chainLog); + assert(ms->cParams.hashLog > ms->cParams.chainLog); assert(idx !=3D 0); assert(tmpMinChain <=3D minChain); =20 @@ -536,7 +453,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_= matchState_t* ms, const B if (count =3D=3D cacheSize) { for (count =3D 0; count < chainLimit;) { if (i < minChain) { - if (!i || countBeyondMinChain++ > cacheSize) { + if (!i || ++countBeyondMinChain > cacheSize) { /* only allow pulling `cacheSize` number of en= tries * into the cache or chainTable beyond `minCha= in`, * to replace the entries pulled out of the @@ -592,10 +509,143 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZS= TD_matchState_t* ms, const B ms->nextToUpdate =3D target; } =20 +/* Returns the longest match length found in the dedicated dict search str= ucture. + * If none are longer than the argument ml, then ml will be returned. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, = U32 nbAttempts, + const ZSTD_matchState_t* const= dms, + const BYTE* const ip, const BY= TE* const iLimit, + const BYTE* const prefixStart,= const U32 curr, + const U32 dictLimit, const siz= e_t ddsIdx) { + const U32 ddsLowestIndex =3D dms->window.dictLimit; + const BYTE* const ddsBase =3D dms->window.base; + const BYTE* const ddsEnd =3D dms->window.nextSrc; + const U32 ddsSize =3D (U32)(ddsEnd - ddsBase); + const U32 ddsIndexDelta =3D dictLimit - ddsSize; + const U32 bucketSize =3D (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); + const U32 bucketLimit =3D nbAttempts < bucketSize - 1 ? nbAttempts= : bucketSize - 1; + U32 ddsAttempt; + U32 matchIndex; + + for (ddsAttempt =3D 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { + PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); + } + + { + U32 const chainPackedPointer =3D dms->hashTable[ddsIdx + bucketSiz= e - 1]; + U32 const chainIndex =3D chainPackedPointer >> 8; + + PREFETCH_L1(&dms->chainTable[chainIndex]); + } + + for (ddsAttempt =3D 0; ddsAttempt < bucketLimit; ddsAttempt++) { + size_t currentMl=3D0; + const BYTE* match; + matchIndex =3D dms->hashTable[ddsIdx + ddsAttempt]; + match =3D ddsBase + matchIndex; + + if (!matchIndex) { + return ml; + } + + /* guaranteed by table construction */ + (void)ddsLowestIndex; + assert(matchIndex >=3D ddsLowestIndex); + assert(match+4 <=3D ddsEnd); + if (MEM_read32(match) =3D=3D MEM_read32(ip)) { + /* assumption : matchIndex <=3D dictLimit-4 (by table construc= tion) */ + currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLimit, ddsE= nd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml =3D currentMl; + *offsetPtr =3D STORE_OFFSET(curr - (matchIndex + ddsIndexDelta= )); + if (ip+currentMl =3D=3D iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; + } + } + } + + { + U32 const chainPackedPointer =3D dms->hashTable[ddsIdx + bucketSiz= e - 1]; + U32 chainIndex =3D chainPackedPointer >> 8; + U32 const chainLength =3D chainPackedPointer & 0xFF; + U32 const chainAttempts =3D nbAttempts - ddsAttempt; + U32 const chainLimit =3D chainAttempts > chainLength ? chainLength= : chainAttempts; + U32 chainAttempt; + + for (chainAttempt =3D 0 ; chainAttempt < chainLimit; chainAttempt+= +) { + PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttemp= t]); + } + + for (chainAttempt =3D 0 ; chainAttempt < chainLimit; chainAttempt+= +, chainIndex++) { + size_t currentMl=3D0; + const BYTE* match; + matchIndex =3D dms->chainTable[chainIndex]; + match =3D ddsBase + matchIndex; + + /* guaranteed by table construction */ + assert(matchIndex >=3D ddsLowestIndex); + assert(match+4 <=3D ddsEnd); + if (MEM_read32(match) =3D=3D MEM_read32(ip)) { + /* assumption : matchIndex <=3D dictLimit-4 (by table cons= truction) */ + currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLimit, = ddsEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml =3D currentMl; + *offsetPtr =3D STORE_OFFSET(curr - (matchIndex + ddsIndexD= elta)); + if (ip+currentMl =3D=3D iLimit) break; /* best possible, a= voids read overflow on next attempt */ + } + } + } + return ml; +} + + +/* ********************************* +* Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] + +/* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, + const BYTE* ip, U32 const mls) +{ + U32* const hashTable =3D ms->hashTable; + const U32 hashLog =3D cParams->hashLog; + U32* const chainTable =3D ms->chainTable; + const U32 chainMask =3D (1 << cParams->chainLog) - 1; + const BYTE* const base =3D ms->window.base; + const U32 target =3D (U32)(ip - base); + U32 idx =3D ms->nextToUpdate; + + while(idx < target) { /* catch up */ + size_t const h =3D ZSTD_hashPtr(base+idx, hashLog, mls); + NEXT_IN_CHAIN(idx, chainMask) =3D hashTable[h]; + hashTable[h] =3D idx; + idx++; + } + + ms->nextToUpdate =3D target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams =3D &ms->cParams; + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cPar= ams.minMatch); +} =20 /* inlining is important to hardwire a hot branch (template emulation) */ FORCE_INLINE_TEMPLATE -size_t ZSTD_HcFindBestMatch_generic ( +size_t ZSTD_HcFindBestMatch( ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iLimit, size_t* offsetPtr, @@ -653,7 +703,7 @@ size_t ZSTD_HcFindBestMatch_generic ( /* save best solution */ if (currentMl > ml) { ml =3D currentMl; - *offsetPtr =3D curr - matchIndex + ZSTD_REP_MOVE; + *offsetPtr =3D STORE_OFFSET(curr - matchIndex); if (ip+currentMl =3D=3D iLimit) break; /* best possible, avoid= s read overflow on next attempt */ } =20 @@ -663,90 +713,8 @@ size_t ZSTD_HcFindBestMatch_generic ( =20 assert(nbAttempts <=3D (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven'= t underflowed. */ if (dictMode =3D=3D ZSTD_dedicatedDictSearch) { - const U32 ddsLowestIndex =3D dms->window.dictLimit; - const BYTE* const ddsBase =3D dms->window.base; - const BYTE* const ddsEnd =3D dms->window.nextSrc; - const U32 ddsSize =3D (U32)(ddsEnd - ddsBase); - const U32 ddsIndexDelta =3D dictLimit - ddsSize; - const U32 bucketSize =3D (1 << ZSTD_LAZY_DDSS_BUCKET_LOG); - const U32 bucketLimit =3D nbAttempts < bucketSize - 1 ? nbAtte= mpts : bucketSize - 1; - U32 ddsAttempt; - - for (ddsAttempt =3D 0; ddsAttempt < bucketSize - 1; ddsAttempt++) { - PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]); - } - - { - U32 const chainPackedPointer =3D dms->hashTable[ddsIdx + bucke= tSize - 1]; - U32 const chainIndex =3D chainPackedPointer >> 8; - - PREFETCH_L1(&dms->chainTable[chainIndex]); - } - - for (ddsAttempt =3D 0; ddsAttempt < bucketLimit; ddsAttempt++) { - size_t currentMl=3D0; - const BYTE* match; - matchIndex =3D dms->hashTable[ddsIdx + ddsAttempt]; - match =3D ddsBase + matchIndex; - - if (!matchIndex) { - return ml; - } - - /* guaranteed by table construction */ - (void)ddsLowestIndex; - assert(matchIndex >=3D ddsLowestIndex); - assert(match+4 <=3D ddsEnd); - if (MEM_read32(match) =3D=3D MEM_read32(ip)) { - /* assumption : matchIndex <=3D dictLimit-4 (by table cons= truction) */ - currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLimit, = ddsEnd, prefixStart) + 4; - } - - /* save best solution */ - if (currentMl > ml) { - ml =3D currentMl; - *offsetPtr =3D curr - (matchIndex + ddsIndexDelta) + ZSTD_= REP_MOVE; - if (ip+currentMl =3D=3D iLimit) { - /* best possible, avoids read overflow on next attempt= */ - return ml; - } - } - } - - { - U32 const chainPackedPointer =3D dms->hashTable[ddsIdx + bucke= tSize - 1]; - U32 chainIndex =3D chainPackedPointer >> 8; - U32 const chainLength =3D chainPackedPointer & 0xFF; - U32 const chainAttempts =3D nbAttempts - ddsAttempt; - U32 const chainLimit =3D chainAttempts > chainLength ? chainLe= ngth : chainAttempts; - U32 chainAttempt; - - for (chainAttempt =3D 0 ; chainAttempt < chainLimit; chainAtte= mpt++) { - PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAt= tempt]); - } - - for (chainAttempt =3D 0 ; chainAttempt < chainLimit; chainAtte= mpt++, chainIndex++) { - size_t currentMl=3D0; - const BYTE* match; - matchIndex =3D dms->chainTable[chainIndex]; - match =3D ddsBase + matchIndex; - - /* guaranteed by table construction */ - assert(matchIndex >=3D ddsLowestIndex); - assert(match+4 <=3D ddsEnd); - if (MEM_read32(match) =3D=3D MEM_read32(ip)) { - /* assumption : matchIndex <=3D dictLimit-4 (by table = construction) */ - currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLim= it, ddsEnd, prefixStart) + 4; - } - - /* save best solution */ - if (currentMl > ml) { - ml =3D currentMl; - *offsetPtr =3D curr - (matchIndex + ddsIndexDelta) + Z= STD_REP_MOVE; - if (ip+currentMl =3D=3D iLimit) break; /* best possibl= e, avoids read overflow on next attempt */ - } - } - } + ml =3D ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttem= pts, dms, + ip, iLimit, prefixStart,= curr, dictLimit, ddsIdx); } else if (dictMode =3D=3D ZSTD_dictMatchState) { const U32* const dmsChainTable =3D dms->chainTable; const U32 dmsChainSize =3D (1 << dms->cParams.chainLog); @@ -770,7 +738,8 @@ size_t ZSTD_HcFindBestMatch_generic ( /* save best solution */ if (currentMl > ml) { ml =3D currentMl; - *offsetPtr =3D curr - (matchIndex + dmsIndexDelta) + ZSTD_= REP_MOVE; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr =3D STORE_OFFSET(curr - (matchIndex + dmsIndexD= elta)); if (ip+currentMl =3D=3D iLimit) break; /* best possible, a= voids read overflow on next attempt */ } =20 @@ -783,75 +752,725 @@ size_t ZSTD_HcFindBestMatch_generic ( return ml; } =20 +/* ********************************* +* (SIMD) Row-based matchfinder +***********************************/ +/* Constants for row-based hash */ +#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the ma= tch state's tagTable from the beginning of a row */ +#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ +#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) +#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entr= ies per row, for all configurations */ + +#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1) =20 -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U6= 4 representing a mask of matches */ + +/* ZSTD_VecMask_next(): + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { + assert(val !=3D 0); +# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ =3D=3D 3) && (_= _GNUC_MINOR__ >=3D 4)))) + if (sizeof(size_t) =3D=3D 4) { + U32 mostSignificantWord =3D (U32)(val >> 32); + U32 leastSignificantWord =3D (U32)val; + if (leastSignificantWord =3D=3D 0) { + return 32 + (U32)__builtin_ctz(mostSignificantWord); + } else { + return (U32)__builtin_ctz(leastSignificantWord); + } + } else { + return (U32)__builtin_ctzll(val); + } +# else + /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%2= 0Count + * and: https://stackoverflow.com/questions/2709430/count-number-of-bi= ts-in-a-64-bit-long-big-integer + */ + val =3D ~val & (val - 1ULL); /* Lowest set bit mask */ + val =3D val - ((val >> 1) & 0x5555555555555555); + val =3D (val & 0x3333333333333333ULL) + ((val >> 2) & 0x33333333333333= 33ULL); + return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x10101010= 1010101ULL) >> 56); +# endif +} + +/* ZSTD_rotateRight_*(): + * Rotates a bitfield to the right by "count" bits. + * https://en.wikipedia.org/w/index.php?title=3DCircular_shift&oldid=3D991= 635599#Implementing_circular_shifts + */ +FORCE_INLINE_TEMPLATE +U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { + assert(count < 64); + count &=3D 0x3F; /* for fickle pattern recognition */ + return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { + assert(count < 32); + count &=3D 0x1F; /* for fickle pattern recognition */ + return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +} + +FORCE_INLINE_TEMPLATE +U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { + assert(count < 16); + count &=3D 0x0F; /* for fickle pattern recognition */ + return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); +} + +/* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates = the "head" + * value to reflect the update. Essentially cycles backwards from [0, {ent= ries per row}) + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const= rowMask) { + U32 const next =3D (*tagRow - 1) & rowMask; + *tagRow =3D (BYTE)next; + return next; +} + +/* ZSTD_isAligned(): + * Checks that a pointer is aligned to "align" bytes which must be a power= of 2. + */ +MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + assert((align & (align - 1)) =3D=3D 0); + return (((size_t)ptr) & (align - 1)) =3D=3D 0; +} + +/* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 con= st* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >=3D 5) { + PREFETCH_L1(hashTable + relRow + 16); + /* Note: prefetching more of the hash table does not appear to be = beneficial for 128-entry rows */ + } + PREFETCH_L1(tagTable + relRow); + if (rowLog =3D=3D 6) { + PREFETCH_L1(tagTable + relRow + 32); + } + assert(rowLog =3D=3D 4 || rowLog =3D=3D 5 || rowLog =3D=3D 6); + assert(ZSTD_isAligned(hashTable + relRow, 64)); /* pre= fetched hash row always 64-byte aligned */ + assert(ZSTD_isAligned(tagTable + relRow, (size_t)1 << rowLog)); /* pre= fetched tagRow sits on correct multiple of bytes (32,64,128) */ +} + +/* ZSTD_row_fillHashCache(): + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH= _CACHE_SIZE entries, + * but not beyond iLimit. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, c= onst BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 4, ZSTD_noDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 5, ZSTD_noDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 6, ZSTD_noDict); + U32 const* const hashTable =3D ms->hashTable; + U16 const* const tagTable =3D ms->tagTable; + U32 const hashLog =3D ms->rowHashLog; + U32 const maxElemsToPrefetch =3D (base + idx) > iLimit ? 0 : (U32)(iLi= mit - (base + idx) + 1); + U32 const lim =3D idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefet= ch); + + for (; idx < lim; ++idx) { + U32 const hash =3D (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_RO= W_HASH_TAG_BITS, mls); + U32 const row =3D (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] =3D hash; } + + DEBUGLOG(6, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms-= >hashCache[0], ms->hashCache[1], + ms->hashCache[2], ms-= >hashCache[3], ms->hashCache[4], + ms->hashCache[5], ms-= >hashCache[6], ms->hashCache[7]); } =20 +/* ZSTD_row_nextCachedHash(): + * Returns the hash of base + idx, and replaces the hash in the hash cache= with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate = rows from hashTable and tagTable. + */ +FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* h= ashTable, + U16 const* tagTable, BYT= E const* base, + U32 idx, U32 const hashL= og, + U32 const rowLog, U32 co= nst mls) +{ + U32 const newHash =3D (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_S= IZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const row =3D (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash =3D cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; + cache[idx & ZSTD_ROW_HASH_CACHE_MASK] =3D newHash; + return hash; + } +} =20 -static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +/* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx unti= l updateEndIdx. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t*= ms, + U32 updateStartIdx= , U32 const updateEndIdx, + U32 const mls, U32= const rowLog, + U32 const rowMask,= U32 const useCache) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 4, ZSTD_dictMatchState); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 5, ZSTD_dictMatchState); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 6, ZSTD_dictMatchState); + U32* const hashTable =3D ms->hashTable; + U16* const tagTable =3D ms->tagTable; + U32 const hashLog =3D ms->rowHashLog; + const BYTE* const base =3D ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=3D%u, upda= teEndIdx=3D%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { + U32 const hash =3D useCache ? ZSTD_row_nextCachedHash(ms->hashCach= e, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) + : (U32)ZSTD_hashPtr(base + updateStartId= x, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const relRow =3D (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row =3D hashTable + relRow; + BYTE* tagRow =3D (BYTE*)(tagTable + relRow); /* Though tagTable i= s laid out as a table of U16, each tag is only 1 byte. + Explicit cast allow= s us to get exact desired position within each row */ + U32 const pos =3D ZSTD_row_nextIndex(tagRow, rowMask); + + assert(hash =3D=3D ZSTD_hashPtr(base + updateStartIdx, hashLog + Z= STD_ROW_HASH_TAG_BITS, mls)); + ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] =3D hash & ZSTD_RO= W_HASH_TAG_MASK; + row[pos] =3D updateStartIdx; } } =20 +/* ZSTD_row_update_internal(): + * Inserts the byte at ip into the appropriate position in the hash table,= and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms,= const BYTE* ip, + U32 const mls, U32 con= st rowLog, + U32 const rowMask, U32= const useCache) +{ + U32 idx =3D ms->nextToUpdate; + const BYTE* const base =3D ms->window.base; + const U32 target =3D (U32)(ip - base); + const U32 kSkipThreshold =3D 384; + const U32 kMaxMatchStartPositionsToUpdate =3D 96; + const U32 kMaxMatchEndPositionsToUpdate =3D 32; + + if (useCache) { + /* Only skip positions when using hash cache, i.e. + * if we are loading a dict, don't skip anything. + * If we decide to skip, then we only update a set number + * of positions at the beginning and end of the match. + */ + if (UNLIKELY(target - idx > kSkipThreshold)) { + U32 const bound =3D idx + kMaxMatchStartPositionsToUpdate; + ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowM= ask, useCache); + idx =3D target - kMaxMatchEndPositionsToUpdate; + ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+1); + } + } + assert(target >=3D idx); + ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, us= eCache); + ms->nextToUpdate =3D target; +} =20 -static size_t ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS ( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) +/* ZSTD_row_update(): + * External wrapper for ZSTD_row_update_internal(). Used for filling the h= ashtable during dictionary + * processing. + */ +void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { + const U32 rowLog =3D BOUNDED(4, ms->cParams.searchLog, 6); + const U32 rowMask =3D (1u << rowLog) - 1; + const U32 mls =3D MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=3D%u", rowLog); + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use c= ache */); +} + +#if defined(ZSTD_ARCH_X86_SSE2) +FORCE_INLINE_TEMPLATE ZSTD_VecMask +ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, c= onst U32 head) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 4, ZSTD_dedicatedDictSearch); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 5, ZSTD_dedicatedDictSearch); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 6, ZSTD_dedicatedDictSearch); + const __m128i comparisonMask =3D _mm_set1_epi8((char)tag); + int matches[4] =3D {0}; + int i; + assert(nbChunks =3D=3D 1 || nbChunks =3D=3D 2 || nbChunks =3D=3D 4); + for (i=3D0; i> chunkSize; + do { + size_t chunk =3D MEM_readST(&src[i]); + chunk ^=3D splatChar; + chunk =3D (((chunk | x80) - x01) | chunk) & x80; + matches <<=3D chunkSize; + matches |=3D (chunk * extractMagic) >> shiftAmount; + i -=3D chunkSize; + } while (i >=3D 0); + } else { /* big endian: reverse bits during extraction */ + const size_t msb =3D xFF ^ (xFF >> 1); + const size_t extractMagic =3D (msb / 0x1FF) | msb; + do { + size_t chunk =3D MEM_readST(&src[i]); + chunk ^=3D splatChar; + chunk =3D (((chunk | x80) - x01) | chunk) & x80; + matches <<=3D chunkSize; + matches |=3D ((chunk >> 7) * extractMagic) >> shiftAmount; + i -=3D chunkSize; + } while (i >=3D 0); + } + matches =3D ~matches; + if (rowEntries =3D=3D 16) { + return ZSTD_rotateRight_U16((U16)matches, head); + } else if (rowEntries =3D=3D 32) { + return ZSTD_rotateRight_U32((U32)matches, head); + } else { + return ZSTD_rotateRight_U64((U64)matches, head); + } + } +#endif +} =20 -FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( +/* The high-level approach of the SIMD row based match finder is as follow= s: + * - Figure out where to insert the new entry: + * - Generate a hash from a byte along with an additional 1-byte "sho= rt hash". The additional byte is our "tag" + * - The hashTable is effectively split into groups or "rows" of 16 o= r 32 entries of U32, and the hash determines + * which row to insert into. + * - Determine the correct position within the row to insert the entr= y into. Each row of 16 or 32 can + * be considered as a circular buffer with a "head" index that resi= des in the tagTable. + * - Also insert the "tag" into the equivalent row and position in th= e tagTable. + * - Note: The tagTable has 17 or 33 1-byte entries per row, due = to 16 or 32 tags, and 1 "head" entry. + * The 17 or 33 entry rows are spaced out to occur every = 32 or 64 bytes, respectively, + * for alignment/performance reasons, leaving some bytes = unused. + * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byt= e "short hash" and + * generate a bitfield that we can cycle through to check the collisions= in the hash table. + * - Pick the longest match. + */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_RowFindBestMatch( ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr) + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode, + const U32 rowLog) { - switch(ms->cParams.minMatch) - { - default : /* includes case 3 */ - case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 4, ZSTD_extDict); - case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 5, ZSTD_extDict); - case 7 : - case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr= , 6, ZSTD_extDict); + U32* const hashTable =3D ms->hashTable; + U16* const tagTable =3D ms->tagTable; + U32* const hashCache =3D ms->hashCache; + const U32 hashLog =3D ms->rowHashLog; + const ZSTD_compressionParameters* const cParams =3D &ms->cParams; + const BYTE* const base =3D ms->window.base; + const BYTE* const dictBase =3D ms->window.dictBase; + const U32 dictLimit =3D ms->window.dictLimit; + const BYTE* const prefixStart =3D base + dictLimit; + const BYTE* const dictEnd =3D dictBase + dictLimit; + const U32 curr =3D (U32)(ip-base); + const U32 maxDistance =3D 1U << cParams->windowLog; + const U32 lowestValid =3D ms->window.lowLimit; + const U32 withinMaxDistance =3D (curr - lowestValid > maxDistance) ? c= urr - maxDistance : lowestValid; + const U32 isDictionary =3D (ms->loadedDictEnd !=3D 0); + const U32 lowLimit =3D isDictionary ? lowestValid : withinMaxDistance; + const U32 rowEntries =3D (1U << rowLog); + const U32 rowMask =3D rowEntries - 1; + const U32 cappedSearchLog =3D MIN(cParams->searchLog, rowLog); /* nb o= f searches is capped at nb entries per row */ + U32 nbAttempts =3D 1U << cappedSearchLog; + size_t ml=3D4-1; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_matchState_t* const dms =3D ms->dictMatchState; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx =3D 0; + U32 ddsExtraAttempts =3D 0; /* cctx hash tables are limited in searche= s, but allow extra searches into DDS */ + U32 dmsTag =3D 0; + U32* dmsRow =3D NULL; + BYTE* dmsTagRow =3D NULL; + + if (dictMode =3D=3D ZSTD_dedicatedDictSearch) { + const U32 ddsHashLog =3D dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUC= KET_LOG; + { /* Prefetch DDS hashtable entry */ + ddsIdx =3D ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS= _BUCKET_LOG; + PREFETCH_L1(&dms->hashTable[ddsIdx]); + } + ddsExtraAttempts =3D cParams->searchLog > rowLog ? 1U << (cParams-= >searchLog - rowLog) : 0; + } + + if (dictMode =3D=3D ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable =3D dms->hashTable; + U16* const dmsTagTable =3D dms->tagTable; + U32 const dmsHash =3D (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD= _ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow =3D (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << row= Log; + dmsTag =3D dmsHash & ZSTD_ROW_HASH_TAG_MASK; + dmsTagRow =3D (BYTE*)(dmsTagTable + dmsRelRow); + dmsRow =3D dmsHashTable + dmsRelRow; + ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog); + } + + /* Update the hashTable and tagTable up to (but not including) ip */ + ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache *= /); + { /* Get the hash for ip, compute the appropriate row */ + U32 const hash =3D ZSTD_row_nextCachedHash(hashCache, hashTable, t= agTable, base, curr, hashLog, rowLog, mls); + U32 const relRow =3D (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag =3D hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row =3D hashTable + relRow; + BYTE* tagRow =3D (BYTE*)(tagTable + relRow); + U32 const head =3D *tagRow & rowMask; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches =3D 0; + size_t currMatch =3D 0; + ZSTD_VecMask matches =3D ZSTD_row_getMatchMask(tagRow, (BYTE)tag, = head, rowEntries); + + /* Cycle through the matches and prefetch */ + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= =3D (matches - 1)) { + U32 const matchPos =3D (head + ZSTD_VecMask_next(matches)) & r= owMask; + U32 const matchIndex =3D row[matchPos]; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; + if ((dictMode !=3D ZSTD_extDict) || matchIndex >=3D dictLimit)= { + PREFETCH_L1(base + matchIndex); + } else { + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] =3D matchIndex; + } + + /* Speed opt: insert current byte into hashtable too. This allows = us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos =3D ZSTD_row_nextIndex(tagRow, rowMask); + tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] =3D (BYTE)tag; + row[pos] =3D ms->nextToUpdate++; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex =3D matchBuffer[currMatch]; + size_t currentMl=3D0; + assert(matchIndex < curr); + assert(matchIndex >=3D lowLimit); + + if ((dictMode !=3D ZSTD_extDict) || matchIndex >=3D dictLimit)= { + const BYTE* const match =3D base + matchIndex; + assert(matchIndex >=3D dictLimit); /* ensures this is tr= ue if dictMode !=3D ZSTD_extDict */ + if (match[ml] =3D=3D ip[ml]) /* potentially better */ + currentMl =3D ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match =3D dictBase + matchIndex; + assert(match+4 <=3D dictEnd); + if (MEM_read32(match) =3D=3D MEM_read32(ip)) /* assumpti= on : matchIndex <=3D dictLimit-4 (by table construction) */ + currentMl =3D ZSTD_count_2segments(ip+4, match+4, iLim= it, dictEnd, prefixStart) + 4; + } + + /* Save best solution */ + if (currentMl > ml) { + ml =3D currentMl; + *offsetPtr =3D STORE_OFFSET(curr - matchIndex); + if (ip+currentMl =3D=3D iLimit) break; /* best possible, a= voids read overflow on next attempt */ + } + } + } + + assert(nbAttempts <=3D (1U << ZSTD_SEARCHLOG_MAX)); /* Check we haven'= t underflowed. */ + if (dictMode =3D=3D ZSTD_dedicatedDictSearch) { + ml =3D ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttem= pts + ddsExtraAttempts, dms, + ip, iLimit, prefixStart,= curr, dictLimit, ddsIdx); + } else if (dictMode =3D=3D ZSTD_dictMatchState) { + /* TODO: Measure and potentially add prefetching to DMS */ + const U32 dmsLowestIndex =3D dms->window.dictLimit; + const BYTE* const dmsBase =3D dms->window.base; + const BYTE* const dmsEnd =3D dms->window.nextSrc; + const U32 dmsSize =3D (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta =3D dictLimit - dmsSize; + + { U32 const head =3D *dmsTagRow & rowMask; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches =3D 0; + size_t currMatch =3D 0; + ZSTD_VecMask matches =3D ZSTD_row_getMatchMask(dmsTagRow, (BYT= E)dmsTag, head, rowEntries); + + for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matche= s &=3D (matches - 1)) { + U32 const matchPos =3D (head + ZSTD_VecMask_next(matches))= & rowMask; + U32 const matchIndex =3D dmsRow[matchPos]; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] =3D matchIndex; + } + + /* Return the longest match */ + for (; currMatch < numMatches; ++currMatch) { + U32 const matchIndex =3D matchBuffer[currMatch]; + size_t currentMl=3D0; + assert(matchIndex >=3D dmsLowestIndex); + assert(matchIndex < curr); + + { const BYTE* const match =3D dmsBase + matchIndex; + assert(match+4 <=3D dmsEnd); + if (MEM_read32(match) =3D=3D MEM_read32(ip)) + currentMl =3D ZSTD_count_2segments(ip+4, match+4, = iLimit, dmsEnd, prefixStart) + 4; + } + + if (currentMl > ml) { + ml =3D currentMl; + assert(curr > matchIndex + dmsIndexDelta); + *offsetPtr =3D STORE_OFFSET(curr - (matchIndex + dmsIn= dexDelta)); + if (ip+currentMl =3D=3D iLimit) break; + } + } + } } + return ml; } =20 =20 +/* + * Generate search functions templated on (dictMode, mls, rowLog). + * These functions are outlined for code size & compilation time. + * ZSTD_searchMax() dispatches to the correct implementation function. + * + * TODO: The start of the search function involves loading and calculating= a + * bunch of constants from the ZSTD_matchState_t. These computations could= be + * done in an initialization function, and saved somewhere in the match st= ate. + * Then we could pass a pointer to the saved state instead of the match st= ate, + * and avoid duplicate computations. + * + * TODO: Move the match re-winding into searchMax. This improves compressi= on + * ratio, and unlocks further simplifications with the next TODO. + * + * TODO: Try moving the repcode search into searchMax. After the re-winding + * and repcode search are in searchMax, there is no more logic in the match + * finder loop that requires knowledge about the dictMode. So we should be + * able to avoid force inlining it, and we can join the extDict loop with + * the single segment loop. It should go in searchMax instead of its own + * function to avoid having multiple virtual function calls per search. + */ + +#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##= _##mls +#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##= _##mls +#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##= dictMode##_##mls##_##rowLog + +#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE + +#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) = \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( = \ + ZSTD_matchState_t* ms, = \ + const BYTE* ip, const BYTE* const iLimit, = \ + size_t* offBasePtr) = \ + { = \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) =3D=3D mls); = \ + return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_= ##dictMode); \ + } = \ + +#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) = \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( = \ + ZSTD_matchState_t* ms, = \ + const BYTE* ip, const BYTE* const iLimit, = \ + size_t* offsetPtr) = \ + { = \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) =3D=3D mls); = \ + return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_#= #dictMode); \ + } = \ + +#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) = \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(= \ + ZSTD_matchState_t* ms, = \ + const BYTE* ip, const BYTE* const iLimit, = \ + size_t* offsetPtr) = \ + { = \ + assert(MAX(4, MIN(6, ms->cParams.minMatch)) =3D=3D mls); = \ + assert(MAX(4, MIN(6, ms->cParams.searchLog)) =3D=3D rowLog); = \ + return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_= ##dictMode, rowLog); \ + } = \ + +#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \ + X(dictMode, mls, 4) \ + X(dictMode, mls, 5) \ + X(dictMode, mls, 6) + +#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \ + ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6) + +#define ZSTD_FOR_EACH_MLS(X, dictMode) \ + X(dictMode, 4) \ + X(dictMode, 5) \ + X(dictMode, 6) + +#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \ + X(__VA_ARGS__, noDict) \ + X(__VA_ARGS__, extDict) \ + X(__VA_ARGS__, dictMatchState) \ + X(__VA_ARGS__, dedicatedDictSearch) + +/* Generate row search fns for each combination of (dictMode, mls, rowLog)= */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN) +/* Generate binary Tree search fns for each combination of (dictMode, mls)= */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN) +/* Generate hash chain search fns for each combination of (dictMode, mls) = */ +ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN) + +typedef enum { search_hashChain=3D0, search_binaryTree=3D1, search_rowHash= =3D2 } searchMethod_e; + +#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \ + case mls: \ + return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr); +#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) = \ + case rowLog: = \ + return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, off= setPtr); + +#define ZSTD_SWITCH_MLS(X, dictMode) \ + switch (mls) { \ + ZSTD_FOR_EACH_MLS(X, dictMode) \ + } + +#define ZSTD_SWITCH_ROWLOG(dictMode, mls) = \ + case mls: = \ + switch (rowLog) { = \ + ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, ml= s) \ + } = \ + ZSTD_UNREACHABLE; = \ + break; + +#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \ + switch (searchMethod) { \ + case search_hashChain: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \ + break; \ + case search_binaryTree: \ + ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \ + break; \ + case search_rowHash: \ + ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \ + break; \ + } \ + ZSTD_UNREACHABLE; + +/* + * Searches for the longest match at @p ip. + * Dispatches to the correct implementation function based on the + * (searchMethod, dictMode, mls, rowLog). We use switch statements + * here instead of using an indirect function call through a function + * pointer because after Spectre and Meltdown mitigations, indirect + * function calls can be very costly, especially in the kernel. + * + * NOTE: dictMode and searchMethod should be templated, so those switch + * statements should be optimized out. Only the mls & rowLog switches + * should be left. + * + * @param ms The match state. + * @param ip The position to search at. + * @param iend The end of the input data. + * @param[out] offsetPtr Stores the match offset into this pointer. + * @param mls The minimum search length, in the range [4, 6]. + * @param rowLog The row log (if applicable), in the range [4, 6]. + * @param searchMethod The search method to use (templated). + * @param dictMode The dictMode (templated). + * + * @returns The length of the longest match found, or < mls if no match is= found. + * If a match is found its offset is stored in @p offsetPtr. + */ +FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + ZSTD_matchState_t* ms, + const BYTE* ip, + const BYTE* iend, + size_t* offsetPtr, + U32 const mls, + U32 const rowLog, + searchMethod_e const searchMethod, + ZSTD_dictMode_e const dictMode) +{ + if (dictMode =3D=3D ZSTD_noDict) { + ZSTD_SWITCH_SEARCH_METHOD(noDict) + } else if (dictMode =3D=3D ZSTD_extDict) { + ZSTD_SWITCH_SEARCH_METHOD(extDict) + } else if (dictMode =3D=3D ZSTD_dictMatchState) { + ZSTD_SWITCH_SEARCH_METHOD(dictMatchState) + } else if (dictMode =3D=3D ZSTD_dedicatedDictSearch) { + ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch) + } + ZSTD_UNREACHABLE; + return 0; +} + /* ******************************* * Common parser - lazy strategy *********************************/ -typedef enum { search_hashChain, search_binaryTree } searchMethod_e; =20 FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_generic( @@ -865,41 +1484,13 @@ ZSTD_compressBlock_lazy_generic( const BYTE* ip =3D istart; const BYTE* anchor =3D istart; const BYTE* const iend =3D istart + srcSize; - const BYTE* const ilimit =3D iend - 8; + const BYTE* const ilimit =3D (searchMethod =3D=3D search_rowHash) ? ie= nd - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base =3D ms->window.base; const U32 prefixLowestIndex =3D ms->window.dictLimit; const BYTE* const prefixLowest =3D base + prefixLowestIndex; + const U32 mls =3D BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog =3D BOUNDED(4, ms->cParams.searchLog, 6); =20 - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offset= Ptr); - - /* - * This table is indexed first by the four ZSTD_dictMode_e values, and= then - * by the two searchMethod_e values. NULLs are placed for configuratio= ns - * that should never occur (extDict modes go to the other implementati= on - * below and there is no DDSS for binary tree search yet). - */ - const searchMax_f searchFuncs[4][2] =3D { - { - ZSTD_HcFindBestMatch_selectMLS, - ZSTD_BtFindBestMatch_selectMLS - }, - { - NULL, - NULL - }, - { - ZSTD_HcFindBestMatch_dictMatchState_selectMLS, - ZSTD_BtFindBestMatch_dictMatchState_selectMLS - }, - { - ZSTD_HcFindBestMatch_dedicatedDictSearch_selectMLS, - NULL - } - }; - - searchMax_f const searchMax =3D searchFuncs[dictMode][searchMethod =3D= =3D search_binaryTree]; U32 offset_1 =3D rep[0], offset_2 =3D rep[1], savedOffset=3D0; =20 const int isDMS =3D dictMode =3D=3D ZSTD_dictMatchState; @@ -915,11 +1506,7 @@ ZSTD_compressBlock_lazy_generic( 0; const U32 dictAndPrefixLength =3D (U32)((ip - prefixLowest) + (dictEnd= - dictLowest)); =20 - assert(searchMax !=3D NULL); - - DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=3D%u)", (U32)di= ctMode); - - /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=3D%u) (searchFu= nc=3D%u)", (U32)dictMode, (U32)searchMethod); ip +=3D (dictAndPrefixLength =3D=3D 0); if (dictMode =3D=3D ZSTD_noDict) { U32 const curr =3D (U32)(ip - base); @@ -935,6 +1522,12 @@ ZSTD_compressBlock_lazy_generic( assert(offset_2 <=3D dictAndPrefixLength); } =20 + if (searchMethod =3D=3D search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, + MIN(ms->cParams.minMatch, 6 /* mls caps out at= 6 */), + ms->nextToUpdate, ilimit); + } + /* Match Loop */ #if defined(__x86_64__) /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when = the @@ -944,8 +1537,9 @@ ZSTD_compressBlock_lazy_generic( #endif while (ip < ilimit) { size_t matchLength=3D0; - size_t offset=3D0; + size_t offcode=3DSTORE_REPCODE_1; const BYTE* start=3Dip+1; + DEBUGLOG(7, "search baseline (depth 0)"); =20 /* check repCode */ if (isDxS) { @@ -969,9 +1563,9 @@ ZSTD_compressBlock_lazy_generic( =20 /* first search (depth 0) */ { size_t offsetFound =3D 999999999; - size_t const ml2 =3D searchMax(ms, ip, iend, &offsetFound); + size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &offsetFound= , mls, rowLog, searchMethod, dictMode); if (ml2 > matchLength) - matchLength =3D ml2, start =3D ip, offset=3DoffsetFound; + matchLength =3D ml2, start =3D ip, offcode=3DoffsetFound; } =20 if (matchLength < 4) { @@ -982,14 +1576,15 @@ ZSTD_compressBlock_lazy_generic( /* let's try to find a better solution */ if (depth>=3D1) while (ip0) & (MEM_read32(ip) =3D=3D MEM_re= ad32(ip - offset_1)))) { + && (offcode) && ((offset_1>0) & (MEM_read32(ip) =3D=3D MEM_r= ead32(ip - offset_1)))) { size_t const mlRep =3D ZSTD_count(ip+4, ip+4-offset_1, ien= d) + 4; int const gain2 =3D (int)(mlRep * 3); - int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit32((= U32)offset+1) + 1); + int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit32((= U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >=3D 4) && (gain2 > gain1)) - matchLength =3D mlRep, offset =3D 0, start =3D ip; + matchLength =3D mlRep, offcode =3D STORE_REPCODE_1, st= art =3D ip; } if (isDxS) { const U32 repIndex =3D (U32)(ip - base) - offset_1; @@ -1001,30 +1596,31 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd =3D repIndex < prefixLowestInd= ex ? dictEnd : iend; size_t const mlRep =3D ZSTD_count_2segments(ip+4, repM= atch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 =3D (int)(mlRep * 3); - int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit= 32((U32)offset+1) + 1); + int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit= 32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >=3D 4) && (gain2 > gain1)) - matchLength =3D mlRep, offset =3D 0, start =3D ip; + matchLength =3D mlRep, offcode =3D STORE_REPCODE_1= , start =3D ip; } } { size_t offset2=3D999999999; - size_t const ml2 =3D searchMax(ms, ip, iend, &offset2); - int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)offs= et2+1)); /* raw approx */ - int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit32((= U32)offset+1) + 4); + size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &offset2= , mls, rowLog, searchMethod, dictMode); + int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)STOR= ED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit32((= U32)STORED_TO_OFFBASE(offcode)) + 4); if ((ml2 >=3D 4) && (gain2 > gain1)) { - matchLength =3D ml2, offset =3D offset2, start =3D ip; + matchLength =3D ml2, offcode =3D offset2, start =3D ip; continue; /* search a better one */ } } =20 /* let's find an even better one */ if ((depth=3D=3D2) && (ip0) & (MEM_read32(ip) =3D=3D ME= M_read32(ip - offset_1)))) { + && (offcode) && ((offset_1>0) & (MEM_read32(ip) =3D=3D M= EM_read32(ip - offset_1)))) { size_t const mlRep =3D ZSTD_count(ip+4, ip+4-offset_1,= iend) + 4; int const gain2 =3D (int)(mlRep * 4); - int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit= 32((U32)offset+1) + 1); + int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit= 32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >=3D 4) && (gain2 > gain1)) - matchLength =3D mlRep, offset =3D 0, start =3D ip; + matchLength =3D mlRep, offcode =3D STORE_REPCODE_1= , start =3D ip; } if (isDxS) { const U32 repIndex =3D (U32)(ip - base) - offset_1; @@ -1036,46 +1632,45 @@ ZSTD_compressBlock_lazy_generic( const BYTE* repMatchEnd =3D repIndex < prefixLowes= tIndex ? dictEnd : iend; size_t const mlRep =3D ZSTD_count_2segments(ip+4, = repMatch+4, iend, repMatchEnd, prefixLowest) + 4; int const gain2 =3D (int)(mlRep * 4); - int const gain1 =3D (int)(matchLength*4 - ZSTD_hig= hbit32((U32)offset+1) + 1); + int const gain1 =3D (int)(matchLength*4 - ZSTD_hig= hbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((mlRep >=3D 4) && (gain2 > gain1)) - matchLength =3D mlRep, offset =3D 0, start =3D= ip; + matchLength =3D mlRep, offcode =3D STORE_REPCO= DE_1, start =3D ip; } } { size_t offset2=3D999999999; - size_t const ml2 =3D searchMax(ms, ip, iend, &offset2); - int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)= offset2+1)); /* raw approx */ - int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit= 32((U32)offset+1) + 7); + size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &off= set2, mls, rowLog, searchMethod, dictMode); + int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)= STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit= 32((U32)STORED_TO_OFFBASE(offcode)) + 7); if ((ml2 >=3D 4) && (gain2 > gain1)) { - matchLength =3D ml2, offset =3D offset2, start =3D= ip; + matchLength =3D ml2, offcode =3D offset2, start = =3D ip; continue; } } } break; /* nothing found : store previous solution */ } =20 /* NOTE: - * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. - * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, w= hich - * overflows the pointer, which is undefined behavior. + * Pay attention that `start[-value]` can lead to strange undefine= d behavior + * notably if `value` is unsigned, resulting in a large positive `= -value`. */ /* catch up */ - if (offset) { + if (STORED_IS_OFFSET(offcode)) { if (dictMode =3D=3D ZSTD_noDict) { - while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE= ) > prefixLowest)) - && (start[-1] =3D=3D (start-(offset-ZSTD_REP_MOVE))[-= 1]) ) /* only search for offset within prefix */ + while ( ((start > anchor) & (start - STORED_OFFSET(offcode= ) > prefixLowest)) + && (start[-1] =3D=3D (start-STORED_OFFSET(offcode))[-= 1]) ) /* only search for offset within prefix */ { start--; matchLength++; } } if (isDxS) { - U32 const matchIndex =3D (U32)((start-base) - (offset - ZS= TD_REP_MOVE)); + U32 const matchIndex =3D (U32)((size_t)(start-base) - STOR= ED_OFFSET(offcode)); const BYTE* match =3D (matchIndex < prefixLowestIndex) ? d= ictBase + matchIndex - dictIndexDelta : base + matchIndex; const BYTE* const mStart =3D (matchIndex < prefixLowestInd= ex) ? dictLowest : prefixLowest; while ((start>anchor) && (match>mStart) && (start[-1] =3D= =3D match[-1])) { start--; match--; matchLength++; } /* catch up */ } - offset_2 =3D offset_1; offset_1 =3D (U32)(offset - ZSTD_REP_MO= VE); + offset_2 =3D offset_1; offset_1 =3D (U32)STORED_OFFSET(offcode= ); } /* store sequence */ _storeSequence: - { size_t const litLength =3D start - anchor; - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, = matchLength-MINMATCH); + { size_t const litLength =3D (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode,= matchLength); anchor =3D ip =3D start + matchLength; } =20 @@ -1091,8 +1686,8 @@ ZSTD_compressBlock_lazy_generic( && (MEM_read32(repMatch) =3D=3D MEM_read32(ip)) ) { const BYTE* const repEnd2 =3D repIndex < prefixLowestI= ndex ? dictEnd : iend; matchLength =3D ZSTD_count_2segments(ip+4, repMatch+4,= iend, repEnd2, prefixLowest) + 4; - offset =3D offset_2; offset_2 =3D offset_1; offset_1 = =3D (U32)offset; /* swap offset_2 <=3D> offset_1 */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLengt= h-MINMATCH); + offcode =3D offset_2; offset_2 =3D offset_1; offset_1 = =3D (U32)offcode; /* swap offset_2 <=3D> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE= _1, matchLength); ip +=3D matchLength; anchor =3D ip; continue; @@ -1106,8 +1701,8 @@ ZSTD_compressBlock_lazy_generic( && (MEM_read32(ip) =3D=3D MEM_read32(ip - offset_2)) ) { /* store sequence */ matchLength =3D ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; - offset =3D offset_2; offset_2 =3D offset_1; offset_1 =3D (= U32)offset; /* swap repcodes */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MI= NMATCH); + offcode =3D offset_2; offset_2 =3D offset_1; offset_1 =3D = (U32)offcode; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, = matchLength); ip +=3D matchLength; anchor =3D ip; continue; /* faster when present ... (?) */ @@ -1200,6 +1795,70 @@ size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_hashChain, 0, ZSTD_dedicatedDictSearch); } =20 +/* Row-based matchfinder */ +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 0, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 0, ZSTD_dictMatchState); +} + + +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 2, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 1, ZSTD_dedicatedDictSearch); +} + +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize= , search_rowHash, 0, ZSTD_dedicatedDictSearch); +} =20 FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_lazy_extDict_generic( @@ -1212,7 +1871,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* ip =3D istart; const BYTE* anchor =3D istart; const BYTE* const iend =3D istart + srcSize; - const BYTE* const ilimit =3D iend - 8; + const BYTE* const ilimit =3D searchMethod =3D=3D search_rowHash ? iend= - 8 - ZSTD_ROW_HASH_CACHE_SIZE : iend - 8; const BYTE* const base =3D ms->window.base; const U32 dictLimit =3D ms->window.dictLimit; const BYTE* const prefixStart =3D base + dictLimit; @@ -1220,18 +1879,20 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const BYTE* const dictEnd =3D dictBase + dictLimit; const BYTE* const dictStart =3D dictBase + ms->window.lowLimit; const U32 windowLog =3D ms->cParams.windowLog; - - typedef size_t (*searchMax_f)( - ZSTD_matchState_t* ms, - const BYTE* ip, const BYTE* iLimit, size_t* offset= Ptr); - searchMax_f searchMax =3D searchMethod=3D=3Dsearch_binaryTree ? ZSTD_B= tFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; + const U32 mls =3D BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog =3D BOUNDED(4, ms->cParams.searchLog, 6); =20 U32 offset_1 =3D rep[0], offset_2 =3D rep[1]; =20 - DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic"); + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=3D%u)= ", (U32)searchMethod); =20 /* init */ ip +=3D (ip =3D=3D prefixStart); + if (searchMethod =3D=3D search_rowHash) { + ZSTD_row_fillHashCache(ms, base, rowLog, + MIN(ms->cParams.minMatch, 6 /* mls caps out= at 6 */), + ms->nextToUpdate, ilimit); + } =20 /* Match Loop */ #if defined(__x86_64__) @@ -1242,7 +1903,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( #endif while (ip < ilimit) { size_t matchLength=3D0; - size_t offset=3D0; + size_t offcode=3DSTORE_REPCODE_1; const BYTE* start=3Dip+1; U32 curr =3D (U32)(ip-base); =20 @@ -1251,7 +1912,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex =3D (U32)(curr+1 - offset_1); const BYTE* const repBase =3D repIndex < dictLimit ? dictBase = : base; const BYTE* const repMatch =3D repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >=3D 3) & (repIndex > win= dowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >=3D 3) /* intentional o= verflow */ + & (offset_1 <=3D curr+1 - windowLow) ) /* note: we are sear= ching at curr+1 */ if (MEM_read32(ip+1) =3D=3D MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd =3D repIndex < dictLimit ? dictEn= d : iend; @@ -1261,9 +1923,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( =20 /* first search (depth 0) */ { size_t offsetFound =3D 999999999; - size_t const ml2 =3D searchMax(ms, ip, iend, &offsetFound); + size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &offsetFound= , mls, rowLog, searchMethod, ZSTD_extDict); if (ml2 > matchLength) - matchLength =3D ml2, start =3D ip, offset=3DoffsetFound; + matchLength =3D ml2, start =3D ip, offcode=3DoffsetFound; } =20 if (matchLength < 4) { @@ -1277,29 +1939,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ - if (offset) { + if (offcode) { const U32 windowLow =3D ZSTD_getLowestMatchIndex(ms, curr,= windowLog); const U32 repIndex =3D (U32)(curr - offset_1); const BYTE* const repBase =3D repIndex < dictLimit ? dictB= ase : base; const BYTE* const repMatch =3D repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >=3D 3) & (repIndex >= windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >=3D 3) /* intention= al overflow : do not test positions overlapping 2 memory segments */ + & (offset_1 <=3D curr - windowLow) ) /* equivalent to `= curr > repIndex >=3D windowLow` */ if (MEM_read32(ip) =3D=3D MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd =3D repIndex < dictLimit ? di= ctEnd : iend; size_t const repLength =3D ZSTD_count_2segments(ip+4, = repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 =3D (int)(repLength * 3); - int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit= 32((U32)offset+1) + 1); + int const gain1 =3D (int)(matchLength*3 - ZSTD_highbit= 32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((repLength >=3D 4) && (gain2 > gain1)) - matchLength =3D repLength, offset =3D 0, start =3D= ip; + matchLength =3D repLength, offcode =3D STORE_REPCO= DE_1, start =3D ip; } } =20 /* search match, depth 1 */ { size_t offset2=3D999999999; - size_t const ml2 =3D searchMax(ms, ip, iend, &offset2); - int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)offs= et2+1)); /* raw approx */ - int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit32((= U32)offset+1) + 4); + size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &offset2= , mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)STOR= ED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit32((= U32)STORED_TO_OFFBASE(offcode)) + 4); if ((ml2 >=3D 4) && (gain2 > gain1)) { - matchLength =3D ml2, offset =3D offset2, start =3D ip; + matchLength =3D ml2, offcode =3D offset2, start =3D ip; continue; /* search a better one */ } } =20 @@ -1308,47 +1971,48 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( ip ++; curr++; /* check repCode */ - if (offset) { + if (offcode) { const U32 windowLow =3D ZSTD_getLowestMatchIndex(ms, c= urr, windowLog); const U32 repIndex =3D (U32)(curr - offset_1); const BYTE* const repBase =3D repIndex < dictLimit ? d= ictBase : base; const BYTE* const repMatch =3D repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >=3D 3) & (repInd= ex > windowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >=3D 3) /* inten= tional overflow : do not test positions overlapping 2 memory segments */ + & (offset_1 <=3D curr - windowLow) ) /* equivalent = to `curr > repIndex >=3D windowLow` */ if (MEM_read32(ip) =3D=3D MEM_read32(repMatch)) { /* repcode detected */ const BYTE* const repEnd =3D repIndex < dictLimit = ? dictEnd : iend; size_t const repLength =3D ZSTD_count_2segments(ip= +4, repMatch+4, iend, repEnd, prefixStart) + 4; int const gain2 =3D (int)(repLength * 4); - int const gain1 =3D (int)(matchLength*4 - ZSTD_hig= hbit32((U32)offset+1) + 1); + int const gain1 =3D (int)(matchLength*4 - ZSTD_hig= hbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); if ((repLength >=3D 4) && (gain2 > gain1)) - matchLength =3D repLength, offset =3D 0, start= =3D ip; + matchLength =3D repLength, offcode =3D STORE_R= EPCODE_1, start =3D ip; } } =20 /* search match, depth 2 */ { size_t offset2=3D999999999; - size_t const ml2 =3D searchMax(ms, ip, iend, &offset2); - int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)= offset2+1)); /* raw approx */ - int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit= 32((U32)offset+1) + 7); + size_t const ml2 =3D ZSTD_searchMax(ms, ip, iend, &off= set2, mls, rowLog, searchMethod, ZSTD_extDict); + int const gain2 =3D (int)(ml2*4 - ZSTD_highbit32((U32)= STORED_TO_OFFBASE(offset2))); /* raw approx */ + int const gain1 =3D (int)(matchLength*4 - ZSTD_highbit= 32((U32)STORED_TO_OFFBASE(offcode)) + 7); if ((ml2 >=3D 4) && (gain2 > gain1)) { - matchLength =3D ml2, offset =3D offset2, start =3D= ip; + matchLength =3D ml2, offcode =3D offset2, start = =3D ip; continue; } } } break; /* nothing found : store previous solution */ } =20 /* catch up */ - if (offset) { - U32 const matchIndex =3D (U32)((start-base) - (offset - ZSTD_R= EP_MOVE)); + if (STORED_IS_OFFSET(offcode)) { + U32 const matchIndex =3D (U32)((size_t)(start-base) - STORED_O= FFSET(offcode)); const BYTE* match =3D (matchIndex < dictLimit) ? dictBase + ma= tchIndex : base + matchIndex; const BYTE* const mStart =3D (matchIndex < dictLimit) ? dictSt= art : prefixStart; while ((start>anchor) && (match>mStart) && (start[-1] =3D=3D m= atch[-1])) { start--; match--; matchLength++; } /* catch up */ - offset_2 =3D offset_1; offset_1 =3D (U32)(offset - ZSTD_REP_MO= VE); + offset_2 =3D offset_1; offset_1 =3D (U32)STORED_OFFSET(offcode= ); } =20 /* store sequence */ _storeSequence: - { size_t const litLength =3D start - anchor; - ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, = matchLength-MINMATCH); + { size_t const litLength =3D (size_t)(start - anchor); + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode,= matchLength); anchor =3D ip =3D start + matchLength; } =20 @@ -1359,13 +2023,14 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( const U32 repIndex =3D repCurrent - offset_2; const BYTE* const repBase =3D repIndex < dictLimit ? dictBase = : base; const BYTE* const repMatch =3D repBase + repIndex; - if (((U32)((dictLimit-1) - repIndex) >=3D 3) & (repIndex > win= dowLow)) /* intentional overflow */ + if ( ((U32)((dictLimit-1) - repIndex) >=3D 3) /* intentional o= verflow : do not test positions overlapping 2 memory segments */ + & (offset_2 <=3D repCurrent - windowLow) ) /* equivalent to= `curr > repIndex >=3D windowLow` */ if (MEM_read32(ip) =3D=3D MEM_read32(repMatch)) { /* repcode detected we should take it */ const BYTE* const repEnd =3D repIndex < dictLimit ? dictEn= d : iend; matchLength =3D ZSTD_count_2segments(ip+4, repMatch+4, ien= d, repEnd, prefixStart) + 4; - offset =3D offset_2; offset_2 =3D offset_1; offset_1 =3D (= U32)offset; /* swap offset history */ - ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MI= NMATCH); + offcode =3D offset_2; offset_2 =3D offset_1; offset_1 =3D = (U32)offcode; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, = matchLength); ip +=3D matchLength; anchor =3D ip; continue; /* faster when present ... (?) */ @@ -1412,3 +2077,26 @@ size_t ZSTD_compressBlock_btlazy2_extDict( { return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src,= srcSize, search_binaryTree, 2); } + +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src,= srcSize, search_rowHash, 0); +} + +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src,= srcSize, search_rowHash, 1); +} + +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src,= srcSize, search_rowHash, 2); +} diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h index 2fc5a6182134..e5bdf4df8dde 100644 --- a/lib/zstd/compress/zstd_lazy.h +++ b/lib/zstd/compress/zstd_lazy.h @@ -23,6 +23,7 @@ #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 =20 U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); +void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); =20 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, c= onst BYTE* const ip); =20 @@ -40,6 +41,15 @@ size_t ZSTD_compressBlock_lazy( size_t ZSTD_compressBlock_greedy( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); =20 size_t ZSTD_compressBlock_btlazy2_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -53,6 +63,15 @@ size_t ZSTD_compressBlock_lazy_dictMatchState( size_t ZSTD_compressBlock_greedy_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); =20 size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -63,6 +82,15 @@ size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); =20 size_t ZSTD_compressBlock_greedy_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], @@ -73,9 +101,19 @@ size_t ZSTD_compressBlock_lazy_extDict( size_t ZSTD_compressBlock_lazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); size_t ZSTD_compressBlock_btlazy2_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], void const* src, size_t srcSize); + =20 =20 =20 #endif /* ZSTD_LAZY_H */ diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c index 8ef7e88a5add..dd86fc83e7dd 100644 --- a/lib/zstd/compress/zstd_ldm.c +++ b/lib/zstd/compress/zstd_ldm.c @@ -57,6 +57,33 @@ static void ZSTD_ldm_gear_init(ldmRollingHashState_t* st= ate, ldmParams_t const* } } =20 +/* ZSTD_ldm_gear_reset() + * Feeds [data, data + minMatchLength) into the hash without registering a= ny + * splits. This effectively resets the hash state. This is used when skipp= ing + * over data, either at the beginning of a block, or skipping sections. + */ +static void ZSTD_ldm_gear_reset(ldmRollingHashState_t* state, + BYTE const* data, size_t minMatchLength) +{ + U64 hash =3D state->rolling; + size_t n =3D 0; + +#define GEAR_ITER_ONCE() do { \ + hash =3D (hash << 1) + ZSTD_ldm_gearTab[data[n] & 0xff]; \ + n +=3D 1; \ + } while (0) + while (n + 3 < minMatchLength) { + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + GEAR_ITER_ONCE(); + } + while (n < minMatchLength) { + GEAR_ITER_ONCE(); + } +#undef GEAR_ITER_ONCE +} + /* ZSTD_ldm_gear_feed(): * * Registers in the splits array all the split points found in the first @@ -132,12 +159,12 @@ size_t ZSTD_ldm_getTableSize(ldmParams_t params) size_t const ldmBucketSize =3D ((size_t)1) << (params.hashLog - ldmBuc= ketSizeLog); size_t const totalSize =3D ZSTD_cwksp_alloc_size(ldmBucketSize) + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEn= try_t)); - return params.enableLdm ? totalSize : 0; + return params.enableLdm =3D=3D ZSTD_ps_enable ? totalSize : 0; } =20 size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) { - return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; + return params.enableLdm =3D=3D ZSTD_ps_enable ? (maxChunkSize / params= .minMatchLength) : 0; } =20 /* ZSTD_ldm_getBucket() : @@ -255,7 +282,7 @@ void ZSTD_ldm_fillHashTable( while (ip < iend) { size_t hashed; unsigned n; - =20 + numSplits =3D 0; hashed =3D ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &= numSplits); =20 @@ -327,16 +354,8 @@ static size_t ZSTD_ldm_generateSequences_internal( =20 /* Initialize the rolling hash state with the first minMatchLength byt= es */ ZSTD_ldm_gear_init(&hashState, params); - { - size_t n =3D 0; - - while (n < minMatchLength) { - numSplits =3D 0; - n +=3D ZSTD_ldm_gear_feed(&hashState, ip + n, minMatchLength -= n, - splits, &numSplits); - } - ip +=3D minMatchLength; - } + ZSTD_ldm_gear_reset(&hashState, ip, minMatchLength); + ip +=3D minMatchLength; =20 while (ip < ilimit) { size_t hashed; @@ -361,6 +380,7 @@ static size_t ZSTD_ldm_generateSequences_internal( for (n =3D 0; n < numSplits; n++) { size_t forwardMatchLength =3D 0, backwardMatchLength =3D 0, bestMatchLength =3D 0, mLength; + U32 offset; BYTE const* const split =3D candidates[n].split; U32 const checksum =3D candidates[n].checksum; U32 const hash =3D candidates[n].hash; @@ -428,9 +448,9 @@ static size_t ZSTD_ldm_generateSequences_internal( } =20 /* Match found */ + offset =3D (U32)(split - base) - bestEntry->offset; mLength =3D forwardMatchLength + backwardMatchLength; { - U32 const offset =3D (U32)(split - base) - bestEntry->offs= et; rawSeq* const seq =3D rawSeqStore->seq + rawSeqStore->size; =20 /* Out of sequence storage */ @@ -447,6 +467,21 @@ static size_t ZSTD_ldm_generateSequences_internal( ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); =20 anchor =3D split + forwardMatchLength; + + /* If we find a match that ends after the data that we've hash= ed + * then we have a repeating, overlapping, pattern. E.g. all ze= ros. + * If one repetition of the pattern matches our `stopMask` the= n all + * repetitions will. We don't need to insert them all into out= table, + * only the first one. So skip over overlapping matches. + * This is a major speed boost (20x) for compressing a single = byte + * repeated, when that byte ends up in the table. + */ + if (anchor > ip + hashed) { + ZSTD_ldm_gear_reset(&hashState, anchor - minMatchLength, m= inMatchLength); + /* Continue the outer loop at anchor (ip + hashed =3D=3D a= nchor). */ + ip =3D anchor - hashed; + break; + } } =20 ip +=3D hashed; @@ -500,7 +535,7 @@ size_t ZSTD_ldm_generateSequences( =20 assert(chunkStart < iend); /* 1. Perform overflow correction if necessary. */ - if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)= ) { + if (ZSTD_window_needOverflowCorrection(ldmState->window, 0, maxDis= t, ldmState->loadedDictEnd, chunkStart, chunkEnd)) { U32 const ldmHSize =3D 1U << params->hashLog; U32 const correction =3D ZSTD_window_correctOverflow( &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); @@ -544,7 +579,9 @@ size_t ZSTD_ldm_generateSequences( return 0; } =20 -void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U3= 2 const minMatch) { +void +ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 con= st minMatch) +{ while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { rawSeq* seq =3D rawSeqStore->seq + rawSeqStore->pos; if (srcSize <=3D seq->litLength) { @@ -622,12 +659,13 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* raw= SeqStore, size_t nbBytes) { =20 size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + ZSTD_paramSwitch_e useRowMatchFinder, void const* src, size_t srcSize) { const ZSTD_compressionParameters* const cParams =3D &ms->cParams; unsigned const minMatch =3D cParams->minMatch; ZSTD_blockCompressor const blockCompressor =3D - ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dict= Mode(ms)); + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, Z= STD_matchState_dictMode(ms)); /* Input bounds */ BYTE const* const istart =3D (BYTE const*)src; BYTE const* const iend =3D istart + srcSize; @@ -673,8 +711,8 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStor= e, rep[0] =3D sequence.offset; /* Store the sequence */ ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, - sequence.offset + ZSTD_REP_MOVE, - sequence.matchLength - MINMATCH); + STORE_OFFSET(sequence.offset), + sequence.matchLength); ip +=3D sequence.matchLength; } } diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h index 25b25270b72e..fbc6a5e88fd7 100644 --- a/lib/zstd/compress/zstd_ldm.h +++ b/lib/zstd/compress/zstd_ldm.h @@ -63,6 +63,7 @@ size_t ZSTD_ldm_generateSequences( */ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_= NUM], + ZSTD_paramSwitch_e useRowMatchFinder, void const* src, size_t srcSize); =20 /* diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_= ldm_geartab.h index e5c24d856b0a..647f865be290 100644 --- a/lib/zstd/compress/zstd_ldm_geartab.h +++ b/lib/zstd/compress/zstd_ldm_geartab.h @@ -11,7 +11,10 @@ #ifndef ZSTD_LDM_GEARTAB_H #define ZSTD_LDM_GEARTAB_H =20 -static U64 ZSTD_ldm_gearTab[256] =3D { +#include "../common/compiler.h" /* UNUSED_ATTR */ +#include "../common/mem.h" /* U64 */ + +static UNUSED_ATTR const U64 ZSTD_ldm_gearTab[256] =3D { 0xf5b8f72c5f77775c, 0x84935f266b7ac412, 0xb647ada9ca730ccc, 0xb065bb4b114fb1de, 0x34584e7e8c3a9fd0, 0x4e97e17c6ae26b05, 0x3a03d743bc99a604, 0xcecd042422c4044f, 0x76de76c58524259e, diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c index dfc55e3e8119..fd82acfda62f 100644 --- a/lib/zstd/compress/zstd_opt.c +++ b/lib/zstd/compress/zstd_opt.c @@ -8,25 +8,12 @@ * You may select, at your option, one of the above-listed licenses. */ =20 -/* - * Disable inlining for the optimal parser for the kernel build. - * It is unlikely to be used in the kernel, and where it is used - * latency shouldn't matter because it is very slow to begin with. - * We prefer a ~180KB binary size win over faster optimal parsing. - * - * TODO(https://github.com/facebook/zstd/issues/2862): - * Improve the code size of the optimal parser in general, so we - * don't need this hack for the kernel build. - */ -#define ZSTD_NO_INLINE 1 - #include "zstd_compress_internal.h" #include "hist.h" #include "zstd_opt.h" =20 =20 #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that fre= quencies adapt faster to new stats */ -#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to= init next stats */ #define ZSTD_MAX_PRICE (1<<30) =20 #define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD= , symbols' cost is assumed static, directly determined by pre-defined distr= ibutions */ @@ -36,11 +23,11 @@ * Price functions for optimal parser ***************************************/ =20 -#if 0 /* approximation at bit level */ +#if 0 /* approximation at bit level (for tests) */ # define BITCOST_ACCURACY 0 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) -# define WEIGHT(stat) ((void)opt, ZSTD_bitWeight(stat)) -#elif 0 /* fractional bit accuracy */ +# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) +#elif 0 /* fractional bit accuracy (for tests) */ # define BITCOST_ACCURACY 8 # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) # define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) @@ -78,7 +65,7 @@ MEM_STATIC double ZSTD_fCost(U32 price) =20 static int ZSTD_compressedLiterals(optState_t const* const optPtr) { - return optPtr->literalCompressionMode !=3D ZSTD_lcm_uncompressed; + return optPtr->literalCompressionMode !=3D ZSTD_ps_disable; } =20 static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) @@ -91,25 +78,46 @@ static void ZSTD_setBasePrices(optState_t* optPtr, int = optLevel) } =20 =20 -/* ZSTD_downscaleStat() : - * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus) - * return the resulting sum of elements */ -static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus) +static U32 sum_u32(const unsigned table[], size_t nbElts) +{ + size_t n; + U32 total =3D 0; + for (n=3D0; n 0 && ZSTD_FREQ_DIV+malus < 31); + DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=3D%u, shift=3D%u)", (unsigned= )lastEltIndex+1, (unsigned)shift); + assert(shift < 30); for (s=3D0; s> (ZSTD_FREQ_DIV+malus)); + table[s] =3D 1 + (table[s] >> shift); sum +=3D table[s]; } return sum; } =20 +/* ZSTD_scaleStats() : + * reduce all elements in table is sum too large + * return the resulting sum of elements */ +static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarge= t) +{ + U32 const prevsum =3D sum_u32(table, lastEltIndex+1); + U32 const factor =3D prevsum >> logTarget; + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=3D%u, target=3D%u)", (unsigned)la= stEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <=3D 1) return prevsum; + return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)= ); +} + /* ZSTD_rescaleFreqs() : * if first block (detected by optPtr->litLengthSum =3D=3D 0) : init stati= stics * take hints from dictionary if there is one - * or init from zero, using src for literals stats, or flat 1 for match= symbols + * and init from zero if there is none, + * using src for literals stats, and baseline stats for sequence symbols * otherwise downscale existing stats, to be used as seed for next block. */ static void @@ -138,7 +146,7 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, optPtr->litSum =3D 0; for (lit=3D0; lit<=3DMaxLit; lit++) { U32 const scaleLog =3D 11; /* scale to 2K */ - U32 const bitCost =3D HUF_getNbBits(optPtr->symbolCost= s->huf.CTable, lit); + U32 const bitCost =3D HUF_getNbBitsFromCTable(optPtr->= symbolCosts->huf.CTable, lit); assert(bitCost <=3D scaleLog); optPtr->litFreq[lit] =3D bitCost ? 1 << (scaleLog-bitC= ost) : 1 /*minimum to calculate cost*/; optPtr->litSum +=3D optPtr->litFreq[lit]; @@ -186,14 +194,19 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, if (compressedLiterals) { unsigned lit =3D MaxLit; HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); = /* use raw first block to init statistics */ - optPtr->litSum =3D ZSTD_downscaleStat(optPtr->litFreq, Max= Lit, 1); + optPtr->litSum =3D ZSTD_downscaleStats(optPtr->litFreq, Ma= xLit, 8); } =20 - { unsigned ll; - for (ll=3D0; ll<=3DMaxLL; ll++) - optPtr->litLengthFreq[ll] =3D 1; + { unsigned const baseLLfreqs[MaxLL+1] =3D { + 4, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 + }; + ZSTD_memcpy(optPtr->litLengthFreq, baseLLfreqs, sizeof(bas= eLLfreqs)); + optPtr->litLengthSum =3D sum_u32(baseLLfreqs, MaxLL+1); } - optPtr->litLengthSum =3D MaxLL+1; =20 { unsigned ml; for (ml=3D0; ml<=3DMaxML; ml++) @@ -201,21 +214,26 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, } optPtr->matchLengthSum =3D MaxML+1; =20 - { unsigned of; - for (of=3D0; of<=3DMaxOff; of++) - optPtr->offCodeFreq[of] =3D 1; + { unsigned const baseOFCfreqs[MaxOff+1] =3D { + 6, 2, 1, 1, 2, 3, 4, 4, + 4, 3, 2, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1 + }; + ZSTD_memcpy(optPtr->offCodeFreq, baseOFCfreqs, sizeof(base= OFCfreqs)); + optPtr->offCodeSum =3D sum_u32(baseOFCfreqs, MaxOff+1); } - optPtr->offCodeSum =3D MaxOff+1; + =20 } =20 } else { /* new block : re-use previous statistics, scaled down */ =20 if (compressedLiterals) - optPtr->litSum =3D ZSTD_downscaleStat(optPtr->litFreq, MaxLit,= 1); - optPtr->litLengthSum =3D ZSTD_downscaleStat(optPtr->litLengthFreq,= MaxLL, 0); - optPtr->matchLengthSum =3D ZSTD_downscaleStat(optPtr->matchLengthF= req, MaxML, 0); - optPtr->offCodeSum =3D ZSTD_downscaleStat(optPtr->offCodeFreq, Max= Off, 0); + optPtr->litSum =3D ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12= ); + optPtr->litLengthSum =3D ZSTD_scaleStats(optPtr->litLengthFreq, Ma= xLL, 11); + optPtr->matchLengthSum =3D ZSTD_scaleStats(optPtr->matchLengthFreq= , MaxML, 11); + optPtr->offCodeSum =3D ZSTD_scaleStats(optPtr->offCodeFreq, MaxOff= , 11); } =20 ZSTD_setBasePrices(optPtr, optLevel); @@ -251,7 +269,16 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const lite= rals, U32 const litLength, * cost of literalLength symbol */ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* cons= t optPtr, int optLevel) { - if (optPtr->priceType =3D=3D zop_predef) return WEIGHT(litLength, optL= evel); + assert(litLength <=3D ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType =3D=3D zop_predef) + return WEIGHT(litLength, optLevel); + /* We can't compute the litLength price for sizes >=3D ZSTD_BLOCKSIZE_= MAX + * because it isn't representable in the zstd format. So instead just + * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the bl= ock + * would be all literals. + */ + if (litLength =3D=3D ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX= - 1, optPtr, optLevel); =20 /* dynamic statistics */ { U32 const llCode =3D ZSTD_LLcode(litLength); @@ -264,15 +291,17 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, c= onst optState_t* const optP /* ZSTD_getMatchPrice() : * Provides the cost of the match part (offset + matchLength) of a sequence * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a= sequence. - * optLevel: when <2, favors small offset for decompression speed (improve= d cache efficiency) */ + * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are rea= l_offsets+2 + * @optLevel: when <2, favors small offset for decompression speed (improv= ed cache efficiency) + */ FORCE_INLINE_TEMPLATE U32 -ZSTD_getMatchPrice(U32 const offset, +ZSTD_getMatchPrice(U32 const offcode, U32 const matchLength, const optState_t* const optPtr, int const optLevel) { U32 price; - U32 const offCode =3D ZSTD_highbit32(offset+1); + U32 const offCode =3D ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); U32 const mlBase =3D matchLength - MINMATCH; assert(matchLength >=3D MINMATCH); =20 @@ -315,8 +344,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, optPtr->litLengthSum++; } =20 - /* match offset code (0-2=3D>repCode; 3+=3D>offset+2) */ - { U32 const offCode =3D ZSTD_highbit32(offsetCode+1); + /* offset code : expected to follow storeSeq() numeric representation = */ + { U32 const offCode =3D ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)= ); assert(offCode <=3D MaxOff); optPtr->offCodeFreq[offCode]++; optPtr->offCodeSum++; @@ -350,7 +379,7 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U3= 2 length) =20 /* Update hashTable3 up to ip (excluded) Assumption : always within prefix (i.e. not within extDict) */ -static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, +static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, U32* nextToUpdate3, const BYTE* const ip) { @@ -376,11 +405,13 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_ma= tchState_t* ms, * Binary Tree search ***************************************/ /* ZSTD_insertBt1() : add one or multiple positions to tree. - * ip : assumed <=3D iend-8 . + * @param ip assumed <=3D iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling= to this position * @return : nb of positions added */ static U32 ZSTD_insertBt1( - ZSTD_matchState_t* ms, + const ZSTD_matchState_t* ms, const BYTE* const ip, const BYTE* const iend, + U32 const target, U32 const mls, const int extDict) { const ZSTD_compressionParameters* const cParams =3D &ms->cParams; @@ -403,7 +434,10 @@ static U32 ZSTD_insertBt1( U32* smallerPtr =3D bt + 2*(curr&btMask); U32* largerPtr =3D smallerPtr + 1; U32 dummy32; /* to be nullified at the end */ - U32 const windowLow =3D ms->window.lowLimit; + /* windowLow is based on target because + * we only need positions that will be in the window at the end of the= tree update. + */ + U32 const windowLow =3D ZSTD_getLowestMatchIndex(ms, target, cParams->= windowLog); U32 matchEndIdx =3D curr+8+1; size_t bestLength =3D 8; U32 nbCompares =3D 1U << cParams->searchLog; @@ -416,6 +450,7 @@ static U32 ZSTD_insertBt1( =20 DEBUGLOG(8, "ZSTD_insertBt1 (%u)", curr); =20 + assert(curr <=3D target); assert(ip <=3D iend-8); /* required for h calculation */ hashTable[h] =3D curr; /* Update Hash Table */ =20 @@ -504,7 +539,7 @@ void ZSTD_updateTree_internal( idx, target, dictMode); =20 while(idx < target) { - U32 const forward =3D ZSTD_insertBt1(ms, base+idx, iend, mls, dict= Mode =3D=3D ZSTD_extDict); + U32 const forward =3D ZSTD_insertBt1(ms, base+idx, iend, target, m= ls, dictMode =3D=3D ZSTD_extDict); assert(idx < (U32)(idx + forward)); idx +=3D forward; } @@ -609,7 +644,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of lengt= h %u", repCode, ll0, repOffset, repLen); bestLength =3D repLen; - matches[mnum].off =3D repCode - ll0; + matches[mnum].off =3D STORE_REPCODE(repCode - ll0 + 1); /= * expect value between 1 and 3 */ matches[mnum].len =3D (U32)repLen; mnum++; if ( (repLen > sufficient_len) @@ -638,7 +673,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( bestLength =3D mlen; assert(curr > matchIndex3); assert(mnum=3D=3D0); /* no prior solution */ - matches[0].off =3D (curr - matchIndex3) + ZSTD_REP_MOVE; + matches[0].off =3D STORE_OFFSET(curr - matchIndex3); matches[0].len =3D (U32)mlen; mnum =3D 1; if ( (mlen > sufficient_len) | @@ -647,7 +682,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( return 1; } } } /* no dictMatchState lookup: dicts don't have a populated HC3 tabl= e */ - } + } /* if (mls =3D=3D 3) */ =20 hashTable[h] =3D curr; /* Update Hash Table */ =20 @@ -672,20 +707,19 @@ U32 ZSTD_insertBtAndGetAllMatches ( =20 if (matchLength > bestLength) { DEBUGLOG(8, "found match of length %u at distance %u (offCode= =3D%u)", - (U32)matchLength, curr - matchIndex, curr - matchIndex= + ZSTD_REP_MOVE); + (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr= - matchIndex)); assert(matchEndIdx > matchIndex); if (matchLength > matchEndIdx - matchIndex) matchEndIdx =3D matchIndex + (U32)matchLength; bestLength =3D matchLength; - matches[mnum].off =3D (curr - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].off =3D STORE_OFFSET(curr - matchIndex); matches[mnum].len =3D (U32)matchLength; mnum++; if ( (matchLength > ZSTD_OPT_NUM) | (ip+matchLength =3D=3D iLimit) /* equal : no way to know = if inf or sup */) { if (dictMode =3D=3D ZSTD_dictMatchState) nbCompares =3D 0;= /* break should also skip searching dms */ break; /* drop, to preserve bt consistency (miss a little = bit of compression) */ - } - } + } } =20 if (match[matchLength] < ip[matchLength]) { /* match smaller than current */ @@ -721,18 +755,17 @@ U32 ZSTD_insertBtAndGetAllMatches ( if (matchLength > bestLength) { matchIndex =3D dictMatchIndex + dmsIndexDelta; DEBUGLOG(8, "found dms match of length %u at distance %u (= offCode=3D%u)", - (U32)matchLength, curr - matchIndex, curr - matchI= ndex + ZSTD_REP_MOVE); + (U32)matchLength, curr - matchIndex, STORE_OFFSET(= curr - matchIndex)); if (matchLength > matchEndIdx - matchIndex) matchEndIdx =3D matchIndex + (U32)matchLength; bestLength =3D matchLength; - matches[mnum].off =3D (curr - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].off =3D STORE_OFFSET(curr - matchIndex); matches[mnum].len =3D (U32)matchLength; mnum++; if ( (matchLength > ZSTD_OPT_NUM) | (ip+matchLength =3D=3D iLimit) /* equal : no way to k= now if inf or sup */) { break; /* drop, to guarantee consistency (miss a lit= tle bit of compression) */ - } - } + } } =20 if (dictMatchIndex <=3D dmsBtLow) { break; } /* beyond tree = size, stop the search */ if (match[matchLength] < ip[matchLength]) { @@ -742,39 +775,91 @@ U32 ZSTD_insertBtAndGetAllMatches ( /* match is larger than current */ commonLengthLarger =3D matchLength; dictMatchIndex =3D nextPtr[0]; - } - } - } + } } } /* if (dictMode =3D=3D ZSTD_dictMatchState) */ =20 assert(matchEndIdx > curr+8); ms->nextToUpdate =3D matchEndIdx - 8; /* skip repetitive patterns */ return mnum; } =20 - -FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( - ZSTD_match_t* matches, /* store result (match fo= und, increasing size) in this table */ - ZSTD_matchState_t* ms, - U32* nextToUpdate3, - const BYTE* ip, const BYTE* const iHighLimit, cons= t ZSTD_dictMode_e dictMode, - const U32 rep[ZSTD_REP_NUM], - U32 const ll0, - U32 const lengthToBeat) +typedef U32 (*ZSTD_getAllMatchesFn)( + ZSTD_match_t*, + ZSTD_matchState_t*, + U32*, + const BYTE*, + const BYTE*, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat); + +FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, + ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* ip, + const BYTE* const iHighLimit, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat, + const ZSTD_dictMode_e dictMode, + const U32 mls) { - const ZSTD_compressionParameters* const cParams =3D &ms->cParams; - U32 const matchLengthSearch =3D cParams->minMatch; - DEBUGLOG(8, "ZSTD_BtGetAllMatches"); - if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped ar= ea */ - ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMo= de); - switch(matchLengthSearch) - { - case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdat= e3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3); - default : - case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdat= e3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4); - case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdat= e3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5); - case 7 : - case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdat= e3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6); + assert(BOUNDED(3, ms->cParams.minMatch, 6) =3D=3D mls); + DEBUGLOG(8, "ZSTD_BtGetAllMatches(dictMode=3D%d, mls=3D%u)", (int)dict= Mode, mls); + if (ip < ms->window.base + ms->nextToUpdate) + return 0; /* skipped area */ + ZSTD_updateTree_internal(ms, ip, iHighLimit, mls, dictMode); + return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, i= HighLimit, dictMode, rep, ll0, lengthToBeat, mls); +} + +#define ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls) ZSTD_btGetAllMatches_##d= ictMode##_##mls + +#define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \ + static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \ + ZSTD_match_t* matches, \ + ZSTD_matchState_t* ms, \ + U32* nextToUpdate3, \ + const BYTE* ip, \ + const BYTE* const iHighLimit, \ + const U32 rep[ZSTD_REP_NUM], \ + U32 const ll0, \ + U32 const lengthToBeat) \ + { \ + return ZSTD_btGetAllMatches_internal( \ + matches, ms, nextToUpdate3, ip, iHighLimit, \ + rep, ll0, lengthToBeat, ZSTD_##dictMode, mls); \ + } + +#define GEN_ZSTD_BT_GET_ALL_MATCHES(dictMode) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 3) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 4) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 5) \ + GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, 6) + +GEN_ZSTD_BT_GET_ALL_MATCHES(noDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(extDict) +GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState) + +#define ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMode) \ + { \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 3), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 4), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 5), \ + ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, 6) \ } + +static ZSTD_getAllMatchesFn +ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e co= nst dictMode) +{ + ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] =3D { + ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), + ZSTD_BT_GET_ALL_MATCHES_ARRAY(extDict), + ZSTD_BT_GET_ALL_MATCHES_ARRAY(dictMatchState) + }; + U32 const mls =3D BOUNDED(3, ms->cParams.minMatch, 6); + assert((U32)dictMode < 3); + assert(mls - 3 < 4); + return getAllMatchesFns[(int)dictMode][mls - 3]; } =20 /* *********************** @@ -783,16 +868,18 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( =20 /* Struct containing info needed to make decision about ldm inclusion */ typedef struct { - rawSeqStore_t seqStore; /* External match candidates store for= this block */ - U32 startPosInBlock; /* Start position of the current match= candidate */ - U32 endPosInBlock; /* End position of the current match c= andidate */ - U32 offset; /* Offset of the match candidate */ + rawSeqStore_t seqStore; /* External match candidates store for this = block */ + U32 startPosInBlock; /* Start position of the current match candi= date */ + U32 endPosInBlock; /* End position of the current match candida= te */ + U32 offset; /* Offset of the match candidate */ } ZSTD_optLdm_t; =20 /* ZSTD_optLdm_skipRawSeqStoreBytes(): - * Moves forward in rawSeqStore by nbBytes, which will update the fields '= pos' and 'posInSequence'. + * Moves forward in @rawSeqStore by @nbBytes, + * which will update the fields 'pos' and 'posInSequence'. */ -static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, s= ize_t nbBytes) { +static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, s= ize_t nbBytes) +{ U32 currPos =3D (U32)(rawSeqStore->posInSequence + nbBytes); while (currPos && rawSeqStore->pos < rawSeqStore->size) { rawSeq currSeq =3D rawSeqStore->seq[rawSeqStore->pos]; @@ -813,8 +900,10 @@ static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqSto= re_t* rawSeqStore, size_t * Calculates the beginning and end of the next match in the current block. * Updates 'pos' and 'posInSequence' of the ldmSeqStore. */ -static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, = U32 currPosInBlock, - U32 blockBytesRemaining= ) { +static void +ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosI= nBlock, + U32 blockBytesRemaining) +{ rawSeq currSeq; U32 currBlockEndPos; U32 literalsBytesRemaining; @@ -826,8 +915,8 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD= _optLdm_t* optLdm, U32 cu optLdm->endPosInBlock =3D UINT_MAX; return; } - /* Calculate appropriate bytes left in matchLength and litLength after= adjusting - based on ldmSeqStore->posInSequence */ + /* Calculate appropriate bytes left in matchLength and litLength + * after adjusting based on ldmSeqStore->posInSequence */ currSeq =3D optLdm->seqStore.seq[optLdm->seqStore.pos]; assert(optLdm->seqStore.posInSequence <=3D currSeq.litLength + currSeq= .matchLength); currBlockEndPos =3D currPosInBlock + blockBytesRemaining; @@ -863,15 +952,16 @@ static void ZSTD_opt_getNextMatchAndUpdateSeqStore(ZS= TD_optLdm_t* optLdm, U32 cu } =20 /* ZSTD_optLdm_maybeAddMatch(): - * Adds a match if it's long enough, based on it's 'matchStartPosInBlock' - * and 'matchEndPosInBlock', into 'matches'. Maintains the correct orderin= g of 'matches' + * Adds a match if it's long enough, + * based on it's 'matchStartPosInBlock' and 'matchEndPosInBlock', + * into 'matches'. Maintains the correct ordering of 'matches'. */ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatche= s, - ZSTD_optLdm_t* optLdm, U32 currPosIn= Block) { - U32 posDiff =3D currPosInBlock - optLdm->startPosInBlock; + const ZSTD_optLdm_t* optLdm, U32 cur= rPosInBlock) +{ + U32 const posDiff =3D currPosInBlock - optLdm->startPosInBlock; /* Note: ZSTD_match_t actually contains offCode and matchLength (befor= e subtracting MINMATCH) */ - U32 candidateMatchLength =3D optLdm->endPosInBlock - optLdm->startPosI= nBlock - posDiff; - U32 candidateOffCode =3D optLdm->offset + ZSTD_REP_MOVE; + U32 const candidateMatchLength =3D optLdm->endPosInBlock - optLdm->sta= rtPosInBlock - posDiff; =20 /* Ensure that current block position is not outside of the match */ if (currPosInBlock < optLdm->startPosInBlock @@ -881,6 +971,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* mat= ches, U32* nbMatches, } =20 if (*nbMatches =3D=3D 0 || ((candidateMatchLength > matches[*nbMatches= -1].len) && *nbMatches < ZSTD_OPT_NUM)) { + U32 const candidateOffCode =3D STORE_OFFSET(optLdm->offset); DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate mat= ch (offCode: %u matchLength %u) at block position=3D%u", candidateOffCode, candidateMatchLength, currPosInBlock); matches[*nbMatches].len =3D candidateMatchLength; @@ -892,8 +983,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* ma= tches, U32* nbMatches, /* ZSTD_optLdm_processMatchCandidate(): * Wrapper function to update ldm seq store and call ldm functions as nece= ssary. */ -static void ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, ZSTD_= match_t* matches, U32* nbMatches, - U32 currPosInBlock, U32 rema= iningBytes) { +static void +ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + ZSTD_match_t* matches, U32* nbMatches, + U32 currPosInBlock, U32 remainingBytes) +{ if (optLdm->seqStore.size =3D=3D 0 || optLdm->seqStore.pos >=3D optLdm= ->seqStore.size) { return; } @@ -904,19 +998,19 @@ static void ZSTD_optLdm_processMatchCandidate(ZSTD_op= tLdm_t* optLdm, ZSTD_match_ * at the end of a match from the ldm seq store, and will ofte= n be some bytes * over beyond matchEndPosInBlock. As such, we need to correct= for these "overshoots" */ - U32 posOvershoot =3D currPosInBlock - optLdm->endPosInBlock; + U32 const posOvershoot =3D currPosInBlock - optLdm->endPosInBl= ock; ZSTD_optLdm_skipRawSeqStoreBytes(&optLdm->seqStore, posOversho= ot); - }=20 + } ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, rem= ainingBytes); } ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock); } =20 + /*-******************************* * Optimal parser *********************************/ =20 - static U32 ZSTD_totalLen(ZSTD_optimal_t sol) { return sol.litlen + sol.mlen; @@ -957,6 +1051,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, const BYTE* const prefixStart =3D base + ms->window.dictLimit; const ZSTD_compressionParameters* const cParams =3D &ms->cParams; =20 + ZSTD_getAllMatchesFn getAllMatches =3D ZSTD_selectBtGetAllMatches(ms, = dictMode); + U32 const sufficient_len =3D MIN(cParams->targetLength, ZSTD_OPT_NUM -= 1); U32 const minMatch =3D (cParams->minMatch =3D=3D 3) ? 3 : 4; U32 nextToUpdate3 =3D ms->nextToUpdate; @@ -984,7 +1080,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, /* find first match */ { U32 const litlen =3D (U32)(ip - anchor); U32 const ll0 =3D !litlen; - U32 nbMatches =3D ZSTD_BtGetAllMatches(matches, ms, &nextToUpd= ate3, ip, iend, dictMode, rep, ll0, minMatch); + U32 nbMatches =3D getAllMatches(matches, ms, &nextToUpdate3, i= p, iend, rep, ll0, minMatch); ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, (U32)(ip-istart), (U32)(iend= - ip)); if (!nbMatches) { ip++; continue; } @@ -998,18 +1094,18 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, * in every price. We include the literal length to avoid nega= tive * prices when we subtract the previous literal length. */ - opt[0].price =3D ZSTD_litLengthPrice(litlen, optStatePtr, optL= evel); + opt[0].price =3D (int)ZSTD_litLengthPrice(litlen, optStatePtr,= optLevel); =20 /* large match -> immediate encoding */ { U32 const maxML =3D matches[nbMatches-1].len; - U32 const maxOffset =3D matches[nbMatches-1].off; + U32 const maxOffcode =3D matches[nbMatches-1].off; DEBUGLOG(6, "found %u matches of maxLength=3D%u and maxOff= Code=3D%u at cPos=3D%u =3D> start new series", - nbMatches, maxML, maxOffset, (U32)(ip-prefixSt= art)); + nbMatches, maxML, maxOffcode, (U32)(ip-prefixS= tart)); =20 if (maxML > sufficient_len) { lastSequence.litlen =3D litlen; lastSequence.mlen =3D maxML; - lastSequence.off =3D maxOffset; + lastSequence.off =3D maxOffcode; DEBUGLOG(6, "large match (%u>%u), immediate encoding", maxML, sufficient_len); cur =3D 0; @@ -1018,24 +1114,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* m= s, } } =20 /* set prices for first matches starting position =3D=3D 0 */ - { U32 const literalsPrice =3D opt[0].price + ZSTD_litLengthP= rice(0, optStatePtr, optLevel); + assert(opt[0].price >=3D 0); + { U32 const literalsPrice =3D (U32)opt[0].price + ZSTD_litLe= ngthPrice(0, optStatePtr, optLevel); U32 pos; U32 matchNb; for (pos =3D 1; pos < minMatch; pos++) { opt[pos].price =3D ZSTD_MAX_PRICE; /* mlen, litlen a= nd price will be fixed during forward scanning */ } for (matchNb =3D 0; matchNb < nbMatches; matchNb++) { - U32 const offset =3D matches[matchNb].off; + U32 const offcode =3D matches[matchNb].off; U32 const end =3D matches[matchNb].len; for ( ; pos <=3D end ; pos++ ) { - U32 const matchPrice =3D ZSTD_getMatchPrice(offset= , pos, optStatePtr, optLevel); + U32 const matchPrice =3D ZSTD_getMatchPrice(offcod= e, pos, optStatePtr, optLevel); U32 const sequencePrice =3D literalsPrice + matchP= rice; DEBUGLOG(7, "rPos:%u =3D> set initial price : %.2f= ", pos, ZSTD_fCost(sequencePrice)); opt[pos].mlen =3D pos; - opt[pos].off =3D offset; + opt[pos].off =3D offcode; opt[pos].litlen =3D litlen; - opt[pos].price =3D sequencePrice; + opt[pos].price =3D (int)sequencePrice; } } last_pos =3D pos-1; } @@ -1050,9 +1147,9 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, /* Fix current position with one literal if cheaper */ { U32 const litlen =3D (opt[cur-1].mlen =3D=3D 0) ? opt[cur-= 1].litlen + 1 : 1; int const price =3D opt[cur-1].price - + ZSTD_rawLiteralsCost(ip+cur-1, 1, optSta= tePtr, optLevel) - + ZSTD_litLengthPrice(litlen, optStatePtr,= optLevel) - - ZSTD_litLengthPrice(litlen-1, optStatePt= r, optLevel); + + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, o= ptStatePtr, optLevel) + + (int)ZSTD_litLengthPrice(litlen, optStat= ePtr, optLevel) + - (int)ZSTD_litLengthPrice(litlen-1, optSt= atePtr, optLevel); assert(price < 1000000000); /* overflow check */ if (price <=3D opt[cur].price) { DEBUGLOG(7, "cPos:%zi=3D=3DrPos:%u : better price (%.2= f<=3D%.2f) using literal (ll=3D=3D%u) (hist:%u,%u,%u)", @@ -1078,7 +1175,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, assert(cur >=3D opt[cur].mlen); if (opt[cur].mlen !=3D 0) { U32 const prev =3D cur - opt[cur].mlen; - repcodes_t newReps =3D ZSTD_updateRep(opt[prev].rep, opt[c= ur].off, opt[cur].litlen=3D=3D0); + repcodes_t const newReps =3D ZSTD_newRep(opt[prev].rep, op= t[cur].off, opt[cur].litlen=3D=3D0); ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); } else { ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcode= s_t)); @@ -1095,11 +1192,12 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* m= s, continue; /* skip unpromising positions; about ~+6% speed= , -0.01 ratio */ } =20 + assert(opt[cur].price >=3D 0); { U32 const ll0 =3D (opt[cur].mlen !=3D 0); U32 const litlen =3D (opt[cur].mlen =3D=3D 0) ? opt[cur].l= itlen : 0; - U32 const previousPrice =3D opt[cur].price; + U32 const previousPrice =3D (U32)opt[cur].price; U32 const basePrice =3D previousPrice + ZSTD_litLengthPric= e(0, optStatePtr, optLevel); - U32 nbMatches =3D ZSTD_BtGetAllMatches(matches, ms, &nextT= oUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch); + U32 nbMatches =3D getAllMatches(matches, ms, &nextToUpdate= 3, inr, iend, opt[cur].rep, ll0, minMatch); U32 matchNb; =20 ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMat= ches, @@ -1137,7 +1235,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, =20 for (mlen =3D lastML; mlen >=3D startML; mlen--) { /*= scan downward */ U32 const pos =3D cur + mlen; - int const price =3D basePrice + ZSTD_getMatchPrice= (offset, mlen, optStatePtr, optLevel); + int const price =3D (int)basePrice + (int)ZSTD_get= MatchPrice(offset, mlen, optStatePtr, optLevel); =20 if ((pos > last_pos) || (price < opt[pos].price)) { DEBUGLOG(7, "rPos:%u (ml=3D%2u) =3D> new bette= r price (%.2f<%.2f)", @@ -1167,7 +1265,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, * update them while traversing the sequences. */ if (lastSequence.mlen !=3D 0) { - repcodes_t reps =3D ZSTD_updateRep(opt[cur].rep, lastSequence.= off, lastSequence.litlen=3D=3D0); + repcodes_t const reps =3D ZSTD_newRep(opt[cur].rep, lastSequen= ce.off, lastSequence.litlen=3D=3D0); ZSTD_memcpy(rep, &reps, sizeof(reps)); } else { ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); @@ -1211,7 +1309,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, =20 assert(anchor + llen <=3D iend); ZSTD_updateStats(optStatePtr, llen, anchor, offCode, m= len); - ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, m= len-MINMATCH); + ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, m= len); anchor +=3D advance; ip =3D anchor; } } @@ -1223,38 +1321,30 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* m= s, return (size_t)(iend - anchor); } =20 +static size_t ZSTD_compressBlock_opt0( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 0 /* optLevel */, dictMode); +} + +static size_t ZSTD_compressBlock_opt2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 2 /* optLevel */, dictMode); +} =20 size_t ZSTD_compressBlock_btopt( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock_btopt"); - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 0 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_n= oDict); } =20 =20 -/* used in 2-pass strategy */ -static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus) -{ - U32 s, sum=3D0; - assert(ZSTD_FREQ_DIV+bonus >=3D 0); - for (s=3D0; slitSum =3D ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0); - optPtr->litLengthSum =3D ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL= , 0); - optPtr->matchLengthSum =3D ZSTD_upscaleStat(optPtr->matchLengthFreq, M= axML, 0); - optPtr->offCodeSum =3D ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0= ); -} =20 /* ZSTD_initStats_ultra(): * make a first compression pass, just to seed stats with more accurate st= arting values. @@ -1276,7 +1366,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, assert(ms->window.dictLimit =3D=3D ms->window.lowLimit); /* no dicti= onary */ assert(ms->window.dictLimit - ms->nextToUpdate <=3D 1); /* no prefix = (note: intentional overflow, defined as 2-complement) */ =20 - ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /= *optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/ + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDic= t); /* generate stats into ms->opt*/ =20 /* invalidate first scan from history */ ZSTD_resetSeqStore(seqStore); @@ -1285,8 +1375,6 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, ms->window.lowLimit =3D ms->window.dictLimit; ms->nextToUpdate =3D ms->window.dictLimit; =20 - /* re-inforce weight of collected statistics */ - ZSTD_upscaleStats(&ms->opt); } =20 size_t ZSTD_compressBlock_btultra( @@ -1294,7 +1382,7 @@ size_t ZSTD_compressBlock_btultra( const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=3D%zu)", srcSize); - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 2 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_n= oDict); } =20 size_t ZSTD_compressBlock_btultra2( @@ -1322,35 +1410,35 @@ size_t ZSTD_compressBlock_btultra2( ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); } =20 - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 2 /*optLevel*/, ZSTD_noDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_n= oDict); } =20 size_t ZSTD_compressBlock_btopt_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 0 /*optLevel*/, ZSTD_dictMatchState); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_d= ictMatchState); } =20 size_t ZSTD_compressBlock_btultra_dictMatchState( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 2 /*optLevel*/, ZSTD_dictMatchState); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_d= ictMatchState); } =20 size_t ZSTD_compressBlock_btopt_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 0 /*optLevel*/, ZSTD_extDict); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_e= xtDict); } =20 size_t ZSTD_compressBlock_btultra_extDict( ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize,= 2 /*optLevel*/, ZSTD_extDict); + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_e= xtDict); } =20 /* note : no btultra2 variant for extDict nor dictMatchState, diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf= _decompress.c index 5105e59ac04a..89b269a641c7 100644 --- a/lib/zstd/decompress/huf_decompress.c +++ b/lib/zstd/decompress/huf_decompress.c @@ -22,6 +22,13 @@ #define HUF_STATIC_LINKING_ONLY #include "../common/huf.h" #include "../common/error_private.h" +#include "../common/zstd_internal.h" + +/* ************************************************************** +* Constants +****************************************************************/ + +#define HUF_DECODER_FAST_TABLELOG 11 =20 /* ************************************************************** * Macros @@ -36,6 +43,26 @@ #error "Cannot force the use of the X1 and X2 decoders at the same time!" #endif =20 +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE +#else +# define HUF_ASM_X86_64_BMI2_ATTRS +#endif + +#define HUF_EXTERN_C +#define HUF_ASM_DECL HUF_EXTERN_C + +#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +# define HUF_NEED_BMI2_FUNCTION 1 +#else +# define HUF_NEED_BMI2_FUNCTION 0 +#endif + +#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +# define HUF_NEED_DEFAULT_FUNCTION 1 +#else +# define HUF_NEED_DEFAULT_FUNCTION 0 +#endif =20 /* ************************************************************** * Error Management @@ -65,7 +92,7 @@ return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); = \ } = \ = \ - static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( = \ + static BMI2_TARGET_ATTRIBUTE size_t fn##_bmi2( = \ void* dst, size_t dstSize, = \ const void* cSrc, size_t cSrcSize, = \ const HUF_DTable* DTable) = \ @@ -107,13 +134,147 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable= * table) return dtd; } =20 +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +static size_t HUF_initDStream(BYTE const* ip) { + BYTE const lastByte =3D ip[7]; + size_t const bitsConsumed =3D lastByte ? 8 - BIT_highbit32(lastByte) := 0; + size_t const value =3D MEM_readLEST(ip) | 1; + assert(bitsConsumed <=3D 8); + return value << bitsConsumed; +} +typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; + BYTE const* ilimit; + BYTE* oend; + BYTE const* iend[4]; +} HUF_DecompressAsmArgs; + +/* + * Initializes args for the asm decoding loop. + * @returns 0 on success + * 1 if the fallback implementation should be used. + * Or an error code on failure. + */ +static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void= * dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* D= Table) +{ + void const* dt =3D DTable + 1; + U32 const dtLog =3D HUF_getDTableDesc(DTable).tableLog; + + const BYTE* const ilimit =3D (const BYTE*)src + 6 + 8; + + BYTE* const oend =3D (BYTE*)dst + dstSize; + + /* The following condition is false on x32 platform, + * but HUF_asm is not compatible with this ABI */ + if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) + return ERROR(corruption_detected); + + /* Must have at least 8 bytes per stream because we don't handle initi= alizing smaller bit containers. + * If table log is not correct at this point, fallback to the old deco= der. + * On small inputs we don't have enough data to trigger the fast loop,= so use the old decoder. + */ + if (dtLog !=3D HUF_DECODER_FAST_TABLELOG) + return 1; + + /* Read the jump table. */ + { + const BYTE* const istart =3D (const BYTE*)src; + size_t const length1 =3D MEM_readLE16(istart); + size_t const length2 =3D MEM_readLE16(istart+2); + size_t const length3 =3D MEM_readLE16(istart+4); + size_t const length4 =3D srcSize - (length1 + length2 + length3 + = 6); + args->iend[0] =3D istart + 6; /* jumpTable */ + args->iend[1] =3D args->iend[0] + length1; + args->iend[2] =3D args->iend[1] + length2; + args->iend[3] =3D args->iend[2] + length3; + + /* HUF_initDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. + * length1 must be >=3D 16 so that ip[0] >=3D ilimit before the lo= op + * starts. + */ + if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) + return 1; + if (length4 > srcSize) return ERROR(corruption_detected); /* ove= rflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ + args->ip[0] =3D args->iend[1] - sizeof(U64); + args->ip[1] =3D args->iend[2] - sizeof(U64); + args->ip[2] =3D args->iend[3] - sizeof(U64); + args->ip[3] =3D (BYTE const*)src + srcSize - sizeof(U64); + + /* op[] contains the output pointers. */ + args->op[0] =3D (BYTE*)dst; + args->op[1] =3D args->op[0] + (dstSize+3)/4; + args->op[2] =3D args->op[1] + (dstSize+3)/4; + args->op[3] =3D args->op[2] + (dstSize+3)/4; + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >=3D oend) + return 1; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. + * It is shifted left as it is read, and zeros are + * shifted in. After the lowest valid bit a 1 is + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ + args->bits[0] =3D HUF_initDStream(args->ip[0]); + args->bits[1] =3D HUF_initDStream(args->ip[1]); + args->bits[2] =3D HUF_initDStream(args->ip[2]); + args->bits[3] =3D HUF_initDStream(args->ip[3]); + + /* If ip[] >=3D ilimit, it is guaranteed to be safe to + * reload bits[]. It may be beyond its section, but is + * guaranteed to be valid (>=3D istart). + */ + args->ilimit =3D ilimit; + + args->oend =3D oend; + args->dt =3D dt; + + return 0; +} + +static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressA= smArgs const* args, int stream, BYTE* segmentEnd) +{ + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) + return ERROR(corruption_detected); + /* Validate that we haven't read beyond iend[]. + * Note that ip[] may be < iend[] because the MSB is + * the next bit to read, and we may have consumed 100% + * of the stream, so down to iend[i] - 8 is valid. + */ + if (args->ip[stream] < args->iend[stream] - 8) + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ + bit->bitContainer =3D MEM_readLE64(args->ip[stream]); + bit->bitsConsumed =3D ZSTD_countTrailingZeros((size_t)args->bits[strea= m]); + bit->start =3D (const char*)args->iend[0]; + bit->limitPtr =3D bit->start + sizeof(size_t); + bit->ptr =3D (const char*)args->ip[stream]; + + return 0; +} +#endif + =20 #ifndef HUF_FORCE_DECOMPRESS_X2 =20 /*-***************************/ /* single-symbol decoding */ /*-***************************/ -typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol = decoding */ +typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol = decoding */ =20 /* * Packs 4 HUF_DEltX1 structs into a U64. This is used to lay down 4 entri= es at @@ -122,14 +283,44 @@ typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1= ; /* single-symbol decodi static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { U64 D4; if (MEM_isLittleEndian()) { - D4 =3D symbol + (nbBits << 8); - } else { D4 =3D (symbol << 8) + nbBits; + } else { + D4 =3D symbol + (nbBits << 8); } D4 *=3D 0x0001000100010001ULL; return D4; } =20 +/* + * Increase the tableLog to targetTableLog and rescales the stats. + * If tableLog > targetTableLog this is a no-op. + * @returns New tableLog + */ +static U32 HUF_rescaleStats(BYTE* huffWeight, U32* rankVal, U32 nbSymbols,= U32 tableLog, U32 targetTableLog) +{ + if (tableLog > targetTableLog) + return tableLog; + if (tableLog < targetTableLog) { + U32 const scale =3D targetTableLog - tableLog; + U32 s; + /* Increase the weight for all non-zero probability symbols by sca= le. */ + for (s =3D 0; s < nbSymbols; ++s) { + huffWeight[s] +=3D (BYTE)((huffWeight[s] =3D=3D 0) ? 0 : scale= ); + } + /* Update rankVal to reflect the new weights. + * All weights except 0 get moved to weight + scale. + * Weights [1, scale] are empty. + */ + for (s =3D targetTableLog; s > scale; --s) { + rankVal[s] =3D rankVal[s - scale]; + } + for (s =3D scale; s > 0; --s) { + rankVal[s] =3D 0; + } + } + return targetTableLog; +} + typedef struct { U32 rankVal[HUF_TABLELOG_ABSOLUTEMAX + 1]; U32 rankStart[HUF_TABLELOG_ABSOLUTEMAX + 1]; @@ -162,8 +353,12 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, = const void* src, size_t sr iSize =3D HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1= , wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, size= of(wksp->statsWksp), bmi2); if (HUF_isError(iSize)) return iSize; =20 + /* Table header */ { DTableDesc dtd =3D HUF_getDTableDesc(DTable); + U32 const maxTableLog =3D dtd.maxTableLog + 1; + U32 const targetTableLog =3D MIN(maxTableLog, HUF_DECODER_FAST_TAB= LELOG); + tableLog =3D HUF_rescaleStats(wksp->huffWeight, wksp->rankVal, nbS= ymbols, tableLog, targetTableLog); if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_too= Large); /* DTable too small, Huffman tree cannot fit in */ dtd.tableType =3D 0; dtd.tableLog =3D (BYTE)tableLog; @@ -207,7 +402,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, c= onst void* src, size_t sr =20 /* fill DTable * We fill all entries of each weight in order. - * That way length is a constant for each iteration of the outter loop. + * That way length is a constant for each iteration of the outer loop. * We can switch based on the length to a different inner loop which is * optimized for that particular case. */ @@ -304,11 +499,15 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitD= Ptr, BYTE* const pEnd, cons BYTE* const pStart =3D p; =20 /* up to 4 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) & (p= < pEnd-3)) { - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); - HUF_DECODE_SYMBOLX1_1(p, bitDPtr); - HUF_DECODE_SYMBOLX1_2(p, bitDPtr); - HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + if ((pEnd - p) > 3) { + while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) = & (p < pEnd-3)) { + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_1(p, bitDPtr); + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + } + } else { + BIT_reloadDStream(bitDPtr); } =20 /* [0-3] symbols remaining */ @@ -388,33 +587,36 @@ HUF_decompress4X1_usingDTable_internal_body( U32 endSignal =3D 1; =20 if (length4 > cSrcSize) return ERROR(corruption_detected); /* ov= erflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* ov= erflow */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); =20 /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode= */ - for ( ; (endSignal) & (op4 < olimit) ; ) { - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); - HUF_DECODE_SYMBOLX1_1(op1, &bitD1); - HUF_DECODE_SYMBOLX1_1(op2, &bitD2); - HUF_DECODE_SYMBOLX1_1(op3, &bitD3); - HUF_DECODE_SYMBOLX1_1(op4, &bitD4); - HUF_DECODE_SYMBOLX1_2(op1, &bitD1); - HUF_DECODE_SYMBOLX1_2(op2, &bitD2); - HUF_DECODE_SYMBOLX1_2(op3, &bitD3); - HUF_DECODE_SYMBOLX1_2(op4, &bitD4); - HUF_DECODE_SYMBOLX1_0(op1, &bitD1); - HUF_DECODE_SYMBOLX1_0(op2, &bitD2); - HUF_DECODE_SYMBOLX1_0(op3, &bitD3); - HUF_DECODE_SYMBOLX1_0(op4, &bitD4); - endSignal &=3D BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DStrea= m_unfinished; - endSignal &=3D BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DStrea= m_unfinished; - endSignal &=3D BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DStrea= m_unfinished; - endSignal &=3D BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DStrea= m_unfinished; + if ((size_t)(oend - op4) >=3D sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit) ; ) { + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_1(op1, &bitD1); + HUF_DECODE_SYMBOLX1_1(op2, &bitD2); + HUF_DECODE_SYMBOLX1_1(op3, &bitD3); + HUF_DECODE_SYMBOLX1_1(op4, &bitD4); + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_0(op1, &bitD1); + HUF_DECODE_SYMBOLX1_0(op2, &bitD2); + HUF_DECODE_SYMBOLX1_0(op3, &bitD3); + HUF_DECODE_SYMBOLX1_0(op4, &bitD4); + endSignal &=3D BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DS= tream_unfinished; + endSignal &=3D BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DS= tream_unfinished; + endSignal &=3D BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DS= tream_unfinished; + endSignal &=3D BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DS= tream_unfinished; + } } =20 /* check corruption */ @@ -440,6 +642,79 @@ HUF_decompress4X1_usingDTable_internal_body( } } =20 +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSi= ze, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc,= cSrcSize, DTable); +} +#endif + +#if HUF_NEED_DEFAULT_FUNCTION +static +size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t ds= tSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc,= cSrcSize, DTable); +} +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF= _DecompressAsmArgs* args) ZSTDLIB_HIDDEN; + +static HUF_ASM_X86_64_BMI2_ATTRS +size_t +HUF_decompress4X1_usingDTable_internal_bmi2_asm( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + void const* dt =3D DTable + 1; + const BYTE* const iend =3D (const BYTE*)cSrc + 6; + BYTE* const oend =3D (BYTE*)dst + dstSize; + HUF_DecompressAsmArgs args; + { + size_t const ret =3D HUF_DecompressAsmArgs_init(&args, dst, dstSiz= e, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret !=3D 0) + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSiz= e, cSrc, cSrcSize, DTable); + } + + assert(args.ip[0] >=3D args.ilimit); + HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); + + /* Our loop guarantees that ip[] >=3D ilimit and that we haven't + * overwritten any op[]. + */ + assert(args.ip[0] >=3D iend); + assert(args.ip[1] >=3D iend); + assert(args.ip[2] >=3D iend); + assert(args.ip[3] >=3D iend); + assert(args.op[3] <=3D oend); + (void)iend; + + /* finish bit streams one by one. */ + { + size_t const segmentSize =3D (dstSize+3) / 4; + BYTE* segmentEnd =3D (BYTE*)dst; + int i; + for (i =3D 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <=3D (size_t)(oend - segmentEnd)) + segmentEnd +=3D segmentSize; + else + segmentEnd =3D oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segm= entEnd), "corruption"); + /* Decompress and validate that we've produced exactly the exp= ected length. */ + args.op[i] +=3D HUF_decodeStreamX1(args.op[i], &bit, segmentEn= d, (HUF_DEltX1 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] !=3D segmentEnd) return ERROR(corruption_detect= ed); + } + } + + /* decoded size */ + return dstSize; +} +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ =20 typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, const void *cSrc, @@ -447,8 +722,28 @@ typedef size_t (*HUF_decompress_usingDTable_t)(void *d= st, size_t dstSize, const HUF_DTable *DTable); =20 HUF_DGEN(HUF_decompress1X1_usingDTable_internal) -HUF_DGEN(HUF_decompress4X1_usingDTable_internal) =20 +static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dst= Size, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSiz= e, cSrc, cSrcSize, DTable); +# else + return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, c= Src, cSrcSize, DTable); +# endif + } +#else + (void)bmi2; +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, c= Src, cSrcSize, DTable); +#else + return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cS= rc, cSrcSize, DTable); +#endif +} =20 =20 size_t HUF_decompress1X1_usingDTable( @@ -518,106 +813,226 @@ size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx= , void* dst, size_t dstSize, /* *************************/ =20 typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /*= double-symbols decoding */ -typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; +typedef struct { BYTE symbol; } sortedSymbol_t; typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; =20 +/* + * Constructs a HUF_DEltX2 in a U32. + */ +static U32 HUF_buildDEltX2U32(U32 symbol, U32 nbBits, U32 baseSeq, int lev= el) +{ + U32 seq; + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, sequence) =3D=3D 0); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, nbBits) =3D=3D 2); + DEBUG_STATIC_ASSERT(offsetof(HUF_DEltX2, length) =3D=3D 3); + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) =3D=3D sizeof(U32)); + if (MEM_isLittleEndian()) { + seq =3D level =3D=3D 1 ? symbol : (baseSeq + (symbol << 8)); + return seq + (nbBits << 16) + ((U32)level << 24); + } else { + seq =3D level =3D=3D 1 ? (symbol << 8) : ((baseSeq << 8) + symbol); + return (seq << 16) + (nbBits << 8) + (U32)level; + } +} =20 -/* HUF_fillDTableX2Level2() : - * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 = */ -static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const = U32 consumed, - const U32* rankValOrigin, const int minWeight, - const sortedSymbol_t* sortedSymbols, const U32 = sortedListSize, - U32 nbBitsBaseline, U16 baseSeq, U32* wksp, siz= e_t wkspSize) +/* + * Constructs a HUF_DEltX2. + */ +static HUF_DEltX2 HUF_buildDEltX2(U32 symbol, U32 nbBits, U32 baseSeq, int= level) { HUF_DEltX2 DElt; - U32* rankVal =3D wksp; + U32 const val =3D HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + DEBUG_STATIC_ASSERT(sizeof(DElt) =3D=3D sizeof(val)); + ZSTD_memcpy(&DElt, &val, sizeof(val)); + return DElt; +} + +/* + * Constructs 2 HUF_DEltX2s and packs them into a U64. + */ +static U64 HUF_buildDEltX2U64(U32 symbol, U32 nbBits, U16 baseSeq, int lev= el) +{ + U32 DElt =3D HUF_buildDEltX2U32(symbol, nbBits, baseSeq, level); + return (U64)DElt + ((U64)DElt << 32); +} =20 - assert(wkspSize >=3D HUF_TABLELOG_MAX + 1); - (void)wkspSize; - /* get pre-calculated rankVal */ - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + = 1)); +/* + * Fills the DTable rank with all the symbols from [begin, end) that are e= ach + * nbBits long. + * + * @param DTableRank The start of the rank in the DTable. + * @param begin The first symbol to fill (inclusive). + * @param end The last symbol to fill (exclusive). + * @param nbBits Each symbol is nbBits long. + * @param tableLog The table log. + * @param baseSeq If level =3D=3D 1 { 0 } else { the first level symbol } + * @param level The level in the table. Must be 1 or 2. + */ +static void HUF_fillDTableX2ForWeight( + HUF_DEltX2* DTableRank, + sortedSymbol_t const* begin, sortedSymbol_t const* end, + U32 nbBits, U32 tableLog, + U16 baseSeq, int const level) +{ + U32 const length =3D 1U << ((tableLog - nbBits) & 0x1F /* quiet static= -analyzer */); + const sortedSymbol_t* ptr; + assert(level >=3D 1 && level <=3D 2); + switch (length) { + case 1: + for (ptr =3D begin; ptr !=3D end; ++ptr) { + HUF_DEltX2 const DElt =3D HUF_buildDEltX2(ptr->symbol, nbBits,= baseSeq, level); + *DTableRank++ =3D DElt; + } + break; + case 2: + for (ptr =3D begin; ptr !=3D end; ++ptr) { + HUF_DEltX2 const DElt =3D HUF_buildDEltX2(ptr->symbol, nbBits,= baseSeq, level); + DTableRank[0] =3D DElt; + DTableRank[1] =3D DElt; + DTableRank +=3D 2; + } + break; + case 4: + for (ptr =3D begin; ptr !=3D end; ++ptr) { + U64 const DEltX2 =3D HUF_buildDEltX2U64(ptr->symbol, nbBits, b= aseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + DTableRank +=3D 4; + } + break; + case 8: + for (ptr =3D begin; ptr !=3D end; ++ptr) { + U64 const DEltX2 =3D HUF_buildDEltX2U64(ptr->symbol, nbBits, b= aseSeq, level); + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + DTableRank +=3D 8; + } + break; + default: + for (ptr =3D begin; ptr !=3D end; ++ptr) { + U64 const DEltX2 =3D HUF_buildDEltX2U64(ptr->symbol, nbBits, b= aseSeq, level); + HUF_DEltX2* const DTableRankEnd =3D DTableRank + length; + for (; DTableRank !=3D DTableRankEnd; DTableRank +=3D 8) { + ZSTD_memcpy(DTableRank + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTableRank + 6, &DEltX2, sizeof(DEltX2)); + } + } + break; + } +} =20 - /* fill skipped values */ +/* HUF_fillDTableX2Level2() : + * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 = */ +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, cons= t U32 consumedBits, + const U32* rankVal, const int minWeight, const = int maxWeight1, + const sortedSymbol_t* sortedSymbols, U32 const*= rankStart, + U32 nbBitsBaseline, U16 baseSeq) +{ + /* Fill skipped values (all positions up to rankVal[minWeight]). + * These are positions only get a single symbol because the combined w= eight + * is too large. + */ if (minWeight>1) { - U32 i, skipSize =3D rankVal[minWeight]; - MEM_writeLE16(&(DElt.sequence), baseSeq); - DElt.nbBits =3D (BYTE)(consumed); - DElt.length =3D 1; - for (i =3D 0; i < skipSize; i++) - DTable[i] =3D DElt; + U32 const length =3D 1U << ((targetLog - consumedBits) & 0x1F /* q= uiet static-analyzer */); + U64 const DEltX2 =3D HUF_buildDEltX2U64(baseSeq, consumedBits, /* = baseSeq */ 0, /* level */ 1); + int const skipSize =3D rankVal[minWeight]; + assert(length > 1); + assert((U32)skipSize < length); + switch (length) { + case 2: + assert(skipSize =3D=3D 1); + ZSTD_memcpy(DTable, &DEltX2, sizeof(DEltX2)); + break; + case 4: + assert(skipSize <=3D 4); + ZSTD_memcpy(DTable + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + 2, &DEltX2, sizeof(DEltX2)); + break; + default: + { + int i; + for (i =3D 0; i < skipSize; i +=3D 8) { + ZSTD_memcpy(DTable + i + 0, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 2, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 4, &DEltX2, sizeof(DEltX2)); + ZSTD_memcpy(DTable + i + 6, &DEltX2, sizeof(DEltX2)); + } + } + } } =20 - /* fill DTable */ - { U32 s; for (s=3D0; s=3D 1 */ - - rankVal[weight] +=3D length; - } } + /* Fill each of the second level symbols by weight. */ + { + int w; + for (w =3D minWeight; w < maxWeight1; ++w) { + int const begin =3D rankStart[w]; + int const end =3D rankStart[w+1]; + U32 const nbBits =3D nbBitsBaseline - w; + U32 const totalBits =3D nbBits + consumedBits; + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedSymbols + begin, sortedSymbols + end, + totalBits, targetLog, + baseSeq, /* level */ 2); + } + } } =20 - static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, - const sortedSymbol_t* sortedList, const U32 sor= tedListSize, + const sortedSymbol_t* sortedList, const U32* rankStart, rankVal_t rankValOrigin, = const U32 maxWeight, - const U32 nbBitsBaseline, U32* wksp, size_t wks= pSize) + const U32 nbBitsBaseline) { - U32* rankVal =3D wksp; + U32* const rankVal =3D rankValOrigin[0]; const int scaleLog =3D nbBitsBaseline - targetLog; /* note : targetL= og >=3D srcLog, hence scaleLog <=3D 1 */ const U32 minBits =3D nbBitsBaseline - maxWeight; - U32 s; - - assert(wkspSize >=3D HUF_TABLELOG_MAX + 1); - wksp +=3D HUF_TABLELOG_MAX + 1; - wkspSize -=3D HUF_TABLELOG_MAX + 1; - - ZSTD_memcpy(rankVal, rankValOrigin, sizeof(U32) * (HUF_TABLELOG_MAX + = 1)); - - /* fill DTable */ - for (s=3D0; s=3D minBits) { /* enough room for a second= symbol */ - U32 sortedRank; + int w; + int const wEnd =3D (int)maxWeight + 1; + + /* Fill DTable in order of weight. */ + for (w =3D 1; w < wEnd; ++w) { + int const begin =3D (int)rankStart[w]; + int const end =3D (int)rankStart[w+1]; + U32 const nbBits =3D nbBitsBaseline - w; + + if (targetLog-nbBits >=3D minBits) { + /* Enough room for a second symbol. */ + int start =3D rankVal[w]; + U32 const length =3D 1U << ((targetLog - nbBits) & 0x1F /* qui= et static-analyzer */); int minWeight =3D nbBits + scaleLog; + int s; if (minWeight < 1) minWeight =3D 1; - sortedRank =3D rankStart[minWeight]; - HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, - rankValOrigin[nbBits], minWeight, - sortedList+sortedRank, sortedListSize-sortedRan= k, - nbBitsBaseline, symbol, wksp, wkspSize); + /* Fill the DTable for every symbol of weight w. + * These symbols get at least 1 second symbol. + */ + for (s =3D begin; s !=3D end; ++s) { + HUF_fillDTableX2Level2( + DTable + start, targetLog, nbBits, + rankValOrigin[nbBits], minWeight, wEnd, + sortedList, rankStart, + nbBitsBaseline, sortedList[s].symbol); + start +=3D length; + } } else { - HUF_DEltX2 DElt; - MEM_writeLE16(&(DElt.sequence), symbol); - DElt.nbBits =3D (BYTE)(nbBits); - DElt.length =3D 1; - { U32 const end =3D start + length; - U32 u; - for (u =3D start; u < end; u++) DTable[u] =3D DElt; - } } - rankVal[weight] +=3D length; + /* Only a single symbol. */ + HUF_fillDTableX2ForWeight( + DTable + rankVal[w], + sortedList + begin, sortedList + end, + nbBits, targetLog, + /* baseSeq */ 0, /* level */ 1); + } } } =20 typedef struct { rankValCol_t rankVal[HUF_TABLELOG_MAX]; U32 rankStats[HUF_TABLELOG_MAX + 1]; - U32 rankStart0[HUF_TABLELOG_MAX + 2]; + U32 rankStart0[HUF_TABLELOG_MAX + 3]; sortedSymbol_t sortedSymbol[HUF_SYMBOLVALUE_MAX + 1]; BYTE weightList[HUF_SYMBOLVALUE_MAX + 1]; U32 calleeWksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; @@ -627,9 +1042,16 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) { - U32 tableLog, maxW, sizeOfSort, nbSymbols; + return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wks= pSize, /* bmi2 */ 0); +} + +size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + U32 tableLog, maxW, nbSymbols; DTableDesc dtd =3D HUF_getDTableDesc(DTable); - U32 const maxTableLog =3D dtd.maxTableLog; + U32 maxTableLog =3D dtd.maxTableLog; size_t iSize; void* dtPtr =3D DTable+1; /* force compiler to avoid strict-aliasing= */ HUF_DEltX2* const dt =3D (HUF_DEltX2*)dtPtr; @@ -647,11 +1069,12 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not neces= sary, even though some analyzer complain ... */ =20 - iSize =3D HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1= , wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, s= izeof(wksp->calleeWksp), /* bmi2 */ 0); + iSize =3D HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1= , wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, s= izeof(wksp->calleeWksp), bmi2); if (HUF_isError(iSize)) return iSize; =20 /* check result */ if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTab= le can't fit code depth */ + if (tableLog <=3D HUF_DECODER_FAST_TABLELOG && maxTableLog > HUF_DECOD= ER_FAST_TABLELOG) maxTableLog =3D HUF_DECODER_FAST_TABLELOG; =20 /* find maxWeight */ for (maxW =3D tableLog; wksp->rankStats[maxW]=3D=3D0; maxW--) {} /* n= ecessarily finds a solution before 0 */ @@ -664,7 +1087,7 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, rankStart[w] =3D curr; } rankStart[0] =3D nextRankStart; /* put all 0w symbols at the end= of sorted list*/ - sizeOfSort =3D nextRankStart; + rankStart[maxW+1] =3D nextRankStart; } =20 /* sort symbols by weight */ @@ -673,7 +1096,6 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, U32 const w =3D wksp->weightList[s]; U32 const r =3D rankStart[w]++; wksp->sortedSymbol[r].symbol =3D (BYTE)s; - wksp->sortedSymbol[r].weight =3D (BYTE)w; } rankStart[0] =3D 0; /* forget 0w symbols; this is beginning of w= eight(1) */ } @@ -698,10 +1120,9 @@ size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, } } } } =20 HUF_fillDTableX2(dt, maxTableLog, - wksp->sortedSymbol, sizeOfSort, + wksp->sortedSymbol, wksp->rankStart0, wksp->rankVal, maxW, - tableLog+1, - wksp->calleeWksp, sizeof(wksp->calleeWksp) / sizeof(U32= )); + tableLog+1); =20 dtd.tableLog =3D (BYTE)maxTableLog; dtd.tableType =3D 1; @@ -714,7 +1135,7 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt,= const U32 dtLog) { size_t const val =3D BIT_lookBitsFast(DStream, dtLog); /* note : dtL= og >=3D 1 */ - ZSTD_memcpy(op, dt+val, 2); + ZSTD_memcpy(op, &dt[val].sequence, 2); BIT_skipBits(DStream, dt[val].nbBits); return dt[val].length; } @@ -723,15 +1144,17 @@ FORCE_INLINE_TEMPLATE U32 HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2*= dt, const U32 dtLog) { size_t const val =3D BIT_lookBitsFast(DStream, dtLog); /* note : dtL= og >=3D 1 */ - ZSTD_memcpy(op, dt+val, 1); - if (dt[val].length=3D=3D1) BIT_skipBits(DStream, dt[val].nbBits); - else { + ZSTD_memcpy(op, &dt[val].sequence, 1); + if (dt[val].length=3D=3D1) { + BIT_skipBits(DStream, dt[val].nbBits); + } else { if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { BIT_skipBits(DStream, dt[val].nbBits); if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) /* ugly hack; works only because it's the last symbol. Not= e : can't easily extract nbBits from just this symbol */ DStream->bitsConsumed =3D (sizeof(DStream->bitContainer)*8= ); - } } + } + } return 1; } =20 @@ -753,19 +1176,37 @@ HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, = BYTE* const pEnd, BYTE* const pStart =3D p; =20 /* up to 8 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) & (p= < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_1(p, bitDPtr); - HUF_DECODE_SYMBOLX2_2(p, bitDPtr); - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if ((size_t)(pEnd - p) >=3D sizeof(bitDPtr->bitContainer)) { + if (dtLog <=3D 11 && MEM_64bits()) { + /* up to 10 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinish= ed) & (p < pEnd-9)) { + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } else { + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinish= ed) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + } + } else { + BIT_reloadDStream(bitDPtr); } =20 /* closer to end : up to 2 symbols at a time */ - while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) & (p= <=3D pEnd-2)) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + if ((size_t)(pEnd - p) >=3D 2) { + while ((BIT_reloadDStream(bitDPtr) =3D=3D BIT_DStream_unfinished) = & (p <=3D pEnd-2)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); =20 - while (p <=3D pEnd-2) - HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reache= d the end of DStream */ + while (p <=3D pEnd-2) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : re= ached the end of DStream */ + } =20 if (p < pEnd) p +=3D HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); @@ -799,7 +1240,6 @@ HUF_decompress1X2_usingDTable_internal_body( /* decoded size */ return dstSize; } - FORCE_INLINE_TEMPLATE size_t HUF_decompress4X2_usingDTable_internal_body( void* dst, size_t dstSize, @@ -841,57 +1281,60 @@ HUF_decompress4X2_usingDTable_internal_body( U32 const dtLog =3D dtd.tableLog; =20 if (length4 > cSrcSize) return ERROR(corruption_detected); /* ov= erflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* ov= erflow */ CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); =20 /* 16-32 symbols per loop (4-8 symbols per stream) */ - for ( ; (endSignal) & (op4 < olimit); ) { + if ((size_t)(oend - op4) >=3D sizeof(size_t)) { + for ( ; (endSignal) & (op4 < olimit); ) { #if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); - endSignal &=3D BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DStrea= m_unfinished; - endSignal &=3D BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DStrea= m_unfinished; - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - endSignal &=3D BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DStrea= m_unfinished; - endSignal &=3D BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DStrea= m_unfinished; + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + endSignal &=3D BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DS= tream_unfinished; + endSignal &=3D BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DS= tream_unfinished; + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal &=3D BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DS= tream_unfinished; + endSignal &=3D BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DS= tream_unfinished; #else - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_1(op1, &bitD1); - HUF_DECODE_SYMBOLX2_1(op2, &bitD2); - HUF_DECODE_SYMBOLX2_1(op3, &bitD3); - HUF_DECODE_SYMBOLX2_1(op4, &bitD4); - HUF_DECODE_SYMBOLX2_2(op1, &bitD1); - HUF_DECODE_SYMBOLX2_2(op2, &bitD2); - HUF_DECODE_SYMBOLX2_2(op3, &bitD3); - HUF_DECODE_SYMBOLX2_2(op4, &bitD4); - HUF_DECODE_SYMBOLX2_0(op1, &bitD1); - HUF_DECODE_SYMBOLX2_0(op2, &bitD2); - HUF_DECODE_SYMBOLX2_0(op3, &bitD3); - HUF_DECODE_SYMBOLX2_0(op4, &bitD4); - endSignal =3D (U32)LIKELY((U32) - (BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DStream_= unfinished) - & (BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DStream_= unfinished) - & (BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DStream_= unfinished) - & (BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DStream_= unfinished)); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal =3D (U32)LIKELY((U32) + (BIT_reloadDStreamFast(&bitD1) =3D=3D BIT_DStr= eam_unfinished) + & (BIT_reloadDStreamFast(&bitD2) =3D=3D BIT_DStrea= m_unfinished) + & (BIT_reloadDStreamFast(&bitD3) =3D=3D BIT_DStrea= m_unfinished) + & (BIT_reloadDStreamFast(&bitD4) =3D=3D BIT_DStrea= m_unfinished)); #endif + } } =20 /* check corruption */ @@ -915,8 +1358,99 @@ HUF_decompress4X2_usingDTable_internal_body( } } =20 +#if HUF_NEED_BMI2_FUNCTION +static BMI2_TARGET_ATTRIBUTE +size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSi= ze, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc,= cSrcSize, DTable); +} +#endif + +#if HUF_NEED_DEFAULT_FUNCTION +static +size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t ds= tSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc,= cSrcSize, DTable); +} +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 + +HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF= _DecompressAsmArgs* args) ZSTDLIB_HIDDEN; + +static HUF_ASM_X86_64_BMI2_ATTRS size_t +HUF_decompress4X2_usingDTable_internal_bmi2_asm( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) { + void const* dt =3D DTable + 1; + const BYTE* const iend =3D (const BYTE*)cSrc + 6; + BYTE* const oend =3D (BYTE*)dst + dstSize; + HUF_DecompressAsmArgs args; + { + size_t const ret =3D HUF_DecompressAsmArgs_init(&args, dst, dstSiz= e, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); + if (ret !=3D 0) + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSiz= e, cSrc, cSrcSize, DTable); + } + + assert(args.ip[0] >=3D args.ilimit); + HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); + + /* note : op4 already verified within main loop */ + assert(args.ip[0] >=3D iend); + assert(args.ip[1] >=3D iend); + assert(args.ip[2] >=3D iend); + assert(args.ip[3] >=3D iend); + assert(args.op[3] <=3D oend); + (void)iend; + + /* finish bitStreams one by one */ + { + size_t const segmentSize =3D (dstSize+3) / 4; + BYTE* segmentEnd =3D (BYTE*)dst; + int i; + for (i =3D 0; i < 4; ++i) { + BIT_DStream_t bit; + if (segmentSize <=3D (size_t)(oend - segmentEnd)) + segmentEnd +=3D segmentSize; + else + segmentEnd =3D oend; + FORWARD_IF_ERROR(HUF_initRemainingDStream(&bit, &args, i, segm= entEnd), "corruption"); + args.op[i] +=3D HUF_decodeStreamX2(args.op[i], &bit, segmentEn= d, (HUF_DEltX2 const*)dt, HUF_DECODER_FAST_TABLELOG); + if (args.op[i] !=3D segmentEnd) + return ERROR(corruption_detected); + } + } + + /* decoded size */ + return dstSize; +} +#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + +static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dst= Size, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) +{ +#if DYNAMIC_BMI2 + if (bmi2) { +# if ZSTD_ENABLE_ASM_X86_64_BMI2 + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSiz= e, cSrc, cSrcSize, DTable); +# else + return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, c= Src, cSrcSize, DTable); +# endif + } +#else + (void)bmi2; +#endif + +#if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) + return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, c= Src, cSrcSize, DTable); +#else + return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cS= rc, cSrcSize, DTable); +#endif +} + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) -HUF_DGEN(HUF_decompress4X2_usingDTable_internal) =20 size_t HUF_decompress1X2_usingDTable( void* dst, size_t dstSize, @@ -1025,25 +1559,25 @@ size_t HUF_decompress4X_usingDTable(void* dst, size= _t maxDstSize, =20 #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; -static const algo_time_t algoTime[16 /* Quantization */][3 /* single, doub= le, quad */] =3D +static const algo_time_t algoTime[16 /* Quantization */][2 /* single, doub= le */] =3D { /* single, double, quad */ - {{0,0}, {1,1}, {2,2}}, /* Q=3D=3D0 : impossible */ - {{0,0}, {1,1}, {2,2}}, /* Q=3D=3D1 : impossible */ - {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q =3D=3D 2 : 12-18% */ - {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q =3D=3D 3 : 18-25% */ - {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q =3D=3D 4 : 25-32% */ - {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q =3D=3D 5 : 32-38% */ - {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q =3D=3D 6 : 38-44% */ - {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q =3D=3D 7 : 44-50% */ - {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q =3D=3D 8 : 50-56% */ - {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q =3D=3D 9 : 56-62% */ - {{1107,128}, {2083, 81}, {4006, 84}}, /* Q =3D=3D10 : 62-69% */ - {{1177,128}, {2379, 87}, {4785, 88}}, /* Q =3D=3D11 : 69-75% */ - {{1242,128}, {2415, 93}, {5155, 84}}, /* Q =3D=3D12 : 75-81% */ - {{1349,128}, {2644,106}, {5260,106}}, /* Q =3D=3D13 : 81-87% */ - {{1455,128}, {2422,124}, {4174,124}}, /* Q =3D=3D14 : 87-93% */ - {{ 722,128}, {1891,145}, {1936,146}}, /* Q =3D=3D15 : 93-99% */ + {{0,0}, {1,1}}, /* Q=3D=3D0 : impossible */ + {{0,0}, {1,1}}, /* Q=3D=3D1 : impossible */ + {{ 150,216}, { 381,119}}, /* Q =3D=3D 2 : 12-18% */ + {{ 170,205}, { 514,112}}, /* Q =3D=3D 3 : 18-25% */ + {{ 177,199}, { 539,110}}, /* Q =3D=3D 4 : 25-32% */ + {{ 197,194}, { 644,107}}, /* Q =3D=3D 5 : 32-38% */ + {{ 221,192}, { 735,107}}, /* Q =3D=3D 6 : 38-44% */ + {{ 256,189}, { 881,106}}, /* Q =3D=3D 7 : 44-50% */ + {{ 359,188}, {1167,109}}, /* Q =3D=3D 8 : 50-56% */ + {{ 582,187}, {1570,114}}, /* Q =3D=3D 9 : 56-62% */ + {{ 688,187}, {1712,122}}, /* Q =3D=3D10 : 62-69% */ + {{ 825,186}, {1965,136}}, /* Q =3D=3D11 : 69-75% */ + {{ 976,185}, {2131,150}}, /* Q =3D=3D12 : 75-81% */ + {{1180,186}, {2070,175}}, /* Q =3D=3D13 : 81-87% */ + {{1377,185}, {1731,202}}, /* Q =3D=3D14 : 87-93% */ + {{1412,185}, {1695,202}}, /* Q =3D=3D15 : 93-99% */ }; #endif =20 @@ -1070,7 +1604,7 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSiz= e) U32 const D256 =3D (U32)(dstSize >> 8); U32 const DTime0 =3D algoTime[Q][0].tableTime + (algoTime[Q][0].de= code256Time * D256); U32 DTime1 =3D algoTime[Q][1].tableTime + (algoTime[Q][1].decode25= 6Time * D256); - DTime1 +=3D DTime1 >> 3; /* advantage to algorithm using less mem= ory, to reduce cache eviction */ + DTime1 +=3D DTime1 >> 5; /* small advantage to algorithm using le= ss memory, to reduce cache eviction */ return DTime1 < DTime0; } #endif diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zs= td_decompress.c index b4d81d84479a..b9b935a9f5c0 100644 --- a/lib/zstd/decompress/zstd_decompress.c +++ b/lib/zstd/decompress/zstd_decompress.c @@ -53,7 +53,6 @@ * Dependencies *********************************************************/ #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_mems= et */ -#include "../common/cpu.h" /* bmi2 */ #include "../common/mem.h" /* low level memory routines */ #define FSE_STATIC_LINKING_ONLY #include "../common/fse.h" @@ -252,11 +251,11 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) dctx->inBuffSize =3D 0; dctx->outBuffSize =3D 0; dctx->streamStage =3D zdss_init; - dctx->legacyContext =3D NULL; - dctx->previousLegacyVersion =3D 0; dctx->noForwardProgress =3D 0; dctx->oversizedDuration =3D 0; - dctx->bmi2 =3D ZSTD_cpuid_bmi2(ZSTD_cpuid()); +#if DYNAMIC_BMI2 + dctx->bmi2 =3D ZSTD_cpuSupportsBmi2(); +#endif dctx->ddictSet =3D NULL; ZSTD_DCtx_resetParameters(dctx); #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION @@ -277,8 +276,7 @@ ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t = workspaceSize) return dctx; } =20 -ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) -{ +static ZSTD_DCtx* ZSTD_createDCtx_internal(ZSTD_customMem customMem) { if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; =20 { ZSTD_DCtx* const dctx =3D (ZSTD_DCtx*)ZSTD_customMalloc(sizeof(*dc= tx), customMem); @@ -289,10 +287,15 @@ ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem cu= stomMem) } } =20 +ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDCtx_internal(customMem); +} + ZSTD_DCtx* ZSTD_createDCtx(void) { DEBUGLOG(3, "ZSTD_createDCtx"); - return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); } =20 static void ZSTD_clearDict(ZSTD_DCtx* dctx) @@ -370,6 +373,19 @@ unsigned ZSTD_isFrame(const void* buffer, size_t size) return 0; } =20 +/*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier = for a skippable frame. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always= be 0. + */ +unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic =3D MEM_readLE32(buffer); + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) =3D=3D ZSTD_MAGIC_SKIPPABL= E_START) return 1; + } + return 0; +} + /* ZSTD_frameHeaderSize_internal() : * srcSize must be large enough to reach header size fields. * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. @@ -497,7 +513,6 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, co= nst void* src, size_t src return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1= ); } =20 - /* ZSTD_getFrameContentSize() : * compatible with legacy mode * @return : decompressed size of the single frame pointed to be `src` if = known, otherwise @@ -532,6 +547,37 @@ static size_t readSkippableFrameSize(void const* src, = size_t srcSize) } } =20 +/*! ZSTD_readSkippableFrame() : + * Retrieves a zstd skippable frame containing data given by src, and writ= es it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was suppl= ied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the= caller is not interested + * in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the f= rame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, = unsigned* magicVariant, + const void* src, size_t srcSiz= e) +{ + U32 const magicNumber =3D MEM_readLE32(src); + size_t skippableFrameSize =3D readSkippableFrameSize(src, srcSize); + size_t skippableContentSize =3D skippableFrameSize - ZSTD_SKIPPABLEHEA= DERSIZE; + + /* check input validity */ + RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_u= nsupported, ""); + RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skipp= ableFrameSize > srcSize, srcSize_wrong, ""); + RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, = ""); + + /* deliver payload */ + if (skippableContentSize > 0 && dst !=3D NULL) + ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, ski= ppableContentSize); + if (magicVariant !=3D NULL) + *magicVariant =3D magicNumber - ZSTD_MAGIC_SKIPPABLE_START; + return skippableContentSize; +} + /* ZSTD_findDecompressedSize() : * compatible with legacy mode * `srcSize` must be the exact length of some number of ZSTD compressed a= nd/or @@ -824,7 +870,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, switch(blockProperties.blockType) { case bt_compressed: - decodedSize =3D ZSTD_decompressBlock_internal(dctx, op, (size_= t)(oend-op), ip, cBlockSize, /* frame */ 1); + decodedSize =3D ZSTD_decompressBlock_internal(dctx, op, (size_= t)(oend-op), ip, cBlockSize, /* frame */ 1, not_streaming); break; case bt_raw : decodedSize =3D ZSTD_copyRawBlock(op, (size_t)(oend-op), ip, c= BlockSize); @@ -976,7 +1022,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, = const void* src, size_t sr { #if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=3D1) size_t regenSize; - ZSTD_DCtx* const dctx =3D ZSTD_createDCtx(); + ZSTD_DCtx* const dctx =3D ZSTD_createDCtx_internal(ZSTD_defaultCMem); RETURN_ERROR_IF(dctx=3D=3DNULL, memory_allocation, "NULL pointer!"); regenSize =3D ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize= ); ZSTD_freeDCtx(dctx); @@ -996,7 +1042,7 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, = const void* src, size_t sr size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expect= ed; } =20 /* - * Similar to ZSTD_nextSrcSizeToDecompress(), but when when a block input = can be streamed, + * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can b= e streamed, * we allow taking a partial block as the input. Currently only raw uncomp= ressed blocks can * be streamed. * @@ -1010,7 +1056,7 @@ static size_t ZSTD_nextSrcSizeToDecompressWithInputSi= ze(ZSTD_DCtx* dctx, size_t return dctx->expected; if (dctx->bType !=3D bt_raw) return dctx->expected; - return MIN(MAX(inputSize, 1), dctx->expected); + return BOUNDED(1, inputSize, dctx->expected); } =20 ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { @@ -1116,7 +1162,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void*= dst, size_t dstCapacity, c { case bt_compressed: DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); - rSize =3D ZSTD_decompressBlock_internal(dctx, dst, dstCapa= city, src, srcSize, /* frame */ 1); + rSize =3D ZSTD_decompressBlock_internal(dctx, dst, dstCapa= city, src, srcSize, /* frame */ 1, is_streaming); dctx->expected =3D 0; /* Streaming not supported */ break; case bt_raw : @@ -1438,7 +1484,7 @@ size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, ZSTD_DStream* ZSTD_createDStream(void) { DEBUGLOG(3, "ZSTD_createDStream"); - return ZSTD_createDStream_advanced(ZSTD_defaultCMem); + return ZSTD_createDCtx_internal(ZSTD_defaultCMem); } =20 ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) @@ -1448,7 +1494,7 @@ ZSTD_DStream* ZSTD_initStaticDStream(void *workspace,= size_t workspaceSize) =20 ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) { - return ZSTD_createDCtx_advanced(customMem); + return ZSTD_createDCtx_internal(customMem); } =20 size_t ZSTD_freeDStream(ZSTD_DStream* zds) @@ -1708,7 +1754,8 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned= long long frameContentSize) { size_t const blockSize =3D (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX= ); - unsigned long long const neededRBSize =3D windowSize + blockSize + (WI= LDCOPY_OVERLENGTH * 2); + /* space is needed to store the litbuffer after the output of a given = block without stomping the extDict of a previous run, as well as to cover b= oth windows against wildcopy*/ + unsigned long long const neededRBSize =3D windowSize + blockSize + ZST= D_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); unsigned long long const neededSize =3D MIN(frameContentSize, neededRB= Size); size_t const minRBSize =3D (size_t) neededSize; RETURN_ERROR_IF((unsigned long long)minRBSize !=3D neededSize, @@ -1842,7 +1889,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_= outBuffer* output, ZSTD_inB DEBUGLOG(5, "stage zdss_init =3D> transparent reset "); zds->streamStage =3D zdss_loadHeader; zds->lhSize =3D zds->inPos =3D zds->outStart =3D zds->outEnd = =3D 0; - zds->legacyVersion =3D 0; zds->hostageByte =3D 0; zds->expectedOutBuffer =3D *output; ZSTD_FALLTHROUGH; diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompr= ess/zstd_decompress_block.c index 2d101d9a842e..c1913b8e7c89 100644 --- a/lib/zstd/decompress/zstd_decompress_block.c +++ b/lib/zstd/decompress/zstd_decompress_block.c @@ -69,15 +69,56 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSi= ze, } } =20 +/* Allocate buffer for literals, either overlapping current dst, or split = between dst and litExtraBuffer, or stored entirely within litExtraBuffer */ +static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, = const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, c= onst unsigned splitImmediately) +{ + if (streaming =3D=3D not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX= + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) + { + /* room for litbuffer to fit without read faulting */ + dctx->litBuffer =3D (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVE= RLENGTH; + dctx->litBufferEnd =3D dctx->litBuffer + litSize; + dctx->litBufferLocation =3D ZSTD_in_dst; + } + else if (litSize > ZSTD_LITBUFFEREXTRASIZE) + { + /* won't fit in litExtraBuffer, so it will be split between end of= dst and extra buffer */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between en= d of dst and extra buffer */ + dctx->litBuffer =3D (BYTE*)dst + expectedWriteSize - litSize += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd =3D dctx->litBuffer + litSize - ZSTD_LITBUF= FEREXTRASIZE; + } + else { + /* initially this will be stored entirely in dst during huffma= n decoding, it will partially shifted to litExtraBuffer after */ + dctx->litBuffer =3D (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd =3D (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation =3D ZSTD_split; + } + else + { + /* fits entirely within litExtraBuffer, so no split is necessary */ + dctx->litBuffer =3D dctx->litExtraBuffer; + dctx->litBufferEnd =3D dctx->litBuffer + litSize; + dctx->litBufferLocation =3D ZSTD_not_in_dst; + } +} =20 /* Hidden declaration for fullbench */ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize); + const void* src, size_t srcSize, + void* dst, size_t dstCapacity, const streaming_o= peration streaming); /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output durin= g decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full= in the excess dst space after where the current + * block will be output. Otherwise it will be stored at the end of the cu= rrent dst blockspace, with a small portion being + * stored in dctx->litExtraBuffer to help keep it "ahead" of the current o= utput write. + * * @return : nb of bytes read from src (< srcSize ) * note : symbol not declared but exposed for fullbench */ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, - const void* src, size_t srcSize) /* note : src= Size < BLOCKSIZE */ + const void* src, size_t srcSize, /* note : src= Size < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_o= peration streaming) { DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); @@ -99,6 +140,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, U32 const lhlCode =3D (istart[0] >> 2) & 3; U32 const lhc =3D MEM_readLE32(istart); size_t hufSuccess; + size_t expectedWriteSize =3D MIN(ZSTD_BLOCKSIZE_MAX, dstCa= pacity); switch(lhlCode) { case 0: case 1: default: /* note : default is impossible= , since lhlCode into [0..3] */ @@ -121,8 +163,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, litCSize =3D (lhc >> 22) + ((size_t)istart[4] << 10); break; } + RETURN_ERROR_IF(litSize > 0 && dst =3D=3D NULL, dstSize_to= oSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_d= etected, ""); RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_de= tected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooS= mall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSiz= e, streaming, expectedWriteSize, 0); =20 /* prefetch huffman table if cold */ if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { @@ -133,11 +178,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, if (singleStream) { hufSuccess =3D HUF_decompress1X_usingDTable_bmi2( dctx->litBuffer, litSize, istart+lhSize, litCS= ize, - dctx->HUFptr, dctx->bmi2); + dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); } else { hufSuccess =3D HUF_decompress4X_usingDTable_bmi2( dctx->litBuffer, litSize, istart+lhSize, litCS= ize, - dctx->HUFptr, dctx->bmi2); + dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); } } else { if (singleStream) { @@ -150,15 +195,22 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, hufSuccess =3D HUF_decompress1X1_DCtx_wksp_bmi2( dctx->entropy.hufTable, dctx->litBuffer, litSi= ze, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace), dctx->bmi2); + sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dc= tx)); #endif } else { hufSuccess =3D HUF_decompress4X_hufOnly_wksp_bmi2( dctx->entropy.hufTable, dctx->litBuffer, litSi= ze, istart+lhSize, litCSize, dctx->workspace, - sizeof(dctx->workspace), dctx->bmi2); + sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dc= tx)); } } + if (dctx->litBufferLocation =3D=3D ZSTD_split) + { + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd -= ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE= - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer +=3D ZSTD_LITBUFFEREXTRASIZE - WILDCOP= Y_OVERLENGTH; + dctx->litBufferEnd -=3D WILDCOPY_OVERLENGTH; + } =20 RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detect= ed, ""); =20 @@ -166,13 +218,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, dctx->litSize =3D litSize; dctx->litEntropy =3D 1; if (litEncType=3D=3Dset_compressed) dctx->HUFptr =3D dctx-= >entropy.hufTable; - ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_O= VERLENGTH); return litCSize + lhSize; } =20 case set_basic: { size_t litSize, lhSize; U32 const lhlCode =3D ((istart[0]) >> 2) & 3; + size_t expectedWriteSize =3D MIN(ZSTD_BLOCKSIZE_MAX, dstCa= pacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible= , since lhlCode into [0..3] */ @@ -189,23 +241,36 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, break; } =20 + RETURN_ERROR_IF(litSize > 0 && dst =3D=3D NULL, dstSize_to= oSmall, "NULL not handled"); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSm= all, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSiz= e, streaming, expectedWriteSize, 1); if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* ri= sk reading beyond src buffer with wildcopy */ RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_d= etected, ""); - ZSTD_memcpy(dctx->litBuffer, istart+lhSize, litSize); + if (dctx->litBufferLocation =3D=3D ZSTD_split) + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litS= ize - ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, istart + lhSize = + litSize - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memcpy(dctx->litBuffer, istart + lhSize, litS= ize); + } dctx->litPtr =3D dctx->litBuffer; dctx->litSize =3D litSize; - ZSTD_memset(dctx->litBuffer + dctx->litSize, 0, WILDCO= PY_OVERLENGTH); return lhSize+litSize; } /* direct reference into compressed stream */ dctx->litPtr =3D istart+lhSize; dctx->litSize =3D litSize; + dctx->litBufferEnd =3D dctx->litPtr + litSize; + dctx->litBufferLocation =3D ZSTD_not_in_dst; return lhSize+litSize; } =20 case set_rle: { U32 const lhlCode =3D ((istart[0]) >> 2) & 3; size_t litSize, lhSize; + size_t expectedWriteSize =3D MIN(ZSTD_BLOCKSIZE_MAX, dstCa= pacity); switch(lhlCode) { case 0: case 2: default: /* note : default is impossible= , since lhlCode into [0..3] */ @@ -222,8 +287,19 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSi= ze >=3D MIN_CBLOCK_SIZE =3D=3D 3; here we need lhSize+1 =3D 4"); break; } + RETURN_ERROR_IF(litSize > 0 && dst =3D=3D NULL, dstSize_to= oSmall, "NULL not handled"); RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_d= etected, ""); - ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize + WIL= DCOPY_OVERLENGTH); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSm= all, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSiz= e, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation =3D=3D ZSTD_split) + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize -= ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memset(dctx->litExtraBuffer, istart[lhSize], ZSTD= _LITBUFFEREXTRASIZE); + } + else + { + ZSTD_memset(dctx->litBuffer, istart[lhSize], litSize); + } dctx->litPtr =3D dctx->litBuffer; dctx->litSize =3D litSize; return lhSize+1; @@ -343,7 +419,7 @@ static const ZSTD_seqSymbol ML_defaultDTable[(1<nbBits =3D 0; cell->nextState =3D 0; assert(nbAddBits < 255); - cell->nbAdditionalBits =3D (BYTE)nbAddBits; + cell->nbAdditionalBits =3D nbAddBits; cell->baseValue =3D baseValue; } =20 @@ -367,7 +443,7 @@ static void ZSTD_buildSeqTable_rle(ZSTD_seqSymbol* dt, = U32 baseValue, U32 nbAddB FORCE_INLINE_TEMPLATE void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_seqSymbol* const tableDecode =3D dt+1; @@ -478,7 +554,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, tableDecode[u].nbBits =3D (BYTE) (tableLog - BIT_highbit32(nex= tState) ); tableDecode[u].nextState =3D (U16) ( (nextState << tableDecode= [u].nbBits) - tableSize); assert(nbAdditionalBits[symbol] < 255); - tableDecode[u].nbAdditionalBits =3D (BYTE)nbAdditionalBits[sym= bol]; + tableDecode[u].nbAdditionalBits =3D nbAdditionalBits[symbol]; tableDecode[u].baseValue =3D baseValue[symbol]; } } @@ -487,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, /* Avoids the FORCE_INLINE of the _body() function. */ static void ZSTD_buildFSETable_body_default(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, @@ -495,9 +571,9 @@ static void ZSTD_buildFSETable_body_default(ZSTD_seqSym= bol* dt, } =20 #if DYNAMIC_BMI2 -TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable_body_bmi2(ZSTD_seq= Symbol* dt, +BMI2_TARGET_ATTRIBUTE static void ZSTD_buildFSETable_body_bmi2(ZSTD_seqSym= bol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize) { ZSTD_buildFSETable_body(dt, normalizedCounter, maxSymbolValue, @@ -507,7 +583,7 @@ TARGET_ATTRIBUTE("bmi2") static void ZSTD_buildFSETable= _body_bmi2(ZSTD_seqSymbol =20 void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize, int bmi2) { #if DYNAMIC_BMI2 @@ -529,7 +605,7 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTableSpace, const ZSTD_s= eqSymbol** DTablePtr, symbolEncodingType_e type, unsigned max, = U32 maxLog, const void* src, size_t srcSize, - const U32* baseValue, const U32* nbAdditi= onalBits, + const U32* baseValue, const U8* nbAdditio= nalBits, const ZSTD_seqSymbol* defaultTable, U32 f= lagRepeatTable, int ddictIsCold, int nbSeq, U32* wksp, si= ze_t wkspSize, int bmi2) @@ -541,7 +617,7 @@ static size_t ZSTD_buildSeqTable(ZSTD_seqSymbol* DTable= Space, const ZSTD_seqSymb RETURN_ERROR_IF((*(const BYTE*)src) > max, corruption_detected, ""= ); { U32 const symbol =3D *(const BYTE*)src; U32 const baseline =3D baseValue[symbol]; - U32 const nbBits =3D nbAdditionalBits[symbol]; + U8 const nbBits =3D nbAdditionalBits[symbol]; ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); } *DTablePtr =3D DTableSpace; @@ -620,7 +696,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSe= qPtr, LL_defaultDTable, dc= tx->fseEntropy, dctx->ddictIsCold, n= bSeq, dctx->workspace, siz= eof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(d= ctx)); RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "Z= STD_buildSeqTable failed"); ip +=3D llhSize; } @@ -632,7 +708,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSe= qPtr, OF_defaultDTable, dc= tx->fseEntropy, dctx->ddictIsCold, n= bSeq, dctx->workspace, siz= eof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(d= ctx)); RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "Z= STD_buildSeqTable failed"); ip +=3D ofhSize; } @@ -644,7 +720,7 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSe= qPtr, ML_defaultDTable, dc= tx->fseEntropy, dctx->ddictIsCold, n= bSeq, dctx->workspace, siz= eof(dctx->workspace), - dctx->bmi2); + ZSTD_DCtx_get_bmi2(d= ctx)); RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "Z= STD_buildSeqTable failed"); ip +=3D mlhSize; } @@ -658,7 +734,6 @@ typedef struct { size_t litLength; size_t matchLength; size_t offset; - const BYTE* match; } seq_t; =20 typedef struct { @@ -672,9 +747,6 @@ typedef struct { ZSTD_fseState stateOffb; ZSTD_fseState stateML; size_t prevOffset[ZSTD_REP_NUM]; - const BYTE* prefixStart; - const BYTE* dictEnd; - size_t pos; } seqState_t; =20 /*! ZSTD_overlapCopy8() : @@ -717,7 +789,7 @@ HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE cons= t** ip, size_t offset) { * - ZSTD_overlap_src_before_dst: The src and dst may overlap and = may be any distance apart. * The src buffer must be before the dst buffer. */ -static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, pt= rdiff_t length, ZSTD_overlap_e ovtype) { +static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* = ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { ptrdiff_t const diff =3D op - ip; BYTE* const oend =3D op + length; =20 @@ -733,6 +805,7 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w,= BYTE const* ip, ptrdiff_ /* Copy 8 bytes and ensure the offset >=3D 8 when there can be ove= rlap. */ assert(length >=3D 8); ZSTD_overlapCopy8(&op, &ip, diff); + length -=3D 8; assert(op - ip >=3D 8); assert(op <=3D oend); } @@ -747,8 +820,31 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w= , BYTE const* ip, ptrdiff_ assert(oend > oend_w); ZSTD_wildcopy(op, ip, oend_w - op, ovtype); ip +=3D oend_w - op; - op =3D oend_w; + op +=3D oend_w - op; + } + /* Handle the leftovers. */ + while (op < oend) *op++ =3D *ip++; +} + +/* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-ove= rlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance = impact to the safecopy common case */ +static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t = length) { + ptrdiff_t const diff =3D op - ip; + BYTE* const oend =3D op + length; + + if (length < 8 || diff > -8) { + /* Handle short lengths, close overlaps, and dst not before src. */ + while (op < oend) *op++ =3D *ip++; + return; + } + + if (op <=3D oend - WILDCOPY_OVERLENGTH && diff < -WILDCOPY_VECLEN) { + ZSTD_wildcopy(op, ip, oend - WILDCOPY_OVERLENGTH - op, ZSTD_no_ove= rlap); + ip +=3D oend - WILDCOPY_OVERLENGTH - op; + op +=3D oend - WILDCOPY_OVERLENGTH - op; } + /* Handle the leftovers. */ while (op < oend) *op++ =3D *ip++; } @@ -763,9 +859,9 @@ static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w,= BYTE const* ip, ptrdiff_ */ FORCE_NOINLINE size_t ZSTD_execSequenceEnd(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimi= t, - const BYTE* const prefixStart, const BYTE* con= st virtualStart, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const B= YTE* const dictEnd) { BYTE* const oLitEnd =3D op + sequence.litLength; size_t const sequenceLength =3D sequence.litLength + sequence.matchLen= gth; @@ -788,27 +884,76 @@ size_t ZSTD_execSequenceEnd(BYTE* op, if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { /* offset beyond prefix */ RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart)= , corruption_detected, ""); - match =3D dictEnd - (prefixStart-match); + match =3D dictEnd - (prefixStart - match); if (match + sequence.matchLength <=3D dictEnd) { ZSTD_memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; } /* span extDict & currentPrefixSegment */ { size_t const length1 =3D dictEnd - match; - ZSTD_memmove(oLitEnd, match, length1); - op =3D oLitEnd + length1; - sequence.matchLength -=3D length1; - match =3D prefixStart; - } } + ZSTD_memmove(oLitEnd, match, length1); + op =3D oLitEnd + length1; + sequence.matchLength -=3D length1; + match =3D prefixStart; + } + } + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_sr= c_before_dst); + return sequenceLength; +} + +/* ZSTD_execSequenceEndSplitLitBuffer(): + * This version is intended to be used during instances where the litBuffe= r is still split. It is kept separate to avoid performance impact for the = good case. + */ +FORCE_NOINLINE +size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const B= YTE* const dictEnd) +{ + BYTE* const oLitEnd =3D op + sequence.litLength; + size_t const sequenceLength =3D sequence.litLength + sequence.matchLen= gth; + const BYTE* const iLitEnd =3D *litPtr + sequence.litLength; + const BYTE* match =3D oLitEnd - sequence.offset; + + + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall= , "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), cor= ruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); + + /* copy literals */ + RETURN_ERROR_IF(op > *litPtr && op < *litPtr + sequence.litLength, dst= Size_tooSmall, "output should not catch up to and overwrite literal buffer"= ); + ZSTD_safecopyDstBeforeSrc(op, *litPtr, sequence.litLength); + op =3D oLitEnd; + *litPtr =3D iLitEnd; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart)= , corruption_detected, ""); + match =3D dictEnd - (prefixStart - match); + if (match + sequence.matchLength <=3D dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 =3D dictEnd - match; + ZSTD_memmove(oLitEnd, match, length1); + op =3D oLitEnd + length1; + sequence.matchLength -=3D length1; + match =3D prefixStart; + } + } ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_sr= c_before_dst); return sequenceLength; } =20 HINT_INLINE size_t ZSTD_execSequence(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const prefixStart, const BYTE* const = virtualStart, const BYTE* const dictEnd) + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const B= YTE* const dictEnd) { BYTE* const oLitEnd =3D op + sequence.litLength; size_t const sequenceLength =3D sequence.litLength + sequence.matchLen= gth; @@ -817,6 +962,98 @@ size_t ZSTD_execSequence(BYTE* op, const BYTE* const iLitEnd =3D *litPtr + sequence.litLength; const BYTE* match =3D oLitEnd - sequence.offset; =20 + assert(op !=3D NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_O= VERLENGTH))) + return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, = prefixStart, virtualStart, dictEnd); + + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <=3D oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <=3D oend /* No underflow */); + assert(iLitEnd <=3D litLimit /* Literal length is in bounds */); + assert(oLitEnd <=3D oend_w /* Can wildcopy literals */); + assert(oMatchEnd <=3D oend_w /* Can wildcopy matches */); + + /* Copy Literals: + * Split out litLength <=3D 16 since it is nearly always true. +1.6% o= n gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + assert(WILDCOPY_OVERLENGTH >=3D 16); + ZSTD_copy16(op, (*litPtr)); + if (UNLIKELY(sequence.litLength > 16)) { + ZSTD_wildcopy(op + 16, (*litPtr) + 16, sequence.litLength - 16, ZS= TD_no_overlap); + } + op =3D oLitEnd; + *litPtr =3D iLitEnd; /* update for next sequence */ + + /* Copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix -> go into extDict */ + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virt= ualStart)), corruption_detected, ""); + match =3D dictEnd + (match - prefixStart); + if (match + sequence.matchLength <=3D dictEnd) { + ZSTD_memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 =3D dictEnd - match; + ZSTD_memmove(oLitEnd, match, length1); + op =3D oLitEnd + length1; + sequence.matchLength -=3D length1; + match =3D prefixStart; + } + } + /* Match within prefix of 1 or more bytes */ + assert(op <=3D oMatchEnd); + assert(oMatchEnd <=3D oend_w); + assert(match >=3D prefixStart); + assert(sequence.matchLength >=3D 1); + + /* Nearly all offsets are >=3D WILDCOPY_VECLEN bytes, which means we c= an use wildcopy + * without overlap checking. + */ + if (LIKELY(sequence.offset >=3D WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches = to be + * longer than literals (in general). In silesia, ~10% of matches = are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_= overlap); + return sequenceLength; + } + assert(sequence.offset < WILDCOPY_VECLEN); + + /* Copy 8 bytes and spread the offset to be >=3D 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. = */ + if (sequence.matchLength > 8) { + assert(op < oMatchEnd); + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength - 8, ZSTD= _overlap_src_before_dst); + } + return sequenceLength; +} + +HINT_INLINE +size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const B= YTE* const dictEnd) +{ + BYTE* const oLitEnd =3D op + sequence.litLength; + size_t const sequenceLength =3D sequence.litLength + sequence.matchLen= gth; + BYTE* const oMatchEnd =3D op + sequenceLength; /* risk : address spa= ce overflow (32-bits) */ + const BYTE* const iLitEnd =3D *litPtr + sequence.litLength; + const BYTE* match =3D oLitEnd - sequence.offset; + assert(op !=3D NULL /* Precondition */); assert(oend_w < oend /* No underflow */); /* Handle edge cases in a slow path: @@ -828,7 +1065,7 @@ size_t ZSTD_execSequence(BYTE* op, iLitEnd > litLimit || oMatchEnd > oend_w || (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCO= PY_OVERLENGTH))) - return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, = prefixStart, virtualStart, dictEnd); + return ZSTD_execSequenceEndSplitLitBuffer(op, oend, oend_w, sequen= ce, litPtr, litLimit, prefixStart, virtualStart, dictEnd); =20 /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ assert(op <=3D oLitEnd /* No overflow */); @@ -896,6 +1133,7 @@ size_t ZSTD_execSequence(BYTE* op, return sequenceLength; } =20 + static void ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZST= D_seqSymbol* dt) { @@ -909,20 +1147,10 @@ ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStr= eam_t* bitD, const ZSTD_seqS } =20 FORCE_INLINE_TEMPLATE void -ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) -{ - ZSTD_seqSymbol const DInfo =3D DStatePtr->table[DStatePtr->state]; - U32 const nbBits =3D DInfo.nbBits; - size_t const lowBits =3D BIT_readBits(bitD, nbBits); - DStatePtr->state =3D DInfo.nextState + lowBits; -} - -FORCE_INLINE_TEMPLATE void -ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD= , ZSTD_seqSymbol const DInfo) +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD= , U16 nextState, U32 nbBits) { - U32 const nbBits =3D DInfo.nbBits; size_t const lowBits =3D BIT_readBits(bitD, nbBits); - DStatePtr->state =3D DInfo.nextState + lowBits; + DStatePtr->state =3D nextState + lowBits; } =20 /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the max= imum @@ -936,116 +1164,105 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DState= Ptr, BIT_DStream_t* bitD, ZSTD : 0) =20 typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=3D1 } ZSTD_lo= ngOffset_e; -typedef enum { ZSTD_p_noPrefetch=3D0, ZSTD_p_prefetch=3D1 } ZSTD_prefetch_= e; =20 FORCE_INLINE_TEMPLATE seq_t -ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffs= ets, const ZSTD_prefetch_e prefetch) +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffs= ets) { seq_t seq; - ZSTD_seqSymbol const llDInfo =3D seqState->stateLL.table[seqState->sta= teLL.state]; - ZSTD_seqSymbol const mlDInfo =3D seqState->stateML.table[seqState->sta= teML.state]; - ZSTD_seqSymbol const ofDInfo =3D seqState->stateOffb.table[seqState->s= tateOffb.state]; - U32 const llBase =3D llDInfo.baseValue; - U32 const mlBase =3D mlDInfo.baseValue; - U32 const ofBase =3D ofDInfo.baseValue; - BYTE const llBits =3D llDInfo.nbAdditionalBits; - BYTE const mlBits =3D mlDInfo.nbAdditionalBits; - BYTE const ofBits =3D ofDInfo.nbAdditionalBits; - BYTE const totalBits =3D llBits+mlBits+ofBits; - - /* sequence */ - { size_t offset; - if (ofBits > 1) { - ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset =3D=3D 1); - ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 =3D=3D 5); - assert(ofBits <=3D MaxOff); - if (MEM_32bits() && longOffsets && (ofBits >=3D STREAM_ACCUMUL= ATOR_MIN_32)) { - U32 const extraBits =3D ofBits - MIN(ofBits, 32 - seqState= ->DStream.bitsConsumed); - offset =3D ofBase + (BIT_readBitsFast(&seqState->DStream, = ofBits - extraBits) << extraBits); - BIT_reloadDStream(&seqState->DStream); - if (extraBits) offset +=3D BIT_readBitsFast(&seqState->DSt= ream, extraBits); - assert(extraBits <=3D LONG_OFFSETS_MAX_EXTRA_BITS_32); /= * to avoid another reload */ - } else { - offset =3D ofBase + BIT_readBitsFast(&seqState->DStream, o= fBits/*>0*/); /* <=3D (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } - seqState->prevOffset[2] =3D seqState->prevOffset[1]; - seqState->prevOffset[1] =3D seqState->prevOffset[0]; - seqState->prevOffset[0] =3D offset; - } else { - U32 const ll0 =3D (llBase =3D=3D 0); - if (LIKELY((ofBits =3D=3D 0))) { - if (LIKELY(!ll0)) - offset =3D seqState->prevOffset[0]; - else { - offset =3D seqState->prevOffset[1]; - seqState->prevOffset[1] =3D seqState->prevOffset[0]; - seqState->prevOffset[0] =3D offset; + const ZSTD_seqSymbol* const llDInfo =3D seqState->stateLL.table + seqS= tate->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo =3D seqState->stateML.table + seqS= tate->stateML.state; + const ZSTD_seqSymbol* const ofDInfo =3D seqState->stateOffb.table + se= qState->stateOffb.state; + seq.matchLength =3D mlDInfo->baseValue; + seq.litLength =3D llDInfo->baseValue; + { U32 const ofBase =3D ofDInfo->baseValue; + BYTE const llBits =3D llDInfo->nbAdditionalBits; + BYTE const mlBits =3D mlDInfo->nbAdditionalBits; + BYTE const ofBits =3D ofDInfo->nbAdditionalBits; + BYTE const totalBits =3D llBits+mlBits+ofBits; + + U16 const llNext =3D llDInfo->nextState; + U16 const mlNext =3D mlDInfo->nextState; + U16 const ofNext =3D ofDInfo->nextState; + U32 const llnbBits =3D llDInfo->nbBits; + U32 const mlnbBits =3D mlDInfo->nbBits; + U32 const ofnbBits =3D ofDInfo->nbBits; + /* + * As gcc has better branch and block analyzers, sometimes it is o= nly + * valuable to mark likelyness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; + #if defined(__clang__) + if (LIKELY(ofBits > 1)) { + #else + if (ofBits > 1) { + #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset =3D=3D 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 =3D=3D 5= ); + assert(ofBits <=3D MaxOff); + if (MEM_32bits() && longOffsets && (ofBits >=3D STREAM_ACC= UMULATOR_MIN_32)) { + U32 const extraBits =3D ofBits - MIN(ofBits, 32 - seqS= tate->DStream.bitsConsumed); + offset =3D ofBase + (BIT_readBitsFast(&seqState->DStre= am, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset +=3D BIT_readBitsFast(&seqState-= >DStream, extraBits); + assert(extraBits <=3D LONG_OFFSETS_MAX_EXTRA_BITS_32);= /* to avoid another reload */ + } else { + offset =3D ofBase + BIT_readBitsFast(&seqState->DStrea= m, ofBits/*>0*/); /* <=3D (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream= ); } + seqState->prevOffset[2] =3D seqState->prevOffset[1]; + seqState->prevOffset[1] =3D seqState->prevOffset[0]; + seqState->prevOffset[0] =3D offset; } else { - offset =3D ofBase + ll0 + BIT_readBitsFast(&seqState->DStr= eam, 1); - { size_t temp =3D (offset=3D=3D3) ? seqState->prevOffset= [0] - 1 : seqState->prevOffset[offset]; - temp +=3D !temp; /* 0 is not valid; input is corrupt= ed; force offset to 1 */ - if (offset !=3D 1) seqState->prevOffset[2] =3D seqStat= e->prevOffset[1]; - seqState->prevOffset[1] =3D seqState->prevOffset[0]; - seqState->prevOffset[0] =3D offset =3D temp; - } } } - seq.offset =3D offset; - } - - seq.matchLength =3D mlBase; - if (mlBits > 0) - seq.matchLength +=3D BIT_readBitsFast(&seqState->DStream, mlBits/*= >0*/); - - if (MEM_32bits() && (mlBits+llBits >=3D STREAM_ACCUMULATOR_MIN_32-LONG= _OFFSETS_MAX_EXTRA_BITS_32)) - BIT_reloadDStream(&seqState->DStream); - if (MEM_64bits() && UNLIKELY(totalBits >=3D STREAM_ACCUMULATOR_MIN_64-= (LLFSELog+MLFSELog+OffFSELog))) - BIT_reloadDStream(&seqState->DStream); - /* Ensure there are enough bits to read the rest of data in 64-bit mod= e. */ - ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR= _MIN_64); - - seq.litLength =3D llBase; - if (llBits > 0) - seq.litLength +=3D BIT_readBitsFast(&seqState->DStream, llBits/*>0= */); - - if (MEM_32bits()) - BIT_reloadDStream(&seqState->DStream); - - DEBUGLOG(6, "seq: litL=3D%u, matchL=3D%u, offset=3D%u", - (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); - - if (prefetch =3D=3D ZSTD_p_prefetch) { - size_t const pos =3D seqState->pos + seq.litLength; - const BYTE* const matchBase =3D (seq.offset > pos) ? seqState->dic= tEnd : seqState->prefixStart; - seq.match =3D matchBase + pos - seq.offset; /* note : this operat= ion can overflow when seq.offset is really too large, which can only happen= when input is corrupted. - * No consequence thoug= h : no memory access will occur, offset is only used for prefetching */ - seqState->pos =3D pos + seq.matchLength; - } - - /* ANS state update - * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). - * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). - * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the - * better option, so it is the default for other compilers. But, if you - * measure that it is worse, please put up a pull request. - */ - { -#if !defined(__clang__) - const int kUseUpdateFseState =3D 1; -#else - const int kUseUpdateFseState =3D 0; -#endif - if (kUseUpdateFseState) { - ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); = /* <=3D 9 bits */ - ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); = /* <=3D 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /*= <=3D 18 bits */ - ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream);= /* <=3D 8 bits */ - } else { - ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DS= tream, llDInfo); /* <=3D 9 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DS= tream, mlDInfo); /* <=3D 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /*= <=3D 18 bits */ - ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->= DStream, ofDInfo); /* <=3D 8 bits */ + U32 const ll0 =3D (llDInfo->baseValue =3D=3D 0); + if (LIKELY((ofBits =3D=3D 0))) { + offset =3D seqState->prevOffset[ll0]; + seqState->prevOffset[1] =3D seqState->prevOffset[!ll0]; + seqState->prevOffset[0] =3D offset; + } else { + offset =3D ofBase + ll0 + BIT_readBitsFast(&seqState->= DStream, 1); + { size_t temp =3D (offset=3D=3D3) ? seqState->prevOf= fset[0] - 1 : seqState->prevOffset[offset]; + temp +=3D !temp; /* 0 is not valid; input is cor= rupted; force offset to 1 */ + if (offset !=3D 1) seqState->prevOffset[2] =3D seq= State->prevOffset[1]; + seqState->prevOffset[1] =3D seqState->prevOffset[0= ]; + seqState->prevOffset[0] =3D offset =3D temp; + } } } + seq.offset =3D offset; } + + #if defined(__clang__) + if (UNLIKELY(mlBits > 0)) + #else + if (mlBits > 0) + #endif + seq.matchLength +=3D BIT_readBitsFast(&seqState->DStream, mlBi= ts/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >=3D STREAM_ACCUMULATOR_MIN_32-= LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >=3D STREAM_ACCUMULATOR_MIN= _64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit= mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMUL= ATOR_MIN_64); + + #if defined(__clang__) + if (UNLIKELY(llBits > 0)) + #else + if (llBits > 0) + #endif + seq.litLength +=3D BIT_readBitsFast(&seqState->DStream, llBits= /*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=3D%u, matchL=3D%u, offset=3D%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.off= set); + + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStrea= m, llNext, llnbBits); /* <=3D 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStrea= m, mlNext, mlnbBits); /* <=3D 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= =3D 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStr= eam, ofNext, ofnbBits); /* <=3D 8 bits */ } =20 return seq; @@ -1098,9 +1315,11 @@ MEM_STATIC void ZSTD_assertValidSequence( #endif =20 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + + FORCE_INLINE_TEMPLATE size_t DONT_VECTORIZE -ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, +ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, const ZSTD_longOffset_e isLongOffset, @@ -1112,17 +1331,16 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, BYTE* const oend =3D ostart + maxDstSize; BYTE* op =3D ostart; const BYTE* litPtr =3D dctx->litPtr; - const BYTE* const litEnd =3D litPtr + dctx->litSize; + const BYTE* litBufferEnd =3D dctx->litBufferEnd; const BYTE* const prefixStart =3D (const BYTE*) (dctx->prefixStart); const BYTE* const vBase =3D (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd =3D (const BYTE*) (dctx->dictEnd); - DEBUGLOG(5, "ZSTD_decompressSequences_body"); + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); (void)frame; =20 /* Regen sequences */ if (nbSeq) { seqState_t seqState; - size_t error =3D 0; dctx->fseEntropy =3D 1; { U32 i; for (i=3D0; ientropy.rep[i]; } RETURN_ERROR_IF( @@ -1138,70 +1356,255 @@ ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, BIT_DStream_endOfBuffer < BIT_DStream_completed && BIT_DStream_completed < BIT_DStream_overflow); =20 + /* decompress without overrunning litPtr begins */ + { + seq_t sequence =3D ZSTD_decodeSequence(&seqState, isLongOffset= ); + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% = decompression + * speed swings based on the alignment of the decompression= loop. This + * performance swing is caused by parts of the decompressio= n loop falling + * out of the DSB. The entire decompression loop should fit= in the DSB, + * when it can't we get much worse performance. You can mea= sure if you've + * hit the good case or the bad case with this perf command= for some + * compressed file test.zst: + * + * perf stat -e cycles -e instructions -e idq.all_dsb_cyc= les_any_uops \ + * -e idq.all_mite_cycles_any_uops -- ./zstd -t= q test.zst + * + * If you see most cycles served out of the MITE you've hit= the bad case. + * If you see most cycles served out of the DSB you've hit = the good case. + * If it is pretty even then you may be in an okay case. + * + * This issue has been reproduced on the following CPUs: + * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel = Core i9 + * Use Instruments->Counters to get DSB/MITE = cycles. + * I never got performance swings, but I was = able to + * go from the good case of mostly DSB to hal= f of the + * cycles served from MITE. + * - Coffeelake: Intel i9-9900k + * - Coffeelake: Intel i7-9700k + * + * I haven't been able to reproduce the instability or DSB = misses on any + * of the following CPUS: + * - Haswell + * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH + * - Skylake + * + * Alignment is done for each of the three major decompress= ion loops: + * - ZSTD_decompressSequences_bodySplitLitBuffer - prespl= it section of the literal buffer + * - ZSTD_decompressSequences_bodySplitLitBuffer - postsp= lit section of the literal buffer + * - ZSTD_decompressSequences_body + * Alignment choices are made to minimize large swings on b= ad cases and influence on performance + * from changes external to this code, rather than to overo= ptimize on the current commit. + * + * If you are seeing performance stability this script can = help test. + * It tests on 4 commits in zstd where I saw performance ch= ange. + * + * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99= 351564473f4 + */ #if defined(__x86_64__) - /* Align the decompression loop to 32 + 16 bytes. - * - * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompr= ession - * speed swings based on the alignment of the decompression loop. = This - * performance swing is caused by parts of the decompression loop = falling - * out of the DSB. The entire decompression loop should fit in the= DSB, - * when it can't we get much worse performance. You can measure if= you've - * hit the good case or the bad case with this perf command for so= me - * compressed file test.zst: - * - * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any= _uops \ - * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.= zst - * - * If you see most cycles served out of the MITE you've hit the ba= d case. - * If you see most cycles served out of the DSB you've hit the goo= d case. - * If it is pretty even then you may be in an okay case. - * - * I've been able to reproduce this issue on the following CPUs: - * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 - * Use Instruments->Counters to get DSB/MITE cycles. - * I never got performance swings, but I was able to - * go from the good case of mostly DSB to half of the - * cycles served from MITE. - * - Coffeelake: Intel i9-9900k - * - * I haven't been able to reproduce the instability or DSB misses = on any - * of the following CPUS: - * - Haswell - * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH - * - Skylake - * - * If you are seeing performance stability this script can help te= st. - * It tests on 4 commits in zstd where I saw performance change. - * - * https://gist.github.com/terrelln/9889fc06a423fd5ca6e993515644= 73f4 - */ - __asm__(".p2align 5"); - __asm__("nop"); - __asm__(".p2align 4"); + __asm__(".p2align 6"); +# if __GNUC__ >=3D 7 + /* good for gcc-7, gcc-9, and gcc-11 */ + __asm__("nop"); + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 4"); +# if __GNUC__ =3D=3D 8 || __GNUC__ =3D=3D 10 + /* good for gcc-8 and gcc-10 */ + __asm__("nop"); + __asm__(".p2align 3"); +# endif +# endif +#endif + + /* Handle the initial state where litBuffer is currently split= between dst and litExtraBuffer */ + for (; litPtr + sequence.litLength <=3D dctx->litBufferEnd; ) { + size_t const oneSeqSize =3D ZSTD_execSequenceSplitLitBuffe= r(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &l= itPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A= SSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen= ce, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqS= ize); + op +=3D oneSeqSize; + if (UNLIKELY(!--nbSeq)) + break; + BIT_reloadDStream(&(seqState.DStream)); + sequence =3D ZSTD_decodeSequence(&seqState, isLongOffset); + } + + /* If there are more sequences, they will need to read literal= s from litExtraBuffer; copy over the remainder from dst and update litPtr a= nd litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit =3D dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dst= Size_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -=3D leftoverLit; + op +=3D leftoverLit; + } + litPtr =3D dctx->litExtraBuffer; + litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTR= ASIZE; + dctx->litBufferLocation =3D ZSTD_not_in_dst; + { + size_t const oneSeqSize =3D ZSTD_execSequence(op, oend= , sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A= SSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, se= quence, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)one= SeqSize); + op +=3D oneSeqSize; + if (--nbSeq) + BIT_reloadDStream(&(seqState.DStream)); + } + } + } + + if (nbSeq > 0) /* there is remaining lit from extra buffer */ + { + +#if defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ !=3D 7 + /* worse for gcc-7 better for gcc-8, gcc-9, and gcc-10 and cla= ng */ + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# elif __GNUC__ >=3D 11 + __asm__(".p2align 3"); +# else + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif +#endif + + for (; ; ) { + seq_t const sequence =3D ZSTD_decodeSequence(&seqState, is= LongOffset); + size_t const oneSeqSize =3D ZSTD_execSequence(op, oend, se= quence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A= SSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen= ce, prefixStart, vBase); +#endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqS= ize); + op +=3D oneSeqSize; + if (UNLIKELY(!--nbSeq)) + break; + BIT_reloadDStream(&(seqState.DStream)); + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after de= code loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); + RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream= _completed, corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=3D0; ientropy.rep[i] =3D= (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + if (dctx->litBufferLocation =3D=3D ZSTD_split) /* split hasn't been r= eached yet, first get dst then copy litExtraBuffer */ + { + size_t const lastLLSize =3D litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall= , ""); + if (op !=3D NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op +=3D lastLLSize; + } + litPtr =3D dctx->litExtraBuffer; + litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation =3D ZSTD_not_in_dst; + } + { size_t const lastLLSize =3D litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, = ""); + if (op !=3D NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op +=3D lastLLSize; + } + } + + return op-ostart; +} + +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + const BYTE* ip =3D (const BYTE*)seqStart; + const BYTE* const iend =3D ip + seqSize; + BYTE* const ostart =3D (BYTE*)dst; + BYTE* const oend =3D dctx->litBufferLocation =3D=3D ZSTD_not_in_dst ? = ostart + maxDstSize : dctx->litBuffer; + BYTE* op =3D ostart; + const BYTE* litPtr =3D dctx->litPtr; + const BYTE* const litEnd =3D litPtr + dctx->litSize; + const BYTE* const prefixStart =3D (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase =3D (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd =3D (const BYTE*)(dctx->dictEnd); + DEBUGLOG(5, "ZSTD_decompressSequences_body"); + (void)frame; + + /* Regen sequences */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy =3D 1; + { U32 i; for (i =3D 0; i < ZSTD_REP_NUM; i++) seqState.prevOffset[= i] =3D dctx->entropy.rep[i]; } + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend - ip)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTp= tr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OF= Tptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTp= tr); + assert(dst !=3D NULL); + + ZSTD_STATIC_ASSERT( + BIT_DStream_unfinished < BIT_DStream_completed && + BIT_DStream_endOfBuffer < BIT_DStream_completed && + BIT_DStream_completed < BIT_DStream_overflow); + +#if defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +# if __GNUC__ >=3D 7 + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 3"); +# else + __asm__(".p2align 4"); + __asm__("nop"); + __asm__(".p2align 3"); +# endif #endif + for ( ; ; ) { - seq_t const sequence =3D ZSTD_decodeSequence(&seqState, isLong= Offset, ZSTD_p_noPrefetch); + seq_t const sequence =3D ZSTD_decodeSequence(&seqState, isLong= Offset); size_t const oneSeqSize =3D ZSTD_execSequence(op, oend, sequen= ce, &litPtr, litEnd, prefixStart, vBase, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A= SSERT_VALID_SEQUENCE) assert(!ZSTD_isError(oneSeqSize)); if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, = prefixStart, vBase); #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); - BIT_reloadDStream(&(seqState.DStream)); op +=3D oneSeqSize; - /* gcc and clang both don't like early returns in this loop. - * Instead break and check for an error at the end of the loop. - */ - if (UNLIKELY(ZSTD_isError(oneSeqSize))) { - error =3D oneSeqSize; + if (UNLIKELY(!--nbSeq)) break; - } - if (UNLIKELY(!--nbSeq)) break; + BIT_reloadDStream(&(seqState.DStream)); } =20 /* check if reached exact end */ DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, rem= aining nbSeq : %i", nbSeq); - if (ZSTD_isError(error)) return error; RETURN_ERROR_IF(nbSeq, corruption_detected, ""); RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream= _completed, corruption_detected, ""); /* save reps for next block */ @@ -1229,9 +1632,37 @@ ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, = seqSize, nbSeq, isLongOffset, frame); } + +static size_t +ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSiz= e, + const void* seqStart, size_t seqS= ize, int nbSeq, + const ZSTD_longOffset_e isLongOff= set, + const int frame) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSi= ze, seqStart, seqSize, nbSeq, isLongOffset, frame); +} #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ =20 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +FORCE_INLINE_TEMPLATE size_t +ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEn= d) +{ + prefetchPos +=3D sequence.litLength; + { const BYTE* const matchBase =3D (sequence.offset > prefetchPos) ? = dictEnd : prefixStart; + const BYTE* const match =3D matchBase + prefetchPos - sequence.off= set; /* note : this operation can overflow when seq.offset is really too la= rge, which can only happen when input is corrupted. + = * No consequence though : memory address is only used for prefetching, = not for dereferencing */ + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note := it's safe to invoke PREFETCH() on any memory address, including invalid on= es */ + } + return prefetchPos + sequence.matchLength; +} + +/* This decoding function employs prefetching + * to reduce latency impact of cache misses. + * It's generally employed when block contains a significant portion of lo= ng-distance matches + * or when coupled with a "cold" dictionary */ FORCE_INLINE_TEMPLATE size_t ZSTD_decompressSequencesLong_body( ZSTD_DCtx* dctx, @@ -1243,10 +1674,10 @@ ZSTD_decompressSequencesLong_body( const BYTE* ip =3D (const BYTE*)seqStart; const BYTE* const iend =3D ip + seqSize; BYTE* const ostart =3D (BYTE*)dst; - BYTE* const oend =3D ostart + maxDstSize; + BYTE* const oend =3D dctx->litBufferLocation =3D=3D ZSTD_in_dst ? dctx= ->litBuffer : ostart + maxDstSize; BYTE* op =3D ostart; const BYTE* litPtr =3D dctx->litPtr; - const BYTE* const litEnd =3D litPtr + dctx->litSize; + const BYTE* litBufferEnd =3D dctx->litBufferEnd; const BYTE* const prefixStart =3D (const BYTE*) (dctx->prefixStart); const BYTE* const dictStart =3D (const BYTE*) (dctx->virtualStart); const BYTE* const dictEnd =3D (const BYTE*) (dctx->dictEnd); @@ -1254,18 +1685,17 @@ ZSTD_decompressSequencesLong_body( =20 /* Regen sequences */ if (nbSeq) { -#define STORED_SEQS 4 +#define STORED_SEQS 8 #define STORED_SEQS_MASK (STORED_SEQS-1) -#define ADVANCED_SEQS 4 +#define ADVANCED_SEQS STORED_SEQS seq_t sequences[STORED_SEQS]; int const seqAdvance =3D MIN(nbSeq, ADVANCED_SEQS); seqState_t seqState; int seqNb; + size_t prefetchPos =3D (size_t)(op-prefixStart); /* track position= relative to prefixStart */ + dctx->fseEntropy =3D 1; { int i; for (i=3D0; ientropy.rep[i]; } - seqState.prefixStart =3D prefixStart; - seqState.pos =3D (size_t)(op-prefixStart); - seqState.dictEnd =3D dictEnd; assert(dst !=3D NULL); assert(iend >=3D ip); RETURN_ERROR_IF( @@ -1277,36 +1707,100 @@ ZSTD_decompressSequencesLong_body( =20 /* prepare in advance */ for (seqNb=3D0; (BIT_reloadDStream(&seqState.DStream) <=3D BIT_DSt= ream_completed) && (seqNblitBufferLocation =3D=3D ZSTD_split && litPtr + sequ= ences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBuff= erEnd) + { + /* lit buffer is reaching split point, empty out the first= buffer and transition to litExtraBuffer */ + const size_t leftoverLit =3D dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dst= Size_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].= litLength -=3D leftoverLit; + op +=3D leftoverLit; + } + litPtr =3D dctx->litExtraBuffer; + litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTR= ASIZE; + dctx->litBufferLocation =3D ZSTD_not_in_dst; + oneSeqSize =3D ZSTD_execSequence(op, oend, sequences[(seqN= b - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart,= dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A= SSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[= (seqNb-ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen= ces[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - PREFETCH_L1(sequence.match); PREFETCH_L1(sequence.match + sequ= ence.matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memo= ry address, including invalid ones */ - sequences[seqNb & STORED_SEQS_MASK] =3D sequence; - op +=3D oneSeqSize; + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos =3D ZSTD_prefetchMatch(prefetchPos, sequence, = prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] =3D sequence; + op +=3D oneSeqSize; + } + else + { + /* lit buffer is either wholly contained in first or secon= d split, or not split at all*/ + oneSeqSize =3D dctx->litBufferLocation =3D=3D ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + seq= uences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVE= RLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, li= tBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCE= D_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart,= dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A= SSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen= ces[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + + prefetchPos =3D ZSTD_prefetchMatch(prefetchPos, sequence, = prefixStart, dictEnd); + sequences[seqNb & STORED_SEQS_MASK] =3D sequence; + op +=3D oneSeqSize; + } } RETURN_ERROR_IF(seqNblitBufferLocation =3D=3D ZSTD_split && litPtr + sequ= ence->litLength > dctx->litBufferEnd) + { + const size_t leftoverLit =3D dctx->litBufferEnd - litPtr; + if (leftoverLit) + { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dst= Size_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -=3D leftoverLit; + op +=3D leftoverLit; + } + litPtr =3D dctx->litExtraBuffer; + litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTR= ASIZE; + dctx->litBufferLocation =3D ZSTD_not_in_dst; + { + size_t const oneSeqSize =3D ZSTD_execSequence(op, oend= , *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A= SSERT_VALID_SEQUENCE) - assert(!ZSTD_isError(oneSeqSize)); - if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[= seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, se= quences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); #endif - if (ZSTD_isError(oneSeqSize)) return oneSeqSize; - op +=3D oneSeqSize; + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op +=3D oneSeqSize; + } + } + else + { + size_t const oneSeqSize =3D dctx->litBufferLocation =3D=3D= ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + seq= uence->litLength - WILDCOPY_OVERLENGTH, *sequence, &litPtr, litBufferEnd, p= refixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBuf= ferEnd, prefixStart, dictStart, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_A= SSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequen= ces[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); +#endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op +=3D oneSeqSize; + } } =20 /* save reps for next block */ @@ -1314,10 +1808,21 @@ ZSTD_decompressSequencesLong_body( } =20 /* last literal segment */ - { size_t const lastLLSize =3D litEnd - litPtr; + if (dctx->litBufferLocation =3D=3D ZSTD_split) /* first deplete liter= al buffer in dst, then copy litExtraBuffer */ + { + size_t const lastLLSize =3D litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall= , ""); + if (op !=3D NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); + op +=3D lastLLSize; + } + litPtr =3D dctx->litExtraBuffer; + litBufferEnd =3D dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + } + { size_t const lastLLSize =3D litBufferEnd - litPtr; RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, = ""); if (op !=3D NULL) { - ZSTD_memcpy(op, litPtr, lastLLSize); + ZSTD_memmove(op, litPtr, lastLLSize); op +=3D lastLLSize; } } @@ -1341,7 +1846,7 @@ ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, #if DYNAMIC_BMI2 =20 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t DONT_VECTORIZE ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, @@ -1351,10 +1856,20 @@ ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, { return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, = seqSize, nbSeq, isLongOffset, frame); } +static BMI2_TARGET_ATTRIBUTE size_t +DONT_VECTORIZE +ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSi= ze, seqStart, seqSize, nbSeq, isLongOffset, frame); +} #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ =20 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT -static TARGET_ATTRIBUTE("bmi2") size_t +static BMI2_TARGET_ATTRIBUTE size_t ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, const void* seqStart, size_t seqSize, int nbSeq, @@ -1383,11 +1898,25 @@ ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst= , size_t maxDstSize, { DEBUGLOG(5, "ZSTD_decompressSequences"); #if DYNAMIC_BMI2 - if (dctx->bmi2) { + if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqSta= rt, seqSize, nbSeq, isLongOffset, frame); } #endif - return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart,= seqSize, nbSeq, isLongOffset, frame); + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStar= t, seqSize, nbSeq, isLongOffset, frame); +} +static size_t +ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t = maxDstSize, + const void* seqStart, size_t seqSize, int= nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); +#if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { + return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxD= stSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + } +#endif + return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDs= tSize, seqStart, seqSize, nbSeq, isLongOffset, frame); } #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ =20 @@ -1407,7 +1936,7 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, { DEBUGLOG(5, "ZSTD_decompressSequencesLong"); #if DYNAMIC_BMI2 - if (dctx->bmi2) { + if (ZSTD_DCtx_get_bmi2(dctx)) { return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, se= qStart, seqSize, nbSeq, isLongOffset, frame); } #endif @@ -1448,7 +1977,7 @@ ZSTD_getLongOffsetsShare(const ZSTD_seqSymbol* offTab= le) size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame) + const void* src, size_t srcSize, const int frame, = const streaming_operation streaming) { /* blockType =3D=3D blockCompressed */ const BYTE* ip =3D (const BYTE*)src; /* isLongOffset must be true if there are long offsets. @@ -1463,7 +1992,7 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, RETURN_ERROR_IF(srcSize >=3D ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); =20 /* Decode literals section */ - { size_t const litCSize =3D ZSTD_decodeLiteralsBlock(dctx, src, srcS= ize); + { size_t const litCSize =3D ZSTD_decodeLiteralsBlock(dctx, src, srcS= ize, dst, dstCapacity, streaming); DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); if (ZSTD_isError(litCSize)) return litCSize; ip +=3D litCSize; @@ -1511,7 +2040,10 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, =20 #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG /* else */ - return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSiz= e, nbSeq, isLongOffset, frame); + if (dctx->litBufferLocation =3D=3D ZSTD_split) + return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCa= pacity, ip, srcSize, nbSeq, isLongOffset, frame); + else + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, sr= cSize, nbSeq, isLongOffset, frame); #endif } } @@ -1534,7 +2066,7 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, { size_t dSize; ZSTD_checkContinuity(dctx, dst, dstCapacity); - dSize =3D ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, s= rcSize, /* frame */ 0); + dSize =3D ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, s= rcSize, /* frame */ 0, not_streaming); dctx->previousDstEnd =3D (char*)dst + dSize; return dSize; } diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompr= ess/zstd_decompress_block.h index e7f5f6689459..3d2d57a5d25a 100644 --- a/lib/zstd/decompress/zstd_decompress_block.h +++ b/lib/zstd/decompress/zstd_decompress_block.h @@ -33,6 +33,12 @@ */ =20 =20 + /* Streaming state is used to inform allocation of the literal buffer */ +typedef enum { + not_streaming =3D 0, + is_streaming =3D 1 +} streaming_operation; + /* ZSTD_decompressBlock_internal() : * decompress block, starting at `src`, * into destination buffer `dst`. @@ -41,7 +47,7 @@ */ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, - const void* src, size_t srcSize, const int frame); + const void* src, size_t srcSize, const int frame,= const streaming_operation streaming); =20 /* ZSTD_buildFSETable() : * generate FSE decoding table for one symbol (ll, ml or off) @@ -54,7 +60,7 @@ size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, */ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, const short* normalizedCounter, unsigned maxSymbolValue, - const U32* baseValue, const U32* nbAdditionalBits, + const U32* baseValue, const U8* nbAdditionalBits, unsigned tableLog, void* wksp, size_t wkspSize, int bmi2); =20 diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/deco= mpress/zstd_decompress_internal.h index 4b9052f68755..98102edb6a83 100644 --- a/lib/zstd/decompress/zstd_decompress_internal.h +++ b/lib/zstd/decompress/zstd_decompress_internal.h @@ -20,7 +20,7 @@ * Dependencies *********************************************************/ #include "../common/mem.h" /* BYTE, U16, U32 */ -#include "../common/zstd_internal.h" /* ZSTD_seqSymbol */ +#include "../common/zstd_internal.h" /* constants : MaxLL, MaxML, MaxOff= , LLFSELog, etc. */ =20 =20 =20 @@ -40,7 +40,7 @@ static UNUSED_ATTR const U32 OF_base[MaxOff+1] =3D { 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3= FFFFD, 0x7FFFFD, 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1= FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; =20 -static UNUSED_ATTR const U32 OF_bits[MaxOff+1] =3D { +static UNUSED_ATTR const U8 OF_bits[MaxOff+1] =3D { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -106,6 +106,22 @@ typedef struct { size_t ddictPtrCount; } ZSTD_DDictHashSet; =20 +#ifndef ZSTD_DECODER_INTERNAL_BUFFER +# define ZSTD_DECODER_INTERNAL_BUFFER (1 << 16) +#endif + +#define ZSTD_LBMIN 64 +#define ZSTD_LBMAX (128 << 10) + +/* extra buffer, compensates when dst is not large enough to store litBuff= er */ +#define ZSTD_LITBUFFEREXTRASIZE BOUNDED(ZSTD_LBMIN, ZSTD_DECODER_INTERNAL= _BUFFER, ZSTD_LBMAX) + +typedef enum { + ZSTD_not_in_dst =3D 0, /* Stored entirely within litExtraBuffer */ + ZSTD_in_dst =3D 1, /* Stored entirely within dst (in memory = after current output write) */ + ZSTD_split =3D 2 /* Split between litExtraBuffer and dst */ +} ZSTD_litLocation_e; + struct ZSTD_DCtx_s { const ZSTD_seqSymbol* LLTptr; @@ -136,7 +152,9 @@ struct ZSTD_DCtx_s size_t litSize; size_t rleSize; size_t staticSize; +#if DYNAMIC_BMI2 !=3D 0 int bmi2; /* =3D=3D 1 if the CPU supports BMI2 and= 0 otherwise. CPU support is determined dynamically once per context lifeti= me. */ +#endif =20 /* dictionary */ ZSTD_DDict* ddictLocal; @@ -158,16 +176,16 @@ struct ZSTD_DCtx_s size_t outStart; size_t outEnd; size_t lhSize; - void* legacyContext; - U32 previousLegacyVersion; - U32 legacyVersion; U32 hostageByte; int noForwardProgress; ZSTD_bufferMode_e outBufferMode; ZSTD_outBuffer expectedOutBuffer; =20 /* workspace */ - BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; + BYTE* litBuffer; + const BYTE* litBufferEnd; + ZSTD_litLocation_e litBufferLocation; + BYTE litExtraBuffer[ZSTD_LITBUFFEREXTRASIZE + WILDCOPY_OVERLENGTH]; /*= literal buffer can be split between storage within dst and within this scr= atch buffer */ BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; =20 size_t oversizedDuration; @@ -180,6 +198,14 @@ struct ZSTD_DCtx_s /* Tracing */ }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ =20 +MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { +#if DYNAMIC_BMI2 !=3D 0 + return dctx->bmi2; +#else + (void)dctx; + return 0; +#endif +} =20 /*-******************************************************* * Shared internal functions diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h index 0fbec508f285..a06ca187aab5 100644 --- a/lib/zstd/decompress_sources.h +++ b/lib/zstd/decompress_sources.h @@ -16,6 +16,12 @@ * decompression. */ =20 +/* + * Disable the ASM Huffman implementation because we need to + * include all the sources. + */ +#define ZSTD_DISABLE_ASM 1 + #include "common/debug.c" #include "common/entropy_common.c" #include "common/error_private.c" diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_modul= e.c index 65548a4bb934..04e1b5c01d9b 100644 --- a/lib/zstd/zstd_compress_module.c +++ b/lib/zstd/zstd_compress_module.c @@ -133,7 +133,11 @@ EXPORT_SYMBOL(zstd_init_cstream); size_t zstd_reset_cstream(zstd_cstream *cstream, unsigned long long pledged_src_size) { - return ZSTD_resetCStream(cstream, pledged_src_size); + if (pledged_src_size =3D=3D 0) + pledged_src_size =3D ZSTD_CONTENTSIZE_UNKNOWN; + ZSTD_FORWARD_IF_ERR( ZSTD_CCtx_reset(cstream, ZSTD_reset_session_only) ); + ZSTD_FORWARD_IF_ERR( ZSTD_CCtx_setPledgedSrcSize(cstream, pledged_src_siz= e) ); + return 0; } EXPORT_SYMBOL(zstd_reset_cstream); =20 --=20 2.38.1