From c70605e672f819d3d11fd06cd973b075c71b8995 Mon Sep 17 00:00:00 2001 From: Ziemas Date: Tue, 29 Dec 2020 15:23:11 +0100 Subject: [PATCH] libdeflate: delete libdeflate --- DobieStation/libdeflate/libdeflate.pro | 67 - DobieStation/libdeflate/libdeflate.vcxproj | 94 - .../libdeflate/libdeflate.vcxproj.filters | 126 - ext/libdeflate/CMakeLists.txt | 84 - ext/libdeflate/COPYING | 21 - ext/libdeflate/common/common_defs.h | 366 --- ext/libdeflate/common/compiler_gcc.h | 159 - ext/libdeflate/common/compiler_msc.h | 96 - ext/libdeflate/lib/adler32.c | 130 - ext/libdeflate/lib/adler32_vec_template.h | 124 - ext/libdeflate/lib/aligned_malloc.c | 57 - ext/libdeflate/lib/aligned_malloc.h | 13 - ext/libdeflate/lib/arm/adler32_impl.h | 120 - ext/libdeflate/lib/arm/cpu_features.c | 119 - ext/libdeflate/lib/arm/cpu_features.h | 37 - ext/libdeflate/lib/arm/crc32_impl.h | 166 - ext/libdeflate/lib/arm/matchfinder_impl.h | 93 - ext/libdeflate/lib/bt_matchfinder.h | 355 --- ext/libdeflate/lib/crc32.c | 313 -- ext/libdeflate/lib/crc32_table.h | 526 --- ext/libdeflate/lib/crc32_vec_template.h | 61 - ext/libdeflate/lib/decompress_template.h | 421 --- ext/libdeflate/lib/deflate_compress.c | 2826 ----------------- ext/libdeflate/lib/deflate_compress.h | 14 - ext/libdeflate/lib/deflate_constants.h | 66 - ext/libdeflate/lib/deflate_decompress.c | 997 ------ ext/libdeflate/lib/gzip_compress.c | 95 - ext/libdeflate/lib/gzip_constants.h | 45 - ext/libdeflate/lib/gzip_decompress.c | 148 - ext/libdeflate/lib/hc_matchfinder.h | 403 --- ext/libdeflate/lib/lib_common.h | 35 - ext/libdeflate/lib/matchfinder_common.h | 168 - ext/libdeflate/lib/unaligned.h | 202 -- ext/libdeflate/lib/x86/adler32_impl.h | 332 -- ext/libdeflate/lib/x86/cpu_features.c | 139 - ext/libdeflate/lib/x86/cpu_features.h | 41 - ext/libdeflate/lib/x86/crc32_impl.h | 87 - .../lib/x86/crc32_pclmul_template.h | 262 -- ext/libdeflate/lib/x86/decompress_impl.h | 26 - ext/libdeflate/lib/x86/matchfinder_impl.h | 164 - ext/libdeflate/lib/zlib_compress.c | 87 - ext/libdeflate/lib/zlib_constants.h | 21 - ext/libdeflate/lib/zlib_decompress.c | 91 - ext/libdeflate/libdeflate.h | 323 -- 44 files changed, 10120 deletions(-) delete mode 100644 DobieStation/libdeflate/libdeflate.pro delete mode 100644 DobieStation/libdeflate/libdeflate.vcxproj delete mode 100644 DobieStation/libdeflate/libdeflate.vcxproj.filters delete mode 100644 ext/libdeflate/CMakeLists.txt delete mode 100644 ext/libdeflate/COPYING delete mode 100644 ext/libdeflate/common/common_defs.h delete mode 100644 ext/libdeflate/common/compiler_gcc.h delete mode 100644 ext/libdeflate/common/compiler_msc.h delete mode 100644 ext/libdeflate/lib/adler32.c delete mode 100644 ext/libdeflate/lib/adler32_vec_template.h delete mode 100644 ext/libdeflate/lib/aligned_malloc.c delete mode 100644 ext/libdeflate/lib/aligned_malloc.h delete mode 100644 ext/libdeflate/lib/arm/adler32_impl.h delete mode 100644 ext/libdeflate/lib/arm/cpu_features.c delete mode 100644 ext/libdeflate/lib/arm/cpu_features.h delete mode 100644 ext/libdeflate/lib/arm/crc32_impl.h delete mode 100644 ext/libdeflate/lib/arm/matchfinder_impl.h delete mode 100644 ext/libdeflate/lib/bt_matchfinder.h delete mode 100644 ext/libdeflate/lib/crc32.c delete mode 100644 ext/libdeflate/lib/crc32_table.h delete mode 100644 ext/libdeflate/lib/crc32_vec_template.h delete mode 100644 ext/libdeflate/lib/decompress_template.h delete mode 100644 ext/libdeflate/lib/deflate_compress.c delete mode 100644 ext/libdeflate/lib/deflate_compress.h delete mode 100644 ext/libdeflate/lib/deflate_constants.h delete mode 100644 ext/libdeflate/lib/deflate_decompress.c delete mode 100644 ext/libdeflate/lib/gzip_compress.c delete mode 100644 ext/libdeflate/lib/gzip_constants.h delete mode 100644 ext/libdeflate/lib/gzip_decompress.c delete mode 100644 ext/libdeflate/lib/hc_matchfinder.h delete mode 100644 ext/libdeflate/lib/lib_common.h delete mode 100644 ext/libdeflate/lib/matchfinder_common.h delete mode 100644 ext/libdeflate/lib/unaligned.h delete mode 100644 ext/libdeflate/lib/x86/adler32_impl.h delete mode 100644 ext/libdeflate/lib/x86/cpu_features.c delete mode 100644 ext/libdeflate/lib/x86/cpu_features.h delete mode 100644 ext/libdeflate/lib/x86/crc32_impl.h delete mode 100644 ext/libdeflate/lib/x86/crc32_pclmul_template.h delete mode 100644 ext/libdeflate/lib/x86/decompress_impl.h delete mode 100644 ext/libdeflate/lib/x86/matchfinder_impl.h delete mode 100644 ext/libdeflate/lib/zlib_compress.c delete mode 100644 ext/libdeflate/lib/zlib_constants.h delete mode 100644 ext/libdeflate/lib/zlib_decompress.c delete mode 100644 ext/libdeflate/libdeflate.h diff --git a/DobieStation/libdeflate/libdeflate.pro b/DobieStation/libdeflate/libdeflate.pro deleted file mode 100644 index 63f79854..00000000 --- a/DobieStation/libdeflate/libdeflate.pro +++ /dev/null @@ -1,67 +0,0 @@ -win32:TARGET = libdeflate -else:TARGET = deflate - -TEMPLATE = lib -CONFIG += staticlib c99 - -win32-msvc: QMAKE_CFLAGS += /MD /O2 -else { - QMAKE_CFLAGS += -O2 \ - -fomit-frame-pointer \ - -Wall -Wundef \ - -Wpedantic -Wdeclaration-after-statement -Wmissing-prototypes -Wstrict-prototypes -Wvla \ - -fvisibility=hidden -D_ANSI_SOURCE - - mingw: QMAKE_CFLAGS += -Wno-pedantic-ms-format -} - -INCLUDEPATH += \ - ../../ext/libdeflate \ - ../../ext/libdeflate/common - -HEADERS += \ -# common headers - ../../ext/libdeflate/libdeflate.h \ - ../../ext/libdeflate/common/common_defs.h \ - ../../ext/libdeflate/common/compiler_gcc.h \ - ../../ext/libdeflate/common/compiler_msc.h \ -# library headers - ../../ext/libdeflate/lib/adler32_vec_template.h \ - ../../ext/libdeflate/lib/aligned_malloc.h \ - ../../ext/libdeflate/lib/bt_matchfinder.h \ - ../../ext/libdeflate/lib/crc32_table.h \ - ../../ext/libdeflate/lib/crc32_vec_template.h \ - ../../ext/libdeflate/lib/decompress_template.h \ - ../../ext/libdeflate/lib/deflate_compress.h \ - ../../ext/libdeflate/lib/deflate_constants.h \ - ../../ext/libdeflate/lib/gzip_constants.h \ - ../../ext/libdeflate/lib/hc_matchfinder.h \ - ../../ext/libdeflate/lib/lib_common.h \ - ../../ext/libdeflate/lib/matchfinder_common.h \ - ../../ext/libdeflate/lib/unaligned.h \ - ../../ext/libdeflate/lib/zlib_constants.h \ - ../../ext/libdeflate/lib/arm/adler32_impl.h \ - ../../ext/libdeflate/lib/arm/cpu_features.h \ - ../../ext/libdeflate/lib/arm/crc32_impl.h \ - ../../ext/libdeflate/lib/arm/matchfinder_impl.h \ - ../../ext/libdeflate/lib/x86/adler32_impl.h \ - ../../ext/libdeflate/lib/x86/cpu_features.h \ - ../../ext/libdeflate/lib/x86/crc32_impl.h \ - ../../ext/libdeflate/lib/x86/crc32_pclmul_template.h \ - ../../ext/libdeflate/lib/x86/decompress_impl.h \ - ../../ext/libdeflate/lib/x86/matchfinder_impl.h - -SOURCES += \ - ../../ext/libdeflate/lib/aligned_malloc.c \ - ../../ext/libdeflate/lib/deflate_decompress.c \ -# uncomment for compression support - #../../ext/libdeflate/lib/deflate_compress.c \ -# uncomment for zlib format support - #../../ext/libdeflate/lib/adler32.c \ - #../../ext/libdeflate/lib/zlib_decompress.c \ - #../../ext/libdeflate/lib/zlib_compress.c \ -# uncomment for gzip support - #../../ext/libdeflate/lib/gzip_decompress.c \ - #../../ext/libdeflate/lib/gzip_compress.c \ - ../../ext/libdeflate/lib/arm/cpu_features.c \ - ../../ext/libdeflate/lib/x86/cpu_features.c diff --git a/DobieStation/libdeflate/libdeflate.vcxproj b/DobieStation/libdeflate/libdeflate.vcxproj deleted file mode 100644 index 2fbaac94..00000000 --- a/DobieStation/libdeflate/libdeflate.vcxproj +++ /dev/null @@ -1,94 +0,0 @@ - - - - - - Release Optimized - x64 - - - Release - x64 - - - Devel - x64 - - - Debug - x64 - - - - - $([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0')) - $(LatestTargetPlatformVersion) - $(WindowsTargetPlatformVersion) - - - - {A77564F4-56BB-3D08-8126-3FD5FC44F217} - - - - - - StaticLibrary - Unicode - v141 - - - - - - - - - - - $(ProjectName)$(Postfix) - - - - $(BinDir)\$(ProjectName)$(Postfix).lib - - - - - - 4127;%(DisableSpecificWarnings) - - - 4127;%(DisableSpecificWarnings) - - - 4127;4245;4100;4018;%(DisableSpecificWarnings) - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/DobieStation/libdeflate/libdeflate.vcxproj.filters b/DobieStation/libdeflate/libdeflate.vcxproj.filters deleted file mode 100644 index 1a760763..00000000 --- a/DobieStation/libdeflate/libdeflate.vcxproj.filters +++ /dev/null @@ -1,126 +0,0 @@ - - - - - {71ED8ED8-ACB9-4CE9-BBE1-E00B30144E11} - cpp;c;cxx;moc;h;def;odl;idl;res; - - - {71ED8ED8-ACB9-4CE9-BBE1-E00B30144E11} - cpp;c;cxx;moc;h;def;odl;idl;res; - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx - - - - - Source Files - - - Source Files - - - Source Files - - - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - \ No newline at end of file diff --git a/ext/libdeflate/CMakeLists.txt b/ext/libdeflate/CMakeLists.txt deleted file mode 100644 index d1c08983..00000000 --- a/ext/libdeflate/CMakeLists.txt +++ /dev/null @@ -1,84 +0,0 @@ -project(libdeflate C) -set(TARGET libdeflate) -set(CMAKE_C_STANDARD 99) - -if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR - ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR - ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang") - - set(FLAGS ${FLAGS} -fomit-frame-pointer) - set(FLAGS ${FLAGS} -Wall -Wundef) - set(FLAGS ${FLAGS} -Wpedantic -Wdeclaration-after-statement -Wmissing-prototypes -Wstrict-prototypes -Wvla) - set(FLAGS ${FLAGS} -fvisibility=hidden -D_ANSI_SOURCE) - - if (MINGW) - set(FLAGS ${FLAGS} -Wno-pedantic-ms-format) - endif() -endif() - -set(COMMON_HEADERS - libdeflate.h - - common/common_defs.h - common/compiler_gcc.h - common/compiler_msc.h) - -set(LIB_HEADERS - lib/adler32_vec_template.h - lib/aligned_malloc.h - lib/bt_matchfinder.h - lib/crc32_table.h - lib/crc32_vec_template.h - lib/decompress_template.h - lib/deflate_compress.h - lib/deflate_constants.h - lib/gzip_constants.h - lib/hc_matchfinder.h - lib/lib_common.h - lib/matchfinder_common.h - lib/unaligned.h - lib/zlib_constants.h - - lib/arm/adler32_impl.h - lib/arm/cpu_features.h - lib/arm/crc32_impl.h - lib/arm/matchfinder_impl.h - - lib/x86/adler32_impl.h - lib/x86/cpu_features.h - lib/x86/crc32_impl.h - lib/x86/crc32_pclmul_template.h - lib/x86/decompress_impl.h - lib/x86/matchfinder_impl.h) - -set(LIB_SRC - lib/aligned_malloc.c - lib/deflate_decompress.c - - # uncomment for compression support - #lib/deflate_compress.c - - # uncomment for zlib format support - #lib/adler32.c - #lib/zlib_decompress.c - #lib/zlib_compress.c - - # uncomment for gzip support - #lib/gzip_decompress.c - #lib/gzip_compress.c - - lib/arm/cpu_features.c - lib/x86/cpu_features.c) - -add_library(${TARGET} STATIC ${LIB_SRC} ${LIB_HEADERS} ${COMMON_HEADERS}) -add_library(Ext::libdeflate ALIAS ${TARGET}) -set_target_properties(${TARGET} PROPERTIES PREFIX "") -set_property(TARGET ${TARGET} PROPERTY FOLDER External) - -target_include_directories(${TARGET} - PUBLIC ${CMAKE_CURRENT_SOURCE_DIR} - PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common) - -if (FLAGS) - target_compile_options(${TARGET} PRIVATE ${FLAGS}) -endif() diff --git a/ext/libdeflate/COPYING b/ext/libdeflate/COPYING deleted file mode 100644 index 1f1b81cd..00000000 --- a/ext/libdeflate/COPYING +++ /dev/null @@ -1,21 +0,0 @@ -Copyright 2016 Eric Biggers - -Permission is hereby granted, free of charge, to any person -obtaining a copy of this software and associated documentation files -(the "Software"), to deal in the Software without restriction, -including without limitation the rights to use, copy, modify, merge, -publish, distribute, sublicense, and/or sell copies of the Software, -and to permit persons to whom the Software is furnished to do so, -subject to the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS -BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN -ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. diff --git a/ext/libdeflate/common/common_defs.h b/ext/libdeflate/common/common_defs.h deleted file mode 100644 index 80623085..00000000 --- a/ext/libdeflate/common/common_defs.h +++ /dev/null @@ -1,366 +0,0 @@ -/* - * common_defs.h - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef COMMON_COMMON_DEFS_H -#define COMMON_COMMON_DEFS_H - -#ifdef __GNUC__ -# include "compiler_gcc.h" -#elif defined(_MSC_VER) -# include "compiler_msc.h" -#else -# pragma message("Unrecognized compiler. Please add a header file for your compiler. Compilation will proceed, but performance may suffer!") -#endif - -/* ========================================================================== */ -/* Type definitions */ -/* ========================================================================== */ - -#include /* size_t */ - -#ifndef __bool_true_false_are_defined -# include /* bool */ -#endif - -/* Fixed-width integer types */ -#ifndef PRIu32 -# include -#endif -typedef uint8_t u8; -typedef uint16_t u16; -typedef uint32_t u32; -typedef uint64_t u64; -typedef int8_t s8; -typedef int16_t s16; -typedef int32_t s32; -typedef int64_t s64; - -/* - * Word type of the target architecture. Use 'size_t' instead of 'unsigned - * long' to account for platforms such as Windows that use 32-bit 'unsigned - * long' on 64-bit architectures. - */ -typedef size_t machine_word_t; - -/* Number of bytes in a word */ -#define WORDBYTES ((int)sizeof(machine_word_t)) - -/* Number of bits in a word */ -#define WORDBITS (8 * WORDBYTES) - -/* ========================================================================== */ -/* Optional compiler features */ -/* ========================================================================== */ - -/* LIBEXPORT - export a function from a shared library */ -#ifndef LIBEXPORT -# define LIBEXPORT -#endif - -/* inline - suggest that a function be inlined */ -#ifndef inline -# define inline -#endif - -/* forceinline - force a function to be inlined, if possible */ -#ifndef forceinline -# define forceinline inline -#endif - -/* restrict - annotate a non-aliased pointer */ -#ifndef restrict -# define restrict -#endif - -/* likely(expr) - hint that an expression is usually true */ -#ifndef likely -# define likely(expr) (expr) -#endif - -/* unlikely(expr) - hint that an expression is usually false */ -#ifndef unlikely -# define unlikely(expr) (expr) -#endif - -/* prefetchr(addr) - prefetch into L1 cache for read */ -#ifndef prefetchr -# define prefetchr(addr) -#endif - -/* prefetchw(addr) - prefetch into L1 cache for write */ -#ifndef prefetchw -# define prefetchw(addr) -#endif - -/* Does the compiler support the 'target' function attribute? */ -#ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE -# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0 -#endif - -/* Which targets are supported with the 'target' function attribute? */ -#ifndef COMPILER_SUPPORTS_BMI2_TARGET -# define COMPILER_SUPPORTS_BMI2_TARGET 0 -#endif -#ifndef COMPILER_SUPPORTS_AVX_TARGET -# define COMPILER_SUPPORTS_AVX_TARGET 0 -#endif -#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET -# define COMPILER_SUPPORTS_AVX512BW_TARGET 0 -#endif - -/* - * Which targets are supported with the 'target' function attribute and have - * intrinsics that work within 'target'-ed functions? - */ -#ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS -# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0 -#endif -#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS -# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0 -#endif -#ifndef COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS -# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS 0 -#endif -#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS -# define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS 0 -#endif -#ifndef COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS -# define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 0 -#endif -#ifndef COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS -# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 0 -#endif - -/* _aligned_attribute(n) - declare that the annotated variable, or variables of - * the annotated type, are to be aligned on n-byte boundaries */ -#ifndef _aligned_attribute -#endif - -/* ========================================================================== */ -/* Miscellaneous macros */ -/* ========================================================================== */ - -#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0])) -#define MIN(a, b) ((a) <= (b) ? (a) : (b)) -#define MAX(a, b) ((a) >= (b) ? (a) : (b)) -#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) -#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)])) -#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1)) - -/* ========================================================================== */ -/* Endianness handling */ -/* ========================================================================== */ - -/* - * CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little - * endian or 0 if it is big endian. The macro should be defined in a way such - * that the compiler can evaluate it at compilation time. If not defined, a - * fallback is used. - */ -#ifndef CPU_IS_LITTLE_ENDIAN -static forceinline int CPU_IS_LITTLE_ENDIAN(void) -{ - union { - unsigned int v; - unsigned char b; - } u; - u.v = 1; - return u.b; -} -#endif - -/* bswap16(n) - swap the bytes of a 16-bit integer */ -#ifndef bswap16 -static forceinline u16 bswap16(u16 n) -{ - return (n << 8) | (n >> 8); -} -#endif - -/* bswap32(n) - swap the bytes of a 32-bit integer */ -#ifndef bswap32 -static forceinline u32 bswap32(u32 n) -{ - return ((n & 0x000000FF) << 24) | - ((n & 0x0000FF00) << 8) | - ((n & 0x00FF0000) >> 8) | - ((n & 0xFF000000) >> 24); -} -#endif - -/* bswap64(n) - swap the bytes of a 64-bit integer */ -#ifndef bswap64 -static forceinline u64 bswap64(u64 n) -{ - return ((n & 0x00000000000000FF) << 56) | - ((n & 0x000000000000FF00) << 40) | - ((n & 0x0000000000FF0000) << 24) | - ((n & 0x00000000FF000000) << 8) | - ((n & 0x000000FF00000000) >> 8) | - ((n & 0x0000FF0000000000) >> 24) | - ((n & 0x00FF000000000000) >> 40) | - ((n & 0xFF00000000000000) >> 56); -} -#endif - -#define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n)) -#define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n)) -#define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n)) -#define be16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap16(n) : (n)) -#define be32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap32(n) : (n)) -#define be64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap64(n) : (n)) - -/* ========================================================================== */ -/* Unaligned memory accesses */ -/* ========================================================================== */ - -/* - * UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses - * can be performed efficiently on the target platform. - */ -#ifndef UNALIGNED_ACCESS_IS_FAST -# define UNALIGNED_ACCESS_IS_FAST 0 -#endif - -/* - * DEFINE_UNALIGNED_TYPE(type) - a macro that, given an integer type 'type', - * defines load_type_unaligned(addr) and store_type_unaligned(v, addr) functions - * which load and store variables of type 'type' from/to unaligned memory - * addresses. If not defined, a fallback is used. - */ -#ifndef DEFINE_UNALIGNED_TYPE - -/* - * Although memcpy() may seem inefficient, it *usually* gets optimized - * appropriately by modern compilers. It's portable and may be the best we can - * do for a fallback... - */ -#include - -#define DEFINE_UNALIGNED_TYPE(type) \ - \ -static forceinline type \ -load_##type##_unaligned(const void *p) \ -{ \ - type v; \ - memcpy(&v, p, sizeof(v)); \ - return v; \ -} \ - \ -static forceinline void \ -store_##type##_unaligned(type v, void *p) \ -{ \ - memcpy(p, &v, sizeof(v)); \ -} - -#endif /* !DEFINE_UNALIGNED_TYPE */ - -/* ========================================================================== */ -/* Bit scan functions */ -/* ========================================================================== */ - -/* - * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least - * significant end) of the *most* significant 1 bit in the input value. The - * input value must be nonzero! - */ - -#ifndef bsr32 -static forceinline unsigned -bsr32(u32 n) -{ - unsigned i = 0; - while ((n >>= 1) != 0) - i++; - return i; -} -#endif - -#ifndef bsr64 -static forceinline unsigned -bsr64(u64 n) -{ - unsigned i = 0; - while ((n >>= 1) != 0) - i++; - return i; -} -#endif - -static forceinline unsigned -bsrw(machine_word_t n) -{ - STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - if (WORDBITS == 32) - return bsr32(n); - else - return bsr64(n); -} - -/* - * Bit Scan Forward (BSF) - find the 0-based index (relative to the least - * significant end) of the *least* significant 1 bit in the input value. The - * input value must be nonzero! - */ - -#ifndef bsf32 -static forceinline unsigned -bsf32(u32 n) -{ - unsigned i = 0; - while ((n & 1) == 0) { - i++; - n >>= 1; - } - return i; -} -#endif - -#ifndef bsf64 -static forceinline unsigned -bsf64(u64 n) -{ - unsigned i = 0; - while ((n & 1) == 0) { - i++; - n >>= 1; - } - return i; -} -#endif - -static forceinline unsigned -bsfw(machine_word_t n) -{ - STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - if (WORDBITS == 32) - return bsf32(n); - else - return bsf64(n); -} - -#endif /* COMMON_COMMON_DEFS_H */ diff --git a/ext/libdeflate/common/compiler_gcc.h b/ext/libdeflate/common/compiler_gcc.h deleted file mode 100644 index 17ca18cd..00000000 --- a/ext/libdeflate/common/compiler_gcc.h +++ /dev/null @@ -1,159 +0,0 @@ -/* - * compiler_gcc.h - definitions for the GNU C Compiler. This also handles clang - * and the Intel C Compiler (icc). - * - * TODO: icc is not well tested, so some things are currently disabled even - * though they maybe can be enabled on some icc versions. - */ - -#if !defined(__clang__) && !defined(__INTEL_COMPILER) -# define GCC_PREREQ(major, minor) \ - (__GNUC__ > (major) || \ - (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor))) -#else -# define GCC_PREREQ(major, minor) 0 -#endif - -/* Note: only check the clang version when absolutely necessary! - * "Vendors" such as Apple can use different version numbers. */ -#ifdef __clang__ -# ifdef __apple_build_version__ -# define CLANG_PREREQ(major, minor, apple_version) \ - (__apple_build_version__ >= (apple_version)) -# else -# define CLANG_PREREQ(major, minor, apple_version) \ - (__clang_major__ > (major) || \ - (__clang_major__ == (major) && __clang_minor__ >= (minor))) -# endif -#else -# define CLANG_PREREQ(major, minor, apple_version) 0 -#endif - -#ifndef __has_attribute -# define __has_attribute(attribute) 0 -#endif -#ifndef __has_feature -# define __has_feature(feature) 0 -#endif -#ifndef __has_builtin -# define __has_builtin(builtin) 0 -#endif - -#ifdef _WIN32 -# define LIBEXPORT __declspec(dllexport) -#else -# define LIBEXPORT __attribute__((visibility("default"))) -#endif - -#define inline inline -#define forceinline inline __attribute__((always_inline)) -#define restrict __restrict__ -#define likely(expr) __builtin_expect(!!(expr), 1) -#define unlikely(expr) __builtin_expect(!!(expr), 0) -#define prefetchr(addr) __builtin_prefetch((addr), 0) -#define prefetchw(addr) __builtin_prefetch((addr), 1) -#define _aligned_attribute(n) __attribute__((aligned(n))) - -#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \ - (GCC_PREREQ(4, 4) || __has_attribute(target)) - -#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE - -# if defined(__i386__) || defined(__x86_64__) - -# define COMPILER_SUPPORTS_PCLMUL_TARGET \ - (GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128)) - -# define COMPILER_SUPPORTS_AVX_TARGET \ - (GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256)) - -# define COMPILER_SUPPORTS_BMI2_TARGET \ - (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di)) - -# define COMPILER_SUPPORTS_AVX2_TARGET \ - (GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_psadbw256)) - -# define COMPILER_SUPPORTS_AVX512BW_TARGET \ - (GCC_PREREQ(5, 1) || __has_builtin(__builtin_ia32_psadbw512)) - - /* - * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics - * not available in the main target could not be used in 'target' - * attribute functions. Unfortunately clang has no feature test macro - * for this so we have to check its version. - */ -# if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000) -# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 1 -# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS \ - COMPILER_SUPPORTS_PCLMUL_TARGET -# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS \ - COMPILER_SUPPORTS_AVX2_TARGET -# define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS \ - COMPILER_SUPPORTS_AVX512BW_TARGET -# endif -# elif (defined(__arm__) && defined(__ARM_FP)) || defined(__aarch64__) - /* arm: including arm_neon.h requires hardware fp support */ - - /* - * Prior to gcc 6.1 (r230411 for arm, r226563 for aarch64), NEON - * and crypto intrinsics not available in the main target could not be - * used in 'target' attribute functions. - * - * clang as of 5.0.1 still doesn't allow it. But, it does seem to allow - * the pmull intrinsics if only __ARM_NEON is enabled. - */ -# define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS GCC_PREREQ(6, 1) -# ifdef __ARM_NEON -# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS \ - (GCC_PREREQ(6, 1) || __has_builtin(__builtin_neon_vmull_p64)) -# else -# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS \ - (GCC_PREREQ(6, 1)) -# endif -# endif -#endif /* COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE */ - -/* Newer gcc supports __BYTE_ORDER__. Older gcc doesn't. */ -#ifdef __BYTE_ORDER__ -# define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -#endif - -#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) -# define bswap16 __builtin_bswap16 -#endif - -#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) -# define bswap32 __builtin_bswap32 -#endif - -#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) -# define bswap64 __builtin_bswap64 -#endif - -#if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) -# define UNALIGNED_ACCESS_IS_FAST 1 -#endif - -/* With gcc, we can access unaligned memory through 'packed' structures. */ -#define DEFINE_UNALIGNED_TYPE(type) \ - \ -struct type##unaligned { \ - type v; \ -} __attribute__((packed)); \ - \ -static forceinline type \ -load_##type##_unaligned(const void *p) \ -{ \ - return ((const struct type##unaligned *)p)->v; \ -} \ - \ -static forceinline void \ -store_##type##_unaligned(type v, void *p) \ -{ \ - ((struct type##unaligned *)p)->v = v; \ -} - -#define bsr32(n) (31 - __builtin_clz(n)) -#define bsr64(n) (63 - __builtin_clzll(n)) -#define bsf32(n) __builtin_ctz(n) -#define bsf64(n) __builtin_ctzll(n) diff --git a/ext/libdeflate/common/compiler_msc.h b/ext/libdeflate/common/compiler_msc.h deleted file mode 100644 index 0b40d3fc..00000000 --- a/ext/libdeflate/common/compiler_msc.h +++ /dev/null @@ -1,96 +0,0 @@ -/* - * compiler_msc.h - definitions for the Microsoft C Compiler - */ - -#define LIBEXPORT __declspec(dllexport) - -/* - * Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h. - * Beware: the below replacement isn't fully standard, since normally any value - * != 0 should be implicitly cast to a bool with value 1... but that doesn't - * happen if bool is really just an 'int'. - */ -typedef int bool; -#define true 1 -#define false 0 -#define __bool_true_false_are_defined 1 - -/* Define ssize_t */ -#ifdef _WIN64 -typedef long long ssize_t; -#else -typedef int ssize_t; -#endif - -/* - * Old versions (e.g. VS2010) of MSC have stdint.h but not the C99 header - * inttypes.h. Work around this by defining the PRI* macros ourselves. - */ -#include -#define PRIu8 "hhu" -#define PRIu16 "hu" -#define PRIu32 "u" -#define PRIu64 "llu" -#define PRIi8 "hhi" -#define PRIi16 "hi" -#define PRIi32 "i" -#define PRIi64 "lli" -#define PRIx8 "hhx" -#define PRIx16 "hx" -#define PRIx32 "x" -#define PRIx64 "llx" - -/* Assume a little endian architecture with fast unaligned access */ -#define CPU_IS_LITTLE_ENDIAN() 1 -#define UNALIGNED_ACCESS_IS_FAST 1 - -/* __restrict has nonstandard behavior; don't use it */ -#define restrict - -/* ... but we can use __inline and __forceinline */ -#define inline __inline -#define forceinline __forceinline - -/* Byte swap functions */ -#include -#define bswap16 _byteswap_ushort -#define bswap32 _byteswap_ulong -#define bswap64 _byteswap_uint64 - -/* Bit scan functions (32-bit) */ - -static forceinline unsigned -bsr32(uint32_t n) -{ - _BitScanReverse(&n, n); - return n; -} -#define bsr32 bsr32 - -static forceinline unsigned -bsf32(uint32_t n) -{ - _BitScanForward(&n, n); - return n; -} -#define bsf32 bsf32 - -#ifdef _M_X64 /* Bit scan functions (64-bit) */ - -static forceinline unsigned -bsr64(uint64_t n) -{ - _BitScanReverse64(&n, n); - return n; -} -#define bsr64 bsr64 - -static forceinline unsigned -bsf64(uint64_t n) -{ - _BitScanForward64(&n, n); - return n; -} -#define bsf64 bsf64 - -#endif /* _M_X64 */ diff --git a/ext/libdeflate/lib/adler32.c b/ext/libdeflate/lib/adler32.c deleted file mode 100644 index 185575a2..00000000 --- a/ext/libdeflate/lib/adler32.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * adler32.c - Adler-32 checksum algorithm - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "lib_common.h" -#include "libdeflate.h" - -/* The Adler-32 divisor, or "base", value. */ -#define DIVISOR 65521 - -/* - * MAX_CHUNK_SIZE is the most bytes that can be processed without the - * possibility of s2 overflowing when it is represented as an unsigned 32-bit - * integer. This value was computed using the following Python script: - * - * divisor = 65521 - * count = 0 - * s1 = divisor - 1 - * s2 = divisor - 1 - * while True: - * s1 += 0xFF - * s2 += s1 - * if s2 > 0xFFFFFFFF: - * break - * count += 1 - * print(count) - * - * Note that to get the correct worst-case value, we must assume that every byte - * has value 0xFF and that s1 and s2 started with the highest possible values - * modulo the divisor. - */ -#define MAX_CHUNK_SIZE 5552 - -typedef u32 (*adler32_func_t)(u32, const u8 *, size_t); - -/* Include architecture-specific implementations if available */ -#undef DEFAULT_IMPL -#undef DISPATCH -#if defined(__arm__) || defined(__aarch64__) -# include "arm/adler32_impl.h" -#elif defined(__i386__) || defined(__x86_64__) -# include "x86/adler32_impl.h" -#endif - -/* Define a generic implementation if needed */ -#ifndef DEFAULT_IMPL -#define DEFAULT_IMPL adler32_generic -static u32 adler32_generic(u32 adler, const u8 *p, size_t size) -{ - u32 s1 = adler & 0xFFFF; - u32 s2 = adler >> 16; - const u8 * const end = p + size; - - while (p != end) { - size_t chunk_size = MIN(end - p, MAX_CHUNK_SIZE); - const u8 *chunk_end = p + chunk_size; - size_t num_unrolled_iterations = chunk_size / 4; - - while (num_unrolled_iterations--) { - s1 += *p++; - s2 += s1; - s1 += *p++; - s2 += s1; - s1 += *p++; - s2 += s1; - s1 += *p++; - s2 += s1; - } - while (p != chunk_end) { - s1 += *p++; - s2 += s1; - } - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - return (s2 << 16) | s1; -} -#endif /* !DEFAULT_IMPL */ - -#ifdef DISPATCH -static u32 dispatch(u32, const u8 *, size_t); - -static volatile adler32_func_t adler32_impl = dispatch; - -/* Choose the fastest implementation at runtime */ -static u32 dispatch(u32 adler, const u8 *buffer, size_t size) -{ - adler32_func_t f = arch_select_adler32_func(); - - if (f == NULL) - f = DEFAULT_IMPL; - - adler32_impl = f; - return adler32_impl(adler, buffer, size); -} -#else -# define adler32_impl DEFAULT_IMPL /* only one implementation, use it */ -#endif - -LIBDEFLATEAPI u32 -libdeflate_adler32(u32 adler, const void *buffer, size_t size) -{ - if (buffer == NULL) /* return initial value */ - return 1; - return adler32_impl(adler, buffer, size); -} diff --git a/ext/libdeflate/lib/adler32_vec_template.h b/ext/libdeflate/lib/adler32_vec_template.h deleted file mode 100644 index 4eb8c2a8..00000000 --- a/ext/libdeflate/lib/adler32_vec_template.h +++ /dev/null @@ -1,124 +0,0 @@ -/* - * adler32_vec_template.h - template for vectorized Adler-32 implementations - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This file contains a template for vectorized Adler-32 implementations. - * - * The inner loop between reductions modulo 65521 of an unvectorized Adler-32 - * implementation looks something like this: - * - * do { - * s1 += *p; - * s2 += s1; - * } while (++p != chunk_end); - * - * For vectorized calculation of s1, we only need to sum the input bytes. They - * can be accumulated into multiple counters which are eventually summed - * together. - * - * For vectorized calculation of s2, the basic idea is that for each iteration - * that processes N bytes, we can perform the following vectorizable - * calculation: - * - * s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N - * - * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N - * separate counters, then do the multiplications by N...1 just once at the end - * rather than once per iteration. - * - * Also, we must account for how previous bytes will affect s2 by doing the - * following at beginning of each iteration: - * - * s2 += s1 * N - * - * Furthermore, like s1, "s2" can actually be multiple counters which are - * eventually summed together. - */ - -static u32 ATTRIBUTES -FUNCNAME(u32 adler, const u8 *p, size_t size) -{ - u32 s1 = adler & 0xFFFF; - u32 s2 = adler >> 16; - const u8 * const end = p + size; - const u8 *vend; - const size_t max_chunk_size = - MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) - - (MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) % - IMPL_SEGMENT_SIZE); - - /* Process a byte at a time until the needed alignment is reached */ - if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) { - do { - s1 += *p++; - s2 += s1; - } while (p != end && (uintptr_t)p % IMPL_ALIGNMENT); - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - /* - * Process "chunks" of bytes using vector instructions. Chunk sizes are - * limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never - * overflow before being reduced modulo DIVISOR. For vector processing, - * chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and - * may be further limited to IMPL_MAX_CHUNK_SIZE. - */ - STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0); - vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE); - while (p != vend) { - size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size); - - s2 += s1 * chunk_size; - - FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size), - &s1, &s2); - - p += chunk_size; - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - /* Process any remaining bytes */ - if (p != end) { - do { - s1 += *p++; - s2 += s1; - } while (p != end); - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - return (s2 << 16) | s1; -} - -#undef FUNCNAME -#undef FUNCNAME_CHUNK -#undef ATTRIBUTES -#undef IMPL_ALIGNMENT -#undef IMPL_SEGMENT_SIZE -#undef IMPL_MAX_CHUNK_SIZE diff --git a/ext/libdeflate/lib/aligned_malloc.c b/ext/libdeflate/lib/aligned_malloc.c deleted file mode 100644 index e714dc79..00000000 --- a/ext/libdeflate/lib/aligned_malloc.c +++ /dev/null @@ -1,57 +0,0 @@ -/* - * aligned_malloc.c - aligned memory allocation - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This file provides portable aligned memory allocation functions that only - * use malloc() and free(). This avoids portability problems with - * posix_memalign(), aligned_alloc(), etc. - */ - -#include - -#include "aligned_malloc.h" - -void * -aligned_malloc(size_t alignment, size_t size) -{ - void *ptr = malloc(sizeof(void *) + alignment - 1 + size); - if (ptr) { - void *orig_ptr = ptr; - ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); - ((void **)ptr)[-1] = orig_ptr; - } - return ptr; -} - -void -aligned_free(void *ptr) -{ - if (ptr) - free(((void **)ptr)[-1]); -} diff --git a/ext/libdeflate/lib/aligned_malloc.h b/ext/libdeflate/lib/aligned_malloc.h deleted file mode 100644 index ee6e7b89..00000000 --- a/ext/libdeflate/lib/aligned_malloc.h +++ /dev/null @@ -1,13 +0,0 @@ -/* - * aligned_malloc.c - aligned memory allocation - */ - -#ifndef LIB_ALIGNED_MALLOC_H -#define LIB_ALIGNED_MALLOC_H - -#include "lib_common.h" - -extern void *aligned_malloc(size_t alignment, size_t size); -extern void aligned_free(void *ptr); - -#endif /* LIB_ALIGNED_MALLOC_H */ diff --git a/ext/libdeflate/lib/arm/adler32_impl.h b/ext/libdeflate/lib/arm/adler32_impl.h deleted file mode 100644 index de52d81c..00000000 --- a/ext/libdeflate/lib/arm/adler32_impl.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "cpu_features.h" - -/* NEON implementation */ -#undef DISPATCH_NEON -#if !defined(DEFAULT_IMPL) && \ - (defined(__ARM_NEON) || (ARM_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS)) -# define FUNCNAME adler32_neon -# define FUNCNAME_CHUNK adler32_neon_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_SIZE 32 -/* Prevent unsigned overflow of the 16-bit precision byte counters */ -# define IMPL_MAX_CHUNK_SIZE (32 * (0xFFFF / 0xFF)) -# ifdef __ARM_NEON -# define ATTRIBUTES -# define DEFAULT_IMPL adler32_neon -# else -# ifdef __arm__ -# define ATTRIBUTES __attribute__((target("fpu=neon"))) -# else -# define ATTRIBUTES __attribute__((target("+simd"))) -# endif -# define DISPATCH 1 -# define DISPATCH_NEON 1 -# endif -# include -static forceinline ATTRIBUTES void -adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, - u32 *s1, u32 *s2) -{ - uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; - uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 }; - uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 }; - - do { - const uint8x16_t bytes1 = *p++; - const uint8x16_t bytes2 = *p++; - uint16x8_t tmp; - - v_s2 += v_s1; - - /* Vector Pairwise Add Long (u8 => u16) */ - tmp = vpaddlq_u8(bytes1); - - /* Vector Pairwise Add and Accumulate Long (u8 => u16) */ - tmp = vpadalq_u8(tmp, bytes2); - - /* Vector Pairwise Add and Accumulate Long (u16 => u32) */ - v_s1 = vpadalq_u16(v_s1, tmp); - - /* Vector Add Wide (u8 => u16) */ - v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1)); - v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1)); - v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2)); - v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2)); - - } while (p != end); - - /* Vector Shift Left (u32) */ - v_s2 = vqshlq_n_u32(v_s2, 5); - - /* Vector Multiply Accumulate Long (u16 => u32) */ - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), (uint16x4_t) { 32, 31, 30, 29 }); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 }); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), (uint16x4_t) { 24, 23, 22, 21 }); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 }); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), (uint16x4_t) { 16, 15, 14, 13 }); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10, 9 }); - v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) { 8, 7, 6, 5 }); - v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) { 4, 3, 2, 1 }); - - *s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3]; - *s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3]; -} -# include "../adler32_vec_template.h" -#endif /* NEON implementation */ - -#ifdef DISPATCH -static inline adler32_func_t -arch_select_adler32_func(void) -{ - u32 features = get_cpu_features(); - -#ifdef DISPATCH_NEON - if (features & ARM_CPU_FEATURE_NEON) - return adler32_neon; -#endif - return NULL; -} -#endif /* DISPATCH */ diff --git a/ext/libdeflate/lib/arm/cpu_features.c b/ext/libdeflate/lib/arm/cpu_features.c deleted file mode 100644 index 8d1facc1..00000000 --- a/ext/libdeflate/lib/arm/cpu_features.c +++ /dev/null @@ -1,119 +0,0 @@ -/* - * arm/cpu_features.c - feature detection for ARM processors - * - * Copyright 2018 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * ARM processors don't have a standard way for unprivileged programs to detect - * processor features. But, on Linux we can read the AT_HWCAP and AT_HWCAP2 - * values from /proc/self/auxv. - * - * Ideally we'd use the C library function getauxval(), but it's not guaranteed - * to be available: it was only added to glibc in 2.16, and in Android it was - * added to API level 18 for ARM and level 21 for AArch64. - */ - -#include "cpu_features.h" - -#if ARM_CPU_FEATURES_ENABLED - -#include -#include -#include -#include - -#define AT_HWCAP 16 -#define AT_HWCAP2 26 - -volatile u32 _cpu_features = 0; - -static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2) -{ - int fd; - unsigned long auxbuf[32]; - int filled = 0; - int i; - - fd = open("/proc/self/auxv", O_RDONLY); - if (fd < 0) - return; - - for (;;) { - do { - int ret = read(fd, &((char *)auxbuf)[filled], - sizeof(auxbuf) - filled); - if (ret <= 0) { - if (ret < 0 && errno == EINTR) - continue; - goto out; - } - filled += ret; - } while (filled < 2 * sizeof(long)); - - i = 0; - do { - unsigned long type = auxbuf[i]; - unsigned long value = auxbuf[i + 1]; - - if (type == AT_HWCAP) - *hwcap = value; - else if (type == AT_HWCAP2) - *hwcap2 = value; - i += 2; - filled -= 2 * sizeof(long); - } while (filled >= 2 * sizeof(long)); - - memmove(auxbuf, &auxbuf[i], filled); - } -out: - close(fd); -} - -void setup_cpu_features(void) -{ - u32 features = 0; - unsigned long hwcap = 0; - unsigned long hwcap2 = 0; - - scan_auxv(&hwcap, &hwcap2); - -#ifdef __arm__ - STATIC_ASSERT(sizeof(long) == 4); - if (hwcap & (1 << 12)) /* HWCAP_NEON */ - features |= ARM_CPU_FEATURE_NEON; - if (hwcap2 & (1 << 1)) /* HWCAP2_PMULL */ - features |= ARM_CPU_FEATURE_PMULL; -#else - STATIC_ASSERT(sizeof(long) == 8); - if (hwcap & (1 << 1)) /* HWCAP_ASIMD */ - features |= ARM_CPU_FEATURE_NEON; - if (hwcap & (1 << 4)) /* HWCAP_PMULL */ - features |= ARM_CPU_FEATURE_PMULL; -#endif - - _cpu_features = features | ARM_CPU_FEATURES_KNOWN; -} - -#endif /* ARM_CPU_FEATURES_ENABLED */ diff --git a/ext/libdeflate/lib/arm/cpu_features.h b/ext/libdeflate/lib/arm/cpu_features.h deleted file mode 100644 index 390d96c5..00000000 --- a/ext/libdeflate/lib/arm/cpu_features.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * arm/cpu_features.h - feature detection for ARM processors - */ - -#ifndef LIB_ARM_CPU_FEATURES_H -#define LIB_ARM_CPU_FEATURES_H - -#include "../lib_common.h" - -#if (defined(__arm__) || defined(__aarch64__)) && \ - defined(__linux__) && COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE -# define ARM_CPU_FEATURES_ENABLED 1 -#else -# define ARM_CPU_FEATURES_ENABLED 0 -#endif - -#if ARM_CPU_FEATURES_ENABLED - -#define ARM_CPU_FEATURE_NEON 0x00000001 -#define ARM_CPU_FEATURE_PMULL 0x00000002 - -#define ARM_CPU_FEATURES_KNOWN 0x80000000 - -extern volatile u32 _cpu_features; - -extern void setup_cpu_features(void); - -static inline u32 get_cpu_features(void) -{ - if (_cpu_features == 0) - setup_cpu_features(); - return _cpu_features; -} - -#endif /* ARM_CPU_FEATURES_ENABLED */ - -#endif /* LIB_ARM_CPU_FEATURES_H */ diff --git a/ext/libdeflate/lib/arm/crc32_impl.h b/ext/libdeflate/lib/arm/crc32_impl.h deleted file mode 100644 index e64616ff..00000000 --- a/ext/libdeflate/lib/arm/crc32_impl.h +++ /dev/null @@ -1,166 +0,0 @@ -/* - * arm/crc32_impl.h - * - * Copyright 2017 Jun He - * Copyright 2018 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "cpu_features.h" - -/* - * CRC-32 folding with ARM Crypto extension-PMULL - * - * This works the same way as the x86 PCLMUL version. - * See x86/crc32_pclmul_template.h for an explanation. - */ -#undef DISPATCH_PMULL -#if (defined(__ARM_FEATURE_CRYPTO) || \ - (ARM_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS)) && \ - /* not yet tested on big endian, probably needs changes to work there */ \ - (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && \ - /* clang as of v5.0.1 doesn't allow pmull intrinsics in 32-bit mode, even - * when compiling with -mfpu=crypto-neon-fp-armv8 */ \ - !(defined(__clang__) && defined(__arm__)) -# define FUNCNAME crc32_pmull -# define FUNCNAME_ALIGNED crc32_pmull_aligned -# ifdef __ARM_FEATURE_CRYPTO -# define ATTRIBUTES -# define DEFAULT_IMPL crc32_pmull -# else -# ifdef __arm__ -# define ATTRIBUTES __attribute__((target("fpu=crypto-neon-fp-armv8"))) -# else -# ifdef __clang__ -# define ATTRIBUTES __attribute__((target("crypto"))) -# else -# define ATTRIBUTES __attribute__((target("+crypto"))) -# endif -# endif -# define DISPATCH 1 -# define DISPATCH_PMULL 1 -# endif - -#include - -static forceinline ATTRIBUTES uint8x16_t -clmul_00(uint8x16_t a, uint8x16_t b) -{ - return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a), - (poly64_t)vget_low_u8(b)); -} - -static forceinline ATTRIBUTES uint8x16_t -clmul_10(uint8x16_t a, uint8x16_t b) -{ - return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a), - (poly64_t)vget_high_u8(b)); -} - -static forceinline ATTRIBUTES uint8x16_t -clmul_11(uint8x16_t a, uint8x16_t b) -{ - return (uint8x16_t)vmull_high_p64((poly64x2_t)a, (poly64x2_t)b); -} - -static forceinline ATTRIBUTES uint8x16_t -fold_128b(uint8x16_t dst, uint8x16_t src, uint8x16_t multipliers) -{ - return dst ^ clmul_00(src, multipliers) ^ clmul_11(src, multipliers); -} - -static forceinline ATTRIBUTES u32 -crc32_pmull_aligned(u32 remainder, const uint8x16_t *p, size_t nr_segs) -{ - /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */ - const uint8x16_t multipliers_4 = - (uint8x16_t)(uint64x2_t){ 0x8F352D95, 0x1D9513D7 }; - const uint8x16_t multipliers_1 = - (uint8x16_t)(uint64x2_t){ 0xAE689191, 0xCCAA009E }; - const uint8x16_t final_multiplier = - (uint8x16_t)(uint64x2_t){ 0xB8BC6765 }; - const uint8x16_t mask32 = (uint8x16_t)(uint32x4_t){ 0xFFFFFFFF }; - const uint8x16_t barrett_reduction_constants = - (uint8x16_t)(uint64x2_t){ 0x00000001F7011641, - 0x00000001DB710641 }; - const uint8x16_t zeroes = (uint8x16_t){ 0 }; - - const uint8x16_t * const end = p + nr_segs; - const uint8x16_t * const end512 = p + (nr_segs & ~3); - uint8x16_t x0, x1, x2, x3; - - x0 = *p++ ^ (uint8x16_t)(uint32x4_t){ remainder }; - if (nr_segs >= 4) { - x1 = *p++; - x2 = *p++; - x3 = *p++; - - /* Fold 512 bits at a time */ - while (p != end512) { - x0 = fold_128b(*p++, x0, multipliers_4); - x1 = fold_128b(*p++, x1, multipliers_4); - x2 = fold_128b(*p++, x2, multipliers_4); - x3 = fold_128b(*p++, x3, multipliers_4); - } - - /* Fold 512 bits => 128 bits */ - x1 = fold_128b(x1, x0, multipliers_1); - x2 = fold_128b(x2, x1, multipliers_1); - x0 = fold_128b(x3, x2, multipliers_1); - } - - /* Fold 128 bits at a time */ - while (p != end) - x0 = fold_128b(*p++, x0, multipliers_1); - - /* Fold 128 => 96 bits, implicitly appending 32 zeroes */ - x0 = vextq_u8(x0, zeroes, 8) ^ clmul_10(x0, multipliers_1); - - /* Fold 96 => 64 bits */ - x0 = vextq_u8(x0, zeroes, 4) ^ clmul_00(x0 & mask32, final_multiplier); - - /* Reduce 64 => 32 bits using Barrett reduction */ - x1 = x0; - x0 = clmul_00(x0 & mask32, barrett_reduction_constants); - x0 = clmul_10(x0 & mask32, barrett_reduction_constants); - return vgetq_lane_u32((uint32x4_t)(x0 ^ x1), 1); -} -#define IMPL_ALIGNMENT 16 -#define IMPL_SEGMENT_SIZE 16 -#include "../crc32_vec_template.h" -#endif /* PMULL implementation */ - -#ifdef DISPATCH -static inline crc32_func_t -arch_select_crc32_func(void) -{ - u32 features = get_cpu_features(); - -#ifdef DISPATCH_PMULL - if (features & ARM_CPU_FEATURE_PMULL) - return crc32_pmull; -#endif - return NULL; -} -#endif /* DISPATCH */ diff --git a/ext/libdeflate/lib/arm/matchfinder_impl.h b/ext/libdeflate/lib/arm/matchfinder_impl.h deleted file mode 100644 index aa1a0c72..00000000 --- a/ext/libdeflate/lib/arm/matchfinder_impl.h +++ /dev/null @@ -1,93 +0,0 @@ -/* - * arm/matchfinder_impl.h - ARM implementations of matchfinder functions - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifdef __ARM_NEON -# if MATCHFINDER_ALIGNMENT < 16 -# undef MATCHFINDER_ALIGNMENT -# define MATCHFINDER_ALIGNMENT 16 -# endif -# include -static forceinline bool -matchfinder_init_neon(mf_pos_t *data, size_t size) -{ - int16x8_t v, *p; - size_t n; - - if (size % (sizeof(int16x8_t) * 4) != 0) - return false; - - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = (int16x8_t) { - MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, - MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, - MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, - }; - p = (int16x8_t *)data; - n = size / (sizeof(int16x8_t) * 4); - do { - p[0] = v; - p[1] = v; - p[2] = v; - p[3] = v; - p += 4; - } while (--n); - return true; -} -#undef arch_matchfinder_init -#define arch_matchfinder_init matchfinder_init_neon - -static forceinline bool -matchfinder_rebase_neon(mf_pos_t *data, size_t size) -{ - int16x8_t v, *p; - size_t n; - - if (size % (sizeof(int16x8_t) * 4) != 0) - return false; - - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = (int16x8_t) { - (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, - (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, - (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, - (u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE, - }; - p = (int16x8_t *)data; - n = size / (sizeof(int16x8_t) * 4); - do { - p[0] = vqaddq_s16(p[0], v); - p[1] = vqaddq_s16(p[1], v); - p[2] = vqaddq_s16(p[2], v); - p[3] = vqaddq_s16(p[3], v); - p += 4; - } while (--n); - return true; -} -#undef arch_matchfinder_rebase -#define arch_matchfinder_rebase matchfinder_rebase_neon - -#endif /* __ARM_NEON */ diff --git a/ext/libdeflate/lib/bt_matchfinder.h b/ext/libdeflate/lib/bt_matchfinder.h deleted file mode 100644 index 49fc0bf4..00000000 --- a/ext/libdeflate/lib/bt_matchfinder.h +++ /dev/null @@ -1,355 +0,0 @@ -/* - * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * ---------------------------------------------------------------------------- - * - * This is a Binary Trees (bt) based matchfinder. - * - * The main data structure is a hash table where each hash bucket contains a - * binary tree of sequences whose first 4 bytes share the same hash code. Each - * sequence is identified by its starting position in the input buffer. Each - * binary tree is always sorted such that each left child represents a sequence - * lexicographically lesser than its parent and each right child represents a - * sequence lexicographically greater than its parent. - * - * The algorithm processes the input buffer sequentially. At each byte - * position, the hash code of the first 4 bytes of the sequence beginning at - * that position (the sequence being matched against) is computed. This - * identifies the hash bucket to use for that position. Then, a new binary tree - * node is created to represent the current sequence. Then, in a single tree - * traversal, the hash bucket's binary tree is searched for matches and is - * re-rooted at the new node. - * - * Compared to the simpler algorithm that uses linked lists instead of binary - * trees (see hc_matchfinder.h), the binary tree version gains more information - * at each node visitation. Ideally, the binary tree version will examine only - * 'log(n)' nodes to find the same matches that the linked list version will - * find by examining 'n' nodes. In addition, the binary tree version can - * examine fewer bytes at each node by taking advantage of the common prefixes - * that result from the sort order, whereas the linked list version may have to - * examine up to the full length of the match at each node. - * - * However, it is not always best to use the binary tree version. It requires - * nearly twice as much memory as the linked list version, and it takes time to - * keep the binary trees sorted, even at positions where the compressor does not - * need matches. Generally, when doing fast compression on small buffers, - * binary trees are the wrong approach. They are best suited for thorough - * compression and/or large buffers. - * - * ---------------------------------------------------------------------------- - */ - - -#include "matchfinder_common.h" - -#define BT_MATCHFINDER_HASH3_ORDER 16 -#define BT_MATCHFINDER_HASH3_WAYS 2 -#define BT_MATCHFINDER_HASH4_ORDER 16 - -#define BT_MATCHFINDER_TOTAL_HASH_LENGTH \ - ((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \ - (1UL << BT_MATCHFINDER_HASH4_ORDER)) - -/* Representation of a match found by the bt_matchfinder */ -struct lz_match { - - /* The number of bytes matched. */ - u16 length; - - /* The offset back from the current position that was matched. */ - u16 offset; -}; - -struct bt_matchfinder { - - /* The hash table for finding length 3 matches */ - mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS]; - - /* The hash table which contains the roots of the binary trees for - * finding length 4+ matches */ - mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER]; - - /* The child node references for the binary trees. The left and right - * children of the node for the sequence with position 'pos' are - * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */ - mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE]; - -} -#ifdef _aligned_attribute -_aligned_attribute(MATCHFINDER_ALIGNMENT) -#endif -; - -/* Prepare the matchfinder for a new input buffer. */ -static forceinline void -bt_matchfinder_init(struct bt_matchfinder *mf) -{ - matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH); -} - -static forceinline void -bt_matchfinder_slide_window(struct bt_matchfinder *mf) -{ - matchfinder_rebase((mf_pos_t *)mf, - sizeof(struct bt_matchfinder) / sizeof(mf_pos_t)); -} - -static forceinline mf_pos_t * -bt_left_child(struct bt_matchfinder *mf, s32 node) -{ - return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0]; -} - -static forceinline mf_pos_t * -bt_right_child(struct bt_matchfinder *mf, s32 node) -{ - return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1]; -} - -/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches() - * and bt_matchfinder_skip_position(). There must be sufficiently many bytes - * remaining to load a 32-bit integer from the *next* position. */ -#define BT_MATCHFINDER_REQUIRED_NBYTES 5 - -/* Advance the binary tree matchfinder by one byte, optionally recording - * matches. @record_matches should be a compile-time constant. */ -static forceinline struct lz_match * -bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf, - const u8 * const restrict in_base, - const ptrdiff_t cur_pos, - const u32 max_len, - const u32 nice_len, - const u32 max_search_depth, - u32 * const restrict next_hashes, - u32 * const restrict best_len_ret, - struct lz_match * restrict lz_matchptr, - const bool record_matches) -{ - const u8 *in_next = in_base + cur_pos; - u32 depth_remaining = max_search_depth; - const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; - u32 next_hashseq; - u32 hash3; - u32 hash4; - s32 cur_node; -#if BT_MATCHFINDER_HASH3_WAYS >= 2 - s32 cur_node_2; -#endif - const u8 *matchptr; - mf_pos_t *pending_lt_ptr, *pending_gt_ptr; - u32 best_lt_len, best_gt_len; - u32 len; - u32 best_len = 3; - - STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 && - BT_MATCHFINDER_HASH3_WAYS <= 2); - - next_hashseq = get_unaligned_le32(in_next + 1); - - hash3 = next_hashes[0]; - hash4 = next_hashes[1]; - - next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER); - next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER); - prefetchw(&mf->hash3_tab[next_hashes[0]]); - prefetchw(&mf->hash4_tab[next_hashes[1]]); - - cur_node = mf->hash3_tab[hash3][0]; - mf->hash3_tab[hash3][0] = cur_pos; -#if BT_MATCHFINDER_HASH3_WAYS >= 2 - cur_node_2 = mf->hash3_tab[hash3][1]; - mf->hash3_tab[hash3][1] = cur_node; -#endif - if (record_matches && cur_node > cutoff) { - u32 seq3 = load_u24_unaligned(in_next); - if (seq3 == load_u24_unaligned(&in_base[cur_node])) { - lz_matchptr->length = 3; - lz_matchptr->offset = in_next - &in_base[cur_node]; - lz_matchptr++; - } - #if BT_MATCHFINDER_HASH3_WAYS >= 2 - else if (cur_node_2 > cutoff && - seq3 == load_u24_unaligned(&in_base[cur_node_2])) - { - lz_matchptr->length = 3; - lz_matchptr->offset = in_next - &in_base[cur_node_2]; - lz_matchptr++; - } - #endif - } - - cur_node = mf->hash4_tab[hash4]; - mf->hash4_tab[hash4] = cur_pos; - - pending_lt_ptr = bt_left_child(mf, cur_pos); - pending_gt_ptr = bt_right_child(mf, cur_pos); - - if (cur_node <= cutoff) { - *pending_lt_ptr = MATCHFINDER_INITVAL; - *pending_gt_ptr = MATCHFINDER_INITVAL; - *best_len_ret = best_len; - return lz_matchptr; - } - - best_lt_len = 0; - best_gt_len = 0; - len = 0; - - for (;;) { - matchptr = &in_base[cur_node]; - - if (matchptr[len] == in_next[len]) { - len = lz_extend(in_next, matchptr, len + 1, max_len); - if (!record_matches || len > best_len) { - if (record_matches) { - best_len = len; - lz_matchptr->length = len; - lz_matchptr->offset = in_next - matchptr; - lz_matchptr++; - } - if (len >= nice_len) { - *pending_lt_ptr = *bt_left_child(mf, cur_node); - *pending_gt_ptr = *bt_right_child(mf, cur_node); - *best_len_ret = best_len; - return lz_matchptr; - } - } - } - - if (matchptr[len] < in_next[len]) { - *pending_lt_ptr = cur_node; - pending_lt_ptr = bt_right_child(mf, cur_node); - cur_node = *pending_lt_ptr; - best_lt_len = len; - if (best_gt_len < len) - len = best_gt_len; - } else { - *pending_gt_ptr = cur_node; - pending_gt_ptr = bt_left_child(mf, cur_node); - cur_node = *pending_gt_ptr; - best_gt_len = len; - if (best_lt_len < len) - len = best_lt_len; - } - - if (cur_node <= cutoff || !--depth_remaining) { - *pending_lt_ptr = MATCHFINDER_INITVAL; - *pending_gt_ptr = MATCHFINDER_INITVAL; - *best_len_ret = best_len; - return lz_matchptr; - } - } -} - -/* - * Retrieve a list of matches with the current position. - * - * @mf - * The matchfinder structure. - * @in_base - * Pointer to the next byte in the input buffer to process _at the last - * time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_. - * @cur_pos - * The current position in the input buffer relative to @in_base (the - * position of the sequence being matched against). - * @max_len - * The maximum permissible match length at this position. Must be >= - * BT_MATCHFINDER_REQUIRED_NBYTES. - * @nice_len - * Stop searching if a match of at least this length is found. - * Must be <= @max_len. - * @max_search_depth - * Limit on the number of potential matches to consider. Must be >= 1. - * @next_hashes - * The precomputed hash codes for the sequence beginning at @in_next. - * These will be used and then updated with the precomputed hashcodes for - * the sequence beginning at @in_next + 1. - * @best_len_ret - * If a match of length >= 4 was found, then the length of the longest such - * match is written here; otherwise 3 is written here. (Note: this is - * redundant with the 'struct lz_match' array, but this is easier for the - * compiler to optimize when inlined and the caller immediately does a - * check against 'best_len'.) - * @lz_matchptr - * An array in which this function will record the matches. The recorded - * matches will be sorted by strictly increasing length and (non-strictly) - * increasing offset. The maximum number of matches that may be found is - * 'nice_len - 2'. - * - * The return value is a pointer to the next available slot in the @lz_matchptr - * array. (If no matches were found, this will be the same as @lz_matchptr.) - */ -static forceinline struct lz_match * -bt_matchfinder_get_matches(struct bt_matchfinder *mf, - const u8 *in_base, - ptrdiff_t cur_pos, - u32 max_len, - u32 nice_len, - u32 max_search_depth, - u32 next_hashes[2], - u32 *best_len_ret, - struct lz_match *lz_matchptr) -{ - return bt_matchfinder_advance_one_byte(mf, - in_base, - cur_pos, - max_len, - nice_len, - max_search_depth, - next_hashes, - best_len_ret, - lz_matchptr, - true); -} - -/* - * Advance the matchfinder, but don't record any matches. - * - * This is very similar to bt_matchfinder_get_matches() because both functions - * must do hashing and tree re-rooting. - */ -static forceinline void -bt_matchfinder_skip_position(struct bt_matchfinder *mf, - const u8 *in_base, - ptrdiff_t cur_pos, - u32 nice_len, - u32 max_search_depth, - u32 next_hashes[2]) -{ - u32 best_len; - bt_matchfinder_advance_one_byte(mf, - in_base, - cur_pos, - nice_len, - nice_len, - max_search_depth, - next_hashes, - &best_len, - NULL, - false); -} diff --git a/ext/libdeflate/lib/crc32.c b/ext/libdeflate/lib/crc32.c deleted file mode 100644 index 129149a1..00000000 --- a/ext/libdeflate/lib/crc32.c +++ /dev/null @@ -1,313 +0,0 @@ -/* - * crc32.c - CRC-32 checksum algorithm for the gzip format - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * High-level description of CRC - * ============================= - * - * Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message" - * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2), - * where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute: - * - * R(x) = M(x)*x^n mod G(x) - * - * where G(x) is a selected "generator" polynomial of degree 'n'. The remainder - * R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x) - * interpreted as a bitstring of length 'n'. - * - * CRC used in gzip - * ================ - * - * In the gzip format (RFC 1952): - * - * - The bitstring to checksum is formed from the bytes of the uncompressed - * data by concatenating the bits from the bytes in order, proceeding - * from the low-order bit to the high-order bit within each byte. - * - * - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 + - * x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1. - * Consequently, the CRC length is 32 bits ("CRC-32"). - * - * - The highest order 32 coefficients of M(x)*x^n are inverted. - * - * - All 32 coefficients of R(x) are inverted. - * - * The two inversions cause added leading and trailing zero bits to affect the - * resulting CRC, whereas with a regular CRC such bits would have no effect on - * the CRC. - * - * Computation and optimizations - * ============================= - * - * We can compute R(x) through "long division", maintaining only 32 bits of - * state at any given time. Multiplication by 'x' can be implemented as - * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the - * highest order bit represents the coefficient of x^0), and both addition and - * subtraction can be implemented as bitwise exclusive OR (since we are working - * in GF(2)). Here is an unoptimized implementation: - * - * static u32 crc32_gzip(const u8 *buffer, size_t size) - * { - * u32 remainder = 0; - * const u32 divisor = 0xEDB88320; - * - * for (size_t i = 0; i < size * 8 + 32; i++) { - * int bit; - * u32 multiple; - * - * if (i < size * 8) - * bit = (buffer[i / 8] >> (i % 8)) & 1; - * else - * bit = 0; // one of the 32 appended 0 bits - * - * if (i < 32) // the first 32 bits are inverted - * bit ^= 1; - * - * if (remainder & 1) - * multiple = divisor; - * else - * multiple = 0; - * - * remainder >>= 1; - * remainder |= (u32)bit << 31; - * remainder ^= multiple; - * } - * - * return ~remainder; - * } - * - * In this implementation, the 32-bit integer 'remainder' maintains the - * remainder of the currently processed portion of the message (with 32 zero - * bits appended) when divided by the generator polynomial. 'remainder' is the - * representation of R(x), and 'divisor' is the representation of G(x) excluding - * the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1', - * then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero - * x^32 term, then we subtract G(x) from R(x). - * - * We can speed this up by taking advantage of the fact that XOR is commutative - * and associative, so the order in which we combine the inputs into 'remainder' - * is unimportant. And since each message bit we add doesn't affect the choice - * of 'multiple' until 32 bits later, we need not actually add each message bit - * until that point: - * - * static u32 crc32_gzip(const u8 *buffer, size_t size) - * { - * u32 remainder = ~0; - * const u32 divisor = 0xEDB88320; - * - * for (size_t i = 0; i < size * 8; i++) { - * int bit; - * u32 multiple; - * - * bit = (buffer[i / 8] >> (i % 8)) & 1; - * remainder ^= bit; - * if (remainder & 1) - * multiple = divisor; - * else - * multiple = 0; - * remainder >>= 1; - * remainder ^= multiple; - * } - * - * return ~remainder; - * } - * - * With the above implementation we get the effect of 32 appended 0 bits for - * free; they never affect the choice of a divisor, nor would they change the - * value of 'remainder' if they were to be actually XOR'ed in. And by starting - * with a remainder of all 1 bits, we get the effect of complementing the first - * 32 message bits. - * - * The next optimization is to process the input in multi-bit units. Suppose - * that we insert the next 'n' message bits into the remainder. Then we get an - * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n' - * bits is the amount by which the low 32 bits of the remainder will change as a - * result of cancelling out those 'n' bits. Taking n=8 (one byte) and - * precomputing a table containing the CRC of each possible byte, we get - * crc32_slice1() defined below. - * - * As a further optimization, we could increase the multi-bit unit size to 16. - * However, that is inefficient because the table size explodes from 256 entries - * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't - * fit in L1 cache on typical processors. - * - * However, we can actually process 4 bytes at a time using 4 different tables - * with 256 entries each. Logically, we form a 64-bit intermediate remainder - * and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled - * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the - * CRC of those bits with 8 zero bits appended, and so on. This method is - * implemented in crc32_slice4(), defined below. - * - * In crc32_slice8(), this method is extended to 8 bytes at a time. The - * intermediate remainder (which we never actually store explicitly) is 96 bits. - * - * On CPUs that support fast carryless multiplication, CRCs can be computed even - * more quickly via "folding". See e.g. the x86 PCLMUL implementation. - */ - -#include "lib_common.h" -#include "libdeflate.h" - -typedef u32 (*crc32_func_t)(u32, const u8 *, size_t); - -/* Include architecture-specific implementations if available */ -#undef CRC32_SLICE1 -#undef CRC32_SLICE4 -#undef CRC32_SLICE8 -#undef DEFAULT_IMPL -#undef DISPATCH -#if defined(__arm__) || defined(__aarch64__) -# include "arm/crc32_impl.h" -#elif defined(__i386__) || defined(__x86_64__) -# include "x86/crc32_impl.h" -#endif - -/* - * Define a generic implementation (crc32_slice8()) if needed. crc32_slice1() - * may also be needed as a fallback for architecture-specific implementations. - */ - -#ifndef DEFAULT_IMPL -# define CRC32_SLICE8 1 -# define DEFAULT_IMPL crc32_slice8 -#endif - -#if defined(CRC32_SLICE1) || defined(CRC32_SLICE4) || defined(CRC32_SLICE8) -#include "crc32_table.h" -static forceinline u32 -crc32_update_byte(u32 remainder, u8 next_byte) -{ - return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte]; -} -#endif - -#ifdef CRC32_SLICE1 -static u32 -crc32_slice1(u32 remainder, const u8 *buffer, size_t size) -{ - size_t i; - - STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100); - - for (i = 0; i < size; i++) - remainder = crc32_update_byte(remainder, buffer[i]); - return remainder; -} -#endif /* CRC32_SLICE1 */ - -#ifdef CRC32_SLICE4 -static u32 -crc32_slice4(u32 remainder, const u8 *buffer, size_t size) -{ - const u8 *p = buffer; - const u8 *end = buffer + size; - const u8 *end32; - - STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400); - - for (; ((uintptr_t)p & 3) && p != end; p++) - remainder = crc32_update_byte(remainder, *p); - - end32 = p + ((end - p) & ~3); - for (; p != end32; p += 4) { - u32 v = le32_bswap(*(const u32 *)p); - remainder = - crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^ - crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^ - crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^ - crc32_table[0x000 + (u8)((remainder ^ v) >> 24)]; - } - - for (; p != end; p++) - remainder = crc32_update_byte(remainder, *p); - - return remainder; -} -#endif /* CRC32_SLICE4 */ - -#ifdef CRC32_SLICE8 -static u32 -crc32_slice8(u32 remainder, const u8 *buffer, size_t size) -{ - const u8 *p = buffer; - const u8 *end = buffer + size; - const u8 *end64; - - STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800); - - for (; ((uintptr_t)p & 7) && p != end; p++) - remainder = crc32_update_byte(remainder, *p); - - end64 = p + ((end - p) & ~7); - for (; p != end64; p += 8) { - u32 v1 = le32_bswap(*(const u32 *)(p + 0)); - u32 v2 = le32_bswap(*(const u32 *)(p + 4)); - remainder = - crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^ - crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^ - crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^ - crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^ - crc32_table[0x300 + (u8)(v2 >> 0)] ^ - crc32_table[0x200 + (u8)(v2 >> 8)] ^ - crc32_table[0x100 + (u8)(v2 >> 16)] ^ - crc32_table[0x000 + (u8)(v2 >> 24)]; - } - - for (; p != end; p++) - remainder = crc32_update_byte(remainder, *p); - - return remainder; -} -#endif /* CRC32_SLICE8 */ - -#ifdef DISPATCH -static u32 dispatch(u32, const u8 *, size_t); - -static volatile crc32_func_t crc32_impl = dispatch; - -/* Choose the fastest implementation at runtime */ -static u32 dispatch(u32 remainder, const u8 *buffer, size_t size) -{ - crc32_func_t f = arch_select_crc32_func(); - - if (f == NULL) - f = DEFAULT_IMPL; - - crc32_impl = f; - return crc32_impl(remainder, buffer, size); -} -#else -# define crc32_impl DEFAULT_IMPL /* only one implementation, use it */ -#endif - -LIBDEFLATEAPI u32 -libdeflate_crc32(u32 remainder, const void *buffer, size_t size) -{ - if (buffer == NULL) /* return initial value */ - return 0; - return ~crc32_impl(~remainder, buffer, size); -} diff --git a/ext/libdeflate/lib/crc32_table.h b/ext/libdeflate/lib/crc32_table.h deleted file mode 100644 index 05421b98..00000000 --- a/ext/libdeflate/lib/crc32_table.h +++ /dev/null @@ -1,526 +0,0 @@ -/* - * crc32_table.h - data table to accelerate CRC-32 computation - * - * THIS FILE WAS AUTOMATICALLY GENERATED BY gen_crc32_table.c. DO NOT EDIT. - */ - -#include - -static const uint32_t crc32_table[] = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, - 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, - 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, - 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, - 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, - 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, - 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, - 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, - 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, - 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, - 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, - 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, - 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, - 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, - 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, - 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, - 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, - 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, - 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, - 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, - 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, - 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, - 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, - 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, - 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, - 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, - 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, - 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, - 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, - 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, - 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, - 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, - 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, - 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, - 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, - 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, - 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, - 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, - 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, - 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, - 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, - 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, - 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, -#if defined(CRC32_SLICE4) || defined(CRC32_SLICE8) - 0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3, - 0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7, - 0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb, - 0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf, - 0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192, - 0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496, - 0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a, - 0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e, - 0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761, - 0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265, - 0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69, - 0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d, - 0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530, - 0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034, - 0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38, - 0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c, - 0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6, - 0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2, - 0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce, - 0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca, - 0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97, - 0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93, - 0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f, - 0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b, - 0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864, - 0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60, - 0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c, - 0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768, - 0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35, - 0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31, - 0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d, - 0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539, - 0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88, - 0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c, - 0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180, - 0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484, - 0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9, - 0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd, - 0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1, - 0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5, - 0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a, - 0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e, - 0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522, - 0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026, - 0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b, - 0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f, - 0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773, - 0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277, - 0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d, - 0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189, - 0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85, - 0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81, - 0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc, - 0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8, - 0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4, - 0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0, - 0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f, - 0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b, - 0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27, - 0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23, - 0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e, - 0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a, - 0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876, - 0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72, - 0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59, - 0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685, - 0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1, - 0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d, - 0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29, - 0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5, - 0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91, - 0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d, - 0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9, - 0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065, - 0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901, - 0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd, - 0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9, - 0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315, - 0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71, - 0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad, - 0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399, - 0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45, - 0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221, - 0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd, - 0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9, - 0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835, - 0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151, - 0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d, - 0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579, - 0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5, - 0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1, - 0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d, - 0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609, - 0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5, - 0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1, - 0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d, - 0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9, - 0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05, - 0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461, - 0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd, - 0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9, - 0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75, - 0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711, - 0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd, - 0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339, - 0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5, - 0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281, - 0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d, - 0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049, - 0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895, - 0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1, - 0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d, - 0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819, - 0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5, - 0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1, - 0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d, - 0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69, - 0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5, - 0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1, - 0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d, - 0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9, - 0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625, - 0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41, - 0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d, - 0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89, - 0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555, - 0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31, - 0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed, - 0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, - 0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9, - 0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701, - 0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056, - 0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871, - 0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26, - 0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e, - 0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9, - 0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0, - 0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787, - 0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f, - 0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68, - 0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f, - 0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018, - 0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0, - 0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7, - 0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3, - 0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084, - 0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c, - 0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b, - 0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c, - 0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b, - 0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3, - 0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4, - 0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed, - 0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba, - 0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002, - 0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755, - 0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72, - 0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825, - 0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d, - 0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca, - 0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5, - 0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82, - 0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a, - 0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d, - 0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a, - 0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d, - 0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5, - 0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2, - 0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb, - 0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc, - 0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04, - 0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953, - 0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174, - 0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623, - 0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b, - 0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc, - 0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8, - 0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf, - 0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907, - 0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50, - 0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677, - 0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120, - 0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98, - 0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf, - 0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6, - 0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981, - 0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639, - 0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e, - 0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949, - 0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e, - 0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6, - 0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1, -#endif /* CRC32_SLICE4 || CRC32_SLICE8 */ -#if defined(CRC32_SLICE8) - 0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0, - 0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10, - 0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111, - 0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1, - 0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52, - 0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92, - 0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693, - 0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053, - 0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4, - 0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314, - 0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15, - 0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5, - 0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256, - 0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496, - 0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997, - 0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57, - 0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299, - 0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459, - 0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958, - 0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98, - 0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b, - 0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db, - 0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda, - 0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a, - 0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d, - 0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d, - 0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c, - 0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c, - 0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f, - 0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf, - 0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de, - 0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e, - 0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42, - 0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82, - 0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183, - 0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743, - 0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0, - 0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00, - 0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601, - 0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1, - 0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546, - 0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386, - 0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87, - 0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847, - 0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4, - 0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404, - 0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905, - 0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5, - 0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b, - 0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb, - 0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca, - 0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a, - 0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589, - 0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349, - 0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48, - 0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888, - 0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f, - 0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf, - 0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce, - 0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e, - 0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d, - 0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d, - 0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c, - 0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c, - 0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae, - 0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8, - 0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3, - 0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5, - 0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035, - 0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223, - 0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258, - 0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e, - 0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798, - 0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e, - 0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5, - 0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3, - 0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503, - 0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715, - 0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e, - 0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578, - 0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2, - 0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4, - 0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf, - 0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9, - 0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59, - 0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f, - 0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834, - 0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22, - 0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4, - 0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2, - 0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99, - 0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f, - 0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f, - 0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79, - 0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02, - 0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14, - 0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676, - 0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460, - 0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b, - 0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d, - 0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed, - 0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb, - 0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680, - 0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496, - 0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340, - 0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156, - 0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d, - 0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b, - 0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db, - 0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd, - 0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6, - 0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0, - 0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a, - 0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c, - 0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77, - 0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61, - 0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81, - 0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97, - 0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec, - 0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa, - 0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c, - 0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a, - 0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41, - 0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957, - 0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7, - 0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1, - 0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da, - 0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc, - 0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d, - 0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e, - 0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa, - 0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9, - 0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653, - 0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240, - 0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834, - 0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27, - 0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301, - 0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712, - 0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66, - 0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975, - 0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf, - 0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc, - 0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8, - 0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb, - 0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4, - 0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7, - 0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183, - 0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590, - 0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a, - 0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739, - 0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d, - 0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e, - 0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678, - 0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b, - 0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f, - 0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c, - 0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6, - 0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5, - 0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1, - 0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2, - 0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f, - 0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c, - 0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08, - 0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b, - 0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1, - 0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2, - 0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6, - 0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5, - 0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3, - 0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0, - 0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794, - 0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387, - 0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d, - 0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e, - 0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a, - 0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49, - 0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516, - 0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105, - 0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71, - 0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62, - 0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8, - 0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb, - 0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf, - 0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac, - 0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a, - 0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899, - 0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed, - 0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe, - 0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044, - 0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457, - 0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23, - 0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30, - 0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, - 0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919, - 0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56, - 0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac, - 0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8, - 0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832, - 0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d, - 0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387, - 0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5, - 0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f, - 0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00, - 0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa, - 0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e, - 0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64, - 0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b, - 0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1, - 0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e, - 0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4, - 0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb, - 0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041, - 0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425, - 0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf, - 0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90, - 0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a, - 0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758, - 0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2, - 0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced, - 0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217, - 0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673, - 0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889, - 0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6, - 0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c, - 0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239, - 0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3, - 0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c, - 0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776, - 0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312, - 0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8, - 0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7, - 0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d, - 0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f, - 0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95, - 0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda, - 0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520, - 0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144, - 0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe, - 0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1, - 0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b, - 0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4, - 0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e, - 0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61, - 0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b, - 0x061d761c, 0xcab77682, 0x44387161, 0x889271ff, - 0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05, - 0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a, - 0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0, - 0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282, - 0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78, - 0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937, - 0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd, - 0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9, - 0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53, - 0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c, - 0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6, -#endif /* CRC32_SLICE8 */ -}; diff --git a/ext/libdeflate/lib/crc32_vec_template.h b/ext/libdeflate/lib/crc32_vec_template.h deleted file mode 100644 index 9a2ad5bd..00000000 --- a/ext/libdeflate/lib/crc32_vec_template.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * crc32_vec_template.h - template for vectorized CRC-32 implementations - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#define CRC32_SLICE1 1 -static u32 crc32_slice1(u32, const u8 *, size_t); - -/* - * Template for vectorized CRC-32 implementations. - * - * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead - * of crc32_slice8() because only a few bytes need to be processed, so a smaller - * table is preferable. - */ -static u32 ATTRIBUTES -FUNCNAME(u32 remainder, const u8 *p, size_t size) -{ - if ((uintptr_t)p % IMPL_ALIGNMENT) { - size_t n = MIN(size, -(uintptr_t)p % IMPL_ALIGNMENT); - - remainder = crc32_slice1(remainder, p, n); - p += n; - size -= n; - } - if (size >= IMPL_SEGMENT_SIZE) { - remainder = FUNCNAME_ALIGNED(remainder, (const void *)p, - size / IMPL_SEGMENT_SIZE); - p += size - (size % IMPL_SEGMENT_SIZE); - size %= IMPL_SEGMENT_SIZE; - } - return crc32_slice1(remainder, p, size); -} - -#undef FUNCNAME -#undef FUNCNAME_ALIGNED -#undef ATTRIBUTES -#undef IMPL_ALIGNMENT -#undef IMPL_SEGMENT_SIZE diff --git a/ext/libdeflate/lib/decompress_template.h b/ext/libdeflate/lib/decompress_template.h deleted file mode 100644 index c6bcf9f5..00000000 --- a/ext/libdeflate/lib/decompress_template.h +++ /dev/null @@ -1,421 +0,0 @@ -/* - * decompress_template.h - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This is the actual DEFLATE decompression routine, lifted out of - * deflate_decompress.c so that it can be compiled multiple times with different - * target instruction sets. - */ - -static enum libdeflate_result ATTRIBUTES -FUNCNAME(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) -{ - u8 *out_next = out; - u8 * const out_end = out_next + out_nbytes_avail; - const u8 *in_next = in; - const u8 * const in_end = in_next + in_nbytes; - bitbuf_t bitbuf = 0; - unsigned bitsleft = 0; - size_t overrun_count = 0; - unsigned i; - unsigned is_final_block; - unsigned block_type; - u16 len; - u16 nlen; - unsigned num_litlen_syms; - unsigned num_offset_syms; - u16 tmp16; - u32 tmp32; - -next_block: - /* Starting to read the next block. */ - ; - - STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4)); - ENSURE_BITS(1 + 2 + 5 + 5 + 4); - - /* BFINAL: 1 bit */ - is_final_block = POP_BITS(1); - - /* BTYPE: 2 bits */ - block_type = POP_BITS(2); - - if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { - - /* Dynamic Huffman block. */ - - /* The order in which precode lengths are stored. */ - static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 - }; - - unsigned num_explicit_precode_lens; - - /* Read the codeword length counts. */ - - STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257); - num_litlen_syms = POP_BITS(5) + 257; - - STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1); - num_offset_syms = POP_BITS(5) + 1; - - STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4); - num_explicit_precode_lens = POP_BITS(4) + 4; - - d->static_codes_loaded = false; - - /* Read the precode codeword lengths. */ - STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); - for (i = 0; i < num_explicit_precode_lens; i++) { - ENSURE_BITS(3); - d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3); - } - - for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) - d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; - - /* Build the decode table for the precode. */ - SAFETY_CHECK(build_precode_decode_table(d)); - - /* Expand the literal/length and offset codeword lengths. */ - for (i = 0; i < num_litlen_syms + num_offset_syms; ) { - u32 entry; - unsigned presym; - u8 rep_val; - unsigned rep_count; - - ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7); - - /* (The code below assumes that the precode decode table - * does not have any subtables.) */ - STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); - - /* Read the next precode symbol. */ - entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)]; - REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); - presym = entry >> HUFFDEC_RESULT_SHIFT; - - if (presym < 16) { - /* Explicit codeword length */ - d->u.l.lens[i++] = presym; - continue; - } - - /* Run-length encoded codeword lengths */ - - /* Note: we don't need verify that the repeat count - * doesn't overflow the number of elements, since we - * have enough extra spaces to allow for the worst-case - * overflow (138 zeroes when only 1 length was - * remaining). - * - * In the case of the small repeat counts (presyms 16 - * and 17), it is fastest to always write the maximum - * number of entries. That gets rid of branches that - * would otherwise be required. - * - * It is not just because of the numerical order that - * our checks go in the order 'presym < 16', 'presym == - * 16', and 'presym == 17'. For typical data this is - * ordered from most frequent to least frequent case. - */ - STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); - - if (presym == 16) { - /* Repeat the previous length 3 - 6 times */ - SAFETY_CHECK(i != 0); - rep_val = d->u.l.lens[i - 1]; - STATIC_ASSERT(3 + ((1 << 2) - 1) == 6); - rep_count = 3 + POP_BITS(2); - d->u.l.lens[i + 0] = rep_val; - d->u.l.lens[i + 1] = rep_val; - d->u.l.lens[i + 2] = rep_val; - d->u.l.lens[i + 3] = rep_val; - d->u.l.lens[i + 4] = rep_val; - d->u.l.lens[i + 5] = rep_val; - i += rep_count; - } else if (presym == 17) { - /* Repeat zero 3 - 10 times */ - STATIC_ASSERT(3 + ((1 << 3) - 1) == 10); - rep_count = 3 + POP_BITS(3); - d->u.l.lens[i + 0] = 0; - d->u.l.lens[i + 1] = 0; - d->u.l.lens[i + 2] = 0; - d->u.l.lens[i + 3] = 0; - d->u.l.lens[i + 4] = 0; - d->u.l.lens[i + 5] = 0; - d->u.l.lens[i + 6] = 0; - d->u.l.lens[i + 7] = 0; - d->u.l.lens[i + 8] = 0; - d->u.l.lens[i + 9] = 0; - i += rep_count; - } else { - /* Repeat zero 11 - 138 times */ - STATIC_ASSERT(11 + ((1 << 7) - 1) == 138); - rep_count = 11 + POP_BITS(7); - memset(&d->u.l.lens[i], 0, - rep_count * sizeof(d->u.l.lens[i])); - i += rep_count; - } - } - } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { - - /* Uncompressed block: copy 'len' bytes literally from the input - * buffer to the output buffer. */ - - ALIGN_INPUT(); - - SAFETY_CHECK(in_end - in_next >= 4); - - len = READ_U16(); - nlen = READ_U16(); - - SAFETY_CHECK(len == (u16)~nlen); - if (unlikely(len > out_end - out_next)) - return LIBDEFLATE_INSUFFICIENT_SPACE; - SAFETY_CHECK(len <= in_end - in_next); - - memcpy(out_next, in_next, len); - in_next += len; - out_next += len; - - goto block_done; - - } else { - SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); - - /* - * Static Huffman block: build the decode tables for the static - * codes. Skip doing so if the tables are already set up from - * an earlier static block; this speeds up decompression of - * degenerate input of many empty or very short static blocks. - * - * Afterwards, the remainder is the same as decompressing a - * dynamic Huffman block. - */ - - if (d->static_codes_loaded) - goto have_decode_tables; - - d->static_codes_loaded = true; - - STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); - STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); - - for (i = 0; i < 144; i++) - d->u.l.lens[i] = 8; - for (; i < 256; i++) - d->u.l.lens[i] = 9; - for (; i < 280; i++) - d->u.l.lens[i] = 7; - for (; i < 288; i++) - d->u.l.lens[i] = 8; - - for (; i < 288 + 32; i++) - d->u.l.lens[i] = 5; - - num_litlen_syms = 288; - num_offset_syms = 32; - } - - /* Decompressing a Huffman block (either dynamic or static) */ - - SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); - SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); -have_decode_tables: - - /* The main DEFLATE decode loop */ - for (;;) { - u32 entry; - u32 length; - u32 offset; - const u8 *src; - u8 *dst; - - /* Decode a litlen symbol. */ - ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN); - entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)]; - if (entry & HUFFDEC_SUBTABLE_POINTER) { - /* Litlen subtable required (uncommon case) */ - REMOVE_BITS(LITLEN_TABLEBITS); - entry = d->u.litlen_decode_table[ - ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + - BITS(entry & HUFFDEC_LENGTH_MASK)]; - } - REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); - if (entry & HUFFDEC_LITERAL) { - /* Literal */ - if (unlikely(out_next == out_end)) - return LIBDEFLATE_INSUFFICIENT_SPACE; - *out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT); - continue; - } - - /* Match or end-of-block */ - - entry >>= HUFFDEC_RESULT_SHIFT; - ENSURE_BITS(MAX_ENSURE); - - /* Pop the extra length bits and add them to the length base to - * produce the full length. */ - length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) + - POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK); - - /* The match destination must not end after the end of the - * output buffer. For efficiency, combine this check with the - * end-of-block check. We're using 0 for the special - * end-of-block length, so subtract 1 and it turn it into - * SIZE_MAX. */ - STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0); - if (unlikely((size_t)length - 1 >= out_end - out_next)) { - if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH)) - return LIBDEFLATE_INSUFFICIENT_SPACE; - goto block_done; - } - - /* Decode the match offset. */ - - entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)]; - if (entry & HUFFDEC_SUBTABLE_POINTER) { - /* Offset subtable required (uncommon case) */ - REMOVE_BITS(OFFSET_TABLEBITS); - entry = d->offset_decode_table[ - ((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) + - BITS(entry & HUFFDEC_LENGTH_MASK)]; - } - REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK); - entry >>= HUFFDEC_RESULT_SHIFT; - - STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + - DEFLATE_MAX_OFFSET_CODEWORD_LEN) && - CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS)); - if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS + - DEFLATE_MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS); - - /* Pop the extra offset bits and add them to the offset base to - * produce the full offset. */ - offset = (entry & HUFFDEC_OFFSET_BASE_MASK) + - POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT); - - /* The match source must not begin before the beginning of the - * output buffer. */ - SAFETY_CHECK(offset <= out_next - (const u8 *)out); - - /* - * Copy the match: 'length' bytes at 'out_next - offset' to - * 'out_next', possibly overlapping. If the match doesn't end - * too close to the end of the buffer and offset >= WORDBYTES || - * offset == 1, take a fast path which copies a word at a time - * -- potentially more than the length of the match, but that's - * fine as long as we check for enough extra space. - * - * The remaining cases are not performance-critical so are - * handled by a simple byte-by-byte copy. - */ - - src = out_next - offset; - dst = out_next; - out_next += length; - - if (UNALIGNED_ACCESS_IS_FAST && - /* max overrun is writing 3 words for a min length match */ - likely(out_end - out_next >= - 3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) { - if (offset >= WORDBYTES) { /* words don't overlap? */ - copy_word_unaligned(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - copy_word_unaligned(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - do { - copy_word_unaligned(src, dst); - src += WORDBYTES; - dst += WORDBYTES; - } while (dst < out_next); - } else if (offset == 1) { - /* RLE encoding of previous byte, common if the - * data contains many repeated bytes */ - machine_word_t v = repeat_byte(*src); - - store_word_unaligned(v, dst); - dst += WORDBYTES; - store_word_unaligned(v, dst); - dst += WORDBYTES; - do { - store_word_unaligned(v, dst); - dst += WORDBYTES; - } while (dst < out_next); - } else { - *dst++ = *src++; - *dst++ = *src++; - do { - *dst++ = *src++; - } while (dst < out_next); - } - } else { - STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); - *dst++ = *src++; - *dst++ = *src++; - do { - *dst++ = *src++; - } while (dst < out_next); - } - } - -block_done: - /* Finished decoding a block. */ - - if (!is_final_block) - goto next_block; - - /* That was the last block. */ - - /* Discard any readahead bits and check for excessive overread */ - ALIGN_INPUT(); - - /* Optionally return the actual number of bytes read */ - if (actual_in_nbytes_ret) - *actual_in_nbytes_ret = in_next - (u8 *)in; - - /* Optionally return the actual number of bytes written */ - if (actual_out_nbytes_ret) { - *actual_out_nbytes_ret = out_next - (u8 *)out; - } else { - if (out_next != out_end) - return LIBDEFLATE_SHORT_OUTPUT; - } - return LIBDEFLATE_SUCCESS; -} - -#undef FUNCNAME -#undef ATTRIBUTES diff --git a/ext/libdeflate/lib/deflate_compress.c b/ext/libdeflate/lib/deflate_compress.c deleted file mode 100644 index 5049b13e..00000000 --- a/ext/libdeflate/lib/deflate_compress.c +++ /dev/null @@ -1,2826 +0,0 @@ -/* - * deflate_compress.c - a compressor for DEFLATE - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include -#include - -#include "aligned_malloc.h" -#include "deflate_compress.h" -#include "deflate_constants.h" -#include "unaligned.h" - -#include "libdeflate.h" - -/* - * By default, the near-optimal parsing algorithm is enabled at compression - * level 8 and above. The near-optimal parsing algorithm produces a compression - * ratio significantly better than the greedy and lazy algorithms implemented - * here, and also the algorithm used by zlib at level 9. However, it is slow. - */ -#define SUPPORT_NEAR_OPTIMAL_PARSING 1 - -/* - * Define to 1 to maintain the full map from match offsets to offset slots. - * This slightly speeds up translations of match offsets to offset slots, but it - * uses 32769 bytes of memory rather than the 512 bytes used by the condensed - * map. The speedup provided by the larger map is most helpful when the - * near-optimal parsing algorithm is being used. - */ -#define USE_FULL_OFFSET_SLOT_FAST SUPPORT_NEAR_OPTIMAL_PARSING - -/* - * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters - * appropriately. - */ -#define MATCHFINDER_WINDOW_ORDER 15 - -#include "hc_matchfinder.h" -#if SUPPORT_NEAR_OPTIMAL_PARSING -# include "bt_matchfinder.h" -#endif - -/* - * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes, - * except if the last block has to be shorter. - */ -#define MIN_BLOCK_LENGTH 10000 - -/* - * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but - * the final length might be slightly longer due to matches extending beyond - * this limit. - */ -#define SOFT_MAX_BLOCK_LENGTH 300000 - -/* - * The number of observed matches or literals that represents sufficient data to - * decide whether the current block should be terminated or not. - */ -#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 - - -#if SUPPORT_NEAR_OPTIMAL_PARSING -/* Constants specific to the near-optimal parsing algorithm */ - -/* - * The maximum number of matches the matchfinder can find at a single position. - * Since the matchfinder never finds more than one match for the same length, - * presuming one of each possible length is sufficient for an upper bound. - * (This says nothing about whether it is worthwhile to consider so many - * matches; this is just defining the worst case.) - */ -# define MAX_MATCHES_PER_POS (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) - -/* - * The number of lz_match structures in the match cache, excluding the extra - * "overflow" entries. This value should be high enough so that nearly the - * time, all matches found in a given block can fit in the match cache. - * However, fallback behavior (immediately terminating the block) on cache - * overflow is still required. - */ -# define CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) - -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* - * These are the compressor-side limits on the codeword lengths for each Huffman - * code. To make outputting bits slightly faster, some of these limits are - * lower than the limits defined by the DEFLATE format. This does not - * significantly affect the compression ratio, at least for the block lengths we - * use. - */ -#define MAX_LITLEN_CODEWORD_LEN 14 -#define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN -#define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN - -/* Table: length slot => length slot base value */ -static const unsigned deflate_length_slot_base[] = { - 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , - 11 , 13 , 15 , 17 , 19 , 23 , 27 , 31 , - 35 , 43 , 51 , 59 , 67 , 83 , 99 , 115 , - 131 , 163 , 195 , 227 , 258 , -}; - -/* Table: length slot => number of extra length bits */ -static const u8 deflate_extra_length_bits[] = { - 0 , 0 , 0 , 0 , 0 , 0 , 0 , 0 , - 1 , 1 , 1 , 1 , 2 , 2 , 2 , 2 , - 3 , 3 , 3 , 3 , 4 , 4 , 4 , 4 , - 5 , 5 , 5 , 5 , 0 , -}; - -/* Table: offset slot => offset slot base value */ -static const unsigned deflate_offset_slot_base[] = { - 1 , 2 , 3 , 4 , 5 , 7 , 9 , 13 , - 17 , 25 , 33 , 49 , 65 , 97 , 129 , 193 , - 257 , 385 , 513 , 769 , 1025 , 1537 , 2049 , 3073 , - 4097 , 6145 , 8193 , 12289 , 16385 , 24577 , -}; - -/* Table: offset slot => number of extra offset bits */ -static const u8 deflate_extra_offset_bits[] = { - 0 , 0 , 0 , 0 , 1 , 1 , 2 , 2 , - 3 , 3 , 4 , 4 , 5 , 5 , 6 , 6 , - 7 , 7 , 8 , 8 , 9 , 9 , 10 , 10 , - 11 , 11 , 12 , 12 , 13 , 13 , -}; - -/* Table: length => length slot */ -static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { - 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, - 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, - 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, - 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, - 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, - 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, - 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, - 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, - 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 28, -}; - -/* The order in which precode codeword lengths are stored */ -static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 -}; - -/* Codewords for the DEFLATE Huffman codes. */ -struct deflate_codewords { - u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u32 offset[DEFLATE_NUM_OFFSET_SYMS]; -}; - -/* Codeword lengths (in bits) for the DEFLATE Huffman codes. - * A zero length means the corresponding symbol had zero frequency. */ -struct deflate_lens { - u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u8 offset[DEFLATE_NUM_OFFSET_SYMS]; -}; - -/* Codewords and lengths for the DEFLATE Huffman codes. */ -struct deflate_codes { - struct deflate_codewords codewords; - struct deflate_lens lens; -}; - -/* Symbol frequency counters for the DEFLATE Huffman codes. */ -struct deflate_freqs { - u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u32 offset[DEFLATE_NUM_OFFSET_SYMS]; -}; - -#if SUPPORT_NEAR_OPTIMAL_PARSING - -/* Costs for the near-optimal parsing algorithm. */ -struct deflate_costs { - - /* The cost to output each possible literal. */ - u32 literal[DEFLATE_NUM_LITERALS]; - - /* The cost to output each possible match length. */ - u32 length[DEFLATE_MAX_MATCH_LEN + 1]; - - /* The cost to output a match offset of each possible offset slot. */ - u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; -}; - -/* - * COST_SHIFT is a scaling factor that makes it possible to consider fractional - * bit costs. A token requiring 'n' bits to represent has cost n << COST_SHIFT. - * - * Note: this is only useful as a statistical trick for when the true costs are - * unknown. In reality, each token in DEFLATE requires a whole number of bits - * to output. - */ -#define COST_SHIFT 3 - -/* - * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to - * be needed to output a symbol that was unused in the previous optimization - * pass. Assigning a default cost allows the symbol to be used in the next - * optimization pass. However, the cost should be relatively high because the - * symbol probably won't be used very many times (if at all). - */ -#define LITERAL_NOSTAT_BITS 13 -#define LENGTH_NOSTAT_BITS 13 -#define OFFSET_NOSTAT_BITS 10 - -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* - * Represents a run of literals followed by a match or end-of-block. This - * struct is needed to temporarily store items chosen by the parser, since items - * cannot be written until all items for the block have been chosen and the - * block's Huffman codes have been computed. - */ -struct deflate_sequence { - - /* Bits 0..22: the number of literals in this run. This may be 0 and - * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not - * stored explicitly in this structure; instead, they are read directly - * from the uncompressed data. - * - * Bits 23..31: the length of the match which follows the literals, or 0 - * if this literal run was the last in the block, so there is no match - * which follows it. */ - u32 litrunlen_and_length; - - /* If 'length' doesn't indicate end-of-block, then this is the offset of - * the match which follows the literals. */ - u16 offset; - - /* If 'length' doesn't indicate end-of-block, then this is the offset - * symbol of the match which follows the literals. */ - u8 offset_symbol; - - /* If 'length' doesn't indicate end-of-block, then this is the length - * slot of the match which follows the literals. */ - u8 length_slot; -}; - -#if SUPPORT_NEAR_OPTIMAL_PARSING - -/* - * This structure represents a byte position in the input data and a node in the - * graph of possible match/literal choices for the current block. - * - * Logically, each incoming edge to this node is labeled with a literal or a - * match that can be taken to reach this position from an earlier position; and - * each outgoing edge from this node is labeled with a literal or a match that - * can be taken to advance from this position to a later position. - * - * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we - * associate with each node just two pieces of information: - * - * 'cost_to_end' is the minimum cost to reach the end of the block from - * this position. - * - * 'item' represents the literal or match that must be chosen from here to - * reach the end of the block with the minimum cost. Equivalently, this - * can be interpreted as the label of the outgoing edge on the minimum-cost - * path to the "end of block" node from this node. - */ -struct deflate_optimum_node { - - u32 cost_to_end; - - /* - * Notes on the match/literal representation used here: - * - * The low bits of 'item' are the length: 1 if this is a literal, - * or the match length if this is a match. - * - * The high bits of 'item' are the actual literal byte if this is a - * literal, or the match offset if this is a match. - */ -#define OPTIMUM_OFFSET_SHIFT 9 -#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) - u32 item; - -}; - -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* Block split statistics. See "Block splitting algorithm" below. */ -#define NUM_LITERAL_OBSERVATION_TYPES 8 -#define NUM_MATCH_OBSERVATION_TYPES 2 -#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES) -struct block_split_stats { - u32 new_observations[NUM_OBSERVATION_TYPES]; - u32 observations[NUM_OBSERVATION_TYPES]; - u32 num_new_observations; - u32 num_observations; -}; - -/* The main DEFLATE compressor structure */ -struct libdeflate_compressor { - - /* Pointer to the compress() implementation chosen at allocation time */ - size_t (*impl)(struct libdeflate_compressor *, - const u8 *, size_t, u8 *, size_t); - - /* Frequency counters for the current block */ - struct deflate_freqs freqs; - - /* Dynamic Huffman codes for the current block */ - struct deflate_codes codes; - - /* Static Huffman codes */ - struct deflate_codes static_codes; - - /* Block split statistics for the currently pending block */ - struct block_split_stats split_stats; - - /* A table for fast lookups of offset slot by match offset. - * - * If the full table is being used, it is a direct mapping from offset - * to offset slot. - * - * If the condensed table is being used, the first 256 entries map - * directly to the offset slots of offsets 1 through 256. The next 256 - * entries map to the offset slots for the remaining offsets, stepping - * through the offsets with a stride of 128. This relies on the fact - * that each of the remaining offset slots contains at least 128 offsets - * and has an offset base that is a multiple of 128. */ -#if USE_FULL_OFFSET_SLOT_FAST - u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1]; -#else - u8 offset_slot_fast[512]; -#endif - - /* The "nice" match length: if a match of this length is found, choose - * it immediately without further consideration. */ - unsigned nice_match_length; - - /* The maximum search depth: consider at most this many potential - * matches at each position. */ - unsigned max_search_depth; - - /* The compression level with which this compressor was created. */ - unsigned compression_level; - - /* Temporary space for Huffman code output */ - u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS]; - u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; - u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS]; - unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS]; - unsigned num_litlen_syms; - unsigned num_offset_syms; - unsigned num_explicit_lens; - unsigned num_precode_items; - - union { - /* Data for greedy or lazy parsing */ - struct { - /* Hash chain matchfinder */ - struct hc_matchfinder hc_mf; - - /* The matches and literals that the parser has chosen - * for the current block. The required length of this - * array is limited by the maximum number of matches - * that can ever be chosen for a single block, plus one - * for the special entry at the end. */ - struct deflate_sequence sequences[ - DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH, - DEFLATE_MIN_MATCH_LEN) + 1]; - } g; /* (g)reedy */ - - #if SUPPORT_NEAR_OPTIMAL_PARSING - /* Data for near-optimal parsing */ - struct { - - /* Binary tree matchfinder */ - struct bt_matchfinder bt_mf; - - /* - * Cached matches for the current block. This array - * contains the matches that were found at each position - * in the block. Specifically, for each position, there - * is a list of matches found at that position, if any, - * sorted by strictly increasing length. In addition, - * following the matches for each position, there is a - * special 'struct lz_match' whose 'length' member - * contains the number of matches found at that - * position, and whose 'offset' member contains the - * literal at that position. - * - * Note: in rare cases, there will be a very high number - * of matches in the block and this array will overflow. - * If this happens, we force the end of the current - * block. CACHE_LENGTH is the length at which we - * actually check for overflow. The extra slots beyond - * this are enough to absorb the worst case overflow, - * which occurs if starting at &match_cache[CACHE_LENGTH - * - 1], we write MAX_MATCHES_PER_POS matches and a - * match count header, then skip searching for matches - * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write - * the match count header for each. - */ - struct lz_match match_cache[CACHE_LENGTH + - MAX_MATCHES_PER_POS + - DEFLATE_MAX_MATCH_LEN - 1]; - - /* - * Array of nodes, one per position, for running the - * minimum-cost path algorithm. - * - * This array must be large enough to accommodate the - * worst-case number of nodes, which occurs if we find a - * match of length DEFLATE_MAX_MATCH_LEN at position - * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of - * length SOFT_MAX_BLOCK_LENGTH - 1 + - * DEFLATE_MAX_MATCH_LEN. Add one for the end-of-block - * node. - */ - struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 + - DEFLATE_MAX_MATCH_LEN + 1]; - - /* The current cost model being used. */ - struct deflate_costs costs; - - unsigned num_optim_passes; - } n; /* (n)ear-optimal */ - #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - - } p; /* (p)arser */ -}; - -/* - * The type for the bitbuffer variable, which temporarily holds bits that are - * being packed into bytes and written to the output buffer. For best - * performance, this should have size equal to a machine word. - */ -typedef machine_word_t bitbuf_t; -#define BITBUF_NBITS (8 * sizeof(bitbuf_t)) - -/* Can the specified number of bits always be added to 'bitbuf' after any - * pending bytes have been flushed? */ -#define CAN_BUFFER(n) ((n) <= BITBUF_NBITS - 7) - -/* - * Structure to keep track of the current state of sending bits to the - * compressed output buffer. - */ -struct deflate_output_bitstream { - - /* Bits that haven't yet been written to the output buffer. */ - bitbuf_t bitbuf; - - /* Number of bits currently held in @bitbuf. */ - unsigned bitcount; - - /* Pointer to the beginning of the output buffer. */ - u8 *begin; - - /* Pointer to the position in the output buffer at which the next byte - * should be written. */ - u8 *next; - - /* Pointer just past the end of the output buffer. */ - u8 *end; -}; - -/* - * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be - * present following os->end, in order to not overrun the buffer when generating - * output. When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t) - * bytes for put_unaligned_leword(). Otherwise we need only 1 byte. However, - * to make the compression algorithm produce the same result on all CPU - * architectures (which is sometimes desirable), we have to unconditionally use - * the maximum for any CPU, which is sizeof(bitbuf_t) == 8. - */ -#define OUTPUT_END_PADDING 8 - -/* Initialize the output bitstream. 'size' is assumed to be at least - * OUTPUT_END_PADDING. */ -static void -deflate_init_output(struct deflate_output_bitstream *os, - void *buffer, size_t size) -{ - os->bitbuf = 0; - os->bitcount = 0; - os->begin = buffer; - os->next = os->begin; - os->end = os->begin + size - OUTPUT_END_PADDING; -} - -/* Add some bits to the bitbuffer variable of the output bitstream. The caller - * must make sure there is enough room. */ -static forceinline void -deflate_add_bits(struct deflate_output_bitstream *os, - const bitbuf_t bits, const unsigned num_bits) -{ - os->bitbuf |= bits << os->bitcount; - os->bitcount += num_bits; -} - -/* Flush bits from the bitbuffer variable to the output buffer. */ -static forceinline void -deflate_flush_bits(struct deflate_output_bitstream *os) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - /* Flush a whole word (branchlessly). */ - put_unaligned_leword(os->bitbuf, os->next); - os->bitbuf >>= os->bitcount & ~7; - os->next += MIN(os->end - os->next, os->bitcount >> 3); - os->bitcount &= 7; - } else { - /* Flush a byte at a time. */ - while (os->bitcount >= 8) { - *os->next = os->bitbuf; - if (os->next != os->end) - os->next++; - os->bitcount -= 8; - os->bitbuf >>= 8; - } - } -} - -/* Align the bitstream on a byte boundary. */ -static forceinline void -deflate_align_bitstream(struct deflate_output_bitstream *os) -{ - os->bitcount += -os->bitcount & 7; - deflate_flush_bits(os); -} - -/* - * Flush any remaining bits to the output buffer if needed. Return the total - * number of bytes written to the output buffer, or 0 if an overflow occurred. - */ -static u32 -deflate_flush_output(struct deflate_output_bitstream *os) -{ - if (os->next == os->end) /* overflow? */ - return 0; - - while ((int)os->bitcount > 0) { - *os->next++ = os->bitbuf; - os->bitcount -= 8; - os->bitbuf >>= 8; - } - - return os->next - os->begin; -} - -/* Given the binary tree node A[subtree_idx] whose children already - * satisfy the maxheap property, swap the node with its greater child - * until it is greater than both its children, so that the maxheap - * property is satisfied in the subtree rooted at A[subtree_idx]. */ -static void -heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) -{ - unsigned parent_idx; - unsigned child_idx; - u32 v; - - v = A[subtree_idx]; - parent_idx = subtree_idx; - while ((child_idx = parent_idx * 2) <= length) { - if (child_idx < length && A[child_idx + 1] > A[child_idx]) - child_idx++; - if (v >= A[child_idx]) - break; - A[parent_idx] = A[child_idx]; - parent_idx = child_idx; - } - A[parent_idx] = v; -} - -/* Rearrange the array 'A' so that it satisfies the maxheap property. - * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. - */ -static void -heapify_array(u32 A[], unsigned length) -{ - unsigned subtree_idx; - - for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) - heapify_subtree(A, length, subtree_idx); -} - -/* - * Sort the array 'A', which contains 'length' unsigned 32-bit integers. - * - * Note: name this function heap_sort() instead of heapsort() to avoid colliding - * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't - * necessary when compiling with -D_ANSI_SOURCE, which is the better solution. - */ -static void -heap_sort(u32 A[], unsigned length) -{ - A--; /* Use 1-based indices */ - - heapify_array(A, length); - - while (length >= 2) { - u32 tmp = A[length]; - A[length] = A[1]; - A[1] = tmp; - length--; - heapify_subtree(A, length, 1); - } -} - -#define NUM_SYMBOL_BITS 10 -#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) - -#define GET_NUM_COUNTERS(num_syms) ((((num_syms) + 3 / 4) + 3) & ~3) -/* - * Sort the symbols primarily by frequency and secondarily by symbol - * value. Discard symbols with zero frequency and fill in an array with - * the remaining symbols, along with their frequencies. The low - * NUM_SYMBOL_BITS bits of each array entry will contain the symbol - * value, and the remaining bits will contain the frequency. - * - * @num_syms - * Number of symbols in the alphabet. - * Can't be greater than (1 << NUM_SYMBOL_BITS). - * - * @freqs[num_syms] - * The frequency of each symbol. - * - * @lens[num_syms] - * An array that eventually will hold the length of each codeword. - * This function only fills in the codeword lengths for symbols that - * have zero frequency, which are not well defined per se but will - * be set to 0. - * - * @symout[num_syms] - * The output array, described above. - * - * Returns the number of entries in 'symout' that were filled. This is - * the number of symbols that have nonzero frequency. - */ -static unsigned -sort_symbols(unsigned num_syms, const u32 freqs[restrict], - u8 lens[restrict], u32 symout[restrict]) -{ - unsigned sym; - unsigned i; - unsigned num_used_syms; - unsigned num_counters; - unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; - - /* We rely on heapsort, but with an added optimization. Since - * it's common for most symbol frequencies to be low, we first do - * a count sort using a limited number of counters. High - * frequencies will be counted in the last counter, and only they - * will be sorted with heapsort. - * - * Note: with more symbols, it is generally beneficial to have more - * counters. About 1 counter per 4 symbols seems fast. - * - * Note: I also tested radix sort, but even for large symbol - * counts (> 255) and frequencies bounded at 16 bits (enabling - * radix sort by just two base-256 digits), it didn't seem any - * faster than the method implemented here. - * - * Note: I tested the optimized quicksort implementation from - * glibc (with indirection overhead removed), but it was only - * marginally faster than the simple heapsort implemented here. - * - * Tests were done with building the codes for LZX. Results may - * vary for different compression algorithms...! */ - - num_counters = GET_NUM_COUNTERS(num_syms); - - memset(counters, 0, num_counters * sizeof(counters[0])); - - /* Count the frequencies. */ - for (sym = 0; sym < num_syms; sym++) - counters[MIN(freqs[sym], num_counters - 1)]++; - - /* Make the counters cumulative, ignoring the zero-th, which - * counted symbols with zero frequency. As a side effect, this - * calculates the number of symbols with nonzero frequency. */ - num_used_syms = 0; - for (i = 1; i < num_counters; i++) { - unsigned count = counters[i]; - counters[i] = num_used_syms; - num_used_syms += count; - } - - /* Sort nonzero-frequency symbols using the counters. At the - * same time, set the codeword lengths of zero-frequency symbols - * to 0. */ - for (sym = 0; sym < num_syms; sym++) { - u32 freq = freqs[sym]; - if (freq != 0) { - symout[counters[MIN(freq, num_counters - 1)]++] = - sym | (freq << NUM_SYMBOL_BITS); - } else { - lens[sym] = 0; - } - } - - /* Sort the symbols counted in the last counter. */ - heap_sort(symout + counters[num_counters - 2], - counters[num_counters - 1] - counters[num_counters - 2]); - - return num_used_syms; -} - -/* - * Build the Huffman tree. - * - * This is an optimized implementation that - * (a) takes advantage of the frequencies being already sorted; - * (b) only generates non-leaf nodes, since the non-leaf nodes of a - * Huffman tree are sufficient to generate a canonical code; - * (c) Only stores parent pointers, not child pointers; - * (d) Produces the nodes in the same memory used for input - * frequency information. - * - * Array 'A', which contains 'sym_count' entries, is used for both input - * and output. For this function, 'sym_count' must be at least 2. - * - * For input, the array must contain the frequencies of the symbols, - * sorted in increasing order. Specifically, each entry must contain a - * frequency left shifted by NUM_SYMBOL_BITS bits. Any data in the low - * NUM_SYMBOL_BITS bits of the entries will be ignored by this function. - * Although these bits will, in fact, contain the symbols that correspond - * to the frequencies, this function is concerned with frequencies only - * and keeps the symbols as-is. - * - * For output, this function will produce the non-leaf nodes of the - * Huffman tree. These nodes will be stored in the first (sym_count - 1) - * entries of the array. Entry A[sym_count - 2] will represent the root - * node. Each other node will contain the zero-based index of its parent - * node in 'A', left shifted by NUM_SYMBOL_BITS bits. The low - * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is. Again, - * note that although these low bits will, in fact, contain a symbol - * value, this symbol will have *no relationship* with the Huffman tree - * node that happens to occupy the same slot. This is because this - * implementation only generates the non-leaf nodes of the tree. - */ -static void -build_tree(u32 A[], unsigned sym_count) -{ - /* Index, in 'A', of next lowest frequency symbol that has not - * yet been processed. */ - unsigned i = 0; - - /* Index, in 'A', of next lowest frequency parentless non-leaf - * node; or, if equal to 'e', then no such node exists yet. */ - unsigned b = 0; - - /* Index, in 'A', of next node to allocate as a non-leaf. */ - unsigned e = 0; - - do { - unsigned m, n; - u32 freq_shifted; - - /* Choose the two next lowest frequency entries. */ - - if (i != sym_count && - (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) - m = i++; - else - m = b++; - - if (i != sym_count && - (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS))) - n = i++; - else - n = b++; - - /* Allocate a non-leaf node and link the entries to it. - * - * If we link an entry that we're visiting for the first - * time (via index 'i'), then we're actually linking a - * leaf node and it will have no effect, since the leaf - * will be overwritten with a non-leaf when index 'e' - * catches up to it. But it's not any slower to - * unconditionally set the parent index. - * - * We also compute the frequency of the non-leaf node as - * the sum of its two children's frequencies. */ - - freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK); - - A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); - A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS); - A[e] = (A[e] & SYMBOL_MASK) | freq_shifted; - e++; - } while (sym_count - e > 1); - /* When just one entry remains, it is a "leaf" that was - * linked to some other node. We ignore it, since the - * rest of the array contains the non-leaves which we - * need. (Note that we're assuming the cases with 0 or 1 - * symbols were handled separately.) */ -} - -/* - * Given the stripped-down Huffman tree constructed by build_tree(), - * determine the number of codewords that should be assigned each - * possible length, taking into account the length-limited constraint. - * - * @A - * The array produced by build_tree(), containing parent index - * information for the non-leaf nodes of the Huffman tree. Each - * entry in this array is a node; a node's parent always has a - * greater index than that node itself. This function will - * overwrite the parent index information in this array, so - * essentially it will destroy the tree. However, the data in the - * low NUM_SYMBOL_BITS of each entry will be preserved. - * - * @root_idx - * The 0-based index of the root node in 'A', and consequently one - * less than the number of tree node entries in 'A'. (Or, really 2 - * less than the actual length of 'A'.) - * - * @len_counts - * An array of length ('max_codeword_len' + 1) in which the number of - * codewords having each length <= max_codeword_len will be - * returned. - * - * @max_codeword_len - * The maximum permissible codeword length. - */ -static void -compute_length_counts(u32 A[restrict], unsigned root_idx, - unsigned len_counts[restrict], unsigned max_codeword_len) -{ - unsigned len; - int node; - - /* The key observations are: - * - * (1) We can traverse the non-leaf nodes of the tree, always - * visiting a parent before its children, by simply iterating - * through the array in reverse order. Consequently, we can - * compute the depth of each node in one pass, overwriting the - * parent indices with depths. - * - * (2) We can initially assume that in the real Huffman tree, - * both children of the root are leaves. This corresponds to two - * codewords of length 1. Then, whenever we visit a (non-leaf) - * node during the traversal, we modify this assumption to - * account for the current node *not* being a leaf, but rather - * its two children being leaves. This causes the loss of one - * codeword for the current depth and the addition of two - * codewords for the current depth plus one. - * - * (3) We can handle the length-limited constraint fairly easily - * by simply using the largest length available when a depth - * exceeds max_codeword_len. - */ - - for (len = 0; len <= max_codeword_len; len++) - len_counts[len] = 0; - len_counts[1] = 2; - - /* Set the root node's depth to 0. */ - A[root_idx] &= SYMBOL_MASK; - - for (node = root_idx - 1; node >= 0; node--) { - - /* Calculate the depth of this node. */ - - unsigned parent = A[node] >> NUM_SYMBOL_BITS; - unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; - unsigned depth = parent_depth + 1; - unsigned len = depth; - - /* Set the depth of this node so that it is available - * when its children (if any) are processed. */ - - A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); - - /* If needed, decrease the length to meet the - * length-limited constraint. This is not the optimal - * method for generating length-limited Huffman codes! - * But it should be good enough. */ - if (len >= max_codeword_len) { - len = max_codeword_len; - do { - len--; - } while (len_counts[len] == 0); - } - - /* Account for the fact that we have a non-leaf node at - * the current depth. */ - len_counts[len]--; - len_counts[len + 1] += 2; - } -} - -/* - * Generate the codewords for a canonical Huffman code. - * - * @A - * The output array for codewords. In addition, initially this - * array must contain the symbols, sorted primarily by frequency and - * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of - * each entry. - * - * @len - * Output array for codeword lengths. - * - * @len_counts - * An array that provides the number of codewords that will have - * each possible length <= max_codeword_len. - * - * @max_codeword_len - * Maximum length, in bits, of each codeword. - * - * @num_syms - * Number of symbols in the alphabet, including symbols with zero - * frequency. This is the length of the 'A' and 'len' arrays. - */ -static void -gen_codewords(u32 A[restrict], u8 lens[restrict], - const unsigned len_counts[restrict], - unsigned max_codeword_len, unsigned num_syms) -{ - u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned i; - unsigned len; - unsigned sym; - - /* Given the number of codewords that will have each length, - * assign codeword lengths to symbols. We do this by assigning - * the lengths in decreasing order to the symbols sorted - * primarily by increasing frequency and secondarily by - * increasing symbol value. */ - for (i = 0, len = max_codeword_len; len >= 1; len--) { - unsigned count = len_counts[len]; - while (count--) - lens[A[i++] & SYMBOL_MASK] = len; - } - - /* Generate the codewords themselves. We initialize the - * 'next_codewords' array to provide the lexicographically first - * codeword of each length, then assign codewords in symbol - * order. This produces a canonical code. */ - next_codewords[0] = 0; - next_codewords[1] = 0; - for (len = 2; len <= max_codeword_len; len++) - next_codewords[len] = - (next_codewords[len - 1] + len_counts[len - 1]) << 1; - - for (sym = 0; sym < num_syms; sym++) - A[sym] = next_codewords[lens[sym]]++; -} - -/* - * --------------------------------------------------------------------- - * make_canonical_huffman_code() - * --------------------------------------------------------------------- - * - * Given an alphabet and the frequency of each symbol in it, construct a - * length-limited canonical Huffman code. - * - * @num_syms - * The number of symbols in the alphabet. The symbols are the - * integers in the range [0, num_syms - 1]. This parameter must be - * at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS). - * - * @max_codeword_len - * The maximum permissible codeword length. - * - * @freqs - * An array of @num_syms entries, each of which specifies the - * frequency of the corresponding symbol. It is valid for some, - * none, or all of the frequencies to be 0. - * - * @lens - * An array of @num_syms entries in which this function will return - * the length, in bits, of the codeword assigned to each symbol. - * Symbols with 0 frequency will not have codewords per se, but - * their entries in this array will be set to 0. No lengths greater - * than @max_codeword_len will be assigned. - * - * @codewords - * An array of @num_syms entries in which this function will return - * the codeword for each symbol, right-justified and padded on the - * left with zeroes. Codewords for symbols with 0 frequency will be - * undefined. - * - * --------------------------------------------------------------------- - * - * This function builds a length-limited canonical Huffman code. - * - * A length-limited Huffman code contains no codewords longer than some - * specified length, and has exactly (with some algorithms) or - * approximately (with the algorithm used here) the minimum weighted path - * length from the root, given this constraint. - * - * A canonical Huffman code satisfies the properties that a longer - * codeword never lexicographically precedes a shorter codeword, and the - * lexicographic ordering of codewords of the same length is the same as - * the lexicographic ordering of the corresponding symbols. A canonical - * Huffman code, or more generally a canonical prefix code, can be - * reconstructed from only a list containing the codeword length of each - * symbol. - * - * The classic algorithm to generate a Huffman code creates a node for - * each symbol, then inserts these nodes into a min-heap keyed by symbol - * frequency. Then, repeatedly, the two lowest-frequency nodes are - * removed from the min-heap and added as the children of a new node - * having frequency equal to the sum of its two children, which is then - * inserted into the min-heap. When only a single node remains in the - * min-heap, it is the root of the Huffman tree. The codeword for each - * symbol is determined by the path needed to reach the corresponding - * node from the root. Descending to the left child appends a 0 bit, - * whereas descending to the right child appends a 1 bit. - * - * The classic algorithm is relatively easy to understand, but it is - * subject to a number of inefficiencies. In practice, it is fastest to - * first sort the symbols by frequency. (This itself can be subject to - * an optimization based on the fact that most frequencies tend to be - * low.) At the same time, we sort secondarily by symbol value, which - * aids the process of generating a canonical code. Then, during tree - * construction, no heap is necessary because both the leaf nodes and the - * unparented non-leaf nodes can be easily maintained in sorted order. - * Consequently, there can never be more than two possibilities for the - * next-lowest-frequency node. - * - * In addition, because we're generating a canonical code, we actually - * don't need the leaf nodes of the tree at all, only the non-leaf nodes. - * This is because for canonical code generation we don't need to know - * where the symbols are in the tree. Rather, we only need to know how - * many leaf nodes have each depth (codeword length). And this - * information can, in fact, be quickly generated from the tree of - * non-leaves only. - * - * Furthermore, we can build this stripped-down Huffman tree directly in - * the array in which the codewords are to be generated, provided that - * these array slots are large enough to hold a symbol and frequency - * value. - * - * Still furthermore, we don't even need to maintain explicit child - * pointers. We only need the parent pointers, and even those can be - * overwritten in-place with depth information as part of the process of - * extracting codeword lengths from the tree. So in summary, we do NOT - * need a big structure like: - * - * struct huffman_tree_node { - * unsigned int symbol; - * unsigned int frequency; - * unsigned int depth; - * struct huffman_tree_node *left_child; - * struct huffman_tree_node *right_child; - * }; - * - * - * ... which often gets used in "naive" implementations of Huffman code - * generation. - * - * Many of these optimizations are based on the implementation in 7-Zip - * (source file: C/HuffEnc.c), which has been placed in the public domain - * by Igor Pavlov. - */ -static void -make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, - const u32 freqs[restrict], - u8 lens[restrict], u32 codewords[restrict]) -{ - u32 *A = codewords; - unsigned num_used_syms; - - STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); - - /* We begin by sorting the symbols primarily by frequency and - * secondarily by symbol value. As an optimization, the array - * used for this purpose ('A') shares storage with the space in - * which we will eventually return the codewords. */ - - num_used_syms = sort_symbols(num_syms, freqs, lens, A); - - /* 'num_used_syms' is the number of symbols with nonzero - * frequency. This may be less than @num_syms. 'num_used_syms' - * is also the number of entries in 'A' that are valid. Each - * entry consists of a distinct symbol and a nonzero frequency - * packed into a 32-bit integer. */ - - /* Handle special cases where only 0 or 1 symbols were used (had - * nonzero frequency). */ - - if (unlikely(num_used_syms == 0)) { - /* Code is empty. sort_symbols() already set all lengths - * to 0, so there is nothing more to do. */ - return; - } - - if (unlikely(num_used_syms == 1)) { - /* Only one symbol was used, so we only need one - * codeword. But two codewords are needed to form the - * smallest complete Huffman code, which uses codewords 0 - * and 1. Therefore, we choose another symbol to which - * to assign a codeword. We use 0 (if the used symbol is - * not 0) or 1 (if the used symbol is 0). In either - * case, the lesser-valued symbol must be assigned - * codeword 0 so that the resulting code is canonical. */ - - unsigned sym = A[0] & SYMBOL_MASK; - unsigned nonzero_idx = sym ? sym : 1; - - codewords[0] = 0; - lens[0] = 1; - codewords[nonzero_idx] = 1; - lens[nonzero_idx] = 1; - return; - } - - /* Build a stripped-down version of the Huffman tree, sharing the - * array 'A' with the symbol values. Then extract length counts - * from the tree and use them to generate the final codewords. */ - - build_tree(A, num_used_syms); - - { - unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; - - compute_length_counts(A, num_used_syms - 2, - len_counts, max_codeword_len); - - gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); - } -} - -/* - * Clear the Huffman symbol frequency counters. - * This must be called when starting a new DEFLATE block. - */ -static void -deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) -{ - memset(&c->freqs, 0, sizeof(c->freqs)); -} - -/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length. */ -static u32 -deflate_reverse_codeword(u32 codeword, u8 len) -{ - /* The following branchless algorithm is faster than going bit by bit. - * Note: since no codewords are longer than 16 bits, we only need to - * reverse the low 16 bits of the 'u32'. */ - STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); - - /* Flip adjacent 1-bit fields */ - codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1); - - /* Flip adjacent 2-bit fields */ - codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2); - - /* Flip adjacent 4-bit fields */ - codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4); - - /* Flip adjacent 8-bit fields */ - codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8); - - /* Return the high 'len' bits of the bit-reversed 16 bit value. */ - return codeword >> (16 - len); -} - -/* Make a canonical Huffman code with bit-reversed codewords. */ -static void -deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, - const u32 freqs[], u8 lens[], u32 codewords[]) -{ - unsigned sym; - - make_canonical_huffman_code(num_syms, max_codeword_len, - freqs, lens, codewords); - - for (sym = 0; sym < num_syms; sym++) - codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]); -} - -/* - * Build the literal/length and offset Huffman codes for a DEFLATE block. - * - * This takes as input the frequency tables for each code and produces as output - * a set of tables that map symbols to codewords and codeword lengths. - */ -static void -deflate_make_huffman_codes(const struct deflate_freqs *freqs, - struct deflate_codes *codes) -{ - STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); - STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); - - deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, - MAX_LITLEN_CODEWORD_LEN, - freqs->litlen, - codes->lens.litlen, - codes->codewords.litlen); - - deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, - MAX_OFFSET_CODEWORD_LEN, - freqs->offset, - codes->lens.offset, - codes->codewords.offset); -} - -/* Initialize c->static_codes. */ -static void -deflate_init_static_codes(struct libdeflate_compressor *c) -{ - unsigned i; - - for (i = 0; i < 144; i++) - c->freqs.litlen[i] = 1 << (9 - 8); - for (; i < 256; i++) - c->freqs.litlen[i] = 1 << (9 - 9); - for (; i < 280; i++) - c->freqs.litlen[i] = 1 << (9 - 7); - for (; i < 288; i++) - c->freqs.litlen[i] = 1 << (9 - 8); - - for (i = 0; i < 32; i++) - c->freqs.offset[i] = 1 << (5 - 5); - - deflate_make_huffman_codes(&c->freqs, &c->static_codes); -} - -/* Return the offset slot for the specified match offset. */ -static forceinline unsigned -deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset) -{ -#if USE_FULL_OFFSET_SLOT_FAST - return c->offset_slot_fast[offset]; -#else - if (offset <= 256) - return c->offset_slot_fast[offset - 1]; - else - return c->offset_slot_fast[256 + ((offset - 1) >> 7)]; -#endif -} - -/* Write the header fields common to all DEFLATE block types. */ -static void -deflate_write_block_header(struct deflate_output_bitstream *os, - bool is_final_block, unsigned block_type) -{ - deflate_add_bits(os, is_final_block, 1); - deflate_add_bits(os, block_type, 2); - deflate_flush_bits(os); -} - -static unsigned -deflate_compute_precode_items(const u8 lens[restrict], - const unsigned num_lens, - u32 precode_freqs[restrict], - unsigned precode_items[restrict]) -{ - unsigned *itemptr; - unsigned run_start; - unsigned run_end; - unsigned extra_bits; - u8 len; - - memset(precode_freqs, 0, - DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); - - itemptr = precode_items; - run_start = 0; - do { - /* Find the next run of codeword lengths. */ - - /* len = the length being repeated */ - len = lens[run_start]; - - /* Extend the run. */ - run_end = run_start; - do { - run_end++; - } while (run_end != num_lens && len == lens[run_end]); - - if (len == 0) { - /* Run of zeroes. */ - - /* Symbol 18: RLE 11 to 138 zeroes at a time. */ - while ((run_end - run_start) >= 11) { - extra_bits = MIN((run_end - run_start) - 11, 0x7F); - precode_freqs[18]++; - *itemptr++ = 18 | (extra_bits << 5); - run_start += 11 + extra_bits; - } - - /* Symbol 17: RLE 3 to 10 zeroes at a time. */ - if ((run_end - run_start) >= 3) { - extra_bits = MIN((run_end - run_start) - 3, 0x7); - precode_freqs[17]++; - *itemptr++ = 17 | (extra_bits << 5); - run_start += 3 + extra_bits; - } - } else { - - /* A run of nonzero lengths. */ - - /* Symbol 16: RLE 3 to 6 of the previous length. */ - if ((run_end - run_start) >= 4) { - precode_freqs[len]++; - *itemptr++ = len; - run_start++; - do { - extra_bits = MIN((run_end - run_start) - 3, 0x3); - precode_freqs[16]++; - *itemptr++ = 16 | (extra_bits << 5); - run_start += 3 + extra_bits; - } while ((run_end - run_start) >= 3); - } - } - - /* Output any remaining lengths without RLE. */ - while (run_start != run_end) { - precode_freqs[len]++; - *itemptr++ = len; - run_start++; - } - } while (run_start != num_lens); - - return itemptr - precode_items; -} - -/* - * Huffman codeword lengths for dynamic Huffman blocks are compressed using a - * separate Huffman code, the "precode", which contains a symbol for each - * possible codeword length in the larger code as well as several special - * symbols to represent repeated codeword lengths (a form of run-length - * encoding). The precode is itself constructed in canonical form, and its - * codeword lengths are represented literally in 19 3-bit fields that - * immediately precede the compressed codeword lengths of the larger code. - */ - -/* Precompute the information needed to output Huffman codes. */ -static void -deflate_precompute_huffman_header(struct libdeflate_compressor *c) -{ - /* Compute how many litlen and offset symbols are needed. */ - - for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; - c->num_litlen_syms > 257; - c->num_litlen_syms--) - if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0) - break; - - for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; - c->num_offset_syms > 1; - c->num_offset_syms--) - if (c->codes.lens.offset[c->num_offset_syms - 1] != 0) - break; - - /* If we're not using the full set of literal/length codeword lengths, - * then temporarily move the offset codeword lengths over so that the - * literal/length and offset codeword lengths are contiguous. */ - - STATIC_ASSERT(offsetof(struct deflate_lens, offset) == - DEFLATE_NUM_LITLEN_SYMS); - - if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { - memmove((u8 *)&c->codes.lens + c->num_litlen_syms, - (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, - c->num_offset_syms); - } - - /* Compute the "items" (RLE / literal tokens and extra bits) with which - * the codeword lengths in the larger code will be output. */ - c->num_precode_items = - deflate_compute_precode_items((u8 *)&c->codes.lens, - c->num_litlen_syms + - c->num_offset_syms, - c->precode_freqs, - c->precode_items); - - /* Build the precode. */ - STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); - deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, - MAX_PRE_CODEWORD_LEN, - c->precode_freqs, c->precode_lens, - c->precode_codewords); - - /* Count how many precode lengths we actually need to output. */ - for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; - c->num_explicit_lens > 4; - c->num_explicit_lens--) - if (c->precode_lens[deflate_precode_lens_permutation[ - c->num_explicit_lens - 1]] != 0) - break; - - /* Restore the offset codeword lengths if needed. */ - if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { - memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, - (u8 *)&c->codes.lens + c->num_litlen_syms, - c->num_offset_syms); - } -} - -/* Output the Huffman codes. */ -static void -deflate_write_huffman_header(struct libdeflate_compressor *c, - struct deflate_output_bitstream *os) -{ - unsigned i; - - deflate_add_bits(os, c->num_litlen_syms - 257, 5); - deflate_add_bits(os, c->num_offset_syms - 1, 5); - deflate_add_bits(os, c->num_explicit_lens - 4, 4); - deflate_flush_bits(os); - - /* Output the lengths of the codewords in the precode. */ - for (i = 0; i < c->num_explicit_lens; i++) { - deflate_add_bits(os, c->precode_lens[ - deflate_precode_lens_permutation[i]], 3); - deflate_flush_bits(os); - } - - /* Output the encoded lengths of the codewords in the larger code. */ - for (i = 0; i < c->num_precode_items; i++) { - unsigned precode_item = c->precode_items[i]; - unsigned precode_sym = precode_item & 0x1F; - deflate_add_bits(os, c->precode_codewords[precode_sym], - c->precode_lens[precode_sym]); - if (precode_sym >= 16) { - if (precode_sym == 16) - deflate_add_bits(os, precode_item >> 5, 2); - else if (precode_sym == 17) - deflate_add_bits(os, precode_item >> 5, 3); - else - deflate_add_bits(os, precode_item >> 5, 7); - } - STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7)); - deflate_flush_bits(os); - } -} - -static void -deflate_write_sequences(struct deflate_output_bitstream * restrict os, - const struct deflate_codes * restrict codes, - const struct deflate_sequence sequences[restrict], - const u8 * restrict in_next) -{ - const struct deflate_sequence *seq = sequences; - - for (;;) { - u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF; - unsigned length = seq->litrunlen_and_length >> 23; - unsigned length_slot; - unsigned litlen_symbol; - unsigned offset_symbol; - - if (litrunlen) { - #if 1 - while (litrunlen >= 4) { - unsigned lit0 = in_next[0]; - unsigned lit1 = in_next[1]; - unsigned lit2 = in_next[2]; - unsigned lit3 = in_next[3]; - - deflate_add_bits(os, codes->codewords.litlen[lit0], - codes->lens.litlen[lit0]); - if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit1], - codes->lens.litlen[lit1]); - if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit2], - codes->lens.litlen[lit2]); - if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - - deflate_add_bits(os, codes->codewords.litlen[lit3], - codes->lens.litlen[lit3]); - deflate_flush_bits(os); - in_next += 4; - litrunlen -= 4; - } - if (litrunlen-- != 0) { - deflate_add_bits(os, codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - if (litrunlen-- != 0) { - deflate_add_bits(os, codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - if (litrunlen-- != 0) { - deflate_add_bits(os, codes->codewords.litlen[*in_next], - codes->lens.litlen[*in_next]); - if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - in_next++; - } - } - if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN)) - deflate_flush_bits(os); - } - #else - do { - unsigned lit = *in_next++; - deflate_add_bits(os, codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - deflate_flush_bits(os); - } while (--litrunlen); - #endif - } - - if (length == 0) - return; - - in_next += length; - - length_slot = seq->length_slot; - litlen_symbol = 257 + length_slot; - - /* Litlen symbol */ - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - - /* Extra length bits */ - STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS)); - deflate_add_bits(os, length - deflate_length_slot_base[length_slot], - deflate_extra_length_bits[length_slot]); - - if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS + - MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - /* Offset symbol */ - offset_symbol = seq->offset_symbol; - deflate_add_bits(os, codes->codewords.offset[offset_symbol], - codes->lens.offset[offset_symbol]); - - if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - /* Extra offset bits */ - deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol], - deflate_extra_offset_bits[offset_symbol]); - - deflate_flush_bits(os); - - seq++; - } -} - -#if SUPPORT_NEAR_OPTIMAL_PARSING -/* - * Follow the minimum-cost path in the graph of possible match/literal choices - * for the current block and write out the matches/literals using the specified - * Huffman codes. - * - * Note: this is slightly duplicated with deflate_write_sequences(), the reason - * being that we don't want to waste time translating between intermediate - * match/literal representations. - */ -static void -deflate_write_item_list(struct deflate_output_bitstream *os, - const struct deflate_codes *codes, - struct libdeflate_compressor *c, - u32 block_length) -{ - struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; - struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length]; - do { - unsigned length = cur_node->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; - unsigned litlen_symbol; - unsigned length_slot; - unsigned offset_slot; - - if (length == 1) { - /* Literal */ - litlen_symbol = offset; - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - deflate_flush_bits(os); - } else { - /* Match length */ - length_slot = deflate_length_slot[length]; - litlen_symbol = 257 + length_slot; - deflate_add_bits(os, codes->codewords.litlen[litlen_symbol], - codes->lens.litlen[litlen_symbol]); - - deflate_add_bits(os, length - deflate_length_slot_base[length_slot], - deflate_extra_length_bits[length_slot]); - - if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS + - MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - - /* Match offset */ - offset_slot = deflate_get_offset_slot(c, offset); - deflate_add_bits(os, codes->codewords.offset[offset_slot], - codes->lens.offset[offset_slot]); - - if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_OFFSET_BITS)) - deflate_flush_bits(os); - - deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot], - deflate_extra_offset_bits[offset_slot]); - - deflate_flush_bits(os); - } - cur_node += length; - } while (cur_node != end_node); -} -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* Output the end-of-block symbol. */ -static void -deflate_write_end_of_block(struct deflate_output_bitstream *os, - const struct deflate_codes *codes) -{ - deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK], - codes->lens.litlen[DEFLATE_END_OF_BLOCK]); - deflate_flush_bits(os); -} - -static void -deflate_write_uncompressed_block(struct deflate_output_bitstream *os, - const u8 *data, u16 len, - bool is_final_block) -{ - deflate_write_block_header(os, is_final_block, - DEFLATE_BLOCKTYPE_UNCOMPRESSED); - deflate_align_bitstream(os); - - if (4 + (u32)len >= os->end - os->next) { - os->next = os->end; - return; - } - - put_unaligned_le16(len, os->next); - os->next += 2; - put_unaligned_le16(~len, os->next); - os->next += 2; - memcpy(os->next, data, len); - os->next += len; -} - -static void -deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os, - const u8 *data, u32 data_length, - bool is_final_block) -{ - do { - u16 len = MIN(data_length, UINT16_MAX); - - deflate_write_uncompressed_block(os, data, len, - is_final_block && len == data_length); - data += len; - data_length -= len; - } while (data_length != 0); -} - -/* - * Choose the best type of block to use (dynamic Huffman, static Huffman, or - * uncompressed), then output it. - */ -static void -deflate_flush_block(struct libdeflate_compressor * restrict c, - struct deflate_output_bitstream * restrict os, - const u8 * restrict block_begin, u32 block_length, - bool is_final_block, bool use_item_list) -{ - static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7, - }; - - /* Costs are measured in bits */ - u32 dynamic_cost = 0; - u32 static_cost = 0; - u32 uncompressed_cost = 0; - struct deflate_codes *codes; - int block_type; - unsigned sym; - - /* Tally the end-of-block symbol. */ - c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; - - /* Build dynamic Huffman codes. */ - deflate_make_huffman_codes(&c->freqs, &c->codes); - - /* Account for the cost of sending dynamic Huffman codes. */ - deflate_precompute_huffman_header(c); - dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens); - for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { - u32 extra = deflate_extra_precode_bits[sym]; - dynamic_cost += c->precode_freqs[sym] * - (extra + c->precode_lens[sym]); - } - - /* Account for the cost of encoding literals. */ - for (sym = 0; sym < 256; sym++) { - dynamic_cost += c->freqs.litlen[sym] * - c->codes.lens.litlen[sym]; - } - for (sym = 0; sym < 144; sym++) - static_cost += c->freqs.litlen[sym] * 8; - for (; sym < 256; sym++) - static_cost += c->freqs.litlen[sym] * 9; - - /* Account for the cost of encoding the end-of-block symbol. */ - dynamic_cost += c->codes.lens.litlen[256]; - static_cost += 7; - - /* Account for the cost of encoding lengths. */ - for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) { - u32 extra = deflate_extra_length_bits[sym - 257]; - dynamic_cost += c->freqs.litlen[sym] * - (extra + c->codes.lens.litlen[sym]); - static_cost += c->freqs.litlen[sym] * - (extra + c->static_codes.lens.litlen[sym]); - } - - /* Account for the cost of encoding offsets. */ - for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { - u32 extra = deflate_extra_offset_bits[sym]; - dynamic_cost += c->freqs.offset[sym] * - (extra + c->codes.lens.offset[sym]); - static_cost += c->freqs.offset[sym] * (extra + 5); - } - - /* Compute the cost of using uncompressed blocks. */ - uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 + - (40 * (DIV_ROUND_UP(block_length, - UINT16_MAX) - 1)) + - (8 * block_length); - - /* Choose the cheapest block type. */ - if (dynamic_cost < MIN(static_cost, uncompressed_cost)) { - block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN; - codes = &c->codes; - } else if (static_cost < uncompressed_cost) { - block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN; - codes = &c->static_codes; - } else { - block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED; - } - - /* Now actually output the block. */ - - if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { - /* Note: the length being flushed may exceed the maximum length - * of an uncompressed block (65535 bytes). Therefore, more than - * one uncompressed block might be needed. */ - deflate_write_uncompressed_blocks(os, block_begin, block_length, - is_final_block); - } else { - /* Output the block header. */ - deflate_write_block_header(os, is_final_block, block_type); - - /* Output the Huffman codes (dynamic Huffman blocks only). */ - if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) - deflate_write_huffman_header(c, os); - - /* Output the literals, matches, and end-of-block symbol. */ - #if SUPPORT_NEAR_OPTIMAL_PARSING - if (use_item_list) - deflate_write_item_list(os, codes, c, block_length); - else - #endif - deflate_write_sequences(os, codes, c->p.g.sequences, - block_begin); - deflate_write_end_of_block(os, codes); - } -} - -static forceinline void -deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, - u32 *litrunlen_p) -{ - c->freqs.litlen[literal]++; - ++*litrunlen_p; -} - -static forceinline void -deflate_choose_match(struct libdeflate_compressor *c, - unsigned length, unsigned offset, - u32 *litrunlen_p, struct deflate_sequence **next_seq_p) -{ - struct deflate_sequence *seq = *next_seq_p; - unsigned length_slot = deflate_length_slot[length]; - unsigned offset_slot = deflate_get_offset_slot(c, offset); - - c->freqs.litlen[257 + length_slot]++; - c->freqs.offset[offset_slot]++; - - seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p; - seq->offset = offset; - seq->length_slot = length_slot; - seq->offset_symbol = offset_slot; - - *litrunlen_p = 0; - *next_seq_p = seq + 1; -} - -static forceinline void -deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen) -{ - seq->litrunlen_and_length = litrunlen; /* length = 0 */ -} - -/******************************************************************************/ - -/* - * Block splitting algorithm. The problem is to decide when it is worthwhile to - * start a new block with new Huffman codes. There is a theoretically optimal - * solution: recursively consider every possible block split, considering the - * exact cost of each block, and choose the minimum cost approach. But this is - * far too slow. Instead, as an approximation, we can count symbols and after - * every N symbols, compare the expected distribution of symbols based on the - * previous data with the actual distribution. If they differ "by enough", then - * start a new block. - * - * As an optimization and heuristic, we don't distinguish between every symbol - * but rather we combine many symbols into a single "observation type". For - * literals we only look at the high bits and low bits, and for matches we only - * look at whether the match is long or not. The assumption is that for typical - * "real" data, places that are good block boundaries will tend to be noticable - * based only on changes in these aggregate frequencies, without looking for - * subtle differences in individual symbols. For example, a change from ASCII - * bytes to non-ASCII bytes, or from few matches (generally less compressible) - * to many matches (generally more compressible), would be easily noticed based - * on the aggregates. - * - * For determining whether the frequency distributions are "different enough" to - * start a new block, the simply heuristic of splitting when the sum of absolute - * differences exceeds a constant seems to be good enough. We also add a number - * proportional to the block length so that the algorithm is more likely to end - * long blocks than short blocks. This reflects the general expectation that it - * will become increasingly beneficial to start a new block as the current - * block grows longer. - * - * Finally, for an approximation, it is not strictly necessary that the exact - * symbols being used are considered. With "near-optimal parsing", for example, - * the actual symbols that will be used are unknown until after the block - * boundary is chosen and the block has been optimized. Since the final choices - * cannot be used, we can use preliminary "greedy" choices instead. - */ - -/* Initialize the block split statistics when starting a new block. */ -static void -init_block_split_stats(struct block_split_stats *stats) -{ - int i; - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - stats->new_observations[i] = 0; - stats->observations[i] = 0; - } - stats->num_new_observations = 0; - stats->num_observations = 0; -} - -/* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the - * literal, for 8 possible literal observation types. */ -static forceinline void -observe_literal(struct block_split_stats *stats, u8 lit) -{ - stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; - stats->num_new_observations++; -} - -/* Match observation. Heuristic: use one observation type for "short match" and - * one observation type for "long match". */ -static forceinline void -observe_match(struct block_split_stats *stats, unsigned length) -{ - stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++; - stats->num_new_observations++; -} - -static bool -do_end_block_check(struct block_split_stats *stats, u32 block_length) -{ - int i; - - if (stats->num_observations > 0) { - - /* Note: to avoid slow divisions, we do not divide by - * 'num_observations', but rather do all math with the numbers - * multiplied by 'num_observations'. */ - u32 total_delta = 0; - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - u32 expected = stats->observations[i] * stats->num_new_observations; - u32 actual = stats->new_observations[i] * stats->num_observations; - u32 delta = (actual > expected) ? actual - expected : - expected - actual; - total_delta += delta; - } - - /* Ready to end the block? */ - if (total_delta + (block_length / 4096) * stats->num_observations >= - NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations) - return true; - } - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - stats->num_observations += stats->new_observations[i]; - stats->observations[i] += stats->new_observations[i]; - stats->new_observations[i] = 0; - } - stats->num_new_observations = 0; - return false; -} - -static forceinline bool -should_end_block(struct block_split_stats *stats, - const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) -{ - /* Ready to check block split statistics? */ - if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK || - in_next - in_block_begin < MIN_BLOCK_LENGTH || - in_end - in_next < MIN_BLOCK_LENGTH) - return false; - - return do_end_block_check(stats, in_next - in_block_begin); -} - -/******************************************************************************/ - -/* - * This is the "greedy" DEFLATE compressor. It always chooses the longest match. - */ -static size_t -deflate_compress_greedy(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) -{ - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; - const u8 *in_cur_base = in_next; - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hashes[2] = {0, 0}; - - deflate_init_output(&os, out, out_nbytes_avail); - hc_matchfinder_init(&c->p.g.hc_mf); - - do { - /* Starting a new DEFLATE block. */ - - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = - in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); - u32 litrunlen = 0; - struct deflate_sequence *next_seq = c->p.g.sequences; - - init_block_split_stats(&c->split_stats); - deflate_reset_symbol_frequencies(c); - - do { - u32 length; - u32 offset; - - /* Decrease the maximum and nice match lengths if we're - * approaching the end of the input buffer. */ - if (unlikely(max_len > in_end - in_next)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - - length = hc_matchfinder_longest_match(&c->p.g.hc_mf, - &in_cur_base, - in_next, - DEFLATE_MIN_MATCH_LEN - 1, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - &offset); - - if (length >= DEFLATE_MIN_MATCH_LEN) { - /* Match found. */ - deflate_choose_match(c, length, offset, - &litrunlen, &next_seq); - observe_match(&c->split_stats, length); - in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, - &in_cur_base, - in_next + 1, - in_end, - length - 1, - next_hashes); - } else { - /* No match found. */ - deflate_choose_literal(c, *in_next, &litrunlen); - observe_literal(&c->split_stats, *in_next); - in_next++; - } - - /* Check if it's time to output another block. */ - } while (in_next < in_max_block_end && - !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - - deflate_finish_sequence(next_seq, litrunlen); - deflate_flush_block(c, &os, in_block_begin, - in_next - in_block_begin, - in_next == in_end, false); - } while (in_next != in_end); - - return deflate_flush_output(&os); -} - -/* - * This is the "lazy" DEFLATE compressor. Before choosing a match, it checks to - * see if there's a longer match at the next position. If yes, it outputs a - * literal and continues to the next position. If no, it outputs the match. - */ -static size_t -deflate_compress_lazy(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) -{ - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; - const u8 *in_cur_base = in_next; - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hashes[2] = {0, 0}; - - deflate_init_output(&os, out, out_nbytes_avail); - hc_matchfinder_init(&c->p.g.hc_mf); - - do { - /* Starting a new DEFLATE block. */ - - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = - in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); - u32 litrunlen = 0; - struct deflate_sequence *next_seq = c->p.g.sequences; - - init_block_split_stats(&c->split_stats); - deflate_reset_symbol_frequencies(c); - - do { - unsigned cur_len; - unsigned cur_offset; - unsigned next_len; - unsigned next_offset; - - if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - - /* Find the longest match at the current position. */ - cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf, - &in_cur_base, - in_next, - DEFLATE_MIN_MATCH_LEN - 1, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - &cur_offset); - in_next += 1; - - if (cur_len < DEFLATE_MIN_MATCH_LEN) { - /* No match found. Choose a literal. */ - deflate_choose_literal(c, *(in_next - 1), &litrunlen); - observe_literal(&c->split_stats, *(in_next - 1)); - continue; - } - - have_cur_match: - observe_match(&c->split_stats, cur_len); - - /* We have a match at the current position. */ - - /* If the current match is very long, choose it - * immediately. */ - if (cur_len >= nice_len) { - deflate_choose_match(c, cur_len, cur_offset, - &litrunlen, &next_seq); - in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, - &in_cur_base, - in_next, - in_end, - cur_len - 1, - next_hashes); - continue; - } - - /* - * Try to find a match at the next position. - * - * Note: since we already have a match at the *current* - * position, we use only half the 'max_search_depth' - * when checking the *next* position. This is a useful - * trade-off because it's more worthwhile to use a - * greater search depth on the initial match. - * - * Note: it's possible to structure the code such that - * there's only one call to longest_match(), which - * handles both the "find the initial match" and "try to - * find a longer match" cases. However, it is faster to - * have two call sites, with longest_match() inlined at - * each. - */ - if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf, - &in_cur_base, - in_next, - cur_len, - max_len, - nice_len, - c->max_search_depth / 2, - next_hashes, - &next_offset); - in_next += 1; - - if (next_len > cur_len) { - /* Found a longer match at the next position. - * Output a literal. Then the next match - * becomes the current match. */ - deflate_choose_literal(c, *(in_next - 2), &litrunlen); - cur_len = next_len; - cur_offset = next_offset; - goto have_cur_match; - } - - /* No longer match at the next position. - * Output the current match. */ - deflate_choose_match(c, cur_len, cur_offset, - &litrunlen, &next_seq); - in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf, - &in_cur_base, - in_next, - in_end, - cur_len - 2, - next_hashes); - - /* Check if it's time to output another block. */ - } while (in_next < in_max_block_end && - !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - - deflate_finish_sequence(next_seq, litrunlen); - deflate_flush_block(c, &os, in_block_begin, - in_next - in_block_begin, - in_next == in_end, false); - } while (in_next != in_end); - - return deflate_flush_output(&os); -} - -#if SUPPORT_NEAR_OPTIMAL_PARSING - -/* - * Follow the minimum-cost path in the graph of possible match/literal choices - * for the current block and compute the frequencies of the Huffman symbols that - * would be needed to output those matches and literals. - */ -static void -deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) -{ - struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; - struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; - do { - unsigned length = cur_node->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; - - if (length == 1) { - /* Literal */ - c->freqs.litlen[offset]++; - } else { - /* Match */ - c->freqs.litlen[257 + deflate_length_slot[length]]++; - c->freqs.offset[deflate_get_offset_slot(c, offset)]++; - } - cur_node += length; - } while (cur_node != end_node); -} - -/* Set the current cost model from the codeword lengths specified in @lens. */ -static void -deflate_set_costs_from_codes(struct libdeflate_compressor *c, - const struct deflate_lens *lens) -{ - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { - u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS); - c->p.n.costs.literal[i] = bits << COST_SHIFT; - } - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { - unsigned length_slot = deflate_length_slot[i]; - unsigned litlen_sym = 257 + length_slot; - u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); - bits += deflate_extra_length_bits[length_slot]; - c->p.n.costs.length[i] = bits << COST_SHIFT; - } - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { - u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS); - bits += deflate_extra_offset_bits[i]; - c->p.n.costs.offset_slot[i] = bits << COST_SHIFT; - } -} - -static forceinline u32 -deflate_default_literal_cost(unsigned literal) -{ - STATIC_ASSERT(COST_SHIFT == 3); - /* 66 is 8.25 bits/symbol */ - return 66; -} - -static forceinline u32 -deflate_default_length_slot_cost(unsigned length_slot) -{ - STATIC_ASSERT(COST_SHIFT == 3); - /* 60 is 7.5 bits/symbol */ - return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT); -} - -static forceinline u32 -deflate_default_offset_slot_cost(unsigned offset_slot) -{ - STATIC_ASSERT(COST_SHIFT == 3); - /* 39 is 4.875 bits/symbol */ - return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT); -} - -/* - * Set default symbol costs for the first block's first optimization pass. - * - * It works well to assume that each symbol is equally probable. This results - * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 << - * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding - * alphabet. However, we intentionally bias the parse towards matches rather - * than literals by using a slightly lower default cost for length symbols than - * for literals. This often improves the compression ratio slightly. - */ -static void -deflate_set_default_costs(struct libdeflate_compressor *c) -{ - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) - c->p.n.costs.literal[i] = deflate_default_literal_cost(i); - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) - c->p.n.costs.length[i] = deflate_default_length_slot_cost( - deflate_length_slot[i]); - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) - c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i); -} - -static forceinline void -deflate_adjust_cost(u32 *cost_p, u32 default_cost) -{ - *cost_p += ((s32)default_cost - (s32)*cost_p) >> 1; -} - -/* - * Adjust the costs when beginning a new block. - * - * Since the current costs have been optimized for the data, it's undesirable to - * throw them away and start over with the default costs. At the same time, we - * don't want to bias the parse by assuming that the next block will be similar - * to the current block. As a compromise, make the costs closer to the - * defaults, but don't simply set them to the defaults. - */ -static void -deflate_adjust_costs(struct libdeflate_compressor *c) -{ - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) - deflate_adjust_cost(&c->p.n.costs.literal[i], - deflate_default_literal_cost(i)); - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) - deflate_adjust_cost(&c->p.n.costs.length[i], - deflate_default_length_slot_cost( - deflate_length_slot[i])); - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) - deflate_adjust_cost(&c->p.n.costs.offset_slot[i], - deflate_default_offset_slot_cost(i)); -} - -/* - * Find the minimum-cost path through the graph of possible match/literal - * choices for this block. - * - * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which - * represents the node at the beginning of the block, to - * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of - * the block. Edge costs are evaluated using the cost model 'c->p.n.costs'. - * - * The algorithm works backwards, starting at the end node and proceeding - * backwards one node at a time. At each node, the minimum cost to reach the - * end node is computed and the match/literal choice that begins that path is - * saved. - */ -static void -deflate_find_min_cost_path(struct libdeflate_compressor *c, - const u32 block_length, - const struct lz_match *cache_ptr) -{ - struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length]; - struct deflate_optimum_node *cur_node = end_node; - - cur_node->cost_to_end = 0; - do { - unsigned num_matches; - unsigned literal; - u32 best_cost_to_end; - - cur_node--; - cache_ptr--; - - num_matches = cache_ptr->length; - literal = cache_ptr->offset; - - /* It's always possible to choose a literal. */ - best_cost_to_end = c->p.n.costs.literal[literal] + - (cur_node + 1)->cost_to_end; - cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; - - /* Also consider matches if there are any. */ - if (num_matches) { - const struct lz_match *match; - unsigned len; - unsigned offset; - unsigned offset_slot; - u32 offset_cost; - u32 cost_to_end; - - /* - * Consider each length from the minimum - * (DEFLATE_MIN_MATCH_LEN) to the length of the longest - * match found at this position. For each length, we - * consider only the smallest offset for which that - * length is available. Although this is not guaranteed - * to be optimal due to the possibility of a larger - * offset costing less than a smaller offset to code, - * this is a very useful heuristic. - */ - match = cache_ptr - num_matches; - len = DEFLATE_MIN_MATCH_LEN; - do { - offset = match->offset; - offset_slot = deflate_get_offset_slot(c, offset); - offset_cost = c->p.n.costs.offset_slot[offset_slot]; - do { - cost_to_end = offset_cost + - c->p.n.costs.length[len] + - (cur_node + len)->cost_to_end; - if (cost_to_end < best_cost_to_end) { - best_cost_to_end = cost_to_end; - cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len; - } - } while (++len <= match->length); - } while (++match != cache_ptr); - cache_ptr -= num_matches; - } - cur_node->cost_to_end = best_cost_to_end; - } while (cur_node != &c->p.n.optimum_nodes[0]); -} - -/* - * Choose the literal/match sequence to use for the current block. The basic - * algorithm finds a minimum-cost path through the block's graph of - * literal/match choices, given a cost model. However, the cost of each symbol - * is unknown until the Huffman codes have been built, but at the same time the - * Huffman codes depend on the frequencies of chosen symbols. Consequently, - * multiple passes must be used to try to approximate an optimal solution. The - * first pass uses default costs, mixed with the costs from the previous block - * if any. Later passes use the Huffman codeword lengths from the previous pass - * as the costs. - */ -static void -deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length, - const struct lz_match *cache_ptr, bool is_first_block) -{ - unsigned num_passes_remaining = c->p.n.num_optim_passes; - u32 i; - - /* Force the block to really end at the desired length, even if some - * matches extend beyond it. */ - for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, - ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) - c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; - - /* Set the initial costs. */ - if (is_first_block) - deflate_set_default_costs(c); - else - deflate_adjust_costs(c); - - for (;;) { - /* Find the minimum cost path for this pass. */ - deflate_find_min_cost_path(c, block_length, cache_ptr); - - /* Compute frequencies of the chosen symbols. */ - deflate_reset_symbol_frequencies(c); - deflate_tally_item_list(c, block_length); - - if (--num_passes_remaining == 0) - break; - - /* At least one optimization pass remains; update the costs. */ - deflate_make_huffman_codes(&c->freqs, &c->codes); - deflate_set_costs_from_codes(c, &c->codes.lens); - } -} - -/* - * This is the "near-optimal" DEFLATE compressor. It computes the optimal - * representation of each DEFLATE block using a minimum-cost path search over - * the graph of possible match/literal choices for that block, assuming a - * certain cost for each Huffman symbol. - * - * For several reasons, the end result is not guaranteed to be optimal: - * - * - Nonoptimal choice of blocks - * - Heuristic limitations on which matches are actually considered - * - Symbol costs are unknown until the symbols have already been chosen - * (so iterative optimization must be used) - */ -static size_t -deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, - const u8 * restrict in, size_t in_nbytes, - u8 * restrict out, size_t out_nbytes_avail) -{ - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - struct deflate_output_bitstream os; - const u8 *in_cur_base = in_next; - const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hashes[2] = {0, 0}; - - deflate_init_output(&os, out, out_nbytes_avail); - bt_matchfinder_init(&c->p.n.bt_mf); - - do { - /* Starting a new DEFLATE block. */ - - struct lz_match *cache_ptr = c->p.n.match_cache; - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = - in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH); - const u8 *next_observation = in_next; - - init_block_split_stats(&c->split_stats); - - /* - * Find matches until we decide to end the block. We end the - * block if any of the following is true: - * - * (1) Maximum block length has been reached - * (2) Match catch may overflow. - * (3) Block split heuristic says to split now. - */ - do { - struct lz_match *matches; - unsigned best_len; - - /* Slide the window forward if needed. */ - if (in_next == in_next_slide) { - bt_matchfinder_slide_window(&c->p.n.bt_mf); - in_cur_base = in_next; - in_next_slide = in_next + MIN(in_end - in_next, - MATCHFINDER_WINDOW_SIZE); - } - - /* Decrease the maximum and nice match lengths if we're - * approaching the end of the input buffer. */ - if (unlikely(max_len > in_end - in_next)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - - /* - * Find matches with the current position using the - * binary tree matchfinder and save them in - * 'match_cache'. - * - * Note: the binary tree matchfinder is more suited for - * optimal parsing than the hash chain matchfinder. The - * reasons for this include: - * - * - The binary tree matchfinder can find more matches - * in the same number of steps. - * - One of the major advantages of hash chains is that - * skipping positions (not searching for matches at - * them) is faster; however, with optimal parsing we - * search for matches at almost all positions, so this - * advantage of hash chains is negated. - */ - matches = cache_ptr; - best_len = 0; - if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { - cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf, - in_cur_base, - in_next - in_cur_base, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - &best_len, - matches); - } - - if (in_next >= next_observation) { - if (best_len >= 4) { - observe_match(&c->split_stats, best_len); - next_observation = in_next + best_len; - } else { - observe_literal(&c->split_stats, *in_next); - next_observation = in_next + 1; - } - } - - cache_ptr->length = cache_ptr - matches; - cache_ptr->offset = *in_next; - in_next++; - cache_ptr++; - - /* - * If there was a very long match found, don't cache any - * matches for the bytes covered by that match. This - * avoids degenerate behavior when compressing highly - * redundant data, where the number of matches can be - * very large. - * - * This heuristic doesn't actually hurt the compression - * ratio very much. If there's a long match, then the - * data must be highly compressible, so it doesn't - * matter much what we do. - */ - if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) { - --best_len; - do { - if (in_next == in_next_slide) { - bt_matchfinder_slide_window(&c->p.n.bt_mf); - in_cur_base = in_next; - in_next_slide = in_next + MIN(in_end - in_next, - MATCHFINDER_WINDOW_SIZE); - } - if (unlikely(max_len > in_end - in_next)) { - max_len = in_end - in_next; - nice_len = MIN(nice_len, max_len); - } - if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) { - bt_matchfinder_skip_position(&c->p.n.bt_mf, - in_cur_base, - in_next - in_cur_base, - nice_len, - c->max_search_depth, - next_hashes); - } - cache_ptr->length = 0; - cache_ptr->offset = *in_next; - in_next++; - cache_ptr++; - } while (--best_len); - } - } while (in_next < in_max_block_end && - cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] && - !should_end_block(&c->split_stats, in_block_begin, in_next, in_end)); - - /* All the matches for this block have been cached. Now choose - * the sequence of items to output and flush the block. */ - deflate_optimize_block(c, in_next - in_block_begin, cache_ptr, - in_block_begin == in); - deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin, - in_next == in_end, true); - } while (in_next != in_end); - - return deflate_flush_output(&os); -} - -#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - -/* Initialize c->offset_slot_fast. */ -static void -deflate_init_offset_slot_fast(struct libdeflate_compressor *c) -{ - unsigned offset_slot; - unsigned offset; - unsigned offset_end; - - for (offset_slot = 0; - offset_slot < ARRAY_LEN(deflate_offset_slot_base); - offset_slot++) - { - offset = deflate_offset_slot_base[offset_slot]; - #if USE_FULL_OFFSET_SLOT_FAST - offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->offset_slot_fast[offset] = offset_slot; - } while (++offset != offset_end); - #else - if (offset <= 256) { - offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->offset_slot_fast[offset - 1] = offset_slot; - } while (++offset != offset_end); - } else { - offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot; - } while ((offset += (1 << 7)) != offset_end); - } - #endif - } -} - -LIBDEFLATEAPI struct libdeflate_compressor * -libdeflate_alloc_compressor(int compression_level) -{ - struct libdeflate_compressor *c; - size_t size; - -#if SUPPORT_NEAR_OPTIMAL_PARSING - if (compression_level >= 8) - size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n); - else -#endif - size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g); - - c = aligned_malloc(MATCHFINDER_ALIGNMENT, size); - if (!c) - return NULL; - - switch (compression_level) { - case 1: - c->impl = deflate_compress_greedy; - c->max_search_depth = 2; - c->nice_match_length = 8; - break; - case 2: - c->impl = deflate_compress_greedy; - c->max_search_depth = 6; - c->nice_match_length = 10; - break; - case 3: - c->impl = deflate_compress_greedy; - c->max_search_depth = 12; - c->nice_match_length = 14; - break; - case 4: - c->impl = deflate_compress_greedy; - c->max_search_depth = 24; - c->nice_match_length = 24; - break; - case 5: - c->impl = deflate_compress_lazy; - c->max_search_depth = 20; - c->nice_match_length = 30; - break; - case 6: - c->impl = deflate_compress_lazy; - c->max_search_depth = 40; - c->nice_match_length = 65; - break; - case 7: - c->impl = deflate_compress_lazy; - c->max_search_depth = 100; - c->nice_match_length = 130; - break; -#if SUPPORT_NEAR_OPTIMAL_PARSING - case 8: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 12; - c->nice_match_length = 20; - c->p.n.num_optim_passes = 1; - break; - case 9: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 16; - c->nice_match_length = 26; - c->p.n.num_optim_passes = 2; - break; - case 10: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 30; - c->nice_match_length = 50; - c->p.n.num_optim_passes = 2; - break; - case 11: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 60; - c->nice_match_length = 80; - c->p.n.num_optim_passes = 3; - break; - case 12: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 100; - c->nice_match_length = 133; - c->p.n.num_optim_passes = 4; - break; -#else - case 8: - c->impl = deflate_compress_lazy; - c->max_search_depth = 150; - c->nice_match_length = 200; - break; - case 9: - c->impl = deflate_compress_lazy; - c->max_search_depth = 200; - c->nice_match_length = DEFLATE_MAX_MATCH_LEN; - break; -#endif - default: - aligned_free(c); - return NULL; - } - - c->compression_level = compression_level; - - deflate_init_offset_slot_fast(c); - deflate_init_static_codes(c); - - return c; -} - -LIBDEFLATEAPI size_t -libdeflate_deflate_compress(struct libdeflate_compressor *c, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail) -{ - if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING)) - return 0; - - /* For extremely small inputs just use a single uncompressed block. */ - if (unlikely(in_nbytes < 16)) { - struct deflate_output_bitstream os; - deflate_init_output(&os, out, out_nbytes_avail); - if (in_nbytes == 0) - in = &os; /* Avoid passing NULL to memcpy() */ - deflate_write_uncompressed_block(&os, in, in_nbytes, true); - return deflate_flush_output(&os); - } - - return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail); -} - -LIBDEFLATEAPI void -libdeflate_free_compressor(struct libdeflate_compressor *c) -{ - aligned_free(c); -} - -unsigned int -deflate_get_compression_level(struct libdeflate_compressor *c) -{ - return c->compression_level; -} - -LIBDEFLATEAPI size_t -libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, - size_t in_nbytes) -{ - /* - * The worst case is all uncompressed blocks where one block has length - * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH. - * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE, - * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN. - */ - size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); - return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING; -} diff --git a/ext/libdeflate/lib/deflate_compress.h b/ext/libdeflate/lib/deflate_compress.h deleted file mode 100644 index f4bb23b6..00000000 --- a/ext/libdeflate/lib/deflate_compress.h +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef LIB_DEFLATE_COMPRESS_H -#define LIB_DEFLATE_COMPRESS_H - -#include "lib_common.h" - -/* DEFLATE compression is private to deflate_compress.c, but we do need to be - * able to query the compression level for zlib and gzip header generation. */ - -struct libdeflate_compressor; - -extern unsigned int -deflate_get_compression_level(struct libdeflate_compressor *c); - -#endif /* LIB_DEFLATE_COMPRESS_H */ diff --git a/ext/libdeflate/lib/deflate_constants.h b/ext/libdeflate/lib/deflate_constants.h deleted file mode 100644 index a10b57de..00000000 --- a/ext/libdeflate/lib/deflate_constants.h +++ /dev/null @@ -1,66 +0,0 @@ -/* - * deflate_constants.h - constants for the DEFLATE compression format - */ - -#ifndef LIB_DEFLATE_CONSTANTS_H -#define LIB_DEFLATE_CONSTANTS_H - -/* Valid block types */ -#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 -#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 -#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 - -/* Minimum and maximum supported match lengths (in bytes) */ -#define DEFLATE_MIN_MATCH_LEN 3 -#define DEFLATE_MAX_MATCH_LEN 258 - -/* Minimum and maximum supported match offsets (in bytes) */ -#define DEFLATE_MIN_MATCH_OFFSET 1 -#define DEFLATE_MAX_MATCH_OFFSET 32768 - -#define DEFLATE_MAX_WINDOW_SIZE 32768 - -/* Number of symbols in each Huffman code. Note: for the literal/length - * and offset codes, these are actually the maximum values; a given block - * might use fewer symbols. */ -#define DEFLATE_NUM_PRECODE_SYMS 19 -#define DEFLATE_NUM_LITLEN_SYMS 288 -#define DEFLATE_NUM_OFFSET_SYMS 32 - -/* The maximum number of symbols across all codes */ -#define DEFLATE_MAX_NUM_SYMS 288 - -/* Division of symbols in the literal/length code */ -#define DEFLATE_NUM_LITERALS 256 -#define DEFLATE_END_OF_BLOCK 256 -#define DEFLATE_NUM_LEN_SYMS 31 - -/* Maximum codeword length, in bits, within each Huffman code */ -#define DEFLATE_MAX_PRE_CODEWORD_LEN 7 -#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 -#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 - -/* The maximum codeword length across all codes */ -#define DEFLATE_MAX_CODEWORD_LEN 15 - -/* Maximum possible overrun when decoding codeword lengths */ -#define DEFLATE_MAX_LENS_OVERRUN 137 - -/* - * Maximum number of extra bits that may be required to represent a match - * length or offset. - * - * TODO: are we going to have full DEFLATE64 support? If so, up to 16 - * length bits must be supported. - */ -#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 -#define DEFLATE_MAX_EXTRA_OFFSET_BITS 14 - -/* The maximum number of bits in which a match can be represented. This - * is the absolute worst case, which assumes the longest possible Huffman - * codewords and the maximum numbers of extra bits. */ -#define DEFLATE_MAX_MATCH_BITS \ - (DEFLATE_MAX_LITLEN_CODEWORD_LEN + DEFLATE_MAX_EXTRA_LENGTH_BITS + \ - DEFLATE_MAX_OFFSET_CODEWORD_LEN + DEFLATE_MAX_EXTRA_OFFSET_BITS) - -#endif /* LIB_DEFLATE_CONSTANTS_H */ diff --git a/ext/libdeflate/lib/deflate_decompress.c b/ext/libdeflate/lib/deflate_decompress.c deleted file mode 100644 index 19ccdb21..00000000 --- a/ext/libdeflate/lib/deflate_decompress.c +++ /dev/null @@ -1,997 +0,0 @@ -/* - * deflate_decompress.c - a decompressor for DEFLATE - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * --------------------------------------------------------------------------- - * - * This is a highly optimized DEFLATE decompressor. When compiled with gcc on - * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2 - * instructions are available). On other architectures it should still be - * significantly faster than zlib, but the difference may be smaller. - * - * Why this is faster than zlib's implementation: - * - * - Word accesses rather than byte accesses when reading input - * - Word accesses rather than byte accesses when copying matches - * - Faster Huffman decoding combined with various DEFLATE-specific tricks - * - Larger bitbuffer variable that doesn't need to be filled as often - * - Other optimizations to remove unnecessary branches - * - Only full-buffer decompression is supported, so the code doesn't need to - * support stopping and resuming decompression. - * - On x86_64, compile a version of the decompression routine using BMI2 - * instructions and use it automatically at runtime when supported. - */ - -#include -#include -#include - -#include "deflate_constants.h" -#include "unaligned.h" - -#include "libdeflate.h" - -/* - * If the expression passed to SAFETY_CHECK() evaluates to false, then the - * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the - * compressed data is invalid. - * - * Theoretically, these checks could be disabled for specialized applications - * where all input to the decompressor will be trusted. - */ -#if 0 -# pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") -# define SAFETY_CHECK(expr) (void)(expr) -#else -# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA -#endif - -/* - * Each TABLEBITS number is the base-2 logarithm of the number of entries in the - * main portion of the corresponding decode table. Each number should be large - * enough to ensure that for typical data, the vast majority of symbols can be - * decoded by a direct lookup of the next TABLEBITS bits of compressed data. - * However, this must be balanced against the fact that a larger table requires - * more memory and requires more time to fill. - * - * Note: you cannot change a TABLEBITS number without also changing the - * corresponding ENOUGH number! - */ -#define PRECODE_TABLEBITS 7 -#define LITLEN_TABLEBITS 10 -#define OFFSET_TABLEBITS 8 - -/* - * Each ENOUGH number is the maximum number of decode table entries that may be - * required for the corresponding Huffman code, including the main table and all - * subtables. Each number depends on three parameters: - * - * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) - * (2) the number of main table bits (the TABLEBITS numbers defined above) - * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) - * - * The ENOUGH numbers were computed using the utility program 'enough' from - * zlib. This program enumerates all possible relevant Huffman codes to find - * the worst-case usage of decode table entries. - */ -#define PRECODE_ENOUGH 128 /* enough 19 7 7 */ -#define LITLEN_ENOUGH 1334 /* enough 288 10 15 */ -#define OFFSET_ENOUGH 402 /* enough 32 8 15 */ - -/* - * Type for codeword lengths. - */ -typedef u8 len_t; - -/* - * The main DEFLATE decompressor structure. Since this implementation only - * supports full buffer decompression, this structure does not store the entire - * decompression state, but rather only some arrays that are too large to - * comfortably allocate on the stack. - */ -struct libdeflate_decompressor { - - /* - * The arrays aren't all needed at the same time. 'precode_lens' and - * 'precode_decode_table' are unneeded after 'lens' has been filled. - * Furthermore, 'lens' need not be retained after building the litlen - * and offset decode tables. In fact, 'lens' can be in union with - * 'litlen_decode_table' provided that 'offset_decode_table' is separate - * and is built first. - */ - - union { - len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS]; - - struct { - len_t lens[DEFLATE_NUM_LITLEN_SYMS + - DEFLATE_NUM_OFFSET_SYMS + - DEFLATE_MAX_LENS_OVERRUN]; - - u32 precode_decode_table[PRECODE_ENOUGH]; - } l; - - u32 litlen_decode_table[LITLEN_ENOUGH]; - } u; - - u32 offset_decode_table[OFFSET_ENOUGH]; - - /* used only during build_decode_table() */ - u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; - - bool static_codes_loaded; -}; - -/***************************************************************************** - * Input bitstream * - *****************************************************************************/ - -/* - * The state of the "input bitstream" consists of the following variables: - * - * - in_next: pointer to the next unread byte in the input buffer - * - * - in_end: pointer just past the end of the input buffer - * - * - bitbuf: a word-sized variable containing bits that have been read from - * the input buffer. The buffered bits are right-aligned - * (they're the low-order bits). - * - * - bitsleft: number of bits in 'bitbuf' that are valid. - * - * To make it easier for the compiler to optimize the code by keeping variables - * in registers, these are declared as normal variables and manipulated using - * macros. - */ - -/* - * The type for the bitbuffer variable ('bitbuf' described above). For best - * performance, this should have size equal to a machine word. - * - * 64-bit platforms have a significant advantage: they get a bigger bitbuffer - * which they have to fill less often. - */ -typedef machine_word_t bitbuf_t; - -/* - * Number of bits the bitbuffer variable can hold. - * - * This is one less than the obvious value because of the optimized arithmetic - * in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range - * [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS]. - */ -#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) - -/* - * The maximum number of bits that can be ensured in the bitbuffer variable, - * i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n). The decoder - * only reads whole bytes from memory, so this is the lowest value of 'bitsleft' - * at which another byte cannot be read without first consuming some bits. - */ -#define MAX_ENSURE (BITBUF_NBITS - 7) - -/* - * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if - * 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile - * time constant, then this expression will be a compile-type constant. - * Therefore, CAN_ENSURE() can be used choose between alternative - * implementations at compile time. - */ -#define CAN_ENSURE(n) ((n) <= MAX_ENSURE) - -/* - * Fill the bitbuffer variable, reading one byte at a time. - * - * If we would overread the input buffer, we just don't read anything, leaving - * the bits zeroed but marking them filled. This simplifies the decompressor - * because it removes the need to distinguish between real overreads and - * overreads that occur only because of the decompressor's own lookahead. - * - * The disadvantage is that real overreads are not detected immediately. - * However, this is safe because the decompressor is still guaranteed to make - * forward progress when presented never-ending 0 bits. In an existing block - * output will be getting generated, whereas new blocks can only be uncompressed - * (since the type code for uncompressed blocks is 0), for which we check for - * previous overread. But even if we didn't check, uncompressed blocks would - * fail to validate because LEN would not equal ~NLEN. So the decompressor will - * eventually either detect that the output buffer is full, or detect invalid - * input, or finish the final block. - */ -#define FILL_BITS_BYTEWISE() \ -do { \ - if (likely(in_next != in_end)) \ - bitbuf |= (bitbuf_t)*in_next++ << bitsleft; \ - else \ - overrun_count++; \ - bitsleft += 8; \ -} while (bitsleft <= BITBUF_NBITS - 8) - -/* - * Fill the bitbuffer variable by reading the next word from the input buffer - * and branchlessly updating 'in_next' and 'bitsleft' based on how many bits - * were filled. This can be significantly faster than FILL_BITS_BYTEWISE(). - * However, for this to work correctly, the word must be interpreted in - * little-endian format. In addition, the memory access may be unaligned. - * Therefore, this method is most efficient on little-endian architectures that - * support fast unaligned access, such as x86 and x86_64. - * - * For faster updating of 'bitsleft', we consider the bitbuffer size in bits to - * be 1 less than the word size and therefore be all 1 bits. Then the number of - * bits filled is the value of the 0 bits in position >= 3 when changed to 1. - * E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000 - * = 40 bits = 5 bytes. This uses only 4 operations to update 'in_next' and - * 'bitsleft': one each of +, ^, >>, and |. (Not counting operations the - * compiler optimizes out.) In contrast, the alternative of: - * - * in_next += (BITBUF_NBITS - bitsleft) >> 3; - * bitsleft += (BITBUF_NBITS - bitsleft) & ~7; - * - * (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on - * average refill an extra bit, but uses 5 operations: two +, and one each of - * -, >>, and &. Also the - and & must be completed before 'bitsleft' can be - * updated, while the current solution updates 'bitsleft' with no dependencies. - */ -#define FILL_BITS_WORDWISE() \ -do { \ - /* BITBUF_NBITS must be all 1's in binary, see above */ \ - STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\ - \ - bitbuf |= get_unaligned_leword(in_next) << bitsleft; \ - in_next += (bitsleft ^ BITBUF_NBITS) >> 3; \ - bitsleft |= BITBUF_NBITS & ~7; \ -} while (0) - -/* - * Does the bitbuffer variable currently contain at least 'n' bits? - */ -#define HAVE_BITS(n) (bitsleft >= (n)) - -/* - * Load more bits from the input buffer until the specified number of bits is - * present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE - * and CAN_ENSURE(). - */ -#define ENSURE_BITS(n) \ -if (!HAVE_BITS(n)) { \ - if (CPU_IS_LITTLE_ENDIAN() && \ - UNALIGNED_ACCESS_IS_FAST && \ - likely(in_end - in_next >= sizeof(bitbuf_t))) \ - FILL_BITS_WORDWISE(); \ - else \ - FILL_BITS_BYTEWISE(); \ -} - -/* - * Return the next 'n' bits from the bitbuffer variable without removing them. - */ -#define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1)) - -/* - * Remove the next 'n' bits from the bitbuffer variable. - */ -#define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n)) - -/* - * Remove and return the next 'n' bits from the bitbuffer variable. - */ -#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32) - -/* - * Verify that the input buffer hasn't been overread, then align the input to - * the next byte boundary, discarding any remaining bits in the current byte. - * - * Note that if the bitbuffer variable currently contains more than 7 bits, then - * we must rewind 'in_next', effectively putting those bits back. Only the bits - * in what would be the "current" byte if we were reading one byte at a time can - * be actually discarded. - */ -#define ALIGN_INPUT() \ -do { \ - SAFETY_CHECK(overrun_count <= (bitsleft >> 3)); \ - in_next -= (bitsleft >> 3) - overrun_count; \ - overrun_count = 0; \ - bitbuf = 0; \ - bitsleft = 0; \ -} while(0) - -/* - * Read a 16-bit value from the input. This must have been preceded by a call - * to ALIGN_INPUT(), and the caller must have already checked for overrun. - */ -#define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16) - -/***************************************************************************** - * Huffman decoding * - *****************************************************************************/ - -/* - * A decode table for order TABLEBITS consists of a main table of (1 << - * TABLEBITS) entries followed by a variable number of subtables. - * - * The decoding algorithm takes the next TABLEBITS bits of compressed data and - * uses them as an index into the decode table. The resulting entry is either a - * "direct entry", meaning that it contains the value desired, or a "subtable - * pointer", meaning that the entry references a subtable that must be indexed - * using more bits of the compressed data to decode the symbol. - * - * Each decode table (a main table along with with its subtables, if any) is - * associated with a Huffman code. Logically, the result of a decode table - * lookup is a symbol from the alphabet from which the corresponding Huffman - * code was constructed. A symbol with codeword length n <= TABLEBITS is - * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a - * symbol with codeword length n > TABLEBITS is associated with one or more - * subtable entries. - * - * On top of this basic design, we implement several optimizations: - * - * - We store the length of each codeword directly in each of its decode table - * entries. This allows the codeword length to be produced without indexing - * an additional table. - * - * - When beneficial, we don't store the Huffman symbol itself, but instead data - * generated from it. For example, when decoding an offset symbol in DEFLATE, - * it's more efficient if we can decode the offset base and number of extra - * offset bits directly rather than decoding the offset symbol and then - * looking up both of those values in an additional table or tables. - * - * The size of each decode table entry is 32 bits, which provides slightly - * better performance than 16-bit entries on 32 and 64 bit processers, provided - * that the table doesn't get so large that it takes up too much memory and - * starts generating cache misses. The bits of each decode table entry are - * defined as follows: - * - * - Bits 30 -- 31: flags (see below) - * - Bits 8 -- 29: decode result: a Huffman symbol or related data - * - Bits 0 -- 7: codeword length - */ - -/* - * This flag is set in all main decode table entries that represent subtable - * pointers. - */ -#define HUFFDEC_SUBTABLE_POINTER 0x80000000 - -/* - * This flag is set in all entries in the litlen decode table that represent - * literals. - */ -#define HUFFDEC_LITERAL 0x40000000 - -/* Mask for extracting the codeword length from a decode table entry. */ -#define HUFFDEC_LENGTH_MASK 0xFF - -/* Shift to extract the decode result from a decode table entry. */ -#define HUFFDEC_RESULT_SHIFT 8 - -/* Shift a decode result into its position in the decode table entry. */ -#define HUFFDEC_RESULT_ENTRY(result) ((u32)(result) << HUFFDEC_RESULT_SHIFT) - -/* The decode result for each precode symbol. There is no special optimization - * for the precode; the decode result is simply the symbol value. */ -static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = { -#define ENTRY(presym) HUFFDEC_RESULT_ENTRY(presym) - ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , - ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , - ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , - ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , - ENTRY(16) , ENTRY(17) , ENTRY(18) , -#undef ENTRY -}; - -/* The decode result for each litlen symbol. For literals, this is the literal - * value itself and the HUFFDEC_LITERAL flag. For lengths, this is the length - * base and the number of extra length bits. */ -static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = { - - /* Literals */ -#define ENTRY(literal) (HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal)) - ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , - ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , - ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , - ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , - ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , - ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , - ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , - ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , - ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , - ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , - ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , - ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , - ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , - ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , - ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , - ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , - ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , - ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , - ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , - ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , - ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , - ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , - ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , - ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , - ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , - ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , - ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , - ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , - ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , - ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , - ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , - ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , - ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , - ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , - ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , - ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , - ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , - ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , - ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , - ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , - ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , - ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , - ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , - ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , - ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , - ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , - ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , - ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , - ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , - ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , - ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , - ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , - ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , - ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , - ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , - ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , - ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , - ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , - ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , - ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , - ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , - ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , - ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , - ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , -#undef ENTRY - -#define HUFFDEC_EXTRA_LENGTH_BITS_MASK 0xFF -#define HUFFDEC_LENGTH_BASE_SHIFT 8 -#define HUFFDEC_END_OF_BLOCK_LENGTH 0 - -#define ENTRY(length_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \ - ((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits)) - - /* End of block */ - ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0), - - /* Lengths */ - ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), - ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), - ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), - ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), - ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), - ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), - ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), - ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , -#undef ENTRY -}; - -/* The decode result for each offset symbol. This is the offset base and the - * number of extra offset bits. */ -static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = { - -#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16 -#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1) - -#define ENTRY(offset_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \ - ((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \ - (offset_base)) - ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , - ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , - ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , - ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , - ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , - ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , - ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , - ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) , -#undef ENTRY -}; - -/* - * Build a table for fast decoding of symbols from a Huffman code. As input, - * this function takes the codeword length of each symbol which may be used in - * the code. As output, it produces a decode table for the canonical Huffman - * code described by the codeword lengths. The decode table is built with the - * assumption that it will be indexed with "bit-reversed" codewords, where the - * low-order bit is the first bit of the codeword. This format is used for all - * Huffman codes in DEFLATE. - * - * @decode_table - * The array in which the decode table will be generated. This array must - * have sufficient length; see the definition of the ENOUGH numbers. - * @lens - * An array which provides, for each symbol, the length of the - * corresponding codeword in bits, or 0 if the symbol is unused. This may - * alias @decode_table, since nothing is written to @decode_table until all - * @lens have been consumed. All codeword lengths are assumed to be <= - * @max_codeword_len but are otherwise considered untrusted. If they do - * not form a valid Huffman code, then the decode table is not built and - * %false is returned. - * @num_syms - * The number of symbols in the code, including all unused symbols. - * @decode_results - * An array which provides, for each symbol, the actual value to store into - * the decode table. This value will be directly produced as the result of - * decoding that symbol, thereby moving the indirection out of the decode - * loop and into the table initialization. - * @table_bits - * The log base-2 of the number of main table entries to use. - * @max_codeword_len - * The maximum allowed codeword length for this Huffman code. - * Must be <= DEFLATE_MAX_CODEWORD_LEN. - * @sorted_syms - * A temporary array of length @num_syms. - * - * Returns %true if successful; %false if the codeword lengths do not form a - * valid Huffman code. - */ -static bool -build_decode_table(u32 decode_table[], - const len_t lens[], - const unsigned num_syms, - const u32 decode_results[], - const unsigned table_bits, - const unsigned max_codeword_len, - u16 *sorted_syms) -{ - unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned sym; /* current symbol */ - unsigned codeword; /* current codeword, bit-reversed */ - unsigned len; /* current codeword length in bits */ - unsigned count; /* num codewords remaining with this length */ - u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ - unsigned cur_table_end; /* end index of current table */ - unsigned subtable_prefix; /* codeword prefix of current subtable */ - unsigned subtable_start; /* start index of current subtable */ - unsigned subtable_bits; /* log2 of current subtable length */ - - /* Count how many codewords have each length, including 0. */ - for (len = 0; len <= max_codeword_len; len++) - len_counts[len] = 0; - for (sym = 0; sym < num_syms; sym++) - len_counts[lens[sym]]++; - - /* - * Sort the symbols primarily by increasing codeword length and - * secondarily by increasing symbol value; or equivalently by their - * codewords in lexicographic order, since a canonical code is assumed. - * - * For efficiency, also compute 'codespace_used' in the same pass over - * 'len_counts[]' used to build 'offsets[]' for sorting. - */ - - /* Ensure that 'codespace_used' cannot overflow. */ - STATIC_ASSERT(sizeof(codespace_used) == 4); - STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= - DEFLATE_MAX_NUM_SYMS); - - offsets[0] = 0; - offsets[1] = len_counts[0]; - codespace_used = 0; - for (len = 1; len < max_codeword_len; len++) { - offsets[len + 1] = offsets[len] + len_counts[len]; - codespace_used = (codespace_used << 1) + len_counts[len]; - } - codespace_used = (codespace_used << 1) + len_counts[len]; - - for (sym = 0; sym < num_syms; sym++) - sorted_syms[offsets[lens[sym]]++] = sym; - - sorted_syms += offsets[0]; /* Skip unused symbols */ - - /* lens[] is done being used, so we can write to decode_table[] now. */ - - /* - * Check whether the lengths form a complete code (exactly fills the - * codespace), an incomplete code (doesn't fill the codespace), or an - * overfull code (overflows the codespace). A codeword of length 'n' - * uses proportion '1/(2^n)' of the codespace. An overfull code is - * nonsensical, so is considered invalid. An incomplete code is - * considered valid only in two specific cases; see below. - */ - - /* overfull code? */ - if (unlikely(codespace_used > (1U << max_codeword_len))) - return false; - - /* incomplete code? */ - if (unlikely(codespace_used < (1U << max_codeword_len))) { - u32 entry; - unsigned i; - - if (codespace_used == 0) { - /* - * An empty code is allowed. This can happen for the - * offset code in DEFLATE, since a dynamic Huffman block - * need not contain any matches. - */ - - /* sym=0, len=1 (arbitrary) */ - entry = decode_results[0] | 1; - } else { - /* - * Allow codes with a single used symbol, with codeword - * length 1. The DEFLATE RFC is unclear regarding this - * case. What zlib's decompressor does is permit this - * for the litlen and offset codes and assume the - * codeword is '0' rather than '1'. We do the same - * except we allow this for precodes too, since there's - * no convincing reason to treat the codes differently. - * We also assign both codewords '0' and '1' to the - * symbol to avoid having to handle '1' specially. - */ - if (codespace_used != (1U << (max_codeword_len - 1)) || - len_counts[1] != 1) - return false; - entry = decode_results[*sorted_syms] | 1; - } - /* - * Note: the decode table still must be fully initialized, in - * case the stream is malformed and contains bits from the part - * of the codespace the incomplete code doesn't use. - */ - for (i = 0; i < (1U << table_bits); i++) - decode_table[i] = entry; - return true; - } - - /* - * The lengths form a complete code. Now, enumerate the codewords in - * lexicographic order and fill the decode table entries for each one. - * - * First, process all codewords with len <= table_bits. Each one gets - * '2^(table_bits-len)' direct entries in the table. - * - * Since DEFLATE uses bit-reversed codewords, these entries aren't - * consecutive but rather are spaced '2^len' entries apart. This makes - * filling them naively somewhat awkward and inefficient, since strided - * stores are less cache-friendly and preclude the use of word or - * vector-at-a-time stores to fill multiple entries per instruction. - * - * To optimize this, we incrementally double the table size. When - * processing codewords with length 'len', the table is treated as - * having only '2^len' entries, so each codeword uses just one entry. - * Then, each time 'len' is incremented, the table size is doubled and - * the first half is copied to the second half. This significantly - * improves performance over naively doing strided stores. - * - * Note that some entries copied for each table doubling may not have - * been initialized yet, but it doesn't matter since they're guaranteed - * to be initialized later (because the Huffman code is complete). - */ - codeword = 0; - len = 1; - while ((count = len_counts[len]) == 0) - len++; - cur_table_end = 1U << len; - while (len <= table_bits) { - /* Process all 'count' codewords with length 'len' bits. */ - do { - unsigned bit; - - /* Fill the first entry for the current codeword. */ - decode_table[codeword] = - decode_results[*sorted_syms++] | len; - - if (codeword == cur_table_end - 1) { - /* Last codeword (all 1's) */ - for (; len < table_bits; len++) { - memcpy(&decode_table[cur_table_end], - decode_table, - cur_table_end * - sizeof(decode_table[0])); - cur_table_end <<= 1; - } - return true; - } - /* - * To advance to the lexicographically next codeword in - * the canonical code, the codeword must be incremented, - * then 0's must be appended to the codeword as needed - * to match the next codeword's length. - * - * Since the codeword is bit-reversed, appending 0's is - * a no-op. However, incrementing it is nontrivial. To - * do so efficiently, use the 'bsr' instruction to find - * the last (highest order) 0 bit in the codeword, set - * it, and clear any later (higher order) 1 bits. But - * 'bsr' actually finds the highest order 1 bit, so to - * use it first flip all bits in the codeword by XOR'ing - * it with (1U << len) - 1 == cur_table_end - 1. - */ - bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); - codeword &= bit - 1; - codeword |= bit; - } while (--count); - - /* Advance to the next codeword length. */ - do { - if (++len <= table_bits) { - memcpy(&decode_table[cur_table_end], - decode_table, - cur_table_end * sizeof(decode_table[0])); - cur_table_end <<= 1; - } - } while ((count = len_counts[len]) == 0); - } - - /* Process codewords with len > table_bits. These require subtables. */ - cur_table_end = 1U << table_bits; - subtable_prefix = -1; - subtable_start = 0; - for (;;) { - u32 entry; - unsigned i; - unsigned stride; - unsigned bit; - - /* - * Start a new subtable if the first 'table_bits' bits of the - * codeword don't match the prefix of the current subtable. - */ - if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { - subtable_prefix = (codeword & ((1U << table_bits) - 1)); - subtable_start = cur_table_end; - /* - * Calculate the subtable length. If the codeword has - * length 'table_bits + n', then the subtable needs - * '2^n' entries. But it may need more; if fewer than - * '2^n' codewords of length 'table_bits + n' remain, - * then the length will need to be incremented to bring - * in longer codewords until the subtable can be - * completely filled. Note that because the Huffman - * code is complete, it will always be possible to fill - * the subtable eventually. - */ - subtable_bits = len - table_bits; - codespace_used = count; - while (codespace_used < (1U << subtable_bits)) { - subtable_bits++; - codespace_used = (codespace_used << 1) + - len_counts[table_bits + subtable_bits]; - } - cur_table_end = subtable_start + (1U << subtable_bits); - - /* - * Create the entry that points from the main table to - * the subtable. This entry contains the index of the - * start of the subtable and the number of bits with - * which the subtable is indexed (the log base 2 of the - * number of entries it contains). - */ - decode_table[subtable_prefix] = - HUFFDEC_SUBTABLE_POINTER | - HUFFDEC_RESULT_ENTRY(subtable_start) | - subtable_bits; - } - - /* Fill the subtable entries for the current codeword. */ - entry = decode_results[*sorted_syms++] | (len - table_bits); - i = subtable_start + (codeword >> table_bits); - stride = 1U << (len - table_bits); - do { - decode_table[i] = entry; - i += stride; - } while (i < cur_table_end); - - /* Advance to the next codeword. */ - if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ - return true; - bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); - codeword &= bit - 1; - codeword |= bit; - count--; - while (count == 0) - count = len_counts[++len]; - } -} - -/* Build the decode table for the precode. */ -static bool -build_precode_decode_table(struct libdeflate_decompressor *d) -{ - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); - - return build_decode_table(d->u.l.precode_decode_table, - d->u.precode_lens, - DEFLATE_NUM_PRECODE_SYMS, - precode_decode_results, - PRECODE_TABLEBITS, - DEFLATE_MAX_PRE_CODEWORD_LEN, - d->sorted_syms); -} - -/* Build the decode table for the literal/length code. */ -static bool -build_litlen_decode_table(struct libdeflate_decompressor *d, - unsigned num_litlen_syms, unsigned num_offset_syms) -{ - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334); - - return build_decode_table(d->u.litlen_decode_table, - d->u.l.lens, - num_litlen_syms, - litlen_decode_results, - LITLEN_TABLEBITS, - DEFLATE_MAX_LITLEN_CODEWORD_LEN, - d->sorted_syms); -} - -/* Build the decode table for the offset code. */ -static bool -build_offset_decode_table(struct libdeflate_decompressor *d, - unsigned num_litlen_syms, unsigned num_offset_syms) -{ - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); - - return build_decode_table(d->offset_decode_table, - d->u.l.lens + num_litlen_syms, - num_offset_syms, - offset_decode_results, - OFFSET_TABLEBITS, - DEFLATE_MAX_OFFSET_CODEWORD_LEN, - d->sorted_syms); -} - -static forceinline machine_word_t -repeat_byte(u8 b) -{ - machine_word_t v; - - STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - - v = b; - v |= v << 8; - v |= v << 16; - v |= v << ((WORDBITS == 64) ? 32 : 0); - return v; -} - -static forceinline void -copy_word_unaligned(const void *src, void *dst) -{ - store_word_unaligned(load_word_unaligned(src), dst); -} - -/***************************************************************************** - * Main decompression routine - *****************************************************************************/ - -typedef enum libdeflate_result (*decompress_func_t) - (struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); - -#undef DEFAULT_IMPL -#undef DISPATCH -#if defined(__i386__) || defined(__x86_64__) -# include "x86/decompress_impl.h" -#endif - -#ifndef DEFAULT_IMPL -# define FUNCNAME deflate_decompress_default -# define ATTRIBUTES -# include "decompress_template.h" -# define DEFAULT_IMPL deflate_decompress_default -#endif - -#ifdef DISPATCH -static enum libdeflate_result -dispatch(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); - -static volatile decompress_func_t decompress_impl = dispatch; - -/* Choose the fastest implementation at runtime */ -static enum libdeflate_result -dispatch(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) -{ - decompress_func_t f = arch_select_decompress_func(); - - if (f == NULL) - f = DEFAULT_IMPL; - - decompress_impl = f; - return (*f)(d, in, in_nbytes, out, out_nbytes_avail, - actual_in_nbytes_ret, actual_out_nbytes_ret); -} -#else -# define decompress_impl DEFAULT_IMPL /* only one implementation, use it */ -#endif - - -/* - * This is the main DEFLATE decompression routine. See libdeflate.h for the - * documentation. - * - * Note that the real code is in decompress_template.h. The part here just - * handles calling the appropriate implementation depending on the CPU features - * at runtime. - */ -LIBDEFLATEAPI enum libdeflate_result -libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret) -{ - return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, - actual_in_nbytes_ret, actual_out_nbytes_ret); -} - -LIBDEFLATEAPI enum libdeflate_result -libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) -{ - return libdeflate_deflate_decompress_ex(d, in, in_nbytes, - out, out_nbytes_avail, - NULL, actual_out_nbytes_ret); -} - -LIBDEFLATEAPI struct libdeflate_decompressor * -libdeflate_alloc_decompressor(void) -{ - /* - * Note that only certain parts of the decompressor actually must be - * initialized here: - * - * - 'static_codes_loaded' must be initialized to false. - * - * - The first half of the main portion of each decode table must be - * initialized to any value, to avoid reading from uninitialized - * memory during table expansion in build_decode_table(). (Although, - * this is really just to avoid warnings with dynamic tools like - * valgrind, since build_decode_table() is guaranteed to initialize - * all entries eventually anyway.) - * - * But for simplicity, we currently just zero the whole decompressor. - */ - return calloc(1, sizeof(struct libdeflate_decompressor)); -} - -LIBDEFLATEAPI void -libdeflate_free_decompressor(struct libdeflate_decompressor *d) -{ - free(d); -} diff --git a/ext/libdeflate/lib/gzip_compress.c b/ext/libdeflate/lib/gzip_compress.c deleted file mode 100644 index bfc75e2b..00000000 --- a/ext/libdeflate/lib/gzip_compress.c +++ /dev/null @@ -1,95 +0,0 @@ -/* - * gzip_compress.c - compress with a gzip wrapper - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "deflate_compress.h" -#include "gzip_constants.h" -#include "unaligned.h" - -#include "libdeflate.h" - -LIBDEFLATEAPI size_t -libdeflate_gzip_compress(struct libdeflate_compressor *c, - const void *in, size_t in_size, - void *out, size_t out_nbytes_avail) -{ - u8 *out_next = out; - unsigned compression_level; - u8 xfl; - size_t deflate_size; - - if (out_nbytes_avail <= GZIP_MIN_OVERHEAD) - return 0; - - /* ID1 */ - *out_next++ = GZIP_ID1; - /* ID2 */ - *out_next++ = GZIP_ID2; - /* CM */ - *out_next++ = GZIP_CM_DEFLATE; - /* FLG */ - *out_next++ = 0; - /* MTIME */ - put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next); - out_next += 4; - /* XFL */ - xfl = 0; - compression_level = deflate_get_compression_level(c); - if (compression_level < 2) - xfl |= GZIP_XFL_FASTEST_COMRESSION; - else if (compression_level >= 8) - xfl |= GZIP_XFL_SLOWEST_COMRESSION; - *out_next++ = xfl; - /* OS */ - *out_next++ = GZIP_OS_UNKNOWN; /* OS */ - - /* Compressed data */ - deflate_size = libdeflate_deflate_compress(c, in, in_size, out_next, - out_nbytes_avail - GZIP_MIN_OVERHEAD); - if (deflate_size == 0) - return 0; - out_next += deflate_size; - - /* CRC32 */ - put_unaligned_le32(libdeflate_crc32(0, in, in_size), out_next); - out_next += 4; - - /* ISIZE */ - put_unaligned_le32((u32)in_size, out_next); - out_next += 4; - - return out_next - (u8 *)out; -} - -LIBDEFLATEAPI size_t -libdeflate_gzip_compress_bound(struct libdeflate_compressor *c, - size_t in_nbytes) -{ - return GZIP_MIN_OVERHEAD + - libdeflate_deflate_compress_bound(c, in_nbytes); -} diff --git a/ext/libdeflate/lib/gzip_constants.h b/ext/libdeflate/lib/gzip_constants.h deleted file mode 100644 index 40dd4358..00000000 --- a/ext/libdeflate/lib/gzip_constants.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * gzip_constants.h - constants for the gzip wrapper format - */ - -#ifndef LIB_GZIP_CONSTANTS_H -#define LIB_GZIP_CONSTANTS_H - -#define GZIP_MIN_HEADER_SIZE 10 -#define GZIP_FOOTER_SIZE 8 -#define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE) - -#define GZIP_ID1 0x1F -#define GZIP_ID2 0x8B - -#define GZIP_CM_DEFLATE 8 - -#define GZIP_FTEXT 0x01 -#define GZIP_FHCRC 0x02 -#define GZIP_FEXTRA 0x04 -#define GZIP_FNAME 0x08 -#define GZIP_FCOMMENT 0x10 -#define GZIP_FRESERVED 0xE0 - -#define GZIP_MTIME_UNAVAILABLE 0 - -#define GZIP_XFL_SLOWEST_COMRESSION 0x02 -#define GZIP_XFL_FASTEST_COMRESSION 0x04 - -#define GZIP_OS_FAT 0 -#define GZIP_OS_AMIGA 1 -#define GZIP_OS_VMS 2 -#define GZIP_OS_UNIX 3 -#define GZIP_OS_VM_CMS 4 -#define GZIP_OS_ATARI_TOS 5 -#define GZIP_OS_HPFS 6 -#define GZIP_OS_MACINTOSH 7 -#define GZIP_OS_Z_SYSTEM 8 -#define GZIP_OS_CP_M 9 -#define GZIP_OS_TOPS_20 10 -#define GZIP_OS_NTFS 11 -#define GZIP_OS_QDOS 12 -#define GZIP_OS_RISCOS 13 -#define GZIP_OS_UNKNOWN 255 - -#endif /* LIB_GZIP_CONSTANTS_H */ diff --git a/ext/libdeflate/lib/gzip_decompress.c b/ext/libdeflate/lib/gzip_decompress.c deleted file mode 100644 index 5703093e..00000000 --- a/ext/libdeflate/lib/gzip_decompress.c +++ /dev/null @@ -1,148 +0,0 @@ -/* - * gzip_decompress.c - decompress with a gzip wrapper - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "gzip_constants.h" -#include "unaligned.h" - -#include "libdeflate.h" - -LIBDEFLATEAPI enum libdeflate_result -libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret) -{ - const u8 *in_next = in; - const u8 * const in_end = in_next + in_nbytes; - u8 flg; - size_t actual_in_nbytes; - size_t actual_out_nbytes; - enum libdeflate_result result; - - if (in_nbytes < GZIP_MIN_OVERHEAD) - return LIBDEFLATE_BAD_DATA; - - /* ID1 */ - if (*in_next++ != GZIP_ID1) - return LIBDEFLATE_BAD_DATA; - /* ID2 */ - if (*in_next++ != GZIP_ID2) - return LIBDEFLATE_BAD_DATA; - /* CM */ - if (*in_next++ != GZIP_CM_DEFLATE) - return LIBDEFLATE_BAD_DATA; - flg = *in_next++; - /* MTIME */ - in_next += 4; - /* XFL */ - in_next += 1; - /* OS */ - in_next += 1; - - if (flg & GZIP_FRESERVED) - return LIBDEFLATE_BAD_DATA; - - /* Extra field */ - if (flg & GZIP_FEXTRA) { - u16 xlen = get_unaligned_le16(in_next); - in_next += 2; - - if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE) - return LIBDEFLATE_BAD_DATA; - - in_next += xlen; - } - - /* Original file name (zero terminated) */ - if (flg & GZIP_FNAME) { - while (*in_next++ != 0 && in_next != in_end) - ; - if (in_end - in_next < GZIP_FOOTER_SIZE) - return LIBDEFLATE_BAD_DATA; - } - - /* File comment (zero terminated) */ - if (flg & GZIP_FCOMMENT) { - while (*in_next++ != 0 && in_next != in_end) - ; - if (in_end - in_next < GZIP_FOOTER_SIZE) - return LIBDEFLATE_BAD_DATA; - } - - /* CRC16 for gzip header */ - if (flg & GZIP_FHCRC) { - in_next += 2; - if (in_end - in_next < GZIP_FOOTER_SIZE) - return LIBDEFLATE_BAD_DATA; - } - - /* Compressed data */ - result = libdeflate_deflate_decompress_ex(d, in_next, - in_end - GZIP_FOOTER_SIZE - in_next, - out, out_nbytes_avail, - &actual_in_nbytes, - actual_out_nbytes_ret); - if (result != LIBDEFLATE_SUCCESS) - return result; - - if (actual_out_nbytes_ret) - actual_out_nbytes = *actual_out_nbytes_ret; - else - actual_out_nbytes = out_nbytes_avail; - - in_next += actual_in_nbytes; - - /* CRC32 */ - if (libdeflate_crc32(0, out, actual_out_nbytes) != - get_unaligned_le32(in_next)) - return LIBDEFLATE_BAD_DATA; - in_next += 4; - - /* ISIZE */ - if ((u32)actual_out_nbytes != get_unaligned_le32(in_next)) - return LIBDEFLATE_BAD_DATA; - in_next += 4; - - if (actual_in_nbytes_ret) - *actual_in_nbytes_ret = in_next - (u8 *)in; - - return LIBDEFLATE_SUCCESS; -} - -LIBDEFLATEAPI enum libdeflate_result -libdeflate_gzip_decompress(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) -{ - return libdeflate_gzip_decompress_ex(d, in, in_nbytes, - out, out_nbytes_avail, - NULL, actual_out_nbytes_ret); -} diff --git a/ext/libdeflate/lib/hc_matchfinder.h b/ext/libdeflate/lib/hc_matchfinder.h deleted file mode 100644 index 8412a6fa..00000000 --- a/ext/libdeflate/lib/hc_matchfinder.h +++ /dev/null @@ -1,403 +0,0 @@ -/* - * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - * - * --------------------------------------------------------------------------- - * - * Algorithm - * - * This is a Hash Chains (hc) based matchfinder. - * - * The main data structure is a hash table where each hash bucket contains a - * linked list (or "chain") of sequences whose first 4 bytes share the same hash - * code. Each sequence is identified by its starting position in the input - * buffer. - * - * The algorithm processes the input buffer sequentially. At each byte - * position, the hash code of the first 4 bytes of the sequence beginning at - * that position (the sequence being matched against) is computed. This - * identifies the hash bucket to use for that position. Then, this hash - * bucket's linked list is searched for matches. Then, a new linked list node - * is created to represent the current sequence and is prepended to the list. - * - * This algorithm has several useful properties: - * - * - It only finds true Lempel-Ziv matches; i.e., those where the matching - * sequence occurs prior to the sequence being matched against. - * - * - The sequences in each linked list are always sorted by decreasing starting - * position. Therefore, the closest (smallest offset) matches are found - * first, which in many compression formats tend to be the cheapest to encode. - * - * - Although fast running time is not guaranteed due to the possibility of the - * lists getting very long, the worst degenerate behavior can be easily - * prevented by capping the number of nodes searched at each position. - * - * - If the compressor decides not to search for matches at a certain position, - * then that position can be quickly inserted without searching the list. - * - * - The algorithm is adaptable to sliding windows: just store the positions - * relative to a "base" value that is updated from time to time, and stop - * searching each list when the sequences get too far away. - * - * ---------------------------------------------------------------------------- - * - * Optimizations - * - * The main hash table and chains handle length 4+ matches. Length 3 matches - * are handled by a separate hash table with no chains. This works well for - * typical "greedy" or "lazy"-style compressors, where length 3 matches are - * often only helpful if they have small offsets. Instead of searching a full - * chain for length 3+ matches, the algorithm just checks for one close length 3 - * match, then focuses on finding length 4+ matches. - * - * The longest_match() and skip_positions() functions are inlined into the - * compressors that use them. This isn't just about saving the overhead of a - * function call. These functions are intended to be called from the inner - * loops of compressors, where giving the compiler more control over register - * allocation is very helpful. There is also significant benefit to be gained - * from allowing the CPU to predict branches independently at each call site. - * For example, "lazy"-style compressors can be written with two calls to - * longest_match(), each of which starts with a different 'best_len' and - * therefore has significantly different performance characteristics. - * - * Although any hash function can be used, a multiplicative hash is fast and - * works well. - * - * On some processors, it is significantly faster to extend matches by whole - * words (32 or 64 bits) instead of by individual bytes. For this to be the - * case, the processor must implement unaligned memory accesses efficiently and - * must have either a fast "find first set bit" instruction or a fast "find last - * set bit" instruction, depending on the processor's endianness. - * - * The code uses one loop for finding the first match and one loop for finding a - * longer match. Each of these loops is tuned for its respective task and in - * combination are faster than a single generalized loop that handles both - * tasks. - * - * The code also uses a tight inner loop that only compares the last and first - * bytes of a potential match. It is only when these bytes match that a full - * match extension is attempted. - * - * ---------------------------------------------------------------------------- - */ - -#include "matchfinder_common.h" - -#define HC_MATCHFINDER_HASH3_ORDER 15 -#define HC_MATCHFINDER_HASH4_ORDER 16 - -#define HC_MATCHFINDER_TOTAL_HASH_LENGTH \ - ((1UL << HC_MATCHFINDER_HASH3_ORDER) + \ - (1UL << HC_MATCHFINDER_HASH4_ORDER)) - -struct hc_matchfinder { - - /* The hash table for finding length 3 matches */ - mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER]; - - /* The hash table which contains the first nodes of the linked lists for - * finding length 4+ matches */ - mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER]; - - /* The "next node" references for the linked lists. The "next node" of - * the node for the sequence with position 'pos' is 'next_tab[pos]'. */ - mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE]; - -} -#ifdef _aligned_attribute - _aligned_attribute(MATCHFINDER_ALIGNMENT) -#endif -; - -/* Prepare the matchfinder for a new input buffer. */ -static forceinline void -hc_matchfinder_init(struct hc_matchfinder *mf) -{ - matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_LENGTH); -} - -static forceinline void -hc_matchfinder_slide_window(struct hc_matchfinder *mf) -{ - matchfinder_rebase((mf_pos_t *)mf, - sizeof(struct hc_matchfinder) / sizeof(mf_pos_t)); -} - -/* - * Find the longest match longer than 'best_len' bytes. - * - * @mf - * The matchfinder structure. - * @in_base_p - * Location of a pointer which points to the place in the input data the - * matchfinder currently stores positions relative to. This may be updated - * by this function. - * @cur_pos - * The current position in the input buffer relative to @in_base (the - * position of the sequence being matched against). - * @best_len - * Require a match longer than this length. - * @max_len - * The maximum permissible match length at this position. - * @nice_len - * Stop searching if a match of at least this length is found. - * Must be <= @max_len. - * @max_search_depth - * Limit on the number of potential matches to consider. Must be >= 1. - * @next_hashes - * The precomputed hash codes for the sequence beginning at @in_next. - * These will be used and then updated with the precomputed hashcodes for - * the sequence beginning at @in_next + 1. - * @offset_ret - * If a match is found, its offset is returned in this location. - * - * Return the length of the match found, or 'best_len' if no match longer than - * 'best_len' was found. - */ -static forceinline u32 -hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf, - const u8 ** const restrict in_base_p, - const u8 * const restrict in_next, - u32 best_len, - const u32 max_len, - const u32 nice_len, - const u32 max_search_depth, - u32 * const restrict next_hashes, - u32 * const restrict offset_ret) -{ - u32 depth_remaining = max_search_depth; - const u8 *best_matchptr = in_next; - mf_pos_t cur_node3, cur_node4; - u32 hash3, hash4; - u32 next_hashseq; - u32 seq4; - const u8 *matchptr; - u32 len; - u32 cur_pos = in_next - *in_base_p; - const u8 *in_base; - mf_pos_t cutoff; - - if (cur_pos == MATCHFINDER_WINDOW_SIZE) { - hc_matchfinder_slide_window(mf); - *in_base_p += MATCHFINDER_WINDOW_SIZE; - cur_pos = 0; - } - - in_base = *in_base_p; - cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE; - - if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */ - goto out; - - /* Get the precomputed hash codes. */ - hash3 = next_hashes[0]; - hash4 = next_hashes[1]; - - /* From the hash buckets, get the first node of each linked list. */ - cur_node3 = mf->hash3_tab[hash3]; - cur_node4 = mf->hash4_tab[hash4]; - - /* Update for length 3 matches. This replaces the singleton node in the - * 'hash3' bucket with the node for the current sequence. */ - mf->hash3_tab[hash3] = cur_pos; - - /* Update for length 4 matches. This prepends the node for the current - * sequence to the linked list in the 'hash4' bucket. */ - mf->hash4_tab[hash4] = cur_pos; - mf->next_tab[cur_pos] = cur_node4; - - /* Compute the next hash codes. */ - next_hashseq = get_unaligned_le32(in_next + 1); - next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); - next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); - prefetchw(&mf->hash3_tab[next_hashes[0]]); - prefetchw(&mf->hash4_tab[next_hashes[1]]); - - if (best_len < 4) { /* No match of length >= 4 found yet? */ - - /* Check for a length 3 match if needed. */ - - if (cur_node3 <= cutoff) - goto out; - - seq4 = load_u32_unaligned(in_next); - - if (best_len < 3) { - matchptr = &in_base[cur_node3]; - if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) { - best_len = 3; - best_matchptr = matchptr; - } - } - - /* Check for a length 4 match. */ - - if (cur_node4 <= cutoff) - goto out; - - for (;;) { - /* No length 4 match found yet. Check the first 4 bytes. */ - matchptr = &in_base[cur_node4]; - - if (load_u32_unaligned(matchptr) == seq4) - break; - - /* The first 4 bytes did not match. Keep trying. */ - cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; - if (cur_node4 <= cutoff || !--depth_remaining) - goto out; - } - - /* Found a match of length >= 4. Extend it to its full length. */ - best_matchptr = matchptr; - best_len = lz_extend(in_next, best_matchptr, 4, max_len); - if (best_len >= nice_len) - goto out; - cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; - if (cur_node4 <= cutoff || !--depth_remaining) - goto out; - } else { - if (cur_node4 <= cutoff || best_len >= nice_len) - goto out; - } - - /* Check for matches of length >= 5. */ - - for (;;) { - for (;;) { - matchptr = &in_base[cur_node4]; - - /* Already found a length 4 match. Try for a longer - * match; start by checking either the last 4 bytes and - * the first 4 bytes, or the last byte. (The last byte, - * the one which would extend the match length by 1, is - * the most important.) */ - #if UNALIGNED_ACCESS_IS_FAST - if ((load_u32_unaligned(matchptr + best_len - 3) == - load_u32_unaligned(in_next + best_len - 3)) && - (load_u32_unaligned(matchptr) == - load_u32_unaligned(in_next))) - #else - if (matchptr[best_len] == in_next[best_len]) - #endif - break; - - /* Continue to the next node in the list. */ - cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; - if (cur_node4 <= cutoff || !--depth_remaining) - goto out; - } - - #if UNALIGNED_ACCESS_IS_FAST - len = 4; - #else - len = 0; - #endif - len = lz_extend(in_next, matchptr, len, max_len); - if (len > best_len) { - /* This is the new longest match. */ - best_len = len; - best_matchptr = matchptr; - if (best_len >= nice_len) - goto out; - } - - /* Continue to the next node in the list. */ - cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)]; - if (cur_node4 <= cutoff || !--depth_remaining) - goto out; - } -out: - *offset_ret = in_next - best_matchptr; - return best_len; -} - -/* - * Advance the matchfinder, but don't search for matches. - * - * @mf - * The matchfinder structure. - * @in_base_p - * Location of a pointer which points to the place in the input data the - * matchfinder currently stores positions relative to. This may be updated - * by this function. - * @cur_pos - * The current position in the input buffer relative to @in_base. - * @end_pos - * The end position of the input buffer, relative to @in_base. - * @next_hashes - * The precomputed hash codes for the sequence beginning at @in_next. - * These will be used and then updated with the precomputed hashcodes for - * the sequence beginning at @in_next + @count. - * @count - * The number of bytes to advance. Must be > 0. - * - * Returns @in_next + @count. - */ -static forceinline const u8 * -hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf, - const u8 ** const restrict in_base_p, - const u8 *in_next, - const u8 * const in_end, - const u32 count, - u32 * const restrict next_hashes) -{ - u32 cur_pos; - u32 hash3, hash4; - u32 next_hashseq; - u32 remaining = count; - - if (unlikely(count + 5 > in_end - in_next)) - return &in_next[count]; - - cur_pos = in_next - *in_base_p; - hash3 = next_hashes[0]; - hash4 = next_hashes[1]; - do { - if (cur_pos == MATCHFINDER_WINDOW_SIZE) { - hc_matchfinder_slide_window(mf); - *in_base_p += MATCHFINDER_WINDOW_SIZE; - cur_pos = 0; - } - mf->hash3_tab[hash3] = cur_pos; - mf->next_tab[cur_pos] = mf->hash4_tab[hash4]; - mf->hash4_tab[hash4] = cur_pos; - - next_hashseq = get_unaligned_le32(++in_next); - hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); - hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); - cur_pos++; - } while (--remaining); - - prefetchw(&mf->hash3_tab[hash3]); - prefetchw(&mf->hash4_tab[hash4]); - next_hashes[0] = hash3; - next_hashes[1] = hash4; - - return in_next; -} diff --git a/ext/libdeflate/lib/lib_common.h b/ext/libdeflate/lib/lib_common.h deleted file mode 100644 index e3f33ef5..00000000 --- a/ext/libdeflate/lib/lib_common.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * lib_common.h - internal header included by all library code - */ - -#ifndef LIB_LIB_COMMON_H -#define LIB_LIB_COMMON_H - -#ifdef LIBDEFLATE_H -# error "lib_common.h must always be included before libdeflate.h" - /* because BUILDING_LIBDEFLATE must be set first */ -#endif - -#define BUILDING_LIBDEFLATE - -#include "common_defs.h" - -/* - * Prefix with "_libdeflate_" all global symbols which are not part of the API. - * This avoids exposing overly generic names when libdeflate is built as a - * static library. - * - * Note that the chosen prefix is not really important and can be changed - * without breaking library users. It was just chosen so that the resulting - * symbol names are unlikely to conflict with those from any other software. - * Also note that this fixup has no useful effect when libdeflate is built as a - * shared library, since these symbols are not exported. - */ -#define SYM_FIXUP(sym) _libdeflate_##sym -#define aligned_malloc SYM_FIXUP(aligned_malloc) -#define aligned_free SYM_FIXUP(aligned_free) -#define deflate_get_compression_level SYM_FIXUP(deflate_get_compression_level) -#define _cpu_features SYM_FIXUP(_cpu_features) -#define setup_cpu_features SYM_FIXUP(setup_cpu_features) - -#endif /* LIB_LIB_COMMON_H */ diff --git a/ext/libdeflate/lib/matchfinder_common.h b/ext/libdeflate/lib/matchfinder_common.h deleted file mode 100644 index edd9fb70..00000000 --- a/ext/libdeflate/lib/matchfinder_common.h +++ /dev/null @@ -1,168 +0,0 @@ -/* - * matchfinder_common.h - common code for Lempel-Ziv matchfinding - */ - -#ifndef LIB_MATCHFINDER_COMMON_H -#define LIB_MATCHFINDER_COMMON_H - -#include "lib_common.h" -#include "unaligned.h" - -#ifndef MATCHFINDER_WINDOW_ORDER -# error "MATCHFINDER_WINDOW_ORDER must be defined!" -#endif - -#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER) - -typedef s16 mf_pos_t; - -#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE) - -#define MATCHFINDER_ALIGNMENT 8 - -#define arch_matchfinder_init(data, size) false -#define arch_matchfinder_rebase(data, size) false - -#ifdef _aligned_attribute -# if defined(__arm__) || defined(__aarch64__) -# include "arm/matchfinder_impl.h" -# elif defined(__i386__) || defined(__x86_64__) -# include "x86/matchfinder_impl.h" -# endif -#endif - -/* - * Initialize the hash table portion of the matchfinder. - * - * Essentially, this is an optimized memset(). - * - * 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary. - */ -static forceinline void -matchfinder_init(mf_pos_t *data, size_t num_entries) -{ - size_t i; - - if (arch_matchfinder_init(data, num_entries * sizeof(data[0]))) - return; - - for (i = 0; i < num_entries; i++) - data[i] = MATCHFINDER_INITVAL; -} - -/* - * Slide the matchfinder by WINDOW_SIZE bytes. - * - * This must be called just after each WINDOW_SIZE bytes have been run through - * the matchfinder. - * - * This will subtract WINDOW_SIZE bytes from each entry in the array specified. - * The effect is that all entries are updated to be relative to the current - * position, rather than the position WINDOW_SIZE bytes prior. - * - * Underflow is detected and replaced with signed saturation. This ensures that - * once the sliding window has passed over a position, that position forever - * remains out of bounds. - * - * The array passed in must contain all matchfinder data that is - * position-relative. Concretely, this will include the hash table as well as - * the table of positions that is used to link together the sequences in each - * hash bucket. Note that in the latter table, the links are 1-ary in the case - * of "hash chains", and 2-ary in the case of "binary trees". In either case, - * the links need to be rebased in the same way. - */ -static forceinline void -matchfinder_rebase(mf_pos_t *data, size_t num_entries) -{ - size_t i; - - if (arch_matchfinder_rebase(data, num_entries * sizeof(data[0]))) - return; - - if (MATCHFINDER_WINDOW_SIZE == 32768) { - /* Branchless version for 32768 byte windows. If the value was - * already negative, clear all bits except the sign bit; this - * changes the value to -32768. Otherwise, set the sign bit; - * this is equivalent to subtracting 32768. */ - for (i = 0; i < num_entries; i++) { - u16 v = data[i]; - u16 sign_bit = v & 0x8000; - v &= sign_bit - ((sign_bit >> 15) ^ 1); - v |= 0x8000; - data[i] = v; - } - return; - } - - for (i = 0; i < num_entries; i++) { - if (data[i] >= 0) - data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; - else - data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; - } -} - -/* - * The hash function: given a sequence prefix held in the low-order bits of a - * 32-bit value, multiply by a carefully-chosen large constant. Discard any - * bits of the product that don't fit in a 32-bit value, but take the - * next-highest @num_bits bits of the product as the hash value, as those have - * the most randomness. - */ -static forceinline u32 -lz_hash(u32 seq, unsigned num_bits) -{ - return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits); -} - -/* - * Return the number of bytes at @matchptr that match the bytes at @strptr, up - * to a maximum of @max_len. Initially, @start_len bytes are matched. - */ -static forceinline unsigned -lz_extend(const u8 * const strptr, const u8 * const matchptr, - const unsigned start_len, const unsigned max_len) -{ - unsigned len = start_len; - machine_word_t v_word; - - if (UNALIGNED_ACCESS_IS_FAST) { - - if (likely(max_len - len >= 4 * WORDBYTES)) { - - #define COMPARE_WORD_STEP \ - v_word = load_word_unaligned(&matchptr[len]) ^ \ - load_word_unaligned(&strptr[len]); \ - if (v_word != 0) \ - goto word_differs; \ - len += WORDBYTES; \ - - COMPARE_WORD_STEP - COMPARE_WORD_STEP - COMPARE_WORD_STEP - COMPARE_WORD_STEP - #undef COMPARE_WORD_STEP - } - - while (len + WORDBYTES <= max_len) { - v_word = load_word_unaligned(&matchptr[len]) ^ - load_word_unaligned(&strptr[len]); - if (v_word != 0) - goto word_differs; - len += WORDBYTES; - } - } - - while (len < max_len && matchptr[len] == strptr[len]) - len++; - return len; - -word_differs: - if (CPU_IS_LITTLE_ENDIAN()) - len += (bsfw(v_word) >> 3); - else - len += (WORDBITS - 1 - bsrw(v_word)) >> 3; - return len; -} - -#endif /* LIB_MATCHFINDER_COMMON_H */ diff --git a/ext/libdeflate/lib/unaligned.h b/ext/libdeflate/lib/unaligned.h deleted file mode 100644 index 7aeaf0c5..00000000 --- a/ext/libdeflate/lib/unaligned.h +++ /dev/null @@ -1,202 +0,0 @@ -/* - * unaligned.h - inline functions for unaligned memory accesses - */ - -#ifndef LIB_UNALIGNED_H -#define LIB_UNALIGNED_H - -#include "lib_common.h" - -/* - * Naming note: - * - * {load,store}_*_unaligned() deal with raw bytes without endianness conversion. - * {get,put}_unaligned_*() deal with a specific endianness. - */ - -DEFINE_UNALIGNED_TYPE(u16) -DEFINE_UNALIGNED_TYPE(u32) -DEFINE_UNALIGNED_TYPE(u64) -DEFINE_UNALIGNED_TYPE(machine_word_t) - -#define load_word_unaligned load_machine_word_t_unaligned -#define store_word_unaligned store_machine_word_t_unaligned - -/***** Unaligned loads *****/ - -static forceinline u16 -get_unaligned_le16(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return le16_bswap(load_u16_unaligned(p)); - else - return ((u16)p[1] << 8) | p[0]; -} - -static forceinline u16 -get_unaligned_be16(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return be16_bswap(load_u16_unaligned(p)); - else - return ((u16)p[0] << 8) | p[1]; -} - -static forceinline u32 -get_unaligned_le32(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return le32_bswap(load_u32_unaligned(p)); - else - return ((u32)p[3] << 24) | ((u32)p[2] << 16) | - ((u32)p[1] << 8) | p[0]; -} - -static forceinline u32 -get_unaligned_be32(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return be32_bswap(load_u32_unaligned(p)); - else - return ((u32)p[0] << 24) | ((u32)p[1] << 16) | - ((u32)p[2] << 8) | p[3]; -} - -static forceinline u64 -get_unaligned_le64(const u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) - return le64_bswap(load_u64_unaligned(p)); - else - return ((u64)p[7] << 56) | ((u64)p[6] << 48) | - ((u64)p[5] << 40) | ((u64)p[4] << 32) | - ((u64)p[3] << 24) | ((u64)p[2] << 16) | - ((u64)p[1] << 8) | p[0]; -} - -static forceinline machine_word_t -get_unaligned_leword(const u8 *p) -{ - STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - if (WORDBITS == 32) - return get_unaligned_le32(p); - else - return get_unaligned_le64(p); -} - -/***** Unaligned stores *****/ - -static forceinline void -put_unaligned_le16(u16 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u16_unaligned(le16_bswap(v), p); - } else { - p[0] = (u8)(v >> 0); - p[1] = (u8)(v >> 8); - } -} - -static forceinline void -put_unaligned_be16(u16 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u16_unaligned(be16_bswap(v), p); - } else { - p[0] = (u8)(v >> 8); - p[1] = (u8)(v >> 0); - } -} - -static forceinline void -put_unaligned_le32(u32 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u32_unaligned(le32_bswap(v), p); - } else { - p[0] = (u8)(v >> 0); - p[1] = (u8)(v >> 8); - p[2] = (u8)(v >> 16); - p[3] = (u8)(v >> 24); - } -} - -static forceinline void -put_unaligned_be32(u32 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u32_unaligned(be32_bswap(v), p); - } else { - p[0] = (u8)(v >> 24); - p[1] = (u8)(v >> 16); - p[2] = (u8)(v >> 8); - p[3] = (u8)(v >> 0); - } -} - -static forceinline void -put_unaligned_le64(u64 v, u8 *p) -{ - if (UNALIGNED_ACCESS_IS_FAST) { - store_u64_unaligned(le64_bswap(v), p); - } else { - p[0] = (u8)(v >> 0); - p[1] = (u8)(v >> 8); - p[2] = (u8)(v >> 16); - p[3] = (u8)(v >> 24); - p[4] = (u8)(v >> 32); - p[5] = (u8)(v >> 40); - p[6] = (u8)(v >> 48); - p[7] = (u8)(v >> 56); - } -} - -static forceinline void -put_unaligned_leword(machine_word_t v, u8 *p) -{ - STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); - if (WORDBITS == 32) - put_unaligned_le32(v, p); - else - put_unaligned_le64(v, p); -} - -/***** 24-bit loads *****/ - -/* - * Given a 32-bit value that was loaded with the platform's native endianness, - * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24 - * bits contain the first 3 bytes, arranged in octets in a platform-dependent - * order, at the memory location from which the input 32-bit value was loaded. - */ -static forceinline u32 -loaded_u32_to_u24(u32 v) -{ - if (CPU_IS_LITTLE_ENDIAN()) - return v & 0xFFFFFF; - else - return v >> 8; -} - -/* - * Load the next 3 bytes from the memory location @p into the 24 low-order bits - * of a 32-bit value. The order in which the 3 bytes will be arranged as octets - * in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES - * bytes must be available at @p; note that this may be more than 3. - */ -static forceinline u32 -load_u24_unaligned(const u8 *p) -{ -#if UNALIGNED_ACCESS_IS_FAST -# define LOAD_U24_REQUIRED_NBYTES 4 - return loaded_u32_to_u24(load_u32_unaligned(p)); -#else -# define LOAD_U24_REQUIRED_NBYTES 3 - if (CPU_IS_LITTLE_ENDIAN()) - return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); - else - return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16); -#endif -} - -#endif /* LIB_UNALIGNED_H */ diff --git a/ext/libdeflate/lib/x86/adler32_impl.h b/ext/libdeflate/lib/x86/adler32_impl.h deleted file mode 100644 index 4627a41c..00000000 --- a/ext/libdeflate/lib/x86/adler32_impl.h +++ /dev/null @@ -1,332 +0,0 @@ -/* - * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "cpu_features.h" - -/* - * The following macros horizontally sum the s1 counters and add them to the - * real s1, and likewise for s2. They do this via a series of reductions, each - * of which halves the vector length, until just one counter remains. - * - * The s1 reductions don't depend on the s2 reductions and vice versa, so for - * efficiency they are interleaved. Also, every other s1 counter is 0 due to - * the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than - * 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits. - */ - -#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \ -{ \ - __v4si s1_last = (v_s1), s2_last = (v_s2); \ - \ - /* 128 => 32 bits */ \ - s2_last += (__v4si)_mm_shuffle_epi32((__m128i)s2_last, 0x31); \ - s1_last += (__v4si)_mm_shuffle_epi32((__m128i)s1_last, 0x02); \ - s2_last += (__v4si)_mm_shuffle_epi32((__m128i)s2_last, 0x02); \ - \ - *(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last); \ - *(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last); \ -} - -#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \ -{ \ - __v4si s1_128bit, s2_128bit; \ - \ - /* 256 => 128 bits */ \ - s1_128bit = (__v4si)_mm256_extracti128_si256((__m256i)(v_s1), 0) + \ - (__v4si)_mm256_extracti128_si256((__m256i)(v_s1), 1); \ - s2_128bit = (__v4si)_mm256_extracti128_si256((__m256i)(v_s2), 0) + \ - (__v4si)_mm256_extracti128_si256((__m256i)(v_s2), 1); \ - \ - ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \ -} - -#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \ -{ \ - __v8si s1_256bit, s2_256bit; \ - \ - /* 512 => 256 bits */ \ - s1_256bit = (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \ - (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1); \ - s2_256bit = (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \ - (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1); \ - \ - ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \ -} - -/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */ -#undef DISPATCH_AVX512BW -#if !defined(DEFAULT_IMPL) && \ - /* - * clang before v3.9 is missing some AVX-512BW intrinsics including - * _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512. So just make using - * AVX-512BW, even when __AVX512BW__ is defined, conditional on - * COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin. - */ \ - COMPILER_SUPPORTS_AVX512BW_TARGET && \ - (defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS)) -# define FUNCNAME adler32_avx512bw -# define FUNCNAME_CHUNK adler32_avx512bw_chunk -# define IMPL_ALIGNMENT 64 -# define IMPL_SEGMENT_SIZE 64 -# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE -# ifdef __AVX512BW__ -# define ATTRIBUTES -# define DEFAULT_IMPL adler32_avx512bw -# else -# define ATTRIBUTES __attribute__((target("avx512bw"))) -# define DISPATCH 1 -# define DISPATCH_AVX512BW 1 -# endif -# include -static forceinline ATTRIBUTES void -adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end, - u32 *s1, u32 *s2) -{ - const __m512i zeroes = _mm512_setzero_si512(); - const __v64qi multipliers = (__v64qi){ - 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - }; - const __v32hi ones = (__v32hi)_mm512_set1_epi16(1); - __v16si v_s1 = (__v16si)zeroes; - __v16si v_s1_sums = (__v16si)zeroes; - __v16si v_s2 = (__v16si)zeroes; - - do { - /* Load the next 64-byte segment */ - __m512i bytes = *p++; - - /* Multiply the bytes by 64...1 (the number of times they need - * to be added to s2) and add adjacent products */ - __v32hi sums = (__v32hi)_mm512_maddubs_epi16( - bytes, (__m512i)multipliers); - - /* Keep sum of all previous s1 counters, for adding to s2 later. - * This allows delaying the multiplication by 64 to the end. */ - v_s1_sums += v_s1; - - /* Add the sum of each group of 8 bytes to the corresponding s1 - * counter */ - v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes); - - /* Add the sum of each group of 4 products of the bytes by - * 64...1 to the corresponding s2 counter */ - v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums, - (__m512i)ones); - } while (p != end); - - /* Finish the s2 counters by adding the sum of the s1 values at the - * beginning of each segment, multiplied by the segment size (64) */ - v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6); - - /* Add the counters to the real s1 and s2 */ - ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2); -} -# include "../adler32_vec_template.h" -#endif /* AVX-512BW implementation */ - -/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */ -#undef DISPATCH_AVX2 -#if !defined(DEFAULT_IMPL) && \ - (defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS)) -# define FUNCNAME adler32_avx2 -# define FUNCNAME_CHUNK adler32_avx2_chunk -# define IMPL_ALIGNMENT 32 -# define IMPL_SEGMENT_SIZE 32 -# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE -# ifdef __AVX2__ -# define ATTRIBUTES -# define DEFAULT_IMPL adler32_avx2 -# else -# define ATTRIBUTES __attribute__((target("avx2"))) -# define DISPATCH 1 -# define DISPATCH_AVX2 1 -# endif -# include -static forceinline ATTRIBUTES void -adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2) -{ - const __m256i zeroes = _mm256_setzero_si256(); - const __v32qi multipliers = (__v32qi){ - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - }; - const __v16hi ones = (__v16hi)_mm256_set1_epi16(1); - __v8si v_s1 = (__v8si)zeroes; - __v8si v_s1_sums = (__v8si)zeroes; - __v8si v_s2 = (__v8si)zeroes; - - do { - /* Load the next 32-byte segment */ - __m256i bytes = *p++; - - /* Multiply the bytes by 32...1 (the number of times they need - * to be added to s2) and add adjacent products */ - __v16hi sums = (__v16hi)_mm256_maddubs_epi16( - bytes, (__m256i)multipliers); - - /* Keep sum of all previous s1 counters, for adding to s2 later. - * This allows delaying the multiplication by 32 to the end. */ - v_s1_sums += v_s1; - - /* Add the sum of each group of 8 bytes to the corresponding s1 - * counter */ - v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes); - - /* Add the sum of each group of 4 products of the bytes by - * 32...1 to the corresponding s2 counter */ - v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones); - } while (p != end); - - /* Finish the s2 counters by adding the sum of the s1 values at the - * beginning of each segment, multiplied by the segment size (32) */ - v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5); - - /* Add the counters to the real s1 and s2 */ - ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2); -} -# include "../adler32_vec_template.h" -#endif /* AVX2 implementation */ - -/* SSE2 implementation */ -#undef DISPATCH_SSE2 -#if !defined(DEFAULT_IMPL) && \ - (defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS)) -# define FUNCNAME adler32_sse2 -# define FUNCNAME_CHUNK adler32_sse2_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_SIZE 32 -/* - * The 16-bit precision byte counters must not be allowed to undergo *signed* - * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16) - * would behave incorrectly. - */ -# define IMPL_MAX_CHUNK_SIZE (32 * (0x7FFF / 0xFF)) -# ifdef __SSE2__ -# define ATTRIBUTES -# define DEFAULT_IMPL adler32_sse2 -# else -# define ATTRIBUTES __attribute__((target("sse2"))) -# define DISPATCH 1 -# define DISPATCH_SSE2 1 -# endif -# include -static forceinline ATTRIBUTES void -adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2) -{ - const __m128i zeroes = _mm_setzero_si128(); - - /* s1 counters: 32-bit, sum of bytes */ - __v4si v_s1 = (__v4si)zeroes; - - /* s2 counters: 32-bit, sum of s1 values */ - __v4si v_s2 = (__v4si)zeroes; - - /* - * Thirty-two 16-bit counters for byte sums. Each accumulates the bytes - * that eventually need to be multiplied by a number 32...1 for addition - * into s2. - */ - __v8hi v_byte_sums_a = (__v8hi)zeroes; - __v8hi v_byte_sums_b = (__v8hi)zeroes; - __v8hi v_byte_sums_c = (__v8hi)zeroes; - __v8hi v_byte_sums_d = (__v8hi)zeroes; - - do { - /* Load the next 32 bytes */ - const __m128i bytes1 = *p++; - const __m128i bytes2 = *p++; - - /* - * Accumulate the previous s1 counters into the s2 counters. - * Logically, this really should be v_s2 += v_s1 * 32, but we - * can do the multiplication (or left shift) later. - */ - v_s2 += v_s1; - - /* - * s1 update: use "Packed Sum of Absolute Differences" to add - * the bytes horizontally with 8 bytes per sum. Then add the - * sums to the s1 counters. - */ - v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes); - v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes); - - /* - * Also accumulate the bytes into 32 separate counters that have - * 16-bit precision. - */ - v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes); - v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes); - v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes); - v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes); - - } while (p != end); - - /* Finish calculating the s2 counters */ - v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5); - v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a, - (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 }); - v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b, - (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 }); - v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c, - (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 }); - v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d, - (__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 }); - - /* Add the counters to the real s1 and s2 */ - ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2); -} -# include "../adler32_vec_template.h" -#endif /* SSE2 implementation */ - -#ifdef DISPATCH -static inline adler32_func_t -arch_select_adler32_func(void) -{ - u32 features = get_cpu_features(); - -#ifdef DISPATCH_AVX512BW - if (features & X86_CPU_FEATURE_AVX512BW) - return adler32_avx512bw; -#endif -#ifdef DISPATCH_AVX2 - if (features & X86_CPU_FEATURE_AVX2) - return adler32_avx2; -#endif -#ifdef DISPATCH_SSE2 - if (features & X86_CPU_FEATURE_SSE2) - return adler32_sse2; -#endif - return NULL; -} -#endif /* DISPATCH */ diff --git a/ext/libdeflate/lib/x86/cpu_features.c b/ext/libdeflate/lib/x86/cpu_features.c deleted file mode 100644 index 78a7af28..00000000 --- a/ext/libdeflate/lib/x86/cpu_features.c +++ /dev/null @@ -1,139 +0,0 @@ -/* - * x86/cpu_features.c - feature detection for x86 processors - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "cpu_features.h" - -#if X86_CPU_FEATURES_ENABLED - -volatile u32 _cpu_features = 0; - -/* With old GCC versions we have to manually save and restore the x86_32 PIC - * register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */ -#if defined(__i386__) && defined(__PIC__) -# define EBX_CONSTRAINT "=r" -#else -# define EBX_CONSTRAINT "=b" -#endif - -/* Execute the CPUID instruction. */ -static inline void -cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) -{ - __asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" - "cpuid \n" - ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" - : "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d) - : "a" (leaf), "c" (subleaf)); -} - -/* Read an extended control register. */ -static inline u64 -read_xcr(u32 index) -{ - u32 edx, eax; - - /* Execute the "xgetbv" instruction. Old versions of binutils do not - * recognize this instruction, so list the raw bytes instead. */ - __asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index)); - - return ((u64)edx << 32) | eax; -} - -#undef BIT -#define BIT(nr) (1UL << (nr)) - -#define XCR0_BIT_SSE BIT(1) -#define XCR0_BIT_AVX BIT(2) -#define XCR0_BIT_OPMASK BIT(5) -#define XCR0_BIT_ZMM_HI256 BIT(6) -#define XCR0_BIT_HI16_ZMM BIT(7) - -#define IS_SET(reg, nr) ((reg) & BIT(nr)) -#define IS_ALL_SET(reg, mask) (((reg) & (mask)) == (mask)) - -/* Initialize _cpu_features with bits for interesting processor features. */ -void setup_cpu_features(void) -{ - u32 features = 0; - u32 dummy1, dummy2, dummy3, dummy4; - u32 max_function; - u32 features_1, features_2, features_3, features_4; - bool os_avx_support = false; - bool os_avx512_support = false; - - /* Get maximum supported function */ - cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4); - if (max_function < 1) - goto out; - - /* Standard feature flags */ - cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1); - - if (IS_SET(features_1, 26)) - features |= X86_CPU_FEATURE_SSE2; - - if (IS_SET(features_2, 1)) - features |= X86_CPU_FEATURE_PCLMULQDQ; - - if (IS_SET(features_2, 27)) { /* OSXSAVE set? */ - u64 xcr0 = read_xcr(0); - - os_avx_support = IS_ALL_SET(xcr0, - XCR0_BIT_SSE | - XCR0_BIT_AVX); - - os_avx512_support = IS_ALL_SET(xcr0, - XCR0_BIT_SSE | - XCR0_BIT_AVX | - XCR0_BIT_OPMASK | - XCR0_BIT_ZMM_HI256 | - XCR0_BIT_HI16_ZMM); - } - - if (os_avx_support && IS_SET(features_2, 28)) - features |= X86_CPU_FEATURE_AVX; - - if (max_function < 7) - goto out; - - /* Extended feature flags */ - cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4); - - if (os_avx_support && IS_SET(features_3, 5)) - features |= X86_CPU_FEATURE_AVX2; - - if (IS_SET(features_3, 8)) - features |= X86_CPU_FEATURE_BMI2; - - if (os_avx512_support && IS_SET(features_3, 30)) - features |= X86_CPU_FEATURE_AVX512BW; - -out: - _cpu_features = features | X86_CPU_FEATURES_KNOWN; -} - -#endif /* X86_CPU_FEATURES_ENABLED */ diff --git a/ext/libdeflate/lib/x86/cpu_features.h b/ext/libdeflate/lib/x86/cpu_features.h deleted file mode 100644 index b2241818..00000000 --- a/ext/libdeflate/lib/x86/cpu_features.h +++ /dev/null @@ -1,41 +0,0 @@ -/* - * x86/cpu_features.h - feature detection for x86 processors - */ - -#ifndef LIB_X86_CPU_FEATURES_H -#define LIB_X86_CPU_FEATURES_H - -#include "../lib_common.h" - -#if (defined(__i386__) || defined(__x86_64__)) && \ - COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE -# define X86_CPU_FEATURES_ENABLED 1 -#else -# define X86_CPU_FEATURES_ENABLED 0 -#endif - -#if X86_CPU_FEATURES_ENABLED - -#define X86_CPU_FEATURE_SSE2 0x00000001 -#define X86_CPU_FEATURE_PCLMULQDQ 0x00000002 -#define X86_CPU_FEATURE_AVX 0x00000004 -#define X86_CPU_FEATURE_AVX2 0x00000008 -#define X86_CPU_FEATURE_BMI2 0x00000010 -#define X86_CPU_FEATURE_AVX512BW 0x00000020 - -#define X86_CPU_FEATURES_KNOWN 0x80000000 - -extern volatile u32 _cpu_features; - -extern void setup_cpu_features(void); - -static inline u32 get_cpu_features(void) -{ - if (_cpu_features == 0) - setup_cpu_features(); - return _cpu_features; -} - -#endif /* X86_CPU_FEATURES_ENABLED */ - -#endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/ext/libdeflate/lib/x86/crc32_impl.h b/ext/libdeflate/lib/x86/crc32_impl.h deleted file mode 100644 index ff896268..00000000 --- a/ext/libdeflate/lib/x86/crc32_impl.h +++ /dev/null @@ -1,87 +0,0 @@ -/* - * x86/crc32_impl.h - x86 implementations of CRC-32 checksum algorithm - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "cpu_features.h" - -/* - * Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32 - * function doesn't use any AVX intrinsics specifically, it can benefit a lot - * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100 - * MB/s. I expect this is related to the PCLMULQDQ instructions being assembled - * in the newer three-operand form rather than the older two-operand form. - * - * Note: this is only needed if __AVX__ is *not* defined, since otherwise the - * "regular" PCLMUL implementation would already be AVX enabled. - */ -#undef DISPATCH_PCLMUL_AVX -#if !defined(DEFAULT_IMPL) && !defined(__AVX__) && \ - X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET && \ - (defined(__PCLMUL__) || COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS) -# define FUNCNAME crc32_pclmul_avx -# define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned -# define ATTRIBUTES __attribute__((target("pclmul,avx"))) -# define DISPATCH 1 -# define DISPATCH_PCLMUL_AVX 1 -# include "crc32_pclmul_template.h" -#endif - -/* PCLMUL implementation */ -#undef DISPATCH_PCLMUL -#if !defined(DEFAULT_IMPL) && \ - (defined(__PCLMUL__) || (X86_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS)) -# define FUNCNAME crc32_pclmul -# define FUNCNAME_ALIGNED crc32_pclmul_aligned -# ifdef __PCLMUL__ -# define ATTRIBUTES -# define DEFAULT_IMPL crc32_pclmul -# else -# define ATTRIBUTES __attribute__((target("pclmul"))) -# define DISPATCH 1 -# define DISPATCH_PCLMUL 1 -# endif -# include "crc32_pclmul_template.h" -#endif - -#ifdef DISPATCH -static inline crc32_func_t -arch_select_crc32_func(void) -{ - u32 features = get_cpu_features(); - -#ifdef DISPATCH_PCLMUL_AVX - if ((features & X86_CPU_FEATURE_PCLMULQDQ) && - (features & X86_CPU_FEATURE_AVX)) - return crc32_pclmul_avx; -#endif -#ifdef DISPATCH_PCLMUL - if (features & X86_CPU_FEATURE_PCLMULQDQ) - return crc32_pclmul; -#endif - return NULL; -} -#endif /* DISPATCH */ diff --git a/ext/libdeflate/lib/x86/crc32_pclmul_template.h b/ext/libdeflate/lib/x86/crc32_pclmul_template.h deleted file mode 100644 index eb4c4ba8..00000000 --- a/ext/libdeflate/lib/x86/crc32_pclmul_template.h +++ /dev/null @@ -1,262 +0,0 @@ -/* - * x86/crc32_pclmul_template.h - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include - -/* - * CRC-32 folding with PCLMULQDQ. - * - * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits, - * producing an abbreviated message which is congruent the original message - * modulo the generator polynomial G(x). - * - * Folding each 512 bits is implemented as eight 64-bit folds, each of which - * uses one carryless multiplication instruction. It's expected that CPUs may - * be able to execute some of these multiplications in parallel. - * - * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x) - * be 95 bits from a constant distance D later in the message. The relevant - * portion of the message can be written as: - * - * M(x) = A(x)*x^D + B(x) - * - * ... where + and * represent addition and multiplication, respectively, of - * polynomials over GF(2). Note that when implemented on a computer, these - * operations are equivalent to XOR and carryless multiplication, respectively. - * - * For the purpose of CRC calculation, only the remainder modulo the generator - * polynomial G(x) matters: - * - * M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x) - * - * Since the modulo operation can be applied anywhere in a sequence of additions - * and multiplications without affecting the result, this is equivalent to: - * - * M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x) - * - * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e. - * a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is equivalent to a carryless - * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit - * product. Then, adding (XOR-ing) the product to B(x) produces a polynomial - * with the same length as B(x) but with the same remainder as 'A(x)*x^D + - * B(x)'. This is the basic fold operation with 64 bits. - * - * Note that the carryless multiplication instruction PCLMULQDQ actually takes - * two 64-bit inputs and produces a 127-bit product in the low-order bits of a - * 128-bit XMM register. This works fine, but care must be taken to account for - * "bit endianness". With the CRC version implemented here, bits are always - * ordered such that the lowest-order bit represents the coefficient of highest - * power of x and the highest-order bit represents the coefficient of the lowest - * power of x. This is backwards from the more intuitive order. Still, - * carryless multiplication works essentially the same either way. It just must - * be accounted for that when we XOR the 95-bit product in the low-order 95 bits - * of a 128-bit XMM register into 128-bits of later data held in another XMM - * register, we'll really be XOR-ing the product into the mathematically higher - * degree end of those later bits, not the lower degree end as may be expected. - * - * So given that caveat and the fact that we process 512 bits per iteration, the - * 'D' values we need for the two 64-bit halves of each 128 bits of data are: - * - * D = (512 + 95) - 64 for the higher-degree half of each 128 bits, - * i.e. the lower order bits in the XMM register - * - * D = (512 + 95) - 128 for the lower-degree half of each 128 bits, - * i.e. the higher order bits in the XMM register - * - * The required 'x^D mod G(x)' values were precomputed. - * - * When <= 512 bits remain in the message, we finish up by folding across - * smaller distances. This works similarly; the distance D is just different, - * so different constant multipliers must be used. Finally, once the remaining - * message is just 64 bits, it is is reduced to the CRC-32 using Barrett - * reduction (explained later). - * - * For more information see the original paper from Intel: - * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" - * December 2009 - * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf - */ -static u32 ATTRIBUTES -FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs) -{ - /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */ - const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 }; - const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 }; - const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E }; - const __v2di final_multiplier = (__v2di){ 0xB8BC6765 }; - const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF }; - const __v2di barrett_reduction_constants = - (__v2di){ 0x00000001F7011641, 0x00000001DB710641 }; - - const __m128i * const end = p + nr_segs; - const __m128i * const end512 = p + (nr_segs & ~3); - __m128i x0, x1, x2, x3; - - /* - * Account for the current 'remainder', i.e. the CRC of the part of the - * message already processed. Explanation: rewrite the message - * polynomial M(x) in terms of the first part A(x), the second part - * B(x), and the length of the second part in bits |B(x)| >= 32: - * - * M(x) = A(x)*x^|B(x)| + B(x) - * - * Then the CRC of M(x) is: - * - * CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x)) - * = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x)) - * = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x)) - * - * Note: all arithmetic is modulo G(x), the generator polynomial; that's - * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x). - * - * So the CRC of the full message is the CRC of the second part of the - * message where the first 32 bits of the second part of the message - * have been XOR'ed with the CRC of the first part of the message. - */ - x0 = *p++; - x0 ^= (__m128i)(__v4si){ remainder }; - - if (p > end512) /* only 128, 256, or 384 bits of input? */ - goto _128_bits_at_a_time; - x1 = *p++; - x2 = *p++; - x3 = *p++; - - /* Fold 512 bits at a time */ - for (; p != end512; p += 4) { - __m128i y0, y1, y2, y3; - - y0 = p[0]; - y1 = p[1]; - y2 = p[2]; - y3 = p[3]; - - /* - * Note: the immediate constant for PCLMULQDQ specifies which - * 64-bit halves of the 128-bit vectors to multiply: - * - * 0x00 means low halves (higher degree polynomial terms for us) - * 0x11 means high halves (lower degree polynomial terms for us) - */ - y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00); - y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00); - y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00); - y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00); - y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11); - y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11); - y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11); - y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11); - - x0 = y0; - x1 = y1; - x2 = y2; - x3 = y3; - } - - /* Fold 512 bits => 128 bits */ - x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00); - x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00); - x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11); - x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11); - x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00); - x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11); - x0 = x3; - -_128_bits_at_a_time: - while (p != end) { - /* Fold 128 bits into next 128 bits */ - x1 = *p++; - x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00); - x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11); - x0 = x1; - } - - /* Now there are just 128 bits left, stored in 'x0'. */ - - /* - * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, - * which is equivalent to multiplying by x^32. This is needed because - * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). - */ - x0 = _mm_srli_si128(x0, 8) ^ - _mm_clmulepi64_si128(x0, multipliers_1, 0x10); - - /* Fold 96 => 64 bits */ - x0 = _mm_srli_si128(x0, 4) ^ - _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00); - - /* - * Finally, reduce 64 => 32 bits using Barrett reduction. - * - * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to - * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)): - * - * R(x) = (A(x)*x^32 + B(x)) mod G(x) - * = (A(x)*x^32) mod G(x) + B(x) - * - * Then, by the Division Algorithm there exists a unique q(x) such that: - * - * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x) - * - * Since the left-hand side is of maximum degree 31, the right-hand side - * must be too. This implies that we can apply 'mod x^32' to the - * right-hand side without changing its value: - * - * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32 - * - * Note that '+' is equivalent to '-' in polynomials over GF(2). - * - * We also know that: - * - * / A(x)*x^32 \ - * q(x) = floor ( --------- ) - * \ G(x) / - * - * To compute this efficiently, we can multiply the top and bottom by - * x^32 and move the division by G(x) to the top: - * - * / A(x) * floor(x^64 / G(x)) \ - * q(x) = floor ( ------------------------- ) - * \ x^32 / - * - * Note that floor(x^64 / G(x)) is a constant. - * - * So finally we have: - * - * / A(x) * floor(x^64 / G(x)) \ - * R(x) = B(x) + G(x)*floor ( ------------------------- ) - * \ x^32 / - */ - x1 = x0; - x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00); - x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10); - return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4)); -} - -#define IMPL_ALIGNMENT 16 -#define IMPL_SEGMENT_SIZE 16 -#include "../crc32_vec_template.h" diff --git a/ext/libdeflate/lib/x86/decompress_impl.h b/ext/libdeflate/lib/x86/decompress_impl.h deleted file mode 100644 index b3d322a1..00000000 --- a/ext/libdeflate/lib/x86/decompress_impl.h +++ /dev/null @@ -1,26 +0,0 @@ -#include "cpu_features.h" - -/* Include the BMI2-optimized version? */ -#undef DISPATCH_BMI2 -#if !defined(__BMI2__) && X86_CPU_FEATURES_ENABLED && \ - COMPILER_SUPPORTS_BMI2_TARGET -# define FUNCNAME deflate_decompress_bmi2 -# define ATTRIBUTES __attribute__((target("bmi2"))) -# define DISPATCH 1 -# define DISPATCH_BMI2 1 -# include "../decompress_template.h" -#endif - -#ifdef DISPATCH -static inline decompress_func_t -arch_select_decompress_func(void) -{ - u32 features = get_cpu_features(); - -#ifdef DISPATCH_BMI2 - if (features & X86_CPU_FEATURE_BMI2) - return deflate_decompress_bmi2; -#endif - return NULL; -} -#endif /* DISPATCH */ diff --git a/ext/libdeflate/lib/x86/matchfinder_impl.h b/ext/libdeflate/lib/x86/matchfinder_impl.h deleted file mode 100644 index 735bb483..00000000 --- a/ext/libdeflate/lib/x86/matchfinder_impl.h +++ /dev/null @@ -1,164 +0,0 @@ -/* - * x86/matchfinder_impl.h - x86 implementations of matchfinder functions - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifdef __AVX2__ -# if MATCHFINDER_ALIGNMENT < 32 -# undef MATCHFINDER_ALIGNMENT -# define MATCHFINDER_ALIGNMENT 32 -# endif -# include -static forceinline bool -matchfinder_init_avx2(mf_pos_t *data, size_t size) -{ - __m256i v, *p; - size_t n; - - if (size % (sizeof(__m256i) * 4) != 0) - return false; - - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = _mm256_set1_epi16(MATCHFINDER_INITVAL); - p = (__m256i *)data; - n = size / (sizeof(__m256i) * 4); - do { - p[0] = v; - p[1] = v; - p[2] = v; - p[3] = v; - p += 4; - } while (--n); - return true; -} - -static forceinline bool -matchfinder_rebase_avx2(mf_pos_t *data, size_t size) -{ - __m256i v, *p; - size_t n; - - if (size % (sizeof(__m256i) * 4) != 0) - return false; - - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); - p = (__m256i *)data; - n = size / (sizeof(__m256i) * 4); - do { - /* PADDSW: Add Packed Signed Integers With Signed Saturation */ - p[0] = _mm256_adds_epi16(p[0], v); - p[1] = _mm256_adds_epi16(p[1], v); - p[2] = _mm256_adds_epi16(p[2], v); - p[3] = _mm256_adds_epi16(p[3], v); - p += 4; - } while (--n); - return true; -} -#endif /* __AVX2__ */ - -#ifdef __SSE2__ -# if MATCHFINDER_ALIGNMENT < 16 -# undef MATCHFINDER_ALIGNMENT -# define MATCHFINDER_ALIGNMENT 16 -# endif -# include -static forceinline bool -matchfinder_init_sse2(mf_pos_t *data, size_t size) -{ - __m128i v, *p; - size_t n; - - if (size % (sizeof(__m128i) * 4) != 0) - return false; - - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = _mm_set1_epi16(MATCHFINDER_INITVAL); - p = (__m128i *)data; - n = size / (sizeof(__m128i) * 4); - do { - p[0] = v; - p[1] = v; - p[2] = v; - p[3] = v; - p += 4; - } while (--n); - return true; -} - -static forceinline bool -matchfinder_rebase_sse2(mf_pos_t *data, size_t size) -{ - __m128i v, *p; - size_t n; - - if (size % (sizeof(__m128i) * 4) != 0) - return false; - - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); - p = (__m128i *)data; - n = size / (sizeof(__m128i) * 4); - do { - /* PADDSW: Add Packed Signed Integers With Signed Saturation */ - p[0] = _mm_adds_epi16(p[0], v); - p[1] = _mm_adds_epi16(p[1], v); - p[2] = _mm_adds_epi16(p[2], v); - p[3] = _mm_adds_epi16(p[3], v); - p += 4; - } while (--n); - return true; -} -#endif /* __SSE2__ */ - -#undef arch_matchfinder_init -static forceinline bool -arch_matchfinder_init(mf_pos_t *data, size_t size) -{ -#ifdef __AVX2__ - if (matchfinder_init_avx2(data, size)) - return true; -#endif -#ifdef __SSE2__ - if (matchfinder_init_sse2(data, size)) - return true; -#endif - return false; -} - -#undef arch_matchfinder_rebase -static forceinline bool -arch_matchfinder_rebase(mf_pos_t *data, size_t size) -{ -#ifdef __AVX2__ - if (matchfinder_rebase_avx2(data, size)) - return true; -#endif -#ifdef __SSE2__ - if (matchfinder_rebase_sse2(data, size)) - return true; -#endif - return false; -} diff --git a/ext/libdeflate/lib/zlib_compress.c b/ext/libdeflate/lib/zlib_compress.c deleted file mode 100644 index b4cebaf8..00000000 --- a/ext/libdeflate/lib/zlib_compress.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * zlib_compress.c - compress with a zlib wrapper - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "deflate_compress.h" -#include "unaligned.h" -#include "zlib_constants.h" - -#include "libdeflate.h" - -LIBDEFLATEAPI size_t -libdeflate_zlib_compress(struct libdeflate_compressor *c, - const void *in, size_t in_size, - void *out, size_t out_nbytes_avail) -{ - u8 *out_next = out; - u16 hdr; - unsigned compression_level; - unsigned level_hint; - size_t deflate_size; - - if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD) - return 0; - - /* 2 byte header: CMF and FLG */ - hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12); - compression_level = deflate_get_compression_level(c); - if (compression_level < 2) - level_hint = ZLIB_FASTEST_COMPRESSION; - else if (compression_level < 6) - level_hint = ZLIB_FAST_COMPRESSION; - else if (compression_level < 8) - level_hint = ZLIB_DEFAULT_COMPRESSION; - else - level_hint = ZLIB_SLOWEST_COMPRESSION; - hdr |= level_hint << 6; - hdr |= 31 - (hdr % 31); - - put_unaligned_be16(hdr, out_next); - out_next += 2; - - /* Compressed data */ - deflate_size = libdeflate_deflate_compress(c, in, in_size, out_next, - out_nbytes_avail - ZLIB_MIN_OVERHEAD); - if (deflate_size == 0) - return 0; - out_next += deflate_size; - - /* ADLER32 */ - put_unaligned_be32(libdeflate_adler32(1, in, in_size), out_next); - out_next += 4; - - return out_next - (u8 *)out; -} - -LIBDEFLATEAPI size_t -libdeflate_zlib_compress_bound(struct libdeflate_compressor *c, - size_t in_nbytes) -{ - return ZLIB_MIN_OVERHEAD + - libdeflate_deflate_compress_bound(c, in_nbytes); -} diff --git a/ext/libdeflate/lib/zlib_constants.h b/ext/libdeflate/lib/zlib_constants.h deleted file mode 100644 index f304310c..00000000 --- a/ext/libdeflate/lib/zlib_constants.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * zlib_constants.h - constants for the zlib wrapper format - */ - -#ifndef LIB_ZLIB_CONSTANTS_H -#define LIB_ZLIB_CONSTANTS_H - -#define ZLIB_MIN_HEADER_SIZE 2 -#define ZLIB_FOOTER_SIZE 4 -#define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE) - -#define ZLIB_CM_DEFLATE 8 - -#define ZLIB_CINFO_32K_WINDOW 7 - -#define ZLIB_FASTEST_COMPRESSION 0 -#define ZLIB_FAST_COMPRESSION 1 -#define ZLIB_DEFAULT_COMPRESSION 2 -#define ZLIB_SLOWEST_COMPRESSION 3 - -#endif /* LIB_ZLIB_CONSTANTS_H */ diff --git a/ext/libdeflate/lib/zlib_decompress.c b/ext/libdeflate/lib/zlib_decompress.c deleted file mode 100644 index c5a15cab..00000000 --- a/ext/libdeflate/lib/zlib_decompress.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * zlib_decompress.c - decompress with a zlib wrapper - * - * Originally public domain; changes after 2016-09-07 are copyrighted. - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "unaligned.h" -#include "zlib_constants.h" - -#include "libdeflate.h" - -LIBDEFLATEAPI enum libdeflate_result -libdeflate_zlib_decompress(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) -{ - const u8 *in_next = in; - const u8 * const in_end = in_next + in_nbytes; - u16 hdr; - size_t actual_out_nbytes; - enum libdeflate_result result; - - if (in_nbytes < ZLIB_MIN_OVERHEAD) - return LIBDEFLATE_BAD_DATA; - - /* 2 byte header: CMF and FLG */ - hdr = get_unaligned_be16(in_next); - in_next += 2; - - /* FCHECK */ - if ((hdr % 31) != 0) - return LIBDEFLATE_BAD_DATA; - - /* CM */ - if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE) - return LIBDEFLATE_BAD_DATA; - - /* CINFO */ - if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW) - return LIBDEFLATE_BAD_DATA; - - /* FDICT */ - if ((hdr >> 5) & 1) - return LIBDEFLATE_BAD_DATA; - - /* Compressed data */ - result = libdeflate_deflate_decompress(d, in_next, - in_end - ZLIB_FOOTER_SIZE - in_next, - out, out_nbytes_avail, - actual_out_nbytes_ret); - if (result != LIBDEFLATE_SUCCESS) - return result; - - if (actual_out_nbytes_ret) - actual_out_nbytes = *actual_out_nbytes_ret; - else - actual_out_nbytes = out_nbytes_avail; - - in_next = in_end - ZLIB_FOOTER_SIZE; - - /* ADLER32 */ - if (libdeflate_adler32(1, out, actual_out_nbytes) != - get_unaligned_be32(in_next)) - return LIBDEFLATE_BAD_DATA; - - return LIBDEFLATE_SUCCESS; -} diff --git a/ext/libdeflate/libdeflate.h b/ext/libdeflate/libdeflate.h deleted file mode 100644 index c5600fc0..00000000 --- a/ext/libdeflate/libdeflate.h +++ /dev/null @@ -1,323 +0,0 @@ -/* - * libdeflate.h - public header for libdeflate - */ - -#ifndef LIBDEFLATE_H -#define LIBDEFLATE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#define LIBDEFLATE_VERSION_MAJOR 1 -#define LIBDEFLATE_VERSION_MINOR 2 -#define LIBDEFLATE_VERSION_STRING "1.2" - -#include -#include - -/* - * On Windows, if you want to link to the DLL version of libdeflate, then - * #define LIBDEFLATE_DLL. Note that the calling convention is cdecl. - */ -#ifdef LIBDEFLATE_DLL -# ifdef BUILDING_LIBDEFLATE -# define LIBDEFLATEAPI_SYM_VISIBILITY LIBEXPORT -# elif defined(_WIN32) || defined(__CYGWIN__) -# define LIBDEFLATEAPI_SYM_VISIBILITY __declspec(dllimport) -# endif -#endif -#ifndef LIBDEFLATEAPI_SYM_VISIBILITY -# define LIBDEFLATEAPI_SYM_VISIBILITY -#endif - -#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \ - defined(_WIN32) && defined(__i386__) - /* - * On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4. - * Realign the stack when entering libdeflate to avoid crashing in SSE/AVX - * code when called from an MSVC-compiled application. - */ -# define LIBDEFLATEAPI_STACKALIGN __attribute__((force_align_arg_pointer)) -#endif -#ifndef LIBDEFLATEAPI_STACKALIGN -# define LIBDEFLATEAPI_STACKALIGN -#endif - -#define LIBDEFLATEAPI LIBDEFLATEAPI_SYM_VISIBILITY LIBDEFLATEAPI_STACKALIGN - -/* ========================================================================== */ -/* Compression */ -/* ========================================================================== */ - -struct libdeflate_compressor; - -/* - * libdeflate_alloc_compressor() allocates a new compressor that supports - * DEFLATE, zlib, and gzip compression. 'compression_level' is the compression - * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 = - * medium/default, 9 = slow, 12 = slowest). The return value is a pointer to - * the new compressor, or NULL if out of memory. - * - * Note: for compression, the sliding window size is defined at compilation time - * to 32768, the largest size permissible in the DEFLATE format. It cannot be - * changed at runtime. - * - * A single compressor is not safe to use by multiple threads concurrently. - * However, different threads may use different compressors concurrently. - */ -LIBDEFLATEAPI struct libdeflate_compressor * -libdeflate_alloc_compressor(int compression_level); - -/* - * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of - * data. The function attempts to compress 'in_nbytes' bytes of data located at - * 'in' and write the results to 'out', which has space for 'out_nbytes_avail' - * bytes. The return value is the compressed size in bytes, or 0 if the data - * could not be compressed to 'out_nbytes_avail' bytes or fewer. - */ -LIBDEFLATEAPI size_t -libdeflate_deflate_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the - * number of bytes of compressed data that may be produced by compressing any - * buffer of length less than or equal to 'in_nbytes' using - * libdeflate_deflate_compress() with the specified compressor. Mathematically, - * this bound will necessarily be a number greater than or equal to 'in_nbytes'. - * It may be an overestimate of the true upper bound. The return value is - * guaranteed to be the same for all invocations with the same compressor and - * same 'in_nbytes'. - * - * As a special case, 'compressor' may be NULL. This causes the bound to be - * taken across *any* libdeflate_compressor that could ever be allocated with - * this build of the library, with any options. - * - * Note that this function is not necessary in many applications. With - * block-based compression, it is usually preferable to separately store the - * uncompressed size of each block and to store any blocks that did not compress - * to less than their original size uncompressed. In that scenario, there is no - * need to know the worst-case compressed size, since the maximum number of - * bytes of compressed data that may be used would always be one less than the - * input length. You can just pass a buffer of that size to - * libdeflate_deflate_compress() and store the data uncompressed if - * libdeflate_deflate_compress() returns 0, indicating that the compressed data - * did not fit into the provided output buffer. - */ -LIBDEFLATEAPI size_t -libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper - * format. - */ -LIBDEFLATEAPI size_t -libdeflate_zlib_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * Like libdeflate_deflate_compress_bound(), but assumes the data will be - * compressed with libdeflate_zlib_compress() rather than with - * libdeflate_deflate_compress(). - */ -LIBDEFLATEAPI size_t -libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper - * format. - */ -LIBDEFLATEAPI size_t -libdeflate_gzip_compress(struct libdeflate_compressor *compressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail); - -/* - * Like libdeflate_deflate_compress_bound(), but assumes the data will be - * compressed with libdeflate_gzip_compress() rather than with - * libdeflate_deflate_compress(). - */ -LIBDEFLATEAPI size_t -libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor, - size_t in_nbytes); - -/* - * libdeflate_free_compressor() frees a compressor that was allocated with - * libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is - * taken. - */ -LIBDEFLATEAPI void -libdeflate_free_compressor(struct libdeflate_compressor *compressor); - -/* ========================================================================== */ -/* Decompression */ -/* ========================================================================== */ - -struct libdeflate_decompressor; - -/* - * libdeflate_alloc_decompressor() allocates a new decompressor that can be used - * for DEFLATE, zlib, and gzip decompression. The return value is a pointer to - * the new decompressor, or NULL if out of memory. - * - * This function takes no parameters, and the returned decompressor is valid for - * decompressing data that was compressed at any compression level and with any - * sliding window size. - * - * A single decompressor is not safe to use by multiple threads concurrently. - * However, different threads may use different decompressors concurrently. - */ -LIBDEFLATEAPI struct libdeflate_decompressor * -libdeflate_alloc_decompressor(void); - -/* - * Result of a call to libdeflate_deflate_decompress(), - * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress(). - */ -enum libdeflate_result { - /* Decompression was successful. */ - LIBDEFLATE_SUCCESS = 0, - - /* Decompressed failed because the compressed data was invalid, corrupt, - * or otherwise unsupported. */ - LIBDEFLATE_BAD_DATA = 1, - - /* A NULL 'actual_out_nbytes_ret' was provided, but the data would have - * decompressed to fewer than 'out_nbytes_avail' bytes. */ - LIBDEFLATE_SHORT_OUTPUT = 2, - - /* The data would have decompressed to more than 'out_nbytes_avail' - * bytes. */ - LIBDEFLATE_INSUFFICIENT_SPACE = 3, -}; - -/* - * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream - * from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The - * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail' - * bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned. - * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If - * a nonzero result code is returned, then the contents of the output buffer are - * undefined. - * - * Decompression stops at the end of the DEFLATE stream (as indicated by the - * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes. - * - * libdeflate_deflate_decompress() can be used in cases where the actual - * uncompressed size is known (recommended) or unknown (not recommended): - * - * - If the actual uncompressed size is known, then pass the actual - * uncompressed size as 'out_nbytes_avail' and pass NULL for - * 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail - * with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the - * specified number of bytes. - * - * - If the actual uncompressed size is unknown, then provide a non-NULL - * 'actual_out_nbytes_ret' and provide a buffer with some size - * 'out_nbytes_avail' that you think is large enough to hold all the - * uncompressed data. In this case, if the data decompresses to less than - * or equal to 'out_nbytes_avail' bytes, then - * libdeflate_deflate_decompress() will write the actual uncompressed size - * to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise, - * it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was - * not large enough but no other problems were encountered, or another - * nonzero result code if decompression failed for another reason. - */ -LIBDEFLATEAPI enum libdeflate_result -libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret' - * argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL, - * then the actual compressed size of the DEFLATE stream (aligned to the next - * byte boundary) is written to *actual_in_nbytes_ret. - */ -LIBDEFLATEAPI enum libdeflate_result -libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format - * instead of raw DEFLATE. - */ -LIBDEFLATEAPI enum libdeflate_result -libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format - * instead of raw DEFLATE. - * - * If multiple gzip-compressed members are concatenated, then only the first - * will be decompressed. Use libdeflate_gzip_decompress_ex() if you need - * multi-member support. - */ -LIBDEFLATEAPI enum libdeflate_result -libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret); - -/* - * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret' - * argument. If 'actual_in_nbytes_ret' is not NULL and the decompression - * succeeds (indicating that the first gzip-compressed member in the input - * buffer was decompressed), then the actual number of input bytes consumed is - * written to *actual_in_nbytes_ret. - */ -LIBDEFLATEAPI enum libdeflate_result -libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret); - -/* - * libdeflate_free_decompressor() frees a decompressor that was allocated with - * libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action - * is taken. - */ -LIBDEFLATEAPI void -libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor); - -/* ========================================================================== */ -/* Checksums */ -/* ========================================================================== */ - -/* - * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of - * data and returns the updated checksum. When starting a new checksum, the - * required initial value for 'adler' is 1. This value is also returned when - * 'buffer' is specified as NULL. - */ -LIBDEFLATEAPI uint32_t -libdeflate_adler32(uint32_t adler32, const void *buffer, size_t len); - - -/* - * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data - * and returns the updated checksum. When starting a new checksum, the required - * initial value for 'crc' is 0. This value is also returned when 'buffer' is - * specified as NULL. - */ -LIBDEFLATEAPI uint32_t -libdeflate_crc32(uint32_t crc, const void *buffer, size_t len); - -#ifdef __cplusplus -} -#endif - -#endif /* LIBDEFLATE_H */