libdeflate: delete libdeflate

This commit is contained in:
Ziemas 2020-12-29 15:23:11 +01:00
parent 542843e429
commit c70605e672
44 changed files with 0 additions and 10120 deletions

View file

@ -1,67 +0,0 @@
win32:TARGET = libdeflate
else:TARGET = deflate
TEMPLATE = lib
CONFIG += staticlib c99
win32-msvc: QMAKE_CFLAGS += /MD /O2
else {
QMAKE_CFLAGS += -O2 \
-fomit-frame-pointer \
-Wall -Wundef \
-Wpedantic -Wdeclaration-after-statement -Wmissing-prototypes -Wstrict-prototypes -Wvla \
-fvisibility=hidden -D_ANSI_SOURCE
mingw: QMAKE_CFLAGS += -Wno-pedantic-ms-format
}
INCLUDEPATH += \
../../ext/libdeflate \
../../ext/libdeflate/common
HEADERS += \
# common headers
../../ext/libdeflate/libdeflate.h \
../../ext/libdeflate/common/common_defs.h \
../../ext/libdeflate/common/compiler_gcc.h \
../../ext/libdeflate/common/compiler_msc.h \
# library headers
../../ext/libdeflate/lib/adler32_vec_template.h \
../../ext/libdeflate/lib/aligned_malloc.h \
../../ext/libdeflate/lib/bt_matchfinder.h \
../../ext/libdeflate/lib/crc32_table.h \
../../ext/libdeflate/lib/crc32_vec_template.h \
../../ext/libdeflate/lib/decompress_template.h \
../../ext/libdeflate/lib/deflate_compress.h \
../../ext/libdeflate/lib/deflate_constants.h \
../../ext/libdeflate/lib/gzip_constants.h \
../../ext/libdeflate/lib/hc_matchfinder.h \
../../ext/libdeflate/lib/lib_common.h \
../../ext/libdeflate/lib/matchfinder_common.h \
../../ext/libdeflate/lib/unaligned.h \
../../ext/libdeflate/lib/zlib_constants.h \
../../ext/libdeflate/lib/arm/adler32_impl.h \
../../ext/libdeflate/lib/arm/cpu_features.h \
../../ext/libdeflate/lib/arm/crc32_impl.h \
../../ext/libdeflate/lib/arm/matchfinder_impl.h \
../../ext/libdeflate/lib/x86/adler32_impl.h \
../../ext/libdeflate/lib/x86/cpu_features.h \
../../ext/libdeflate/lib/x86/crc32_impl.h \
../../ext/libdeflate/lib/x86/crc32_pclmul_template.h \
../../ext/libdeflate/lib/x86/decompress_impl.h \
../../ext/libdeflate/lib/x86/matchfinder_impl.h
SOURCES += \
../../ext/libdeflate/lib/aligned_malloc.c \
../../ext/libdeflate/lib/deflate_decompress.c \
# uncomment for compression support
#../../ext/libdeflate/lib/deflate_compress.c \
# uncomment for zlib format support
#../../ext/libdeflate/lib/adler32.c \
#../../ext/libdeflate/lib/zlib_decompress.c \
#../../ext/libdeflate/lib/zlib_compress.c \
# uncomment for gzip support
#../../ext/libdeflate/lib/gzip_decompress.c \
#../../ext/libdeflate/lib/gzip_compress.c \
../../ext/libdeflate/lib/arm/cpu_features.c \
../../ext/libdeflate/lib/x86/cpu_features.c

View file

@ -1,94 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<!-- configurations -->
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Release Optimized|x64">
<Configuration>Release Optimized</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Devel|x64">
<Configuration>Devel</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
<!-- Latest Target Version property -->
<LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
<WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
<TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
</PropertyGroup>
<!-- globals -->
<PropertyGroup Label="Globals">
<ProjectGuid>{A77564F4-56BB-3D08-8126-3FD5FC44F217}</ProjectGuid>
<!-- <WindowsTargetPlatformVersion>10.0.18362.0</WindowsTargetPlatformVersion> -->
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<!-- configuration defaults -->
<PropertyGroup Label="Configuration">
<ConfigurationType>StaticLibrary</ConfigurationType>
<CharacterSet>Unicode</CharacterSet>
<PlatformToolset>v141</PlatformToolset>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings" />
<!-- prop includes -->
<ImportGroup Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" />
<Import Project="..\common.props" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup>
<TargetName>$(ProjectName)$(Postfix)</TargetName>
</PropertyGroup>
<ItemDefinitionGroup>
<Lib>
<OutputFile>$(BinDir)\$(ProjectName)$(Postfix).lib</OutputFile>
</Lib>
</ItemDefinitionGroup>
<!-- c files -->
<ItemGroup>
<ClCompile Include="$(ExtDir)\libdeflate\lib\aligned_malloc.c">
<DisableSpecificWarnings>4127;%(DisableSpecificWarnings)</DisableSpecificWarnings>
</ClCompile>
<ClCompile Include="$(ExtDir)\libdeflate\lib\x86\cpu_features.c">
<DisableSpecificWarnings>4127;%(DisableSpecificWarnings)</DisableSpecificWarnings>
</ClCompile>
<ClCompile Include="$(ExtDir)\libdeflate\lib\deflate_decompress.c">
<DisableSpecificWarnings>4127;4245;4100;4018;%(DisableSpecificWarnings)</DisableSpecificWarnings>
</ClCompile>
<!-- headers-->
<ClInclude Include="$(ExtDir)\libdeflate\lib\x86\adler32_impl.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\adler32_vec_template.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\aligned_malloc.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\bt_matchfinder.h" />
<ClInclude Include="$(ExtDir)\libdeflate\common\common_defs.h" />
<ClInclude Include="$(ExtDir)\libdeflate\common\compiler_gcc.h" />
<ClInclude Include="$(ExtDir)\libdeflate\common\compiler_msc.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\x86\cpu_features.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\x86\crc32_impl.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\x86\crc32_pclmul_template.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\crc32_table.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\crc32_vec_template.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\x86\decompress_impl.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\decompress_template.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\deflate_compress.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\deflate_constants.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\gzip_constants.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\hc_matchfinder.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\lib_common.h" />
<ClInclude Include="$(ExtDir)\libdeflate\libdeflate.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\matchfinder_common.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\x86\matchfinder_impl.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\unaligned.h" />
<ClInclude Include="$(ExtDir)\libdeflate\lib\zlib_constants.h" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
</Project>

View file

@ -1,126 +0,0 @@
<?xml version="1.0" encoding="utf-8"?>
<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup>
<Filter Include="Generated Files">
<UniqueIdentifier>{71ED8ED8-ACB9-4CE9-BBE1-E00B30144E11}</UniqueIdentifier>
<Extensions>cpp;c;cxx;moc;h;def;odl;idl;res;</Extensions>
</Filter>
<Filter Include="Generated Files">
<UniqueIdentifier>{71ED8ED8-ACB9-4CE9-BBE1-E00B30144E11}</UniqueIdentifier>
<Extensions>cpp;c;cxx;moc;h;def;odl;idl;res;</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
</Filter>
<Filter Include="Header Files">
<UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
<Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
</Filter>
<Filter Include="Source Files">
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
<Extensions>cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
</Filter>
<Filter Include="Source Files">
<UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
<Extensions>cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\ext\libdeflate\lib\aligned_malloc.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\ext\libdeflate\lib\x86\cpu_features.c">
<Filter>Source Files</Filter>
</ClCompile>
<ClCompile Include="..\..\ext\libdeflate\lib\deflate_decompress.c">
<Filter>Source Files</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\..\ext\libdeflate\lib\arm\adler32_impl.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\x86\adler32_impl.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\adler32_vec_template.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\aligned_malloc.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\bt_matchfinder.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\common\common_defs.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\common\compiler_gcc.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\common\compiler_msc.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\arm\cpu_features.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\x86\cpu_features.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\arm\crc32_impl.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\x86\crc32_impl.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\x86\crc32_pclmul_template.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\crc32_table.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\crc32_vec_template.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\x86\decompress_impl.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\decompress_template.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\deflate_compress.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\deflate_constants.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\gzip_constants.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\hc_matchfinder.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\lib_common.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\libdeflate.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\matchfinder_common.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\arm\matchfinder_impl.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\x86\matchfinder_impl.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\unaligned.h">
<Filter>Header Files</Filter>
</ClInclude>
<ClInclude Include="..\..\ext\libdeflate\lib\zlib_constants.h">
<Filter>Header Files</Filter>
</ClInclude>
</ItemGroup>
</Project>

View file

@ -1,84 +0,0 @@
project(libdeflate C)
set(TARGET libdeflate)
set(CMAKE_C_STANDARD 99)
if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR
${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR
${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
set(FLAGS ${FLAGS} -fomit-frame-pointer)
set(FLAGS ${FLAGS} -Wall -Wundef)
set(FLAGS ${FLAGS} -Wpedantic -Wdeclaration-after-statement -Wmissing-prototypes -Wstrict-prototypes -Wvla)
set(FLAGS ${FLAGS} -fvisibility=hidden -D_ANSI_SOURCE)
if (MINGW)
set(FLAGS ${FLAGS} -Wno-pedantic-ms-format)
endif()
endif()
set(COMMON_HEADERS
libdeflate.h
common/common_defs.h
common/compiler_gcc.h
common/compiler_msc.h)
set(LIB_HEADERS
lib/adler32_vec_template.h
lib/aligned_malloc.h
lib/bt_matchfinder.h
lib/crc32_table.h
lib/crc32_vec_template.h
lib/decompress_template.h
lib/deflate_compress.h
lib/deflate_constants.h
lib/gzip_constants.h
lib/hc_matchfinder.h
lib/lib_common.h
lib/matchfinder_common.h
lib/unaligned.h
lib/zlib_constants.h
lib/arm/adler32_impl.h
lib/arm/cpu_features.h
lib/arm/crc32_impl.h
lib/arm/matchfinder_impl.h
lib/x86/adler32_impl.h
lib/x86/cpu_features.h
lib/x86/crc32_impl.h
lib/x86/crc32_pclmul_template.h
lib/x86/decompress_impl.h
lib/x86/matchfinder_impl.h)
set(LIB_SRC
lib/aligned_malloc.c
lib/deflate_decompress.c
# uncomment for compression support
#lib/deflate_compress.c
# uncomment for zlib format support
#lib/adler32.c
#lib/zlib_decompress.c
#lib/zlib_compress.c
# uncomment for gzip support
#lib/gzip_decompress.c
#lib/gzip_compress.c
lib/arm/cpu_features.c
lib/x86/cpu_features.c)
add_library(${TARGET} STATIC ${LIB_SRC} ${LIB_HEADERS} ${COMMON_HEADERS})
add_library(Ext::libdeflate ALIAS ${TARGET})
set_target_properties(${TARGET} PROPERTIES PREFIX "")
set_property(TARGET ${TARGET} PROPERTY FOLDER External)
target_include_directories(${TARGET}
PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
if (FLAGS)
target_compile_options(${TARGET} PRIVATE ${FLAGS})
endif()

View file

@ -1,21 +0,0 @@
Copyright 2016 Eric Biggers
Permission is hereby granted, free of charge, to any person
obtaining a copy of this software and associated documentation files
(the "Software"), to deal in the Software without restriction,
including without limitation the rights to use, copy, modify, merge,
publish, distribute, sublicense, and/or sell copies of the Software,
and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be
included in all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View file

@ -1,366 +0,0 @@
/*
* common_defs.h
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifndef COMMON_COMMON_DEFS_H
#define COMMON_COMMON_DEFS_H
#ifdef __GNUC__
# include "compiler_gcc.h"
#elif defined(_MSC_VER)
# include "compiler_msc.h"
#else
# pragma message("Unrecognized compiler. Please add a header file for your compiler. Compilation will proceed, but performance may suffer!")
#endif
/* ========================================================================== */
/* Type definitions */
/* ========================================================================== */
#include <stddef.h> /* size_t */
#ifndef __bool_true_false_are_defined
# include <stdbool.h> /* bool */
#endif
/* Fixed-width integer types */
#ifndef PRIu32
# include <inttypes.h>
#endif
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef uint64_t u64;
typedef int8_t s8;
typedef int16_t s16;
typedef int32_t s32;
typedef int64_t s64;
/*
* Word type of the target architecture. Use 'size_t' instead of 'unsigned
* long' to account for platforms such as Windows that use 32-bit 'unsigned
* long' on 64-bit architectures.
*/
typedef size_t machine_word_t;
/* Number of bytes in a word */
#define WORDBYTES ((int)sizeof(machine_word_t))
/* Number of bits in a word */
#define WORDBITS (8 * WORDBYTES)
/* ========================================================================== */
/* Optional compiler features */
/* ========================================================================== */
/* LIBEXPORT - export a function from a shared library */
#ifndef LIBEXPORT
# define LIBEXPORT
#endif
/* inline - suggest that a function be inlined */
#ifndef inline
# define inline
#endif
/* forceinline - force a function to be inlined, if possible */
#ifndef forceinline
# define forceinline inline
#endif
/* restrict - annotate a non-aliased pointer */
#ifndef restrict
# define restrict
#endif
/* likely(expr) - hint that an expression is usually true */
#ifndef likely
# define likely(expr) (expr)
#endif
/* unlikely(expr) - hint that an expression is usually false */
#ifndef unlikely
# define unlikely(expr) (expr)
#endif
/* prefetchr(addr) - prefetch into L1 cache for read */
#ifndef prefetchr
# define prefetchr(addr)
#endif
/* prefetchw(addr) - prefetch into L1 cache for write */
#ifndef prefetchw
# define prefetchw(addr)
#endif
/* Does the compiler support the 'target' function attribute? */
#ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
# define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0
#endif
/* Which targets are supported with the 'target' function attribute? */
#ifndef COMPILER_SUPPORTS_BMI2_TARGET
# define COMPILER_SUPPORTS_BMI2_TARGET 0
#endif
#ifndef COMPILER_SUPPORTS_AVX_TARGET
# define COMPILER_SUPPORTS_AVX_TARGET 0
#endif
#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET
# define COMPILER_SUPPORTS_AVX512BW_TARGET 0
#endif
/*
* Which targets are supported with the 'target' function attribute and have
* intrinsics that work within 'target'-ed functions?
*/
#ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 0
#endif
#ifndef COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS
# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 0
#endif
/* _aligned_attribute(n) - declare that the annotated variable, or variables of
* the annotated type, are to be aligned on n-byte boundaries */
#ifndef _aligned_attribute
#endif
/* ========================================================================== */
/* Miscellaneous macros */
/* ========================================================================== */
#define ARRAY_LEN(A) (sizeof(A) / sizeof((A)[0]))
#define MIN(a, b) ((a) <= (b) ? (a) : (b))
#define MAX(a, b) ((a) >= (b) ? (a) : (b))
#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
#define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)]))
#define ALIGN(n, a) (((n) + (a) - 1) & ~((a) - 1))
/* ========================================================================== */
/* Endianness handling */
/* ========================================================================== */
/*
* CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little
* endian or 0 if it is big endian. The macro should be defined in a way such
* that the compiler can evaluate it at compilation time. If not defined, a
* fallback is used.
*/
#ifndef CPU_IS_LITTLE_ENDIAN
static forceinline int CPU_IS_LITTLE_ENDIAN(void)
{
union {
unsigned int v;
unsigned char b;
} u;
u.v = 1;
return u.b;
}
#endif
/* bswap16(n) - swap the bytes of a 16-bit integer */
#ifndef bswap16
static forceinline u16 bswap16(u16 n)
{
return (n << 8) | (n >> 8);
}
#endif
/* bswap32(n) - swap the bytes of a 32-bit integer */
#ifndef bswap32
static forceinline u32 bswap32(u32 n)
{
return ((n & 0x000000FF) << 24) |
((n & 0x0000FF00) << 8) |
((n & 0x00FF0000) >> 8) |
((n & 0xFF000000) >> 24);
}
#endif
/* bswap64(n) - swap the bytes of a 64-bit integer */
#ifndef bswap64
static forceinline u64 bswap64(u64 n)
{
return ((n & 0x00000000000000FF) << 56) |
((n & 0x000000000000FF00) << 40) |
((n & 0x0000000000FF0000) << 24) |
((n & 0x00000000FF000000) << 8) |
((n & 0x000000FF00000000) >> 8) |
((n & 0x0000FF0000000000) >> 24) |
((n & 0x00FF000000000000) >> 40) |
((n & 0xFF00000000000000) >> 56);
}
#endif
#define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n))
#define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n))
#define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n))
#define be16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap16(n) : (n))
#define be32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap32(n) : (n))
#define be64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap64(n) : (n))
/* ========================================================================== */
/* Unaligned memory accesses */
/* ========================================================================== */
/*
* UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses
* can be performed efficiently on the target platform.
*/
#ifndef UNALIGNED_ACCESS_IS_FAST
# define UNALIGNED_ACCESS_IS_FAST 0
#endif
/*
* DEFINE_UNALIGNED_TYPE(type) - a macro that, given an integer type 'type',
* defines load_type_unaligned(addr) and store_type_unaligned(v, addr) functions
* which load and store variables of type 'type' from/to unaligned memory
* addresses. If not defined, a fallback is used.
*/
#ifndef DEFINE_UNALIGNED_TYPE
/*
* Although memcpy() may seem inefficient, it *usually* gets optimized
* appropriately by modern compilers. It's portable and may be the best we can
* do for a fallback...
*/
#include <string.h>
#define DEFINE_UNALIGNED_TYPE(type) \
\
static forceinline type \
load_##type##_unaligned(const void *p) \
{ \
type v; \
memcpy(&v, p, sizeof(v)); \
return v; \
} \
\
static forceinline void \
store_##type##_unaligned(type v, void *p) \
{ \
memcpy(p, &v, sizeof(v)); \
}
#endif /* !DEFINE_UNALIGNED_TYPE */
/* ========================================================================== */
/* Bit scan functions */
/* ========================================================================== */
/*
* Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
* significant end) of the *most* significant 1 bit in the input value. The
* input value must be nonzero!
*/
#ifndef bsr32
static forceinline unsigned
bsr32(u32 n)
{
unsigned i = 0;
while ((n >>= 1) != 0)
i++;
return i;
}
#endif
#ifndef bsr64
static forceinline unsigned
bsr64(u64 n)
{
unsigned i = 0;
while ((n >>= 1) != 0)
i++;
return i;
}
#endif
static forceinline unsigned
bsrw(machine_word_t n)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
return bsr32(n);
else
return bsr64(n);
}
/*
* Bit Scan Forward (BSF) - find the 0-based index (relative to the least
* significant end) of the *least* significant 1 bit in the input value. The
* input value must be nonzero!
*/
#ifndef bsf32
static forceinline unsigned
bsf32(u32 n)
{
unsigned i = 0;
while ((n & 1) == 0) {
i++;
n >>= 1;
}
return i;
}
#endif
#ifndef bsf64
static forceinline unsigned
bsf64(u64 n)
{
unsigned i = 0;
while ((n & 1) == 0) {
i++;
n >>= 1;
}
return i;
}
#endif
static forceinline unsigned
bsfw(machine_word_t n)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
return bsf32(n);
else
return bsf64(n);
}
#endif /* COMMON_COMMON_DEFS_H */

View file

@ -1,159 +0,0 @@
/*
* compiler_gcc.h - definitions for the GNU C Compiler. This also handles clang
* and the Intel C Compiler (icc).
*
* TODO: icc is not well tested, so some things are currently disabled even
* though they maybe can be enabled on some icc versions.
*/
#if !defined(__clang__) && !defined(__INTEL_COMPILER)
# define GCC_PREREQ(major, minor) \
(__GNUC__ > (major) || \
(__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
#else
# define GCC_PREREQ(major, minor) 0
#endif
/* Note: only check the clang version when absolutely necessary!
* "Vendors" such as Apple can use different version numbers. */
#ifdef __clang__
# ifdef __apple_build_version__
# define CLANG_PREREQ(major, minor, apple_version) \
(__apple_build_version__ >= (apple_version))
# else
# define CLANG_PREREQ(major, minor, apple_version) \
(__clang_major__ > (major) || \
(__clang_major__ == (major) && __clang_minor__ >= (minor)))
# endif
#else
# define CLANG_PREREQ(major, minor, apple_version) 0
#endif
#ifndef __has_attribute
# define __has_attribute(attribute) 0
#endif
#ifndef __has_feature
# define __has_feature(feature) 0
#endif
#ifndef __has_builtin
# define __has_builtin(builtin) 0
#endif
#ifdef _WIN32
# define LIBEXPORT __declspec(dllexport)
#else
# define LIBEXPORT __attribute__((visibility("default")))
#endif
#define inline inline
#define forceinline inline __attribute__((always_inline))
#define restrict __restrict__
#define likely(expr) __builtin_expect(!!(expr), 1)
#define unlikely(expr) __builtin_expect(!!(expr), 0)
#define prefetchr(addr) __builtin_prefetch((addr), 0)
#define prefetchw(addr) __builtin_prefetch((addr), 1)
#define _aligned_attribute(n) __attribute__((aligned(n)))
#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE \
(GCC_PREREQ(4, 4) || __has_attribute(target))
#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
# if defined(__i386__) || defined(__x86_64__)
# define COMPILER_SUPPORTS_PCLMUL_TARGET \
(GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128))
# define COMPILER_SUPPORTS_AVX_TARGET \
(GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256))
# define COMPILER_SUPPORTS_BMI2_TARGET \
(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))
# define COMPILER_SUPPORTS_AVX2_TARGET \
(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_psadbw256))
# define COMPILER_SUPPORTS_AVX512BW_TARGET \
(GCC_PREREQ(5, 1) || __has_builtin(__builtin_ia32_psadbw512))
/*
* Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics
* not available in the main target could not be used in 'target'
* attribute functions. Unfortunately clang has no feature test macro
* for this so we have to check its version.
*/
# if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000)
# define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 1
# define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS \
COMPILER_SUPPORTS_PCLMUL_TARGET
# define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS \
COMPILER_SUPPORTS_AVX2_TARGET
# define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS \
COMPILER_SUPPORTS_AVX512BW_TARGET
# endif
# elif (defined(__arm__) && defined(__ARM_FP)) || defined(__aarch64__)
/* arm: including arm_neon.h requires hardware fp support */
/*
* Prior to gcc 6.1 (r230411 for arm, r226563 for aarch64), NEON
* and crypto intrinsics not available in the main target could not be
* used in 'target' attribute functions.
*
* clang as of 5.0.1 still doesn't allow it. But, it does seem to allow
* the pmull intrinsics if only __ARM_NEON is enabled.
*/
# define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS GCC_PREREQ(6, 1)
# ifdef __ARM_NEON
# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS \
(GCC_PREREQ(6, 1) || __has_builtin(__builtin_neon_vmull_p64))
# else
# define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS \
(GCC_PREREQ(6, 1))
# endif
# endif
#endif /* COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE */
/* Newer gcc supports __BYTE_ORDER__. Older gcc doesn't. */
#ifdef __BYTE_ORDER__
# define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
#endif
#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
# define bswap16 __builtin_bswap16
#endif
#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
# define bswap32 __builtin_bswap32
#endif
#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
# define bswap64 __builtin_bswap64
#endif
#if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__)
# define UNALIGNED_ACCESS_IS_FAST 1
#endif
/* With gcc, we can access unaligned memory through 'packed' structures. */
#define DEFINE_UNALIGNED_TYPE(type) \
\
struct type##unaligned { \
type v; \
} __attribute__((packed)); \
\
static forceinline type \
load_##type##_unaligned(const void *p) \
{ \
return ((const struct type##unaligned *)p)->v; \
} \
\
static forceinline void \
store_##type##_unaligned(type v, void *p) \
{ \
((struct type##unaligned *)p)->v = v; \
}
#define bsr32(n) (31 - __builtin_clz(n))
#define bsr64(n) (63 - __builtin_clzll(n))
#define bsf32(n) __builtin_ctz(n)
#define bsf64(n) __builtin_ctzll(n)

View file

@ -1,96 +0,0 @@
/*
* compiler_msc.h - definitions for the Microsoft C Compiler
*/
#define LIBEXPORT __declspec(dllexport)
/*
* Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h.
* Beware: the below replacement isn't fully standard, since normally any value
* != 0 should be implicitly cast to a bool with value 1... but that doesn't
* happen if bool is really just an 'int'.
*/
typedef int bool;
#define true 1
#define false 0
#define __bool_true_false_are_defined 1
/* Define ssize_t */
#ifdef _WIN64
typedef long long ssize_t;
#else
typedef int ssize_t;
#endif
/*
* Old versions (e.g. VS2010) of MSC have stdint.h but not the C99 header
* inttypes.h. Work around this by defining the PRI* macros ourselves.
*/
#include <stdint.h>
#define PRIu8 "hhu"
#define PRIu16 "hu"
#define PRIu32 "u"
#define PRIu64 "llu"
#define PRIi8 "hhi"
#define PRIi16 "hi"
#define PRIi32 "i"
#define PRIi64 "lli"
#define PRIx8 "hhx"
#define PRIx16 "hx"
#define PRIx32 "x"
#define PRIx64 "llx"
/* Assume a little endian architecture with fast unaligned access */
#define CPU_IS_LITTLE_ENDIAN() 1
#define UNALIGNED_ACCESS_IS_FAST 1
/* __restrict has nonstandard behavior; don't use it */
#define restrict
/* ... but we can use __inline and __forceinline */
#define inline __inline
#define forceinline __forceinline
/* Byte swap functions */
#include <stdlib.h>
#define bswap16 _byteswap_ushort
#define bswap32 _byteswap_ulong
#define bswap64 _byteswap_uint64
/* Bit scan functions (32-bit) */
static forceinline unsigned
bsr32(uint32_t n)
{
_BitScanReverse(&n, n);
return n;
}
#define bsr32 bsr32
static forceinline unsigned
bsf32(uint32_t n)
{
_BitScanForward(&n, n);
return n;
}
#define bsf32 bsf32
#ifdef _M_X64 /* Bit scan functions (64-bit) */
static forceinline unsigned
bsr64(uint64_t n)
{
_BitScanReverse64(&n, n);
return n;
}
#define bsr64 bsr64
static forceinline unsigned
bsf64(uint64_t n)
{
_BitScanForward64(&n, n);
return n;
}
#define bsf64 bsf64
#endif /* _M_X64 */

View file

@ -1,130 +0,0 @@
/*
* adler32.c - Adler-32 checksum algorithm
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "lib_common.h"
#include "libdeflate.h"
/* The Adler-32 divisor, or "base", value. */
#define DIVISOR 65521
/*
* MAX_CHUNK_SIZE is the most bytes that can be processed without the
* possibility of s2 overflowing when it is represented as an unsigned 32-bit
* integer. This value was computed using the following Python script:
*
* divisor = 65521
* count = 0
* s1 = divisor - 1
* s2 = divisor - 1
* while True:
* s1 += 0xFF
* s2 += s1
* if s2 > 0xFFFFFFFF:
* break
* count += 1
* print(count)
*
* Note that to get the correct worst-case value, we must assume that every byte
* has value 0xFF and that s1 and s2 started with the highest possible values
* modulo the divisor.
*/
#define MAX_CHUNK_SIZE 5552
typedef u32 (*adler32_func_t)(u32, const u8 *, size_t);
/* Include architecture-specific implementations if available */
#undef DEFAULT_IMPL
#undef DISPATCH
#if defined(__arm__) || defined(__aarch64__)
# include "arm/adler32_impl.h"
#elif defined(__i386__) || defined(__x86_64__)
# include "x86/adler32_impl.h"
#endif
/* Define a generic implementation if needed */
#ifndef DEFAULT_IMPL
#define DEFAULT_IMPL adler32_generic
static u32 adler32_generic(u32 adler, const u8 *p, size_t size)
{
u32 s1 = adler & 0xFFFF;
u32 s2 = adler >> 16;
const u8 * const end = p + size;
while (p != end) {
size_t chunk_size = MIN(end - p, MAX_CHUNK_SIZE);
const u8 *chunk_end = p + chunk_size;
size_t num_unrolled_iterations = chunk_size / 4;
while (num_unrolled_iterations--) {
s1 += *p++;
s2 += s1;
s1 += *p++;
s2 += s1;
s1 += *p++;
s2 += s1;
s1 += *p++;
s2 += s1;
}
while (p != chunk_end) {
s1 += *p++;
s2 += s1;
}
s1 %= DIVISOR;
s2 %= DIVISOR;
}
return (s2 << 16) | s1;
}
#endif /* !DEFAULT_IMPL */
#ifdef DISPATCH
static u32 dispatch(u32, const u8 *, size_t);
static volatile adler32_func_t adler32_impl = dispatch;
/* Choose the fastest implementation at runtime */
static u32 dispatch(u32 adler, const u8 *buffer, size_t size)
{
adler32_func_t f = arch_select_adler32_func();
if (f == NULL)
f = DEFAULT_IMPL;
adler32_impl = f;
return adler32_impl(adler, buffer, size);
}
#else
# define adler32_impl DEFAULT_IMPL /* only one implementation, use it */
#endif
LIBDEFLATEAPI u32
libdeflate_adler32(u32 adler, const void *buffer, size_t size)
{
if (buffer == NULL) /* return initial value */
return 1;
return adler32_impl(adler, buffer, size);
}

View file

@ -1,124 +0,0 @@
/*
* adler32_vec_template.h - template for vectorized Adler-32 implementations
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* This file contains a template for vectorized Adler-32 implementations.
*
* The inner loop between reductions modulo 65521 of an unvectorized Adler-32
* implementation looks something like this:
*
* do {
* s1 += *p;
* s2 += s1;
* } while (++p != chunk_end);
*
* For vectorized calculation of s1, we only need to sum the input bytes. They
* can be accumulated into multiple counters which are eventually summed
* together.
*
* For vectorized calculation of s2, the basic idea is that for each iteration
* that processes N bytes, we can perform the following vectorizable
* calculation:
*
* s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
*
* Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
* separate counters, then do the multiplications by N...1 just once at the end
* rather than once per iteration.
*
* Also, we must account for how previous bytes will affect s2 by doing the
* following at beginning of each iteration:
*
* s2 += s1 * N
*
* Furthermore, like s1, "s2" can actually be multiple counters which are
* eventually summed together.
*/
static u32 ATTRIBUTES
FUNCNAME(u32 adler, const u8 *p, size_t size)
{
u32 s1 = adler & 0xFFFF;
u32 s2 = adler >> 16;
const u8 * const end = p + size;
const u8 *vend;
const size_t max_chunk_size =
MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
(MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
IMPL_SEGMENT_SIZE);
/* Process a byte at a time until the needed alignment is reached */
if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
do {
s1 += *p++;
s2 += s1;
} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
s1 %= DIVISOR;
s2 %= DIVISOR;
}
/*
* Process "chunks" of bytes using vector instructions. Chunk sizes are
* limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
* overflow before being reduced modulo DIVISOR. For vector processing,
* chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
* may be further limited to IMPL_MAX_CHUNK_SIZE.
*/
STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
while (p != vend) {
size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);
s2 += s1 * chunk_size;
FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
&s1, &s2);
p += chunk_size;
s1 %= DIVISOR;
s2 %= DIVISOR;
}
/* Process any remaining bytes */
if (p != end) {
do {
s1 += *p++;
s2 += s1;
} while (p != end);
s1 %= DIVISOR;
s2 %= DIVISOR;
}
return (s2 << 16) | s1;
}
#undef FUNCNAME
#undef FUNCNAME_CHUNK
#undef ATTRIBUTES
#undef IMPL_ALIGNMENT
#undef IMPL_SEGMENT_SIZE
#undef IMPL_MAX_CHUNK_SIZE

View file

@ -1,57 +0,0 @@
/*
* aligned_malloc.c - aligned memory allocation
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* This file provides portable aligned memory allocation functions that only
* use malloc() and free(). This avoids portability problems with
* posix_memalign(), aligned_alloc(), etc.
*/
#include <stdlib.h>
#include "aligned_malloc.h"
void *
aligned_malloc(size_t alignment, size_t size)
{
void *ptr = malloc(sizeof(void *) + alignment - 1 + size);
if (ptr) {
void *orig_ptr = ptr;
ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
((void **)ptr)[-1] = orig_ptr;
}
return ptr;
}
void
aligned_free(void *ptr)
{
if (ptr)
free(((void **)ptr)[-1]);
}

View file

@ -1,13 +0,0 @@
/*
* aligned_malloc.c - aligned memory allocation
*/
#ifndef LIB_ALIGNED_MALLOC_H
#define LIB_ALIGNED_MALLOC_H
#include "lib_common.h"
extern void *aligned_malloc(size_t alignment, size_t size);
extern void aligned_free(void *ptr);
#endif /* LIB_ALIGNED_MALLOC_H */

View file

@ -1,120 +0,0 @@
/*
* arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "cpu_features.h"
/* NEON implementation */
#undef DISPATCH_NEON
#if !defined(DEFAULT_IMPL) && \
(defined(__ARM_NEON) || (ARM_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS))
# define FUNCNAME adler32_neon
# define FUNCNAME_CHUNK adler32_neon_chunk
# define IMPL_ALIGNMENT 16
# define IMPL_SEGMENT_SIZE 32
/* Prevent unsigned overflow of the 16-bit precision byte counters */
# define IMPL_MAX_CHUNK_SIZE (32 * (0xFFFF / 0xFF))
# ifdef __ARM_NEON
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_neon
# else
# ifdef __arm__
# define ATTRIBUTES __attribute__((target("fpu=neon")))
# else
# define ATTRIBUTES __attribute__((target("+simd")))
# endif
# define DISPATCH 1
# define DISPATCH_NEON 1
# endif
# include <arm_neon.h>
static forceinline ATTRIBUTES void
adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
u32 *s1, u32 *s2)
{
uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
do {
const uint8x16_t bytes1 = *p++;
const uint8x16_t bytes2 = *p++;
uint16x8_t tmp;
v_s2 += v_s1;
/* Vector Pairwise Add Long (u8 => u16) */
tmp = vpaddlq_u8(bytes1);
/* Vector Pairwise Add and Accumulate Long (u8 => u16) */
tmp = vpadalq_u8(tmp, bytes2);
/* Vector Pairwise Add and Accumulate Long (u16 => u32) */
v_s1 = vpadalq_u16(v_s1, tmp);
/* Vector Add Wide (u8 => u16) */
v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
} while (p != end);
/* Vector Shift Left (u32) */
v_s2 = vqshlq_n_u32(v_s2, 5);
/* Vector Multiply Accumulate Long (u16 => u32) */
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), (uint16x4_t) { 32, 31, 30, 29 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), (uint16x4_t) { 24, 23, 22, 21 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), (uint16x4_t) { 16, 15, 14, 13 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10, 9 });
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) { 8, 7, 6, 5 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) { 4, 3, 2, 1 });
*s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
}
# include "../adler32_vec_template.h"
#endif /* NEON implementation */
#ifdef DISPATCH
static inline adler32_func_t
arch_select_adler32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_NEON
if (features & ARM_CPU_FEATURE_NEON)
return adler32_neon;
#endif
return NULL;
}
#endif /* DISPATCH */

View file

@ -1,119 +0,0 @@
/*
* arm/cpu_features.c - feature detection for ARM processors
*
* Copyright 2018 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* ARM processors don't have a standard way for unprivileged programs to detect
* processor features. But, on Linux we can read the AT_HWCAP and AT_HWCAP2
* values from /proc/self/auxv.
*
* Ideally we'd use the C library function getauxval(), but it's not guaranteed
* to be available: it was only added to glibc in 2.16, and in Android it was
* added to API level 18 for ARM and level 21 for AArch64.
*/
#include "cpu_features.h"
#if ARM_CPU_FEATURES_ENABLED
#include <errno.h>
#include <fcntl.h>
#include <string.h>
#include <unistd.h>
#define AT_HWCAP 16
#define AT_HWCAP2 26
volatile u32 _cpu_features = 0;
static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
{
int fd;
unsigned long auxbuf[32];
int filled = 0;
int i;
fd = open("/proc/self/auxv", O_RDONLY);
if (fd < 0)
return;
for (;;) {
do {
int ret = read(fd, &((char *)auxbuf)[filled],
sizeof(auxbuf) - filled);
if (ret <= 0) {
if (ret < 0 && errno == EINTR)
continue;
goto out;
}
filled += ret;
} while (filled < 2 * sizeof(long));
i = 0;
do {
unsigned long type = auxbuf[i];
unsigned long value = auxbuf[i + 1];
if (type == AT_HWCAP)
*hwcap = value;
else if (type == AT_HWCAP2)
*hwcap2 = value;
i += 2;
filled -= 2 * sizeof(long);
} while (filled >= 2 * sizeof(long));
memmove(auxbuf, &auxbuf[i], filled);
}
out:
close(fd);
}
void setup_cpu_features(void)
{
u32 features = 0;
unsigned long hwcap = 0;
unsigned long hwcap2 = 0;
scan_auxv(&hwcap, &hwcap2);
#ifdef __arm__
STATIC_ASSERT(sizeof(long) == 4);
if (hwcap & (1 << 12)) /* HWCAP_NEON */
features |= ARM_CPU_FEATURE_NEON;
if (hwcap2 & (1 << 1)) /* HWCAP2_PMULL */
features |= ARM_CPU_FEATURE_PMULL;
#else
STATIC_ASSERT(sizeof(long) == 8);
if (hwcap & (1 << 1)) /* HWCAP_ASIMD */
features |= ARM_CPU_FEATURE_NEON;
if (hwcap & (1 << 4)) /* HWCAP_PMULL */
features |= ARM_CPU_FEATURE_PMULL;
#endif
_cpu_features = features | ARM_CPU_FEATURES_KNOWN;
}
#endif /* ARM_CPU_FEATURES_ENABLED */

View file

@ -1,37 +0,0 @@
/*
* arm/cpu_features.h - feature detection for ARM processors
*/
#ifndef LIB_ARM_CPU_FEATURES_H
#define LIB_ARM_CPU_FEATURES_H
#include "../lib_common.h"
#if (defined(__arm__) || defined(__aarch64__)) && \
defined(__linux__) && COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
# define ARM_CPU_FEATURES_ENABLED 1
#else
# define ARM_CPU_FEATURES_ENABLED 0
#endif
#if ARM_CPU_FEATURES_ENABLED
#define ARM_CPU_FEATURE_NEON 0x00000001
#define ARM_CPU_FEATURE_PMULL 0x00000002
#define ARM_CPU_FEATURES_KNOWN 0x80000000
extern volatile u32 _cpu_features;
extern void setup_cpu_features(void);
static inline u32 get_cpu_features(void)
{
if (_cpu_features == 0)
setup_cpu_features();
return _cpu_features;
}
#endif /* ARM_CPU_FEATURES_ENABLED */
#endif /* LIB_ARM_CPU_FEATURES_H */

View file

@ -1,166 +0,0 @@
/*
* arm/crc32_impl.h
*
* Copyright 2017 Jun He <jun.he@linaro.org>
* Copyright 2018 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "cpu_features.h"
/*
* CRC-32 folding with ARM Crypto extension-PMULL
*
* This works the same way as the x86 PCLMUL version.
* See x86/crc32_pclmul_template.h for an explanation.
*/
#undef DISPATCH_PMULL
#if (defined(__ARM_FEATURE_CRYPTO) || \
(ARM_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS)) && \
/* not yet tested on big endian, probably needs changes to work there */ \
(defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && \
/* clang as of v5.0.1 doesn't allow pmull intrinsics in 32-bit mode, even
* when compiling with -mfpu=crypto-neon-fp-armv8 */ \
!(defined(__clang__) && defined(__arm__))
# define FUNCNAME crc32_pmull
# define FUNCNAME_ALIGNED crc32_pmull_aligned
# ifdef __ARM_FEATURE_CRYPTO
# define ATTRIBUTES
# define DEFAULT_IMPL crc32_pmull
# else
# ifdef __arm__
# define ATTRIBUTES __attribute__((target("fpu=crypto-neon-fp-armv8")))
# else
# ifdef __clang__
# define ATTRIBUTES __attribute__((target("crypto")))
# else
# define ATTRIBUTES __attribute__((target("+crypto")))
# endif
# endif
# define DISPATCH 1
# define DISPATCH_PMULL 1
# endif
#include <arm_neon.h>
static forceinline ATTRIBUTES uint8x16_t
clmul_00(uint8x16_t a, uint8x16_t b)
{
return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a),
(poly64_t)vget_low_u8(b));
}
static forceinline ATTRIBUTES uint8x16_t
clmul_10(uint8x16_t a, uint8x16_t b)
{
return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a),
(poly64_t)vget_high_u8(b));
}
static forceinline ATTRIBUTES uint8x16_t
clmul_11(uint8x16_t a, uint8x16_t b)
{
return (uint8x16_t)vmull_high_p64((poly64x2_t)a, (poly64x2_t)b);
}
static forceinline ATTRIBUTES uint8x16_t
fold_128b(uint8x16_t dst, uint8x16_t src, uint8x16_t multipliers)
{
return dst ^ clmul_00(src, multipliers) ^ clmul_11(src, multipliers);
}
static forceinline ATTRIBUTES u32
crc32_pmull_aligned(u32 remainder, const uint8x16_t *p, size_t nr_segs)
{
/* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */
const uint8x16_t multipliers_4 =
(uint8x16_t)(uint64x2_t){ 0x8F352D95, 0x1D9513D7 };
const uint8x16_t multipliers_1 =
(uint8x16_t)(uint64x2_t){ 0xAE689191, 0xCCAA009E };
const uint8x16_t final_multiplier =
(uint8x16_t)(uint64x2_t){ 0xB8BC6765 };
const uint8x16_t mask32 = (uint8x16_t)(uint32x4_t){ 0xFFFFFFFF };
const uint8x16_t barrett_reduction_constants =
(uint8x16_t)(uint64x2_t){ 0x00000001F7011641,
0x00000001DB710641 };
const uint8x16_t zeroes = (uint8x16_t){ 0 };
const uint8x16_t * const end = p + nr_segs;
const uint8x16_t * const end512 = p + (nr_segs & ~3);
uint8x16_t x0, x1, x2, x3;
x0 = *p++ ^ (uint8x16_t)(uint32x4_t){ remainder };
if (nr_segs >= 4) {
x1 = *p++;
x2 = *p++;
x3 = *p++;
/* Fold 512 bits at a time */
while (p != end512) {
x0 = fold_128b(*p++, x0, multipliers_4);
x1 = fold_128b(*p++, x1, multipliers_4);
x2 = fold_128b(*p++, x2, multipliers_4);
x3 = fold_128b(*p++, x3, multipliers_4);
}
/* Fold 512 bits => 128 bits */
x1 = fold_128b(x1, x0, multipliers_1);
x2 = fold_128b(x2, x1, multipliers_1);
x0 = fold_128b(x3, x2, multipliers_1);
}
/* Fold 128 bits at a time */
while (p != end)
x0 = fold_128b(*p++, x0, multipliers_1);
/* Fold 128 => 96 bits, implicitly appending 32 zeroes */
x0 = vextq_u8(x0, zeroes, 8) ^ clmul_10(x0, multipliers_1);
/* Fold 96 => 64 bits */
x0 = vextq_u8(x0, zeroes, 4) ^ clmul_00(x0 & mask32, final_multiplier);
/* Reduce 64 => 32 bits using Barrett reduction */
x1 = x0;
x0 = clmul_00(x0 & mask32, barrett_reduction_constants);
x0 = clmul_10(x0 & mask32, barrett_reduction_constants);
return vgetq_lane_u32((uint32x4_t)(x0 ^ x1), 1);
}
#define IMPL_ALIGNMENT 16
#define IMPL_SEGMENT_SIZE 16
#include "../crc32_vec_template.h"
#endif /* PMULL implementation */
#ifdef DISPATCH
static inline crc32_func_t
arch_select_crc32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_PMULL
if (features & ARM_CPU_FEATURE_PMULL)
return crc32_pmull;
#endif
return NULL;
}
#endif /* DISPATCH */

View file

@ -1,93 +0,0 @@
/*
* arm/matchfinder_impl.h - ARM implementations of matchfinder functions
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifdef __ARM_NEON
# if MATCHFINDER_ALIGNMENT < 16
# undef MATCHFINDER_ALIGNMENT
# define MATCHFINDER_ALIGNMENT 16
# endif
# include <arm_neon.h>
static forceinline bool
matchfinder_init_neon(mf_pos_t *data, size_t size)
{
int16x8_t v, *p;
size_t n;
if (size % (sizeof(int16x8_t) * 4) != 0)
return false;
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
v = (int16x8_t) {
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
};
p = (int16x8_t *)data;
n = size / (sizeof(int16x8_t) * 4);
do {
p[0] = v;
p[1] = v;
p[2] = v;
p[3] = v;
p += 4;
} while (--n);
return true;
}
#undef arch_matchfinder_init
#define arch_matchfinder_init matchfinder_init_neon
static forceinline bool
matchfinder_rebase_neon(mf_pos_t *data, size_t size)
{
int16x8_t v, *p;
size_t n;
if (size % (sizeof(int16x8_t) * 4) != 0)
return false;
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
v = (int16x8_t) {
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
};
p = (int16x8_t *)data;
n = size / (sizeof(int16x8_t) * 4);
do {
p[0] = vqaddq_s16(p[0], v);
p[1] = vqaddq_s16(p[1], v);
p[2] = vqaddq_s16(p[2], v);
p[3] = vqaddq_s16(p[3], v);
p += 4;
} while (--n);
return true;
}
#undef arch_matchfinder_rebase
#define arch_matchfinder_rebase matchfinder_rebase_neon
#endif /* __ARM_NEON */

View file

@ -1,355 +0,0 @@
/*
* bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* ----------------------------------------------------------------------------
*
* This is a Binary Trees (bt) based matchfinder.
*
* The main data structure is a hash table where each hash bucket contains a
* binary tree of sequences whose first 4 bytes share the same hash code. Each
* sequence is identified by its starting position in the input buffer. Each
* binary tree is always sorted such that each left child represents a sequence
* lexicographically lesser than its parent and each right child represents a
* sequence lexicographically greater than its parent.
*
* The algorithm processes the input buffer sequentially. At each byte
* position, the hash code of the first 4 bytes of the sequence beginning at
* that position (the sequence being matched against) is computed. This
* identifies the hash bucket to use for that position. Then, a new binary tree
* node is created to represent the current sequence. Then, in a single tree
* traversal, the hash bucket's binary tree is searched for matches and is
* re-rooted at the new node.
*
* Compared to the simpler algorithm that uses linked lists instead of binary
* trees (see hc_matchfinder.h), the binary tree version gains more information
* at each node visitation. Ideally, the binary tree version will examine only
* 'log(n)' nodes to find the same matches that the linked list version will
* find by examining 'n' nodes. In addition, the binary tree version can
* examine fewer bytes at each node by taking advantage of the common prefixes
* that result from the sort order, whereas the linked list version may have to
* examine up to the full length of the match at each node.
*
* However, it is not always best to use the binary tree version. It requires
* nearly twice as much memory as the linked list version, and it takes time to
* keep the binary trees sorted, even at positions where the compressor does not
* need matches. Generally, when doing fast compression on small buffers,
* binary trees are the wrong approach. They are best suited for thorough
* compression and/or large buffers.
*
* ----------------------------------------------------------------------------
*/
#include "matchfinder_common.h"
#define BT_MATCHFINDER_HASH3_ORDER 16
#define BT_MATCHFINDER_HASH3_WAYS 2
#define BT_MATCHFINDER_HASH4_ORDER 16
#define BT_MATCHFINDER_TOTAL_HASH_LENGTH \
((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
(1UL << BT_MATCHFINDER_HASH4_ORDER))
/* Representation of a match found by the bt_matchfinder */
struct lz_match {
/* The number of bytes matched. */
u16 length;
/* The offset back from the current position that was matched. */
u16 offset;
};
struct bt_matchfinder {
/* The hash table for finding length 3 matches */
mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
/* The hash table which contains the roots of the binary trees for
* finding length 4+ matches */
mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
/* The child node references for the binary trees. The left and right
* children of the node for the sequence with position 'pos' are
* 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */
mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
}
#ifdef _aligned_attribute
_aligned_attribute(MATCHFINDER_ALIGNMENT)
#endif
;
/* Prepare the matchfinder for a new input buffer. */
static forceinline void
bt_matchfinder_init(struct bt_matchfinder *mf)
{
matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH);
}
static forceinline void
bt_matchfinder_slide_window(struct bt_matchfinder *mf)
{
matchfinder_rebase((mf_pos_t *)mf,
sizeof(struct bt_matchfinder) / sizeof(mf_pos_t));
}
static forceinline mf_pos_t *
bt_left_child(struct bt_matchfinder *mf, s32 node)
{
return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
}
static forceinline mf_pos_t *
bt_right_child(struct bt_matchfinder *mf, s32 node)
{
return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
}
/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
* and bt_matchfinder_skip_position(). There must be sufficiently many bytes
* remaining to load a 32-bit integer from the *next* position. */
#define BT_MATCHFINDER_REQUIRED_NBYTES 5
/* Advance the binary tree matchfinder by one byte, optionally recording
* matches. @record_matches should be a compile-time constant. */
static forceinline struct lz_match *
bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
const u8 * const restrict in_base,
const ptrdiff_t cur_pos,
const u32 max_len,
const u32 nice_len,
const u32 max_search_depth,
u32 * const restrict next_hashes,
u32 * const restrict best_len_ret,
struct lz_match * restrict lz_matchptr,
const bool record_matches)
{
const u8 *in_next = in_base + cur_pos;
u32 depth_remaining = max_search_depth;
const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
u32 next_hashseq;
u32 hash3;
u32 hash4;
s32 cur_node;
#if BT_MATCHFINDER_HASH3_WAYS >= 2
s32 cur_node_2;
#endif
const u8 *matchptr;
mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
u32 best_lt_len, best_gt_len;
u32 len;
u32 best_len = 3;
STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
BT_MATCHFINDER_HASH3_WAYS <= 2);
next_hashseq = get_unaligned_le32(in_next + 1);
hash3 = next_hashes[0];
hash4 = next_hashes[1];
next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER);
next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER);
prefetchw(&mf->hash3_tab[next_hashes[0]]);
prefetchw(&mf->hash4_tab[next_hashes[1]]);
cur_node = mf->hash3_tab[hash3][0];
mf->hash3_tab[hash3][0] = cur_pos;
#if BT_MATCHFINDER_HASH3_WAYS >= 2
cur_node_2 = mf->hash3_tab[hash3][1];
mf->hash3_tab[hash3][1] = cur_node;
#endif
if (record_matches && cur_node > cutoff) {
u32 seq3 = load_u24_unaligned(in_next);
if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
lz_matchptr->length = 3;
lz_matchptr->offset = in_next - &in_base[cur_node];
lz_matchptr++;
}
#if BT_MATCHFINDER_HASH3_WAYS >= 2
else if (cur_node_2 > cutoff &&
seq3 == load_u24_unaligned(&in_base[cur_node_2]))
{
lz_matchptr->length = 3;
lz_matchptr->offset = in_next - &in_base[cur_node_2];
lz_matchptr++;
}
#endif
}
cur_node = mf->hash4_tab[hash4];
mf->hash4_tab[hash4] = cur_pos;
pending_lt_ptr = bt_left_child(mf, cur_pos);
pending_gt_ptr = bt_right_child(mf, cur_pos);
if (cur_node <= cutoff) {
*pending_lt_ptr = MATCHFINDER_INITVAL;
*pending_gt_ptr = MATCHFINDER_INITVAL;
*best_len_ret = best_len;
return lz_matchptr;
}
best_lt_len = 0;
best_gt_len = 0;
len = 0;
for (;;) {
matchptr = &in_base[cur_node];
if (matchptr[len] == in_next[len]) {
len = lz_extend(in_next, matchptr, len + 1, max_len);
if (!record_matches || len > best_len) {
if (record_matches) {
best_len = len;
lz_matchptr->length = len;
lz_matchptr->offset = in_next - matchptr;
lz_matchptr++;
}
if (len >= nice_len) {
*pending_lt_ptr = *bt_left_child(mf, cur_node);
*pending_gt_ptr = *bt_right_child(mf, cur_node);
*best_len_ret = best_len;
return lz_matchptr;
}
}
}
if (matchptr[len] < in_next[len]) {
*pending_lt_ptr = cur_node;
pending_lt_ptr = bt_right_child(mf, cur_node);
cur_node = *pending_lt_ptr;
best_lt_len = len;
if (best_gt_len < len)
len = best_gt_len;
} else {
*pending_gt_ptr = cur_node;
pending_gt_ptr = bt_left_child(mf, cur_node);
cur_node = *pending_gt_ptr;
best_gt_len = len;
if (best_lt_len < len)
len = best_lt_len;
}
if (cur_node <= cutoff || !--depth_remaining) {
*pending_lt_ptr = MATCHFINDER_INITVAL;
*pending_gt_ptr = MATCHFINDER_INITVAL;
*best_len_ret = best_len;
return lz_matchptr;
}
}
}
/*
* Retrieve a list of matches with the current position.
*
* @mf
* The matchfinder structure.
* @in_base
* Pointer to the next byte in the input buffer to process _at the last
* time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
* @cur_pos
* The current position in the input buffer relative to @in_base (the
* position of the sequence being matched against).
* @max_len
* The maximum permissible match length at this position. Must be >=
* BT_MATCHFINDER_REQUIRED_NBYTES.
* @nice_len
* Stop searching if a match of at least this length is found.
* Must be <= @max_len.
* @max_search_depth
* Limit on the number of potential matches to consider. Must be >= 1.
* @next_hashes
* The precomputed hash codes for the sequence beginning at @in_next.
* These will be used and then updated with the precomputed hashcodes for
* the sequence beginning at @in_next + 1.
* @best_len_ret
* If a match of length >= 4 was found, then the length of the longest such
* match is written here; otherwise 3 is written here. (Note: this is
* redundant with the 'struct lz_match' array, but this is easier for the
* compiler to optimize when inlined and the caller immediately does a
* check against 'best_len'.)
* @lz_matchptr
* An array in which this function will record the matches. The recorded
* matches will be sorted by strictly increasing length and (non-strictly)
* increasing offset. The maximum number of matches that may be found is
* 'nice_len - 2'.
*
* The return value is a pointer to the next available slot in the @lz_matchptr
* array. (If no matches were found, this will be the same as @lz_matchptr.)
*/
static forceinline struct lz_match *
bt_matchfinder_get_matches(struct bt_matchfinder *mf,
const u8 *in_base,
ptrdiff_t cur_pos,
u32 max_len,
u32 nice_len,
u32 max_search_depth,
u32 next_hashes[2],
u32 *best_len_ret,
struct lz_match *lz_matchptr)
{
return bt_matchfinder_advance_one_byte(mf,
in_base,
cur_pos,
max_len,
nice_len,
max_search_depth,
next_hashes,
best_len_ret,
lz_matchptr,
true);
}
/*
* Advance the matchfinder, but don't record any matches.
*
* This is very similar to bt_matchfinder_get_matches() because both functions
* must do hashing and tree re-rooting.
*/
static forceinline void
bt_matchfinder_skip_position(struct bt_matchfinder *mf,
const u8 *in_base,
ptrdiff_t cur_pos,
u32 nice_len,
u32 max_search_depth,
u32 next_hashes[2])
{
u32 best_len;
bt_matchfinder_advance_one_byte(mf,
in_base,
cur_pos,
nice_len,
nice_len,
max_search_depth,
next_hashes,
&best_len,
NULL,
false);
}

View file

@ -1,313 +0,0 @@
/*
* crc32.c - CRC-32 checksum algorithm for the gzip format
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* High-level description of CRC
* =============================
*
* Consider a bit sequence 'bits[1...len]'. Interpret 'bits' as the "message"
* polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
* where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute:
*
* R(x) = M(x)*x^n mod G(x)
*
* where G(x) is a selected "generator" polynomial of degree 'n'. The remainder
* R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x)
* interpreted as a bitstring of length 'n'.
*
* CRC used in gzip
* ================
*
* In the gzip format (RFC 1952):
*
* - The bitstring to checksum is formed from the bytes of the uncompressed
* data by concatenating the bits from the bytes in order, proceeding
* from the low-order bit to the high-order bit within each byte.
*
* - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
* x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
* Consequently, the CRC length is 32 bits ("CRC-32").
*
* - The highest order 32 coefficients of M(x)*x^n are inverted.
*
* - All 32 coefficients of R(x) are inverted.
*
* The two inversions cause added leading and trailing zero bits to affect the
* resulting CRC, whereas with a regular CRC such bits would have no effect on
* the CRC.
*
* Computation and optimizations
* =============================
*
* We can compute R(x) through "long division", maintaining only 32 bits of
* state at any given time. Multiplication by 'x' can be implemented as
* right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
* highest order bit represents the coefficient of x^0), and both addition and
* subtraction can be implemented as bitwise exclusive OR (since we are working
* in GF(2)). Here is an unoptimized implementation:
*
* static u32 crc32_gzip(const u8 *buffer, size_t size)
* {
* u32 remainder = 0;
* const u32 divisor = 0xEDB88320;
*
* for (size_t i = 0; i < size * 8 + 32; i++) {
* int bit;
* u32 multiple;
*
* if (i < size * 8)
* bit = (buffer[i / 8] >> (i % 8)) & 1;
* else
* bit = 0; // one of the 32 appended 0 bits
*
* if (i < 32) // the first 32 bits are inverted
* bit ^= 1;
*
* if (remainder & 1)
* multiple = divisor;
* else
* multiple = 0;
*
* remainder >>= 1;
* remainder |= (u32)bit << 31;
* remainder ^= multiple;
* }
*
* return ~remainder;
* }
*
* In this implementation, the 32-bit integer 'remainder' maintains the
* remainder of the currently processed portion of the message (with 32 zero
* bits appended) when divided by the generator polynomial. 'remainder' is the
* representation of R(x), and 'divisor' is the representation of G(x) excluding
* the x^32 coefficient. For each bit to process, we multiply R(x) by 'x^1',
* then add 'x^0' if the new bit is a 1. If this causes R(x) to gain a nonzero
* x^32 term, then we subtract G(x) from R(x).
*
* We can speed this up by taking advantage of the fact that XOR is commutative
* and associative, so the order in which we combine the inputs into 'remainder'
* is unimportant. And since each message bit we add doesn't affect the choice
* of 'multiple' until 32 bits later, we need not actually add each message bit
* until that point:
*
* static u32 crc32_gzip(const u8 *buffer, size_t size)
* {
* u32 remainder = ~0;
* const u32 divisor = 0xEDB88320;
*
* for (size_t i = 0; i < size * 8; i++) {
* int bit;
* u32 multiple;
*
* bit = (buffer[i / 8] >> (i % 8)) & 1;
* remainder ^= bit;
* if (remainder & 1)
* multiple = divisor;
* else
* multiple = 0;
* remainder >>= 1;
* remainder ^= multiple;
* }
*
* return ~remainder;
* }
*
* With the above implementation we get the effect of 32 appended 0 bits for
* free; they never affect the choice of a divisor, nor would they change the
* value of 'remainder' if they were to be actually XOR'ed in. And by starting
* with a remainder of all 1 bits, we get the effect of complementing the first
* 32 message bits.
*
* The next optimization is to process the input in multi-bit units. Suppose
* that we insert the next 'n' message bits into the remainder. Then we get an
* intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
* bits is the amount by which the low 32 bits of the remainder will change as a
* result of cancelling out those 'n' bits. Taking n=8 (one byte) and
* precomputing a table containing the CRC of each possible byte, we get
* crc32_slice1() defined below.
*
* As a further optimization, we could increase the multi-bit unit size to 16.
* However, that is inefficient because the table size explodes from 256 entries
* (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
* fit in L1 cache on typical processors.
*
* However, we can actually process 4 bytes at a time using 4 different tables
* with 256 entries each. Logically, we form a 64-bit intermediate remainder
* and cancel out the high 32 bits in 8-bit chunks. Bits 32-39 are cancelled
* out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
* CRC of those bits with 8 zero bits appended, and so on. This method is
* implemented in crc32_slice4(), defined below.
*
* In crc32_slice8(), this method is extended to 8 bytes at a time. The
* intermediate remainder (which we never actually store explicitly) is 96 bits.
*
* On CPUs that support fast carryless multiplication, CRCs can be computed even
* more quickly via "folding". See e.g. the x86 PCLMUL implementation.
*/
#include "lib_common.h"
#include "libdeflate.h"
typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
/* Include architecture-specific implementations if available */
#undef CRC32_SLICE1
#undef CRC32_SLICE4
#undef CRC32_SLICE8
#undef DEFAULT_IMPL
#undef DISPATCH
#if defined(__arm__) || defined(__aarch64__)
# include "arm/crc32_impl.h"
#elif defined(__i386__) || defined(__x86_64__)
# include "x86/crc32_impl.h"
#endif
/*
* Define a generic implementation (crc32_slice8()) if needed. crc32_slice1()
* may also be needed as a fallback for architecture-specific implementations.
*/
#ifndef DEFAULT_IMPL
# define CRC32_SLICE8 1
# define DEFAULT_IMPL crc32_slice8
#endif
#if defined(CRC32_SLICE1) || defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
#include "crc32_table.h"
static forceinline u32
crc32_update_byte(u32 remainder, u8 next_byte)
{
return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
}
#endif
#ifdef CRC32_SLICE1
static u32
crc32_slice1(u32 remainder, const u8 *buffer, size_t size)
{
size_t i;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
for (i = 0; i < size; i++)
remainder = crc32_update_byte(remainder, buffer[i]);
return remainder;
}
#endif /* CRC32_SLICE1 */
#ifdef CRC32_SLICE4
static u32
crc32_slice4(u32 remainder, const u8 *buffer, size_t size)
{
const u8 *p = buffer;
const u8 *end = buffer + size;
const u8 *end32;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
for (; ((uintptr_t)p & 3) && p != end; p++)
remainder = crc32_update_byte(remainder, *p);
end32 = p + ((end - p) & ~3);
for (; p != end32; p += 4) {
u32 v = le32_bswap(*(const u32 *)p);
remainder =
crc32_table[0x300 + (u8)((remainder ^ v) >> 0)] ^
crc32_table[0x200 + (u8)((remainder ^ v) >> 8)] ^
crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
}
for (; p != end; p++)
remainder = crc32_update_byte(remainder, *p);
return remainder;
}
#endif /* CRC32_SLICE4 */
#ifdef CRC32_SLICE8
static u32
crc32_slice8(u32 remainder, const u8 *buffer, size_t size)
{
const u8 *p = buffer;
const u8 *end = buffer + size;
const u8 *end64;
STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
for (; ((uintptr_t)p & 7) && p != end; p++)
remainder = crc32_update_byte(remainder, *p);
end64 = p + ((end - p) & ~7);
for (; p != end64; p += 8) {
u32 v1 = le32_bswap(*(const u32 *)(p + 0));
u32 v2 = le32_bswap(*(const u32 *)(p + 4));
remainder =
crc32_table[0x700 + (u8)((remainder ^ v1) >> 0)] ^
crc32_table[0x600 + (u8)((remainder ^ v1) >> 8)] ^
crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
crc32_table[0x300 + (u8)(v2 >> 0)] ^
crc32_table[0x200 + (u8)(v2 >> 8)] ^
crc32_table[0x100 + (u8)(v2 >> 16)] ^
crc32_table[0x000 + (u8)(v2 >> 24)];
}
for (; p != end; p++)
remainder = crc32_update_byte(remainder, *p);
return remainder;
}
#endif /* CRC32_SLICE8 */
#ifdef DISPATCH
static u32 dispatch(u32, const u8 *, size_t);
static volatile crc32_func_t crc32_impl = dispatch;
/* Choose the fastest implementation at runtime */
static u32 dispatch(u32 remainder, const u8 *buffer, size_t size)
{
crc32_func_t f = arch_select_crc32_func();
if (f == NULL)
f = DEFAULT_IMPL;
crc32_impl = f;
return crc32_impl(remainder, buffer, size);
}
#else
# define crc32_impl DEFAULT_IMPL /* only one implementation, use it */
#endif
LIBDEFLATEAPI u32
libdeflate_crc32(u32 remainder, const void *buffer, size_t size)
{
if (buffer == NULL) /* return initial value */
return 0;
return ~crc32_impl(~remainder, buffer, size);
}

View file

@ -1,526 +0,0 @@
/*
* crc32_table.h - data table to accelerate CRC-32 computation
*
* THIS FILE WAS AUTOMATICALLY GENERATED BY gen_crc32_table.c. DO NOT EDIT.
*/
#include <stdint.h>
static const uint32_t crc32_table[] = {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
#if defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1,
#endif /* CRC32_SLICE4 || CRC32_SLICE8 */
#if defined(CRC32_SLICE8)
0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0,
0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10,
0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111,
0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1,
0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92,
0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693,
0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053,
0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4,
0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15,
0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5,
0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256,
0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496,
0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57,
0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299,
0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459,
0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958,
0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b,
0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db,
0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda,
0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a,
0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d,
0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c,
0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c,
0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f,
0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de,
0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e,
0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42,
0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82,
0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743,
0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0,
0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00,
0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601,
0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546,
0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386,
0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87,
0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847,
0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404,
0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905,
0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5,
0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b,
0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca,
0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a,
0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589,
0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349,
0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888,
0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f,
0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf,
0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce,
0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d,
0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d,
0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c,
0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c,
0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae,
0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8,
0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3,
0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5,
0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223,
0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258,
0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e,
0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798,
0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5,
0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3,
0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503,
0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715,
0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578,
0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2,
0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4,
0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf,
0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59,
0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f,
0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834,
0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22,
0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2,
0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99,
0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f,
0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f,
0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02,
0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14,
0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676,
0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460,
0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d,
0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed,
0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb,
0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680,
0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340,
0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156,
0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d,
0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b,
0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd,
0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6,
0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0,
0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a,
0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77,
0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61,
0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81,
0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97,
0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa,
0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c,
0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a,
0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41,
0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7,
0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1,
0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da,
0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc,
0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d,
0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e,
0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa,
0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9,
0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240,
0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834,
0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27,
0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301,
0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66,
0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975,
0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf,
0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc,
0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb,
0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4,
0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7,
0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183,
0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a,
0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739,
0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d,
0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e,
0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b,
0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f,
0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c,
0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6,
0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1,
0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2,
0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f,
0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c,
0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b,
0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1,
0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2,
0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6,
0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3,
0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0,
0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794,
0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387,
0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e,
0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a,
0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49,
0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516,
0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71,
0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62,
0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8,
0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb,
0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac,
0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a,
0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899,
0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed,
0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044,
0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457,
0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23,
0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30,
0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3,
0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919,
0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56,
0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac,
0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832,
0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d,
0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387,
0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5,
0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00,
0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa,
0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e,
0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64,
0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1,
0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e,
0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4,
0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb,
0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425,
0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf,
0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90,
0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a,
0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2,
0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced,
0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217,
0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673,
0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6,
0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c,
0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239,
0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3,
0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776,
0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312,
0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8,
0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7,
0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f,
0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95,
0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda,
0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520,
0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe,
0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1,
0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b,
0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4,
0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61,
0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b,
0x061d761c, 0xcab77682, 0x44387161, 0x889271ff,
0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05,
0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0,
0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282,
0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78,
0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937,
0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9,
0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53,
0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c,
0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6,
#endif /* CRC32_SLICE8 */
};

View file

@ -1,61 +0,0 @@
/*
* crc32_vec_template.h - template for vectorized CRC-32 implementations
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#define CRC32_SLICE1 1
static u32 crc32_slice1(u32, const u8 *, size_t);
/*
* Template for vectorized CRC-32 implementations.
*
* Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
* of crc32_slice8() because only a few bytes need to be processed, so a smaller
* table is preferable.
*/
static u32 ATTRIBUTES
FUNCNAME(u32 remainder, const u8 *p, size_t size)
{
if ((uintptr_t)p % IMPL_ALIGNMENT) {
size_t n = MIN(size, -(uintptr_t)p % IMPL_ALIGNMENT);
remainder = crc32_slice1(remainder, p, n);
p += n;
size -= n;
}
if (size >= IMPL_SEGMENT_SIZE) {
remainder = FUNCNAME_ALIGNED(remainder, (const void *)p,
size / IMPL_SEGMENT_SIZE);
p += size - (size % IMPL_SEGMENT_SIZE);
size %= IMPL_SEGMENT_SIZE;
}
return crc32_slice1(remainder, p, size);
}
#undef FUNCNAME
#undef FUNCNAME_ALIGNED
#undef ATTRIBUTES
#undef IMPL_ALIGNMENT
#undef IMPL_SEGMENT_SIZE

View file

@ -1,421 +0,0 @@
/*
* decompress_template.h
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* This is the actual DEFLATE decompression routine, lifted out of
* deflate_decompress.c so that it can be compiled multiple times with different
* target instruction sets.
*/
static enum libdeflate_result ATTRIBUTES
FUNCNAME(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
{
u8 *out_next = out;
u8 * const out_end = out_next + out_nbytes_avail;
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
bitbuf_t bitbuf = 0;
unsigned bitsleft = 0;
size_t overrun_count = 0;
unsigned i;
unsigned is_final_block;
unsigned block_type;
u16 len;
u16 nlen;
unsigned num_litlen_syms;
unsigned num_offset_syms;
u16 tmp16;
u32 tmp32;
next_block:
/* Starting to read the next block. */
;
STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
ENSURE_BITS(1 + 2 + 5 + 5 + 4);
/* BFINAL: 1 bit */
is_final_block = POP_BITS(1);
/* BTYPE: 2 bits */
block_type = POP_BITS(2);
if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
/* Dynamic Huffman block. */
/* The order in which precode lengths are stored. */
static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
};
unsigned num_explicit_precode_lens;
/* Read the codeword length counts. */
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
num_litlen_syms = POP_BITS(5) + 257;
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
num_offset_syms = POP_BITS(5) + 1;
STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
num_explicit_precode_lens = POP_BITS(4) + 4;
d->static_codes_loaded = false;
/* Read the precode codeword lengths. */
STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
for (i = 0; i < num_explicit_precode_lens; i++) {
ENSURE_BITS(3);
d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
}
for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
/* Build the decode table for the precode. */
SAFETY_CHECK(build_precode_decode_table(d));
/* Expand the literal/length and offset codeword lengths. */
for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
u32 entry;
unsigned presym;
u8 rep_val;
unsigned rep_count;
ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
/* (The code below assumes that the precode decode table
* does not have any subtables.) */
STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
/* Read the next precode symbol. */
entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
presym = entry >> HUFFDEC_RESULT_SHIFT;
if (presym < 16) {
/* Explicit codeword length */
d->u.l.lens[i++] = presym;
continue;
}
/* Run-length encoded codeword lengths */
/* Note: we don't need verify that the repeat count
* doesn't overflow the number of elements, since we
* have enough extra spaces to allow for the worst-case
* overflow (138 zeroes when only 1 length was
* remaining).
*
* In the case of the small repeat counts (presyms 16
* and 17), it is fastest to always write the maximum
* number of entries. That gets rid of branches that
* would otherwise be required.
*
* It is not just because of the numerical order that
* our checks go in the order 'presym < 16', 'presym ==
* 16', and 'presym == 17'. For typical data this is
* ordered from most frequent to least frequent case.
*/
STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
if (presym == 16) {
/* Repeat the previous length 3 - 6 times */
SAFETY_CHECK(i != 0);
rep_val = d->u.l.lens[i - 1];
STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
rep_count = 3 + POP_BITS(2);
d->u.l.lens[i + 0] = rep_val;
d->u.l.lens[i + 1] = rep_val;
d->u.l.lens[i + 2] = rep_val;
d->u.l.lens[i + 3] = rep_val;
d->u.l.lens[i + 4] = rep_val;
d->u.l.lens[i + 5] = rep_val;
i += rep_count;
} else if (presym == 17) {
/* Repeat zero 3 - 10 times */
STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
rep_count = 3 + POP_BITS(3);
d->u.l.lens[i + 0] = 0;
d->u.l.lens[i + 1] = 0;
d->u.l.lens[i + 2] = 0;
d->u.l.lens[i + 3] = 0;
d->u.l.lens[i + 4] = 0;
d->u.l.lens[i + 5] = 0;
d->u.l.lens[i + 6] = 0;
d->u.l.lens[i + 7] = 0;
d->u.l.lens[i + 8] = 0;
d->u.l.lens[i + 9] = 0;
i += rep_count;
} else {
/* Repeat zero 11 - 138 times */
STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
rep_count = 11 + POP_BITS(7);
memset(&d->u.l.lens[i], 0,
rep_count * sizeof(d->u.l.lens[i]));
i += rep_count;
}
}
} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
/* Uncompressed block: copy 'len' bytes literally from the input
* buffer to the output buffer. */
ALIGN_INPUT();
SAFETY_CHECK(in_end - in_next >= 4);
len = READ_U16();
nlen = READ_U16();
SAFETY_CHECK(len == (u16)~nlen);
if (unlikely(len > out_end - out_next))
return LIBDEFLATE_INSUFFICIENT_SPACE;
SAFETY_CHECK(len <= in_end - in_next);
memcpy(out_next, in_next, len);
in_next += len;
out_next += len;
goto block_done;
} else {
SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
/*
* Static Huffman block: build the decode tables for the static
* codes. Skip doing so if the tables are already set up from
* an earlier static block; this speeds up decompression of
* degenerate input of many empty or very short static blocks.
*
* Afterwards, the remainder is the same as decompressing a
* dynamic Huffman block.
*/
if (d->static_codes_loaded)
goto have_decode_tables;
d->static_codes_loaded = true;
STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
for (i = 0; i < 144; i++)
d->u.l.lens[i] = 8;
for (; i < 256; i++)
d->u.l.lens[i] = 9;
for (; i < 280; i++)
d->u.l.lens[i] = 7;
for (; i < 288; i++)
d->u.l.lens[i] = 8;
for (; i < 288 + 32; i++)
d->u.l.lens[i] = 5;
num_litlen_syms = 288;
num_offset_syms = 32;
}
/* Decompressing a Huffman block (either dynamic or static) */
SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
have_decode_tables:
/* The main DEFLATE decode loop */
for (;;) {
u32 entry;
u32 length;
u32 offset;
const u8 *src;
u8 *dst;
/* Decode a litlen symbol. */
ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
if (entry & HUFFDEC_SUBTABLE_POINTER) {
/* Litlen subtable required (uncommon case) */
REMOVE_BITS(LITLEN_TABLEBITS);
entry = d->u.litlen_decode_table[
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
BITS(entry & HUFFDEC_LENGTH_MASK)];
}
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
if (entry & HUFFDEC_LITERAL) {
/* Literal */
if (unlikely(out_next == out_end))
return LIBDEFLATE_INSUFFICIENT_SPACE;
*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
continue;
}
/* Match or end-of-block */
entry >>= HUFFDEC_RESULT_SHIFT;
ENSURE_BITS(MAX_ENSURE);
/* Pop the extra length bits and add them to the length base to
* produce the full length. */
length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
/* The match destination must not end after the end of the
* output buffer. For efficiency, combine this check with the
* end-of-block check. We're using 0 for the special
* end-of-block length, so subtract 1 and it turn it into
* SIZE_MAX. */
STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
if (unlikely((size_t)length - 1 >= out_end - out_next)) {
if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
return LIBDEFLATE_INSUFFICIENT_SPACE;
goto block_done;
}
/* Decode the match offset. */
entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
if (entry & HUFFDEC_SUBTABLE_POINTER) {
/* Offset subtable required (uncommon case) */
REMOVE_BITS(OFFSET_TABLEBITS);
entry = d->offset_decode_table[
((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
BITS(entry & HUFFDEC_LENGTH_MASK)];
}
REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
entry >>= HUFFDEC_RESULT_SHIFT;
STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
DEFLATE_MAX_OFFSET_CODEWORD_LEN +
DEFLATE_MAX_EXTRA_OFFSET_BITS))
ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
/* Pop the extra offset bits and add them to the offset base to
* produce the full offset. */
offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
/* The match source must not begin before the beginning of the
* output buffer. */
SAFETY_CHECK(offset <= out_next - (const u8 *)out);
/*
* Copy the match: 'length' bytes at 'out_next - offset' to
* 'out_next', possibly overlapping. If the match doesn't end
* too close to the end of the buffer and offset >= WORDBYTES ||
* offset == 1, take a fast path which copies a word at a time
* -- potentially more than the length of the match, but that's
* fine as long as we check for enough extra space.
*
* The remaining cases are not performance-critical so are
* handled by a simple byte-by-byte copy.
*/
src = out_next - offset;
dst = out_next;
out_next += length;
if (UNALIGNED_ACCESS_IS_FAST &&
/* max overrun is writing 3 words for a min length match */
likely(out_end - out_next >=
3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) {
if (offset >= WORDBYTES) { /* words don't overlap? */
copy_word_unaligned(src, dst);
src += WORDBYTES;
dst += WORDBYTES;
copy_word_unaligned(src, dst);
src += WORDBYTES;
dst += WORDBYTES;
do {
copy_word_unaligned(src, dst);
src += WORDBYTES;
dst += WORDBYTES;
} while (dst < out_next);
} else if (offset == 1) {
/* RLE encoding of previous byte, common if the
* data contains many repeated bytes */
machine_word_t v = repeat_byte(*src);
store_word_unaligned(v, dst);
dst += WORDBYTES;
store_word_unaligned(v, dst);
dst += WORDBYTES;
do {
store_word_unaligned(v, dst);
dst += WORDBYTES;
} while (dst < out_next);
} else {
*dst++ = *src++;
*dst++ = *src++;
do {
*dst++ = *src++;
} while (dst < out_next);
}
} else {
STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
*dst++ = *src++;
*dst++ = *src++;
do {
*dst++ = *src++;
} while (dst < out_next);
}
}
block_done:
/* Finished decoding a block. */
if (!is_final_block)
goto next_block;
/* That was the last block. */
/* Discard any readahead bits and check for excessive overread */
ALIGN_INPUT();
/* Optionally return the actual number of bytes read */
if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
/* Optionally return the actual number of bytes written */
if (actual_out_nbytes_ret) {
*actual_out_nbytes_ret = out_next - (u8 *)out;
} else {
if (out_next != out_end)
return LIBDEFLATE_SHORT_OUTPUT;
}
return LIBDEFLATE_SUCCESS;
}
#undef FUNCNAME
#undef ATTRIBUTES

File diff suppressed because it is too large Load diff

View file

@ -1,14 +0,0 @@
#ifndef LIB_DEFLATE_COMPRESS_H
#define LIB_DEFLATE_COMPRESS_H
#include "lib_common.h"
/* DEFLATE compression is private to deflate_compress.c, but we do need to be
* able to query the compression level for zlib and gzip header generation. */
struct libdeflate_compressor;
extern unsigned int
deflate_get_compression_level(struct libdeflate_compressor *c);
#endif /* LIB_DEFLATE_COMPRESS_H */

View file

@ -1,66 +0,0 @@
/*
* deflate_constants.h - constants for the DEFLATE compression format
*/
#ifndef LIB_DEFLATE_CONSTANTS_H
#define LIB_DEFLATE_CONSTANTS_H
/* Valid block types */
#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0
#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1
#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2
/* Minimum and maximum supported match lengths (in bytes) */
#define DEFLATE_MIN_MATCH_LEN 3
#define DEFLATE_MAX_MATCH_LEN 258
/* Minimum and maximum supported match offsets (in bytes) */
#define DEFLATE_MIN_MATCH_OFFSET 1
#define DEFLATE_MAX_MATCH_OFFSET 32768
#define DEFLATE_MAX_WINDOW_SIZE 32768
/* Number of symbols in each Huffman code. Note: for the literal/length
* and offset codes, these are actually the maximum values; a given block
* might use fewer symbols. */
#define DEFLATE_NUM_PRECODE_SYMS 19
#define DEFLATE_NUM_LITLEN_SYMS 288
#define DEFLATE_NUM_OFFSET_SYMS 32
/* The maximum number of symbols across all codes */
#define DEFLATE_MAX_NUM_SYMS 288
/* Division of symbols in the literal/length code */
#define DEFLATE_NUM_LITERALS 256
#define DEFLATE_END_OF_BLOCK 256
#define DEFLATE_NUM_LEN_SYMS 31
/* Maximum codeword length, in bits, within each Huffman code */
#define DEFLATE_MAX_PRE_CODEWORD_LEN 7
#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15
#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15
/* The maximum codeword length across all codes */
#define DEFLATE_MAX_CODEWORD_LEN 15
/* Maximum possible overrun when decoding codeword lengths */
#define DEFLATE_MAX_LENS_OVERRUN 137
/*
* Maximum number of extra bits that may be required to represent a match
* length or offset.
*
* TODO: are we going to have full DEFLATE64 support? If so, up to 16
* length bits must be supported.
*/
#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5
#define DEFLATE_MAX_EXTRA_OFFSET_BITS 14
/* The maximum number of bits in which a match can be represented. This
* is the absolute worst case, which assumes the longest possible Huffman
* codewords and the maximum numbers of extra bits. */
#define DEFLATE_MAX_MATCH_BITS \
(DEFLATE_MAX_LITLEN_CODEWORD_LEN + DEFLATE_MAX_EXTRA_LENGTH_BITS + \
DEFLATE_MAX_OFFSET_CODEWORD_LEN + DEFLATE_MAX_EXTRA_OFFSET_BITS)
#endif /* LIB_DEFLATE_CONSTANTS_H */

View file

@ -1,997 +0,0 @@
/*
* deflate_decompress.c - a decompressor for DEFLATE
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* ---------------------------------------------------------------------------
*
* This is a highly optimized DEFLATE decompressor. When compiled with gcc on
* x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2
* instructions are available). On other architectures it should still be
* significantly faster than zlib, but the difference may be smaller.
*
* Why this is faster than zlib's implementation:
*
* - Word accesses rather than byte accesses when reading input
* - Word accesses rather than byte accesses when copying matches
* - Faster Huffman decoding combined with various DEFLATE-specific tricks
* - Larger bitbuffer variable that doesn't need to be filled as often
* - Other optimizations to remove unnecessary branches
* - Only full-buffer decompression is supported, so the code doesn't need to
* support stopping and resuming decompression.
* - On x86_64, compile a version of the decompression routine using BMI2
* instructions and use it automatically at runtime when supported.
*/
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include "deflate_constants.h"
#include "unaligned.h"
#include "libdeflate.h"
/*
* If the expression passed to SAFETY_CHECK() evaluates to false, then the
* decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
* compressed data is invalid.
*
* Theoretically, these checks could be disabled for specialized applications
* where all input to the decompressor will be trusted.
*/
#if 0
# pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
# define SAFETY_CHECK(expr) (void)(expr)
#else
# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
#endif
/*
* Each TABLEBITS number is the base-2 logarithm of the number of entries in the
* main portion of the corresponding decode table. Each number should be large
* enough to ensure that for typical data, the vast majority of symbols can be
* decoded by a direct lookup of the next TABLEBITS bits of compressed data.
* However, this must be balanced against the fact that a larger table requires
* more memory and requires more time to fill.
*
* Note: you cannot change a TABLEBITS number without also changing the
* corresponding ENOUGH number!
*/
#define PRECODE_TABLEBITS 7
#define LITLEN_TABLEBITS 10
#define OFFSET_TABLEBITS 8
/*
* Each ENOUGH number is the maximum number of decode table entries that may be
* required for the corresponding Huffman code, including the main table and all
* subtables. Each number depends on three parameters:
*
* (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
* (2) the number of main table bits (the TABLEBITS numbers defined above)
* (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
*
* The ENOUGH numbers were computed using the utility program 'enough' from
* zlib. This program enumerates all possible relevant Huffman codes to find
* the worst-case usage of decode table entries.
*/
#define PRECODE_ENOUGH 128 /* enough 19 7 7 */
#define LITLEN_ENOUGH 1334 /* enough 288 10 15 */
#define OFFSET_ENOUGH 402 /* enough 32 8 15 */
/*
* Type for codeword lengths.
*/
typedef u8 len_t;
/*
* The main DEFLATE decompressor structure. Since this implementation only
* supports full buffer decompression, this structure does not store the entire
* decompression state, but rather only some arrays that are too large to
* comfortably allocate on the stack.
*/
struct libdeflate_decompressor {
/*
* The arrays aren't all needed at the same time. 'precode_lens' and
* 'precode_decode_table' are unneeded after 'lens' has been filled.
* Furthermore, 'lens' need not be retained after building the litlen
* and offset decode tables. In fact, 'lens' can be in union with
* 'litlen_decode_table' provided that 'offset_decode_table' is separate
* and is built first.
*/
union {
len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS];
struct {
len_t lens[DEFLATE_NUM_LITLEN_SYMS +
DEFLATE_NUM_OFFSET_SYMS +
DEFLATE_MAX_LENS_OVERRUN];
u32 precode_decode_table[PRECODE_ENOUGH];
} l;
u32 litlen_decode_table[LITLEN_ENOUGH];
} u;
u32 offset_decode_table[OFFSET_ENOUGH];
/* used only during build_decode_table() */
u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
bool static_codes_loaded;
};
/*****************************************************************************
* Input bitstream *
*****************************************************************************/
/*
* The state of the "input bitstream" consists of the following variables:
*
* - in_next: pointer to the next unread byte in the input buffer
*
* - in_end: pointer just past the end of the input buffer
*
* - bitbuf: a word-sized variable containing bits that have been read from
* the input buffer. The buffered bits are right-aligned
* (they're the low-order bits).
*
* - bitsleft: number of bits in 'bitbuf' that are valid.
*
* To make it easier for the compiler to optimize the code by keeping variables
* in registers, these are declared as normal variables and manipulated using
* macros.
*/
/*
* The type for the bitbuffer variable ('bitbuf' described above). For best
* performance, this should have size equal to a machine word.
*
* 64-bit platforms have a significant advantage: they get a bigger bitbuffer
* which they have to fill less often.
*/
typedef machine_word_t bitbuf_t;
/*
* Number of bits the bitbuffer variable can hold.
*
* This is one less than the obvious value because of the optimized arithmetic
* in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range
* [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS].
*/
#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1)
/*
* The maximum number of bits that can be ensured in the bitbuffer variable,
* i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n). The decoder
* only reads whole bytes from memory, so this is the lowest value of 'bitsleft'
* at which another byte cannot be read without first consuming some bits.
*/
#define MAX_ENSURE (BITBUF_NBITS - 7)
/*
* Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if
* 'n' is too large to be passed to ENSURE_BITS(n). Note: if 'n' is a compile
* time constant, then this expression will be a compile-type constant.
* Therefore, CAN_ENSURE() can be used choose between alternative
* implementations at compile time.
*/
#define CAN_ENSURE(n) ((n) <= MAX_ENSURE)
/*
* Fill the bitbuffer variable, reading one byte at a time.
*
* If we would overread the input buffer, we just don't read anything, leaving
* the bits zeroed but marking them filled. This simplifies the decompressor
* because it removes the need to distinguish between real overreads and
* overreads that occur only because of the decompressor's own lookahead.
*
* The disadvantage is that real overreads are not detected immediately.
* However, this is safe because the decompressor is still guaranteed to make
* forward progress when presented never-ending 0 bits. In an existing block
* output will be getting generated, whereas new blocks can only be uncompressed
* (since the type code for uncompressed blocks is 0), for which we check for
* previous overread. But even if we didn't check, uncompressed blocks would
* fail to validate because LEN would not equal ~NLEN. So the decompressor will
* eventually either detect that the output buffer is full, or detect invalid
* input, or finish the final block.
*/
#define FILL_BITS_BYTEWISE() \
do { \
if (likely(in_next != in_end)) \
bitbuf |= (bitbuf_t)*in_next++ << bitsleft; \
else \
overrun_count++; \
bitsleft += 8; \
} while (bitsleft <= BITBUF_NBITS - 8)
/*
* Fill the bitbuffer variable by reading the next word from the input buffer
* and branchlessly updating 'in_next' and 'bitsleft' based on how many bits
* were filled. This can be significantly faster than FILL_BITS_BYTEWISE().
* However, for this to work correctly, the word must be interpreted in
* little-endian format. In addition, the memory access may be unaligned.
* Therefore, this method is most efficient on little-endian architectures that
* support fast unaligned access, such as x86 and x86_64.
*
* For faster updating of 'bitsleft', we consider the bitbuffer size in bits to
* be 1 less than the word size and therefore be all 1 bits. Then the number of
* bits filled is the value of the 0 bits in position >= 3 when changed to 1.
* E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000
* = 40 bits = 5 bytes. This uses only 4 operations to update 'in_next' and
* 'bitsleft': one each of +, ^, >>, and |. (Not counting operations the
* compiler optimizes out.) In contrast, the alternative of:
*
* in_next += (BITBUF_NBITS - bitsleft) >> 3;
* bitsleft += (BITBUF_NBITS - bitsleft) & ~7;
*
* (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on
* average refill an extra bit, but uses 5 operations: two +, and one each of
* -, >>, and &. Also the - and & must be completed before 'bitsleft' can be
* updated, while the current solution updates 'bitsleft' with no dependencies.
*/
#define FILL_BITS_WORDWISE() \
do { \
/* BITBUF_NBITS must be all 1's in binary, see above */ \
STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\
\
bitbuf |= get_unaligned_leword(in_next) << bitsleft; \
in_next += (bitsleft ^ BITBUF_NBITS) >> 3; \
bitsleft |= BITBUF_NBITS & ~7; \
} while (0)
/*
* Does the bitbuffer variable currently contain at least 'n' bits?
*/
#define HAVE_BITS(n) (bitsleft >= (n))
/*
* Load more bits from the input buffer until the specified number of bits is
* present in the bitbuffer variable. 'n' cannot be too large; see MAX_ENSURE
* and CAN_ENSURE().
*/
#define ENSURE_BITS(n) \
if (!HAVE_BITS(n)) { \
if (CPU_IS_LITTLE_ENDIAN() && \
UNALIGNED_ACCESS_IS_FAST && \
likely(in_end - in_next >= sizeof(bitbuf_t))) \
FILL_BITS_WORDWISE(); \
else \
FILL_BITS_BYTEWISE(); \
}
/*
* Return the next 'n' bits from the bitbuffer variable without removing them.
*/
#define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1))
/*
* Remove the next 'n' bits from the bitbuffer variable.
*/
#define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n))
/*
* Remove and return the next 'n' bits from the bitbuffer variable.
*/
#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32)
/*
* Verify that the input buffer hasn't been overread, then align the input to
* the next byte boundary, discarding any remaining bits in the current byte.
*
* Note that if the bitbuffer variable currently contains more than 7 bits, then
* we must rewind 'in_next', effectively putting those bits back. Only the bits
* in what would be the "current" byte if we were reading one byte at a time can
* be actually discarded.
*/
#define ALIGN_INPUT() \
do { \
SAFETY_CHECK(overrun_count <= (bitsleft >> 3)); \
in_next -= (bitsleft >> 3) - overrun_count; \
overrun_count = 0; \
bitbuf = 0; \
bitsleft = 0; \
} while(0)
/*
* Read a 16-bit value from the input. This must have been preceded by a call
* to ALIGN_INPUT(), and the caller must have already checked for overrun.
*/
#define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16)
/*****************************************************************************
* Huffman decoding *
*****************************************************************************/
/*
* A decode table for order TABLEBITS consists of a main table of (1 <<
* TABLEBITS) entries followed by a variable number of subtables.
*
* The decoding algorithm takes the next TABLEBITS bits of compressed data and
* uses them as an index into the decode table. The resulting entry is either a
* "direct entry", meaning that it contains the value desired, or a "subtable
* pointer", meaning that the entry references a subtable that must be indexed
* using more bits of the compressed data to decode the symbol.
*
* Each decode table (a main table along with with its subtables, if any) is
* associated with a Huffman code. Logically, the result of a decode table
* lookup is a symbol from the alphabet from which the corresponding Huffman
* code was constructed. A symbol with codeword length n <= TABLEBITS is
* associated with 2**(TABLEBITS - n) direct entries in the table, whereas a
* symbol with codeword length n > TABLEBITS is associated with one or more
* subtable entries.
*
* On top of this basic design, we implement several optimizations:
*
* - We store the length of each codeword directly in each of its decode table
* entries. This allows the codeword length to be produced without indexing
* an additional table.
*
* - When beneficial, we don't store the Huffman symbol itself, but instead data
* generated from it. For example, when decoding an offset symbol in DEFLATE,
* it's more efficient if we can decode the offset base and number of extra
* offset bits directly rather than decoding the offset symbol and then
* looking up both of those values in an additional table or tables.
*
* The size of each decode table entry is 32 bits, which provides slightly
* better performance than 16-bit entries on 32 and 64 bit processers, provided
* that the table doesn't get so large that it takes up too much memory and
* starts generating cache misses. The bits of each decode table entry are
* defined as follows:
*
* - Bits 30 -- 31: flags (see below)
* - Bits 8 -- 29: decode result: a Huffman symbol or related data
* - Bits 0 -- 7: codeword length
*/
/*
* This flag is set in all main decode table entries that represent subtable
* pointers.
*/
#define HUFFDEC_SUBTABLE_POINTER 0x80000000
/*
* This flag is set in all entries in the litlen decode table that represent
* literals.
*/
#define HUFFDEC_LITERAL 0x40000000
/* Mask for extracting the codeword length from a decode table entry. */
#define HUFFDEC_LENGTH_MASK 0xFF
/* Shift to extract the decode result from a decode table entry. */
#define HUFFDEC_RESULT_SHIFT 8
/* Shift a decode result into its position in the decode table entry. */
#define HUFFDEC_RESULT_ENTRY(result) ((u32)(result) << HUFFDEC_RESULT_SHIFT)
/* The decode result for each precode symbol. There is no special optimization
* for the precode; the decode result is simply the symbol value. */
static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
#define ENTRY(presym) HUFFDEC_RESULT_ENTRY(presym)
ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) ,
ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) ,
ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) ,
ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) ,
ENTRY(16) , ENTRY(17) , ENTRY(18) ,
#undef ENTRY
};
/* The decode result for each litlen symbol. For literals, this is the literal
* value itself and the HUFFDEC_LITERAL flag. For lengths, this is the length
* base and the number of extra length bits. */
static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
/* Literals */
#define ENTRY(literal) (HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal))
ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) ,
ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) ,
ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) ,
ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) ,
ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) ,
ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) ,
ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) ,
ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) ,
ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) ,
ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) ,
ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) ,
ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) ,
ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) ,
ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) ,
ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) ,
ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) ,
ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) ,
ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) ,
ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) ,
ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) ,
ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) ,
ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) ,
ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) ,
ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) ,
ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) ,
ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
#undef ENTRY
#define HUFFDEC_EXTRA_LENGTH_BITS_MASK 0xFF
#define HUFFDEC_LENGTH_BASE_SHIFT 8
#define HUFFDEC_END_OF_BLOCK_LENGTH 0
#define ENTRY(length_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \
((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits))
/* End of block */
ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0),
/* Lengths */
ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0),
ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0),
ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
#undef ENTRY
};
/* The decode result for each offset symbol. This is the offset base and the
* number of extra offset bits. */
static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16
#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1)
#define ENTRY(offset_base, num_extra_bits) HUFFDEC_RESULT_ENTRY( \
((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \
(offset_base))
ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) ,
ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) ,
ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) ,
ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) ,
ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) ,
ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) ,
ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) ,
ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) ,
#undef ENTRY
};
/*
* Build a table for fast decoding of symbols from a Huffman code. As input,
* this function takes the codeword length of each symbol which may be used in
* the code. As output, it produces a decode table for the canonical Huffman
* code described by the codeword lengths. The decode table is built with the
* assumption that it will be indexed with "bit-reversed" codewords, where the
* low-order bit is the first bit of the codeword. This format is used for all
* Huffman codes in DEFLATE.
*
* @decode_table
* The array in which the decode table will be generated. This array must
* have sufficient length; see the definition of the ENOUGH numbers.
* @lens
* An array which provides, for each symbol, the length of the
* corresponding codeword in bits, or 0 if the symbol is unused. This may
* alias @decode_table, since nothing is written to @decode_table until all
* @lens have been consumed. All codeword lengths are assumed to be <=
* @max_codeword_len but are otherwise considered untrusted. If they do
* not form a valid Huffman code, then the decode table is not built and
* %false is returned.
* @num_syms
* The number of symbols in the code, including all unused symbols.
* @decode_results
* An array which provides, for each symbol, the actual value to store into
* the decode table. This value will be directly produced as the result of
* decoding that symbol, thereby moving the indirection out of the decode
* loop and into the table initialization.
* @table_bits
* The log base-2 of the number of main table entries to use.
* @max_codeword_len
* The maximum allowed codeword length for this Huffman code.
* Must be <= DEFLATE_MAX_CODEWORD_LEN.
* @sorted_syms
* A temporary array of length @num_syms.
*
* Returns %true if successful; %false if the codeword lengths do not form a
* valid Huffman code.
*/
static bool
build_decode_table(u32 decode_table[],
const len_t lens[],
const unsigned num_syms,
const u32 decode_results[],
const unsigned table_bits,
const unsigned max_codeword_len,
u16 *sorted_syms)
{
unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
unsigned sym; /* current symbol */
unsigned codeword; /* current codeword, bit-reversed */
unsigned len; /* current codeword length in bits */
unsigned count; /* num codewords remaining with this length */
u32 codespace_used; /* codespace used out of '2^max_codeword_len' */
unsigned cur_table_end; /* end index of current table */
unsigned subtable_prefix; /* codeword prefix of current subtable */
unsigned subtable_start; /* start index of current subtable */
unsigned subtable_bits; /* log2 of current subtable length */
/* Count how many codewords have each length, including 0. */
for (len = 0; len <= max_codeword_len; len++)
len_counts[len] = 0;
for (sym = 0; sym < num_syms; sym++)
len_counts[lens[sym]]++;
/*
* Sort the symbols primarily by increasing codeword length and
* secondarily by increasing symbol value; or equivalently by their
* codewords in lexicographic order, since a canonical code is assumed.
*
* For efficiency, also compute 'codespace_used' in the same pass over
* 'len_counts[]' used to build 'offsets[]' for sorting.
*/
/* Ensure that 'codespace_used' cannot overflow. */
STATIC_ASSERT(sizeof(codespace_used) == 4);
STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
DEFLATE_MAX_NUM_SYMS);
offsets[0] = 0;
offsets[1] = len_counts[0];
codespace_used = 0;
for (len = 1; len < max_codeword_len; len++) {
offsets[len + 1] = offsets[len] + len_counts[len];
codespace_used = (codespace_used << 1) + len_counts[len];
}
codespace_used = (codespace_used << 1) + len_counts[len];
for (sym = 0; sym < num_syms; sym++)
sorted_syms[offsets[lens[sym]]++] = sym;
sorted_syms += offsets[0]; /* Skip unused symbols */
/* lens[] is done being used, so we can write to decode_table[] now. */
/*
* Check whether the lengths form a complete code (exactly fills the
* codespace), an incomplete code (doesn't fill the codespace), or an
* overfull code (overflows the codespace). A codeword of length 'n'
* uses proportion '1/(2^n)' of the codespace. An overfull code is
* nonsensical, so is considered invalid. An incomplete code is
* considered valid only in two specific cases; see below.
*/
/* overfull code? */
if (unlikely(codespace_used > (1U << max_codeword_len)))
return false;
/* incomplete code? */
if (unlikely(codespace_used < (1U << max_codeword_len))) {
u32 entry;
unsigned i;
if (codespace_used == 0) {
/*
* An empty code is allowed. This can happen for the
* offset code in DEFLATE, since a dynamic Huffman block
* need not contain any matches.
*/
/* sym=0, len=1 (arbitrary) */
entry = decode_results[0] | 1;
} else {
/*
* Allow codes with a single used symbol, with codeword
* length 1. The DEFLATE RFC is unclear regarding this
* case. What zlib's decompressor does is permit this
* for the litlen and offset codes and assume the
* codeword is '0' rather than '1'. We do the same
* except we allow this for precodes too, since there's
* no convincing reason to treat the codes differently.
* We also assign both codewords '0' and '1' to the
* symbol to avoid having to handle '1' specially.
*/
if (codespace_used != (1U << (max_codeword_len - 1)) ||
len_counts[1] != 1)
return false;
entry = decode_results[*sorted_syms] | 1;
}
/*
* Note: the decode table still must be fully initialized, in
* case the stream is malformed and contains bits from the part
* of the codespace the incomplete code doesn't use.
*/
for (i = 0; i < (1U << table_bits); i++)
decode_table[i] = entry;
return true;
}
/*
* The lengths form a complete code. Now, enumerate the codewords in
* lexicographic order and fill the decode table entries for each one.
*
* First, process all codewords with len <= table_bits. Each one gets
* '2^(table_bits-len)' direct entries in the table.
*
* Since DEFLATE uses bit-reversed codewords, these entries aren't
* consecutive but rather are spaced '2^len' entries apart. This makes
* filling them naively somewhat awkward and inefficient, since strided
* stores are less cache-friendly and preclude the use of word or
* vector-at-a-time stores to fill multiple entries per instruction.
*
* To optimize this, we incrementally double the table size. When
* processing codewords with length 'len', the table is treated as
* having only '2^len' entries, so each codeword uses just one entry.
* Then, each time 'len' is incremented, the table size is doubled and
* the first half is copied to the second half. This significantly
* improves performance over naively doing strided stores.
*
* Note that some entries copied for each table doubling may not have
* been initialized yet, but it doesn't matter since they're guaranteed
* to be initialized later (because the Huffman code is complete).
*/
codeword = 0;
len = 1;
while ((count = len_counts[len]) == 0)
len++;
cur_table_end = 1U << len;
while (len <= table_bits) {
/* Process all 'count' codewords with length 'len' bits. */
do {
unsigned bit;
/* Fill the first entry for the current codeword. */
decode_table[codeword] =
decode_results[*sorted_syms++] | len;
if (codeword == cur_table_end - 1) {
/* Last codeword (all 1's) */
for (; len < table_bits; len++) {
memcpy(&decode_table[cur_table_end],
decode_table,
cur_table_end *
sizeof(decode_table[0]));
cur_table_end <<= 1;
}
return true;
}
/*
* To advance to the lexicographically next codeword in
* the canonical code, the codeword must be incremented,
* then 0's must be appended to the codeword as needed
* to match the next codeword's length.
*
* Since the codeword is bit-reversed, appending 0's is
* a no-op. However, incrementing it is nontrivial. To
* do so efficiently, use the 'bsr' instruction to find
* the last (highest order) 0 bit in the codeword, set
* it, and clear any later (higher order) 1 bits. But
* 'bsr' actually finds the highest order 1 bit, so to
* use it first flip all bits in the codeword by XOR'ing
* it with (1U << len) - 1 == cur_table_end - 1.
*/
bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
codeword &= bit - 1;
codeword |= bit;
} while (--count);
/* Advance to the next codeword length. */
do {
if (++len <= table_bits) {
memcpy(&decode_table[cur_table_end],
decode_table,
cur_table_end * sizeof(decode_table[0]));
cur_table_end <<= 1;
}
} while ((count = len_counts[len]) == 0);
}
/* Process codewords with len > table_bits. These require subtables. */
cur_table_end = 1U << table_bits;
subtable_prefix = -1;
subtable_start = 0;
for (;;) {
u32 entry;
unsigned i;
unsigned stride;
unsigned bit;
/*
* Start a new subtable if the first 'table_bits' bits of the
* codeword don't match the prefix of the current subtable.
*/
if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
subtable_prefix = (codeword & ((1U << table_bits) - 1));
subtable_start = cur_table_end;
/*
* Calculate the subtable length. If the codeword has
* length 'table_bits + n', then the subtable needs
* '2^n' entries. But it may need more; if fewer than
* '2^n' codewords of length 'table_bits + n' remain,
* then the length will need to be incremented to bring
* in longer codewords until the subtable can be
* completely filled. Note that because the Huffman
* code is complete, it will always be possible to fill
* the subtable eventually.
*/
subtable_bits = len - table_bits;
codespace_used = count;
while (codespace_used < (1U << subtable_bits)) {
subtable_bits++;
codespace_used = (codespace_used << 1) +
len_counts[table_bits + subtable_bits];
}
cur_table_end = subtable_start + (1U << subtable_bits);
/*
* Create the entry that points from the main table to
* the subtable. This entry contains the index of the
* start of the subtable and the number of bits with
* which the subtable is indexed (the log base 2 of the
* number of entries it contains).
*/
decode_table[subtable_prefix] =
HUFFDEC_SUBTABLE_POINTER |
HUFFDEC_RESULT_ENTRY(subtable_start) |
subtable_bits;
}
/* Fill the subtable entries for the current codeword. */
entry = decode_results[*sorted_syms++] | (len - table_bits);
i = subtable_start + (codeword >> table_bits);
stride = 1U << (len - table_bits);
do {
decode_table[i] = entry;
i += stride;
} while (i < cur_table_end);
/* Advance to the next codeword. */
if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
return true;
bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
codeword &= bit - 1;
codeword |= bit;
count--;
while (count == 0)
count = len_counts[++len];
}
}
/* Build the decode table for the precode. */
static bool
build_precode_decode_table(struct libdeflate_decompressor *d)
{
/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
return build_decode_table(d->u.l.precode_decode_table,
d->u.precode_lens,
DEFLATE_NUM_PRECODE_SYMS,
precode_decode_results,
PRECODE_TABLEBITS,
DEFLATE_MAX_PRE_CODEWORD_LEN,
d->sorted_syms);
}
/* Build the decode table for the literal/length code. */
static bool
build_litlen_decode_table(struct libdeflate_decompressor *d,
unsigned num_litlen_syms, unsigned num_offset_syms)
{
/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334);
return build_decode_table(d->u.litlen_decode_table,
d->u.l.lens,
num_litlen_syms,
litlen_decode_results,
LITLEN_TABLEBITS,
DEFLATE_MAX_LITLEN_CODEWORD_LEN,
d->sorted_syms);
}
/* Build the decode table for the offset code. */
static bool
build_offset_decode_table(struct libdeflate_decompressor *d,
unsigned num_litlen_syms, unsigned num_offset_syms)
{
/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
return build_decode_table(d->offset_decode_table,
d->u.l.lens + num_litlen_syms,
num_offset_syms,
offset_decode_results,
OFFSET_TABLEBITS,
DEFLATE_MAX_OFFSET_CODEWORD_LEN,
d->sorted_syms);
}
static forceinline machine_word_t
repeat_byte(u8 b)
{
machine_word_t v;
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
v = b;
v |= v << 8;
v |= v << 16;
v |= v << ((WORDBITS == 64) ? 32 : 0);
return v;
}
static forceinline void
copy_word_unaligned(const void *src, void *dst)
{
store_word_unaligned(load_word_unaligned(src), dst);
}
/*****************************************************************************
* Main decompression routine
*****************************************************************************/
typedef enum libdeflate_result (*decompress_func_t)
(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
#undef DEFAULT_IMPL
#undef DISPATCH
#if defined(__i386__) || defined(__x86_64__)
# include "x86/decompress_impl.h"
#endif
#ifndef DEFAULT_IMPL
# define FUNCNAME deflate_decompress_default
# define ATTRIBUTES
# include "decompress_template.h"
# define DEFAULT_IMPL deflate_decompress_default
#endif
#ifdef DISPATCH
static enum libdeflate_result
dispatch(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
static volatile decompress_func_t decompress_impl = dispatch;
/* Choose the fastest implementation at runtime */
static enum libdeflate_result
dispatch(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
{
decompress_func_t f = arch_select_decompress_func();
if (f == NULL)
f = DEFAULT_IMPL;
decompress_impl = f;
return (*f)(d, in, in_nbytes, out, out_nbytes_avail,
actual_in_nbytes_ret, actual_out_nbytes_ret);
}
#else
# define decompress_impl DEFAULT_IMPL /* only one implementation, use it */
#endif
/*
* This is the main DEFLATE decompression routine. See libdeflate.h for the
* documentation.
*
* Note that the real code is in decompress_template.h. The part here just
* handles calling the appropriate implementation depending on the CPU features
* at runtime.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret)
{
return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
actual_in_nbytes_ret, actual_out_nbytes_ret);
}
LIBDEFLATEAPI enum libdeflate_result
libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d,
const void * restrict in, size_t in_nbytes,
void * restrict out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
{
return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
out, out_nbytes_avail,
NULL, actual_out_nbytes_ret);
}
LIBDEFLATEAPI struct libdeflate_decompressor *
libdeflate_alloc_decompressor(void)
{
/*
* Note that only certain parts of the decompressor actually must be
* initialized here:
*
* - 'static_codes_loaded' must be initialized to false.
*
* - The first half of the main portion of each decode table must be
* initialized to any value, to avoid reading from uninitialized
* memory during table expansion in build_decode_table(). (Although,
* this is really just to avoid warnings with dynamic tools like
* valgrind, since build_decode_table() is guaranteed to initialize
* all entries eventually anyway.)
*
* But for simplicity, we currently just zero the whole decompressor.
*/
return calloc(1, sizeof(struct libdeflate_decompressor));
}
LIBDEFLATEAPI void
libdeflate_free_decompressor(struct libdeflate_decompressor *d)
{
free(d);
}

View file

@ -1,95 +0,0 @@
/*
* gzip_compress.c - compress with a gzip wrapper
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "deflate_compress.h"
#include "gzip_constants.h"
#include "unaligned.h"
#include "libdeflate.h"
LIBDEFLATEAPI size_t
libdeflate_gzip_compress(struct libdeflate_compressor *c,
const void *in, size_t in_size,
void *out, size_t out_nbytes_avail)
{
u8 *out_next = out;
unsigned compression_level;
u8 xfl;
size_t deflate_size;
if (out_nbytes_avail <= GZIP_MIN_OVERHEAD)
return 0;
/* ID1 */
*out_next++ = GZIP_ID1;
/* ID2 */
*out_next++ = GZIP_ID2;
/* CM */
*out_next++ = GZIP_CM_DEFLATE;
/* FLG */
*out_next++ = 0;
/* MTIME */
put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next);
out_next += 4;
/* XFL */
xfl = 0;
compression_level = deflate_get_compression_level(c);
if (compression_level < 2)
xfl |= GZIP_XFL_FASTEST_COMRESSION;
else if (compression_level >= 8)
xfl |= GZIP_XFL_SLOWEST_COMRESSION;
*out_next++ = xfl;
/* OS */
*out_next++ = GZIP_OS_UNKNOWN; /* OS */
/* Compressed data */
deflate_size = libdeflate_deflate_compress(c, in, in_size, out_next,
out_nbytes_avail - GZIP_MIN_OVERHEAD);
if (deflate_size == 0)
return 0;
out_next += deflate_size;
/* CRC32 */
put_unaligned_le32(libdeflate_crc32(0, in, in_size), out_next);
out_next += 4;
/* ISIZE */
put_unaligned_le32((u32)in_size, out_next);
out_next += 4;
return out_next - (u8 *)out;
}
LIBDEFLATEAPI size_t
libdeflate_gzip_compress_bound(struct libdeflate_compressor *c,
size_t in_nbytes)
{
return GZIP_MIN_OVERHEAD +
libdeflate_deflate_compress_bound(c, in_nbytes);
}

View file

@ -1,45 +0,0 @@
/*
* gzip_constants.h - constants for the gzip wrapper format
*/
#ifndef LIB_GZIP_CONSTANTS_H
#define LIB_GZIP_CONSTANTS_H
#define GZIP_MIN_HEADER_SIZE 10
#define GZIP_FOOTER_SIZE 8
#define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
#define GZIP_ID1 0x1F
#define GZIP_ID2 0x8B
#define GZIP_CM_DEFLATE 8
#define GZIP_FTEXT 0x01
#define GZIP_FHCRC 0x02
#define GZIP_FEXTRA 0x04
#define GZIP_FNAME 0x08
#define GZIP_FCOMMENT 0x10
#define GZIP_FRESERVED 0xE0
#define GZIP_MTIME_UNAVAILABLE 0
#define GZIP_XFL_SLOWEST_COMRESSION 0x02
#define GZIP_XFL_FASTEST_COMRESSION 0x04
#define GZIP_OS_FAT 0
#define GZIP_OS_AMIGA 1
#define GZIP_OS_VMS 2
#define GZIP_OS_UNIX 3
#define GZIP_OS_VM_CMS 4
#define GZIP_OS_ATARI_TOS 5
#define GZIP_OS_HPFS 6
#define GZIP_OS_MACINTOSH 7
#define GZIP_OS_Z_SYSTEM 8
#define GZIP_OS_CP_M 9
#define GZIP_OS_TOPS_20 10
#define GZIP_OS_NTFS 11
#define GZIP_OS_QDOS 12
#define GZIP_OS_RISCOS 13
#define GZIP_OS_UNKNOWN 255
#endif /* LIB_GZIP_CONSTANTS_H */

View file

@ -1,148 +0,0 @@
/*
* gzip_decompress.c - decompress with a gzip wrapper
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "gzip_constants.h"
#include "unaligned.h"
#include "libdeflate.h"
LIBDEFLATEAPI enum libdeflate_result
libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret)
{
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
u8 flg;
size_t actual_in_nbytes;
size_t actual_out_nbytes;
enum libdeflate_result result;
if (in_nbytes < GZIP_MIN_OVERHEAD)
return LIBDEFLATE_BAD_DATA;
/* ID1 */
if (*in_next++ != GZIP_ID1)
return LIBDEFLATE_BAD_DATA;
/* ID2 */
if (*in_next++ != GZIP_ID2)
return LIBDEFLATE_BAD_DATA;
/* CM */
if (*in_next++ != GZIP_CM_DEFLATE)
return LIBDEFLATE_BAD_DATA;
flg = *in_next++;
/* MTIME */
in_next += 4;
/* XFL */
in_next += 1;
/* OS */
in_next += 1;
if (flg & GZIP_FRESERVED)
return LIBDEFLATE_BAD_DATA;
/* Extra field */
if (flg & GZIP_FEXTRA) {
u16 xlen = get_unaligned_le16(in_next);
in_next += 2;
if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
return LIBDEFLATE_BAD_DATA;
in_next += xlen;
}
/* Original file name (zero terminated) */
if (flg & GZIP_FNAME) {
while (*in_next++ != 0 && in_next != in_end)
;
if (in_end - in_next < GZIP_FOOTER_SIZE)
return LIBDEFLATE_BAD_DATA;
}
/* File comment (zero terminated) */
if (flg & GZIP_FCOMMENT) {
while (*in_next++ != 0 && in_next != in_end)
;
if (in_end - in_next < GZIP_FOOTER_SIZE)
return LIBDEFLATE_BAD_DATA;
}
/* CRC16 for gzip header */
if (flg & GZIP_FHCRC) {
in_next += 2;
if (in_end - in_next < GZIP_FOOTER_SIZE)
return LIBDEFLATE_BAD_DATA;
}
/* Compressed data */
result = libdeflate_deflate_decompress_ex(d, in_next,
in_end - GZIP_FOOTER_SIZE - in_next,
out, out_nbytes_avail,
&actual_in_nbytes,
actual_out_nbytes_ret);
if (result != LIBDEFLATE_SUCCESS)
return result;
if (actual_out_nbytes_ret)
actual_out_nbytes = *actual_out_nbytes_ret;
else
actual_out_nbytes = out_nbytes_avail;
in_next += actual_in_nbytes;
/* CRC32 */
if (libdeflate_crc32(0, out, actual_out_nbytes) !=
get_unaligned_le32(in_next))
return LIBDEFLATE_BAD_DATA;
in_next += 4;
/* ISIZE */
if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
return LIBDEFLATE_BAD_DATA;
in_next += 4;
if (actual_in_nbytes_ret)
*actual_in_nbytes_ret = in_next - (u8 *)in;
return LIBDEFLATE_SUCCESS;
}
LIBDEFLATEAPI enum libdeflate_result
libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
{
return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
out, out_nbytes_avail,
NULL, actual_out_nbytes_ret);
}

View file

@ -1,403 +0,0 @@
/*
* hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*
* ---------------------------------------------------------------------------
*
* Algorithm
*
* This is a Hash Chains (hc) based matchfinder.
*
* The main data structure is a hash table where each hash bucket contains a
* linked list (or "chain") of sequences whose first 4 bytes share the same hash
* code. Each sequence is identified by its starting position in the input
* buffer.
*
* The algorithm processes the input buffer sequentially. At each byte
* position, the hash code of the first 4 bytes of the sequence beginning at
* that position (the sequence being matched against) is computed. This
* identifies the hash bucket to use for that position. Then, this hash
* bucket's linked list is searched for matches. Then, a new linked list node
* is created to represent the current sequence and is prepended to the list.
*
* This algorithm has several useful properties:
*
* - It only finds true Lempel-Ziv matches; i.e., those where the matching
* sequence occurs prior to the sequence being matched against.
*
* - The sequences in each linked list are always sorted by decreasing starting
* position. Therefore, the closest (smallest offset) matches are found
* first, which in many compression formats tend to be the cheapest to encode.
*
* - Although fast running time is not guaranteed due to the possibility of the
* lists getting very long, the worst degenerate behavior can be easily
* prevented by capping the number of nodes searched at each position.
*
* - If the compressor decides not to search for matches at a certain position,
* then that position can be quickly inserted without searching the list.
*
* - The algorithm is adaptable to sliding windows: just store the positions
* relative to a "base" value that is updated from time to time, and stop
* searching each list when the sequences get too far away.
*
* ----------------------------------------------------------------------------
*
* Optimizations
*
* The main hash table and chains handle length 4+ matches. Length 3 matches
* are handled by a separate hash table with no chains. This works well for
* typical "greedy" or "lazy"-style compressors, where length 3 matches are
* often only helpful if they have small offsets. Instead of searching a full
* chain for length 3+ matches, the algorithm just checks for one close length 3
* match, then focuses on finding length 4+ matches.
*
* The longest_match() and skip_positions() functions are inlined into the
* compressors that use them. This isn't just about saving the overhead of a
* function call. These functions are intended to be called from the inner
* loops of compressors, where giving the compiler more control over register
* allocation is very helpful. There is also significant benefit to be gained
* from allowing the CPU to predict branches independently at each call site.
* For example, "lazy"-style compressors can be written with two calls to
* longest_match(), each of which starts with a different 'best_len' and
* therefore has significantly different performance characteristics.
*
* Although any hash function can be used, a multiplicative hash is fast and
* works well.
*
* On some processors, it is significantly faster to extend matches by whole
* words (32 or 64 bits) instead of by individual bytes. For this to be the
* case, the processor must implement unaligned memory accesses efficiently and
* must have either a fast "find first set bit" instruction or a fast "find last
* set bit" instruction, depending on the processor's endianness.
*
* The code uses one loop for finding the first match and one loop for finding a
* longer match. Each of these loops is tuned for its respective task and in
* combination are faster than a single generalized loop that handles both
* tasks.
*
* The code also uses a tight inner loop that only compares the last and first
* bytes of a potential match. It is only when these bytes match that a full
* match extension is attempted.
*
* ----------------------------------------------------------------------------
*/
#include "matchfinder_common.h"
#define HC_MATCHFINDER_HASH3_ORDER 15
#define HC_MATCHFINDER_HASH4_ORDER 16
#define HC_MATCHFINDER_TOTAL_HASH_LENGTH \
((1UL << HC_MATCHFINDER_HASH3_ORDER) + \
(1UL << HC_MATCHFINDER_HASH4_ORDER))
struct hc_matchfinder {
/* The hash table for finding length 3 matches */
mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
/* The hash table which contains the first nodes of the linked lists for
* finding length 4+ matches */
mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
/* The "next node" references for the linked lists. The "next node" of
* the node for the sequence with position 'pos' is 'next_tab[pos]'. */
mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
}
#ifdef _aligned_attribute
_aligned_attribute(MATCHFINDER_ALIGNMENT)
#endif
;
/* Prepare the matchfinder for a new input buffer. */
static forceinline void
hc_matchfinder_init(struct hc_matchfinder *mf)
{
matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_LENGTH);
}
static forceinline void
hc_matchfinder_slide_window(struct hc_matchfinder *mf)
{
matchfinder_rebase((mf_pos_t *)mf,
sizeof(struct hc_matchfinder) / sizeof(mf_pos_t));
}
/*
* Find the longest match longer than 'best_len' bytes.
*
* @mf
* The matchfinder structure.
* @in_base_p
* Location of a pointer which points to the place in the input data the
* matchfinder currently stores positions relative to. This may be updated
* by this function.
* @cur_pos
* The current position in the input buffer relative to @in_base (the
* position of the sequence being matched against).
* @best_len
* Require a match longer than this length.
* @max_len
* The maximum permissible match length at this position.
* @nice_len
* Stop searching if a match of at least this length is found.
* Must be <= @max_len.
* @max_search_depth
* Limit on the number of potential matches to consider. Must be >= 1.
* @next_hashes
* The precomputed hash codes for the sequence beginning at @in_next.
* These will be used and then updated with the precomputed hashcodes for
* the sequence beginning at @in_next + 1.
* @offset_ret
* If a match is found, its offset is returned in this location.
*
* Return the length of the match found, or 'best_len' if no match longer than
* 'best_len' was found.
*/
static forceinline u32
hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
const u8 ** const restrict in_base_p,
const u8 * const restrict in_next,
u32 best_len,
const u32 max_len,
const u32 nice_len,
const u32 max_search_depth,
u32 * const restrict next_hashes,
u32 * const restrict offset_ret)
{
u32 depth_remaining = max_search_depth;
const u8 *best_matchptr = in_next;
mf_pos_t cur_node3, cur_node4;
u32 hash3, hash4;
u32 next_hashseq;
u32 seq4;
const u8 *matchptr;
u32 len;
u32 cur_pos = in_next - *in_base_p;
const u8 *in_base;
mf_pos_t cutoff;
if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
hc_matchfinder_slide_window(mf);
*in_base_p += MATCHFINDER_WINDOW_SIZE;
cur_pos = 0;
}
in_base = *in_base_p;
cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
goto out;
/* Get the precomputed hash codes. */
hash3 = next_hashes[0];
hash4 = next_hashes[1];
/* From the hash buckets, get the first node of each linked list. */
cur_node3 = mf->hash3_tab[hash3];
cur_node4 = mf->hash4_tab[hash4];
/* Update for length 3 matches. This replaces the singleton node in the
* 'hash3' bucket with the node for the current sequence. */
mf->hash3_tab[hash3] = cur_pos;
/* Update for length 4 matches. This prepends the node for the current
* sequence to the linked list in the 'hash4' bucket. */
mf->hash4_tab[hash4] = cur_pos;
mf->next_tab[cur_pos] = cur_node4;
/* Compute the next hash codes. */
next_hashseq = get_unaligned_le32(in_next + 1);
next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
prefetchw(&mf->hash3_tab[next_hashes[0]]);
prefetchw(&mf->hash4_tab[next_hashes[1]]);
if (best_len < 4) { /* No match of length >= 4 found yet? */
/* Check for a length 3 match if needed. */
if (cur_node3 <= cutoff)
goto out;
seq4 = load_u32_unaligned(in_next);
if (best_len < 3) {
matchptr = &in_base[cur_node3];
if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
best_len = 3;
best_matchptr = matchptr;
}
}
/* Check for a length 4 match. */
if (cur_node4 <= cutoff)
goto out;
for (;;) {
/* No length 4 match found yet. Check the first 4 bytes. */
matchptr = &in_base[cur_node4];
if (load_u32_unaligned(matchptr) == seq4)
break;
/* The first 4 bytes did not match. Keep trying. */
cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
if (cur_node4 <= cutoff || !--depth_remaining)
goto out;
}
/* Found a match of length >= 4. Extend it to its full length. */
best_matchptr = matchptr;
best_len = lz_extend(in_next, best_matchptr, 4, max_len);
if (best_len >= nice_len)
goto out;
cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
if (cur_node4 <= cutoff || !--depth_remaining)
goto out;
} else {
if (cur_node4 <= cutoff || best_len >= nice_len)
goto out;
}
/* Check for matches of length >= 5. */
for (;;) {
for (;;) {
matchptr = &in_base[cur_node4];
/* Already found a length 4 match. Try for a longer
* match; start by checking either the last 4 bytes and
* the first 4 bytes, or the last byte. (The last byte,
* the one which would extend the match length by 1, is
* the most important.) */
#if UNALIGNED_ACCESS_IS_FAST
if ((load_u32_unaligned(matchptr + best_len - 3) ==
load_u32_unaligned(in_next + best_len - 3)) &&
(load_u32_unaligned(matchptr) ==
load_u32_unaligned(in_next)))
#else
if (matchptr[best_len] == in_next[best_len])
#endif
break;
/* Continue to the next node in the list. */
cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
if (cur_node4 <= cutoff || !--depth_remaining)
goto out;
}
#if UNALIGNED_ACCESS_IS_FAST
len = 4;
#else
len = 0;
#endif
len = lz_extend(in_next, matchptr, len, max_len);
if (len > best_len) {
/* This is the new longest match. */
best_len = len;
best_matchptr = matchptr;
if (best_len >= nice_len)
goto out;
}
/* Continue to the next node in the list. */
cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
if (cur_node4 <= cutoff || !--depth_remaining)
goto out;
}
out:
*offset_ret = in_next - best_matchptr;
return best_len;
}
/*
* Advance the matchfinder, but don't search for matches.
*
* @mf
* The matchfinder structure.
* @in_base_p
* Location of a pointer which points to the place in the input data the
* matchfinder currently stores positions relative to. This may be updated
* by this function.
* @cur_pos
* The current position in the input buffer relative to @in_base.
* @end_pos
* The end position of the input buffer, relative to @in_base.
* @next_hashes
* The precomputed hash codes for the sequence beginning at @in_next.
* These will be used and then updated with the precomputed hashcodes for
* the sequence beginning at @in_next + @count.
* @count
* The number of bytes to advance. Must be > 0.
*
* Returns @in_next + @count.
*/
static forceinline const u8 *
hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
const u8 ** const restrict in_base_p,
const u8 *in_next,
const u8 * const in_end,
const u32 count,
u32 * const restrict next_hashes)
{
u32 cur_pos;
u32 hash3, hash4;
u32 next_hashseq;
u32 remaining = count;
if (unlikely(count + 5 > in_end - in_next))
return &in_next[count];
cur_pos = in_next - *in_base_p;
hash3 = next_hashes[0];
hash4 = next_hashes[1];
do {
if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
hc_matchfinder_slide_window(mf);
*in_base_p += MATCHFINDER_WINDOW_SIZE;
cur_pos = 0;
}
mf->hash3_tab[hash3] = cur_pos;
mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
mf->hash4_tab[hash4] = cur_pos;
next_hashseq = get_unaligned_le32(++in_next);
hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
cur_pos++;
} while (--remaining);
prefetchw(&mf->hash3_tab[hash3]);
prefetchw(&mf->hash4_tab[hash4]);
next_hashes[0] = hash3;
next_hashes[1] = hash4;
return in_next;
}

View file

@ -1,35 +0,0 @@
/*
* lib_common.h - internal header included by all library code
*/
#ifndef LIB_LIB_COMMON_H
#define LIB_LIB_COMMON_H
#ifdef LIBDEFLATE_H
# error "lib_common.h must always be included before libdeflate.h"
/* because BUILDING_LIBDEFLATE must be set first */
#endif
#define BUILDING_LIBDEFLATE
#include "common_defs.h"
/*
* Prefix with "_libdeflate_" all global symbols which are not part of the API.
* This avoids exposing overly generic names when libdeflate is built as a
* static library.
*
* Note that the chosen prefix is not really important and can be changed
* without breaking library users. It was just chosen so that the resulting
* symbol names are unlikely to conflict with those from any other software.
* Also note that this fixup has no useful effect when libdeflate is built as a
* shared library, since these symbols are not exported.
*/
#define SYM_FIXUP(sym) _libdeflate_##sym
#define aligned_malloc SYM_FIXUP(aligned_malloc)
#define aligned_free SYM_FIXUP(aligned_free)
#define deflate_get_compression_level SYM_FIXUP(deflate_get_compression_level)
#define _cpu_features SYM_FIXUP(_cpu_features)
#define setup_cpu_features SYM_FIXUP(setup_cpu_features)
#endif /* LIB_LIB_COMMON_H */

View file

@ -1,168 +0,0 @@
/*
* matchfinder_common.h - common code for Lempel-Ziv matchfinding
*/
#ifndef LIB_MATCHFINDER_COMMON_H
#define LIB_MATCHFINDER_COMMON_H
#include "lib_common.h"
#include "unaligned.h"
#ifndef MATCHFINDER_WINDOW_ORDER
# error "MATCHFINDER_WINDOW_ORDER must be defined!"
#endif
#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
typedef s16 mf_pos_t;
#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
#define MATCHFINDER_ALIGNMENT 8
#define arch_matchfinder_init(data, size) false
#define arch_matchfinder_rebase(data, size) false
#ifdef _aligned_attribute
# if defined(__arm__) || defined(__aarch64__)
# include "arm/matchfinder_impl.h"
# elif defined(__i386__) || defined(__x86_64__)
# include "x86/matchfinder_impl.h"
# endif
#endif
/*
* Initialize the hash table portion of the matchfinder.
*
* Essentially, this is an optimized memset().
*
* 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary.
*/
static forceinline void
matchfinder_init(mf_pos_t *data, size_t num_entries)
{
size_t i;
if (arch_matchfinder_init(data, num_entries * sizeof(data[0])))
return;
for (i = 0; i < num_entries; i++)
data[i] = MATCHFINDER_INITVAL;
}
/*
* Slide the matchfinder by WINDOW_SIZE bytes.
*
* This must be called just after each WINDOW_SIZE bytes have been run through
* the matchfinder.
*
* This will subtract WINDOW_SIZE bytes from each entry in the array specified.
* The effect is that all entries are updated to be relative to the current
* position, rather than the position WINDOW_SIZE bytes prior.
*
* Underflow is detected and replaced with signed saturation. This ensures that
* once the sliding window has passed over a position, that position forever
* remains out of bounds.
*
* The array passed in must contain all matchfinder data that is
* position-relative. Concretely, this will include the hash table as well as
* the table of positions that is used to link together the sequences in each
* hash bucket. Note that in the latter table, the links are 1-ary in the case
* of "hash chains", and 2-ary in the case of "binary trees". In either case,
* the links need to be rebased in the same way.
*/
static forceinline void
matchfinder_rebase(mf_pos_t *data, size_t num_entries)
{
size_t i;
if (arch_matchfinder_rebase(data, num_entries * sizeof(data[0])))
return;
if (MATCHFINDER_WINDOW_SIZE == 32768) {
/* Branchless version for 32768 byte windows. If the value was
* already negative, clear all bits except the sign bit; this
* changes the value to -32768. Otherwise, set the sign bit;
* this is equivalent to subtracting 32768. */
for (i = 0; i < num_entries; i++) {
u16 v = data[i];
u16 sign_bit = v & 0x8000;
v &= sign_bit - ((sign_bit >> 15) ^ 1);
v |= 0x8000;
data[i] = v;
}
return;
}
for (i = 0; i < num_entries; i++) {
if (data[i] >= 0)
data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
else
data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
}
}
/*
* The hash function: given a sequence prefix held in the low-order bits of a
* 32-bit value, multiply by a carefully-chosen large constant. Discard any
* bits of the product that don't fit in a 32-bit value, but take the
* next-highest @num_bits bits of the product as the hash value, as those have
* the most randomness.
*/
static forceinline u32
lz_hash(u32 seq, unsigned num_bits)
{
return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
}
/*
* Return the number of bytes at @matchptr that match the bytes at @strptr, up
* to a maximum of @max_len. Initially, @start_len bytes are matched.
*/
static forceinline unsigned
lz_extend(const u8 * const strptr, const u8 * const matchptr,
const unsigned start_len, const unsigned max_len)
{
unsigned len = start_len;
machine_word_t v_word;
if (UNALIGNED_ACCESS_IS_FAST) {
if (likely(max_len - len >= 4 * WORDBYTES)) {
#define COMPARE_WORD_STEP \
v_word = load_word_unaligned(&matchptr[len]) ^ \
load_word_unaligned(&strptr[len]); \
if (v_word != 0) \
goto word_differs; \
len += WORDBYTES; \
COMPARE_WORD_STEP
COMPARE_WORD_STEP
COMPARE_WORD_STEP
COMPARE_WORD_STEP
#undef COMPARE_WORD_STEP
}
while (len + WORDBYTES <= max_len) {
v_word = load_word_unaligned(&matchptr[len]) ^
load_word_unaligned(&strptr[len]);
if (v_word != 0)
goto word_differs;
len += WORDBYTES;
}
}
while (len < max_len && matchptr[len] == strptr[len])
len++;
return len;
word_differs:
if (CPU_IS_LITTLE_ENDIAN())
len += (bsfw(v_word) >> 3);
else
len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
return len;
}
#endif /* LIB_MATCHFINDER_COMMON_H */

View file

@ -1,202 +0,0 @@
/*
* unaligned.h - inline functions for unaligned memory accesses
*/
#ifndef LIB_UNALIGNED_H
#define LIB_UNALIGNED_H
#include "lib_common.h"
/*
* Naming note:
*
* {load,store}_*_unaligned() deal with raw bytes without endianness conversion.
* {get,put}_unaligned_*() deal with a specific endianness.
*/
DEFINE_UNALIGNED_TYPE(u16)
DEFINE_UNALIGNED_TYPE(u32)
DEFINE_UNALIGNED_TYPE(u64)
DEFINE_UNALIGNED_TYPE(machine_word_t)
#define load_word_unaligned load_machine_word_t_unaligned
#define store_word_unaligned store_machine_word_t_unaligned
/***** Unaligned loads *****/
static forceinline u16
get_unaligned_le16(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le16_bswap(load_u16_unaligned(p));
else
return ((u16)p[1] << 8) | p[0];
}
static forceinline u16
get_unaligned_be16(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return be16_bswap(load_u16_unaligned(p));
else
return ((u16)p[0] << 8) | p[1];
}
static forceinline u32
get_unaligned_le32(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le32_bswap(load_u32_unaligned(p));
else
return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
((u32)p[1] << 8) | p[0];
}
static forceinline u32
get_unaligned_be32(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return be32_bswap(load_u32_unaligned(p));
else
return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
((u32)p[2] << 8) | p[3];
}
static forceinline u64
get_unaligned_le64(const u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST)
return le64_bswap(load_u64_unaligned(p));
else
return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
((u64)p[5] << 40) | ((u64)p[4] << 32) |
((u64)p[3] << 24) | ((u64)p[2] << 16) |
((u64)p[1] << 8) | p[0];
}
static forceinline machine_word_t
get_unaligned_leword(const u8 *p)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
return get_unaligned_le32(p);
else
return get_unaligned_le64(p);
}
/***** Unaligned stores *****/
static forceinline void
put_unaligned_le16(u16 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u16_unaligned(le16_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
}
}
static forceinline void
put_unaligned_be16(u16 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u16_unaligned(be16_bswap(v), p);
} else {
p[0] = (u8)(v >> 8);
p[1] = (u8)(v >> 0);
}
}
static forceinline void
put_unaligned_le32(u32 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u32_unaligned(le32_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
p[2] = (u8)(v >> 16);
p[3] = (u8)(v >> 24);
}
}
static forceinline void
put_unaligned_be32(u32 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u32_unaligned(be32_bswap(v), p);
} else {
p[0] = (u8)(v >> 24);
p[1] = (u8)(v >> 16);
p[2] = (u8)(v >> 8);
p[3] = (u8)(v >> 0);
}
}
static forceinline void
put_unaligned_le64(u64 v, u8 *p)
{
if (UNALIGNED_ACCESS_IS_FAST) {
store_u64_unaligned(le64_bswap(v), p);
} else {
p[0] = (u8)(v >> 0);
p[1] = (u8)(v >> 8);
p[2] = (u8)(v >> 16);
p[3] = (u8)(v >> 24);
p[4] = (u8)(v >> 32);
p[5] = (u8)(v >> 40);
p[6] = (u8)(v >> 48);
p[7] = (u8)(v >> 56);
}
}
static forceinline void
put_unaligned_leword(machine_word_t v, u8 *p)
{
STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
if (WORDBITS == 32)
put_unaligned_le32(v, p);
else
put_unaligned_le64(v, p);
}
/***** 24-bit loads *****/
/*
* Given a 32-bit value that was loaded with the platform's native endianness,
* return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
* bits contain the first 3 bytes, arranged in octets in a platform-dependent
* order, at the memory location from which the input 32-bit value was loaded.
*/
static forceinline u32
loaded_u32_to_u24(u32 v)
{
if (CPU_IS_LITTLE_ENDIAN())
return v & 0xFFFFFF;
else
return v >> 8;
}
/*
* Load the next 3 bytes from the memory location @p into the 24 low-order bits
* of a 32-bit value. The order in which the 3 bytes will be arranged as octets
* in the 24 bits is platform-dependent. At least LOAD_U24_REQUIRED_NBYTES
* bytes must be available at @p; note that this may be more than 3.
*/
static forceinline u32
load_u24_unaligned(const u8 *p)
{
#if UNALIGNED_ACCESS_IS_FAST
# define LOAD_U24_REQUIRED_NBYTES 4
return loaded_u32_to_u24(load_u32_unaligned(p));
#else
# define LOAD_U24_REQUIRED_NBYTES 3
if (CPU_IS_LITTLE_ENDIAN())
return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
else
return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
#endif
}
#endif /* LIB_UNALIGNED_H */

View file

@ -1,332 +0,0 @@
/*
* x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "cpu_features.h"
/*
* The following macros horizontally sum the s1 counters and add them to the
* real s1, and likewise for s2. They do this via a series of reductions, each
* of which halves the vector length, until just one counter remains.
*
* The s1 reductions don't depend on the s2 reductions and vice versa, so for
* efficiency they are interleaved. Also, every other s1 counter is 0 due to
* the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
* 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
*/
#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2) \
{ \
__v4si s1_last = (v_s1), s2_last = (v_s2); \
\
/* 128 => 32 bits */ \
s2_last += (__v4si)_mm_shuffle_epi32((__m128i)s2_last, 0x31); \
s1_last += (__v4si)_mm_shuffle_epi32((__m128i)s1_last, 0x02); \
s2_last += (__v4si)_mm_shuffle_epi32((__m128i)s2_last, 0x02); \
\
*(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last); \
*(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last); \
}
#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2) \
{ \
__v4si s1_128bit, s2_128bit; \
\
/* 256 => 128 bits */ \
s1_128bit = (__v4si)_mm256_extracti128_si256((__m256i)(v_s1), 0) + \
(__v4si)_mm256_extracti128_si256((__m256i)(v_s1), 1); \
s2_128bit = (__v4si)_mm256_extracti128_si256((__m256i)(v_s2), 0) + \
(__v4si)_mm256_extracti128_si256((__m256i)(v_s2), 1); \
\
ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit); \
}
#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2) \
{ \
__v8si s1_256bit, s2_256bit; \
\
/* 512 => 256 bits */ \
s1_256bit = (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \
(__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1); \
s2_256bit = (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \
(__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1); \
\
ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit); \
}
/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
#undef DISPATCH_AVX512BW
#if !defined(DEFAULT_IMPL) && \
/*
* clang before v3.9 is missing some AVX-512BW intrinsics including
* _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512. So just make using
* AVX-512BW, even when __AVX512BW__ is defined, conditional on
* COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin.
*/ \
COMPILER_SUPPORTS_AVX512BW_TARGET && \
(defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS))
# define FUNCNAME adler32_avx512bw
# define FUNCNAME_CHUNK adler32_avx512bw_chunk
# define IMPL_ALIGNMENT 64
# define IMPL_SEGMENT_SIZE 64
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
# ifdef __AVX512BW__
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_avx512bw
# else
# define ATTRIBUTES __attribute__((target("avx512bw")))
# define DISPATCH 1
# define DISPATCH_AVX512BW 1
# endif
# include <immintrin.h>
static forceinline ATTRIBUTES void
adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
u32 *s1, u32 *s2)
{
const __m512i zeroes = _mm512_setzero_si512();
const __v64qi multipliers = (__v64qi){
64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
__v16si v_s1 = (__v16si)zeroes;
__v16si v_s1_sums = (__v16si)zeroes;
__v16si v_s2 = (__v16si)zeroes;
do {
/* Load the next 64-byte segment */
__m512i bytes = *p++;
/* Multiply the bytes by 64...1 (the number of times they need
* to be added to s2) and add adjacent products */
__v32hi sums = (__v32hi)_mm512_maddubs_epi16(
bytes, (__m512i)multipliers);
/* Keep sum of all previous s1 counters, for adding to s2 later.
* This allows delaying the multiplication by 64 to the end. */
v_s1_sums += v_s1;
/* Add the sum of each group of 8 bytes to the corresponding s1
* counter */
v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
/* Add the sum of each group of 4 products of the bytes by
* 64...1 to the corresponding s2 counter */
v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
(__m512i)ones);
} while (p != end);
/* Finish the s2 counters by adding the sum of the s1 values at the
* beginning of each segment, multiplied by the segment size (64) */
v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);
/* Add the counters to the real s1 and s2 */
ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* AVX-512BW implementation */
/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
#undef DISPATCH_AVX2
#if !defined(DEFAULT_IMPL) && \
(defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS))
# define FUNCNAME adler32_avx2
# define FUNCNAME_CHUNK adler32_avx2_chunk
# define IMPL_ALIGNMENT 32
# define IMPL_SEGMENT_SIZE 32
# define IMPL_MAX_CHUNK_SIZE MAX_CHUNK_SIZE
# ifdef __AVX2__
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_avx2
# else
# define ATTRIBUTES __attribute__((target("avx2")))
# define DISPATCH 1
# define DISPATCH_AVX2 1
# endif
# include <immintrin.h>
static forceinline ATTRIBUTES void
adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
{
const __m256i zeroes = _mm256_setzero_si256();
const __v32qi multipliers = (__v32qi){
32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1,
};
const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
__v8si v_s1 = (__v8si)zeroes;
__v8si v_s1_sums = (__v8si)zeroes;
__v8si v_s2 = (__v8si)zeroes;
do {
/* Load the next 32-byte segment */
__m256i bytes = *p++;
/* Multiply the bytes by 32...1 (the number of times they need
* to be added to s2) and add adjacent products */
__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
bytes, (__m256i)multipliers);
/* Keep sum of all previous s1 counters, for adding to s2 later.
* This allows delaying the multiplication by 32 to the end. */
v_s1_sums += v_s1;
/* Add the sum of each group of 8 bytes to the corresponding s1
* counter */
v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
/* Add the sum of each group of 4 products of the bytes by
* 32...1 to the corresponding s2 counter */
v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
} while (p != end);
/* Finish the s2 counters by adding the sum of the s1 values at the
* beginning of each segment, multiplied by the segment size (32) */
v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
/* Add the counters to the real s1 and s2 */
ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* AVX2 implementation */
/* SSE2 implementation */
#undef DISPATCH_SSE2
#if !defined(DEFAULT_IMPL) && \
(defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS))
# define FUNCNAME adler32_sse2
# define FUNCNAME_CHUNK adler32_sse2_chunk
# define IMPL_ALIGNMENT 16
# define IMPL_SEGMENT_SIZE 32
/*
* The 16-bit precision byte counters must not be allowed to undergo *signed*
* overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
* would behave incorrectly.
*/
# define IMPL_MAX_CHUNK_SIZE (32 * (0x7FFF / 0xFF))
# ifdef __SSE2__
# define ATTRIBUTES
# define DEFAULT_IMPL adler32_sse2
# else
# define ATTRIBUTES __attribute__((target("sse2")))
# define DISPATCH 1
# define DISPATCH_SSE2 1
# endif
# include <emmintrin.h>
static forceinline ATTRIBUTES void
adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
{
const __m128i zeroes = _mm_setzero_si128();
/* s1 counters: 32-bit, sum of bytes */
__v4si v_s1 = (__v4si)zeroes;
/* s2 counters: 32-bit, sum of s1 values */
__v4si v_s2 = (__v4si)zeroes;
/*
* Thirty-two 16-bit counters for byte sums. Each accumulates the bytes
* that eventually need to be multiplied by a number 32...1 for addition
* into s2.
*/
__v8hi v_byte_sums_a = (__v8hi)zeroes;
__v8hi v_byte_sums_b = (__v8hi)zeroes;
__v8hi v_byte_sums_c = (__v8hi)zeroes;
__v8hi v_byte_sums_d = (__v8hi)zeroes;
do {
/* Load the next 32 bytes */
const __m128i bytes1 = *p++;
const __m128i bytes2 = *p++;
/*
* Accumulate the previous s1 counters into the s2 counters.
* Logically, this really should be v_s2 += v_s1 * 32, but we
* can do the multiplication (or left shift) later.
*/
v_s2 += v_s1;
/*
* s1 update: use "Packed Sum of Absolute Differences" to add
* the bytes horizontally with 8 bytes per sum. Then add the
* sums to the s1 counters.
*/
v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
/*
* Also accumulate the bytes into 32 separate counters that have
* 16-bit precision.
*/
v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
} while (p != end);
/* Finish calculating the s2 counters */
v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
(__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
(__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
(__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
(__m128i)(__v8hi){ 8, 7, 6, 5, 4, 3, 2, 1 });
/* Add the counters to the real s1 and s2 */
ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
}
# include "../adler32_vec_template.h"
#endif /* SSE2 implementation */
#ifdef DISPATCH
static inline adler32_func_t
arch_select_adler32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_AVX512BW
if (features & X86_CPU_FEATURE_AVX512BW)
return adler32_avx512bw;
#endif
#ifdef DISPATCH_AVX2
if (features & X86_CPU_FEATURE_AVX2)
return adler32_avx2;
#endif
#ifdef DISPATCH_SSE2
if (features & X86_CPU_FEATURE_SSE2)
return adler32_sse2;
#endif
return NULL;
}
#endif /* DISPATCH */

View file

@ -1,139 +0,0 @@
/*
* x86/cpu_features.c - feature detection for x86 processors
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "cpu_features.h"
#if X86_CPU_FEATURES_ENABLED
volatile u32 _cpu_features = 0;
/* With old GCC versions we have to manually save and restore the x86_32 PIC
* register (ebx). See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602 */
#if defined(__i386__) && defined(__PIC__)
# define EBX_CONSTRAINT "=r"
#else
# define EBX_CONSTRAINT "=b"
#endif
/* Execute the CPUID instruction. */
static inline void
cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
{
__asm__(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n"
"cpuid \n"
".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
: "a" (leaf), "c" (subleaf));
}
/* Read an extended control register. */
static inline u64
read_xcr(u32 index)
{
u32 edx, eax;
/* Execute the "xgetbv" instruction. Old versions of binutils do not
* recognize this instruction, so list the raw bytes instead. */
__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
return ((u64)edx << 32) | eax;
}
#undef BIT
#define BIT(nr) (1UL << (nr))
#define XCR0_BIT_SSE BIT(1)
#define XCR0_BIT_AVX BIT(2)
#define XCR0_BIT_OPMASK BIT(5)
#define XCR0_BIT_ZMM_HI256 BIT(6)
#define XCR0_BIT_HI16_ZMM BIT(7)
#define IS_SET(reg, nr) ((reg) & BIT(nr))
#define IS_ALL_SET(reg, mask) (((reg) & (mask)) == (mask))
/* Initialize _cpu_features with bits for interesting processor features. */
void setup_cpu_features(void)
{
u32 features = 0;
u32 dummy1, dummy2, dummy3, dummy4;
u32 max_function;
u32 features_1, features_2, features_3, features_4;
bool os_avx_support = false;
bool os_avx512_support = false;
/* Get maximum supported function */
cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
if (max_function < 1)
goto out;
/* Standard feature flags */
cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
if (IS_SET(features_1, 26))
features |= X86_CPU_FEATURE_SSE2;
if (IS_SET(features_2, 1))
features |= X86_CPU_FEATURE_PCLMULQDQ;
if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
u64 xcr0 = read_xcr(0);
os_avx_support = IS_ALL_SET(xcr0,
XCR0_BIT_SSE |
XCR0_BIT_AVX);
os_avx512_support = IS_ALL_SET(xcr0,
XCR0_BIT_SSE |
XCR0_BIT_AVX |
XCR0_BIT_OPMASK |
XCR0_BIT_ZMM_HI256 |
XCR0_BIT_HI16_ZMM);
}
if (os_avx_support && IS_SET(features_2, 28))
features |= X86_CPU_FEATURE_AVX;
if (max_function < 7)
goto out;
/* Extended feature flags */
cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
if (os_avx_support && IS_SET(features_3, 5))
features |= X86_CPU_FEATURE_AVX2;
if (IS_SET(features_3, 8))
features |= X86_CPU_FEATURE_BMI2;
if (os_avx512_support && IS_SET(features_3, 30))
features |= X86_CPU_FEATURE_AVX512BW;
out:
_cpu_features = features | X86_CPU_FEATURES_KNOWN;
}
#endif /* X86_CPU_FEATURES_ENABLED */

View file

@ -1,41 +0,0 @@
/*
* x86/cpu_features.h - feature detection for x86 processors
*/
#ifndef LIB_X86_CPU_FEATURES_H
#define LIB_X86_CPU_FEATURES_H
#include "../lib_common.h"
#if (defined(__i386__) || defined(__x86_64__)) && \
COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
# define X86_CPU_FEATURES_ENABLED 1
#else
# define X86_CPU_FEATURES_ENABLED 0
#endif
#if X86_CPU_FEATURES_ENABLED
#define X86_CPU_FEATURE_SSE2 0x00000001
#define X86_CPU_FEATURE_PCLMULQDQ 0x00000002
#define X86_CPU_FEATURE_AVX 0x00000004
#define X86_CPU_FEATURE_AVX2 0x00000008
#define X86_CPU_FEATURE_BMI2 0x00000010
#define X86_CPU_FEATURE_AVX512BW 0x00000020
#define X86_CPU_FEATURES_KNOWN 0x80000000
extern volatile u32 _cpu_features;
extern void setup_cpu_features(void);
static inline u32 get_cpu_features(void)
{
if (_cpu_features == 0)
setup_cpu_features();
return _cpu_features;
}
#endif /* X86_CPU_FEATURES_ENABLED */
#endif /* LIB_X86_CPU_FEATURES_H */

View file

@ -1,87 +0,0 @@
/*
* x86/crc32_impl.h - x86 implementations of CRC-32 checksum algorithm
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "cpu_features.h"
/*
* Include the PCLMUL/AVX implementation? Although our PCLMUL-optimized CRC-32
* function doesn't use any AVX intrinsics specifically, it can benefit a lot
* from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
* MB/s. I expect this is related to the PCLMULQDQ instructions being assembled
* in the newer three-operand form rather than the older two-operand form.
*
* Note: this is only needed if __AVX__ is *not* defined, since otherwise the
* "regular" PCLMUL implementation would already be AVX enabled.
*/
#undef DISPATCH_PCLMUL_AVX
#if !defined(DEFAULT_IMPL) && !defined(__AVX__) && \
X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET && \
(defined(__PCLMUL__) || COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS)
# define FUNCNAME crc32_pclmul_avx
# define FUNCNAME_ALIGNED crc32_pclmul_avx_aligned
# define ATTRIBUTES __attribute__((target("pclmul,avx")))
# define DISPATCH 1
# define DISPATCH_PCLMUL_AVX 1
# include "crc32_pclmul_template.h"
#endif
/* PCLMUL implementation */
#undef DISPATCH_PCLMUL
#if !defined(DEFAULT_IMPL) && \
(defined(__PCLMUL__) || (X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS))
# define FUNCNAME crc32_pclmul
# define FUNCNAME_ALIGNED crc32_pclmul_aligned
# ifdef __PCLMUL__
# define ATTRIBUTES
# define DEFAULT_IMPL crc32_pclmul
# else
# define ATTRIBUTES __attribute__((target("pclmul")))
# define DISPATCH 1
# define DISPATCH_PCLMUL 1
# endif
# include "crc32_pclmul_template.h"
#endif
#ifdef DISPATCH
static inline crc32_func_t
arch_select_crc32_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_PCLMUL_AVX
if ((features & X86_CPU_FEATURE_PCLMULQDQ) &&
(features & X86_CPU_FEATURE_AVX))
return crc32_pclmul_avx;
#endif
#ifdef DISPATCH_PCLMUL
if (features & X86_CPU_FEATURE_PCLMULQDQ)
return crc32_pclmul;
#endif
return NULL;
}
#endif /* DISPATCH */

View file

@ -1,262 +0,0 @@
/*
* x86/crc32_pclmul_template.h
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include <wmmintrin.h>
/*
* CRC-32 folding with PCLMULQDQ.
*
* The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
* producing an abbreviated message which is congruent the original message
* modulo the generator polynomial G(x).
*
* Folding each 512 bits is implemented as eight 64-bit folds, each of which
* uses one carryless multiplication instruction. It's expected that CPUs may
* be able to execute some of these multiplications in parallel.
*
* Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
* be 95 bits from a constant distance D later in the message. The relevant
* portion of the message can be written as:
*
* M(x) = A(x)*x^D + B(x)
*
* ... where + and * represent addition and multiplication, respectively, of
* polynomials over GF(2). Note that when implemented on a computer, these
* operations are equivalent to XOR and carryless multiplication, respectively.
*
* For the purpose of CRC calculation, only the remainder modulo the generator
* polynomial G(x) matters:
*
* M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
*
* Since the modulo operation can be applied anywhere in a sequence of additions
* and multiplications without affecting the result, this is equivalent to:
*
* M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
*
* For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
* a 32-bit quantity. So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
* multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
* product. Then, adding (XOR-ing) the product to B(x) produces a polynomial
* with the same length as B(x) but with the same remainder as 'A(x)*x^D +
* B(x)'. This is the basic fold operation with 64 bits.
*
* Note that the carryless multiplication instruction PCLMULQDQ actually takes
* two 64-bit inputs and produces a 127-bit product in the low-order bits of a
* 128-bit XMM register. This works fine, but care must be taken to account for
* "bit endianness". With the CRC version implemented here, bits are always
* ordered such that the lowest-order bit represents the coefficient of highest
* power of x and the highest-order bit represents the coefficient of the lowest
* power of x. This is backwards from the more intuitive order. Still,
* carryless multiplication works essentially the same either way. It just must
* be accounted for that when we XOR the 95-bit product in the low-order 95 bits
* of a 128-bit XMM register into 128-bits of later data held in another XMM
* register, we'll really be XOR-ing the product into the mathematically higher
* degree end of those later bits, not the lower degree end as may be expected.
*
* So given that caveat and the fact that we process 512 bits per iteration, the
* 'D' values we need for the two 64-bit halves of each 128 bits of data are:
*
* D = (512 + 95) - 64 for the higher-degree half of each 128 bits,
* i.e. the lower order bits in the XMM register
*
* D = (512 + 95) - 128 for the lower-degree half of each 128 bits,
* i.e. the higher order bits in the XMM register
*
* The required 'x^D mod G(x)' values were precomputed.
*
* When <= 512 bits remain in the message, we finish up by folding across
* smaller distances. This works similarly; the distance D is just different,
* so different constant multipliers must be used. Finally, once the remaining
* message is just 64 bits, it is is reduced to the CRC-32 using Barrett
* reduction (explained later).
*
* For more information see the original paper from Intel:
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* December 2009
* http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
*/
static u32 ATTRIBUTES
FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs)
{
/* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */
const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
const __v2di barrett_reduction_constants =
(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
const __m128i * const end = p + nr_segs;
const __m128i * const end512 = p + (nr_segs & ~3);
__m128i x0, x1, x2, x3;
/*
* Account for the current 'remainder', i.e. the CRC of the part of the
* message already processed. Explanation: rewrite the message
* polynomial M(x) in terms of the first part A(x), the second part
* B(x), and the length of the second part in bits |B(x)| >= 32:
*
* M(x) = A(x)*x^|B(x)| + B(x)
*
* Then the CRC of M(x) is:
*
* CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
* = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
* = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
*
* Note: all arithmetic is modulo G(x), the generator polynomial; that's
* why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
*
* So the CRC of the full message is the CRC of the second part of the
* message where the first 32 bits of the second part of the message
* have been XOR'ed with the CRC of the first part of the message.
*/
x0 = *p++;
x0 ^= (__m128i)(__v4si){ remainder };
if (p > end512) /* only 128, 256, or 384 bits of input? */
goto _128_bits_at_a_time;
x1 = *p++;
x2 = *p++;
x3 = *p++;
/* Fold 512 bits at a time */
for (; p != end512; p += 4) {
__m128i y0, y1, y2, y3;
y0 = p[0];
y1 = p[1];
y2 = p[2];
y3 = p[3];
/*
* Note: the immediate constant for PCLMULQDQ specifies which
* 64-bit halves of the 128-bit vectors to multiply:
*
* 0x00 means low halves (higher degree polynomial terms for us)
* 0x11 means high halves (lower degree polynomial terms for us)
*/
y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
x0 = y0;
x1 = y1;
x2 = y2;
x3 = y3;
}
/* Fold 512 bits => 128 bits */
x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
x0 = x3;
_128_bits_at_a_time:
while (p != end) {
/* Fold 128 bits into next 128 bits */
x1 = *p++;
x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
x0 = x1;
}
/* Now there are just 128 bits left, stored in 'x0'. */
/*
* Fold 128 => 96 bits. This also implicitly appends 32 zero bits,
* which is equivalent to multiplying by x^32. This is needed because
* the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
*/
x0 = _mm_srli_si128(x0, 8) ^
_mm_clmulepi64_si128(x0, multipliers_1, 0x10);
/* Fold 96 => 64 bits */
x0 = _mm_srli_si128(x0, 4) ^
_mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
/*
* Finally, reduce 64 => 32 bits using Barrett reduction.
*
* Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to
* compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)):
*
* R(x) = (A(x)*x^32 + B(x)) mod G(x)
* = (A(x)*x^32) mod G(x) + B(x)
*
* Then, by the Division Algorithm there exists a unique q(x) such that:
*
* A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
*
* Since the left-hand side is of maximum degree 31, the right-hand side
* must be too. This implies that we can apply 'mod x^32' to the
* right-hand side without changing its value:
*
* (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
*
* Note that '+' is equivalent to '-' in polynomials over GF(2).
*
* We also know that:
*
* / A(x)*x^32 \
* q(x) = floor ( --------- )
* \ G(x) /
*
* To compute this efficiently, we can multiply the top and bottom by
* x^32 and move the division by G(x) to the top:
*
* / A(x) * floor(x^64 / G(x)) \
* q(x) = floor ( ------------------------- )
* \ x^32 /
*
* Note that floor(x^64 / G(x)) is a constant.
*
* So finally we have:
*
* / A(x) * floor(x^64 / G(x)) \
* R(x) = B(x) + G(x)*floor ( ------------------------- )
* \ x^32 /
*/
x1 = x0;
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
}
#define IMPL_ALIGNMENT 16
#define IMPL_SEGMENT_SIZE 16
#include "../crc32_vec_template.h"

View file

@ -1,26 +0,0 @@
#include "cpu_features.h"
/* Include the BMI2-optimized version? */
#undef DISPATCH_BMI2
#if !defined(__BMI2__) && X86_CPU_FEATURES_ENABLED && \
COMPILER_SUPPORTS_BMI2_TARGET
# define FUNCNAME deflate_decompress_bmi2
# define ATTRIBUTES __attribute__((target("bmi2")))
# define DISPATCH 1
# define DISPATCH_BMI2 1
# include "../decompress_template.h"
#endif
#ifdef DISPATCH
static inline decompress_func_t
arch_select_decompress_func(void)
{
u32 features = get_cpu_features();
#ifdef DISPATCH_BMI2
if (features & X86_CPU_FEATURE_BMI2)
return deflate_decompress_bmi2;
#endif
return NULL;
}
#endif /* DISPATCH */

View file

@ -1,164 +0,0 @@
/*
* x86/matchfinder_impl.h - x86 implementations of matchfinder functions
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#ifdef __AVX2__
# if MATCHFINDER_ALIGNMENT < 32
# undef MATCHFINDER_ALIGNMENT
# define MATCHFINDER_ALIGNMENT 32
# endif
# include <immintrin.h>
static forceinline bool
matchfinder_init_avx2(mf_pos_t *data, size_t size)
{
__m256i v, *p;
size_t n;
if (size % (sizeof(__m256i) * 4) != 0)
return false;
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
p = (__m256i *)data;
n = size / (sizeof(__m256i) * 4);
do {
p[0] = v;
p[1] = v;
p[2] = v;
p[3] = v;
p += 4;
} while (--n);
return true;
}
static forceinline bool
matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
{
__m256i v, *p;
size_t n;
if (size % (sizeof(__m256i) * 4) != 0)
return false;
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
p = (__m256i *)data;
n = size / (sizeof(__m256i) * 4);
do {
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
p[0] = _mm256_adds_epi16(p[0], v);
p[1] = _mm256_adds_epi16(p[1], v);
p[2] = _mm256_adds_epi16(p[2], v);
p[3] = _mm256_adds_epi16(p[3], v);
p += 4;
} while (--n);
return true;
}
#endif /* __AVX2__ */
#ifdef __SSE2__
# if MATCHFINDER_ALIGNMENT < 16
# undef MATCHFINDER_ALIGNMENT
# define MATCHFINDER_ALIGNMENT 16
# endif
# include <emmintrin.h>
static forceinline bool
matchfinder_init_sse2(mf_pos_t *data, size_t size)
{
__m128i v, *p;
size_t n;
if (size % (sizeof(__m128i) * 4) != 0)
return false;
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
v = _mm_set1_epi16(MATCHFINDER_INITVAL);
p = (__m128i *)data;
n = size / (sizeof(__m128i) * 4);
do {
p[0] = v;
p[1] = v;
p[2] = v;
p[3] = v;
p += 4;
} while (--n);
return true;
}
static forceinline bool
matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
{
__m128i v, *p;
size_t n;
if (size % (sizeof(__m128i) * 4) != 0)
return false;
STATIC_ASSERT(sizeof(mf_pos_t) == 2);
v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
p = (__m128i *)data;
n = size / (sizeof(__m128i) * 4);
do {
/* PADDSW: Add Packed Signed Integers With Signed Saturation */
p[0] = _mm_adds_epi16(p[0], v);
p[1] = _mm_adds_epi16(p[1], v);
p[2] = _mm_adds_epi16(p[2], v);
p[3] = _mm_adds_epi16(p[3], v);
p += 4;
} while (--n);
return true;
}
#endif /* __SSE2__ */
#undef arch_matchfinder_init
static forceinline bool
arch_matchfinder_init(mf_pos_t *data, size_t size)
{
#ifdef __AVX2__
if (matchfinder_init_avx2(data, size))
return true;
#endif
#ifdef __SSE2__
if (matchfinder_init_sse2(data, size))
return true;
#endif
return false;
}
#undef arch_matchfinder_rebase
static forceinline bool
arch_matchfinder_rebase(mf_pos_t *data, size_t size)
{
#ifdef __AVX2__
if (matchfinder_rebase_avx2(data, size))
return true;
#endif
#ifdef __SSE2__
if (matchfinder_rebase_sse2(data, size))
return true;
#endif
return false;
}

View file

@ -1,87 +0,0 @@
/*
* zlib_compress.c - compress with a zlib wrapper
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "deflate_compress.h"
#include "unaligned.h"
#include "zlib_constants.h"
#include "libdeflate.h"
LIBDEFLATEAPI size_t
libdeflate_zlib_compress(struct libdeflate_compressor *c,
const void *in, size_t in_size,
void *out, size_t out_nbytes_avail)
{
u8 *out_next = out;
u16 hdr;
unsigned compression_level;
unsigned level_hint;
size_t deflate_size;
if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
return 0;
/* 2 byte header: CMF and FLG */
hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
compression_level = deflate_get_compression_level(c);
if (compression_level < 2)
level_hint = ZLIB_FASTEST_COMPRESSION;
else if (compression_level < 6)
level_hint = ZLIB_FAST_COMPRESSION;
else if (compression_level < 8)
level_hint = ZLIB_DEFAULT_COMPRESSION;
else
level_hint = ZLIB_SLOWEST_COMPRESSION;
hdr |= level_hint << 6;
hdr |= 31 - (hdr % 31);
put_unaligned_be16(hdr, out_next);
out_next += 2;
/* Compressed data */
deflate_size = libdeflate_deflate_compress(c, in, in_size, out_next,
out_nbytes_avail - ZLIB_MIN_OVERHEAD);
if (deflate_size == 0)
return 0;
out_next += deflate_size;
/* ADLER32 */
put_unaligned_be32(libdeflate_adler32(1, in, in_size), out_next);
out_next += 4;
return out_next - (u8 *)out;
}
LIBDEFLATEAPI size_t
libdeflate_zlib_compress_bound(struct libdeflate_compressor *c,
size_t in_nbytes)
{
return ZLIB_MIN_OVERHEAD +
libdeflate_deflate_compress_bound(c, in_nbytes);
}

View file

@ -1,21 +0,0 @@
/*
* zlib_constants.h - constants for the zlib wrapper format
*/
#ifndef LIB_ZLIB_CONSTANTS_H
#define LIB_ZLIB_CONSTANTS_H
#define ZLIB_MIN_HEADER_SIZE 2
#define ZLIB_FOOTER_SIZE 4
#define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
#define ZLIB_CM_DEFLATE 8
#define ZLIB_CINFO_32K_WINDOW 7
#define ZLIB_FASTEST_COMPRESSION 0
#define ZLIB_FAST_COMPRESSION 1
#define ZLIB_DEFAULT_COMPRESSION 2
#define ZLIB_SLOWEST_COMPRESSION 3
#endif /* LIB_ZLIB_CONSTANTS_H */

View file

@ -1,91 +0,0 @@
/*
* zlib_decompress.c - decompress with a zlib wrapper
*
* Originally public domain; changes after 2016-09-07 are copyrighted.
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "unaligned.h"
#include "zlib_constants.h"
#include "libdeflate.h"
LIBDEFLATEAPI enum libdeflate_result
libdeflate_zlib_decompress(struct libdeflate_decompressor *d,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret)
{
const u8 *in_next = in;
const u8 * const in_end = in_next + in_nbytes;
u16 hdr;
size_t actual_out_nbytes;
enum libdeflate_result result;
if (in_nbytes < ZLIB_MIN_OVERHEAD)
return LIBDEFLATE_BAD_DATA;
/* 2 byte header: CMF and FLG */
hdr = get_unaligned_be16(in_next);
in_next += 2;
/* FCHECK */
if ((hdr % 31) != 0)
return LIBDEFLATE_BAD_DATA;
/* CM */
if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE)
return LIBDEFLATE_BAD_DATA;
/* CINFO */
if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW)
return LIBDEFLATE_BAD_DATA;
/* FDICT */
if ((hdr >> 5) & 1)
return LIBDEFLATE_BAD_DATA;
/* Compressed data */
result = libdeflate_deflate_decompress(d, in_next,
in_end - ZLIB_FOOTER_SIZE - in_next,
out, out_nbytes_avail,
actual_out_nbytes_ret);
if (result != LIBDEFLATE_SUCCESS)
return result;
if (actual_out_nbytes_ret)
actual_out_nbytes = *actual_out_nbytes_ret;
else
actual_out_nbytes = out_nbytes_avail;
in_next = in_end - ZLIB_FOOTER_SIZE;
/* ADLER32 */
if (libdeflate_adler32(1, out, actual_out_nbytes) !=
get_unaligned_be32(in_next))
return LIBDEFLATE_BAD_DATA;
return LIBDEFLATE_SUCCESS;
}

View file

@ -1,323 +0,0 @@
/*
* libdeflate.h - public header for libdeflate
*/
#ifndef LIBDEFLATE_H
#define LIBDEFLATE_H
#ifdef __cplusplus
extern "C" {
#endif
#define LIBDEFLATE_VERSION_MAJOR 1
#define LIBDEFLATE_VERSION_MINOR 2
#define LIBDEFLATE_VERSION_STRING "1.2"
#include <stddef.h>
#include <stdint.h>
/*
* On Windows, if you want to link to the DLL version of libdeflate, then
* #define LIBDEFLATE_DLL. Note that the calling convention is cdecl.
*/
#ifdef LIBDEFLATE_DLL
# ifdef BUILDING_LIBDEFLATE
# define LIBDEFLATEAPI_SYM_VISIBILITY LIBEXPORT
# elif defined(_WIN32) || defined(__CYGWIN__)
# define LIBDEFLATEAPI_SYM_VISIBILITY __declspec(dllimport)
# endif
#endif
#ifndef LIBDEFLATEAPI_SYM_VISIBILITY
# define LIBDEFLATEAPI_SYM_VISIBILITY
#endif
#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \
defined(_WIN32) && defined(__i386__)
/*
* On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4.
* Realign the stack when entering libdeflate to avoid crashing in SSE/AVX
* code when called from an MSVC-compiled application.
*/
# define LIBDEFLATEAPI_STACKALIGN __attribute__((force_align_arg_pointer))
#endif
#ifndef LIBDEFLATEAPI_STACKALIGN
# define LIBDEFLATEAPI_STACKALIGN
#endif
#define LIBDEFLATEAPI LIBDEFLATEAPI_SYM_VISIBILITY LIBDEFLATEAPI_STACKALIGN
/* ========================================================================== */
/* Compression */
/* ========================================================================== */
struct libdeflate_compressor;
/*
* libdeflate_alloc_compressor() allocates a new compressor that supports
* DEFLATE, zlib, and gzip compression. 'compression_level' is the compression
* level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
* medium/default, 9 = slow, 12 = slowest). The return value is a pointer to
* the new compressor, or NULL if out of memory.
*
* Note: for compression, the sliding window size is defined at compilation time
* to 32768, the largest size permissible in the DEFLATE format. It cannot be
* changed at runtime.
*
* A single compressor is not safe to use by multiple threads concurrently.
* However, different threads may use different compressors concurrently.
*/
LIBDEFLATEAPI struct libdeflate_compressor *
libdeflate_alloc_compressor(int compression_level);
/*
* libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
* data. The function attempts to compress 'in_nbytes' bytes of data located at
* 'in' and write the results to 'out', which has space for 'out_nbytes_avail'
* bytes. The return value is the compressed size in bytes, or 0 if the data
* could not be compressed to 'out_nbytes_avail' bytes or fewer.
*/
LIBDEFLATEAPI size_t
libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail);
/*
* libdeflate_deflate_compress_bound() returns a worst-case upper bound on the
* number of bytes of compressed data that may be produced by compressing any
* buffer of length less than or equal to 'in_nbytes' using
* libdeflate_deflate_compress() with the specified compressor. Mathematically,
* this bound will necessarily be a number greater than or equal to 'in_nbytes'.
* It may be an overestimate of the true upper bound. The return value is
* guaranteed to be the same for all invocations with the same compressor and
* same 'in_nbytes'.
*
* As a special case, 'compressor' may be NULL. This causes the bound to be
* taken across *any* libdeflate_compressor that could ever be allocated with
* this build of the library, with any options.
*
* Note that this function is not necessary in many applications. With
* block-based compression, it is usually preferable to separately store the
* uncompressed size of each block and to store any blocks that did not compress
* to less than their original size uncompressed. In that scenario, there is no
* need to know the worst-case compressed size, since the maximum number of
* bytes of compressed data that may be used would always be one less than the
* input length. You can just pass a buffer of that size to
* libdeflate_deflate_compress() and store the data uncompressed if
* libdeflate_deflate_compress() returns 0, indicating that the compressed data
* did not fit into the provided output buffer.
*/
LIBDEFLATEAPI size_t
libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor,
size_t in_nbytes);
/*
* Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper
* format.
*/
LIBDEFLATEAPI size_t
libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail);
/*
* Like libdeflate_deflate_compress_bound(), but assumes the data will be
* compressed with libdeflate_zlib_compress() rather than with
* libdeflate_deflate_compress().
*/
LIBDEFLATEAPI size_t
libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor,
size_t in_nbytes);
/*
* Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper
* format.
*/
LIBDEFLATEAPI size_t
libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail);
/*
* Like libdeflate_deflate_compress_bound(), but assumes the data will be
* compressed with libdeflate_gzip_compress() rather than with
* libdeflate_deflate_compress().
*/
LIBDEFLATEAPI size_t
libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
size_t in_nbytes);
/*
* libdeflate_free_compressor() frees a compressor that was allocated with
* libdeflate_alloc_compressor(). If a NULL pointer is passed in, no action is
* taken.
*/
LIBDEFLATEAPI void
libdeflate_free_compressor(struct libdeflate_compressor *compressor);
/* ========================================================================== */
/* Decompression */
/* ========================================================================== */
struct libdeflate_decompressor;
/*
* libdeflate_alloc_decompressor() allocates a new decompressor that can be used
* for DEFLATE, zlib, and gzip decompression. The return value is a pointer to
* the new decompressor, or NULL if out of memory.
*
* This function takes no parameters, and the returned decompressor is valid for
* decompressing data that was compressed at any compression level and with any
* sliding window size.
*
* A single decompressor is not safe to use by multiple threads concurrently.
* However, different threads may use different decompressors concurrently.
*/
LIBDEFLATEAPI struct libdeflate_decompressor *
libdeflate_alloc_decompressor(void);
/*
* Result of a call to libdeflate_deflate_decompress(),
* libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
*/
enum libdeflate_result {
/* Decompression was successful. */
LIBDEFLATE_SUCCESS = 0,
/* Decompressed failed because the compressed data was invalid, corrupt,
* or otherwise unsupported. */
LIBDEFLATE_BAD_DATA = 1,
/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
* decompressed to fewer than 'out_nbytes_avail' bytes. */
LIBDEFLATE_SHORT_OUTPUT = 2,
/* The data would have decompressed to more than 'out_nbytes_avail'
* bytes. */
LIBDEFLATE_INSUFFICIENT_SPACE = 3,
};
/*
* libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream
* from the buffer 'in' with compressed size up to 'in_nbytes' bytes. The
* uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail'
* bytes. If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
* Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned. If
* a nonzero result code is returned, then the contents of the output buffer are
* undefined.
*
* Decompression stops at the end of the DEFLATE stream (as indicated by the
* BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
*
* libdeflate_deflate_decompress() can be used in cases where the actual
* uncompressed size is known (recommended) or unknown (not recommended):
*
* - If the actual uncompressed size is known, then pass the actual
* uncompressed size as 'out_nbytes_avail' and pass NULL for
* 'actual_out_nbytes_ret'. This makes libdeflate_deflate_decompress() fail
* with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the
* specified number of bytes.
*
* - If the actual uncompressed size is unknown, then provide a non-NULL
* 'actual_out_nbytes_ret' and provide a buffer with some size
* 'out_nbytes_avail' that you think is large enough to hold all the
* uncompressed data. In this case, if the data decompresses to less than
* or equal to 'out_nbytes_avail' bytes, then
* libdeflate_deflate_decompress() will write the actual uncompressed size
* to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS). Otherwise,
* it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was
* not large enough but no other problems were encountered, or another
* nonzero result code if decompression failed for another reason.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
* argument. If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
* then the actual compressed size of the DEFLATE stream (aligned to the next
* byte boundary) is written to *actual_in_nbytes_ret.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
* instead of raw DEFLATE.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
* instead of raw DEFLATE.
*
* If multiple gzip-compressed members are concatenated, then only the first
* will be decompressed. Use libdeflate_gzip_decompress_ex() if you need
* multi-member support.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_out_nbytes_ret);
/*
* Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
* argument. If 'actual_in_nbytes_ret' is not NULL and the decompression
* succeeds (indicating that the first gzip-compressed member in the input
* buffer was decompressed), then the actual number of input bytes consumed is
* written to *actual_in_nbytes_ret.
*/
LIBDEFLATEAPI enum libdeflate_result
libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
const void *in, size_t in_nbytes,
void *out, size_t out_nbytes_avail,
size_t *actual_in_nbytes_ret,
size_t *actual_out_nbytes_ret);
/*
* libdeflate_free_decompressor() frees a decompressor that was allocated with
* libdeflate_alloc_decompressor(). If a NULL pointer is passed in, no action
* is taken.
*/
LIBDEFLATEAPI void
libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
/* ========================================================================== */
/* Checksums */
/* ========================================================================== */
/*
* libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of
* data and returns the updated checksum. When starting a new checksum, the
* required initial value for 'adler' is 1. This value is also returned when
* 'buffer' is specified as NULL.
*/
LIBDEFLATEAPI uint32_t
libdeflate_adler32(uint32_t adler32, const void *buffer, size_t len);
/*
* libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data
* and returns the updated checksum. When starting a new checksum, the required
* initial value for 'crc' is 0. This value is also returned when 'buffer' is
* specified as NULL.
*/
LIBDEFLATEAPI uint32_t
libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
#ifdef __cplusplus
}
#endif
#endif /* LIBDEFLATE_H */