diff --git a/DobieStation/libdeflate/libdeflate.pro b/DobieStation/libdeflate/libdeflate.pro
deleted file mode 100644
index 63f79854..00000000
--- a/DobieStation/libdeflate/libdeflate.pro
+++ /dev/null
@@ -1,67 +0,0 @@
-win32:TARGET = libdeflate
-else:TARGET = deflate
-
-TEMPLATE = lib
-CONFIG += staticlib c99
-
-win32-msvc: QMAKE_CFLAGS += /MD /O2
-else {
-	QMAKE_CFLAGS += -O2 \
-		-fomit-frame-pointer \
-		-Wall -Wundef \
-		-Wpedantic -Wdeclaration-after-statement -Wmissing-prototypes -Wstrict-prototypes -Wvla \
-		-fvisibility=hidden -D_ANSI_SOURCE
-
-	mingw: QMAKE_CFLAGS += -Wno-pedantic-ms-format
-}
-
-INCLUDEPATH += \
-	../../ext/libdeflate \
-	../../ext/libdeflate/common
-
-HEADERS += \
-# common headers
-	../../ext/libdeflate/libdeflate.h \
-	../../ext/libdeflate/common/common_defs.h \
-	../../ext/libdeflate/common/compiler_gcc.h \
-	../../ext/libdeflate/common/compiler_msc.h \
-# library headers
-	../../ext/libdeflate/lib/adler32_vec_template.h \
-	../../ext/libdeflate/lib/aligned_malloc.h \
-	../../ext/libdeflate/lib/bt_matchfinder.h \
-	../../ext/libdeflate/lib/crc32_table.h \
-	../../ext/libdeflate/lib/crc32_vec_template.h \
-	../../ext/libdeflate/lib/decompress_template.h \
-	../../ext/libdeflate/lib/deflate_compress.h \
-	../../ext/libdeflate/lib/deflate_constants.h \
-	../../ext/libdeflate/lib/gzip_constants.h \
-	../../ext/libdeflate/lib/hc_matchfinder.h \
-	../../ext/libdeflate/lib/lib_common.h \
-	../../ext/libdeflate/lib/matchfinder_common.h \
-	../../ext/libdeflate/lib/unaligned.h \
-	../../ext/libdeflate/lib/zlib_constants.h \
-	../../ext/libdeflate/lib/arm/adler32_impl.h \
-	../../ext/libdeflate/lib/arm/cpu_features.h \
-	../../ext/libdeflate/lib/arm/crc32_impl.h \
-	../../ext/libdeflate/lib/arm/matchfinder_impl.h \
-	../../ext/libdeflate/lib/x86/adler32_impl.h \
-	../../ext/libdeflate/lib/x86/cpu_features.h \
-	../../ext/libdeflate/lib/x86/crc32_impl.h \
-	../../ext/libdeflate/lib/x86/crc32_pclmul_template.h \
-	../../ext/libdeflate/lib/x86/decompress_impl.h \
-	../../ext/libdeflate/lib/x86/matchfinder_impl.h
-
-SOURCES += \
-	../../ext/libdeflate/lib/aligned_malloc.c \
-	../../ext/libdeflate/lib/deflate_decompress.c \
-# uncomment for compression support
-	#../../ext/libdeflate/lib/deflate_compress.c \
-# uncomment for zlib format support
-	#../../ext/libdeflate/lib/adler32.c \
-	#../../ext/libdeflate/lib/zlib_decompress.c \
-	#../../ext/libdeflate/lib/zlib_compress.c \
-# uncomment for gzip support
-	#../../ext/libdeflate/lib/gzip_decompress.c \
-	#../../ext/libdeflate/lib/gzip_compress.c \
-	../../ext/libdeflate/lib/arm/cpu_features.c \
-	../../ext/libdeflate/lib/x86/cpu_features.c
diff --git a/DobieStation/libdeflate/libdeflate.vcxproj b/DobieStation/libdeflate/libdeflate.vcxproj
deleted file mode 100644
index 2fbaac94..00000000
--- a/DobieStation/libdeflate/libdeflate.vcxproj
+++ /dev/null
@@ -1,94 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <!-- configurations -->
-  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Release Optimized|x64">
-      <Configuration>Release Optimized</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|x64">
-      <Configuration>Release</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Devel|x64">
-      <Configuration>Devel</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-    <ProjectConfiguration Include="Debug|x64">
-      <Configuration>Debug</Configuration>
-      <Platform>x64</Platform>
-    </ProjectConfiguration>
-  </ItemGroup>
-  <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
-    <!-- Latest Target Version property -->
-    <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
-    <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
-    <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
-  </PropertyGroup>
-  <!-- globals -->
-  <PropertyGroup Label="Globals">
-    <ProjectGuid>{A77564F4-56BB-3D08-8126-3FD5FC44F217}</ProjectGuid>
-    <!-- <WindowsTargetPlatformVersion>10.0.18362.0</WindowsTargetPlatformVersion> -->
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <!-- configuration defaults -->
-  <PropertyGroup Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <CharacterSet>Unicode</CharacterSet>
-    <PlatformToolset>v141</PlatformToolset>
-  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings" />
-  <!-- prop includes -->
-  <ImportGroup Label="PropertySheets">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" />
-    <Import Project="..\common.props" />
-  </ImportGroup>
-  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup>
-    <TargetName>$(ProjectName)$(Postfix)</TargetName>
-  </PropertyGroup>
-  <ItemDefinitionGroup>
-    <Lib>
-      <OutputFile>$(BinDir)\$(ProjectName)$(Postfix).lib</OutputFile>
-    </Lib>
-  </ItemDefinitionGroup>
-  <!-- c files -->
-  <ItemGroup>
-    <ClCompile Include="$(ExtDir)\libdeflate\lib\aligned_malloc.c">
-      <DisableSpecificWarnings>4127;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-    </ClCompile>
-    <ClCompile Include="$(ExtDir)\libdeflate\lib\x86\cpu_features.c">
-      <DisableSpecificWarnings>4127;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-    </ClCompile>
-    <ClCompile Include="$(ExtDir)\libdeflate\lib\deflate_decompress.c">
-      <DisableSpecificWarnings>4127;4245;4100;4018;%(DisableSpecificWarnings)</DisableSpecificWarnings>
-    </ClCompile>
-    <!-- headers-->
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\x86\adler32_impl.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\adler32_vec_template.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\aligned_malloc.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\bt_matchfinder.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\common\common_defs.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\common\compiler_gcc.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\common\compiler_msc.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\x86\cpu_features.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\x86\crc32_impl.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\x86\crc32_pclmul_template.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\crc32_table.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\crc32_vec_template.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\x86\decompress_impl.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\decompress_template.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\deflate_compress.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\deflate_constants.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\gzip_constants.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\hc_matchfinder.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\lib_common.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\libdeflate.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\matchfinder_common.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\x86\matchfinder_impl.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\unaligned.h" />
-    <ClInclude Include="$(ExtDir)\libdeflate\lib\zlib_constants.h" />
-  </ItemGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-</Project>
\ No newline at end of file
diff --git a/DobieStation/libdeflate/libdeflate.vcxproj.filters b/DobieStation/libdeflate/libdeflate.vcxproj.filters
deleted file mode 100644
index 1a760763..00000000
--- a/DobieStation/libdeflate/libdeflate.vcxproj.filters
+++ /dev/null
@@ -1,126 +0,0 @@
-﻿<?xml version="1.0" encoding="utf-8"?>
-<Project xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-  <ItemGroup>
-    <Filter Include="Generated Files">
-      <UniqueIdentifier>{71ED8ED8-ACB9-4CE9-BBE1-E00B30144E11}</UniqueIdentifier>
-      <Extensions>cpp;c;cxx;moc;h;def;odl;idl;res;</Extensions>
-    </Filter>
-    <Filter Include="Generated Files">
-      <UniqueIdentifier>{71ED8ED8-ACB9-4CE9-BBE1-E00B30144E11}</UniqueIdentifier>
-      <Extensions>cpp;c;cxx;moc;h;def;odl;idl;res;</Extensions>
-    </Filter>
-    <Filter Include="Header Files">
-      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
-      <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
-    </Filter>
-    <Filter Include="Header Files">
-      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
-      <Extensions>h;hpp;hxx;hm;inl;inc;xsd</Extensions>
-    </Filter>
-    <Filter Include="Source Files">
-      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
-      <Extensions>cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
-    </Filter>
-    <Filter Include="Source Files">
-      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
-      <Extensions>cpp;c;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
-    </Filter>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="..\..\ext\libdeflate\lib\aligned_malloc.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\ext\libdeflate\lib\x86\cpu_features.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-    <ClCompile Include="..\..\ext\libdeflate\lib\deflate_decompress.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
-  </ItemGroup>
-  <ItemGroup>
-    <ClInclude Include="..\..\ext\libdeflate\lib\arm\adler32_impl.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\x86\adler32_impl.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\adler32_vec_template.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\aligned_malloc.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\bt_matchfinder.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\common\common_defs.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\common\compiler_gcc.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\common\compiler_msc.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\arm\cpu_features.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\x86\cpu_features.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\arm\crc32_impl.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\x86\crc32_impl.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\x86\crc32_pclmul_template.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\crc32_table.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\crc32_vec_template.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\x86\decompress_impl.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\decompress_template.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\deflate_compress.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\deflate_constants.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\gzip_constants.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\hc_matchfinder.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\lib_common.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\libdeflate.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\matchfinder_common.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\arm\matchfinder_impl.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\x86\matchfinder_impl.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\unaligned.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="..\..\ext\libdeflate\lib\zlib_constants.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-  </ItemGroup>
-</Project>
\ No newline at end of file
diff --git a/ext/libdeflate/CMakeLists.txt b/ext/libdeflate/CMakeLists.txt
deleted file mode 100644
index d1c08983..00000000
--- a/ext/libdeflate/CMakeLists.txt
+++ /dev/null
@@ -1,84 +0,0 @@
-project(libdeflate C)
-set(TARGET libdeflate)
-set(CMAKE_C_STANDARD 99)
-
-if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR
-    ${CMAKE_C_COMPILER_ID} STREQUAL "Clang" OR
-    ${CMAKE_C_COMPILER_ID} STREQUAL "AppleClang")
-
-    set(FLAGS ${FLAGS} -fomit-frame-pointer)
-    set(FLAGS ${FLAGS} -Wall -Wundef)
-    set(FLAGS ${FLAGS} -Wpedantic -Wdeclaration-after-statement -Wmissing-prototypes -Wstrict-prototypes -Wvla)
-    set(FLAGS ${FLAGS} -fvisibility=hidden -D_ANSI_SOURCE)
-
-    if (MINGW)
-        set(FLAGS ${FLAGS} -Wno-pedantic-ms-format)
-    endif()
-endif()
-
-set(COMMON_HEADERS
-    libdeflate.h
-
-    common/common_defs.h
-    common/compiler_gcc.h
-    common/compiler_msc.h)
-
-set(LIB_HEADERS
-    lib/adler32_vec_template.h
-    lib/aligned_malloc.h
-    lib/bt_matchfinder.h
-    lib/crc32_table.h
-    lib/crc32_vec_template.h
-    lib/decompress_template.h
-    lib/deflate_compress.h
-    lib/deflate_constants.h
-    lib/gzip_constants.h
-    lib/hc_matchfinder.h
-    lib/lib_common.h
-    lib/matchfinder_common.h
-    lib/unaligned.h
-    lib/zlib_constants.h
-
-    lib/arm/adler32_impl.h
-    lib/arm/cpu_features.h
-    lib/arm/crc32_impl.h
-    lib/arm/matchfinder_impl.h
-
-    lib/x86/adler32_impl.h
-    lib/x86/cpu_features.h
-    lib/x86/crc32_impl.h
-    lib/x86/crc32_pclmul_template.h
-    lib/x86/decompress_impl.h
-    lib/x86/matchfinder_impl.h)
-
-set(LIB_SRC
-    lib/aligned_malloc.c
-    lib/deflate_decompress.c
-
-    # uncomment for compression support
-    #lib/deflate_compress.c
-
-    # uncomment for zlib format support
-    #lib/adler32.c
-    #lib/zlib_decompress.c
-    #lib/zlib_compress.c
-
-    # uncomment for gzip support
-    #lib/gzip_decompress.c
-    #lib/gzip_compress.c
-
-    lib/arm/cpu_features.c
-    lib/x86/cpu_features.c)
-
-add_library(${TARGET} STATIC ${LIB_SRC} ${LIB_HEADERS} ${COMMON_HEADERS})
-add_library(Ext::libdeflate ALIAS ${TARGET})
-set_target_properties(${TARGET} PROPERTIES PREFIX "")
-set_property(TARGET ${TARGET} PROPERTY FOLDER External)
-
-target_include_directories(${TARGET}
-    PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}
-    PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/common)
-
-if (FLAGS)
-    target_compile_options(${TARGET} PRIVATE ${FLAGS})
-endif()
diff --git a/ext/libdeflate/COPYING b/ext/libdeflate/COPYING
deleted file mode 100644
index 1f1b81cd..00000000
--- a/ext/libdeflate/COPYING
+++ /dev/null
@@ -1,21 +0,0 @@
-Copyright 2016 Eric Biggers
-
-Permission is hereby granted, free of charge, to any person
-obtaining a copy of this software and associated documentation files
-(the "Software"), to deal in the Software without restriction,
-including without limitation the rights to use, copy, modify, merge,
-publish, distribute, sublicense, and/or sell copies of the Software,
-and to permit persons to whom the Software is furnished to do so,
-subject to the following conditions:
-
-The above copyright notice and this permission notice shall be
-included in all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
-ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
-CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
diff --git a/ext/libdeflate/common/common_defs.h b/ext/libdeflate/common/common_defs.h
deleted file mode 100644
index 80623085..00000000
--- a/ext/libdeflate/common/common_defs.h
+++ /dev/null
@@ -1,366 +0,0 @@
-/*
- * common_defs.h
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef COMMON_COMMON_DEFS_H
-#define COMMON_COMMON_DEFS_H
-
-#ifdef __GNUC__
-#  include "compiler_gcc.h"
-#elif defined(_MSC_VER)
-#  include "compiler_msc.h"
-#else
-#  pragma message("Unrecognized compiler.  Please add a header file for your compiler.  Compilation will proceed, but performance may suffer!")
-#endif
-
-/* ========================================================================== */
-/*                              Type definitions                              */
-/* ========================================================================== */
-
-#include <stddef.h> /* size_t */
-
-#ifndef __bool_true_false_are_defined
-#  include <stdbool.h> /* bool */
-#endif
-
-/* Fixed-width integer types */
-#ifndef PRIu32
-#  include <inttypes.h>
-#endif
-typedef uint8_t u8;
-typedef uint16_t u16;
-typedef uint32_t u32;
-typedef uint64_t u64;
-typedef int8_t s8;
-typedef int16_t s16;
-typedef int32_t s32;
-typedef int64_t s64;
-
-/*
- * Word type of the target architecture.  Use 'size_t' instead of 'unsigned
- * long' to account for platforms such as Windows that use 32-bit 'unsigned
- * long' on 64-bit architectures.
- */
-typedef size_t machine_word_t;
-
-/* Number of bytes in a word */
-#define WORDBYTES	((int)sizeof(machine_word_t))
-
-/* Number of bits in a word */
-#define WORDBITS	(8 * WORDBYTES)
-
-/* ========================================================================== */
-/*                         Optional compiler features                         */
-/* ========================================================================== */
-
-/* LIBEXPORT - export a function from a shared library */
-#ifndef LIBEXPORT
-#  define LIBEXPORT
-#endif
-
-/* inline - suggest that a function be inlined */
-#ifndef inline
-#  define inline
-#endif
-
-/* forceinline - force a function to be inlined, if possible */
-#ifndef forceinline
-#  define forceinline inline
-#endif
-
-/* restrict - annotate a non-aliased pointer */
-#ifndef restrict
-#  define restrict
-#endif
-
-/* likely(expr) - hint that an expression is usually true */
-#ifndef likely
-#  define likely(expr)		(expr)
-#endif
-
-/* unlikely(expr) - hint that an expression is usually false */
-#ifndef unlikely
-#  define unlikely(expr)	(expr)
-#endif
-
-/* prefetchr(addr) - prefetch into L1 cache for read */
-#ifndef prefetchr
-#  define prefetchr(addr)
-#endif
-
-/* prefetchw(addr) - prefetch into L1 cache for write */
-#ifndef prefetchw
-#  define prefetchw(addr)
-#endif
-
-/* Does the compiler support the 'target' function attribute? */
-#ifndef COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
-#  define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE 0
-#endif
-
-/* Which targets are supported with the 'target' function attribute? */
-#ifndef COMPILER_SUPPORTS_BMI2_TARGET
-#  define COMPILER_SUPPORTS_BMI2_TARGET 0
-#endif
-#ifndef COMPILER_SUPPORTS_AVX_TARGET
-#  define COMPILER_SUPPORTS_AVX_TARGET 0
-#endif
-#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET
-#  define COMPILER_SUPPORTS_AVX512BW_TARGET 0
-#endif
-
-/*
- * Which targets are supported with the 'target' function attribute and have
- * intrinsics that work within 'target'-ed functions?
- */
-#ifndef COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS
-#  define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS 0
-#endif
-#ifndef COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS
-#  define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS 0
-#endif
-#ifndef COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS
-#  define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS 0
-#endif
-#ifndef COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS
-#  define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS 0
-#endif
-#ifndef COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS
-#  define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS 0
-#endif
-#ifndef COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS
-#  define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS 0
-#endif
-
-/* _aligned_attribute(n) - declare that the annotated variable, or variables of
- * the annotated type, are to be aligned on n-byte boundaries */
-#ifndef _aligned_attribute
-#endif
-
-/* ========================================================================== */
-/*                          Miscellaneous macros                              */
-/* ========================================================================== */
-
-#define ARRAY_LEN(A)		(sizeof(A) / sizeof((A)[0]))
-#define MIN(a, b)		((a) <= (b) ? (a) : (b))
-#define MAX(a, b)		((a) >= (b) ? (a) : (b))
-#define DIV_ROUND_UP(n, d)	(((n) + (d) - 1) / (d))
-#define STATIC_ASSERT(expr)	((void)sizeof(char[1 - 2 * !(expr)]))
-#define ALIGN(n, a)		(((n) + (a) - 1) & ~((a) - 1))
-
-/* ========================================================================== */
-/*                           Endianness handling                              */
-/* ========================================================================== */
-
-/*
- * CPU_IS_LITTLE_ENDIAN() - a macro which evaluates to 1 if the CPU is little
- * endian or 0 if it is big endian.  The macro should be defined in a way such
- * that the compiler can evaluate it at compilation time.  If not defined, a
- * fallback is used.
- */
-#ifndef CPU_IS_LITTLE_ENDIAN
-static forceinline int CPU_IS_LITTLE_ENDIAN(void)
-{
-	union {
-		unsigned int v;
-		unsigned char b;
-	} u;
-	u.v = 1;
-	return u.b;
-}
-#endif
-
-/* bswap16(n) - swap the bytes of a 16-bit integer */
-#ifndef bswap16
-static forceinline u16 bswap16(u16 n)
-{
-	return (n << 8) | (n >> 8);
-}
-#endif
-
-/* bswap32(n) - swap the bytes of a 32-bit integer */
-#ifndef bswap32
-static forceinline u32 bswap32(u32 n)
-{
-	return ((n & 0x000000FF) << 24) |
-	       ((n & 0x0000FF00) << 8) |
-	       ((n & 0x00FF0000) >> 8) |
-	       ((n & 0xFF000000) >> 24);
-}
-#endif
-
-/* bswap64(n) - swap the bytes of a 64-bit integer */
-#ifndef bswap64
-static forceinline u64 bswap64(u64 n)
-{
-	return ((n & 0x00000000000000FF) << 56) |
-	       ((n & 0x000000000000FF00) << 40) |
-	       ((n & 0x0000000000FF0000) << 24) |
-	       ((n & 0x00000000FF000000) << 8) |
-	       ((n & 0x000000FF00000000) >> 8) |
-	       ((n & 0x0000FF0000000000) >> 24) |
-	       ((n & 0x00FF000000000000) >> 40) |
-	       ((n & 0xFF00000000000000) >> 56);
-}
-#endif
-
-#define le16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap16(n))
-#define le32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap32(n))
-#define le64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? (n) : bswap64(n))
-#define be16_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap16(n) : (n))
-#define be32_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap32(n) : (n))
-#define be64_bswap(n) (CPU_IS_LITTLE_ENDIAN() ? bswap64(n) : (n))
-
-/* ========================================================================== */
-/*                          Unaligned memory accesses                         */
-/* ========================================================================== */
-
-/*
- * UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses
- * can be performed efficiently on the target platform.
- */
-#ifndef UNALIGNED_ACCESS_IS_FAST
-#  define UNALIGNED_ACCESS_IS_FAST 0
-#endif
-
-/*
- * DEFINE_UNALIGNED_TYPE(type) - a macro that, given an integer type 'type',
- * defines load_type_unaligned(addr) and store_type_unaligned(v, addr) functions
- * which load and store variables of type 'type' from/to unaligned memory
- * addresses.  If not defined, a fallback is used.
- */
-#ifndef DEFINE_UNALIGNED_TYPE
-
-/*
- * Although memcpy() may seem inefficient, it *usually* gets optimized
- * appropriately by modern compilers.  It's portable and may be the best we can
- * do for a fallback...
- */
-#include <string.h>
-
-#define DEFINE_UNALIGNED_TYPE(type)				\
-								\
-static forceinline type						\
-load_##type##_unaligned(const void *p)				\
-{								\
-	type v;							\
-	memcpy(&v, p, sizeof(v));				\
-	return v;						\
-}								\
-								\
-static forceinline void						\
-store_##type##_unaligned(type v, void *p)			\
-{								\
-	memcpy(p, &v, sizeof(v));				\
-}
-
-#endif /* !DEFINE_UNALIGNED_TYPE */
-
-/* ========================================================================== */
-/*                             Bit scan functions                             */
-/* ========================================================================== */
-
-/*
- * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
- * significant end) of the *most* significant 1 bit in the input value.  The
- * input value must be nonzero!
- */
-
-#ifndef bsr32
-static forceinline unsigned
-bsr32(u32 n)
-{
-	unsigned i = 0;
-	while ((n >>= 1) != 0)
-		i++;
-	return i;
-}
-#endif
-
-#ifndef bsr64
-static forceinline unsigned
-bsr64(u64 n)
-{
-	unsigned i = 0;
-	while ((n >>= 1) != 0)
-		i++;
-	return i;
-}
-#endif
-
-static forceinline unsigned
-bsrw(machine_word_t n)
-{
-	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-	if (WORDBITS == 32)
-		return bsr32(n);
-	else
-		return bsr64(n);
-}
-
-/*
- * Bit Scan Forward (BSF) - find the 0-based index (relative to the least
- * significant end) of the *least* significant 1 bit in the input value.  The
- * input value must be nonzero!
- */
-
-#ifndef bsf32
-static forceinline unsigned
-bsf32(u32 n)
-{
-	unsigned i = 0;
-	while ((n & 1) == 0) {
-		i++;
-		n >>= 1;
-	}
-	return i;
-}
-#endif
-
-#ifndef bsf64
-static forceinline unsigned
-bsf64(u64 n)
-{
-	unsigned i = 0;
-	while ((n & 1) == 0) {
-		i++;
-		n >>= 1;
-	}
-	return i;
-}
-#endif
-
-static forceinline unsigned
-bsfw(machine_word_t n)
-{
-	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-	if (WORDBITS == 32)
-		return bsf32(n);
-	else
-		return bsf64(n);
-}
-
-#endif /* COMMON_COMMON_DEFS_H */
diff --git a/ext/libdeflate/common/compiler_gcc.h b/ext/libdeflate/common/compiler_gcc.h
deleted file mode 100644
index 17ca18cd..00000000
--- a/ext/libdeflate/common/compiler_gcc.h
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * compiler_gcc.h - definitions for the GNU C Compiler.  This also handles clang
- * and the Intel C Compiler (icc).
- *
- * TODO: icc is not well tested, so some things are currently disabled even
- * though they maybe can be enabled on some icc versions.
- */
-
-#if !defined(__clang__) && !defined(__INTEL_COMPILER)
-#  define GCC_PREREQ(major, minor)		\
-	(__GNUC__ > (major) ||			\
-	 (__GNUC__ == (major) && __GNUC_MINOR__ >= (minor)))
-#else
-#  define GCC_PREREQ(major, minor)	0
-#endif
-
-/* Note: only check the clang version when absolutely necessary!
- * "Vendors" such as Apple can use different version numbers. */
-#ifdef __clang__
-#  ifdef __apple_build_version__
-#    define CLANG_PREREQ(major, minor, apple_version)	\
-	(__apple_build_version__ >= (apple_version))
-#  else
-#    define CLANG_PREREQ(major, minor, apple_version)	\
-	(__clang_major__ > (major) ||			\
-	 (__clang_major__ == (major) && __clang_minor__ >= (minor)))
-#  endif
-#else
-#  define CLANG_PREREQ(major, minor, apple_version)	0
-#endif
-
-#ifndef __has_attribute
-#  define __has_attribute(attribute)	0
-#endif
-#ifndef __has_feature
-#  define __has_feature(feature)	0
-#endif
-#ifndef __has_builtin
-#  define __has_builtin(builtin)	0
-#endif
-
-#ifdef _WIN32
-#  define LIBEXPORT __declspec(dllexport)
-#else
-#  define LIBEXPORT __attribute__((visibility("default")))
-#endif
-
-#define inline			inline
-#define forceinline		inline __attribute__((always_inline))
-#define restrict		__restrict__
-#define likely(expr)		__builtin_expect(!!(expr), 1)
-#define unlikely(expr)		__builtin_expect(!!(expr), 0)
-#define prefetchr(addr)		__builtin_prefetch((addr), 0)
-#define prefetchw(addr)		__builtin_prefetch((addr), 1)
-#define _aligned_attribute(n)	__attribute__((aligned(n)))
-
-#define COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE	\
-	(GCC_PREREQ(4, 4) || __has_attribute(target))
-
-#if COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
-
-#  if defined(__i386__) || defined(__x86_64__)
-
-#    define COMPILER_SUPPORTS_PCLMUL_TARGET	\
-	(GCC_PREREQ(4, 4) || __has_builtin(__builtin_ia32_pclmulqdq128))
-
-#    define COMPILER_SUPPORTS_AVX_TARGET	\
-	(GCC_PREREQ(4, 6) || __has_builtin(__builtin_ia32_maxps256))
-
-#    define COMPILER_SUPPORTS_BMI2_TARGET	\
-	(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_pdep_di))
-
-#    define COMPILER_SUPPORTS_AVX2_TARGET	\
-	(GCC_PREREQ(4, 7) || __has_builtin(__builtin_ia32_psadbw256))
-
-#    define COMPILER_SUPPORTS_AVX512BW_TARGET	\
-	(GCC_PREREQ(5, 1) || __has_builtin(__builtin_ia32_psadbw512))
-
-	/*
-	 * Prior to gcc 4.9 (r200349) and clang 3.8 (r239883), x86 intrinsics
-	 * not available in the main target could not be used in 'target'
-	 * attribute functions.  Unfortunately clang has no feature test macro
-	 * for this so we have to check its version.
-	 */
-#    if GCC_PREREQ(4, 9) || CLANG_PREREQ(3, 8, 7030000)
-#      define COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS	1
-#      define COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS	\
-		COMPILER_SUPPORTS_PCLMUL_TARGET
-#      define COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS	\
-		COMPILER_SUPPORTS_AVX2_TARGET
-#      define COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS	\
-		COMPILER_SUPPORTS_AVX512BW_TARGET
-#    endif
-#  elif (defined(__arm__) && defined(__ARM_FP)) || defined(__aarch64__)
-	/* arm: including arm_neon.h requires hardware fp support */
-
-	/*
-	 * Prior to gcc 6.1 (r230411 for arm, r226563 for aarch64), NEON
-	 * and crypto intrinsics not available in the main target could not be
-	 * used in 'target' attribute functions.
-	 *
-	 * clang as of 5.0.1 still doesn't allow it.  But, it does seem to allow
-	 * the pmull intrinsics if only __ARM_NEON is enabled.
-	 */
-#    define COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS	GCC_PREREQ(6, 1)
-#    ifdef __ARM_NEON
-#      define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS	\
-		(GCC_PREREQ(6, 1) || __has_builtin(__builtin_neon_vmull_p64))
-#    else
-#      define COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS	\
-		(GCC_PREREQ(6, 1))
-#    endif
-#  endif
-#endif /* COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE */
-
-/* Newer gcc supports __BYTE_ORDER__.  Older gcc doesn't. */
-#ifdef __BYTE_ORDER__
-#  define CPU_IS_LITTLE_ENDIAN() (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
-#endif
-
-#if GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
-#  define bswap16	__builtin_bswap16
-#endif
-
-#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
-#  define bswap32	__builtin_bswap32
-#endif
-
-#if GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
-#  define bswap64	__builtin_bswap64
-#endif
-
-#if defined(__x86_64__) || defined(__i386__) || defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__)
-#  define UNALIGNED_ACCESS_IS_FAST 1
-#endif
-
-/* With gcc, we can access unaligned memory through 'packed' structures. */
-#define DEFINE_UNALIGNED_TYPE(type)				\
-								\
-struct type##unaligned {					\
-	type v;							\
-} __attribute__((packed));					\
-								\
-static forceinline type						\
-load_##type##_unaligned(const void *p)				\
-{								\
-	return ((const struct type##unaligned *)p)->v;		\
-}								\
-								\
-static forceinline void						\
-store_##type##_unaligned(type v, void *p)			\
-{								\
-	((struct type##unaligned *)p)->v = v;			\
-}
-
-#define bsr32(n)	(31 - __builtin_clz(n))
-#define bsr64(n)	(63 - __builtin_clzll(n))
-#define bsf32(n)	__builtin_ctz(n)
-#define bsf64(n)	__builtin_ctzll(n)
diff --git a/ext/libdeflate/common/compiler_msc.h b/ext/libdeflate/common/compiler_msc.h
deleted file mode 100644
index 0b40d3fc..00000000
--- a/ext/libdeflate/common/compiler_msc.h
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * compiler_msc.h - definitions for the Microsoft C Compiler
- */
-
-#define LIBEXPORT	__declspec(dllexport)
-
-/*
- * Old versions (e.g. VS2010) of MSC don't have the C99 header stdbool.h.
- * Beware: the below replacement isn't fully standard, since normally any value
- * != 0 should be implicitly cast to a bool with value 1... but that doesn't
- * happen if bool is really just an 'int'.
- */
-typedef int bool;
-#define true 1
-#define false 0
-#define __bool_true_false_are_defined 1
-
-/* Define ssize_t */
-#ifdef _WIN64
-typedef long long ssize_t;
-#else
-typedef int ssize_t;
-#endif
-
-/*
- * Old versions (e.g. VS2010) of MSC have stdint.h but not the C99 header
- * inttypes.h.  Work around this by defining the PRI* macros ourselves.
- */
-#include <stdint.h>
-#define PRIu8  "hhu"
-#define PRIu16 "hu"
-#define PRIu32 "u"
-#define PRIu64 "llu"
-#define PRIi8  "hhi"
-#define PRIi16 "hi"
-#define PRIi32 "i"
-#define PRIi64 "lli"
-#define PRIx8  "hhx"
-#define PRIx16 "hx"
-#define PRIx32 "x"
-#define PRIx64 "llx"
-
-/* Assume a little endian architecture with fast unaligned access */
-#define CPU_IS_LITTLE_ENDIAN()		1
-#define UNALIGNED_ACCESS_IS_FAST	1
-
-/* __restrict has nonstandard behavior; don't use it */
-#define restrict
-
-/* ... but we can use __inline and __forceinline */
-#define inline		__inline
-#define forceinline	__forceinline
-
-/* Byte swap functions */
-#include <stdlib.h>
-#define bswap16	_byteswap_ushort
-#define bswap32	_byteswap_ulong
-#define bswap64	_byteswap_uint64
-
-/* Bit scan functions (32-bit) */
-
-static forceinline unsigned
-bsr32(uint32_t n)
-{
-	_BitScanReverse(&n, n);
-	return n;
-}
-#define bsr32 bsr32
-
-static forceinline unsigned
-bsf32(uint32_t n)
-{
-	_BitScanForward(&n, n);
-	return n;
-}
-#define bsf32 bsf32
-
-#ifdef _M_X64 /* Bit scan functions (64-bit) */
-
-static forceinline unsigned
-bsr64(uint64_t n)
-{
-	_BitScanReverse64(&n, n);
-	return n;
-}
-#define bsr64 bsr64
-
-static forceinline unsigned
-bsf64(uint64_t n)
-{
-	_BitScanForward64(&n, n);
-	return n;
-}
-#define bsf64 bsf64
-
-#endif /* _M_X64 */
diff --git a/ext/libdeflate/lib/adler32.c b/ext/libdeflate/lib/adler32.c
deleted file mode 100644
index 185575a2..00000000
--- a/ext/libdeflate/lib/adler32.c
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * adler32.c - Adler-32 checksum algorithm
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "lib_common.h"
-#include "libdeflate.h"
-
-/* The Adler-32 divisor, or "base", value. */
-#define DIVISOR 65521
-
-/*
- * MAX_CHUNK_SIZE is the most bytes that can be processed without the
- * possibility of s2 overflowing when it is represented as an unsigned 32-bit
- * integer.  This value was computed using the following Python script:
- *
- *	divisor = 65521
- *	count = 0
- *	s1 = divisor - 1
- *	s2 = divisor - 1
- *	while True:
- *		s1 += 0xFF
- *		s2 += s1
- *		if s2 > 0xFFFFFFFF:
- *			break
- *		count += 1
- *	print(count)
- *
- * Note that to get the correct worst-case value, we must assume that every byte
- * has value 0xFF and that s1 and s2 started with the highest possible values
- * modulo the divisor.
- */
-#define MAX_CHUNK_SIZE	5552
-
-typedef u32 (*adler32_func_t)(u32, const u8 *, size_t);
-
-/* Include architecture-specific implementations if available */
-#undef DEFAULT_IMPL
-#undef DISPATCH
-#if defined(__arm__) || defined(__aarch64__)
-#  include "arm/adler32_impl.h"
-#elif defined(__i386__) || defined(__x86_64__)
-#  include "x86/adler32_impl.h"
-#endif
-
-/* Define a generic implementation if needed */
-#ifndef DEFAULT_IMPL
-#define DEFAULT_IMPL adler32_generic
-static u32 adler32_generic(u32 adler, const u8 *p, size_t size)
-{
-	u32 s1 = adler & 0xFFFF;
-	u32 s2 = adler >> 16;
-	const u8 * const end = p + size;
-
-	while (p != end) {
-		size_t chunk_size = MIN(end - p, MAX_CHUNK_SIZE);
-		const u8 *chunk_end = p + chunk_size;
-		size_t num_unrolled_iterations = chunk_size / 4;
-
-		while (num_unrolled_iterations--) {
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-		}
-		while (p != chunk_end) {
-			s1 += *p++;
-			s2 += s1;
-		}
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	return (s2 << 16) | s1;
-}
-#endif /* !DEFAULT_IMPL */
-
-#ifdef DISPATCH
-static u32 dispatch(u32, const u8 *, size_t);
-
-static volatile adler32_func_t adler32_impl = dispatch;
-
-/* Choose the fastest implementation at runtime */
-static u32 dispatch(u32 adler, const u8 *buffer, size_t size)
-{
-	adler32_func_t f = arch_select_adler32_func();
-
-	if (f == NULL)
-		f = DEFAULT_IMPL;
-
-	adler32_impl = f;
-	return adler32_impl(adler, buffer, size);
-}
-#else
-#  define adler32_impl DEFAULT_IMPL /* only one implementation, use it */
-#endif
-
-LIBDEFLATEAPI u32
-libdeflate_adler32(u32 adler, const void *buffer, size_t size)
-{
-	if (buffer == NULL) /* return initial value */
-		return 1;
-	return adler32_impl(adler, buffer, size);
-}
diff --git a/ext/libdeflate/lib/adler32_vec_template.h b/ext/libdeflate/lib/adler32_vec_template.h
deleted file mode 100644
index 4eb8c2a8..00000000
--- a/ext/libdeflate/lib/adler32_vec_template.h
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * adler32_vec_template.h - template for vectorized Adler-32 implementations
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * This file contains a template for vectorized Adler-32 implementations.
- *
- * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
- * implementation looks something like this:
- *
- *	do {
- *		s1 += *p;
- *		s2 += s1;
- *	} while (++p != chunk_end);
- *
- * For vectorized calculation of s1, we only need to sum the input bytes.  They
- * can be accumulated into multiple counters which are eventually summed
- * together.
- *
- * For vectorized calculation of s2, the basic idea is that for each iteration
- * that processes N bytes, we can perform the following vectorizable
- * calculation:
- *
- *	s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
- *
- * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
- * separate counters, then do the multiplications by N...1 just once at the end
- * rather than once per iteration.
- *
- * Also, we must account for how previous bytes will affect s2 by doing the
- * following at beginning of each iteration:
- *
- *	s2 += s1 * N
- *
- * Furthermore, like s1, "s2" can actually be multiple counters which are
- * eventually summed together.
- */
-
-static u32 ATTRIBUTES
-FUNCNAME(u32 adler, const u8 *p, size_t size)
-{
-	u32 s1 = adler & 0xFFFF;
-	u32 s2 = adler >> 16;
-	const u8 * const end = p + size;
-	const u8 *vend;
-	const size_t max_chunk_size =
-		MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
-		(MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
-		 IMPL_SEGMENT_SIZE);
-
-	/* Process a byte at a time until the needed alignment is reached */
-	if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
-		do {
-			s1 += *p++;
-			s2 += s1;
-		} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	/*
-	 * Process "chunks" of bytes using vector instructions.  Chunk sizes are
-	 * limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
-	 * overflow before being reduced modulo DIVISOR.  For vector processing,
-	 * chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
-	 * may be further limited to IMPL_MAX_CHUNK_SIZE.
-	 */
-	STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
-	vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
-	while (p != vend) {
-		size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);
-
-		s2 += s1 * chunk_size;
-
-		FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
-			       &s1, &s2);
-
-		p += chunk_size;
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	/* Process any remaining bytes */
-	if (p != end) {
-		do {
-			s1 += *p++;
-			s2 += s1;
-		} while (p != end);
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	return (s2 << 16) | s1;
-}
-
-#undef FUNCNAME
-#undef FUNCNAME_CHUNK
-#undef ATTRIBUTES
-#undef IMPL_ALIGNMENT
-#undef IMPL_SEGMENT_SIZE
-#undef IMPL_MAX_CHUNK_SIZE
diff --git a/ext/libdeflate/lib/aligned_malloc.c b/ext/libdeflate/lib/aligned_malloc.c
deleted file mode 100644
index e714dc79..00000000
--- a/ext/libdeflate/lib/aligned_malloc.c
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * aligned_malloc.c - aligned memory allocation
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * This file provides portable aligned memory allocation functions that only
- * use malloc() and free().  This avoids portability problems with
- * posix_memalign(), aligned_alloc(), etc.
- */
-
-#include <stdlib.h>
-
-#include "aligned_malloc.h"
-
-void *
-aligned_malloc(size_t alignment, size_t size)
-{
-	void *ptr = malloc(sizeof(void *) + alignment - 1 + size);
-	if (ptr) {
-		void *orig_ptr = ptr;
-		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
-		((void **)ptr)[-1] = orig_ptr;
-	}
-	return ptr;
-}
-
-void
-aligned_free(void *ptr)
-{
-	if (ptr)
-		free(((void **)ptr)[-1]);
-}
diff --git a/ext/libdeflate/lib/aligned_malloc.h b/ext/libdeflate/lib/aligned_malloc.h
deleted file mode 100644
index ee6e7b89..00000000
--- a/ext/libdeflate/lib/aligned_malloc.h
+++ /dev/null
@@ -1,13 +0,0 @@
-/*
- * aligned_malloc.c - aligned memory allocation
- */
-
-#ifndef LIB_ALIGNED_MALLOC_H
-#define LIB_ALIGNED_MALLOC_H
-
-#include "lib_common.h"
-
-extern void *aligned_malloc(size_t alignment, size_t size);
-extern void aligned_free(void *ptr);
-
-#endif /* LIB_ALIGNED_MALLOC_H */
diff --git a/ext/libdeflate/lib/arm/adler32_impl.h b/ext/libdeflate/lib/arm/adler32_impl.h
deleted file mode 100644
index de52d81c..00000000
--- a/ext/libdeflate/lib/arm/adler32_impl.h
+++ /dev/null
@@ -1,120 +0,0 @@
-/*
- * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "cpu_features.h"
-
-/* NEON implementation */
-#undef DISPATCH_NEON
-#if !defined(DEFAULT_IMPL) &&	\
-	(defined(__ARM_NEON) || (ARM_CPU_FEATURES_ENABLED &&	\
-				 COMPILER_SUPPORTS_NEON_TARGET_INTRINSICS))
-#  define FUNCNAME		adler32_neon
-#  define FUNCNAME_CHUNK	adler32_neon_chunk
-#  define IMPL_ALIGNMENT	16
-#  define IMPL_SEGMENT_SIZE	32
-/* Prevent unsigned overflow of the 16-bit precision byte counters */
-#  define IMPL_MAX_CHUNK_SIZE	(32 * (0xFFFF / 0xFF))
-#  ifdef __ARM_NEON
-#    define ATTRIBUTES
-#    define DEFAULT_IMPL	adler32_neon
-#  else
-#    ifdef __arm__
-#      define ATTRIBUTES	__attribute__((target("fpu=neon")))
-#    else
-#      define ATTRIBUTES	__attribute__((target("+simd")))
-#    endif
-#    define DISPATCH		1
-#    define DISPATCH_NEON	1
-#  endif
-#  include <arm_neon.h>
-static forceinline ATTRIBUTES void
-adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
-		   u32 *s1, u32 *s2)
-{
-	uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };
-	uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_a = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_b = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_c = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
-	uint16x8_t v_byte_sums_d = (uint16x8_t) { 0, 0, 0, 0, 0, 0, 0, 0 };
-
-	do {
-		const uint8x16_t bytes1 = *p++;
-		const uint8x16_t bytes2 = *p++;
-		uint16x8_t tmp;
-
-		v_s2 += v_s1;
-
-		/* Vector Pairwise Add Long (u8 => u16) */
-		tmp = vpaddlq_u8(bytes1);
-
-		/* Vector Pairwise Add and Accumulate Long (u8 => u16) */
-		tmp = vpadalq_u8(tmp, bytes2);
-
-		/* Vector Pairwise Add and Accumulate Long (u16 => u32) */
-		v_s1 = vpadalq_u16(v_s1, tmp);
-
-		/* Vector Add Wide (u8 => u16) */
-		v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
-		v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
-		v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
-		v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
-
-	} while (p != end);
-
-	/* Vector Shift Left (u32) */
-	v_s2 = vqshlq_n_u32(v_s2, 5);
-
-	/* Vector Multiply Accumulate Long (u16 => u32) */
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),  (uint16x4_t) { 32, 31, 30, 29 });
-	v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_a), (uint16x4_t) { 28, 27, 26, 25 });
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),  (uint16x4_t) { 24, 23, 22, 21 });
-	v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_b), (uint16x4_t) { 20, 19, 18, 17 });
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),  (uint16x4_t) { 16, 15, 14, 13 });
-	v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_c), (uint16x4_t) { 12, 11, 10,  9 });
-	v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_byte_sums_d), (uint16x4_t) {  8,  7,  6,  5 });
-	v_s2 = vmlal_u16(v_s2, vget_high_u16(v_byte_sums_d), (uint16x4_t) {  4,  3,  2,  1 });
-
-	*s1 += v_s1[0] + v_s1[1] + v_s1[2] + v_s1[3];
-	*s2 += v_s2[0] + v_s2[1] + v_s2[2] + v_s2[3];
-}
-#  include "../adler32_vec_template.h"
-#endif /* NEON implementation */
-
-#ifdef DISPATCH
-static inline adler32_func_t
-arch_select_adler32_func(void)
-{
-	u32 features = get_cpu_features();
-
-#ifdef DISPATCH_NEON
-	if (features & ARM_CPU_FEATURE_NEON)
-		return adler32_neon;
-#endif
-	return NULL;
-}
-#endif /* DISPATCH */
diff --git a/ext/libdeflate/lib/arm/cpu_features.c b/ext/libdeflate/lib/arm/cpu_features.c
deleted file mode 100644
index 8d1facc1..00000000
--- a/ext/libdeflate/lib/arm/cpu_features.c
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * arm/cpu_features.c - feature detection for ARM processors
- *
- * Copyright 2018 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * ARM processors don't have a standard way for unprivileged programs to detect
- * processor features.  But, on Linux we can read the AT_HWCAP and AT_HWCAP2
- * values from /proc/self/auxv.
- *
- * Ideally we'd use the C library function getauxval(), but it's not guaranteed
- * to be available: it was only added to glibc in 2.16, and in Android it was
- * added to API level 18 for ARM and level 21 for AArch64.
- */
-
-#include "cpu_features.h"
-
-#if ARM_CPU_FEATURES_ENABLED
-
-#include <errno.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-
-#define AT_HWCAP	16
-#define AT_HWCAP2	26
-
-volatile u32 _cpu_features = 0;
-
-static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
-{
-	int fd;
-	unsigned long auxbuf[32];
-	int filled = 0;
-	int i;
-
-	fd = open("/proc/self/auxv", O_RDONLY);
-	if (fd < 0)
-		return;
-
-	for (;;) {
-		do {
-			int ret = read(fd, &((char *)auxbuf)[filled],
-				       sizeof(auxbuf) - filled);
-			if (ret <= 0) {
-				if (ret < 0 && errno == EINTR)
-					continue;
-				goto out;
-			}
-			filled += ret;
-		} while (filled < 2 * sizeof(long));
-
-		i = 0;
-		do {
-			unsigned long type = auxbuf[i];
-			unsigned long value = auxbuf[i + 1];
-
-			if (type == AT_HWCAP)
-				*hwcap = value;
-			else if (type == AT_HWCAP2)
-				*hwcap2 = value;
-			i += 2;
-			filled -= 2 * sizeof(long);
-		} while (filled >= 2 * sizeof(long));
-
-		memmove(auxbuf, &auxbuf[i], filled);
-	}
-out:
-	close(fd);
-}
-
-void setup_cpu_features(void)
-{
-	u32 features = 0;
-	unsigned long hwcap = 0;
-	unsigned long hwcap2 = 0;
-
-	scan_auxv(&hwcap, &hwcap2);
-
-#ifdef __arm__
-	STATIC_ASSERT(sizeof(long) == 4);
-	if (hwcap & (1 << 12))	/* HWCAP_NEON */
-		features |= ARM_CPU_FEATURE_NEON;
-	if (hwcap2 & (1 << 1))	/* HWCAP2_PMULL */
-		features |= ARM_CPU_FEATURE_PMULL;
-#else
-	STATIC_ASSERT(sizeof(long) == 8);
-	if (hwcap & (1 << 1))	/* HWCAP_ASIMD */
-		features |= ARM_CPU_FEATURE_NEON;
-	if (hwcap & (1 << 4))	/* HWCAP_PMULL */
-		features |= ARM_CPU_FEATURE_PMULL;
-#endif
-
-	_cpu_features = features | ARM_CPU_FEATURES_KNOWN;
-}
-
-#endif /* ARM_CPU_FEATURES_ENABLED */
diff --git a/ext/libdeflate/lib/arm/cpu_features.h b/ext/libdeflate/lib/arm/cpu_features.h
deleted file mode 100644
index 390d96c5..00000000
--- a/ext/libdeflate/lib/arm/cpu_features.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * arm/cpu_features.h - feature detection for ARM processors
- */
-
-#ifndef LIB_ARM_CPU_FEATURES_H
-#define LIB_ARM_CPU_FEATURES_H
-
-#include "../lib_common.h"
-
-#if (defined(__arm__) || defined(__aarch64__)) && \
-	defined(__linux__) && COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
-#  define ARM_CPU_FEATURES_ENABLED 1
-#else
-#  define ARM_CPU_FEATURES_ENABLED 0
-#endif
-
-#if ARM_CPU_FEATURES_ENABLED
-
-#define ARM_CPU_FEATURE_NEON		0x00000001
-#define ARM_CPU_FEATURE_PMULL		0x00000002
-
-#define ARM_CPU_FEATURES_KNOWN		0x80000000
-
-extern volatile u32 _cpu_features;
-
-extern void setup_cpu_features(void);
-
-static inline u32 get_cpu_features(void)
-{
-	if (_cpu_features == 0)
-		setup_cpu_features();
-	return _cpu_features;
-}
-
-#endif /* ARM_CPU_FEATURES_ENABLED */
-
-#endif /* LIB_ARM_CPU_FEATURES_H */
diff --git a/ext/libdeflate/lib/arm/crc32_impl.h b/ext/libdeflate/lib/arm/crc32_impl.h
deleted file mode 100644
index e64616ff..00000000
--- a/ext/libdeflate/lib/arm/crc32_impl.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/*
- * arm/crc32_impl.h
- *
- * Copyright 2017 Jun He <jun.he@linaro.org>
- * Copyright 2018 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "cpu_features.h"
-
-/*
- * CRC-32 folding with ARM Crypto extension-PMULL
- *
- * This works the same way as the x86 PCLMUL version.
- * See x86/crc32_pclmul_template.h for an explanation.
- */
-#undef DISPATCH_PMULL
-#if (defined(__ARM_FEATURE_CRYPTO) ||	\
-     (ARM_CPU_FEATURES_ENABLED &&	\
-      COMPILER_SUPPORTS_PMULL_TARGET_INTRINSICS)) && \
-      /* not yet tested on big endian, probably needs changes to work there */ \
-    (defined(__BYTE_ORDER__) && __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) && \
-      /* clang as of v5.0.1 doesn't allow pmull intrinsics in 32-bit mode, even
-       * when compiling with -mfpu=crypto-neon-fp-armv8 */ \
-    !(defined(__clang__) && defined(__arm__))
-#  define FUNCNAME		crc32_pmull
-#  define FUNCNAME_ALIGNED	crc32_pmull_aligned
-#  ifdef __ARM_FEATURE_CRYPTO
-#    define ATTRIBUTES
-#    define DEFAULT_IMPL	crc32_pmull
-#  else
-#    ifdef __arm__
-#      define ATTRIBUTES	__attribute__((target("fpu=crypto-neon-fp-armv8")))
-#    else
-#      ifdef __clang__
-#        define ATTRIBUTES	__attribute__((target("crypto")))
-#      else
-#        define ATTRIBUTES	__attribute__((target("+crypto")))
-#      endif
-#    endif
-#    define DISPATCH		1
-#    define DISPATCH_PMULL	1
-#  endif
-
-#include <arm_neon.h>
-
-static forceinline ATTRIBUTES uint8x16_t
-clmul_00(uint8x16_t a, uint8x16_t b)
-{
-	return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a),
-				     (poly64_t)vget_low_u8(b));
-}
-
-static forceinline ATTRIBUTES uint8x16_t
-clmul_10(uint8x16_t a, uint8x16_t b)
-{
-	return (uint8x16_t)vmull_p64((poly64_t)vget_low_u8(a),
-				     (poly64_t)vget_high_u8(b));
-}
-
-static forceinline ATTRIBUTES uint8x16_t
-clmul_11(uint8x16_t a, uint8x16_t b)
-{
-	return (uint8x16_t)vmull_high_p64((poly64x2_t)a, (poly64x2_t)b);
-}
-
-static forceinline ATTRIBUTES uint8x16_t
-fold_128b(uint8x16_t dst, uint8x16_t src, uint8x16_t multipliers)
-{
-	return dst ^ clmul_00(src, multipliers) ^ clmul_11(src, multipliers);
-}
-
-static forceinline ATTRIBUTES u32
-crc32_pmull_aligned(u32 remainder, const uint8x16_t *p, size_t nr_segs)
-{
-	/* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
-	const uint8x16_t multipliers_4 =
-		(uint8x16_t)(uint64x2_t){ 0x8F352D95, 0x1D9513D7 };
-	const uint8x16_t multipliers_1 =
-		(uint8x16_t)(uint64x2_t){ 0xAE689191, 0xCCAA009E };
-	const uint8x16_t final_multiplier =
-		(uint8x16_t)(uint64x2_t){ 0xB8BC6765 };
-	const uint8x16_t mask32 = (uint8x16_t)(uint32x4_t){ 0xFFFFFFFF };
-	const uint8x16_t barrett_reduction_constants =
-			(uint8x16_t)(uint64x2_t){ 0x00000001F7011641,
-						  0x00000001DB710641 };
-	const uint8x16_t zeroes = (uint8x16_t){ 0 };
-
-	const uint8x16_t * const end = p + nr_segs;
-	const uint8x16_t * const end512 = p + (nr_segs & ~3);
-	uint8x16_t x0, x1, x2, x3;
-
-	x0 = *p++ ^ (uint8x16_t)(uint32x4_t){ remainder };
-	if (nr_segs >= 4) {
-		x1 = *p++;
-		x2 = *p++;
-		x3 = *p++;
-
-		/* Fold 512 bits at a time */
-		while (p != end512) {
-			x0 = fold_128b(*p++, x0, multipliers_4);
-			x1 = fold_128b(*p++, x1, multipliers_4);
-			x2 = fold_128b(*p++, x2, multipliers_4);
-			x3 = fold_128b(*p++, x3, multipliers_4);
-		}
-
-		/* Fold 512 bits => 128 bits */
-		x1 = fold_128b(x1, x0, multipliers_1);
-		x2 = fold_128b(x2, x1, multipliers_1);
-		x0 = fold_128b(x3, x2, multipliers_1);
-	}
-
-	/* Fold 128 bits at a time */
-	while (p != end)
-		x0 = fold_128b(*p++, x0, multipliers_1);
-
-	/* Fold 128 => 96 bits, implicitly appending 32 zeroes */
-	x0 = vextq_u8(x0, zeroes, 8) ^ clmul_10(x0, multipliers_1);
-
-	/* Fold 96 => 64 bits */
-	x0 = vextq_u8(x0, zeroes, 4) ^ clmul_00(x0 & mask32, final_multiplier);
-
-	/* Reduce 64 => 32 bits using Barrett reduction */
-	x1 = x0;
-	x0 = clmul_00(x0 & mask32, barrett_reduction_constants);
-	x0 = clmul_10(x0 & mask32, barrett_reduction_constants);
-	return vgetq_lane_u32((uint32x4_t)(x0 ^ x1), 1);
-}
-#define IMPL_ALIGNMENT		16
-#define IMPL_SEGMENT_SIZE	16
-#include "../crc32_vec_template.h"
-#endif /* PMULL implementation */
-
-#ifdef DISPATCH
-static inline crc32_func_t
-arch_select_crc32_func(void)
-{
-	u32 features = get_cpu_features();
-
-#ifdef DISPATCH_PMULL
-	if (features & ARM_CPU_FEATURE_PMULL)
-		return crc32_pmull;
-#endif
-	return NULL;
-}
-#endif /* DISPATCH */
diff --git a/ext/libdeflate/lib/arm/matchfinder_impl.h b/ext/libdeflate/lib/arm/matchfinder_impl.h
deleted file mode 100644
index aa1a0c72..00000000
--- a/ext/libdeflate/lib/arm/matchfinder_impl.h
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * arm/matchfinder_impl.h - ARM implementations of matchfinder functions
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifdef __ARM_NEON
-#  if MATCHFINDER_ALIGNMENT < 16
-#    undef MATCHFINDER_ALIGNMENT
-#    define MATCHFINDER_ALIGNMENT 16
-#  endif
-#  include <arm_neon.h>
-static forceinline bool
-matchfinder_init_neon(mf_pos_t *data, size_t size)
-{
-	int16x8_t v, *p;
-	size_t n;
-
-	if (size % (sizeof(int16x8_t) * 4) != 0)
-		return false;
-
-	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
-	v = (int16x8_t) {
-		MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
-		MATCHFINDER_INITVAL, MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
-		MATCHFINDER_INITVAL, MATCHFINDER_INITVAL,
-	};
-	p = (int16x8_t *)data;
-	n = size / (sizeof(int16x8_t) * 4);
-	do {
-		p[0] = v;
-		p[1] = v;
-		p[2] = v;
-		p[3] = v;
-		p += 4;
-	} while (--n);
-	return true;
-}
-#undef arch_matchfinder_init
-#define arch_matchfinder_init matchfinder_init_neon
-
-static forceinline bool
-matchfinder_rebase_neon(mf_pos_t *data, size_t size)
-{
-	int16x8_t v, *p;
-	size_t n;
-
-	if (size % (sizeof(int16x8_t) * 4) != 0)
-		return false;
-
-	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
-	v = (int16x8_t) {
-		(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
-		(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
-		(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
-		(u16)-MATCHFINDER_WINDOW_SIZE, (u16)-MATCHFINDER_WINDOW_SIZE,
-	};
-	p = (int16x8_t *)data;
-	n = size / (sizeof(int16x8_t) * 4);
-	do {
-		p[0] = vqaddq_s16(p[0], v);
-		p[1] = vqaddq_s16(p[1], v);
-		p[2] = vqaddq_s16(p[2], v);
-		p[3] = vqaddq_s16(p[3], v);
-		p += 4;
-	} while (--n);
-	return true;
-}
-#undef arch_matchfinder_rebase
-#define arch_matchfinder_rebase matchfinder_rebase_neon
-
-#endif /* __ARM_NEON */
diff --git a/ext/libdeflate/lib/bt_matchfinder.h b/ext/libdeflate/lib/bt_matchfinder.h
deleted file mode 100644
index 49fc0bf4..00000000
--- a/ext/libdeflate/lib/bt_matchfinder.h
+++ /dev/null
@@ -1,355 +0,0 @@
-/*
- * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- * ----------------------------------------------------------------------------
- *
- * This is a Binary Trees (bt) based matchfinder.
- *
- * The main data structure is a hash table where each hash bucket contains a
- * binary tree of sequences whose first 4 bytes share the same hash code.  Each
- * sequence is identified by its starting position in the input buffer.  Each
- * binary tree is always sorted such that each left child represents a sequence
- * lexicographically lesser than its parent and each right child represents a
- * sequence lexicographically greater than its parent.
- *
- * The algorithm processes the input buffer sequentially.  At each byte
- * position, the hash code of the first 4 bytes of the sequence beginning at
- * that position (the sequence being matched against) is computed.  This
- * identifies the hash bucket to use for that position.  Then, a new binary tree
- * node is created to represent the current sequence.  Then, in a single tree
- * traversal, the hash bucket's binary tree is searched for matches and is
- * re-rooted at the new node.
- *
- * Compared to the simpler algorithm that uses linked lists instead of binary
- * trees (see hc_matchfinder.h), the binary tree version gains more information
- * at each node visitation.  Ideally, the binary tree version will examine only
- * 'log(n)' nodes to find the same matches that the linked list version will
- * find by examining 'n' nodes.  In addition, the binary tree version can
- * examine fewer bytes at each node by taking advantage of the common prefixes
- * that result from the sort order, whereas the linked list version may have to
- * examine up to the full length of the match at each node.
- *
- * However, it is not always best to use the binary tree version.  It requires
- * nearly twice as much memory as the linked list version, and it takes time to
- * keep the binary trees sorted, even at positions where the compressor does not
- * need matches.  Generally, when doing fast compression on small buffers,
- * binary trees are the wrong approach.  They are best suited for thorough
- * compression and/or large buffers.
- *
- * ----------------------------------------------------------------------------
- */
-
-
-#include "matchfinder_common.h"
-
-#define BT_MATCHFINDER_HASH3_ORDER 16
-#define BT_MATCHFINDER_HASH3_WAYS  2
-#define BT_MATCHFINDER_HASH4_ORDER 16
-
-#define BT_MATCHFINDER_TOTAL_HASH_LENGTH		\
-	((1UL << BT_MATCHFINDER_HASH3_ORDER) * BT_MATCHFINDER_HASH3_WAYS + \
-	 (1UL << BT_MATCHFINDER_HASH4_ORDER))
-
-/* Representation of a match found by the bt_matchfinder  */
-struct lz_match {
-
-	/* The number of bytes matched.  */
-	u16 length;
-
-	/* The offset back from the current position that was matched.  */
-	u16 offset;
-};
-
-struct bt_matchfinder {
-
-	/* The hash table for finding length 3 matches  */
-	mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
-
-	/* The hash table which contains the roots of the binary trees for
-	 * finding length 4+ matches  */
-	mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
-
-	/* The child node references for the binary trees.  The left and right
-	 * children of the node for the sequence with position 'pos' are
-	 * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively.  */
-	mf_pos_t child_tab[2UL * MATCHFINDER_WINDOW_SIZE];
-
-}
-#ifdef _aligned_attribute
-_aligned_attribute(MATCHFINDER_ALIGNMENT)
-#endif
-;
-
-/* Prepare the matchfinder for a new input buffer.  */
-static forceinline void
-bt_matchfinder_init(struct bt_matchfinder *mf)
-{
-	matchfinder_init((mf_pos_t *)mf, BT_MATCHFINDER_TOTAL_HASH_LENGTH);
-}
-
-static forceinline void
-bt_matchfinder_slide_window(struct bt_matchfinder *mf)
-{
-	matchfinder_rebase((mf_pos_t *)mf,
-			   sizeof(struct bt_matchfinder) / sizeof(mf_pos_t));
-}
-
-static forceinline mf_pos_t *
-bt_left_child(struct bt_matchfinder *mf, s32 node)
-{
-	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 0];
-}
-
-static forceinline mf_pos_t *
-bt_right_child(struct bt_matchfinder *mf, s32 node)
-{
-	return &mf->child_tab[2 * (node & (MATCHFINDER_WINDOW_SIZE - 1)) + 1];
-}
-
-/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
- * and bt_matchfinder_skip_position().  There must be sufficiently many bytes
- * remaining to load a 32-bit integer from the *next* position.  */
-#define BT_MATCHFINDER_REQUIRED_NBYTES	5
-
-/* Advance the binary tree matchfinder by one byte, optionally recording
- * matches.  @record_matches should be a compile-time constant.  */
-static forceinline struct lz_match *
-bt_matchfinder_advance_one_byte(struct bt_matchfinder * const restrict mf,
-				const u8 * const restrict in_base,
-				const ptrdiff_t cur_pos,
-				const u32 max_len,
-				const u32 nice_len,
-				const u32 max_search_depth,
-				u32 * const restrict next_hashes,
-				u32 * const restrict best_len_ret,
-				struct lz_match * restrict lz_matchptr,
-				const bool record_matches)
-{
-	const u8 *in_next = in_base + cur_pos;
-	u32 depth_remaining = max_search_depth;
-	const s32 cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
-	u32 next_hashseq;
-	u32 hash3;
-	u32 hash4;
-	s32 cur_node;
-#if BT_MATCHFINDER_HASH3_WAYS >= 2
-	s32 cur_node_2;
-#endif
-	const u8 *matchptr;
-	mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
-	u32 best_lt_len, best_gt_len;
-	u32 len;
-	u32 best_len = 3;
-
-	STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
-		      BT_MATCHFINDER_HASH3_WAYS <= 2);
-
-	next_hashseq = get_unaligned_le32(in_next + 1);
-
-	hash3 = next_hashes[0];
-	hash4 = next_hashes[1];
-
-	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER);
-	next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER);
-	prefetchw(&mf->hash3_tab[next_hashes[0]]);
-	prefetchw(&mf->hash4_tab[next_hashes[1]]);
-
-	cur_node = mf->hash3_tab[hash3][0];
-	mf->hash3_tab[hash3][0] = cur_pos;
-#if BT_MATCHFINDER_HASH3_WAYS >= 2
-	cur_node_2 = mf->hash3_tab[hash3][1];
-	mf->hash3_tab[hash3][1] = cur_node;
-#endif
-	if (record_matches && cur_node > cutoff) {
-		u32 seq3 = load_u24_unaligned(in_next);
-		if (seq3 == load_u24_unaligned(&in_base[cur_node])) {
-			lz_matchptr->length = 3;
-			lz_matchptr->offset = in_next - &in_base[cur_node];
-			lz_matchptr++;
-		}
-	#if BT_MATCHFINDER_HASH3_WAYS >= 2
-		else if (cur_node_2 > cutoff &&
-			seq3 == load_u24_unaligned(&in_base[cur_node_2]))
-		{
-			lz_matchptr->length = 3;
-			lz_matchptr->offset = in_next - &in_base[cur_node_2];
-			lz_matchptr++;
-		}
-	#endif
-	}
-
-	cur_node = mf->hash4_tab[hash4];
-	mf->hash4_tab[hash4] = cur_pos;
-
-	pending_lt_ptr = bt_left_child(mf, cur_pos);
-	pending_gt_ptr = bt_right_child(mf, cur_pos);
-
-	if (cur_node <= cutoff) {
-		*pending_lt_ptr = MATCHFINDER_INITVAL;
-		*pending_gt_ptr = MATCHFINDER_INITVAL;
-		*best_len_ret = best_len;
-		return lz_matchptr;
-	}
-
-	best_lt_len = 0;
-	best_gt_len = 0;
-	len = 0;
-
-	for (;;) {
-		matchptr = &in_base[cur_node];
-
-		if (matchptr[len] == in_next[len]) {
-			len = lz_extend(in_next, matchptr, len + 1, max_len);
-			if (!record_matches || len > best_len) {
-				if (record_matches) {
-					best_len = len;
-					lz_matchptr->length = len;
-					lz_matchptr->offset = in_next - matchptr;
-					lz_matchptr++;
-				}
-				if (len >= nice_len) {
-					*pending_lt_ptr = *bt_left_child(mf, cur_node);
-					*pending_gt_ptr = *bt_right_child(mf, cur_node);
-					*best_len_ret = best_len;
-					return lz_matchptr;
-				}
-			}
-		}
-
-		if (matchptr[len] < in_next[len]) {
-			*pending_lt_ptr = cur_node;
-			pending_lt_ptr = bt_right_child(mf, cur_node);
-			cur_node = *pending_lt_ptr;
-			best_lt_len = len;
-			if (best_gt_len < len)
-				len = best_gt_len;
-		} else {
-			*pending_gt_ptr = cur_node;
-			pending_gt_ptr = bt_left_child(mf, cur_node);
-			cur_node = *pending_gt_ptr;
-			best_gt_len = len;
-			if (best_lt_len < len)
-				len = best_lt_len;
-		}
-
-		if (cur_node <= cutoff || !--depth_remaining) {
-			*pending_lt_ptr = MATCHFINDER_INITVAL;
-			*pending_gt_ptr = MATCHFINDER_INITVAL;
-			*best_len_ret = best_len;
-			return lz_matchptr;
-		}
-	}
-}
-
-/*
- * Retrieve a list of matches with the current position.
- *
- * @mf
- *	The matchfinder structure.
- * @in_base
- *	Pointer to the next byte in the input buffer to process _at the last
- *	time bt_matchfinder_init() or bt_matchfinder_slide_window() was called_.
- * @cur_pos
- *	The current position in the input buffer relative to @in_base (the
- *	position of the sequence being matched against).
- * @max_len
- *	The maximum permissible match length at this position.  Must be >=
- *	BT_MATCHFINDER_REQUIRED_NBYTES.
- * @nice_len
- *	Stop searching if a match of at least this length is found.
- *	Must be <= @max_len.
- * @max_search_depth
- *	Limit on the number of potential matches to consider.  Must be >= 1.
- * @next_hashes
- *	The precomputed hash codes for the sequence beginning at @in_next.
- *	These will be used and then updated with the precomputed hashcodes for
- *	the sequence beginning at @in_next + 1.
- * @best_len_ret
- *	If a match of length >= 4 was found, then the length of the longest such
- *	match is written here; otherwise 3 is written here.  (Note: this is
- *	redundant with the 'struct lz_match' array, but this is easier for the
- *	compiler to optimize when inlined and the caller immediately does a
- *	check against 'best_len'.)
- * @lz_matchptr
- *	An array in which this function will record the matches.  The recorded
- *	matches will be sorted by strictly increasing length and (non-strictly)
- *	increasing offset.  The maximum number of matches that may be found is
- *	'nice_len - 2'.
- *
- * The return value is a pointer to the next available slot in the @lz_matchptr
- * array.  (If no matches were found, this will be the same as @lz_matchptr.)
- */
-static forceinline struct lz_match *
-bt_matchfinder_get_matches(struct bt_matchfinder *mf,
-			   const u8 *in_base,
-			   ptrdiff_t cur_pos,
-			   u32 max_len,
-			   u32 nice_len,
-			   u32 max_search_depth,
-			   u32 next_hashes[2],
-			   u32 *best_len_ret,
-			   struct lz_match *lz_matchptr)
-{
-	return bt_matchfinder_advance_one_byte(mf,
-					       in_base,
-					       cur_pos,
-					       max_len,
-					       nice_len,
-					       max_search_depth,
-					       next_hashes,
-					       best_len_ret,
-					       lz_matchptr,
-					       true);
-}
-
-/*
- * Advance the matchfinder, but don't record any matches.
- *
- * This is very similar to bt_matchfinder_get_matches() because both functions
- * must do hashing and tree re-rooting.
- */
-static forceinline void
-bt_matchfinder_skip_position(struct bt_matchfinder *mf,
-			     const u8 *in_base,
-			     ptrdiff_t cur_pos,
-			     u32 nice_len,
-			     u32 max_search_depth,
-			     u32 next_hashes[2])
-{
-	u32 best_len;
-	bt_matchfinder_advance_one_byte(mf,
-					in_base,
-					cur_pos,
-					nice_len,
-					nice_len,
-					max_search_depth,
-					next_hashes,
-					&best_len,
-					NULL,
-					false);
-}
diff --git a/ext/libdeflate/lib/crc32.c b/ext/libdeflate/lib/crc32.c
deleted file mode 100644
index 129149a1..00000000
--- a/ext/libdeflate/lib/crc32.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*
- * crc32.c - CRC-32 checksum algorithm for the gzip format
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * High-level description of CRC
- * =============================
- *
- * Consider a bit sequence 'bits[1...len]'.  Interpret 'bits' as the "message"
- * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
- * where the coefficient of 'x^i' is 'bits[len - i]'.  Then, compute:
- *
- *			R(x) = M(x)*x^n mod G(x)
- *
- * where G(x) is a selected "generator" polynomial of degree 'n'.  The remainder
- * R(x) is a polynomial of max degree 'n - 1'.  The CRC of 'bits' is R(x)
- * interpreted as a bitstring of length 'n'.
- *
- * CRC used in gzip
- * ================
- *
- * In the gzip format (RFC 1952):
- *
- *	- The bitstring to checksum is formed from the bytes of the uncompressed
- *	  data by concatenating the bits from the bytes in order, proceeding
- *	  from the low-order bit to the high-order bit within each byte.
- *
- *	- The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
- *	  x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
- *	  Consequently, the CRC length is 32 bits ("CRC-32").
- *
- *	- The highest order 32 coefficients of M(x)*x^n are inverted.
- *
- *	- All 32 coefficients of R(x) are inverted.
- *
- * The two inversions cause added leading and trailing zero bits to affect the
- * resulting CRC, whereas with a regular CRC such bits would have no effect on
- * the CRC.
- *
- * Computation and optimizations
- * =============================
- *
- * We can compute R(x) through "long division", maintaining only 32 bits of
- * state at any given time.  Multiplication by 'x' can be implemented as
- * right-shifting by 1 (assuming the polynomial<=>bitstring mapping where the
- * highest order bit represents the coefficient of x^0), and both addition and
- * subtraction can be implemented as bitwise exclusive OR (since we are working
- * in GF(2)).  Here is an unoptimized implementation:
- *
- *	static u32 crc32_gzip(const u8 *buffer, size_t size)
- *	{
- *		u32 remainder = 0;
- *		const u32 divisor = 0xEDB88320;
- *
- *		for (size_t i = 0; i < size * 8 + 32; i++) {
- *			int bit;
- *			u32 multiple;
- *
- *			if (i < size * 8)
- *				bit = (buffer[i / 8] >> (i % 8)) & 1;
- *			else
- *				bit = 0; // one of the 32 appended 0 bits
- *
- *			if (i < 32) // the first 32 bits are inverted
- *				bit ^= 1;
- *
- *			if (remainder & 1)
- *				multiple = divisor;
- *			else
- *				multiple = 0;
- *
- *			remainder >>= 1;
- *			remainder |= (u32)bit << 31;
- *			remainder ^= multiple;
- *		}
- *
- *		return ~remainder;
- *	}
- *
- * In this implementation, the 32-bit integer 'remainder' maintains the
- * remainder of the currently processed portion of the message (with 32 zero
- * bits appended) when divided by the generator polynomial.  'remainder' is the
- * representation of R(x), and 'divisor' is the representation of G(x) excluding
- * the x^32 coefficient.  For each bit to process, we multiply R(x) by 'x^1',
- * then add 'x^0' if the new bit is a 1.  If this causes R(x) to gain a nonzero
- * x^32 term, then we subtract G(x) from R(x).
- *
- * We can speed this up by taking advantage of the fact that XOR is commutative
- * and associative, so the order in which we combine the inputs into 'remainder'
- * is unimportant.  And since each message bit we add doesn't affect the choice
- * of 'multiple' until 32 bits later, we need not actually add each message bit
- * until that point:
- *
- *	static u32 crc32_gzip(const u8 *buffer, size_t size)
- *	{
- *		u32 remainder = ~0;
- *		const u32 divisor = 0xEDB88320;
- *
- *		for (size_t i = 0; i < size * 8; i++) {
- *			int bit;
- *			u32 multiple;
- *
- *			bit = (buffer[i / 8] >> (i % 8)) & 1;
- *			remainder ^= bit;
- *			if (remainder & 1)
- *				multiple = divisor;
- *			else
- *				multiple = 0;
- *			remainder >>= 1;
- *			remainder ^= multiple;
- *		}
- *
- *		return ~remainder;
- *	}
- *
- * With the above implementation we get the effect of 32 appended 0 bits for
- * free; they never affect the choice of a divisor, nor would they change the
- * value of 'remainder' if they were to be actually XOR'ed in.  And by starting
- * with a remainder of all 1 bits, we get the effect of complementing the first
- * 32 message bits.
- *
- * The next optimization is to process the input in multi-bit units.  Suppose
- * that we insert the next 'n' message bits into the remainder.  Then we get an
- * intermediate remainder of length '32 + n' bits, and the CRC of the extra 'n'
- * bits is the amount by which the low 32 bits of the remainder will change as a
- * result of cancelling out those 'n' bits.  Taking n=8 (one byte) and
- * precomputing a table containing the CRC of each possible byte, we get
- * crc32_slice1() defined below.
- *
- * As a further optimization, we could increase the multi-bit unit size to 16.
- * However, that is inefficient because the table size explodes from 256 entries
- * (1024 bytes) to 65536 entries (262144 bytes), which wastes memory and won't
- * fit in L1 cache on typical processors.
- *
- * However, we can actually process 4 bytes at a time using 4 different tables
- * with 256 entries each.  Logically, we form a 64-bit intermediate remainder
- * and cancel out the high 32 bits in 8-bit chunks.  Bits 32-39 are cancelled
- * out by the CRC of those bits, whereas bits 40-47 are be cancelled out by the
- * CRC of those bits with 8 zero bits appended, and so on.  This method is
- * implemented in crc32_slice4(), defined below.
- *
- * In crc32_slice8(), this method is extended to 8 bytes at a time.  The
- * intermediate remainder (which we never actually store explicitly) is 96 bits.
- *
- * On CPUs that support fast carryless multiplication, CRCs can be computed even
- * more quickly via "folding".  See e.g. the x86 PCLMUL implementation.
- */
-
-#include "lib_common.h"
-#include "libdeflate.h"
-
-typedef u32 (*crc32_func_t)(u32, const u8 *, size_t);
-
-/* Include architecture-specific implementations if available */
-#undef CRC32_SLICE1
-#undef CRC32_SLICE4
-#undef CRC32_SLICE8
-#undef DEFAULT_IMPL
-#undef DISPATCH
-#if defined(__arm__) || defined(__aarch64__)
-#  include "arm/crc32_impl.h"
-#elif defined(__i386__) || defined(__x86_64__)
-#  include "x86/crc32_impl.h"
-#endif
-
-/*
- * Define a generic implementation (crc32_slice8()) if needed.  crc32_slice1()
- * may also be needed as a fallback for architecture-specific implementations.
- */
-
-#ifndef DEFAULT_IMPL
-#  define CRC32_SLICE8	1
-#  define DEFAULT_IMPL	crc32_slice8
-#endif
-
-#if defined(CRC32_SLICE1) || defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
-#include "crc32_table.h"
-static forceinline u32
-crc32_update_byte(u32 remainder, u8 next_byte)
-{
-	return (remainder >> 8) ^ crc32_table[(u8)remainder ^ next_byte];
-}
-#endif
-
-#ifdef CRC32_SLICE1
-static u32
-crc32_slice1(u32 remainder, const u8 *buffer, size_t size)
-{
-	size_t i;
-
-	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x100);
-
-	for (i = 0; i < size; i++)
-		remainder = crc32_update_byte(remainder, buffer[i]);
-	return remainder;
-}
-#endif /* CRC32_SLICE1 */
-
-#ifdef CRC32_SLICE4
-static u32
-crc32_slice4(u32 remainder, const u8 *buffer, size_t size)
-{
-	const u8 *p = buffer;
-	const u8 *end = buffer + size;
-	const u8 *end32;
-
-	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x400);
-
-	for (; ((uintptr_t)p & 3) && p != end; p++)
-		remainder = crc32_update_byte(remainder, *p);
-
-	end32 = p + ((end - p) & ~3);
-	for (; p != end32; p += 4) {
-		u32 v = le32_bswap(*(const u32 *)p);
-		remainder =
-		    crc32_table[0x300 + (u8)((remainder ^ v) >>  0)] ^
-		    crc32_table[0x200 + (u8)((remainder ^ v) >>  8)] ^
-		    crc32_table[0x100 + (u8)((remainder ^ v) >> 16)] ^
-		    crc32_table[0x000 + (u8)((remainder ^ v) >> 24)];
-	}
-
-	for (; p != end; p++)
-		remainder = crc32_update_byte(remainder, *p);
-
-	return remainder;
-}
-#endif /* CRC32_SLICE4 */
-
-#ifdef CRC32_SLICE8
-static u32
-crc32_slice8(u32 remainder, const u8 *buffer, size_t size)
-{
-	const u8 *p = buffer;
-	const u8 *end = buffer + size;
-	const u8 *end64;
-
-	STATIC_ASSERT(ARRAY_LEN(crc32_table) >= 0x800);
-
-	for (; ((uintptr_t)p & 7) && p != end; p++)
-		remainder = crc32_update_byte(remainder, *p);
-
-	end64 = p + ((end - p) & ~7);
-	for (; p != end64; p += 8) {
-		u32 v1 = le32_bswap(*(const u32 *)(p + 0));
-		u32 v2 = le32_bswap(*(const u32 *)(p + 4));
-		remainder =
-		    crc32_table[0x700 + (u8)((remainder ^ v1) >>  0)] ^
-		    crc32_table[0x600 + (u8)((remainder ^ v1) >>  8)] ^
-		    crc32_table[0x500 + (u8)((remainder ^ v1) >> 16)] ^
-		    crc32_table[0x400 + (u8)((remainder ^ v1) >> 24)] ^
-		    crc32_table[0x300 + (u8)(v2 >>  0)] ^
-		    crc32_table[0x200 + (u8)(v2 >>  8)] ^
-		    crc32_table[0x100 + (u8)(v2 >> 16)] ^
-		    crc32_table[0x000 + (u8)(v2 >> 24)];
-	}
-
-	for (; p != end; p++)
-		remainder = crc32_update_byte(remainder, *p);
-
-	return remainder;
-}
-#endif /* CRC32_SLICE8 */
-
-#ifdef DISPATCH
-static u32 dispatch(u32, const u8 *, size_t);
-
-static volatile crc32_func_t crc32_impl = dispatch;
-
-/* Choose the fastest implementation at runtime */
-static u32 dispatch(u32 remainder, const u8 *buffer, size_t size)
-{
-	crc32_func_t f = arch_select_crc32_func();
-
-	if (f == NULL)
-		f = DEFAULT_IMPL;
-
-	crc32_impl = f;
-	return crc32_impl(remainder, buffer, size);
-}
-#else
-#  define crc32_impl DEFAULT_IMPL /* only one implementation, use it */
-#endif
-
-LIBDEFLATEAPI u32
-libdeflate_crc32(u32 remainder, const void *buffer, size_t size)
-{
-	if (buffer == NULL) /* return initial value */
-		return 0;
-	return ~crc32_impl(~remainder, buffer, size);
-}
diff --git a/ext/libdeflate/lib/crc32_table.h b/ext/libdeflate/lib/crc32_table.h
deleted file mode 100644
index 05421b98..00000000
--- a/ext/libdeflate/lib/crc32_table.h
+++ /dev/null
@@ -1,526 +0,0 @@
-/*
- * crc32_table.h - data table to accelerate CRC-32 computation
- *
- * THIS FILE WAS AUTOMATICALLY GENERATED BY gen_crc32_table.c.  DO NOT EDIT.
- */
-
-#include <stdint.h>
-
-static const uint32_t crc32_table[] = {
-	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
-	0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
-	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
-	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
-	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
-	0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
-	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
-	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
-	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
-	0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
-	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
-	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
-	0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
-	0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
-	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
-	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
-	0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
-	0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
-	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
-	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
-	0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
-	0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
-	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
-	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
-	0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
-	0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
-	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
-	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
-	0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
-	0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
-	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
-	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
-	0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
-	0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
-	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
-	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
-	0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
-	0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
-	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
-	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
-	0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
-	0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
-	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
-#if defined(CRC32_SLICE4) || defined(CRC32_SLICE8)
-	0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
-	0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
-	0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
-	0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
-	0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
-	0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
-	0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
-	0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
-	0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
-	0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
-	0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
-	0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
-	0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
-	0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
-	0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
-	0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
-	0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
-	0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
-	0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
-	0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
-	0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
-	0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
-	0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
-	0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
-	0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
-	0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
-	0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
-	0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
-	0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
-	0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
-	0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
-	0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
-	0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
-	0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
-	0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
-	0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
-	0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
-	0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
-	0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
-	0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
-	0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
-	0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
-	0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
-	0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
-	0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
-	0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
-	0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
-	0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
-	0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
-	0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
-	0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
-	0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
-	0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
-	0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
-	0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
-	0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
-	0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
-	0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
-	0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
-	0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
-	0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
-	0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
-	0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
-	0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
-	0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
-	0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
-	0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
-	0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
-	0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
-	0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
-	0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
-	0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
-	0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
-	0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
-	0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
-	0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
-	0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
-	0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
-	0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
-	0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
-	0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
-	0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
-	0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
-	0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
-	0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
-	0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
-	0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
-	0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
-	0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
-	0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
-	0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
-	0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
-	0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
-	0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
-	0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
-	0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
-	0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
-	0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
-	0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
-	0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
-	0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
-	0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
-	0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
-	0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
-	0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
-	0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
-	0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
-	0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
-	0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
-	0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
-	0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
-	0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
-	0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
-	0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
-	0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
-	0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
-	0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
-	0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
-	0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
-	0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
-	0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
-	0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
-	0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
-	0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
-	0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
-	0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
-	0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
-	0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
-	0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
-	0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
-	0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
-	0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
-	0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
-	0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
-	0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
-	0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
-	0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
-	0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
-	0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
-	0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
-	0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
-	0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
-	0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
-	0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
-	0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
-	0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
-	0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
-	0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
-	0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
-	0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
-	0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
-	0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
-	0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
-	0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
-	0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
-	0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
-	0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
-	0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
-	0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
-	0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
-	0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
-	0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
-	0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
-	0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
-	0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
-	0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
-	0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
-	0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
-	0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
-	0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
-	0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
-	0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
-	0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
-	0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
-	0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
-	0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
-	0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
-	0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
-	0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
-	0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
-	0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
-	0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
-	0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
-	0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
-	0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
-	0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
-	0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
-	0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
-	0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
-	0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
-	0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
-	0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1,
-#endif /* CRC32_SLICE4 || CRC32_SLICE8 */
-#if defined(CRC32_SLICE8)
-	0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0,
-	0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10,
-	0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111,
-	0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1,
-	0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
-	0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92,
-	0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693,
-	0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053,
-	0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4,
-	0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
-	0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15,
-	0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5,
-	0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256,
-	0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496,
-	0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
-	0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57,
-	0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299,
-	0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459,
-	0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958,
-	0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
-	0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b,
-	0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db,
-	0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda,
-	0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a,
-	0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
-	0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d,
-	0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c,
-	0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c,
-	0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f,
-	0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
-	0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de,
-	0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e,
-	0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42,
-	0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82,
-	0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
-	0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743,
-	0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0,
-	0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00,
-	0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601,
-	0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
-	0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546,
-	0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386,
-	0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87,
-	0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847,
-	0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
-	0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404,
-	0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905,
-	0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5,
-	0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b,
-	0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
-	0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca,
-	0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a,
-	0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589,
-	0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349,
-	0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
-	0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888,
-	0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f,
-	0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf,
-	0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce,
-	0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
-	0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d,
-	0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d,
-	0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c,
-	0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c,
-	0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae,
-	0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8,
-	0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3,
-	0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5,
-	0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
-	0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223,
-	0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258,
-	0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e,
-	0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798,
-	0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
-	0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5,
-	0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3,
-	0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503,
-	0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715,
-	0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
-	0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578,
-	0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2,
-	0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4,
-	0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf,
-	0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
-	0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59,
-	0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f,
-	0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834,
-	0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22,
-	0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
-	0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2,
-	0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99,
-	0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f,
-	0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f,
-	0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
-	0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02,
-	0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14,
-	0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676,
-	0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460,
-	0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
-	0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d,
-	0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed,
-	0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb,
-	0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680,
-	0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
-	0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340,
-	0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156,
-	0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d,
-	0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b,
-	0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
-	0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd,
-	0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6,
-	0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0,
-	0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a,
-	0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
-	0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77,
-	0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61,
-	0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81,
-	0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97,
-	0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
-	0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa,
-	0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c,
-	0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a,
-	0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41,
-	0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
-	0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7,
-	0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1,
-	0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da,
-	0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc,
-	0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d,
-	0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e,
-	0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa,
-	0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9,
-	0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
-	0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240,
-	0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834,
-	0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27,
-	0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301,
-	0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
-	0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66,
-	0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975,
-	0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf,
-	0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc,
-	0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
-	0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb,
-	0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4,
-	0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7,
-	0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183,
-	0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
-	0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a,
-	0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739,
-	0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d,
-	0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e,
-	0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
-	0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b,
-	0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f,
-	0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c,
-	0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6,
-	0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
-	0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1,
-	0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2,
-	0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f,
-	0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c,
-	0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
-	0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b,
-	0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1,
-	0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2,
-	0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6,
-	0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
-	0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3,
-	0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0,
-	0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794,
-	0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387,
-	0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
-	0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e,
-	0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a,
-	0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49,
-	0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516,
-	0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
-	0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71,
-	0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62,
-	0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8,
-	0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb,
-	0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
-	0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac,
-	0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a,
-	0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899,
-	0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed,
-	0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
-	0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044,
-	0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457,
-	0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23,
-	0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30,
-	0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3,
-	0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919,
-	0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56,
-	0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac,
-	0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
-	0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832,
-	0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d,
-	0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387,
-	0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5,
-	0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
-	0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00,
-	0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa,
-	0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e,
-	0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64,
-	0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
-	0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1,
-	0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e,
-	0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4,
-	0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb,
-	0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
-	0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425,
-	0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf,
-	0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90,
-	0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a,
-	0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
-	0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2,
-	0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced,
-	0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217,
-	0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673,
-	0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
-	0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6,
-	0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c,
-	0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239,
-	0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3,
-	0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
-	0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776,
-	0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312,
-	0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8,
-	0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7,
-	0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
-	0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f,
-	0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95,
-	0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda,
-	0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520,
-	0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
-	0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe,
-	0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1,
-	0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b,
-	0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4,
-	0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
-	0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61,
-	0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b,
-	0x061d761c, 0xcab77682, 0x44387161, 0x889271ff,
-	0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05,
-	0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
-	0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0,
-	0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282,
-	0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78,
-	0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937,
-	0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
-	0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9,
-	0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53,
-	0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c,
-	0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6,
-#endif /* CRC32_SLICE8 */
-};
diff --git a/ext/libdeflate/lib/crc32_vec_template.h b/ext/libdeflate/lib/crc32_vec_template.h
deleted file mode 100644
index 9a2ad5bd..00000000
--- a/ext/libdeflate/lib/crc32_vec_template.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * crc32_vec_template.h - template for vectorized CRC-32 implementations
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#define CRC32_SLICE1	1
-static u32 crc32_slice1(u32, const u8 *, size_t);
-
-/*
- * Template for vectorized CRC-32 implementations.
- *
- * Note: on unaligned ends of the buffer, we fall back to crc32_slice1() instead
- * of crc32_slice8() because only a few bytes need to be processed, so a smaller
- * table is preferable.
- */
-static u32 ATTRIBUTES
-FUNCNAME(u32 remainder, const u8 *p, size_t size)
-{
-	if ((uintptr_t)p % IMPL_ALIGNMENT) {
-		size_t n = MIN(size, -(uintptr_t)p % IMPL_ALIGNMENT);
-
-		remainder = crc32_slice1(remainder, p, n);
-		p += n;
-		size -= n;
-	}
-	if (size >= IMPL_SEGMENT_SIZE) {
-		remainder = FUNCNAME_ALIGNED(remainder, (const void *)p,
-					     size / IMPL_SEGMENT_SIZE);
-		p += size - (size % IMPL_SEGMENT_SIZE);
-		size %= IMPL_SEGMENT_SIZE;
-	}
-	return crc32_slice1(remainder, p, size);
-}
-
-#undef FUNCNAME
-#undef FUNCNAME_ALIGNED
-#undef ATTRIBUTES
-#undef IMPL_ALIGNMENT
-#undef IMPL_SEGMENT_SIZE
diff --git a/ext/libdeflate/lib/decompress_template.h b/ext/libdeflate/lib/decompress_template.h
deleted file mode 100644
index c6bcf9f5..00000000
--- a/ext/libdeflate/lib/decompress_template.h
+++ /dev/null
@@ -1,421 +0,0 @@
-/*
- * decompress_template.h
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * This is the actual DEFLATE decompression routine, lifted out of
- * deflate_decompress.c so that it can be compiled multiple times with different
- * target instruction sets.
- */
-
-static enum libdeflate_result ATTRIBUTES
-FUNCNAME(struct libdeflate_decompressor * restrict d,
-	 const void * restrict in, size_t in_nbytes,
-	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
-{
-	u8 *out_next = out;
-	u8 * const out_end = out_next + out_nbytes_avail;
-	const u8 *in_next = in;
-	const u8 * const in_end = in_next + in_nbytes;
-	bitbuf_t bitbuf = 0;
-	unsigned bitsleft = 0;
-	size_t overrun_count = 0;
-	unsigned i;
-	unsigned is_final_block;
-	unsigned block_type;
-	u16 len;
-	u16 nlen;
-	unsigned num_litlen_syms;
-	unsigned num_offset_syms;
-	u16 tmp16;
-	u32 tmp32;
-
-next_block:
-	/* Starting to read the next block.  */
-	;
-
-	STATIC_ASSERT(CAN_ENSURE(1 + 2 + 5 + 5 + 4));
-	ENSURE_BITS(1 + 2 + 5 + 5 + 4);
-
-	/* BFINAL: 1 bit  */
-	is_final_block = POP_BITS(1);
-
-	/* BTYPE: 2 bits  */
-	block_type = POP_BITS(2);
-
-	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
-
-		/* Dynamic Huffman block.  */
-
-		/* The order in which precode lengths are stored.  */
-		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
-			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
-		};
-
-		unsigned num_explicit_precode_lens;
-
-		/* Read the codeword length counts.  */
-
-		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == ((1 << 5) - 1) + 257);
-		num_litlen_syms = POP_BITS(5) + 257;
-
-		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == ((1 << 5) - 1) + 1);
-		num_offset_syms = POP_BITS(5) + 1;
-
-		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == ((1 << 4) - 1) + 4);
-		num_explicit_precode_lens = POP_BITS(4) + 4;
-
-		d->static_codes_loaded = false;
-
-		/* Read the precode codeword lengths.  */
-		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
-		for (i = 0; i < num_explicit_precode_lens; i++) {
-			ENSURE_BITS(3);
-			d->u.precode_lens[deflate_precode_lens_permutation[i]] = POP_BITS(3);
-		}
-
-		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
-			d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
-
-		/* Build the decode table for the precode.  */
-		SAFETY_CHECK(build_precode_decode_table(d));
-
-		/* Expand the literal/length and offset codeword lengths.  */
-		for (i = 0; i < num_litlen_syms + num_offset_syms; ) {
-			u32 entry;
-			unsigned presym;
-			u8 rep_val;
-			unsigned rep_count;
-
-			ENSURE_BITS(DEFLATE_MAX_PRE_CODEWORD_LEN + 7);
-
-			/* (The code below assumes that the precode decode table
-			 * does not have any subtables.)  */
-			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
-
-			/* Read the next precode symbol.  */
-			entry = d->u.l.precode_decode_table[BITS(DEFLATE_MAX_PRE_CODEWORD_LEN)];
-			REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
-			presym = entry >> HUFFDEC_RESULT_SHIFT;
-
-			if (presym < 16) {
-				/* Explicit codeword length  */
-				d->u.l.lens[i++] = presym;
-				continue;
-			}
-
-			/* Run-length encoded codeword lengths  */
-
-			/* Note: we don't need verify that the repeat count
-			 * doesn't overflow the number of elements, since we
-			 * have enough extra spaces to allow for the worst-case
-			 * overflow (138 zeroes when only 1 length was
-			 * remaining).
-			 *
-			 * In the case of the small repeat counts (presyms 16
-			 * and 17), it is fastest to always write the maximum
-			 * number of entries.  That gets rid of branches that
-			 * would otherwise be required.
-			 *
-			 * It is not just because of the numerical order that
-			 * our checks go in the order 'presym < 16', 'presym ==
-			 * 16', and 'presym == 17'.  For typical data this is
-			 * ordered from most frequent to least frequent case.
-			 */
-			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
-
-			if (presym == 16) {
-				/* Repeat the previous length 3 - 6 times  */
-				SAFETY_CHECK(i != 0);
-				rep_val = d->u.l.lens[i - 1];
-				STATIC_ASSERT(3 + ((1 << 2) - 1) == 6);
-				rep_count = 3 + POP_BITS(2);
-				d->u.l.lens[i + 0] = rep_val;
-				d->u.l.lens[i + 1] = rep_val;
-				d->u.l.lens[i + 2] = rep_val;
-				d->u.l.lens[i + 3] = rep_val;
-				d->u.l.lens[i + 4] = rep_val;
-				d->u.l.lens[i + 5] = rep_val;
-				i += rep_count;
-			} else if (presym == 17) {
-				/* Repeat zero 3 - 10 times  */
-				STATIC_ASSERT(3 + ((1 << 3) - 1) == 10);
-				rep_count = 3 + POP_BITS(3);
-				d->u.l.lens[i + 0] = 0;
-				d->u.l.lens[i + 1] = 0;
-				d->u.l.lens[i + 2] = 0;
-				d->u.l.lens[i + 3] = 0;
-				d->u.l.lens[i + 4] = 0;
-				d->u.l.lens[i + 5] = 0;
-				d->u.l.lens[i + 6] = 0;
-				d->u.l.lens[i + 7] = 0;
-				d->u.l.lens[i + 8] = 0;
-				d->u.l.lens[i + 9] = 0;
-				i += rep_count;
-			} else {
-				/* Repeat zero 11 - 138 times  */
-				STATIC_ASSERT(11 + ((1 << 7) - 1) == 138);
-				rep_count = 11 + POP_BITS(7);
-				memset(&d->u.l.lens[i], 0,
-				       rep_count * sizeof(d->u.l.lens[i]));
-				i += rep_count;
-			}
-		}
-	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
-
-		/* Uncompressed block: copy 'len' bytes literally from the input
-		 * buffer to the output buffer.  */
-
-		ALIGN_INPUT();
-
-		SAFETY_CHECK(in_end - in_next >= 4);
-
-		len = READ_U16();
-		nlen = READ_U16();
-
-		SAFETY_CHECK(len == (u16)~nlen);
-		if (unlikely(len > out_end - out_next))
-			return LIBDEFLATE_INSUFFICIENT_SPACE;
-		SAFETY_CHECK(len <= in_end - in_next);
-
-		memcpy(out_next, in_next, len);
-		in_next += len;
-		out_next += len;
-
-		goto block_done;
-
-	} else {
-		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
-
-		/*
-		 * Static Huffman block: build the decode tables for the static
-		 * codes.  Skip doing so if the tables are already set up from
-		 * an earlier static block; this speeds up decompression of
-		 * degenerate input of many empty or very short static blocks.
-		 *
-		 * Afterwards, the remainder is the same as decompressing a
-		 * dynamic Huffman block.
-		 */
-
-		if (d->static_codes_loaded)
-			goto have_decode_tables;
-
-		d->static_codes_loaded = true;
-
-		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
-		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
-
-		for (i = 0; i < 144; i++)
-			d->u.l.lens[i] = 8;
-		for (; i < 256; i++)
-			d->u.l.lens[i] = 9;
-		for (; i < 280; i++)
-			d->u.l.lens[i] = 7;
-		for (; i < 288; i++)
-			d->u.l.lens[i] = 8;
-
-		for (; i < 288 + 32; i++)
-			d->u.l.lens[i] = 5;
-
-		num_litlen_syms = 288;
-		num_offset_syms = 32;
-	}
-
-	/* Decompressing a Huffman block (either dynamic or static)  */
-
-	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
-	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
-have_decode_tables:
-
-	/* The main DEFLATE decode loop  */
-	for (;;) {
-		u32 entry;
-		u32 length;
-		u32 offset;
-		const u8 *src;
-		u8 *dst;
-
-		/* Decode a litlen symbol.  */
-		ENSURE_BITS(DEFLATE_MAX_LITLEN_CODEWORD_LEN);
-		entry = d->u.litlen_decode_table[BITS(LITLEN_TABLEBITS)];
-		if (entry & HUFFDEC_SUBTABLE_POINTER) {
-			/* Litlen subtable required (uncommon case)  */
-			REMOVE_BITS(LITLEN_TABLEBITS);
-			entry = d->u.litlen_decode_table[
-				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
-				BITS(entry & HUFFDEC_LENGTH_MASK)];
-		}
-		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
-		if (entry & HUFFDEC_LITERAL) {
-			/* Literal  */
-			if (unlikely(out_next == out_end))
-				return LIBDEFLATE_INSUFFICIENT_SPACE;
-			*out_next++ = (u8)(entry >> HUFFDEC_RESULT_SHIFT);
-			continue;
-		}
-
-		/* Match or end-of-block  */
-
-		entry >>= HUFFDEC_RESULT_SHIFT;
-		ENSURE_BITS(MAX_ENSURE);
-
-		/* Pop the extra length bits and add them to the length base to
-		 * produce the full length.  */
-		length = (entry >> HUFFDEC_LENGTH_BASE_SHIFT) +
-			 POP_BITS(entry & HUFFDEC_EXTRA_LENGTH_BITS_MASK);
-
-		/* The match destination must not end after the end of the
-		 * output buffer.  For efficiency, combine this check with the
-		 * end-of-block check.  We're using 0 for the special
-		 * end-of-block length, so subtract 1 and it turn it into
-		 * SIZE_MAX.  */
-		STATIC_ASSERT(HUFFDEC_END_OF_BLOCK_LENGTH == 0);
-		if (unlikely((size_t)length - 1 >= out_end - out_next)) {
-			if (unlikely(length != HUFFDEC_END_OF_BLOCK_LENGTH))
-				return LIBDEFLATE_INSUFFICIENT_SPACE;
-			goto block_done;
-		}
-
-		/* Decode the match offset.  */
-
-		entry = d->offset_decode_table[BITS(OFFSET_TABLEBITS)];
-		if (entry & HUFFDEC_SUBTABLE_POINTER) {
-			/* Offset subtable required (uncommon case)  */
-			REMOVE_BITS(OFFSET_TABLEBITS);
-			entry = d->offset_decode_table[
-				((entry >> HUFFDEC_RESULT_SHIFT) & 0xFFFF) +
-				BITS(entry & HUFFDEC_LENGTH_MASK)];
-		}
-		REMOVE_BITS(entry & HUFFDEC_LENGTH_MASK);
-		entry >>= HUFFDEC_RESULT_SHIFT;
-
-		STATIC_ASSERT(CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
-					 DEFLATE_MAX_OFFSET_CODEWORD_LEN) &&
-			      CAN_ENSURE(DEFLATE_MAX_EXTRA_OFFSET_BITS));
-		if (!CAN_ENSURE(DEFLATE_MAX_EXTRA_LENGTH_BITS +
-				DEFLATE_MAX_OFFSET_CODEWORD_LEN +
-				DEFLATE_MAX_EXTRA_OFFSET_BITS))
-			ENSURE_BITS(DEFLATE_MAX_EXTRA_OFFSET_BITS);
-
-		/* Pop the extra offset bits and add them to the offset base to
-		 * produce the full offset.  */
-		offset = (entry & HUFFDEC_OFFSET_BASE_MASK) +
-			 POP_BITS(entry >> HUFFDEC_EXTRA_OFFSET_BITS_SHIFT);
-
-		/* The match source must not begin before the beginning of the
-		 * output buffer.  */
-		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
-
-		/*
-		 * Copy the match: 'length' bytes at 'out_next - offset' to
-		 * 'out_next', possibly overlapping.  If the match doesn't end
-		 * too close to the end of the buffer and offset >= WORDBYTES ||
-		 * offset == 1, take a fast path which copies a word at a time
-		 * -- potentially more than the length of the match, but that's
-		 * fine as long as we check for enough extra space.
-		 *
-		 * The remaining cases are not performance-critical so are
-		 * handled by a simple byte-by-byte copy.
-		 */
-
-		src = out_next - offset;
-		dst = out_next;
-		out_next += length;
-
-		if (UNALIGNED_ACCESS_IS_FAST &&
-		    /* max overrun is writing 3 words for a min length match */
-		    likely(out_end - out_next >=
-			   3 * WORDBYTES - DEFLATE_MIN_MATCH_LEN)) {
-			if (offset >= WORDBYTES) { /* words don't overlap? */
-				copy_word_unaligned(src, dst);
-				src += WORDBYTES;
-				dst += WORDBYTES;
-				copy_word_unaligned(src, dst);
-				src += WORDBYTES;
-				dst += WORDBYTES;
-				do {
-					copy_word_unaligned(src, dst);
-					src += WORDBYTES;
-					dst += WORDBYTES;
-				} while (dst < out_next);
-			} else if (offset == 1) {
-				/* RLE encoding of previous byte, common if the
-				 * data contains many repeated bytes */
-				machine_word_t v = repeat_byte(*src);
-
-				store_word_unaligned(v, dst);
-				dst += WORDBYTES;
-				store_word_unaligned(v, dst);
-				dst += WORDBYTES;
-				do {
-					store_word_unaligned(v, dst);
-					dst += WORDBYTES;
-				} while (dst < out_next);
-			} else {
-				*dst++ = *src++;
-				*dst++ = *src++;
-				do {
-					*dst++ = *src++;
-				} while (dst < out_next);
-			}
-		} else {
-			STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
-			*dst++ = *src++;
-			*dst++ = *src++;
-			do {
-				*dst++ = *src++;
-			} while (dst < out_next);
-		}
-	}
-
-block_done:
-	/* Finished decoding a block.  */
-
-	if (!is_final_block)
-		goto next_block;
-
-	/* That was the last block.  */
-
-	/* Discard any readahead bits and check for excessive overread */
-	ALIGN_INPUT();
-
-	/* Optionally return the actual number of bytes read */
-	if (actual_in_nbytes_ret)
-		*actual_in_nbytes_ret = in_next - (u8 *)in;
-
-	/* Optionally return the actual number of bytes written */
-	if (actual_out_nbytes_ret) {
-		*actual_out_nbytes_ret = out_next - (u8 *)out;
-	} else {
-		if (out_next != out_end)
-			return LIBDEFLATE_SHORT_OUTPUT;
-	}
-	return LIBDEFLATE_SUCCESS;
-}
-
-#undef FUNCNAME
-#undef ATTRIBUTES
diff --git a/ext/libdeflate/lib/deflate_compress.c b/ext/libdeflate/lib/deflate_compress.c
deleted file mode 100644
index 5049b13e..00000000
--- a/ext/libdeflate/lib/deflate_compress.c
+++ /dev/null
@@ -1,2826 +0,0 @@
-/*
- * deflate_compress.c - a compressor for DEFLATE
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include <stdlib.h>
-#include <string.h>
-
-#include "aligned_malloc.h"
-#include "deflate_compress.h"
-#include "deflate_constants.h"
-#include "unaligned.h"
-
-#include "libdeflate.h"
-
-/*
- * By default, the near-optimal parsing algorithm is enabled at compression
- * level 8 and above.  The near-optimal parsing algorithm produces a compression
- * ratio significantly better than the greedy and lazy algorithms implemented
- * here, and also the algorithm used by zlib at level 9.  However, it is slow.
- */
-#define SUPPORT_NEAR_OPTIMAL_PARSING 1
-
-/*
- * Define to 1 to maintain the full map from match offsets to offset slots.
- * This slightly speeds up translations of match offsets to offset slots, but it
- * uses 32769 bytes of memory rather than the 512 bytes used by the condensed
- * map.  The speedup provided by the larger map is most helpful when the
- * near-optimal parsing algorithm is being used.
- */
-#define USE_FULL_OFFSET_SLOT_FAST	SUPPORT_NEAR_OPTIMAL_PARSING
-
-/*
- * DEFLATE uses a 32768 byte sliding window; set the matchfinder parameters
- * appropriately.
- */
-#define MATCHFINDER_WINDOW_ORDER	15
-
-#include "hc_matchfinder.h"
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-#  include "bt_matchfinder.h"
-#endif
-
-/*
- * The compressor always chooses a block of at least MIN_BLOCK_LENGTH bytes,
- * except if the last block has to be shorter.
- */
-#define MIN_BLOCK_LENGTH	10000
-
-/*
- * The compressor attempts to end blocks after SOFT_MAX_BLOCK_LENGTH bytes, but
- * the final length might be slightly longer due to matches extending beyond
- * this limit.
- */
-#define SOFT_MAX_BLOCK_LENGTH	300000
-
-/*
- * The number of observed matches or literals that represents sufficient data to
- * decide whether the current block should be terminated or not.
- */
-#define NUM_OBSERVATIONS_PER_BLOCK_CHECK       512
-
-
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-/* Constants specific to the near-optimal parsing algorithm */
-
-/*
- * The maximum number of matches the matchfinder can find at a single position.
- * Since the matchfinder never finds more than one match for the same length,
- * presuming one of each possible length is sufficient for an upper bound.
- * (This says nothing about whether it is worthwhile to consider so many
- * matches; this is just defining the worst case.)
- */
-#  define MAX_MATCHES_PER_POS	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
-
-/*
- * The number of lz_match structures in the match cache, excluding the extra
- * "overflow" entries.  This value should be high enough so that nearly the
- * time, all matches found in a given block can fit in the match cache.
- * However, fallback behavior (immediately terminating the block) on cache
- * overflow is still required.
- */
-#  define CACHE_LENGTH      (SOFT_MAX_BLOCK_LENGTH * 5)
-
-#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-
-/*
- * These are the compressor-side limits on the codeword lengths for each Huffman
- * code.  To make outputting bits slightly faster, some of these limits are
- * lower than the limits defined by the DEFLATE format.  This does not
- * significantly affect the compression ratio, at least for the block lengths we
- * use.
- */
-#define MAX_LITLEN_CODEWORD_LEN		14
-#define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
-#define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
-
-/* Table: length slot => length slot base value  */
-static const unsigned deflate_length_slot_base[] = {
-	3   , 4   , 5   , 6   , 7   , 8   , 9   , 10  ,
-	11  , 13  , 15  , 17  , 19  , 23  , 27  , 31  ,
-	35  , 43  , 51  , 59  , 67  , 83  , 99  , 115 ,
-	131 , 163 , 195 , 227 , 258 ,
-};
-
-/* Table: length slot => number of extra length bits  */
-static const u8 deflate_extra_length_bits[] = {
-	0   , 0   , 0   , 0   , 0   , 0   , 0   , 0 ,
-	1   , 1   , 1   , 1   , 2   , 2   , 2   , 2 ,
-	3   , 3   , 3   , 3   , 4   , 4   , 4   , 4 ,
-	5   , 5   , 5   , 5   , 0   ,
-};
-
-/* Table: offset slot => offset slot base value  */
-static const unsigned deflate_offset_slot_base[] = {
-	1    , 2    , 3    , 4     , 5     , 7     , 9     , 13    ,
-	17   , 25   , 33   , 49    , 65    , 97    , 129   , 193   ,
-	257  , 385  , 513  , 769   , 1025  , 1537  , 2049  , 3073  ,
-	4097 , 6145 , 8193 , 12289 , 16385 , 24577 ,
-};
-
-/* Table: offset slot => number of extra offset bits  */
-static const u8 deflate_extra_offset_bits[] = {
-	0    , 0    , 0    , 0     , 1     , 1     , 2     , 2     ,
-	3    , 3    , 4    , 4     , 5     , 5     , 6     , 6     ,
-	7    , 7    , 8    , 8     , 9     , 9     , 10    , 10    ,
-	11   , 11   , 12   , 12    , 13    , 13    ,
-};
-
-/* Table: length => length slot  */
-static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
-	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
-	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
-	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
-	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
-	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
-	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
-	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	27, 27, 28,
-};
-
-/* The order in which precode codeword lengths are stored */
-static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
-	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
-};
-
-/* Codewords for the DEFLATE Huffman codes.  */
-struct deflate_codewords {
-	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
-	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
-};
-
-/* Codeword lengths (in bits) for the DEFLATE Huffman codes.
- * A zero length means the corresponding symbol had zero frequency.  */
-struct deflate_lens {
-	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
-	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
-};
-
-/* Codewords and lengths for the DEFLATE Huffman codes.  */
-struct deflate_codes {
-	struct deflate_codewords codewords;
-	struct deflate_lens lens;
-};
-
-/* Symbol frequency counters for the DEFLATE Huffman codes.  */
-struct deflate_freqs {
-	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
-	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
-};
-
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-
-/* Costs for the near-optimal parsing algorithm.  */
-struct deflate_costs {
-
-	/* The cost to output each possible literal.  */
-	u32 literal[DEFLATE_NUM_LITERALS];
-
-	/* The cost to output each possible match length.  */
-	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
-
-	/* The cost to output a match offset of each possible offset slot.  */
-	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
-};
-
-/*
- * COST_SHIFT is a scaling factor that makes it possible to consider fractional
- * bit costs.  A token requiring 'n' bits to represent has cost n << COST_SHIFT.
- *
- * Note: this is only useful as a statistical trick for when the true costs are
- * unknown.  In reality, each token in DEFLATE requires a whole number of bits
- * to output.
- */
-#define COST_SHIFT	3
-
-/*
- * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
- * be needed to output a symbol that was unused in the previous optimization
- * pass.  Assigning a default cost allows the symbol to be used in the next
- * optimization pass.  However, the cost should be relatively high because the
- * symbol probably won't be used very many times (if at all).
- */
-#define LITERAL_NOSTAT_BITS	13
-#define LENGTH_NOSTAT_BITS	13
-#define OFFSET_NOSTAT_BITS	10
-
-#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-
-/*
- * Represents a run of literals followed by a match or end-of-block.  This
- * struct is needed to temporarily store items chosen by the parser, since items
- * cannot be written until all items for the block have been chosen and the
- * block's Huffman codes have been computed.
- */
-struct deflate_sequence {
-
-	/* Bits 0..22: the number of literals in this run.  This may be 0 and
-	 * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
-	 * stored explicitly in this structure; instead, they are read directly
-	 * from the uncompressed data.
-	 *
-	 * Bits 23..31: the length of the match which follows the literals, or 0
-	 * if this literal run was the last in the block, so there is no match
-	 * which follows it.  */
-	u32 litrunlen_and_length;
-
-	/* If 'length' doesn't indicate end-of-block, then this is the offset of
-	 * the match which follows the literals.  */
-	u16 offset;
-
-	/* If 'length' doesn't indicate end-of-block, then this is the offset
-	 * symbol of the match which follows the literals.  */
-	u8 offset_symbol;
-
-	/* If 'length' doesn't indicate end-of-block, then this is the length
-	 * slot of the match which follows the literals.  */
-	u8 length_slot;
-};
-
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-
-/*
- * This structure represents a byte position in the input data and a node in the
- * graph of possible match/literal choices for the current block.
- *
- * Logically, each incoming edge to this node is labeled with a literal or a
- * match that can be taken to reach this position from an earlier position; and
- * each outgoing edge from this node is labeled with a literal or a match that
- * can be taken to advance from this position to a later position.
- *
- * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
- * associate with each node just two pieces of information:
- *
- *	'cost_to_end' is the minimum cost to reach the end of the block from
- *	this position.
- *
- *	'item' represents the literal or match that must be chosen from here to
- *	reach the end of the block with the minimum cost.  Equivalently, this
- *	can be interpreted as the label of the outgoing edge on the minimum-cost
- *	path to the "end of block" node from this node.
- */
-struct deflate_optimum_node {
-
-	u32 cost_to_end;
-
-	/*
-	 * Notes on the match/literal representation used here:
-	 *
-	 *	The low bits of 'item' are the length: 1 if this is a literal,
-	 *	or the match length if this is a match.
-	 *
-	 *	The high bits of 'item' are the actual literal byte if this is a
-	 *	literal, or the match offset if this is a match.
-	 */
-#define OPTIMUM_OFFSET_SHIFT 9
-#define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
-	u32 item;
-
-};
-
-#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-
-/* Block split statistics.  See "Block splitting algorithm" below. */
-#define NUM_LITERAL_OBSERVATION_TYPES 8
-#define NUM_MATCH_OBSERVATION_TYPES 2
-#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + NUM_MATCH_OBSERVATION_TYPES)
-struct block_split_stats {
-	u32 new_observations[NUM_OBSERVATION_TYPES];
-	u32 observations[NUM_OBSERVATION_TYPES];
-	u32 num_new_observations;
-	u32 num_observations;
-};
-
-/* The main DEFLATE compressor structure  */
-struct libdeflate_compressor {
-
-	/* Pointer to the compress() implementation chosen at allocation time */
-	size_t (*impl)(struct libdeflate_compressor *,
-		       const u8 *, size_t, u8 *, size_t);
-
-	/* Frequency counters for the current block  */
-	struct deflate_freqs freqs;
-
-	/* Dynamic Huffman codes for the current block  */
-	struct deflate_codes codes;
-
-	/* Static Huffman codes */
-	struct deflate_codes static_codes;
-
-	/* Block split statistics for the currently pending block */
-	struct block_split_stats split_stats;
-
-	/* A table for fast lookups of offset slot by match offset.
-	 *
-	 * If the full table is being used, it is a direct mapping from offset
-	 * to offset slot.
-	 *
-	 * If the condensed table is being used, the first 256 entries map
-	 * directly to the offset slots of offsets 1 through 256.  The next 256
-	 * entries map to the offset slots for the remaining offsets, stepping
-	 * through the offsets with a stride of 128.  This relies on the fact
-	 * that each of the remaining offset slots contains at least 128 offsets
-	 * and has an offset base that is a multiple of 128.  */
-#if USE_FULL_OFFSET_SLOT_FAST
-	u8 offset_slot_fast[DEFLATE_MAX_MATCH_OFFSET + 1];
-#else
-	u8 offset_slot_fast[512];
-#endif
-
-	/* The "nice" match length: if a match of this length is found, choose
-	 * it immediately without further consideration.  */
-	unsigned nice_match_length;
-
-	/* The maximum search depth: consider at most this many potential
-	 * matches at each position.  */
-	unsigned max_search_depth;
-
-	/* The compression level with which this compressor was created.  */
-	unsigned compression_level;
-
-	/* Temporary space for Huffman code output  */
-	u32 precode_freqs[DEFLATE_NUM_PRECODE_SYMS];
-	u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
-	u32 precode_codewords[DEFLATE_NUM_PRECODE_SYMS];
-	unsigned precode_items[DEFLATE_NUM_LITLEN_SYMS + DEFLATE_NUM_OFFSET_SYMS];
-	unsigned num_litlen_syms;
-	unsigned num_offset_syms;
-	unsigned num_explicit_lens;
-	unsigned num_precode_items;
-
-	union {
-		/* Data for greedy or lazy parsing  */
-		struct {
-			/* Hash chain matchfinder  */
-			struct hc_matchfinder hc_mf;
-
-			/* The matches and literals that the parser has chosen
-			 * for the current block.  The required length of this
-			 * array is limited by the maximum number of matches
-			 * that can ever be chosen for a single block, plus one
-			 * for the special entry at the end.  */
-			struct deflate_sequence sequences[
-				DIV_ROUND_UP(SOFT_MAX_BLOCK_LENGTH,
-					     DEFLATE_MIN_MATCH_LEN) + 1];
-		} g; /* (g)reedy */
-
-	#if SUPPORT_NEAR_OPTIMAL_PARSING
-		/* Data for near-optimal parsing  */
-		struct {
-
-			/* Binary tree matchfinder  */
-			struct bt_matchfinder bt_mf;
-
-			/*
-			 * Cached matches for the current block.  This array
-			 * contains the matches that were found at each position
-			 * in the block.  Specifically, for each position, there
-			 * is a list of matches found at that position, if any,
-			 * sorted by strictly increasing length.  In addition,
-			 * following the matches for each position, there is a
-			 * special 'struct lz_match' whose 'length' member
-			 * contains the number of matches found at that
-			 * position, and whose 'offset' member contains the
-			 * literal at that position.
-			 *
-			 * Note: in rare cases, there will be a very high number
-			 * of matches in the block and this array will overflow.
-			 * If this happens, we force the end of the current
-			 * block.  CACHE_LENGTH is the length at which we
-			 * actually check for overflow.  The extra slots beyond
-			 * this are enough to absorb the worst case overflow,
-			 * which occurs if starting at &match_cache[CACHE_LENGTH
-			 * - 1], we write MAX_MATCHES_PER_POS matches and a
-			 * match count header, then skip searching for matches
-			 * at 'DEFLATE_MAX_MATCH_LEN - 1' positions and write
-			 * the match count header for each.
-			 */
-			struct lz_match match_cache[CACHE_LENGTH +
-						    MAX_MATCHES_PER_POS +
-						    DEFLATE_MAX_MATCH_LEN - 1];
-
-			/*
-			 * Array of nodes, one per position, for running the
-			 * minimum-cost path algorithm.
-			 *
-			 * This array must be large enough to accommodate the
-			 * worst-case number of nodes, which occurs if we find a
-			 * match of length DEFLATE_MAX_MATCH_LEN at position
-			 * SOFT_MAX_BLOCK_LENGTH - 1, producing a block of
-			 * length SOFT_MAX_BLOCK_LENGTH - 1 +
-			 * DEFLATE_MAX_MATCH_LEN.  Add one for the end-of-block
-			 * node.
-			 */
-			struct deflate_optimum_node optimum_nodes[SOFT_MAX_BLOCK_LENGTH - 1 +
-								  DEFLATE_MAX_MATCH_LEN + 1];
-
-			/* The current cost model being used.  */
-			struct deflate_costs costs;
-
-			unsigned num_optim_passes;
-		} n; /* (n)ear-optimal */
-	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-
-	} p; /* (p)arser */
-};
-
-/*
- * The type for the bitbuffer variable, which temporarily holds bits that are
- * being packed into bytes and written to the output buffer.  For best
- * performance, this should have size equal to a machine word.
- */
-typedef machine_word_t bitbuf_t;
-#define BITBUF_NBITS	(8 * sizeof(bitbuf_t))
-
-/* Can the specified number of bits always be added to 'bitbuf' after any
- * pending bytes have been flushed?  */
-#define CAN_BUFFER(n)	((n) <= BITBUF_NBITS - 7)
-
-/*
- * Structure to keep track of the current state of sending bits to the
- * compressed output buffer.
- */
-struct deflate_output_bitstream {
-
-	/* Bits that haven't yet been written to the output buffer.  */
-	bitbuf_t bitbuf;
-
-	/* Number of bits currently held in @bitbuf.  */
-	unsigned bitcount;
-
-	/* Pointer to the beginning of the output buffer.  */
-	u8 *begin;
-
-	/* Pointer to the position in the output buffer at which the next byte
-	 * should be written.  */
-	u8 *next;
-
-	/* Pointer just past the end of the output buffer.  */
-	u8 *end;
-};
-
-/*
- * OUTPUT_END_PADDING is the size, in bytes, of the extra space that must be
- * present following os->end, in order to not overrun the buffer when generating
- * output.  When UNALIGNED_ACCESS_IS_FAST, we need at least sizeof(bitbuf_t)
- * bytes for put_unaligned_leword().  Otherwise we need only 1 byte.  However,
- * to make the compression algorithm produce the same result on all CPU
- * architectures (which is sometimes desirable), we have to unconditionally use
- * the maximum for any CPU, which is sizeof(bitbuf_t) == 8.
- */
-#define OUTPUT_END_PADDING	8
-
-/* Initialize the output bitstream.  'size' is assumed to be at least
- * OUTPUT_END_PADDING.  */
-static void
-deflate_init_output(struct deflate_output_bitstream *os,
-		    void *buffer, size_t size)
-{
-	os->bitbuf = 0;
-	os->bitcount = 0;
-	os->begin = buffer;
-	os->next = os->begin;
-	os->end = os->begin + size - OUTPUT_END_PADDING;
-}
-
-/* Add some bits to the bitbuffer variable of the output bitstream.  The caller
- * must make sure there is enough room.  */
-static forceinline void
-deflate_add_bits(struct deflate_output_bitstream *os,
-		 const bitbuf_t bits, const unsigned num_bits)
-{
-	os->bitbuf |= bits << os->bitcount;
-	os->bitcount += num_bits;
-}
-
-/* Flush bits from the bitbuffer variable to the output buffer.  */
-static forceinline void
-deflate_flush_bits(struct deflate_output_bitstream *os)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		/* Flush a whole word (branchlessly).  */
-		put_unaligned_leword(os->bitbuf, os->next);
-		os->bitbuf >>= os->bitcount & ~7;
-		os->next += MIN(os->end - os->next, os->bitcount >> 3);
-		os->bitcount &= 7;
-	} else {
-		/* Flush a byte at a time.  */
-		while (os->bitcount >= 8) {
-			*os->next = os->bitbuf;
-			if (os->next != os->end)
-				os->next++;
-			os->bitcount -= 8;
-			os->bitbuf >>= 8;
-		}
-	}
-}
-
-/* Align the bitstream on a byte boundary. */
-static forceinline void
-deflate_align_bitstream(struct deflate_output_bitstream *os)
-{
-	os->bitcount += -os->bitcount & 7;
-	deflate_flush_bits(os);
-}
-
-/*
- * Flush any remaining bits to the output buffer if needed.  Return the total
- * number of bytes written to the output buffer, or 0 if an overflow occurred.
- */
-static u32
-deflate_flush_output(struct deflate_output_bitstream *os)
-{
-	if (os->next == os->end) /* overflow?  */
-		return 0;
-
-	while ((int)os->bitcount > 0) {
-		*os->next++ = os->bitbuf;
-		os->bitcount -= 8;
-		os->bitbuf >>= 8;
-	}
-
-	return os->next - os->begin;
-}
-
-/* Given the binary tree node A[subtree_idx] whose children already
- * satisfy the maxheap property, swap the node with its greater child
- * until it is greater than both its children, so that the maxheap
- * property is satisfied in the subtree rooted at A[subtree_idx].  */
-static void
-heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
-{
-	unsigned parent_idx;
-	unsigned child_idx;
-	u32 v;
-
-	v = A[subtree_idx];
-	parent_idx = subtree_idx;
-	while ((child_idx = parent_idx * 2) <= length) {
-		if (child_idx < length && A[child_idx + 1] > A[child_idx])
-			child_idx++;
-		if (v >= A[child_idx])
-			break;
-		A[parent_idx] = A[child_idx];
-		parent_idx = child_idx;
-	}
-	A[parent_idx] = v;
-}
-
-/* Rearrange the array 'A' so that it satisfies the maxheap property.
- * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
- */
-static void
-heapify_array(u32 A[], unsigned length)
-{
-	unsigned subtree_idx;
-
-	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
-		heapify_subtree(A, length, subtree_idx);
-}
-
-/*
- * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
- *
- * Note: name this function heap_sort() instead of heapsort() to avoid colliding
- * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
- * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
- */
-static void
-heap_sort(u32 A[], unsigned length)
-{
-	A--; /* Use 1-based indices  */
-
-	heapify_array(A, length);
-
-	while (length >= 2) {
-		u32 tmp = A[length];
-		A[length] = A[1];
-		A[1] = tmp;
-		length--;
-		heapify_subtree(A, length, 1);
-	}
-}
-
-#define NUM_SYMBOL_BITS 10
-#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1)
-
-#define GET_NUM_COUNTERS(num_syms)	((((num_syms) + 3 / 4) + 3) & ~3)
-/*
- * Sort the symbols primarily by frequency and secondarily by symbol
- * value.  Discard symbols with zero frequency and fill in an array with
- * the remaining symbols, along with their frequencies.  The low
- * NUM_SYMBOL_BITS bits of each array entry will contain the symbol
- * value, and the remaining bits will contain the frequency.
- *
- * @num_syms
- *	Number of symbols in the alphabet.
- *	Can't be greater than (1 << NUM_SYMBOL_BITS).
- *
- * @freqs[num_syms]
- *	The frequency of each symbol.
- *
- * @lens[num_syms]
- *	An array that eventually will hold the length of each codeword.
- *	This function only fills in the codeword lengths for symbols that
- *	have zero frequency, which are not well defined per se but will
- *	be set to 0.
- *
- * @symout[num_syms]
- *	The output array, described above.
- *
- * Returns the number of entries in 'symout' that were filled.  This is
- * the number of symbols that have nonzero frequency.
- */
-static unsigned
-sort_symbols(unsigned num_syms, const u32 freqs[restrict],
-	     u8 lens[restrict], u32 symout[restrict])
-{
-	unsigned sym;
-	unsigned i;
-	unsigned num_used_syms;
-	unsigned num_counters;
-	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
-
-	/* We rely on heapsort, but with an added optimization.  Since
-	 * it's common for most symbol frequencies to be low, we first do
-	 * a count sort using a limited number of counters.  High
-	 * frequencies will be counted in the last counter, and only they
-	 * will be sorted with heapsort.
-	 *
-	 * Note: with more symbols, it is generally beneficial to have more
-	 * counters.  About 1 counter per 4 symbols seems fast.
-	 *
-	 * Note: I also tested radix sort, but even for large symbol
-	 * counts (> 255) and frequencies bounded at 16 bits (enabling
-	 * radix sort by just two base-256 digits), it didn't seem any
-	 * faster than the method implemented here.
-	 *
-	 * Note: I tested the optimized quicksort implementation from
-	 * glibc (with indirection overhead removed), but it was only
-	 * marginally faster than the simple heapsort implemented here.
-	 *
-	 * Tests were done with building the codes for LZX.  Results may
-	 * vary for different compression algorithms...!  */
-
-	num_counters = GET_NUM_COUNTERS(num_syms);
-
-	memset(counters, 0, num_counters * sizeof(counters[0]));
-
-	/* Count the frequencies.  */
-	for (sym = 0; sym < num_syms; sym++)
-		counters[MIN(freqs[sym], num_counters - 1)]++;
-
-	/* Make the counters cumulative, ignoring the zero-th, which
-	 * counted symbols with zero frequency.  As a side effect, this
-	 * calculates the number of symbols with nonzero frequency.  */
-	num_used_syms = 0;
-	for (i = 1; i < num_counters; i++) {
-		unsigned count = counters[i];
-		counters[i] = num_used_syms;
-		num_used_syms += count;
-	}
-
-	/* Sort nonzero-frequency symbols using the counters.  At the
-	 * same time, set the codeword lengths of zero-frequency symbols
-	 * to 0.  */
-	for (sym = 0; sym < num_syms; sym++) {
-		u32 freq = freqs[sym];
-		if (freq != 0) {
-			symout[counters[MIN(freq, num_counters - 1)]++] =
-				sym | (freq << NUM_SYMBOL_BITS);
-		} else {
-			lens[sym] = 0;
-		}
-	}
-
-	/* Sort the symbols counted in the last counter.  */
-	heap_sort(symout + counters[num_counters - 2],
-		  counters[num_counters - 1] - counters[num_counters - 2]);
-
-	return num_used_syms;
-}
-
-/*
- * Build the Huffman tree.
- *
- * This is an optimized implementation that
- *	(a) takes advantage of the frequencies being already sorted;
- *	(b) only generates non-leaf nodes, since the non-leaf nodes of a
- *	    Huffman tree are sufficient to generate a canonical code;
- *	(c) Only stores parent pointers, not child pointers;
- *	(d) Produces the nodes in the same memory used for input
- *	    frequency information.
- *
- * Array 'A', which contains 'sym_count' entries, is used for both input
- * and output.  For this function, 'sym_count' must be at least 2.
- *
- * For input, the array must contain the frequencies of the symbols,
- * sorted in increasing order.  Specifically, each entry must contain a
- * frequency left shifted by NUM_SYMBOL_BITS bits.  Any data in the low
- * NUM_SYMBOL_BITS bits of the entries will be ignored by this function.
- * Although these bits will, in fact, contain the symbols that correspond
- * to the frequencies, this function is concerned with frequencies only
- * and keeps the symbols as-is.
- *
- * For output, this function will produce the non-leaf nodes of the
- * Huffman tree.  These nodes will be stored in the first (sym_count - 1)
- * entries of the array.  Entry A[sym_count - 2] will represent the root
- * node.  Each other node will contain the zero-based index of its parent
- * node in 'A', left shifted by NUM_SYMBOL_BITS bits.  The low
- * NUM_SYMBOL_BITS bits of each entry in A will be kept as-is.  Again,
- * note that although these low bits will, in fact, contain a symbol
- * value, this symbol will have *no relationship* with the Huffman tree
- * node that happens to occupy the same slot.  This is because this
- * implementation only generates the non-leaf nodes of the tree.
- */
-static void
-build_tree(u32 A[], unsigned sym_count)
-{
-	/* Index, in 'A', of next lowest frequency symbol that has not
-	 * yet been processed.  */
-	unsigned i = 0;
-
-	/* Index, in 'A', of next lowest frequency parentless non-leaf
-	 * node; or, if equal to 'e', then no such node exists yet.  */
-	unsigned b = 0;
-
-	/* Index, in 'A', of next node to allocate as a non-leaf.  */
-	unsigned e = 0;
-
-	do {
-		unsigned m, n;
-		u32 freq_shifted;
-
-		/* Choose the two next lowest frequency entries.  */
-
-		if (i != sym_count &&
-		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
-			m = i++;
-		else
-			m = b++;
-
-		if (i != sym_count &&
-		    (b == e || (A[i] >> NUM_SYMBOL_BITS) <= (A[b] >> NUM_SYMBOL_BITS)))
-			n = i++;
-		else
-			n = b++;
-
-		/* Allocate a non-leaf node and link the entries to it.
-		 *
-		 * If we link an entry that we're visiting for the first
-		 * time (via index 'i'), then we're actually linking a
-		 * leaf node and it will have no effect, since the leaf
-		 * will be overwritten with a non-leaf when index 'e'
-		 * catches up to it.  But it's not any slower to
-		 * unconditionally set the parent index.
-		 *
-		 * We also compute the frequency of the non-leaf node as
-		 * the sum of its two children's frequencies.  */
-
-		freq_shifted = (A[m] & ~SYMBOL_MASK) + (A[n] & ~SYMBOL_MASK);
-
-		A[m] = (A[m] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
-		A[n] = (A[n] & SYMBOL_MASK) | (e << NUM_SYMBOL_BITS);
-		A[e] = (A[e] & SYMBOL_MASK) | freq_shifted;
-		e++;
-	} while (sym_count - e > 1);
-		/* When just one entry remains, it is a "leaf" that was
-		 * linked to some other node.  We ignore it, since the
-		 * rest of the array contains the non-leaves which we
-		 * need.  (Note that we're assuming the cases with 0 or 1
-		 * symbols were handled separately.) */
-}
-
-/*
- * Given the stripped-down Huffman tree constructed by build_tree(),
- * determine the number of codewords that should be assigned each
- * possible length, taking into account the length-limited constraint.
- *
- * @A
- *	The array produced by build_tree(), containing parent index
- *	information for the non-leaf nodes of the Huffman tree.  Each
- *	entry in this array is a node; a node's parent always has a
- *	greater index than that node itself.  This function will
- *	overwrite the parent index information in this array, so
- *	essentially it will destroy the tree.  However, the data in the
- *	low NUM_SYMBOL_BITS of each entry will be preserved.
- *
- * @root_idx
- *	The 0-based index of the root node in 'A', and consequently one
- *	less than the number of tree node entries in 'A'.  (Or, really 2
- *	less than the actual length of 'A'.)
- *
- * @len_counts
- *	An array of length ('max_codeword_len' + 1) in which the number of
- *	codewords having each length <= max_codeword_len will be
- *	returned.
- *
- * @max_codeword_len
- *	The maximum permissible codeword length.
- */
-static void
-compute_length_counts(u32 A[restrict], unsigned root_idx,
-		      unsigned len_counts[restrict], unsigned max_codeword_len)
-{
-	unsigned len;
-	int node;
-
-	/* The key observations are:
-	 *
-	 * (1) We can traverse the non-leaf nodes of the tree, always
-	 * visiting a parent before its children, by simply iterating
-	 * through the array in reverse order.  Consequently, we can
-	 * compute the depth of each node in one pass, overwriting the
-	 * parent indices with depths.
-	 *
-	 * (2) We can initially assume that in the real Huffman tree,
-	 * both children of the root are leaves.  This corresponds to two
-	 * codewords of length 1.  Then, whenever we visit a (non-leaf)
-	 * node during the traversal, we modify this assumption to
-	 * account for the current node *not* being a leaf, but rather
-	 * its two children being leaves.  This causes the loss of one
-	 * codeword for the current depth and the addition of two
-	 * codewords for the current depth plus one.
-	 *
-	 * (3) We can handle the length-limited constraint fairly easily
-	 * by simply using the largest length available when a depth
-	 * exceeds max_codeword_len.
-	 */
-
-	for (len = 0; len <= max_codeword_len; len++)
-		len_counts[len] = 0;
-	len_counts[1] = 2;
-
-	/* Set the root node's depth to 0.  */
-	A[root_idx] &= SYMBOL_MASK;
-
-	for (node = root_idx - 1; node >= 0; node--) {
-
-		/* Calculate the depth of this node.  */
-
-		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
-		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
-		unsigned depth = parent_depth + 1;
-		unsigned len = depth;
-
-		/* Set the depth of this node so that it is available
-		 * when its children (if any) are processed.  */
-
-		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
-
-		/* If needed, decrease the length to meet the
-		 * length-limited constraint.  This is not the optimal
-		 * method for generating length-limited Huffman codes!
-		 * But it should be good enough.  */
-		if (len >= max_codeword_len) {
-			len = max_codeword_len;
-			do {
-				len--;
-			} while (len_counts[len] == 0);
-		}
-
-		/* Account for the fact that we have a non-leaf node at
-		 * the current depth.  */
-		len_counts[len]--;
-		len_counts[len + 1] += 2;
-	}
-}
-
-/*
- * Generate the codewords for a canonical Huffman code.
- *
- * @A
- *	The output array for codewords.  In addition, initially this
- *	array must contain the symbols, sorted primarily by frequency and
- *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
- *	each entry.
- *
- * @len
- *	Output array for codeword lengths.
- *
- * @len_counts
- *	An array that provides the number of codewords that will have
- *	each possible length <= max_codeword_len.
- *
- * @max_codeword_len
- *	Maximum length, in bits, of each codeword.
- *
- * @num_syms
- *	Number of symbols in the alphabet, including symbols with zero
- *	frequency.  This is the length of the 'A' and 'len' arrays.
- */
-static void
-gen_codewords(u32 A[restrict], u8 lens[restrict],
-	      const unsigned len_counts[restrict],
-	      unsigned max_codeword_len, unsigned num_syms)
-{
-	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
-	unsigned i;
-	unsigned len;
-	unsigned sym;
-
-	/* Given the number of codewords that will have each length,
-	 * assign codeword lengths to symbols.  We do this by assigning
-	 * the lengths in decreasing order to the symbols sorted
-	 * primarily by increasing frequency and secondarily by
-	 * increasing symbol value.  */
-	for (i = 0, len = max_codeword_len; len >= 1; len--) {
-		unsigned count = len_counts[len];
-		while (count--)
-			lens[A[i++] & SYMBOL_MASK] = len;
-	}
-
-	/* Generate the codewords themselves.  We initialize the
-	 * 'next_codewords' array to provide the lexicographically first
-	 * codeword of each length, then assign codewords in symbol
-	 * order.  This produces a canonical code.  */
-	next_codewords[0] = 0;
-	next_codewords[1] = 0;
-	for (len = 2; len <= max_codeword_len; len++)
-		next_codewords[len] =
-			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
-
-	for (sym = 0; sym < num_syms; sym++)
-		A[sym] = next_codewords[lens[sym]]++;
-}
-
-/*
- * ---------------------------------------------------------------------
- *			make_canonical_huffman_code()
- * ---------------------------------------------------------------------
- *
- * Given an alphabet and the frequency of each symbol in it, construct a
- * length-limited canonical Huffman code.
- *
- * @num_syms
- *	The number of symbols in the alphabet.  The symbols are the
- *	integers in the range [0, num_syms - 1].  This parameter must be
- *	at least 2 and can't be greater than (1 << NUM_SYMBOL_BITS).
- *
- * @max_codeword_len
- *	The maximum permissible codeword length.
- *
- * @freqs
- *	An array of @num_syms entries, each of which specifies the
- *	frequency of the corresponding symbol.  It is valid for some,
- *	none, or all of the frequencies to be 0.
- *
- * @lens
- *	An array of @num_syms entries in which this function will return
- *	the length, in bits, of the codeword assigned to each symbol.
- *	Symbols with 0 frequency will not have codewords per se, but
- *	their entries in this array will be set to 0.  No lengths greater
- *	than @max_codeword_len will be assigned.
- *
- * @codewords
- *	An array of @num_syms entries in which this function will return
- *	the codeword for each symbol, right-justified and padded on the
- *	left with zeroes.  Codewords for symbols with 0 frequency will be
- *	undefined.
- *
- * ---------------------------------------------------------------------
- *
- * This function builds a length-limited canonical Huffman code.
- *
- * A length-limited Huffman code contains no codewords longer than some
- * specified length, and has exactly (with some algorithms) or
- * approximately (with the algorithm used here) the minimum weighted path
- * length from the root, given this constraint.
- *
- * A canonical Huffman code satisfies the properties that a longer
- * codeword never lexicographically precedes a shorter codeword, and the
- * lexicographic ordering of codewords of the same length is the same as
- * the lexicographic ordering of the corresponding symbols.  A canonical
- * Huffman code, or more generally a canonical prefix code, can be
- * reconstructed from only a list containing the codeword length of each
- * symbol.
- *
- * The classic algorithm to generate a Huffman code creates a node for
- * each symbol, then inserts these nodes into a min-heap keyed by symbol
- * frequency.  Then, repeatedly, the two lowest-frequency nodes are
- * removed from the min-heap and added as the children of a new node
- * having frequency equal to the sum of its two children, which is then
- * inserted into the min-heap.  When only a single node remains in the
- * min-heap, it is the root of the Huffman tree.  The codeword for each
- * symbol is determined by the path needed to reach the corresponding
- * node from the root.  Descending to the left child appends a 0 bit,
- * whereas descending to the right child appends a 1 bit.
- *
- * The classic algorithm is relatively easy to understand, but it is
- * subject to a number of inefficiencies.  In practice, it is fastest to
- * first sort the symbols by frequency.  (This itself can be subject to
- * an optimization based on the fact that most frequencies tend to be
- * low.)  At the same time, we sort secondarily by symbol value, which
- * aids the process of generating a canonical code.  Then, during tree
- * construction, no heap is necessary because both the leaf nodes and the
- * unparented non-leaf nodes can be easily maintained in sorted order.
- * Consequently, there can never be more than two possibilities for the
- * next-lowest-frequency node.
- *
- * In addition, because we're generating a canonical code, we actually
- * don't need the leaf nodes of the tree at all, only the non-leaf nodes.
- * This is because for canonical code generation we don't need to know
- * where the symbols are in the tree.  Rather, we only need to know how
- * many leaf nodes have each depth (codeword length).  And this
- * information can, in fact, be quickly generated from the tree of
- * non-leaves only.
- *
- * Furthermore, we can build this stripped-down Huffman tree directly in
- * the array in which the codewords are to be generated, provided that
- * these array slots are large enough to hold a symbol and frequency
- * value.
- *
- * Still furthermore, we don't even need to maintain explicit child
- * pointers.  We only need the parent pointers, and even those can be
- * overwritten in-place with depth information as part of the process of
- * extracting codeword lengths from the tree.  So in summary, we do NOT
- * need a big structure like:
- *
- *	struct huffman_tree_node {
- *		unsigned int symbol;
- *		unsigned int frequency;
- *		unsigned int depth;
- *		struct huffman_tree_node *left_child;
- *		struct huffman_tree_node *right_child;
- *	};
- *
- *
- *   ... which often gets used in "naive" implementations of Huffman code
- *   generation.
- *
- * Many of these optimizations are based on the implementation in 7-Zip
- * (source file: C/HuffEnc.c), which has been placed in the public domain
- * by Igor Pavlov.
- */
-static void
-make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
-			    const u32 freqs[restrict],
-			    u8 lens[restrict], u32 codewords[restrict])
-{
-	u32 *A = codewords;
-	unsigned num_used_syms;
-
-	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
-
-	/* We begin by sorting the symbols primarily by frequency and
-	 * secondarily by symbol value.  As an optimization, the array
-	 * used for this purpose ('A') shares storage with the space in
-	 * which we will eventually return the codewords.  */
-
-	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
-
-	/* 'num_used_syms' is the number of symbols with nonzero
-	 * frequency.  This may be less than @num_syms.  'num_used_syms'
-	 * is also the number of entries in 'A' that are valid.  Each
-	 * entry consists of a distinct symbol and a nonzero frequency
-	 * packed into a 32-bit integer.  */
-
-	/* Handle special cases where only 0 or 1 symbols were used (had
-	 * nonzero frequency).  */
-
-	if (unlikely(num_used_syms == 0)) {
-		/* Code is empty.  sort_symbols() already set all lengths
-		 * to 0, so there is nothing more to do.  */
-		return;
-	}
-
-	if (unlikely(num_used_syms == 1)) {
-		/* Only one symbol was used, so we only need one
-		 * codeword.  But two codewords are needed to form the
-		 * smallest complete Huffman code, which uses codewords 0
-		 * and 1.  Therefore, we choose another symbol to which
-		 * to assign a codeword.  We use 0 (if the used symbol is
-		 * not 0) or 1 (if the used symbol is 0).  In either
-		 * case, the lesser-valued symbol must be assigned
-		 * codeword 0 so that the resulting code is canonical.  */
-
-		unsigned sym = A[0] & SYMBOL_MASK;
-		unsigned nonzero_idx = sym ? sym : 1;
-
-		codewords[0] = 0;
-		lens[0] = 1;
-		codewords[nonzero_idx] = 1;
-		lens[nonzero_idx] = 1;
-		return;
-	}
-
-	/* Build a stripped-down version of the Huffman tree, sharing the
-	 * array 'A' with the symbol values.  Then extract length counts
-	 * from the tree and use them to generate the final codewords.  */
-
-	build_tree(A, num_used_syms);
-
-	{
-		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
-
-		compute_length_counts(A, num_used_syms - 2,
-				      len_counts, max_codeword_len);
-
-		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
-	}
-}
-
-/*
- * Clear the Huffman symbol frequency counters.
- * This must be called when starting a new DEFLATE block.
- */
-static void
-deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
-{
-	memset(&c->freqs, 0, sizeof(c->freqs));
-}
-
-/* Reverse the Huffman codeword 'codeword', which is 'len' bits in length.  */
-static u32
-deflate_reverse_codeword(u32 codeword, u8 len)
-{
-	/* The following branchless algorithm is faster than going bit by bit.
-	 * Note: since no codewords are longer than 16 bits, we only need to
-	 * reverse the low 16 bits of the 'u32'.  */
-	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
-
-	/* Flip adjacent 1-bit fields  */
-	codeword = ((codeword & 0x5555) << 1) | ((codeword & 0xAAAA) >> 1);
-
-	/* Flip adjacent 2-bit fields  */
-	codeword = ((codeword & 0x3333) << 2) | ((codeword & 0xCCCC) >> 2);
-
-	/* Flip adjacent 4-bit fields  */
-	codeword = ((codeword & 0x0F0F) << 4) | ((codeword & 0xF0F0) >> 4);
-
-	/* Flip adjacent 8-bit fields  */
-	codeword = ((codeword & 0x00FF) << 8) | ((codeword & 0xFF00) >> 8);
-
-	/* Return the high 'len' bits of the bit-reversed 16 bit value.  */
-	return codeword >> (16 - len);
-}
-
-/* Make a canonical Huffman code with bit-reversed codewords.  */
-static void
-deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
-			  const u32 freqs[], u8 lens[], u32 codewords[])
-{
-	unsigned sym;
-
-	make_canonical_huffman_code(num_syms, max_codeword_len,
-				    freqs, lens, codewords);
-
-	for (sym = 0; sym < num_syms; sym++)
-		codewords[sym] = deflate_reverse_codeword(codewords[sym], lens[sym]);
-}
-
-/*
- * Build the literal/length and offset Huffman codes for a DEFLATE block.
- *
- * This takes as input the frequency tables for each code and produces as output
- * a set of tables that map symbols to codewords and codeword lengths.
- */
-static void
-deflate_make_huffman_codes(const struct deflate_freqs *freqs,
-			   struct deflate_codes *codes)
-{
-	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
-	STATIC_ASSERT(MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
-
-	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
-				  MAX_LITLEN_CODEWORD_LEN,
-				  freqs->litlen,
-				  codes->lens.litlen,
-				  codes->codewords.litlen);
-
-	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
-				  MAX_OFFSET_CODEWORD_LEN,
-				  freqs->offset,
-				  codes->lens.offset,
-				  codes->codewords.offset);
-}
-
-/* Initialize c->static_codes.  */
-static void
-deflate_init_static_codes(struct libdeflate_compressor *c)
-{
-	unsigned i;
-
-	for (i = 0; i < 144; i++)
-		c->freqs.litlen[i] = 1 << (9 - 8);
-	for (; i < 256; i++)
-		c->freqs.litlen[i] = 1 << (9 - 9);
-	for (; i < 280; i++)
-		c->freqs.litlen[i] = 1 << (9 - 7);
-	for (; i < 288; i++)
-		c->freqs.litlen[i] = 1 << (9 - 8);
-
-	for (i = 0; i < 32; i++)
-		c->freqs.offset[i] = 1 << (5 - 5);
-
-	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
-}
-
-/* Return the offset slot for the specified match offset.  */
-static forceinline unsigned
-deflate_get_offset_slot(struct libdeflate_compressor *c, unsigned offset)
-{
-#if USE_FULL_OFFSET_SLOT_FAST
-	return c->offset_slot_fast[offset];
-#else
-	if (offset <= 256)
-		return c->offset_slot_fast[offset - 1];
-	else
-		return c->offset_slot_fast[256 + ((offset - 1) >> 7)];
-#endif
-}
-
-/* Write the header fields common to all DEFLATE block types.  */
-static void
-deflate_write_block_header(struct deflate_output_bitstream *os,
-			   bool is_final_block, unsigned block_type)
-{
-	deflate_add_bits(os, is_final_block, 1);
-	deflate_add_bits(os, block_type, 2);
-	deflate_flush_bits(os);
-}
-
-static unsigned
-deflate_compute_precode_items(const u8 lens[restrict],
-			      const unsigned num_lens,
-			      u32 precode_freqs[restrict],
-			      unsigned precode_items[restrict])
-{
-	unsigned *itemptr;
-	unsigned run_start;
-	unsigned run_end;
-	unsigned extra_bits;
-	u8 len;
-
-	memset(precode_freqs, 0,
-	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
-
-	itemptr = precode_items;
-	run_start = 0;
-	do {
-		/* Find the next run of codeword lengths.  */
-
-		/* len = the length being repeated  */
-		len = lens[run_start];
-
-		/* Extend the run.  */
-		run_end = run_start;
-		do {
-			run_end++;
-		} while (run_end != num_lens && len == lens[run_end]);
-
-		if (len == 0) {
-			/* Run of zeroes.  */
-
-			/* Symbol 18: RLE 11 to 138 zeroes at a time.  */
-			while ((run_end - run_start) >= 11) {
-				extra_bits = MIN((run_end - run_start) - 11, 0x7F);
-				precode_freqs[18]++;
-				*itemptr++ = 18 | (extra_bits << 5);
-				run_start += 11 + extra_bits;
-			}
-
-			/* Symbol 17: RLE 3 to 10 zeroes at a time.  */
-			if ((run_end - run_start) >= 3) {
-				extra_bits = MIN((run_end - run_start) - 3, 0x7);
-				precode_freqs[17]++;
-				*itemptr++ = 17 | (extra_bits << 5);
-				run_start += 3 + extra_bits;
-			}
-		} else {
-
-			/* A run of nonzero lengths. */
-
-			/* Symbol 16: RLE 3 to 6 of the previous length.  */
-			if ((run_end - run_start) >= 4) {
-				precode_freqs[len]++;
-				*itemptr++ = len;
-				run_start++;
-				do {
-					extra_bits = MIN((run_end - run_start) - 3, 0x3);
-					precode_freqs[16]++;
-					*itemptr++ = 16 | (extra_bits << 5);
-					run_start += 3 + extra_bits;
-				} while ((run_end - run_start) >= 3);
-			}
-		}
-
-		/* Output any remaining lengths without RLE.  */
-		while (run_start != run_end) {
-			precode_freqs[len]++;
-			*itemptr++ = len;
-			run_start++;
-		}
-	} while (run_start != num_lens);
-
-	return itemptr - precode_items;
-}
-
-/*
- * Huffman codeword lengths for dynamic Huffman blocks are compressed using a
- * separate Huffman code, the "precode", which contains a symbol for each
- * possible codeword length in the larger code as well as several special
- * symbols to represent repeated codeword lengths (a form of run-length
- * encoding).  The precode is itself constructed in canonical form, and its
- * codeword lengths are represented literally in 19 3-bit fields that
- * immediately precede the compressed codeword lengths of the larger code.
- */
-
-/* Precompute the information needed to output Huffman codes. */
-static void
-deflate_precompute_huffman_header(struct libdeflate_compressor *c)
-{
-	/* Compute how many litlen and offset symbols are needed. */
-
-	for (c->num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
-	     c->num_litlen_syms > 257;
-	     c->num_litlen_syms--)
-		if (c->codes.lens.litlen[c->num_litlen_syms - 1] != 0)
-			break;
-
-	for (c->num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
-	     c->num_offset_syms > 1;
-	     c->num_offset_syms--)
-		if (c->codes.lens.offset[c->num_offset_syms - 1] != 0)
-			break;
-
-	/* If we're not using the full set of literal/length codeword lengths,
-	 * then temporarily move the offset codeword lengths over so that the
-	 * literal/length and offset codeword lengths are contiguous. */
-
-	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
-		      DEFLATE_NUM_LITLEN_SYMS);
-
-	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
-		memmove((u8 *)&c->codes.lens + c->num_litlen_syms,
-			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
-			c->num_offset_syms);
-	}
-
-	/* Compute the "items" (RLE / literal tokens and extra bits) with which
-	 * the codeword lengths in the larger code will be output. */
-	c->num_precode_items =
-		deflate_compute_precode_items((u8 *)&c->codes.lens,
-					      c->num_litlen_syms +
-							c->num_offset_syms,
-					      c->precode_freqs,
-					      c->precode_items);
-
-	/* Build the precode. */
-	STATIC_ASSERT(MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
-	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
-				  MAX_PRE_CODEWORD_LEN,
-				  c->precode_freqs, c->precode_lens,
-				  c->precode_codewords);
-
-	/* Count how many precode lengths we actually need to output. */
-	for (c->num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
-	     c->num_explicit_lens > 4;
-	     c->num_explicit_lens--)
-		if (c->precode_lens[deflate_precode_lens_permutation[
-						c->num_explicit_lens - 1]] != 0)
-			break;
-
-	/* Restore the offset codeword lengths if needed. */
-	if (c->num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
-		memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
-			(u8 *)&c->codes.lens + c->num_litlen_syms,
-			c->num_offset_syms);
-	}
-}
-
-/* Output the Huffman codes. */
-static void
-deflate_write_huffman_header(struct libdeflate_compressor *c,
-			     struct deflate_output_bitstream *os)
-{
-	unsigned i;
-
-	deflate_add_bits(os, c->num_litlen_syms - 257, 5);
-	deflate_add_bits(os, c->num_offset_syms - 1, 5);
-	deflate_add_bits(os, c->num_explicit_lens - 4, 4);
-	deflate_flush_bits(os);
-
-	/* Output the lengths of the codewords in the precode.  */
-	for (i = 0; i < c->num_explicit_lens; i++) {
-		deflate_add_bits(os, c->precode_lens[
-				       deflate_precode_lens_permutation[i]], 3);
-		deflate_flush_bits(os);
-	}
-
-	/* Output the encoded lengths of the codewords in the larger code.  */
-	for (i = 0; i < c->num_precode_items; i++) {
-		unsigned precode_item = c->precode_items[i];
-		unsigned precode_sym = precode_item & 0x1F;
-		deflate_add_bits(os, c->precode_codewords[precode_sym],
-				 c->precode_lens[precode_sym]);
-		if (precode_sym >= 16) {
-			if (precode_sym == 16)
-				deflate_add_bits(os, precode_item >> 5, 2);
-			else if (precode_sym == 17)
-				deflate_add_bits(os, precode_item >> 5, 3);
-			else
-				deflate_add_bits(os, precode_item >> 5, 7);
-		}
-		STATIC_ASSERT(CAN_BUFFER(DEFLATE_MAX_PRE_CODEWORD_LEN + 7));
-		deflate_flush_bits(os);
-	}
-}
-
-static void
-deflate_write_sequences(struct deflate_output_bitstream * restrict os,
-			const struct deflate_codes * restrict codes,
-			const struct deflate_sequence sequences[restrict],
-			const u8 * restrict in_next)
-{
-	const struct deflate_sequence *seq = sequences;
-
-	for (;;) {
-		u32 litrunlen = seq->litrunlen_and_length & 0x7FFFFF;
-		unsigned length = seq->litrunlen_and_length >> 23;
-		unsigned length_slot;
-		unsigned litlen_symbol;
-		unsigned offset_symbol;
-
-		if (litrunlen) {
-		#if 1
-			while (litrunlen >= 4) {
-				unsigned lit0 = in_next[0];
-				unsigned lit1 = in_next[1];
-				unsigned lit2 = in_next[2];
-				unsigned lit3 = in_next[3];
-
-				deflate_add_bits(os, codes->codewords.litlen[lit0],
-						 codes->lens.litlen[lit0]);
-				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
-					deflate_flush_bits(os);
-
-				deflate_add_bits(os, codes->codewords.litlen[lit1],
-						 codes->lens.litlen[lit1]);
-				if (!CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN))
-					deflate_flush_bits(os);
-
-				deflate_add_bits(os, codes->codewords.litlen[lit2],
-						 codes->lens.litlen[lit2]);
-				if (!CAN_BUFFER(2 * MAX_LITLEN_CODEWORD_LEN))
-					deflate_flush_bits(os);
-
-				deflate_add_bits(os, codes->codewords.litlen[lit3],
-						 codes->lens.litlen[lit3]);
-				deflate_flush_bits(os);
-				in_next += 4;
-				litrunlen -= 4;
-			}
-			if (litrunlen-- != 0) {
-				deflate_add_bits(os, codes->codewords.litlen[*in_next],
-						 codes->lens.litlen[*in_next]);
-				if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
-					deflate_flush_bits(os);
-				in_next++;
-				if (litrunlen-- != 0) {
-					deflate_add_bits(os, codes->codewords.litlen[*in_next],
-							 codes->lens.litlen[*in_next]);
-					if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
-						deflate_flush_bits(os);
-					in_next++;
-					if (litrunlen-- != 0) {
-						deflate_add_bits(os, codes->codewords.litlen[*in_next],
-								 codes->lens.litlen[*in_next]);
-						if (!CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
-							deflate_flush_bits(os);
-						in_next++;
-					}
-				}
-				if (CAN_BUFFER(3 * MAX_LITLEN_CODEWORD_LEN))
-					deflate_flush_bits(os);
-			}
-		#else
-			do {
-				unsigned lit = *in_next++;
-				deflate_add_bits(os, codes->codewords.litlen[lit],
-						 codes->lens.litlen[lit]);
-				deflate_flush_bits(os);
-			} while (--litrunlen);
-		#endif
-		}
-
-		if (length == 0)
-			return;
-
-		in_next += length;
-
-		length_slot = seq->length_slot;
-		litlen_symbol = 257 + length_slot;
-
-		/* Litlen symbol  */
-		deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
-				 codes->lens.litlen[litlen_symbol]);
-
-		/* Extra length bits  */
-		STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
-					 DEFLATE_MAX_EXTRA_LENGTH_BITS));
-		deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
-				 deflate_extra_length_bits[length_slot]);
-
-		if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
-				DEFLATE_MAX_EXTRA_LENGTH_BITS +
-				MAX_OFFSET_CODEWORD_LEN +
-				DEFLATE_MAX_EXTRA_OFFSET_BITS))
-			deflate_flush_bits(os);
-
-		/* Offset symbol  */
-		offset_symbol = seq->offset_symbol;
-		deflate_add_bits(os, codes->codewords.offset[offset_symbol],
-				 codes->lens.offset[offset_symbol]);
-
-		if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
-				DEFLATE_MAX_EXTRA_OFFSET_BITS))
-			deflate_flush_bits(os);
-
-		/* Extra offset bits  */
-		deflate_add_bits(os, seq->offset - deflate_offset_slot_base[offset_symbol],
-				 deflate_extra_offset_bits[offset_symbol]);
-
-		deflate_flush_bits(os);
-
-		seq++;
-	}
-}
-
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-/*
- * Follow the minimum-cost path in the graph of possible match/literal choices
- * for the current block and write out the matches/literals using the specified
- * Huffman codes.
- *
- * Note: this is slightly duplicated with deflate_write_sequences(), the reason
- * being that we don't want to waste time translating between intermediate
- * match/literal representations.
- */
-static void
-deflate_write_item_list(struct deflate_output_bitstream *os,
-			const struct deflate_codes *codes,
-			struct libdeflate_compressor *c,
-			u32 block_length)
-{
-	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
-	struct deflate_optimum_node * const end_node = &c->p.n.optimum_nodes[block_length];
-	do {
-		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
-		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
-		unsigned litlen_symbol;
-		unsigned length_slot;
-		unsigned offset_slot;
-
-		if (length == 1) {
-			/* Literal  */
-			litlen_symbol = offset;
-			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
-					 codes->lens.litlen[litlen_symbol]);
-			deflate_flush_bits(os);
-		} else {
-			/* Match length  */
-			length_slot = deflate_length_slot[length];
-			litlen_symbol = 257 + length_slot;
-			deflate_add_bits(os, codes->codewords.litlen[litlen_symbol],
-					 codes->lens.litlen[litlen_symbol]);
-
-			deflate_add_bits(os, length - deflate_length_slot_base[length_slot],
-					 deflate_extra_length_bits[length_slot]);
-
-			if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +
-					DEFLATE_MAX_EXTRA_LENGTH_BITS +
-					MAX_OFFSET_CODEWORD_LEN +
-					DEFLATE_MAX_EXTRA_OFFSET_BITS))
-				deflate_flush_bits(os);
-
-
-			/* Match offset  */
-			offset_slot = deflate_get_offset_slot(c, offset);
-			deflate_add_bits(os, codes->codewords.offset[offset_slot],
-					 codes->lens.offset[offset_slot]);
-
-			if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +
-					DEFLATE_MAX_EXTRA_OFFSET_BITS))
-				deflate_flush_bits(os);
-
-			deflate_add_bits(os, offset - deflate_offset_slot_base[offset_slot],
-					 deflate_extra_offset_bits[offset_slot]);
-
-			deflate_flush_bits(os);
-		}
-		cur_node += length;
-	} while (cur_node != end_node);
-}
-#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-
-/* Output the end-of-block symbol.  */
-static void
-deflate_write_end_of_block(struct deflate_output_bitstream *os,
-			   const struct deflate_codes *codes)
-{
-	deflate_add_bits(os, codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
-			 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
-	deflate_flush_bits(os);
-}
-
-static void
-deflate_write_uncompressed_block(struct deflate_output_bitstream *os,
-				 const u8 *data, u16 len,
-				 bool is_final_block)
-{
-	deflate_write_block_header(os, is_final_block,
-				   DEFLATE_BLOCKTYPE_UNCOMPRESSED);
-	deflate_align_bitstream(os);
-
-	if (4 + (u32)len >= os->end - os->next) {
-		os->next = os->end;
-		return;
-	}
-
-	put_unaligned_le16(len, os->next);
-	os->next += 2;
-	put_unaligned_le16(~len, os->next);
-	os->next += 2;
-	memcpy(os->next, data, len);
-	os->next += len;
-}
-
-static void
-deflate_write_uncompressed_blocks(struct deflate_output_bitstream *os,
-				  const u8 *data, u32 data_length,
-				  bool is_final_block)
-{
-	do {
-		u16 len = MIN(data_length, UINT16_MAX);
-
-		deflate_write_uncompressed_block(os, data, len,
-					is_final_block && len == data_length);
-		data += len;
-		data_length -= len;
-	} while (data_length != 0);
-}
-
-/*
- * Choose the best type of block to use (dynamic Huffman, static Huffman, or
- * uncompressed), then output it.
- */
-static void
-deflate_flush_block(struct libdeflate_compressor * restrict c,
-		    struct deflate_output_bitstream * restrict os,
-		    const u8 * restrict block_begin, u32 block_length,
-		    bool is_final_block, bool use_item_list)
-{
-	static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
-		0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7,
-	};
-
-	/* Costs are measured in bits */
-	u32 dynamic_cost = 0;
-	u32 static_cost = 0;
-	u32 uncompressed_cost = 0;
-	struct deflate_codes *codes;
-	int block_type;
-	unsigned sym;
-
-	/* Tally the end-of-block symbol. */
-	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
-
-	/* Build dynamic Huffman codes. */
-	deflate_make_huffman_codes(&c->freqs, &c->codes);
-
-	/* Account for the cost of sending dynamic Huffman codes. */
-	deflate_precompute_huffman_header(c);
-	dynamic_cost += 5 + 5 + 4 + (3 * c->num_explicit_lens);
-	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
-		u32 extra = deflate_extra_precode_bits[sym];
-		dynamic_cost += c->precode_freqs[sym] *
-				(extra + c->precode_lens[sym]);
-	}
-
-	/* Account for the cost of encoding literals. */
-	for (sym = 0; sym < 256; sym++) {
-		dynamic_cost += c->freqs.litlen[sym] *
-				c->codes.lens.litlen[sym];
-	}
-	for (sym = 0; sym < 144; sym++)
-		static_cost += c->freqs.litlen[sym] * 8;
-	for (; sym < 256; sym++)
-		static_cost += c->freqs.litlen[sym] * 9;
-
-	/* Account for the cost of encoding the end-of-block symbol. */
-	dynamic_cost += c->codes.lens.litlen[256];
-	static_cost += 7;
-
-	/* Account for the cost of encoding lengths. */
-	for (sym = 257; sym < 257 + ARRAY_LEN(deflate_extra_length_bits); sym++) {
-		u32 extra = deflate_extra_length_bits[sym - 257];
-		dynamic_cost += c->freqs.litlen[sym] *
-				(extra + c->codes.lens.litlen[sym]);
-		static_cost += c->freqs.litlen[sym] *
-				(extra + c->static_codes.lens.litlen[sym]);
-	}
-
-	/* Account for the cost of encoding offsets. */
-	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
-		u32 extra = deflate_extra_offset_bits[sym];
-		dynamic_cost += c->freqs.offset[sym] *
-				(extra + c->codes.lens.offset[sym]);
-		static_cost += c->freqs.offset[sym] * (extra + 5);
-	}
-
-	/* Compute the cost of using uncompressed blocks. */
-	uncompressed_cost += (-(os->bitcount + 3) & 7) + 32 +
-			     (40 * (DIV_ROUND_UP(block_length,
-						 UINT16_MAX) - 1)) +
-			     (8 * block_length);
-
-	/* Choose the cheapest block type. */
-	if (dynamic_cost < MIN(static_cost, uncompressed_cost)) {
-		block_type = DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN;
-		codes = &c->codes;
-	} else if (static_cost < uncompressed_cost) {
-		block_type = DEFLATE_BLOCKTYPE_STATIC_HUFFMAN;
-		codes = &c->static_codes;
-	} else {
-		block_type = DEFLATE_BLOCKTYPE_UNCOMPRESSED;
-	}
-
-	/* Now actually output the block. */
-
-	if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
-		/* Note: the length being flushed may exceed the maximum length
-		 * of an uncompressed block (65535 bytes).  Therefore, more than
-		 * one uncompressed block might be needed. */
-		deflate_write_uncompressed_blocks(os, block_begin, block_length,
-						  is_final_block);
-	} else {
-		/* Output the block header. */
-		deflate_write_block_header(os, is_final_block, block_type);
-
-		/* Output the Huffman codes (dynamic Huffman blocks only). */
-		if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN)
-			deflate_write_huffman_header(c, os);
-
-		/* Output the literals, matches, and end-of-block symbol. */
-	#if SUPPORT_NEAR_OPTIMAL_PARSING
-		if (use_item_list)
-			deflate_write_item_list(os, codes, c, block_length);
-		else
-	#endif
-			deflate_write_sequences(os, codes, c->p.g.sequences,
-						block_begin);
-		deflate_write_end_of_block(os, codes);
-	}
-}
-
-static forceinline void
-deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
-		       u32 *litrunlen_p)
-{
-	c->freqs.litlen[literal]++;
-	++*litrunlen_p;
-}
-
-static forceinline void
-deflate_choose_match(struct libdeflate_compressor *c,
-		     unsigned length, unsigned offset,
-		     u32 *litrunlen_p, struct deflate_sequence **next_seq_p)
-{
-	struct deflate_sequence *seq = *next_seq_p;
-	unsigned length_slot = deflate_length_slot[length];
-	unsigned offset_slot = deflate_get_offset_slot(c, offset);
-
-	c->freqs.litlen[257 + length_slot]++;
-	c->freqs.offset[offset_slot]++;
-
-	seq->litrunlen_and_length = ((u32)length << 23) | *litrunlen_p;
-	seq->offset = offset;
-	seq->length_slot = length_slot;
-	seq->offset_symbol = offset_slot;
-
-	*litrunlen_p = 0;
-	*next_seq_p = seq + 1;
-}
-
-static forceinline void
-deflate_finish_sequence(struct deflate_sequence *seq, u32 litrunlen)
-{
-	seq->litrunlen_and_length = litrunlen; /* length = 0 */
-}
-
-/******************************************************************************/
-
-/*
- * Block splitting algorithm.  The problem is to decide when it is worthwhile to
- * start a new block with new Huffman codes.  There is a theoretically optimal
- * solution: recursively consider every possible block split, considering the
- * exact cost of each block, and choose the minimum cost approach.  But this is
- * far too slow.  Instead, as an approximation, we can count symbols and after
- * every N symbols, compare the expected distribution of symbols based on the
- * previous data with the actual distribution.  If they differ "by enough", then
- * start a new block.
- *
- * As an optimization and heuristic, we don't distinguish between every symbol
- * but rather we combine many symbols into a single "observation type".  For
- * literals we only look at the high bits and low bits, and for matches we only
- * look at whether the match is long or not.  The assumption is that for typical
- * "real" data, places that are good block boundaries will tend to be noticable
- * based only on changes in these aggregate frequencies, without looking for
- * subtle differences in individual symbols.  For example, a change from ASCII
- * bytes to non-ASCII bytes, or from few matches (generally less compressible)
- * to many matches (generally more compressible), would be easily noticed based
- * on the aggregates.
- *
- * For determining whether the frequency distributions are "different enough" to
- * start a new block, the simply heuristic of splitting when the sum of absolute
- * differences exceeds a constant seems to be good enough.  We also add a number
- * proportional to the block length so that the algorithm is more likely to end
- * long blocks than short blocks.  This reflects the general expectation that it
- * will become increasingly beneficial to start a new block as the current
- * block grows longer.
- *
- * Finally, for an approximation, it is not strictly necessary that the exact
- * symbols being used are considered.  With "near-optimal parsing", for example,
- * the actual symbols that will be used are unknown until after the block
- * boundary is chosen and the block has been optimized.  Since the final choices
- * cannot be used, we can use preliminary "greedy" choices instead.
- */
-
-/* Initialize the block split statistics when starting a new block. */
-static void
-init_block_split_stats(struct block_split_stats *stats)
-{
-	int i;
-
-	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-		stats->new_observations[i] = 0;
-		stats->observations[i] = 0;
-	}
-	stats->num_new_observations = 0;
-	stats->num_observations = 0;
-}
-
-/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
- * literal, for 8 possible literal observation types.  */
-static forceinline void
-observe_literal(struct block_split_stats *stats, u8 lit)
-{
-	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
-	stats->num_new_observations++;
-}
-
-/* Match observation.  Heuristic: use one observation type for "short match" and
- * one observation type for "long match".  */
-static forceinline void
-observe_match(struct block_split_stats *stats, unsigned length)
-{
-	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 9)]++;
-	stats->num_new_observations++;
-}
-
-static bool
-do_end_block_check(struct block_split_stats *stats, u32 block_length)
-{
-	int i;
-
-	if (stats->num_observations > 0) {
-
-		/* Note: to avoid slow divisions, we do not divide by
-		 * 'num_observations', but rather do all math with the numbers
-		 * multiplied by 'num_observations'.  */
-		u32 total_delta = 0;
-		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-			u32 expected = stats->observations[i] * stats->num_new_observations;
-			u32 actual = stats->new_observations[i] * stats->num_observations;
-			u32 delta = (actual > expected) ? actual - expected :
-							  expected - actual;
-			total_delta += delta;
-		}
-
-		/* Ready to end the block? */
-		if (total_delta + (block_length / 4096) * stats->num_observations >=
-		    NUM_OBSERVATIONS_PER_BLOCK_CHECK * 200 / 512 * stats->num_observations)
-			return true;
-	}
-
-	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-		stats->num_observations += stats->new_observations[i];
-		stats->observations[i] += stats->new_observations[i];
-		stats->new_observations[i] = 0;
-	}
-	stats->num_new_observations = 0;
-	return false;
-}
-
-static forceinline bool
-should_end_block(struct block_split_stats *stats,
-		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
-{
-	/* Ready to check block split statistics? */
-	if (stats->num_new_observations < NUM_OBSERVATIONS_PER_BLOCK_CHECK ||
-	    in_next - in_block_begin < MIN_BLOCK_LENGTH ||
-	    in_end - in_next < MIN_BLOCK_LENGTH)
-		return false;
-
-	return do_end_block_check(stats, in_next - in_block_begin);
-}
-
-/******************************************************************************/
-
-/*
- * This is the "greedy" DEFLATE compressor. It always chooses the longest match.
- */
-static size_t
-deflate_compress_greedy(struct libdeflate_compressor * restrict c,
-			const u8 * restrict in, size_t in_nbytes,
-			u8 * restrict out, size_t out_nbytes_avail)
-{
-	const u8 *in_next = in;
-	const u8 *in_end = in_next + in_nbytes;
-	struct deflate_output_bitstream os;
-	const u8 *in_cur_base = in_next;
-	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
-	unsigned nice_len = MIN(c->nice_match_length, max_len);
-	u32 next_hashes[2] = {0, 0};
-
-	deflate_init_output(&os, out, out_nbytes_avail);
-	hc_matchfinder_init(&c->p.g.hc_mf);
-
-	do {
-		/* Starting a new DEFLATE block.  */
-
-		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end =
-			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
-		u32 litrunlen = 0;
-		struct deflate_sequence *next_seq = c->p.g.sequences;
-
-		init_block_split_stats(&c->split_stats);
-		deflate_reset_symbol_frequencies(c);
-
-		do {
-			u32 length;
-			u32 offset;
-
-			/* Decrease the maximum and nice match lengths if we're
-			 * approaching the end of the input buffer.  */
-			if (unlikely(max_len > in_end - in_next)) {
-				max_len = in_end - in_next;
-				nice_len = MIN(nice_len, max_len);
-			}
-
-			length = hc_matchfinder_longest_match(&c->p.g.hc_mf,
-							      &in_cur_base,
-							      in_next,
-							      DEFLATE_MIN_MATCH_LEN - 1,
-							      max_len,
-							      nice_len,
-							      c->max_search_depth,
-							      next_hashes,
-							      &offset);
-
-			if (length >= DEFLATE_MIN_MATCH_LEN) {
-				/* Match found.  */
-				deflate_choose_match(c, length, offset,
-						     &litrunlen, &next_seq);
-				observe_match(&c->split_stats, length);
-				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
-									&in_cur_base,
-									in_next + 1,
-									in_end,
-									length - 1,
-									next_hashes);
-			} else {
-				/* No match found.  */
-				deflate_choose_literal(c, *in_next, &litrunlen);
-				observe_literal(&c->split_stats, *in_next);
-				in_next++;
-			}
-
-			/* Check if it's time to output another block.  */
-		} while (in_next < in_max_block_end &&
-			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
-
-		deflate_finish_sequence(next_seq, litrunlen);
-		deflate_flush_block(c, &os, in_block_begin,
-				    in_next - in_block_begin,
-				    in_next == in_end, false);
-	} while (in_next != in_end);
-
-	return deflate_flush_output(&os);
-}
-
-/*
- * This is the "lazy" DEFLATE compressor.  Before choosing a match, it checks to
- * see if there's a longer match at the next position.  If yes, it outputs a
- * literal and continues to the next position.  If no, it outputs the match.
- */
-static size_t
-deflate_compress_lazy(struct libdeflate_compressor * restrict c,
-		      const u8 * restrict in, size_t in_nbytes,
-		      u8 * restrict out, size_t out_nbytes_avail)
-{
-	const u8 *in_next = in;
-	const u8 *in_end = in_next + in_nbytes;
-	struct deflate_output_bitstream os;
-	const u8 *in_cur_base = in_next;
-	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
-	unsigned nice_len = MIN(c->nice_match_length, max_len);
-	u32 next_hashes[2] = {0, 0};
-
-	deflate_init_output(&os, out, out_nbytes_avail);
-	hc_matchfinder_init(&c->p.g.hc_mf);
-
-	do {
-		/* Starting a new DEFLATE block.  */
-
-		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end =
-			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
-		u32 litrunlen = 0;
-		struct deflate_sequence *next_seq = c->p.g.sequences;
-
-		init_block_split_stats(&c->split_stats);
-		deflate_reset_symbol_frequencies(c);
-
-		do {
-			unsigned cur_len;
-			unsigned cur_offset;
-			unsigned next_len;
-			unsigned next_offset;
-
-			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
-				max_len = in_end - in_next;
-				nice_len = MIN(nice_len, max_len);
-			}
-
-			/* Find the longest match at the current position.  */
-			cur_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
-							       &in_cur_base,
-							       in_next,
-							       DEFLATE_MIN_MATCH_LEN - 1,
-							       max_len,
-							       nice_len,
-							       c->max_search_depth,
-							       next_hashes,
-							       &cur_offset);
-			in_next += 1;
-
-			if (cur_len < DEFLATE_MIN_MATCH_LEN) {
-				/* No match found.  Choose a literal.  */
-				deflate_choose_literal(c, *(in_next - 1), &litrunlen);
-				observe_literal(&c->split_stats, *(in_next - 1));
-				continue;
-			}
-
-		have_cur_match:
-			observe_match(&c->split_stats, cur_len);
-
-			/* We have a match at the current position.  */
-
-			/* If the current match is very long, choose it
-			 * immediately.  */
-			if (cur_len >= nice_len) {
-				deflate_choose_match(c, cur_len, cur_offset,
-						     &litrunlen, &next_seq);
-				in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
-									&in_cur_base,
-									in_next,
-									in_end,
-									cur_len - 1,
-									next_hashes);
-				continue;
-			}
-
-			/*
-			 * Try to find a match at the next position.
-			 *
-			 * Note: since we already have a match at the *current*
-			 * position, we use only half the 'max_search_depth'
-			 * when checking the *next* position.  This is a useful
-			 * trade-off because it's more worthwhile to use a
-			 * greater search depth on the initial match.
-			 *
-			 * Note: it's possible to structure the code such that
-			 * there's only one call to longest_match(), which
-			 * handles both the "find the initial match" and "try to
-			 * find a longer match" cases.  However, it is faster to
-			 * have two call sites, with longest_match() inlined at
-			 * each.
-			 */
-			if (unlikely(in_end - in_next < DEFLATE_MAX_MATCH_LEN)) {
-				max_len = in_end - in_next;
-				nice_len = MIN(nice_len, max_len);
-			}
-			next_len = hc_matchfinder_longest_match(&c->p.g.hc_mf,
-								&in_cur_base,
-								in_next,
-								cur_len,
-								max_len,
-								nice_len,
-								c->max_search_depth / 2,
-								next_hashes,
-								&next_offset);
-			in_next += 1;
-
-			if (next_len > cur_len) {
-				/* Found a longer match at the next position.
-				 * Output a literal.  Then the next match
-				 * becomes the current match.  */
-				deflate_choose_literal(c, *(in_next - 2), &litrunlen);
-				cur_len = next_len;
-				cur_offset = next_offset;
-				goto have_cur_match;
-			}
-
-			/* No longer match at the next position.
-			 * Output the current match.  */
-			deflate_choose_match(c, cur_len, cur_offset,
-					     &litrunlen, &next_seq);
-			in_next = hc_matchfinder_skip_positions(&c->p.g.hc_mf,
-								&in_cur_base,
-								in_next,
-								in_end,
-								cur_len - 2,
-								next_hashes);
-
-			/* Check if it's time to output another block.  */
-		} while (in_next < in_max_block_end &&
-			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
-
-		deflate_finish_sequence(next_seq, litrunlen);
-		deflate_flush_block(c, &os, in_block_begin,
-				    in_next - in_block_begin,
-				    in_next == in_end, false);
-	} while (in_next != in_end);
-
-	return deflate_flush_output(&os);
-}
-
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-
-/*
- * Follow the minimum-cost path in the graph of possible match/literal choices
- * for the current block and compute the frequencies of the Huffman symbols that
- * would be needed to output those matches and literals.
- */
-static void
-deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
-{
-	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
-	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
-	do {
-		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
-		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
-
-		if (length == 1) {
-			/* Literal  */
-			c->freqs.litlen[offset]++;
-		} else {
-			/* Match  */
-			c->freqs.litlen[257 + deflate_length_slot[length]]++;
-			c->freqs.offset[deflate_get_offset_slot(c, offset)]++;
-		}
-		cur_node += length;
-	} while (cur_node != end_node);
-}
-
-/* Set the current cost model from the codeword lengths specified in @lens.  */
-static void
-deflate_set_costs_from_codes(struct libdeflate_compressor *c,
-			     const struct deflate_lens *lens)
-{
-	unsigned i;
-
-	/* Literals  */
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
-		u32 bits = (lens->litlen[i] ? lens->litlen[i] : LITERAL_NOSTAT_BITS);
-		c->p.n.costs.literal[i] = bits << COST_SHIFT;
-	}
-
-	/* Lengths  */
-	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
-		unsigned length_slot = deflate_length_slot[i];
-		unsigned litlen_sym = 257 + length_slot;
-		u32 bits = (lens->litlen[litlen_sym] ? lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
-		bits += deflate_extra_length_bits[length_slot];
-		c->p.n.costs.length[i] = bits << COST_SHIFT;
-	}
-
-	/* Offset slots  */
-	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
-		u32 bits = (lens->offset[i] ? lens->offset[i] : OFFSET_NOSTAT_BITS);
-		bits += deflate_extra_offset_bits[i];
-		c->p.n.costs.offset_slot[i] = bits << COST_SHIFT;
-	}
-}
-
-static forceinline u32
-deflate_default_literal_cost(unsigned literal)
-{
-	STATIC_ASSERT(COST_SHIFT == 3);
-	/* 66 is 8.25 bits/symbol  */
-	return 66;
-}
-
-static forceinline u32
-deflate_default_length_slot_cost(unsigned length_slot)
-{
-	STATIC_ASSERT(COST_SHIFT == 3);
-	/* 60 is 7.5 bits/symbol  */
-	return 60 + ((u32)deflate_extra_length_bits[length_slot] << COST_SHIFT);
-}
-
-static forceinline u32
-deflate_default_offset_slot_cost(unsigned offset_slot)
-{
-	STATIC_ASSERT(COST_SHIFT == 3);
-	/* 39 is 4.875 bits/symbol  */
-	return 39 + ((u32)deflate_extra_offset_bits[offset_slot] << COST_SHIFT);
-}
-
-/*
- * Set default symbol costs for the first block's first optimization pass.
- *
- * It works well to assume that each symbol is equally probable.  This results
- * in each symbol being assigned a cost of (-log2(1.0/num_syms) * (1 <<
- * COST_SHIFT)) where 'num_syms' is the number of symbols in the corresponding
- * alphabet.  However, we intentionally bias the parse towards matches rather
- * than literals by using a slightly lower default cost for length symbols than
- * for literals.  This often improves the compression ratio slightly.
- */
-static void
-deflate_set_default_costs(struct libdeflate_compressor *c)
-{
-	unsigned i;
-
-	/* Literals  */
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
-		c->p.n.costs.literal[i] = deflate_default_literal_cost(i);
-
-	/* Lengths  */
-	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
-		c->p.n.costs.length[i] = deflate_default_length_slot_cost(
-						deflate_length_slot[i]);
-
-	/* Offset slots  */
-	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
-		c->p.n.costs.offset_slot[i] = deflate_default_offset_slot_cost(i);
-}
-
-static forceinline void
-deflate_adjust_cost(u32 *cost_p, u32 default_cost)
-{
-	*cost_p += ((s32)default_cost - (s32)*cost_p) >> 1;
-}
-
-/*
- * Adjust the costs when beginning a new block.
- *
- * Since the current costs have been optimized for the data, it's undesirable to
- * throw them away and start over with the default costs.  At the same time, we
- * don't want to bias the parse by assuming that the next block will be similar
- * to the current block.  As a compromise, make the costs closer to the
- * defaults, but don't simply set them to the defaults.
- */
-static void
-deflate_adjust_costs(struct libdeflate_compressor *c)
-{
-	unsigned i;
-
-	/* Literals  */
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
-		deflate_adjust_cost(&c->p.n.costs.literal[i],
-				    deflate_default_literal_cost(i));
-
-	/* Lengths  */
-	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
-		deflate_adjust_cost(&c->p.n.costs.length[i],
-				    deflate_default_length_slot_cost(
-						deflate_length_slot[i]));
-
-	/* Offset slots  */
-	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
-		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
-				    deflate_default_offset_slot_cost(i));
-}
-
-/*
- * Find the minimum-cost path through the graph of possible match/literal
- * choices for this block.
- *
- * We find the minimum cost path from 'c->p.n.optimum_nodes[0]', which
- * represents the node at the beginning of the block, to
- * 'c->p.n.optimum_nodes[block_length]', which represents the node at the end of
- * the block.  Edge costs are evaluated using the cost model 'c->p.n.costs'.
- *
- * The algorithm works backwards, starting at the end node and proceeding
- * backwards one node at a time.  At each node, the minimum cost to reach the
- * end node is computed and the match/literal choice that begins that path is
- * saved.
- */
-static void
-deflate_find_min_cost_path(struct libdeflate_compressor *c,
-			   const u32 block_length,
-			   const struct lz_match *cache_ptr)
-{
-	struct deflate_optimum_node *end_node = &c->p.n.optimum_nodes[block_length];
-	struct deflate_optimum_node *cur_node = end_node;
-
-	cur_node->cost_to_end = 0;
-	do {
-		unsigned num_matches;
-		unsigned literal;
-		u32 best_cost_to_end;
-
-		cur_node--;
-		cache_ptr--;
-
-		num_matches = cache_ptr->length;
-		literal = cache_ptr->offset;
-
-		/* It's always possible to choose a literal.  */
-		best_cost_to_end = c->p.n.costs.literal[literal] +
-				   (cur_node + 1)->cost_to_end;
-		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
-
-		/* Also consider matches if there are any.  */
-		if (num_matches) {
-			const struct lz_match *match;
-			unsigned len;
-			unsigned offset;
-			unsigned offset_slot;
-			u32 offset_cost;
-			u32 cost_to_end;
-
-			/*
-			 * Consider each length from the minimum
-			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
-			 * match found at this position.  For each length, we
-			 * consider only the smallest offset for which that
-			 * length is available.  Although this is not guaranteed
-			 * to be optimal due to the possibility of a larger
-			 * offset costing less than a smaller offset to code,
-			 * this is a very useful heuristic.
-			 */
-			match = cache_ptr - num_matches;
-			len = DEFLATE_MIN_MATCH_LEN;
-			do {
-				offset = match->offset;
-				offset_slot = deflate_get_offset_slot(c, offset);
-				offset_cost = c->p.n.costs.offset_slot[offset_slot];
-				do {
-					cost_to_end = offset_cost +
-						      c->p.n.costs.length[len] +
-						      (cur_node + len)->cost_to_end;
-					if (cost_to_end < best_cost_to_end) {
-						best_cost_to_end = cost_to_end;
-						cur_node->item = ((u32)offset << OPTIMUM_OFFSET_SHIFT) | len;
-					}
-				} while (++len <= match->length);
-			} while (++match != cache_ptr);
-			cache_ptr -= num_matches;
-		}
-		cur_node->cost_to_end = best_cost_to_end;
-	} while (cur_node != &c->p.n.optimum_nodes[0]);
-}
-
-/*
- * Choose the literal/match sequence to use for the current block.  The basic
- * algorithm finds a minimum-cost path through the block's graph of
- * literal/match choices, given a cost model.  However, the cost of each symbol
- * is unknown until the Huffman codes have been built, but at the same time the
- * Huffman codes depend on the frequencies of chosen symbols.  Consequently,
- * multiple passes must be used to try to approximate an optimal solution.  The
- * first pass uses default costs, mixed with the costs from the previous block
- * if any.  Later passes use the Huffman codeword lengths from the previous pass
- * as the costs.
- */
-static void
-deflate_optimize_block(struct libdeflate_compressor *c, u32 block_length,
-		       const struct lz_match *cache_ptr, bool is_first_block)
-{
-	unsigned num_passes_remaining = c->p.n.num_optim_passes;
-	u32 i;
-
-	/* Force the block to really end at the desired length, even if some
-	 * matches extend beyond it. */
-	for (i = block_length; i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
-					ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
-		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
-
-	/* Set the initial costs. */
-	if (is_first_block)
-		deflate_set_default_costs(c);
-	else
-		deflate_adjust_costs(c);
-
-	for (;;) {
-		/* Find the minimum cost path for this pass. */
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
-
-		/* Compute frequencies of the chosen symbols. */
-		deflate_reset_symbol_frequencies(c);
-		deflate_tally_item_list(c, block_length);
-
-		if (--num_passes_remaining == 0)
-			break;
-
-		/* At least one optimization pass remains; update the costs. */
-		deflate_make_huffman_codes(&c->freqs, &c->codes);
-		deflate_set_costs_from_codes(c, &c->codes.lens);
-	}
-}
-
-/*
- * This is the "near-optimal" DEFLATE compressor.  It computes the optimal
- * representation of each DEFLATE block using a minimum-cost path search over
- * the graph of possible match/literal choices for that block, assuming a
- * certain cost for each Huffman symbol.
- *
- * For several reasons, the end result is not guaranteed to be optimal:
- *
- * - Nonoptimal choice of blocks
- * - Heuristic limitations on which matches are actually considered
- * - Symbol costs are unknown until the symbols have already been chosen
- *   (so iterative optimization must be used)
- */
-static size_t
-deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
-			      const u8 * restrict in, size_t in_nbytes,
-			      u8 * restrict out, size_t out_nbytes_avail)
-{
-	const u8 *in_next = in;
-	const u8 *in_end = in_next + in_nbytes;
-	struct deflate_output_bitstream os;
-	const u8 *in_cur_base = in_next;
-	const u8 *in_next_slide = in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
-	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
-	unsigned nice_len = MIN(c->nice_match_length, max_len);
-	u32 next_hashes[2] = {0, 0};
-
-	deflate_init_output(&os, out, out_nbytes_avail);
-	bt_matchfinder_init(&c->p.n.bt_mf);
-
-	do {
-		/* Starting a new DEFLATE block.  */
-
-		struct lz_match *cache_ptr = c->p.n.match_cache;
-		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end =
-			in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
-		const u8 *next_observation = in_next;
-
-		init_block_split_stats(&c->split_stats);
-
-		/*
-		 * Find matches until we decide to end the block.  We end the
-		 * block if any of the following is true:
-		 *
-		 * (1) Maximum block length has been reached
-		 * (2) Match catch may overflow.
-		 * (3) Block split heuristic says to split now.
-		 */
-		do {
-			struct lz_match *matches;
-			unsigned best_len;
-
-			/* Slide the window forward if needed.  */
-			if (in_next == in_next_slide) {
-				bt_matchfinder_slide_window(&c->p.n.bt_mf);
-				in_cur_base = in_next;
-				in_next_slide = in_next + MIN(in_end - in_next,
-							      MATCHFINDER_WINDOW_SIZE);
-			}
-
-			/* Decrease the maximum and nice match lengths if we're
-			 * approaching the end of the input buffer.  */
-			if (unlikely(max_len > in_end - in_next)) {
-				max_len = in_end - in_next;
-				nice_len = MIN(nice_len, max_len);
-			}
-
-			/*
-			 * Find matches with the current position using the
-			 * binary tree matchfinder and save them in
-			 * 'match_cache'.
-			 *
-			 * Note: the binary tree matchfinder is more suited for
-			 * optimal parsing than the hash chain matchfinder.  The
-			 * reasons for this include:
-			 *
-			 * - The binary tree matchfinder can find more matches
-			 *   in the same number of steps.
-			 * - One of the major advantages of hash chains is that
-			 *   skipping positions (not searching for matches at
-			 *   them) is faster; however, with optimal parsing we
-			 *   search for matches at almost all positions, so this
-			 *   advantage of hash chains is negated.
-			 */
-			matches = cache_ptr;
-			best_len = 0;
-			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
-				cache_ptr = bt_matchfinder_get_matches(&c->p.n.bt_mf,
-								       in_cur_base,
-								       in_next - in_cur_base,
-								       max_len,
-								       nice_len,
-								       c->max_search_depth,
-								       next_hashes,
-								       &best_len,
-								       matches);
-			}
-
-			if (in_next >= next_observation) {
-				if (best_len >= 4) {
-					observe_match(&c->split_stats, best_len);
-					next_observation = in_next + best_len;
-				} else {
-					observe_literal(&c->split_stats, *in_next);
-					next_observation = in_next + 1;
-				}
-			}
-
-			cache_ptr->length = cache_ptr - matches;
-			cache_ptr->offset = *in_next;
-			in_next++;
-			cache_ptr++;
-
-			/*
-			 * If there was a very long match found, don't cache any
-			 * matches for the bytes covered by that match.  This
-			 * avoids degenerate behavior when compressing highly
-			 * redundant data, where the number of matches can be
-			 * very large.
-			 *
-			 * This heuristic doesn't actually hurt the compression
-			 * ratio very much.  If there's a long match, then the
-			 * data must be highly compressible, so it doesn't
-			 * matter much what we do.
-			 */
-			if (best_len >= DEFLATE_MIN_MATCH_LEN && best_len >= nice_len) {
-				--best_len;
-				do {
-					if (in_next == in_next_slide) {
-						bt_matchfinder_slide_window(&c->p.n.bt_mf);
-						in_cur_base = in_next;
-						in_next_slide = in_next + MIN(in_end - in_next,
-									      MATCHFINDER_WINDOW_SIZE);
-					}
-					if (unlikely(max_len > in_end - in_next)) {
-						max_len = in_end - in_next;
-						nice_len = MIN(nice_len, max_len);
-					}
-					if (max_len >= BT_MATCHFINDER_REQUIRED_NBYTES) {
-						bt_matchfinder_skip_position(&c->p.n.bt_mf,
-									     in_cur_base,
-									     in_next - in_cur_base,
-									     nice_len,
-									     c->max_search_depth,
-									     next_hashes);
-					}
-					cache_ptr->length = 0;
-					cache_ptr->offset = *in_next;
-					in_next++;
-					cache_ptr++;
-				} while (--best_len);
-			}
-		} while (in_next < in_max_block_end &&
-			 cache_ptr < &c->p.n.match_cache[CACHE_LENGTH] &&
-			 !should_end_block(&c->split_stats, in_block_begin, in_next, in_end));
-
-		/* All the matches for this block have been cached.  Now choose
-		 * the sequence of items to output and flush the block.  */
-		deflate_optimize_block(c, in_next - in_block_begin, cache_ptr,
-				       in_block_begin == in);
-		deflate_flush_block(c, &os, in_block_begin, in_next - in_block_begin,
-				    in_next == in_end, true);
-	} while (in_next != in_end);
-
-	return deflate_flush_output(&os);
-}
-
-#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-
-/* Initialize c->offset_slot_fast.  */
-static void
-deflate_init_offset_slot_fast(struct libdeflate_compressor *c)
-{
-	unsigned offset_slot;
-	unsigned offset;
-	unsigned offset_end;
-
-	for (offset_slot = 0;
-	     offset_slot < ARRAY_LEN(deflate_offset_slot_base);
-	     offset_slot++)
-	{
-		offset = deflate_offset_slot_base[offset_slot];
-	#if USE_FULL_OFFSET_SLOT_FAST
-		offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
-		do {
-			c->offset_slot_fast[offset] = offset_slot;
-		} while (++offset != offset_end);
-	#else
-		if (offset <= 256) {
-			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
-			do {
-				c->offset_slot_fast[offset - 1] = offset_slot;
-			} while (++offset != offset_end);
-		} else {
-			offset_end = offset + (1 << deflate_extra_offset_bits[offset_slot]);
-			do {
-				c->offset_slot_fast[256 + ((offset - 1) >> 7)] = offset_slot;
-			} while ((offset += (1 << 7)) != offset_end);
-		}
-	#endif
-	}
-}
-
-LIBDEFLATEAPI struct libdeflate_compressor *
-libdeflate_alloc_compressor(int compression_level)
-{
-	struct libdeflate_compressor *c;
-	size_t size;
-
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-	if (compression_level >= 8)
-		size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.n);
-	else
-#endif
-		size = offsetof(struct libdeflate_compressor, p) + sizeof(c->p.g);
-
-	c = aligned_malloc(MATCHFINDER_ALIGNMENT, size);
-	if (!c)
-		return NULL;
-
-	switch (compression_level) {
-	case 1:
-		c->impl = deflate_compress_greedy;
-		c->max_search_depth = 2;
-		c->nice_match_length = 8;
-		break;
-	case 2:
-		c->impl = deflate_compress_greedy;
-		c->max_search_depth = 6;
-		c->nice_match_length = 10;
-		break;
-	case 3:
-		c->impl = deflate_compress_greedy;
-		c->max_search_depth = 12;
-		c->nice_match_length = 14;
-		break;
-	case 4:
-		c->impl = deflate_compress_greedy;
-		c->max_search_depth = 24;
-		c->nice_match_length = 24;
-		break;
-	case 5:
-		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 20;
-		c->nice_match_length = 30;
-		break;
-	case 6:
-		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 40;
-		c->nice_match_length = 65;
-		break;
-	case 7:
-		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 100;
-		c->nice_match_length = 130;
-		break;
-#if SUPPORT_NEAR_OPTIMAL_PARSING
-	case 8:
-		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 12;
-		c->nice_match_length = 20;
-		c->p.n.num_optim_passes = 1;
-		break;
-	case 9:
-		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 16;
-		c->nice_match_length = 26;
-		c->p.n.num_optim_passes = 2;
-		break;
-	case 10:
-		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 30;
-		c->nice_match_length = 50;
-		c->p.n.num_optim_passes = 2;
-		break;
-	case 11:
-		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 60;
-		c->nice_match_length = 80;
-		c->p.n.num_optim_passes = 3;
-		break;
-	case 12:
-		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 100;
-		c->nice_match_length = 133;
-		c->p.n.num_optim_passes = 4;
-		break;
-#else
-	case 8:
-		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 150;
-		c->nice_match_length = 200;
-		break;
-	case 9:
-		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 200;
-		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
-		break;
-#endif
-	default:
-		aligned_free(c);
-		return NULL;
-	}
-
-	c->compression_level = compression_level;
-
-	deflate_init_offset_slot_fast(c);
-	deflate_init_static_codes(c);
-
-	return c;
-}
-
-LIBDEFLATEAPI size_t
-libdeflate_deflate_compress(struct libdeflate_compressor *c,
-			    const void *in, size_t in_nbytes,
-			    void *out, size_t out_nbytes_avail)
-{
-	if (unlikely(out_nbytes_avail < OUTPUT_END_PADDING))
-		return 0;
-
-	/* For extremely small inputs just use a single uncompressed block. */
-	if (unlikely(in_nbytes < 16)) {
-		struct deflate_output_bitstream os;
-		deflate_init_output(&os, out, out_nbytes_avail);
-		if (in_nbytes == 0)
-			in = &os; /* Avoid passing NULL to memcpy() */
-		deflate_write_uncompressed_block(&os, in, in_nbytes, true);
-		return deflate_flush_output(&os);
-	}
-
-	return (*c->impl)(c, in, in_nbytes, out, out_nbytes_avail);
-}
-
-LIBDEFLATEAPI void
-libdeflate_free_compressor(struct libdeflate_compressor *c)
-{
-	aligned_free(c);
-}
-
-unsigned int
-deflate_get_compression_level(struct libdeflate_compressor *c)
-{
-	return c->compression_level;
-}
-
-LIBDEFLATEAPI size_t
-libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
-				  size_t in_nbytes)
-{
-	/*
-	 * The worst case is all uncompressed blocks where one block has length
-	 * <= MIN_BLOCK_LENGTH and the others have length MIN_BLOCK_LENGTH.
-	 * Each uncompressed block has 5 bytes of overhead: 1 for BFINAL, BTYPE,
-	 * and alignment to a byte boundary; 2 for LEN; and 2 for NLEN.
-	 */
-	size_t max_num_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
-	return (5 * max_num_blocks) + in_nbytes + 1 + OUTPUT_END_PADDING;
-}
diff --git a/ext/libdeflate/lib/deflate_compress.h b/ext/libdeflate/lib/deflate_compress.h
deleted file mode 100644
index f4bb23b6..00000000
--- a/ext/libdeflate/lib/deflate_compress.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef LIB_DEFLATE_COMPRESS_H
-#define LIB_DEFLATE_COMPRESS_H
-
-#include "lib_common.h"
-
-/* DEFLATE compression is private to deflate_compress.c, but we do need to be
- * able to query the compression level for zlib and gzip header generation.  */
-
-struct libdeflate_compressor;
-
-extern unsigned int
-deflate_get_compression_level(struct libdeflate_compressor *c);
-
-#endif /* LIB_DEFLATE_COMPRESS_H */
diff --git a/ext/libdeflate/lib/deflate_constants.h b/ext/libdeflate/lib/deflate_constants.h
deleted file mode 100644
index a10b57de..00000000
--- a/ext/libdeflate/lib/deflate_constants.h
+++ /dev/null
@@ -1,66 +0,0 @@
-/*
- * deflate_constants.h - constants for the DEFLATE compression format
- */
-
-#ifndef LIB_DEFLATE_CONSTANTS_H
-#define LIB_DEFLATE_CONSTANTS_H
-
-/* Valid block types  */
-#define DEFLATE_BLOCKTYPE_UNCOMPRESSED		0
-#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN	1
-#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN	2
-
-/* Minimum and maximum supported match lengths (in bytes)  */
-#define DEFLATE_MIN_MATCH_LEN			3
-#define DEFLATE_MAX_MATCH_LEN			258
-
-/* Minimum and maximum supported match offsets (in bytes)  */
-#define DEFLATE_MIN_MATCH_OFFSET		1
-#define DEFLATE_MAX_MATCH_OFFSET		32768
-
-#define DEFLATE_MAX_WINDOW_SIZE			32768
-
-/* Number of symbols in each Huffman code.  Note: for the literal/length
- * and offset codes, these are actually the maximum values; a given block
- * might use fewer symbols.  */
-#define DEFLATE_NUM_PRECODE_SYMS		19
-#define DEFLATE_NUM_LITLEN_SYMS			288
-#define DEFLATE_NUM_OFFSET_SYMS			32
-
-/* The maximum number of symbols across all codes  */
-#define DEFLATE_MAX_NUM_SYMS			288
-
-/* Division of symbols in the literal/length code  */
-#define DEFLATE_NUM_LITERALS			256
-#define DEFLATE_END_OF_BLOCK			256
-#define DEFLATE_NUM_LEN_SYMS			31
-
-/* Maximum codeword length, in bits, within each Huffman code  */
-#define DEFLATE_MAX_PRE_CODEWORD_LEN		7
-#define DEFLATE_MAX_LITLEN_CODEWORD_LEN		15
-#define DEFLATE_MAX_OFFSET_CODEWORD_LEN		15
-
-/* The maximum codeword length across all codes  */
-#define DEFLATE_MAX_CODEWORD_LEN		15
-
-/* Maximum possible overrun when decoding codeword lengths  */
-#define DEFLATE_MAX_LENS_OVERRUN		137
-
-/*
- * Maximum number of extra bits that may be required to represent a match
- * length or offset.
- *
- * TODO: are we going to have full DEFLATE64 support?  If so, up to 16
- * length bits must be supported.
- */
-#define DEFLATE_MAX_EXTRA_LENGTH_BITS		5
-#define DEFLATE_MAX_EXTRA_OFFSET_BITS		14
-
-/* The maximum number of bits in which a match can be represented.  This
- * is the absolute worst case, which assumes the longest possible Huffman
- * codewords and the maximum numbers of extra bits.  */
-#define DEFLATE_MAX_MATCH_BITS	\
-	(DEFLATE_MAX_LITLEN_CODEWORD_LEN + DEFLATE_MAX_EXTRA_LENGTH_BITS + \
-	DEFLATE_MAX_OFFSET_CODEWORD_LEN + DEFLATE_MAX_EXTRA_OFFSET_BITS)
-
-#endif /* LIB_DEFLATE_CONSTANTS_H */
diff --git a/ext/libdeflate/lib/deflate_decompress.c b/ext/libdeflate/lib/deflate_decompress.c
deleted file mode 100644
index 19ccdb21..00000000
--- a/ext/libdeflate/lib/deflate_decompress.c
+++ /dev/null
@@ -1,997 +0,0 @@
-/*
- * deflate_decompress.c - a decompressor for DEFLATE
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- * ---------------------------------------------------------------------------
- *
- * This is a highly optimized DEFLATE decompressor.  When compiled with gcc on
- * x86_64, it decompresses data in about 52% of the time of zlib (48% if BMI2
- * instructions are available).  On other architectures it should still be
- * significantly faster than zlib, but the difference may be smaller.
- *
- * Why this is faster than zlib's implementation:
- *
- * - Word accesses rather than byte accesses when reading input
- * - Word accesses rather than byte accesses when copying matches
- * - Faster Huffman decoding combined with various DEFLATE-specific tricks
- * - Larger bitbuffer variable that doesn't need to be filled as often
- * - Other optimizations to remove unnecessary branches
- * - Only full-buffer decompression is supported, so the code doesn't need to
- *   support stopping and resuming decompression.
- * - On x86_64, compile a version of the decompression routine using BMI2
- *   instructions and use it automatically at runtime when supported.
- */
-
-#include <limits.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "deflate_constants.h"
-#include "unaligned.h"
-
-#include "libdeflate.h"
-
-/*
- * If the expression passed to SAFETY_CHECK() evaluates to false, then the
- * decompression routine immediately returns LIBDEFLATE_BAD_DATA, indicating the
- * compressed data is invalid.
- *
- * Theoretically, these checks could be disabled for specialized applications
- * where all input to the decompressor will be trusted.
- */
-#if 0
-#  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
-#  define SAFETY_CHECK(expr)	(void)(expr)
-#else
-#  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
-#endif
-
-/*
- * Each TABLEBITS number is the base-2 logarithm of the number of entries in the
- * main portion of the corresponding decode table.  Each number should be large
- * enough to ensure that for typical data, the vast majority of symbols can be
- * decoded by a direct lookup of the next TABLEBITS bits of compressed data.
- * However, this must be balanced against the fact that a larger table requires
- * more memory and requires more time to fill.
- *
- * Note: you cannot change a TABLEBITS number without also changing the
- * corresponding ENOUGH number!
- */
-#define PRECODE_TABLEBITS	7
-#define LITLEN_TABLEBITS	10
-#define OFFSET_TABLEBITS	8
-
-/*
- * Each ENOUGH number is the maximum number of decode table entries that may be
- * required for the corresponding Huffman code, including the main table and all
- * subtables.  Each number depends on three parameters:
- *
- *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
- *	(2) the number of main table bits (the TABLEBITS numbers defined above)
- *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
- *
- * The ENOUGH numbers were computed using the utility program 'enough' from
- * zlib.  This program enumerates all possible relevant Huffman codes to find
- * the worst-case usage of decode table entries.
- */
-#define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
-#define LITLEN_ENOUGH		1334	/* enough 288 10 15	*/
-#define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
-
-/*
- * Type for codeword lengths.
- */
-typedef u8 len_t;
-
-/*
- * The main DEFLATE decompressor structure.  Since this implementation only
- * supports full buffer decompression, this structure does not store the entire
- * decompression state, but rather only some arrays that are too large to
- * comfortably allocate on the stack.
- */
-struct libdeflate_decompressor {
-
-	/*
-	 * The arrays aren't all needed at the same time.  'precode_lens' and
-	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
-	 * Furthermore, 'lens' need not be retained after building the litlen
-	 * and offset decode tables.  In fact, 'lens' can be in union with
-	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
-	 * and is built first.
-	 */
-
-	union {
-		len_t precode_lens[DEFLATE_NUM_PRECODE_SYMS];
-
-		struct {
-			len_t lens[DEFLATE_NUM_LITLEN_SYMS +
-				   DEFLATE_NUM_OFFSET_SYMS +
-				   DEFLATE_MAX_LENS_OVERRUN];
-
-			u32 precode_decode_table[PRECODE_ENOUGH];
-		} l;
-
-		u32 litlen_decode_table[LITLEN_ENOUGH];
-	} u;
-
-	u32 offset_decode_table[OFFSET_ENOUGH];
-
-	/* used only during build_decode_table() */
-	u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
-
-	bool static_codes_loaded;
-};
-
-/*****************************************************************************
- *				Input bitstream                              *
- *****************************************************************************/
-
-/*
- * The state of the "input bitstream" consists of the following variables:
- *
- *	- in_next: pointer to the next unread byte in the input buffer
- *
- *	- in_end: pointer just past the end of the input buffer
- *
- *	- bitbuf: a word-sized variable containing bits that have been read from
- *		  the input buffer.  The buffered bits are right-aligned
- *		  (they're the low-order bits).
- *
- *	- bitsleft: number of bits in 'bitbuf' that are valid.
- *
- * To make it easier for the compiler to optimize the code by keeping variables
- * in registers, these are declared as normal variables and manipulated using
- * macros.
- */
-
-/*
- * The type for the bitbuffer variable ('bitbuf' described above).  For best
- * performance, this should have size equal to a machine word.
- *
- * 64-bit platforms have a significant advantage: they get a bigger bitbuffer
- * which they have to fill less often.
- */
-typedef machine_word_t bitbuf_t;
-
-/*
- * Number of bits the bitbuffer variable can hold.
- *
- * This is one less than the obvious value because of the optimized arithmetic
- * in FILL_BITS_WORDWISE() that leaves 'bitsleft' in the range
- * [WORDBITS - 8, WORDBITS - 1] rather than [WORDBITS - 7, WORDBITS].
- */
-#define BITBUF_NBITS	(8 * sizeof(bitbuf_t) - 1)
-
-/*
- * The maximum number of bits that can be ensured in the bitbuffer variable,
- * i.e. the maximum value of 'n' that can be passed ENSURE_BITS(n).  The decoder
- * only reads whole bytes from memory, so this is the lowest value of 'bitsleft'
- * at which another byte cannot be read without first consuming some bits.
- */
-#define MAX_ENSURE	(BITBUF_NBITS - 7)
-
-/*
- * Evaluates to true if 'n' is a valid argument to ENSURE_BITS(n), or false if
- * 'n' is too large to be passed to ENSURE_BITS(n).  Note: if 'n' is a compile
- * time constant, then this expression will be a compile-type constant.
- * Therefore, CAN_ENSURE() can be used choose between alternative
- * implementations at compile time.
- */
-#define CAN_ENSURE(n)	((n) <= MAX_ENSURE)
-
-/*
- * Fill the bitbuffer variable, reading one byte at a time.
- *
- * If we would overread the input buffer, we just don't read anything, leaving
- * the bits zeroed but marking them filled.  This simplifies the decompressor
- * because it removes the need to distinguish between real overreads and
- * overreads that occur only because of the decompressor's own lookahead.
- *
- * The disadvantage is that real overreads are not detected immediately.
- * However, this is safe because the decompressor is still guaranteed to make
- * forward progress when presented never-ending 0 bits.  In an existing block
- * output will be getting generated, whereas new blocks can only be uncompressed
- * (since the type code for uncompressed blocks is 0), for which we check for
- * previous overread.  But even if we didn't check, uncompressed blocks would
- * fail to validate because LEN would not equal ~NLEN.  So the decompressor will
- * eventually either detect that the output buffer is full, or detect invalid
- * input, or finish the final block.
- */
-#define FILL_BITS_BYTEWISE()					\
-do {								\
-	if (likely(in_next != in_end))				\
-		bitbuf |= (bitbuf_t)*in_next++ << bitsleft;	\
-	else							\
-		overrun_count++;				\
-	bitsleft += 8;						\
-} while (bitsleft <= BITBUF_NBITS - 8)
-
-/*
- * Fill the bitbuffer variable by reading the next word from the input buffer
- * and branchlessly updating 'in_next' and 'bitsleft' based on how many bits
- * were filled.  This can be significantly faster than FILL_BITS_BYTEWISE().
- * However, for this to work correctly, the word must be interpreted in
- * little-endian format.  In addition, the memory access may be unaligned.
- * Therefore, this method is most efficient on little-endian architectures that
- * support fast unaligned access, such as x86 and x86_64.
- *
- * For faster updating of 'bitsleft', we consider the bitbuffer size in bits to
- * be 1 less than the word size and therefore be all 1 bits.  Then the number of
- * bits filled is the value of the 0 bits in position >= 3 when changed to 1.
- * E.g. if words are 64 bits and bitsleft = 16 = b010000 then we refill b101000
- * = 40 bits = 5 bytes.  This uses only 4 operations to update 'in_next' and
- * 'bitsleft': one each of +, ^, >>, and |.  (Not counting operations the
- * compiler optimizes out.)  In contrast, the alternative of:
- *
- *	in_next += (BITBUF_NBITS - bitsleft) >> 3;
- *	bitsleft += (BITBUF_NBITS - bitsleft) & ~7;
- *
- * (where BITBUF_NBITS would be WORDBITS rather than WORDBITS - 1) would on
- * average refill an extra bit, but uses 5 operations: two +, and one each of
- * -, >>, and &.  Also the - and & must be completed before 'bitsleft' can be
- * updated, while the current solution updates 'bitsleft' with no dependencies.
- */
-#define FILL_BITS_WORDWISE()					\
-do {								\
-	/* BITBUF_NBITS must be all 1's in binary, see above */	\
-	STATIC_ASSERT((BITBUF_NBITS & (BITBUF_NBITS + 1)) == 0);\
-								\
-	bitbuf |= get_unaligned_leword(in_next) << bitsleft;	\
-	in_next += (bitsleft ^ BITBUF_NBITS) >> 3;		\
-	bitsleft |= BITBUF_NBITS & ~7;				\
-} while (0)
-
-/*
- * Does the bitbuffer variable currently contain at least 'n' bits?
- */
-#define HAVE_BITS(n) (bitsleft >= (n))
-
-/*
- * Load more bits from the input buffer until the specified number of bits is
- * present in the bitbuffer variable.  'n' cannot be too large; see MAX_ENSURE
- * and CAN_ENSURE().
- */
-#define ENSURE_BITS(n)						\
-if (!HAVE_BITS(n)) {						\
-	if (CPU_IS_LITTLE_ENDIAN() &&				\
-	    UNALIGNED_ACCESS_IS_FAST &&				\
-	    likely(in_end - in_next >= sizeof(bitbuf_t)))	\
-		FILL_BITS_WORDWISE();				\
-	else							\
-		FILL_BITS_BYTEWISE();				\
-}
-
-/*
- * Return the next 'n' bits from the bitbuffer variable without removing them.
- */
-#define BITS(n) ((u32)bitbuf & (((u32)1 << (n)) - 1))
-
-/*
- * Remove the next 'n' bits from the bitbuffer variable.
- */
-#define REMOVE_BITS(n) (bitbuf >>= (n), bitsleft -= (n))
-
-/*
- * Remove and return the next 'n' bits from the bitbuffer variable.
- */
-#define POP_BITS(n) (tmp32 = BITS(n), REMOVE_BITS(n), tmp32)
-
-/*
- * Verify that the input buffer hasn't been overread, then align the input to
- * the next byte boundary, discarding any remaining bits in the current byte.
- *
- * Note that if the bitbuffer variable currently contains more than 7 bits, then
- * we must rewind 'in_next', effectively putting those bits back.  Only the bits
- * in what would be the "current" byte if we were reading one byte at a time can
- * be actually discarded.
- */
-#define ALIGN_INPUT()							\
-do {									\
-	SAFETY_CHECK(overrun_count <= (bitsleft >> 3));			\
-	in_next -= (bitsleft >> 3) - overrun_count;			\
-	overrun_count = 0;						\
-	bitbuf = 0;							\
-	bitsleft = 0;							\
-} while(0)
-
-/*
- * Read a 16-bit value from the input.  This must have been preceded by a call
- * to ALIGN_INPUT(), and the caller must have already checked for overrun.
- */
-#define READ_U16() (tmp16 = get_unaligned_le16(in_next), in_next += 2, tmp16)
-
-/*****************************************************************************
- *                              Huffman decoding                             *
- *****************************************************************************/
-
-/*
- * A decode table for order TABLEBITS consists of a main table of (1 <<
- * TABLEBITS) entries followed by a variable number of subtables.
- *
- * The decoding algorithm takes the next TABLEBITS bits of compressed data and
- * uses them as an index into the decode table.  The resulting entry is either a
- * "direct entry", meaning that it contains the value desired, or a "subtable
- * pointer", meaning that the entry references a subtable that must be indexed
- * using more bits of the compressed data to decode the symbol.
- *
- * Each decode table (a main table along with with its subtables, if any) is
- * associated with a Huffman code.  Logically, the result of a decode table
- * lookup is a symbol from the alphabet from which the corresponding Huffman
- * code was constructed.  A symbol with codeword length n <= TABLEBITS is
- * associated with 2**(TABLEBITS - n) direct entries in the table, whereas a
- * symbol with codeword length n > TABLEBITS is associated with one or more
- * subtable entries.
- *
- * On top of this basic design, we implement several optimizations:
- *
- * - We store the length of each codeword directly in each of its decode table
- *   entries.  This allows the codeword length to be produced without indexing
- *   an additional table.
- *
- * - When beneficial, we don't store the Huffman symbol itself, but instead data
- *   generated from it.  For example, when decoding an offset symbol in DEFLATE,
- *   it's more efficient if we can decode the offset base and number of extra
- *   offset bits directly rather than decoding the offset symbol and then
- *   looking up both of those values in an additional table or tables.
- *
- * The size of each decode table entry is 32 bits, which provides slightly
- * better performance than 16-bit entries on 32 and 64 bit processers, provided
- * that the table doesn't get so large that it takes up too much memory and
- * starts generating cache misses.  The bits of each decode table entry are
- * defined as follows:
- *
- * - Bits 30 -- 31: flags (see below)
- * - Bits 8 -- 29: decode result: a Huffman symbol or related data
- * - Bits 0 -- 7: codeword length
- */
-
-/*
- * This flag is set in all main decode table entries that represent subtable
- * pointers.
- */
-#define HUFFDEC_SUBTABLE_POINTER	0x80000000
-
-/*
- * This flag is set in all entries in the litlen decode table that represent
- * literals.
- */
-#define HUFFDEC_LITERAL			0x40000000
-
-/* Mask for extracting the codeword length from a decode table entry.  */
-#define HUFFDEC_LENGTH_MASK		0xFF
-
-/* Shift to extract the decode result from a decode table entry.  */
-#define HUFFDEC_RESULT_SHIFT		8
-
-/* Shift a decode result into its position in the decode table entry.  */
-#define HUFFDEC_RESULT_ENTRY(result)	((u32)(result) << HUFFDEC_RESULT_SHIFT)
-
-/* The decode result for each precode symbol.  There is no special optimization
- * for the precode; the decode result is simply the symbol value.  */
-static const u32 precode_decode_results[DEFLATE_NUM_PRECODE_SYMS] = {
-#define ENTRY(presym)	HUFFDEC_RESULT_ENTRY(presym)
-	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
-	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
-	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
-	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
-	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  ,
-#undef ENTRY
-};
-
-/* The decode result for each litlen symbol.  For literals, this is the literal
- * value itself and the HUFFDEC_LITERAL flag.  For lengths, this is the length
- * base and the number of extra length bits.  */
-static const u32 litlen_decode_results[DEFLATE_NUM_LITLEN_SYMS] = {
-
-	/* Literals  */
-#define ENTRY(literal)	(HUFFDEC_LITERAL | HUFFDEC_RESULT_ENTRY(literal))
-	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
-	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
-	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
-	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
-	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
-	ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
-	ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
-	ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
-	ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
-	ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
-	ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
-	ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
-	ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
-	ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
-	ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
-	ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
-	ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
-	ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
-	ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
-	ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
-	ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
-	ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
-	ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
-	ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
-	ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
-	ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
-	ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
-	ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
-	ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
-	ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
-	ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
-	ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
-	ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
-	ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
-	ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
-	ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
-	ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
-	ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
-	ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
-	ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
-	ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
-	ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
-	ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
-	ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
-	ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
-	ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
-	ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
-	ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
-	ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
-	ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
-	ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
-	ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
-	ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
-	ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
-	ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
-	ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
-	ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
-	ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
-	ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
-	ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
-	ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
-	ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
-	ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
-	ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
-#undef ENTRY
-
-#define HUFFDEC_EXTRA_LENGTH_BITS_MASK	0xFF
-#define HUFFDEC_LENGTH_BASE_SHIFT	8
-#define HUFFDEC_END_OF_BLOCK_LENGTH	0
-
-#define ENTRY(length_base, num_extra_bits)	HUFFDEC_RESULT_ENTRY(	\
-	((u32)(length_base) << HUFFDEC_LENGTH_BASE_SHIFT) | (num_extra_bits))
-
-	/* End of block  */
-	ENTRY(HUFFDEC_END_OF_BLOCK_LENGTH, 0),
-
-	/* Lengths  */
-	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
-	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
-	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
-	ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
-	ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
-	ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
-	ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
-	ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
-#undef ENTRY
-};
-
-/* The decode result for each offset symbol.  This is the offset base and the
- * number of extra offset bits.  */
-static const u32 offset_decode_results[DEFLATE_NUM_OFFSET_SYMS] = {
-
-#define HUFFDEC_EXTRA_OFFSET_BITS_SHIFT 16
-#define HUFFDEC_OFFSET_BASE_MASK (((u32)1 << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) - 1)
-
-#define ENTRY(offset_base, num_extra_bits)	HUFFDEC_RESULT_ENTRY(	\
-		((u32)(num_extra_bits) << HUFFDEC_EXTRA_OFFSET_BITS_SHIFT) | \
-		(offset_base))
-	ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
-	ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
-	ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
-	ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
-	ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
-	ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
-	ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
-	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(32769 , 14) , ENTRY(49153 , 14) ,
-#undef ENTRY
-};
-
-/*
- * Build a table for fast decoding of symbols from a Huffman code.  As input,
- * this function takes the codeword length of each symbol which may be used in
- * the code.  As output, it produces a decode table for the canonical Huffman
- * code described by the codeword lengths.  The decode table is built with the
- * assumption that it will be indexed with "bit-reversed" codewords, where the
- * low-order bit is the first bit of the codeword.  This format is used for all
- * Huffman codes in DEFLATE.
- *
- * @decode_table
- *	The array in which the decode table will be generated.  This array must
- *	have sufficient length; see the definition of the ENOUGH numbers.
- * @lens
- *	An array which provides, for each symbol, the length of the
- *	corresponding codeword in bits, or 0 if the symbol is unused.  This may
- *	alias @decode_table, since nothing is written to @decode_table until all
- *	@lens have been consumed.  All codeword lengths are assumed to be <=
- *	@max_codeword_len but are otherwise considered untrusted.  If they do
- *	not form a valid Huffman code, then the decode table is not built and
- *	%false is returned.
- * @num_syms
- *	The number of symbols in the code, including all unused symbols.
- * @decode_results
- *	An array which provides, for each symbol, the actual value to store into
- *	the decode table.  This value will be directly produced as the result of
- *	decoding that symbol, thereby moving the indirection out of the decode
- *	loop and into the table initialization.
- * @table_bits
- *	The log base-2 of the number of main table entries to use.
- * @max_codeword_len
- *	The maximum allowed codeword length for this Huffman code.
- *	Must be <= DEFLATE_MAX_CODEWORD_LEN.
- * @sorted_syms
- *	A temporary array of length @num_syms.
- *
- * Returns %true if successful; %false if the codeword lengths do not form a
- * valid Huffman code.
- */
-static bool
-build_decode_table(u32 decode_table[],
-		   const len_t lens[],
-		   const unsigned num_syms,
-		   const u32 decode_results[],
-		   const unsigned table_bits,
-		   const unsigned max_codeword_len,
-		   u16 *sorted_syms)
-{
-	unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
-	unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
-	unsigned sym;		/* current symbol */
-	unsigned codeword;	/* current codeword, bit-reversed */
-	unsigned len;		/* current codeword length in bits */
-	unsigned count;		/* num codewords remaining with this length */
-	u32 codespace_used;	/* codespace used out of '2^max_codeword_len' */
-	unsigned cur_table_end; /* end index of current table */
-	unsigned subtable_prefix; /* codeword prefix of current subtable */
-	unsigned subtable_start;  /* start index of current subtable */
-	unsigned subtable_bits;   /* log2 of current subtable length */
-
-	/* Count how many codewords have each length, including 0. */
-	for (len = 0; len <= max_codeword_len; len++)
-		len_counts[len] = 0;
-	for (sym = 0; sym < num_syms; sym++)
-		len_counts[lens[sym]]++;
-
-	/*
-	 * Sort the symbols primarily by increasing codeword length and
-	 * secondarily by increasing symbol value; or equivalently by their
-	 * codewords in lexicographic order, since a canonical code is assumed.
-	 *
-	 * For efficiency, also compute 'codespace_used' in the same pass over
-	 * 'len_counts[]' used to build 'offsets[]' for sorting.
-	 */
-
-	/* Ensure that 'codespace_used' cannot overflow. */
-	STATIC_ASSERT(sizeof(codespace_used) == 4);
-	STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
-		      DEFLATE_MAX_NUM_SYMS);
-
-	offsets[0] = 0;
-	offsets[1] = len_counts[0];
-	codespace_used = 0;
-	for (len = 1; len < max_codeword_len; len++) {
-		offsets[len + 1] = offsets[len] + len_counts[len];
-		codespace_used = (codespace_used << 1) + len_counts[len];
-	}
-	codespace_used = (codespace_used << 1) + len_counts[len];
-
-	for (sym = 0; sym < num_syms; sym++)
-		sorted_syms[offsets[lens[sym]]++] = sym;
-
-	sorted_syms += offsets[0]; /* Skip unused symbols */
-
-	/* lens[] is done being used, so we can write to decode_table[] now. */
-
-	/*
-	 * Check whether the lengths form a complete code (exactly fills the
-	 * codespace), an incomplete code (doesn't fill the codespace), or an
-	 * overfull code (overflows the codespace).  A codeword of length 'n'
-	 * uses proportion '1/(2^n)' of the codespace.  An overfull code is
-	 * nonsensical, so is considered invalid.  An incomplete code is
-	 * considered valid only in two specific cases; see below.
-	 */
-
-	/* overfull code? */
-	if (unlikely(codespace_used > (1U << max_codeword_len)))
-		return false;
-
-	/* incomplete code? */
-	if (unlikely(codespace_used < (1U << max_codeword_len))) {
-		u32 entry;
-		unsigned i;
-
-		if (codespace_used == 0) {
-			/*
-			 * An empty code is allowed.  This can happen for the
-			 * offset code in DEFLATE, since a dynamic Huffman block
-			 * need not contain any matches.
-			 */
-
-			/* sym=0, len=1 (arbitrary) */
-			entry = decode_results[0] | 1;
-		} else {
-			/*
-			 * Allow codes with a single used symbol, with codeword
-			 * length 1.  The DEFLATE RFC is unclear regarding this
-			 * case.  What zlib's decompressor does is permit this
-			 * for the litlen and offset codes and assume the
-			 * codeword is '0' rather than '1'.  We do the same
-			 * except we allow this for precodes too, since there's
-			 * no convincing reason to treat the codes differently.
-			 * We also assign both codewords '0' and '1' to the
-			 * symbol to avoid having to handle '1' specially.
-			 */
-			if (codespace_used != (1U << (max_codeword_len - 1)) ||
-			    len_counts[1] != 1)
-				return false;
-			entry = decode_results[*sorted_syms] | 1;
-		}
-		/*
-		 * Note: the decode table still must be fully initialized, in
-		 * case the stream is malformed and contains bits from the part
-		 * of the codespace the incomplete code doesn't use.
-		 */
-		for (i = 0; i < (1U << table_bits); i++)
-			decode_table[i] = entry;
-		return true;
-	}
-
-	/*
-	 * The lengths form a complete code.  Now, enumerate the codewords in
-	 * lexicographic order and fill the decode table entries for each one.
-	 *
-	 * First, process all codewords with len <= table_bits.  Each one gets
-	 * '2^(table_bits-len)' direct entries in the table.
-	 *
-	 * Since DEFLATE uses bit-reversed codewords, these entries aren't
-	 * consecutive but rather are spaced '2^len' entries apart.  This makes
-	 * filling them naively somewhat awkward and inefficient, since strided
-	 * stores are less cache-friendly and preclude the use of word or
-	 * vector-at-a-time stores to fill multiple entries per instruction.
-	 *
-	 * To optimize this, we incrementally double the table size.  When
-	 * processing codewords with length 'len', the table is treated as
-	 * having only '2^len' entries, so each codeword uses just one entry.
-	 * Then, each time 'len' is incremented, the table size is doubled and
-	 * the first half is copied to the second half.  This significantly
-	 * improves performance over naively doing strided stores.
-	 *
-	 * Note that some entries copied for each table doubling may not have
-	 * been initialized yet, but it doesn't matter since they're guaranteed
-	 * to be initialized later (because the Huffman code is complete).
-	 */
-	codeword = 0;
-	len = 1;
-	while ((count = len_counts[len]) == 0)
-		len++;
-	cur_table_end = 1U << len;
-	while (len <= table_bits) {
-		/* Process all 'count' codewords with length 'len' bits. */
-		do {
-			unsigned bit;
-
-			/* Fill the first entry for the current codeword. */
-			decode_table[codeword] =
-				decode_results[*sorted_syms++] | len;
-
-			if (codeword == cur_table_end - 1) {
-				/* Last codeword (all 1's) */
-				for (; len < table_bits; len++) {
-					memcpy(&decode_table[cur_table_end],
-					       decode_table,
-					       cur_table_end *
-						sizeof(decode_table[0]));
-					cur_table_end <<= 1;
-				}
-				return true;
-			}
-			/*
-			 * To advance to the lexicographically next codeword in
-			 * the canonical code, the codeword must be incremented,
-			 * then 0's must be appended to the codeword as needed
-			 * to match the next codeword's length.
-			 *
-			 * Since the codeword is bit-reversed, appending 0's is
-			 * a no-op.  However, incrementing it is nontrivial.  To
-			 * do so efficiently, use the 'bsr' instruction to find
-			 * the last (highest order) 0 bit in the codeword, set
-			 * it, and clear any later (higher order) 1 bits.  But
-			 * 'bsr' actually finds the highest order 1 bit, so to
-			 * use it first flip all bits in the codeword by XOR'ing
-			 * it with (1U << len) - 1 == cur_table_end - 1.
-			 */
-			bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
-			codeword &= bit - 1;
-			codeword |= bit;
-		} while (--count);
-
-		/* Advance to the next codeword length. */
-		do {
-			if (++len <= table_bits) {
-				memcpy(&decode_table[cur_table_end],
-				       decode_table,
-				       cur_table_end * sizeof(decode_table[0]));
-				cur_table_end <<= 1;
-			}
-		} while ((count = len_counts[len]) == 0);
-	}
-
-	/* Process codewords with len > table_bits.  These require subtables. */
-	cur_table_end = 1U << table_bits;
-	subtable_prefix = -1;
-	subtable_start = 0;
-	for (;;) {
-		u32 entry;
-		unsigned i;
-		unsigned stride;
-		unsigned bit;
-
-		/*
-		 * Start a new subtable if the first 'table_bits' bits of the
-		 * codeword don't match the prefix of the current subtable.
-		 */
-		if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
-			subtable_prefix = (codeword & ((1U << table_bits) - 1));
-			subtable_start = cur_table_end;
-			/*
-			 * Calculate the subtable length.  If the codeword has
-			 * length 'table_bits + n', then the subtable needs
-			 * '2^n' entries.  But it may need more; if fewer than
-			 * '2^n' codewords of length 'table_bits + n' remain,
-			 * then the length will need to be incremented to bring
-			 * in longer codewords until the subtable can be
-			 * completely filled.  Note that because the Huffman
-			 * code is complete, it will always be possible to fill
-			 * the subtable eventually.
-			 */
-			subtable_bits = len - table_bits;
-			codespace_used = count;
-			while (codespace_used < (1U << subtable_bits)) {
-				subtable_bits++;
-				codespace_used = (codespace_used << 1) +
-					len_counts[table_bits + subtable_bits];
-			}
-			cur_table_end = subtable_start + (1U << subtable_bits);
-
-			/*
-			 * Create the entry that points from the main table to
-			 * the subtable.  This entry contains the index of the
-			 * start of the subtable and the number of bits with
-			 * which the subtable is indexed (the log base 2 of the
-			 * number of entries it contains).
-			 */
-			decode_table[subtable_prefix] =
-				HUFFDEC_SUBTABLE_POINTER |
-				HUFFDEC_RESULT_ENTRY(subtable_start) |
-				subtable_bits;
-		}
-
-		/* Fill the subtable entries for the current codeword. */
-		entry = decode_results[*sorted_syms++] | (len - table_bits);
-		i = subtable_start + (codeword >> table_bits);
-		stride = 1U << (len - table_bits);
-		do {
-			decode_table[i] = entry;
-			i += stride;
-		} while (i < cur_table_end);
-
-		/* Advance to the next codeword. */
-		if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
-			return true;
-		bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
-		codeword &= bit - 1;
-		codeword |= bit;
-		count--;
-		while (count == 0)
-			count = len_counts[++len];
-	}
-}
-
-/* Build the decode table for the precode.  */
-static bool
-build_precode_decode_table(struct libdeflate_decompressor *d)
-{
-	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
-	STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
-
-	return build_decode_table(d->u.l.precode_decode_table,
-				  d->u.precode_lens,
-				  DEFLATE_NUM_PRECODE_SYMS,
-				  precode_decode_results,
-				  PRECODE_TABLEBITS,
-				  DEFLATE_MAX_PRE_CODEWORD_LEN,
-				  d->sorted_syms);
-}
-
-/* Build the decode table for the literal/length code.  */
-static bool
-build_litlen_decode_table(struct libdeflate_decompressor *d,
-			  unsigned num_litlen_syms, unsigned num_offset_syms)
-{
-	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
-	STATIC_ASSERT(LITLEN_TABLEBITS == 10 && LITLEN_ENOUGH == 1334);
-
-	return build_decode_table(d->u.litlen_decode_table,
-				  d->u.l.lens,
-				  num_litlen_syms,
-				  litlen_decode_results,
-				  LITLEN_TABLEBITS,
-				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
-				  d->sorted_syms);
-}
-
-/* Build the decode table for the offset code.  */
-static bool
-build_offset_decode_table(struct libdeflate_decompressor *d,
-			  unsigned num_litlen_syms, unsigned num_offset_syms)
-{
-	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
-	STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
-
-	return build_decode_table(d->offset_decode_table,
-				  d->u.l.lens + num_litlen_syms,
-				  num_offset_syms,
-				  offset_decode_results,
-				  OFFSET_TABLEBITS,
-				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
-				  d->sorted_syms);
-}
-
-static forceinline machine_word_t
-repeat_byte(u8 b)
-{
-	machine_word_t v;
-
-	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-
-	v = b;
-	v |= v << 8;
-	v |= v << 16;
-	v |= v << ((WORDBITS == 64) ? 32 : 0);
-	return v;
-}
-
-static forceinline void
-copy_word_unaligned(const void *src, void *dst)
-{
-	store_word_unaligned(load_word_unaligned(src), dst);
-}
-
-/*****************************************************************************
- *                         Main decompression routine
- *****************************************************************************/
-
-typedef enum libdeflate_result (*decompress_func_t)
-	(struct libdeflate_decompressor * restrict d,
-	 const void * restrict in, size_t in_nbytes,
-	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
-
-#undef DEFAULT_IMPL
-#undef DISPATCH
-#if defined(__i386__) || defined(__x86_64__)
-#  include "x86/decompress_impl.h"
-#endif
-
-#ifndef DEFAULT_IMPL
-#  define FUNCNAME deflate_decompress_default
-#  define ATTRIBUTES
-#  include "decompress_template.h"
-#  define DEFAULT_IMPL deflate_decompress_default
-#endif
-
-#ifdef DISPATCH
-static enum libdeflate_result
-dispatch(struct libdeflate_decompressor * restrict d,
-	 const void * restrict in, size_t in_nbytes,
-	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
-
-static volatile decompress_func_t decompress_impl = dispatch;
-
-/* Choose the fastest implementation at runtime */
-static enum libdeflate_result
-dispatch(struct libdeflate_decompressor * restrict d,
-	 const void * restrict in, size_t in_nbytes,
-	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
-{
-	decompress_func_t f = arch_select_decompress_func();
-
-	if (f == NULL)
-		f = DEFAULT_IMPL;
-
-	decompress_impl = f;
-	return (*f)(d, in, in_nbytes, out, out_nbytes_avail,
-		    actual_in_nbytes_ret, actual_out_nbytes_ret);
-}
-#else
-#  define decompress_impl DEFAULT_IMPL /* only one implementation, use it */
-#endif
-
-
-/*
- * This is the main DEFLATE decompression routine.  See libdeflate.h for the
- * documentation.
- *
- * Note that the real code is in decompress_template.h.  The part here just
- * handles calling the appropriate implementation depending on the CPU features
- * at runtime.
- */
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_deflate_decompress_ex(struct libdeflate_decompressor * restrict d,
-				 const void * restrict in, size_t in_nbytes,
-				 void * restrict out, size_t out_nbytes_avail,
-				 size_t *actual_in_nbytes_ret,
-				 size_t *actual_out_nbytes_ret)
-{
-	return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
-			       actual_in_nbytes_ret, actual_out_nbytes_ret);
-}
-
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_deflate_decompress(struct libdeflate_decompressor * restrict d,
-			      const void * restrict in, size_t in_nbytes,
-			      void * restrict out, size_t out_nbytes_avail,
-			      size_t *actual_out_nbytes_ret)
-{
-	return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
-						out, out_nbytes_avail,
-						NULL, actual_out_nbytes_ret);
-}
-
-LIBDEFLATEAPI struct libdeflate_decompressor *
-libdeflate_alloc_decompressor(void)
-{
-	/*
-	 * Note that only certain parts of the decompressor actually must be
-	 * initialized here:
-	 *
-	 * - 'static_codes_loaded' must be initialized to false.
-	 *
-	 * - The first half of the main portion of each decode table must be
-	 *   initialized to any value, to avoid reading from uninitialized
-	 *   memory during table expansion in build_decode_table().  (Although,
-	 *   this is really just to avoid warnings with dynamic tools like
-	 *   valgrind, since build_decode_table() is guaranteed to initialize
-	 *   all entries eventually anyway.)
-	 *
-	 * But for simplicity, we currently just zero the whole decompressor.
-	 */
-	return calloc(1, sizeof(struct libdeflate_decompressor));
-}
-
-LIBDEFLATEAPI void
-libdeflate_free_decompressor(struct libdeflate_decompressor *d)
-{
-	free(d);
-}
diff --git a/ext/libdeflate/lib/gzip_compress.c b/ext/libdeflate/lib/gzip_compress.c
deleted file mode 100644
index bfc75e2b..00000000
--- a/ext/libdeflate/lib/gzip_compress.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * gzip_compress.c - compress with a gzip wrapper
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "deflate_compress.h"
-#include "gzip_constants.h"
-#include "unaligned.h"
-
-#include "libdeflate.h"
-
-LIBDEFLATEAPI size_t
-libdeflate_gzip_compress(struct libdeflate_compressor *c,
-			 const void *in, size_t in_size,
-			 void *out, size_t out_nbytes_avail)
-{
-	u8 *out_next = out;
-	unsigned compression_level;
-	u8 xfl;
-	size_t deflate_size;
-
-	if (out_nbytes_avail <= GZIP_MIN_OVERHEAD)
-		return 0;
-
-	/* ID1 */
-	*out_next++ = GZIP_ID1;
-	/* ID2 */
-	*out_next++ = GZIP_ID2;
-	/* CM */
-	*out_next++ = GZIP_CM_DEFLATE;
-	/* FLG */
-	*out_next++ = 0;
-	/* MTIME */
-	put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next);
-	out_next += 4;
-	/* XFL */
-	xfl = 0;
-	compression_level = deflate_get_compression_level(c);
-	if (compression_level < 2)
-		xfl |= GZIP_XFL_FASTEST_COMRESSION;
-	else if (compression_level >= 8)
-		xfl |= GZIP_XFL_SLOWEST_COMRESSION;
-	*out_next++ = xfl;
-	/* OS */
-	*out_next++ = GZIP_OS_UNKNOWN;	/* OS  */
-
-	/* Compressed data  */
-	deflate_size = libdeflate_deflate_compress(c, in, in_size, out_next,
-					out_nbytes_avail - GZIP_MIN_OVERHEAD);
-	if (deflate_size == 0)
-		return 0;
-	out_next += deflate_size;
-
-	/* CRC32 */
-	put_unaligned_le32(libdeflate_crc32(0, in, in_size), out_next);
-	out_next += 4;
-
-	/* ISIZE */
-	put_unaligned_le32((u32)in_size, out_next);
-	out_next += 4;
-
-	return out_next - (u8 *)out;
-}
-
-LIBDEFLATEAPI size_t
-libdeflate_gzip_compress_bound(struct libdeflate_compressor *c,
-			       size_t in_nbytes)
-{
-	return GZIP_MIN_OVERHEAD +
-	       libdeflate_deflate_compress_bound(c, in_nbytes);
-}
diff --git a/ext/libdeflate/lib/gzip_constants.h b/ext/libdeflate/lib/gzip_constants.h
deleted file mode 100644
index 40dd4358..00000000
--- a/ext/libdeflate/lib/gzip_constants.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * gzip_constants.h - constants for the gzip wrapper format
- */
-
-#ifndef LIB_GZIP_CONSTANTS_H
-#define LIB_GZIP_CONSTANTS_H
-
-#define GZIP_MIN_HEADER_SIZE	10
-#define GZIP_FOOTER_SIZE	8
-#define GZIP_MIN_OVERHEAD	(GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
-
-#define GZIP_ID1		0x1F
-#define GZIP_ID2		0x8B
-
-#define GZIP_CM_DEFLATE		8
-
-#define GZIP_FTEXT		0x01
-#define GZIP_FHCRC		0x02
-#define GZIP_FEXTRA		0x04
-#define GZIP_FNAME		0x08
-#define GZIP_FCOMMENT		0x10
-#define GZIP_FRESERVED		0xE0
-
-#define GZIP_MTIME_UNAVAILABLE	0
-
-#define GZIP_XFL_SLOWEST_COMRESSION	0x02
-#define GZIP_XFL_FASTEST_COMRESSION	0x04
-
-#define GZIP_OS_FAT		0
-#define GZIP_OS_AMIGA		1
-#define GZIP_OS_VMS		2
-#define GZIP_OS_UNIX		3
-#define GZIP_OS_VM_CMS		4
-#define GZIP_OS_ATARI_TOS	5
-#define GZIP_OS_HPFS		6
-#define GZIP_OS_MACINTOSH	7
-#define GZIP_OS_Z_SYSTEM	8
-#define GZIP_OS_CP_M		9
-#define GZIP_OS_TOPS_20		10
-#define GZIP_OS_NTFS		11
-#define GZIP_OS_QDOS		12
-#define GZIP_OS_RISCOS		13
-#define GZIP_OS_UNKNOWN		255
-
-#endif /* LIB_GZIP_CONSTANTS_H */
diff --git a/ext/libdeflate/lib/gzip_decompress.c b/ext/libdeflate/lib/gzip_decompress.c
deleted file mode 100644
index 5703093e..00000000
--- a/ext/libdeflate/lib/gzip_decompress.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * gzip_decompress.c - decompress with a gzip wrapper
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "gzip_constants.h"
-#include "unaligned.h"
-
-#include "libdeflate.h"
-
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_in_nbytes_ret,
-			      size_t *actual_out_nbytes_ret)
-{
-	const u8 *in_next = in;
-	const u8 * const in_end = in_next + in_nbytes;
-	u8 flg;
-	size_t actual_in_nbytes;
-	size_t actual_out_nbytes;
-	enum libdeflate_result result;
-
-	if (in_nbytes < GZIP_MIN_OVERHEAD)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* ID1 */
-	if (*in_next++ != GZIP_ID1)
-		return LIBDEFLATE_BAD_DATA;
-	/* ID2 */
-	if (*in_next++ != GZIP_ID2)
-		return LIBDEFLATE_BAD_DATA;
-	/* CM */
-	if (*in_next++ != GZIP_CM_DEFLATE)
-		return LIBDEFLATE_BAD_DATA;
-	flg = *in_next++;
-	/* MTIME */
-	in_next += 4;
-	/* XFL */
-	in_next += 1;
-	/* OS */
-	in_next += 1;
-
-	if (flg & GZIP_FRESERVED)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* Extra field */
-	if (flg & GZIP_FEXTRA) {
-		u16 xlen = get_unaligned_le16(in_next);
-		in_next += 2;
-
-		if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
-			return LIBDEFLATE_BAD_DATA;
-
-		in_next += xlen;
-	}
-
-	/* Original file name (zero terminated) */
-	if (flg & GZIP_FNAME) {
-		while (*in_next++ != 0 && in_next != in_end)
-			;
-		if (in_end - in_next < GZIP_FOOTER_SIZE)
-			return LIBDEFLATE_BAD_DATA;
-	}
-
-	/* File comment (zero terminated) */
-	if (flg & GZIP_FCOMMENT) {
-		while (*in_next++ != 0 && in_next != in_end)
-			;
-		if (in_end - in_next < GZIP_FOOTER_SIZE)
-			return LIBDEFLATE_BAD_DATA;
-	}
-
-	/* CRC16 for gzip header */
-	if (flg & GZIP_FHCRC) {
-		in_next += 2;
-		if (in_end - in_next < GZIP_FOOTER_SIZE)
-			return LIBDEFLATE_BAD_DATA;
-	}
-
-	/* Compressed data  */
-	result = libdeflate_deflate_decompress_ex(d, in_next,
-					in_end - GZIP_FOOTER_SIZE - in_next,
-					out, out_nbytes_avail,
-					&actual_in_nbytes,
-					actual_out_nbytes_ret);
-	if (result != LIBDEFLATE_SUCCESS)
-		return result;
-
-	if (actual_out_nbytes_ret)
-		actual_out_nbytes = *actual_out_nbytes_ret;
-	else
-		actual_out_nbytes = out_nbytes_avail;
-
-	in_next += actual_in_nbytes;
-
-	/* CRC32 */
-	if (libdeflate_crc32(0, out, actual_out_nbytes) !=
-	    get_unaligned_le32(in_next))
-		return LIBDEFLATE_BAD_DATA;
-	in_next += 4;
-
-	/* ISIZE */
-	if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
-		return LIBDEFLATE_BAD_DATA;
-	in_next += 4;
-
-	if (actual_in_nbytes_ret)
-		*actual_in_nbytes_ret = in_next - (u8 *)in;
-
-	return LIBDEFLATE_SUCCESS;
-}
-
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret)
-{
-	return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
-					     out, out_nbytes_avail,
-					     NULL, actual_out_nbytes_ret);
-}
diff --git a/ext/libdeflate/lib/hc_matchfinder.h b/ext/libdeflate/lib/hc_matchfinder.h
deleted file mode 100644
index 8412a6fa..00000000
--- a/ext/libdeflate/lib/hc_matchfinder.h
+++ /dev/null
@@ -1,403 +0,0 @@
-/*
- * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- *
- * ---------------------------------------------------------------------------
- *
- *				   Algorithm
- *
- * This is a Hash Chains (hc) based matchfinder.
- *
- * The main data structure is a hash table where each hash bucket contains a
- * linked list (or "chain") of sequences whose first 4 bytes share the same hash
- * code.  Each sequence is identified by its starting position in the input
- * buffer.
- *
- * The algorithm processes the input buffer sequentially.  At each byte
- * position, the hash code of the first 4 bytes of the sequence beginning at
- * that position (the sequence being matched against) is computed.  This
- * identifies the hash bucket to use for that position.  Then, this hash
- * bucket's linked list is searched for matches.  Then, a new linked list node
- * is created to represent the current sequence and is prepended to the list.
- *
- * This algorithm has several useful properties:
- *
- * - It only finds true Lempel-Ziv matches; i.e., those where the matching
- *   sequence occurs prior to the sequence being matched against.
- *
- * - The sequences in each linked list are always sorted by decreasing starting
- *   position.  Therefore, the closest (smallest offset) matches are found
- *   first, which in many compression formats tend to be the cheapest to encode.
- *
- * - Although fast running time is not guaranteed due to the possibility of the
- *   lists getting very long, the worst degenerate behavior can be easily
- *   prevented by capping the number of nodes searched at each position.
- *
- * - If the compressor decides not to search for matches at a certain position,
- *   then that position can be quickly inserted without searching the list.
- *
- * - The algorithm is adaptable to sliding windows: just store the positions
- *   relative to a "base" value that is updated from time to time, and stop
- *   searching each list when the sequences get too far away.
- *
- * ----------------------------------------------------------------------------
- *
- *				 Optimizations
- *
- * The main hash table and chains handle length 4+ matches.  Length 3 matches
- * are handled by a separate hash table with no chains.  This works well for
- * typical "greedy" or "lazy"-style compressors, where length 3 matches are
- * often only helpful if they have small offsets.  Instead of searching a full
- * chain for length 3+ matches, the algorithm just checks for one close length 3
- * match, then focuses on finding length 4+ matches.
- *
- * The longest_match() and skip_positions() functions are inlined into the
- * compressors that use them.  This isn't just about saving the overhead of a
- * function call.  These functions are intended to be called from the inner
- * loops of compressors, where giving the compiler more control over register
- * allocation is very helpful.  There is also significant benefit to be gained
- * from allowing the CPU to predict branches independently at each call site.
- * For example, "lazy"-style compressors can be written with two calls to
- * longest_match(), each of which starts with a different 'best_len' and
- * therefore has significantly different performance characteristics.
- *
- * Although any hash function can be used, a multiplicative hash is fast and
- * works well.
- *
- * On some processors, it is significantly faster to extend matches by whole
- * words (32 or 64 bits) instead of by individual bytes.  For this to be the
- * case, the processor must implement unaligned memory accesses efficiently and
- * must have either a fast "find first set bit" instruction or a fast "find last
- * set bit" instruction, depending on the processor's endianness.
- *
- * The code uses one loop for finding the first match and one loop for finding a
- * longer match.  Each of these loops is tuned for its respective task and in
- * combination are faster than a single generalized loop that handles both
- * tasks.
- *
- * The code also uses a tight inner loop that only compares the last and first
- * bytes of a potential match.  It is only when these bytes match that a full
- * match extension is attempted.
- *
- * ----------------------------------------------------------------------------
- */
-
-#include "matchfinder_common.h"
-
-#define HC_MATCHFINDER_HASH3_ORDER	15
-#define HC_MATCHFINDER_HASH4_ORDER	16
-
-#define HC_MATCHFINDER_TOTAL_HASH_LENGTH		\
-	((1UL << HC_MATCHFINDER_HASH3_ORDER) +		\
-	 (1UL << HC_MATCHFINDER_HASH4_ORDER))
-
-struct hc_matchfinder {
-
-	/* The hash table for finding length 3 matches  */
-	mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
-
-	/* The hash table which contains the first nodes of the linked lists for
-	 * finding length 4+ matches  */
-	mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
-
-	/* The "next node" references for the linked lists.  The "next node" of
-	 * the node for the sequence with position 'pos' is 'next_tab[pos]'.  */
-	mf_pos_t next_tab[MATCHFINDER_WINDOW_SIZE];
-
-}
-#ifdef _aligned_attribute
-  _aligned_attribute(MATCHFINDER_ALIGNMENT)
-#endif
-;
-
-/* Prepare the matchfinder for a new input buffer.  */
-static forceinline void
-hc_matchfinder_init(struct hc_matchfinder *mf)
-{
-	matchfinder_init((mf_pos_t *)mf, HC_MATCHFINDER_TOTAL_HASH_LENGTH);
-}
-
-static forceinline void
-hc_matchfinder_slide_window(struct hc_matchfinder *mf)
-{
-	matchfinder_rebase((mf_pos_t *)mf,
-			   sizeof(struct hc_matchfinder) / sizeof(mf_pos_t));
-}
-
-/*
- * Find the longest match longer than 'best_len' bytes.
- *
- * @mf
- *	The matchfinder structure.
- * @in_base_p
- *	Location of a pointer which points to the place in the input data the
- *	matchfinder currently stores positions relative to.  This may be updated
- *	by this function.
- * @cur_pos
- *	The current position in the input buffer relative to @in_base (the
- *	position of the sequence being matched against).
- * @best_len
- *	Require a match longer than this length.
- * @max_len
- *	The maximum permissible match length at this position.
- * @nice_len
- *	Stop searching if a match of at least this length is found.
- *	Must be <= @max_len.
- * @max_search_depth
- *	Limit on the number of potential matches to consider.  Must be >= 1.
- * @next_hashes
- *	The precomputed hash codes for the sequence beginning at @in_next.
- *	These will be used and then updated with the precomputed hashcodes for
- *	the sequence beginning at @in_next + 1.
- * @offset_ret
- *	If a match is found, its offset is returned in this location.
- *
- * Return the length of the match found, or 'best_len' if no match longer than
- * 'best_len' was found.
- */
-static forceinline u32
-hc_matchfinder_longest_match(struct hc_matchfinder * const restrict mf,
-			     const u8 ** const restrict in_base_p,
-			     const u8 * const restrict in_next,
-			     u32 best_len,
-			     const u32 max_len,
-			     const u32 nice_len,
-			     const u32 max_search_depth,
-			     u32 * const restrict next_hashes,
-			     u32 * const restrict offset_ret)
-{
-	u32 depth_remaining = max_search_depth;
-	const u8 *best_matchptr = in_next;
-	mf_pos_t cur_node3, cur_node4;
-	u32 hash3, hash4;
-	u32 next_hashseq;
-	u32 seq4;
-	const u8 *matchptr;
-	u32 len;
-	u32 cur_pos = in_next - *in_base_p;
-	const u8 *in_base;
-	mf_pos_t cutoff;
-
-	if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
-		hc_matchfinder_slide_window(mf);
-		*in_base_p += MATCHFINDER_WINDOW_SIZE;
-		cur_pos = 0;
-	}
-
-	in_base = *in_base_p;
-	cutoff = cur_pos - MATCHFINDER_WINDOW_SIZE;
-
-	if (unlikely(max_len < 5)) /* can we read 4 bytes from 'in_next + 1'? */
-		goto out;
-
-	/* Get the precomputed hash codes.  */
-	hash3 = next_hashes[0];
-	hash4 = next_hashes[1];
-
-	/* From the hash buckets, get the first node of each linked list.  */
-	cur_node3 = mf->hash3_tab[hash3];
-	cur_node4 = mf->hash4_tab[hash4];
-
-	/* Update for length 3 matches.  This replaces the singleton node in the
-	 * 'hash3' bucket with the node for the current sequence.  */
-	mf->hash3_tab[hash3] = cur_pos;
-
-	/* Update for length 4 matches.  This prepends the node for the current
-	 * sequence to the linked list in the 'hash4' bucket.  */
-	mf->hash4_tab[hash4] = cur_pos;
-	mf->next_tab[cur_pos] = cur_node4;
-
-	/* Compute the next hash codes.  */
-	next_hashseq = get_unaligned_le32(in_next + 1);
-	next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
-	next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
-	prefetchw(&mf->hash3_tab[next_hashes[0]]);
-	prefetchw(&mf->hash4_tab[next_hashes[1]]);
-
-	if (best_len < 4) {  /* No match of length >= 4 found yet?  */
-
-		/* Check for a length 3 match if needed.  */
-
-		if (cur_node3 <= cutoff)
-			goto out;
-
-		seq4 = load_u32_unaligned(in_next);
-
-		if (best_len < 3) {
-			matchptr = &in_base[cur_node3];
-			if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
-				best_len = 3;
-				best_matchptr = matchptr;
-			}
-		}
-
-		/* Check for a length 4 match.  */
-
-		if (cur_node4 <= cutoff)
-			goto out;
-
-		for (;;) {
-			/* No length 4 match found yet.  Check the first 4 bytes.  */
-			matchptr = &in_base[cur_node4];
-
-			if (load_u32_unaligned(matchptr) == seq4)
-				break;
-
-			/* The first 4 bytes did not match.  Keep trying.  */
-			cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
-			if (cur_node4 <= cutoff || !--depth_remaining)
-				goto out;
-		}
-
-		/* Found a match of length >= 4.  Extend it to its full length.  */
-		best_matchptr = matchptr;
-		best_len = lz_extend(in_next, best_matchptr, 4, max_len);
-		if (best_len >= nice_len)
-			goto out;
-		cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
-		if (cur_node4 <= cutoff || !--depth_remaining)
-			goto out;
-	} else {
-		if (cur_node4 <= cutoff || best_len >= nice_len)
-			goto out;
-	}
-
-	/* Check for matches of length >= 5.  */
-
-	for (;;) {
-		for (;;) {
-			matchptr = &in_base[cur_node4];
-
-			/* Already found a length 4 match.  Try for a longer
-			 * match; start by checking either the last 4 bytes and
-			 * the first 4 bytes, or the last byte.  (The last byte,
-			 * the one which would extend the match length by 1, is
-			 * the most important.)  */
-		#if UNALIGNED_ACCESS_IS_FAST
-			if ((load_u32_unaligned(matchptr + best_len - 3) ==
-			     load_u32_unaligned(in_next + best_len - 3)) &&
-			    (load_u32_unaligned(matchptr) ==
-			     load_u32_unaligned(in_next)))
-		#else
-			if (matchptr[best_len] == in_next[best_len])
-		#endif
-				break;
-
-			/* Continue to the next node in the list.  */
-			cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
-			if (cur_node4 <= cutoff || !--depth_remaining)
-				goto out;
-		}
-
-	#if UNALIGNED_ACCESS_IS_FAST
-		len = 4;
-	#else
-		len = 0;
-	#endif
-		len = lz_extend(in_next, matchptr, len, max_len);
-		if (len > best_len) {
-			/* This is the new longest match.  */
-			best_len = len;
-			best_matchptr = matchptr;
-			if (best_len >= nice_len)
-				goto out;
-		}
-
-		/* Continue to the next node in the list.  */
-		cur_node4 = mf->next_tab[cur_node4 & (MATCHFINDER_WINDOW_SIZE - 1)];
-		if (cur_node4 <= cutoff || !--depth_remaining)
-			goto out;
-	}
-out:
-	*offset_ret = in_next - best_matchptr;
-	return best_len;
-}
-
-/*
- * Advance the matchfinder, but don't search for matches.
- *
- * @mf
- *	The matchfinder structure.
- * @in_base_p
- *	Location of a pointer which points to the place in the input data the
- *	matchfinder currently stores positions relative to.  This may be updated
- *	by this function.
- * @cur_pos
- *	The current position in the input buffer relative to @in_base.
- * @end_pos
- *	The end position of the input buffer, relative to @in_base.
- * @next_hashes
- *	The precomputed hash codes for the sequence beginning at @in_next.
- *	These will be used and then updated with the precomputed hashcodes for
- *	the sequence beginning at @in_next + @count.
- * @count
- *	The number of bytes to advance.  Must be > 0.
- *
- * Returns @in_next + @count.
- */
-static forceinline const u8 *
-hc_matchfinder_skip_positions(struct hc_matchfinder * const restrict mf,
-			      const u8 ** const restrict in_base_p,
-			      const u8 *in_next,
-			      const u8 * const in_end,
-			      const u32 count,
-			      u32 * const restrict next_hashes)
-{
-	u32 cur_pos;
-	u32 hash3, hash4;
-	u32 next_hashseq;
-	u32 remaining = count;
-
-	if (unlikely(count + 5 > in_end - in_next))
-		return &in_next[count];
-
-	cur_pos = in_next - *in_base_p;
-	hash3 = next_hashes[0];
-	hash4 = next_hashes[1];
-	do {
-		if (cur_pos == MATCHFINDER_WINDOW_SIZE) {
-			hc_matchfinder_slide_window(mf);
-			*in_base_p += MATCHFINDER_WINDOW_SIZE;
-			cur_pos = 0;
-		}
-		mf->hash3_tab[hash3] = cur_pos;
-		mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
-		mf->hash4_tab[hash4] = cur_pos;
-
-		next_hashseq = get_unaligned_le32(++in_next);
-		hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
-		hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
-		cur_pos++;
-	} while (--remaining);
-
-	prefetchw(&mf->hash3_tab[hash3]);
-	prefetchw(&mf->hash4_tab[hash4]);
-	next_hashes[0] = hash3;
-	next_hashes[1] = hash4;
-
-	return in_next;
-}
diff --git a/ext/libdeflate/lib/lib_common.h b/ext/libdeflate/lib/lib_common.h
deleted file mode 100644
index e3f33ef5..00000000
--- a/ext/libdeflate/lib/lib_common.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * lib_common.h - internal header included by all library code
- */
-
-#ifndef LIB_LIB_COMMON_H
-#define LIB_LIB_COMMON_H
-
-#ifdef LIBDEFLATE_H
-#  error "lib_common.h must always be included before libdeflate.h"
-   /* because BUILDING_LIBDEFLATE must be set first */
-#endif
-
-#define BUILDING_LIBDEFLATE
-
-#include "common_defs.h"
-
-/*
- * Prefix with "_libdeflate_" all global symbols which are not part of the API.
- * This avoids exposing overly generic names when libdeflate is built as a
- * static library.
- *
- * Note that the chosen prefix is not really important and can be changed
- * without breaking library users.  It was just chosen so that the resulting
- * symbol names are unlikely to conflict with those from any other software.
- * Also note that this fixup has no useful effect when libdeflate is built as a
- * shared library, since these symbols are not exported.
- */
-#define SYM_FIXUP(sym)			_libdeflate_##sym
-#define aligned_malloc			SYM_FIXUP(aligned_malloc)
-#define aligned_free			SYM_FIXUP(aligned_free)
-#define deflate_get_compression_level	SYM_FIXUP(deflate_get_compression_level)
-#define _cpu_features			SYM_FIXUP(_cpu_features)
-#define setup_cpu_features		SYM_FIXUP(setup_cpu_features)
-
-#endif /* LIB_LIB_COMMON_H */
diff --git a/ext/libdeflate/lib/matchfinder_common.h b/ext/libdeflate/lib/matchfinder_common.h
deleted file mode 100644
index edd9fb70..00000000
--- a/ext/libdeflate/lib/matchfinder_common.h
+++ /dev/null
@@ -1,168 +0,0 @@
-/*
- * matchfinder_common.h - common code for Lempel-Ziv matchfinding
- */
-
-#ifndef LIB_MATCHFINDER_COMMON_H
-#define LIB_MATCHFINDER_COMMON_H
-
-#include "lib_common.h"
-#include "unaligned.h"
-
-#ifndef MATCHFINDER_WINDOW_ORDER
-#  error "MATCHFINDER_WINDOW_ORDER must be defined!"
-#endif
-
-#define MATCHFINDER_WINDOW_SIZE (1UL << MATCHFINDER_WINDOW_ORDER)
-
-typedef s16 mf_pos_t;
-
-#define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
-
-#define MATCHFINDER_ALIGNMENT 8
-
-#define arch_matchfinder_init(data, size)	false
-#define arch_matchfinder_rebase(data, size)	false
-
-#ifdef _aligned_attribute
-#  if defined(__arm__) || defined(__aarch64__)
-#    include "arm/matchfinder_impl.h"
-#  elif defined(__i386__) || defined(__x86_64__)
-#    include "x86/matchfinder_impl.h"
-#  endif
-#endif
-
-/*
- * Initialize the hash table portion of the matchfinder.
- *
- * Essentially, this is an optimized memset().
- *
- * 'data' must be aligned to a MATCHFINDER_ALIGNMENT boundary.
- */
-static forceinline void
-matchfinder_init(mf_pos_t *data, size_t num_entries)
-{
-	size_t i;
-
-	if (arch_matchfinder_init(data, num_entries * sizeof(data[0])))
-		return;
-
-	for (i = 0; i < num_entries; i++)
-		data[i] = MATCHFINDER_INITVAL;
-}
-
-/*
- * Slide the matchfinder by WINDOW_SIZE bytes.
- *
- * This must be called just after each WINDOW_SIZE bytes have been run through
- * the matchfinder.
- *
- * This will subtract WINDOW_SIZE bytes from each entry in the array specified.
- * The effect is that all entries are updated to be relative to the current
- * position, rather than the position WINDOW_SIZE bytes prior.
- *
- * Underflow is detected and replaced with signed saturation.  This ensures that
- * once the sliding window has passed over a position, that position forever
- * remains out of bounds.
- *
- * The array passed in must contain all matchfinder data that is
- * position-relative.  Concretely, this will include the hash table as well as
- * the table of positions that is used to link together the sequences in each
- * hash bucket.  Note that in the latter table, the links are 1-ary in the case
- * of "hash chains", and 2-ary in the case of "binary trees".  In either case,
- * the links need to be rebased in the same way.
- */
-static forceinline void
-matchfinder_rebase(mf_pos_t *data, size_t num_entries)
-{
-	size_t i;
-
-	if (arch_matchfinder_rebase(data, num_entries * sizeof(data[0])))
-		return;
-
-	if (MATCHFINDER_WINDOW_SIZE == 32768) {
-		/* Branchless version for 32768 byte windows.  If the value was
-		 * already negative, clear all bits except the sign bit; this
-		 * changes the value to -32768.  Otherwise, set the sign bit;
-		 * this is equivalent to subtracting 32768.  */
-		for (i = 0; i < num_entries; i++) {
-			u16 v = data[i];
-			u16 sign_bit = v & 0x8000;
-			v &= sign_bit - ((sign_bit >> 15) ^ 1);
-			v |= 0x8000;
-			data[i] = v;
-		}
-		return;
-	}
-
-	for (i = 0; i < num_entries; i++) {
-		if (data[i] >= 0)
-			data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
-		else
-			data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
-	}
-}
-
-/*
- * The hash function: given a sequence prefix held in the low-order bits of a
- * 32-bit value, multiply by a carefully-chosen large constant.  Discard any
- * bits of the product that don't fit in a 32-bit value, but take the
- * next-highest @num_bits bits of the product as the hash value, as those have
- * the most randomness.
- */
-static forceinline u32
-lz_hash(u32 seq, unsigned num_bits)
-{
-	return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
-}
-
-/*
- * Return the number of bytes at @matchptr that match the bytes at @strptr, up
- * to a maximum of @max_len.  Initially, @start_len bytes are matched.
- */
-static forceinline unsigned
-lz_extend(const u8 * const strptr, const u8 * const matchptr,
-	  const unsigned start_len, const unsigned max_len)
-{
-	unsigned len = start_len;
-	machine_word_t v_word;
-
-	if (UNALIGNED_ACCESS_IS_FAST) {
-
-		if (likely(max_len - len >= 4 * WORDBYTES)) {
-
-		#define COMPARE_WORD_STEP				\
-			v_word = load_word_unaligned(&matchptr[len]) ^	\
-				 load_word_unaligned(&strptr[len]);	\
-			if (v_word != 0)				\
-				goto word_differs;			\
-			len += WORDBYTES;				\
-
-			COMPARE_WORD_STEP
-			COMPARE_WORD_STEP
-			COMPARE_WORD_STEP
-			COMPARE_WORD_STEP
-		#undef COMPARE_WORD_STEP
-		}
-
-		while (len + WORDBYTES <= max_len) {
-			v_word = load_word_unaligned(&matchptr[len]) ^
-				 load_word_unaligned(&strptr[len]);
-			if (v_word != 0)
-				goto word_differs;
-			len += WORDBYTES;
-		}
-	}
-
-	while (len < max_len && matchptr[len] == strptr[len])
-		len++;
-	return len;
-
-word_differs:
-	if (CPU_IS_LITTLE_ENDIAN())
-		len += (bsfw(v_word) >> 3);
-	else
-		len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
-	return len;
-}
-
-#endif /* LIB_MATCHFINDER_COMMON_H */
diff --git a/ext/libdeflate/lib/unaligned.h b/ext/libdeflate/lib/unaligned.h
deleted file mode 100644
index 7aeaf0c5..00000000
--- a/ext/libdeflate/lib/unaligned.h
+++ /dev/null
@@ -1,202 +0,0 @@
-/*
- * unaligned.h - inline functions for unaligned memory accesses
- */
-
-#ifndef LIB_UNALIGNED_H
-#define LIB_UNALIGNED_H
-
-#include "lib_common.h"
-
-/*
- * Naming note:
- *
- * {load,store}_*_unaligned() deal with raw bytes without endianness conversion.
- * {get,put}_unaligned_*() deal with a specific endianness.
- */
-
-DEFINE_UNALIGNED_TYPE(u16)
-DEFINE_UNALIGNED_TYPE(u32)
-DEFINE_UNALIGNED_TYPE(u64)
-DEFINE_UNALIGNED_TYPE(machine_word_t)
-
-#define load_word_unaligned	load_machine_word_t_unaligned
-#define store_word_unaligned	store_machine_word_t_unaligned
-
-/***** Unaligned loads  *****/
-
-static forceinline u16
-get_unaligned_le16(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return le16_bswap(load_u16_unaligned(p));
-	else
-		return ((u16)p[1] << 8) | p[0];
-}
-
-static forceinline u16
-get_unaligned_be16(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return be16_bswap(load_u16_unaligned(p));
-	else
-		return ((u16)p[0] << 8) | p[1];
-}
-
-static forceinline u32
-get_unaligned_le32(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return le32_bswap(load_u32_unaligned(p));
-	else
-		return ((u32)p[3] << 24) | ((u32)p[2] << 16) |
-			((u32)p[1] << 8) | p[0];
-}
-
-static forceinline u32
-get_unaligned_be32(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return be32_bswap(load_u32_unaligned(p));
-	else
-		return ((u32)p[0] << 24) | ((u32)p[1] << 16) |
-			((u32)p[2] << 8) | p[3];
-}
-
-static forceinline u64
-get_unaligned_le64(const u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST)
-		return le64_bswap(load_u64_unaligned(p));
-	else
-		return ((u64)p[7] << 56) | ((u64)p[6] << 48) |
-			((u64)p[5] << 40) | ((u64)p[4] << 32) |
-			((u64)p[3] << 24) | ((u64)p[2] << 16) |
-			((u64)p[1] << 8) | p[0];
-}
-
-static forceinline machine_word_t
-get_unaligned_leword(const u8 *p)
-{
-	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-	if (WORDBITS == 32)
-		return get_unaligned_le32(p);
-	else
-		return get_unaligned_le64(p);
-}
-
-/***** Unaligned stores  *****/
-
-static forceinline void
-put_unaligned_le16(u16 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u16_unaligned(le16_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 0);
-		p[1] = (u8)(v >> 8);
-	}
-}
-
-static forceinline void
-put_unaligned_be16(u16 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u16_unaligned(be16_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 8);
-		p[1] = (u8)(v >> 0);
-	}
-}
-
-static forceinline void
-put_unaligned_le32(u32 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u32_unaligned(le32_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 0);
-		p[1] = (u8)(v >> 8);
-		p[2] = (u8)(v >> 16);
-		p[3] = (u8)(v >> 24);
-	}
-}
-
-static forceinline void
-put_unaligned_be32(u32 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u32_unaligned(be32_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 24);
-		p[1] = (u8)(v >> 16);
-		p[2] = (u8)(v >> 8);
-		p[3] = (u8)(v >> 0);
-	}
-}
-
-static forceinline void
-put_unaligned_le64(u64 v, u8 *p)
-{
-	if (UNALIGNED_ACCESS_IS_FAST) {
-		store_u64_unaligned(le64_bswap(v), p);
-	} else {
-		p[0] = (u8)(v >> 0);
-		p[1] = (u8)(v >> 8);
-		p[2] = (u8)(v >> 16);
-		p[3] = (u8)(v >> 24);
-		p[4] = (u8)(v >> 32);
-		p[5] = (u8)(v >> 40);
-		p[6] = (u8)(v >> 48);
-		p[7] = (u8)(v >> 56);
-	}
-}
-
-static forceinline void
-put_unaligned_leword(machine_word_t v, u8 *p)
-{
-	STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
-	if (WORDBITS == 32)
-		put_unaligned_le32(v, p);
-	else
-		put_unaligned_le64(v, p);
-}
-
-/***** 24-bit loads *****/
-
-/*
- * Given a 32-bit value that was loaded with the platform's native endianness,
- * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
- * bits contain the first 3 bytes, arranged in octets in a platform-dependent
- * order, at the memory location from which the input 32-bit value was loaded.
- */
-static forceinline u32
-loaded_u32_to_u24(u32 v)
-{
-	if (CPU_IS_LITTLE_ENDIAN())
-		return v & 0xFFFFFF;
-	else
-		return v >> 8;
-}
-
-/*
- * Load the next 3 bytes from the memory location @p into the 24 low-order bits
- * of a 32-bit value.  The order in which the 3 bytes will be arranged as octets
- * in the 24 bits is platform-dependent.  At least LOAD_U24_REQUIRED_NBYTES
- * bytes must be available at @p; note that this may be more than 3.
- */
-static forceinline u32
-load_u24_unaligned(const u8 *p)
-{
-#if UNALIGNED_ACCESS_IS_FAST
-#  define LOAD_U24_REQUIRED_NBYTES 4
-	return loaded_u32_to_u24(load_u32_unaligned(p));
-#else
-#  define LOAD_U24_REQUIRED_NBYTES 3
-	if (CPU_IS_LITTLE_ENDIAN())
-		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
-	else
-		return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
-#endif
-}
-
-#endif /* LIB_UNALIGNED_H */
diff --git a/ext/libdeflate/lib/x86/adler32_impl.h b/ext/libdeflate/lib/x86/adler32_impl.h
deleted file mode 100644
index 4627a41c..00000000
--- a/ext/libdeflate/lib/x86/adler32_impl.h
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "cpu_features.h"
-
-/*
- * The following macros horizontally sum the s1 counters and add them to the
- * real s1, and likewise for s2.  They do this via a series of reductions, each
- * of which halves the vector length, until just one counter remains.
- *
- * The s1 reductions don't depend on the s2 reductions and vice versa, so for
- * efficiency they are interleaved.  Also, every other s1 counter is 0 due to
- * the 'psadbw' instruction (_mm_sad_epu8) summing groups of 8 bytes rather than
- * 4; hence, one of the s1 reductions is skipped when going from 128 => 32 bits.
- */
-
-#define ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2)		    \
-{									    \
-	__v4si s1_last = (v_s1), s2_last = (v_s2);			    \
-									    \
-	/* 128 => 32 bits */						    \
-	s2_last += (__v4si)_mm_shuffle_epi32((__m128i)s2_last, 0x31);	    \
-	s1_last += (__v4si)_mm_shuffle_epi32((__m128i)s1_last, 0x02);	    \
-	s2_last += (__v4si)_mm_shuffle_epi32((__m128i)s2_last, 0x02);	    \
-									    \
-	*(s1) += (u32)_mm_cvtsi128_si32((__m128i)s1_last);		    \
-	*(s2) += (u32)_mm_cvtsi128_si32((__m128i)s2_last);		    \
-}
-
-#define ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2)		    \
-{									    \
-	__v4si s1_128bit, s2_128bit;					    \
-									    \
-	/* 256 => 128 bits */						    \
-	s1_128bit = (__v4si)_mm256_extracti128_si256((__m256i)(v_s1), 0) +  \
-		    (__v4si)_mm256_extracti128_si256((__m256i)(v_s1), 1);   \
-	s2_128bit = (__v4si)_mm256_extracti128_si256((__m256i)(v_s2), 0) +  \
-		    (__v4si)_mm256_extracti128_si256((__m256i)(v_s2), 1);   \
-									    \
-	ADLER32_FINISH_VEC_CHUNK_128((s1), (s2), s1_128bit, s2_128bit);	    \
-}
-
-#define ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2)		    \
-{									    \
-	__v8si s1_256bit, s2_256bit;					    \
-									    \
-	/* 512 => 256 bits */						    \
-	s1_256bit = (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s1), 0) + \
-		    (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s1), 1);  \
-	s2_256bit = (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s2), 0) + \
-		    (__v8si)_mm512_extracti64x4_epi64((__m512i)(v_s2), 1);  \
-									    \
-	ADLER32_FINISH_VEC_CHUNK_256((s1), (s2), s1_256bit, s2_256bit);	    \
-}
-
-/* AVX-512BW implementation: like the AVX2 one, but does 64 bytes at a time */
-#undef DISPATCH_AVX512BW
-#if !defined(DEFAULT_IMPL) &&						\
-    /*
-     * clang before v3.9 is missing some AVX-512BW intrinsics including
-     * _mm512_sad_epu8(), a.k.a. __builtin_ia32_psadbw512.  So just make using
-     * AVX-512BW, even when __AVX512BW__ is defined, conditional on
-     * COMPILER_SUPPORTS_AVX512BW_TARGET where we check for that builtin.
-     */									\
-    COMPILER_SUPPORTS_AVX512BW_TARGET &&				\
-    (defined(__AVX512BW__) || (X86_CPU_FEATURES_ENABLED &&		\
-			       COMPILER_SUPPORTS_AVX512BW_TARGET_INTRINSICS))
-#  define FUNCNAME		adler32_avx512bw
-#  define FUNCNAME_CHUNK	adler32_avx512bw_chunk
-#  define IMPL_ALIGNMENT	64
-#  define IMPL_SEGMENT_SIZE	64
-#  define IMPL_MAX_CHUNK_SIZE	MAX_CHUNK_SIZE
-#  ifdef __AVX512BW__
-#    define ATTRIBUTES
-#    define DEFAULT_IMPL	adler32_avx512bw
-#  else
-#    define ATTRIBUTES		__attribute__((target("avx512bw")))
-#    define DISPATCH		1
-#    define DISPATCH_AVX512BW	1
-#  endif
-#  include <immintrin.h>
-static forceinline ATTRIBUTES void
-adler32_avx512bw_chunk(const __m512i *p, const __m512i *const end,
-		       u32 *s1, u32 *s2)
-{
-	const __m512i zeroes = _mm512_setzero_si512();
-	const __v64qi multipliers = (__v64qi){
-		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
-		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
-		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
-	};
-	const __v32hi ones = (__v32hi)_mm512_set1_epi16(1);
-	__v16si v_s1 = (__v16si)zeroes;
-	__v16si v_s1_sums = (__v16si)zeroes;
-	__v16si v_s2 = (__v16si)zeroes;
-
-	do {
-		/* Load the next 64-byte segment */
-		__m512i bytes = *p++;
-
-		/* Multiply the bytes by 64...1 (the number of times they need
-		 * to be added to s2) and add adjacent products */
-		__v32hi sums = (__v32hi)_mm512_maddubs_epi16(
-						bytes, (__m512i)multipliers);
-
-		/* Keep sum of all previous s1 counters, for adding to s2 later.
-		 * This allows delaying the multiplication by 64 to the end. */
-		v_s1_sums += v_s1;
-
-		/* Add the sum of each group of 8 bytes to the corresponding s1
-		 * counter */
-		v_s1 += (__v16si)_mm512_sad_epu8(bytes, zeroes);
-
-		/* Add the sum of each group of 4 products of the bytes by
-		 * 64...1 to the corresponding s2 counter */
-		v_s2 += (__v16si)_mm512_madd_epi16((__m512i)sums,
-						   (__m512i)ones);
-	} while (p != end);
-
-	/* Finish the s2 counters by adding the sum of the s1 values at the
-	 * beginning of each segment, multiplied by the segment size (64) */
-	v_s2 += (__v16si)_mm512_slli_epi32((__m512i)v_s1_sums, 6);
-
-	/* Add the counters to the real s1 and s2 */
-	ADLER32_FINISH_VEC_CHUNK_512(s1, s2, v_s1, v_s2);
-}
-#  include "../adler32_vec_template.h"
-#endif /* AVX-512BW implementation */
-
-/* AVX2 implementation: like the AVX-512BW one, but does 32 bytes at a time */
-#undef DISPATCH_AVX2
-#if !defined(DEFAULT_IMPL) &&	\
-	(defined(__AVX2__) || (X86_CPU_FEATURES_ENABLED &&	\
-			       COMPILER_SUPPORTS_AVX2_TARGET_INTRINSICS))
-#  define FUNCNAME		adler32_avx2
-#  define FUNCNAME_CHUNK	adler32_avx2_chunk
-#  define IMPL_ALIGNMENT	32
-#  define IMPL_SEGMENT_SIZE	32
-#  define IMPL_MAX_CHUNK_SIZE	MAX_CHUNK_SIZE
-#  ifdef __AVX2__
-#    define ATTRIBUTES
-#    define DEFAULT_IMPL	adler32_avx2
-#  else
-#    define ATTRIBUTES		__attribute__((target("avx2")))
-#    define DISPATCH		1
-#    define DISPATCH_AVX2	1
-#  endif
-#  include <immintrin.h>
-static forceinline ATTRIBUTES void
-adler32_avx2_chunk(const __m256i *p, const __m256i *const end, u32 *s1, u32 *s2)
-{
-	const __m256i zeroes = _mm256_setzero_si256();
-	const __v32qi multipliers = (__v32qi){
-		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
-	};
-	const __v16hi ones = (__v16hi)_mm256_set1_epi16(1);
-	__v8si v_s1 = (__v8si)zeroes;
-	__v8si v_s1_sums = (__v8si)zeroes;
-	__v8si v_s2 = (__v8si)zeroes;
-
-	do {
-		/* Load the next 32-byte segment */
-		__m256i bytes = *p++;
-
-		/* Multiply the bytes by 32...1 (the number of times they need
-		 * to be added to s2) and add adjacent products */
-		__v16hi sums = (__v16hi)_mm256_maddubs_epi16(
-						bytes, (__m256i)multipliers);
-
-		/* Keep sum of all previous s1 counters, for adding to s2 later.
-		 * This allows delaying the multiplication by 32 to the end. */
-		v_s1_sums += v_s1;
-
-		/* Add the sum of each group of 8 bytes to the corresponding s1
-		 * counter */
-		v_s1 += (__v8si)_mm256_sad_epu8(bytes, zeroes);
-
-		/* Add the sum of each group of 4 products of the bytes by
-		 * 32...1 to the corresponding s2 counter */
-		v_s2 += (__v8si)_mm256_madd_epi16((__m256i)sums, (__m256i)ones);
-	} while (p != end);
-
-	/* Finish the s2 counters by adding the sum of the s1 values at the
-	 * beginning of each segment, multiplied by the segment size (32) */
-	v_s2 += (__v8si)_mm256_slli_epi32((__m256i)v_s1_sums, 5);
-
-	/* Add the counters to the real s1 and s2 */
-	ADLER32_FINISH_VEC_CHUNK_256(s1, s2, v_s1, v_s2);
-}
-#  include "../adler32_vec_template.h"
-#endif /* AVX2 implementation */
-
-/* SSE2 implementation */
-#undef DISPATCH_SSE2
-#if !defined(DEFAULT_IMPL) &&	\
-	(defined(__SSE2__) || (X86_CPU_FEATURES_ENABLED &&	\
-			       COMPILER_SUPPORTS_SSE2_TARGET_INTRINSICS))
-#  define FUNCNAME		adler32_sse2
-#  define FUNCNAME_CHUNK	adler32_sse2_chunk
-#  define IMPL_ALIGNMENT	16
-#  define IMPL_SEGMENT_SIZE	32
-/*
- * The 16-bit precision byte counters must not be allowed to undergo *signed*
- * overflow, otherwise the signed multiplications at the end (_mm_madd_epi16)
- * would behave incorrectly.
- */
-#  define IMPL_MAX_CHUNK_SIZE	(32 * (0x7FFF / 0xFF))
-#  ifdef __SSE2__
-#    define ATTRIBUTES
-#    define DEFAULT_IMPL	adler32_sse2
-#  else
-#    define ATTRIBUTES		__attribute__((target("sse2")))
-#    define DISPATCH		1
-#    define DISPATCH_SSE2	1
-#  endif
-#  include <emmintrin.h>
-static forceinline ATTRIBUTES void
-adler32_sse2_chunk(const __m128i *p, const __m128i *const end, u32 *s1, u32 *s2)
-{
-	const __m128i zeroes = _mm_setzero_si128();
-
-	/* s1 counters: 32-bit, sum of bytes */
-	__v4si v_s1 = (__v4si)zeroes;
-
-	/* s2 counters: 32-bit, sum of s1 values */
-	__v4si v_s2 = (__v4si)zeroes;
-
-	/*
-	 * Thirty-two 16-bit counters for byte sums.  Each accumulates the bytes
-	 * that eventually need to be multiplied by a number 32...1 for addition
-	 * into s2.
-	 */
-	__v8hi v_byte_sums_a = (__v8hi)zeroes;
-	__v8hi v_byte_sums_b = (__v8hi)zeroes;
-	__v8hi v_byte_sums_c = (__v8hi)zeroes;
-	__v8hi v_byte_sums_d = (__v8hi)zeroes;
-
-	do {
-		/* Load the next 32 bytes */
-		const __m128i bytes1 = *p++;
-		const __m128i bytes2 = *p++;
-
-		/*
-		 * Accumulate the previous s1 counters into the s2 counters.
-		 * Logically, this really should be v_s2 += v_s1 * 32, but we
-		 * can do the multiplication (or left shift) later.
-		 */
-		v_s2 += v_s1;
-
-		/*
-		 * s1 update: use "Packed Sum of Absolute Differences" to add
-		 * the bytes horizontally with 8 bytes per sum.  Then add the
-		 * sums to the s1 counters.
-		 */
-		v_s1 += (__v4si)_mm_sad_epu8(bytes1, zeroes);
-		v_s1 += (__v4si)_mm_sad_epu8(bytes2, zeroes);
-
-		/*
-		 * Also accumulate the bytes into 32 separate counters that have
-		 * 16-bit precision.
-		 */
-		v_byte_sums_a += (__v8hi)_mm_unpacklo_epi8(bytes1, zeroes);
-		v_byte_sums_b += (__v8hi)_mm_unpackhi_epi8(bytes1, zeroes);
-		v_byte_sums_c += (__v8hi)_mm_unpacklo_epi8(bytes2, zeroes);
-		v_byte_sums_d += (__v8hi)_mm_unpackhi_epi8(bytes2, zeroes);
-
-	} while (p != end);
-
-	/* Finish calculating the s2 counters */
-	v_s2 = (__v4si)_mm_slli_epi32((__m128i)v_s2, 5);
-	v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_a,
-				       (__m128i)(__v8hi){ 32, 31, 30, 29, 28, 27, 26, 25 });
-	v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_b,
-				       (__m128i)(__v8hi){ 24, 23, 22, 21, 20, 19, 18, 17 });
-	v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_c,
-				       (__m128i)(__v8hi){ 16, 15, 14, 13, 12, 11, 10, 9 });
-	v_s2 += (__v4si)_mm_madd_epi16((__m128i)v_byte_sums_d,
-				       (__m128i)(__v8hi){ 8,  7,  6,  5,  4,  3,  2,  1 });
-
-	/* Add the counters to the real s1 and s2 */
-	ADLER32_FINISH_VEC_CHUNK_128(s1, s2, v_s1, v_s2);
-}
-#  include "../adler32_vec_template.h"
-#endif /* SSE2 implementation */
-
-#ifdef DISPATCH
-static inline adler32_func_t
-arch_select_adler32_func(void)
-{
-	u32 features = get_cpu_features();
-
-#ifdef DISPATCH_AVX512BW
-	if (features & X86_CPU_FEATURE_AVX512BW)
-		return adler32_avx512bw;
-#endif
-#ifdef DISPATCH_AVX2
-	if (features & X86_CPU_FEATURE_AVX2)
-		return adler32_avx2;
-#endif
-#ifdef DISPATCH_SSE2
-	if (features & X86_CPU_FEATURE_SSE2)
-		return adler32_sse2;
-#endif
-	return NULL;
-}
-#endif /* DISPATCH */
diff --git a/ext/libdeflate/lib/x86/cpu_features.c b/ext/libdeflate/lib/x86/cpu_features.c
deleted file mode 100644
index 78a7af28..00000000
--- a/ext/libdeflate/lib/x86/cpu_features.c
+++ /dev/null
@@ -1,139 +0,0 @@
-/*
- * x86/cpu_features.c - feature detection for x86 processors
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "cpu_features.h"
-
-#if X86_CPU_FEATURES_ENABLED
-
-volatile u32 _cpu_features = 0;
-
-/* With old GCC versions we have to manually save and restore the x86_32 PIC
- * register (ebx).  See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602  */
-#if defined(__i386__) && defined(__PIC__)
-#  define EBX_CONSTRAINT "=r"
-#else
-#  define EBX_CONSTRAINT "=b"
-#endif
-
-/* Execute the CPUID instruction.  */
-static inline void
-cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
-{
-	__asm__(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
-		"cpuid                                  \n"
-		".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
-		: "=a" (*a), EBX_CONSTRAINT (*b), "=c" (*c), "=d" (*d)
-		: "a" (leaf), "c" (subleaf));
-}
-
-/* Read an extended control register.  */
-static inline u64
-read_xcr(u32 index)
-{
-	u32 edx, eax;
-
-	/* Execute the "xgetbv" instruction.  Old versions of binutils do not
-	 * recognize this instruction, so list the raw bytes instead.  */
-	__asm__ (".byte 0x0f, 0x01, 0xd0" : "=d" (edx), "=a" (eax) : "c" (index));
-
-	return ((u64)edx << 32) | eax;
-}
-
-#undef BIT
-#define BIT(nr)			(1UL << (nr))
-
-#define XCR0_BIT_SSE		BIT(1)
-#define XCR0_BIT_AVX		BIT(2)
-#define XCR0_BIT_OPMASK		BIT(5)
-#define XCR0_BIT_ZMM_HI256	BIT(6)
-#define XCR0_BIT_HI16_ZMM	BIT(7)
-
-#define IS_SET(reg, nr)		((reg) & BIT(nr))
-#define IS_ALL_SET(reg, mask)	(((reg) & (mask)) == (mask))
-
-/* Initialize _cpu_features with bits for interesting processor features. */
-void setup_cpu_features(void)
-{
-	u32 features = 0;
-	u32 dummy1, dummy2, dummy3, dummy4;
-	u32 max_function;
-	u32 features_1, features_2, features_3, features_4;
-	bool os_avx_support = false;
-	bool os_avx512_support = false;
-
-	/* Get maximum supported function  */
-	cpuid(0, 0, &max_function, &dummy2, &dummy3, &dummy4);
-	if (max_function < 1)
-		goto out;
-
-	/* Standard feature flags  */
-	cpuid(1, 0, &dummy1, &dummy2, &features_2, &features_1);
-
-	if (IS_SET(features_1, 26))
-		features |= X86_CPU_FEATURE_SSE2;
-
-	if (IS_SET(features_2, 1))
-		features |= X86_CPU_FEATURE_PCLMULQDQ;
-
-	if (IS_SET(features_2, 27)) { /* OSXSAVE set? */
-		u64 xcr0 = read_xcr(0);
-
-		os_avx_support = IS_ALL_SET(xcr0,
-					    XCR0_BIT_SSE |
-					    XCR0_BIT_AVX);
-
-		os_avx512_support = IS_ALL_SET(xcr0,
-					       XCR0_BIT_SSE |
-					       XCR0_BIT_AVX |
-					       XCR0_BIT_OPMASK |
-					       XCR0_BIT_ZMM_HI256 |
-					       XCR0_BIT_HI16_ZMM);
-	}
-
-	if (os_avx_support && IS_SET(features_2, 28))
-		features |= X86_CPU_FEATURE_AVX;
-
-	if (max_function < 7)
-		goto out;
-
-	/* Extended feature flags  */
-	cpuid(7, 0, &dummy1, &features_3, &features_4, &dummy4);
-
-	if (os_avx_support && IS_SET(features_3, 5))
-		features |= X86_CPU_FEATURE_AVX2;
-
-	if (IS_SET(features_3, 8))
-		features |= X86_CPU_FEATURE_BMI2;
-
-	if (os_avx512_support && IS_SET(features_3, 30))
-		features |= X86_CPU_FEATURE_AVX512BW;
-
-out:
-	_cpu_features = features | X86_CPU_FEATURES_KNOWN;
-}
-
-#endif /* X86_CPU_FEATURES_ENABLED */
diff --git a/ext/libdeflate/lib/x86/cpu_features.h b/ext/libdeflate/lib/x86/cpu_features.h
deleted file mode 100644
index b2241818..00000000
--- a/ext/libdeflate/lib/x86/cpu_features.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * x86/cpu_features.h - feature detection for x86 processors
- */
-
-#ifndef LIB_X86_CPU_FEATURES_H
-#define LIB_X86_CPU_FEATURES_H
-
-#include "../lib_common.h"
-
-#if (defined(__i386__) || defined(__x86_64__)) && \
-	COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE
-#  define X86_CPU_FEATURES_ENABLED 1
-#else
-#  define X86_CPU_FEATURES_ENABLED 0
-#endif
-
-#if X86_CPU_FEATURES_ENABLED
-
-#define X86_CPU_FEATURE_SSE2		0x00000001
-#define X86_CPU_FEATURE_PCLMULQDQ	0x00000002
-#define X86_CPU_FEATURE_AVX		0x00000004
-#define X86_CPU_FEATURE_AVX2		0x00000008
-#define X86_CPU_FEATURE_BMI2		0x00000010
-#define X86_CPU_FEATURE_AVX512BW	0x00000020
-
-#define X86_CPU_FEATURES_KNOWN		0x80000000
-
-extern volatile u32 _cpu_features;
-
-extern void setup_cpu_features(void);
-
-static inline u32 get_cpu_features(void)
-{
-	if (_cpu_features == 0)
-		setup_cpu_features();
-	return _cpu_features;
-}
-
-#endif /* X86_CPU_FEATURES_ENABLED */
-
-#endif /* LIB_X86_CPU_FEATURES_H */
diff --git a/ext/libdeflate/lib/x86/crc32_impl.h b/ext/libdeflate/lib/x86/crc32_impl.h
deleted file mode 100644
index ff896268..00000000
--- a/ext/libdeflate/lib/x86/crc32_impl.h
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * x86/crc32_impl.h - x86 implementations of CRC-32 checksum algorithm
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "cpu_features.h"
-
-/*
- * Include the PCLMUL/AVX implementation?  Although our PCLMUL-optimized CRC-32
- * function doesn't use any AVX intrinsics specifically, it can benefit a lot
- * from being compiled for an AVX target: on Skylake, ~16700 MB/s vs. ~10100
- * MB/s.  I expect this is related to the PCLMULQDQ instructions being assembled
- * in the newer three-operand form rather than the older two-operand form.
- *
- * Note: this is only needed if __AVX__ is *not* defined, since otherwise the
- * "regular" PCLMUL implementation would already be AVX enabled.
- */
-#undef DISPATCH_PCLMUL_AVX
-#if !defined(DEFAULT_IMPL) && !defined(__AVX__) &&	\
-	X86_CPU_FEATURES_ENABLED && COMPILER_SUPPORTS_AVX_TARGET &&	\
-	(defined(__PCLMUL__) || COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS)
-#  define FUNCNAME		crc32_pclmul_avx
-#  define FUNCNAME_ALIGNED	crc32_pclmul_avx_aligned
-#  define ATTRIBUTES		__attribute__((target("pclmul,avx")))
-#  define DISPATCH		1
-#  define DISPATCH_PCLMUL_AVX	1
-#  include "crc32_pclmul_template.h"
-#endif
-
-/* PCLMUL implementation */
-#undef DISPATCH_PCLMUL
-#if !defined(DEFAULT_IMPL) &&	\
-	(defined(__PCLMUL__) || (X86_CPU_FEATURES_ENABLED &&	\
-				 COMPILER_SUPPORTS_PCLMUL_TARGET_INTRINSICS))
-#  define FUNCNAME		crc32_pclmul
-#  define FUNCNAME_ALIGNED	crc32_pclmul_aligned
-#  ifdef __PCLMUL__
-#    define ATTRIBUTES
-#    define DEFAULT_IMPL	crc32_pclmul
-#  else
-#    define ATTRIBUTES		__attribute__((target("pclmul")))
-#    define DISPATCH		1
-#    define DISPATCH_PCLMUL	1
-#  endif
-#  include "crc32_pclmul_template.h"
-#endif
-
-#ifdef DISPATCH
-static inline crc32_func_t
-arch_select_crc32_func(void)
-{
-	u32 features = get_cpu_features();
-
-#ifdef DISPATCH_PCLMUL_AVX
-	if ((features & X86_CPU_FEATURE_PCLMULQDQ) &&
-	    (features & X86_CPU_FEATURE_AVX))
-		return crc32_pclmul_avx;
-#endif
-#ifdef DISPATCH_PCLMUL
-	if (features & X86_CPU_FEATURE_PCLMULQDQ)
-		return crc32_pclmul;
-#endif
-	return NULL;
-}
-#endif /* DISPATCH */
diff --git a/ext/libdeflate/lib/x86/crc32_pclmul_template.h b/ext/libdeflate/lib/x86/crc32_pclmul_template.h
deleted file mode 100644
index eb4c4ba8..00000000
--- a/ext/libdeflate/lib/x86/crc32_pclmul_template.h
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * x86/crc32_pclmul_template.h
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include <wmmintrin.h>
-
-/*
- * CRC-32 folding with PCLMULQDQ.
- *
- * The basic idea is to repeatedly "fold" each 512 bits into the next 512 bits,
- * producing an abbreviated message which is congruent the original message
- * modulo the generator polynomial G(x).
- *
- * Folding each 512 bits is implemented as eight 64-bit folds, each of which
- * uses one carryless multiplication instruction.  It's expected that CPUs may
- * be able to execute some of these multiplications in parallel.
- *
- * Explanation of "folding": let A(x) be 64 bits from the message, and let B(x)
- * be 95 bits from a constant distance D later in the message.  The relevant
- * portion of the message can be written as:
- *
- *	M(x) = A(x)*x^D + B(x)
- *
- * ... where + and * represent addition and multiplication, respectively, of
- * polynomials over GF(2).  Note that when implemented on a computer, these
- * operations are equivalent to XOR and carryless multiplication, respectively.
- *
- * For the purpose of CRC calculation, only the remainder modulo the generator
- * polynomial G(x) matters:
- *
- *	M(x) mod G(x) = (A(x)*x^D + B(x)) mod G(x)
- *
- * Since the modulo operation can be applied anywhere in a sequence of additions
- * and multiplications without affecting the result, this is equivalent to:
- *
- *	M(x) mod G(x) = (A(x)*(x^D mod G(x)) + B(x)) mod G(x)
- *
- * For any D, 'x^D mod G(x)' will be a polynomial with maximum degree 31, i.e.
- * a 32-bit quantity.  So 'A(x) * (x^D mod G(x))' is equivalent to a carryless
- * multiplication of a 64-bit quantity by a 32-bit quantity, producing a 95-bit
- * product.  Then, adding (XOR-ing) the product to B(x) produces a polynomial
- * with the same length as B(x) but with the same remainder as 'A(x)*x^D +
- * B(x)'.  This is the basic fold operation with 64 bits.
- *
- * Note that the carryless multiplication instruction PCLMULQDQ actually takes
- * two 64-bit inputs and produces a 127-bit product in the low-order bits of a
- * 128-bit XMM register.  This works fine, but care must be taken to account for
- * "bit endianness".  With the CRC version implemented here, bits are always
- * ordered such that the lowest-order bit represents the coefficient of highest
- * power of x and the highest-order bit represents the coefficient of the lowest
- * power of x.  This is backwards from the more intuitive order.  Still,
- * carryless multiplication works essentially the same either way.  It just must
- * be accounted for that when we XOR the 95-bit product in the low-order 95 bits
- * of a 128-bit XMM register into 128-bits of later data held in another XMM
- * register, we'll really be XOR-ing the product into the mathematically higher
- * degree end of those later bits, not the lower degree end as may be expected.
- *
- * So given that caveat and the fact that we process 512 bits per iteration, the
- * 'D' values we need for the two 64-bit halves of each 128 bits of data are:
- *
- *	D = (512 + 95) - 64	 for the higher-degree half of each 128 bits,
- *				 i.e. the lower order bits in the XMM register
- *
- *	D = (512 + 95) - 128	 for the lower-degree half of each 128 bits,
- *				 i.e. the higher order bits in the XMM register
- *
- * The required 'x^D mod G(x)' values were precomputed.
- *
- * When <= 512 bits remain in the message, we finish up by folding across
- * smaller distances.  This works similarly; the distance D is just different,
- * so different constant multipliers must be used.  Finally, once the remaining
- * message is just 64 bits, it is is reduced to the CRC-32 using Barrett
- * reduction (explained later).
- *
- * For more information see the original paper from Intel:
- *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
- *	December 2009
- *	http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
- */
-static u32 ATTRIBUTES
-FUNCNAME_ALIGNED(u32 remainder, const __m128i *p, size_t nr_segs)
-{
-	/* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
-	const __v2di multipliers_4 = (__v2di){ 0x8F352D95, 0x1D9513D7 };
-	const __v2di multipliers_2 = (__v2di){ 0xF1DA05AA, 0x81256527 };
-	const __v2di multipliers_1 = (__v2di){ 0xAE689191, 0xCCAA009E };
-	const __v2di final_multiplier = (__v2di){ 0xB8BC6765 };
-	const __m128i mask32 = (__m128i)(__v4si){ 0xFFFFFFFF };
-	const __v2di barrett_reduction_constants =
-			(__v2di){ 0x00000001F7011641, 0x00000001DB710641 };
-
-	const __m128i * const end = p + nr_segs;
-	const __m128i * const end512 = p + (nr_segs & ~3);
-	__m128i x0, x1, x2, x3;
-
-	/*
-	 * Account for the current 'remainder', i.e. the CRC of the part of the
-	 * message already processed.  Explanation: rewrite the message
-	 * polynomial M(x) in terms of the first part A(x), the second part
-	 * B(x), and the length of the second part in bits |B(x)| >= 32:
-	 *
-	 *	M(x) = A(x)*x^|B(x)| + B(x)
-	 *
-	 * Then the CRC of M(x) is:
-	 *
-	 *	CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
-	 *	          = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
-	 *	          = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
-	 *
-	 * Note: all arithmetic is modulo G(x), the generator polynomial; that's
-	 * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
-	 *
-	 * So the CRC of the full message is the CRC of the second part of the
-	 * message where the first 32 bits of the second part of the message
-	 * have been XOR'ed with the CRC of the first part of the message.
-	 */
-	x0 = *p++;
-	x0 ^= (__m128i)(__v4si){ remainder };
-
-	if (p > end512) /* only 128, 256, or 384 bits of input? */
-		goto _128_bits_at_a_time;
-	x1 = *p++;
-	x2 = *p++;
-	x3 = *p++;
-
-	/* Fold 512 bits at a time */
-	for (; p != end512; p += 4) {
-		__m128i y0, y1, y2, y3;
-
-		y0 = p[0];
-		y1 = p[1];
-		y2 = p[2];
-		y3 = p[3];
-
-		/*
-		 * Note: the immediate constant for PCLMULQDQ specifies which
-		 * 64-bit halves of the 128-bit vectors to multiply:
-		 *
-		 * 0x00 means low halves (higher degree polynomial terms for us)
-		 * 0x11 means high halves (lower degree polynomial terms for us)
-		 */
-		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x00);
-		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x00);
-		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x00);
-		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x00);
-		y0 ^= _mm_clmulepi64_si128(x0, multipliers_4, 0x11);
-		y1 ^= _mm_clmulepi64_si128(x1, multipliers_4, 0x11);
-		y2 ^= _mm_clmulepi64_si128(x2, multipliers_4, 0x11);
-		y3 ^= _mm_clmulepi64_si128(x3, multipliers_4, 0x11);
-
-		x0 = y0;
-		x1 = y1;
-		x2 = y2;
-		x3 = y3;
-	}
-
-	/* Fold 512 bits => 128 bits */
-	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x00);
-	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x00);
-	x2 ^= _mm_clmulepi64_si128(x0, multipliers_2, 0x11);
-	x3 ^= _mm_clmulepi64_si128(x1, multipliers_2, 0x11);
-	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x00);
-	x3 ^= _mm_clmulepi64_si128(x2, multipliers_1, 0x11);
-	x0 = x3;
-
-_128_bits_at_a_time:
-	while (p != end) {
-		/* Fold 128 bits into next 128 bits */
-		x1 = *p++;
-		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x00);
-		x1 ^= _mm_clmulepi64_si128(x0, multipliers_1, 0x11);
-		x0 = x1;
-	}
-
-	/* Now there are just 128 bits left, stored in 'x0'. */
-
-	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-	x0 = _mm_srli_si128(x0, 8) ^
-	     _mm_clmulepi64_si128(x0, multipliers_1, 0x10);
-
-	/* Fold 96 => 64 bits */
-	x0 = _mm_srli_si128(x0, 4) ^
-	     _mm_clmulepi64_si128(x0 & mask32, final_multiplier, 0x00);
-
-        /*
-	 * Finally, reduce 64 => 32 bits using Barrett reduction.
-	 *
-	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
-	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
-	 *
-	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
-	 *	     = (A(x)*x^32) mod G(x) + B(x)
-	 *
-	 * Then, by the Division Algorithm there exists a unique q(x) such that:
-	 *
-	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
-	 *
-	 * Since the left-hand side is of maximum degree 31, the right-hand side
-	 * must be too.  This implies that we can apply 'mod x^32' to the
-	 * right-hand side without changing its value:
-	 *
-	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
-	 *
-	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
-	 *
-	 * We also know that:
-	 *
-	 *	              / A(x)*x^32 \
-	 *	q(x) = floor (  ---------  )
-	 *	              \    G(x)   /
-	 *
-	 * To compute this efficiently, we can multiply the top and bottom by
-	 * x^32 and move the division by G(x) to the top:
-	 *
-	 *	              / A(x) * floor(x^64 / G(x)) \
-	 *	q(x) = floor (  -------------------------  )
-	 *	              \           x^32            /
-	 *
-	 * Note that floor(x^64 / G(x)) is a constant.
-	 *
-	 * So finally we have:
-	 *
-	 *	                          / A(x) * floor(x^64 / G(x)) \
-	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
-	 *	                          \           x^32            /
-	 */
-	x1 = x0;
-	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x00);
-	x0 = _mm_clmulepi64_si128(x0 & mask32, barrett_reduction_constants, 0x10);
-	return _mm_cvtsi128_si32(_mm_srli_si128(x0 ^ x1, 4));
-}
-
-#define IMPL_ALIGNMENT		16
-#define IMPL_SEGMENT_SIZE	16
-#include "../crc32_vec_template.h"
diff --git a/ext/libdeflate/lib/x86/decompress_impl.h b/ext/libdeflate/lib/x86/decompress_impl.h
deleted file mode 100644
index b3d322a1..00000000
--- a/ext/libdeflate/lib/x86/decompress_impl.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#include "cpu_features.h"
-
-/* Include the BMI2-optimized version? */
-#undef DISPATCH_BMI2
-#if !defined(__BMI2__) && X86_CPU_FEATURES_ENABLED && \
-	COMPILER_SUPPORTS_BMI2_TARGET
-#  define FUNCNAME	deflate_decompress_bmi2
-#  define ATTRIBUTES	__attribute__((target("bmi2")))
-#  define DISPATCH	1
-#  define DISPATCH_BMI2	1
-#  include "../decompress_template.h"
-#endif
-
-#ifdef DISPATCH
-static inline decompress_func_t
-arch_select_decompress_func(void)
-{
-	u32 features = get_cpu_features();
-
-#ifdef DISPATCH_BMI2
-	if (features & X86_CPU_FEATURE_BMI2)
-		return deflate_decompress_bmi2;
-#endif
-	return NULL;
-}
-#endif /* DISPATCH */
diff --git a/ext/libdeflate/lib/x86/matchfinder_impl.h b/ext/libdeflate/lib/x86/matchfinder_impl.h
deleted file mode 100644
index 735bb483..00000000
--- a/ext/libdeflate/lib/x86/matchfinder_impl.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * x86/matchfinder_impl.h - x86 implementations of matchfinder functions
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifdef __AVX2__
-#  if MATCHFINDER_ALIGNMENT < 32
-#    undef MATCHFINDER_ALIGNMENT
-#    define MATCHFINDER_ALIGNMENT 32
-#  endif
-#  include <immintrin.h>
-static forceinline bool
-matchfinder_init_avx2(mf_pos_t *data, size_t size)
-{
-	__m256i v, *p;
-	size_t n;
-
-	if (size % (sizeof(__m256i) * 4) != 0)
-		return false;
-
-	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
-	v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
-	p = (__m256i *)data;
-	n = size / (sizeof(__m256i) * 4);
-	do {
-		p[0] = v;
-		p[1] = v;
-		p[2] = v;
-		p[3] = v;
-		p += 4;
-	} while (--n);
-	return true;
-}
-
-static forceinline bool
-matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
-{
-	__m256i v, *p;
-	size_t n;
-
-	if (size % (sizeof(__m256i) * 4) != 0)
-		return false;
-
-	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
-	v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
-	p = (__m256i *)data;
-	n = size / (sizeof(__m256i) * 4);
-	do {
-		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
-		p[0] = _mm256_adds_epi16(p[0], v);
-		p[1] = _mm256_adds_epi16(p[1], v);
-		p[2] = _mm256_adds_epi16(p[2], v);
-		p[3] = _mm256_adds_epi16(p[3], v);
-		p += 4;
-	} while (--n);
-	return true;
-}
-#endif /* __AVX2__ */
-
-#ifdef __SSE2__
-#  if MATCHFINDER_ALIGNMENT < 16
-#    undef MATCHFINDER_ALIGNMENT
-#    define MATCHFINDER_ALIGNMENT 16
-#  endif
-#  include <emmintrin.h>
-static forceinline bool
-matchfinder_init_sse2(mf_pos_t *data, size_t size)
-{
-	__m128i v, *p;
-	size_t n;
-
-	if (size % (sizeof(__m128i) * 4) != 0)
-		return false;
-
-	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
-	v = _mm_set1_epi16(MATCHFINDER_INITVAL);
-	p = (__m128i *)data;
-	n = size / (sizeof(__m128i) * 4);
-	do {
-		p[0] = v;
-		p[1] = v;
-		p[2] = v;
-		p[3] = v;
-		p += 4;
-	} while (--n);
-	return true;
-}
-
-static forceinline bool
-matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
-{
-	__m128i v, *p;
-	size_t n;
-
-	if (size % (sizeof(__m128i) * 4) != 0)
-		return false;
-
-	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
-	v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
-	p = (__m128i *)data;
-	n = size / (sizeof(__m128i) * 4);
-	do {
-		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
-		p[0] = _mm_adds_epi16(p[0], v);
-		p[1] = _mm_adds_epi16(p[1], v);
-		p[2] = _mm_adds_epi16(p[2], v);
-		p[3] = _mm_adds_epi16(p[3], v);
-		p += 4;
-	} while (--n);
-	return true;
-}
-#endif /* __SSE2__ */
-
-#undef arch_matchfinder_init
-static forceinline bool
-arch_matchfinder_init(mf_pos_t *data, size_t size)
-{
-#ifdef __AVX2__
-	if (matchfinder_init_avx2(data, size))
-		return true;
-#endif
-#ifdef __SSE2__
-	if (matchfinder_init_sse2(data, size))
-		return true;
-#endif
-	return false;
-}
-
-#undef arch_matchfinder_rebase
-static forceinline bool
-arch_matchfinder_rebase(mf_pos_t *data, size_t size)
-{
-#ifdef __AVX2__
-	if (matchfinder_rebase_avx2(data, size))
-		return true;
-#endif
-#ifdef __SSE2__
-	if (matchfinder_rebase_sse2(data, size))
-		return true;
-#endif
-	return false;
-}
diff --git a/ext/libdeflate/lib/zlib_compress.c b/ext/libdeflate/lib/zlib_compress.c
deleted file mode 100644
index b4cebaf8..00000000
--- a/ext/libdeflate/lib/zlib_compress.c
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * zlib_compress.c - compress with a zlib wrapper
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "deflate_compress.h"
-#include "unaligned.h"
-#include "zlib_constants.h"
-
-#include "libdeflate.h"
-
-LIBDEFLATEAPI size_t
-libdeflate_zlib_compress(struct libdeflate_compressor *c,
-			 const void *in, size_t in_size,
-			 void *out, size_t out_nbytes_avail)
-{
-	u8 *out_next = out;
-	u16 hdr;
-	unsigned compression_level;
-	unsigned level_hint;
-	size_t deflate_size;
-
-	if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
-		return 0;
-
-	/* 2 byte header: CMF and FLG  */
-	hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
-	compression_level = deflate_get_compression_level(c);
-	if (compression_level < 2)
-		level_hint = ZLIB_FASTEST_COMPRESSION;
-	else if (compression_level < 6)
-		level_hint = ZLIB_FAST_COMPRESSION;
-	else if (compression_level < 8)
-		level_hint = ZLIB_DEFAULT_COMPRESSION;
-	else
-		level_hint = ZLIB_SLOWEST_COMPRESSION;
-	hdr |= level_hint << 6;
-	hdr |= 31 - (hdr % 31);
-
-	put_unaligned_be16(hdr, out_next);
-	out_next += 2;
-
-	/* Compressed data  */
-	deflate_size = libdeflate_deflate_compress(c, in, in_size, out_next,
-					out_nbytes_avail - ZLIB_MIN_OVERHEAD);
-	if (deflate_size == 0)
-		return 0;
-	out_next += deflate_size;
-
-	/* ADLER32  */
-	put_unaligned_be32(libdeflate_adler32(1, in, in_size), out_next);
-	out_next += 4;
-
-	return out_next - (u8 *)out;
-}
-
-LIBDEFLATEAPI size_t
-libdeflate_zlib_compress_bound(struct libdeflate_compressor *c,
-			       size_t in_nbytes)
-{
-	return ZLIB_MIN_OVERHEAD +
-	       libdeflate_deflate_compress_bound(c, in_nbytes);
-}
diff --git a/ext/libdeflate/lib/zlib_constants.h b/ext/libdeflate/lib/zlib_constants.h
deleted file mode 100644
index f304310c..00000000
--- a/ext/libdeflate/lib/zlib_constants.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * zlib_constants.h - constants for the zlib wrapper format
- */
-
-#ifndef LIB_ZLIB_CONSTANTS_H
-#define LIB_ZLIB_CONSTANTS_H
-
-#define ZLIB_MIN_HEADER_SIZE	2
-#define ZLIB_FOOTER_SIZE	4
-#define ZLIB_MIN_OVERHEAD	(ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
-
-#define ZLIB_CM_DEFLATE		8
-
-#define ZLIB_CINFO_32K_WINDOW	7
-
-#define ZLIB_FASTEST_COMPRESSION	0
-#define ZLIB_FAST_COMPRESSION		1
-#define ZLIB_DEFAULT_COMPRESSION	2
-#define ZLIB_SLOWEST_COMPRESSION	3
-
-#endif /* LIB_ZLIB_CONSTANTS_H */
diff --git a/ext/libdeflate/lib/zlib_decompress.c b/ext/libdeflate/lib/zlib_decompress.c
deleted file mode 100644
index c5a15cab..00000000
--- a/ext/libdeflate/lib/zlib_decompress.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * zlib_decompress.c - decompress with a zlib wrapper
- *
- * Originally public domain; changes after 2016-09-07 are copyrighted.
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "unaligned.h"
-#include "zlib_constants.h"
-
-#include "libdeflate.h"
-
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_zlib_decompress(struct libdeflate_decompressor *d,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret)
-{
-	const u8 *in_next = in;
-	const u8 * const in_end = in_next + in_nbytes;
-	u16 hdr;
-	size_t actual_out_nbytes;
-	enum libdeflate_result result;
-
-	if (in_nbytes < ZLIB_MIN_OVERHEAD)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* 2 byte header: CMF and FLG  */
-	hdr = get_unaligned_be16(in_next);
-	in_next += 2;
-
-	/* FCHECK */
-	if ((hdr % 31) != 0)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* CM */
-	if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* CINFO */
-	if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* FDICT */
-	if ((hdr >> 5) & 1)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* Compressed data  */
-	result = libdeflate_deflate_decompress(d, in_next,
-					in_end - ZLIB_FOOTER_SIZE - in_next,
-					out, out_nbytes_avail,
-					actual_out_nbytes_ret);
-	if (result != LIBDEFLATE_SUCCESS)
-		return result;
-
-	if (actual_out_nbytes_ret)
-		actual_out_nbytes = *actual_out_nbytes_ret;
-	else
-		actual_out_nbytes = out_nbytes_avail;
-
-	in_next = in_end - ZLIB_FOOTER_SIZE;
-
-	/* ADLER32  */
-	if (libdeflate_adler32(1, out, actual_out_nbytes) !=
-	    get_unaligned_be32(in_next))
-		return LIBDEFLATE_BAD_DATA;
-
-	return LIBDEFLATE_SUCCESS;
-}
diff --git a/ext/libdeflate/libdeflate.h b/ext/libdeflate/libdeflate.h
deleted file mode 100644
index c5600fc0..00000000
--- a/ext/libdeflate/libdeflate.h
+++ /dev/null
@@ -1,323 +0,0 @@
-/*
- * libdeflate.h - public header for libdeflate
- */
-
-#ifndef LIBDEFLATE_H
-#define LIBDEFLATE_H
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define LIBDEFLATE_VERSION_MAJOR	1
-#define LIBDEFLATE_VERSION_MINOR	2
-#define LIBDEFLATE_VERSION_STRING	"1.2"
-
-#include <stddef.h>
-#include <stdint.h>
-
-/*
- * On Windows, if you want to link to the DLL version of libdeflate, then
- * #define LIBDEFLATE_DLL.  Note that the calling convention is cdecl.
- */
-#ifdef LIBDEFLATE_DLL
-#  ifdef BUILDING_LIBDEFLATE
-#    define LIBDEFLATEAPI_SYM_VISIBILITY	LIBEXPORT
-#  elif defined(_WIN32) || defined(__CYGWIN__)
-#    define LIBDEFLATEAPI_SYM_VISIBILITY	__declspec(dllimport)
-#  endif
-#endif
-#ifndef LIBDEFLATEAPI_SYM_VISIBILITY
-#  define LIBDEFLATEAPI_SYM_VISIBILITY
-#endif
-
-#if defined(BUILDING_LIBDEFLATE) && defined(__GNUC__) && \
-	defined(_WIN32) && defined(__i386__)
-    /*
-     * On 32-bit Windows, gcc assumes 16-byte stack alignment but MSVC only 4.
-     * Realign the stack when entering libdeflate to avoid crashing in SSE/AVX
-     * code when called from an MSVC-compiled application.
-     */
-#  define LIBDEFLATEAPI_STACKALIGN	__attribute__((force_align_arg_pointer))
-#endif
-#ifndef LIBDEFLATEAPI_STACKALIGN
-#  define LIBDEFLATEAPI_STACKALIGN
-#endif
-
-#define LIBDEFLATEAPI	LIBDEFLATEAPI_SYM_VISIBILITY LIBDEFLATEAPI_STACKALIGN
-
-/* ========================================================================== */
-/*                             Compression                                    */
-/* ========================================================================== */
-
-struct libdeflate_compressor;
-
-/*
- * libdeflate_alloc_compressor() allocates a new compressor that supports
- * DEFLATE, zlib, and gzip compression.  'compression_level' is the compression
- * level on a zlib-like scale but with a higher maximum value (1 = fastest, 6 =
- * medium/default, 9 = slow, 12 = slowest).  The return value is a pointer to
- * the new compressor, or NULL if out of memory.
- *
- * Note: for compression, the sliding window size is defined at compilation time
- * to 32768, the largest size permissible in the DEFLATE format.  It cannot be
- * changed at runtime.
- *
- * A single compressor is not safe to use by multiple threads concurrently.
- * However, different threads may use different compressors concurrently.
- */
-LIBDEFLATEAPI struct libdeflate_compressor *
-libdeflate_alloc_compressor(int compression_level);
-
-/*
- * libdeflate_deflate_compress() performs raw DEFLATE compression on a buffer of
- * data.  The function attempts to compress 'in_nbytes' bytes of data located at
- * 'in' and write the results to 'out', which has space for 'out_nbytes_avail'
- * bytes.  The return value is the compressed size in bytes, or 0 if the data
- * could not be compressed to 'out_nbytes_avail' bytes or fewer.
- */
-LIBDEFLATEAPI size_t
-libdeflate_deflate_compress(struct libdeflate_compressor *compressor,
-			    const void *in, size_t in_nbytes,
-			    void *out, size_t out_nbytes_avail);
-
-/*
- * libdeflate_deflate_compress_bound() returns a worst-case upper bound on the
- * number of bytes of compressed data that may be produced by compressing any
- * buffer of length less than or equal to 'in_nbytes' using
- * libdeflate_deflate_compress() with the specified compressor.  Mathematically,
- * this bound will necessarily be a number greater than or equal to 'in_nbytes'.
- * It may be an overestimate of the true upper bound.  The return value is
- * guaranteed to be the same for all invocations with the same compressor and
- * same 'in_nbytes'.
- *
- * As a special case, 'compressor' may be NULL.  This causes the bound to be
- * taken across *any* libdeflate_compressor that could ever be allocated with
- * this build of the library, with any options.
- *
- * Note that this function is not necessary in many applications.  With
- * block-based compression, it is usually preferable to separately store the
- * uncompressed size of each block and to store any blocks that did not compress
- * to less than their original size uncompressed.  In that scenario, there is no
- * need to know the worst-case compressed size, since the maximum number of
- * bytes of compressed data that may be used would always be one less than the
- * input length.  You can just pass a buffer of that size to
- * libdeflate_deflate_compress() and store the data uncompressed if
- * libdeflate_deflate_compress() returns 0, indicating that the compressed data
- * did not fit into the provided output buffer.
- */
-LIBDEFLATEAPI size_t
-libdeflate_deflate_compress_bound(struct libdeflate_compressor *compressor,
-				  size_t in_nbytes);
-
-/*
- * Like libdeflate_deflate_compress(), but stores the data in the zlib wrapper
- * format.
- */
-LIBDEFLATEAPI size_t
-libdeflate_zlib_compress(struct libdeflate_compressor *compressor,
-			 const void *in, size_t in_nbytes,
-			 void *out, size_t out_nbytes_avail);
-
-/*
- * Like libdeflate_deflate_compress_bound(), but assumes the data will be
- * compressed with libdeflate_zlib_compress() rather than with
- * libdeflate_deflate_compress().
- */
-LIBDEFLATEAPI size_t
-libdeflate_zlib_compress_bound(struct libdeflate_compressor *compressor,
-			       size_t in_nbytes);
-
-/*
- * Like libdeflate_deflate_compress(), but stores the data in the gzip wrapper
- * format.
- */
-LIBDEFLATEAPI size_t
-libdeflate_gzip_compress(struct libdeflate_compressor *compressor,
-			 const void *in, size_t in_nbytes,
-			 void *out, size_t out_nbytes_avail);
-
-/*
- * Like libdeflate_deflate_compress_bound(), but assumes the data will be
- * compressed with libdeflate_gzip_compress() rather than with
- * libdeflate_deflate_compress().
- */
-LIBDEFLATEAPI size_t
-libdeflate_gzip_compress_bound(struct libdeflate_compressor *compressor,
-			       size_t in_nbytes);
-
-/*
- * libdeflate_free_compressor() frees a compressor that was allocated with
- * libdeflate_alloc_compressor().  If a NULL pointer is passed in, no action is
- * taken.
- */
-LIBDEFLATEAPI void
-libdeflate_free_compressor(struct libdeflate_compressor *compressor);
-
-/* ========================================================================== */
-/*                             Decompression                                  */
-/* ========================================================================== */
-
-struct libdeflate_decompressor;
-
-/*
- * libdeflate_alloc_decompressor() allocates a new decompressor that can be used
- * for DEFLATE, zlib, and gzip decompression.  The return value is a pointer to
- * the new decompressor, or NULL if out of memory.
- *
- * This function takes no parameters, and the returned decompressor is valid for
- * decompressing data that was compressed at any compression level and with any
- * sliding window size.
- *
- * A single decompressor is not safe to use by multiple threads concurrently.
- * However, different threads may use different decompressors concurrently.
- */
-LIBDEFLATEAPI struct libdeflate_decompressor *
-libdeflate_alloc_decompressor(void);
-
-/*
- * Result of a call to libdeflate_deflate_decompress(),
- * libdeflate_zlib_decompress(), or libdeflate_gzip_decompress().
- */
-enum libdeflate_result {
-	/* Decompression was successful.  */
-	LIBDEFLATE_SUCCESS = 0,
-
-	/* Decompressed failed because the compressed data was invalid, corrupt,
-	 * or otherwise unsupported.  */
-	LIBDEFLATE_BAD_DATA = 1,
-
-	/* A NULL 'actual_out_nbytes_ret' was provided, but the data would have
-	 * decompressed to fewer than 'out_nbytes_avail' bytes.  */
-	LIBDEFLATE_SHORT_OUTPUT = 2,
-
-	/* The data would have decompressed to more than 'out_nbytes_avail'
-	 * bytes.  */
-	LIBDEFLATE_INSUFFICIENT_SPACE = 3,
-};
-
-/*
- * libdeflate_deflate_decompress() decompresses the DEFLATE-compressed stream
- * from the buffer 'in' with compressed size up to 'in_nbytes' bytes.  The
- * uncompressed data is written to 'out', a buffer with size 'out_nbytes_avail'
- * bytes.  If decompression succeeds, then 0 (LIBDEFLATE_SUCCESS) is returned.
- * Otherwise, a nonzero result code such as LIBDEFLATE_BAD_DATA is returned.  If
- * a nonzero result code is returned, then the contents of the output buffer are
- * undefined.
- *
- * Decompression stops at the end of the DEFLATE stream (as indicated by the
- * BFINAL flag), even if it is actually shorter than 'in_nbytes' bytes.
- *
- * libdeflate_deflate_decompress() can be used in cases where the actual
- * uncompressed size is known (recommended) or unknown (not recommended):
- *
- *   - If the actual uncompressed size is known, then pass the actual
- *     uncompressed size as 'out_nbytes_avail' and pass NULL for
- *     'actual_out_nbytes_ret'.  This makes libdeflate_deflate_decompress() fail
- *     with LIBDEFLATE_SHORT_OUTPUT if the data decompressed to fewer than the
- *     specified number of bytes.
- *
- *   - If the actual uncompressed size is unknown, then provide a non-NULL
- *     'actual_out_nbytes_ret' and provide a buffer with some size
- *     'out_nbytes_avail' that you think is large enough to hold all the
- *     uncompressed data.  In this case, if the data decompresses to less than
- *     or equal to 'out_nbytes_avail' bytes, then
- *     libdeflate_deflate_decompress() will write the actual uncompressed size
- *     to *actual_out_nbytes_ret and return 0 (LIBDEFLATE_SUCCESS).  Otherwise,
- *     it will return LIBDEFLATE_INSUFFICIENT_SPACE if the provided buffer was
- *     not large enough but no other problems were encountered, or another
- *     nonzero result code if decompression failed for another reason.
- */
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_deflate_decompress(struct libdeflate_decompressor *decompressor,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_deflate_decompress(), but adds the 'actual_in_nbytes_ret'
- * argument.  If decompression succeeds and 'actual_in_nbytes_ret' is not NULL,
- * then the actual compressed size of the DEFLATE stream (aligned to the next
- * byte boundary) is written to *actual_in_nbytes_ret.
- */
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *decompressor,
-				 const void *in, size_t in_nbytes,
-				 void *out, size_t out_nbytes_avail,
-				 size_t *actual_in_nbytes_ret,
-				 size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_deflate_decompress(), but assumes the zlib wrapper format
- * instead of raw DEFLATE.
- */
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_zlib_decompress(struct libdeflate_decompressor *decompressor,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_deflate_decompress(), but assumes the gzip wrapper format
- * instead of raw DEFLATE.
- *
- * If multiple gzip-compressed members are concatenated, then only the first
- * will be decompressed.  Use libdeflate_gzip_decompress_ex() if you need
- * multi-member support.
- */
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_gzip_decompress(struct libdeflate_decompressor *decompressor,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret);
-
-/*
- * Like libdeflate_gzip_decompress(), but adds the 'actual_in_nbytes_ret'
- * argument.  If 'actual_in_nbytes_ret' is not NULL and the decompression
- * succeeds (indicating that the first gzip-compressed member in the input
- * buffer was decompressed), then the actual number of input bytes consumed is
- * written to *actual_in_nbytes_ret.
- */
-LIBDEFLATEAPI enum libdeflate_result
-libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *decompressor,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_in_nbytes_ret,
-			      size_t *actual_out_nbytes_ret);
-
-/*
- * libdeflate_free_decompressor() frees a decompressor that was allocated with
- * libdeflate_alloc_decompressor().  If a NULL pointer is passed in, no action
- * is taken.
- */
-LIBDEFLATEAPI void
-libdeflate_free_decompressor(struct libdeflate_decompressor *decompressor);
-
-/* ========================================================================== */
-/*                                Checksums                                   */
-/* ========================================================================== */
-
-/*
- * libdeflate_adler32() updates a running Adler-32 checksum with 'len' bytes of
- * data and returns the updated checksum.  When starting a new checksum, the
- * required initial value for 'adler' is 1.  This value is also returned when
- * 'buffer' is specified as NULL.
- */
-LIBDEFLATEAPI uint32_t
-libdeflate_adler32(uint32_t adler32, const void *buffer, size_t len);
-
-
-/*
- * libdeflate_crc32() updates a running CRC-32 checksum with 'len' bytes of data
- * and returns the updated checksum.  When starting a new checksum, the required
- * initial value for 'crc' is 0.  This value is also returned when 'buffer' is
- * specified as NULL.
- */
-LIBDEFLATEAPI uint32_t
-libdeflate_crc32(uint32_t crc, const void *buffer, size_t len);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* LIBDEFLATE_H */