DobieStation/ext/libdeflate/lib/adler32_vec_template.h
a dinosaur a491dce51a CISO reading support (#123)
This changes up the build system quite a bit.
2019-02-25 18:51:46 -05:00

125 lines
3.8 KiB
C

/*
* adler32_vec_template.h - template for vectorized Adler-32 implementations
*
* Copyright 2016 Eric Biggers
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
/*
* This file contains a template for vectorized Adler-32 implementations.
*
* The inner loop between reductions modulo 65521 of an unvectorized Adler-32
* implementation looks something like this:
*
* do {
* s1 += *p;
* s2 += s1;
* } while (++p != chunk_end);
*
* For vectorized calculation of s1, we only need to sum the input bytes. They
* can be accumulated into multiple counters which are eventually summed
* together.
*
* For vectorized calculation of s2, the basic idea is that for each iteration
* that processes N bytes, we can perform the following vectorizable
* calculation:
*
* s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
*
* Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
* separate counters, then do the multiplications by N...1 just once at the end
* rather than once per iteration.
*
* Also, we must account for how previous bytes will affect s2 by doing the
* following at beginning of each iteration:
*
* s2 += s1 * N
*
* Furthermore, like s1, "s2" can actually be multiple counters which are
* eventually summed together.
*/
static u32 ATTRIBUTES
FUNCNAME(u32 adler, const u8 *p, size_t size)
{
u32 s1 = adler & 0xFFFF;
u32 s2 = adler >> 16;
const u8 * const end = p + size;
const u8 *vend;
const size_t max_chunk_size =
MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
(MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
IMPL_SEGMENT_SIZE);
/* Process a byte at a time until the needed alignment is reached */
if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
do {
s1 += *p++;
s2 += s1;
} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
s1 %= DIVISOR;
s2 %= DIVISOR;
}
/*
* Process "chunks" of bytes using vector instructions. Chunk sizes are
* limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
* overflow before being reduced modulo DIVISOR. For vector processing,
* chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
* may be further limited to IMPL_MAX_CHUNK_SIZE.
*/
STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
while (p != vend) {
size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);
s2 += s1 * chunk_size;
FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
&s1, &s2);
p += chunk_size;
s1 %= DIVISOR;
s2 %= DIVISOR;
}
/* Process any remaining bytes */
if (p != end) {
do {
s1 += *p++;
s2 += s1;
} while (p != end);
s1 %= DIVISOR;
s2 %= DIVISOR;
}
return (s2 << 16) | s1;
}
#undef FUNCNAME
#undef FUNCNAME_CHUNK
#undef ATTRIBUTES
#undef IMPL_ALIGNMENT
#undef IMPL_SEGMENT_SIZE
#undef IMPL_MAX_CHUNK_SIZE