mirror of
https://github.com/PSI-Rockin/DobieStation.git
synced 2024-05-20 13:08:11 -04:00
a491dce51a
This changes up the build system quite a bit.
125 lines
3.8 KiB
C
125 lines
3.8 KiB
C
/*
|
|
* adler32_vec_template.h - template for vectorized Adler-32 implementations
|
|
*
|
|
* Copyright 2016 Eric Biggers
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person
|
|
* obtaining a copy of this software and associated documentation
|
|
* files (the "Software"), to deal in the Software without
|
|
* restriction, including without limitation the rights to use,
|
|
* copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
* copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following
|
|
* conditions:
|
|
*
|
|
* The above copyright notice and this permission notice shall be
|
|
* included in all copies or substantial portions of the Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
|
|
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
|
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
|
|
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
|
|
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
*/
|
|
|
|
/*
|
|
* This file contains a template for vectorized Adler-32 implementations.
|
|
*
|
|
* The inner loop between reductions modulo 65521 of an unvectorized Adler-32
|
|
* implementation looks something like this:
|
|
*
|
|
* do {
|
|
* s1 += *p;
|
|
* s2 += s1;
|
|
* } while (++p != chunk_end);
|
|
*
|
|
* For vectorized calculation of s1, we only need to sum the input bytes. They
|
|
* can be accumulated into multiple counters which are eventually summed
|
|
* together.
|
|
*
|
|
* For vectorized calculation of s2, the basic idea is that for each iteration
|
|
* that processes N bytes, we can perform the following vectorizable
|
|
* calculation:
|
|
*
|
|
* s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
|
|
*
|
|
* Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
|
|
* separate counters, then do the multiplications by N...1 just once at the end
|
|
* rather than once per iteration.
|
|
*
|
|
* Also, we must account for how previous bytes will affect s2 by doing the
|
|
* following at beginning of each iteration:
|
|
*
|
|
* s2 += s1 * N
|
|
*
|
|
* Furthermore, like s1, "s2" can actually be multiple counters which are
|
|
* eventually summed together.
|
|
*/
|
|
|
|
static u32 ATTRIBUTES
|
|
FUNCNAME(u32 adler, const u8 *p, size_t size)
|
|
{
|
|
u32 s1 = adler & 0xFFFF;
|
|
u32 s2 = adler >> 16;
|
|
const u8 * const end = p + size;
|
|
const u8 *vend;
|
|
const size_t max_chunk_size =
|
|
MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) -
|
|
(MIN(MAX_CHUNK_SIZE, IMPL_MAX_CHUNK_SIZE) %
|
|
IMPL_SEGMENT_SIZE);
|
|
|
|
/* Process a byte at a time until the needed alignment is reached */
|
|
if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
|
|
do {
|
|
s1 += *p++;
|
|
s2 += s1;
|
|
} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
|
|
s1 %= DIVISOR;
|
|
s2 %= DIVISOR;
|
|
}
|
|
|
|
/*
|
|
* Process "chunks" of bytes using vector instructions. Chunk sizes are
|
|
* limited to MAX_CHUNK_SIZE, which guarantees that s1 and s2 never
|
|
* overflow before being reduced modulo DIVISOR. For vector processing,
|
|
* chunk sizes are also made evenly divisible by IMPL_SEGMENT_SIZE and
|
|
* may be further limited to IMPL_MAX_CHUNK_SIZE.
|
|
*/
|
|
STATIC_ASSERT(IMPL_SEGMENT_SIZE % IMPL_ALIGNMENT == 0);
|
|
vend = end - ((size_t)(end - p) % IMPL_SEGMENT_SIZE);
|
|
while (p != vend) {
|
|
size_t chunk_size = MIN((size_t)(vend - p), max_chunk_size);
|
|
|
|
s2 += s1 * chunk_size;
|
|
|
|
FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_size),
|
|
&s1, &s2);
|
|
|
|
p += chunk_size;
|
|
s1 %= DIVISOR;
|
|
s2 %= DIVISOR;
|
|
}
|
|
|
|
/* Process any remaining bytes */
|
|
if (p != end) {
|
|
do {
|
|
s1 += *p++;
|
|
s2 += s1;
|
|
} while (p != end);
|
|
s1 %= DIVISOR;
|
|
s2 %= DIVISOR;
|
|
}
|
|
|
|
return (s2 << 16) | s1;
|
|
}
|
|
|
|
#undef FUNCNAME
|
|
#undef FUNCNAME_CHUNK
|
|
#undef ATTRIBUTES
|
|
#undef IMPL_ALIGNMENT
|
|
#undef IMPL_SEGMENT_SIZE
|
|
#undef IMPL_MAX_CHUNK_SIZE
|