Update 7z to 23.01

This commit is contained in:
Megamouse 2023-12-13 20:36:41 +01:00
parent 98b7642232
commit 00a80adfae
100 changed files with 11232 additions and 3906 deletions

View file

@ -1,5 +1,5 @@
/* LzFind.c -- Match finder for LZ algorithms
2021-11-29 : Igor Pavlov : Public domain */
2023-03-14 : Igor Pavlov : Public domain */
#include "Precomp.h"
@ -17,7 +17,7 @@
#define kEmptyHashValue 0
#define kMaxValForNormalize ((UInt32)0)
// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xFFF) // for debug
// #define kMaxValForNormalize ((UInt32)(1 << 20) + 0xfff) // for debug
// #define kNormalizeAlign (1 << 7) // alignment for speculated accesses
@ -67,10 +67,10 @@
static void LzInWindow_Free(CMatchFinder *p, ISzAllocPtr alloc)
{
if (!p->directInput)
// if (!p->directInput)
{
ISzAlloc_Free(alloc, p->bufferBase);
p->bufferBase = NULL;
ISzAlloc_Free(alloc, p->bufBase);
p->bufBase = NULL;
}
}
@ -79,7 +79,7 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all
{
if (blockSize == 0)
return 0;
if (!p->bufferBase || p->blockSize != blockSize)
if (!p->bufBase || p->blockSize != blockSize)
{
// size_t blockSizeT;
LzInWindow_Free(p, alloc);
@ -101,11 +101,11 @@ static int LzInWindow_Create2(CMatchFinder *p, UInt32 blockSize, ISzAllocPtr all
#endif
*/
p->bufferBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
// printf("\nbufferBase = %p\n", p->bufferBase);
p->bufBase = (Byte *)ISzAlloc_Alloc(alloc, blockSize);
// printf("\nbufferBase = %p\n", p->bufBase);
// return 0; // for debug
}
return (p->bufferBase != NULL);
return (p->bufBase != NULL);
}
static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return p->buffer; }
@ -113,7 +113,7 @@ static const Byte *MatchFinder_GetPointerToCurrentPos(CMatchFinder *p) { return
static UInt32 MatchFinder_GetNumAvailableBytes(CMatchFinder *p) { return GET_AVAIL_BYTES(p); }
MY_NO_INLINE
Z7_NO_INLINE
static void MatchFinder_ReadBlock(CMatchFinder *p)
{
if (p->streamEndWasReached || p->result != SZ_OK)
@ -127,8 +127,8 @@ static void MatchFinder_ReadBlock(CMatchFinder *p)
UInt32 curSize = 0xFFFFFFFF - GET_AVAIL_BYTES(p);
if (curSize > p->directInputRem)
curSize = (UInt32)p->directInputRem;
p->directInputRem -= curSize;
p->streamPos += curSize;
p->directInputRem -= curSize;
if (p->directInputRem == 0)
p->streamEndWasReached = 1;
return;
@ -136,8 +136,8 @@ static void MatchFinder_ReadBlock(CMatchFinder *p)
for (;;)
{
Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
size_t size = (size_t)(p->bufferBase + p->blockSize - dest);
const Byte *dest = p->buffer + GET_AVAIL_BYTES(p);
size_t size = (size_t)(p->bufBase + p->blockSize - dest);
if (size == 0)
{
/* we call ReadBlock() after NeedMove() and MoveBlock().
@ -153,7 +153,14 @@ static void MatchFinder_ReadBlock(CMatchFinder *p)
// #define kRead 3
// if (size > kRead) size = kRead; // for debug
p->result = ISeqInStream_Read(p->stream, dest, &size);
/*
// we need cast (Byte *)dest.
#ifdef __clang__
#pragma GCC diagnostic ignored "-Wcast-qual"
#endif
*/
p->result = ISeqInStream_Read(p->stream,
p->bufBase + (dest - p->bufBase), &size);
if (p->result != SZ_OK)
return;
if (size == 0)
@ -173,14 +180,14 @@ static void MatchFinder_ReadBlock(CMatchFinder *p)
MY_NO_INLINE
Z7_NO_INLINE
void MatchFinder_MoveBlock(CMatchFinder *p)
{
const size_t offset = (size_t)(p->buffer - p->bufferBase) - p->keepSizeBefore;
const size_t offset = (size_t)(p->buffer - p->bufBase) - p->keepSizeBefore;
const size_t keepBefore = (offset & (kBlockMoveAlign - 1)) + p->keepSizeBefore;
p->buffer = p->bufferBase + keepBefore;
memmove(p->bufferBase,
p->bufferBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
p->buffer = p->bufBase + keepBefore;
memmove(p->bufBase,
p->bufBase + (offset & ~((size_t)kBlockMoveAlign - 1)),
keepBefore + (size_t)GET_AVAIL_BYTES(p));
}
@ -198,7 +205,7 @@ int MatchFinder_NeedMove(CMatchFinder *p)
return 0;
if (p->streamEndWasReached || p->result != SZ_OK)
return 0;
return ((size_t)(p->bufferBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
return ((size_t)(p->bufBase + p->blockSize - p->buffer) <= p->keepSizeAfter);
}
void MatchFinder_ReadIfRequired(CMatchFinder *p)
@ -214,6 +221,8 @@ static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
p->cutValue = 32;
p->btMode = 1;
p->numHashBytes = 4;
p->numHashBytes_Min = 2;
p->numHashOutBits = 0;
p->bigHash = 0;
}
@ -222,8 +231,10 @@ static void MatchFinder_SetDefaultSettings(CMatchFinder *p)
void MatchFinder_Construct(CMatchFinder *p)
{
unsigned i;
p->bufferBase = NULL;
p->buffer = NULL;
p->bufBase = NULL;
p->directInput = 0;
p->stream = NULL;
p->hash = NULL;
p->expectedDataSize = (UInt64)(Int64)-1;
MatchFinder_SetDefaultSettings(p);
@ -238,6 +249,8 @@ void MatchFinder_Construct(CMatchFinder *p)
}
}
#undef kCrcPoly
static void MatchFinder_FreeThisClassMemory(CMatchFinder *p, ISzAllocPtr alloc)
{
ISzAlloc_Free(alloc, p->hash);
@ -252,7 +265,7 @@ void MatchFinder_Free(CMatchFinder *p, ISzAllocPtr alloc)
static CLzRef* AllocRefs(size_t num, ISzAllocPtr alloc)
{
size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
const size_t sizeInBytes = (size_t)num * sizeof(CLzRef);
if (sizeInBytes / sizeof(CLzRef) != num)
return NULL;
return (CLzRef *)ISzAlloc_Alloc(alloc, sizeInBytes);
@ -298,6 +311,62 @@ static UInt32 GetBlockSize(CMatchFinder *p, UInt32 historySize)
}
// input is historySize
static UInt32 MatchFinder_GetHashMask2(CMatchFinder *p, UInt32 hs)
{
if (p->numHashBytes == 2)
return (1 << 16) - 1;
if (hs != 0)
hs--;
hs |= (hs >> 1);
hs |= (hs >> 2);
hs |= (hs >> 4);
hs |= (hs >> 8);
// we propagated 16 bits in (hs). Low 16 bits must be set later
if (hs >= (1 << 24))
{
if (p->numHashBytes == 3)
hs = (1 << 24) - 1;
/* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
}
// (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
hs |= (1 << 16) - 1; /* don't change it! */
// bt5: we adjust the size with recommended minimum size
if (p->numHashBytes >= 5)
hs |= (256 << kLzHash_CrcShift_2) - 1;
return hs;
}
// input is historySize
static UInt32 MatchFinder_GetHashMask(CMatchFinder *p, UInt32 hs)
{
if (p->numHashBytes == 2)
return (1 << 16) - 1;
if (hs != 0)
hs--;
hs |= (hs >> 1);
hs |= (hs >> 2);
hs |= (hs >> 4);
hs |= (hs >> 8);
// we propagated 16 bits in (hs). Low 16 bits must be set later
hs >>= 1;
if (hs >= (1 << 24))
{
if (p->numHashBytes == 3)
hs = (1 << 24) - 1;
else
hs >>= 1;
/* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
}
// (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
hs |= (1 << 16) - 1; /* don't change it! */
// bt5: we adjust the size with recommended minimum size
if (p->numHashBytes >= 5)
hs |= (256 << kLzHash_CrcShift_2) - 1;
return hs;
}
int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
UInt32 keepAddBufferBefore, UInt32 matchMaxLen, UInt32 keepAddBufferAfter,
ISzAllocPtr alloc)
@ -318,78 +387,91 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
p->blockSize = 0;
if (p->directInput || LzInWindow_Create2(p, GetBlockSize(p, historySize), alloc))
{
const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
UInt32 hs;
p->matchMaxLen = matchMaxLen;
size_t hashSizeSum;
{
// UInt32 hs4;
p->fixedHashSize = 0;
hs = (1 << 16) - 1;
if (p->numHashBytes != 2)
UInt32 hs;
UInt32 hsCur;
if (p->numHashOutBits != 0)
{
hs = historySize;
if (hs > p->expectedDataSize)
hs = (UInt32)p->expectedDataSize;
if (hs != 0)
hs--;
hs |= (hs >> 1);
hs |= (hs >> 2);
hs |= (hs >> 4);
hs |= (hs >> 8);
// we propagated 16 bits in (hs). Low 16 bits must be set later
hs >>= 1;
if (hs >= (1 << 24))
{
if (p->numHashBytes == 3)
hs = (1 << 24) - 1;
else
hs >>= 1;
/* if (bigHash) mode, GetHeads4b() in LzFindMt.c needs (hs >= ((1 << 24) - 1))) */
}
// hs = ((UInt32)1 << 25) - 1; // for test
unsigned numBits = p->numHashOutBits;
const unsigned nbMax =
(p->numHashBytes == 2 ? 16 :
(p->numHashBytes == 3 ? 24 : 32));
if (numBits > nbMax)
numBits = nbMax;
if (numBits >= 32)
hs = (UInt32)0 - 1;
else
hs = ((UInt32)1 << numBits) - 1;
// (hash_size >= (1 << 16)) : Required for (numHashBytes > 2)
hs |= (1 << 16) - 1; /* don't change it! */
// bt5: we adjust the size with recommended minimum size
if (p->numHashBytes >= 5)
hs |= (256 << kLzHash_CrcShift_2) - 1;
{
const UInt32 hs2 = MatchFinder_GetHashMask2(p, historySize);
if (hs > hs2)
hs = hs2;
}
hsCur = hs;
if (p->expectedDataSize < historySize)
{
const UInt32 hs2 = MatchFinder_GetHashMask2(p, (UInt32)p->expectedDataSize);
if (hsCur > hs2)
hsCur = hs2;
}
}
else
{
hs = MatchFinder_GetHashMask(p, historySize);
hsCur = hs;
if (p->expectedDataSize < historySize)
{
hsCur = MatchFinder_GetHashMask(p, (UInt32)p->expectedDataSize);
if (hsCur > hs) // is it possible?
hsCur = hs;
}
}
p->hashMask = hs;
hs++;
/*
hs4 = (1 << 20);
if (hs4 > hs)
hs4 = hs;
// hs4 = (1 << 16); // for test
p->hash4Mask = hs4 - 1;
*/
p->hashMask = hsCur;
if (p->numHashBytes > 2) p->fixedHashSize += kHash2Size;
if (p->numHashBytes > 3) p->fixedHashSize += kHash3Size;
// if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
hs += p->fixedHashSize;
hashSizeSum = hs;
hashSizeSum++;
if (hashSizeSum < hs)
return 0;
{
UInt32 fixedHashSize = 0;
if (p->numHashBytes > 2 && p->numHashBytes_Min <= 2) fixedHashSize += kHash2Size;
if (p->numHashBytes > 3 && p->numHashBytes_Min <= 3) fixedHashSize += kHash3Size;
// if (p->numHashBytes > 4) p->fixedHashSize += hs4; // kHash4Size;
hashSizeSum += fixedHashSize;
p->fixedHashSize = fixedHashSize;
}
}
p->matchMaxLen = matchMaxLen;
{
size_t newSize;
size_t numSons;
const UInt32 newCyclicBufferSize = historySize + 1; // do not change it
p->historySize = historySize;
p->hashSizeSum = hs;
p->cyclicBufferSize = newCyclicBufferSize; // it must be = (historySize + 1)
numSons = newCyclicBufferSize;
if (p->btMode)
numSons <<= 1;
newSize = hs + numSons;
newSize = hashSizeSum + numSons;
if (numSons < newCyclicBufferSize || newSize < numSons)
return 0;
// aligned size is not required here, but it can be better for some loops
#define NUM_REFS_ALIGN_MASK 0xF
newSize = (newSize + NUM_REFS_ALIGN_MASK) & ~(size_t)NUM_REFS_ALIGN_MASK;
if (p->hash && p->numRefs == newSize)
// 22.02: we don't reallocate buffer, if old size is enough
if (p->hash && p->numRefs >= newSize)
return 1;
MatchFinder_FreeThisClassMemory(p, alloc);
@ -398,7 +480,7 @@ int MatchFinder_Create(CMatchFinder *p, UInt32 historySize,
if (p->hash)
{
p->son = p->hash + p->hashSizeSum;
p->son = p->hash + hashSizeSum;
return 1;
}
}
@ -470,7 +552,8 @@ void MatchFinder_Init_HighHash(CMatchFinder *p)
void MatchFinder_Init_4(CMatchFinder *p)
{
p->buffer = p->bufferBase;
if (!p->directInput)
p->buffer = p->bufBase;
{
/* kEmptyHashValue = 0 (Zero) is used in hash tables as NO-VALUE marker.
the code in CMatchFinderMt expects (pos = 1) */
@ -507,20 +590,20 @@ void MatchFinder_Init(CMatchFinder *p)
#ifdef MY_CPU_X86_OR_AMD64
#if defined(__clang__) && (__clang_major__ >= 8) \
|| defined(__GNUC__) && (__GNUC__ >= 8) \
|| defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
#define USE_SATUR_SUB_128
#define USE_AVX2
#define ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
#define ATTRIB_AVX2 __attribute__((__target__("avx2")))
#if defined(__clang__) && (__clang_major__ >= 4) \
|| defined(Z7_GCC_VERSION) && (Z7_GCC_VERSION >= 40701)
// || defined(__INTEL_COMPILER) && (__INTEL_COMPILER >= 1900)
#define USE_LZFIND_SATUR_SUB_128
#define USE_LZFIND_SATUR_SUB_256
#define LZFIND_ATTRIB_SSE41 __attribute__((__target__("sse4.1")))
#define LZFIND_ATTRIB_AVX2 __attribute__((__target__("avx2")))
#elif defined(_MSC_VER)
#if (_MSC_VER >= 1600)
#define USE_SATUR_SUB_128
#if (_MSC_VER >= 1900)
#define USE_AVX2
#include <immintrin.h> // avx
#endif
#define USE_LZFIND_SATUR_SUB_128
#endif
#if (_MSC_VER >= 1900)
#define USE_LZFIND_SATUR_SUB_256
#endif
#endif
@ -529,16 +612,16 @@ void MatchFinder_Init(CMatchFinder *p)
#if defined(__clang__) && (__clang_major__ >= 8) \
|| defined(__GNUC__) && (__GNUC__ >= 8)
#define USE_SATUR_SUB_128
#define USE_LZFIND_SATUR_SUB_128
#ifdef MY_CPU_ARM64
// #define ATTRIB_SSE41 __attribute__((__target__("")))
// #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("")))
#else
// #define ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
// #define LZFIND_ATTRIB_SSE41 __attribute__((__target__("fpu=crypto-neon-fp-armv8")))
#endif
#elif defined(_MSC_VER)
#if (_MSC_VER >= 1910)
#define USE_SATUR_SUB_128
#define USE_LZFIND_SATUR_SUB_128
#endif
#endif
@ -550,121 +633,130 @@ void MatchFinder_Init(CMatchFinder *p)
#endif
/*
#ifndef ATTRIB_SSE41
#define ATTRIB_SSE41
#endif
#ifndef ATTRIB_AVX2
#define ATTRIB_AVX2
#endif
*/
#ifdef USE_SATUR_SUB_128
#ifdef USE_LZFIND_SATUR_SUB_128
// #define _SHOW_HW_STATUS
// #define Z7_SHOW_HW_STATUS
#ifdef _SHOW_HW_STATUS
#ifdef Z7_SHOW_HW_STATUS
#include <stdio.h>
#define _PRF(x) x
_PRF(;)
#define PRF(x) x
PRF(;)
#else
#define _PRF(x)
#define PRF(x)
#endif
#ifdef MY_CPU_ARM_OR_ARM64
#ifdef MY_CPU_ARM64
// #define FORCE_SATUR_SUB_128
// #define FORCE_LZFIND_SATUR_SUB_128
#endif
typedef uint32x4_t LzFind_v128;
#define SASUB_128_V(v, s) \
vsubq_u32(vmaxq_u32(v, s), s)
typedef uint32x4_t v128;
#define SASUB_128(i) \
*(v128 *)(void *)(items + (i) * 4) = \
vsubq_u32(vmaxq_u32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2);
#else
#else // MY_CPU_ARM_OR_ARM64
#include <smmintrin.h> // sse4.1
typedef __m128i v128;
typedef __m128i LzFind_v128;
// SSE 4.1
#define SASUB_128_V(v, s) \
_mm_sub_epi32(_mm_max_epu32(v, s), s)
#endif // MY_CPU_ARM_OR_ARM64
#define SASUB_128(i) \
*(v128 *)(void *)(items + (i) * 4) = \
_mm_sub_epi32(_mm_max_epu32(*(const v128 *)(const void *)(items + (i) * 4), sub2), sub2); // SSE 4.1
#endif
*( LzFind_v128 *)( void *)(items + (i) * 4) = SASUB_128_V( \
*(const LzFind_v128 *)(const void *)(items + (i) * 4), sub2);
MY_NO_INLINE
Z7_NO_INLINE
static
#ifdef ATTRIB_SSE41
ATTRIB_SSE41
#ifdef LZFIND_ATTRIB_SSE41
LZFIND_ATTRIB_SSE41
#endif
void
MY_FAST_CALL
Z7_FASTCALL
LzFind_SaturSub_128(UInt32 subValue, CLzRef *items, const CLzRef *lim)
{
v128 sub2 =
const LzFind_v128 sub2 =
#ifdef MY_CPU_ARM_OR_ARM64
vdupq_n_u32(subValue);
#else
_mm_set_epi32((Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
#endif
Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
do
{
SASUB_128(0)
SASUB_128(1)
SASUB_128(2)
SASUB_128(3)
items += 4 * 4;
SASUB_128(0) SASUB_128(1) items += 2 * 4;
SASUB_128(0) SASUB_128(1) items += 2 * 4;
}
while (items != lim);
}
#ifdef USE_AVX2
#ifdef USE_LZFIND_SATUR_SUB_256
#include <immintrin.h> // avx
/*
clang :immintrin.h uses
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVX2__)
#include <avx2intrin.h>
#endif
so we need <avxintrin.h> for clang-cl */
#define SASUB_256(i) *(__m256i *)(void *)(items + (i) * 8) = _mm256_sub_epi32(_mm256_max_epu32(*(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2); // AVX2
#if defined(__clang__)
#include <avxintrin.h>
#include <avx2intrin.h>
#endif
MY_NO_INLINE
// AVX2:
#define SASUB_256(i) \
*( __m256i *)( void *)(items + (i) * 8) = \
_mm256_sub_epi32(_mm256_max_epu32( \
*(const __m256i *)(const void *)(items + (i) * 8), sub2), sub2);
Z7_NO_INLINE
static
#ifdef ATTRIB_AVX2
ATTRIB_AVX2
#ifdef LZFIND_ATTRIB_AVX2
LZFIND_ATTRIB_AVX2
#endif
void
MY_FAST_CALL
Z7_FASTCALL
LzFind_SaturSub_256(UInt32 subValue, CLzRef *items, const CLzRef *lim)
{
__m256i sub2 = _mm256_set_epi32(
const __m256i sub2 = _mm256_set_epi32(
(Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue,
(Int32)subValue, (Int32)subValue, (Int32)subValue, (Int32)subValue);
Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
do
{
SASUB_256(0)
SASUB_256(1)
items += 2 * 8;
SASUB_256(0) SASUB_256(1) items += 2 * 8;
SASUB_256(0) SASUB_256(1) items += 2 * 8;
}
while (items != lim);
}
#endif // USE_AVX2
#endif // USE_LZFIND_SATUR_SUB_256
#ifndef FORCE_SATUR_SUB_128
typedef void (MY_FAST_CALL *LZFIND_SATUR_SUB_CODE_FUNC)(
#ifndef FORCE_LZFIND_SATUR_SUB_128
typedef void (Z7_FASTCALL *LZFIND_SATUR_SUB_CODE_FUNC)(
UInt32 subValue, CLzRef *items, const CLzRef *lim);
static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
#endif // FORCE_SATUR_SUB_128
#endif // FORCE_LZFIND_SATUR_SUB_128
#endif // USE_SATUR_SUB_128
#endif // USE_LZFIND_SATUR_SUB_128
// kEmptyHashValue must be zero
// #define SASUB_32(i) v = items[i]; m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m;
#define SASUB_32(i) v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue;
// #define SASUB_32(i) { UInt32 v = items[i]; UInt32 m = v - subValue; if (v < subValue) m = kEmptyHashValue; items[i] = m; }
#define SASUB_32(i) { UInt32 v = items[i]; if (v < subValue) v = subValue; items[i] = v - subValue; }
#ifdef FORCE_SATUR_SUB_128
#ifdef FORCE_LZFIND_SATUR_SUB_128
#define DEFAULT_SaturSub LzFind_SaturSub_128
@ -672,24 +764,19 @@ static LZFIND_SATUR_SUB_CODE_FUNC g_LzFind_SaturSub;
#define DEFAULT_SaturSub LzFind_SaturSub_32
MY_NO_INLINE
Z7_NO_INLINE
static
void
MY_FAST_CALL
Z7_FASTCALL
LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
{
Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
do
{
UInt32 v;
SASUB_32(0)
SASUB_32(1)
SASUB_32(2)
SASUB_32(3)
SASUB_32(4)
SASUB_32(5)
SASUB_32(6)
SASUB_32(7)
items += 8;
SASUB_32(0) SASUB_32(1) items += 2;
SASUB_32(0) SASUB_32(1) items += 2;
SASUB_32(0) SASUB_32(1) items += 2;
SASUB_32(0) SASUB_32(1) items += 2;
}
while (items != lim);
}
@ -697,27 +784,23 @@ LzFind_SaturSub_32(UInt32 subValue, CLzRef *items, const CLzRef *lim)
#endif
MY_NO_INLINE
Z7_NO_INLINE
void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
{
#define K_NORM_ALIGN_BLOCK_SIZE (1 << 6)
CLzRef *lim;
for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (K_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
#define LZFIND_NORM_ALIGN_BLOCK_SIZE (1 << 7)
Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
for (; numItems != 0 && ((unsigned)(ptrdiff_t)items & (LZFIND_NORM_ALIGN_BLOCK_SIZE - 1)) != 0; numItems--)
{
UInt32 v;
SASUB_32(0);
SASUB_32(0)
items++;
}
{
#define K_NORM_ALIGN_MASK (K_NORM_ALIGN_BLOCK_SIZE / 4 - 1)
lim = items + (numItems & ~(size_t)K_NORM_ALIGN_MASK);
numItems &= K_NORM_ALIGN_MASK;
const size_t k_Align_Mask = (LZFIND_NORM_ALIGN_BLOCK_SIZE / 4 - 1);
CLzRef *lim = items + (numItems & ~(size_t)k_Align_Mask);
numItems &= k_Align_Mask;
if (items != lim)
{
#if defined(USE_SATUR_SUB_128) && !defined(FORCE_SATUR_SUB_128)
#if defined(USE_LZFIND_SATUR_SUB_128) && !defined(FORCE_LZFIND_SATUR_SUB_128)
if (g_LzFind_SaturSub)
g_LzFind_SaturSub(subValue, items, lim);
else
@ -726,12 +809,10 @@ void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
}
items = lim;
}
Z7_PRAGMA_OPT_DISABLE_LOOP_UNROLL_VECTORIZE
for (; numItems != 0; numItems--)
{
UInt32 v;
SASUB_32(0);
SASUB_32(0)
items++;
}
}
@ -740,7 +821,7 @@ void MatchFinder_Normalize3(UInt32 subValue, CLzRef *items, size_t numItems)
// call MatchFinder_CheckLimits() only after (p->pos++) update
MY_NO_INLINE
Z7_NO_INLINE
static void MatchFinder_CheckLimits(CMatchFinder *p)
{
if (// !p->streamEndWasReached && p->result == SZ_OK &&
@ -768,11 +849,14 @@ static void MatchFinder_CheckLimits(CMatchFinder *p)
const UInt32 subValue = (p->pos - p->historySize - 1) /* & ~(UInt32)(kNormalizeAlign - 1) */;
// const UInt32 subValue = (1 << 15); // for debug
// printf("\nMatchFinder_Normalize() subValue == 0x%x\n", subValue);
size_t numSonRefs = p->cyclicBufferSize;
if (p->btMode)
numSonRefs <<= 1;
Inline_MatchFinder_ReduceOffsets(p, subValue);
MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashSizeSum + numSonRefs);
MatchFinder_REDUCE_OFFSETS(p, subValue)
MatchFinder_Normalize3(subValue, p->hash, (size_t)p->hashMask + 1 + p->fixedHashSize);
{
size_t numSonRefs = p->cyclicBufferSize;
if (p->btMode)
numSonRefs <<= 1;
MatchFinder_Normalize3(subValue, p->son, numSonRefs);
}
}
if (p->cyclicBufferPos == p->cyclicBufferSize)
@ -785,7 +869,7 @@ static void MatchFinder_CheckLimits(CMatchFinder *p)
/*
(lenLimit > maxLen)
*/
MY_FORCE_INLINE
Z7_FORCE_INLINE
static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
UInt32 *d, unsigned maxLen)
@ -867,7 +951,7 @@ static UInt32 * Hc_GetMatchesSpec(size_t lenLimit, UInt32 curMatch, UInt32 pos,
}
MY_FORCE_INLINE
Z7_FORCE_INLINE
UInt32 * GetMatchesSpec1(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const Byte *cur, CLzRef *son,
size_t _cyclicBufferPos, UInt32 _cyclicBufferSize, UInt32 cutValue,
UInt32 *d, UInt32 maxLen)
@ -1004,7 +1088,7 @@ static void SkipMatchesSpec(UInt32 lenLimit, UInt32 curMatch, UInt32 pos, const
#define MOVE_POS_RET MOVE_POS return distances;
MY_NO_INLINE
Z7_NO_INLINE
static void MatchFinder_MovePos(CMatchFinder *p)
{
/* we go here at the end of stream data, when (avail < num_hash_bytes)
@ -1015,11 +1099,11 @@ static void MatchFinder_MovePos(CMatchFinder *p)
if (p->btMode)
p->sons[(p->cyclicBufferPos << p->btMode) + 1] = 0; // kEmptyHashValue
*/
MOVE_POS;
MOVE_POS
}
#define GET_MATCHES_HEADER2(minLen, ret_op) \
unsigned lenLimit; UInt32 hv; Byte *cur; UInt32 curMatch; \
unsigned lenLimit; UInt32 hv; const Byte *cur; UInt32 curMatch; \
lenLimit = (unsigned)p->lenLimit; { if (lenLimit < minLen) { MatchFinder_MovePos(p); ret_op; }} \
cur = p->buffer;
@ -1028,11 +1112,11 @@ static void MatchFinder_MovePos(CMatchFinder *p)
#define MF_PARAMS(p) lenLimit, curMatch, p->pos, p->buffer, p->son, p->cyclicBufferPos, p->cyclicBufferSize, p->cutValue
#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS; } while (--num);
#define SKIP_FOOTER SkipMatchesSpec(MF_PARAMS(p)); MOVE_POS } while (--num);
#define GET_MATCHES_FOOTER_BASE(_maxLen_, func) \
distances = func(MF_PARAMS(p), \
distances, (UInt32)_maxLen_); MOVE_POS_RET;
distances, (UInt32)_maxLen_); MOVE_POS_RET
#define GET_MATCHES_FOOTER_BT(_maxLen_) \
GET_MATCHES_FOOTER_BASE(_maxLen_, GetMatchesSpec1)
@ -1052,7 +1136,7 @@ static void MatchFinder_MovePos(CMatchFinder *p)
static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
{
GET_MATCHES_HEADER(2)
HASH2_CALC;
HASH2_CALC
curMatch = p->hash[hv];
p->hash[hv] = p->pos;
GET_MATCHES_FOOTER_BT(1)
@ -1061,7 +1145,7 @@ static UInt32* Bt2_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
UInt32* Bt3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
{
GET_MATCHES_HEADER(3)
HASH_ZIP_CALC;
HASH_ZIP_CALC
curMatch = p->hash[hv];
p->hash[hv] = p->pos;
GET_MATCHES_FOOTER_BT(2)
@ -1082,7 +1166,7 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
UInt32 *hash;
GET_MATCHES_HEADER(3)
HASH3_CALC;
HASH3_CALC
hash = p->hash;
pos = p->pos;
@ -1107,7 +1191,7 @@ static UInt32* Bt3_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
if (maxLen == lenLimit)
{
SkipMatchesSpec(MF_PARAMS(p));
MOVE_POS_RET;
MOVE_POS_RET
}
}
@ -1123,7 +1207,7 @@ static UInt32* Bt4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
UInt32 *hash;
GET_MATCHES_HEADER(4)
HASH4_CALC;
HASH4_CALC
hash = p->hash;
pos = p->pos;
@ -1190,7 +1274,7 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
UInt32 *hash;
GET_MATCHES_HEADER(5)
HASH5_CALC;
HASH5_CALC
hash = p->hash;
pos = p->pos;
@ -1246,7 +1330,7 @@ static UInt32* Bt5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
if (maxLen == lenLimit)
{
SkipMatchesSpec(MF_PARAMS(p));
MOVE_POS_RET;
MOVE_POS_RET
}
break;
}
@ -1263,7 +1347,7 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
UInt32 *hash;
GET_MATCHES_HEADER(4)
HASH4_CALC;
HASH4_CALC
hash = p->hash;
pos = p->pos;
@ -1314,12 +1398,12 @@ static UInt32* Hc4_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
if (maxLen == lenLimit)
{
p->son[p->cyclicBufferPos] = curMatch;
MOVE_POS_RET;
MOVE_POS_RET
}
break;
}
GET_MATCHES_FOOTER_HC(maxLen);
GET_MATCHES_FOOTER_HC(maxLen)
}
@ -1330,7 +1414,7 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
UInt32 *hash;
GET_MATCHES_HEADER(5)
HASH5_CALC;
HASH5_CALC
hash = p->hash;
pos = p->pos;
@ -1386,19 +1470,19 @@ static UInt32 * Hc5_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
if (maxLen == lenLimit)
{
p->son[p->cyclicBufferPos] = curMatch;
MOVE_POS_RET;
MOVE_POS_RET
}
break;
}
GET_MATCHES_FOOTER_HC(maxLen);
GET_MATCHES_FOOTER_HC(maxLen)
}
UInt32* Hc3Zip_MatchFinder_GetMatches(CMatchFinder *p, UInt32 *distances)
{
GET_MATCHES_HEADER(3)
HASH_ZIP_CALC;
HASH_ZIP_CALC
curMatch = p->hash[hv];
p->hash[hv] = p->pos;
GET_MATCHES_FOOTER_HC(2)
@ -1409,7 +1493,7 @@ static void Bt2_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
{
SKIP_HEADER(2)
{
HASH2_CALC;
HASH2_CALC
curMatch = p->hash[hv];
p->hash[hv] = p->pos;
}
@ -1420,7 +1504,7 @@ void Bt3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
{
SKIP_HEADER(3)
{
HASH_ZIP_CALC;
HASH_ZIP_CALC
curMatch = p->hash[hv];
p->hash[hv] = p->pos;
}
@ -1433,7 +1517,7 @@ static void Bt3_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
{
UInt32 h2;
UInt32 *hash;
HASH3_CALC;
HASH3_CALC
hash = p->hash;
curMatch = (hash + kFix3HashSize)[hv];
hash[h2] =
@ -1448,7 +1532,7 @@ static void Bt4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
{
UInt32 h2, h3;
UInt32 *hash;
HASH4_CALC;
HASH4_CALC
hash = p->hash;
curMatch = (hash + kFix4HashSize)[hv];
hash [h2] =
@ -1464,7 +1548,7 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
{
UInt32 h2, h3;
UInt32 *hash;
HASH5_CALC;
HASH5_CALC
hash = p->hash;
curMatch = (hash + kFix5HashSize)[hv];
hash [h2] =
@ -1478,7 +1562,7 @@ static void Bt5_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
#define HC_SKIP_HEADER(minLen) \
do { if (p->lenLimit < minLen) { MatchFinder_MovePos(p); num--; continue; } { \
Byte *cur; \
const Byte *cur; \
UInt32 *hash; \
UInt32 *son; \
UInt32 pos = p->pos; \
@ -1510,7 +1594,7 @@ static void Hc4_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
HC_SKIP_HEADER(4)
UInt32 h2, h3;
HASH4_CALC;
HASH4_CALC
curMatch = (hash + kFix4HashSize)[hv];
hash [h2] =
(hash + kFix3HashSize)[h3] =
@ -1540,7 +1624,7 @@ void Hc3Zip_MatchFinder_Skip(CMatchFinder *p, UInt32 num)
{
HC_SKIP_HEADER(3)
HASH_ZIP_CALC;
HASH_ZIP_CALC
curMatch = hash[hv];
hash[hv] = pos;
@ -1590,17 +1674,17 @@ void MatchFinder_CreateVTable(CMatchFinder *p, IMatchFinder2 *vTable)
void LzFindPrepare()
void LzFindPrepare(void)
{
#ifndef FORCE_SATUR_SUB_128
#ifdef USE_SATUR_SUB_128
#ifndef FORCE_LZFIND_SATUR_SUB_128
#ifdef USE_LZFIND_SATUR_SUB_128
LZFIND_SATUR_SUB_CODE_FUNC f = NULL;
#ifdef MY_CPU_ARM_OR_ARM64
{
if (CPU_IsSupported_NEON())
{
// #pragma message ("=== LzFind NEON")
_PRF(printf("\n=== LzFind NEON\n"));
PRF(printf("\n=== LzFind NEON\n"));
f = LzFind_SaturSub_128;
}
// f = 0; // for debug
@ -1609,20 +1693,25 @@ void LzFindPrepare()
if (CPU_IsSupported_SSE41())
{
// #pragma message ("=== LzFind SSE41")
_PRF(printf("\n=== LzFind SSE41\n"));
PRF(printf("\n=== LzFind SSE41\n"));
f = LzFind_SaturSub_128;
#ifdef USE_AVX2
#ifdef USE_LZFIND_SATUR_SUB_256
if (CPU_IsSupported_AVX2())
{
// #pragma message ("=== LzFind AVX2")
_PRF(printf("\n=== LzFind AVX2\n"));
PRF(printf("\n=== LzFind AVX2\n"));
f = LzFind_SaturSub_256;
}
#endif
}
#endif // MY_CPU_ARM_OR_ARM64
g_LzFind_SaturSub = f;
#endif // USE_SATUR_SUB_128
#endif // FORCE_SATUR_SUB_128
#endif // USE_LZFIND_SATUR_SUB_128
#endif // FORCE_LZFIND_SATUR_SUB_128
}
#undef MOVE_POS
#undef MOVE_POS_RET
#undef PRF