mirror of
https://github.com/RPCS3/rpcs3.git
synced 2025-07-03 13:31:27 +12:00
Update 7z to 23.01
This commit is contained in:
parent
98b7642232
commit
00a80adfae
100 changed files with 11232 additions and 3906 deletions
348
3rdparty/7z/src/AesOpt.c
vendored
348
3rdparty/7z/src/AesOpt.c
vendored
|
@ -1,39 +1,33 @@
|
|||
/* AesOpt.c -- AES optimized code for x86 AES hardware instructions
|
||||
2021-04-01 : Igor Pavlov : Public domain */
|
||||
2023-04-02 : Igor Pavlov : Public domain */
|
||||
|
||||
#include "Precomp.h"
|
||||
|
||||
#include "Aes.h"
|
||||
#include "CpuArch.h"
|
||||
|
||||
#ifdef MY_CPU_X86_OR_AMD64
|
||||
|
||||
#if defined(__clang__)
|
||||
#if __clang_major__ > 3 || (__clang_major__ == 3 && __clang_minor__ >= 8)
|
||||
#define USE_INTEL_AES
|
||||
#define ATTRIB_AES __attribute__((__target__("aes")))
|
||||
#if (__clang_major__ >= 8)
|
||||
#define USE_INTEL_VAES
|
||||
#define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
|
||||
#endif
|
||||
#endif
|
||||
#elif defined(__GNUC__)
|
||||
#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4)
|
||||
#define USE_INTEL_AES
|
||||
#ifndef __AES__
|
||||
#define ATTRIB_AES __attribute__((__target__("aes")))
|
||||
#endif
|
||||
#if (__GNUC__ >= 8)
|
||||
#define USE_INTEL_VAES
|
||||
#define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx2")))
|
||||
#endif
|
||||
#endif
|
||||
#elif defined(__INTEL_COMPILER)
|
||||
#if defined(__INTEL_COMPILER)
|
||||
#if (__INTEL_COMPILER >= 1110)
|
||||
#define USE_INTEL_AES
|
||||
#if (__INTEL_COMPILER >= 1900)
|
||||
#define USE_INTEL_VAES
|
||||
#endif
|
||||
#endif
|
||||
#elif defined(__clang__) && (__clang_major__ > 3 || __clang_major__ == 3 && __clang_minor__ >= 8) \
|
||||
|| defined(__GNUC__) && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 4)
|
||||
#define USE_INTEL_AES
|
||||
#if !defined(__AES__)
|
||||
#define ATTRIB_AES __attribute__((__target__("aes")))
|
||||
#endif
|
||||
#if defined(__clang__) && (__clang_major__ >= 8) \
|
||||
|| defined(__GNUC__) && (__GNUC__ >= 8)
|
||||
#define USE_INTEL_VAES
|
||||
#if !defined(__AES__) || !defined(__VAES__) || !defined(__AVX__) || !defined(__AVX2__)
|
||||
#define ATTRIB_VAES __attribute__((__target__("aes,vaes,avx,avx2")))
|
||||
#endif
|
||||
#endif
|
||||
#elif defined(_MSC_VER)
|
||||
#if (_MSC_VER > 1500) || (_MSC_FULL_VER >= 150030729)
|
||||
#define USE_INTEL_AES
|
||||
|
@ -56,12 +50,15 @@
|
|||
#include <wmmintrin.h>
|
||||
|
||||
#ifndef USE_INTEL_VAES
|
||||
#define AES_TYPE_keys __m128i
|
||||
#define AES_TYPE_data __m128i
|
||||
#define AES_TYPE_keys UInt32
|
||||
#define AES_TYPE_data Byte
|
||||
// #define AES_TYPE_keys __m128i
|
||||
// #define AES_TYPE_data __m128i
|
||||
#endif
|
||||
|
||||
#define AES_FUNC_START(name) \
|
||||
void MY_FAST_CALL name(__m128i *p, __m128i *data, size_t numBlocks)
|
||||
void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
|
||||
// void Z7_FASTCALL name(__m128i *p, __m128i *data, size_t numBlocks)
|
||||
|
||||
#define AES_FUNC_START2(name) \
|
||||
AES_FUNC_START (name); \
|
||||
|
@ -69,14 +66,16 @@ ATTRIB_AES \
|
|||
AES_FUNC_START (name)
|
||||
|
||||
#define MM_OP(op, dest, src) dest = op(dest, src);
|
||||
#define MM_OP_m(op, src) MM_OP(op, m, src);
|
||||
#define MM_OP_m(op, src) MM_OP(op, m, src)
|
||||
|
||||
#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src);
|
||||
#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src);
|
||||
#define MM_XOR( dest, src) MM_OP(_mm_xor_si128, dest, src)
|
||||
#define AVX_XOR(dest, src) MM_OP(_mm256_xor_si256, dest, src)
|
||||
|
||||
|
||||
AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
{
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i m = *p;
|
||||
const __m128i k0 = p[2];
|
||||
const __m128i k1 = p[3];
|
||||
|
@ -86,17 +85,17 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
|||
UInt32 r = numRounds2;
|
||||
const __m128i *w = p + 4;
|
||||
__m128i temp = *data;
|
||||
MM_XOR (temp, k0);
|
||||
MM_XOR (m, temp);
|
||||
MM_OP_m (_mm_aesenc_si128, k1);
|
||||
MM_XOR (temp, k0)
|
||||
MM_XOR (m, temp)
|
||||
MM_OP_m (_mm_aesenc_si128, k1)
|
||||
do
|
||||
{
|
||||
MM_OP_m (_mm_aesenc_si128, w[0]);
|
||||
MM_OP_m (_mm_aesenc_si128, w[1]);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0])
|
||||
MM_OP_m (_mm_aesenc_si128, w[1])
|
||||
w += 2;
|
||||
}
|
||||
while (--r);
|
||||
MM_OP_m (_mm_aesenclast_si128, w[0]);
|
||||
MM_OP_m (_mm_aesenclast_si128, w[0])
|
||||
*data = m;
|
||||
}
|
||||
*p = m;
|
||||
|
@ -104,14 +103,14 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
|||
|
||||
|
||||
#define WOP_1(op)
|
||||
#define WOP_2(op) WOP_1 (op) op (m1, 1);
|
||||
#define WOP_3(op) WOP_2 (op) op (m2, 2);
|
||||
#define WOP_4(op) WOP_3 (op) op (m3, 3);
|
||||
#define WOP_2(op) WOP_1 (op) op (m1, 1)
|
||||
#define WOP_3(op) WOP_2 (op) op (m2, 2)
|
||||
#define WOP_4(op) WOP_3 (op) op (m3, 3)
|
||||
#ifdef MY_CPU_AMD64
|
||||
#define WOP_5(op) WOP_4 (op) op (m4, 4);
|
||||
#define WOP_6(op) WOP_5 (op) op (m5, 5);
|
||||
#define WOP_7(op) WOP_6 (op) op (m6, 6);
|
||||
#define WOP_8(op) WOP_7 (op) op (m7, 7);
|
||||
#define WOP_5(op) WOP_4 (op) op (m4, 4)
|
||||
#define WOP_6(op) WOP_5 (op) op (m5, 5)
|
||||
#define WOP_7(op) WOP_6 (op) op (m6, 6)
|
||||
#define WOP_8(op) WOP_7 (op) op (m7, 7)
|
||||
#endif
|
||||
/*
|
||||
#define WOP_9(op) WOP_8 (op) op (m8, 8);
|
||||
|
@ -130,20 +129,20 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
|||
#define WOP_M1 WOP_4
|
||||
#endif
|
||||
|
||||
#define WOP(op) op (m0, 0); WOP_M1(op)
|
||||
#define WOP(op) op (m0, 0) WOP_M1(op)
|
||||
|
||||
|
||||
#define DECLARE_VAR(reg, ii) __m128i reg
|
||||
#define DECLARE_VAR(reg, ii) __m128i reg;
|
||||
#define LOAD_data( reg, ii) reg = data[ii];
|
||||
#define STORE_data( reg, ii) data[ii] = reg;
|
||||
#if (NUM_WAYS > 1)
|
||||
#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]);
|
||||
#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
|
||||
#endif
|
||||
|
||||
#define AVX__DECLARE_VAR(reg, ii) __m256i reg
|
||||
#define AVX__LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
|
||||
#define AVX__STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
|
||||
#define AVX__XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]));
|
||||
#define AVX_DECLARE_VAR(reg, ii) __m256i reg;
|
||||
#define AVX_LOAD_data( reg, ii) reg = ((const __m256i *)(const void *)data)[ii];
|
||||
#define AVX_STORE_data( reg, ii) ((__m256i *)(void *)data)[ii] = reg;
|
||||
#define AVX_XOR_data_M1(reg, ii) AVX_XOR (reg, (((const __m256i *)(const void *)(data - 1))[ii]))
|
||||
|
||||
#define MM_OP_key(op, reg) MM_OP(op, reg, key);
|
||||
|
||||
|
@ -154,23 +153,23 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
|||
#define AES_XOR( reg, ii) MM_OP_key (_mm_xor_si128, reg)
|
||||
|
||||
|
||||
#define AVX__AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
|
||||
#define AVX__AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
|
||||
#define AVX__AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
|
||||
#define AVX__AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
|
||||
#define AVX__AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
|
||||
#define AVX_AES_DEC( reg, ii) MM_OP_key (_mm256_aesdec_epi128, reg)
|
||||
#define AVX_AES_DEC_LAST( reg, ii) MM_OP_key (_mm256_aesdeclast_epi128, reg)
|
||||
#define AVX_AES_ENC( reg, ii) MM_OP_key (_mm256_aesenc_epi128, reg)
|
||||
#define AVX_AES_ENC_LAST( reg, ii) MM_OP_key (_mm256_aesenclast_epi128, reg)
|
||||
#define AVX_AES_XOR( reg, ii) MM_OP_key (_mm256_xor_si256, reg)
|
||||
|
||||
#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one); reg = ctr;
|
||||
#define CTR_END( reg, ii) MM_XOR (data[ii], reg);
|
||||
#define CTR_START(reg, ii) MM_OP (_mm_add_epi64, ctr, one) reg = ctr;
|
||||
#define CTR_END( reg, ii) MM_XOR (data[ii], reg)
|
||||
|
||||
#define AVX__CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two); reg = _mm256_xor_si256(ctr2, key);
|
||||
#define AVX__CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg);
|
||||
#define AVX_CTR_START(reg, ii) MM_OP (_mm256_add_epi64, ctr2, two) reg = _mm256_xor_si256(ctr2, key);
|
||||
#define AVX_CTR_END( reg, ii) AVX_XOR (((__m256i *)(void *)data)[ii], reg)
|
||||
|
||||
#define WOP_KEY(op, n) { \
|
||||
const __m128i key = w[n]; \
|
||||
WOP(op); }
|
||||
|
||||
#define AVX__WOP_KEY(op, n) { \
|
||||
#define AVX_WOP_KEY(op, n) { \
|
||||
const __m256i key = w[n]; \
|
||||
WOP(op); }
|
||||
|
||||
|
@ -218,6 +217,8 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
|||
|
||||
AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
{
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i iv = *p;
|
||||
const __m128i *wStart = p + *(const UInt32 *)(p + 1) * 2 + 2 - 1;
|
||||
const __m128i *dataEnd;
|
||||
|
@ -228,7 +229,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
|||
const __m128i *w = wStart;
|
||||
|
||||
WOP (DECLARE_VAR)
|
||||
WOP (LOAD_data);
|
||||
WOP (LOAD_data)
|
||||
WOP_KEY (AES_XOR, 1)
|
||||
|
||||
do
|
||||
|
@ -239,10 +240,10 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
|||
while (w != p);
|
||||
WOP_KEY (AES_DEC_LAST, 0)
|
||||
|
||||
MM_XOR (m0, iv);
|
||||
MM_XOR (m0, iv)
|
||||
WOP_M1 (XOR_data_M1)
|
||||
iv = data[NUM_WAYS - 1];
|
||||
WOP (STORE_data);
|
||||
WOP (STORE_data)
|
||||
}
|
||||
WIDE_LOOP_END
|
||||
|
||||
|
@ -252,15 +253,15 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
|||
__m128i m = _mm_xor_si128 (w[2], *data);
|
||||
do
|
||||
{
|
||||
MM_OP_m (_mm_aesdec_si128, w[1]);
|
||||
MM_OP_m (_mm_aesdec_si128, w[0]);
|
||||
MM_OP_m (_mm_aesdec_si128, w[1])
|
||||
MM_OP_m (_mm_aesdec_si128, w[0])
|
||||
w -= 2;
|
||||
}
|
||||
while (w != p);
|
||||
MM_OP_m (_mm_aesdec_si128, w[1]);
|
||||
MM_OP_m (_mm_aesdeclast_si128, w[0]);
|
||||
MM_OP_m (_mm_aesdec_si128, w[1])
|
||||
MM_OP_m (_mm_aesdeclast_si128, w[0])
|
||||
|
||||
MM_XOR (m, iv);
|
||||
MM_XOR (m, iv)
|
||||
iv = *data;
|
||||
*data = m;
|
||||
}
|
||||
|
@ -271,6 +272,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
|||
|
||||
AES_FUNC_START2 (AesCtr_Code_HW)
|
||||
{
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i ctr = *p;
|
||||
UInt32 numRoundsMinus2 = *(const UInt32 *)(p + 1) * 2 - 1;
|
||||
const __m128i *dataEnd;
|
||||
|
@ -283,7 +286,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
|||
const __m128i *w = p;
|
||||
UInt32 r = numRoundsMinus2;
|
||||
WOP (DECLARE_VAR)
|
||||
WOP (CTR_START);
|
||||
WOP (CTR_START)
|
||||
WOP_KEY (AES_XOR, 0)
|
||||
w += 1;
|
||||
do
|
||||
|
@ -294,7 +297,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
|||
while (--r);
|
||||
WOP_KEY (AES_ENC_LAST, 0)
|
||||
|
||||
WOP (CTR_END);
|
||||
WOP (CTR_END)
|
||||
}
|
||||
WIDE_LOOP_END
|
||||
|
||||
|
@ -303,19 +306,19 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
|||
UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
|
||||
const __m128i *w = p;
|
||||
__m128i m;
|
||||
MM_OP (_mm_add_epi64, ctr, one);
|
||||
MM_OP (_mm_add_epi64, ctr, one)
|
||||
m = _mm_xor_si128 (ctr, p[0]);
|
||||
w += 1;
|
||||
do
|
||||
{
|
||||
MM_OP_m (_mm_aesenc_si128, w[0]);
|
||||
MM_OP_m (_mm_aesenc_si128, w[1]);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0])
|
||||
MM_OP_m (_mm_aesenc_si128, w[1])
|
||||
w += 2;
|
||||
}
|
||||
while (--numRounds2);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0]);
|
||||
MM_OP_m (_mm_aesenclast_si128, w[1]);
|
||||
MM_XOR (*data, m);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0])
|
||||
MM_OP_m (_mm_aesenclast_si128, w[1])
|
||||
MM_XOR (*data, m)
|
||||
}
|
||||
|
||||
p[-2] = ctr;
|
||||
|
@ -325,17 +328,58 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
|||
|
||||
#ifdef USE_INTEL_VAES
|
||||
|
||||
/*
|
||||
GCC before 2013-Jun:
|
||||
<immintrin.h>:
|
||||
#ifdef __AVX__
|
||||
#include <avxintrin.h>
|
||||
#endif
|
||||
GCC after 2013-Jun:
|
||||
<immintrin.h>:
|
||||
#include <avxintrin.h>
|
||||
CLANG 3.8+:
|
||||
{
|
||||
<immintrin.h>:
|
||||
#if !defined(_MSC_VER) || defined(__AVX__)
|
||||
#include <avxintrin.h>
|
||||
#endif
|
||||
|
||||
if (the compiler is clang for Windows and if global arch is not set for __AVX__)
|
||||
[ if (defined(_MSC_VER) && !defined(__AVX__)) ]
|
||||
{
|
||||
<immintrin.h> doesn't include <avxintrin.h>
|
||||
and we have 2 ways to fix it:
|
||||
1) we can define required __AVX__ before <immintrin.h>
|
||||
or
|
||||
2) we can include <avxintrin.h> after <immintrin.h>
|
||||
}
|
||||
}
|
||||
|
||||
If we include <avxintrin.h> manually for GCC/CLANG, it's
|
||||
required that <immintrin.h> must be included before <avxintrin.h>.
|
||||
*/
|
||||
|
||||
/*
|
||||
#if defined(__clang__) && defined(_MSC_VER)
|
||||
#define __SSE4_2__
|
||||
#define __AES__
|
||||
#define __AVX__
|
||||
#define __AVX2__
|
||||
#define __VAES__
|
||||
#define __AVX512F__
|
||||
#define __AVX512VL__
|
||||
#endif
|
||||
*/
|
||||
|
||||
#include <immintrin.h>
|
||||
#if defined(__clang__) && defined(_MSC_VER)
|
||||
#if !defined(__AVX__)
|
||||
#include <avxintrin.h>
|
||||
#endif
|
||||
#if !defined(__AVX2__)
|
||||
#include <avx2intrin.h>
|
||||
#endif
|
||||
#if !defined(__VAES__)
|
||||
#include <vaesintrin.h>
|
||||
#endif
|
||||
#endif // __clang__ && _MSC_VER
|
||||
|
||||
|
||||
#define VAES_FUNC_START2(name) \
|
||||
AES_FUNC_START (name); \
|
||||
|
@ -344,6 +388,8 @@ AES_FUNC_START (name)
|
|||
|
||||
VAES_FUNC_START2 (AesCbc_Decode_HW_256)
|
||||
{
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i iv = *p;
|
||||
const __m128i *dataEnd;
|
||||
UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
|
||||
|
@ -353,22 +399,22 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
|
|||
{
|
||||
const __m256i *w = keys + numRounds - 2;
|
||||
|
||||
WOP (AVX__DECLARE_VAR)
|
||||
WOP (AVX__LOAD_data);
|
||||
AVX__WOP_KEY (AVX__AES_XOR, 1)
|
||||
WOP (AVX_DECLARE_VAR)
|
||||
WOP (AVX_LOAD_data)
|
||||
AVX_WOP_KEY (AVX_AES_XOR, 1)
|
||||
|
||||
do
|
||||
{
|
||||
AVX__WOP_KEY (AVX__AES_DEC, 0)
|
||||
AVX_WOP_KEY (AVX_AES_DEC, 0)
|
||||
w--;
|
||||
}
|
||||
while (w != keys);
|
||||
AVX__WOP_KEY (AVX__AES_DEC_LAST, 0)
|
||||
AVX_WOP_KEY (AVX_AES_DEC_LAST, 0)
|
||||
|
||||
AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]));
|
||||
WOP_M1 (AVX__XOR_data_M1)
|
||||
AVX_XOR (m0, _mm256_setr_m128i(iv, data[0]))
|
||||
WOP_M1 (AVX_XOR_data_M1)
|
||||
iv = data[NUM_WAYS * 2 - 1];
|
||||
WOP (AVX__STORE_data);
|
||||
WOP (AVX_STORE_data)
|
||||
}
|
||||
WIDE_LOOP_END_AVX(;)
|
||||
|
||||
|
@ -378,15 +424,15 @@ VAES_FUNC_START2 (AesCbc_Decode_HW_256)
|
|||
__m128i m = _mm_xor_si128 (w[2], *data);
|
||||
do
|
||||
{
|
||||
MM_OP_m (_mm_aesdec_si128, w[1]);
|
||||
MM_OP_m (_mm_aesdec_si128, w[0]);
|
||||
MM_OP_m (_mm_aesdec_si128, w[1])
|
||||
MM_OP_m (_mm_aesdec_si128, w[0])
|
||||
w -= 2;
|
||||
}
|
||||
while (w != p);
|
||||
MM_OP_m (_mm_aesdec_si128, w[1]);
|
||||
MM_OP_m (_mm_aesdeclast_si128, w[0]);
|
||||
MM_OP_m (_mm_aesdec_si128, w[1])
|
||||
MM_OP_m (_mm_aesdeclast_si128, w[0])
|
||||
|
||||
MM_XOR (m, iv);
|
||||
MM_XOR (m, iv)
|
||||
iv = *data;
|
||||
*data = m;
|
||||
}
|
||||
|
@ -403,18 +449,20 @@ AVX2: _mm256_add_epi64 : vpaddq ymm, ymm, ymm
|
|||
_mm256_broadcastsi128_si256 : vbroadcasti128
|
||||
*/
|
||||
|
||||
#define AVX__CTR_LOOP_START \
|
||||
#define AVX_CTR_LOOP_START \
|
||||
ctr2 = _mm256_setr_m128i(_mm_sub_epi64(ctr, one), ctr); \
|
||||
two = _mm256_setr_m128i(one, one); \
|
||||
two = _mm256_add_epi64(two, two); \
|
||||
|
||||
// two = _mm256_setr_epi64x(2, 0, 2, 0);
|
||||
|
||||
#define AVX__CTR_LOOP_ENC \
|
||||
#define AVX_CTR_LOOP_ENC \
|
||||
ctr = _mm256_extracti128_si256 (ctr2, 1); \
|
||||
|
||||
VAES_FUNC_START2 (AesCtr_Code_HW_256)
|
||||
{
|
||||
__m128i *p = (__m128i *)(void *)ivAes;
|
||||
__m128i *data = (__m128i *)(void *)data8;
|
||||
__m128i ctr = *p;
|
||||
UInt32 numRounds = *(const UInt32 *)(p + 1) * 2 + 1;
|
||||
const __m128i *dataEnd;
|
||||
|
@ -422,44 +470,44 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
|
|||
__m256i ctr2, two;
|
||||
p += 2;
|
||||
|
||||
WIDE_LOOP_START_AVX (AVX__CTR_LOOP_START)
|
||||
WIDE_LOOP_START_AVX (AVX_CTR_LOOP_START)
|
||||
{
|
||||
const __m256i *w = keys;
|
||||
UInt32 r = numRounds - 2;
|
||||
WOP (AVX__DECLARE_VAR)
|
||||
AVX__WOP_KEY (AVX__CTR_START, 0);
|
||||
WOP (AVX_DECLARE_VAR)
|
||||
AVX_WOP_KEY (AVX_CTR_START, 0)
|
||||
|
||||
w += 1;
|
||||
do
|
||||
{
|
||||
AVX__WOP_KEY (AVX__AES_ENC, 0)
|
||||
AVX_WOP_KEY (AVX_AES_ENC, 0)
|
||||
w += 1;
|
||||
}
|
||||
while (--r);
|
||||
AVX__WOP_KEY (AVX__AES_ENC_LAST, 0)
|
||||
AVX_WOP_KEY (AVX_AES_ENC_LAST, 0)
|
||||
|
||||
WOP (AVX__CTR_END);
|
||||
WOP (AVX_CTR_END)
|
||||
}
|
||||
WIDE_LOOP_END_AVX (AVX__CTR_LOOP_ENC)
|
||||
WIDE_LOOP_END_AVX (AVX_CTR_LOOP_ENC)
|
||||
|
||||
SINGLE_LOOP
|
||||
{
|
||||
UInt32 numRounds2 = *(const UInt32 *)(p - 2 + 1) - 1;
|
||||
const __m128i *w = p;
|
||||
__m128i m;
|
||||
MM_OP (_mm_add_epi64, ctr, one);
|
||||
MM_OP (_mm_add_epi64, ctr, one)
|
||||
m = _mm_xor_si128 (ctr, p[0]);
|
||||
w += 1;
|
||||
do
|
||||
{
|
||||
MM_OP_m (_mm_aesenc_si128, w[0]);
|
||||
MM_OP_m (_mm_aesenc_si128, w[1]);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0])
|
||||
MM_OP_m (_mm_aesenc_si128, w[1])
|
||||
w += 2;
|
||||
}
|
||||
while (--numRounds2);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0]);
|
||||
MM_OP_m (_mm_aesenclast_si128, w[1]);
|
||||
MM_XOR (*data, m);
|
||||
MM_OP_m (_mm_aesenc_si128, w[0])
|
||||
MM_OP_m (_mm_aesenclast_si128, w[1])
|
||||
MM_XOR (*data, m)
|
||||
}
|
||||
|
||||
p[-2] = ctr;
|
||||
|
@ -477,7 +525,7 @@ VAES_FUNC_START2 (AesCtr_Code_HW_256)
|
|||
#define AES_TYPE_data Byte
|
||||
|
||||
#define AES_FUNC_START(name) \
|
||||
void MY_FAST_CALL name(UInt32 *p, Byte *data, size_t numBlocks) \
|
||||
void Z7_FASTCALL name(UInt32 *p, Byte *data, size_t numBlocks) \
|
||||
|
||||
#define AES_COMPAT_STUB(name) \
|
||||
AES_FUNC_START(name); \
|
||||
|
@ -496,8 +544,8 @@ AES_COMPAT_STUB (AesCtr_Code)
|
|||
#pragma message("VAES HW_SW stub was used")
|
||||
|
||||
#define VAES_COMPAT_STUB(name) \
|
||||
void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
|
||||
void MY_FAST_CALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
|
||||
void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks); \
|
||||
void Z7_FASTCALL name ## _256(UInt32 *p, Byte *data, size_t numBlocks) \
|
||||
{ name((AES_TYPE_keys *)(void *)p, (AES_TYPE_data *)(void *)data, numBlocks); }
|
||||
|
||||
VAES_COMPAT_STUB (AesCbc_Decode_HW)
|
||||
|
@ -551,7 +599,8 @@ VAES_COMPAT_STUB (AesCtr_Code_HW)
|
|||
typedef uint8x16_t v128;
|
||||
|
||||
#define AES_FUNC_START(name) \
|
||||
void MY_FAST_CALL name(v128 *p, v128 *data, size_t numBlocks)
|
||||
void Z7_FASTCALL name(UInt32 *ivAes, Byte *data8, size_t numBlocks)
|
||||
// void Z7_FASTCALL name(v128 *p, v128 *data, size_t numBlocks)
|
||||
|
||||
#define AES_FUNC_START2(name) \
|
||||
AES_FUNC_START (name); \
|
||||
|
@ -559,18 +608,20 @@ ATTRIB_AES \
|
|||
AES_FUNC_START (name)
|
||||
|
||||
#define MM_OP(op, dest, src) dest = op(dest, src);
|
||||
#define MM_OP_m(op, src) MM_OP(op, m, src);
|
||||
#define MM_OP_m(op, src) MM_OP(op, m, src)
|
||||
#define MM_OP1_m(op) m = op(m);
|
||||
|
||||
#define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src);
|
||||
#define MM_XOR_m( src) MM_XOR(m, src);
|
||||
#define MM_XOR( dest, src) MM_OP(veorq_u8, dest, src)
|
||||
#define MM_XOR_m( src) MM_XOR(m, src)
|
||||
|
||||
#define AES_E_m(k) MM_OP_m (vaeseq_u8, k);
|
||||
#define AES_E_MC_m(k) AES_E_m (k); MM_OP1_m(vaesmcq_u8);
|
||||
#define AES_E_m(k) MM_OP_m (vaeseq_u8, k)
|
||||
#define AES_E_MC_m(k) AES_E_m (k) MM_OP1_m(vaesmcq_u8)
|
||||
|
||||
|
||||
AES_FUNC_START2 (AesCbc_Encode_HW)
|
||||
{
|
||||
v128 *p = (v128*)(void*)ivAes;
|
||||
v128 *data = (v128*)(void*)data8;
|
||||
v128 m = *p;
|
||||
const v128 k0 = p[2];
|
||||
const v128 k1 = p[3];
|
||||
|
@ -608,7 +659,7 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
|||
AES_E_MC_m (p[14])
|
||||
}
|
||||
}
|
||||
AES_E_m (k_z1);
|
||||
AES_E_m (k_z1)
|
||||
MM_XOR_m (k_z0);
|
||||
*data = m;
|
||||
}
|
||||
|
@ -617,44 +668,44 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
|||
|
||||
|
||||
#define WOP_1(op)
|
||||
#define WOP_2(op) WOP_1 (op) op (m1, 1);
|
||||
#define WOP_3(op) WOP_2 (op) op (m2, 2);
|
||||
#define WOP_4(op) WOP_3 (op) op (m3, 3);
|
||||
#define WOP_5(op) WOP_4 (op) op (m4, 4);
|
||||
#define WOP_6(op) WOP_5 (op) op (m5, 5);
|
||||
#define WOP_7(op) WOP_6 (op) op (m6, 6);
|
||||
#define WOP_8(op) WOP_7 (op) op (m7, 7);
|
||||
#define WOP_2(op) WOP_1 (op) op (m1, 1)
|
||||
#define WOP_3(op) WOP_2 (op) op (m2, 2)
|
||||
#define WOP_4(op) WOP_3 (op) op (m3, 3)
|
||||
#define WOP_5(op) WOP_4 (op) op (m4, 4)
|
||||
#define WOP_6(op) WOP_5 (op) op (m5, 5)
|
||||
#define WOP_7(op) WOP_6 (op) op (m6, 6)
|
||||
#define WOP_8(op) WOP_7 (op) op (m7, 7)
|
||||
|
||||
#define NUM_WAYS 8
|
||||
#define WOP_M1 WOP_8
|
||||
|
||||
#define WOP(op) op (m0, 0); WOP_M1(op)
|
||||
#define WOP(op) op (m0, 0) WOP_M1(op)
|
||||
|
||||
#define DECLARE_VAR(reg, ii) v128 reg
|
||||
#define DECLARE_VAR(reg, ii) v128 reg;
|
||||
#define LOAD_data( reg, ii) reg = data[ii];
|
||||
#define STORE_data( reg, ii) data[ii] = reg;
|
||||
#if (NUM_WAYS > 1)
|
||||
#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1]);
|
||||
#define XOR_data_M1(reg, ii) MM_XOR (reg, data[ii- 1])
|
||||
#endif
|
||||
|
||||
#define MM_OP_key(op, reg) MM_OP (op, reg, key);
|
||||
#define MM_OP_key(op, reg) MM_OP (op, reg, key)
|
||||
|
||||
#define AES_D_m(k) MM_OP_m (vaesdq_u8, k);
|
||||
#define AES_D_IMC_m(k) AES_D_m (k); MM_OP1_m (vaesimcq_u8);
|
||||
#define AES_D_m(k) MM_OP_m (vaesdq_u8, k)
|
||||
#define AES_D_IMC_m(k) AES_D_m (k) MM_OP1_m (vaesimcq_u8)
|
||||
|
||||
#define AES_XOR( reg, ii) MM_OP_key (veorq_u8, reg)
|
||||
#define AES_D( reg, ii) MM_OP_key (vaesdq_u8, reg)
|
||||
#define AES_E( reg, ii) MM_OP_key (vaeseq_u8, reg)
|
||||
|
||||
#define AES_D_IMC( reg, ii) AES_D (reg, ii); reg = vaesimcq_u8(reg)
|
||||
#define AES_E_MC( reg, ii) AES_E (reg, ii); reg = vaesmcq_u8(reg)
|
||||
#define AES_D_IMC( reg, ii) AES_D (reg, ii) reg = vaesimcq_u8(reg);
|
||||
#define AES_E_MC( reg, ii) AES_E (reg, ii) reg = vaesmcq_u8(reg);
|
||||
|
||||
#define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one); reg = vreinterpretq_u8_u64(ctr);
|
||||
#define CTR_END( reg, ii) MM_XOR (data[ii], reg);
|
||||
#define CTR_START(reg, ii) MM_OP (vaddq_u64, ctr, one) reg = vreinterpretq_u8_u64(ctr);
|
||||
#define CTR_END( reg, ii) MM_XOR (data[ii], reg)
|
||||
|
||||
#define WOP_KEY(op, n) { \
|
||||
const v128 key = w[n]; \
|
||||
WOP(op); }
|
||||
WOP(op) }
|
||||
|
||||
#define WIDE_LOOP_START \
|
||||
dataEnd = data + numBlocks; \
|
||||
|
@ -672,6 +723,8 @@ AES_FUNC_START2 (AesCbc_Encode_HW)
|
|||
|
||||
AES_FUNC_START2 (AesCbc_Decode_HW)
|
||||
{
|
||||
v128 *p = (v128*)(void*)ivAes;
|
||||
v128 *data = (v128*)(void*)data8;
|
||||
v128 iv = *p;
|
||||
const v128 *wStart = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
|
||||
const v128 *dataEnd;
|
||||
|
@ -681,7 +734,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
|||
{
|
||||
const v128 *w = wStart;
|
||||
WOP (DECLARE_VAR)
|
||||
WOP (LOAD_data);
|
||||
WOP (LOAD_data)
|
||||
WOP_KEY (AES_D_IMC, 2)
|
||||
do
|
||||
{
|
||||
|
@ -695,7 +748,7 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
|||
MM_XOR (m0, iv);
|
||||
WOP_M1 (XOR_data_M1)
|
||||
iv = data[NUM_WAYS - 1];
|
||||
WOP (STORE_data);
|
||||
WOP (STORE_data)
|
||||
}
|
||||
WIDE_LOOP_END
|
||||
|
||||
|
@ -724,6 +777,8 @@ AES_FUNC_START2 (AesCbc_Decode_HW)
|
|||
|
||||
AES_FUNC_START2 (AesCtr_Code_HW)
|
||||
{
|
||||
v128 *p = (v128*)(void*)ivAes;
|
||||
v128 *data = (v128*)(void*)data8;
|
||||
uint64x2_t ctr = vreinterpretq_u64_u8(*p);
|
||||
const v128 *wEnd = p + ((size_t)*(const UInt32 *)(p + 1)) * 2;
|
||||
const v128 *dataEnd;
|
||||
|
@ -735,7 +790,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
|||
{
|
||||
const v128 *w = p;
|
||||
WOP (DECLARE_VAR)
|
||||
WOP (CTR_START);
|
||||
WOP (CTR_START)
|
||||
do
|
||||
{
|
||||
WOP_KEY (AES_E_MC, 0)
|
||||
|
@ -746,7 +801,7 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
|||
WOP_KEY (AES_E_MC, 0)
|
||||
WOP_KEY (AES_E, 1)
|
||||
WOP_KEY (AES_XOR, 2)
|
||||
WOP (CTR_END);
|
||||
WOP (CTR_END)
|
||||
}
|
||||
WIDE_LOOP_END
|
||||
|
||||
|
@ -762,10 +817,10 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
|||
w += 2;
|
||||
}
|
||||
while (w != wEnd);
|
||||
AES_E_MC_m (w[0]);
|
||||
AES_E_m (w[1]);
|
||||
MM_XOR_m (w[2]);
|
||||
CTR_END (m, 0);
|
||||
AES_E_MC_m (w[0])
|
||||
AES_E_m (w[1])
|
||||
MM_XOR_m (w[2])
|
||||
CTR_END (m, 0)
|
||||
}
|
||||
|
||||
p[-2] = vreinterpretq_u8_u64(ctr);
|
||||
|
@ -774,3 +829,12 @@ AES_FUNC_START2 (AesCtr_Code_HW)
|
|||
#endif // USE_HW_AES
|
||||
|
||||
#endif // MY_CPU_ARM_OR_ARM64
|
||||
|
||||
#undef NUM_WAYS
|
||||
#undef WOP_M1
|
||||
#undef WOP
|
||||
#undef DECLARE_VAR
|
||||
#undef LOAD_data
|
||||
#undef STORE_data
|
||||
#undef USE_INTEL_AES
|
||||
#undef USE_HW_AES
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue