mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-07-06 06:51:18 +12:00
Latte: decode indices with NEON on aarch64
This commit is contained in:
parent
e6e65aff9a
commit
10bed1abc8
1 changed files with 120 additions and 6 deletions
|
@ -5,6 +5,8 @@
|
|||
|
||||
#if defined(ARCH_X86_64) && defined(__GNUC__)
|
||||
#include <immintrin.h>
|
||||
#elif defined(__aarch64__)
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
struct
|
||||
|
@ -480,6 +482,114 @@ void LatteIndices_fastConvertU32_AVX2(const void* indexDataInput, void* indexDat
|
|||
indexMax = std::max(indexMax, _maxIndex);
|
||||
indexMin = std::min(indexMin, _minIndex);
|
||||
}
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
void LatteIndices_fastConvertU16_NEON(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
|
||||
{
|
||||
const uint16* indicesU16BE = (const uint16*)indexDataInput;
|
||||
uint16* indexOutput = (uint16*)indexDataOutput;
|
||||
sint32 count8 = count >> 3;
|
||||
sint32 countRemaining = count & 7;
|
||||
|
||||
if (count8)
|
||||
{
|
||||
uint16x8_t mMin = vdupq_n_u16(0xFFFF);
|
||||
uint16x8_t mMax = vdupq_n_u16(0x0000);
|
||||
uint16x8_t mTemp;
|
||||
uint16x8_t* mRawIndices = (uint16x8_t*) indicesU16BE;
|
||||
indicesU16BE += count8 * 8;
|
||||
uint16x8_t* mOutputIndices = (uint16x8_t*) indexOutput;
|
||||
indexOutput += count8 * 8;
|
||||
|
||||
while (count8--)
|
||||
{
|
||||
mTemp = vld1q_u16((uint16*)mRawIndices);
|
||||
mRawIndices++;
|
||||
mTemp = vrev16q_u8(mTemp);
|
||||
mMin = vminq_u16(mMin, mTemp);
|
||||
mMax = vmaxq_u16(mMax, mTemp);
|
||||
vst1q_u16((uint16*)mOutputIndices, mTemp);
|
||||
mOutputIndices++;
|
||||
}
|
||||
|
||||
uint16* mMaxU16 = (uint16*)&mMax;
|
||||
uint16* mMinU16 = (uint16*)&mMin;
|
||||
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
indexMax = std::max(indexMax, (uint32)mMaxU16[i]);
|
||||
indexMin = std::min(indexMin, (uint32)mMinU16[i]);
|
||||
}
|
||||
}
|
||||
// process remaining indices
|
||||
uint32 _minIndex = 0xFFFFFFFF;
|
||||
uint32 _maxIndex = 0;
|
||||
for (sint32 i = countRemaining; (--i) >= 0;)
|
||||
{
|
||||
uint16 idx = _swapEndianU16(*indicesU16BE);
|
||||
*indexOutput = idx;
|
||||
indexOutput++;
|
||||
indicesU16BE++;
|
||||
_maxIndex = std::max(_maxIndex, (uint32)idx);
|
||||
_minIndex = std::min(_minIndex, (uint32)idx);
|
||||
}
|
||||
// update min/max
|
||||
indexMax = std::max(indexMax, _maxIndex);
|
||||
indexMin = std::min(indexMin, _minIndex);
|
||||
}
|
||||
|
||||
void LatteIndices_fastConvertU32_NEON(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
|
||||
{
|
||||
const uint32* indicesU32BE = (const uint32*)indexDataInput;
|
||||
uint32* indexOutput = (uint32*)indexDataOutput;
|
||||
sint32 count8 = count >> 2;
|
||||
sint32 countRemaining = count & 3;
|
||||
|
||||
if (count8)
|
||||
{
|
||||
uint32x4_t mMin = vdupq_n_u32(0xFFFFFFFF);
|
||||
uint32x4_t mMax = vdupq_n_u32(0x00000000);
|
||||
uint32x4_t mTemp;
|
||||
uint32x4_t* mRawIndices = (uint32x4_t*) indicesU32BE;
|
||||
indicesU32BE += count8 * 4;
|
||||
uint32x4_t* mOutputIndices = (uint32x4_t*) indexOutput;
|
||||
indexOutput += count8 * 4;
|
||||
|
||||
while (count8--)
|
||||
{
|
||||
mTemp = vld1q_u32((uint32*)mRawIndices);
|
||||
mRawIndices++;
|
||||
mTemp = vrev32q_u8(mTemp);
|
||||
mMin = vminq_u32(mMin, mTemp);
|
||||
mMax = vmaxq_u32(mMax, mTemp);
|
||||
vst1q_u32((uint32*)mOutputIndices, mTemp);
|
||||
mOutputIndices++;
|
||||
}
|
||||
|
||||
uint32* mMaxU32 = (uint32*)&mMax;
|
||||
uint32* mMinU32 = (uint32*)&mMin;
|
||||
|
||||
for (int i = 0; i < 4; ++i) {
|
||||
indexMax = std::max(indexMax, mMaxU32[i]);
|
||||
indexMin = std::min(indexMin, mMinU32[i]);
|
||||
}
|
||||
}
|
||||
// process remaining indices
|
||||
uint32 _minIndex = 0xFFFFFFFF;
|
||||
uint32 _maxIndex = 0;
|
||||
for (sint32 i = countRemaining; (--i) >= 0;)
|
||||
{
|
||||
uint32 idx = _swapEndianU32(*indicesU32BE);
|
||||
*indexOutput = idx;
|
||||
indexOutput++;
|
||||
indicesU32BE++;
|
||||
_maxIndex = std::max(_maxIndex, idx);
|
||||
_minIndex = std::min(_minIndex, idx);
|
||||
}
|
||||
// update min/max
|
||||
indexMax = std::max(indexMax, _maxIndex);
|
||||
indexMin = std::min(indexMin, _minIndex);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
template<typename T>
|
||||
|
@ -663,27 +773,31 @@ void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32
|
|||
{
|
||||
if (indexType == LatteIndexType::U16_BE)
|
||||
{
|
||||
#if defined(ARCH_X86_64)
|
||||
#if defined(ARCH_X86_64)
|
||||
if (g_CPUFeatures.x86.avx2)
|
||||
LatteIndices_fastConvertU16_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
else if (g_CPUFeatures.x86.sse4_1 && g_CPUFeatures.x86.ssse3)
|
||||
LatteIndices_fastConvertU16_SSE41(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
else
|
||||
LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
#else
|
||||
#elif defined(__aarch64__)
|
||||
LatteIndices_fastConvertU16_NEON(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
#else
|
||||
LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
else if (indexType == LatteIndexType::U32_BE)
|
||||
{
|
||||
#if defined(ARCH_X86_64)
|
||||
#if defined(ARCH_X86_64)
|
||||
if (g_CPUFeatures.x86.avx2)
|
||||
LatteIndices_fastConvertU32_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
else
|
||||
LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
#else
|
||||
#elif defined(__aarch64__)
|
||||
LatteIndices_fastConvertU32_NEON(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
#else
|
||||
LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
else if (indexType == LatteIndexType::U16_LE)
|
||||
{
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue