Cemu/src/Cafe/HW/Latte/Core/LatteIndices.cpp
2024-08-30 11:02:09 +02:00

780 lines
26 KiB
C++

#include "Cafe/HW/Latte/Core/LatteConst.h"
#include "Cafe/HW/Latte/Renderer/Renderer.h"
#include "Cafe/HW/Latte/ISA/RegDefines.h"
#include "Common/cpu_features.h"
#if defined(ARCH_X86_64) && defined(__GNUC__)
#include <immintrin.h>
#endif
struct
{
const void* lastPtr;
uint32 lastCount;
LattePrimitiveMode lastPrimitiveMode;
LatteIndexType lastIndexType;
// output
uint32 indexMin;
uint32 indexMax;
Renderer::INDEX_TYPE renderIndexType;
uint32 outputCount;
uint32 indexBufferOffset;
uint32 indexBufferIndex;
}LatteIndexCache{};
void LatteIndices_invalidate(const void* memPtr, uint32 size)
{
if (LatteIndexCache.lastPtr >= memPtr && (LatteIndexCache.lastPtr < ((uint8*)memPtr + size)) )
{
LatteIndexCache.lastPtr = nullptr;
LatteIndexCache.lastCount = 0;
}
}
void LatteIndices_invalidateAll()
{
LatteIndexCache.lastPtr = nullptr;
LatteIndexCache.lastCount = 0;
}
uint32 LatteIndices_calculateIndexOutputSize(LattePrimitiveMode primitiveMode, LatteIndexType indexType, uint32 count)
{
if (primitiveMode == LattePrimitiveMode::QUADS)
{
sint32 numQuads = count / 4;
if (indexType == LatteIndexType::AUTO)
{
if(count <= 0xFFFF)
return numQuads * 6 * sizeof(uint16);
return numQuads * 6 * sizeof(uint32);
}
if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE)
return numQuads * 6 * sizeof(uint16);
if (indexType == LatteIndexType::U32_BE || indexType == LatteIndexType::U32_LE)
return numQuads * 6 * sizeof(uint32);
cemu_assert_suspicious();
return 0;
}
else if (primitiveMode == LattePrimitiveMode::QUAD_STRIP)
{
if (count <= 3)
{
return 0;
}
sint32 numQuads = (count-2) / 2;
if (indexType == LatteIndexType::AUTO)
{
if (count <= 0xFFFF)
return numQuads * 6 * sizeof(uint16);
return numQuads * 6 * sizeof(uint32);
}
if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE)
return numQuads * 6 * sizeof(uint16);
if (indexType == LatteIndexType::U32_BE || indexType == LatteIndexType::U32_LE)
return numQuads * 6 * sizeof(uint32);
cemu_assert_suspicious();
return 0;
}
else if (primitiveMode == LattePrimitiveMode::LINE_LOOP)
{
count++; // one extra vertex to reconnect the LINE_STRIP to the beginning
if (indexType == LatteIndexType::AUTO)
{
if (count <= 0xFFFF)
return count * sizeof(uint16);
return count * sizeof(uint32);
}
if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE)
return count * sizeof(uint16);
if (indexType == LatteIndexType::U32_BE || indexType == LatteIndexType::U32_LE)
return count * sizeof(uint32);
cemu_assert_suspicious();
return 0;
}
else if(indexType == LatteIndexType::AUTO)
return 0;
else if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE)
return count * sizeof(uint16);
else if (indexType == LatteIndexType::U32_BE || indexType == LatteIndexType::U32_LE)
return count * sizeof(uint32);
return 0;
}
template<typename T>
void LatteIndices_convertBE(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
const betype<T>* src = (betype<T>*)indexDataInput;
T* dst = (T*)indexDataOutput;
for (uint32 i = 0; i < count; i++)
{
T v = *src;
*dst = v;
indexMin = std::min(indexMin, (uint32)v);
indexMax = std::max(indexMax, (uint32)v);
dst++;
src++;
}
}
template<typename T>
void LatteIndices_convertLE(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
const T* src = (T*)indexDataInput;
T* dst = (T*)indexDataOutput;
for (uint32 i = 0; i < count; i++)
{
T v = *src;
*dst = v;
indexMin = std::min(indexMin, (uint32)v);
indexMax = std::max(indexMax, (uint32)v);
dst++;
src++;
}
}
template<typename T>
void LatteIndices_unpackQuadsAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
sint32 numQuads = count / 4;
const betype<T>* src = (betype<T>*)indexDataInput;
T* dst = (T*)indexDataOutput;
for (sint32 i = 0; i < numQuads; i++)
{
T idx0 = src[0];
T idx1 = src[1];
T idx2 = src[2];
T idx3 = src[3];
indexMin = std::min(indexMin, (uint32)idx0);
indexMax = std::max(indexMax, (uint32)idx0);
indexMin = std::min(indexMin, (uint32)idx1);
indexMax = std::max(indexMax, (uint32)idx1);
indexMin = std::min(indexMin, (uint32)idx2);
indexMax = std::max(indexMax, (uint32)idx2);
indexMin = std::min(indexMin, (uint32)idx3);
indexMax = std::max(indexMax, (uint32)idx3);
dst[0] = idx0;
dst[1] = idx1;
dst[2] = idx2;
dst[3] = idx0;
dst[4] = idx2;
dst[5] = idx3;
src += 4;
dst += 6;
}
}
template<typename T>
void LatteIndices_generateAutoQuadIndices(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
sint32 numQuads = count / 4;
const betype<T>* src = (betype<T>*)indexDataInput;
T* dst = (T*)indexDataOutput;
for (sint32 i = 0; i < numQuads; i++)
{
T idx0 = i * 4 + 0;
T idx1 = i * 4 + 1;
T idx2 = i * 4 + 2;
T idx3 = i * 4 + 3;
dst[0] = idx0;
dst[1] = idx1;
dst[2] = idx2;
dst[3] = idx0;
dst[4] = idx2;
dst[5] = idx3;
src += 4;
dst += 6;
}
indexMin = 0;
indexMax = std::max(count, 1u) - 1;
}
template<typename T>
void LatteIndices_unpackQuadStripAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
if (count <= 3)
return;
sint32 numQuads = (count - 2) / 2;
const betype<T>* src = (betype<T>*)indexDataInput;
T* dst = (T*)indexDataOutput;
for (sint32 i = 0; i < numQuads; i++)
{
T idx0 = src[0];
T idx1 = src[1];
T idx2 = src[2];
T idx3 = src[3];
indexMin = std::min(indexMin, (uint32)idx0);
indexMax = std::max(indexMax, (uint32)idx0);
indexMin = std::min(indexMin, (uint32)idx1);
indexMax = std::max(indexMax, (uint32)idx1);
indexMin = std::min(indexMin, (uint32)idx2);
indexMax = std::max(indexMax, (uint32)idx2);
indexMin = std::min(indexMin, (uint32)idx3);
indexMax = std::max(indexMax, (uint32)idx3);
dst[0] = idx0;
dst[1] = idx1;
dst[2] = idx2;
dst[3] = idx2;
dst[4] = idx1;
dst[5] = idx3;
src += 2;
dst += 6;
}
}
template<typename T>
void LatteIndices_unpackLineLoopAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
if (count <= 0)
return;
const betype<T>* src = (betype<T>*)indexDataInput;
T firstIndex = *src;
T* dst = (T*)indexDataOutput;
for (sint32 i = 0; i < (sint32)count; i++)
{
T idx = *src;
indexMin = std::min(indexMin, (uint32)idx);
indexMax = std::max(indexMax, (uint32)idx);
*dst = idx;
src++;
dst++;
}
*dst = firstIndex;
}
template<typename T>
void LatteIndices_generateAutoQuadStripIndices(void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
if (count <= 3)
return;
sint32 numQuads = (count - 2) / 2;
T* dst = (T*)indexDataOutput;
for (sint32 i = 0; i < numQuads; i++)
{
T idx0 = i * 2 + 0;
T idx1 = i * 2 + 1;
T idx2 = i * 2 + 2;
T idx3 = i * 2 + 3;
dst[0] = idx0;
dst[1] = idx1;
dst[2] = idx2;
dst[3] = idx2;
dst[4] = idx1;
dst[5] = idx3;
dst += 6;
}
indexMin = 0;
indexMax = std::max(count, 1u) - 1;
}
template<typename T>
void LatteIndices_generateAutoLineLoopIndices(void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
if (count == 0)
return;
T* dst = (T*)indexDataOutput;
for (sint32 i = 0; i < (sint32)count; i++)
{
*dst = (T)i;
dst++;
}
*dst = 0;
dst++;
indexMin = 0;
indexMax = std::max(count, 1u) - 1;
}
template<typename T>
void LatteIndices_unpackTriangleFanAndConvert(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
const betype<T>* src = (betype<T>*)indexDataInput;
T* dst = (T*)indexDataOutput;
// TODO: check this
for (sint32 i = 0; i < count; i++)
{
uint32 i0;
if (i % 2 == 0)
i0 = i / 2;
else
i0 = count - 1 - i / 2;
T idx = src[i0];
indexMin = std::min(indexMin, (uint32)idx);
indexMax = std::max(indexMax, (uint32)idx);
dst[i] = idx;
}
}
template<typename T>
void LatteIndices_generateAutoTriangleFanIndices(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
const betype<T>* src = (betype<T>*)indexDataInput;
T* dst = (T*)indexDataOutput;
for (sint32 i = 0; i < count; i++)
{
T idx = i;
if (idx % 2 == 0)
idx = idx / 2;
else
idx = count - 1 - idx / 2;
dst[i] = idx;
}
indexMin = 0;
indexMax = std::max(count, 1u) - 1;
}
#if defined(ARCH_X86_64)
ATTRIBUTE_AVX2
void LatteIndices_fastConvertU16_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
// using AVX + AVX2 we can process 16 indices at a time
const uint16* indicesU16BE = (const uint16*)indexDataInput;
uint16* indexOutput = (uint16*)indexDataOutput;
sint32 count16 = count >> 4;
sint32 countRemaining = count & 15;
if (count16)
{
__m256i mMin = _mm256_set_epi16((sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF,
(sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF, (sint16)0xFFFF);
__m256i mMax = _mm256_set_epi16(0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000);
__m256i mShuffle16Swap = _mm256_set_epi8(30, 31, 28, 29, 26, 27, 24, 25, 22, 23, 20, 21, 18, 19, 16, 17, 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
do
{
__m256i mIndexData = _mm256_loadu_si256((const __m256i*)indicesU16BE);
indicesU16BE += 16;
_mm_prefetch((const char*)indicesU16BE, _MM_HINT_T0);
// endian swap
mIndexData = _mm256_shuffle_epi8(mIndexData, mShuffle16Swap);
_mm256_store_si256((__m256i*)indexOutput, mIndexData);
mMin = _mm256_min_epu16(mIndexData, mMin);
mMax = _mm256_max_epu16(mIndexData, mMax);
indexOutput += 16;
} while (--count16);
// fold 32 to 16 byte
mMin = _mm256_min_epu16(mMin, _mm256_permute2x128_si256(mMin, mMin, 1));
mMax = _mm256_max_epu16(mMax, _mm256_permute2x128_si256(mMax, mMax, 1));
// fold 16 to 8 byte
mMin = _mm256_min_epu16(mMin, _mm256_shuffle_epi32(mMin, (2 << 0) | (3 << 2) | (2 << 4) | (3 << 6)));
mMax = _mm256_max_epu16(mMax, _mm256_shuffle_epi32(mMax, (2 << 0) | (3 << 2) | (2 << 4) | (3 << 6)));
uint16* mMinU16 = (uint16*)&mMin;
uint16* mMaxU16 = (uint16*)&mMax;
indexMin = std::min(indexMin, (uint32)mMinU16[0]);
indexMin = std::min(indexMin, (uint32)mMinU16[1]);
indexMin = std::min(indexMin, (uint32)mMinU16[2]);
indexMin = std::min(indexMin, (uint32)mMinU16[3]);
indexMax = std::max(indexMax, (uint32)mMaxU16[0]);
indexMax = std::max(indexMax, (uint32)mMaxU16[1]);
indexMax = std::max(indexMax, (uint32)mMaxU16[2]);
indexMax = std::max(indexMax, (uint32)mMaxU16[3]);
}
// process remaining indices
uint32 _minIndex = 0xFFFFFFFF;
uint32 _maxIndex = 0;
for (sint32 i = countRemaining; (--i) >= 0;)
{
uint16 idx = _swapEndianU16(*indicesU16BE);
*indexOutput = idx;
indexOutput++;
indicesU16BE++;
_maxIndex = std::max(_maxIndex, (uint32)idx);
_minIndex = std::min(_minIndex, (uint32)idx);
}
// update min/max
indexMax = std::max(indexMax, _maxIndex);
indexMin = std::min(indexMin, _minIndex);
}
ATTRIBUTE_SSE41
void LatteIndices_fastConvertU16_SSE41(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
// SSSE3 & SSE4.1 optimized decoding
const uint16* indicesU16BE = (const uint16*)indexDataInput;
uint16* indexOutput = (uint16*)indexDataOutput;
sint32 count8 = count >> 3;
sint32 countRemaining = count & 7;
if (count8)
{
__m128i mMin = _mm_set_epi16((short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF, (short)0xFFFF);
__m128i mMax = _mm_set_epi16(0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000);
__m128i mTemp;
__m128i* mRawIndices = (__m128i*)indicesU16BE;
indicesU16BE += count8 * 8;
__m128i* mOutputIndices = (__m128i*)indexOutput;
indexOutput += count8 * 8;
__m128i shufmask = _mm_set_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1);
while (count8--)
{
mTemp = _mm_loadu_si128(mRawIndices);
mRawIndices++;
mTemp = _mm_shuffle_epi8(mTemp, shufmask);
mMin = _mm_min_epu16(mMin, mTemp);
mMax = _mm_max_epu16(mMax, mTemp);
_mm_store_si128(mOutputIndices, mTemp);
mOutputIndices++;
}
uint16* mMaxU16 = (uint16*)&mMax;
uint16* mMinU16 = (uint16*)&mMin;
indexMax = std::max(indexMax, (uint32)mMaxU16[0]);
indexMax = std::max(indexMax, (uint32)mMaxU16[1]);
indexMax = std::max(indexMax, (uint32)mMaxU16[2]);
indexMax = std::max(indexMax, (uint32)mMaxU16[3]);
indexMax = std::max(indexMax, (uint32)mMaxU16[4]);
indexMax = std::max(indexMax, (uint32)mMaxU16[5]);
indexMax = std::max(indexMax, (uint32)mMaxU16[6]);
indexMax = std::max(indexMax, (uint32)mMaxU16[7]);
indexMin = std::min(indexMin, (uint32)mMinU16[0]);
indexMin = std::min(indexMin, (uint32)mMinU16[1]);
indexMin = std::min(indexMin, (uint32)mMinU16[2]);
indexMin = std::min(indexMin, (uint32)mMinU16[3]);
indexMin = std::min(indexMin, (uint32)mMinU16[4]);
indexMin = std::min(indexMin, (uint32)mMinU16[5]);
indexMin = std::min(indexMin, (uint32)mMinU16[6]);
indexMin = std::min(indexMin, (uint32)mMinU16[7]);
}
uint32 _minIndex = 0xFFFFFFFF;
uint32 _maxIndex = 0;
for (sint32 i = countRemaining; (--i) >= 0;)
{
uint16 idx = _swapEndianU16(*indicesU16BE);
*indexOutput = idx;
indexOutput++;
indicesU16BE++;
_maxIndex = std::max(_maxIndex, (uint32)idx);
_minIndex = std::min(_minIndex, (uint32)idx);
}
indexMax = std::max(indexMax, _maxIndex);
indexMin = std::min(indexMin, _minIndex);
}
ATTRIBUTE_AVX2
void LatteIndices_fastConvertU32_AVX2(const void* indexDataInput, void* indexDataOutput, uint32 count, uint32& indexMin, uint32& indexMax)
{
// using AVX + AVX2 we can process 8 indices at a time
const uint32* indicesU32BE = (const uint32*)indexDataInput;
uint32* indexOutput = (uint32*)indexDataOutput;
sint32 count8 = count >> 3;
sint32 countRemaining = count & 7;
if (count8)
{
__m256i mMin = _mm256_set_epi32((sint32)0xFFFFFFFF, (sint32)0xFFFFFFFF, (sint32)0xFFFFFFFF, (sint32)0xFFFFFFFF, (sint32)0xFFFFFFFF, (sint32)0xFFFFFFFF, (sint32)0xFFFFFFFF, (sint32)0xFFFFFFFF);
__m256i mMax = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, 0);
__m256i mShuffle32Swap = _mm256_set_epi8(28,29,30,31,
24,25,26,27,
20,21,22,23,
16,17,18,19,
12,13,14,15,
8,9,10,11,
4,5,6,7,
0,1,2,3);
// unaligned
do
{
__m256i mIndexData = _mm256_loadu_si256((const __m256i*)indicesU32BE);
indicesU32BE += 8;
_mm_prefetch((const char*)indicesU32BE, _MM_HINT_T0);
// endian swap
mIndexData = _mm256_shuffle_epi8(mIndexData, mShuffle32Swap);
_mm256_store_si256((__m256i*)indexOutput, mIndexData);
mMin = _mm256_min_epu32(mIndexData, mMin);
mMax = _mm256_max_epu32(mIndexData, mMax);
indexOutput += 8;
} while (--count8);
// fold 32 to 16 byte
mMin = _mm256_min_epu32(mMin, _mm256_permute2x128_si256(mMin, mMin, 1));
mMax = _mm256_max_epu32(mMax, _mm256_permute2x128_si256(mMax, mMax, 1));
// fold 16 to 8 byte
mMin = _mm256_min_epu32(mMin, _mm256_shuffle_epi32(mMin, (2 << 0) | (3 << 2) | (2 << 4) | (3 << 6)));
mMax = _mm256_max_epu32(mMax, _mm256_shuffle_epi32(mMax, (2 << 0) | (3 << 2) | (2 << 4) | (3 << 6)));
uint32* mMinU32 = (uint32*)&mMin;
uint32* mMaxU32 = (uint32*)&mMax;
indexMin = std::min(indexMin, (uint32)mMinU32[0]);
indexMin = std::min(indexMin, (uint32)mMinU32[1]);
indexMax = std::max(indexMax, (uint32)mMaxU32[0]);
indexMax = std::max(indexMax, (uint32)mMaxU32[1]);
}
// process remaining indices
uint32 _minIndex = 0xFFFFFFFF;
uint32 _maxIndex = 0;
for (sint32 i = countRemaining; (--i) >= 0;)
{
uint32 idx = _swapEndianU32(*indicesU32BE);
*indexOutput = idx;
indexOutput++;
indicesU32BE++;
_maxIndex = std::max(_maxIndex, (uint32)idx);
_minIndex = std::min(_minIndex, (uint32)idx);
}
// update min/max
indexMax = std::max(indexMax, _maxIndex);
indexMin = std::min(indexMin, _minIndex);
}
#endif
template<typename T>
void _LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, uint32 count, uint32 primitiveRestartIndex, uint32& indexMin, uint32& indexMax)
{
cemu_assert_debug(count != 0);
const betype<T>* idxPtrT = (betype<T>*)indexData;
T _indexMin = *idxPtrT;
T _indexMax = *idxPtrT;
cemu_assert_debug(primitiveRestartIndex <= std::numeric_limits<T>::max());
T restartIndexT = (T)primitiveRestartIndex;
while (count)
{
T idx = *idxPtrT;
if (idx != restartIndexT)
{
_indexMin = std::min(_indexMin, idx);
_indexMax = std::max(_indexMax, idx);
}
idxPtrT++;
count--;
}
indexMin = _indexMin;
indexMax = _indexMax;
}
// calculate min and max index while taking primitive restart into account
// fallback implementation in case the fast path gives us invalid results
void LatteIndices_alternativeCalculateIndexMinMax(const void* indexData, LatteIndexType indexType, uint32 count, uint32& indexMin, uint32& indexMax)
{
if (count == 0)
{
indexMin = 0;
indexMax = 0;
return;
}
uint32 primitiveRestartIndex = LatteGPUState.contextNew.VGT_MULTI_PRIM_IB_RESET_INDX.get_RESTART_INDEX();
if (indexType == LatteIndexType::U16_BE)
{
_LatteIndices_alternativeCalculateIndexMinMax<uint16>(indexData, count, primitiveRestartIndex, indexMin, indexMax);
}
else if (indexType == LatteIndexType::U32_BE)
{
_LatteIndices_alternativeCalculateIndexMinMax<uint32>(indexData, count, primitiveRestartIndex, indexMin, indexMax);
}
else
{
cemu_assert_debug(false);
}
}
void LatteIndices_decode(const void* indexData, LatteIndexType indexType, uint32 count, LattePrimitiveMode primitiveMode, uint32& indexMin, uint32& indexMax, Renderer::INDEX_TYPE& renderIndexType, uint32& outputCount, uint32& indexBufferOffset, uint32& indexBufferIndex)
{
// what this should do:
// [x] use fast SIMD-based index decoding
// [x] unpack QUAD indices to triangle indices
// [x] calculate min and max index, be careful about primitive restart index
// [x] decode data directly into coherent memory buffer?
// [ ] better cache implementation, allow to cache across frames
// reuse from cache if data didn't change
if (LatteIndexCache.lastPtr == indexData &&
LatteIndexCache.lastCount == count &&
LatteIndexCache.lastPrimitiveMode == primitiveMode &&
LatteIndexCache.lastIndexType == indexType)
{
indexMin = LatteIndexCache.indexMin;
indexMax = LatteIndexCache.indexMax;
renderIndexType = LatteIndexCache.renderIndexType;
outputCount = LatteIndexCache.outputCount;
indexBufferOffset = LatteIndexCache.indexBufferOffset;
indexBufferIndex = LatteIndexCache.indexBufferIndex;
return;
}
outputCount = 0;
if (indexType == LatteIndexType::AUTO)
renderIndexType = Renderer::INDEX_TYPE::NONE;
else if (indexType == LatteIndexType::U16_BE || indexType == LatteIndexType::U16_LE)
renderIndexType = Renderer::INDEX_TYPE::U16;
else if (indexType == LatteIndexType::U32_BE)
renderIndexType = Renderer::INDEX_TYPE::U32;
else
cemu_assert_debug(false);
uint32 primitiveRestartIndex = LatteGPUState.contextNew.VGT_MULTI_PRIM_IB_RESET_INDX.get_RESTART_INDEX();
// calculate index output size
uint32 indexOutputSize = LatteIndices_calculateIndexOutputSize(primitiveMode, indexType, count);
if (indexOutputSize == 0)
{
outputCount = count;
indexMin = 0;
indexMax = std::max(count, 1u)-1;
renderIndexType = Renderer::INDEX_TYPE::NONE;
return; // no indices
}
// query index buffer from renderer
void* indexOutputPtr = g_renderer->indexData_reserveIndexMemory(indexOutputSize, indexBufferOffset, indexBufferIndex);
// decode indices
indexMin = std::numeric_limits<uint32>::max();
indexMax = std::numeric_limits<uint32>::min();
if (primitiveMode == LattePrimitiveMode::QUADS)
{
// unpack quads into triangles
if (indexType == LatteIndexType::AUTO)
{
if (count <= 0xFFFF)
{
LatteIndices_generateAutoQuadIndices<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
renderIndexType = Renderer::INDEX_TYPE::U16;
}
else
{
LatteIndices_generateAutoQuadIndices<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
renderIndexType = Renderer::INDEX_TYPE::U32;
}
}
else if (indexType == LatteIndexType::U16_BE)
LatteIndices_unpackQuadsAndConvert<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
else if (indexType == LatteIndexType::U32_BE)
LatteIndices_unpackQuadsAndConvert<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
else
cemu_assert_debug(false);
outputCount = count / 4 * 6;
}
else if (primitiveMode == LattePrimitiveMode::QUAD_STRIP)
{
// unpack quad strip into triangles
if (indexType == LatteIndexType::AUTO)
{
if (count <= 0xFFFF)
{
LatteIndices_generateAutoQuadStripIndices<uint16>(indexOutputPtr, count, indexMin, indexMax);
renderIndexType = Renderer::INDEX_TYPE::U16;
}
else
{
LatteIndices_generateAutoQuadStripIndices<uint32>(indexOutputPtr, count, indexMin, indexMax);
renderIndexType = Renderer::INDEX_TYPE::U32;
}
}
else if (indexType == LatteIndexType::U16_BE)
LatteIndices_unpackQuadStripAndConvert<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
else if (indexType == LatteIndexType::U32_BE)
LatteIndices_unpackQuadStripAndConvert<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
else
cemu_assert_debug(false);
if (count >= 2)
outputCount = (count - 2) / 2 * 6;
else
outputCount = 0;
}
else if (primitiveMode == LattePrimitiveMode::LINE_LOOP)
{
// unpack line loop into line strip with extra reconnecting vertex
if (indexType == LatteIndexType::AUTO)
{
if (count <= 0xFFFF)
{
LatteIndices_generateAutoLineLoopIndices<uint16>(indexOutputPtr, count, indexMin, indexMax);
renderIndexType = Renderer::INDEX_TYPE::U16;
}
else
{
LatteIndices_generateAutoLineLoopIndices<uint32>(indexOutputPtr, count, indexMin, indexMax);
renderIndexType = Renderer::INDEX_TYPE::U32;
}
}
else if (indexType == LatteIndexType::U16_BE)
LatteIndices_unpackLineLoopAndConvert<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
else if (indexType == LatteIndexType::U32_BE)
LatteIndices_unpackLineLoopAndConvert<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
else
cemu_assert_debug(false);
outputCount = count + 1;
}
else if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN && g_renderer->GetType() == RendererAPI::Metal)
{
if (indexType == LatteIndexType::AUTO)
{
if (count <= 0xFFFF)
{
LatteIndices_generateAutoTriangleFanIndices<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
renderIndexType = Renderer::INDEX_TYPE::U16;
}
else
{
LatteIndices_generateAutoTriangleFanIndices<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
renderIndexType = Renderer::INDEX_TYPE::U32;
}
}
else if (indexType == LatteIndexType::U16_BE)
LatteIndices_unpackTriangleFanAndConvert<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
else if (indexType == LatteIndexType::U32_BE)
LatteIndices_unpackTriangleFanAndConvert<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
else
cemu_assert_debug(false);
outputCount = count;
}
else
{
if (indexType == LatteIndexType::U16_BE)
{
#if defined(ARCH_X86_64)
if (g_CPUFeatures.x86.avx2)
LatteIndices_fastConvertU16_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax);
else if (g_CPUFeatures.x86.sse4_1 && g_CPUFeatures.x86.ssse3)
LatteIndices_fastConvertU16_SSE41(indexData, indexOutputPtr, count, indexMin, indexMax);
else
LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
#else
LatteIndices_convertBE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
#endif
}
else if (indexType == LatteIndexType::U32_BE)
{
#if defined(ARCH_X86_64)
if (g_CPUFeatures.x86.avx2)
LatteIndices_fastConvertU32_AVX2(indexData, indexOutputPtr, count, indexMin, indexMax);
else
LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
#else
LatteIndices_convertBE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
#endif
}
else if (indexType == LatteIndexType::U16_LE)
{
LatteIndices_convertLE<uint16>(indexData, indexOutputPtr, count, indexMin, indexMax);
}
else if (indexType == LatteIndexType::U32_LE)
{
LatteIndices_convertLE<uint32>(indexData, indexOutputPtr, count, indexMin, indexMax);
}
else
cemu_assert_debug(false);
outputCount = count;
}
// the above algorithms use a simplistic approach to get indexMin/indexMax
// here we make sure primitive restart indices dont influence the index range
if (primitiveRestartIndex == indexMin || primitiveRestartIndex == indexMax)
{
// recalculate index range but filter out primitive restart index
LatteIndices_alternativeCalculateIndexMinMax(indexData, indexType, count, indexMin, indexMax);
}
g_renderer->indexData_uploadIndexMemory(indexBufferIndex, indexBufferOffset, indexOutputSize);
// update cache
LatteIndexCache.lastPtr = indexData;
LatteIndexCache.lastCount = count;
LatteIndexCache.lastPrimitiveMode = primitiveMode;
LatteIndexCache.lastIndexType = indexType;
LatteIndexCache.indexMin = indexMin;
LatteIndexCache.indexMax = indexMax;
LatteIndexCache.renderIndexType = renderIndexType;
LatteIndexCache.outputCount = outputCount;
LatteIndexCache.indexBufferOffset = indexBufferOffset;
LatteIndexCache.indexBufferIndex = indexBufferIndex;
}