cellPngDecDecodeData handles CELL_PNGDEC_ARGB case much faster.

Profiling done with two samples on Solar v2.1 from rpcs3 init to first frame.
Before optimization, profiler found rpcs3 in cellPngDecDecodeData 15.3% of the time.
Post-optimization, profiler finds rpcs3 in cellPngDecDecodeData 0.33% of the time for ~50x improvement.
This commit is contained in:
Michael Yu 2014-05-19 02:14:07 -07:00
parent 9bdb12e3da
commit 3aeb0b0f95

View file

@ -201,7 +201,7 @@ int cellPngDecDecodeData(u32 mainHandle, u32 subHandle, mem8_ptr_t data, const m
case CELL_PNGDEC_ARGB: case CELL_PNGDEC_ARGB:
{ {
const char nComponents = 4; const int nComponents = 4;
image_size *= nComponents; image_size *= nComponents;
if (bytesPerLine > width * nComponents || flip) //check if we need padding if (bytesPerLine > width * nComponents || flip) //check if we need padding
{ {
@ -225,13 +225,19 @@ int cellPngDecDecodeData(u32 mainHandle, u32 subHandle, mem8_ptr_t data, const m
} }
else else
{ {
for (uint i = 0; i < image_size; i += nComponents) uint* dest = (uint*)new char[image_size];
uint* source_current = (uint*)&(image.get()[0]);
uint* dest_current = dest;
for (uint i = 0; i < image_size / nComponents; i++)
{ {
data += image.get()[i + 3]; uint val = *source_current;
data += image.get()[i + 0]; *dest_current = (val >> 24) | (val << 8); // set alpha (A8) as leftmost byte
data += image.get()[i + 1]; source_current++;
data += image.get()[i + 2]; dest_current++;
} }
// NOTE: AppendRawBytes has diff side-effect vs Memory.CopyFromReal
data.AppendRawBytes((u8*)dest, image_size);
delete[] dest;
} }
} }
break; break;