#include "Cafe/HW/Latte/Renderer/Metal/MetalRenderer.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalMemoryManager.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureViewMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/RendererShaderMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/CachedFBOMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalOutputShaderCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalPipelineCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalDepthStencilCache.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalSamplerCache.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteTextureReadbackMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalVoidVertexPipeline.h" #include "Cafe/HW/Latte/Renderer/Metal/MetalQuery.h" #include "Cafe/HW/Latte/Renderer/Metal/LatteToMtl.h" #include "Cafe/HW/Latte/Renderer/Metal/UtilityShaderSource.h" #include "Cafe/HW/Latte/Core/LatteShader.h" #include "Cafe/HW/Latte/Core/LatteIndices.h" #include "Cafe/HW/Latte/Core/LatteBufferCache.h" #include "CafeSystem.h" #include "Cemu/Logging/CemuLogging.h" #include "Cafe/HW/Latte/Core/FetchShader.h" #include "Cafe/HW/Latte/Core/LatteConst.h" #include "HW/Latte/Renderer/Metal/MetalBufferAllocator.h" #include "HW/Latte/Renderer/Metal/MetalCommon.h" #include "config/CemuConfig.h" #include "gui/guiWrapper.h" #define IMGUI_IMPL_METAL_CPP #include "imgui/imgui_extension.h" #include "imgui/imgui_impl_metal.h" #define EVENT_VALUE_WRAP 4096 extern bool hasValidFramebufferAttached; float supportBufferData[512 * 4]; // Defined in the OpenGL renderer void LatteDraw_handleSpecialState8_clearAsDepth(); std::vector MetalRenderer::GetDevices() { auto devices = MTL::CopyAllDevices(); std::vector result; result.reserve(devices->count()); for (uint32 i = 0; i < devices->count(); i++) { MTL::Device* device = static_cast(devices->object(i)); result.emplace_back(std::string(device->name()->utf8String()), device->registryID()); } devices->release(); return result; } MetalRenderer::MetalRenderer() { // Options // Position invariance switch (g_current_game_profile->GetPositionInvariance()) { case PositionInvariance::Auto: switch (CafeSystem::GetForegroundTitleId()) { // Mario Kart 9 case 0x000500001010ED00: // EUR case 0x000500001010EC00: // USA case 0x000500001010EB00: // JPN case 0x0005000010183A00: // JPN (TODO: check) // Minecraft: Story Mode case 0x000500001020A300: // EUR case 0x00050000101E0100: // USA //case 0x000500001020a200: // USA // Resident Evil: Revelations case 0x000500001012B400: // EUR case 0x000500001012CF00: // USA // The Legend of Zelda: Breath of the Wild case 0x00050000101C9500: // EUR case 0x00050000101C9400: // USA case 0x00050000101C9300: // JPN // Ninja Gaiden 3: Razor's Edge case 0x0005000010110B00: // EUR case 0x0005000010139B00: // EUR (TODO: check) case 0x0005000010110A00: // USA case 0x0005000010110900: // JPN // Bayonetta 2 case 0x0005000010172700: // EUR case 0x0005000010172600: // USA // LEGO STAR WARS: The Force Awakens case 0x00050000101DAA00: // EUR case 0x00050000101DAB00: // USA // Bayonetta case 0x0005000010157F00: // EUR case 0x0005000010157E00: // USA case 0x000500001014DB00: // JPN // Disney Planes case 0x0005000010136900: // EUR case 0x0005000010136A00: // EUR case 0x0005000010136B00: // EUR case 0x000500001011C500: // USA (TODO: check) // Wonderful 101 case 0x0005000010135300: // EUR case 0x000500001012DC00: // USA case 0x0005000010116300: // JPN case 0x0005000010185600: // JPN m_positionInvariance = true; break; default: m_positionInvariance = false; break; } case PositionInvariance::False: m_positionInvariance = false; break; case PositionInvariance::True: m_positionInvariance = true; break; } // Pick a device auto& config = GetConfig(); const bool hasDeviceSet = config.mtl_graphic_device_uuid != 0; // If a device is set, try to find it if (hasDeviceSet) { auto devices = MTL::CopyAllDevices(); for (uint32 i = 0; i < devices->count(); i++) { MTL::Device* device = static_cast(devices->object(i)); if (device->registryID() == config.mtl_graphic_device_uuid) { m_device = device; break; } } devices->release(); } if (!m_device) { if (hasDeviceSet) { cemuLog_log(LogType::Force, "The selected GPU ({}) could not be found. Using the system default device.", config.mtl_graphic_device_uuid); config.mtl_graphic_device_uuid = 0; } // Use the system default device m_device = MTL::CreateSystemDefaultDevice(); } // Feature support m_isAppleGPU = m_device->supportsFamily(MTL::GPUFamilyApple1); m_supportsFramebufferFetch = GetConfig().framebuffer_fetch.GetValue() ? m_device->supportsFamily(MTL::GPUFamilyApple2) : false; m_hasUnifiedMemory = m_device->hasUnifiedMemory(); m_supportsMetal3 = m_device->supportsFamily(MTL::GPUFamilyMetal3); m_recommendedMaxVRAMUsage = m_device->recommendedMaxWorkingSetSize(); m_pixelFormatSupport = MetalPixelFormatSupport(m_device); CheckForPixelFormatSupport(m_pixelFormatSupport); // Command queue m_commandQueue = m_device->newCommandQueue(); // Synchronization resources m_event = m_device->newEvent(); // Resources MTL::SamplerDescriptor* samplerDescriptor = MTL::SamplerDescriptor::alloc()->init(); #ifdef CEMU_DEBUG_ASSERT samplerDescriptor->setLabel(GetLabel("Nearest sampler state", samplerDescriptor)); #endif m_nearestSampler = m_device->newSamplerState(samplerDescriptor); samplerDescriptor->setMinFilter(MTL::SamplerMinMagFilterLinear); samplerDescriptor->setMagFilter(MTL::SamplerMinMagFilterLinear); #ifdef CEMU_DEBUG_ASSERT samplerDescriptor->setLabel(GetLabel("Linear sampler state", samplerDescriptor)); #endif m_linearSampler = m_device->newSamplerState(samplerDescriptor); samplerDescriptor->release(); // Null resources MTL::TextureDescriptor* textureDescriptor = MTL::TextureDescriptor::alloc()->init(); textureDescriptor->setTextureType(MTL::TextureType1D); textureDescriptor->setWidth(1); textureDescriptor->setUsage(MTL::TextureUsageShaderRead); m_nullTexture1D = m_device->newTexture(textureDescriptor); #ifdef CEMU_DEBUG_ASSERT m_nullTexture1D->setLabel(GetLabel("Null texture 1D", m_nullTexture1D)); #endif textureDescriptor->setTextureType(MTL::TextureType2D); textureDescriptor->setHeight(1); textureDescriptor->setUsage(MTL::TextureUsageShaderRead | MTL::TextureUsageRenderTarget); m_nullTexture2D = m_device->newTexture(textureDescriptor); #ifdef CEMU_DEBUG_ASSERT m_nullTexture2D->setLabel(GetLabel("Null texture 2D", m_nullTexture2D)); #endif textureDescriptor->release(); m_memoryManager = new MetalMemoryManager(this); m_outputShaderCache = new MetalOutputShaderCache(this); m_pipelineCache = new MetalPipelineCache(this); m_depthStencilCache = new MetalDepthStencilCache(this); m_samplerCache = new MetalSamplerCache(this); // Lower the commit treshold when buffer cache needs reduced latency if (m_memoryManager->NeedsReducedLatency()) m_defaultCommitTreshlod = 64; else m_defaultCommitTreshlod = 196; // Occlusion queries m_occlusionQuery.m_resultBuffer = m_device->newBuffer(OCCLUSION_QUERY_POOL_SIZE * sizeof(uint64), MTL::ResourceStorageModeShared); #ifdef CEMU_DEBUG_ASSERT m_occlusionQuery.m_resultBuffer->setLabel(GetLabel("Occlusion query result buffer", m_occlusionQuery.m_resultBuffer)); #endif m_occlusionQuery.m_resultsPtr = (uint64*)m_occlusionQuery.m_resultBuffer->contents(); // Reset vertex and uniform buffers for (uint32 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) m_state.m_vertexBufferOffsets[i] = INVALID_OFFSET; for (uint32 i = 0; i < METAL_SHADER_TYPE_TOTAL; i++) { for (uint32 j = 0; j < MAX_MTL_BUFFERS; j++) m_state.m_uniformBufferOffsets[i][j] = INVALID_OFFSET; } // Utility shader library // Create the library NS::Error* error = nullptr; MTL::Library* utilityLibrary = m_device->newLibrary(ToNSString(utilityShaderSource), nullptr, &error); if (error) { cemuLog_log(LogType::Force, "failed to create utility library (error: {})", error->localizedDescription()->utf8String()); } // Pipelines MTL::Function* vertexFullscreenFunction = utilityLibrary->newFunction(ToNSString("vertexFullscreen")); MTL::Function* fragmentCopyDepthToColorFunction = utilityLibrary->newFunction(ToNSString("fragmentCopyDepthToColor")); m_copyDepthToColorDesc = MTL::RenderPipelineDescriptor::alloc()->init(); m_copyDepthToColorDesc->setVertexFunction(vertexFullscreenFunction); m_copyDepthToColorDesc->setFragmentFunction(fragmentCopyDepthToColorFunction); vertexFullscreenFunction->release(); fragmentCopyDepthToColorFunction->release(); // Void vertex pipelines if (m_isAppleGPU) m_copyBufferToBufferPipeline = new MetalVoidVertexPipeline(this, utilityLibrary, "vertexCopyBufferToBuffer"); utilityLibrary->release(); // HACK: for some reason, this variable ends up being initialized to some garbage data, even though its declared as bool m_captureFrame = false; m_occlusionQuery.m_lastCommandBuffer = nullptr; m_captureFrame = false; } MetalRenderer::~MetalRenderer() { if (m_isAppleGPU) delete m_copyBufferToBufferPipeline; //delete m_copyTextureToTexturePipeline; //delete m_restrideBufferPipeline; m_copyDepthToColorDesc->release(); for (const auto [pixelFormat, pipeline] : m_copyDepthToColorPipelines) pipeline->release(); delete m_outputShaderCache; delete m_pipelineCache; delete m_depthStencilCache; delete m_samplerCache; delete m_memoryManager; m_nullTexture1D->release(); m_nullTexture2D->release(); m_nearestSampler->release(); m_linearSampler->release(); if (m_readbackBuffer) m_readbackBuffer->release(); if (m_xfbRingBuffer) m_xfbRingBuffer->release(); m_occlusionQuery.m_resultBuffer->release(); m_event->release(); m_commandQueue->release(); m_device->release(); } void MetalRenderer::InitializeLayer(const Vector2i& size, bool mainWindow) { auto& layer = GetLayer(mainWindow); layer = MetalLayerHandle(m_device, size, mainWindow); layer.GetLayer()->setPixelFormat(MTL::PixelFormatBGRA8Unorm); } void MetalRenderer::ShutdownLayer(bool mainWindow) { GetLayer(mainWindow) = MetalLayerHandle(); } void MetalRenderer::ResizeLayer(const Vector2i& size, bool mainWindow) { GetLayer(mainWindow).Resize(size); } void MetalRenderer::Initialize() { Renderer::Initialize(); RendererShaderMtl::Initialize(); } void MetalRenderer::Shutdown() { // TODO: should shutdown both layers ImGui_ImplMetal_Shutdown(); CommitCommandBuffer(); Renderer::Shutdown(); RendererShaderMtl::Shutdown(); } bool MetalRenderer::IsPadWindowActive() { return (GetLayer(false).GetLayer() != nullptr); } bool MetalRenderer::GetVRAMInfo(int& usageInMB, int& totalInMB) const { // Subtract host memory from total VRAM, since it's shared with the CPU usageInMB = (m_device->currentAllocatedSize() - m_memoryManager->GetHostAllocationSize()) / 1024 / 1024; totalInMB = m_recommendedMaxVRAMUsage / 1024 / 1024; return true; } void MetalRenderer::ClearColorbuffer(bool padView) { if (!AcquireDrawable(!padView)) return; ClearColorTextureInternal(GetLayer(!padView).GetDrawable()->texture(), 0, 0, 0.0f, 0.0f, 0.0f, 0.0f); } void MetalRenderer::DrawEmptyFrame(bool mainWindow) { if (!BeginFrame(mainWindow)) return; SwapBuffers(mainWindow, !mainWindow); } void MetalRenderer::SwapBuffers(bool swapTV, bool swapDRC) { if (swapTV) SwapBuffer(true); if (swapDRC) SwapBuffer(false); // Reset the command buffers (they are released by TemporaryBufferAllocator) CommitCommandBuffer(); // Debug m_performanceMonitor.ResetPerFrameData(); // GPU capture if (m_capturing) { EndCapture(); } else if (m_captureFrame) { StartCapture(); m_captureFrame = false; } } void MetalRenderer::HandleScreenshotRequest(LatteTextureView* texView, bool padView) { const bool hasScreenshotRequest = gui_hasScreenshotRequest(); if (!hasScreenshotRequest && m_screenshot_state == ScreenshotState::None) return; if (m_mainLayer.GetDrawable()) { // we already took a pad view screenshow and want a main window screenshot if (m_screenshot_state == ScreenshotState::Main && padView) return; if (m_screenshot_state == ScreenshotState::Pad && !padView) return; // remember which screenshot is left to take if (m_screenshot_state == ScreenshotState::None) m_screenshot_state = padView ? ScreenshotState::Main : ScreenshotState::Pad; else m_screenshot_state = ScreenshotState::None; } else m_screenshot_state = ScreenshotState::None; auto texMtl = static_cast(texView->baseTexture); int width, height; texMtl->GetEffectiveSize(width, height, 0); uint32 bytesPerRow = GetMtlTextureBytesPerRow(texMtl->format, texMtl->isDepth, width); uint32 size = GetMtlTextureBytesPerImage(texMtl->format, texMtl->isDepth, height, bytesPerRow); // TODO: get a buffer from the memory manager MTL::Buffer* buffer = m_device->newBuffer(size, MTL::ResourceStorageModeShared); auto blitCommandEncoder = GetBlitCommandEncoder(); blitCommandEncoder->copyFromTexture(texMtl->GetTexture(), 0, 0, MTL::Origin(0, 0, 0), MTL::Size(width, height, 1), buffer, 0, bytesPerRow, 0); uint8* bufferPtr = (uint8*)buffer->contents(); bool formatValid = true; std::vector rgb_data; rgb_data.reserve(3 * width * height); auto pixelFormat = texMtl->GetTexture()->pixelFormat(); // TODO: implement more formats switch (pixelFormat) { case MTL::PixelFormatRGBA8Unorm: for (auto ptr = bufferPtr; ptr < bufferPtr + size; ptr += 4) { rgb_data.emplace_back(*ptr); rgb_data.emplace_back(*(ptr + 1)); rgb_data.emplace_back(*(ptr + 2)); } break; case MTL::PixelFormatRGBA8Unorm_sRGB: for (auto ptr = bufferPtr; ptr < bufferPtr + size; ptr += 4) { rgb_data.emplace_back(SRGBComponentToRGB(*ptr)); rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 1))); rgb_data.emplace_back(SRGBComponentToRGB(*(ptr + 2))); } break; default: cemuLog_log(LogType::Force, "Unsupported screenshot texture pixel format {}", pixelFormat); formatValid = false; break; } buffer->release(); if (formatValid) SaveScreenshot(rgb_data, width, height, !padView); } void MetalRenderer::DrawBackbufferQuad(LatteTextureView* texView, RendererOutputShader* shader, bool useLinearTexFilter, sint32 imageX, sint32 imageY, sint32 imageWidth, sint32 imageHeight, bool padView, bool clearBackground) { if (!AcquireDrawable(!padView)) return; MTL::Texture* presentTexture = static_cast(texView)->GetRGBAView(); // Create render pass auto& layer = GetLayer(!padView); MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); colorAttachment->setTexture(layer.GetDrawable()->texture()); colorAttachment->setLoadAction(clearBackground ? MTL::LoadActionClear : MTL::LoadActionLoad); colorAttachment->setStoreAction(MTL::StoreActionStore); auto renderCommandEncoder = GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); // Get a render pipeline // Find out which shader we are using uint8 shaderIndex = 255; if (shader == RendererOutputShader::s_copy_shader) shaderIndex = 0; else if (shader == RendererOutputShader::s_bicubic_shader) shaderIndex = 1; else if (shader == RendererOutputShader::s_hermit_shader) shaderIndex = 2; else if (shader == RendererOutputShader::s_copy_shader_ud) shaderIndex = 3; else if (shader == RendererOutputShader::s_bicubic_shader_ud) shaderIndex = 4; else if (shader == RendererOutputShader::s_hermit_shader_ud) shaderIndex = 5; uint8 shaderType = shaderIndex % 3; // Get the render pipeline state auto renderPipelineState = m_outputShaderCache->GetPipeline(shader, shaderIndex, m_state.m_usesSRGB); // Draw to Metal layer renderCommandEncoder->setRenderPipelineState(renderPipelineState); renderCommandEncoder->setFragmentTexture(presentTexture, 0); renderCommandEncoder->setFragmentSamplerState((useLinearTexFilter ? m_linearSampler : m_nearestSampler), 0); // Set uniforms float outputSize[2] = {(float)imageWidth, (float)imageHeight}; switch (shaderType) { case 2: renderCommandEncoder->setFragmentBytes(outputSize, sizeof(outputSize), 0); break; default: break; } renderCommandEncoder->setViewport(MTL::Viewport{(double)imageX, (double)imageY, (double)imageWidth, (double)imageHeight, 0.0, 1.0}); renderCommandEncoder->setScissorRect(MTL::ScissorRect{(uint32)imageX, (uint32)imageY, (uint32)imageWidth, (uint32)imageHeight}); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); EndEncoding(); } bool MetalRenderer::BeginFrame(bool mainWindow) { return AcquireDrawable(mainWindow); } void MetalRenderer::Flush(bool waitIdle) { if (m_recordedDrawcalls > 0 || waitIdle) CommitCommandBuffer(); if (waitIdle && m_executingCommandBuffers.size() != 0) m_executingCommandBuffers.back()->waitUntilCompleted(); } void MetalRenderer::NotifyLatteCommandProcessorIdle() { //if (m_commitOnIdle) // CommitCommandBuffer(); } bool MetalRenderer::ImguiBegin(bool mainWindow) { if (!Renderer::ImguiBegin(mainWindow)) return false; if (!AcquireDrawable(mainWindow)) return false; EnsureImGuiBackend(); // Check if the font texture needs to be built ImGuiIO& io = ImGui::GetIO(); if (!io.Fonts->IsBuilt()) ImGui_ImplMetal_CreateFontsTexture(m_device); auto& layer = GetLayer(mainWindow); // Render pass descriptor MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); colorAttachment->setTexture(layer.GetDrawable()->texture()); colorAttachment->setLoadAction(MTL::LoadActionLoad); colorAttachment->setStoreAction(MTL::StoreActionStore); // New frame ImGui_ImplMetal_NewFrame(renderPassDescriptor); ImGui_UpdateWindowInformation(mainWindow); ImGui::NewFrame(); if (m_encoderType != MetalEncoderType::Render) GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); return true; } void MetalRenderer::ImguiEnd() { EnsureImGuiBackend(); if (m_encoderType != MetalEncoderType::Render) { cemuLog_logOnce(LogType::Force, "no render command encoder, cannot draw ImGui"); return; } ImGui::Render(); ImGui_ImplMetal_RenderDrawData(ImGui::GetDrawData(), GetCurrentCommandBuffer(), (MTL::RenderCommandEncoder*)m_commandEncoder); //ImGui::EndFrame(); EndEncoding(); } ImTextureID MetalRenderer::GenerateTexture(const std::vector& data, const Vector2i& size) { try { std::vector tmp(size.x * size.y * 4); for (size_t i = 0; i < data.size() / 3; ++i) { tmp[(i * 4) + 0] = data[(i * 3) + 0]; tmp[(i * 4) + 1] = data[(i * 3) + 1]; tmp[(i * 4) + 2] = data[(i * 3) + 2]; tmp[(i * 4) + 3] = 0xFF; } MTL::TextureDescriptor* desc = MTL::TextureDescriptor::alloc()->init(); desc->setTextureType(MTL::TextureType2D); desc->setPixelFormat(MTL::PixelFormatRGBA8Unorm); desc->setWidth(size.x); desc->setHeight(size.y); desc->setStorageMode(m_isAppleGPU ? MTL::StorageModeShared : MTL::StorageModeManaged); desc->setUsage(MTL::TextureUsageShaderRead); MTL::Texture* texture = m_device->newTexture(desc); desc->release(); // TODO: do a GPU copy? texture->replaceRegion(MTL::Region(0, 0, size.x, size.y), 0, 0, tmp.data(), size.x * 4, 0); return (ImTextureID)texture; } catch (const std::exception& ex) { cemuLog_log(LogType::Force, "can't generate imgui texture: {}", ex.what()); return nullptr; } } void MetalRenderer::DeleteTexture(ImTextureID id) { EnsureImGuiBackend(); ((MTL::Texture*)id)->release(); } void MetalRenderer::DeleteFontTextures() { EnsureImGuiBackend(); ImGui_ImplMetal_DestroyFontsTexture(); } void MetalRenderer::AppendOverlayDebugInfo() { ImGui::Text("--- GPU info ---"); ImGui::Text("GPU %s", m_device->name()->utf8String()); ImGui::Text("Is Apple GPU %s", (m_isAppleGPU ? "yes" : "no")); ImGui::Text("Supports framebuffer fetch %s", (m_supportsFramebufferFetch ? "yes" : "no")); ImGui::Text("Has unified memory %s", (m_hasUnifiedMemory ? "yes" : "no")); ImGui::Text("Supports Metal3 %s", (m_supportsMetal3 ? "yes" : "no")); ImGui::Text("--- Metal info ---"); ImGui::Text("Render pipeline states %zu", m_pipelineCache->GetPipelineCacheSize()); ImGui::Text("--- Metal info (per frame) ---"); ImGui::Text("Command buffers %u", m_performanceMonitor.m_commandBuffers); ImGui::Text("Render passes %u", m_performanceMonitor.m_renderPasses); ImGui::Text("Clears %u", m_performanceMonitor.m_clears); ImGui::Text("Manual vertex fetch draws %u (mesh draws: %u)", m_performanceMonitor.m_manualVertexFetchDraws, m_performanceMonitor.m_meshDraws); ImGui::Text("Triangle fans %u", m_performanceMonitor.m_triangleFans); ImGui::Text("--- Cache debug info ---"); uint32 bufferCacheHeapSize = 0; uint32 bufferCacheAllocationSize = 0; uint32 bufferCacheNumAllocations = 0; LatteBufferCache_getStats(bufferCacheHeapSize, bufferCacheAllocationSize, bufferCacheNumAllocations); ImGui::Text("Buffer"); ImGui::SameLine(60.0f); ImGui::Text("%06uKB / %06uKB Allocs: %u", (uint32)(bufferCacheAllocationSize + 1023) / 1024, ((uint32)bufferCacheHeapSize + 1023) / 1024, (uint32)bufferCacheNumAllocations); uint32 numBuffers; size_t totalSize, freeSize; m_memoryManager->GetStagingAllocator().GetStats(numBuffers, totalSize, freeSize); ImGui::Text("Staging"); ImGui::SameLine(60.0f); ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); m_memoryManager->GetIndexAllocator().GetStats(numBuffers, totalSize, freeSize); ImGui::Text("Index"); ImGui::SameLine(60.0f); ImGui::Text("%06uKB / %06uKB Buffers: %u", ((uint32)(totalSize - freeSize) + 1023) / 1024, ((uint32)totalSize + 1023) / 1024, (uint32)numBuffers); } void MetalRenderer::renderTarget_setViewport(float x, float y, float width, float height, float nearZ, float farZ, bool halfZ) { // halfZ is handled in the shader m_state.m_viewport = MTL::Viewport{x, y, width, height, nearZ, farZ}; } void MetalRenderer::renderTarget_setScissor(sint32 scissorX, sint32 scissorY, sint32 scissorWidth, sint32 scissorHeight) { m_state.m_scissor = MTL::ScissorRect{(uint32)scissorX, (uint32)scissorY, (uint32)scissorWidth, (uint32)scissorHeight}; } LatteCachedFBO* MetalRenderer::rendertarget_createCachedFBO(uint64 key) { return new CachedFBOMtl(this, key); } void MetalRenderer::rendertarget_deleteCachedFBO(LatteCachedFBO* cfbo) { if (cfbo == (LatteCachedFBO*)m_state.m_activeFBO.m_fbo) m_state.m_activeFBO = {nullptr}; } void MetalRenderer::rendertarget_bindFramebufferObject(LatteCachedFBO* cfbo) { m_state.m_activeFBO = {(CachedFBOMtl*)cfbo, MetalAttachmentsInfo((CachedFBOMtl*)cfbo)}; } void* MetalRenderer::texture_acquireTextureUploadBuffer(uint32 size) { return m_memoryManager->AcquireTextureUploadBuffer(size); } void MetalRenderer::texture_releaseTextureUploadBuffer(uint8* mem) { m_memoryManager->ReleaseTextureUploadBuffer(mem); } TextureDecoder* MetalRenderer::texture_chooseDecodedFormat(Latte::E_GX2SURFFMT format, bool isDepth, Latte::E_DIM dim, uint32 width, uint32 height) { return GetMtlPixelFormatInfo(format, isDepth).textureDecoder; } void MetalRenderer::texture_clearSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex) { if (hostTexture->isDepth) { texture_clearDepthSlice(hostTexture, sliceIndex, mipIndex, true, hostTexture->hasStencil, 0.0f, 0); } else { texture_clearColorSlice(hostTexture, sliceIndex, mipIndex, 0.0f, 0.0f, 0.0f, 0.0f); } } // TODO: do a cpu copy on Apple Silicon? void MetalRenderer::texture_loadSlice(LatteTexture* hostTexture, sint32 width, sint32 height, sint32 depth, void* pixelData, sint32 sliceIndex, sint32 mipIndex, uint32 compressedImageSize) { auto textureMtl = (LatteTextureMtl*)hostTexture; uint32 offsetZ = 0; if (textureMtl->Is3DTexture()) { offsetZ = sliceIndex; sliceIndex = 0; } size_t bytesPerRow = GetMtlTextureBytesPerRow(textureMtl->format, textureMtl->isDepth, width); // No need to set bytesPerImage for 3D textures, since we always load just one slice //size_t bytesPerImage = GetMtlTextureBytesPerImage(textureMtl->GetFormat(), textureMtl->isDepth, height, bytesPerRow); //if (m_isAppleGPU) //{ // textureMtl->GetTexture()->replaceRegion(MTL::Region(0, 0, offsetZ, width, height, 1), mipIndex, sliceIndex, pixelData, bytesPerRow, 0); //} //else //{ auto blitCommandEncoder = GetBlitCommandEncoder(); // Allocate a temporary buffer auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); auto allocation = bufferAllocator.AllocateBufferMemory(compressedImageSize, 1); bufferAllocator.FlushReservation(allocation); // Copy the data to the temporary buffer memcpy(allocation.memPtr, pixelData, compressedImageSize); //buffer->didModifyRange(NS::Range(allocation.offset, allocation.size)); // TODO: specify blit options when copying to a depth stencil texture? // Copy the data from the temporary buffer to the texture blitCommandEncoder->copyFromBuffer(allocation.mtlBuffer, allocation.bufferOffset, bytesPerRow, 0, MTL::Size(width, height, 1), textureMtl->GetTexture(), sliceIndex, mipIndex, MTL::Origin(0, 0, offsetZ)); //} } void MetalRenderer::texture_clearColorSlice(LatteTexture* hostTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { if (!FormatIsRenderable(hostTexture->format)) { cemuLog_logOnce(LogType::Force, "cannot clear color texture with format {}, because it's not renderable", hostTexture->format); return; } auto mtlTexture = static_cast(hostTexture)->GetTexture(); ClearColorTextureInternal(mtlTexture, sliceIndex, mipIndex, r, g, b, a); } void MetalRenderer::texture_clearDepthSlice(LatteTexture* hostTexture, uint32 sliceIndex, sint32 mipIndex, bool clearDepth, bool clearStencil, float depthValue, uint32 stencilValue) { clearStencil = (clearStencil && GetMtlPixelFormatInfo(hostTexture->format, true).hasStencil); if (!clearDepth && !clearStencil) { cemuLog_logOnce(LogType::Force, "skipping depth/stencil clear"); return; } auto mtlTexture = static_cast(hostTexture)->GetTexture(); MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); if (clearDepth) { auto depthAttachment = renderPassDescriptor->depthAttachment(); depthAttachment->setTexture(mtlTexture); depthAttachment->setClearDepth(depthValue); depthAttachment->setLoadAction(MTL::LoadActionClear); depthAttachment->setStoreAction(MTL::StoreActionStore); depthAttachment->setSlice(sliceIndex); depthAttachment->setLevel(mipIndex); } if (clearStencil) { auto stencilAttachment = renderPassDescriptor->stencilAttachment(); stencilAttachment->setTexture(mtlTexture); stencilAttachment->setClearStencil(stencilValue); stencilAttachment->setLoadAction(MTL::LoadActionClear); stencilAttachment->setStoreAction(MTL::StoreActionStore); stencilAttachment->setSlice(sliceIndex); stencilAttachment->setLevel(mipIndex); } GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); EndEncoding(); // Debug m_performanceMonitor.m_clears++; } LatteTexture* MetalRenderer::texture_createTextureEx(Latte::E_DIM dim, MPTR physAddress, MPTR physMipAddress, Latte::E_GX2SURFFMT format, uint32 width, uint32 height, uint32 depth, uint32 pitch, uint32 mipLevels, uint32 swizzle, Latte::E_HWTILEMODE tileMode, bool isDepth) { return new LatteTextureMtl(this, dim, physAddress, physMipAddress, format, width, height, depth, pitch, mipLevels, swizzle, tileMode, isDepth); } void MetalRenderer::texture_setLatteTexture(LatteTextureView* textureView, uint32 textureUnit) { m_state.m_textures[textureUnit] = static_cast(textureView); } void MetalRenderer::texture_copyImageSubData(LatteTexture* src, sint32 srcMip, sint32 effectiveSrcX, sint32 effectiveSrcY, sint32 srcSlice, LatteTexture* dst, sint32 dstMip, sint32 effectiveDstX, sint32 effectiveDstY, sint32 dstSlice, sint32 effectiveCopyWidth, sint32 effectiveCopyHeight, sint32 srcDepth_) { // Source size seems to apply to the destination texture as well, therefore we need to adjust it when block size doesn't match Uvec2 srcBlockTexelSize = GetMtlPixelFormatInfo(src->format, src->isDepth).blockTexelSize; Uvec2 dstBlockTexelSize = GetMtlPixelFormatInfo(dst->format, dst->isDepth).blockTexelSize; if (srcBlockTexelSize.x != dstBlockTexelSize.x || srcBlockTexelSize.y != dstBlockTexelSize.y) { uint32 multX = (srcBlockTexelSize.x > dstBlockTexelSize.x ? srcBlockTexelSize.x / dstBlockTexelSize.x : dstBlockTexelSize.x / srcBlockTexelSize.x); effectiveCopyWidth *= multX; uint32 multY = (srcBlockTexelSize.y > dstBlockTexelSize.y ? srcBlockTexelSize.y / dstBlockTexelSize.y : dstBlockTexelSize.y / srcBlockTexelSize.y); effectiveCopyHeight *= multY; } auto blitCommandEncoder = GetBlitCommandEncoder(); auto mtlSrc = static_cast(src)->GetTexture(); auto mtlDst = static_cast(dst)->GetTexture(); uint32 srcBaseLayer = 0; uint32 dstBaseLayer = 0; uint32 srcOffsetZ = 0; uint32 dstOffsetZ = 0; uint32 srcLayerCount = 1; uint32 dstLayerCount = 1; uint32 srcDepth = 1; uint32 dstDepth = 1; if (src->Is3DTexture()) { srcOffsetZ = srcSlice; srcDepth = srcDepth_; } else { srcBaseLayer = srcSlice; srcLayerCount = srcDepth_; } if (dst->Is3DTexture()) { dstOffsetZ = dstSlice; dstDepth = srcDepth_; } else { dstBaseLayer = dstSlice; dstLayerCount = srcDepth_; } // If copying whole textures, we can do a more efficient copy if (effectiveSrcX == 0 && effectiveSrcY == 0 && effectiveDstX == 0 && effectiveDstY == 0 && srcOffsetZ == 0 && dstOffsetZ == 0 && effectiveCopyWidth == src->GetMipWidth(srcMip) && effectiveCopyHeight == src->GetMipHeight(srcMip) && srcDepth == src->GetMipDepth(srcMip) && effectiveCopyWidth == dst->GetMipWidth(dstMip) && effectiveCopyHeight == dst->GetMipHeight(dstMip) && dstDepth == dst->GetMipDepth(dstMip) && srcLayerCount == dstLayerCount) { blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, mtlDst, dstBaseLayer, dstMip, srcLayerCount, 1); } else { if (srcLayerCount == dstLayerCount) { for (uint32 i = 0; i < srcLayerCount; i++) { blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer + i, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, srcDepth), mtlDst, dstBaseLayer + i, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); } } else { for (uint32 i = 0; i < std::max(srcLayerCount, dstLayerCount); i++) { if (srcLayerCount == 1) srcOffsetZ++; else srcSlice++; if (dstLayerCount == 1) dstOffsetZ++; else dstSlice++; blitCommandEncoder->copyFromTexture(mtlSrc, srcBaseLayer, srcMip, MTL::Origin(effectiveSrcX, effectiveSrcY, srcOffsetZ), MTL::Size(effectiveCopyWidth, effectiveCopyHeight, 1), mtlDst, dstBaseLayer, dstMip, MTL::Origin(effectiveDstX, effectiveDstY, dstOffsetZ)); } } } } LatteTextureReadbackInfo* MetalRenderer::texture_createReadback(LatteTextureView* textureView) { size_t uploadSize = static_cast(textureView->baseTexture)->GetTexture()->allocatedSize(); if ((m_readbackBufferWriteOffset + uploadSize) > TEXTURE_READBACK_SIZE) { m_readbackBufferWriteOffset = 0; } auto* result = new LatteTextureReadbackInfoMtl(this, textureView, m_readbackBufferWriteOffset); m_readbackBufferWriteOffset += uploadSize; return result; } void MetalRenderer::surfaceCopy_copySurfaceWithFormatConversion(LatteTexture* sourceTexture, sint32 srcMip, sint32 srcSlice, LatteTexture* destinationTexture, sint32 dstMip, sint32 dstSlice, sint32 width, sint32 height) { // scale copy size to effective size sint32 effectiveCopyWidth = width; sint32 effectiveCopyHeight = height; LatteTexture_scaleToEffectiveSize(sourceTexture, &effectiveCopyWidth, &effectiveCopyHeight, 0); //sint32 sourceEffectiveWidth, sourceEffectiveHeight; //sourceTexture->GetEffectiveSize(sourceEffectiveWidth, sourceEffectiveHeight, srcMip); texture_copyImageSubData(sourceTexture, srcMip, 0, 0, srcSlice, destinationTexture, dstMip, 0, 0, dstSlice, effectiveCopyWidth, effectiveCopyHeight, 1); } void MetalRenderer::bufferCache_init(const sint32 bufferSize) { m_memoryManager->InitBufferCache(bufferSize); } void MetalRenderer::bufferCache_upload(uint8* buffer, sint32 size, uint32 bufferOffset) { m_memoryManager->UploadToBufferCache(buffer, bufferOffset, size); } void MetalRenderer::bufferCache_copy(uint32 srcOffset, uint32 dstOffset, uint32 size) { m_memoryManager->CopyBufferCache(srcOffset, dstOffset, size); } void MetalRenderer::bufferCache_copyStreamoutToMainBuffer(uint32 srcOffset, uint32 dstOffset, uint32 size) { if (m_memoryManager->UseHostMemoryForCache()) dstOffset -= m_memoryManager->GetImportedMemBaseAddress(); CopyBufferToBuffer(GetXfbRingBuffer(), srcOffset, m_memoryManager->GetBufferCache(), dstOffset, size, MTL::RenderStageVertex | MTL::RenderStageMesh, ALL_MTL_RENDER_STAGES); } void MetalRenderer::buffer_bindVertexBuffer(uint32 bufferIndex, uint32 offset, uint32 size) { cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); cemu_assert_debug(bufferIndex < LATTE_MAX_VERTEX_BUFFERS); m_state.m_vertexBufferOffsets[bufferIndex] = offset; } void MetalRenderer::buffer_bindUniformBuffer(LatteConst::ShaderType shaderType, uint32 bufferIndex, uint32 offset, uint32 size) { cemu_assert_debug(!m_memoryManager->UseHostMemoryForCache()); m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shaderType)][bufferIndex] = offset; } RendererShader* MetalRenderer::shader_create(RendererShader::ShaderType type, uint64 baseHash, uint64 auxHash, const std::string& source, bool isGameShader, bool isGfxPackShader) { return new RendererShaderMtl(this, type, baseHash, auxHash, isGameShader, isGfxPackShader, source); } void MetalRenderer::streamout_setupXfbBuffer(uint32 bufferIndex, sint32 ringBufferOffset, uint32 rangeAddr, uint32 rangeSize) { m_state.m_streamoutState.buffers[bufferIndex].enabled = true; m_state.m_streamoutState.buffers[bufferIndex].ringBufferOffset = ringBufferOffset; } void MetalRenderer::streamout_begin() { // Do nothing } void MetalRenderer::streamout_rendererFinishDrawcall() { // Do nothing } void MetalRenderer::draw_beginSequence() { m_state.m_skipDrawSequence = false; bool streamoutEnable = LatteGPUState.contextRegister[mmVGT_STRMOUT_EN] != 0; // update shader state LatteSHRC_UpdateActiveShaders(); if (LatteGPUState.activeShaderHasError) { cemuLog_logOnce(LogType::Force, "Skipping drawcalls due to shader error\n"); m_state.m_skipDrawSequence = true; cemu_assert_debug(false); return; } // update render target and texture state LatteGPUState.requiresTextureBarrier = false; while (true) { LatteGPUState.repeatTextureInitialization = false; if (!LatteMRT::UpdateCurrentFBO()) { cemuLog_logOnce(LogType::Force, "Rendertarget invalid\n"); m_state.m_skipDrawSequence = true; return; // no render target } if (!hasValidFramebufferAttached && !streamoutEnable) { cemuLog_logOnce(LogType::Force, "Drawcall with no color buffer or depth buffer attached\n"); m_state.m_skipDrawSequence = true; return; // no render target } LatteTexture_updateTextures(); if (!LatteGPUState.repeatTextureInitialization) break; } // apply render target LatteMRT::ApplyCurrentState(); // viewport and scissor box LatteRenderTarget_updateViewport(); LatteRenderTarget_updateScissorBox(); // check for conditions which would turn the drawcalls into no-ops bool rasterizerEnable = !LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_DX_RASTERIZATION_KILL(); // GX2SetSpecialState(0, true) enables DX_RASTERIZATION_KILL, but still expects depth writes to happen? -> Research which stages are disabled by DX_RASTERIZATION_KILL exactly // for now we use a workaround: if (!LatteGPUState.contextNew.PA_CL_VTE_CNTL.get_VPORT_X_OFFSET_ENA()) rasterizerEnable = true; if (!rasterizerEnable && !streamoutEnable) m_state.m_skipDrawSequence = true; } void MetalRenderer::draw_execute(uint32 baseVertex, uint32 baseInstance, uint32 instanceCount, uint32 count, MPTR indexDataMPTR, Latte::LATTE_VGT_DMA_INDEX_TYPE::E_INDEX_TYPE indexType, bool isFirst) { if (m_state.m_skipDrawSequence) { LatteGPUState.drawCallCounter++; return; } // fast clear color as depth if (LatteGPUState.contextNew.GetSpecialStateValues()[8] != 0) { LatteDraw_handleSpecialState8_clearAsDepth(); LatteGPUState.drawCallCounter++; return; } else if (LatteGPUState.contextNew.GetSpecialStateValues()[5] != 0) { draw_handleSpecialState5(); LatteGPUState.drawCallCounter++; return; } auto& encoderState = m_state.m_encoderState; // Shaders LatteDecompilerShader* vertexShader = LatteSHRC_GetActiveVertexShader(); LatteDecompilerShader* geometryShader = LatteSHRC_GetActiveGeometryShader(); LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); const auto fetchShader = LatteSHRC_GetActiveFetchShader(); /* bool neverSkipAccurateBarrier = false; // "Accurate barriers" is usually enabled globally but since the CPU cost is substantial we allow users to disable it (debug -> 'Accurate barriers' option) // We always force accurate barriers for known problematic shaders if (pixelShader) { if (pixelShader->baseHash == 0x6f6f6e7b9aae57af && pixelShader->auxHash == 0x00078787f9249249) // BotW lava neverSkipAccurateBarrier = true; if (pixelShader->baseHash == 0x4c0bd596e3aef4a6 && pixelShader->auxHash == 0x003c3c3fc9269249) // BotW foam layer for water on the bottom of waterfalls neverSkipAccurateBarrier = true; } // Check if we need to end the render pass if (!m_state.m_isFirstDrawInRenderPass && (GetConfig().vk_accurate_barriers || neverSkipAccurateBarrier)) { // Fragment shader is most likely to require a render pass flush, so check for it first bool endRenderPass = CheckIfRenderPassNeedsFlush(pixelShader); if (!endRenderPass) endRenderPass = CheckIfRenderPassNeedsFlush(vertexShader); if (!endRenderPass && geometryShader) endRenderPass = CheckIfRenderPassNeedsFlush(geometryShader); if (endRenderPass) { EndEncoding(); // TODO: only log in debug? cemuLog_logOnce(LogType::Force, "Ending render pass due to render target self-dependency\n"); } } */ // Primitive type const LattePrimitiveMode primitiveMode = static_cast(LatteGPUState.contextRegister[mmVGT_PRIMITIVE_TYPE]); auto mtlPrimitiveType = GetMtlPrimitiveType(primitiveMode); bool isPrimitiveRect = (primitiveMode == Latte::LATTE_VGT_PRIMITIVE_TYPE::E_PRIMITIVE_TYPE::RECTS); bool usesGeometryShader = (geometryShader != nullptr || isPrimitiveRect); bool fetchVertexManually = (usesGeometryShader || fetchShader->mtlFetchVertexManually); // Index buffer Renderer::INDEX_TYPE hostIndexType; uint32 hostIndexCount; uint32 indexMin = 0; uint32 indexMax = 0; Renderer::IndexAllocation indexAllocation; LatteIndices_decode(memory_getPointerFromVirtualOffset(indexDataMPTR), indexType, count, primitiveMode, indexMin, indexMax, hostIndexType, hostIndexCount, indexAllocation); auto indexAllocationMtl = static_cast(indexAllocation.rendererInternal); // Buffer cache if (m_memoryManager->UseHostMemoryForCache()) { // direct memory access (Wii U memory space imported as a buffer), update buffer bindings draw_updateVertexBuffersDirectAccess(); if (vertexShader) draw_updateUniformBuffersDirectAccess(vertexShader, mmSQ_VTX_UNIFORM_BLOCK_START); if (geometryShader) draw_updateUniformBuffersDirectAccess(geometryShader, mmSQ_GS_UNIFORM_BLOCK_START); if (pixelShader) draw_updateUniformBuffersDirectAccess(pixelShader, mmSQ_PS_UNIFORM_BLOCK_START); } else { // synchronize vertex and uniform cache and update buffer bindings // We need to call this before getting the render command encoder, since it can cause buffer copies LatteBufferCache_Sync(indexMin + baseVertex, indexMax + baseVertex, baseInstance, instanceCount); } // Render pass auto renderCommandEncoder = GetRenderCommandEncoder(); // Render pipeline state PipelineObject* pipelineObj = m_pipelineCache->GetRenderPipelineState(fetchShader, vertexShader, geometryShader, pixelShader, m_state.m_lastUsedFBO.m_attachmentsInfo, m_state.m_activeFBO.m_attachmentsInfo, m_state.m_activeFBO.m_fbo->m_size, count, LatteGPUState.contextNew); if (!pipelineObj->m_pipeline) return; if (pipelineObj->m_pipeline != encoderState.m_renderPipelineState) { renderCommandEncoder->setRenderPipelineState(pipelineObj->m_pipeline); encoderState.m_renderPipelineState = pipelineObj->m_pipeline; } // Depth stencil state // Disable depth write when there is no depth attachment auto& depthControl = LatteGPUState.contextNew.DB_DEPTH_CONTROL; bool depthWriteEnable = depthControl.get_Z_WRITE_ENABLE(); if (!m_state.m_activeFBO.m_fbo->depthBuffer.texture) depthControl.set_Z_WRITE_ENABLE(false); MTL::DepthStencilState* depthStencilState = m_depthStencilCache->GetDepthStencilState(LatteGPUState.contextNew); if (depthStencilState != encoderState.m_depthStencilState) { renderCommandEncoder->setDepthStencilState(depthStencilState); encoderState.m_depthStencilState = depthStencilState; } // Restore the original depth write state depthControl.set_Z_WRITE_ENABLE(depthWriteEnable); // Stencil reference bool stencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_STENCIL_ENABLE(); if (stencilEnable) { bool backStencilEnable = LatteGPUState.contextNew.DB_DEPTH_CONTROL.get_BACK_STENCIL_ENABLE(); uint32 stencilRefFront = LatteGPUState.contextNew.DB_STENCILREFMASK.get_STENCILREF_F(); uint32 stencilRefBack; if (backStencilEnable) stencilRefBack = LatteGPUState.contextNew.DB_STENCILREFMASK_BF.get_STENCILREF_B(); else stencilRefBack = stencilRefFront; if (stencilRefFront != encoderState.m_stencilRefFront || stencilRefBack != encoderState.m_stencilRefBack) { renderCommandEncoder->setStencilReferenceValues(stencilRefFront, stencilRefBack); encoderState.m_stencilRefFront = stencilRefFront; encoderState.m_stencilRefBack = stencilRefBack; } } // Blend color uint32* blendColorConstantU32 = LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; if (blendColorConstantU32[0] != encoderState.m_blendColor[0] || blendColorConstantU32[1] != encoderState.m_blendColor[1] || blendColorConstantU32[2] != encoderState.m_blendColor[2] || blendColorConstantU32[3] != encoderState.m_blendColor[3]) { float* blendColorConstant = (float*)LatteGPUState.contextRegister + Latte::REGADDR::CB_BLEND_RED; renderCommandEncoder->setBlendColor(blendColorConstant[0], blendColorConstant[1], blendColorConstant[2], blendColorConstant[3]); encoderState.m_blendColor[0] = blendColorConstantU32[0]; encoderState.m_blendColor[1] = blendColorConstantU32[1]; encoderState.m_blendColor[2] = blendColorConstantU32[2]; encoderState.m_blendColor[3] = blendColorConstantU32[3]; } // polygon control const auto& polygonControlReg = LatteGPUState.contextNew.PA_SU_SC_MODE_CNTL; const auto frontFace = polygonControlReg.get_FRONT_FACE(); uint32 cullFront = polygonControlReg.get_CULL_FRONT(); uint32 cullBack = polygonControlReg.get_CULL_BACK(); uint32 polyOffsetFrontEnable = polygonControlReg.get_OFFSET_FRONT_ENABLED(); if (polyOffsetFrontEnable) { uint32 frontScaleU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.getRawValue(); uint32 frontOffsetU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.getRawValue(); uint32 offsetClampU32 = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.getRawValue(); if (frontOffsetU32 != encoderState.m_depthBias || frontScaleU32 != encoderState.m_depthSlope || offsetClampU32 != encoderState.m_depthClamp) { float frontScale = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_SCALE.get_SCALE(); float frontOffset = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_FRONT_OFFSET.get_OFFSET(); float offsetClamp = LatteGPUState.contextNew.PA_SU_POLY_OFFSET_CLAMP.get_CLAMP(); frontScale /= 16.0f; renderCommandEncoder->setDepthBias(frontOffset, frontScale, offsetClamp); encoderState.m_depthBias = frontOffsetU32; encoderState.m_depthSlope = frontScaleU32; encoderState.m_depthClamp = offsetClampU32; } } else { if (0 != encoderState.m_depthBias || 0 != encoderState.m_depthSlope || 0 != encoderState.m_depthClamp) { renderCommandEncoder->setDepthBias(0.0f, 0.0f, 0.0f); encoderState.m_depthBias = 0; encoderState.m_depthSlope = 0; encoderState.m_depthClamp = 0; } } // Depth clip mode cemu_assert_debug(LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_NEAR_DISABLE() == LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE()); // near or far clipping can be disabled individually bool zClipEnable = LatteGPUState.contextNew.PA_CL_CLIP_CNTL.get_ZCLIP_FAR_DISABLE() == false; if (zClipEnable != encoderState.m_depthClipEnable) { renderCommandEncoder->setDepthClipMode(zClipEnable ? MTL::DepthClipModeClip : MTL::DepthClipModeClamp); encoderState.m_depthClipEnable = zClipEnable; } // Visibility result mode if (m_occlusionQuery.m_active) { auto mode = (m_occlusionQuery.m_currentIndex == INVALID_UINT32 ? MTL::VisibilityResultModeDisabled : MTL::VisibilityResultModeCounting); renderCommandEncoder->setVisibilityResultMode(mode, m_occlusionQuery.m_currentIndex * sizeof(uint64)); } // todo - how does culling behave with rects? // right now we just assume that their winding is always CW if (isPrimitiveRect) { if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CW) cullFront = cullBack; else cullBack = cullFront; } // Cull mode // Cull front and back is handled by disabling rasterization if (!(cullFront && cullBack)) { MTL::CullMode cullMode; if (cullFront) cullMode = MTL::CullModeFront; else if (cullBack) cullMode = MTL::CullModeBack; else cullMode = MTL::CullModeNone; if (cullMode != encoderState.m_cullMode) { renderCommandEncoder->setCullMode(cullMode); encoderState.m_cullMode = cullMode; } } // Front face MTL::Winding frontFaceWinding; if (frontFace == Latte::LATTE_PA_SU_SC_MODE_CNTL::E_FRONTFACE::CCW) frontFaceWinding = MTL::WindingCounterClockwise; else frontFaceWinding = MTL::WindingClockwise; if (frontFaceWinding != encoderState.m_frontFaceWinding) { renderCommandEncoder->setFrontFacingWinding(frontFaceWinding); encoderState.m_frontFaceWinding = frontFaceWinding; } // Viewport if (m_state.m_viewport.originX != encoderState.m_viewport.originX || m_state.m_viewport.originY != encoderState.m_viewport.originY || m_state.m_viewport.width != encoderState.m_viewport.width || m_state.m_viewport.height != encoderState.m_viewport.height || m_state.m_viewport.znear != encoderState.m_viewport.znear || m_state.m_viewport.zfar != encoderState.m_viewport.zfar) { renderCommandEncoder->setViewport(m_state.m_viewport); encoderState.m_viewport = m_state.m_viewport; } // Scissor if (m_state.m_scissor.x != encoderState.m_scissor.x || m_state.m_scissor.y != encoderState.m_scissor.y || m_state.m_scissor.width != encoderState.m_scissor.width || m_state.m_scissor.height != encoderState.m_scissor.height) { encoderState.m_scissor = m_state.m_scissor; // TODO: clamp scissor to render target dimensions? //scissor.width = ; //scissor.height = ; renderCommandEncoder->setScissorRect(encoderState.m_scissor); } // Resources // Vertex buffers for (uint8 i = 0; i < MAX_MTL_VERTEX_BUFFERS; i++) { size_t offset = m_state.m_vertexBufferOffsets[i]; if (offset != INVALID_OFFSET) { // Bind SetBuffer(renderCommandEncoder, GetMtlShaderType(vertexShader->shaderType, usesGeometryShader), m_memoryManager->GetBufferCache(), offset, GET_MTL_VERTEX_BUFFER_INDEX(i)); } } // Prepare streamout m_state.m_streamoutState.verticesPerInstance = count; LatteStreamout_PrepareDrawcall(count, instanceCount); // Uniform buffers, textures and samplers BindStageResources(renderCommandEncoder, vertexShader, usesGeometryShader); if (geometryShader) BindStageResources(renderCommandEncoder, geometryShader, usesGeometryShader); BindStageResources(renderCommandEncoder, pixelShader, usesGeometryShader); // Draw if (usesGeometryShader) { if (hostIndexType != INDEX_TYPE::NONE) SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_OBJECT, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, vertexShader->resourceMapping.indexBufferBinding); uint8 hostIndexTypeU8 = (uint8)hostIndexType; renderCommandEncoder->setObjectBytes(&hostIndexTypeU8, sizeof(hostIndexTypeU8), vertexShader->resourceMapping.indexTypeBinding); encoderState.m_buffers[METAL_SHADER_TYPE_OBJECT][vertexShader->resourceMapping.indexTypeBinding] = {nullptr}; uint32 verticesPerPrimitive = 0; switch (primitiveMode) { case LattePrimitiveMode::POINTS: verticesPerPrimitive = 1; break; case LattePrimitiveMode::LINES: verticesPerPrimitive = 2; break; case LattePrimitiveMode::TRIANGLES: case LattePrimitiveMode::RECTS: verticesPerPrimitive = 3; break; default: cemuLog_log(LogType::Force, "unimplemented geometry shader primitive mode {}", (uint32)primitiveMode); break; } renderCommandEncoder->drawMeshThreadgroups(MTL::Size(count * instanceCount / verticesPerPrimitive, 1, 1), MTL::Size(verticesPerPrimitive, 1, 1), MTL::Size(1, 1, 1)); } else { if (hostIndexType != INDEX_TYPE::NONE) { auto mtlIndexType = GetMtlIndexType(hostIndexType); renderCommandEncoder->drawIndexedPrimitives(mtlPrimitiveType, hostIndexCount, mtlIndexType, indexAllocationMtl->mtlBuffer, indexAllocationMtl->bufferOffset, instanceCount, baseVertex, baseInstance); } else { renderCommandEncoder->drawPrimitives(mtlPrimitiveType, baseVertex, count, instanceCount, baseInstance); } } m_state.m_isFirstDrawInRenderPass = false; // Occlusion queries if (m_occlusionQuery.m_active) m_occlusionQuery.m_currentIndex = (m_occlusionQuery.m_currentIndex + 1) % OCCLUSION_QUERY_POOL_SIZE; // Streamout LatteStreamout_FinishDrawcall(m_memoryManager->UseHostMemoryForCache()); // Debug if (fetchVertexManually) m_performanceMonitor.m_manualVertexFetchDraws++; if (usesGeometryShader) m_performanceMonitor.m_meshDraws++; if (primitiveMode == LattePrimitiveMode::TRIANGLE_FAN) m_performanceMonitor.m_triangleFans++; LatteGPUState.drawCallCounter++; } void MetalRenderer::draw_endSequence() { LatteDecompilerShader* pixelShader = LatteSHRC_GetActivePixelShader(); // post-drawcall logic if (pixelShader) LatteRenderTarget_trackUpdates(); bool hasReadback = LatteTextureReadback_Update(); m_recordedDrawcalls++; // The number of draw calls needs to twice as big, since we are interrupting the render pass // TODO: ucomment? if (m_recordedDrawcalls >= m_commitTreshold * 2/* || hasReadback*/) { CommitCommandBuffer(); // TODO: where should this be called? LatteTextureReadback_UpdateFinishedTransfers(false); } } void MetalRenderer::draw_updateVertexBuffersDirectAccess() { LatteFetchShader* parsedFetchShader = LatteSHRC_GetActiveFetchShader(); if (!parsedFetchShader) return; for (auto& bufferGroup : parsedFetchShader->bufferGroups) { uint32 bufferIndex = bufferGroup.attributeBufferIndex; uint32 bufferBaseRegisterIndex = mmSQ_VTX_ATTRIBUTE_BLOCK_START + bufferIndex * 7; MPTR bufferAddress = LatteGPUState.contextRegister[bufferBaseRegisterIndex + 0]; if (bufferAddress == MPTR_NULL) [[unlikely]] bufferAddress = m_memoryManager->GetImportedMemBaseAddress(); m_state.m_vertexBufferOffsets[bufferIndex] = bufferAddress - m_memoryManager->GetImportedMemBaseAddress(); } } void MetalRenderer::draw_updateUniformBuffersDirectAccess(LatteDecompilerShader* shader, const uint32 uniformBufferRegOffset) { if (shader->uniformMode == LATTE_DECOMPILER_UNIFORM_MODE_FULL_CBANK) { for (const auto& buf : shader->list_quickBufferList) { sint32 i = buf.index; MPTR physicalAddr = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 0]; uint32 uniformSize = LatteGPUState.contextRegister[uniformBufferRegOffset + i * 7 + 1] + 1; if (physicalAddr == MPTR_NULL) [[unlikely]] { cemu_assert_unimplemented(); continue; } uniformSize = std::min(uniformSize, buf.size); cemu_assert_debug(physicalAddr < 0x50000000); uint32 bufferIndex = i; cemu_assert_debug(bufferIndex < 16); m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][bufferIndex] = physicalAddr - m_memoryManager->GetImportedMemBaseAddress(); } } } void MetalRenderer::draw_handleSpecialState5() { LatteMRT::UpdateCurrentFBO(); LatteRenderTarget_updateViewport(); LatteTextureView* colorBuffer = LatteMRT::GetColorAttachment(0); LatteTextureView* depthBuffer = LatteMRT::GetDepthAttachment(); auto colorTextureMtl = static_cast(colorBuffer); auto depthTextureMtl = static_cast(depthBuffer); sint32 vpWidth, vpHeight; LatteMRT::GetVirtualViewportDimensions(vpWidth, vpHeight); // Get the pipeline MTL::PixelFormat colorPixelFormat = colorTextureMtl->GetRGBAView()->pixelFormat(); auto& pipeline = m_copyDepthToColorPipelines[colorPixelFormat]; if (!pipeline) { m_copyDepthToColorDesc->colorAttachments()->object(0)->setPixelFormat(colorPixelFormat); NS::Error* error = nullptr; pipeline = m_device->newRenderPipelineState(m_copyDepthToColorDesc, &error); if (error) { cemuLog_log(LogType::Force, "failed to create copy depth to color pipeline (error: {})", error->localizedDescription()->utf8String()); } } // Sadly, we need to end encoding to ensure that the depth data is up-to-date EndEncoding(); // Copy depth to color auto renderCommandEncoder = GetRenderCommandEncoder(); auto& encoderState = m_state.m_encoderState; renderCommandEncoder->setRenderPipelineState(pipeline); // TODO: make a helper function for this encoderState.m_renderPipelineState = pipeline; SetTexture(renderCommandEncoder, METAL_SHADER_TYPE_FRAGMENT, depthTextureMtl->GetRGBAView(), GET_HELPER_TEXTURE_BINDING(0)); // TODO: make a helper function for this renderCommandEncoder->setFragmentBytes(&vpWidth, sizeof(sint32), GET_HELPER_BUFFER_BINDING(0)); encoderState.m_buffers[METAL_SHADER_TYPE_FRAGMENT][GET_HELPER_BUFFER_BINDING(0)] = {nullptr}; renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypeTriangle, NS::UInteger(0), NS::UInteger(3)); } Renderer::IndexAllocation MetalRenderer::indexData_reserveIndexMemory(uint32 size) { auto allocation = m_memoryManager->GetIndexAllocator().AllocateBufferMemory(size, 128); return {allocation->memPtr, allocation}; } void MetalRenderer::indexData_releaseIndexMemory(IndexAllocation& allocation) { m_memoryManager->GetIndexAllocator().FreeReservation(static_cast(allocation.rendererInternal)); } void MetalRenderer::indexData_uploadIndexMemory(IndexAllocation& allocation) { m_memoryManager->GetIndexAllocator().FlushReservation(static_cast(allocation.rendererInternal)); } LatteQueryObject* MetalRenderer::occlusionQuery_create() { return new LatteQueryObjectMtl(this); } void MetalRenderer::occlusionQuery_destroy(LatteQueryObject* queryObj) { auto queryObjMtl = static_cast(queryObj); delete queryObjMtl; } void MetalRenderer::occlusionQuery_flush() { if (m_occlusionQuery.m_lastCommandBuffer) m_occlusionQuery.m_lastCommandBuffer->waitUntilCompleted(); } void MetalRenderer::occlusionQuery_updateState() { ProcessFinishedCommandBuffers(); } void MetalRenderer::SetBuffer(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Buffer* buffer, size_t offset, uint32 index) { auto& boundBuffer = m_state.m_encoderState.m_buffers[shaderType][index]; if (buffer == boundBuffer.m_buffer && offset == boundBuffer.m_offset) return; if (buffer == boundBuffer.m_buffer) { // Update just the offset boundBuffer.m_offset = offset; switch (shaderType) { case METAL_SHADER_TYPE_VERTEX: renderCommandEncoder->setVertexBufferOffset(offset, index); break; case METAL_SHADER_TYPE_OBJECT: renderCommandEncoder->setObjectBufferOffset(offset, index); break; case METAL_SHADER_TYPE_MESH: renderCommandEncoder->setMeshBufferOffset(offset, index); break; case METAL_SHADER_TYPE_FRAGMENT: renderCommandEncoder->setFragmentBufferOffset(offset, index); break; } return; } boundBuffer = {buffer, offset}; switch (shaderType) { case METAL_SHADER_TYPE_VERTEX: renderCommandEncoder->setVertexBuffer(buffer, offset, index); break; case METAL_SHADER_TYPE_OBJECT: renderCommandEncoder->setObjectBuffer(buffer, offset, index); break; case METAL_SHADER_TYPE_MESH: renderCommandEncoder->setMeshBuffer(buffer, offset, index); break; case METAL_SHADER_TYPE_FRAGMENT: renderCommandEncoder->setFragmentBuffer(buffer, offset, index); break; } } void MetalRenderer::SetTexture(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::Texture* texture, uint32 index) { auto& boundTexture = m_state.m_encoderState.m_textures[shaderType][index]; if (texture == boundTexture) return; boundTexture = texture; switch (shaderType) { case METAL_SHADER_TYPE_VERTEX: renderCommandEncoder->setVertexTexture(texture, index); break; case METAL_SHADER_TYPE_OBJECT: renderCommandEncoder->setObjectTexture(texture, index); break; case METAL_SHADER_TYPE_MESH: renderCommandEncoder->setMeshTexture(texture, index); break; case METAL_SHADER_TYPE_FRAGMENT: renderCommandEncoder->setFragmentTexture(texture, index); break; } } void MetalRenderer::SetSamplerState(MTL::RenderCommandEncoder* renderCommandEncoder, MetalShaderType shaderType, MTL::SamplerState* samplerState, uint32 index) { auto& boundSamplerState = m_state.m_encoderState.m_samplers[shaderType][index]; if (samplerState == boundSamplerState) return; boundSamplerState = samplerState; switch (shaderType) { case METAL_SHADER_TYPE_VERTEX: renderCommandEncoder->setVertexSamplerState(samplerState, index); break; case METAL_SHADER_TYPE_OBJECT: renderCommandEncoder->setObjectSamplerState(samplerState, index); break; case METAL_SHADER_TYPE_MESH: renderCommandEncoder->setMeshSamplerState(samplerState, index); break; case METAL_SHADER_TYPE_FRAGMENT: renderCommandEncoder->setFragmentSamplerState(samplerState, index); break; } } MTL::CommandBuffer* MetalRenderer::GetCommandBuffer() { bool needsNewCommandBuffer = (!m_currentCommandBuffer.m_commandBuffer || m_currentCommandBuffer.m_commited); if (needsNewCommandBuffer) { // Debug //m_commandQueue->insertDebugCaptureBoundary(); MTL::CommandBuffer* mtlCommandBuffer = m_commandQueue->commandBuffer(); m_currentCommandBuffer = {mtlCommandBuffer}; // Wait for the previous command buffer if (m_eventValue != -1) mtlCommandBuffer->encodeWait(m_event, m_eventValue); m_recordedDrawcalls = 0; m_commitTreshold = m_defaultCommitTreshlod; // Debug m_performanceMonitor.m_commandBuffers++; return mtlCommandBuffer; } else { return m_currentCommandBuffer.m_commandBuffer; } } MTL::RenderCommandEncoder* MetalRenderer::GetTemporaryRenderCommandEncoder(MTL::RenderPassDescriptor* renderPassDescriptor) { EndEncoding(); auto commandBuffer = GetCommandBuffer(); auto renderCommandEncoder = commandBuffer->renderCommandEncoder(renderPassDescriptor); #ifdef CEMU_DEBUG_ASSERT renderCommandEncoder->setLabel(GetLabel("Temporary render command encoder", renderCommandEncoder)); #endif m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; // Debug m_performanceMonitor.m_renderPasses++; return renderCommandEncoder; } // Some render passes clear the attachments, forceRecreate is supposed to be used in those cases MTL::RenderCommandEncoder* MetalRenderer::GetRenderCommandEncoder(bool forceRecreate) { // Check if we need to begin a new render pass if (m_commandEncoder) { if (!forceRecreate) { if (m_encoderType == MetalEncoderType::Render) { bool needsNewRenderPass = (m_state.m_lastUsedFBO.m_fbo == nullptr); if (!needsNewRenderPass) { for (uint8 i = 0; i < 8; i++) { if (m_state.m_activeFBO.m_fbo->colorBuffer[i].texture && m_state.m_activeFBO.m_fbo->colorBuffer[i].texture != m_state.m_lastUsedFBO.m_fbo->colorBuffer[i].texture) { needsNewRenderPass = true; break; } } } if (!needsNewRenderPass) { if (m_state.m_activeFBO.m_fbo->depthBuffer.texture && (m_state.m_activeFBO.m_fbo->depthBuffer.texture != m_state.m_lastUsedFBO.m_fbo->depthBuffer.texture || ( m_state.m_activeFBO.m_fbo->depthBuffer.hasStencil && !m_state.m_lastUsedFBO.m_fbo->depthBuffer.hasStencil))) { needsNewRenderPass = true; } } if (!needsNewRenderPass) { return (MTL::RenderCommandEncoder*)m_commandEncoder; } } } EndEncoding(); } auto commandBuffer = GetCommandBuffer(); auto renderCommandEncoder = commandBuffer->renderCommandEncoder(m_state.m_activeFBO.m_fbo->GetRenderPassDescriptor()); #ifdef CEMU_DEBUG_ASSERT renderCommandEncoder->setLabel(GetLabel("Render command encoder", renderCommandEncoder)); #endif m_commandEncoder = renderCommandEncoder; m_encoderType = MetalEncoderType::Render; // Update state m_state.m_lastUsedFBO = m_state.m_activeFBO; m_state.m_isFirstDrawInRenderPass = true; ResetEncoderState(); // Debug m_performanceMonitor.m_renderPasses++; return renderCommandEncoder; } MTL::ComputeCommandEncoder* MetalRenderer::GetComputeCommandEncoder() { if (m_commandEncoder) { if (m_encoderType == MetalEncoderType::Compute) { return (MTL::ComputeCommandEncoder*)m_commandEncoder; } EndEncoding(); } auto commandBuffer = GetCommandBuffer(); auto computeCommandEncoder = commandBuffer->computeCommandEncoder(); m_commandEncoder = computeCommandEncoder; m_encoderType = MetalEncoderType::Compute; ResetEncoderState(); return computeCommandEncoder; } MTL::BlitCommandEncoder* MetalRenderer::GetBlitCommandEncoder() { if (m_commandEncoder) { if (m_encoderType == MetalEncoderType::Blit) { return (MTL::BlitCommandEncoder*)m_commandEncoder; } EndEncoding(); } auto commandBuffer = GetCommandBuffer(); auto blitCommandEncoder = commandBuffer->blitCommandEncoder(); m_commandEncoder = blitCommandEncoder; m_encoderType = MetalEncoderType::Blit; ResetEncoderState(); return blitCommandEncoder; } void MetalRenderer::EndEncoding() { if (m_commandEncoder) { m_commandEncoder->endEncoding(); m_commandEncoder->release(); m_commandEncoder = nullptr; m_encoderType = MetalEncoderType::None; // Commit the command buffer if enough draw calls have been recorded if (m_recordedDrawcalls >= m_commitTreshold) CommitCommandBuffer(); } } void MetalRenderer::CommitCommandBuffer() { if (!m_currentCommandBuffer.m_commandBuffer) return; EndEncoding(); ProcessFinishedCommandBuffers(); // Commit the command buffer if (!m_currentCommandBuffer.m_commited) { // Handled differently, since it seems like Metal doesn't always call the completion handler //commandBuffer.m_commandBuffer->addCompletedHandler(^(MTL::CommandBuffer*) { // m_memoryManager->GetTemporaryBufferAllocator().CommandBufferFinished(commandBuffer.m_commandBuffer); //}); // Signal event m_eventValue = (m_eventValue + 1) % EVENT_VALUE_WRAP; auto mtlCommandBuffer = m_currentCommandBuffer.m_commandBuffer; mtlCommandBuffer->encodeSignalEvent(m_event, m_eventValue); mtlCommandBuffer->commit(); m_currentCommandBuffer.m_commited = true; m_executingCommandBuffers.push_back(mtlCommandBuffer); // Debug //m_commandQueue->insertDebugCaptureBoundary(); } } void MetalRenderer::ProcessFinishedCommandBuffers() { // Check for finished command buffers for (auto it = m_executingCommandBuffers.begin(); it != m_executingCommandBuffers.end();) { auto commandBuffer = *it; if (CommandBufferCompleted(commandBuffer)) { m_memoryManager->CleanupBuffers(commandBuffer); commandBuffer->release(); it = m_executingCommandBuffers.erase(it); } else { ++it; } } } bool MetalRenderer::AcquireDrawable(bool mainWindow) { auto& layer = GetLayer(mainWindow); if (!layer.GetLayer()) return false; const bool latteBufferUsesSRGB = mainWindow ? LatteGPUState.tvBufferUsesSRGB : LatteGPUState.drcBufferUsesSRGB; if (latteBufferUsesSRGB != m_state.m_usesSRGB) { layer.GetLayer()->setPixelFormat(latteBufferUsesSRGB ? MTL::PixelFormatBGRA8Unorm_sRGB : MTL::PixelFormatBGRA8Unorm); m_state.m_usesSRGB = latteBufferUsesSRGB; } return layer.AcquireDrawable(); } /* bool MetalRenderer::CheckIfRenderPassNeedsFlush(LatteDecompilerShader* shader) { sint32 textureCount = shader->resourceMapping.getTextureCount(); for (int i = 0; i < textureCount; ++i) { const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); auto hostTextureUnit = relative_textureUnit; auto textureDim = shader->textureUnitDim[relative_textureUnit]; // Texture is accessed as a framebuffer fetch, therefore there is no need to flush it if (shader->textureRenderTargetIndex[relative_textureUnit] != 255) continue; auto texUnitRegIndex = hostTextureUnit * 7; switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE; texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS; break; case LatteConst::ShaderType::Pixel: hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE; texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS; break; case LatteConst::ShaderType::Geometry: hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE; texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS; break; default: UNREACHABLE; } auto textureView = m_state.m_textures[hostTextureUnit]; if (!textureView) continue; LatteTexture* baseTexture = textureView->baseTexture; // If the texture is also used in the current render pass, we need to end the render pass to "flush" the texture for (uint8 i = 0; i < LATTE_NUM_COLOR_TARGET; i++) { auto colorTarget = m_state.m_activeFBO.m_fbo->colorBuffer[i].texture; if (colorTarget && colorTarget->baseTexture == baseTexture) return true; } } return false; } */ void MetalRenderer::BindStageResources(MTL::RenderCommandEncoder* renderCommandEncoder, LatteDecompilerShader* shader, bool usesGeometryShader) { auto mtlShaderType = GetMtlShaderType(shader->shaderType, usesGeometryShader); sint32 textureCount = shader->resourceMapping.getTextureCount(); for (int i = 0; i < textureCount; ++i) { const auto relative_textureUnit = shader->resourceMapping.getTextureUnitFromBindingPoint(i); auto hostTextureUnit = relative_textureUnit; // Don't bind textures that are accessed with a framebuffer fetch if (m_supportsFramebufferFetch && shader->textureRenderTargetIndex[relative_textureUnit] != 255) continue; auto textureDim = shader->textureUnitDim[relative_textureUnit]; auto texUnitRegIndex = hostTextureUnit * 7; switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: hostTextureUnit += LATTE_CEMU_VS_TEX_UNIT_BASE; texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_VS; break; case LatteConst::ShaderType::Pixel: hostTextureUnit += LATTE_CEMU_PS_TEX_UNIT_BASE; texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_PS; break; case LatteConst::ShaderType::Geometry: hostTextureUnit += LATTE_CEMU_GS_TEX_UNIT_BASE; texUnitRegIndex += Latte::REGADDR::SQ_TEX_RESOURCE_WORD0_N_GS; break; default: UNREACHABLE; } // TODO: correct? uint32 binding = shader->resourceMapping.getTextureBaseBindingPoint() + i; if (binding >= MAX_MTL_TEXTURES) { cemuLog_logOnce(LogType::Force, "invalid texture binding {}", binding); continue; } auto textureView = m_state.m_textures[hostTextureUnit]; if (!textureView) { if (textureDim == Latte::E_DIM::DIM_1D) SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); else SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding); SetSamplerState(renderCommandEncoder, mtlShaderType, m_nearestSampler, binding); continue; } if (textureDim == Latte::E_DIM::DIM_1D && (textureView->dim != Latte::E_DIM::DIM_1D)) { SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture1D, binding); continue; } else if (textureDim == Latte::E_DIM::DIM_2D && (textureView->dim != Latte::E_DIM::DIM_2D && textureView->dim != Latte::E_DIM::DIM_2D_MSAA)) { SetTexture(renderCommandEncoder, mtlShaderType, m_nullTexture2D, binding); continue; } LatteTexture* baseTexture = textureView->baseTexture; uint32 stageSamplerIndex = shader->textureUnitSamplerAssignment[relative_textureUnit]; MTL::SamplerState* sampler; if (stageSamplerIndex != LATTE_DECOMPILER_SAMPLER_NONE) { uint32 samplerIndex = stageSamplerIndex + LatteDecompiler_getTextureSamplerBaseIndex(shader->shaderType); sampler = m_samplerCache->GetSamplerState(LatteGPUState.contextNew, samplerIndex); } else { sampler = m_nearestSampler; } SetSamplerState(renderCommandEncoder, mtlShaderType, sampler, binding); // get texture register word 0 uint32 word4 = LatteGPUState.contextRegister[texUnitRegIndex + 4]; auto& boundTexture = m_state.m_encoderState.m_textures[mtlShaderType][binding]; MTL::Texture* mtlTexture = textureView->GetSwizzledView(word4); SetTexture(renderCommandEncoder, mtlShaderType, mtlTexture, binding); } // Support buffer auto GET_UNIFORM_DATA_PTR = [&](size_t index) { return supportBufferData + (index / 4); }; sint32 shaderAluConst; sint32 shaderUniformRegisterOffset; switch (shader->shaderType) { case LatteConst::ShaderType::Vertex: shaderAluConst = 0x400; shaderUniformRegisterOffset = mmSQ_VTX_UNIFORM_BLOCK_START; break; case LatteConst::ShaderType::Pixel: shaderAluConst = 0; shaderUniformRegisterOffset = mmSQ_PS_UNIFORM_BLOCK_START; break; case LatteConst::ShaderType::Geometry: shaderAluConst = 0; // geometry shader has no ALU const shaderUniformRegisterOffset = mmSQ_GS_UNIFORM_BLOCK_START; break; default: UNREACHABLE; } if (shader->resourceMapping.uniformVarsBufferBindingPoint >= 0) { if (shader->uniform.list_ufTexRescale.empty() == false) { for (auto& entry : shader->uniform.list_ufTexRescale) { float* xyScale = LatteTexture_getEffectiveTextureScale(shader->shaderType, entry.texUnit); memcpy(entry.currentValue, xyScale, sizeof(float) * 2); memcpy(GET_UNIFORM_DATA_PTR(entry.uniformLocation), xyScale, sizeof(float) * 2); } } if (shader->uniform.loc_alphaTestRef >= 0) { *GET_UNIFORM_DATA_PTR(shader->uniform.loc_alphaTestRef) = LatteGPUState.contextNew.SX_ALPHA_REF.get_ALPHA_TEST_REF(); } if (shader->uniform.loc_pointSize >= 0) { const auto& pointSizeReg = LatteGPUState.contextNew.PA_SU_POINT_SIZE; float pointWidth = (float)pointSizeReg.get_WIDTH() / 8.0f; if (pointWidth == 0.0f) pointWidth = 1.0f / 8.0f; // minimum size *GET_UNIFORM_DATA_PTR(shader->uniform.loc_pointSize) = pointWidth; } if (shader->uniform.loc_remapped >= 0) { LatteBufferCache_LoadRemappedUniforms(shader, GET_UNIFORM_DATA_PTR(shader->uniform.loc_remapped)); } if (shader->uniform.loc_uniformRegister >= 0) { uint32* uniformRegData = (uint32*)(LatteGPUState.contextRegister + mmSQ_ALU_CONSTANT0_0 + shaderAluConst); memcpy(GET_UNIFORM_DATA_PTR(shader->uniform.loc_uniformRegister), uniformRegData, shader->uniform.count_uniformRegister * 16); } if (shader->uniform.loc_windowSpaceToClipSpaceTransform >= 0) { sint32 viewportWidth; sint32 viewportHeight; LatteRenderTarget_GetCurrentVirtualViewportSize(&viewportWidth, &viewportHeight); // always call after _updateViewport() float* v = GET_UNIFORM_DATA_PTR(shader->uniform.loc_windowSpaceToClipSpaceTransform); v[0] = 2.0f / (float)viewportWidth; v[1] = 2.0f / (float)viewportHeight; } if (shader->uniform.loc_fragCoordScale >= 0) { LatteMRT::GetCurrentFragCoordScale(GET_UNIFORM_DATA_PTR(shader->uniform.loc_fragCoordScale)); } if (shader->uniform.loc_verticesPerInstance >= 0) { *(int*)(supportBufferData + ((size_t)shader->uniform.loc_verticesPerInstance / 4)) = m_state.m_streamoutState.verticesPerInstance; for (sint32 b = 0; b < LATTE_NUM_STREAMOUT_BUFFER; b++) { if (shader->uniform.loc_streamoutBufferBase[b] >= 0) { *(uint32*)GET_UNIFORM_DATA_PTR(shader->uniform.loc_streamoutBufferBase[b]) = m_state.m_streamoutState.buffers[b].ringBufferOffset; } } } size_t size = shader->uniform.uniformRangeSize; auto& bufferAllocator = m_memoryManager->GetStagingAllocator(); auto allocation = bufferAllocator.AllocateBufferMemory(size, 1); memcpy(allocation.memPtr, supportBufferData, size); bufferAllocator.FlushReservation(allocation); SetBuffer(renderCommandEncoder, mtlShaderType, allocation.mtlBuffer, allocation.bufferOffset, shader->resourceMapping.uniformVarsBufferBindingPoint); } // Uniform buffers for (sint32 i = 0; i < LATTE_NUM_MAX_UNIFORM_BUFFERS; i++) { if (shader->resourceMapping.uniformBuffersBindingPoint[i] >= 0) { uint32 binding = shader->resourceMapping.uniformBuffersBindingPoint[i]; if (binding >= MAX_MTL_BUFFERS) { cemuLog_logOnce(LogType::Force, "invalid buffer binding {}", binding); continue; } size_t offset = m_state.m_uniformBufferOffsets[GetMtlGeneralShaderType(shader->shaderType)][i]; if (offset == INVALID_OFFSET) continue; SetBuffer(renderCommandEncoder, mtlShaderType, m_memoryManager->GetBufferCache(), offset, binding); } } // Storage buffer if (shader->resourceMapping.tfStorageBindingPoint >= 0) { SetBuffer(renderCommandEncoder, mtlShaderType, m_xfbRingBuffer, 0, shader->resourceMapping.tfStorageBindingPoint); } } void MetalRenderer::ClearColorTextureInternal(MTL::Texture* mtlTexture, sint32 sliceIndex, sint32 mipIndex, float r, float g, float b, float a) { MTL::RenderPassDescriptor* renderPassDescriptor = MTL::RenderPassDescriptor::alloc()->init(); auto colorAttachment = renderPassDescriptor->colorAttachments()->object(0); colorAttachment->setTexture(mtlTexture); colorAttachment->setClearColor(MTL::ClearColor(r, g, b, a)); colorAttachment->setLoadAction(MTL::LoadActionClear); colorAttachment->setStoreAction(MTL::StoreActionStore); colorAttachment->setSlice(sliceIndex); colorAttachment->setLevel(mipIndex); GetTemporaryRenderCommandEncoder(renderPassDescriptor); renderPassDescriptor->release(); EndEncoding(); // Debug m_performanceMonitor.m_clears++; } void MetalRenderer::CopyBufferToBuffer(MTL::Buffer* src, uint32 srcOffset, MTL::Buffer* dst, uint32 dstOffset, uint32 size, MTL::RenderStages after, MTL::RenderStages before) { // TODO: uncomment and fix performance issues // Do the copy in a vertex shader on Apple GPUs /* if (m_isAppleGPU && m_encoderType == MetalEncoderType::Render) { auto renderCommandEncoder = static_cast(m_commandEncoder); MTL::Resource* barrierBuffers[] = {src}; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, after, after | MTL::RenderStageVertex); renderCommandEncoder->setRenderPipelineState(m_copyBufferToBufferPipeline->GetRenderPipelineState()); m_state.m_encoderState.m_renderPipelineState = m_copyBufferToBufferPipeline->GetRenderPipelineState(); SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, src, srcOffset, GET_HELPER_BUFFER_BINDING(0)); SetBuffer(renderCommandEncoder, METAL_SHADER_TYPE_VERTEX, dst, dstOffset, GET_HELPER_BUFFER_BINDING(1)); renderCommandEncoder->drawPrimitives(MTL::PrimitiveTypePoint, NS::UInteger(0), NS::UInteger(size)); barrierBuffers[0] = dst; renderCommandEncoder->memoryBarrier(barrierBuffers, 1, before | MTL::RenderStageVertex, before); } else { */ auto blitCommandEncoder = GetBlitCommandEncoder(); blitCommandEncoder->copyFromBuffer(src, srcOffset, dst, dstOffset, size); //} } void MetalRenderer::SwapBuffer(bool mainWindow) { if (!AcquireDrawable(mainWindow)) return; auto commandBuffer = GetCommandBuffer(); GetLayer(mainWindow).PresentDrawable(commandBuffer); } void MetalRenderer::EnsureImGuiBackend() { if (!ImGui::GetIO().BackendRendererUserData) { ImGui_ImplMetal_Init(m_device); //ImGui_ImplMetal_CreateFontsTexture(m_device); } } void MetalRenderer::StartCapture() { auto captureManager = MTL::CaptureManager::sharedCaptureManager(); auto desc = MTL::CaptureDescriptor::alloc()->init(); desc->setCaptureObject(m_device); // Check if a debugger with support for GPU capture is attached if (captureManager->supportsDestination(MTL::CaptureDestinationDeveloperTools)) { desc->setDestination(MTL::CaptureDestinationDeveloperTools); } else { if (GetConfig().gpu_capture_dir.GetValue().empty()) { cemuLog_log(LogType::Force, "No GPU capture directory specified, cannot do a GPU capture"); return; } // Check if the GPU trace document destination is available if (!captureManager->supportsDestination(MTL::CaptureDestinationGPUTraceDocument)) { cemuLog_log(LogType::Force, "GPU trace document destination is not available, cannot do a GPU capture"); return; } // Get current date and time as a string auto now = std::chrono::system_clock::now(); std::time_t now_time = std::chrono::system_clock::to_time_t(now); std::ostringstream oss; oss << std::put_time(std::localtime(&now_time), "%Y-%m-%d_%H-%M-%S"); std::string now_str = oss.str(); std::string capturePath = fmt::format("{}/cemu_{}.gputrace", GetConfig().gpu_capture_dir.GetValue(), now_str); desc->setDestination(MTL::CaptureDestinationGPUTraceDocument); desc->setOutputURL(ToNSURL(capturePath)); } NS::Error* error = nullptr; captureManager->startCapture(desc, &error); if (error) { cemuLog_log(LogType::Force, "Failed to start GPU capture: {}", error->localizedDescription()->utf8String()); } m_capturing = true; } void MetalRenderer::EndCapture() { auto captureManager = MTL::CaptureManager::sharedCaptureManager(); captureManager->stopCapture(); m_capturing = false; }