From 62e2d8d9a7b8aaa110673862b80733b4e93b42c7 Mon Sep 17 00:00:00 2001
From: S Gopal Rajagopal <gopalsr83@users.noreply.github.com>
Date: Thu, 29 Jan 2015 20:20:34 +0530
Subject: [PATCH] SPURS: Update kernel to use lock line reservations

---
 rpcs3/Emu/Cell/MFC.h                        |   8 +
 rpcs3/Emu/Cell/SPUThread.cpp                |   9 +-
 rpcs3/Emu/Cell/SPUThread.h                  |  31 +-
 rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp    |  19 +-
 rpcs3/Emu/SysCalls/Modules/cellSpurs.h      |  19 +-
 rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp | 623 +++++++++++---------
 rpcs3/stdafx.h                              |   2 +
 7 files changed, 414 insertions(+), 297 deletions(-)
diff --git a/rpcs3/Emu/Cell/MFC.h b/rpcs3/Emu/Cell/MFC.h
index a6c731d3da..0b669deb97 100644
--- a/rpcs3/Emu/Cell/MFC.h
+++ b/rpcs3/Emu/Cell/MFC.h
@@ -35,6 +35,14 @@ enum
 	MFC_GETLLAR_SUCCESS = 4,
 };
 
+// MFC Write Tag Status Update Request Channel (ch23) operations
+enum 
+{
+	MFC_TAG_UPDATE_IMMEDIATE = 0,
+	MFC_TAG_UPDATE_ANY       = 1,
+	MFC_TAG_UPDATE_ALL       = 2,
+};
+
 enum
 {
 	MFC_SPU_TO_PPU_MAILBOX_STATUS_MASK      = 0x000000FF,
diff --git a/rpcs3/Emu/Cell/SPUThread.cpp b/rpcs3/Emu/Cell/SPUThread.cpp
index 95f678bf02..fcb9b012e9 100644
--- a/rpcs3/Emu/Cell/SPUThread.cpp
+++ b/rpcs3/Emu/Cell/SPUThread.cpp
@@ -1060,7 +1060,14 @@ void SPUThread::StopAndSignal(u32 code)
 
 	case 0x003:
 	{
-		m_code3_func(*this);
+		auto iter = m_addr_to_hle_function_map.find(PC);
+		assert(iter != m_addr_to_hle_function_map.end());
+
+		auto return_to_caller = iter->second(*this);
+		if (return_to_caller)
+		{
+			SetBranch(GPR[0]._u32[3] & 0x3fffc);
+		}
 		break;
 	}
 
diff --git a/rpcs3/Emu/Cell/SPUThread.h b/rpcs3/Emu/Cell/SPUThread.h
index f880e5ca6e..d6ecbe64b0 100644
--- a/rpcs3/Emu/Cell/SPUThread.h
+++ b/rpcs3/Emu/Cell/SPUThread.h
@@ -290,6 +290,8 @@ public:
 	u32 m_event_mask;
 	u32 m_events;
 
+	std::unordered_map<u32, std::function<bool(SPUThread& SPU)>> m_addr_to_hle_function_map;
+
 	struct IntrTag
 	{
 		u32 enabled; // 1 == true
@@ -509,8 +511,35 @@ public:
 	void WriteLS64 (const u32 lsa, const u64&  data) const { vm::write64 (lsa + m_offset, data); }
 	void WriteLS128(const u32 lsa, const u128& data) const { vm::write128(lsa + m_offset, data); }
 
+	void RegisterHleFuncion(u32 addr, std::function<bool(SPUThread & SPU)> function)
+	{
+		m_addr_to_hle_function_map[addr] = function;
+		WriteLS32(addr, 0x00000003); // STOP 3
+	}
+
+	void UnregisterHleFunction(u32 addr)
+	{
+		WriteLS32(addr, 0x00200000); // NOP
+		m_addr_to_hle_function_map.erase(addr);
+	}
+
+	void UnregisterHleFunctions(u32 start_addr, u32 end_addr)
+	{
+		for (auto iter = m_addr_to_hle_function_map.begin(); iter != m_addr_to_hle_function_map.end();)
+		{
+			if (iter->first >= start_addr && iter->first <= end_addr)
+			{
+				WriteLS32(iter->first, 0x00200000); // NOP
+				m_addr_to_hle_function_map.erase(iter++);
+			}
+			else
+			{
+				iter++;
+			}
+		}
+	}
+
 	std::function<void(SPUThread& SPU)> m_custom_task;
-	std::function<void(SPUThread& SPU)> m_code3_func;
 
 public:
 	SPUThread(CPUThreadType type = CPU_THREAD_SPU);
diff --git a/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp b/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp
index c9795203f8..62349f276d 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/cellSpurs.cpp
@@ -26,7 +26,7 @@ extern u32 libsre;
 extern u32 libsre_rtoc;
 #endif
 
-void spursKernelMain(SPUThread & spu);
+bool spursKernelMain(SPUThread & spu);
 s64 cellSpursLookUpTasksetAddress(vm::ptr<CellSpurs> spurs, vm::ptr<CellSpursTaskset> taskset, u32 id);
 s64 _cellSpursSendSignal(vm::ptr<CellSpursTaskset> taskset, u32 taskID);
 
@@ -155,7 +155,8 @@ s64 spursInit(
 		assert(!"spu_image_import() failed");
 	}
 #else
-	spurs->m.spuImg.addr = (u32)Memory.Alloc(0x40000, 4096);
+	spurs->m.spuImg.addr        = (u32)Memory.Alloc(0x40000, 4096);
+	spurs->m.spuImg.entry_point = isSecond ? CELL_SPURS_KERNEL2_ENTRY_ADDR : CELL_SPURS_KERNEL1_ENTRY_ADDR;
 #endif
 
 	s32 tgt = SYS_SPU_THREAD_GROUP_TYPE_NORMAL;
@@ -179,17 +180,11 @@ s64 spursInit(
 	name += "CellSpursKernel0";
 	for (s32 num = 0; num < nSpus; num++, name[name.size() - 1]++)
 	{
-		spurs->m.spus[num] = spu_thread_initialize(tg, num, spurs->m.spuImg, name, SYS_SPU_THREAD_OPTION_DEC_SYNC_TB_ENABLE, 0, 0, 0, 0, [spurs, num](SPUThread& SPU)
-		{
-			SPU.GPR[3]._u32[3] = num;
-			SPU.GPR[4]._u64[1] = spurs.addr();
-
-#ifdef PRX_DEBUG_XXX
-			return SPU.FastCall(SPU.PC);
+		auto spu = spu_thread_initialize(tg, num, spurs->m.spuImg, name, SYS_SPU_THREAD_OPTION_DEC_SYNC_TB_ENABLE, num, spurs.addr(), 0, 0);
+#ifndef PRX_DEBUG_XXX
+		spu->RegisterHleFuncion(spurs->m.spuImg.entry_point, spursKernelMain);
 #endif
-
-			spursKernelMain(SPU);
-		})->GetId();
+		spurs->m.spus[num] = spu->GetId();
 	}
 
 	if (flags & SAF_SPU_PRINTF_ENABLED)
diff --git a/rpcs3/Emu/SysCalls/Modules/cellSpurs.h b/rpcs3/Emu/SysCalls/Modules/cellSpurs.h
index 4ccf224ec7..4d77a06402 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellSpurs.h
+++ b/rpcs3/Emu/SysCalls/Modules/cellSpurs.h
@@ -102,6 +102,12 @@ enum SPURSKernelInterfaces
 	CELL_SPURS_INTERRUPT_VECTOR = 0x0,
 	CELL_SPURS_LOCK_LINE = 0x80,
 	CELL_SPURS_KERNEL_DMA_TAG_ID = 31,
+	CELL_SPURS_KERNEL1_ENTRY_ADDR = 0x818,
+	CELL_SPURS_KERNEL2_ENTRY_ADDR = 0x848,
+	CELL_SPURS_KERNEL1_YIELD_ADDR = 0x808,
+	CELL_SPURS_KERNEL2_YIELD_ADDR = 0x838,
+	CELL_SPURS_KERNEL1_SELECT_WORKLOAD_ADDR = 0x290,
+	CELL_SPURS_KERNEL2_SELECT_WORKLOAD_ADDR = 0x290,
 };
 
 enum RangeofEventQueuePortNumbers
@@ -885,14 +891,23 @@ struct SpursKernelMgmtData
 	u8 spuIdling;                                   // 0x1EB
 	be_t<u16> wklRunnable1;                         // 0x1EC
 	be_t<u16> wklRunnable2;                         // 0x1EE
-	u8 x1F0[0x210 - 0x1F0];                         // 0x1F0
+	be_t<u32> x1F0;                                 // 0x1F0
+	be_t<u32> x1F4;                                 // 0x1F4
+	be_t<u32> x1F8;                                 // 0x1F8
+	be_t<u32> x1FC;                                 // 0x1FC
+	be_t<u32> x200;                                 // 0x200
+	be_t<u32> x204;                                 // 0x204
+	be_t<u32> x208;                                 // 0x208
+	be_t<u32> x20C;                                 // 0x20C
 	be_t<u64> traceBuffer;                          // 0x210
 	be_t<u32> traceMsgCount;                        // 0x218
 	be_t<u32> traceMaxCount;                        // 0x21C
 	u8 wklUniqueId[0x10];                           // 0x220
+	u8 x230[0x280 - 0x230];                         // 0x230
+	be_t<u32> guid[4];                              // 0x280
 };
 
-static_assert(sizeof(SpursKernelMgmtData) == 0x130, "Incorrect size for SpursKernelMgmtData");
+static_assert(sizeof(SpursKernelMgmtData) == 0x190, "Incorrect size for SpursKernelMgmtData");
 
 // The SPURS taskset policy module data store. This resides at 0x2700 of the LS.
 struct SpursTasksetPmMgmtData
diff --git a/rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp b/rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp
index e4ebca84a5..898638894c 100644
--- a/rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp
+++ b/rpcs3/Emu/SysCalls/Modules/cellSpursSpu.cpp
@@ -14,11 +14,15 @@
 void cellSpursModulePutTrace(CellSpursTracePacket * packet, unsigned tag);
 u32 cellSpursModulePollStatus(SPUThread & spu, u32 * status);
 
+bool spursDma(SPUThread & spu, u32 cmd, u64 ea, u32 lsa, u32 size, u32 tag);
+u32 spursDmaGetCompletionStatus(SPUThread & spu, u32 tagMask);
+u32 spursDmaWaitForCompletion(SPUThread & spu, u32 tagMask, bool waitForAll = true);
+
 //
 // SPURS Kernel functions
 //
-void spursKernelSelectWorkload(SPUThread & spu);
-void spursKernelSelectWorkload2(SPUThread & spu);
+bool spursKernel1SelectWorkload(SPUThread & spu);
+bool spursKernel2SelectWorkload(SPUThread & spu);
 
 //
 // SPURS system service workload functions
@@ -31,7 +35,7 @@ void spursSysServiceUpdateWorkload(SPUThread & spu, SpursKernelMgmtData * mgmt);
 void spursSysServiceProcessMessages(SPUThread & spu, SpursKernelMgmtData * mgmt);
 void spursSysServiceWaitOrExit(SPUThread & spu, SpursKernelMgmtData * mgmt);
 void spursSysServiceWorkloadMain(SPUThread & spu, u32 pollStatus);
-void spursSysServiceWorkloadEntry(SPUThread & spu);
+bool spursSysServiceWorkloadEntry(SPUThread & spu);
 
 //
 // SPURS taskset polict module functions
@@ -54,9 +58,9 @@ u32 cellSpursModulePollStatus(SPUThread & spu, u32 * status) {
 
     spu.GPR[3]._u32[3] = 1;
     if (mgmt->spurs->m.flags1 & SF1_32_WORKLOADS) {
-        spursKernelSelectWorkload2(spu);
+        spursKernel2SelectWorkload(spu);
     } else {
-        spursKernelSelectWorkload(spu);
+        spursKernel1SelectWorkload(spu);
     }
 
     auto result = spu.GPR[3]._u64[1];
@@ -68,14 +72,51 @@ u32 cellSpursModulePollStatus(SPUThread & spu, u32 * status) {
     return wklId == mgmt->wklCurrentId ? 0 : 1;
 }
 
+/// Execute a DMA operation
+bool spursDma(SPUThread & spu, u32 cmd, u64 ea, u32 lsa, u32 size, u32 tag) {
+    spu.WriteChannel(MFC_LSA, u128::from32r(lsa));
+    spu.WriteChannel(MFC_EAH, u128::from32r((u32)(ea >> 32)));
+    spu.WriteChannel(MFC_EAL, u128::from32r((u32)ea));
+    spu.WriteChannel(MFC_Size, u128::from32r(size));
+    spu.WriteChannel(MFC_TagID, u128::from32r(tag));
+    spu.WriteChannel(MFC_Cmd, u128::from32r(cmd));
+
+    if (cmd == MFC_GETLLAR_CMD || cmd == MFC_PUTLLC_CMD || cmd == MFC_PUTLLUC_CMD) {
+        u128 rv;
+
+        spu.ReadChannel(rv, MFC_RdAtomicStat);
+        return rv._u32[3] ? true : false;
+    }
+
+    return true;
+}
+
+/// Get the status of DMA operations
+u32 spursDmaGetCompletionStatus(SPUThread & spu, u32 tagMask) {
+    u128 rv;
+
+    spu.WriteChannel(MFC_WrTagMask, u128::from32r(tagMask));
+    spu.WriteChannel(MFC_WrTagUpdate, u128::from32r(MFC_TAG_UPDATE_IMMEDIATE));
+    spu.ReadChannel(rv, MFC_RdTagStat);
+    return rv._u32[3];
+}
+
+/// Wait for DMA operations to complete
+u32 spursDmaWaitForCompletion(SPUThread & spu, u32 tagMask, bool waitForAll) {
+    u128 rv;
+
+    spu.WriteChannel(MFC_WrTagMask, u128::from32r(tagMask));
+    spu.WriteChannel(MFC_WrTagUpdate, u128::from32r(waitForAll ? MFC_TAG_UPDATE_ALL : MFC_TAG_UPDATE_ANY));
+    spu.ReadChannel(rv, MFC_RdTagStat);
+    return rv._u32[3];
+}
+
 //////////////////////////////////////////////////////////////////////////////
 // SPURS kernel functions
 //////////////////////////////////////////////////////////////////////////////
 
 /// Select a workload to run
-void spursKernelSelectWorkload(SPUThread & spu) {
-    LV2_LOCK(0); // TODO: lock-free implementation if possible
-
+bool spursKernel1SelectWorkload(SPUThread & spu) {
     auto mgmt = vm::get_ptr<SpursKernelMgmtData>(spu.ls_offset + 0x100);
 
     // The first and only argument to this function is a boolean that is set to false if the function
@@ -83,140 +124,148 @@ void spursKernelSelectWorkload(SPUThread & spu) {
     // If the first argument is true then the shared data is not updated with the result.
     const auto isPoll = spu.GPR[3]._u32[3];
 
-    // Calculate the contention (number of SPUs used) for each workload
-    u8 contention[CELL_SPURS_MAX_WORKLOAD];
-    u8 pendingContention[CELL_SPURS_MAX_WORKLOAD];
-    for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
-        contention[i] = mgmt->spurs->m.wklCurrentContention[i] - mgmt->wklLocContention[i];
+    u32 wklSelectedId;
+    u32 pollStatus;
 
-        // If this is a poll request then the number of SPUs pending to context switch is also added to the contention presumably
-        // to prevent unnecessary jumps to the kernel
-        if (isPoll) {
-            pendingContention[i] = mgmt->spurs->m.wklPendingContention[i] - mgmt->wklLocPendingContention[i];
-            if (i != mgmt->wklCurrentId) {
-                contention[i] += pendingContention[i];
-            }
-        }
-    }
+    do {
+        // DMA and lock the first 0x80 bytes of spurs
+        spursDma(spu, MFC_GETLLAR_CMD, mgmt->spurs.addr(), 0x100/*LSA*/, 0x80/*size*/, 0/*tag*/);
+        CellSpurs * spurs = (CellSpurs *)mgmt->tempArea;
 
-    u32 wklSelectedId = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
-    u32 pollStatus    = 0;
-
-    // The system service workload has the highest priority. Select the system service workload if
-    // the system service message bit for this SPU is set.
-    if (mgmt->spurs->m.sysSrvMessage.read_relaxed() & (1 << mgmt->spuNum)) {
-        mgmt->spuIdling = 0;
-        if (!isPoll || mgmt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
-            // Clear the message bit
-            mgmt->spurs->m.sysSrvMessage.write_relaxed(mgmt->spurs->m.sysSrvMessage.read_relaxed() & ~(1 << mgmt->spuNum));
-        }
-    } else {
-        // Caclulate the scheduling weight for each workload
-        u16 maxWeight = 0;
+        // Calculate the contention (number of SPUs used) for each workload
+        u8 contention[CELL_SPURS_MAX_WORKLOAD];
+        u8 pendingContention[CELL_SPURS_MAX_WORKLOAD];
         for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
-            u16 runnable     = mgmt->wklRunnable1 & (0x8000 >> i);
-            u16 wklSignal    = mgmt->spurs->m.wklSignal1.read_relaxed() & (0x8000 >> i);
-            u8  wklFlag      = mgmt->spurs->m.wklFlag.flag.read_relaxed() == 0 ? mgmt->spurs->m.wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0;
-            u8  readyCount   = mgmt->spurs->m.wklReadyCount1[i].read_relaxed() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : mgmt->spurs->m.wklReadyCount1[i].read_relaxed();
-            u8  idleSpuCount = mgmt->spurs->m.wklIdleSpuCountOrReadyCount2[i].read_relaxed() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : mgmt->spurs->m.wklIdleSpuCountOrReadyCount2[i].read_relaxed();
-            u8  requestCount = readyCount + idleSpuCount;
+            contention[i] = spurs->m.wklCurrentContention[i] - mgmt->wklLocContention[i];
 
-            // For a workload to be considered for scheduling:
-            // 1. Its priority must not be 0
-            // 2. The number of SPUs used by it must be less than the max contention for that workload
-            // 3. The workload should be in runnable state
-            // 4. The number of SPUs allocated to it must be less than the number of SPUs requested (i.e. readyCount)
-            //    OR the workload must be signalled
-            //    OR the workload flag is 0 and the workload is configured as the wokload flag receiver
-            if (runnable && mgmt->priority[i] != 0 && mgmt->spurs->m.wklMaxContention[i].read_relaxed() > contention[i]) {
-                if (wklFlag || wklSignal || (readyCount != 0 && requestCount > contention[i])) {
-                    // The scheduling weight of the workload is formed from the following parameters in decreasing order of priority:
-                    // 1. Wokload signal set or workload flag or ready count > contention
-                    // 2. Priority of the workload on the SPU
-                    // 3. Is the workload the last selected workload
-                    // 4. Minimum contention of the workload
-                    // 5. Number of SPUs that are being used by the workload (lesser the number, more the weight)
-                    // 6. Is the workload executable same as the currently loaded executable
-                    // 7. The workload id (lesser the number, more the weight)
-                    u16 weight  = (wklFlag || wklSignal || (readyCount > contention[i])) ? 0x8000 : 0;
-                    weight     |= (u16)(mgmt->priority[i] & 0x7F) << 16;
-                    weight     |= i == mgmt->wklCurrentId ? 0x80 : 0x00;
-                    weight     |= (contention[i] > 0 && mgmt->spurs->m.wklMinContention[i] > contention[i]) ? 0x40 : 0x00;
-                    weight     |= ((CELL_SPURS_MAX_SPU - contention[i]) & 0x0F) << 2;
-                    weight     |= mgmt->wklUniqueId[i] == mgmt->wklCurrentId ? 0x02 : 0x00;
-                    weight     |= 0x01;
-
-                    // In case of a tie the lower numbered workload is chosen
-                    if (weight > maxWeight) {
-                        wklSelectedId  = i;
-                        maxWeight      = weight;
-                        pollStatus     = readyCount > contention[i] ? CELL_SPURS_MODULE_POLL_STATUS_READYCOUNT : 0;
-                        pollStatus    |= wklSignal ? CELL_SPURS_MODULE_POLL_STATUS_SIGNAL : 0;
-                        pollStatus    |= wklFlag ? CELL_SPURS_MODULE_POLL_STATUS_FLAG : 0;
-                    }
+            // If this is a poll request then the number of SPUs pending to context switch is also added to the contention presumably
+            // to prevent unnecessary jumps to the kernel
+            if (isPoll) {
+                pendingContention[i] = spurs->m.wklPendingContention[i] - mgmt->wklLocPendingContention[i];
+                if (i != mgmt->wklCurrentId) {
+                    contention[i] += pendingContention[i];
                 }
             }
         }
 
-        // Not sure what this does. Possibly mark the SPU as idle/in use.
-        mgmt->spuIdling = wklSelectedId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID ? 1 : 0;
+        wklSelectedId = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
+        pollStatus    = 0;
 
-        if (!isPoll || wklSelectedId == mgmt->wklCurrentId) {
-            // Clear workload signal for the selected workload
-            mgmt->spurs->m.wklSignal1.write_relaxed(be_t<u16>::make(mgmt->spurs->m.wklSignal1.read_relaxed() & ~(0x8000 >> wklSelectedId)));
-            mgmt->spurs->m.wklSignal2.write_relaxed(be_t<u16>::make(mgmt->spurs->m.wklSignal1.read_relaxed() & ~(0x80000000u >> wklSelectedId)));
+        // The system service workload has the highest priority. Select the system service workload if
+        // the system service message bit for this SPU is set.
+        if (spurs->m.sysSrvMessage.read_relaxed() & (1 << mgmt->spuNum)) {
+            mgmt->spuIdling = 0;
+            if (!isPoll || mgmt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
+                // Clear the message bit
+                spurs->m.sysSrvMessage.write_relaxed(spurs->m.sysSrvMessage.read_relaxed() & ~(1 << mgmt->spuNum));
+            }
+        } else {
+            // Caclulate the scheduling weight for each workload
+            u16 maxWeight = 0;
+            for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
+                u16 runnable     = mgmt->wklRunnable1 & (0x8000 >> i);
+                u16 wklSignal    = spurs->m.wklSignal1.read_relaxed() & (0x8000 >> i);
+                u8  wklFlag      = spurs->m.wklFlag.flag.read_relaxed() == 0 ? spurs->m.wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0;
+                u8  readyCount   = spurs->m.wklReadyCount1[i].read_relaxed() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->m.wklReadyCount1[i].read_relaxed();
+                u8  idleSpuCount = spurs->m.wklIdleSpuCountOrReadyCount2[i].read_relaxed() > CELL_SPURS_MAX_SPU ? CELL_SPURS_MAX_SPU : spurs->m.wklIdleSpuCountOrReadyCount2[i].read_relaxed();
+                u8  requestCount = readyCount + idleSpuCount;
 
-            // If the selected workload is the wklFlag workload then pull the wklFlag to all 1s
-            if (wklSelectedId == mgmt->spurs->m.wklFlagReceiver.read_relaxed()) {
-                mgmt->spurs->m.wklFlag.flag.write_relaxed(be_t<u32>::make(0xFFFFFFFF));
+                // For a workload to be considered for scheduling:
+                // 1. Its priority must not be 0
+                // 2. The number of SPUs used by it must be less than the max contention for that workload
+                // 3. The workload should be in runnable state
+                // 4. The number of SPUs allocated to it must be less than the number of SPUs requested (i.e. readyCount)
+                //    OR the workload must be signalled
+                //    OR the workload flag is 0 and the workload is configured as the wokload flag receiver
+                if (runnable && mgmt->priority[i] != 0 && spurs->m.wklMaxContention[i].read_relaxed() > contention[i]) {
+                    if (wklFlag || wklSignal || (readyCount != 0 && requestCount > contention[i])) {
+                        // The scheduling weight of the workload is formed from the following parameters in decreasing order of priority:
+                        // 1. Wokload signal set or workload flag or ready count > contention
+                        // 2. Priority of the workload on the SPU
+                        // 3. Is the workload the last selected workload
+                        // 4. Minimum contention of the workload
+                        // 5. Number of SPUs that are being used by the workload (lesser the number, more the weight)
+                        // 6. Is the workload executable same as the currently loaded executable
+                        // 7. The workload id (lesser the number, more the weight)
+                        u16 weight  = (wklFlag || wklSignal || (readyCount > contention[i])) ? 0x8000 : 0;
+                        weight     |= (u16)(mgmt->priority[i] & 0x7F) << 16;
+                        weight     |= i == mgmt->wklCurrentId ? 0x80 : 0x00;
+                        weight     |= (contention[i] > 0 && spurs->m.wklMinContention[i] > contention[i]) ? 0x40 : 0x00;
+                        weight     |= ((CELL_SPURS_MAX_SPU - contention[i]) & 0x0F) << 2;
+                        weight     |= mgmt->wklUniqueId[i] == mgmt->wklCurrentId ? 0x02 : 0x00;
+                        weight     |= 0x01;
+
+                        // In case of a tie the lower numbered workload is chosen
+                        if (weight > maxWeight) {
+                            wklSelectedId  = i;
+                            maxWeight      = weight;
+                            pollStatus     = readyCount > contention[i] ? CELL_SPURS_MODULE_POLL_STATUS_READYCOUNT : 0;
+                            pollStatus    |= wklSignal ? CELL_SPURS_MODULE_POLL_STATUS_SIGNAL : 0;
+                            pollStatus    |= wklFlag ? CELL_SPURS_MODULE_POLL_STATUS_FLAG : 0;
+                        }
+                    }
+                }
+            }
+
+            // Not sure what this does. Possibly mark the SPU as idle/in use.
+            mgmt->spuIdling = wklSelectedId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID ? 1 : 0;
+
+            if (!isPoll || wklSelectedId == mgmt->wklCurrentId) {
+                // Clear workload signal for the selected workload
+                spurs->m.wklSignal1.write_relaxed(be_t<u16>::make(spurs->m.wklSignal1.read_relaxed() & ~(0x8000 >> wklSelectedId)));
+                spurs->m.wklSignal2.write_relaxed(be_t<u16>::make(spurs->m.wklSignal1.read_relaxed() & ~(0x80000000u >> wklSelectedId)));
+
+                // If the selected workload is the wklFlag workload then pull the wklFlag to all 1s
+                if (wklSelectedId == spurs->m.wklFlagReceiver.read_relaxed()) {
+                    spurs->m.wklFlag.flag.write_relaxed(be_t<u32>::make(0xFFFFFFFF));
+                }
             }
         }
-    }
 
-    if (!isPoll) {
-        // Called by kernel
-        // Increment the contention for the selected workload
-        if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
-            contention[wklSelectedId]++;
-        }
+        if (!isPoll) {
+            // Called by kernel
+            // Increment the contention for the selected workload
+            if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
+                contention[wklSelectedId]++;
+            }
 
-        for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
-            mgmt->spurs->m.wklCurrentContention[i] = contention[i];
-            mgmt->wklLocContention[i]        = 0;
-            mgmt->wklLocPendingContention[i] = 0;
-        }
+            for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
+                spurs->m.wklCurrentContention[i] = contention[i];
+                mgmt->wklLocContention[i]        = 0;
+                mgmt->wklLocPendingContention[i] = 0;
+            }
 
-        if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
-            mgmt->wklLocContention[wklSelectedId] = 1;
-        }
+            if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
+                mgmt->wklLocContention[wklSelectedId] = 1;
+            }
 
-        mgmt->wklCurrentId = wklSelectedId;
-    } else if (wklSelectedId != mgmt->wklCurrentId) {
-        // Not called by kernel but a context switch is required
-        // Increment the pending contention for the selected workload
-        if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
-            pendingContention[wklSelectedId]++;
-        }
+            mgmt->wklCurrentId = wklSelectedId;
+        } else if (wklSelectedId != mgmt->wklCurrentId) {
+            // Not called by kernel but a context switch is required
+            // Increment the pending contention for the selected workload
+            if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
+                pendingContention[wklSelectedId]++;
+            }
 
-        for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
-            mgmt->spurs->m.wklPendingContention[i] = pendingContention[i];
-            mgmt->wklLocPendingContention[i]       = 0;
-        }
+            for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD; i++) {
+                spurs->m.wklPendingContention[i] = pendingContention[i];
+                mgmt->wklLocPendingContention[i] = 0;
+            }
 
-        if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
-            mgmt->wklLocPendingContention[wklSelectedId] = 1;
+            if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
+                mgmt->wklLocPendingContention[wklSelectedId] = 1;
+            }
         }
-    }
+    } while (spursDma(spu, MFC_PUTLLC_CMD, mgmt->spurs.addr(), 0x100/*LSA*/, 0x80/*size*/, 0/*tag*/) == false);
 
     u64 result          = (u64)wklSelectedId << 32;
     result             |= pollStatus;
     spu.GPR[3]._u64[1]  = result;
+    return true;
 }
 
 /// Select a workload to run
-void spursKernelSelectWorkload2(SPUThread & spu) {
-    LV2_LOCK(0); // TODO: lock-free implementation if possible
-
+bool spursKernel2SelectWorkload(SPUThread & spu) {
     auto mgmt = vm::get_ptr<SpursKernelMgmtData>(spu.ls_offset + 0x100);
 
     // The first and only argument to this function is a boolean that is set to false if the function
@@ -224,202 +273,214 @@ void spursKernelSelectWorkload2(SPUThread & spu) {
     // If the first argument is true then the shared data is not updated with the result.
     const auto isPoll = spu.GPR[3]._u32[3];
 
-    // Calculate the contention (number of SPUs used) for each workload
-    u8 contention[CELL_SPURS_MAX_WORKLOAD2];
-    u8 pendingContention[CELL_SPURS_MAX_WORKLOAD2];
-    for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD2; i++) {
-        contention[i] = mgmt->spurs->m.wklCurrentContention[i & 0x0F] - mgmt->wklLocContention[i & 0x0F];
-        contention[i] = i < CELL_SPURS_MAX_WORKLOAD ? contention[i] & 0x0F : contention[i] >> 4;
+    u32 wklSelectedId;
+    u32 pollStatus;
 
-        // If this is a poll request then the number of SPUs pending to context switch is also added to the contention presumably
-        // to prevent unnecessary jumps to the kernel
-        if (isPoll) {
-            pendingContention[i] = mgmt->spurs->m.wklPendingContention[i & 0x0F] - mgmt->wklLocPendingContention[i & 0x0F];
-            pendingContention[i] = i < CELL_SPURS_MAX_WORKLOAD ? pendingContention[i] & 0x0F : pendingContention[i] >> 4;
-            if (i != mgmt->wklCurrentId) {
-                contention[i] += pendingContention[i];
-            }
-        }
-    }
+    do {
+        // DMA and lock the first 0x80 bytes of spurs
+        spursDma(spu, MFC_GETLLAR_CMD, mgmt->spurs.addr(), 0x100/*LSA*/, 0x80/*size*/, 0/*tag*/);
+        CellSpurs * spurs = (CellSpurs *)mgmt->tempArea;
 
-    u32 wklSelectedId = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
-    u32 pollStatus    = 0;
-
-    // The system service workload has the highest priority. Select the system service workload if
-    // the system service message bit for this SPU is set.
-    if (mgmt->spurs->m.sysSrvMessage.read_relaxed() & (1 << mgmt->spuNum)) {
-        // Not sure what this does. Possibly Mark the SPU as in use.
-        mgmt->spuIdling = 0;
-        if (!isPoll || mgmt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
-            // Clear the message bit
-            mgmt->spurs->m.sysSrvMessage.write_relaxed(mgmt->spurs->m.sysSrvMessage.read_relaxed() & ~(1 << mgmt->spuNum));
-        }
-    } else {
-        // Caclulate the scheduling weight for each workload
-        u8 maxWeight = 0;
+        // Calculate the contention (number of SPUs used) for each workload
+        u8 contention[CELL_SPURS_MAX_WORKLOAD2];
+        u8 pendingContention[CELL_SPURS_MAX_WORKLOAD2];
         for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD2; i++) {
-            auto j           = i & 0x0F;
-            u16 runnable      = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->wklRunnable1 & (0x8000 >> j) : mgmt->wklRunnable2 & (0x8000 >> j);
-            u8  priority      = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->priority[j] & 0x0F : mgmt->priority[j] >> 4;
-            u8  maxContention = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->spurs->m.wklMaxContention[j].read_relaxed() & 0x0F : mgmt->spurs->m.wklMaxContention[j].read_relaxed() >> 4;
-            u16 wklSignal     = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->spurs->m.wklSignal1.read_relaxed() & (0x8000 >> j) : mgmt->spurs->m.wklSignal2.read_relaxed() & (0x8000 >> j);
-            u8  wklFlag       = mgmt->spurs->m.wklFlag.flag.read_relaxed() == 0 ? mgmt->spurs->m.wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0;
-            u8  readyCount    = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->spurs->m.wklReadyCount1[j].read_relaxed() : mgmt->spurs->m.wklIdleSpuCountOrReadyCount2[j].read_relaxed();
+            contention[i] = spurs->m.wklCurrentContention[i & 0x0F] - mgmt->wklLocContention[i & 0x0F];
+            contention[i] = i < CELL_SPURS_MAX_WORKLOAD ? contention[i] & 0x0F : contention[i] >> 4;
 
-            // For a workload to be considered for scheduling:
-            // 1. Its priority must be greater than 0
-            // 2. The number of SPUs used by it must be less than the max contention for that workload
-            // 3. The workload should be in runnable state
-            // 4. The number of SPUs allocated to it must be less than the number of SPUs requested (i.e. readyCount)
-            //    OR the workload must be signalled
-            //    OR the workload flag is 0 and the workload is configured as the wokload receiver
-            if (runnable && priority > 0 && maxContention > contention[i]) {
-                if (wklFlag || wklSignal || readyCount > contention[i]) {
-                    // The scheduling weight of the workload is equal to the priority of the workload for the SPU.
-                    // The current workload is given a sligtly higher weight presumably to reduce the number of context switches.
-                    // In case of a tie the lower numbered workload is chosen.
-                    u8 weight = priority << 4;
-                    if (mgmt->wklCurrentId == i) {
-                        weight |= 0x04;
-                    }
-
-                    if (weight > maxWeight) {
-                        wklSelectedId  = i;
-                        maxWeight      = weight;
-                        pollStatus     = readyCount > contention[i] ? CELL_SPURS_MODULE_POLL_STATUS_READYCOUNT : 0;
-                        pollStatus    |= wklSignal ? CELL_SPURS_MODULE_POLL_STATUS_SIGNAL : 0;
-                        pollStatus    |= wklFlag ? CELL_SPURS_MODULE_POLL_STATUS_FLAG : 0;
-                    }
+            // If this is a poll request then the number of SPUs pending to context switch is also added to the contention presumably
+            // to prevent unnecessary jumps to the kernel
+            if (isPoll) {
+                pendingContention[i] = spurs->m.wklPendingContention[i & 0x0F] - mgmt->wklLocPendingContention[i & 0x0F];
+                pendingContention[i] = i < CELL_SPURS_MAX_WORKLOAD ? pendingContention[i] & 0x0F : pendingContention[i] >> 4;
+                if (i != mgmt->wklCurrentId) {
+                    contention[i] += pendingContention[i];
                 }
             }
         }
 
-        // Not sure what this does. Possibly mark the SPU as idle/in use.
-        mgmt->spuIdling = wklSelectedId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID ? 1 : 0;
+        wklSelectedId = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
+        pollStatus    = 0;
 
-        if (!isPoll || wklSelectedId == mgmt->wklCurrentId) {
-            // Clear workload signal for the selected workload
-            mgmt->spurs->m.wklSignal1.write_relaxed(be_t<u16>::make(mgmt->spurs->m.wklSignal1.read_relaxed() & ~(0x8000 >> wklSelectedId)));
-            mgmt->spurs->m.wklSignal2.write_relaxed(be_t<u16>::make(mgmt->spurs->m.wklSignal1.read_relaxed() & ~(0x80000000u >> wklSelectedId)));
+        // The system service workload has the highest priority. Select the system service workload if
+        // the system service message bit for this SPU is set.
+        if (spurs->m.sysSrvMessage.read_relaxed() & (1 << mgmt->spuNum)) {
+            // Not sure what this does. Possibly Mark the SPU as in use.
+            mgmt->spuIdling = 0;
+            if (!isPoll || mgmt->wklCurrentId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
+                // Clear the message bit
+                spurs->m.sysSrvMessage.write_relaxed(spurs->m.sysSrvMessage.read_relaxed() & ~(1 << mgmt->spuNum));
+            }
+        } else {
+            // Caclulate the scheduling weight for each workload
+            u8 maxWeight = 0;
+            for (auto i = 0; i < CELL_SPURS_MAX_WORKLOAD2; i++) {
+                auto j           = i & 0x0F;
+                u16 runnable      = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->wklRunnable1 & (0x8000 >> j) : mgmt->wklRunnable2 & (0x8000 >> j);
+                u8  priority      = i < CELL_SPURS_MAX_WORKLOAD ? mgmt->priority[j] & 0x0F : mgmt->priority[j] >> 4;
+                u8  maxContention = i < CELL_SPURS_MAX_WORKLOAD ? spurs->m.wklMaxContention[j].read_relaxed() & 0x0F : spurs->m.wklMaxContention[j].read_relaxed() >> 4;
+                u16 wklSignal     = i < CELL_SPURS_MAX_WORKLOAD ? spurs->m.wklSignal1.read_relaxed() & (0x8000 >> j) : spurs->m.wklSignal2.read_relaxed() & (0x8000 >> j);
+                u8  wklFlag       = spurs->m.wklFlag.flag.read_relaxed() == 0 ? spurs->m.wklFlagReceiver.read_relaxed() == i ? 1 : 0 : 0;
+                u8  readyCount    = i < CELL_SPURS_MAX_WORKLOAD ? spurs->m.wklReadyCount1[j].read_relaxed() : spurs->m.wklIdleSpuCountOrReadyCount2[j].read_relaxed();
 
-            // If the selected workload is the wklFlag workload then pull the wklFlag to all 1s
-            if (wklSelectedId == mgmt->spurs->m.wklFlagReceiver.read_relaxed()) {
-                mgmt->spurs->m.wklFlag.flag.write_relaxed(be_t<u32>::make(0xFFFFFFFF));
+                // For a workload to be considered for scheduling:
+                // 1. Its priority must be greater than 0
+                // 2. The number of SPUs used by it must be less than the max contention for that workload
+                // 3. The workload should be in runnable state
+                // 4. The number of SPUs allocated to it must be less than the number of SPUs requested (i.e. readyCount)
+                //    OR the workload must be signalled
+                //    OR the workload flag is 0 and the workload is configured as the wokload receiver
+                if (runnable && priority > 0 && maxContention > contention[i]) {
+                    if (wklFlag || wklSignal || readyCount > contention[i]) {
+                        // The scheduling weight of the workload is equal to the priority of the workload for the SPU.
+                        // The current workload is given a sligtly higher weight presumably to reduce the number of context switches.
+                        // In case of a tie the lower numbered workload is chosen.
+                        u8 weight = priority << 4;
+                        if (mgmt->wklCurrentId == i) {
+                            weight |= 0x04;
+                        }
+
+                        if (weight > maxWeight) {
+                            wklSelectedId  = i;
+                            maxWeight      = weight;
+                            pollStatus     = readyCount > contention[i] ? CELL_SPURS_MODULE_POLL_STATUS_READYCOUNT : 0;
+                            pollStatus    |= wklSignal ? CELL_SPURS_MODULE_POLL_STATUS_SIGNAL : 0;
+                            pollStatus    |= wklFlag ? CELL_SPURS_MODULE_POLL_STATUS_FLAG : 0;
+                        }
+                    }
+                }
+            }
+
+            // Not sure what this does. Possibly mark the SPU as idle/in use.
+            mgmt->spuIdling = wklSelectedId == CELL_SPURS_SYS_SERVICE_WORKLOAD_ID ? 1 : 0;
+
+            if (!isPoll || wklSelectedId == mgmt->wklCurrentId) {
+                // Clear workload signal for the selected workload
+                spurs->m.wklSignal1.write_relaxed(be_t<u16>::make(spurs->m.wklSignal1.read_relaxed() & ~(0x8000 >> wklSelectedId)));
+                spurs->m.wklSignal2.write_relaxed(be_t<u16>::make(spurs->m.wklSignal1.read_relaxed() & ~(0x80000000u >> wklSelectedId)));
+
+                // If the selected workload is the wklFlag workload then pull the wklFlag to all 1s
+                if (wklSelectedId == spurs->m.wklFlagReceiver.read_relaxed()) {
+                    spurs->m.wklFlag.flag.write_relaxed(be_t<u32>::make(0xFFFFFFFF));
+                }
             }
         }
-    }
 
-    if (!isPoll) {
-        // Called by kernel
-        // Increment the contention for the selected workload
-        if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
-            contention[wklSelectedId]++;
+        if (!isPoll) {
+            // Called by kernel
+            // Increment the contention for the selected workload
+            if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
+                contention[wklSelectedId]++;
+            }
+
+            for (auto i = 0; i < (CELL_SPURS_MAX_WORKLOAD2 >> 1); i++) {
+                spurs->m.wklCurrentContention[i] = contention[i] | (contention[i + 0x10] << 4);
+                mgmt->wklLocContention[i]        = 0;
+                mgmt->wklLocPendingContention[i] = 0;
+            }
+
+            mgmt->wklLocContention[wklSelectedId & 0x0F] = wklSelectedId < CELL_SPURS_MAX_WORKLOAD ? 0x01 : wklSelectedId < CELL_SPURS_MAX_WORKLOAD2 ? 0x10 : 0;
+            mgmt->wklCurrentId = wklSelectedId;
+        } else if (wklSelectedId != mgmt->wklCurrentId) {
+            // Not called by kernel but a context switch is required
+            // Increment the pending contention for the selected workload
+            if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
+                pendingContention[wklSelectedId]++;
+            }
+
+            for (auto i = 0; i < (CELL_SPURS_MAX_WORKLOAD2 >> 1); i++) {
+                spurs->m.wklPendingContention[i] = pendingContention[i] | (pendingContention[i + 0x10] << 4);
+                mgmt->wklLocPendingContention[i] = 0;
+            }
+
+            mgmt->wklLocPendingContention[wklSelectedId & 0x0F] = wklSelectedId < CELL_SPURS_MAX_WORKLOAD ? 0x01 : wklSelectedId < CELL_SPURS_MAX_WORKLOAD2 ? 0x10 : 0;
         }
-
-        for (auto i = 0; i < (CELL_SPURS_MAX_WORKLOAD2 >> 1); i++) {
-            mgmt->spurs->m.wklCurrentContention[i] = contention[i] | (contention[i + 0x10] << 4);
-            mgmt->wklLocContention[i]        = 0;
-            mgmt->wklLocPendingContention[i] = 0;
-        }
-
-        mgmt->wklLocContention[wklSelectedId & 0x0F] = wklSelectedId < CELL_SPURS_MAX_WORKLOAD ? 0x01 : wklSelectedId < CELL_SPURS_MAX_WORKLOAD2 ? 0x10 : 0;
-        mgmt->wklCurrentId = wklSelectedId;
-    } else if (wklSelectedId != mgmt->wklCurrentId) {
-        // Not called by kernel but a context switch is required
-        // Increment the pending contention for the selected workload
-        if (wklSelectedId != CELL_SPURS_SYS_SERVICE_WORKLOAD_ID) {
-            pendingContention[wklSelectedId]++;
-        }
-
-        for (auto i = 0; i < (CELL_SPURS_MAX_WORKLOAD2 >> 1); i++) {
-            mgmt->spurs->m.wklPendingContention[i] = pendingContention[i] | (pendingContention[i + 0x10] << 4);
-            mgmt->wklLocPendingContention[i]       = 0;
-        }
-
-        mgmt->wklLocPendingContention[wklSelectedId & 0x0F] = wklSelectedId < CELL_SPURS_MAX_WORKLOAD ? 0x01 : wklSelectedId < CELL_SPURS_MAX_WORKLOAD2 ? 0x10 : 0;
-    }
+    } while (spursDma(spu, MFC_PUTLLC_CMD, mgmt->spurs.addr(), 0x100/*LSA*/, 0x80/*size*/, 0/*tag*/) == false);
 
     u64 result          = (u64)wklSelectedId << 32;
     result             |= pollStatus;
     spu.GPR[3]._u64[1]  = result;
+    return true;
 }
 
-/// Entry point of the SPURS kernel
-void spursKernelMain(SPUThread & spu) {
+/// SPURS kernel main
+bool spursKernelMain(SPUThread & spu) {
     SpursKernelMgmtData * mgmt = vm::get_ptr<SpursKernelMgmtData>(spu.ls_offset + 0x100);
-    mgmt->spuNum               = spu.GPR[3]._u32[3];
-    mgmt->dmaTagId             = 0x1F;
-    mgmt->spurs.set(spu.GPR[4]._u64[1]);
-    mgmt->wklCurrentId         = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
-    mgmt->wklCurrentUniqueId   = 0x20;
 
-    bool isSecond            = mgmt->spurs->m.flags1 & SF1_32_WORKLOADS ? true : false;
-    mgmt->yieldToKernelAddr  = isSecond ? 0x838 : 0x808;
-    mgmt->selectWorkloadAddr = 0x290;
-    spu.WriteLS32(mgmt->yieldToKernelAddr, 2);                  // hack for cellSpursModuleExit
-    spu.WriteLS32(mgmt->selectWorkloadAddr, 3);                 // hack for cellSpursModulePollStatus
-    spu.WriteLS32(mgmt->selectWorkloadAddr + 4, 0x35000000);    // bi $0
-    spu.m_code3_func = isSecond ? spursKernelSelectWorkload2 : spursKernelSelectWorkload;
+    bool isKernel2;
+    u32 pollStatus;
+    const CellSpurs::WorkloadInfo * wklInfo;
+    if (spu.PC == CELL_SPURS_KERNEL1_ENTRY_ADDR || spu.PC == CELL_SPURS_KERNEL2_ENTRY_ADDR) {
+        // Entry point of SPURS kernel
+        // Save arguments
+        mgmt->spuNum = spu.GPR[3]._u32[3];
+        mgmt->spurs.set(spu.GPR[4]._u64[1]);
 
-    u32 wid        = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
-    u32 pollStatus = 0;
-    while (true) {
-        if (Emu.IsStopped()) {
-            cellSpurs->Warning("Spurs Kernel aborted");
-            return;
-        }
+        isKernel2 = mgmt->spurs->m.flags1 & SF1_32_WORKLOADS ? true : false;
 
-        // Get current workload info
-        auto & wkl = wid < CELL_SPURS_MAX_WORKLOAD ? mgmt->spurs->m.wklInfo1[wid] : (wid < CELL_SPURS_MAX_WORKLOAD2 && isSecond ? mgmt->spurs->m.wklInfo2[wid & 0xf] : mgmt->spurs->m.wklInfoSysSrv);
+        memset(mgmt, 0, sizeof(SpursKernelMgmtData));
 
-        if (mgmt->wklCurrentAddr != wkl.addr) {
-            if (wkl.addr.addr() != SPURS_IMG_ADDR_SYS_SRV_WORKLOAD) {
-                // Load executable code
-                memcpy(vm::get_ptr<void>(spu.ls_offset + 0xA00), wkl.addr.get_ptr(), wkl.size);
-            }
-            mgmt->wklCurrentAddr     = wkl.addr;
-            mgmt->wklCurrentUniqueId = wkl.uniqueId.read_relaxed();
-        }
-
-        if (!isSecond) {
-            mgmt->moduleId[0] = 0;
-            mgmt->moduleId[1] = 0;
-        }
-
-        // Run workload
-        spu.GPR[1]._u32[3] = 0x3FFB0;
-        spu.GPR[3]._u32[3] = 0x100;
-        spu.GPR[4]._u64[1] = wkl.arg;
-        spu.GPR[5]._u32[3] = pollStatus;
-        spu.SetPc(0xA00);
-        switch (mgmt->wklCurrentAddr.addr()) {
-        case SPURS_IMG_ADDR_SYS_SRV_WORKLOAD:
-            spursSysServiceWorkloadEntry(spu);
-            break;
-        default:
-            spu.FastCall(0xA00);
-            break;
-        }
-
-        // Check status
-        auto status = spu.SPU.Status.GetValue();
-        if (status == SPU_STATUS_STOPPED_BY_STOP) {
-            return;
+        // Initialise the SPURS management area to its initial values
+        mgmt->dmaTagId           = CELL_SPURS_KERNEL_DMA_TAG_ID;
+        mgmt->wklCurrentUniqueId = 0x20;
+        mgmt->wklCurrentId       = CELL_SPURS_SYS_SERVICE_WORKLOAD_ID;
+        mgmt->yieldToKernelAddr  = isKernel2 ? CELL_SPURS_KERNEL2_YIELD_ADDR : CELL_SPURS_KERNEL1_YIELD_ADDR;
+        mgmt->selectWorkloadAddr = isKernel2 ? CELL_SPURS_KERNEL2_SELECT_WORKLOAD_ADDR : CELL_SPURS_KERNEL1_SELECT_WORKLOAD_ADDR;
+        if (!isKernel2) {
+            mgmt->x1F0    = 0xF0020000;
+            mgmt->x200    = 0x20000;
+            mgmt->guid[0] = 0x423A3A02;
+            mgmt->guid[1] = 0x43F43A82;
+            mgmt->guid[2] = 0x43F26502;
+            mgmt->guid[3] = 0x420EB382;
         } else {
-            assert(status == SPU_STATUS_RUNNING);
+            mgmt->guid[0] = 0x43A08402;
+            mgmt->guid[1] = 0x43FB0A82;
+            mgmt->guid[2] = 0x435E9302;
+            mgmt->guid[3] = 0x43A3C982;
         }
 
+        spu.UnregisterHleFunctions(0, 0x40000); // TODO: use a symbolic constant
+        spu.RegisterHleFuncion(isKernel2 ? CELL_SPURS_KERNEL2_ENTRY_ADDR : CELL_SPURS_KERNEL1_ENTRY_ADDR, spursKernelMain);
+        spu.RegisterHleFuncion(mgmt->yieldToKernelAddr, spursKernelMain);
+        spu.RegisterHleFuncion(mgmt->selectWorkloadAddr, isKernel2 ? spursKernel2SelectWorkload : spursKernel1SelectWorkload);
+
+        // Start the system service workload
+        spu.RegisterHleFuncion(0xA00, spursSysServiceWorkloadEntry);
+        wklInfo    = &mgmt->spurs->m.wklInfoSysSrv;
+        pollStatus = 0;
+    } else if (spu.PC == mgmt->yieldToKernelAddr) {
+        isKernel2 = mgmt->spurs->m.flags1 & SF1_32_WORKLOADS ? true : false;
+
         // Select next workload to run
         spu.GPR[3].clear();
-        if (isSecond) {
-            spursKernelSelectWorkload2(spu);
+        if (isKernel2) {
+            spursKernel2SelectWorkload(spu);
         } else {
-            spursKernelSelectWorkload(spu);
+            spursKernel1SelectWorkload(spu);
         }
-        u64 res    = spu.GPR[3]._u64[1];
-        pollStatus = (u32)(res);
-        wid        = (u32)(res >> 32);
+
+        pollStatus = (u32)(spu.GPR[3]._u64[1]);
+        auto wid   = (u32)(spu.GPR[3]._u64[1] >> 32);
+        wklInfo    = wid < CELL_SPURS_MAX_WORKLOAD ? &mgmt->spurs->m.wklInfo1[wid] :
+                        (wid < CELL_SPURS_MAX_WORKLOAD2 && isKernel2 ? &mgmt->spurs->m.wklInfo2[wid & 0xf] : &mgmt->spurs->m.wklInfoSysSrv);
+    } else {
+        assert(0);
     }
+
+    if (!isKernel2) {
+        mgmt->moduleId[0] = 0;
+        mgmt->moduleId[1] = 0;
+    }
+
+    // Run workload
+    spu.GPR[0]._u32[3] = mgmt->yieldToKernelAddr;
+    spu.GPR[1]._u32[3] = 0x3FFB0;
+    spu.GPR[3]._u32[3] = 0x100;
+    spu.GPR[4]._u64[1] = wklInfo->arg;
+    spu.GPR[5]._u32[3] = pollStatus;
+    spu.SetBranch(0xA00);
+    return false;
 }
 
 //////////////////////////////////////////////////////////////////////////////
@@ -783,7 +844,7 @@ poll:
 }
 
 /// Entry point of the system service workload
-void spursSysServiceWorkloadEntry(SPUThread & spu) {
+bool spursSysServiceWorkloadEntry(SPUThread & spu) {
     auto mgmt       = vm::get_ptr<SpursKernelMgmtData>(spu.ls_offset + spu.GPR[3]._u32[3]);
     auto arg        = spu.GPR[4]._u64[1];
     auto pollStatus = spu.GPR[5]._u32[3];
@@ -800,7 +861,7 @@ void spursSysServiceWorkloadEntry(SPUThread & spu) {
     }
 
     // TODO: Ensure that this function always returns to the SPURS kernel
-    return;
+    return false;
 }
 
 //////////////////////////////////////////////////////////////////////////////
diff --git a/rpcs3/stdafx.h b/rpcs3/stdafx.h
index 825c1c4007..4581c27650 100644
--- a/rpcs3/stdafx.h
+++ b/rpcs3/stdafx.h
@@ -34,6 +34,8 @@
 #include <algorithm>
 #include <random>
 #include <unordered_set>
+#include <map>
+#include <unordered_map>
 
 #include <sys/stat.h>
 #include "Utilities/GNU.h"