diff --git a/crt0/switch_crt0.s b/crt0/switch_crt0.s
index 826ccf45..e5353896 100644
--- a/crt0/switch_crt0.s
+++ b/crt0/switch_crt0.s
@@ -8,6 +8,9 @@ _start:
 
 .org _start+0x80
 startup:
+    // save main thread handle
+    mov  x27, x1
+
     // get aslr base
     sub  x28, x30, #4
 
@@ -32,7 +35,7 @@ bss_loop:
     bl   __nx_dynamic
 
     // initialize system
-    mov  x0, x28
+    mov  x0, x27
     bl   __libnx_init
 
     // call entrypoint
diff --git a/nx/source/internal.h b/nx/source/internal.h
index 209eb9cd..c4b66b6c 100644
--- a/nx/source/internal.h
+++ b/nx/source/internal.h
@@ -9,6 +9,9 @@ typedef struct {
     // Magic value used to check if the struct is initialized
     u32 magic;
 
+    // Thread handle, for mutexes
+    Handle handle;
+
     // Pointer to the current thread (if exists)
     Thread* thread_ptr;
 
diff --git a/nx/source/kernel/mutex.c b/nx/source/kernel/mutex.c
index 103108c6..4f71ff97 100644
--- a/nx/source/kernel/mutex.c
+++ b/nx/source/kernel/mutex.c
@@ -1,19 +1,19 @@
 // Copyright 2017 plutoo
 #include <switch.h>
+#include "../internal.h"
 
 #define HAS_LISTENERS 0x40000000
 
 static u32 _GetTag() {
-    // todo: Needs filling in at thread creation.
-    // todo: Must always be assigned non-zero.
-    return ((u32*)armGetTls()) [0x1FC/4];
+    return getThreadVars()->handle;
 }
 
 void mutexLock(Mutex* m) {
     u32 self = _GetTag();
-    u32 cur = __sync_val_compare_and_swap(m, 0, self);
 
     while (1) {
+        u32 cur = __sync_val_compare_and_swap((u32*)m, 0, self);
+
         if (cur == 0) {
             // We won the race!
             return;
@@ -30,21 +30,18 @@ void mutexLock(Mutex* m) {
         }
         else {
             // The flag is not set, we need to set it.
-            u32 old = __sync_val_compare_and_swap(m, cur, cur | HAS_LISTENERS);
+            u32 old = __sync_val_compare_and_swap((u32*)m, cur, cur | HAS_LISTENERS);
 
             if (old == cur) {
                 // Flag was set successfully.
                 svcArbitrateLock(cur &~ HAS_LISTENERS, (u32*)m, self);
             }
         }
-
-        cur = __sync_val_compare_and_swap(m, 0, self);
     }
 }
 
 void mutexUnlock(Mutex* m) {
-    u32 self = _GetTag();
-    u32 old = __sync_val_compare_and_swap(m, self, 0);
+    u32 old = __sync_lock_test_and_set((u32*)m, 0);
 
     if (old & HAS_LISTENERS) {
         svcArbitrateUnlock((u32*)m);
diff --git a/nx/source/kernel/svc.s b/nx/source/kernel/svc.s
index 94aff55a..bddaab7b 100644
--- a/nx/source/kernel/svc.s
+++ b/nx/source/kernel/svc.s
@@ -104,10 +104,12 @@ SVC_END
 
 SVC_BEGIN svcArbitrateLock
     svc 0x1a
+    ret
 SVC_END
 
 SVC_BEGIN svcArbitrateUnlock
     svc 0x1b
+    ret
 SVC_END
 
 SVC_BEGIN svcConnectToNamedPort
diff --git a/nx/source/kernel/thread.c b/nx/source/kernel/thread.c
index d99bc366..fcc6284e 100644
--- a/nx/source/kernel/thread.c
+++ b/nx/source/kernel/thread.c
@@ -25,6 +25,7 @@ static void _EntryWrap(ThreadEntryArgs* args) {
     tv->thread_ptr = args->t;
     tv->reent      = args->reent;
     tv->tls_tp     = (u8*)args->tls-2*sizeof(void*); // subtract size of Thread Control Block (TCB)
+    tv->handle     = args->t->handle;
 
     // Launch thread entrypoint
     args->entry(args->arg);
diff --git a/nx/source/system/init.c b/nx/source/system/init.c
index 7a832310..717b7420 100644
--- a/nx/source/system/init.c
+++ b/nx/source/system/init.c
@@ -3,7 +3,7 @@
 void __nx_exit(int rc);
 
 void virtmemSetup();
-void newlibSetup();
+void newlibSetup(Handle main_thread);
 
 #define INNER_HEAP_SIZE 0x20000
 __attribute__((weak)) size_t __nx_inner_heap_size = INNER_HEAP_SIZE;
@@ -11,12 +11,12 @@ __attribute__((weak)) char __nx_inner_heap[INNER_HEAP_SIZE];
 __attribute__((weak)) size_t __nx_outer_heap_size = 0x2000000*4;//Must be a multiple of 0x2000000.
 
 static void _SetupHeap() {
-    char* addr;
+    u64 addr;
     Result rc   = svcSetHeapSize((void**)&addr, __nx_outer_heap_size);
     size_t size = __nx_outer_heap_size;
 
     if (R_FAILED(rc)) {
-        addr = &__nx_inner_heap[0];
+        addr = (u64) &__nx_inner_heap[0];
         size = __nx_inner_heap_size;
     }
 
@@ -24,8 +24,8 @@ static void _SetupHeap() {
     extern char* fake_heap_start;
     extern char* fake_heap_end;
 
-    fake_heap_start = addr;
-    fake_heap_end   = addr + size;
+    fake_heap_start = (char*)addr;
+    fake_heap_end   = (char*)addr + size;
 }
 
 void __attribute__((weak)) __appInit(void)
@@ -46,12 +46,12 @@ void __attribute__((weak)) __appExit(void)
     smExit();
 }
 
-void __attribute__((weak)) __libnx_init(void)
+void __attribute__((weak)) __libnx_init(Handle main_thread)
 {
     // Called by crt0.
 
     // Libnx initialization goes here.
-    newlibSetup();
+    newlibSetup(main_thread);
     virtmemSetup();
     _SetupHeap();
 
diff --git a/nx/source/system/newlib.c b/nx/source/system/newlib.c
index 4404a984..111a56bb 100644
--- a/nx/source/system/newlib.c
+++ b/nx/source/system/newlib.c
@@ -18,7 +18,7 @@ static struct _reent* __libnx_get_reent() {
     return tv->reent;
 }
 
-void newlibSetup() {
+void newlibSetup(Handle main_thread) {
     // Register newlib syscalls
     __syscalls.exit     = __libnx_exit;
     __syscalls.getreent = __libnx_get_reent;
@@ -37,6 +37,7 @@ void newlibSetup() {
     tv->thread_ptr = NULL;
     tv->reent      = _impure_ptr;
     tv->tls_tp     = __tls_start-2*sizeof(void*); // subtract size of Thread Control Block (TCB)
+    tv->handle     = main_thread;
 
     u32 tls_size = __tdata_lma_end - __tdata_lma;
     if (tls_size)