diff --git a/nx/include/switch/nvidia/fence.h b/nx/include/switch/nvidia/fence.h
index 5748084d..3f9693ed 100644
--- a/nx/include/switch/nvidia/fence.h
+++ b/nx/include/switch/nvidia/fence.h
@@ -8,11 +8,14 @@ typedef struct {
     NvFence fences[4];
 } NvMultiFence;
 
-Result nvFenceWait(NvFence* f, s32 timeout_ms);
+Result nvFenceInit(void);
+void nvFenceExit(void);
+
+Result nvFenceWait(NvFence* f, s32 timeout_us);
 
 static inline void nvMultiFenceCreate(NvMultiFence* mf, const NvFence* fence) {
     mf->num_fences = 1;
     mf->fences[0] = *fence;
 }
 
-Result nvMultiFenceWait(NvMultiFence* mf, s32 timeout_ms);
+Result nvMultiFenceWait(NvMultiFence* mf, s32 timeout_us);
diff --git a/nx/source/nvidia/fence.c b/nx/source/nvidia/fence.c
index e69de29b..16ac1a5d 100644
--- a/nx/source/nvidia/fence.c
+++ b/nx/source/nvidia/fence.c
@@ -0,0 +1,128 @@
+#include "types.h"
+#include "result.h"
+#include "arm/atomics.h"
+#include "kernel/svc.h"
+#include "kernel/event.h"
+#include "services/nv.h"
+#include "nvidia/fence.h"
+
+static u32 g_ctrl_fd = -1;
+static u64 g_refCnt;
+
+static u64 g_NvEventUsedMask;
+static Event g_NvEvents[64];
+
+static int _nvGetEventSlot(void)
+{
+    int slot;
+    u64 new_mask;
+    u64 cur_mask = __atomic_load_n(&g_NvEventUsedMask, __ATOMIC_SEQ_CST);
+    do {
+        slot = __builtin_ffs(~cur_mask)-1;
+        if (slot < 0) break;
+        new_mask = cur_mask | ((u64)1 << slot);
+    } while (!__atomic_compare_exchange_n(&g_NvEventUsedMask, &cur_mask, new_mask, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));
+
+    return slot;
+}
+
+static void _nvFreeEventSlot(int slot)
+{
+    u64 new_mask;
+    u64 cur_mask = __atomic_load_n(&g_NvEventUsedMask, __ATOMIC_SEQ_CST);
+    do
+        new_mask = cur_mask &~ ((u64)1 << slot);
+    while (!__atomic_compare_exchange_n(&g_NvEventUsedMask, &cur_mask, new_mask, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST));
+}
+
+static Event* _nvGetEvent(int event_id)
+{
+    Result rc;
+    Event* event = &g_NvEvents[event_id];
+    if (event->revent != INVALID_HANDLE)
+        return event;
+
+    rc = nvioctlNvhostCtrl_EventRegister(g_ctrl_fd, event_id);
+    if (R_FAILED(rc))
+        return NULL;
+
+    rc = nvQueryEvent(g_ctrl_fd, 0x10000000 | event_id, event);
+    if (R_FAILED(rc)) {
+        nvioctlNvhostCtrl_EventUnregister(g_ctrl_fd, event_id);
+        return NULL;
+    }
+
+    return event;
+}
+
+static void _nvFreeEvent(int event_id)
+{
+    Event* event = &g_NvEvents[event_id];
+    if (event->revent == INVALID_HANDLE)
+        return;
+
+    eventClose(event);
+    nvioctlNvhostCtrl_EventUnregister(g_ctrl_fd, event_id);
+}
+
+Result nvFenceInit(void)
+{
+    Result rc;
+
+    if (atomicIncrement64(&g_refCnt) > 0)
+        return 0;
+
+    rc = nvOpen(&g_ctrl_fd, "/dev/nvhost-ctrl");
+
+    if (R_FAILED(rc))
+        g_ctrl_fd = -1;
+
+    return rc;
+}
+
+void nvFenceExit(void)
+{
+    if (atomicDecrement64(&g_refCnt) == 0) {
+        for (int i = 0; i < 64; i ++)
+            _nvFreeEvent(i);
+        if (g_ctrl_fd != -1)
+            nvClose(g_ctrl_fd);
+        g_ctrl_fd = -1;
+    }
+}
+
+Result nvFenceWait(NvFence* f, s32 timeout_us)
+{
+    Result rc = MAKERESULT(Module_LibnxNvidia, LibnxNvidiaError_InsufficientMemory);
+    int event_id = _nvGetEventSlot();
+    if (event_id >= 0) {
+        Event* event = _nvGetEvent(event_id);
+        if (event) {
+            rc = nvioctlNvhostCtrl_EventWaitAsync(g_ctrl_fd, f->id, f->value, timeout_us, event_id);
+            if (rc == MAKERESULT(Module_LibnxNvidia, LibnxNvidiaError_Timeout)) {
+                u64 timeout_ns = U64_MAX;
+                if (timeout_us >= 0)
+                    timeout_ns = (u64)1000*timeout_us;
+                rc = eventWait(event, timeout_ns);
+                if ((rc & 0x3FFFFF) == 0xEA01) { // timeout
+                    nvioctlNvhostCtrl_EventSignal(g_ctrl_fd, 0x10000000 | event_id);
+                    rc = MAKERESULT(Module_LibnxNvidia, LibnxNvidiaError_Timeout);
+                }
+            }
+        }
+        _nvFreeEventSlot(event_id);
+    }
+    return rc;
+}
+
+Result nvMultiFenceWait(NvMultiFence* mf, s32 timeout_us)
+{
+    // TODO: properly respect timeout
+    Result rc = 0;
+    for (u32 i = 0; i < mf->num_fences; i ++) {
+        rc = nvFenceWait(&mf->fences[i], timeout_us);
+        if (R_FAILED(rc))
+            break;
+    }
+    return rc;
+}