diff --git a/nx/include/switch/nvidia/gpu/gpfifo.h b/nx/include/switch/nvidia/gpu/gpfifo.h
index 661071ec..1869737a 100644
--- a/nx/include/switch/nvidia/gpu/gpfifo.h
+++ b/nx/include/switch/nvidia/gpu/gpfifo.h
@@ -1,13 +1,19 @@
 #pragma once
 
+#define GPFIFO_QUEUE_SIZE        0x800
+#define GPFIFO_ENTRY_NOT_MAIN    BIT(9)
+#define GPFIFO_ENTRY_NO_PREFETCH BIT(31)
+
 typedef struct {
     NvChannel* parent;
+    u32 syncpt_id;
+    u32 num_entries;
+    nvioctl_gpfifo_entry entries[GPFIFO_QUEUE_SIZE];
 } NvGpfifo;
 
 Result nvGpfifoCreate(NvGpfifo* f, NvChannel* parent);
 void   nvGpfifoClose(NvGpfifo* f);
 
-#define NV_MAKE_GPFIFO_ENTRY(iova, size) \
-    ((iova) | (((u64)(size)) << 42))
-
-Result nvGpfifoSubmitCmdList(NvGpfifo* f, NvCmdList* cmd_list, u32 fence_incr, NvFence* fence_out);
+Result nvGpfifoAppendEntry(NvGpfifo* f, iova_t start, size_t num_cmds, u32 flags);
+Result nvGpfifoAppendCmdList(NvGpfifo* f, NvCmdList* cmd_list, u32 flags);
+Result nvGpfifoFlush(NvGpfifo* f, u32 fence_incr, NvFence* fence_out);
diff --git a/nx/source/nvidia/gpu/gpfifo.c b/nx/source/nvidia/gpu/gpfifo.c
index e69bc516..86324449 100644
--- a/nx/source/nvidia/gpu/gpfifo.c
+++ b/nx/source/nvidia/gpu/gpfifo.c
@@ -4,6 +4,7 @@
 #include "arm/atomics.h"
 #include "kernel/svc.h"
 #include "kernel/event.h"
+#include "kernel/detect.h"
 #include "services/nv.h"
 #include "nvidia/ioctl.h"
 #include "nvidia/buffer.h"
@@ -18,17 +19,14 @@
 #include "nvidia/gpu/error_notifier.h"
 #include "nvidia/gpu/gpu.h"
 
-#define DEFAULT_FIFO_ENTRIES 0x800
 
 Result nvGpfifoCreate(NvGpfifo* f, NvChannel* parent)
 {
     f->parent = parent;
 
     NvFence fence;
-    Result res = nvioctlChannel_AllocGpfifoEx2(parent->fd, DEFAULT_FIFO_ENTRIES, 1, 0, 0, 0, 0, &fence);
-    //__builtin_printf("nvGpfifoCreate initial fence: %d %u\n", (int)fence.id, fence.value);
-    //if (R_SUCCEEDED(res) && (s32)fence.id >= 0)
-    //    nvFenceWait(&fence, -1);
+    Result res = nvioctlChannel_AllocGpfifoEx2(parent->fd, GPFIFO_QUEUE_SIZE, 1, 0, 0, 0, 0, &fence);
+    f->syncpt_id = fence.id;
     return res;
 }
 
@@ -36,14 +34,39 @@ void nvGpfifoClose(NvGpfifo* f) {
     /**/
 }
 
-Result nvGpfifoSubmitCmdList(NvGpfifo* f, NvCmdList* cmd_list, u32 fence_incr, NvFence* fence_out)
+Result nvGpfifoAppendEntry(NvGpfifo* f, iova_t start, size_t num_cmds, u32 flags)
+{
+    if (f->num_entries >= GPFIFO_QUEUE_SIZE)
+        return MAKERESULT(Module_Libnx, LibnxError_OutOfMemory);
+
+    nvioctl_gpfifo_entry* entry = &f->entries[f->num_entries++];
+    entry->desc = start;
+    entry->desc32[1] |= flags | (num_cmds << 10);
+    return 0;
+}
+
+Result nvGpfifoAppendCmdList(NvGpfifo* f, NvCmdList* cmd_list, u32 flags)
+{
+    Result rc = nvGpfifoAppendEntry(f,
+        nvCmdListGetGpuAddr(cmd_list) + 4*cmd_list->offset,
+        cmd_list->num_cmds,
+        flags);
+
+    if (R_SUCCEEDED(rc)) {
+        cmd_list->offset += cmd_list->num_cmds;
+        cmd_list->num_cmds = 0;
+    }
+
+    return rc;
+}
+
+Result nvGpfifoFlush(NvGpfifo* f, u32 fence_incr, NvFence* fence_out)
 {
     Result rc;
-    nvioctl_gpfifo_entry ent;
     NvFence fence;
 
-    ent.desc = nvCmdListGetGpuAddr(cmd_list) + 4*cmd_list->offset;
-    ent.desc32[1] |= (2 << 8) | (nvCmdListGetListSize(cmd_list) << 10);
+    if (!f->num_entries)
+        return MAKERESULT(Module_Libnx, LibnxError_NotFound);
 
     fence.id = 0;
     fence.value = fence_incr;
@@ -52,14 +75,16 @@ Result nvGpfifoSubmitCmdList(NvGpfifo* f, NvCmdList* cmd_list, u32 fence_incr, N
     if (fence_incr)
         flags |= BIT(8);
 
-    rc = nvioctlChannel_SubmitGpfifo(
-        f->parent->fd, &ent, 1, flags, &fence);
+    if (kernelAbove400())
+        rc = nvioctlChannel_KickoffPb(f->parent->fd, f->entries, f->num_entries, flags, &fence);
+    else
+        rc = nvioctlChannel_SubmitGpfifo(f->parent->fd, f->entries, f->num_entries, flags, &fence);
 
-    if (R_SUCCEEDED(rc) && fence_out)
-        *fence_out = fence;
-
-    cmd_list->offset += cmd_list->num_cmds;
-    cmd_list->num_cmds = 0;
+    if (R_SUCCEEDED(rc)) {
+        f->num_entries = 0;
+        if (fence_out)
+            *fence_out = fence;
+    }
 
     return rc;
 }