arm: Implement native code execution backend

2023-11-17 23:44:53 +07:00 · 2023-11-17 23:44:53 +07:00 · 9f91ba1f73
parent 4838837620
commit 9f91ba1f73
31 changed files with 1803 additions and 51 deletions
--- a/externals/CMakeLists.txt
+++ b/externals/CMakeLists.txt
@ -20,16 +20,16 @@ if ((ARCHITECTURE_x86 OR ARCHITECTURE_x86_64) AND NOT TARGET xbyak::xbyak)
 endif()

 # Dynarmic
+if (ARCHITECTURE_arm64 AND NOT TARGET merry::oaknut)
+    add_subdirectory(oaknut)
+endif()
+
 if ((ARCHITECTURE_x86_64 OR ARCHITECTURE_arm64) AND NOT TARGET dynarmic::dynarmic)
    set(DYNARMIC_IGNORE_ASSERTS ON)
    add_subdirectory(dynarmic)
    add_library(dynarmic::dynarmic ALIAS dynarmic)
 endif()

-if (ARCHITECTURE_arm64 AND NOT TARGET merry::oaknut)
-    add_subdirectory(oaknut)
-endif()
-
 # getopt
 if (MSVC)
    add_subdirectory(getopt)
--- a/src/common/host_memory.cpp
+++ b/src/common/host_memory.cpp
@ -189,6 +189,11 @@ public:
        }
    }

+    void EnableDirectMappedAddress() {
+        // TODO
+        UNREACHABLE();
+    }
+
    const size_t backing_size; ///< Size of the backing memory in bytes
    const size_t virtual_size; ///< Size of the virtual address placeholder in bytes

@ -340,11 +345,6 @@ private:
        return false;
    }

-    void EnableDirectMappedAddress() {
-        // TODO
-        UNREACHABLE();
-    }
-
    HANDLE process{};        ///< Current process handle
    HANDLE backing_handle{}; ///< File based backing memory

--- a/src/common/settings.cpp
+++ b/src/common/settings.cpp
@ -158,8 +158,8 @@ bool IsFastmemEnabled() {

 static bool is_nce_enabled = false;

-void SetNceEnabled(bool is_64bit) {
-    is_nce_enabled = values.cpu_backend.GetValue() == CpuBackend::Nce && is_64bit;
+void SetNceEnabled(bool is_39bit) {
+    is_nce_enabled = values.cpu_backend.GetValue() == CpuBackend::Nce && is_39bit;
 }

 bool IsNceEnabled() {
--- a/src/common/settings.h
+++ b/src/common/settings.h
@ -181,7 +181,7 @@ struct Values {

    // Cpu
    SwitchableSetting<CpuBackend, true> cpu_backend{
-        linkage,         CpuBackend::Dynarmic, CpuBackend::Dynarmic,
+        linkage,         CpuBackend::Nce, CpuBackend::Dynarmic,
 #ifdef ARCHITECTURE_arm64
        CpuBackend::Nce,
 #else
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@ -926,6 +926,22 @@ if (ENABLE_WEB_SERVICE)
    target_link_libraries(core PRIVATE web_service)
 endif()

+if (ARCHITECTURE_arm64)
+    enable_language(C ASM)
+    set(CMAKE_ASM_FLAGS "${CFLAGS} -x assembler-with-cpp")
+
+    target_sources(core PRIVATE
+        arm/nce/arm_nce.cpp
+        arm/nce/arm_nce.h
+        arm/nce/arm_nce.s
+        arm/nce/guest_context.h
+        arm/nce/patch.cpp
+        arm/nce/patch.h
+        arm/nce/instructions.h
+    )
+    target_link_libraries(core PRIVATE merry::oaknut)
+endif()
+
 if (ARCHITECTURE_x86_64 OR ARCHITECTURE_arm64)
    target_sources(core PRIVATE
        arm/dynarmic/arm_dynarmic.h
--- a/src/core/arm/arm_interface.h
+++ b/src/core/arm/arm_interface.h
@ -81,6 +81,9 @@ public:
    // thread context to be 800 bytes in size.
    static_assert(sizeof(ThreadContext64) == 0x320);

+    /// Perform any backend-specific initialization.
+    virtual void Initialize() {}
+
    /// Runs the CPU until an event happens
    void Run();

--- a/src/core/arm/nce/arm_nce.cpp
+++ b/src/core/arm/nce/arm_nce.cpp
@ -0,0 +1,395 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <cinttypes>
+#include <memory>
+
+#include "common/scope_exit.h"
+#include "common/signal_chain.h"
+#include "core/arm/nce/arm_nce.h"
+#include "core/arm/nce/patch.h"
+#include "core/core.h"
+#include "core/memory.h"
+
+#include "core/hle/kernel/k_process.h"
+
+#include <signal.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+namespace Core {
+
+namespace {
+
+struct sigaction g_orig_action;
+
+// Verify assembly offsets.
+using NativeExecutionParameters = Kernel::KThread::NativeExecutionParameters;
+static_assert(offsetof(NativeExecutionParameters, native_context) == TpidrEl0NativeContext);
+static_assert(offsetof(NativeExecutionParameters, lock) == TpidrEl0Lock);
+static_assert(offsetof(NativeExecutionParameters, magic) == TpidrEl0TlsMagic);
+
+fpsimd_context* GetFloatingPointState(mcontext_t& host_ctx) {
+    _aarch64_ctx* header = reinterpret_cast<_aarch64_ctx*>(&host_ctx.__reserved);
+    while (header->magic != FPSIMD_MAGIC) {
+        header = reinterpret_cast<_aarch64_ctx*>((char*)header + header->size);
+    }
+    return reinterpret_cast<fpsimd_context*>(header);
+}
+
+} // namespace
+
+void* ARM_NCE::RestoreGuestContext(void* raw_context) {
+    // Retrieve the host context.
+    auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext;
+
+    // Thread-local parameters will be located in x9.
+    auto* tpidr = reinterpret_cast<NativeExecutionParameters*>(host_ctx.regs[9]);
+    auto* guest_ctx = static_cast<GuestContext*>(tpidr->native_context);
+
+    // Retrieve the host floating point state.
+    auto* fpctx = GetFloatingPointState(host_ctx);
+
+    // Save host callee-saved registers.
+    std::memcpy(guest_ctx->host_ctx.host_saved_vregs.data(), &fpctx->vregs[8],
+                sizeof(guest_ctx->host_ctx.host_saved_vregs));
+    std::memcpy(guest_ctx->host_ctx.host_saved_regs.data(), &host_ctx.regs[19],
+                sizeof(guest_ctx->host_ctx.host_saved_regs));
+
+    // Save stack pointer.
+    guest_ctx->host_ctx.host_sp = host_ctx.sp;
+
+    // Restore all guest state except tpidr_el0.
+    host_ctx.sp = guest_ctx->sp;
+    host_ctx.pc = guest_ctx->pc;
+    host_ctx.pstate = guest_ctx->pstate;
+    fpctx->fpcr = guest_ctx->fpcr;
+    fpctx->fpsr = guest_ctx->fpsr;
+    std::memcpy(host_ctx.regs, guest_ctx->cpu_registers.data(), sizeof(host_ctx.regs));
+    std::memcpy(fpctx->vregs, guest_ctx->vector_registers.data(), sizeof(fpctx->vregs));
+
+    // Return the new thread-local storage pointer.
+    return tpidr;
+}
+
+void ARM_NCE::SaveGuestContext(GuestContext* guest_ctx, void* raw_context) {
+    // Retrieve the host context.
+    auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext;
+
+    // Retrieve the host floating point state.
+    auto* fpctx = GetFloatingPointState(host_ctx);
+
+    // Save all guest registers except tpidr_el0.
+    std::memcpy(guest_ctx->cpu_registers.data(), host_ctx.regs, sizeof(host_ctx.regs));
+    std::memcpy(guest_ctx->vector_registers.data(), fpctx->vregs, sizeof(fpctx->vregs));
+    guest_ctx->fpsr = fpctx->fpsr;
+    guest_ctx->fpcr = fpctx->fpcr;
+    guest_ctx->pstate = static_cast<u32>(host_ctx.pstate);
+    guest_ctx->pc = host_ctx.pc;
+    guest_ctx->sp = host_ctx.sp;
+
+    // Restore stack pointer.
+    host_ctx.sp = guest_ctx->host_ctx.host_sp;
+
+    // Restore host callee-saved registers.
+    std::memcpy(&host_ctx.regs[19], guest_ctx->host_ctx.host_saved_regs.data(),
+                sizeof(guest_ctx->host_ctx.host_saved_regs));
+    std::memcpy(&fpctx->vregs[8], guest_ctx->host_ctx.host_saved_vregs.data(),
+                sizeof(guest_ctx->host_ctx.host_saved_vregs));
+
+    // Return from the call on exit by setting pc to x30.
+    host_ctx.pc = guest_ctx->host_ctx.host_saved_regs[11];
+
+    // Clear esr_el1 and return it.
+    host_ctx.regs[0] = guest_ctx->esr_el1.exchange(0);
+}
+
+bool ARM_NCE::HandleGuestFault(GuestContext* guest_ctx, void* raw_info, void* raw_context) {
+    auto& host_ctx = static_cast<ucontext_t*>(raw_context)->uc_mcontext;
+    auto* info = static_cast<siginfo_t*>(raw_info);
+
+    // Try to handle an invalid access.
+    // TODO: handle accesses which split a page?
+    const Common::ProcessAddress addr =
+        (reinterpret_cast<u64>(info->si_addr) & ~Memory::YUZU_PAGEMASK);
+    if (guest_ctx->system->ApplicationMemory().InvalidateNCE(addr, Memory::YUZU_PAGESIZE)) {
+        // We handled the access successfully and are returning to guest code.
+        return true;
+    }
+
+    // We can't handle the access, so trigger an exception.
+    const bool is_prefetch_abort = host_ctx.pc == reinterpret_cast<u64>(info->si_addr);
+    guest_ctx->esr_el1.fetch_or(
+        static_cast<u64>(is_prefetch_abort ? HaltReason::PrefetchAbort : HaltReason::DataAbort));
+
+    // Forcibly mark the context as locked. We are still running.
+    // We may race with SignalInterrupt here:
+    // - If we lose the race, then SignalInterrupt will send us a signal which are masking,
+    //   and it will do nothing when it is unmasked, as we have already left guest code.
+    // - If we win the race, then SignalInterrupt will wait for us to unlock first.
+    auto& thread_params = guest_ctx->parent->running_thread->GetNativeExecutionParameters();
+    thread_params.lock.store(SpinLockLocked);
+
+    // Return to host.
+    SaveGuestContext(guest_ctx, raw_context);
+    return false;
+}
+
+void ARM_NCE::HandleHostFault(int sig, void* raw_info, void* raw_context) {
+    return g_orig_action.sa_sigaction(sig, static_cast<siginfo_t*>(raw_info), raw_context);
+}
+
+HaltReason ARM_NCE::RunJit() {
+    // Get the thread parameters.
+    // TODO: pass the current thread down from ::Run
+    auto* thread = Kernel::GetCurrentThreadPointer(system.Kernel());
+    auto* thread_params = &thread->GetNativeExecutionParameters();
+
+    {
+        // Lock our core context.
+        std::scoped_lock lk{lock};
+
+        // We should not be running.
+        ASSERT(running_thread == nullptr);
+
+        // Check if we need to run. If we have already been halted, we are done.
+        u64 halt = guest_ctx.esr_el1.exchange(0);
+        if (halt != 0) {
+            return static_cast<HaltReason>(halt);
+        }
+
+        // Mark that we are running.
+        running_thread = thread;
+
+        // Acquire the lock on the thread parameters.
+        // This allows us to force synchronization with SignalInterrupt.
+        LockThreadParameters(thread_params);
+    }
+
+    // Assign current members.
+    guest_ctx.parent = this;
+    thread_params->native_context = &guest_ctx;
+    thread_params->tpidr_el0 = guest_ctx.tpidr_el0;
+    thread_params->tpidrro_el0 = guest_ctx.tpidrro_el0;
+    thread_params->is_running = true;
+
+    HaltReason halt{};
+
+    // TODO: finding and creating the post handler needs to be locked
+    // to deal with dynamic loading of NROs.
+    const auto& post_handlers = system.ApplicationProcess()->GetPostHandlers();
+    if (auto it = post_handlers.find(guest_ctx.pc); it != post_handlers.end()) {
+        halt = ReturnToRunCodeByTrampoline(thread_params, &guest_ctx, it->second);
+    } else {
+        halt = ReturnToRunCodeByExceptionLevelChange(thread_id, thread_params);
+    }
+
+    // Unload members.
+    // The thread does not change, so we can persist the old reference.
+    guest_ctx.tpidr_el0 = thread_params->tpidr_el0;
+    thread_params->native_context = nullptr;
+    thread_params->is_running = false;
+
+    // Unlock the thread parameters.
+    UnlockThreadParameters(thread_params);
+
+    {
+        // Lock the core context.
+        std::scoped_lock lk{lock};
+
+        // On exit, we no longer have an active thread.
+        running_thread = nullptr;
+    }
+
+    // Return the halt reason.
+    return halt;
+}
+
+HaltReason ARM_NCE::StepJit() {
+    return HaltReason::StepThread;
+}
+
+u32 ARM_NCE::GetSvcNumber() const {
+    return guest_ctx.svc_swi;
+}
+
+ARM_NCE::ARM_NCE(System& system_, bool uses_wall_clock_, std::size_t core_index_)
+    : ARM_Interface{system_, uses_wall_clock_}, core_index{core_index_} {
+    guest_ctx.system = &system_;
+}
+
+ARM_NCE::~ARM_NCE() = default;
+
+void ARM_NCE::Initialize() {
+    thread_id = gettid();
+
+    // Setup our signals
+    static std::once_flag flag;
+    std::call_once(flag, [] {
+        using HandlerType = decltype(sigaction::sa_sigaction);
+
+        sigset_t signal_mask;
+        sigemptyset(&signal_mask);
+        sigaddset(&signal_mask, ReturnToRunCodeByExceptionLevelChangeSignal);
+        sigaddset(&signal_mask, BreakFromRunCodeSignal);
+        sigaddset(&signal_mask, GuestFaultSignal);
+
+        struct sigaction return_to_run_code_action {};
+        return_to_run_code_action.sa_flags = SA_SIGINFO | SA_ONSTACK;
+        return_to_run_code_action.sa_sigaction = reinterpret_cast<HandlerType>(
+            &ARM_NCE::ReturnToRunCodeByExceptionLevelChangeSignalHandler);
+        return_to_run_code_action.sa_mask = signal_mask;
+        Common::SigAction(ReturnToRunCodeByExceptionLevelChangeSignal, &return_to_run_code_action,
+                          nullptr);
+
+        struct sigaction break_from_run_code_action {};
+        break_from_run_code_action.sa_flags = SA_SIGINFO | SA_ONSTACK;
+        break_from_run_code_action.sa_sigaction =
+            reinterpret_cast<HandlerType>(&ARM_NCE::BreakFromRunCodeSignalHandler);
+        break_from_run_code_action.sa_mask = signal_mask;
+        Common::SigAction(BreakFromRunCodeSignal, &break_from_run_code_action, nullptr);
+
+        struct sigaction fault_action {};
+        fault_action.sa_flags = SA_SIGINFO | SA_ONSTACK | SA_RESTART;
+        fault_action.sa_sigaction =
+            reinterpret_cast<HandlerType>(&ARM_NCE::GuestFaultSignalHandler);
+        fault_action.sa_mask = signal_mask;
+        Common::SigAction(GuestFaultSignal, &fault_action, &g_orig_action);
+
+        // Simplify call for g_orig_action.
+        // These fields occupy the same space in memory, so this should be a no-op in practice.
+        if (!(g_orig_action.sa_flags & SA_SIGINFO)) {
+            g_orig_action.sa_sigaction =
+                reinterpret_cast<decltype(g_orig_action.sa_sigaction)>(g_orig_action.sa_handler);
+        }
+    });
+}
+
+void ARM_NCE::SetPC(u64 pc) {
+    guest_ctx.pc = pc;
+}
+
+u64 ARM_NCE::GetPC() const {
+    return guest_ctx.pc;
+}
+
+u64 ARM_NCE::GetSP() const {
+    return guest_ctx.sp;
+}
+
+u64 ARM_NCE::GetReg(int index) const {
+    return guest_ctx.cpu_registers[index];
+}
+
+void ARM_NCE::SetReg(int index, u64 value) {
+    guest_ctx.cpu_registers[index] = value;
+}
+
+u128 ARM_NCE::GetVectorReg(int index) const {
+    return guest_ctx.vector_registers[index];
+}
+
+void ARM_NCE::SetVectorReg(int index, u128 value) {
+    guest_ctx.vector_registers[index] = value;
+}
+
+u32 ARM_NCE::GetPSTATE() const {
+    return guest_ctx.pstate;
+}
+
+void ARM_NCE::SetPSTATE(u32 pstate) {
+    guest_ctx.pstate = pstate;
+}
+
+u64 ARM_NCE::GetTlsAddress() const {
+    return guest_ctx.tpidrro_el0;
+}
+
+void ARM_NCE::SetTlsAddress(u64 address) {
+    guest_ctx.tpidrro_el0 = address;
+}
+
+u64 ARM_NCE::GetTPIDR_EL0() const {
+    return guest_ctx.tpidr_el0;
+}
+
+void ARM_NCE::SetTPIDR_EL0(u64 value) {
+    guest_ctx.tpidr_el0 = value;
+}
+
+void ARM_NCE::SaveContext(ThreadContext64& ctx) const {
+    ctx.cpu_registers = guest_ctx.cpu_registers;
+    ctx.sp = guest_ctx.sp;
+    ctx.pc = guest_ctx.pc;
+    ctx.pstate = guest_ctx.pstate;
+    ctx.vector_registers = guest_ctx.vector_registers;
+    ctx.fpcr = guest_ctx.fpcr;
+    ctx.fpsr = guest_ctx.fpsr;
+    ctx.tpidr = guest_ctx.tpidr_el0;
+}
+
+void ARM_NCE::LoadContext(const ThreadContext64& ctx) {
+    guest_ctx.cpu_registers = ctx.cpu_registers;
+    guest_ctx.sp = ctx.sp;
+    guest_ctx.pc = ctx.pc;
+    guest_ctx.pstate = ctx.pstate;
+    guest_ctx.vector_registers = ctx.vector_registers;
+    guest_ctx.fpcr = ctx.fpcr;
+    guest_ctx.fpsr = ctx.fpsr;
+    guest_ctx.tpidr_el0 = ctx.tpidr;
+}
+
+void ARM_NCE::SignalInterrupt() {
+    // Lock core context.
+    std::scoped_lock lk{lock};
+
+    // Add break loop condition.
+    guest_ctx.esr_el1.fetch_or(static_cast<u64>(HaltReason::BreakLoop));
+
+    // If there is no thread running, we are done.
+    if (running_thread == nullptr) {
+        return;
+    }
+
+    // Lock the thread context.
+    auto* params = &running_thread->GetNativeExecutionParameters();
+    LockThreadParameters(params);
+
+    if (params->is_running) {
+        // We should signal to the running thread.
+        // The running thread will unlock the thread context.
+        syscall(SYS_tkill, thread_id, BreakFromRunCodeSignal);
+    } else {
+        // If the thread is no longer running, we have nothing to do.
+        UnlockThreadParameters(params);
+    }
+}
+
+void ARM_NCE::ClearInterrupt() {
+    guest_ctx.esr_el1 = {};
+}
+
+void ARM_NCE::ClearInstructionCache() {
+    // TODO: This is not possible to implement correctly on Linux because
+    // we do not have any access to ic iallu.
+
+    // Require accesses to complete.
+    std::atomic_thread_fence(std::memory_order_seq_cst);
+}
+
+void ARM_NCE::InvalidateCacheRange(u64 addr, std::size_t size) {
+    // Clean cache.
+    auto* ptr = reinterpret_cast<char*>(addr);
+    __builtin___clear_cache(ptr, ptr + size);
+}
+
+void ARM_NCE::ClearExclusiveState() {
+    // No-op.
+}
+
+void ARM_NCE::PageTableChanged(Common::PageTable& page_table,
+                               std::size_t new_address_space_size_in_bits) {
+    // No-op. Page table is never used.
+}
+
+} // namespace Core
--- a/src/core/arm/nce/arm_nce.h
+++ b/src/core/arm/nce/arm_nce.h
@ -0,0 +1,108 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <span>
+#include <unordered_map>
+#include <vector>
+
+#include "core/arm/arm_interface.h"
+#include "core/arm/nce/guest_context.h"
+
+namespace Core::Memory {
+class Memory;
+}
+
+namespace Core {
+
+class System;
+
+class ARM_NCE final : public ARM_Interface {
+public:
+    ARM_NCE(System& system_, bool uses_wall_clock_, std::size_t core_index_);
+
+    ~ARM_NCE() override;
+
+    void Initialize() override;
+    void SetPC(u64 pc) override;
+    u64 GetPC() const override;
+    u64 GetSP() const override;
+    u64 GetReg(int index) const override;
+    void SetReg(int index, u64 value) override;
+    u128 GetVectorReg(int index) const override;
+    void SetVectorReg(int index, u128 value) override;
+
+    u32 GetPSTATE() const override;
+    void SetPSTATE(u32 pstate) override;
+    u64 GetTlsAddress() const override;
+    void SetTlsAddress(u64 address) override;
+    void SetTPIDR_EL0(u64 value) override;
+    u64 GetTPIDR_EL0() const override;
+
+    Architecture GetArchitecture() const override {
+        return Architecture::Aarch64;
+    }
+
+    void SaveContext(ThreadContext32& ctx) const override {}
+    void SaveContext(ThreadContext64& ctx) const override;
+    void LoadContext(const ThreadContext32& ctx) override {}
+    void LoadContext(const ThreadContext64& ctx) override;
+
+    void SignalInterrupt() override;
+    void ClearInterrupt() override;
+    void ClearExclusiveState() override;
+    void ClearInstructionCache() override;
+    void InvalidateCacheRange(u64 addr, std::size_t size) override;
+    void PageTableChanged(Common::PageTable& new_page_table,
+                          std::size_t new_address_space_size_in_bits) override;
+
+protected:
+    HaltReason RunJit() override;
+    HaltReason StepJit() override;
+
+    u32 GetSvcNumber() const override;
+
+    const Kernel::DebugWatchpoint* HaltedWatchpoint() const override {
+        return nullptr;
+    }
+
+    void RewindBreakpointInstruction() override {}
+
+private:
+    // Assembly definitions.
+    static HaltReason ReturnToRunCodeByTrampoline(void* tpidr, GuestContext* ctx,
+                                                  u64 trampoline_addr);
+    static HaltReason ReturnToRunCodeByExceptionLevelChange(int tid, void* tpidr);
+
+    static void ReturnToRunCodeByExceptionLevelChangeSignalHandler(int sig, void* info,
+                                                                   void* raw_context);
+    static void BreakFromRunCodeSignalHandler(int sig, void* info, void* raw_context);
+    static void GuestFaultSignalHandler(int sig, void* info, void* raw_context);
+
+    static void LockThreadParameters(void* tpidr);
+    static void UnlockThreadParameters(void* tpidr);
+
+private:
+    // C++ implementation functions for assembly definitions.
+    static void* RestoreGuestContext(void* raw_context);
+    static void SaveGuestContext(GuestContext* ctx, void* raw_context);
+    static bool HandleGuestFault(GuestContext* ctx, void* info, void* raw_context);
+    static void HandleHostFault(int sig, void* info, void* raw_context);
+
+public:
+    // Members set on initialization.
+    std::size_t core_index{};
+    pid_t thread_id{-1};
+
+    // Core context.
+    GuestContext guest_ctx;
+
+    // Thread and invalidation info.
+    std::mutex lock;
+    Kernel::KThread* running_thread{};
+};
+
+} // namespace Core
--- a/src/core/arm/nce/arm_nce.s
+++ b/src/core/arm/nce/arm_nce.s
@ -0,0 +1,222 @@
+/* SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#include "core/arm/nce/arm_nce_asm_definitions.h"
+
+#define LOAD_IMMEDIATE_32(reg, val)                     \
+    mov     reg, #(((val) >> 0x00) & 0xFFFF);           \
+    movk    reg, #(((val) >> 0x10) & 0xFFFF), lsl #16
+
+
+/* static HaltReason Core::ARM_NCE::ReturnToRunCodeByTrampoline(void* tpidr, Core::GuestContext* ctx, u64 trampoline_addr) */
+.section    .text._ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm, "ax", %progbits
+.global     _ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm
+.type       _ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm, %function
+_ZN4Core7ARM_NCE27ReturnToRunCodeByTrampolineEPvPNS_12GuestContextEm:
+    /* Back up host sp to x3. */
+    /* Back up host tpidr_el0 to x4. */
+    mov     x3, sp
+    mrs     x4, tpidr_el0
+
+    /* Load guest sp. x5 is used as a scratch register. */
+    ldr     x5, [x1, #(GuestContextSp)]
+    mov     sp, x5
+
+    /* Offset GuestContext pointer to the host member. */
+    add     x5, x1, #(GuestContextHostContext)
+
+    /* Save original host sp and tpidr_el0 (x3, x4) to host context. */
+    stp     x3, x4, [x5, #(HostContextSpTpidrEl0)]
+
+    /* Save all callee-saved host GPRs. */
+    stp     x19, x20, [x5, #(HostContextRegs+0x0)]
+    stp     x21, x22, [x5, #(HostContextRegs+0x10)]
+    stp     x23, x24, [x5, #(HostContextRegs+0x20)]
+    stp     x25, x26, [x5, #(HostContextRegs+0x30)]
+    stp     x27, x28, [x5, #(HostContextRegs+0x40)]
+    stp     x29, x30, [x5, #(HostContextRegs+0x50)]
+
+    /* Save all callee-saved host FPRs. */
+    stp     q8, q9,   [x5, #(HostContextVregs+0x0)]
+    stp     q10, q11, [x5, #(HostContextVregs+0x20)]
+    stp     q12, q13, [x5, #(HostContextVregs+0x40)]
+    stp     q14, q15, [x5, #(HostContextVregs+0x60)]
+
+    /* Load guest tpidr_el0 from argument. */
+    msr     tpidr_el0, x0
+
+    /* Tail call the trampoline to restore guest state. */
+    br      x2
+
+
+/* static HaltReason Core::ARM_NCE::ReturnToRunCodeByExceptionLevelChange(int tid, void* tpidr) */
+.section    .text._ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv, "ax", %progbits
+.global     _ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv
+.type       _ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv, %function
+_ZN4Core7ARM_NCE37ReturnToRunCodeByExceptionLevelChangeEiPv:
+    /* This jumps to the signal handler, which will restore the entire context. */
+    /* On entry, x0 = thread id, which is already in the right place. */
+
+    /* Move tpidr to x9 so it is not trampled. */
+    mov     x9, x1
+
+    /* Set up arguments. */
+    mov     x8, #(__NR_tkill)
+    mov     x1, #(ReturnToRunCodeByExceptionLevelChangeSignal)
+
+    /* Tail call the signal handler. */
+    svc     #0
+
+    /* Block execution from flowing here. */
+    brk     #1000
+
+
+/* static void Core::ARM_NCE::ReturnToRunCodeByExceptionLevelChangeSignalHandler(int sig, void* info, void* raw_context) */
+.section    .text._ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_, "ax", %progbits
+.global     _ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_
+.type       _ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_, %function
+_ZN4Core7ARM_NCE50ReturnToRunCodeByExceptionLevelChangeSignalHandlerEiPvS1_:
+    stp     x29, x30, [sp, #-0x10]!
+    mov     x29, sp
+
+    /* Call the context restorer with the raw context. */
+    mov     x0, x2
+    bl      _ZN4Core7ARM_NCE19RestoreGuestContextEPv
+
+    /* Save the old value of tpidr_el0. */
+    mrs     x8, tpidr_el0
+    ldr     x9, [x0, #(TpidrEl0NativeContext)]
+    str     x8, [x9, #(GuestContextHostContext + HostContextTpidrEl0)]
+
+    /* Set our new tpidr_el0. */
+    msr     tpidr_el0, x0
+
+    /* Unlock the context. */
+    bl      _ZN4Core7ARM_NCE22UnlockThreadParametersEPv
+
+    /* Returning from here will enter the guest. */
+    ldp     x29, x30, [sp], #0x10
+    ret
+
+
+/* static void Core::ARM_NCE::BreakFromRunCodeSignalHandler(int sig, void* info, void* raw_context) */
+.section    .text._ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_, "ax", %progbits
+.global     _ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_
+.type       _ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_, %function
+_ZN4Core7ARM_NCE29BreakFromRunCodeSignalHandlerEiPvS1_:
+    /* Check to see if we have the correct TLS magic. */
+    mrs     x8, tpidr_el0
+    ldr     w9, [x8, #(TpidrEl0TlsMagic)]
+
+    LOAD_IMMEDIATE_32(w10, TlsMagic)
+
+    cmp     w9, w10
+    b.ne    1f
+
+    /* Correct TLS magic, so this is a guest interrupt. */
+    /* Restore host tpidr_el0. */
+    ldr     x0, [x8, #(TpidrEl0NativeContext)]
+    ldr     x3, [x0, #(GuestContextHostContext + HostContextTpidrEl0)]
+    msr     tpidr_el0, x3
+
+    /* Tail call the restorer. */
+    mov     x1, x2
+    b       _ZN4Core7ARM_NCE16SaveGuestContextEPNS_12GuestContextEPv
+
+    /* Returning from here will enter host code. */
+
+1:
+    /* Incorrect TLS magic, so this is a spurious signal. */
+    ret
+
+
+/* static void Core::ARM_NCE::GuestFaultSignalHandler(int sig, void* info, void* raw_context) */
+.section    .text._ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_, "ax", %progbits
+.global     _ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_
+.type       _ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_, %function
+_ZN4Core7ARM_NCE23GuestFaultSignalHandlerEiPvS1_:
+    /* Check to see if we have the correct TLS magic. */
+    mrs     x8, tpidr_el0
+    ldr     w9, [x8, #(TpidrEl0TlsMagic)]
+
+    LOAD_IMMEDIATE_32(w10, TlsMagic)
+
+    cmp     w9, w10
+    b.eq    1f
+
+    /* Incorrect TLS magic, so this is a host fault. */
+    /* Tail call the handler. */
+    b       _ZN4Core7ARM_NCE15HandleHostFaultEiPvS1_
+
+1:
+    /* Correct TLS magic, so this is a guest fault. */
+    stp     x29, x30, [sp, #-0x20]!
+    str     x19, [sp, #0x10]
+    mov     x29, sp
+
+    /* Save the old tpidr_el0. */
+    mov     x19, x8
+
+    /* Restore host tpidr_el0. */
+    ldr     x0, [x8, #(TpidrEl0NativeContext)]
+    ldr     x3, [x0, #(GuestContextHostContext + HostContextTpidrEl0)]
+    msr     tpidr_el0, x3
+
+    /* Call the handler. */
+    bl       _ZN4Core7ARM_NCE16HandleGuestFaultEPNS_12GuestContextEPvS3_
+
+    /* If the handler returned false, we want to preserve the host tpidr_el0. */
+    cbz     x0, 2f
+
+    /* Otherwise, restore guest tpidr_el0. */
+    msr     tpidr_el0, x19
+
+2:
+    ldr     x19, [sp, #0x10]
+    ldp     x29, x30, [sp], #0x20
+    ret
+
+
+/* static void Core::ARM_NCE::LockThreadParameters(void* tpidr) */
+.section    .text._ZN4Core7ARM_NCE20LockThreadParametersEPv, "ax", %progbits
+.global     _ZN4Core7ARM_NCE20LockThreadParametersEPv
+.type       _ZN4Core7ARM_NCE20LockThreadParametersEPv, %function
+_ZN4Core7ARM_NCE20LockThreadParametersEPv:
+    /* Offset to lock member. */
+    add     x0, x0, #(TpidrEl0Lock)
+
+1:
+    /* Clear the monitor. */
+    clrex
+
+2:
+    /* Load-linked with acquire ordering. */
+    ldaxr   w1, [x0]
+
+    /* If the value was SpinLockLocked, clear monitor and retry. */
+    cbz     w1, 1b
+
+    /* Store-conditional SpinLockLocked with relaxed ordering. */
+    stxr    w1, wzr, [x0]
+
+    /* If we failed to store, retry. */
+    cbnz    w1, 2b
+
+    ret
+
+
+/* static void Core::ARM_NCE::UnlockThreadParameters(void* tpidr) */
+.section    .text._ZN4Core7ARM_NCE22UnlockThreadParametersEPv, "ax", %progbits
+.global     _ZN4Core7ARM_NCE22UnlockThreadParametersEPv
+.type       _ZN4Core7ARM_NCE22UnlockThreadParametersEPv, %function
+_ZN4Core7ARM_NCE22UnlockThreadParametersEPv:
+    /* Offset to lock member. */
+    add     x0, x0, #(TpidrEl0Lock)
+
+    /* Load SpinLockUnlocked. */
+    mov     w1, #(SpinLockUnlocked)
+
+    /* Store value with release ordering. */
+    stlr    w1, [x0]
+
+    ret
--- a/src/core/arm/nce/arm_nce_asm_definitions.h
+++ b/src/core/arm/nce/arm_nce_asm_definitions.h
@ -0,0 +1,29 @@
+/* SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project */
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+
+#pragma once
+
+#define __ASSEMBLY__
+
+#include <asm-generic/signal.h>
+#include <asm-generic/unistd.h>
+
+#define ReturnToRunCodeByExceptionLevelChangeSignal SIGUSR2
+#define BreakFromRunCodeSignal SIGURG
+#define GuestFaultSignal SIGSEGV
+
+#define GuestContextSp 0xF8
+#define GuestContextHostContext 0x320
+
+#define HostContextSpTpidrEl0 0xE0
+#define HostContextTpidrEl0 0xE8
+#define HostContextRegs 0x0
+#define HostContextVregs 0x60
+
+#define TpidrEl0NativeContext 0x10
+#define TpidrEl0Lock 0x18
+#define TpidrEl0TlsMagic 0x20
+#define TlsMagic 0x555a5559
+
+#define SpinLockLocked 0
+#define SpinLockUnlocked 1
--- a/src/core/arm/nce/guest_context.h
+++ b/src/core/arm/nce/guest_context.h
@ -0,0 +1,50 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include "common/common_funcs.h"
+#include "common/common_types.h"
+#include "core/arm/arm_interface.h"
+#include "core/arm/nce/arm_nce_asm_definitions.h"
+
+namespace Core {
+
+class ARM_NCE;
+class System;
+
+struct HostContext {
+    alignas(16) std::array<u64, 12> host_saved_regs{};
+    alignas(16) std::array<u128, 8> host_saved_vregs{};
+    u64 host_sp{};
+    void* host_tpidr_el0{};
+};
+
+struct GuestContext {
+    std::array<u64, 31> cpu_registers{};
+    u64 sp{};
+    u64 pc{};
+    u32 fpcr{};
+    u32 fpsr{};
+    std::array<u128, 32> vector_registers{};
+    u32 pstate{};
+    alignas(16) HostContext host_ctx{};
+    u64 tpidrro_el0{};
+    u64 tpidr_el0{};
+    std::atomic<u64> esr_el1{};
+    u32 nzcv{};
+    u32 svc_swi{};
+    System* system{};
+    ARM_NCE* parent{};
+};
+
+// Verify assembly offsets.
+static_assert(offsetof(GuestContext, sp) == GuestContextSp);
+static_assert(offsetof(GuestContext, host_ctx) == GuestContextHostContext);
+static_assert(offsetof(HostContext, host_sp) == HostContextSpTpidrEl0);
+static_assert(offsetof(HostContext, host_tpidr_el0) - 8 == HostContextSpTpidrEl0);
+static_assert(offsetof(HostContext, host_tpidr_el0) == HostContextTpidrEl0);
+static_assert(offsetof(HostContext, host_saved_regs) == HostContextRegs);
+static_assert(offsetof(HostContext, host_saved_vregs) == HostContextVregs);
+
+} // namespace Core
--- a/src/core/arm/nce/instructions.h
+++ b/src/core/arm/nce/instructions.h
@ -0,0 +1,147 @@
+// SPDX-FileCopyrightText: Copyright © 2020 Skyline Team and Contributors
+// SPDX-License-Identifier: MPL-2.0
+
+#include "common/bit_field.h"
+#include "common/common_types.h"
+
+namespace Core::NCE {
+
+enum SystemRegister : u32 {
+    TpidrEl0 = 0x5E82,
+    TpidrroEl0 = 0x5E83,
+    CntfrqEl0 = 0x5F00,
+    CntpctEl0 = 0x5F01,
+};
+
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/SVC--Supervisor-Call-
+union SVC {
+    constexpr explicit SVC(u32 raw_) : raw{raw_} {}
+
+    constexpr bool Verify() {
+        return (this->GetSig0() == 0x1 && this->GetSig1() == 0x6A0);
+    }
+
+    constexpr u32 GetSig0() {
+        return decltype(sig0)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetValue() {
+        return decltype(value)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSig1() {
+        return decltype(sig1)::ExtractValue(raw);
+    }
+
+    u32 raw;
+
+private:
+    BitField<0, 5, u32> sig0;   // 0x1
+    BitField<5, 16, u32> value; // 16-bit immediate
+    BitField<21, 11, u32> sig1; // 0x6A0
+};
+static_assert(sizeof(SVC) == sizeof(u32));
+static_assert(SVC(0xD40000C1).Verify());
+static_assert(SVC(0xD40000C1).GetValue() == 0x6);
+
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MRS--Move-System-Register-
+union MRS {
+    constexpr explicit MRS(u32 raw_) : raw{raw_} {}
+
+    constexpr bool Verify() {
+        return (this->GetSig() == 0xD53);
+    }
+
+    constexpr u32 GetRt() {
+        return decltype(rt)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSystemReg() {
+        return decltype(system_reg)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSig() {
+        return decltype(sig)::ExtractValue(raw);
+    }
+
+    u32 raw;
+
+private:
+    BitField<0, 5, u32> rt;          // destination register
+    BitField<5, 15, u32> system_reg; // source system register
+    BitField<20, 12, u32> sig;       // 0xD53
+};
+static_assert(sizeof(MRS) == sizeof(u32));
+static_assert(MRS(0xD53BE020).Verify());
+static_assert(MRS(0xD53BE020).GetSystemReg() == CntpctEl0);
+static_assert(MRS(0xD53BE020).GetRt() == 0x0);
+
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/MSR--register---Move-general-purpose-register-to-System-Register-
+union MSR {
+    constexpr explicit MSR(u32 raw_) : raw{raw_} {}
+
+    constexpr bool Verify() {
+        return this->GetSig() == 0xD51;
+    }
+
+    constexpr u32 GetRt() {
+        return decltype(rt)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSystemReg() {
+        return decltype(system_reg)::ExtractValue(raw);
+    }
+
+    constexpr u32 GetSig() {
+        return decltype(sig)::ExtractValue(raw);
+    }
+
+    u32 raw;
+
+private:
+    BitField<0, 5, u32> rt;          // source register
+    BitField<5, 15, u32> system_reg; // destination system register
+    BitField<20, 12, u32> sig;       // 0xD51
+};
+static_assert(sizeof(MSR) == sizeof(u32));
+static_assert(MSR(0xD51BD040).Verify());
+static_assert(MSR(0xD51BD040).GetSystemReg() == TpidrEl0);
+static_assert(MSR(0xD51BD040).GetRt() == 0x0);
+
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXR--Load-Exclusive-Register-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/LDXP--Load-Exclusive-Pair-of-Registers-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXR--Store-Exclusive-Register-
+// https://developer.arm.com/documentation/ddi0596/2021-12/Base-Instructions/STXP--Store-Exclusive-Pair-of-registers-
+union Exclusive {
+    constexpr explicit Exclusive(u32 raw_) : raw{raw_} {}
+
+    constexpr bool Verify() {
+        return this->GetSig() == 0x10;
+    }
+
+    constexpr u32 GetSig() {
+        return decltype(sig)::ExtractValue(raw);
+    }
+
+    constexpr u32 AsOrdered() {
+        return raw | decltype(o0)::FormatValue(1);
+    }
+
+    u32 raw;
+
+private:
+    BitField<0, 5, u32> rt;    // memory operand
+    BitField<5, 5, u32> rn;    // register operand 1
+    BitField<10, 5, u32> rt2;  // register operand 2
+    BitField<15, 1, u32> o0;   // ordered
+    BitField<16, 5, u32> rs;   // status register
+    BitField<21, 2, u32> l;    // operation type
+    BitField<23, 7, u32> sig;  // 0x10
+    BitField<30, 2, u32> size; // size
+};
+static_assert(Exclusive(0xC85FFC00).Verify());
+static_assert(Exclusive(0xC85FFC00).AsOrdered() == 0xC85FFC00);
+static_assert(Exclusive(0xC85F7C00).AsOrdered() == 0xC85FFC00);
+static_assert(Exclusive(0xC8200440).AsOrdered() == 0xC8208440);
+
+} // namespace Core::NCE
--- a/src/core/arm/nce/patch.cpp
+++ b/src/core/arm/nce/patch.cpp
@ -0,0 +1,472 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include "common/arm64/native_clock.h"
+#include "common/bit_cast.h"
+#include "common/literals.h"
+#include "core/arm/nce/arm_nce.h"
+#include "core/arm/nce/guest_context.h"
+#include "core/arm/nce/instructions.h"
+#include "core/arm/nce/patch.h"
+#include "core/core.h"
+#include "core/core_timing.h"
+#include "core/hle/kernel/svc.h"
+
+namespace Core::NCE {
+
+using namespace Common::Literals;
+using namespace oaknut::util;
+
+using NativeExecutionParameters = Kernel::KThread::NativeExecutionParameters;
+
+constexpr size_t MaxRelativeBranch = 128_MiB;
+
+Patcher::Patcher() : c(m_patch_instructions) {}
+
+Patcher::~Patcher() = default;
+
+void Patcher::PatchText(const Kernel::PhysicalMemory& program_image,
+                        const Kernel::CodeSet::Segment& code) {
+
+    // Write save context helper function.
+    c.l(m_save_context);
+    WriteSaveContext();
+
+    // Write load context helper function.
+    c.l(m_load_context);
+    WriteLoadContext();
+
+    // Retrieve text segment data.
+    const auto text = std::span{program_image}.subspan(code.offset, code.size);
+    const auto text_words =
+        std::span<const u32>{reinterpret_cast<const u32*>(text.data()), text.size() / sizeof(u32)};
+
+    // Loop through instructions, patching as needed.
+    for (u32 i = 0; i < static_cast<u32>(text_words.size()); i++) {
+        const u32 inst = text_words[i];
+
+        const auto AddRelocations = [&] {
+            const uintptr_t this_offset = i * sizeof(u32);
+            const uintptr_t next_offset = this_offset + sizeof(u32);
+
+            // Relocate from here to patch.
+            this->BranchToPatch(this_offset);
+
+            // Relocate from patch to next instruction.
+            return next_offset;
+        };
+
+        // SVC
+        if (auto svc = SVC{inst}; svc.Verify()) {
+            WriteSvcTrampoline(AddRelocations(), svc.GetValue());
+            continue;
+        }
+
+        // MRS Xn, TPIDR_EL0
+        // MRS Xn, TPIDRRO_EL0
+        if (auto mrs = MRS{inst};
+            mrs.Verify() && (mrs.GetSystemReg() == TpidrroEl0 || mrs.GetSystemReg() == TpidrEl0)) {
+            const auto src_reg = mrs.GetSystemReg() == TpidrroEl0 ? oaknut::SystemReg::TPIDRRO_EL0
+                                                                  : oaknut::SystemReg::TPIDR_EL0;
+            const auto dest_reg = oaknut::XReg{static_cast<int>(mrs.GetRt())};
+            WriteMrsHandler(AddRelocations(), dest_reg, src_reg);
+            continue;
+        }
+
+        // MRS Xn, CNTPCT_EL0
+        if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntpctEl0) {
+            WriteCntpctHandler(AddRelocations(), oaknut::XReg{static_cast<int>(mrs.GetRt())});
+            continue;
+        }
+
+        // MRS Xn, CNTFRQ_EL0
+        if (auto mrs = MRS{inst}; mrs.Verify() && mrs.GetSystemReg() == CntfrqEl0) {
+            UNREACHABLE();
+        }
+
+        // MSR TPIDR_EL0, Xn
+        if (auto msr = MSR{inst}; msr.Verify() && msr.GetSystemReg() == TpidrEl0) {
+            WriteMsrHandler(AddRelocations(), oaknut::XReg{static_cast<int>(msr.GetRt())});
+            continue;
+        }
+    }
+
+    // Determine patching mode for the final relocation step
+    const size_t image_size = program_image.size();
+    this->mode = image_size > MaxRelativeBranch ? PatchMode::PreText : PatchMode::PostData;
+}
+
+void Patcher::RelocateAndCopy(Common::ProcessAddress load_base,
+                              const Kernel::CodeSet::Segment& code,
+                              Kernel::PhysicalMemory& program_image,
+                              EntryTrampolines* out_trampolines) {
+    const size_t patch_size = SectionSize();
+    const size_t image_size = program_image.size();
+
+    // Retrieve text segment data.
+    const auto text = std::span{program_image}.subspan(code.offset, code.size);
+    const auto text_words =
+        std::span<u32>{reinterpret_cast<u32*>(text.data()), text.size() / sizeof(u32)};
+
+    const auto ApplyBranchToPatchRelocation = [&](u32* target, const Relocation& rel) {
+        oaknut::CodeGenerator rc{target};
+        if (mode == PatchMode::PreText) {
+            rc.B(rel.patch_offset - patch_size - rel.module_offset);
+        } else {
+            rc.B(image_size - rel.module_offset + rel.patch_offset);
+        }
+    };
+
+    const auto ApplyBranchToModuleRelocation = [&](u32* target, const Relocation& rel) {
+        oaknut::CodeGenerator rc{target};
+        if (mode == PatchMode::PreText) {
+            rc.B(patch_size - rel.patch_offset + rel.module_offset);
+        } else {
+            rc.B(rel.module_offset - image_size - rel.patch_offset);
+        }
+    };
+
+    const auto RebasePatch = [&](ptrdiff_t patch_offset) {
+        if (mode == PatchMode::PreText) {
+            return GetInteger(load_base) + patch_offset;
+        } else {
+            return GetInteger(load_base) + image_size + patch_offset;
+        }
+    };
+
+    const auto RebasePc = [&](uintptr_t module_offset) {
+        if (mode == PatchMode::PreText) {
+            return GetInteger(load_base) + patch_size + module_offset;
+        } else {
+            return GetInteger(load_base) + module_offset;
+        }
+    };
+
+    // We are now ready to relocate!
+    for (const Relocation& rel : m_branch_to_patch_relocations) {
+        ApplyBranchToPatchRelocation(text_words.data() + rel.module_offset / sizeof(u32), rel);
+    }
+    for (const Relocation& rel : m_branch_to_module_relocations) {
+        ApplyBranchToModuleRelocation(m_patch_instructions.data() + rel.patch_offset / sizeof(u32),
+                                      rel);
+    }
+
+    // Rewrite PC constants and record post trampolines
+    for (const Relocation& rel : m_write_module_pc_relocations) {
+        oaknut::CodeGenerator rc{m_patch_instructions.data() + rel.patch_offset / sizeof(u32)};
+        rc.dx(RebasePc(rel.module_offset));
+    }
+    for (const Trampoline& rel : m_trampolines) {
+        out_trampolines->insert({RebasePc(rel.module_offset), RebasePatch(rel.patch_offset)});
+    }
+
+    // Cortex-A57 seems to treat all exclusives as ordered, but newer processors do not.
+    // Convert to ordered to preserve this assumption
+    for (u32 i = 0; i < static_cast<u32>(text_words.size()); i++) {
+        const u32 inst = text_words[i];
+        if (auto exclusive = Exclusive{inst}; exclusive.Verify()) {
+            text_words[i] = exclusive.AsOrdered();
+        }
+    }
+
+    // Copy to program image
+    if (this->mode == PatchMode::PreText) {
+        std::memcpy(program_image.data(), m_patch_instructions.data(),
+                    m_patch_instructions.size() * sizeof(u32));
+    } else {
+        program_image.resize(image_size + patch_size);
+        std::memcpy(program_image.data() + image_size, m_patch_instructions.data(),
+                    m_patch_instructions.size() * sizeof(u32));
+    }
+}
+
+size_t Patcher::SectionSize() const noexcept {
+    return Common::AlignUp(m_patch_instructions.size() * sizeof(u32), Core::Memory::YUZU_PAGESIZE);
+}
+
+void Patcher::WriteLoadContext() {
+    // This function was called, which modifies X30, so use that as a scratch register.
+    // SP contains the guest X30, so save our return X30 to SP + 8, since we have allocated 16 bytes
+    // of stack.
+    c.STR(X30, SP, 8);
+    c.MRS(X30, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context));
+
+    // Load system registers.
+    c.LDR(W0, X30, offsetof(GuestContext, fpsr));
+    c.MSR(oaknut::SystemReg::FPSR, X0);
+    c.LDR(W0, X30, offsetof(GuestContext, fpcr));
+    c.MSR(oaknut::SystemReg::FPCR, X0);
+    c.LDR(W0, X30, offsetof(GuestContext, nzcv));
+    c.MSR(oaknut::SystemReg::NZCV, X0);
+
+    // Load all vector registers.
+    static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers);
+    for (int i = 0; i <= 30; i += 2) {
+        c.LDP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i);
+    }
+
+    // Load all general-purpose registers except X30.
+    for (int i = 0; i <= 28; i += 2) {
+        c.LDP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i);
+    }
+
+    // Reload our return X30 from the stack and return.
+    // The patch code will reload the guest X30 for us.
+    c.LDR(X30, SP, 8);
+    c.RET();
+}
+
+void Patcher::WriteSaveContext() {
+    // This function was called, which modifies X30, so use that as a scratch register.
+    // SP contains the guest X30, so save our X30 to SP + 8, since we have allocated 16 bytes of
+    // stack.
+    c.STR(X30, SP, 8);
+    c.MRS(X30, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X30, X30, offsetof(NativeExecutionParameters, native_context));
+
+    // Store all general-purpose registers except X30.
+    for (int i = 0; i <= 28; i += 2) {
+        c.STP(oaknut::XReg{i}, oaknut::XReg{i + 1}, X30, 8 * i);
+    }
+
+    // Store all vector registers.
+    static constexpr size_t VEC_OFF = offsetof(GuestContext, vector_registers);
+    for (int i = 0; i <= 30; i += 2) {
+        c.STP(oaknut::QReg{i}, oaknut::QReg{i + 1}, X30, VEC_OFF + 16 * i);
+    }
+
+    // Store guest system registers, X30 and SP, using X0 as a scratch register.
+    c.STR(X0, SP, PRE_INDEXED, -16);
+    c.LDR(X0, SP, 16);
+    c.STR(X0, X30, 8 * 30);
+    c.ADD(X0, SP, 32);
+    c.STR(X0, X30, offsetof(GuestContext, sp));
+    c.MRS(X0, oaknut::SystemReg::FPSR);
+    c.STR(W0, X30, offsetof(GuestContext, fpsr));
+    c.MRS(X0, oaknut::SystemReg::FPCR);
+    c.STR(W0, X30, offsetof(GuestContext, fpcr));
+    c.MRS(X0, oaknut::SystemReg::NZCV);
+    c.STR(W0, X30, offsetof(GuestContext, nzcv));
+    c.LDR(X0, SP, POST_INDEXED, 16);
+
+    // Reload our return X30 from the stack, and return.
+    c.LDR(X30, SP, 8);
+    c.RET();
+}
+
+void Patcher::WriteSvcTrampoline(ModuleDestLabel module_dest, u32 svc_id) {
+    LOG_ERROR(Core_ARM, "Patching SVC {:#x} at {:#x}", svc_id, module_dest - 4);
+    // We are about to start saving state, so we need to lock the context.
+    this->LockContext();
+
+    // Store guest X30 to the stack. Then, save the context and restore the stack.
+    // This will save all registers except PC, but we know PC at patch time.
+    c.STR(X30, SP, PRE_INDEXED, -16);
+    c.BL(m_save_context);
+    c.LDR(X30, SP, POST_INDEXED, 16);
+
+    // Now that we've saved all registers, we can use any registers as scratch.
+    // Store PC + 4 to arm interface, since we know the instruction offset from the entry point.
+    oaknut::Label pc_after_svc;
+    c.MRS(X1, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context));
+    c.LDR(X2, pc_after_svc);
+    c.STR(X2, X1, offsetof(GuestContext, pc));
+
+    // Store SVC number to execute when we return
+    c.MOV(X2, svc_id);
+    c.STR(W2, X1, offsetof(GuestContext, svc_swi));
+
+    // We are calling a SVC. Clear esr_el1 and return it.
+    static_assert(std::is_same_v<std::underlying_type_t<HaltReason>, u64>);
+    oaknut::Label retry;
+    c.ADD(X2, X1, offsetof(GuestContext, esr_el1));
+    c.l(retry);
+    c.LDAXR(X0, X2);
+    c.STLXR(W3, XZR, X2);
+    c.CBNZ(W3, retry);
+
+    // Add "calling SVC" flag. Since this is X0, this is now our return value.
+    c.ORR(X0, X0, static_cast<u64>(HaltReason::SupervisorCall));
+
+    // Offset the GuestContext pointer to the HostContext member.
+    // STP has limited range of [-512, 504] which we can't reach otherwise
+    // NB: Due to this all offsets below are from the start of HostContext.
+    c.ADD(X1, X1, offsetof(GuestContext, host_ctx));
+
+    // Reload host TPIDR_EL0 and SP.
+    static_assert(offsetof(HostContext, host_sp) + 8 == offsetof(HostContext, host_tpidr_el0));
+    c.LDP(X2, X3, X1, offsetof(HostContext, host_sp));
+    c.MOV(SP, X2);
+    c.MSR(oaknut::SystemReg::TPIDR_EL0, X3);
+
+    // Load callee-saved host registers and return to host.
+    static constexpr size_t HOST_REGS_OFF = offsetof(HostContext, host_saved_regs);
+    static constexpr size_t HOST_VREGS_OFF = offsetof(HostContext, host_saved_vregs);
+    c.LDP(X19, X20, X1, HOST_REGS_OFF);
+    c.LDP(X21, X22, X1, HOST_REGS_OFF + 2 * sizeof(u64));
+    c.LDP(X23, X24, X1, HOST_REGS_OFF + 4 * sizeof(u64));
+    c.LDP(X25, X26, X1, HOST_REGS_OFF + 6 * sizeof(u64));
+    c.LDP(X27, X28, X1, HOST_REGS_OFF + 8 * sizeof(u64));
+    c.LDP(X29, X30, X1, HOST_REGS_OFF + 10 * sizeof(u64));
+    c.LDP(Q8, Q9, X1, HOST_VREGS_OFF);
+    c.LDP(Q10, Q11, X1, HOST_VREGS_OFF + 2 * sizeof(u128));
+    c.LDP(Q12, Q13, X1, HOST_VREGS_OFF + 4 * sizeof(u128));
+    c.LDP(Q14, Q15, X1, HOST_VREGS_OFF + 6 * sizeof(u128));
+    c.RET();
+
+    // Write the post-SVC trampoline address, which will jump back to the guest after restoring its
+    // state.
+    m_trampolines.push_back({c.offset(), module_dest});
+
+    // Host called this location. Save the return address so we can
+    // unwind the stack properly when jumping back.
+    c.MRS(X2, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X2, X2, offsetof(NativeExecutionParameters, native_context));
+    c.ADD(X0, X2, offsetof(GuestContext, host_ctx));
+    c.STR(X30, X0, offsetof(HostContext, host_saved_regs) + 11 * sizeof(u64));
+
+    // Reload all guest registers except X30 and PC.
+    // The function also expects 16 bytes of stack already allocated.
+    c.STR(X30, SP, PRE_INDEXED, -16);
+    c.BL(m_load_context);
+    c.LDR(X30, SP, POST_INDEXED, 16);
+
+    // Use X1 as a scratch register to restore X30.
+    c.STR(X1, SP, PRE_INDEXED, -16);
+    c.MRS(X1, oaknut::SystemReg::TPIDR_EL0);
+    c.LDR(X1, X1, offsetof(NativeExecutionParameters, native_context));
+    c.LDR(X30, X1, offsetof(GuestContext, cpu_registers) + sizeof(u64) * 30);
+    c.LDR(X1, SP, POST_INDEXED, 16);
+
+    // Unlock the context.
+    this->UnlockContext();
+
+    // Jump back to the instruction after the emulated SVC.
+    this->BranchToModule(module_dest);
+
+    // Store PC after call.
+    c.l(pc_after_svc);
+    this->WriteModulePc(module_dest);
+}
+
+void Patcher::WriteMrsHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg,
+                              oaknut::SystemReg src_reg) {
+    // Retrieve emulated TLS register from GuestContext.
+    c.MRS(dest_reg, oaknut::SystemReg::TPIDR_EL0);
+    if (src_reg == oaknut::SystemReg::TPIDRRO_EL0) {
+        c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidrro_el0));
+    } else {
+        c.LDR(dest_reg, dest_reg, offsetof(NativeExecutionParameters, tpidr_el0));
+    }
+
+    // Jump back to the instruction after the emulated MRS.
+    this->BranchToModule(module_dest);
+}
+
+void Patcher::WriteMsrHandler(ModuleDestLabel module_dest, oaknut::XReg src_reg) {
+    const auto scratch_reg = src_reg.index() == 0 ? X1 : X0;
+    c.STR(scratch_reg, SP, PRE_INDEXED, -16);
+
+    // Save guest value to NativeExecutionParameters::tpidr_el0.
+    c.MRS(scratch_reg, oaknut::SystemReg::TPIDR_EL0);
+    c.STR(src_reg, scratch_reg, offsetof(NativeExecutionParameters, tpidr_el0));
+
+    // Restore scratch register.
+    c.LDR(scratch_reg, SP, POST_INDEXED, 16);
+
+    // Jump back to the instruction after the emulated MSR.
+    this->BranchToModule(module_dest);
+}
+
+void Patcher::WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg) {
+    static Common::Arm64::NativeClock clock{};
+    const auto factor = clock.GetGuestCNTFRQFactor();
+    const auto raw_factor = Common::BitCast<std::array<u64, 2>>(factor);
+
+    const auto use_x2_x3 = dest_reg.index() == 0 || dest_reg.index() == 1;
+    oaknut::XReg scratch0 = use_x2_x3 ? X2 : X0;
+    oaknut::XReg scratch1 = use_x2_x3 ? X3 : X1;
+
+    oaknut::Label factorlo;
+    oaknut::Label factorhi;
+
+    // Save scratches.
+    c.STP(scratch0, scratch1, SP, PRE_INDEXED, -16);
+
+    // Load counter value.
+    c.MRS(dest_reg, oaknut::SystemReg::CNTVCT_EL0);
+
+    // Load scaling factor.
+    c.LDR(scratch0, factorlo);
+    c.LDR(scratch1, factorhi);
+
+    // Multiply low bits and get result.
+    c.UMULH(scratch0, dest_reg, scratch0);
+
+    // Multiply high bits and add low bit result.
+    c.MADD(dest_reg, dest_reg, scratch1, scratch0);
+
+    // Reload scratches.
+    c.LDP(scratch0, scratch1, SP, POST_INDEXED, 16);
+
+    // Jump back to the instruction after the emulated MRS.
+    this->BranchToModule(module_dest);
+
+    // Scaling factor constant values.
+    c.l(factorlo);
+    c.dx(raw_factor[0]);
+    c.l(factorhi);
+    c.dx(raw_factor[1]);
+}
+
+void Patcher::LockContext() {
+    oaknut::Label retry;
+
+    // Save scratches.
+    c.STP(X0, X1, SP, PRE_INDEXED, -16);
+
+    // Reload lock pointer.
+    c.l(retry);
+    c.CLREX();
+    c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
+    c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
+
+    static_assert(SpinLockLocked == 0);
+
+    // Load-linked with acquire ordering.
+    c.LDAXR(W1, X0);
+
+    // If the value was SpinLockLocked, clear monitor and retry.
+    c.CBZ(W1, retry);
+
+    // Store-conditional SpinLockLocked with relaxed ordering.
+    c.STXR(W1, WZR, X0);
+
+    // If we failed to store, retry.
+    c.CBNZ(W1, retry);
+
+    // We succeeded! Reload scratches.
+    c.LDP(X0, X1, SP, POST_INDEXED, 16);
+}
+
+void Patcher::UnlockContext() {
+    // Save scratches.
+    c.STP(X0, X1, SP, PRE_INDEXED, -16);
+
+    // Load lock pointer.
+    c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
+    c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
+
+    // Load SpinLockUnlocked.
+    c.MOV(W1, SpinLockUnlocked);
+
+    // Store value with release ordering.
+    c.STLR(W1, X0);
+
+    // Load scratches.
+    c.LDP(X0, X1, SP, POST_INDEXED, 16);
+}
+
+} // namespace Core::NCE
--- a/src/core/arm/nce/patch.h
+++ b/src/core/arm/nce/patch.h
@ -0,0 +1,107 @@
+// SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#pragma once
+
+#include <span>
+#include <unordered_map>
+#include <vector>
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wshorten-64-to-32"
+#include <oaknut/code_block.hpp>
+#include <oaknut/oaknut.hpp>
+#pragma clang diagnostic pop
+
+#include "common/common_types.h"
+#include "core/hle/kernel/code_set.h"
+#include "core/hle/kernel/k_typed_address.h"
+#include "core/hle/kernel/physical_memory.h"
+
+#include <signal.h>
+
+namespace Core {
+struct GuestContext;
+}
+
+namespace Core::NCE {
+
+enum class PatchMode : u32 {
+    None,
+    PreText,  ///< Patch section is inserted before .text
+    PostData, ///< Patch section is inserted after .data
+};
+
+using ModuleTextAddress = u64;
+using PatchTextAddress = u64;
+using EntryTrampolines = std::unordered_map<ModuleTextAddress, PatchTextAddress>;
+
+class Patcher {
+public:
+    explicit Patcher();
+    ~Patcher();
+
+    void PatchText(const Kernel::PhysicalMemory& program_image,
+                   const Kernel::CodeSet::Segment& code);
+    void RelocateAndCopy(Common::ProcessAddress load_base, const Kernel::CodeSet::Segment& code,
+                         Kernel::PhysicalMemory& program_image, EntryTrampolines* out_trampolines);
+    size_t SectionSize() const noexcept;
+
+    [[nodiscard]] PatchMode Mode() const noexcept {
+        return mode;
+    }
+
+private:
+    using ModuleDestLabel = uintptr_t;
+
+    struct Trampoline {
+        ptrdiff_t patch_offset;
+        uintptr_t module_offset;
+    };
+
+    void WriteLoadContext();
+    void WriteSaveContext();
+    void LockContext();
+    void UnlockContext();
+    void WriteSvcTrampoline(ModuleDestLabel module_dest, u32 svc_id);
+    void WriteMrsHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg,
+                         oaknut::SystemReg src_reg);
+    void WriteMsrHandler(ModuleDestLabel module_dest, oaknut::XReg src_reg);
+    void WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_reg);
+
+private:
+    void BranchToPatch(uintptr_t module_dest) {
+        m_branch_to_patch_relocations.push_back({c.offset(), module_dest});
+    }
+
+    void BranchToModule(uintptr_t module_dest) {
+        m_branch_to_module_relocations.push_back({c.offset(), module_dest});
+        c.dw(0);
+    }
+
+    void WriteModulePc(uintptr_t module_dest) {
+        m_write_module_pc_relocations.push_back({c.offset(), module_dest});
+        c.dx(0);
+    }
+
+private:
+    // List of patch instructions we have generated.
+    std::vector<u32> m_patch_instructions{};
+
+    // Relocation type for relative branch from module to patch.
+    struct Relocation {
+        ptrdiff_t patch_offset;  ///< Offset in bytes from the start of the patch section.
+        uintptr_t module_offset; ///< Offset in bytes from the start of the text section.
+    };
+
+    oaknut::VectorCodeGenerator c;
+    std::vector<Trampoline> m_trampolines;
+    std::vector<Relocation> m_branch_to_patch_relocations{};
+    std::vector<Relocation> m_branch_to_module_relocations{};
+    std::vector<Relocation> m_write_module_pc_relocations{};
+    oaknut::Label m_save_context{};
+    oaknut::Label m_load_context{};
+    PatchMode mode{PatchMode::None};
+};
+
+} // namespace Core::NCE
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@ -136,9 +136,7 @@ struct System::Impl {
    }

    void Initialize(System& system) {
-        const bool direct_mapped_address = Settings::IsNceEnabled();
-        device_memory = std::make_unique<Core::DeviceMemory>(direct_mapped_address);
-
+        device_memory = std::make_unique<Core::DeviceMemory>();
        is_multicore = Settings::values.use_multi_core.GetValue();
        extended_memory_layout =
            Settings::values.memory_layout_mode.GetValue() != Settings::MemoryLayout::Memory_4Gb;
--- a/src/core/cpu_manager.cpp
+++ b/src/core/cpu_manager.cpp
@ -211,6 +211,8 @@ void CpuManager::RunThread(std::stop_token token, std::size_t core) {
        system.GPU().ObtainContext();
    }

+    system.ArmInterface(core).Initialize();
+
    auto& kernel = system.Kernel();
    auto& scheduler = *kernel.CurrentScheduler();
    auto* thread = scheduler.GetSchedulerCurrentThread();
--- a/src/core/device_memory.cpp
+++ b/src/core/device_memory.cpp
@ -12,13 +12,9 @@ constexpr size_t VirtualReserveSize = 1ULL << 38;
 constexpr size_t VirtualReserveSize = 1ULL << 39;
 #endif

-DeviceMemory::DeviceMemory(bool direct_mapped_address)
+DeviceMemory::DeviceMemory()
    : buffer{Kernel::Board::Nintendo::Nx::KSystemControl::Init::GetIntendedMemorySize(),
-             VirtualReserveSize} {
-    if (direct_mapped_address) {
-        buffer.EnableDirectMappedAddress();
-    }
-}
+             VirtualReserveSize} {}

 DeviceMemory::~DeviceMemory() = default;

--- a/src/core/device_memory.h
+++ b/src/core/device_memory.h
@ -18,7 +18,7 @@ enum : u64 {

 class DeviceMemory {
 public:
-    explicit DeviceMemory(bool direct_mapped_address);
+    explicit DeviceMemory();
    ~DeviceMemory();

    DeviceMemory& operator=(const DeviceMemory&) = delete;
--- a/src/core/hle/kernel/code_set.h
+++ b/src/core/hle/kernel/code_set.h
@ -75,11 +75,20 @@ struct CodeSet final {
        return segments[2];
    }

+    Segment& PatchSegment() {
+        return patch_segment;
+    }
+
+    const Segment& PatchSegment() const {
+        return patch_segment;
+    }
+
    /// The overall data that backs this code set.
    Kernel::PhysicalMemory memory;

    /// The segments that comprise this code set.
    std::array<Segment, 3> segments;
+    Segment patch_segment;

    /// The entry point address for this code set.
    KProcessAddress entrypoint = 0;
--- a/src/core/hle/kernel/k_address_space_info.cpp
+++ b/src/core/hle/kernel/k_address_space_info.cpp
@ -25,8 +25,8 @@ constexpr std::array<KAddressSpaceInfo, 13> AddressSpaceInfos{{
   { .bit_width = 36, .address = 2_GiB       , .size = 64_GiB  - 2_GiB  , .type = KAddressSpaceInfo::Type::MapLarge, },
   { .bit_width = 36, .address = Size_Invalid, .size = 8_GiB            , .type = KAddressSpaceInfo::Type::Heap,     },
   { .bit_width = 36, .address = Size_Invalid, .size = 6_GiB            , .type = KAddressSpaceInfo::Type::Alias,    },
-#ifdef ANDROID
-   // With Android, we use a 38-bit address space due to memory limitations. This should (safely) truncate ASLR region.
+#ifdef ARCHITECTURE_arm64
+   // With NCE, we use a 38-bit address space due to memory limitations. This should (safely) truncate ASLR region.
   { .bit_width = 39, .address = 128_MiB     , .size = 256_GiB - 128_MiB, .type = KAddressSpaceInfo::Type::Map39Bit, },
 #else
   { .bit_width = 39, .address = 128_MiB     , .size = 512_GiB - 128_MiB, .type = KAddressSpaceInfo::Type::Map39Bit, },
--- a/src/core/hle/kernel/k_process.cpp
+++ b/src/core/hle/kernel/k_process.cpp
@ -1214,6 +1214,17 @@ void KProcess::LoadModule(CodeSet code_set, KProcessAddress base_addr) {
    ReprotectSegment(code_set.CodeSegment(), Svc::MemoryPermission::ReadExecute);
    ReprotectSegment(code_set.RODataSegment(), Svc::MemoryPermission::Read);
    ReprotectSegment(code_set.DataSegment(), Svc::MemoryPermission::ReadWrite);
+
+#ifdef ARCHITECTURE_arm64
+    if (Settings::IsNceEnabled()) {
+        auto& buffer = m_kernel.System().DeviceMemory().buffer;
+        const auto& code = code_set.CodeSegment();
+        const auto& patch = code_set.PatchSegment();
+        buffer.Protect(GetInteger(base_addr + code.addr), code.size, true, true, true);
+        buffer.Protect(GetInteger(base_addr + patch.addr), patch.size, true, true, true);
+        ReprotectSegment(code_set.PatchSegment(), Svc::MemoryPermission::None);
+    }
+#endif
 }

 bool KProcess::InsertWatchpoint(KProcessAddress addr, u64 size, DebugWatchpointType type) {
--- a/src/core/hle/kernel/k_process.h
+++ b/src/core/hle/kernel/k_process.h
@ -112,6 +112,7 @@ private:
    std::array<KThread*, Core::Hardware::NUM_CPU_CORES> m_pinned_threads{};
    std::array<DebugWatchpoint, Core::Hardware::NUM_WATCHPOINTS> m_watchpoints{};
    std::map<KProcessAddress, u64> m_debug_page_refcounts{};
+    std::unordered_map<u64, u64> m_post_handlers{};
    std::atomic<s64> m_cpu_time{};
    std::atomic<s64> m_num_process_switches{};
    std::atomic<s64> m_num_thread_switches{};
@ -467,6 +468,14 @@ public:

    static void Switch(KProcess* cur_process, KProcess* next_process);

+    std::unordered_map<u64, u64>& GetPostHandlers() noexcept {
+        return m_post_handlers;
+    }
+
+    KernelCore& GetKernel() noexcept {
+        return m_kernel;
+    }
+
 public:
    // Attempts to insert a watchpoint into a free slot. Returns false if none are available.
    bool InsertWatchpoint(KProcessAddress addr, u64 size, DebugWatchpointType type);
--- a/src/core/hle/kernel/k_thread.h
+++ b/src/core/hle/kernel/k_thread.h
@ -655,6 +655,21 @@ public:
        return m_stack_top;
    }

+public:
+    // TODO: This shouldn't be defined in kernel namespace
+    struct NativeExecutionParameters {
+        u64 tpidr_el0{};
+        u64 tpidrro_el0{};
+        void* native_context{};
+        std::atomic<u32> lock{1};
+        bool is_running{};
+        u32 magic{Common::MakeMagic('Y', 'U', 'Z', 'U')};
+    };
+
+    NativeExecutionParameters& GetNativeExecutionParameters() {
+        return m_native_execution_parameters;
+    }
+
 private:
    KThread* RemoveWaiterByKey(bool* out_has_waiters, KProcessAddress key,
                               bool is_kernel_address_key);
@ -914,6 +929,7 @@ private:
    ThreadWaitReasonForDebugging m_wait_reason_for_debugging{};
    uintptr_t m_argument{};
    KProcessAddress m_stack_top{};
+    NativeExecutionParameters m_native_execution_parameters{};

 public:
    using ConditionVariableThreadTreeType = ConditionVariableThreadTree;
--- a/src/core/hle/kernel/physical_core.cpp
+++ b/src/core/hle/kernel/physical_core.cpp
@ -1,8 +1,12 @@
 // SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

+#include "common/settings.h"
 #include "core/arm/dynarmic/arm_dynarmic_32.h"
 #include "core/arm/dynarmic/arm_dynarmic_64.h"
+#ifdef ARCHITECTURE_arm64
+#include "core/arm/nce/arm_nce.h"
+#endif
 #include "core/core.h"
 #include "core/hle/kernel/k_scheduler.h"
 #include "core/hle/kernel/kernel.h"
@ -14,7 +18,8 @@ PhysicalCore::PhysicalCore(std::size_t core_index, Core::System& system, KSchedu
    : m_core_index{core_index}, m_system{system}, m_scheduler{scheduler} {
 #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
    // TODO(bunnei): Initialization relies on a core being available. We may later replace this with
-    // a 32-bit instance of Dynarmic. This should be abstracted out to a CPU manager.
+    // an NCE interface or a 32-bit instance of Dynarmic. This should be abstracted out to a CPU
+    // manager.
    auto& kernel = system.Kernel();
    m_arm_interface = std::make_unique<Core::ARM_Dynarmic_64>(
        system, kernel.IsMulticore(),
@ -28,6 +33,13 @@ PhysicalCore::PhysicalCore(std::size_t core_index, Core::System& system, KSchedu
 PhysicalCore::~PhysicalCore() = default;

 void PhysicalCore::Initialize(bool is_64_bit) {
+#if defined(ARCHITECTURE_arm64)
+    if (Settings::IsNceEnabled()) {
+        m_arm_interface = std::make_unique<Core::ARM_NCE>(m_system, m_system.Kernel().IsMulticore(),
+                                                          m_core_index);
+        return;
+    }
+#endif
 #if defined(ARCHITECTURE_x86_64) || defined(ARCHITECTURE_arm64)
    auto& kernel = m_system.Kernel();
    if (!is_64_bit) {
--- a/src/core/loader/deconstructed_rom_directory.cpp
+++ b/src/core/loader/deconstructed_rom_directory.cpp
@ -3,6 +3,7 @@

 #include <cstring>
 #include "common/logging/log.h"
+#include "common/settings.h"
 #include "core/core.h"
 #include "core/file_sys/content_archive.h"
 #include "core/file_sys/control_metadata.h"
@ -14,6 +15,10 @@
 #include "core/loader/deconstructed_rom_directory.h"
 #include "core/loader/nso.h"

+#ifdef ARCHITECTURE_arm64
+#include "core/arm/nce/patch.h"
+#endif
+
 namespace Loader {

 AppLoader_DeconstructedRomDirectory::AppLoader_DeconstructedRomDirectory(FileSys::VirtualFile file_,
@ -124,21 +129,41 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
    }
    metadata.Print();

-    const auto static_modules = {"rtld",    "main",    "subsdk0", "subsdk1", "subsdk2",
-                                 "subsdk3", "subsdk4", "subsdk5", "subsdk6", "subsdk7",
-                                 "subsdk8", "subsdk9", "sdk"};
+    // Enable NCE only for 64-bit programs.
+    Settings::SetNceEnabled(metadata.Is64BitProgram());
+
+    const std::array static_modules = {"rtld",    "main",    "subsdk0", "subsdk1", "subsdk2",
+                                       "subsdk3", "subsdk4", "subsdk5", "subsdk6", "subsdk7",
+                                       "subsdk8", "subsdk9", "sdk"};
+
+    std::size_t code_size{};
+
+    // Define an nce patch context for each potential module.
+#ifdef ARCHITECTURE_arm64
+    std::array<Core::NCE::Patcher, 13> module_patchers;
+#endif
+
+    const auto GetPatcher = [&](size_t i) -> Core::NCE::Patcher* {
+#ifdef ARCHITECTURE_arm64
+        if (Settings::IsNceEnabled()) {
+            return &module_patchers[i];
+        }
+#endif
+        return nullptr;
+    };

    // Use the NSO module loader to figure out the code layout
-    std::size_t code_size{};
-    for (const auto& module : static_modules) {
+    for (size_t i = 0; i < static_modules.size(); i++) {
+        const auto& module = static_modules[i];
        const FileSys::VirtualFile module_file{dir->GetFile(module)};
        if (!module_file) {
            continue;
        }

        const bool should_pass_arguments = std::strcmp(module, "rtld") == 0;
-        const auto tentative_next_load_addr = AppLoader_NSO::LoadModule(
-            process, system, *module_file, code_size, should_pass_arguments, false);
+        const auto tentative_next_load_addr =
+            AppLoader_NSO::LoadModule(process, system, *module_file, code_size,
+                                      should_pass_arguments, false, {}, GetPatcher(i));
        if (!tentative_next_load_addr) {
            return {ResultStatus::ErrorLoadingNSO, {}};
        }
@ -146,8 +171,18 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
        code_size = *tentative_next_load_addr;
    }

+    // Enable direct memory mapping in case of NCE.
+    const u64 fastmem_base = [&]() -> size_t {
+        if (Settings::IsNceEnabled()) {
+            auto& buffer = system.DeviceMemory().buffer;
+            buffer.EnableDirectMappedAddress();
+            return reinterpret_cast<u64>(buffer.VirtualBasePointer());
+        }
+        return 0;
+    }();
+
    // Setup the process code layout
-    if (process.LoadFromMetadata(metadata, code_size, 0, is_hbl).IsError()) {
+    if (process.LoadFromMetadata(metadata, code_size, fastmem_base, is_hbl).IsError()) {
        return {ResultStatus::ErrorUnableToParseKernelMetadata, {}};
    }

@ -157,7 +192,8 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect
    VAddr next_load_addr{base_address};
    const FileSys::PatchManager pm{metadata.GetTitleID(), system.GetFileSystemController(),
                                   system.GetContentProvider()};
-    for (const auto& module : static_modules) {
+    for (size_t i = 0; i < static_modules.size(); i++) {
+        const auto& module = static_modules[i];
        const FileSys::VirtualFile module_file{dir->GetFile(module)};
        if (!module_file) {
            continue;
@ -165,15 +201,16 @@ AppLoader_DeconstructedRomDirectory::LoadResult AppLoader_DeconstructedRomDirect

        const VAddr load_addr{next_load_addr};
        const bool should_pass_arguments = std::strcmp(module, "rtld") == 0;
-        const auto tentative_next_load_addr = AppLoader_NSO::LoadModule(
-            process, system, *module_file, load_addr, should_pass_arguments, true, pm);
+        const auto tentative_next_load_addr =
+            AppLoader_NSO::LoadModule(process, system, *module_file, load_addr,
+                                      should_pass_arguments, true, pm, GetPatcher(i));
        if (!tentative_next_load_addr) {
            return {ResultStatus::ErrorLoadingNSO, {}};
        }

        next_load_addr = *tentative_next_load_addr;
        modules.insert_or_assign(load_addr, module);
-        LOG_DEBUG(Loader, "loaded module {} @ 0x{:X}", module, load_addr);
+        LOG_DEBUG(Loader, "loaded module {} @ {:#X}", module, load_addr);
    }

    // Find the RomFS by searching for a ".romfs" file in this directory
--- a/src/core/loader/nro.cpp
+++ b/src/core/loader/nro.cpp
@ -22,6 +22,10 @@
 #include "core/loader/nso.h"
 #include "core/memory.h"

+#ifdef ARCHITECTURE_arm64
+#include "core/arm/nce/patch.h"
+#endif
+
 namespace Loader {

 struct NroSegmentHeader {
@ -139,7 +143,8 @@ static constexpr u32 PageAlignSize(u32 size) {
    return static_cast<u32>((size + Core::Memory::YUZU_PAGEMASK) & ~Core::Memory::YUZU_PAGEMASK);
 }

-static bool LoadNroImpl(Kernel::KProcess& process, const std::vector<u8>& data) {
+static bool LoadNroImpl(Core::System& system, Kernel::KProcess& process,
+                        const std::vector<u8>& data) {
    if (data.size() < sizeof(NroHeader)) {
        return {};
    }
@ -195,14 +200,60 @@ static bool LoadNroImpl(Kernel::KProcess& process, const std::vector<u8>& data)
    codeset.DataSegment().size += bss_size;
    program_image.resize(static_cast<u32>(program_image.size()) + bss_size);

+#ifdef ARCHITECTURE_arm64
+    const auto& code = codeset.CodeSegment();
+
+    // NROs are always 64-bit programs.
+    Settings::SetNceEnabled(true);
+
+    // Create NCE patcher
+    Core::NCE::Patcher patch{};
+    size_t image_size = program_image.size();
+
+    if (Settings::IsNceEnabled()) {
+        // Patch SVCs and MRS calls in the guest code
+        patch.PatchText(program_image, code);
+
+        // We only support PostData patching for NROs.
+        ASSERT(patch.Mode() == Core::NCE::PatchMode::PostData);
+
+        // Update patch section.
+        auto& patch_segment = codeset.PatchSegment();
+        patch_segment.addr = image_size;
+        patch_segment.size = static_cast<u32>(patch.SectionSize());
+
+        // Add patch section size to the module size.
+        image_size += patch_segment.size;
+    }
+#endif
+
+    // Enable direct memory mapping in case of NCE.
+    const u64 fastmem_base = [&]() -> size_t {
+        if (Settings::IsNceEnabled()) {
+            auto& buffer = system.DeviceMemory().buffer;
+            buffer.EnableDirectMappedAddress();
+            return reinterpret_cast<u64>(buffer.VirtualBasePointer());
+        }
+        return 0;
+    }();
+
    // Setup the process code layout
    if (process
-            .LoadFromMetadata(FileSys::ProgramMetadata::GetDefault(), program_image.size(), 0,
+            .LoadFromMetadata(FileSys::ProgramMetadata::GetDefault(), image_size, fastmem_base,
                              false)
            .IsError()) {
        return false;
    }

+    // Relocate code patch and copy to the program_image if running under NCE.
+    // This needs to be after LoadFromMetadata so we can use the process entry point.
+#ifdef ARCHITECTURE_arm64
+    if (Settings::IsNceEnabled()) {
+        patch.RelocateAndCopy(process.GetEntryPoint(), code, program_image,
+                              &process.GetPostHandlers());
+    }
+#endif
+
    // Load codeset for current process
    codeset.memory = std::move(program_image);
    process.LoadModule(std::move(codeset), process.GetEntryPoint());
@ -210,8 +261,9 @@ static bool LoadNroImpl(Kernel::KProcess& process, const std::vector<u8>& data)
    return true;
 }

-bool AppLoader_NRO::LoadNro(Kernel::KProcess& process, const FileSys::VfsFile& nro_file) {
-    return LoadNroImpl(process, nro_file.ReadAllBytes());
+bool AppLoader_NRO::LoadNro(Core::System& system, Kernel::KProcess& process,
+                            const FileSys::VfsFile& nro_file) {
+    return LoadNroImpl(system, process, nro_file.ReadAllBytes());
 }

 AppLoader_NRO::LoadResult AppLoader_NRO::Load(Kernel::KProcess& process, Core::System& system) {
@ -219,7 +271,7 @@ AppLoader_NRO::LoadResult AppLoader_NRO::Load(Kernel::KProcess& process, Core::S
        return {ResultStatus::ErrorAlreadyLoaded, {}};
    }

-    if (!LoadNro(process, *file)) {
+    if (!LoadNro(system, process, *file)) {
        return {ResultStatus::ErrorLoadingNRO, {}};
    }

--- a/src/core/loader/nro.h
+++ b/src/core/loader/nro.h
@ -54,7 +54,7 @@ public:
    bool IsRomFSUpdatable() const override;

 private:
-    bool LoadNro(Kernel::KProcess& process, const FileSys::VfsFile& nro_file);
+    bool LoadNro(Core::System& system, Kernel::KProcess& process, const FileSys::VfsFile& nro_file);

    std::vector<u8> icon_data;
    std::unique_ptr<FileSys::NACP> nacp;
--- a/src/core/loader/nso.cpp
+++ b/src/core/loader/nso.cpp
@ -20,6 +20,10 @@
 #include "core/loader/nso.h"
 #include "core/memory.h"

+#ifdef ARCHITECTURE_arm64
+#include "core/arm/nce/patch.h"
+#endif
+
 namespace Loader {
 namespace {
 struct MODHeader {
@ -72,7 +76,8 @@ FileType AppLoader_NSO::IdentifyType(const FileSys::VirtualFile& in_file) {
 std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::System& system,
                                               const FileSys::VfsFile& nso_file, VAddr load_base,
                                               bool should_pass_arguments, bool load_into_process,
-                                               std::optional<FileSys::PatchManager> pm) {
+                                               std::optional<FileSys::PatchManager> pm,
+                                               Core::NCE::Patcher* patch) {
    if (nso_file.GetSize() < sizeof(NSOHeader)) {
        return std::nullopt;
    }
@ -86,6 +91,16 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::
        return std::nullopt;
    }

+    // Allocate some space at the beginning if we are patching in PreText mode.
+    const size_t module_start = [&]() -> size_t {
+#ifdef ARCHITECTURE_arm64
+        if (patch && patch->Mode() == Core::NCE::PatchMode::PreText) {
+            return patch->SectionSize();
+        }
+#endif
+        return 0;
+    }();
+
    // Build program image
    Kernel::CodeSet codeset;
    Kernel::PhysicalMemory program_image;
@ -95,11 +110,12 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::
        if (nso_header.IsSegmentCompressed(i)) {
            data = DecompressSegment(data, nso_header.segments[i]);
        }
-        program_image.resize(nso_header.segments[i].location + static_cast<u32>(data.size()));
-        std::memcpy(program_image.data() + nso_header.segments[i].location, data.data(),
-                    data.size());
-        codeset.segments[i].addr = nso_header.segments[i].location;
-        codeset.segments[i].offset = nso_header.segments[i].location;
+        program_image.resize(module_start + nso_header.segments[i].location +
+                             static_cast<u32>(data.size()));
+        std::memcpy(program_image.data() + module_start + nso_header.segments[i].location,
+                    data.data(), data.size());
+        codeset.segments[i].addr = module_start + nso_header.segments[i].location;
+        codeset.segments[i].offset = module_start + nso_header.segments[i].location;
        codeset.segments[i].size = nso_header.segments[i].size;
    }

@ -118,7 +134,7 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::
    }

    codeset.DataSegment().size += nso_header.segments[2].bss_size;
-    const u32 image_size{
+    u32 image_size{
        PageAlignSize(static_cast<u32>(program_image.size()) + nso_header.segments[2].bss_size)};
    program_image.resize(image_size);

@ -139,6 +155,32 @@ std::optional<VAddr> AppLoader_NSO::LoadModule(Kernel::KProcess& process, Core::
        std::copy(pi_header.begin() + sizeof(NSOHeader), pi_header.end(), program_image.data());
    }

+#ifdef ARCHITECTURE_arm64
+    // If we are computing the process code layout and using nce backend, patch.
+    const auto& code = codeset.CodeSegment();
+    if (patch && patch->Mode() == Core::NCE::PatchMode::None) {
+        // Patch SVCs and MRS calls in the guest code
+        patch->PatchText(program_image, code);
+
+        // Add patch section size to the module size.
+        image_size += patch->SectionSize();
+    } else if (patch) {
+        // Relocate code patch and copy to the program_image.
+        patch->RelocateAndCopy(load_base, code, program_image, &process.GetPostHandlers());
+
+        // Update patch section.
+        auto& patch_segment = codeset.PatchSegment();
+        patch_segment.addr = patch->Mode() == Core::NCE::PatchMode::PreText ? 0 : image_size;
+        patch_segment.size = static_cast<u32>(patch->SectionSize());
+
+        // Add patch section size to the module size. In PreText mode image_size
+        // already contains the patch segment as part of module_start.
+        if (patch->Mode() == Core::NCE::PatchMode::PostData) {
+            image_size += patch_segment.size;
+        }
+    }
+#endif
+
    // If we aren't actually loading (i.e. just computing the process code layout), we are done
    if (!load_into_process) {
        return load_base + image_size;
--- a/src/core/loader/nso.h
+++ b/src/core/loader/nso.h
@ -15,6 +15,10 @@ namespace Core {
 class System;
 }

+namespace Core::NCE {
+class Patcher;
+}
+
 namespace Kernel {
 class KProcess;
 }
@ -88,7 +92,8 @@ public:
    static std::optional<VAddr> LoadModule(Kernel::KProcess& process, Core::System& system,
                                           const FileSys::VfsFile& nso_file, VAddr load_base,
                                           bool should_pass_arguments, bool load_into_process,
-                                           std::optional<FileSys::PatchManager> pm = {});
+                                           std::optional<FileSys::PatchManager> pm = {},
+                                           Core::NCE::Patcher* patch = nullptr);

    LoadResult Load(Kernel::KProcess& process, Core::System& system) override;

--- a/src/core/memory.cpp
+++ b/src/core/memory.cpp
@ -1001,4 +1001,17 @@ void Memory::FlushRegion(Common::ProcessAddress dest_addr, size_t size) {
    impl->FlushRegion(dest_addr, size);
 }

+bool Memory::InvalidateNCE(Common::ProcessAddress vaddr, size_t size) {
+    bool mapped = true;
+    u8* const ptr = impl->GetPointerImpl(
+        GetInteger(vaddr),
+        [&] {
+            LOG_ERROR(HW_Memory, "Unmapped InvalidateNCE for {} bytes @ {:#x}", size,
+                      GetInteger(vaddr));
+            mapped = false;
+        },
+        [&] { impl->system.GPU().InvalidateRegion(GetInteger(vaddr), size); });
+    return mapped && ptr != nullptr;
+}
+
 } // namespace Core::Memory
--- a/src/core/memory.h
+++ b/src/core/memory.h
@ -474,6 +474,7 @@ public:

    void SetGPUDirtyManagers(std::span<Core::GPUDirtyMemoryManager> managers);
    void InvalidateRegion(Common::ProcessAddress dest_addr, size_t size);
+    bool InvalidateNCE(Common::ProcessAddress vaddr, size_t size);
    void FlushRegion(Common::ProcessAddress dest_addr, size_t size);

 private: