diff --git a/CMakeLists.txt b/CMakeLists.txt
index d628ecc50..8f2898973 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -152,12 +152,15 @@ if (ENABLE_SDL2)
             download_bundled_external("sdl2/" ${SDL2_VER} SDL2_PREFIX)
         endif()
 
+        set(SDL2_FOUND YES)
         set(SDL2_INCLUDE_DIR "${SDL2_PREFIX}/include" CACHE PATH "Path to SDL2 headers")
         set(SDL2_LIBRARY "${SDL2_PREFIX}/lib/x64/SDL2.lib" CACHE PATH "Path to SDL2 library")
         set(SDL2_DLL_DIR "${SDL2_PREFIX}/lib/x64/" CACHE PATH "Path to SDL2.dll")
     else()
         find_package(SDL2 REQUIRED)
     endif()
+else()
+    set(SDL2_FOUND NO)
 endif()
 
 IF (APPLE)
diff --git a/src/audio_core/CMakeLists.txt b/src/audio_core/CMakeLists.txt
index 5a2747e78..13b5e400e 100644
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@@ -4,6 +4,7 @@ set(SRCS
             hle/dsp.cpp
             hle/filter.cpp
             hle/pipe.cpp
+            hle/source.cpp
             interpolate.cpp
             sink_details.cpp
             )
@@ -15,6 +16,7 @@ set(HEADERS
             hle/dsp.h
             hle/filter.h
             hle/pipe.h
+            hle/source.h
             interpolate.h
             null_sink.h
             sink.h
@@ -23,7 +25,18 @@ set(HEADERS
 
 include_directories(../../externals/soundtouch/include)
 
+if(SDL2_FOUND)
+    set(SRCS ${SRCS} sdl2_sink.cpp)
+    set(HEADERS ${HEADERS} sdl2_sink.h)
+    include_directories(${SDL2_INCLUDE_DIR})
+endif()
+
 create_directory_groups(${SRCS} ${HEADERS})
 
 add_library(audio_core STATIC ${SRCS} ${HEADERS})
 target_link_libraries(audio_core SoundTouch)
+
+if(SDL2_FOUND)
+    target_link_libraries(audio_core ${SDL2_LIBRARY})
+    set_property(TARGET audio_core APPEND PROPERTY COMPILE_DEFINITIONS HAVE_SDL2)
+endif()
diff --git a/src/audio_core/hle/common.h b/src/audio_core/hle/common.h
index 7910f42ae..596b67eaf 100644
--- a/src/audio_core/hle/common.h
+++ b/src/audio_core/hle/common.h
@@ -27,7 +27,7 @@ using QuadFrame32   = std::array<std::array<s32, 4>, samples_per_frame>;
  */
 template<typename FrameT, typename FilterT>
 void FilterFrame(FrameT& frame, FilterT& filter) {
-    std::transform(frame.begin(), frame.end(), frame.begin(), [&filter](const typename FrameT::value_type& sample) {
+    std::transform(frame.begin(), frame.end(), frame.begin(), [&filter](const auto& sample) {
         return filter.ProcessSample(sample);
     });
 }
diff --git a/src/audio_core/hle/dsp.cpp b/src/audio_core/hle/dsp.cpp
index 4d44bd2d9..0cdbdb06a 100644
--- a/src/audio_core/hle/dsp.cpp
+++ b/src/audio_core/hle/dsp.cpp
@@ -2,10 +2,12 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.
 
+#include <array>
 #include <memory>
 
 #include "audio_core/hle/dsp.h"
 #include "audio_core/hle/pipe.h"
+#include "audio_core/hle/source.h"
 #include "audio_core/sink.h"
 
 namespace DSP {
@@ -38,16 +40,38 @@ static SharedMemory& WriteRegion() {
     return g_regions[1 - CurrentRegionIndex()];
 }
 
+static std::array<Source, num_sources> sources = {
+    Source(0), Source(1), Source(2), Source(3), Source(4), Source(5),
+    Source(6), Source(7), Source(8), Source(9), Source(10), Source(11),
+    Source(12), Source(13), Source(14), Source(15), Source(16), Source(17),
+    Source(18), Source(19), Source(20), Source(21), Source(22), Source(23)
+};
+
 static std::unique_ptr<AudioCore::Sink> sink;
 
 void Init() {
     DSP::HLE::ResetPipes();
+    for (auto& source : sources) {
+        source.Reset();
+    }
 }
 
 void Shutdown() {
 }
 
 bool Tick() {
+    SharedMemory& read = ReadRegion();
+    SharedMemory& write = WriteRegion();
+
+    std::array<QuadFrame32, 3> intermediate_mixes = {};
+
+    for (size_t i = 0; i < num_sources; i++) {
+        write.source_statuses.status[i] = sources[i].Tick(read.source_configurations.config[i], read.adpcm_coefficients.coeff[i]);
+        for (size_t mix = 0; mix < 3; mix++) {
+            sources[i].MixInto(intermediate_mixes[mix], mix);
+        }
+    }
+
     return true;
 }
 
diff --git a/src/audio_core/hle/dsp.h b/src/audio_core/hle/dsp.h
index 4f2410c27..f6e53f68f 100644
--- a/src/audio_core/hle/dsp.h
+++ b/src/audio_core/hle/dsp.h
@@ -33,13 +33,9 @@ namespace HLE {
 // double-buffer. The frame counter is located as the very last u16 of each region and is incremented
 // each audio tick.
 
-struct SharedMemory;
-
 constexpr VAddr region0_base = 0x1FF50000;
 constexpr VAddr region1_base = 0x1FF70000;
 
-extern std::array<SharedMemory, 2> g_regions;
-
 /**
  * The DSP is native 16-bit. The DSP also appears to be big-endian. When reading 32-bit numbers from
  * its memory regions, the higher and lower 16-bit halves are swapped compared to the little-endian
@@ -169,9 +165,9 @@ struct SourceConfiguration {
         float_le rate_multiplier;
 
         enum class InterpolationMode : u8 {
-            None = 0,
+            Polyphase = 0,
             Linear = 1,
-            Polyphase = 2
+            None = 2
         };
 
         InterpolationMode interpolation_mode;
@@ -318,10 +314,10 @@ ASSERT_DSP_STRUCT(SourceConfiguration::Configuration::Buffer, 20);
 struct SourceStatus {
     struct Status {
         u8 is_enabled;               ///< Is this channel enabled? (Doesn't have to be playing anything.)
-        u8 previous_buffer_id_dirty; ///< Non-zero when previous_buffer_id changes
+        u8 current_buffer_id_dirty;  ///< Non-zero when current_buffer_id changes
         u16_le sync;                 ///< Is set by the DSP to the value of SourceConfiguration::sync
         u32_dsp buffer_position;     ///< Number of samples into the current buffer
-        u16_le previous_buffer_id;   ///< Updated when a buffer finishes playing
+        u16_le current_buffer_id;    ///< Updated when a buffer finishes playing
         INSERT_PADDING_DSPWORDS(1);
     };
 
@@ -507,6 +503,8 @@ struct SharedMemory {
 };
 ASSERT_DSP_STRUCT(SharedMemory, 0x8000);
 
+extern std::array<SharedMemory, 2> g_regions;
+
 // Structures must have an offset that is a multiple of two.
 static_assert(offsetof(SharedMemory, frame_counter) % 2 == 0, "Structures in DSP::HLE::SharedMemory must be 2-byte aligned");
 static_assert(offsetof(SharedMemory, source_configurations) % 2 == 0, "Structures in DSP::HLE::SharedMemory must be 2-byte aligned");
diff --git a/src/audio_core/hle/filter.h b/src/audio_core/hle/filter.h
index 75738f600..43d2035cd 100644
--- a/src/audio_core/hle/filter.h
+++ b/src/audio_core/hle/filter.h
@@ -16,6 +16,7 @@ namespace HLE {
 
 /// Preprocessing filters. There is an independent set of filters for each Source.
 class SourceFilters final {
+public:
     SourceFilters() { Reset(); }
 
     /// Reset internal state.
diff --git a/src/audio_core/hle/pipe.cpp b/src/audio_core/hle/pipe.cpp
index 03280780f..44dff1345 100644
--- a/src/audio_core/hle/pipe.cpp
+++ b/src/audio_core/hle/pipe.cpp
@@ -36,12 +36,17 @@ std::vector<u8> PipeRead(DspPipe pipe_number, u32 length) {
         return {};
     }
 
+    if (length > UINT16_MAX) { // Can only read at most UINT16_MAX from the pipe
+        LOG_ERROR(Audio_DSP, "length of %u greater than max of %u", length, UINT16_MAX);
+        return {};
+    }
+
     std::vector<u8>& data = pipe_data[pipe_index];
 
     if (length > data.size()) {
         LOG_WARNING(Audio_DSP, "pipe_number = %zu is out of data, application requested read of %u but %zu remain",
                     pipe_index, length, data.size());
-        length = data.size();
+        length = static_cast<u32>(data.size());
     }
 
     if (length == 0)
@@ -94,7 +99,7 @@ static void AudioPipeWriteStructAddresses() {
     };
 
     // Begin with a u16 denoting the number of structs.
-    WriteU16(DspPipe::Audio, struct_addresses.size());
+    WriteU16(DspPipe::Audio, static_cast<u16>(struct_addresses.size()));
     // Then write the struct addresses.
     for (u16 addr : struct_addresses) {
         WriteU16(DspPipe::Audio, addr);
diff --git a/src/audio_core/hle/pipe.h b/src/audio_core/hle/pipe.h
index 64d97f8ba..b714c0496 100644
--- a/src/audio_core/hle/pipe.h
+++ b/src/audio_core/hle/pipe.h
@@ -24,10 +24,14 @@ enum class DspPipe {
 constexpr size_t NUM_DSP_PIPE = 8;
 
 /**
- * Read a DSP pipe.
- * @param pipe_number The Pipe ID
- * @param length How much data to request.
- * @return The data read from the pipe. The size of this vector can be less than the length requested.
+ * Reads `length` bytes from the DSP pipe identified with `pipe_number`.
+ * @note Can read up to the maximum value of a u16 in bytes (65,535).
+ * @note IF an error is encoutered with either an invalid `pipe_number` or `length` value, an empty vector will be returned.
+ * @note IF `length` is set to 0, an empty vector will be returned.
+ * @note IF `length` is greater than the amount of data available, this function will only read the available amount.
+ * @param pipe_number a `DspPipe`
+ * @param length the number of bytes to read. The max is 65,535 (max of u16).
+ * @returns a vector of bytes from the specified pipe. On error, will be empty.
  */
 std::vector<u8> PipeRead(DspPipe pipe_number, u32 length);
 
diff --git a/src/audio_core/hle/source.cpp b/src/audio_core/hle/source.cpp
new file mode 100644
index 000000000..30552fe26
--- /dev/null
+++ b/src/audio_core/hle/source.cpp
@@ -0,0 +1,320 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+
+#include "audio_core/codec.h"
+#include "audio_core/hle/common.h"
+#include "audio_core/hle/source.h"
+#include "audio_core/interpolate.h"
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+
+#include "core/memory.h"
+
+namespace DSP {
+namespace HLE {
+
+SourceStatus::Status Source::Tick(SourceConfiguration::Configuration& config, const s16_le (&adpcm_coeffs)[16]) {
+    ParseConfig(config, adpcm_coeffs);
+
+    if (state.enabled) {
+        GenerateFrame();
+    }
+
+    return GetCurrentStatus();
+}
+
+void Source::MixInto(QuadFrame32& dest, size_t intermediate_mix_id) const {
+    if (!state.enabled)
+        return;
+
+    const std::array<float, 4>& gains = state.gain.at(intermediate_mix_id);
+    for (size_t samplei = 0; samplei < samples_per_frame; samplei++) {
+        // Conversion from stereo (current_frame) to quadraphonic (dest) occurs here.
+        dest[samplei][0] += static_cast<s32>(gains[0] * current_frame[samplei][0]);
+        dest[samplei][1] += static_cast<s32>(gains[1] * current_frame[samplei][1]);
+        dest[samplei][2] += static_cast<s32>(gains[2] * current_frame[samplei][0]);
+        dest[samplei][3] += static_cast<s32>(gains[3] * current_frame[samplei][1]);
+    }
+}
+
+void Source::Reset() {
+    current_frame.fill({});
+    state = {};
+}
+
+void Source::ParseConfig(SourceConfiguration::Configuration& config, const s16_le (&adpcm_coeffs)[16]) {
+    if (!config.dirty_raw) {
+        return;
+    }
+
+    if (config.reset_flag) {
+        config.reset_flag.Assign(0);
+        Reset();
+        LOG_TRACE(Audio_DSP, "source_id=%zu reset", source_id);
+    }
+
+    if (config.partial_reset_flag) {
+        config.partial_reset_flag.Assign(0);
+        state.input_queue = std::priority_queue<Buffer, std::vector<Buffer>, BufferOrder>{};
+        LOG_TRACE(Audio_DSP, "source_id=%zu partial_reset", source_id);
+    }
+
+    if (config.enable_dirty) {
+        config.enable_dirty.Assign(0);
+        state.enabled = config.enable != 0;
+        LOG_TRACE(Audio_DSP, "source_id=%zu enable=%d", source_id, state.enabled);
+    }
+
+    if (config.sync_dirty) {
+        config.sync_dirty.Assign(0);
+        state.sync = config.sync;
+        LOG_TRACE(Audio_DSP, "source_id=%zu sync=%u", source_id, state.sync);
+    }
+
+    if (config.rate_multiplier_dirty) {
+        config.rate_multiplier_dirty.Assign(0);
+        state.rate_multiplier = config.rate_multiplier;
+        LOG_TRACE(Audio_DSP, "source_id=%zu rate=%f", source_id, state.rate_multiplier);
+
+        if (state.rate_multiplier <= 0) {
+            LOG_ERROR(Audio_DSP, "Was given an invalid rate multiplier: source_id=%zu rate=%f", source_id, state.rate_multiplier);
+            state.rate_multiplier = 1.0f;
+            // Note: Actual firmware starts producing garbage if this occurs.
+        }
+    }
+
+    if (config.adpcm_coefficients_dirty) {
+        config.adpcm_coefficients_dirty.Assign(0);
+        std::transform(adpcm_coeffs, adpcm_coeffs + state.adpcm_coeffs.size(), state.adpcm_coeffs.begin(),
+            [](const auto& coeff) { return static_cast<s16>(coeff); });
+        LOG_TRACE(Audio_DSP, "source_id=%zu adpcm update", source_id);
+    }
+
+    if (config.gain_0_dirty) {
+        config.gain_0_dirty.Assign(0);
+        std::transform(config.gain[0], config.gain[0] + state.gain[0].size(), state.gain[0].begin(),
+            [](const auto& coeff) { return static_cast<float>(coeff); });
+        LOG_TRACE(Audio_DSP, "source_id=%zu gain 0 update", source_id);
+    }
+
+    if (config.gain_1_dirty) {
+        config.gain_1_dirty.Assign(0);
+        std::transform(config.gain[1], config.gain[1] + state.gain[1].size(), state.gain[1].begin(),
+            [](const auto& coeff) { return static_cast<float>(coeff); });
+        LOG_TRACE(Audio_DSP, "source_id=%zu gain 1 update", source_id);
+    }
+
+    if (config.gain_2_dirty) {
+        config.gain_2_dirty.Assign(0);
+        std::transform(config.gain[2], config.gain[2] + state.gain[2].size(), state.gain[2].begin(),
+            [](const auto& coeff) { return static_cast<float>(coeff); });
+        LOG_TRACE(Audio_DSP, "source_id=%zu gain 2 update", source_id);
+    }
+
+    if (config.filters_enabled_dirty) {
+        config.filters_enabled_dirty.Assign(0);
+        state.filters.Enable(config.simple_filter_enabled.ToBool(), config.biquad_filter_enabled.ToBool());
+        LOG_TRACE(Audio_DSP, "source_id=%zu enable_simple=%hu enable_biquad=%hu",
+                  source_id, config.simple_filter_enabled.Value(), config.biquad_filter_enabled.Value());
+    }
+
+    if (config.simple_filter_dirty) {
+        config.simple_filter_dirty.Assign(0);
+        state.filters.Configure(config.simple_filter);
+        LOG_TRACE(Audio_DSP, "source_id=%zu simple filter update", source_id);
+    }
+
+    if (config.biquad_filter_dirty) {
+        config.biquad_filter_dirty.Assign(0);
+        state.filters.Configure(config.biquad_filter);
+        LOG_TRACE(Audio_DSP, "source_id=%zu biquad filter update", source_id);
+    }
+
+    if (config.interpolation_dirty) {
+        config.interpolation_dirty.Assign(0);
+        state.interpolation_mode = config.interpolation_mode;
+        LOG_TRACE(Audio_DSP, "source_id=%zu interpolation_mode=%zu", source_id, static_cast<size_t>(state.interpolation_mode));
+    }
+
+    if (config.format_dirty || config.embedded_buffer_dirty) {
+        config.format_dirty.Assign(0);
+        state.format = config.format;
+        LOG_TRACE(Audio_DSP, "source_id=%zu format=%zu", source_id, static_cast<size_t>(state.format));
+    }
+
+    if (config.mono_or_stereo_dirty || config.embedded_buffer_dirty) {
+        config.mono_or_stereo_dirty.Assign(0);
+        state.mono_or_stereo = config.mono_or_stereo;
+        LOG_TRACE(Audio_DSP, "source_id=%zu mono_or_stereo=%zu", source_id, static_cast<size_t>(state.mono_or_stereo));
+    }
+
+    if (config.embedded_buffer_dirty) {
+        config.embedded_buffer_dirty.Assign(0);
+        state.input_queue.emplace(Buffer{
+            config.physical_address,
+            config.length,
+            static_cast<u8>(config.adpcm_ps),
+            { config.adpcm_yn[0], config.adpcm_yn[1] },
+            config.adpcm_dirty.ToBool(),
+            config.is_looping.ToBool(),
+            config.buffer_id,
+            state.mono_or_stereo,
+            state.format,
+            false
+        });
+        LOG_TRACE(Audio_DSP, "enqueuing embedded addr=0x%08x len=%u id=%hu", config.physical_address, config.length, config.buffer_id);
+    }
+
+    if (config.buffer_queue_dirty) {
+        config.buffer_queue_dirty.Assign(0);
+        for (size_t i = 0; i < 4; i++) {
+            if (config.buffers_dirty & (1 << i)) {
+                const auto& b = config.buffers[i];
+                state.input_queue.emplace(Buffer{
+                    b.physical_address,
+                    b.length,
+                    static_cast<u8>(b.adpcm_ps),
+                    { b.adpcm_yn[0], b.adpcm_yn[1] },
+                    b.adpcm_dirty != 0,
+                    b.is_looping != 0,
+                    b.buffer_id,
+                    state.mono_or_stereo,
+                    state.format,
+                    true
+                });
+                LOG_TRACE(Audio_DSP, "enqueuing queued %zu addr=0x%08x len=%u id=%hu", i, b.physical_address, b.length, b.buffer_id);
+            }
+        }
+        config.buffers_dirty = 0;
+    }
+
+    if (config.dirty_raw) {
+        LOG_DEBUG(Audio_DSP, "source_id=%zu remaining_dirty=%x", source_id, config.dirty_raw);
+    }
+
+    config.dirty_raw = 0;
+}
+
+void Source::GenerateFrame() {
+    current_frame.fill({});
+
+    if (state.current_buffer.empty() && !DequeueBuffer()) {
+        state.enabled = false;
+        state.buffer_update = true;
+        state.current_buffer_id = 0;
+        return;
+    }
+
+    size_t frame_position = 0;
+
+    state.current_sample_number = state.next_sample_number;
+    while (frame_position < current_frame.size()) {
+        if (state.current_buffer.empty() && !DequeueBuffer()) {
+            break;
+        }
+
+        const size_t size_to_copy = std::min(state.current_buffer.size(), current_frame.size() - frame_position);
+
+        std::copy(state.current_buffer.begin(), state.current_buffer.begin() + size_to_copy, current_frame.begin() + frame_position);
+        state.current_buffer.erase(state.current_buffer.begin(), state.current_buffer.begin() + size_to_copy);
+
+        frame_position += size_to_copy;
+        state.next_sample_number += static_cast<u32>(size_to_copy);
+    }
+
+    state.filters.ProcessFrame(current_frame);
+}
+
+
+bool Source::DequeueBuffer() {
+    ASSERT_MSG(state.current_buffer.empty(), "Shouldn't dequeue; we still have data in current_buffer");
+
+    if (state.input_queue.empty())
+        return false;
+
+    const Buffer buf = state.input_queue.top();
+    state.input_queue.pop();
+
+    if (buf.adpcm_dirty) {
+        state.adpcm_state.yn1 = buf.adpcm_yn[0];
+        state.adpcm_state.yn2 = buf.adpcm_yn[1];
+    }
+
+    if (buf.is_looping) {
+        LOG_ERROR(Audio_DSP, "Looped buffers are unimplemented at the moment");
+    }
+
+    const u8* const memory = Memory::GetPhysicalPointer(buf.physical_address);
+    if (memory) {
+        const unsigned num_channels = buf.mono_or_stereo == MonoOrStereo::Stereo ? 2 : 1;
+        switch (buf.format) {
+        case Format::PCM8:
+            state.current_buffer = Codec::DecodePCM8(num_channels, memory, buf.length);
+            break;
+        case Format::PCM16:
+            state.current_buffer = Codec::DecodePCM16(num_channels, memory, buf.length);
+            break;
+        case Format::ADPCM:
+            DEBUG_ASSERT(num_channels == 1);
+            state.current_buffer = Codec::DecodeADPCM(memory, buf.length, state.adpcm_coeffs, state.adpcm_state);
+            break;
+        default:
+            UNIMPLEMENTED();
+            break;
+        }
+    } else {
+        LOG_WARNING(Audio_DSP, "source_id=%zu buffer_id=%hu length=%u: Invalid physical address 0x%08X",
+                               source_id, buf.buffer_id, buf.length, buf.physical_address);
+        state.current_buffer.clear();
+        return true;
+    }
+
+    switch (state.interpolation_mode) {
+    case InterpolationMode::None:
+        state.current_buffer = AudioInterp::None(state.interp_state, state.current_buffer, state.rate_multiplier);
+        break;
+    case InterpolationMode::Linear:
+        state.current_buffer = AudioInterp::Linear(state.interp_state, state.current_buffer, state.rate_multiplier);
+        break;
+    case InterpolationMode::Polyphase:
+        // TODO(merry): Implement polyphase interpolation
+        state.current_buffer = AudioInterp::Linear(state.interp_state, state.current_buffer, state.rate_multiplier);
+        break;
+    default:
+        UNIMPLEMENTED();
+        break;
+    }
+
+    state.current_sample_number = 0;
+    state.next_sample_number = 0;
+    state.current_buffer_id = buf.buffer_id;
+    state.buffer_update = buf.from_queue;
+
+    LOG_TRACE(Audio_DSP, "source_id=%zu buffer_id=%hu from_queue=%s current_buffer.size()=%zu",
+                         source_id, buf.buffer_id, buf.from_queue ? "true" : "false", state.current_buffer.size());
+    return true;
+}
+
+SourceStatus::Status Source::GetCurrentStatus() {
+    SourceStatus::Status ret;
+
+    // Applications depend on the correct emulation of
+    // current_buffer_id_dirty and current_buffer_id to synchronise
+    // audio with video.
+    ret.is_enabled = state.enabled;
+    ret.current_buffer_id_dirty = state.buffer_update ? 1 : 0;
+    state.buffer_update = false;
+    ret.current_buffer_id = state.current_buffer_id;
+    ret.buffer_position = state.current_sample_number;
+    ret.sync = state.sync;
+
+    return ret;
+}
+
+} // namespace HLE
+} // namespace DSP
diff --git a/src/audio_core/hle/source.h b/src/audio_core/hle/source.h
new file mode 100644
index 000000000..7ee08d424
--- /dev/null
+++ b/src/audio_core/hle/source.h
@@ -0,0 +1,144 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <queue>
+#include <vector>
+
+#include "audio_core/codec.h"
+#include "audio_core/hle/common.h"
+#include "audio_core/hle/dsp.h"
+#include "audio_core/hle/filter.h"
+#include "audio_core/interpolate.h"
+
+#include "common/common_types.h"
+
+namespace DSP {
+namespace HLE {
+
+/**
+ * This module performs:
+ * - Buffer management
+ * - Decoding of buffers
+ * - Buffer resampling and interpolation
+ * - Per-source filtering (SimpleFilter, BiquadFilter)
+ * - Per-source gain
+ * - Other per-source processing
+ */
+class Source final {
+public:
+    explicit Source(size_t source_id_) : source_id(source_id_) {
+        Reset();
+    }
+
+    /// Resets internal state.
+    void Reset();
+
+    /**
+     * This is called once every audio frame. This performs per-source processing every frame.
+     * @param config The new configuration we've got for this Source from the application.
+     * @param adpcm_coeffs ADPCM coefficients to use if config tells us to use them (may contain invalid values otherwise).
+     * @return The current status of this Source. This is given back to the emulated application via SharedMemory.
+     */
+    SourceStatus::Status Tick(SourceConfiguration::Configuration& config, const s16_le (&adpcm_coeffs)[16]);
+
+    /**
+     * Mix this source's output into dest, using the gains for the `intermediate_mix_id`-th intermediate mixer.
+     * @param dest The QuadFrame32 to mix into.
+     * @param intermediate_mix_id The id of the intermediate mix whose gains we are using.
+     */
+    void MixInto(QuadFrame32& dest, size_t intermediate_mix_id) const;
+
+private:
+    const size_t source_id;
+    StereoFrame16 current_frame;
+
+    using Format = SourceConfiguration::Configuration::Format;
+    using InterpolationMode = SourceConfiguration::Configuration::InterpolationMode;
+    using MonoOrStereo = SourceConfiguration::Configuration::MonoOrStereo;
+
+    /// Internal representation of a buffer for our buffer queue
+    struct Buffer {
+        PAddr physical_address;
+        u32 length;
+        u8 adpcm_ps;
+        std::array<u16, 2> adpcm_yn;
+        bool adpcm_dirty;
+        bool is_looping;
+        u16 buffer_id;
+
+        MonoOrStereo mono_or_stereo;
+        Format format;
+
+        bool from_queue;
+    };
+
+    struct BufferOrder {
+        bool operator() (const Buffer& a, const Buffer& b) const {
+            // Lower buffer_id comes first.
+            return a.buffer_id > b.buffer_id;
+        }
+    };
+
+    struct {
+
+        // State variables
+
+        bool enabled = false;
+        u16 sync = 0;
+
+        // Mixing
+
+        std::array<std::array<float, 4>, 3> gain = {};
+
+        // Buffer queue
+
+        std::priority_queue<Buffer, std::vector<Buffer>, BufferOrder> input_queue;
+        MonoOrStereo mono_or_stereo = MonoOrStereo::Mono;
+        Format format = Format::ADPCM;
+
+        // Current buffer
+
+        u32 current_sample_number = 0;
+        u32 next_sample_number = 0;
+        std::vector<std::array<s16, 2>> current_buffer;
+
+        // buffer_id state
+
+        bool buffer_update = false;
+        u32 current_buffer_id = 0;
+
+        // Decoding state
+
+        std::array<s16, 16> adpcm_coeffs = {};
+        Codec::ADPCMState adpcm_state = {};
+
+        // Resampling state
+
+        float rate_multiplier = 1.0;
+        InterpolationMode interpolation_mode = InterpolationMode::Polyphase;
+        AudioInterp::State interp_state = {};
+
+        // Filter state
+
+        SourceFilters filters;
+
+    } state;
+
+    // Internal functions
+
+    /// INTERNAL: Update our internal state based on the current config.
+    void ParseConfig(SourceConfiguration::Configuration& config, const s16_le (&adpcm_coeffs)[16]);
+    /// INTERNAL: Generate the current audio output for this frame based on our internal state.
+    void GenerateFrame();
+    /// INTERNAL: Dequeues a buffer and does preprocessing on it (decoding, resampling). Puts it into current_buffer.
+    bool DequeueBuffer();
+    /// INTERNAL: Generates a SourceStatus::Status based on our internal state.
+    SourceStatus::Status GetCurrentStatus();
+};
+
+} // namespace HLE
+} // namespace DSP
diff --git a/src/audio_core/sdl2_sink.cpp b/src/audio_core/sdl2_sink.cpp
new file mode 100644
index 000000000..dc75c04ee
--- /dev/null
+++ b/src/audio_core/sdl2_sink.cpp
@@ -0,0 +1,126 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <list>
+#include <vector>
+
+#include <SDL.h>
+
+#include "audio_core/audio_core.h"
+#include "audio_core/sdl2_sink.h"
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include <numeric>
+
+namespace AudioCore {
+
+struct SDL2Sink::Impl {
+    unsigned int sample_rate = 0;
+
+    SDL_AudioDeviceID audio_device_id = 0;
+
+    std::list<std::vector<s16>> queue;
+
+    static void Callback(void* impl_, u8* buffer, int buffer_size_in_bytes);
+};
+
+SDL2Sink::SDL2Sink() : impl(std::make_unique<Impl>()) {
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        LOG_CRITICAL(Audio_Sink, "SDL_Init(SDL_INIT_AUDIO) failed");
+        impl->audio_device_id = 0;
+        return;
+    }
+
+    SDL_AudioSpec desired_audiospec;
+    SDL_zero(desired_audiospec);
+    desired_audiospec.format = AUDIO_S16;
+    desired_audiospec.channels = 2;
+    desired_audiospec.freq = native_sample_rate;
+    desired_audiospec.samples = 1024;
+    desired_audiospec.userdata = impl.get();
+    desired_audiospec.callback = &Impl::Callback;
+
+    SDL_AudioSpec obtained_audiospec;
+    SDL_zero(obtained_audiospec);
+
+    impl->audio_device_id = SDL_OpenAudioDevice(nullptr, false, &desired_audiospec, &obtained_audiospec, 0);
+    if (impl->audio_device_id <= 0) {
+        LOG_CRITICAL(Audio_Sink, "SDL_OpenAudioDevice failed");
+        return;
+    }
+
+    impl->sample_rate = obtained_audiospec.freq;
+
+    // SDL2 audio devices start out paused, unpause it:
+    SDL_PauseAudioDevice(impl->audio_device_id, 0);
+}
+
+SDL2Sink::~SDL2Sink() {
+    if (impl->audio_device_id <= 0)
+        return;
+
+    SDL_CloseAudioDevice(impl->audio_device_id);
+}
+
+unsigned int SDL2Sink::GetNativeSampleRate() const {
+    if (impl->audio_device_id <= 0)
+        return native_sample_rate;
+
+    return impl->sample_rate;
+}
+
+void SDL2Sink::EnqueueSamples(const std::vector<s16>& samples) {
+    if (impl->audio_device_id <= 0)
+        return;
+
+    ASSERT_MSG(samples.size() % 2 == 0, "Samples must be in interleaved stereo PCM16 format (size must be a multiple of two)");
+
+    SDL_LockAudioDevice(impl->audio_device_id);
+    impl->queue.emplace_back(samples);
+    SDL_UnlockAudioDevice(impl->audio_device_id);
+}
+
+size_t SDL2Sink::SamplesInQueue() const {
+    if (impl->audio_device_id <= 0)
+        return 0;
+
+    SDL_LockAudioDevice(impl->audio_device_id);
+
+    size_t total_size = std::accumulate(impl->queue.begin(), impl->queue.end(), static_cast<size_t>(0),
+        [](size_t sum, const auto& buffer) {
+            // Division by two because each stereo sample is made of two s16.
+            return sum + buffer.size() / 2;
+        });
+
+    SDL_UnlockAudioDevice(impl->audio_device_id);
+
+    return total_size;
+}
+
+void SDL2Sink::Impl::Callback(void* impl_, u8* buffer, int buffer_size_in_bytes) {
+    Impl* impl = reinterpret_cast<Impl*>(impl_);
+
+    size_t remaining_size = static_cast<size_t>(buffer_size_in_bytes) / sizeof(s16); // Keep track of size in 16-bit increments.
+
+    while (remaining_size > 0 && !impl->queue.empty()) {
+        if (impl->queue.front().size() <= remaining_size) {
+            memcpy(buffer, impl->queue.front().data(), impl->queue.front().size() * sizeof(s16));
+            buffer += impl->queue.front().size() * sizeof(s16);
+            remaining_size -= impl->queue.front().size();
+            impl->queue.pop_front();
+        } else {
+            memcpy(buffer, impl->queue.front().data(), remaining_size * sizeof(s16));
+            buffer += remaining_size * sizeof(s16);
+            impl->queue.front().erase(impl->queue.front().begin(), impl->queue.front().begin() + remaining_size);
+            remaining_size = 0;
+        }
+    }
+
+    if (remaining_size > 0) {
+        memset(buffer, 0, remaining_size * sizeof(s16));
+    }
+}
+
+} // namespace AudioCore
diff --git a/src/audio_core/sdl2_sink.h b/src/audio_core/sdl2_sink.h
new file mode 100644
index 000000000..0f296b673
--- /dev/null
+++ b/src/audio_core/sdl2_sink.h
@@ -0,0 +1,30 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+
+#include "audio_core/sink.h"
+
+namespace AudioCore {
+
+class SDL2Sink final : public Sink {
+public:
+    SDL2Sink();
+    ~SDL2Sink() override;
+
+    unsigned int GetNativeSampleRate() const override;
+
+    void EnqueueSamples(const std::vector<s16>& samples) override;
+
+    size_t SamplesInQueue() const override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl;
+};
+
+} // namespace AudioCore
diff --git a/src/audio_core/sink.h b/src/audio_core/sink.h
index cad21a85e..1c881c3d2 100644
--- a/src/audio_core/sink.h
+++ b/src/audio_core/sink.h
@@ -19,7 +19,7 @@ public:
     virtual ~Sink() = default;
 
     /// The native rate of this sink. The sink expects to be fed samples that respect this. (Units: samples/sec)
-    virtual unsigned GetNativeSampleRate() const = 0;
+    virtual unsigned int GetNativeSampleRate() const = 0;
 
     /**
      * Feed stereo samples to sink.
diff --git a/src/audio_core/sink_details.cpp b/src/audio_core/sink_details.cpp
index d2cc74103..ba5e83d17 100644
--- a/src/audio_core/sink_details.cpp
+++ b/src/audio_core/sink_details.cpp
@@ -8,10 +8,17 @@
 #include "audio_core/null_sink.h"
 #include "audio_core/sink_details.h"
 
+#ifdef HAVE_SDL2
+#include "audio_core/sdl2_sink.h"
+#endif
+
 namespace AudioCore {
 
 // g_sink_details is ordered in terms of desirability, with the best choice at the top.
 const std::vector<SinkDetails> g_sink_details = {
+#ifdef HAVE_SDL2
+    { "sdl2", []() { return std::make_unique<SDL2Sink>(); } },
+#endif
     { "null", []() { return std::make_unique<NullSink>(); } },
 };
 
diff --git a/src/citra/config.cpp b/src/citra/config.cpp
index 1b14c0b1c..684eba338 100644
--- a/src/citra/config.cpp
+++ b/src/citra/config.cpp
@@ -88,7 +88,7 @@ void Config::ReadValues() {
 
     // Debugging
     Settings::values.use_gdbstub = sdl2_config->GetBoolean("Debugging", "use_gdbstub", false);
-    Settings::values.gdbstub_port = sdl2_config->GetInteger("Debugging", "gdbstub_port", 24689);
+    Settings::values.gdbstub_port = static_cast<u16>(sdl2_config->GetInteger("Debugging", "gdbstub_port", 24689));
 }
 
 void Config::Reload() {
diff --git a/src/citra/default_ini.h b/src/citra/default_ini.h
index e6647a277..859185317 100644
--- a/src/citra/default_ini.h
+++ b/src/citra/default_ini.h
@@ -58,7 +58,7 @@ bg_green =
 
 [Audio]
 # Which audio output engine to use.
-# auto (default): Auto-select, null: No audio output
+# auto (default): Auto-select, null: No audio output, sdl2: SDL2 (if available)
 output_engine =
 
 [Data Storage]
diff --git a/src/citra/emu_window/emu_window_sdl2.cpp b/src/citra/emu_window/emu_window_sdl2.cpp
index 924189f4c..12cdd9d95 100644
--- a/src/citra/emu_window/emu_window_sdl2.cpp
+++ b/src/citra/emu_window/emu_window_sdl2.cpp
@@ -9,6 +9,8 @@
 #define SDL_MAIN_HANDLED
 #include <SDL.h>
 
+#include <glad/glad.h>
+
 #include "common/key_map.h"
 #include "common/logging/log.h"
 #include "common/scm_rev.h"
@@ -98,6 +100,11 @@ EmuWindow_SDL2::EmuWindow_SDL2() {
         exit(1);
     }
 
+    if (!gladLoadGLLoader(static_cast<GLADloadproc>(SDL_GL_GetProcAddress))) {
+        LOG_CRITICAL(Frontend, "Failed to initialize GL functions! Exiting...");
+        exit(1);
+    }
+
     OnResize();
     OnMinimalClientAreaChangeRequest(GetActiveConfig().min_client_area_size);
     SDL_PumpEvents();
diff --git a/src/citra_qt/CMakeLists.txt b/src/citra_qt/CMakeLists.txt
index cc9e0c624..3f0099200 100644
--- a/src/citra_qt/CMakeLists.txt
+++ b/src/citra_qt/CMakeLists.txt
@@ -55,6 +55,7 @@ set(HEADERS
             configure_dialog.h
             configure_general.h
             game_list.h
+            game_list_p.h
             hotkeys.h
             main.h
             ui_settings.h
diff --git a/src/citra_qt/debugger/graphics_breakpoints.cpp b/src/citra_qt/debugger/graphics_breakpoints.cpp
index c8510128a..fe66918a8 100644
--- a/src/citra_qt/debugger/graphics_breakpoints.cpp
+++ b/src/citra_qt/debugger/graphics_breakpoints.cpp
@@ -44,7 +44,7 @@ QVariant BreakPointModel::data(const QModelIndex& index, int role) const
                 { Pica::DebugContext::Event::PicaCommandProcessed, tr("Pica command processed") },
                 { Pica::DebugContext::Event::IncomingPrimitiveBatch, tr("Incoming primitive batch") },
                 { Pica::DebugContext::Event::FinishedPrimitiveBatch, tr("Finished primitive batch") },
-                { Pica::DebugContext::Event::VertexLoaded, tr("Vertex loaded") },
+                { Pica::DebugContext::Event::VertexShaderInvocation, tr("Vertex shader invocation") },
                 { Pica::DebugContext::Event::IncomingDisplayTransfer, tr("Incoming display transfer") },
                 { Pica::DebugContext::Event::GSPCommandProcessed, tr("GSP command processed") },
                 { Pica::DebugContext::Event::BufferSwapped, tr("Buffers swapped") }
diff --git a/src/citra_qt/debugger/graphics_vertex_shader.cpp b/src/citra_qt/debugger/graphics_vertex_shader.cpp
index d648d4640..391666d35 100644
--- a/src/citra_qt/debugger/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp
@@ -365,7 +365,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De
         input_data[i]->setValidator(new QDoubleValidator(input_data[i]));
     }
 
-    breakpoint_warning = new QLabel(tr("(data only available at VertexLoaded breakpoints)"));
+    breakpoint_warning = new QLabel(tr("(data only available at vertex shader invocation breakpoints)"));
 
     // TODO: Add some button for jumping to the shader entry point
 
@@ -454,7 +454,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De
 
 void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) {
     auto input = static_cast<Pica::Shader::InputVertex*>(data);
-    if (event == Pica::DebugContext::Event::VertexLoaded) {
+    if (event == Pica::DebugContext::Event::VertexShaderInvocation) {
         Reload(true, data);
     } else {
         // No vertex data is retrievable => invalidate currently stored vertex data
@@ -501,7 +501,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
     info.labels.insert({ entry_point, "main" });
 
     // Generate debug information
-    debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup);
+    debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup);
 
     // Reload widget state
     for (int attr = 0; attr < num_attributes; ++attr) {
@@ -515,7 +515,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
     }
 
     // Initialize debug info text for current cycle count
-    cycle_index->setMaximum(debug_data.records.size() - 1);
+    cycle_index->setMaximum(static_cast<int>(debug_data.records.size() - 1));
     OnCycleIndexChanged(cycle_index->value());
 
     model->endResetModel();
diff --git a/src/citra_qt/game_list.cpp b/src/citra_qt/game_list.cpp
index d14532102..d4ac9c96e 100644
--- a/src/citra_qt/game_list.cpp
+++ b/src/citra_qt/game_list.cpp
@@ -34,8 +34,8 @@ GameList::GameList(QWidget* parent)
     tree_view->setUniformRowHeights(true);
 
     item_model->insertColumns(0, COLUMN_COUNT);
-    item_model->setHeaderData(COLUMN_FILE_TYPE, Qt::Horizontal, "File type");
     item_model->setHeaderData(COLUMN_NAME, Qt::Horizontal, "Name");
+    item_model->setHeaderData(COLUMN_FILE_TYPE, Qt::Horizontal, "File type");
     item_model->setHeaderData(COLUMN_SIZE, Qt::Horizontal, "Size");
 
     connect(tree_view, SIGNAL(activated(const QModelIndex&)), this, SLOT(ValidateEntry(const QModelIndex&)));
@@ -109,7 +109,11 @@ void GameList::SaveInterfaceLayout()
 void GameList::LoadInterfaceLayout()
 {
     auto header = tree_view->header();
-    header->restoreState(UISettings::values.gamelist_header_state);
+    if (!header->restoreState(UISettings::values.gamelist_header_state)) {
+        // We are using the name column to display icons and titles
+        // so make it as large as possible as default.
+        header->resizeSection(COLUMN_NAME, header->width());
+    }
 
     item_model->sort(header->sortIndicatorSection(), header->sortIndicatorOrder());
 }
@@ -143,9 +147,15 @@ void GameListWorker::AddFstEntriesToGameList(const std::string& dir_path, bool d
                 LOG_WARNING(Frontend, "Filetype and extension of file %s do not match.", physical_name.c_str());
             }
 
+            std::vector<u8> smdh;
+            std::unique_ptr<Loader::AppLoader> loader = Loader::GetLoader(FileUtil::IOFile(physical_name, "rb"), filetype, filename_filename, physical_name);
+
+            if (loader)
+                loader->ReadIcon(smdh);
+
             emit EntryReady({
+                new GameListItemPath(QString::fromStdString(physical_name), smdh),
                 new GameListItem(QString::fromStdString(Loader::GetFileTypeString(filetype))),
-                new GameListItemPath(QString::fromStdString(physical_name)),
                 new GameListItemSize(FileUtil::GetSize(physical_name)),
             });
         }
diff --git a/src/citra_qt/game_list.h b/src/citra_qt/game_list.h
index 48febdc60..198674f04 100644
--- a/src/citra_qt/game_list.h
+++ b/src/citra_qt/game_list.h
@@ -20,8 +20,8 @@ class GameList : public QWidget {
 
 public:
     enum {
-        COLUMN_FILE_TYPE,
         COLUMN_NAME,
+        COLUMN_FILE_TYPE,
         COLUMN_SIZE,
         COLUMN_COUNT, // Number of columns
     };
diff --git a/src/citra_qt/game_list_p.h b/src/citra_qt/game_list_p.h
index 820012bce..284f5da81 100644
--- a/src/citra_qt/game_list_p.h
+++ b/src/citra_qt/game_list_p.h
@@ -6,13 +6,85 @@
 
 #include <atomic>
 
+#include <QImage>
 #include <QRunnable>
 #include <QStandardItem>
 #include <QString>
 
 #include "citra_qt/util/util.h"
 #include "common/string_util.h"
+#include "common/color.h"
 
+#include "core/loader/loader.h"
+
+#include "video_core/utils.h"
+
+/**
+ * Tests if data is a valid SMDH by its length and magic number.
+ * @param smdh_data data buffer to test
+ * @return bool test result
+ */
+static bool IsValidSMDH(const std::vector<u8>& smdh_data) {
+    if (smdh_data.size() < sizeof(Loader::SMDH))
+        return false;
+
+    u32 magic;
+    memcpy(&magic, smdh_data.data(), 4);
+
+    return Loader::MakeMagic('S', 'M', 'D', 'H') == magic;
+}
+
+/**
+ * Gets game icon from SMDH
+ * @param sdmh SMDH data
+ * @param large If true, returns large icon (48x48), otherwise returns small icon (24x24)
+ * @return QPixmap game icon
+ */
+static QPixmap GetIconFromSMDH(const Loader::SMDH& smdh, bool large) {
+    u32 size;
+    const u8* icon_data;
+
+    if (large) {
+        size = 48;
+        icon_data = smdh.large_icon.data();
+    } else {
+        size = 24;
+        icon_data = smdh.small_icon.data();
+    }
+
+    QImage icon(size, size, QImage::Format::Format_RGB888);
+    for (u32 x = 0; x < size; ++x) {
+        for (u32 y = 0; y < size; ++y) {
+            u32 coarse_y = y & ~7;
+            auto v = Color::DecodeRGB565(
+                icon_data + VideoCore::GetMortonOffset(x, y, 2) + coarse_y * size * 2);
+            icon.setPixel(x, y, qRgb(v.r(), v.g(), v.b()));
+        }
+    }
+    return QPixmap::fromImage(icon);
+}
+
+/**
+ * Gets the default icon (for games without valid SMDH)
+ * @param large If true, returns large icon (48x48), otherwise returns small icon (24x24)
+ * @return QPixmap default icon
+ */
+static QPixmap GetDefaultIcon(bool large) {
+    int size = large ? 48 : 24;
+    QPixmap icon(size, size);
+    icon.fill(Qt::transparent);
+    return icon;
+}
+
+/**
+ * Gets the short game title fromn SMDH
+ * @param sdmh SMDH data
+ * @param language title language
+ * @return QString short title
+ */
+static QString GetShortTitleFromSMDH(const Loader::SMDH& smdh, Loader::SMDH::TitleLanguage language) {
+    return QString::fromUtf16(smdh.titles[static_cast<int>(language)].short_title.data());
+}
 
 class GameListItem : public QStandardItem {
 
@@ -27,29 +99,43 @@ public:
  * A specialization of GameListItem for path values.
  * This class ensures that for every full path value it holds, a correct string representation
  * of just the filename (with no extension) will be displayed to the user.
+ * If this class recieves valid SMDH data, it will also display game icons and titles.
  */
 class GameListItemPath : public GameListItem {
 
 public:
     static const int FullPathRole = Qt::UserRole + 1;
+    static const int TitleRole = Qt::UserRole + 2;
 
     GameListItemPath(): GameListItem() {}
-    GameListItemPath(const QString& game_path): GameListItem()
+    GameListItemPath(const QString& game_path, const std::vector<u8>& smdh_data): GameListItem()
     {
         setData(game_path, FullPathRole);
+
+        if (!IsValidSMDH(smdh_data)) {
+            // SMDH is not valid, set a default icon
+            setData(GetDefaultIcon(true), Qt::DecorationRole);
+            return;
+        }
+
+        Loader::SMDH smdh;
+        memcpy(&smdh, smdh_data.data(), sizeof(Loader::SMDH));
+
+        // Get icon from SMDH
+        setData(GetIconFromSMDH(smdh, true), Qt::DecorationRole);
+
+        // Get title form SMDH
+        setData(GetShortTitleFromSMDH(smdh, Loader::SMDH::TitleLanguage::English), TitleRole);
     }
 
-    void setData(const QVariant& value, int role) override
-    {
-        // By specializing setData for FullPathRole, we can ensure that the two string
-        // representations of the data are always accurate and in the correct format.
-        if (role == FullPathRole) {
+    QVariant data(int role) const override {
+        if (role == Qt::DisplayRole) {
             std::string filename;
-            Common::SplitPath(value.toString().toStdString(), nullptr, &filename, nullptr);
-            GameListItem::setData(QString::fromStdString(filename), Qt::DisplayRole);
-            GameListItem::setData(value, FullPathRole);
+            Common::SplitPath(data(FullPathRole).toString().toStdString(), nullptr, &filename, nullptr);
+            QString title = data(TitleRole).toString();
+            return QString::fromStdString(filename) + (title.isEmpty() ? "" : "\n    " + title);
         } else {
-            GameListItem::setData(value, role);
+            return GameListItem::data(role);
         }
     }
 };
diff --git a/src/citra_qt/main.cpp b/src/citra_qt/main.cpp
index f1ab29755..a85c94a4b 100644
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@@ -6,6 +6,9 @@
 #include <memory>
 #include <thread>
 
+#include <glad/glad.h>
+
+#define QT_NO_OPENGL
 #include <QDesktopWidget>
 #include <QtGui>
 #include <QFileDialog>
@@ -240,6 +243,14 @@ bool GMainWindow::InitializeSystem() {
     if (emu_thread != nullptr)
         ShutdownGame();
 
+    render_window->MakeCurrent();
+    if (!gladLoadGL()) {
+        QMessageBox::critical(this, tr("Error while starting Citra!"),
+                              tr("Failed to initialize the video core!\n\n"
+                                 "Please ensure that your GPU supports OpenGL 3.3 and that you have the latest graphics driver."));
+        return false;
+    }
+
     // Initialize the core emulation
     System::Result system_result = System::Init(render_window);
     if (System::Result::Success != system_result) {
diff --git a/src/citra_qt/util/util.cpp b/src/citra_qt/util/util.cpp
index 8734a8efd..2f9beb5cc 100644
--- a/src/citra_qt/util/util.cpp
+++ b/src/citra_qt/util/util.cpp
@@ -19,7 +19,7 @@ QString ReadableByteSize(qulonglong size) {
     static const std::array<const char*, 6> units = { "B", "KiB", "MiB", "GiB", "TiB", "PiB" };
     if (size == 0)
         return "0";
-    int digit_groups = std::min<int>((int)(std::log10(size) / std::log10(1024)), units.size());
+    int digit_groups = std::min<int>(static_cast<int>(std::log10(size) / std::log10(1024)), static_cast<int>(units.size()));
     return QString("%L1 %2").arg(size / std::pow(1024, digit_groups), 0, 'f', 1)
                             .arg(units[digit_groups]);
 }
diff --git a/src/common/logging/backend.cpp b/src/common/logging/backend.cpp
index 3d39f94d5..d7008fc66 100644
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@@ -65,6 +65,7 @@ namespace Log {
         SUB(Render, OpenGL) \
         CLS(Audio) \
         SUB(Audio, DSP) \
+        SUB(Audio, Sink) \
         CLS(Loader)
 
 // GetClassName is a macro defined by Windows.h, grrr...
diff --git a/src/common/logging/log.h b/src/common/logging/log.h
index 521362317..c6910b1c7 100644
--- a/src/common/logging/log.h
+++ b/src/common/logging/log.h
@@ -78,8 +78,9 @@ enum class Class : ClassType {
     Render,                     ///< Emulator video output and hardware acceleration
     Render_Software,            ///< Software renderer backend
     Render_OpenGL,              ///< OpenGL backend
-    Audio,                      ///< Emulator audio output
+    Audio,                      ///< Audio emulation
     Audio_DSP,                  ///< The HLE implementation of the DSP
+    Audio_Sink,                 ///< Emulator audio output backend
     Loader,                     ///< ROM loader
 
     Count ///< Total number of logging classes
diff --git a/src/common/swap.h b/src/common/swap.h
index a7c37bc44..1749bd7a4 100644
--- a/src/common/swap.h
+++ b/src/common/swap.h
@@ -25,6 +25,8 @@
     #include <sys/endian.h>
 #endif
 
+#include <cstring>
+
 #include "common/common_types.h"
 
 // GCC 4.6+
@@ -58,9 +60,6 @@
 
 namespace Common {
 
-inline u8 swap8(u8 _data) {return _data;}
-inline u32 swap24(const u8* _data) {return (_data[0] << 16) | (_data[1] << 8) | _data[2];}
-
 #ifdef _MSC_VER
 inline u16 swap16(u16 _data) {return _byteswap_ushort(_data);}
 inline u32 swap32(u32 _data) {return _byteswap_ulong (_data);}
@@ -92,52 +91,29 @@ inline u64 swap64(u64 data) {return ((u64)swap32(data) << 32) | swap32(data >> 3
 #endif
 
 inline float swapf(float f) {
-    union {
-        float f;
-        unsigned int u32;
-    } dat1, dat2;
+    static_assert(sizeof(u32) == sizeof(float),
+                  "float must be the same size as uint32_t.");
 
-    dat1.f = f;
-    dat2.u32 = swap32(dat1.u32);
+    u32 value;
+    std::memcpy(&value, &f, sizeof(u32));
 
-    return dat2.f;
+    value = swap32(value);
+    std::memcpy(&f, &value, sizeof(u32));
+
+    return f;
 }
 
 inline double swapd(double f) {
-    union  {
-        double f;
-        unsigned long long u64;
-    } dat1, dat2;
+    static_assert(sizeof(u64) == sizeof(double),
+                  "double must be the same size as uint64_t.");
 
-    dat1.f = f;
-    dat2.u64 = swap64(dat1.u64);
+    u64 value;
+    std::memcpy(&value, &f, sizeof(u64));
 
-    return dat2.f;
-}
+    value = swap64(value);
+    std::memcpy(&f, &value, sizeof(u64));
 
-inline u16 swap16(const u8* _pData) {return swap16(*(const u16*)_pData);}
-inline u32 swap32(const u8* _pData) {return swap32(*(const u32*)_pData);}
-inline u64 swap64(const u8* _pData) {return swap64(*(const u64*)_pData);}
-
-template <int count>
-void swap(u8*);
-
-template <>
-inline void swap<1>(u8* data) { }
-
-template <>
-inline void swap<2>(u8* data) {
-    *reinterpret_cast<u16*>(data) = swap16(data);
-}
-
-template <>
-inline void swap<4>(u8* data) {
-    *reinterpret_cast<u32*>(data) = swap32(data);
-}
-
-template <>
-inline void swap<8>(u8* data) {
-    *reinterpret_cast<u64*>(data) = swap64(data);
+    return f;
 }
 
 }  // Namespace Common
@@ -534,35 +510,35 @@ bool operator==(const S &p, const swap_struct_t<T, F> v) {
 template <typename T>
 struct swap_64_t {
     static T swap(T x) {
-        return (T)Common::swap64(*(u64 *)&x);
+        return static_cast<T>(Common::swap64(x));
     }
 };
 
 template <typename T>
 struct swap_32_t {
     static T swap(T x) {
-        return (T)Common::swap32(*(u32 *)&x);
+        return static_cast<T>(Common::swap32(x));
     }
 };
 
 template <typename T>
 struct swap_16_t {
     static T swap(T x) {
-        return (T)Common::swap16(*(u16 *)&x);
+        return static_cast<T>(Common::swap16(x));
     }
 };
 
 template <typename T>
 struct swap_float_t {
     static T swap(T x) {
-        return (T)Common::swapf(*(float *)&x);
+        return static_cast<T>(Common::swapf(x));
     }
 };
 
 template <typename T>
 struct swap_double_t {
     static T swap(T x) {
-        return (T)Common::swapd(*(double *)&x);
+        return static_cast<T>(Common::swapd(x));
     }
 };
 
diff --git a/src/core/arm/dyncom/arm_dyncom.cpp b/src/core/arm/dyncom/arm_dyncom.cpp
index a3581132c..13492a08b 100644
--- a/src/core/arm/dyncom/arm_dyncom.cpp
+++ b/src/core/arm/dyncom/arm_dyncom.cpp
@@ -93,7 +93,7 @@ void ARM_DynCom::ResetContext(Core::ThreadContext& context, u32 stack_top, u32 e
     context.cpu_registers[0] = arg;
     context.pc = entry_point;
     context.sp = stack_top;
-    context.cpsr = 0x1F | ((entry_point & 1) << 5); // Usermode and THUMB mode
+    context.cpsr = USER32MODE | ((entry_point & 1) << 5); // Usermode and THUMB mode
 }
 
 void ARM_DynCom::SaveContext(Core::ThreadContext& ctx) {
diff --git a/src/core/core.cpp b/src/core/core.cpp
index 3bb843aab..cabab744a 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -51,7 +51,7 @@ void RunLoop(int tight_loop) {
     }
 
     HW::Update();
-    if (HLE::g_reschedule) {
+    if (HLE::IsReschedulePending()) {
         Kernel::Reschedule();
     }
 }
diff --git a/src/core/gdbstub/gdbstub.cpp b/src/core/gdbstub/gdbstub.cpp
index ae0c116ef..820b19e1a 100644
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@@ -374,7 +374,7 @@ static void SendReply(const char* reply) {
 
     memset(command_buffer, 0, sizeof(command_buffer));
 
-    command_length = strlen(reply);
+    command_length = static_cast<u32>(strlen(reply));
     if (command_length + 4 > sizeof(command_buffer)) {
         LOG_ERROR(Debug_GDBStub, "command_buffer overflow in SendReply");
         return;
@@ -437,7 +437,7 @@ static void HandleSetThread() {
  *
  * @param signal Signal to be sent to client.
  */
-void SendSignal(u32 signal) {
+static void SendSignal(u32 signal) {
     if (gdbserver_socket == -1) {
         return;
     }
@@ -515,7 +515,7 @@ static bool IsDataAvailable() {
         return false;
     }
 
-    return FD_ISSET(gdbserver_socket, &fd_socket);
+    return FD_ISSET(gdbserver_socket, &fd_socket) != 0;
 }
 
 /// Send requested register to gdb client.
@@ -633,10 +633,10 @@ static void ReadMemory() {
 
     auto start_offset = command_buffer+1;
     auto addr_pos = std::find(start_offset, command_buffer+command_length, ',');
-    PAddr addr = HexToInt(start_offset, addr_pos - start_offset);
+    PAddr addr = HexToInt(start_offset, static_cast<u32>(addr_pos - start_offset));
 
     start_offset = addr_pos+1;
-    u32 len = HexToInt(start_offset, (command_buffer + command_length) - start_offset);
+    u32 len = HexToInt(start_offset, static_cast<u32>((command_buffer + command_length) - start_offset));
 
     LOG_DEBUG(Debug_GDBStub, "gdb: addr: %08x len: %08x\n", addr, len);
 
@@ -658,11 +658,11 @@ static void ReadMemory() {
 static void WriteMemory() {
     auto start_offset = command_buffer+1;
     auto addr_pos = std::find(start_offset, command_buffer+command_length, ',');
-    PAddr addr = HexToInt(start_offset, addr_pos - start_offset);
+    PAddr addr = HexToInt(start_offset, static_cast<u32>(addr_pos - start_offset));
 
     start_offset = addr_pos+1;
     auto len_pos = std::find(start_offset, command_buffer+command_length, ':');
-    u32 len = HexToInt(start_offset, len_pos - start_offset);
+    u32 len = HexToInt(start_offset, static_cast<u32>(len_pos - start_offset));
 
     u8* dst = Memory::GetPointer(addr);
     if (!dst) {
@@ -713,7 +713,7 @@ static void Continue() {
  * @param addr Address of breakpoint.
  * @param len Length of breakpoint.
  */
-bool CommitBreakpoint(BreakpointType type, PAddr addr, u32 len) {
+static bool CommitBreakpoint(BreakpointType type, PAddr addr, u32 len) {
     std::map<u32, Breakpoint>& p = GetBreakpointList(type);
 
     Breakpoint breakpoint;
@@ -752,10 +752,10 @@ static void AddBreakpoint() {
 
     auto start_offset = command_buffer+3;
     auto addr_pos = std::find(start_offset, command_buffer+command_length, ',');
-    PAddr addr = HexToInt(start_offset, addr_pos - start_offset);
+    PAddr addr = HexToInt(start_offset, static_cast<u32>(addr_pos - start_offset));
 
     start_offset = addr_pos+1;
-    u32 len = HexToInt(start_offset, (command_buffer + command_length) - start_offset);
+    u32 len = HexToInt(start_offset, static_cast<u32>((command_buffer + command_length) - start_offset));
 
     if (type == BreakpointType::Access) {
         // Access is made up of Read and Write types, so add both breakpoints
@@ -800,10 +800,10 @@ static void RemoveBreakpoint() {
 
     auto start_offset = command_buffer+3;
     auto addr_pos = std::find(start_offset, command_buffer+command_length, ',');
-    PAddr addr = HexToInt(start_offset, addr_pos - start_offset);
+    PAddr addr = HexToInt(start_offset, static_cast<u32>(addr_pos - start_offset));
 
     start_offset = addr_pos+1;
-    u32 len = HexToInt(start_offset, (command_buffer + command_length) - start_offset);
+    u32 len = HexToInt(start_offset, static_cast<u32>((command_buffer + command_length) - start_offset));
 
     if (type == BreakpointType::Access) {
         // Access is made up of Read and Write types, so add both breakpoints
@@ -907,7 +907,7 @@ void ToggleServer(bool status) {
     }
 }
 
-void Init(u16 port) {
+static void Init(u16 port) {
     if (!g_server_enabled) {
         // Set the halt loop to false in case the user enabled the gdbstub mid-execution.
         // This way the CPU can still execute normally.
diff --git a/src/core/hle/applets/mii_selector.cpp b/src/core/hle/applets/mii_selector.cpp
index 708d2f630..b4456ca90 100644
--- a/src/core/hle/applets/mii_selector.cpp
+++ b/src/core/hle/applets/mii_selector.cpp
@@ -21,13 +21,6 @@
 namespace HLE {
 namespace Applets {
 
-MiiSelector::MiiSelector(Service::APT::AppletId id) : Applet(id), started(false) {
-    // Create the SharedMemory that will hold the framebuffer data
-    // TODO(Subv): What size should we use here?
-    using Kernel::MemoryPermission;
-    framebuffer_memory = Kernel::SharedMemory::Create(0x1000, MemoryPermission::ReadWrite, MemoryPermission::ReadWrite, "MiiSelector Memory");
-}
-
 ResultCode MiiSelector::ReceiveParameter(const Service::APT::MessageParameter& parameter) {
     if (parameter.signal != static_cast<u32>(Service::APT::SignalType::LibAppJustStarted)) {
         LOG_ERROR(Service_APT, "unsupported signal %u", parameter.signal);
@@ -36,8 +29,18 @@ ResultCode MiiSelector::ReceiveParameter(const Service::APT::MessageParameter& p
         return ResultCode(-1);
     }
 
+    // The LibAppJustStarted message contains a buffer with the size of the framebuffer shared memory.
+    // Create the SharedMemory that will hold the framebuffer data
+    Service::APT::CaptureBufferInfo capture_info;
+    ASSERT(sizeof(capture_info) == parameter.buffer_size);
+
+    memcpy(&capture_info, parameter.data, sizeof(capture_info));
+    using Kernel::MemoryPermission;
+    framebuffer_memory = Kernel::SharedMemory::Create(capture_info.size, MemoryPermission::ReadWrite,
+                                                      MemoryPermission::ReadWrite, "MiiSelector Memory");
+
+    // Send the response message with the newly created SharedMemory
     Service::APT::MessageParameter result;
-    // The buffer passed in parameter contains the data returned by GSPGPU::ImportDisplayCaptureInfo
     result.signal = static_cast<u32>(Service::APT::SignalType::LibAppFinished);
     result.data = nullptr;
     result.buffer_size = 0;
@@ -55,6 +58,11 @@ ResultCode MiiSelector::StartImpl(const Service::APT::AppletStartupParameter& pa
     // TODO(Subv): Set the expected fields in the response buffer before resending it to the application.
     // TODO(Subv): Reverse the parameter format for the Mii Selector
 
+    if(parameter.buffer_size >= sizeof(u32)) {
+        // TODO: defaults return no error, but garbage in other unknown fields
+        memset(parameter.data, 0, sizeof(u32));
+    }
+
     // Let the application know that we're closing
     Service::APT::MessageParameter message;
     message.buffer_size = parameter.buffer_size;
diff --git a/src/core/hle/applets/mii_selector.h b/src/core/hle/applets/mii_selector.h
index 6a3e7c8eb..be6b04642 100644
--- a/src/core/hle/applets/mii_selector.h
+++ b/src/core/hle/applets/mii_selector.h
@@ -16,17 +16,61 @@
 namespace HLE {
 namespace Applets {
 
+struct MiiConfig {
+    u8  unk_000;
+    u8  unk_001;
+    u8  unk_002;
+    u8  unk_003;
+    u8  unk_004;
+    INSERT_PADDING_BYTES(3);
+    u16 unk_008;
+    INSERT_PADDING_BYTES(0x8C - 0xA);
+    u8  unk_08C;
+    INSERT_PADDING_BYTES(3);
+    u16 unk_090;
+    INSERT_PADDING_BYTES(2);
+    u32 unk_094;
+    u16 unk_098;
+    u8  unk_09A[0x64];
+    u8  unk_0FE;
+    u8  unk_0FF;
+    u32 unk_100;
+};
+
+static_assert(sizeof(MiiConfig) == 0x104, "MiiConfig structure has incorrect size");
+#define ASSERT_REG_POSITION(field_name, position) static_assert(offsetof(MiiConfig, field_name) == position, "Field "#field_name" has invalid position")
+ASSERT_REG_POSITION(unk_008, 0x08);
+ASSERT_REG_POSITION(unk_08C, 0x8C);
+ASSERT_REG_POSITION(unk_090, 0x90);
+ASSERT_REG_POSITION(unk_094, 0x94);
+ASSERT_REG_POSITION(unk_0FE, 0xFE);
+#undef ASSERT_REG_POSITION
+
+struct MiiResult {
+    u32 result_code;
+    u8 unk_04;
+    INSERT_PADDING_BYTES(7);
+    u8 unk_0C[0x60];
+    u8 unk_6C[0x16];
+    INSERT_PADDING_BYTES(2);
+};
+static_assert(sizeof(MiiResult) == 0x84, "MiiResult structure has incorrect size");
+#define ASSERT_REG_POSITION(field_name, position) static_assert(offsetof(MiiResult, field_name) == position, "Field "#field_name" has invalid position")
+ASSERT_REG_POSITION(unk_0C, 0x0C);
+ASSERT_REG_POSITION(unk_6C, 0x6C);
+#undef ASSERT_REG_POSITION
+
 class MiiSelector final : public Applet {
 public:
-    MiiSelector(Service::APT::AppletId id);
+    MiiSelector(Service::APT::AppletId id) : Applet(id), started(false) { }
 
     ResultCode ReceiveParameter(const Service::APT::MessageParameter& parameter) override;
     ResultCode StartImpl(const Service::APT::AppletStartupParameter& parameter) override;
     void Update() override;
     bool IsRunning() const override { return started; }
 
-    /// TODO(Subv): Find out what this is actually used for.
-    /// It is believed that the application stores the current screen image here.
+    /// This SharedMemory will be created when we receive the LibAppJustStarted message.
+    /// It holds the framebuffer info retrieved by the application with GSPGPU::ImportDisplayCaptureInfo
     Kernel::SharedPtr<Kernel::SharedMemory> framebuffer_memory;
 
     /// Whether this applet is currently running instead of the host application or not.
diff --git a/src/core/hle/applets/swkbd.cpp b/src/core/hle/applets/swkbd.cpp
index 1db6b5a17..87238aa1c 100644
--- a/src/core/hle/applets/swkbd.cpp
+++ b/src/core/hle/applets/swkbd.cpp
@@ -24,13 +24,6 @@
 namespace HLE {
 namespace Applets {
 
-SoftwareKeyboard::SoftwareKeyboard(Service::APT::AppletId id) : Applet(id), started(false) {
-    // Create the SharedMemory that will hold the framebuffer data
-    // TODO(Subv): What size should we use here?
-    using Kernel::MemoryPermission;
-    framebuffer_memory = Kernel::SharedMemory::Create(0x1000, MemoryPermission::ReadWrite, MemoryPermission::ReadWrite, "SoftwareKeyboard Memory");
-}
-
 ResultCode SoftwareKeyboard::ReceiveParameter(Service::APT::MessageParameter const& parameter) {
     if (parameter.signal != static_cast<u32>(Service::APT::SignalType::LibAppJustStarted)) {
         LOG_ERROR(Service_APT, "unsupported signal %u", parameter.signal);
@@ -39,8 +32,19 @@ ResultCode SoftwareKeyboard::ReceiveParameter(Service::APT::MessageParameter con
         return ResultCode(-1);
     }
 
+    // The LibAppJustStarted message contains a buffer with the size of the framebuffer shared memory.
+    // Create the SharedMemory that will hold the framebuffer data
+    Service::APT::CaptureBufferInfo capture_info;
+    ASSERT(sizeof(capture_info) == parameter.buffer_size);
+
+    memcpy(&capture_info, parameter.data, sizeof(capture_info));
+
+    using Kernel::MemoryPermission;
+    framebuffer_memory = Kernel::SharedMemory::Create(capture_info.size, MemoryPermission::ReadWrite,
+                                                      MemoryPermission::ReadWrite, "SoftwareKeyboard Memory");
+
+    // Send the response message with the newly created SharedMemory
     Service::APT::MessageParameter result;
-    // The buffer passed in parameter contains the data returned by GSPGPU::ImportDisplayCaptureInfo
     result.signal = static_cast<u32>(Service::APT::SignalType::LibAppFinished);
     result.data = nullptr;
     result.buffer_size = 0;
diff --git a/src/core/hle/applets/swkbd.h b/src/core/hle/applets/swkbd.h
index cb95b8d90..cf26a8fb7 100644
--- a/src/core/hle/applets/swkbd.h
+++ b/src/core/hle/applets/swkbd.h
@@ -53,8 +53,7 @@ static_assert(sizeof(SoftwareKeyboardConfig) == 0x400, "Software Keyboard Config
 
 class SoftwareKeyboard final : public Applet {
 public:
-    SoftwareKeyboard(Service::APT::AppletId id);
-    ~SoftwareKeyboard() {}
+    SoftwareKeyboard(Service::APT::AppletId id) : Applet(id), started(false) { }
 
     ResultCode ReceiveParameter(const Service::APT::MessageParameter& parameter) override;
     ResultCode StartImpl(const Service::APT::AppletStartupParameter& parameter) override;
@@ -72,8 +71,8 @@ public:
      */
     void Finalize();
 
-    /// TODO(Subv): Find out what this is actually used for.
-    /// It is believed that the application stores the current screen image here.
+    /// This SharedMemory will be created when we receive the LibAppJustStarted message.
+    /// It holds the framebuffer info retrieved by the application with GSPGPU::ImportDisplayCaptureInfo
     Kernel::SharedPtr<Kernel::SharedMemory> framebuffer_memory;
 
     /// SharedMemory where the output text will be stored
diff --git a/src/core/hle/hle.cpp b/src/core/hle/hle.cpp
index e545de3b5..5c5373517 100644
--- a/src/core/hle/hle.cpp
+++ b/src/core/hle/hle.cpp
@@ -12,9 +12,13 @@
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace HLE {
+namespace {
 
-bool g_reschedule; ///< If true, immediately reschedules the CPU to a new thread
+bool reschedule; ///< If true, immediately reschedules the CPU to a new thread
+
+}
+
+namespace HLE {
 
 void Reschedule(const char *reason) {
     DEBUG_ASSERT_MSG(reason != nullptr && strlen(reason) < 256, "Reschedule: Invalid or too long reason.");
@@ -27,13 +31,21 @@ void Reschedule(const char *reason) {
 
     Core::g_app_core->PrepareReschedule();
 
-    g_reschedule = true;
+    reschedule = true;
+}
+
+bool IsReschedulePending() {
+    return reschedule;
+}
+
+void DoneRescheduling() {
+    reschedule = false;
 }
 
 void Init() {
     Service::Init();
 
-    g_reschedule = false;
+    reschedule = false;
 
     LOG_DEBUG(Kernel, "initialized OK");
 }
diff --git a/src/core/hle/hle.h b/src/core/hle/hle.h
index e0b97797c..69ac0ade6 100644
--- a/src/core/hle/hle.h
+++ b/src/core/hle/hle.h
@@ -13,9 +13,9 @@ const Handle INVALID_HANDLE = 0;
 
 namespace HLE {
 
-extern bool g_reschedule;   ///< If true, immediately reschedules the CPU to a new thread
-
 void Reschedule(const char *reason);
+bool IsReschedulePending();
+void DoneRescheduling();
 
 void Init();
 void Shutdown();
diff --git a/src/core/hle/kernel/process.h b/src/core/hle/kernel/process.h
index 6d2ca96a2..a06afef2b 100644
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@@ -107,6 +107,8 @@ public:
     ProcessFlags flags;
     /// Kernel compatibility version for this process
     u16 kernel_version = 0;
+    /// The default CPU for this process, threads are scheduled on this cpu by default.
+    u8 ideal_processor = 0;
 
     /// The id of this process
     u32 process_id = next_process_id++;
diff --git a/src/core/hle/kernel/thread.cpp b/src/core/hle/kernel/thread.cpp
index bf32f653d..6dc95d0f1 100644
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@@ -483,7 +483,8 @@ void Reschedule() {
 
     Thread* cur = GetCurrentThread();
     Thread* next = PopNextReadyThread();
-    HLE::g_reschedule = false;
+
+    HLE::DoneRescheduling();
 
     // Don't bother switching to the same thread
     if (next == cur)
diff --git a/src/core/hle/service/apt/apt.h b/src/core/hle/service/apt/apt.h
index 668b4a66f..1a1034fcc 100644
--- a/src/core/hle/service/apt/apt.h
+++ b/src/core/hle/service/apt/apt.h
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "common/common_types.h"
+#include "common/swap.h"
 
 #include "core/hle/kernel/kernel.h"
 
@@ -31,6 +32,20 @@ struct AppletStartupParameter {
     u8* data = nullptr;
 };
 
+/// Used by the application to pass information about the current framebuffer to applets.
+struct CaptureBufferInfo {
+    u32_le size;
+    u8 is_3d;
+    INSERT_PADDING_BYTES(0x3); // Padding for alignment
+    u32_le top_screen_left_offset;
+    u32_le top_screen_right_offset;
+    u32_le top_screen_format;
+    u32_le bottom_screen_left_offset;
+    u32_le bottom_screen_right_offset;
+    u32_le bottom_screen_format;
+};
+static_assert(sizeof(CaptureBufferInfo) == 0x20, "CaptureBufferInfo struct has incorrect size");
+
 /// Signals used by APT functions
 enum class SignalType : u32 {
     None              = 0x0,
diff --git a/src/core/hle/service/dsp_dsp.cpp b/src/core/hle/service/dsp_dsp.cpp
index 995bee3f9..274fc751a 100644
--- a/src/core/hle/service/dsp_dsp.cpp
+++ b/src/core/hle/service/dsp_dsp.cpp
@@ -288,7 +288,7 @@ static void WriteProcessPipe(Service::Interface* self) {
     ASSERT_MSG(Memory::GetPointer(buffer) != nullptr, "Invalid Buffer: pipe=%u, size=0x%X, buffer=0x%08X", pipe_index, size, buffer);
 
     std::vector<u8> message(size);
-    for (size_t i = 0; i < size; i++) {
+    for (u32 i = 0; i < size; i++) {
         message[i] = Memory::Read8(buffer + i);
     }
 
@@ -403,7 +403,7 @@ static void GetPipeReadableSize(Service::Interface* self) {
 
     cmd_buff[0] = IPC::MakeHeader(0xF, 2, 0);
     cmd_buff[1] = RESULT_SUCCESS.raw; // No error
-    cmd_buff[2] = DSP::HLE::GetPipeReadableSize(pipe);
+    cmd_buff[2] = static_cast<u32>(DSP::HLE::GetPipeReadableSize(pipe));
 
     LOG_DEBUG(Service_DSP, "pipe=%u, unknown=0x%08X, return cmd_buff[2]=0x%08X", pipe_index, unknown, cmd_buff[2]);
 }
diff --git a/src/core/hle/svc.cpp b/src/core/hle/svc.cpp
index fb2aecbf2..60c8747f3 100644
--- a/src/core/hle/svc.cpp
+++ b/src/core/hle/svc.cpp
@@ -496,6 +496,11 @@ static ResultCode CreateThread(Handle* out_handle, s32 priority, u32 entry_point
         break;
     }
 
+    if (processor_id == THREADPROCESSORID_1 || processor_id == THREADPROCESSORID_ALL ||
+        (processor_id == THREADPROCESSORID_DEFAULT && Kernel::g_current_process->ideal_processor == THREADPROCESSORID_1)) {
+        LOG_WARNING(Kernel_SVC, "Newly created thread is allowed to be run in the SysCore, unimplemented.");
+    }
+
     CASCADE_RESULT(SharedPtr<Thread> thread, Kernel::Thread::Create(
             name, entry_point, priority, arg, processor_id, stack_top));
     CASCADE_RESULT(*out_handle, Kernel::g_handle_table.Create(std::move(thread)));
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index 2fe856293..a4dfb7e43 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -188,10 +188,10 @@ inline void Write(u32 addr, const T data) {
                     u32 output_gap = config.texture_copy.output_gap * 16;
 
                     size_t contiguous_input_size = config.texture_copy.size / input_width * (input_width + input_gap);
-                    Memory::RasterizerFlushRegion(config.GetPhysicalInputAddress(), contiguous_input_size);
+                    Memory::RasterizerFlushRegion(config.GetPhysicalInputAddress(), static_cast<u32>(contiguous_input_size));
 
                     size_t contiguous_output_size = config.texture_copy.size / output_width * (output_width + output_gap);
-                    Memory::RasterizerFlushAndInvalidateRegion(config.GetPhysicalOutputAddress(), contiguous_output_size);
+                    Memory::RasterizerFlushAndInvalidateRegion(config.GetPhysicalOutputAddress(), static_cast<u32>(contiguous_output_size));
 
                     u32 remaining_size = config.texture_copy.size;
                     u32 remaining_input = input_width;
diff --git a/src/core/loader/3dsx.cpp b/src/core/loader/3dsx.cpp
index 5fb3b9e2b..98e7ab48f 100644
--- a/src/core/loader/3dsx.cpp
+++ b/src/core/loader/3dsx.cpp
@@ -178,11 +178,11 @@ static THREEDSX_Error Load3DSXFile(FileUtil::IOFile& file, u32 base_addr, Shared
                 for (unsigned current_inprogress = 0; current_inprogress < remaining && pos < end_pos; current_inprogress++) {
                     const auto& table = reloc_table[current_inprogress];
                     LOG_TRACE(Loader, "(t=%d,skip=%u,patch=%u)", current_segment_reloc_table,
-                              (u32)table.skip, (u32)table.patch);
+                              static_cast<u32>(table.skip), static_cast<u32>(table.patch));
                     pos += table.skip;
                     s32 num_patches = table.patch;
                     while (0 < num_patches && pos < end_pos) {
-                        u32 in_addr = (u8*)pos - program_image.data();
+                        u32 in_addr = static_cast<u32>(reinterpret_cast<u8*>(pos) - program_image.data());
                         u32 addr = TranslateAddr(*pos, &loadinfo, offsets);
                         LOG_TRACE(Loader, "Patching %08X <-- rel(%08X,%d) (%08X)",
                                   base_addr + in_addr, addr, current_segment_reloc_table, *pos);
@@ -284,7 +284,7 @@ ResultStatus AppLoader_THREEDSX::ReadRomFS(std::shared_ptr<FileUtil::IOFile>& ro
     // Check if the 3DSX has a RomFS...
     if (hdr.fs_offset != 0) {
         u32 romfs_offset = hdr.fs_offset;
-        u32 romfs_size = file.GetSize() - hdr.fs_offset;
+        u32 romfs_size = static_cast<u32>(file.GetSize()) - hdr.fs_offset;
 
         LOG_DEBUG(Loader, "RomFS offset:           0x%08X", romfs_offset);
         LOG_DEBUG(Loader, "RomFS size:             0x%08X", romfs_size);
@@ -303,4 +303,31 @@ ResultStatus AppLoader_THREEDSX::ReadRomFS(std::shared_ptr<FileUtil::IOFile>& ro
     return ResultStatus::ErrorNotUsed;
 }
 
+ResultStatus AppLoader_THREEDSX::ReadIcon(std::vector<u8>& buffer) {
+    if (!file.IsOpen())
+        return ResultStatus::Error;
+
+    // Reset read pointer in case this file has been read before.
+    file.Seek(0, SEEK_SET);
+
+    THREEDSX_Header hdr;
+    if (file.ReadBytes(&hdr, sizeof(THREEDSX_Header)) != sizeof(THREEDSX_Header))
+        return ResultStatus::Error;
+
+    if (hdr.header_size != sizeof(THREEDSX_Header))
+        return ResultStatus::Error;
+
+    // Check if the 3DSX has a SMDH...
+    if (hdr.smdh_offset != 0) {
+        file.Seek(hdr.smdh_offset, SEEK_SET);
+        buffer.resize(hdr.smdh_size);
+
+        if (file.ReadBytes(&buffer[0], hdr.smdh_size) != hdr.smdh_size)
+            return ResultStatus::Error;
+
+        return ResultStatus::Success;
+    }
+    return ResultStatus::ErrorNotUsed;
+}
+
 } // namespace Loader
diff --git a/src/core/loader/3dsx.h b/src/core/loader/3dsx.h
index 365ddb7a5..3ee686703 100644
--- a/src/core/loader/3dsx.h
+++ b/src/core/loader/3dsx.h
@@ -17,7 +17,7 @@ namespace Loader {
 /// Loads an 3DSX file
 class AppLoader_THREEDSX final : public AppLoader {
 public:
-    AppLoader_THREEDSX(FileUtil::IOFile&& file, std::string filename, const std::string& filepath)
+    AppLoader_THREEDSX(FileUtil::IOFile&& file, const std::string& filename, const std::string& filepath)
         : AppLoader(std::move(file)), filename(std::move(filename)), filepath(filepath) {}
 
     /**
@@ -33,6 +33,13 @@ public:
      */
     ResultStatus Load() override;
 
+    /**
+     * Get the icon (typically icon section) of the application
+     * @param buffer Reference to buffer to store data
+     * @return ResultStatus result of function
+     */
+    ResultStatus ReadIcon(std::vector<u8>& buffer) override;
+
     /**
      * Get the RomFS of the application
      * @param romfs_file Reference to buffer to store data
diff --git a/src/core/loader/loader.cpp b/src/core/loader/loader.cpp
index 886501c41..af3f62248 100644
--- a/src/core/loader/loader.cpp
+++ b/src/core/loader/loader.cpp
@@ -90,6 +90,28 @@ const char* GetFileTypeString(FileType type) {
     return "unknown";
 }
 
+std::unique_ptr<AppLoader> GetLoader(FileUtil::IOFile&& file, FileType type,
+    const std::string& filename, const std::string& filepath) {
+    switch (type) {
+
+    // 3DSX file format.
+    case FileType::THREEDSX:
+        return std::make_unique<AppLoader_THREEDSX>(std::move(file), filename, filepath);
+
+    // Standard ELF file format.
+    case FileType::ELF:
+        return std::make_unique<AppLoader_ELF>(std::move(file), filename);
+
+    // NCCH/NCSD container formats.
+    case FileType::CXI:
+    case FileType::CCI:
+        return std::make_unique<AppLoader_NCCH>(std::move(file), filepath);
+
+    default:
+        return std::unique_ptr<AppLoader>();
+    }
+}
+
 ResultStatus LoadFile(const std::string& filename) {
     FileUtil::IOFile file(filename, "rb");
     if (!file.IsOpen()) {
@@ -111,37 +133,28 @@ ResultStatus LoadFile(const std::string& filename) {
 
     LOG_INFO(Loader, "Loading file %s as %s...", filename.c_str(), GetFileTypeString(type));
 
+    std::unique_ptr<AppLoader> app_loader = GetLoader(std::move(file), type, filename_filename, filename);
+
     switch (type) {
 
-    //3DSX file format...
+    // 3DSX file format...
+    // or NCCH/NCSD container formats...
     case FileType::THREEDSX:
+    case FileType::CXI:
+    case FileType::CCI:
     {
-        AppLoader_THREEDSX app_loader(std::move(file), filename_filename, filename);
         // Load application and RomFS
-        if (ResultStatus::Success == app_loader.Load()) {
-            Service::FS::RegisterArchiveType(std::make_unique<FileSys::ArchiveFactory_RomFS>(app_loader), Service::FS::ArchiveIdCode::RomFS);
+        ResultStatus result = app_loader->Load();
+        if (ResultStatus::Success == result) {
+            Service::FS::RegisterArchiveType(std::make_unique<FileSys::ArchiveFactory_RomFS>(*app_loader), Service::FS::ArchiveIdCode::RomFS);
             return ResultStatus::Success;
         }
-        break;
+        return result;
     }
 
     // Standard ELF file format...
     case FileType::ELF:
-        return AppLoader_ELF(std::move(file), filename_filename).Load();
-
-    // NCCH/NCSD container formats...
-    case FileType::CXI:
-    case FileType::CCI:
-    {
-        AppLoader_NCCH app_loader(std::move(file), filename);
-
-        // Load application and RomFS
-        ResultStatus result = app_loader.Load();
-        if (ResultStatus::Success == result) {
-            Service::FS::RegisterArchiveType(std::make_unique<FileSys::ArchiveFactory_RomFS>(app_loader), Service::FS::ArchiveIdCode::RomFS);
-        }
-        return result;
-    }
+        return app_loader->Load();
 
     // CIA file format...
     case FileType::CIA:
diff --git a/src/core/loader/loader.h b/src/core/loader/loader.h
index 84a4ce5fc..9d3e9ed3b 100644
--- a/src/core/loader/loader.h
+++ b/src/core/loader/loader.h
@@ -10,8 +10,10 @@
 #include <string>
 #include <vector>
 
+#include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/file_util.h"
+#include "common/swap.h"
 
 namespace Kernel {
 struct AddressMapping;
@@ -78,6 +80,51 @@ constexpr u32 MakeMagic(char a, char b, char c, char d) {
     return a | b << 8 | c << 16 | d << 24;
 }
 
+/// SMDH data structure that contains titles, icons etc. See https://www.3dbrew.org/wiki/SMDH
+struct SMDH {
+    u32_le magic;
+    u16_le version;
+    INSERT_PADDING_BYTES(2);
+
+    struct Title {
+        std::array<u16, 0x40> short_title;
+        std::array<u16, 0x80> long_title;
+        std::array<u16, 0x40> publisher;
+    };
+    std::array<Title, 16> titles;
+
+    std::array<u8, 16> ratings;
+    u32_le region_lockout;
+    u32_le match_maker_id;
+    u64_le match_maker_bit_id;
+    u32_le flags;
+    u16_le eula_version;
+    INSERT_PADDING_BYTES(2);
+    float_le banner_animation_frame;
+    u32_le cec_id;
+    INSERT_PADDING_BYTES(8);
+
+    std::array<u8, 0x480> small_icon;
+    std::array<u8, 0x1200> large_icon;
+
+    /// indicates the language used for each title entry
+    enum class TitleLanguage {
+        Japanese = 0,
+        English = 1,
+        French = 2,
+        German = 3,
+        Italian = 4,
+        Spanish = 5,
+        SimplifiedChinese = 6,
+        Korean= 7,
+        Dutch = 8,
+        Portuguese = 9,
+        Russian = 10,
+        TraditionalChinese = 11
+    };
+};
+static_assert(sizeof(SMDH) == 0x36C0, "SMDH structure size is wrong");
+
 /// Interface for loading an application
 class AppLoader : NonCopyable {
 public:
@@ -149,6 +196,16 @@ protected:
  */
 extern const std::initializer_list<Kernel::AddressMapping> default_address_mappings;
 
+/**
+ * Get a loader for a file with a specific type
+ * @param file The file to load
+ * @param type The type of the file
+ * @param filename the file name (without path)
+ * @param filepath the file full path (with name)
+ * @return std::unique_ptr<AppLoader> a pointer to a loader object;  nullptr for unsupported type
+ */
+std::unique_ptr<AppLoader> GetLoader(FileUtil::IOFile&& file, FileType type, const std::string& filename, const std::string& filepath);
+
 /**
  * Identifies and loads a bootable file
  * @param filename String filename of bootable file
diff --git a/src/core/loader/ncch.cpp b/src/core/loader/ncch.cpp
index 066e91a9e..7391bdb26 100644
--- a/src/core/loader/ncch.cpp
+++ b/src/core/loader/ncch.cpp
@@ -156,6 +156,9 @@ ResultStatus AppLoader_NCCH::LoadExec() {
         Kernel::g_current_process->resource_limit = Kernel::ResourceLimit::GetForCategory(
             static_cast<Kernel::ResourceLimitCategory>(exheader_header.arm11_system_local_caps.resource_limit_category));
 
+        // Set the default CPU core for this process
+        Kernel::g_current_process->ideal_processor = exheader_header.arm11_system_local_caps.ideal_processor;
+
         // Copy data while converting endianess
         std::array<u32, ARRAY_SIZE(exheader_header.arm11_kernel_caps.descriptors)> kernel_caps;
         std::copy_n(exheader_header.arm11_kernel_caps.descriptors, kernel_caps.size(), begin(kernel_caps));
@@ -173,6 +176,10 @@ ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>&
     if (!file.IsOpen())
         return ResultStatus::Error;
 
+    ResultStatus result = LoadExeFS();
+    if (result != ResultStatus::Success)
+        return result;
+
     LOG_DEBUG(Loader, "%d sections:", kMaxSections);
     // Iterate through the ExeFs archive until we find a section with the specified name...
     for (unsigned section_number = 0; section_number < kMaxSections; section_number++) {
@@ -215,9 +222,9 @@ ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>&
     return ResultStatus::ErrorNotUsed;
 }
 
-ResultStatus AppLoader_NCCH::Load() {
-    if (is_loaded)
-        return ResultStatus::ErrorAlreadyLoaded;
+ResultStatus AppLoader_NCCH::LoadExeFS() {
+    if (is_exefs_loaded)
+        return ResultStatus::Success;
 
     if (!file.IsOpen())
         return ResultStatus::Error;
@@ -282,6 +289,18 @@ ResultStatus AppLoader_NCCH::Load() {
     if (file.ReadBytes(&exefs_header, sizeof(ExeFs_Header)) != sizeof(ExeFs_Header))
         return ResultStatus::Error;
 
+    is_exefs_loaded = true;
+    return ResultStatus::Success;
+}
+
+ResultStatus AppLoader_NCCH::Load() {
+    if (is_loaded)
+        return ResultStatus::ErrorAlreadyLoaded;
+
+    ResultStatus result = LoadExeFS();
+    if (result != ResultStatus::Success)
+        return result;
+
     is_loaded = true; // Set state to loaded
 
     return LoadExec(); // Load the executable into memory for booting
diff --git a/src/core/loader/ncch.h b/src/core/loader/ncch.h
index ca6772a78..fd852c3de 100644
--- a/src/core/loader/ncch.h
+++ b/src/core/loader/ncch.h
@@ -232,6 +232,13 @@ private:
      */
     ResultStatus LoadExec();
 
+    /**
+     * Ensure ExeFS is loaded and ready for reading sections
+     * @return ResultStatus result of function
+     */
+    ResultStatus LoadExeFS();
+
+    bool            is_exefs_loaded = false;
     bool            is_compressed = false;
 
     u32             entry_point = 0;
diff --git a/src/core/tracer/recorder.cpp b/src/core/tracer/recorder.cpp
index c6dc35c83..7abaacf70 100644
--- a/src/core/tracer/recorder.cpp
+++ b/src/core/tracer/recorder.cpp
@@ -26,17 +26,17 @@ void Recorder::Finish(const std::string& filename) {
     // Calculate file offsets
     auto& initial = header.initial_state_offsets;
 
-    initial.gpu_registers_size      = initial_state.gpu_registers.size();
-    initial.lcd_registers_size      = initial_state.lcd_registers.size();
-    initial.pica_registers_size     = initial_state.pica_registers.size();
-    initial.default_attributes_size = initial_state.default_attributes.size();
-    initial.vs_program_binary_size  = initial_state.vs_program_binary.size();
-    initial.vs_swizzle_data_size    = initial_state.vs_swizzle_data.size();
-    initial.vs_float_uniforms_size  = initial_state.vs_float_uniforms.size();
-    initial.gs_program_binary_size  = initial_state.gs_program_binary.size();
-    initial.gs_swizzle_data_size    = initial_state.gs_swizzle_data.size();
-    initial.gs_float_uniforms_size  = initial_state.gs_float_uniforms.size();
-    header.stream_size              = stream.size();
+    initial.gpu_registers_size      = static_cast<u32>(initial_state.gpu_registers.size());
+    initial.lcd_registers_size      = static_cast<u32>(initial_state.lcd_registers.size());
+    initial.pica_registers_size     = static_cast<u32>(initial_state.pica_registers.size());
+    initial.default_attributes_size = static_cast<u32>(initial_state.default_attributes.size());
+    initial.vs_program_binary_size  = static_cast<u32>(initial_state.vs_program_binary.size());
+    initial.vs_swizzle_data_size    = static_cast<u32>(initial_state.vs_swizzle_data.size());
+    initial.vs_float_uniforms_size  = static_cast<u32>(initial_state.vs_float_uniforms.size());
+    initial.gs_program_binary_size  = static_cast<u32>(initial_state.gs_program_binary.size());
+    initial.gs_swizzle_data_size    = static_cast<u32>(initial_state.gs_swizzle_data.size());
+    initial.gs_float_uniforms_size  = static_cast<u32>(initial_state.gs_float_uniforms.size());
+    header.stream_size              = static_cast<u32>(stream.size());
 
     initial.gpu_registers      = sizeof(header);
     initial.lcd_registers      = initial.gpu_registers      + initial.gpu_registers_size * sizeof(u32);
@@ -68,7 +68,7 @@ void Recorder::Finish(const std::string& filename) {
             DEBUG_ASSERT(stream_element.extra_data.size() == 0);
             break;
         }
-        header.stream_offset += stream_element.extra_data.size();
+        header.stream_offset += static_cast<u32>(stream_element.extra_data.size());
     }
 
     try {
diff --git a/src/video_core/clipper.cpp b/src/video_core/clipper.cpp
index 2bc747102..db99ce666 100644
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@@ -75,8 +75,6 @@ static void InitScreenCoordinates(OutputVertex& vtx)
     viewport.halfsize_y = float24::FromRaw(regs.viewport_size_y);
     viewport.offset_x   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.x));
     viewport.offset_y   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.y));
-    viewport.zscale     = float24::FromRaw(regs.viewport_depth_range);
-    viewport.offset_z   = float24::FromRaw(regs.viewport_depth_far_plane);
 
     float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
     vtx.color *= inv_w;
@@ -89,7 +87,7 @@ static void InitScreenCoordinates(OutputVertex& vtx)
 
     vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
     vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
+    vtx.screenpos[2] = vtx.pos.z * inv_w;
 }
 
 void ProcessTriangle(const OutputVertex &v0, const OutputVertex &v1, const OutputVertex &v2) {
diff --git a/src/video_core/command_processor.cpp b/src/video_core/command_processor.cpp
index be1a936b2..e7dc5ddac 100644
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@@ -144,13 +144,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                         immediate_attribute_id = 0;
 
                         Shader::UnitState<false> shader_unit;
-                        Shader::Setup();
-
-                        if (g_debug_context)
-                            g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, static_cast<void*>(&immediate_input));
+                        g_state.vs.Setup();
 
                         // Send to vertex shader
-                        Shader::OutputVertex output = Shader::Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);
+                        if (g_debug_context)
+                            g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast<void*>(&immediate_input));
+                        Shader::OutputVertex output = g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);
 
                         // Send to renderer
                         using Pica::Shader::OutputVertex;
@@ -238,7 +237,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
             vertex_cache_ids.fill(-1);
 
             Shader::UnitState<false> shader_unit;
-            Shader::Setup();
+            g_state.vs.Setup();
 
             for (unsigned int index = 0; index < regs.num_vertices; ++index)
             {
@@ -272,11 +271,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                     Shader::InputVertex input;
                     loader.LoadVertex(base_address, index, vertex, input, memory_accesses);
 
-                    if (g_debug_context)
-                        g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
-
                     // Send to vertex shader
-                    output = Shader::Run(shader_unit, input, loader.GetNumTotalAttributes());
+                    if (g_debug_context)
+                        g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input);
+                    output = g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes());
 
                     if (is_indexed) {
                         vertex_cache[vertex_cache_pos] = output;
diff --git a/src/video_core/debug_utils/debug_utils.cpp b/src/video_core/debug_utils/debug_utils.cpp
index fb20f81dd..2f645b441 100644
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@@ -208,11 +208,12 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c
 
     // TODO: Reduce the amount of binary code written to relevant portions
     dvlp.binary_offset = write_offset - dvlp_offset;
-    dvlp.binary_size_words = setup.program_code.size();
-    QueueForWriting(reinterpret_cast<const u8*>(setup.program_code.data()), setup.program_code.size() * sizeof(u32));
+    dvlp.binary_size_words = static_cast<uint32_t>(setup.program_code.size());
+    QueueForWriting(reinterpret_cast<const u8*>(setup.program_code.data()),
+                    static_cast<u32>(setup.program_code.size()) * sizeof(u32));
 
     dvlp.swizzle_info_offset = write_offset - dvlp_offset;
-    dvlp.swizzle_info_num_entries = setup.swizzle_data.size();
+    dvlp.swizzle_info_num_entries = static_cast<uint32_t>(setup.swizzle_data.size());
     u32 dummy = 0;
     for (unsigned int i = 0; i < setup.swizzle_data.size(); ++i) {
         QueueForWriting(reinterpret_cast<const u8*>(&setup.swizzle_data[i]), sizeof(setup.swizzle_data[i]));
@@ -264,7 +265,7 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c
             constant_table.emplace_back(constant);
     }
     dvle.constant_table_offset = write_offset - dvlb.dvle_offset;
-    dvle.constant_table_size = constant_table.size();
+    dvle.constant_table_size = static_cast<uint32_t>(constant_table.size());
     for (const auto& constant : constant_table) {
         QueueForWriting(reinterpret_cast<const u8*>(&constant), sizeof(constant));
     }
diff --git a/src/video_core/debug_utils/debug_utils.h b/src/video_core/debug_utils/debug_utils.h
index be2d0301a..f628292a4 100644
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@@ -40,7 +40,7 @@ public:
         PicaCommandProcessed,
         IncomingPrimitiveBatch,
         FinishedPrimitiveBatch,
-        VertexLoaded,
+        VertexShaderInvocation,
         IncomingDisplayTransfer,
         GSPCommandProcessed,
         BufferSwapped,
diff --git a/src/video_core/pica.cpp b/src/video_core/pica.cpp
index be82cf4b5..ec78f9593 100644
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@@ -500,7 +500,7 @@ void Init() {
 }
 
 void Shutdown() {
-    Shader::Shutdown();
+    Shader::ClearCache();
 }
 
 template <typename T>
diff --git a/src/video_core/pica.h b/src/video_core/pica.h
index 5891fb72a..86c0a0096 100644
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@@ -70,7 +70,7 @@ struct Regs {
     INSERT_PADDING_WORDS(0x9);
 
     BitField<0, 24, u32> viewport_depth_range; // float24
-    BitField<0, 24, u32> viewport_depth_far_plane; // float24
+    BitField<0, 24, u32> viewport_depth_near_plane; // float24
 
     BitField<0, 3, u32> vs_output_total;
 
@@ -122,9 +122,31 @@ struct Regs {
         BitField<16, 10, s32> y;
     } viewport_corner;
 
-    INSERT_PADDING_WORDS(0x17);
+    INSERT_PADDING_WORDS(0x1);
+
+    //TODO: early depth
+    INSERT_PADDING_WORDS(0x1);
+
+    INSERT_PADDING_WORDS(0x2);
+
+    enum DepthBuffering : u32 {
+        WBuffering  = 0,
+        ZBuffering  = 1,
+    };
+    BitField< 0, 1, DepthBuffering> depthmap_enable;
+
+    INSERT_PADDING_WORDS(0x12);
 
     struct TextureConfig {
+        enum TextureType : u32 {
+            Texture2D    = 0,
+            TextureCube  = 1,
+            Shadow2D     = 2,
+            Projection2D = 3,
+            ShadowCube   = 4,
+            Disabled     = 5,
+        };
+
         enum WrapMode : u32 {
             ClampToEdge    = 0,
             ClampToBorder  = 1,
@@ -155,6 +177,7 @@ struct Regs {
             BitField< 2, 1, TextureFilter> min_filter;
             BitField< 8, 2, WrapMode> wrap_t;
             BitField<12, 2, WrapMode> wrap_s;
+            BitField<28, 2, TextureType> type; ///< @note Only valid for texture 0 according to 3DBrew.
         };
 
         INSERT_PADDING_WORDS(0x1);
@@ -1279,10 +1302,11 @@ ASSERT_REG_POSITION(cull_mode, 0x40);
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
 ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
-ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
+ASSERT_REG_POSITION(viewport_depth_near_plane, 0x4e);
 ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
 ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
 ASSERT_REG_POSITION(viewport_corner, 0x68);
+ASSERT_REG_POSITION(depthmap_enable, 0x6D);
 ASSERT_REG_POSITION(texture0_enable, 0x80);
 ASSERT_REG_POSITION(texture0, 0x81);
 ASSERT_REG_POSITION(texture0_format, 0x8e);
diff --git a/src/video_core/pica_state.h b/src/video_core/pica_state.h
index bbecad850..1059c6ae4 100644
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@@ -56,7 +56,7 @@ struct State {
         // Used to buffer partial vertices for immediate-mode rendering.
         Shader::InputVertex input_vertex;
         // Index of the next attribute to be loaded into `input_vertex`.
-        int current_attribute = 0;
+        u32 current_attribute = 0;
     } immediate;
 
     // This is constructed with a dummy triangle topology
diff --git a/src/video_core/rasterizer.cpp b/src/video_core/rasterizer.cpp
index df67b9081..65168f05a 100644
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@@ -442,8 +442,33 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
 
                 DEBUG_ASSERT(0 != texture.config.address);
 
-                int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
-                int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+                float24 u = uv[i].u();
+                float24 v = uv[i].v();
+
+                // Only unit 0 respects the texturing type (according to 3DBrew)
+                // TODO: Refactor so cubemaps and shadowmaps can be handled
+                if (i == 0) {
+                    switch(texture.config.type) {
+                    case Regs::TextureConfig::Texture2D:
+                        break;
+                    case Regs::TextureConfig::Projection2D: {
+                        auto tc0_w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
+                        u /= tc0_w;
+                        v /= tc0_w;
+                        break;
+                    }
+                    default:
+                        // TODO: Change to LOG_ERROR when more types are handled.
+                        LOG_DEBUG(HW_GPU, "Unhandled texture type %x", (int)texture.config.type);
+                        UNIMPLEMENTED();
+                        break;
+                    }
+                }
+
+                int s = (int)(u * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
+                int t = (int)(v * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+
+
                 static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
                     switch (mode) {
                         case Regs::TextureConfig::ClampToEdge:
@@ -862,10 +887,30 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
                 }
             }
 
+            // interpolated_z = z / w
+            float interpolated_z_over_w = (v0.screenpos[2].ToFloat32() * w0 +
+                                           v1.screenpos[2].ToFloat32() * w1 +
+                                           v2.screenpos[2].ToFloat32() * w2) / wsum;
+
+            // Not fully accurate. About 3 bits in precision are missing.
+            // Z-Buffer (z / w * scale + offset)
+            float depth_scale = float24::FromRaw(regs.viewport_depth_range).ToFloat32();
+            float depth_offset = float24::FromRaw(regs.viewport_depth_near_plane).ToFloat32();
+            float depth = interpolated_z_over_w * depth_scale + depth_offset;
+
+            // Potentially switch to W-Buffer
+            if (regs.depthmap_enable == Pica::Regs::DepthBuffering::WBuffering) {
+
+                // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
+                depth *= interpolated_w_inverse.ToFloat32() * wsum;
+            }
+
+            // Clamp the result
+            depth = MathUtil::Clamp(depth, 0.0f, 1.0f);
+
+            // Convert float to integer
             unsigned num_bits = Regs::DepthBitsPerPixel(regs.framebuffer.depth_format);
-            u32 z = (u32)((v0.screenpos[2].ToFloat32() * w0 +
-                           v1.screenpos[2].ToFloat32() * w1 +
-                           v2.screenpos[2].ToFloat32() * w2) * ((1 << num_bits) - 1) / wsum);
+            u32 z = (u32)(depth * ((1 << num_bits) - 1));
 
             if (output_merger.depth_test_enable) {
                 u32 ref_z = GetDepth(x >> 4, y >> 4);
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index 519d81aeb..ed2e2f3ae 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -76,6 +76,9 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD1);
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD2);
 
+    glVertexAttribPointer(GLShader::ATTRIBUTE_TEXCOORD0_W, 1, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, tex_coord0_w));
+    glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD0_W);
+
     glVertexAttribPointer(GLShader::ATTRIBUTE_NORMQUAT, 4, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, normquat));
     glEnableVertexAttribArray(GLShader::ATTRIBUTE_NORMQUAT);
 
@@ -93,7 +96,7 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
     state.Apply();
 
     for (size_t i = 0; i < lighting_luts.size(); ++i) {
-        glActiveTexture(GL_TEXTURE3 + i);
+        glActiveTexture(static_cast<GLenum>(GL_TEXTURE3 + i));
         glTexImage1D(GL_TEXTURE_1D, 0, GL_RGBA32F, 256, 0, GL_RGBA, GL_FLOAT, nullptr);
         glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
         glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
@@ -256,10 +259,15 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
 
     // Depth modifiers
     case PICA_REG_INDEX(viewport_depth_range):
-    case PICA_REG_INDEX(viewport_depth_far_plane):
+    case PICA_REG_INDEX(viewport_depth_near_plane):
         SyncDepthModifiers();
         break;
 
+    // Depth buffering
+    case PICA_REG_INDEX(depthmap_enable):
+        shader_dirty = true;
+        break;
+
     // Blending
     case PICA_REG_INDEX(output_merger.alphablend_enable):
         SyncBlendEnabled();
@@ -314,6 +322,11 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
         SyncLogicOp();
         break;
 
+    // Texture 0 type
+    case PICA_REG_INDEX(texture0.type):
+        shader_dirty = true;
+        break;
+
     // TEV stages
     case PICA_REG_INDEX(tev_stage0.color_source1):
     case PICA_REG_INDEX(tev_stage0.color_modifier1):
@@ -910,10 +923,10 @@ void RasterizerOpenGL::SyncCullMode() {
 }
 
 void RasterizerOpenGL::SyncDepthModifiers() {
-    float depth_scale = -Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
-    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_far_plane).ToFloat32() / 2.0f;
+    float depth_scale = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
+    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_near_plane).ToFloat32();
 
-    // TODO: Implement scale modifier
+    uniform_block_data.data.depth_scale = depth_scale;
     uniform_block_data.data.depth_offset = depth_offset;
     uniform_block_data.dirty = true;
 }
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h
index 63ff7716d..eed00011a 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@@ -39,139 +39,185 @@ struct ScreenInfo;
  * directly accessing Pica registers. This should reduce the risk of bugs in shader generation where
  * Pica state is not being captured in the shader cache key, thereby resulting in (what should be)
  * two separate shaders sharing the same key.
+ *
+ * We use a union because "implicitly-defined copy/move constructor for a union X copies the object representation of X."
+ * and "implicitly-defined copy assignment operator for a union X copies the object representation (3.9) of X."
+ * = Bytewise copy instead of memberwise copy.
+ * This is important because the padding bytes are included in the hash and comparison between objects.
  */
-struct PicaShaderConfig {
+union PicaShaderConfig {
+
     /// Construct a PicaShaderConfig with the current Pica register configuration.
     static PicaShaderConfig CurrentConfig() {
         PicaShaderConfig res;
+
+        auto& state = res.state;
+        std::memset(&state, 0, sizeof(PicaShaderConfig::State));
+
         const auto& regs = Pica::g_state.regs;
 
-        res.alpha_test_func = regs.output_merger.alpha_test.enable ?
+        state.depthmap_enable = regs.depthmap_enable;
+
+        state.alpha_test_func = regs.output_merger.alpha_test.enable ?
             regs.output_merger.alpha_test.func.Value() : Pica::Regs::CompareFunc::Always;
 
-        // Copy tev stages
+        state.texture0_type = regs.texture0.type;
+
+        // Copy relevant tev stages fields.
+        // We don't sync const_color here because of the high variance, it is a
+        // shader uniform instead.
         const auto& tev_stages = regs.GetTevStages();
-        DEBUG_ASSERT(res.tev_stages.size() == tev_stages.size());
+        DEBUG_ASSERT(state.tev_stages.size() == tev_stages.size());
         for (size_t i = 0; i < tev_stages.size(); i++) {
             const auto& tev_stage = tev_stages[i];
-            res.tev_stages[i].sources_raw = tev_stage.sources_raw;
-            res.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw;
-            res.tev_stages[i].ops_raw = tev_stage.ops_raw;
-            res.tev_stages[i].const_color = tev_stage.const_color;
-            res.tev_stages[i].scales_raw = tev_stage.scales_raw;
+            state.tev_stages[i].sources_raw = tev_stage.sources_raw;
+            state.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw;
+            state.tev_stages[i].ops_raw = tev_stage.ops_raw;
+            state.tev_stages[i].scales_raw = tev_stage.scales_raw;
         }
 
-        res.combiner_buffer_input =
+        state.combiner_buffer_input =
             regs.tev_combiner_buffer_input.update_mask_rgb.Value() |
             regs.tev_combiner_buffer_input.update_mask_a.Value() << 4;
 
         // Fragment lighting
 
-        res.lighting.enable = !regs.lighting.disable;
-        res.lighting.src_num = regs.lighting.num_lights + 1;
+        state.lighting.enable = !regs.lighting.disable;
+        state.lighting.src_num = regs.lighting.num_lights + 1;
 
-        for (unsigned light_index = 0; light_index < res.lighting.src_num; ++light_index) {
+        for (unsigned light_index = 0; light_index < state.lighting.src_num; ++light_index) {
             unsigned num = regs.lighting.light_enable.GetNum(light_index);
             const auto& light = regs.lighting.light[num];
-            res.lighting.light[light_index].num = num;
-            res.lighting.light[light_index].directional = light.directional != 0;
-            res.lighting.light[light_index].two_sided_diffuse = light.two_sided_diffuse != 0;
-            res.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
-            res.lighting.light[light_index].dist_atten_bias = Pica::float20::FromRaw(light.dist_atten_bias).ToFloat32();
-            res.lighting.light[light_index].dist_atten_scale = Pica::float20::FromRaw(light.dist_atten_scale).ToFloat32();
+            state.lighting.light[light_index].num = num;
+            state.lighting.light[light_index].directional = light.directional != 0;
+            state.lighting.light[light_index].two_sided_diffuse = light.two_sided_diffuse != 0;
+            state.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
+            state.lighting.light[light_index].dist_atten_bias = Pica::float20::FromRaw(light.dist_atten_bias).ToFloat32();
+            state.lighting.light[light_index].dist_atten_scale = Pica::float20::FromRaw(light.dist_atten_scale).ToFloat32();
         }
 
-        res.lighting.lut_d0.enable = regs.lighting.disable_lut_d0 == 0;
-        res.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
-        res.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
-        res.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);
+        state.lighting.lut_d0.enable = regs.lighting.disable_lut_d0 == 0;
+        state.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
+        state.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
+        state.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);
 
-        res.lighting.lut_d1.enable = regs.lighting.disable_lut_d1 == 0;
-        res.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
-        res.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
-        res.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);
+        state.lighting.lut_d1.enable = regs.lighting.disable_lut_d1 == 0;
+        state.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
+        state.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
+        state.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);
 
-        res.lighting.lut_fr.enable = regs.lighting.disable_lut_fr == 0;
-        res.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
-        res.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
-        res.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);
+        state.lighting.lut_fr.enable = regs.lighting.disable_lut_fr == 0;
+        state.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
+        state.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
+        state.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);
 
-        res.lighting.lut_rr.enable = regs.lighting.disable_lut_rr == 0;
-        res.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
-        res.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
-        res.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);
+        state.lighting.lut_rr.enable = regs.lighting.disable_lut_rr == 0;
+        state.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
+        state.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
+        state.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);
 
-        res.lighting.lut_rg.enable = regs.lighting.disable_lut_rg == 0;
-        res.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
-        res.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
-        res.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);
+        state.lighting.lut_rg.enable = regs.lighting.disable_lut_rg == 0;
+        state.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
+        state.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
+        state.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);
 
-        res.lighting.lut_rb.enable = regs.lighting.disable_lut_rb == 0;
-        res.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
-        res.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
-        res.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);
+        state.lighting.lut_rb.enable = regs.lighting.disable_lut_rb == 0;
+        state.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
+        state.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
+        state.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);
 
-        res.lighting.config = regs.lighting.config;
-        res.lighting.fresnel_selector = regs.lighting.fresnel_selector;
-        res.lighting.bump_mode = regs.lighting.bump_mode;
-        res.lighting.bump_selector = regs.lighting.bump_selector;
-        res.lighting.bump_renorm = regs.lighting.disable_bump_renorm == 0;
-        res.lighting.clamp_highlights = regs.lighting.clamp_highlights != 0;
+        state.lighting.config = regs.lighting.config;
+        state.lighting.fresnel_selector = regs.lighting.fresnel_selector;
+        state.lighting.bump_mode = regs.lighting.bump_mode;
+        state.lighting.bump_selector = regs.lighting.bump_selector;
+        state.lighting.bump_renorm = regs.lighting.disable_bump_renorm == 0;
+        state.lighting.clamp_highlights = regs.lighting.clamp_highlights != 0;
 
         return res;
     }
 
     bool TevStageUpdatesCombinerBufferColor(unsigned stage_index) const {
-        return (stage_index < 4) && (combiner_buffer_input & (1 << stage_index));
+        return (stage_index < 4) && (state.combiner_buffer_input & (1 << stage_index));
     }
 
     bool TevStageUpdatesCombinerBufferAlpha(unsigned stage_index) const {
-        return (stage_index < 4) && ((combiner_buffer_input >> 4) & (1 << stage_index));
+        return (stage_index < 4) && ((state.combiner_buffer_input >> 4) & (1 << stage_index));
     }
 
     bool operator ==(const PicaShaderConfig& o) const {
-        return std::memcmp(this, &o, sizeof(PicaShaderConfig)) == 0;
+        return std::memcmp(&state, &o.state, sizeof(PicaShaderConfig::State)) == 0;
     };
 
-    Pica::Regs::CompareFunc alpha_test_func = Pica::Regs::CompareFunc::Never;
-    std::array<Pica::Regs::TevStageConfig, 6> tev_stages = {};
-    u8 combiner_buffer_input = 0;
+    // NOTE: MSVC15 (Update 2) doesn't think `delete`'d constructors and operators are TC.
+    //       This makes BitField not TC when used in a union or struct so we have to resort
+    //       to this ugly hack.
+    //       Once that bug is fixed we can use Pica::Regs::TevStageConfig here.
+    //       Doesn't include const_color because we don't sync it, see comment in CurrentConfig()
+    struct TevStageConfigRaw {
+        u32 sources_raw;
+        u32 modifiers_raw;
+        u32 ops_raw;
+        u32 scales_raw;
+        explicit operator Pica::Regs::TevStageConfig() const noexcept {
+            Pica::Regs::TevStageConfig stage;
+            stage.sources_raw = sources_raw;
+            stage.modifiers_raw = modifiers_raw;
+            stage.ops_raw = ops_raw;
+            stage.const_color = 0;
+            stage.scales_raw = scales_raw;
+            return stage;
+        }
+    };
 
-    struct {
-        struct {
-            unsigned num = 0;
-            bool directional = false;
-            bool two_sided_diffuse = false;
-            bool dist_atten_enable = false;
-            GLfloat dist_atten_scale = 0.0f;
-            GLfloat dist_atten_bias = 0.0f;
-        } light[8];
+    struct State {
 
-        bool enable = false;
-        unsigned src_num = 0;
-        Pica::Regs::LightingBumpMode bump_mode = Pica::Regs::LightingBumpMode::None;
-        unsigned bump_selector = 0;
-        bool bump_renorm = false;
-        bool clamp_highlights = false;
+        Pica::Regs::CompareFunc alpha_test_func;
+        Pica::Regs::TextureConfig::TextureType texture0_type;
+        std::array<TevStageConfigRaw, 6> tev_stages;
+        u8 combiner_buffer_input;
 
-        Pica::Regs::LightingConfig config = Pica::Regs::LightingConfig::Config0;
-        Pica::Regs::LightingFresnelSelector fresnel_selector = Pica::Regs::LightingFresnelSelector::None;
+        Pica::Regs::DepthBuffering depthmap_enable;
 
         struct {
-            bool enable = false;
-            bool abs_input = false;
-            Pica::Regs::LightingLutInput type = Pica::Regs::LightingLutInput::NH;
-            float scale = 1.0f;
-        } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
-    } lighting;
+            struct {
+                unsigned num;
+                bool directional;
+                bool two_sided_diffuse;
+                bool dist_atten_enable;
+                GLfloat dist_atten_scale;
+                GLfloat dist_atten_bias;
+            } light[8];
+
+            bool enable;
+            unsigned src_num;
+            Pica::Regs::LightingBumpMode bump_mode;
+            unsigned bump_selector;
+            bool bump_renorm;
+            bool clamp_highlights;
+
+            Pica::Regs::LightingConfig config;
+            Pica::Regs::LightingFresnelSelector fresnel_selector;
+
+            struct {
+                bool enable;
+                bool abs_input;
+                Pica::Regs::LightingLutInput type;
+                float scale;
+            } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
+        } lighting;
+
+    } state;
 };
+#if (__GNUC__ >= 5) || defined(__clang__) || defined(_MSC_VER)
+static_assert(std::is_trivially_copyable<PicaShaderConfig::State>::value, "PicaShaderConfig::State must be trivially copyable");
+#endif
 
 namespace std {
 
 template <>
 struct hash<PicaShaderConfig> {
     size_t operator()(const PicaShaderConfig& k) const {
-        return Common::ComputeHash64(&k, sizeof(PicaShaderConfig));
+        return Common::ComputeHash64(&k.state, sizeof(PicaShaderConfig::State));
     }
 };
 
@@ -238,6 +284,7 @@ private:
             tex_coord1[1] = v.tc1.y.ToFloat32();
             tex_coord2[0] = v.tc2.x.ToFloat32();
             tex_coord2[1] = v.tc2.y.ToFloat32();
+            tex_coord0_w = v.tc0_w.ToFloat32();
             normquat[0] = v.quat.x.ToFloat32();
             normquat[1] = v.quat.y.ToFloat32();
             normquat[2] = v.quat.z.ToFloat32();
@@ -258,6 +305,7 @@ private:
         GLfloat tex_coord0[2];
         GLfloat tex_coord1[2];
         GLfloat tex_coord2[2];
+        GLfloat tex_coord0_w;
         GLfloat normquat[4];
         GLfloat view[3];
     };
@@ -276,6 +324,7 @@ private:
         GLvec4 const_color[6];
         GLvec4 tev_combiner_buffer_color;
         GLint alphatest_ref;
+        GLfloat depth_scale;
         GLfloat depth_offset;
         alignas(16) GLvec3 lighting_global_ambient;
         LightSrc light_src[8];
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.cpp b/src/video_core/renderer_opengl/gl_shader_gen.cpp
index 9011caa39..71d60e69c 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@@ -32,8 +32,9 @@ static bool IsPassThroughTevStage(const TevStageConfig& stage) {
 }
 
 /// Writes the specified TEV stage source component(s)
-static void AppendSource(std::string& out, TevStageConfig::Source source,
+static void AppendSource(std::string& out, const PicaShaderConfig& config, TevStageConfig::Source source,
         const std::string& index_name) {
+    const auto& state = config.state;
     using Source = TevStageConfig::Source;
     switch (source) {
     case Source::PrimaryColor:
@@ -46,7 +47,20 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
         out += "secondary_fragment_color";
         break;
     case Source::Texture0:
-        out += "texture(tex[0], texcoord[0])";
+        // Only unit 0 respects the texturing type (according to 3DBrew)
+        switch(state.texture0_type) {
+        case Pica::Regs::TextureConfig::Texture2D:
+            out += "texture(tex[0], texcoord[0])";
+            break;
+        case Pica::Regs::TextureConfig::Projection2D:
+            out += "textureProj(tex[0], vec3(texcoord[0], texcoord0_w))";
+            break;
+        default:
+            out += "texture(tex[0], texcoord[0])";
+            LOG_CRITICAL(HW_GPU, "Unhandled texture type %x", static_cast<int>(state.texture0_type));
+            UNIMPLEMENTED();
+            break;
+        }
         break;
     case Source::Texture1:
         out += "texture(tex[1], texcoord[1])";
@@ -71,53 +85,53 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
 }
 
 /// Writes the color components to use for the specified TEV stage color modifier
-static void AppendColorModifier(std::string& out, TevStageConfig::ColorModifier modifier,
+static void AppendColorModifier(std::string& out, const PicaShaderConfig& config, TevStageConfig::ColorModifier modifier,
         TevStageConfig::Source source, const std::string& index_name) {
     using ColorModifier = TevStageConfig::ColorModifier;
     switch (modifier) {
     case ColorModifier::SourceColor:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rgb";
         break;
     case ColorModifier::OneMinusSourceColor:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rgb";
         break;
     case ColorModifier::SourceAlpha:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".aaa";
         break;
     case ColorModifier::OneMinusSourceAlpha:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".aaa";
         break;
     case ColorModifier::SourceRed:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rrr";
         break;
     case ColorModifier::OneMinusSourceRed:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".rrr";
         break;
     case ColorModifier::SourceGreen:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".ggg";
         break;
     case ColorModifier::OneMinusSourceGreen:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".ggg";
         break;
     case ColorModifier::SourceBlue:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".bbb";
         break;
     case ColorModifier::OneMinusSourceBlue:
         out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".bbb";
         break;
     default:
@@ -128,44 +142,44 @@ static void AppendColorModifier(std::string& out, TevStageConfig::ColorModifier
 }
 
 /// Writes the alpha component to use for the specified TEV stage alpha modifier
-static void AppendAlphaModifier(std::string& out, TevStageConfig::AlphaModifier modifier,
+static void AppendAlphaModifier(std::string& out, const PicaShaderConfig& config, TevStageConfig::AlphaModifier modifier,
         TevStageConfig::Source source, const std::string& index_name) {
     using AlphaModifier = TevStageConfig::AlphaModifier;
     switch (modifier) {
     case AlphaModifier::SourceAlpha:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".a";
         break;
     case AlphaModifier::OneMinusSourceAlpha:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".a";
         break;
     case AlphaModifier::SourceRed:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".r";
         break;
     case AlphaModifier::OneMinusSourceRed:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".r";
         break;
     case AlphaModifier::SourceGreen:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".g";
         break;
     case AlphaModifier::OneMinusSourceGreen:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".g";
         break;
     case AlphaModifier::SourceBlue:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".b";
         break;
     case AlphaModifier::OneMinusSourceBlue:
         out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
         out += ".b";
         break;
     default:
@@ -287,16 +301,16 @@ static void AppendAlphaTestCondition(std::string& out, Regs::CompareFunc func) {
 
 /// Writes the code to emulate the specified TEV stage
 static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsigned index) {
-    auto& stage = config.tev_stages[index];
+    const auto stage = static_cast<const Pica::Regs::TevStageConfig>(config.state.tev_stages[index]);
     if (!IsPassThroughTevStage(stage)) {
         std::string index_name = std::to_string(index);
 
         out += "vec3 color_results_" + index_name + "[3] = vec3[3](";
-        AppendColorModifier(out, stage.color_modifier1, stage.color_source1, index_name);
+        AppendColorModifier(out, config, stage.color_modifier1, stage.color_source1, index_name);
         out += ", ";
-        AppendColorModifier(out, stage.color_modifier2, stage.color_source2, index_name);
+        AppendColorModifier(out, config, stage.color_modifier2, stage.color_source2, index_name);
         out += ", ";
-        AppendColorModifier(out, stage.color_modifier3, stage.color_source3, index_name);
+        AppendColorModifier(out, config, stage.color_modifier3, stage.color_source3, index_name);
         out += ");\n";
 
         out += "vec3 color_output_" + index_name + " = ";
@@ -304,11 +318,11 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi
         out += ";\n";
 
         out += "float alpha_results_" + index_name + "[3] = float[3](";
-        AppendAlphaModifier(out, stage.alpha_modifier1, stage.alpha_source1, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier1, stage.alpha_source1, index_name);
         out += ", ";
-        AppendAlphaModifier(out, stage.alpha_modifier2, stage.alpha_source2, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier2, stage.alpha_source2, index_name);
         out += ", ";
-        AppendAlphaModifier(out, stage.alpha_modifier3, stage.alpha_source3, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier3, stage.alpha_source3, index_name);
         out += ");\n";
 
         out += "float alpha_output_" + index_name + " = ";
@@ -331,6 +345,8 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi
 
 /// Writes the code to emulate fragment lighting
 static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
+    const auto& lighting = config.state.lighting;
+
     // Define lighting globals
     out += "vec4 diffuse_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
            "vec4 specular_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
@@ -338,17 +354,17 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
            "vec3 refl_value = vec3(0.0);\n";
 
     // Compute fragment normals
-    if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
+    if (lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
         // Bump mapping is enabled using a normal map, read perturbation vector from the selected texture
-        std::string bump_selector = std::to_string(config.lighting.bump_selector);
+        std::string bump_selector = std::to_string(lighting.bump_selector);
         out += "vec3 surface_normal = 2.0 * texture(tex[" + bump_selector + "], texcoord[" + bump_selector + "]).rgb - 1.0;\n";
 
         // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher precision result
-        if (config.lighting.bump_renorm) {
+        if (lighting.bump_renorm) {
             std::string val = "(1.0 - (surface_normal.x*surface_normal.x + surface_normal.y*surface_normal.y))";
             out += "surface_normal.z = sqrt(max(" + val + ", 0.0));\n";
         }
-    } else if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
+    } else if (lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
         // Bump mapping is enabled using a tangent map
         LOG_CRITICAL(HW_GPU, "unimplemented bump mapping mode (tangent mapping)");
         UNIMPLEMENTED();
@@ -361,7 +377,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
     out += "vec3 normal = normalize(quaternion_rotate(normquat, surface_normal));\n";
 
     // Gets the index into the specified lookup table for specular lighting
-    auto GetLutIndex = [config](unsigned light_num, Regs::LightingLutInput input, bool abs) {
+    auto GetLutIndex = [&lighting](unsigned light_num, Regs::LightingLutInput input, bool abs) {
         const std::string half_angle = "normalize(normalize(view) + light_vector)";
         std::string index;
         switch (input) {
@@ -389,7 +405,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 
         if (abs) {
             // LUT index is in the range of (0.0, 1.0)
-            index = config.lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
+            index = lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
             return "(FLOAT_255 * clamp(" + index + ", 0.0, 1.0))";
         } else {
             // LUT index is in the range of (-1.0, 1.0)
@@ -407,8 +423,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
     };
 
     // Write the code to emulate each enabled light
-    for (unsigned light_index = 0; light_index < config.lighting.src_num; ++light_index) {
-        const auto& light_config = config.lighting.light[light_index];
+    for (unsigned light_index = 0; light_index < lighting.src_num; ++light_index) {
+        const auto& light_config = lighting.light[light_index];
         std::string light_src = "light_src[" + std::to_string(light_config.num) + "]";
 
         // Compute light vector (directional or positional)
@@ -432,39 +448,39 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
         }
 
         // If enabled, clamp specular component if lighting result is negative
-        std::string clamp_highlights = config.lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";
+        std::string clamp_highlights = lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";
 
         // Specular 0 component
         std::string d0_lut_value = "1.0";
-        if (config.lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
+        if (lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
             // Lookup specular "distribution 0" LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d0.type, config.lighting.lut_d0.abs_input);
-            d0_lut_value = "(" + std::to_string(config.lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_d0.type, lighting.lut_d0.abs_input);
+            d0_lut_value = "(" + std::to_string(lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
         }
         std::string specular_0 = "(" + d0_lut_value + " * " + light_src + ".specular_0)";
 
         // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
-        if (config.lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rr.type, config.lighting.lut_rr.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
+        if (lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rr.type, lighting.lut_rr.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
             out += "refl_value.r = " + value + ";\n";
         } else {
             out += "refl_value.r = 1.0;\n";
         }
 
         // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
-        if (config.lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rg.type, config.lighting.lut_rg.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
+        if (lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rg.type, lighting.lut_rg.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
             out += "refl_value.g = " + value + ";\n";
         } else {
             out += "refl_value.g = refl_value.r;\n";
         }
 
         // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
-        if (config.lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rb.type, config.lighting.lut_rb.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
+        if (lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rb.type, lighting.lut_rb.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
             out += "refl_value.b = " + value + ";\n";
         } else {
             out += "refl_value.b = refl_value.r;\n";
@@ -472,27 +488,27 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 
         // Specular 1 component
         std::string d1_lut_value = "1.0";
-        if (config.lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
+        if (lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
             // Lookup specular "distribution 1" LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d1.type, config.lighting.lut_d1.abs_input);
-            d1_lut_value = "(" + std::to_string(config.lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_d1.type, lighting.lut_d1.abs_input);
+            d1_lut_value = "(" + std::to_string(lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
         }
         std::string specular_1 = "(" + d1_lut_value + " * refl_value * " + light_src + ".specular_1)";
 
         // Fresnel
-        if (config.lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
+        if (lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
             // Lookup fresnel LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_fr.type, config.lighting.lut_fr.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_fr.type, lighting.lut_fr.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";
 
             // Enabled for difffuse lighting alpha component
-            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
-                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+            if (lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
                 out += "diffuse_sum.a  *= " + value + ";\n";
 
             // Enabled for the specular lighting alpha component
-            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
-                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+            if (lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
                 out += "specular_sum.a *= " + value + ";\n";
         }
 
@@ -510,6 +526,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 }
 
 std::string GenerateFragmentShader(const PicaShaderConfig& config) {
+    const auto& state = config.state;
+
     std::string out = R"(
 #version 330 core
 #define NUM_TEV_STAGES 6
@@ -519,6 +537,7 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {
 
 in vec4 primary_color;
 in vec2 texcoord[3];
+in float texcoord0_w;
 in vec4 normquat;
 in vec3 view;
 
@@ -536,6 +555,7 @@ layout (std140) uniform shader_data {
     vec4 const_color[NUM_TEV_STAGES];
     vec4 tev_combiner_buffer_color;
     int alphatest_ref;
+    float depth_scale;
     float depth_offset;
     vec3 lighting_global_ambient;
     LightSrc light_src[NUM_LIGHTS];
@@ -555,29 +575,37 @@ vec4 secondary_fragment_color = vec4(0.0);
 )";
 
     // Do not do any sort of processing if it's obvious we're not going to pass the alpha test
-    if (config.alpha_test_func == Regs::CompareFunc::Never) {
+    if (state.alpha_test_func == Regs::CompareFunc::Never) {
         out += "discard; }";
         return out;
     }
 
-    if (config.lighting.enable)
+    if (state.lighting.enable)
         WriteLighting(out, config);
 
     out += "vec4 combiner_buffer = vec4(0.0);\n";
     out += "vec4 next_combiner_buffer = tev_combiner_buffer_color;\n";
     out += "vec4 last_tex_env_out = vec4(0.0);\n";
 
-    for (size_t index = 0; index < config.tev_stages.size(); ++index)
+    for (size_t index = 0; index < state.tev_stages.size(); ++index)
         WriteTevStage(out, config, (unsigned)index);
 
-    if (config.alpha_test_func != Regs::CompareFunc::Always) {
+    if (state.alpha_test_func != Regs::CompareFunc::Always) {
         out += "if (";
-        AppendAlphaTestCondition(out, config.alpha_test_func);
+        AppendAlphaTestCondition(out, state.alpha_test_func);
         out += ") discard;\n";
     }
 
     out += "color = last_tex_env_out;\n";
-    out += "gl_FragDepth = gl_FragCoord.z + depth_offset;\n}";
+
+    out += "float z_over_w = 1.0 - gl_FragCoord.z * 2.0;\n";
+    out += "float depth = z_over_w * depth_scale + depth_offset;\n";
+    if (state.depthmap_enable == Pica::Regs::DepthBuffering::WBuffering) {
+        out += "depth /= gl_FragCoord.w;\n";
+    }
+    out += "gl_FragDepth = depth;\n";
+
+    out += "}";
 
     return out;
 }
@@ -585,17 +613,19 @@ vec4 secondary_fragment_color = vec4(0.0);
 std::string GenerateVertexShader() {
     std::string out = "#version 330 core\n";
 
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)  + ") in vec4 vert_position;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)     + ") in vec4 vert_color;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0) + ") in vec2 vert_texcoord0;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1) + ") in vec2 vert_texcoord1;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2) + ") in vec2 vert_texcoord2;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)  + ") in vec4 vert_normquat;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)      + ") in vec3 vert_view;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)    + ") in vec4 vert_position;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)       + ") in vec4 vert_color;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0)   + ") in vec2 vert_texcoord0;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1)   + ") in vec2 vert_texcoord1;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2)   + ") in vec2 vert_texcoord2;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0_W) + ") in float vert_texcoord0_w;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)    + ") in vec4 vert_normquat;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)        + ") in vec3 vert_view;\n";
 
     out += R"(
 out vec4 primary_color;
 out vec2 texcoord[3];
+out float texcoord0_w;
 out vec4 normquat;
 out vec3 view;
 
@@ -604,6 +634,7 @@ void main() {
     texcoord[0] = vert_texcoord0;
     texcoord[1] = vert_texcoord1;
     texcoord[2] = vert_texcoord2;
+    texcoord0_w = vert_texcoord0_w;
     normquat = vert_normquat;
     view = vert_view;
     gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
diff --git a/src/video_core/renderer_opengl/gl_shader_gen.h b/src/video_core/renderer_opengl/gl_shader_gen.h
index 3eb07d57a..bef3249cf 100644
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@@ -6,7 +6,7 @@
 
 #include <string>
 
-struct PicaShaderConfig;
+union PicaShaderConfig;
 
 namespace GLShader {
 
diff --git a/src/video_core/renderer_opengl/gl_shader_util.h b/src/video_core/renderer_opengl/gl_shader_util.h
index 097242f6f..f59912f79 100644
--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@@ -14,6 +14,7 @@ enum Attributes {
     ATTRIBUTE_TEXCOORD0,
     ATTRIBUTE_TEXCOORD1,
     ATTRIBUTE_TEXCOORD2,
+    ATTRIBUTE_TEXCOORD0_W,
     ATTRIBUTE_NORMQUAT,
     ATTRIBUTE_VIEW,
 };
diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp
index 0e9a0be8b..8f424a435 100644
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@@ -192,7 +192,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const GPU::Regs::FramebufferConfig& fram
     // only allows rows to have a memory alignement of 4.
     ASSERT(pixel_stride % 4 == 0);
 
-    if (!Rasterizer()->AccelerateDisplay(framebuffer, framebuffer_addr, pixel_stride, screen_info)) {
+    if (!Rasterizer()->AccelerateDisplay(framebuffer, framebuffer_addr, static_cast<u32>(pixel_stride), screen_info)) {
         // Reset the screen info's display texture to its own permanent texture
         screen_info.display_texture = screen_info.texture.resource.handle;
         screen_info.display_texcoords = MathUtil::Rectangle<float>(0.f, 0.f, 1.f, 1.f);
@@ -473,12 +473,6 @@ static void DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severity,
 bool RendererOpenGL::Init() {
     render_window->MakeCurrent();
 
-    // TODO: Make frontends initialize this, so they can use gladLoadGLLoader with their own loaders
-    if (!gladLoadGL()) {
-        LOG_CRITICAL(Render_OpenGL, "Failed to initialize GL functions! Exiting...");
-        exit(-1);
-    }
-
     if (GLAD_GL_KHR_debug) {
         glEnable(GL_DEBUG_OUTPUT);
         glDebugMessageCallback(DebugHandler, nullptr);
diff --git a/src/video_core/shader/shader.cpp b/src/video_core/shader/shader.cpp
index 65dcc9156..449fc703f 100644
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@@ -35,7 +35,13 @@ static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map;
 static const JitShader* jit_shader;
 #endif // ARCHITECTURE_x86_64
 
-void Setup() {
+void ClearCache() {
+#ifdef ARCHITECTURE_x86_64
+    shader_map.clear();
+#endif // ARCHITECTURE_x86_64
+}
+
+void ShaderSetup::Setup() {
 #ifdef ARCHITECTURE_x86_64
     if (VideoCore::g_shader_jit_enabled) {
         u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
@@ -54,18 +60,12 @@ void Setup() {
 #endif // ARCHITECTURE_x86_64
 }
 
-void Shutdown() {
-#ifdef ARCHITECTURE_x86_64
-    shader_map.clear();
-#endif // ARCHITECTURE_x86_64
-}
+MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));
 
-MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240));
-
-OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
+OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
     auto& config = g_state.regs.vs;
 
-    MICROPROFILE_SCOPE(GPU_VertexShader);
+    MICROPROFILE_SCOPE(GPU_Shader);
 
     state.program_counter = config.main_offset;
     state.debug.max_offset = 0;
@@ -140,7 +140,7 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
     return ret;
 }
 
-DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
+DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
     UnitState<true> state;
 
     state.program_counter = config.main_offset;
diff --git a/src/video_core/shader/shader.h b/src/video_core/shader/shader.h
index 56b83bfeb..7f417675a 100644
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@@ -43,7 +43,8 @@ struct OutputVertex {
     Math::Vec4<float24> color;
     Math::Vec2<float24> tc0;
     Math::Vec2<float24> tc1;
-    INSERT_PADDING_WORDS(2);
+    float24 tc0_w;
+    INSERT_PADDING_WORDS(1);
     Math::Vec3<float24> view;
     INSERT_PADDING_WORDS(1);
     Math::Vec2<float24> tc2;
@@ -83,23 +84,6 @@ struct OutputVertex {
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");
 
-/// Vertex shader memory
-struct ShaderSetup {
-    struct {
-        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
-        // therefore required to be 16-byte aligned.
-        alignas(16) Math::Vec4<float24> f[96];
-
-        std::array<bool, 16> b;
-        std::array<Math::Vec4<u8>, 4> i;
-    } uniforms;
-
-    Math::Vec4<float24> default_attributes[16];
-
-    std::array<u32, 1024> program_code;
-    std::array<u32, 1024> swizzle_data;
-};
-
 // Helper structure used to keep track of data useful for inspection of shader emulation
 template<bool full_debugging>
 struct DebugData;
@@ -342,33 +326,51 @@ struct UnitState {
     }
 };
 
-/**
- * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
- * vertex, which would happen within the `Run` function).
- */
-void Setup();
+/// Clears the shader cache
+void ClearCache();
 
-/// Performs any cleanup when the emulator is shutdown
-void Shutdown();
+struct ShaderSetup {
 
-/**
- * Runs the currently setup shader
- * @param state Shader unit state, must be setup per shader and per shader unit
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @return The output vertex, after having been processed by the vertex shader
- */
-OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+    struct {
+        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
+        // therefore required to be 16-byte aligned.
+        alignas(16) Math::Vec4<float24> f[96];
 
-/**
- * Produce debug information based on the given shader and input vertex
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @param config Configuration object for the shader pipeline
- * @param setup Setup object for the shader pipeline
- * @return Debug information for this shader with regards to the given vertex
- */
-DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);
+        std::array<bool, 16> b;
+        std::array<Math::Vec4<u8>, 4> i;
+    } uniforms;
+
+    Math::Vec4<float24> default_attributes[16];
+
+    std::array<u32, 1024> program_code;
+    std::array<u32, 1024> swizzle_data;
+
+    /**
+     * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
+     * vertex, which would happen within the `Run` function).
+     */
+    void Setup();
+
+    /**
+     * Runs the currently setup shader
+     * @param state Shader unit state, must be setup per shader and per shader unit
+     * @param input Input vertex into the shader
+     * @param num_attributes The number of vertex shader attributes
+     * @return The output vertex, after having been processed by the vertex shader
+     */
+    OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+
+    /**
+     * Produce debug information based on the given shader and input vertex
+     * @param input Input vertex into the shader
+     * @param num_attributes The number of vertex shader attributes
+     * @param config Configuration object for the shader pipeline
+     * @param setup Setup object for the shader pipeline
+     * @return Debug information for this shader with regards to the given vertex
+     */
+    DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);
+
+};
 
 } // namespace Shader