diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 3c3419bbc..b36cbb21e 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -23,12 +23,16 @@ set(SRCS
             thread.cpp
             timer.cpp
             utf8.cpp
+            x64_abi.cpp
+            x64_emitter.cpp
             )
 
 set(HEADERS
+            bit_set.h
             bit_field.h
             break_points.h
             chunk_file.h
+            code_block.h
             common.h
             common_funcs.h
             common_paths.h
@@ -65,6 +69,8 @@ set(HEADERS
             thunk.h
             timer.h
             utf8.h
+            x64_abi.h
+            x64_emitter.h
             )
 
 create_directory_groups(${SRCS} ${HEADERS})
diff --git a/src/common/bit_set.h b/src/common/bit_set.h
new file mode 100644
index 000000000..0091e4818
--- /dev/null
+++ b/src/common/bit_set.h
@@ -0,0 +1,167 @@
+// This file is under the public domain.
+
+#pragma once
+
+#include <cstddef>
+#include <initializer_list>
+#include <type_traits>
+#include "common_types.h"
+
+// Helper functions:
+
+#ifdef _WIN32
+template <typename T>
+static inline int CountSetBits(T v)
+{
+	// from https://graphics.stanford.edu/~seander/bithacks.html
+	// GCC has this built in, but MSVC's intrinsic will only emit the actual
+	// POPCNT instruction, which we're not depending on
+	v = v - ((v >> 1) & (T)~(T)0/3);
+	v = (v & (T)~(T)0/15*3) + ((v >> 2) & (T)~(T)0/15*3);
+	v = (v + (v >> 4)) & (T)~(T)0/255*15;
+	return (T)(v * ((T)~(T)0/255)) >> (sizeof(T) - 1) * 8;
+}
+static inline int LeastSignificantSetBit(u32 val)
+{
+	unsigned long index;
+	_BitScanForward(&index, val);
+	return (int)index;
+}
+static inline int LeastSignificantSetBit(u64 val)
+{
+	unsigned long index;
+	_BitScanForward64(&index, val);
+	return (int)index;
+}
+#else
+static inline int CountSetBits(u32 val) { return __builtin_popcount(val); }
+static inline int CountSetBits(u64 val) { return __builtin_popcountll(val); }
+static inline int LeastSignificantSetBit(u32 val) { return __builtin_ctz(val); }
+static inline int LeastSignificantSetBit(u64 val) { return __builtin_ctzll(val); }
+#endif
+
+// namespace avoids conflict with OS X Carbon; don't use BitSet<T> directly
+namespace BS
+{
+
+// Similar to std::bitset, this is a class which encapsulates a bitset, i.e.
+// using the set bits of an integer to represent a set of integers.  Like that
+// class, it acts like an array of bools:
+//     BitSet32 bs;
+//     bs[1] = true;
+// but also like the underlying integer ([0] = least significant bit):
+//     BitSet32 bs2 = ...;
+//     bs = (bs ^ bs2) & BitSet32(0xffff);
+// The following additional functionality is provided:
+// - Construction using an initializer list.
+//     BitSet bs { 1, 2, 4, 8 };
+// - Efficiently iterating through the set bits:
+//     for (int i : bs)
+//         [i is the *index* of a set bit]
+//   (This uses the appropriate CPU instruction to find the next set bit in one
+//   operation.)
+// - Counting set bits using .Count() - see comment on that method.
+
+// TODO: use constexpr when MSVC gets out of the Dark Ages
+
+template <typename IntTy>
+class BitSet
+{
+	static_assert(!std::is_signed<IntTy>::value, "BitSet should not be used with signed types");
+public:
+	// A reference to a particular bit, returned from operator[].
+	class Ref
+	{
+	public:
+		Ref(Ref&& other) : m_bs(other.m_bs), m_mask(other.m_mask) {}
+		Ref(BitSet* bs, IntTy mask) : m_bs(bs), m_mask(mask) {}
+		operator bool() const { return (m_bs->m_val & m_mask) != 0; }
+		bool operator=(bool set)
+		{
+			m_bs->m_val = (m_bs->m_val & ~m_mask) | (set ? m_mask : 0);
+			return set;
+		}
+	private:
+		BitSet* m_bs;
+		IntTy m_mask;
+	};
+
+	// A STL-like iterator is required to be able to use range-based for loops.
+	class Iterator
+	{
+	public:
+		Iterator(const Iterator& other) : m_val(other.m_val), m_bit(other.m_bit) {}
+		Iterator(IntTy val, int bit) : m_val(val), m_bit(bit) {}
+		Iterator& operator=(Iterator other) { new (this) Iterator(other); return *this; }
+		int operator*() { return m_bit; }
+		Iterator& operator++()
+		{
+			if (m_val == 0)
+			{
+				m_bit = -1;
+			}
+			else
+			{
+				int bit = LeastSignificantSetBit(m_val);
+				m_val &= ~(1 << bit);
+				m_bit = bit;
+			}
+			return *this;
+		}
+		Iterator operator++(int _)
+		{
+			Iterator other(*this);
+			++*this;
+			return other;
+		}
+		bool operator==(Iterator other) const { return m_bit == other.m_bit; }
+		bool operator!=(Iterator other) const { return m_bit != other.m_bit; }
+	private:
+		IntTy m_val;
+		int m_bit;
+	};
+
+	BitSet() : m_val(0) {}
+	explicit BitSet(IntTy val) : m_val(val) {}
+	BitSet(std::initializer_list<int> init)
+	{
+		m_val = 0;
+		for (int bit : init)
+			m_val |= (IntTy)1 << bit;
+	}
+
+	static BitSet AllTrue(size_t count)
+	{
+		return BitSet(count == sizeof(IntTy)*8 ? ~(IntTy)0 : (((IntTy)1 << count) - 1));
+	}
+
+	Ref operator[](size_t bit) { return Ref(this, (IntTy)1 << bit); }
+	const Ref operator[](size_t bit) const { return (*const_cast<BitSet*>(this))[bit]; }
+	bool operator==(BitSet other) const { return m_val == other.m_val; }
+	bool operator!=(BitSet other) const { return m_val != other.m_val; }
+	BitSet operator|(BitSet other) const { return BitSet(m_val | other.m_val); }
+	BitSet operator&(BitSet other) const { return BitSet(m_val & other.m_val); }
+	BitSet operator^(BitSet other) const { return BitSet(m_val ^ other.m_val); }
+	BitSet operator~() const { return BitSet(~m_val); }
+	BitSet& operator|=(BitSet other) { return *this = *this | other; }
+	BitSet& operator&=(BitSet other) { return *this = *this & other; }
+	BitSet& operator^=(BitSet other) { return *this = *this ^ other; }
+	operator u32() = delete;
+	operator bool() { return m_val != 0; }
+
+	// Warning: Even though on modern CPUs this is a single fast instruction,
+	// Dolphin's official builds do not currently assume POPCNT support on x86,
+	// so slower explicit bit twiddling is generated.  Still should generally
+	// be faster than a loop.
+	unsigned int Count() const { return CountSetBits(m_val); }
+
+	Iterator begin() const { Iterator it(m_val, 0); return ++it; }
+	Iterator end() const { return Iterator(m_val, -1); }
+
+	IntTy m_val;
+};
+
+}
+
+typedef BS::BitSet<u32> BitSet32;
+typedef BS::BitSet<u64> BitSet64;
diff --git a/src/common/code_block.h b/src/common/code_block.h
new file mode 100644
index 000000000..0dde07d59
--- /dev/null
+++ b/src/common/code_block.h
@@ -0,0 +1,76 @@
+// Copyright 2013 Dolphin Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/common.h"
+#include "common/memory_util.h"
+
+// Everything that needs to generate code should inherit from this.
+// You get memory management for free, plus, you can use all emitter functions without
+// having to prefix them with gen-> or something similar.
+// Example implementation:
+// class JIT : public CodeBlock<ARMXEmitter> {}
+template<class T> class CodeBlock : public T, NonCopyable
+{
+private:
+	// A privately used function to set the executable RAM space to something invalid.
+	// For debugging usefulness it should be used to set the RAM to a host specific breakpoint instruction
+	virtual void PoisonMemory() = 0;
+
+protected:
+	u8 *region;
+	size_t region_size;
+
+public:
+	CodeBlock() : region(nullptr), region_size(0) {}
+	virtual ~CodeBlock() { if (region) FreeCodeSpace(); }
+
+	// Call this before you generate any code.
+	void AllocCodeSpace(int size)
+	{
+		region_size = size;
+		region = (u8*)AllocateExecutableMemory(region_size);
+		T::SetCodePtr(region);
+	}
+
+	// Always clear code space with breakpoints, so that if someone accidentally executes
+	// uninitialized, it just breaks into the debugger.
+	void ClearCodeSpace()
+	{
+		PoisonMemory();
+		ResetCodePtr();
+	}
+
+	// Call this when shutting down. Don't rely on the destructor, even though it'll do the job.
+	void FreeCodeSpace()
+	{
+		FreeMemoryPages(region, region_size);
+		region = nullptr;
+		region_size = 0;
+	}
+
+	bool IsInSpace(u8 *ptr)
+	{
+		return (ptr >= region) && (ptr < (region + region_size));
+	}
+
+	// Cannot currently be undone. Will write protect the entire code region.
+	// Start over if you need to change the code (call FreeCodeSpace(), AllocCodeSpace()).
+	void WriteProtect()
+	{
+		WriteProtectMemory(region, region_size, true);
+	}
+
+	void ResetCodePtr()
+	{
+		T::SetCodePtr(region);
+	}
+
+	size_t GetSpaceLeft() const
+	{
+		return region_size - (T::GetCodePtr() - region);
+	}
+};
+
diff --git a/src/common/x64_abi.cpp b/src/common/x64_abi.cpp
new file mode 100644
index 000000000..2964fbf6c
--- /dev/null
+++ b/src/common/x64_abi.cpp
@@ -0,0 +1,224 @@
+// Copyright 2013 Dolphin Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include "common/common_types.h"
+#include "common/x64_abi.h"
+#include "common/x64_emitter.h"
+
+using namespace Gen;
+
+// Shared code between Win64 and Unix64
+
+void XEmitter::ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp)
+{
+	size_t shadow = 0;
+#if defined(_WIN32)
+	shadow = 0x20;
+#endif
+
+	int count = (mask & ABI_ALL_GPRS).Count();
+	rsp_alignment -= count * 8;
+	size_t subtraction = 0;
+	int fpr_count = (mask & ABI_ALL_FPRS).Count();
+	if (fpr_count)
+	{
+		// If we have any XMMs to save, we must align the stack here.
+		subtraction = rsp_alignment & 0xf;
+	}
+	subtraction += 16 * fpr_count;
+	size_t xmm_base_subtraction = subtraction;
+	subtraction += needed_frame_size;
+	subtraction += shadow;
+	// Final alignment.
+	rsp_alignment -= subtraction;
+	subtraction += rsp_alignment & 0xf;
+
+	*shadowp = shadow;
+	*subtractionp = subtraction;
+	*xmm_offsetp = subtraction - xmm_base_subtraction;
+}
+
+size_t XEmitter::ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size)
+{
+	size_t shadow, subtraction, xmm_offset;
+	ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
+
+	for (int r : mask & ABI_ALL_GPRS)
+		PUSH((X64Reg) r);
+
+	if (subtraction)
+		SUB(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+	for (int x : mask & ABI_ALL_FPRS)
+	{
+		MOVAPD(MDisp(RSP, (int)xmm_offset), (X64Reg) (x - 16));
+		xmm_offset += 16;
+	}
+
+	return shadow;
+}
+
+void XEmitter::ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size)
+{
+	size_t shadow, subtraction, xmm_offset;
+	ABI_CalculateFrameSize(mask, rsp_alignment, needed_frame_size, &shadow, &subtraction, &xmm_offset);
+
+	for (int x : mask & ABI_ALL_FPRS)
+	{
+		MOVAPD((X64Reg) (x - 16), MDisp(RSP, (int)xmm_offset));
+		xmm_offset += 16;
+	}
+
+	if (subtraction)
+		ADD(64, R(RSP), subtraction >= 0x80 ? Imm32((u32)subtraction) : Imm8((u8)subtraction));
+
+	for (int r = 15; r >= 0; r--)
+	{
+		if (mask[r])
+			POP((X64Reg) r);
+	}
+}
+
+// Common functions
+void XEmitter::ABI_CallFunction(const void *func)
+{
+	u64 distance = u64(func) - (u64(code) + 5);
+	if (distance >= 0x0000000080000000ULL &&
+	    distance <  0xFFFFFFFF80000000ULL)
+	{
+		// Far call
+		MOV(64, R(RAX), Imm64((u64)func));
+		CALLptr(R(RAX));
+	}
+	else
+	{
+		CALL(func);
+	}
+}
+
+void XEmitter::ABI_CallFunctionC16(const void *func, u16 param1)
+{
+	MOV(32, R(ABI_PARAM1), Imm32((u32)param1));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2)
+{
+	MOV(32, R(ABI_PARAM1), Imm32(param1));
+	MOV(32, R(ABI_PARAM2), Imm32((u32)param2));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionC(const void *func, u32 param1)
+{
+	MOV(32, R(ABI_PARAM1), Imm32(param1));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionCC(const void *func, u32 param1, u32 param2)
+{
+	MOV(32, R(ABI_PARAM1), Imm32(param1));
+	MOV(32, R(ABI_PARAM2), Imm32(param2));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionCP(const void *func, u32 param1, void *param2)
+{
+	MOV(32, R(ABI_PARAM1), Imm32(param1));
+	MOV(64, R(ABI_PARAM2), Imm64((u64)param2));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3)
+{
+	MOV(32, R(ABI_PARAM1), Imm32(param1));
+	MOV(32, R(ABI_PARAM2), Imm32(param2));
+	MOV(32, R(ABI_PARAM3), Imm32(param3));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3)
+{
+	MOV(32, R(ABI_PARAM1), Imm32(param1));
+	MOV(32, R(ABI_PARAM2), Imm32(param2));
+	MOV(64, R(ABI_PARAM3), Imm64((u64)param3));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2, u32 param3, void *param4)
+{
+	MOV(32, R(ABI_PARAM1), Imm32(param1));
+	MOV(32, R(ABI_PARAM2), Imm32(param2));
+	MOV(32, R(ABI_PARAM3), Imm32(param3));
+	MOV(64, R(ABI_PARAM4), Imm64((u64)param4));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionPC(const void *func, void *param1, u32 param2)
+{
+	MOV(64, R(ABI_PARAM1), Imm64((u64)param1));
+	MOV(32, R(ABI_PARAM2), Imm32(param2));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3)
+{
+	MOV(64, R(ABI_PARAM1), Imm64((u64)param1));
+	MOV(64, R(ABI_PARAM2), Imm64((u64)param2));
+	MOV(32, R(ABI_PARAM3), Imm32(param3));
+	ABI_CallFunction(func);
+}
+
+// Pass a register as a parameter.
+void XEmitter::ABI_CallFunctionR(const void *func, X64Reg reg1)
+{
+	if (reg1 != ABI_PARAM1)
+		MOV(32, R(ABI_PARAM1), R(reg1));
+	ABI_CallFunction(func);
+}
+
+// Pass two registers as parameters.
+void XEmitter::ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2)
+{
+	MOVTwo(64, ABI_PARAM1, reg1, ABI_PARAM2, reg2);
+	ABI_CallFunction(func);
+}
+
+void XEmitter::MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, Gen::X64Reg dst2, Gen::X64Reg src2)
+{
+	if (dst1 == src2 && dst2 == src1)
+	{
+		XCHG(bits, R(src1), R(src2));
+	}
+	else if (src2 != dst1)
+	{
+		if (dst1 != src1)
+			MOV(bits, R(dst1), R(src1));
+		if (dst2 != src2)
+			MOV(bits, R(dst2), R(src2));
+	}
+	else
+	{
+		if (dst2 != src2)
+			MOV(bits, R(dst2), R(src2));
+		if (dst1 != src1)
+			MOV(bits, R(dst1), R(src1));
+	}
+}
+
+void XEmitter::ABI_CallFunctionAC(int bits, const void *func, const Gen::OpArg &arg1, u32 param2)
+{
+	if (!arg1.IsSimpleReg(ABI_PARAM1))
+		MOV(bits, R(ABI_PARAM1), arg1);
+	MOV(32, R(ABI_PARAM2), Imm32(param2));
+	ABI_CallFunction(func);
+}
+
+void XEmitter::ABI_CallFunctionA(int bits, const void *func, const Gen::OpArg &arg1)
+{
+	if (!arg1.IsSimpleReg(ABI_PARAM1))
+		MOV(bits, R(ABI_PARAM1), arg1);
+	ABI_CallFunction(func);
+}
+
diff --git a/src/common/x64_abi.h b/src/common/x64_abi.h
new file mode 100644
index 000000000..fad6930be
--- /dev/null
+++ b/src/common/x64_abi.h
@@ -0,0 +1,60 @@
+// Copyright 2013 Dolphin Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include "common/bit_set.h"
+#include "common/x64_emitter.h"
+
+// x64 ABI:s, and helpers to help follow them when JIT-ing code.
+// All convensions return values in EAX (+ possibly EDX).
+
+// Windows 64-bit
+// * 4-reg "fastcall" variant, very new-skool stack handling
+// * Callee moves stack pointer, to make room for shadow regs for the biggest function _it itself calls_
+// * Parameters passed in RCX, RDX, ... further parameters are MOVed into the allocated stack space.
+// Scratch:      RAX RCX RDX R8 R9 R10 R11
+// Callee-save:  RBX RSI RDI RBP R12 R13 R14 R15
+// Parameters:   RCX RDX R8 R9, further MOV-ed
+
+// Linux 64-bit
+// * 6-reg "fastcall" variant, old skool stack handling (parameters are pushed)
+// Scratch:      RAX RCX RDX RSI RDI R8 R9 R10 R11
+// Callee-save:  RBX RBP R12 R13 R14 R15
+// Parameters:   RDI RSI RDX RCX R8 R9
+
+#define ABI_ALL_FPRS BitSet32(0xffff0000)
+#define ABI_ALL_GPRS BitSet32(0x0000ffff)
+
+#ifdef _WIN32 // 64-bit Windows - the really exotic calling convention
+
+#define ABI_PARAM1 RCX
+#define ABI_PARAM2 RDX
+#define ABI_PARAM3 R8
+#define ABI_PARAM4 R9
+
+// xmm0-xmm15 use the upper 16 bits in the functions that push/pop registers.
+#define ABI_ALL_CALLER_SAVED \
+	(BitSet32 { RAX, RCX, RDX, R8, R9, R10, R11, \
+	            XMM0+16, XMM1+16, XMM2+16, XMM3+16, XMM4+16, XMM5+16 })
+#else  //64-bit Unix / OS X
+
+#define ABI_PARAM1 RDI
+#define ABI_PARAM2 RSI
+#define ABI_PARAM3 RDX
+#define ABI_PARAM4 RCX
+#define ABI_PARAM5 R8
+#define ABI_PARAM6 R9
+
+// FIXME: avoid pushing all 16 XMM registers when possible? most functions we call probably
+// don't actually clobber them.
+#define ABI_ALL_CALLER_SAVED \
+	(BitSet32 { RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11 } | \
+	 ABI_ALL_FPRS)
+#endif // WIN32
+
+#define ABI_ALL_CALLEE_SAVED (~ABI_ALL_CALLER_SAVED)
+
+#define ABI_RETURN RAX
+
diff --git a/src/common/x64_emitter.cpp b/src/common/x64_emitter.cpp
new file mode 100644
index 000000000..0016fac02
--- /dev/null
+++ b/src/common/x64_emitter.cpp
@@ -0,0 +1,2039 @@
+// Copyright 2013 Dolphin Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+#include <cinttypes>
+
+#include "common/common_types.h"
+#include "common/cpu_detect.h"
+#include "common/x64_emitter.h"
+#include "common/log.h"
+
+namespace Gen
+{
+
+// TODO(ector): Add EAX special casing, for ever so slightly smaller code.
+struct NormalOpDef
+{
+	u8 toRm8, toRm32, fromRm8, fromRm32, imm8, imm32, simm8, eaximm8, eaximm32, ext;
+};
+
+// 0xCC is code for invalid combination of immediates
+static const NormalOpDef normalops[11] =
+{
+	{0x00, 0x01, 0x02, 0x03, 0x80, 0x81, 0x83, 0x04, 0x05, 0}, //ADD
+	{0x10, 0x11, 0x12, 0x13, 0x80, 0x81, 0x83, 0x14, 0x15, 2}, //ADC
+
+	{0x28, 0x29, 0x2A, 0x2B, 0x80, 0x81, 0x83, 0x2C, 0x2D, 5}, //SUB
+	{0x18, 0x19, 0x1A, 0x1B, 0x80, 0x81, 0x83, 0x1C, 0x1D, 3}, //SBB
+
+	{0x20, 0x21, 0x22, 0x23, 0x80, 0x81, 0x83, 0x24, 0x25, 4}, //AND
+	{0x08, 0x09, 0x0A, 0x0B, 0x80, 0x81, 0x83, 0x0C, 0x0D, 1}, //OR
+
+	{0x30, 0x31, 0x32, 0x33, 0x80, 0x81, 0x83, 0x34, 0x35, 6}, //XOR
+	{0x88, 0x89, 0x8A, 0x8B, 0xC6, 0xC7, 0xCC, 0xCC, 0xCC, 0}, //MOV
+
+	{0x84, 0x85, 0x84, 0x85, 0xF6, 0xF7, 0xCC, 0xA8, 0xA9, 0}, //TEST (to == from)
+	{0x38, 0x39, 0x3A, 0x3B, 0x80, 0x81, 0x83, 0x3C, 0x3D, 7}, //CMP
+
+	{0x86, 0x87, 0x86, 0x87, 0xCC, 0xCC, 0xCC, 0xCC, 0xCC, 7}, //XCHG
+};
+
+enum NormalSSEOps
+{
+	sseCMP         = 0xC2,
+	sseADD         = 0x58, //ADD
+	sseSUB         = 0x5C, //SUB
+	sseAND         = 0x54, //AND
+	sseANDN        = 0x55, //ANDN
+	sseOR          = 0x56,
+	sseXOR         = 0x57,
+	sseMUL         = 0x59, //MUL
+	sseDIV         = 0x5E, //DIV
+	sseMIN         = 0x5D, //MIN
+	sseMAX         = 0x5F, //MAX
+	sseCOMIS       = 0x2F, //COMIS
+	sseUCOMIS      = 0x2E, //UCOMIS
+	sseSQRT        = 0x51, //SQRT
+	sseRSQRT       = 0x52, //RSQRT (NO DOUBLE PRECISION!!!)
+	sseMOVAPfromRM = 0x28, //MOVAP from RM
+	sseMOVAPtoRM   = 0x29, //MOVAP to RM
+	sseMOVUPfromRM = 0x10, //MOVUP from RM
+	sseMOVUPtoRM   = 0x11, //MOVUP to RM
+	sseMOVLPDfromRM= 0x12,
+	sseMOVLPDtoRM  = 0x13,
+	sseMOVHPDfromRM= 0x16,
+	sseMOVHPDtoRM  = 0x17,
+	sseMOVHLPS     = 0x12,
+	sseMOVLHPS     = 0x16,
+	sseMOVDQfromRM = 0x6F,
+	sseMOVDQtoRM   = 0x7F,
+	sseMASKMOVDQU  = 0xF7,
+	sseLDDQU       = 0xF0,
+	sseSHUF        = 0xC6,
+	sseMOVNTDQ     = 0xE7,
+	sseMOVNTP      = 0x2B,
+};
+
+
+void XEmitter::SetCodePtr(u8 *ptr)
+{
+	code = ptr;
+}
+
+const u8 *XEmitter::GetCodePtr() const
+{
+	return code;
+}
+
+u8 *XEmitter::GetWritableCodePtr()
+{
+	return code;
+}
+
+void XEmitter::ReserveCodeSpace(int bytes)
+{
+	for (int i = 0; i < bytes; i++)
+		*code++ = 0xCC;
+}
+
+const u8 *XEmitter::AlignCode4()
+{
+	int c = int((u64)code & 3);
+	if (c)
+		ReserveCodeSpace(4-c);
+	return code;
+}
+
+const u8 *XEmitter::AlignCode16()
+{
+	int c = int((u64)code & 15);
+	if (c)
+		ReserveCodeSpace(16-c);
+	return code;
+}
+
+const u8 *XEmitter::AlignCodePage()
+{
+	int c = int((u64)code & 4095);
+	if (c)
+		ReserveCodeSpace(4096-c);
+	return code;
+}
+
+// This operation modifies flags; check to see the flags are locked.
+// If the flags are locked, we should immediately and loudly fail before
+// causing a subtle JIT bug.
+void XEmitter::CheckFlags()
+{
+	_assert_msg_(DYNA_REC, !flags_locked, "Attempt to modify flags while flags locked!");
+}
+
+void XEmitter::WriteModRM(int mod, int reg, int rm)
+{
+	Write8((u8)((mod << 6) | ((reg & 7) << 3) | (rm & 7)));
+}
+
+void XEmitter::WriteSIB(int scale, int index, int base)
+{
+	Write8((u8)((scale << 6) | ((index & 7) << 3) | (base & 7)));
+}
+
+void OpArg::WriteRex(XEmitter *emit, int opBits, int bits, int customOp) const
+{
+	if (customOp == -1)       customOp = operandReg;
+	u8 op = 0x40;
+	// REX.W (whether operation is a 64-bit operation)
+	if (opBits == 64)         op |= 8;
+	// REX.R (whether ModR/M reg field refers to R8-R15.
+	if (customOp & 8)         op |= 4;
+	// REX.X (whether ModR/M SIB index field refers to R8-R15)
+	if (indexReg & 8)         op |= 2;
+	// REX.B (whether ModR/M rm or SIB base or opcode reg field refers to R8-R15)
+	if (offsetOrBaseReg & 8)  op |= 1;
+	// Write REX if wr have REX bits to write, or if the operation accesses
+	// SIL, DIL, BPL, or SPL.
+	if (op != 0x40 ||
+	    (scale == SCALE_NONE && bits == 8 && (offsetOrBaseReg & 0x10c) == 4) ||
+	    (opBits == 8 && (customOp & 0x10c) == 4))
+	{
+		emit->Write8(op);
+		// Check the operation doesn't access AH, BH, CH, or DH.
+		_dbg_assert_(DYNA_REC, (offsetOrBaseReg & 0x100) == 0);
+		_dbg_assert_(DYNA_REC, (customOp & 0x100) == 0);
+	}
+}
+
+void OpArg::WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W) const
+{
+	int R = !(regOp1 & 8);
+	int X = !(indexReg & 8);
+	int B = !(offsetOrBaseReg & 8);
+
+	int vvvv = (regOp2 == X64Reg::INVALID_REG) ? 0xf : (regOp2 ^ 0xf);
+
+	// do we need any VEX fields that only appear in the three-byte form?
+	if (X == 1 && B == 1 && W == 0 && mmmmm == 1)
+	{
+		u8 RvvvvLpp = (R << 7) | (vvvv << 3) | (L << 1) | pp;
+		emit->Write8(0xC5);
+		emit->Write8(RvvvvLpp);
+	}
+	else
+	{
+		u8 RXBmmmmm = (R << 7) | (X << 6) | (B << 5) | mmmmm;
+		u8 WvvvvLpp = (W << 7) | (vvvv << 3) | (L << 1) | pp;
+		emit->Write8(0xC4);
+		emit->Write8(RXBmmmmm);
+		emit->Write8(WvvvvLpp);
+	}
+}
+
+void OpArg::WriteRest(XEmitter *emit, int extraBytes, X64Reg _operandReg,
+	bool warn_64bit_offset) const
+{
+	if (_operandReg == INVALID_REG)
+		_operandReg = (X64Reg)this->operandReg;
+	int mod = 0;
+	int ireg = indexReg;
+	bool SIB = false;
+	int _offsetOrBaseReg = this->offsetOrBaseReg;
+
+	if (scale == SCALE_RIP) //Also, on 32-bit, just an immediate address
+	{
+		// Oh, RIP addressing.
+		_offsetOrBaseReg = 5;
+		emit->WriteModRM(0, _operandReg, _offsetOrBaseReg);
+		//TODO : add some checks
+		u64 ripAddr = (u64)emit->GetCodePtr() + 4 + extraBytes;
+		s64 distance = (s64)offset - (s64)ripAddr;
+		_assert_msg_(DYNA_REC,
+		             (distance < 0x80000000LL &&
+		              distance >=  -0x80000000LL) ||
+		             !warn_64bit_offset,
+		             "WriteRest: op out of range (0x%" PRIx64 " uses 0x%" PRIx64 ")",
+		             ripAddr, offset);
+		s32 offs = (s32)distance;
+		emit->Write32((u32)offs);
+		return;
+	}
+
+	if (scale == 0)
+	{
+		// Oh, no memory, Just a reg.
+		mod = 3; //11
+	}
+	else if (scale >= 1)
+	{
+		//Ah good, no scaling.
+		if (scale == SCALE_ATREG && !((_offsetOrBaseReg & 7) == 4 || (_offsetOrBaseReg & 7) == 5))
+		{
+			//Okay, we're good. No SIB necessary.
+			int ioff = (int)offset;
+			if (ioff == 0)
+			{
+				mod = 0;
+			}
+			else if (ioff<-128 || ioff>127)
+			{
+				mod = 2; //32-bit displacement
+			}
+			else
+			{
+				mod = 1; //8-bit displacement
+			}
+		}
+		else if (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)
+		{
+			SIB = true;
+			mod = 0;
+			_offsetOrBaseReg = 5;
+		}
+		else //if (scale != SCALE_ATREG)
+		{
+			if ((_offsetOrBaseReg & 7) == 4) //this would occupy the SIB encoding :(
+			{
+				//So we have to fake it with SIB encoding :(
+				SIB = true;
+			}
+
+			if (scale >= SCALE_1 && scale < SCALE_ATREG)
+			{
+				SIB = true;
+			}
+
+			if (scale == SCALE_ATREG && ((_offsetOrBaseReg & 7) == 4))
+			{
+				SIB = true;
+				ireg = _offsetOrBaseReg;
+			}
+
+			//Okay, we're fine. Just disp encoding.
+			//We need displacement. Which size?
+			int ioff = (int)(s64)offset;
+			if (ioff < -128 || ioff > 127)
+			{
+				mod = 2; //32-bit displacement
+			}
+			else
+			{
+				mod = 1; //8-bit displacement
+			}
+		}
+	}
+
+	// Okay. Time to do the actual writing
+	// ModRM byte:
+	int oreg = _offsetOrBaseReg;
+	if (SIB)
+		oreg = 4;
+
+	// TODO(ector): WTF is this if about? I don't remember writing it :-)
+	//if (RIP)
+	//    oreg = 5;
+
+	emit->WriteModRM(mod, _operandReg&7, oreg&7);
+
+	if (SIB)
+	{
+		//SIB byte
+		int ss;
+		switch (scale)
+		{
+		case SCALE_NONE: _offsetOrBaseReg = 4; ss = 0; break; //RSP
+		case SCALE_1: ss = 0; break;
+		case SCALE_2: ss = 1; break;
+		case SCALE_4: ss = 2; break;
+		case SCALE_8: ss = 3; break;
+		case SCALE_NOBASE_2: ss = 1; break;
+		case SCALE_NOBASE_4: ss = 2; break;
+		case SCALE_NOBASE_8: ss = 3; break;
+		case SCALE_ATREG: ss = 0; break;
+		default: _assert_msg_(DYNA_REC, 0, "Invalid scale for SIB byte"); ss = 0; break;
+		}
+		emit->Write8((u8)((ss << 6) | ((ireg&7)<<3) | (_offsetOrBaseReg&7)));
+	}
+
+	if (mod == 1) //8-bit disp
+	{
+		emit->Write8((u8)(s8)(s32)offset);
+	}
+	else if (mod == 2 || (scale >= SCALE_NOBASE_2 && scale <= SCALE_NOBASE_8)) //32-bit disp
+	{
+		emit->Write32((u32)offset);
+	}
+}
+
+// W = operand extended width (1 if 64-bit)
+// R = register# upper bit
+// X = scale amnt upper bit
+// B = base register# upper bit
+void XEmitter::Rex(int w, int r, int x, int b)
+{
+	w = w ? 1 : 0;
+	r = r ? 1 : 0;
+	x = x ? 1 : 0;
+	b = b ? 1 : 0;
+	u8 rx = (u8)(0x40 | (w << 3) | (r << 2) | (x << 1) | (b));
+	if (rx != 0x40)
+		Write8(rx);
+}
+
+void XEmitter::JMP(const u8 *addr, bool force5Bytes)
+{
+	u64 fn = (u64)addr;
+	if (!force5Bytes)
+	{
+		s64 distance = (s64)(fn - ((u64)code + 2));
+		_assert_msg_(DYNA_REC, distance >= -0x80 && distance < 0x80,
+			     "Jump target too far away, needs force5Bytes = true");
+		//8 bits will do
+		Write8(0xEB);
+		Write8((u8)(s8)distance);
+	}
+	else
+	{
+		s64 distance = (s64)(fn - ((u64)code + 5));
+
+		_assert_msg_(DYNA_REC,
+		             distance >= -0x80000000LL && distance < 0x80000000LL,
+		             "Jump target too far away, needs indirect register");
+		Write8(0xE9);
+		Write32((u32)(s32)distance);
+	}
+}
+
+void XEmitter::JMPptr(const OpArg &arg2)
+{
+	OpArg arg = arg2;
+	if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "JMPptr - Imm argument");
+	arg.operandReg = 4;
+	arg.WriteRex(this, 0, 0);
+	Write8(0xFF);
+	arg.WriteRest(this);
+}
+
+//Can be used to trap other processors, before overwriting their code
+// not used in dolphin
+void XEmitter::JMPself()
+{
+	Write8(0xEB);
+	Write8(0xFE);
+}
+
+void XEmitter::CALLptr(OpArg arg)
+{
+	if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "CALLptr - Imm argument");
+	arg.operandReg = 2;
+	arg.WriteRex(this, 0, 0);
+	Write8(0xFF);
+	arg.WriteRest(this);
+}
+
+void XEmitter::CALL(const void *fnptr)
+{
+	u64 distance = u64(fnptr) - (u64(code) + 5);
+	_assert_msg_(DYNA_REC,
+	             distance < 0x0000000080000000ULL ||
+	             distance >=  0xFFFFFFFF80000000ULL,
+	             "CALL out of range (%p calls %p)", code, fnptr);
+	Write8(0xE8);
+	Write32(u32(distance));
+}
+
+FixupBranch XEmitter::J(bool force5bytes)
+{
+	FixupBranch branch;
+	branch.type = force5bytes ? 1 : 0;
+	branch.ptr = code + (force5bytes ? 5 : 2);
+	if (!force5bytes)
+	{
+		//8 bits will do
+		Write8(0xEB);
+		Write8(0);
+	}
+	else
+	{
+		Write8(0xE9);
+		Write32(0);
+	}
+	return branch;
+}
+
+FixupBranch XEmitter::J_CC(CCFlags conditionCode, bool force5bytes)
+{
+	FixupBranch branch;
+	branch.type = force5bytes ? 1 : 0;
+	branch.ptr = code + (force5bytes ? 6 : 2);
+	if (!force5bytes)
+	{
+		//8 bits will do
+		Write8(0x70 + conditionCode);
+		Write8(0);
+	}
+	else
+	{
+		Write8(0x0F);
+		Write8(0x80 + conditionCode);
+		Write32(0);
+	}
+	return branch;
+}
+
+void XEmitter::J_CC(CCFlags conditionCode, const u8* addr)
+{
+	u64 fn = (u64)addr;
+	s64 distance = (s64)(fn - ((u64)code + 2));
+	if (distance < -0x80 || distance >= 0x80)
+	{
+		distance = (s64)(fn - ((u64)code + 6));
+		_assert_msg_(DYNA_REC,
+		             distance >= -0x80000000LL && distance < 0x80000000LL,
+		             "Jump target too far away, needs indirect register");
+		Write8(0x0F);
+		Write8(0x80 + conditionCode);
+		Write32((u32)(s32)distance);
+	}
+	else
+	{
+		Write8(0x70 + conditionCode);
+		Write8((u8)(s8)distance);
+	}
+}
+
+void XEmitter::SetJumpTarget(const FixupBranch &branch)
+{
+	if (branch.type == 0)
+	{
+		s64 distance = (s64)(code - branch.ptr);
+		_assert_msg_(DYNA_REC, distance >= -0x80 && distance < 0x80, "Jump target too far away, needs force5Bytes = true");
+		branch.ptr[-1] = (u8)(s8)distance;
+	}
+	else if (branch.type == 1)
+	{
+		s64 distance = (s64)(code - branch.ptr);
+		_assert_msg_(DYNA_REC, distance >= -0x80000000LL && distance < 0x80000000LL, "Jump target too far away, needs indirect register");
+		((s32*)branch.ptr)[-1] = (s32)distance;
+	}
+}
+
+// INC/DEC considered harmful on newer CPUs due to partial flag set.
+// Use ADD, SUB instead.
+
+/*
+void XEmitter::INC(int bits, OpArg arg)
+{
+	if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "INC - Imm argument");
+	arg.operandReg = 0;
+	if (bits == 16) {Write8(0x66);}
+	arg.WriteRex(this, bits, bits);
+	Write8(bits == 8 ? 0xFE : 0xFF);
+	arg.WriteRest(this);
+}
+void XEmitter::DEC(int bits, OpArg arg)
+{
+	if (arg.IsImm()) _assert_msg_(DYNA_REC, 0, "DEC - Imm argument");
+	arg.operandReg = 1;
+	if (bits == 16) {Write8(0x66);}
+	arg.WriteRex(this, bits, bits);
+	Write8(bits == 8 ? 0xFE : 0xFF);
+	arg.WriteRest(this);
+}
+*/
+
+//Single byte opcodes
+//There is no PUSHAD/POPAD in 64-bit mode.
+void XEmitter::INT3() {Write8(0xCC);}
+void XEmitter::RET()  {Write8(0xC3);}
+void XEmitter::RET_FAST()  {Write8(0xF3); Write8(0xC3);} //two-byte return (rep ret) - recommended by AMD optimization manual for the case of jumping to a ret
+
+// The first sign of decadence: optimized NOPs.
+void XEmitter::NOP(size_t size)
+{
+	_dbg_assert_(DYNA_REC, (int)size > 0);
+	while (true)
+	{
+		switch (size)
+		{
+		case 0:
+			return;
+		case 1:
+			Write8(0x90);
+			return;
+		case 2:
+			Write8(0x66); Write8(0x90);
+			return;
+		case 3:
+			Write8(0x0F); Write8(0x1F); Write8(0x00);
+			return;
+		case 4:
+			Write8(0x0F); Write8(0x1F); Write8(0x40); Write8(0x00);
+			return;
+		case 5:
+			Write8(0x0F); Write8(0x1F); Write8(0x44); Write8(0x00);
+			Write8(0x00);
+			return;
+		case 6:
+			Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x44);
+			Write8(0x00); Write8(0x00);
+			return;
+		case 7:
+			Write8(0x0F); Write8(0x1F); Write8(0x80); Write8(0x00);
+			Write8(0x00); Write8(0x00); Write8(0x00);
+			return;
+		case 8:
+			Write8(0x0F); Write8(0x1F); Write8(0x84); Write8(0x00);
+			Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00);
+			return;
+		case 9:
+			Write8(0x66); Write8(0x0F); Write8(0x1F); Write8(0x84);
+			Write8(0x00); Write8(0x00); Write8(0x00); Write8(0x00);
+			Write8(0x00);
+			return;
+		case 10:
+			Write8(0x66); Write8(0x66); Write8(0x0F); Write8(0x1F);
+			Write8(0x84); Write8(0x00); Write8(0x00); Write8(0x00);
+			Write8(0x00); Write8(0x00);
+			return;
+		default:
+			// Even though x86 instructions are allowed to be up to 15 bytes long,
+			// AMD advises against using NOPs longer than 11 bytes because they
+			// carry a performance penalty on CPUs older than AMD family 16h.
+			Write8(0x66); Write8(0x66); Write8(0x66); Write8(0x0F);
+			Write8(0x1F); Write8(0x84); Write8(0x00); Write8(0x00);
+			Write8(0x00); Write8(0x00); Write8(0x00);
+			size -= 11;
+			continue;
+		}
+	}
+}
+
+void XEmitter::PAUSE() {Write8(0xF3); NOP();} //use in tight spinloops for energy saving on some CPU
+void XEmitter::CLC()  {CheckFlags(); Write8(0xF8);} //clear carry
+void XEmitter::CMC()  {CheckFlags(); Write8(0xF5);} //flip carry
+void XEmitter::STC()  {CheckFlags(); Write8(0xF9);} //set carry
+
+//TODO: xchg ah, al ???
+void XEmitter::XCHG_AHAL()
+{
+	Write8(0x86);
+	Write8(0xe0);
+	// alt. 86 c4
+}
+
+//These two can not be executed on early Intel 64-bit CPU:s, only on AMD!
+void XEmitter::LAHF() {Write8(0x9F);}
+void XEmitter::SAHF() {CheckFlags(); Write8(0x9E);}
+
+void XEmitter::PUSHF() {Write8(0x9C);}
+void XEmitter::POPF()  {CheckFlags(); Write8(0x9D);}
+
+void XEmitter::LFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xE8);}
+void XEmitter::MFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF0);}
+void XEmitter::SFENCE() {Write8(0x0F); Write8(0xAE); Write8(0xF8);}
+
+void XEmitter::WriteSimple1Byte(int bits, u8 byte, X64Reg reg)
+{
+	if (bits == 16)
+		Write8(0x66);
+	Rex(bits == 64, 0, 0, (int)reg >> 3);
+	Write8(byte + ((int)reg & 7));
+}
+
+void XEmitter::WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg)
+{
+	if (bits == 16)
+		Write8(0x66);
+	Rex(bits==64, 0, 0, (int)reg >> 3);
+	Write8(byte1);
+	Write8(byte2 + ((int)reg & 7));
+}
+
+void XEmitter::CWD(int bits)
+{
+	if (bits == 16)
+		Write8(0x66);
+	Rex(bits == 64, 0, 0, 0);
+	Write8(0x99);
+}
+
+void XEmitter::CBW(int bits)
+{
+	if (bits == 8)
+		Write8(0x66);
+	Rex(bits == 32, 0, 0, 0);
+	Write8(0x98);
+}
+
+//Simple opcodes
+
+
+//push/pop do not need wide to be 64-bit
+void XEmitter::PUSH(X64Reg reg) {WriteSimple1Byte(32, 0x50, reg);}
+void XEmitter::POP(X64Reg reg)  {WriteSimple1Byte(32, 0x58, reg);}
+
+void XEmitter::PUSH(int bits, const OpArg &reg)
+{
+	if (reg.IsSimpleReg())
+		PUSH(reg.GetSimpleReg());
+	else if (reg.IsImm())
+	{
+		switch (reg.GetImmBits())
+		{
+		case 8:
+			Write8(0x6A);
+			Write8((u8)(s8)reg.offset);
+			break;
+		case 16:
+			Write8(0x66);
+			Write8(0x68);
+			Write16((u16)(s16)(s32)reg.offset);
+			break;
+		case 32:
+			Write8(0x68);
+			Write32((u32)reg.offset);
+			break;
+		default:
+			_assert_msg_(DYNA_REC, 0, "PUSH - Bad imm bits");
+			break;
+		}
+	}
+	else
+	{
+		if (bits == 16)
+			Write8(0x66);
+		reg.WriteRex(this, bits, bits);
+		Write8(0xFF);
+		reg.WriteRest(this, 0, (X64Reg)6);
+	}
+}
+
+void XEmitter::POP(int /*bits*/, const OpArg &reg)
+{
+	if (reg.IsSimpleReg())
+		POP(reg.GetSimpleReg());
+	else
+		_assert_msg_(DYNA_REC, 0, "POP - Unsupported encoding");
+}
+
+void XEmitter::BSWAP(int bits, X64Reg reg)
+{
+	if (bits >= 32)
+	{
+		WriteSimple2Byte(bits, 0x0F, 0xC8, reg);
+	}
+	else if (bits == 16)
+	{
+		ROL(16, R(reg), Imm8(8));
+	}
+	else if (bits == 8)
+	{
+		// Do nothing - can't bswap a single byte...
+	}
+	else
+	{
+		_assert_msg_(DYNA_REC, 0, "BSWAP - Wrong number of bits");
+	}
+}
+
+// Undefined opcode - reserved
+// If we ever need a way to always cause a non-breakpoint hard exception...
+void XEmitter::UD2()
+{
+	Write8(0x0F);
+	Write8(0x0B);
+}
+
+void XEmitter::PREFETCH(PrefetchLevel level, OpArg arg)
+{
+	_assert_msg_(DYNA_REC, !arg.IsImm(), "PREFETCH - Imm argument");
+	arg.operandReg = (u8)level;
+	arg.WriteRex(this, 0, 0);
+	Write8(0x0F);
+	Write8(0x18);
+	arg.WriteRest(this);
+}
+
+void XEmitter::SETcc(CCFlags flag, OpArg dest)
+{
+	_assert_msg_(DYNA_REC, !dest.IsImm(), "SETcc - Imm argument");
+	dest.operandReg = 0;
+	dest.WriteRex(this, 0, 8);
+	Write8(0x0F);
+	Write8(0x90 + (u8)flag);
+	dest.WriteRest(this);
+}
+
+void XEmitter::CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag)
+{
+	_assert_msg_(DYNA_REC, !src.IsImm(), "CMOVcc - Imm argument");
+	_assert_msg_(DYNA_REC, bits != 8, "CMOVcc - 8 bits unsupported");
+	if (bits == 16)
+		Write8(0x66);
+	src.operandReg = dest;
+	src.WriteRex(this, bits, bits);
+	Write8(0x0F);
+	Write8(0x40 + (u8)flag);
+	src.WriteRest(this);
+}
+
+void XEmitter::WriteMulDivType(int bits, OpArg src, int ext)
+{
+	_assert_msg_(DYNA_REC, !src.IsImm(), "WriteMulDivType - Imm argument");
+	CheckFlags();
+	src.operandReg = ext;
+	if (bits == 16)
+		Write8(0x66);
+	src.WriteRex(this, bits, bits, 0);
+	if (bits == 8)
+	{
+		Write8(0xF6);
+	}
+	else
+	{
+		Write8(0xF7);
+	}
+	src.WriteRest(this);
+}
+
+void XEmitter::MUL(int bits, OpArg src)  {WriteMulDivType(bits, src, 4);}
+void XEmitter::DIV(int bits, OpArg src)  {WriteMulDivType(bits, src, 6);}
+void XEmitter::IMUL(int bits, OpArg src) {WriteMulDivType(bits, src, 5);}
+void XEmitter::IDIV(int bits, OpArg src) {WriteMulDivType(bits, src, 7);}
+void XEmitter::NEG(int bits, OpArg src)  {WriteMulDivType(bits, src, 3);}
+void XEmitter::NOT(int bits, OpArg src)  {WriteMulDivType(bits, src, 2);}
+
+void XEmitter::WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep)
+{
+	_assert_msg_(DYNA_REC, !src.IsImm(), "WriteBitSearchType - Imm argument");
+	CheckFlags();
+	src.operandReg = (u8)dest;
+	if (bits == 16)
+		Write8(0x66);
+	if (rep)
+		Write8(0xF3);
+	src.WriteRex(this, bits, bits);
+	Write8(0x0F);
+	Write8(byte2);
+	src.WriteRest(this);
+}
+
+void XEmitter::MOVNTI(int bits, OpArg dest, X64Reg src)
+{
+	if (bits <= 16)
+		_assert_msg_(DYNA_REC, 0, "MOVNTI - bits<=16");
+	WriteBitSearchType(bits, src, dest, 0xC3);
+}
+
+void XEmitter::BSF(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBC);} //bottom bit to top bit
+void XEmitter::BSR(int bits, X64Reg dest, OpArg src) {WriteBitSearchType(bits,dest,src,0xBD);} //top bit to bottom bit
+
+void XEmitter::TZCNT(int bits, X64Reg dest, OpArg src)
+{
+	CheckFlags();
+	if (!cpu_info.bBMI1)
+		PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+	WriteBitSearchType(bits, dest, src, 0xBC, true);
+}
+void XEmitter::LZCNT(int bits, X64Reg dest, OpArg src)
+{
+	CheckFlags();
+	if (!cpu_info.bLZCNT)
+		PanicAlert("Trying to use LZCNT on a system that doesn't support it. Bad programmer.");
+	WriteBitSearchType(bits, dest, src, 0xBD, true);
+}
+
+void XEmitter::MOVSX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+	_assert_msg_(DYNA_REC, !src.IsImm(), "MOVSX - Imm argument");
+	if (dbits == sbits)
+	{
+		MOV(dbits, R(dest), src);
+		return;
+	}
+	src.operandReg = (u8)dest;
+	if (dbits == 16)
+		Write8(0x66);
+	src.WriteRex(this, dbits, sbits);
+	if (sbits == 8)
+	{
+		Write8(0x0F);
+		Write8(0xBE);
+	}
+	else if (sbits == 16)
+	{
+		Write8(0x0F);
+		Write8(0xBF);
+	}
+	else if (sbits == 32 && dbits == 64)
+	{
+		Write8(0x63);
+	}
+	else
+	{
+		Crash();
+	}
+	src.WriteRest(this);
+}
+
+void XEmitter::MOVZX(int dbits, int sbits, X64Reg dest, OpArg src)
+{
+	_assert_msg_(DYNA_REC, !src.IsImm(), "MOVZX - Imm argument");
+	if (dbits == sbits)
+	{
+		MOV(dbits, R(dest), src);
+		return;
+	}
+	src.operandReg = (u8)dest;
+	if (dbits == 16)
+		Write8(0x66);
+	//the 32bit result is automatically zero extended to 64bit
+	src.WriteRex(this, dbits == 64 ? 32 : dbits, sbits);
+	if (sbits == 8)
+	{
+		Write8(0x0F);
+		Write8(0xB6);
+	}
+	else if (sbits == 16)
+	{
+		Write8(0x0F);
+		Write8(0xB7);
+	}
+	else if (sbits == 32 && dbits == 64)
+	{
+		Write8(0x8B);
+	}
+	else
+	{
+		_assert_msg_(DYNA_REC, 0, "MOVZX - Invalid size");
+	}
+	src.WriteRest(this);
+}
+
+void XEmitter::MOVBE(int bits, const OpArg& dest, const OpArg& src)
+{
+	_assert_msg_(DYNA_REC, cpu_info.bMOVBE, "Generating MOVBE on a system that does not support it.");
+	if (bits == 8)
+	{
+		MOV(bits, dest, src);
+		return;
+	}
+
+	if (bits == 16)
+		Write8(0x66);
+
+	if (dest.IsSimpleReg())
+	{
+		_assert_msg_(DYNA_REC, !src.IsSimpleReg() && !src.IsImm(), "MOVBE: Loading from !mem");
+		src.WriteRex(this, bits, bits, dest.GetSimpleReg());
+		Write8(0x0F); Write8(0x38); Write8(0xF0);
+		src.WriteRest(this, 0, dest.GetSimpleReg());
+	}
+	else if (src.IsSimpleReg())
+	{
+		_assert_msg_(DYNA_REC, !dest.IsSimpleReg() && !dest.IsImm(), "MOVBE: Storing to !mem");
+		dest.WriteRex(this, bits, bits, src.GetSimpleReg());
+		Write8(0x0F); Write8(0x38); Write8(0xF1);
+		dest.WriteRest(this, 0, src.GetSimpleReg());
+	}
+	else
+	{
+		_assert_msg_(DYNA_REC, 0, "MOVBE: Not loading or storing to mem");
+	}
+}
+
+
+void XEmitter::LEA(int bits, X64Reg dest, OpArg src)
+{
+	_assert_msg_(DYNA_REC, !src.IsImm(), "LEA - Imm argument");
+	src.operandReg = (u8)dest;
+	if (bits == 16)
+		Write8(0x66); //TODO: performance warning
+	src.WriteRex(this, bits, bits);
+	Write8(0x8D);
+	src.WriteRest(this, 0, INVALID_REG, bits == 64);
+}
+
+//shift can be either imm8 or cl
+void XEmitter::WriteShift(int bits, OpArg dest, OpArg &shift, int ext)
+{
+	CheckFlags();
+	bool writeImm = false;
+	if (dest.IsImm())
+	{
+		_assert_msg_(DYNA_REC, 0, "WriteShift - can't shift imms");
+	}
+	if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
+	{
+		_assert_msg_(DYNA_REC, 0, "WriteShift - illegal argument");
+	}
+	dest.operandReg = ext;
+	if (bits == 16)
+		Write8(0x66);
+	dest.WriteRex(this, bits, bits, 0);
+	if (shift.GetImmBits() == 8)
+	{
+		//ok an imm
+		u8 imm = (u8)shift.offset;
+		if (imm == 1)
+		{
+			Write8(bits == 8 ? 0xD0 : 0xD1);
+		}
+		else
+		{
+			writeImm = true;
+			Write8(bits == 8 ? 0xC0 : 0xC1);
+		}
+	}
+	else
+	{
+		Write8(bits == 8 ? 0xD2 : 0xD3);
+	}
+	dest.WriteRest(this, writeImm ? 1 : 0);
+	if (writeImm)
+		Write8((u8)shift.offset);
+}
+
+// large rotates and shift are slower on intel than amd
+// intel likes to rotate by 1, and the op is smaller too
+void XEmitter::ROL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 0);}
+void XEmitter::ROR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 1);}
+void XEmitter::RCL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 2);}
+void XEmitter::RCR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 3);}
+void XEmitter::SHL(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 4);}
+void XEmitter::SHR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 5);}
+void XEmitter::SAR(int bits, OpArg dest, OpArg shift) {WriteShift(bits, dest, shift, 7);}
+
+// index can be either imm8 or register, don't use memory destination because it's slow
+void XEmitter::WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext)
+{
+	CheckFlags();
+	if (dest.IsImm())
+	{
+		_assert_msg_(DYNA_REC, 0, "WriteBitTest - can't test imms");
+	}
+	if ((index.IsImm() && index.GetImmBits() != 8))
+	{
+		_assert_msg_(DYNA_REC, 0, "WriteBitTest - illegal argument");
+	}
+	if (bits == 16)
+		Write8(0x66);
+	if (index.IsImm())
+	{
+		dest.WriteRex(this, bits, bits);
+		Write8(0x0F); Write8(0xBA);
+		dest.WriteRest(this, 1, (X64Reg)ext);
+		Write8((u8)index.offset);
+	}
+	else
+	{
+		X64Reg operand = index.GetSimpleReg();
+		dest.WriteRex(this, bits, bits, operand);
+		Write8(0x0F); Write8(0x83 + 8*ext);
+		dest.WriteRest(this, 1, operand);
+	}
+}
+
+void XEmitter::BT(int bits, OpArg dest, OpArg index)  {WriteBitTest(bits, dest, index, 4);}
+void XEmitter::BTS(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 5);}
+void XEmitter::BTR(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 6);}
+void XEmitter::BTC(int bits, OpArg dest, OpArg index) {WriteBitTest(bits, dest, index, 7);}
+
+//shift can be either imm8 or cl
+void XEmitter::SHRD(int bits, OpArg dest, OpArg src, OpArg shift)
+{
+	CheckFlags();
+	if (dest.IsImm())
+	{
+		_assert_msg_(DYNA_REC, 0, "SHRD - can't use imms as destination");
+	}
+	if (!src.IsSimpleReg())
+	{
+		_assert_msg_(DYNA_REC, 0, "SHRD - must use simple register as source");
+	}
+	if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
+	{
+		_assert_msg_(DYNA_REC, 0, "SHRD - illegal shift");
+	}
+	if (bits == 16)
+		Write8(0x66);
+	X64Reg operand = src.GetSimpleReg();
+	dest.WriteRex(this, bits, bits, operand);
+	if (shift.GetImmBits() == 8)
+	{
+		Write8(0x0F); Write8(0xAC);
+		dest.WriteRest(this, 1, operand);
+		Write8((u8)shift.offset);
+	}
+	else
+	{
+		Write8(0x0F); Write8(0xAD);
+		dest.WriteRest(this, 0, operand);
+	}
+}
+
+void XEmitter::SHLD(int bits, OpArg dest, OpArg src, OpArg shift)
+{
+	CheckFlags();
+	if (dest.IsImm())
+	{
+		_assert_msg_(DYNA_REC, 0, "SHLD - can't use imms as destination");
+	}
+	if (!src.IsSimpleReg())
+	{
+		_assert_msg_(DYNA_REC, 0, "SHLD - must use simple register as source");
+	}
+	if ((shift.IsSimpleReg() && shift.GetSimpleReg() != ECX) || (shift.IsImm() && shift.GetImmBits() != 8))
+	{
+		_assert_msg_(DYNA_REC, 0, "SHLD - illegal shift");
+	}
+	if (bits == 16)
+		Write8(0x66);
+	X64Reg operand = src.GetSimpleReg();
+	dest.WriteRex(this, bits, bits, operand);
+	if (shift.GetImmBits() == 8)
+	{
+		Write8(0x0F); Write8(0xA4);
+		dest.WriteRest(this, 1, operand);
+		Write8((u8)shift.offset);
+	}
+	else
+	{
+		Write8(0x0F); Write8(0xA5);
+		dest.WriteRest(this, 0, operand);
+	}
+}
+
+void OpArg::WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg _operandReg, int bits)
+{
+	if (bits == 16)
+		emit->Write8(0x66);
+
+	this->operandReg = (u8)_operandReg;
+	WriteRex(emit, bits, bits);
+	emit->Write8(op);
+	WriteRest(emit);
+}
+
+//operand can either be immediate or register
+void OpArg::WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const
+{
+	X64Reg _operandReg;
+	if (IsImm())
+	{
+		_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Imm argument, wrong order");
+	}
+
+	if (bits == 16)
+		emit->Write8(0x66);
+
+	int immToWrite = 0;
+
+	if (operand.IsImm())
+	{
+		WriteRex(emit, bits, bits);
+
+		if (!toRM)
+		{
+			_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Writing to Imm (!toRM)");
+		}
+
+		if (operand.scale == SCALE_IMM8 && bits == 8)
+		{
+			// op al, imm8
+			if (!scale && offsetOrBaseReg == AL && normalops[op].eaximm8 != 0xCC)
+			{
+				emit->Write8(normalops[op].eaximm8);
+				emit->Write8((u8)operand.offset);
+				return;
+			}
+			// mov reg, imm8
+			if (!scale && op == nrmMOV)
+			{
+				emit->Write8(0xB0 + (offsetOrBaseReg & 7));
+				emit->Write8((u8)operand.offset);
+				return;
+			}
+			// op r/m8, imm8
+			emit->Write8(normalops[op].imm8);
+			immToWrite = 8;
+		}
+		else if ((operand.scale == SCALE_IMM16 && bits == 16) ||
+				 (operand.scale == SCALE_IMM32 && bits == 32) ||
+				 (operand.scale == SCALE_IMM32 && bits == 64))
+		{
+			// Try to save immediate size if we can, but first check to see
+			// if the instruction supports simm8.
+			// op r/m, imm8
+			if (normalops[op].simm8 != 0xCC &&
+			    ((operand.scale == SCALE_IMM16 && (s16)operand.offset == (s8)operand.offset) ||
+			     (operand.scale == SCALE_IMM32 && (s32)operand.offset == (s8)operand.offset)))
+			{
+				emit->Write8(normalops[op].simm8);
+				immToWrite = 8;
+			}
+			else
+			{
+				// mov reg, imm
+				if (!scale && op == nrmMOV && bits != 64)
+				{
+					emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+					if (bits == 16)
+						emit->Write16((u16)operand.offset);
+					else
+						emit->Write32((u32)operand.offset);
+					return;
+				}
+				// op eax, imm
+				if (!scale && offsetOrBaseReg == EAX && normalops[op].eaximm32 != 0xCC)
+				{
+					emit->Write8(normalops[op].eaximm32);
+					if (bits == 16)
+						emit->Write16((u16)operand.offset);
+					else
+						emit->Write32((u32)operand.offset);
+					return;
+				}
+				// op r/m, imm
+				emit->Write8(normalops[op].imm32);
+				immToWrite = bits == 16 ? 16 : 32;
+			}
+		}
+		else if ((operand.scale == SCALE_IMM8 && bits == 16) ||
+				 (operand.scale == SCALE_IMM8 && bits == 32) ||
+				 (operand.scale == SCALE_IMM8 && bits == 64))
+		{
+			// op r/m, imm8
+			emit->Write8(normalops[op].simm8);
+			immToWrite = 8;
+		}
+		else if (operand.scale == SCALE_IMM64 && bits == 64)
+		{
+			if (scale)
+			{
+				_assert_msg_(DYNA_REC, 0, "WriteNormalOp - MOV with 64-bit imm requres register destination");
+			}
+			// mov reg64, imm64
+			else if (op == nrmMOV)
+			{
+				emit->Write8(0xB8 + (offsetOrBaseReg & 7));
+				emit->Write64((u64)operand.offset);
+				return;
+			}
+			_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Only MOV can take 64-bit imm");
+		}
+		else
+		{
+			_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Unhandled case %d %d", operand.scale, bits);
+		}
+		_operandReg = (X64Reg)normalops[op].ext; //pass extension in REG of ModRM
+	}
+	else
+	{
+		_operandReg = (X64Reg)operand.offsetOrBaseReg;
+		WriteRex(emit, bits, bits, _operandReg);
+		// op r/m, reg
+		if (toRM)
+		{
+			emit->Write8(bits == 8 ? normalops[op].toRm8 : normalops[op].toRm32);
+		}
+		// op reg, r/m
+		else
+		{
+			emit->Write8(bits == 8 ? normalops[op].fromRm8 : normalops[op].fromRm32);
+		}
+	}
+	WriteRest(emit, immToWrite >> 3, _operandReg);
+	switch (immToWrite)
+	{
+	case 0:
+		break;
+	case 8:
+		emit->Write8((u8)operand.offset);
+		break;
+	case 16:
+		emit->Write16((u16)operand.offset);
+		break;
+	case 32:
+		emit->Write32((u32)operand.offset);
+		break;
+	default:
+		_assert_msg_(DYNA_REC, 0, "WriteNormalOp - Unhandled case");
+	}
+}
+
+void XEmitter::WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2)
+{
+	if (a1.IsImm())
+	{
+		//Booh! Can't write to an imm
+		_assert_msg_(DYNA_REC, 0, "WriteNormalOp - a1 cannot be imm");
+		return;
+	}
+	if (a2.IsImm())
+	{
+		a1.WriteNormalOp(emit, true, op, a2, bits);
+	}
+	else
+	{
+		if (a1.IsSimpleReg())
+		{
+			a2.WriteNormalOp(emit, false, op, a1, bits);
+		}
+		else
+		{
+			_assert_msg_(DYNA_REC, a2.IsSimpleReg() || a2.IsImm(), "WriteNormalOp - a1 and a2 cannot both be memory");
+			a1.WriteNormalOp(emit, true, op, a2, bits);
+		}
+	}
+}
+
+void XEmitter::ADD (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADD, a1, a2);}
+void XEmitter::ADC (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmADC, a1, a2);}
+void XEmitter::SUB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSUB, a1, a2);}
+void XEmitter::SBB (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmSBB, a1, a2);}
+void XEmitter::AND (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmAND, a1, a2);}
+void XEmitter::OR  (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmOR , a1, a2);}
+void XEmitter::XOR (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmXOR, a1, a2);}
+void XEmitter::MOV (int bits, const OpArg &a1, const OpArg &a2)
+{
+	if (a1.IsSimpleReg() && a2.IsSimpleReg() && a1.GetSimpleReg() == a2.GetSimpleReg())
+		ERROR_LOG(DYNA_REC, "Redundant MOV @ %p - bug in JIT?", code);
+	WriteNormalOp(this, bits, nrmMOV, a1, a2);
+}
+void XEmitter::TEST(int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmTEST, a1, a2);}
+void XEmitter::CMP (int bits, const OpArg &a1, const OpArg &a2) {CheckFlags(); WriteNormalOp(this, bits, nrmCMP, a1, a2);}
+void XEmitter::XCHG(int bits, const OpArg &a1, const OpArg &a2) {WriteNormalOp(this, bits, nrmXCHG, a1, a2);}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a1, OpArg a2)
+{
+	CheckFlags();
+	if (bits == 8)
+	{
+		_assert_msg_(DYNA_REC, 0, "IMUL - illegal bit size!");
+		return;
+	}
+
+	if (a1.IsImm())
+	{
+		_assert_msg_(DYNA_REC, 0, "IMUL - second arg cannot be imm!");
+		return;
+	}
+
+	if (!a2.IsImm())
+	{
+		_assert_msg_(DYNA_REC, 0, "IMUL - third arg must be imm!");
+		return;
+	}
+
+	if (bits == 16)
+		Write8(0x66);
+	a1.WriteRex(this, bits, bits, regOp);
+
+	if (a2.GetImmBits() == 8 ||
+	    (a2.GetImmBits() == 16 && (s8)a2.offset == (s16)a2.offset) ||
+	    (a2.GetImmBits() == 32 && (s8)a2.offset == (s32)a2.offset))
+	{
+		Write8(0x6B);
+		a1.WriteRest(this, 1, regOp);
+		Write8((u8)a2.offset);
+	}
+	else
+	{
+		Write8(0x69);
+		if (a2.GetImmBits() == 16 && bits == 16)
+		{
+			a1.WriteRest(this, 2, regOp);
+			Write16((u16)a2.offset);
+		}
+		else if (a2.GetImmBits() == 32 && (bits == 32 || bits == 64))
+		{
+			a1.WriteRest(this, 4, regOp);
+			Write32((u32)a2.offset);
+		}
+		else
+		{
+			_assert_msg_(DYNA_REC, 0, "IMUL - unhandled case!");
+		}
+	}
+}
+
+void XEmitter::IMUL(int bits, X64Reg regOp, OpArg a)
+{
+	CheckFlags();
+	if (bits == 8)
+	{
+		_assert_msg_(DYNA_REC, 0, "IMUL - illegal bit size!");
+		return;
+	}
+
+	if (a.IsImm())
+	{
+		IMUL(bits, regOp, R(regOp), a) ;
+		return;
+	}
+
+	if (bits == 16)
+		Write8(0x66);
+	a.WriteRex(this, bits, bits, regOp);
+	Write8(0x0F);
+	Write8(0xAF);
+	a.WriteRest(this, 0, regOp);
+}
+
+
+void XEmitter::WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+	if (opPrefix)
+		Write8(opPrefix);
+	arg.operandReg = regOp;
+	arg.WriteRex(this, 0, 0);
+	Write8(0x0F);
+	if (op > 0xFF)
+		Write8((op >> 8) & 0xFF);
+	Write8(op & 0xFF);
+	arg.WriteRest(this, extrabytes);
+}
+
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int W, int extrabytes)
+{
+	WriteAVXOp(opPrefix, op, regOp, INVALID_REG, arg, W, extrabytes);
+}
+
+static int GetVEXmmmmm(u16 op)
+{
+	// Currently, only 0x38 and 0x3A are used as secondary escape byte.
+	if ((op >> 8) == 0x3A)
+		return 3;
+	else if ((op >> 8) == 0x38)
+		return 2;
+	else
+		return 1;
+}
+
+static int GetVEXpp(u8 opPrefix)
+{
+	if (opPrefix == 0x66)
+		return 1;
+	else if (opPrefix == 0xF3)
+		return 2;
+	else if (opPrefix == 0xF2)
+		return 3;
+	else
+		return 0;
+}
+
+void XEmitter::WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int W, int extrabytes)
+{
+	if (!cpu_info.bAVX)
+		PanicAlert("Trying to use AVX on a system that doesn't support it. Bad programmer.");
+	int mmmmm = GetVEXmmmmm(op);
+	int pp = GetVEXpp(opPrefix);
+	// FIXME: we currently don't support 256-bit instructions, and "size" is not the vector size here
+	arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, W);
+	Write8(op & 0xFF);
+	arg.WriteRest(this, extrabytes, regOp1);
+}
+
+// Like the above, but more general; covers GPR-based VEX operations, like BMI1/2
+void XEmitter::WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	if (size != 32 && size != 64)
+		PanicAlert("VEX GPR instructions only support 32-bit and 64-bit modes!");
+	int mmmmm = GetVEXmmmmm(op);
+	int pp = GetVEXpp(opPrefix);
+	arg.WriteVex(this, regOp1, regOp2, 0, pp, mmmmm, size == 64);
+	Write8(op & 0xFF);
+	arg.WriteRest(this, extrabytes, regOp1);
+}
+
+void XEmitter::WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	CheckFlags();
+	if (!cpu_info.bBMI1)
+		PanicAlert("Trying to use BMI1 on a system that doesn't support it. Bad programmer.");
+	WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes)
+{
+	CheckFlags();
+	if (!cpu_info.bBMI2)
+		PanicAlert("Trying to use BMI2 on a system that doesn't support it. Bad programmer.");
+	WriteVEXOp(size, opPrefix, op, regOp1, regOp2, arg, extrabytes);
+}
+
+void XEmitter::MOVD_xmm(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x6E, dest, arg, 0);}
+void XEmitter::MOVD_xmm(const OpArg &arg, X64Reg src) {WriteSSEOp(0x66, 0x7E, src, arg, 0);}
+
+void XEmitter::MOVQ_xmm(X64Reg dest, OpArg arg)
+{
+		// Alternate encoding
+		// This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+		arg.operandReg = dest;
+		Write8(0x66);
+		arg.WriteRex(this, 64, 0);
+		Write8(0x0f);
+		Write8(0x6E);
+		arg.WriteRest(this, 0);
+}
+
+void XEmitter::MOVQ_xmm(OpArg arg, X64Reg src)
+{
+	if (src > 7 || arg.IsSimpleReg())
+	{
+		// Alternate encoding
+		// This does not display correctly in MSVC's debugger, it thinks it's a MOVD
+		arg.operandReg = src;
+		Write8(0x66);
+		arg.WriteRex(this, 64, 0);
+		Write8(0x0f);
+		Write8(0x7E);
+		arg.WriteRest(this, 0);
+	}
+	else
+	{
+		arg.operandReg = src;
+		arg.WriteRex(this, 0, 0);
+		Write8(0x66);
+		Write8(0x0f);
+		Write8(0xD6);
+		arg.WriteRest(this, 0);
+	}
+}
+
+void XEmitter::WriteMXCSR(OpArg arg, int ext)
+{
+	if (arg.IsImm() || arg.IsSimpleReg())
+		_assert_msg_(DYNA_REC, 0, "MXCSR - invalid operand");
+
+	arg.operandReg = ext;
+	arg.WriteRex(this, 0, 0);
+	Write8(0x0F);
+	Write8(0xAE);
+	arg.WriteRest(this);
+}
+
+void XEmitter::STMXCSR(OpArg memloc) {WriteMXCSR(memloc, 3);}
+void XEmitter::LDMXCSR(OpArg memloc) {WriteMXCSR(memloc, 2);}
+
+void XEmitter::MOVNTDQ(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTDQ, regOp, arg);}
+void XEmitter::MOVNTPS(OpArg arg, X64Reg regOp) {WriteSSEOp(0x00, sseMOVNTP, regOp, arg);}
+void XEmitter::MOVNTPD(OpArg arg, X64Reg regOp) {WriteSSEOp(0x66, sseMOVNTP, regOp, arg);}
+
+void XEmitter::ADDSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseADD, regOp, arg);}
+void XEmitter::ADDSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseADD, regOp, arg);}
+void XEmitter::SUBSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseSUB, regOp, arg);}
+void XEmitter::SUBSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseSUB, regOp, arg);}
+void XEmitter::CMPSS(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0xF3, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::CMPSD(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0xF2, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::MULSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMUL, regOp, arg);}
+void XEmitter::MULSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMUL, regOp, arg);}
+void XEmitter::DIVSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseDIV, regOp, arg);}
+void XEmitter::DIVSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseDIV, regOp, arg);}
+void XEmitter::MINSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMIN, regOp, arg);}
+void XEmitter::MINSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMIN, regOp, arg);}
+void XEmitter::MAXSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMAX, regOp, arg);}
+void XEmitter::MAXSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMAX, regOp, arg);}
+void XEmitter::SQRTSS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF3, sseSQRT, regOp, arg);}
+void XEmitter::SQRTSD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF2, sseSQRT, regOp, arg);}
+void XEmitter::RSQRTSS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, sseRSQRT, regOp, arg);}
+
+void XEmitter::ADDPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseADD, regOp, arg);}
+void XEmitter::ADDPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseADD, regOp, arg);}
+void XEmitter::SUBPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseSUB, regOp, arg);}
+void XEmitter::SUBPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseSUB, regOp, arg);}
+void XEmitter::CMPPS(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0x00, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::CMPPD(X64Reg regOp, OpArg arg, u8 compare)   {WriteSSEOp(0x66, sseCMP, regOp, arg, 1); Write8(compare);}
+void XEmitter::ANDPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseAND, regOp, arg);}
+void XEmitter::ANDPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseAND, regOp, arg);}
+void XEmitter::ANDNPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseANDN, regOp, arg);}
+void XEmitter::ANDNPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseANDN, regOp, arg);}
+void XEmitter::ORPS(X64Reg regOp, OpArg arg)    {WriteSSEOp(0x00, sseOR, regOp, arg);}
+void XEmitter::ORPD(X64Reg regOp, OpArg arg)    {WriteSSEOp(0x66, sseOR, regOp, arg);}
+void XEmitter::XORPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseXOR, regOp, arg);}
+void XEmitter::XORPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseXOR, regOp, arg);}
+void XEmitter::MULPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMUL, regOp, arg);}
+void XEmitter::MULPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMUL, regOp, arg);}
+void XEmitter::DIVPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseDIV, regOp, arg);}
+void XEmitter::DIVPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseDIV, regOp, arg);}
+void XEmitter::MINPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMIN, regOp, arg);}
+void XEmitter::MINPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMIN, regOp, arg);}
+void XEmitter::MAXPS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x00, sseMAX, regOp, arg);}
+void XEmitter::MAXPD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0x66, sseMAX, regOp, arg);}
+void XEmitter::SQRTPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseSQRT, regOp, arg);}
+void XEmitter::SQRTPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseSQRT, regOp, arg);}
+void XEmitter::RSQRTPS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseRSQRT, regOp, arg);}
+void XEmitter::SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x00, sseSHUF, regOp, arg,1); Write8(shuffle);}
+void XEmitter::SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle) {WriteSSEOp(0x66, sseSHUF, regOp, arg,1); Write8(shuffle);}
+
+void XEmitter::COMISS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseCOMIS, regOp, arg);} //weird that these should be packed
+void XEmitter::COMISD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseCOMIS, regOp, arg);} //ordered
+void XEmitter::UCOMISS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, sseUCOMIS, regOp, arg);} //unordered
+void XEmitter::UCOMISD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, sseUCOMIS, regOp, arg);}
+
+void XEmitter::MOVAPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseMOVAPfromRM, regOp, arg);}
+void XEmitter::MOVAPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVAPfromRM, regOp, arg);}
+void XEmitter::MOVAPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x00, sseMOVAPtoRM, regOp, arg);}
+void XEmitter::MOVAPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVAPtoRM, regOp, arg);}
+
+void XEmitter::MOVUPS(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x00, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVUPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVUPS(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x00, sseMOVUPtoRM, regOp, arg);}
+void XEmitter::MOVUPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVUPtoRM, regOp, arg);}
+
+void XEmitter::MOVDQA(X64Reg regOp, OpArg arg)  {WriteSSEOp(0x66, sseMOVDQfromRM, regOp, arg);}
+void XEmitter::MOVDQA(OpArg arg, X64Reg regOp)  {WriteSSEOp(0x66, sseMOVDQtoRM, regOp, arg);}
+void XEmitter::MOVDQU(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF3, sseMOVDQfromRM, regOp, arg);}
+void XEmitter::MOVDQU(OpArg arg, X64Reg regOp)  {WriteSSEOp(0xF3, sseMOVDQtoRM, regOp, arg);}
+
+void XEmitter::MOVSS(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF3, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVSD(X64Reg regOp, OpArg arg)   {WriteSSEOp(0xF2, sseMOVUPfromRM, regOp, arg);}
+void XEmitter::MOVSS(OpArg arg, X64Reg regOp)   {WriteSSEOp(0xF3, sseMOVUPtoRM, regOp, arg);}
+void XEmitter::MOVSD(OpArg arg, X64Reg regOp)   {WriteSSEOp(0xF2, sseMOVUPtoRM, regOp, arg);}
+
+void XEmitter::MOVLPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF2, sseMOVLPDfromRM, regOp, arg);}
+void XEmitter::MOVHPD(X64Reg regOp, OpArg arg)  {WriteSSEOp(0xF2, sseMOVHPDfromRM, regOp, arg);}
+void XEmitter::MOVLPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0xF2, sseMOVLPDtoRM, regOp, arg);}
+void XEmitter::MOVHPD(OpArg arg, X64Reg regOp)  {WriteSSEOp(0xF2, sseMOVHPDtoRM, regOp, arg);}
+
+void XEmitter::MOVHLPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVHLPS, regOp1, R(regOp2));}
+void XEmitter::MOVLHPS(X64Reg regOp1, X64Reg regOp2) {WriteSSEOp(0x00, sseMOVLHPS, regOp1, R(regOp2));}
+
+void XEmitter::CVTPS2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5A, regOp, arg);}
+void XEmitter::CVTPD2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5A, regOp, arg);}
+
+void XEmitter::CVTSD2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x5A, regOp, arg);}
+void XEmitter::CVTSS2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5A, regOp, arg);}
+void XEmitter::CVTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2D, regOp, arg);}
+void XEmitter::CVTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2D, regOp, arg);}
+void XEmitter::CVTSI2SD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2A, regOp, arg);}
+void XEmitter::CVTSI2SS(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2A, regOp, arg);}
+
+void XEmitter::CVTDQ2PD(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0xE6, regOp, arg);}
+void XEmitter::CVTDQ2PS(X64Reg regOp, OpArg arg) {WriteSSEOp(0x00, 0x5B, regOp, arg);}
+void XEmitter::CVTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0xE6, regOp, arg);}
+void XEmitter::CVTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0x5B, regOp, arg);}
+
+void XEmitter::CVTTSD2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF2, 0x2C, regOp, arg);}
+void XEmitter::CVTTSS2SI(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x2C, regOp, arg);}
+void XEmitter::CVTTPS2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0xF3, 0x5B, regOp, arg);}
+void XEmitter::CVTTPD2DQ(X64Reg regOp, OpArg arg) {WriteSSEOp(0x66, 0xE6, regOp, arg);}
+
+void XEmitter::MASKMOVDQU(X64Reg dest, X64Reg src)  {WriteSSEOp(0x66, sseMASKMOVDQU, dest, R(src));}
+
+void XEmitter::MOVMSKPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x50, dest, arg);}
+void XEmitter::MOVMSKPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x50, dest, arg);}
+
+void XEmitter::LDDQU(X64Reg dest, OpArg arg)    {WriteSSEOp(0xF2, sseLDDQU, dest, arg);} // For integer data only
+
+// THESE TWO ARE UNTESTED.
+void XEmitter::UNPCKLPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x14, dest, arg);}
+void XEmitter::UNPCKHPS(X64Reg dest, OpArg arg) {WriteSSEOp(0x00, 0x15, dest, arg);}
+
+void XEmitter::UNPCKLPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x14, dest, arg);}
+void XEmitter::UNPCKHPD(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x15, dest, arg);}
+
+void XEmitter::MOVDDUP(X64Reg regOp, OpArg arg)
+{
+	if (cpu_info.bSSE3)
+	{
+		WriteSSEOp(0xF2, 0x12, regOp, arg); //SSE3 movddup
+	}
+	else
+	{
+		// Simulate this instruction with SSE2 instructions
+		if (!arg.IsSimpleReg(regOp))
+			MOVSD(regOp, arg);
+		UNPCKLPD(regOp, R(regOp));
+	}
+}
+
+//There are a few more left
+
+// Also some integer instructions are missing
+void XEmitter::PACKSSDW(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x6B, dest, arg);}
+void XEmitter::PACKSSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x63, dest, arg);}
+void XEmitter::PACKUSWB(X64Reg dest, OpArg arg) {WriteSSEOp(0x66, 0x67, dest, arg);}
+
+void XEmitter::PUNPCKLBW(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x60, dest, arg);}
+void XEmitter::PUNPCKLWD(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x61, dest, arg);}
+void XEmitter::PUNPCKLDQ(X64Reg dest, const OpArg &arg) {WriteSSEOp(0x66, 0x62, dest, arg);}
+
+void XEmitter::PSRLW(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x71, (X64Reg)2, R(reg));
+	Write8(shift);
+}
+
+void XEmitter::PSRLD(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x72, (X64Reg)2, R(reg));
+	Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x73, (X64Reg)2, R(reg));
+	Write8(shift);
+}
+
+void XEmitter::PSRLQ(X64Reg reg, OpArg arg)
+{
+	WriteSSEOp(0x66, 0xd3, reg, arg);
+}
+
+void XEmitter::PSRLDQ(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x73, (X64Reg)3, R(reg));
+	Write8(shift);
+}
+
+void XEmitter::PSLLW(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x71, (X64Reg)6, R(reg));
+	Write8(shift);
+}
+
+void XEmitter::PSLLD(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x72, (X64Reg)6, R(reg));
+	Write8(shift);
+}
+
+void XEmitter::PSLLQ(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x73, (X64Reg)6, R(reg));
+	Write8(shift);
+}
+
+void XEmitter::PSLLDQ(X64Reg reg, int shift)
+{
+	WriteSSEOp(0x66, 0x73, (X64Reg)7, R(reg));
+	Write8(shift);
+}
+
+
+// WARNING not REX compatible
+void XEmitter::PSRAW(X64Reg reg, int shift)
+{
+	if (reg > 7)
+		PanicAlert("The PSRAW-emitter does not support regs above 7");
+	Write8(0x66);
+	Write8(0x0f);
+	Write8(0x71);
+	Write8(0xE0 | reg);
+	Write8(shift);
+}
+
+// WARNING not REX compatible
+void XEmitter::PSRAD(X64Reg reg, int shift)
+{
+	if (reg > 7)
+		PanicAlert("The PSRAD-emitter does not support regs above 7");
+	Write8(0x66);
+	Write8(0x0f);
+	Write8(0x72);
+	Write8(0xE0 | reg);
+	Write8(shift);
+}
+
+void XEmitter::WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+	if (!cpu_info.bSSSE3)
+		PanicAlert("Trying to use SSSE3 on a system that doesn't support it. Bad programmer.");
+	WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes)
+{
+	if (!cpu_info.bSSE4_1)
+		PanicAlert("Trying to use SSE4.1 on a system that doesn't support it. Bad programmer.");
+	WriteSSEOp(opPrefix, op, regOp, arg, extrabytes);
+}
+
+void XEmitter::PSHUFB(X64Reg dest, OpArg arg)   {WriteSSSE3Op(0x66, 0x3800, dest, arg);}
+void XEmitter::PTEST(X64Reg dest, OpArg arg)    {WriteSSE41Op(0x66, 0x3817, dest, arg);}
+void XEmitter::PACKUSDW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x382b, dest, arg);}
+
+void XEmitter::PMOVSXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3820, dest, arg);}
+void XEmitter::PMOVSXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3821, dest, arg);}
+void XEmitter::PMOVSXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3822, dest, arg);}
+void XEmitter::PMOVSXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3823, dest, arg);}
+void XEmitter::PMOVSXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3824, dest, arg);}
+void XEmitter::PMOVSXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3825, dest, arg);}
+void XEmitter::PMOVZXBW(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3830, dest, arg);}
+void XEmitter::PMOVZXBD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3831, dest, arg);}
+void XEmitter::PMOVZXBQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3832, dest, arg);}
+void XEmitter::PMOVZXWD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3833, dest, arg);}
+void XEmitter::PMOVZXWQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3834, dest, arg);}
+void XEmitter::PMOVZXDQ(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3835, dest, arg);}
+
+void XEmitter::PBLENDVB(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3810, dest, arg);}
+void XEmitter::BLENDVPS(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3814, dest, arg);}
+void XEmitter::BLENDVPD(X64Reg dest, OpArg arg) {WriteSSE41Op(0x66, 0x3815, dest, arg);}
+
+void XEmitter::PAND(X64Reg dest, OpArg arg)     {WriteSSEOp(0x66, 0xDB, dest, arg);}
+void XEmitter::PANDN(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xDF, dest, arg);}
+void XEmitter::PXOR(X64Reg dest, OpArg arg)     {WriteSSEOp(0x66, 0xEF, dest, arg);}
+void XEmitter::POR(X64Reg dest, OpArg arg)      {WriteSSEOp(0x66, 0xEB, dest, arg);}
+
+void XEmitter::PADDB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFC, dest, arg);}
+void XEmitter::PADDW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFD, dest, arg);}
+void XEmitter::PADDD(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFE, dest, arg);}
+void XEmitter::PADDQ(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xD4, dest, arg);}
+
+void XEmitter::PADDSB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEC, dest, arg);}
+void XEmitter::PADDSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xED, dest, arg);}
+void XEmitter::PADDUSB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xDC, dest, arg);}
+void XEmitter::PADDUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xDD, dest, arg);}
+
+void XEmitter::PSUBB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xF8, dest, arg);}
+void XEmitter::PSUBW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xF9, dest, arg);}
+void XEmitter::PSUBD(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFA, dest, arg);}
+void XEmitter::PSUBQ(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xFB, dest, arg);}
+
+void XEmitter::PSUBSB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xE8, dest, arg);}
+void XEmitter::PSUBSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xE9, dest, arg);}
+void XEmitter::PSUBUSB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xD8, dest, arg);}
+void XEmitter::PSUBUSW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xD9, dest, arg);}
+
+void XEmitter::PAVGB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xE0, dest, arg);}
+void XEmitter::PAVGW(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xE3, dest, arg);}
+
+void XEmitter::PCMPEQB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x74, dest, arg);}
+void XEmitter::PCMPEQW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x75, dest, arg);}
+void XEmitter::PCMPEQD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x76, dest, arg);}
+
+void XEmitter::PCMPGTB(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x64, dest, arg);}
+void XEmitter::PCMPGTW(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x65, dest, arg);}
+void XEmitter::PCMPGTD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0x66, dest, arg);}
+
+void XEmitter::PEXTRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(0x66, 0xC5, dest, arg); Write8(subreg);}
+void XEmitter::PINSRW(X64Reg dest, OpArg arg, u8 subreg)    {WriteSSEOp(0x66, 0xC4, dest, arg); Write8(subreg);}
+
+void XEmitter::PMADDWD(X64Reg dest, OpArg arg)  {WriteSSEOp(0x66, 0xF5, dest, arg); }
+void XEmitter::PSADBW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xF6, dest, arg);}
+
+void XEmitter::PMAXSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEE, dest, arg); }
+void XEmitter::PMAXUB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xDE, dest, arg); }
+void XEmitter::PMINSW(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xEA, dest, arg); }
+void XEmitter::PMINUB(X64Reg dest, OpArg arg)   {WriteSSEOp(0x66, 0xDA, dest, arg); }
+
+void XEmitter::PMOVMSKB(X64Reg dest, OpArg arg)    {WriteSSEOp(0x66, 0xD7, dest, arg); }
+void XEmitter::PSHUFD(X64Reg regOp, OpArg arg, u8 shuffle)    {WriteSSEOp(0x66, 0x70, regOp, arg, 1); Write8(shuffle);}
+void XEmitter::PSHUFLW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(0xF2, 0x70, regOp, arg, 1); Write8(shuffle);}
+void XEmitter::PSHUFHW(X64Reg regOp, OpArg arg, u8 shuffle)   {WriteSSEOp(0xF3, 0x70, regOp, arg, 1); Write8(shuffle);}
+
+// VEX
+void XEmitter::VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseADD, regOp1, regOp2, arg);}
+void XEmitter::VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseSUB, regOp1, regOp2, arg);}
+void XEmitter::VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseMUL, regOp1, regOp2, arg);}
+void XEmitter::VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0xF2, sseDIV, regOp1, regOp2, arg);}
+void XEmitter::VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseADD, regOp1, regOp2, arg);}
+void XEmitter::VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseSUB, regOp1, regOp2, arg);}
+void XEmitter::VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseMUL, regOp1, regOp2, arg);}
+void XEmitter::VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseDIV, regOp1, regOp2, arg);}
+void XEmitter::VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(0xF2, sseSQRT, regOp1, regOp2, arg);}
+void XEmitter::VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle) {WriteAVXOp(0x66, sseSHUF, regOp1, regOp2, arg, 0, 1); Write8(shuffle);}
+void XEmitter::VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x14, regOp1, regOp2, arg);}
+void XEmitter::VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg){WriteAVXOp(0x66, 0x15, regOp1, regOp2, arg);}
+
+void XEmitter::VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x00, sseAND, regOp1, regOp2, arg);}
+void XEmitter::VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseAND, regOp1, regOp2, arg);}
+void XEmitter::VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(0x00, sseANDN, regOp1, regOp2, arg);}
+void XEmitter::VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)  {WriteAVXOp(0x66, sseANDN, regOp1, regOp2, arg);}
+void XEmitter::VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x00, sseOR, regOp1, regOp2, arg);}
+void XEmitter::VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, sseOR, regOp1, regOp2, arg);}
+void XEmitter::VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x00, sseXOR, regOp1, regOp2, arg);}
+void XEmitter::VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, sseXOR, regOp1, regOp2, arg);}
+
+void XEmitter::VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0xDB, regOp1, regOp2, arg);}
+void XEmitter::VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0xDF, regOp1, regOp2, arg);}
+void XEmitter::VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg)     {WriteAVXOp(0x66, 0xEB, regOp1, regOp2, arg);}
+void XEmitter::VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0xEF, regOp1, regOp2, arg);}
+
+void XEmitter::VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg);}
+void XEmitter::VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg);}
+void XEmitter::VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg);}
+void XEmitter::VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x3898, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38A8, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38B8, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg);}
+void XEmitter::VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg);}
+void XEmitter::VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg);}
+void XEmitter::VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x3899, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38A9, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38B9, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg);}
+void XEmitter::VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg);}
+void XEmitter::VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg);}
+void XEmitter::VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x389A, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38AA, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38BA, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg);}
+void XEmitter::VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg);}
+void XEmitter::VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg);}
+void XEmitter::VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x389B, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38AB, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)    {WriteAVXOp(0x66, 0x38BB, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg);}
+void XEmitter::VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg);}
+void XEmitter::VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg);}
+void XEmitter::VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x389C, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38AC, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38BC, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg);}
+void XEmitter::VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg);}
+void XEmitter::VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg);}
+void XEmitter::VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x389D, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38AD, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38BD, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg);}
+void XEmitter::VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg);}
+void XEmitter::VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg);}
+void XEmitter::VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x389E, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38AE, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38BE, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg);}
+void XEmitter::VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg);}
+void XEmitter::VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg);}
+void XEmitter::VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x389F, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38AF, regOp1, regOp2, arg, 1);}
+void XEmitter::VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg)   {WriteAVXOp(0x66, 0x38BF, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg);}
+void XEmitter::VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg);}
+void XEmitter::VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg);}
+void XEmitter::VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3896, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A6, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B6, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg);}
+void XEmitter::VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg);}
+void XEmitter::VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg);}
+void XEmitter::VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x3897, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38A7, regOp1, regOp2, arg, 1);}
+void XEmitter::VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteAVXOp(0x66, 0x38B7, regOp1, regOp2, arg, 1);}
+
+void XEmitter::SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF3, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x66, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0xF2, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate)      {WriteBMI2Op(bits, 0xF2, 0x3AF0, regOp, INVALID_REG, arg, 1); Write8(rotate);}
+void XEmitter::PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF3, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI2Op(bits, 0xF2, 0x38F6, regOp2, regOp1, arg);}
+void XEmitter::BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2) {WriteBMI2Op(bits, 0x00, 0x38F5, regOp1, regOp2, arg);}
+void XEmitter::BLSR(int bits, X64Reg regOp, OpArg arg)                 {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x1, regOp, arg);}
+void XEmitter::BLSMSK(int bits, X64Reg regOp, OpArg arg)               {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x2, regOp, arg);}
+void XEmitter::BLSI(int bits, X64Reg regOp, OpArg arg)                 {WriteBMI1Op(bits, 0x00, 0x38F3, (X64Reg)0x3, regOp, arg);}
+void XEmitter::BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2){WriteBMI1Op(bits, 0x00, 0x38F7, regOp1, regOp2, arg);}
+void XEmitter::ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg) {WriteBMI1Op(bits, 0x00, 0x38F2, regOp1, regOp2, arg);}
+
+// Prefixes
+
+void XEmitter::LOCK()  { Write8(0xF0); }
+void XEmitter::REP()   { Write8(0xF3); }
+void XEmitter::REPNE() { Write8(0xF2); }
+void XEmitter::FSOverride() { Write8(0x64); }
+void XEmitter::GSOverride() { Write8(0x65); }
+
+void XEmitter::FWAIT()
+{
+	Write8(0x9B);
+}
+
+// TODO: make this more generic
+void XEmitter::WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg)
+{
+	int mf = 0;
+	_assert_msg_(DYNA_REC, !(bits == 80 && op_80b == floatINVALID), "WriteFloatLoadStore: 80 bits not supported for this instruction");
+	switch (bits)
+	{
+	case 32: mf = 0; break;
+	case 64: mf = 4; break;
+	case 80: mf = 2; break;
+	default: _assert_msg_(DYNA_REC, 0, "WriteFloatLoadStore: invalid bits (should be 32/64/80)");
+	}
+	Write8(0xd9 | mf);
+	// x87 instructions use the reg field of the ModR/M byte as opcode:
+	if (bits == 80)
+		op = op_80b;
+	arg.WriteRest(this, 0, (X64Reg) op);
+}
+
+void XEmitter::FLD(int bits, OpArg src) {WriteFloatLoadStore(bits, floatLD, floatLD80, src);}
+void XEmitter::FST(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatST, floatINVALID, dest);}
+void XEmitter::FSTP(int bits, OpArg dest) {WriteFloatLoadStore(bits, floatSTP, floatSTP80, dest);}
+void XEmitter::FNSTSW_AX() { Write8(0xDF); Write8(0xE0); }
+
+void XEmitter::RDTSC() { Write8(0x0F); Write8(0x31); }
+
+// helper routines for setting pointers
+void XEmitter::CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2)
+{
+#ifdef _MSC_VER
+	MOV(32, R(RCX), Imm32(arg0));
+	MOV(32, R(RDX), Imm32(arg1));
+	MOV(32, R(R8),  Imm32(arg2));
+	CALL(fnptr);
+#else
+	MOV(32, R(RDI), Imm32(arg0));
+	MOV(32, R(RSI), Imm32(arg1));
+	MOV(32, R(RDX), Imm32(arg2));
+	CALL(fnptr);
+#endif
+}
+
+void XEmitter::CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3)
+{
+#ifdef _MSC_VER
+	MOV(32, R(RCX), Imm32(arg0));
+	MOV(32, R(RDX), Imm32(arg1));
+	MOV(32, R(R8), Imm32(arg2));
+	MOV(32, R(R9), Imm32(arg3));
+	CALL(fnptr);
+#else
+	MOV(32, R(RDI), Imm32(arg0));
+	MOV(32, R(RSI), Imm32(arg1));
+	MOV(32, R(RDX), Imm32(arg2));
+	MOV(32, R(RCX), Imm32(arg3));
+	CALL(fnptr);
+#endif
+}
+
+void XEmitter::CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4)
+{
+#ifdef _MSC_VER
+	MOV(32, R(RCX), Imm32(arg0));
+	MOV(32, R(RDX), Imm32(arg1));
+	MOV(32, R(R8),  Imm32(arg2));
+	MOV(32, R(R9),  Imm32(arg3));
+	MOV(32, MDisp(RSP, 0x20), Imm32(arg4));
+	CALL(fnptr);
+#else
+	MOV(32, R(RDI), Imm32(arg0));
+	MOV(32, R(RSI), Imm32(arg1));
+	MOV(32, R(RDX), Imm32(arg2));
+	MOV(32, R(RCX), Imm32(arg3));
+	MOV(32, R(R8),  Imm32(arg4));
+	CALL(fnptr);
+#endif
+}
+
+void XEmitter::CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5)
+{
+#ifdef _MSC_VER
+	MOV(32, R(RCX), Imm32(arg0));
+	MOV(32, R(RDX), Imm32(arg1));
+	MOV(32, R(R8), Imm32(arg2));
+	MOV(32, R(R9), Imm32(arg3));
+	MOV(32, MDisp(RSP, 0x20), Imm32(arg4));
+	MOV(32, MDisp(RSP, 0x28), Imm32(arg5));
+	CALL(fnptr);
+#else
+	MOV(32, R(RDI), Imm32(arg0));
+	MOV(32, R(RSI), Imm32(arg1));
+	MOV(32, R(RDX), Imm32(arg2));
+	MOV(32, R(RCX), Imm32(arg3));
+	MOV(32, R(R8), Imm32(arg4));
+	MOV(32, R(R9), Imm32(arg5));
+	CALL(fnptr);
+#endif
+}
+
+// See header
+void XEmitter::___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2)
+{
+	MOV(32, R(RCX), Imm32(arg0));
+	MOV(32, R(RDX), Imm32(arg1));
+	MOV(32, R(R8), Imm32(arg2));
+	CALLptr(M(impptr));
+}
+void XEmitter::___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3)
+{
+	MOV(32, R(RCX), Imm32(arg0));
+	MOV(32, R(RDX), Imm32(arg1));
+	MOV(32, R(R8), Imm32(arg2));
+	MOV(32, R(R9), Imm32(arg3));
+	CALLptr(M(impptr));
+}
+void XEmitter::___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4)
+{
+	MOV(32, R(RCX), Imm32(arg0));
+	MOV(32, R(RDX), Imm32(arg1));
+	MOV(32, R(R8), Imm32(arg2));
+	MOV(32, R(R9), Imm32(arg3));
+	MOV(32, MDisp(RSP, 0x20), Imm32(arg4));
+	CALLptr(M(impptr));
+}
+void XEmitter::___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5)
+{
+	MOV(32, R(RCX), Imm32(arg0));
+	MOV(32, R(RDX), Imm32(arg1));
+	MOV(32, R(R8), Imm32(arg2));
+	MOV(32, R(R9), Imm32(arg3));
+	MOV(32, MDisp(RSP, 0x20), Imm32(arg4));
+	MOV(32, MDisp(RSP, 0x28), Imm32(arg5));
+	CALLptr(M(impptr));
+}
+
+}
diff --git a/src/common/x64_emitter.h b/src/common/x64_emitter.h
new file mode 100644
index 000000000..3a8419c99
--- /dev/null
+++ b/src/common/x64_emitter.h
@@ -0,0 +1,956 @@
+// Copyright 2013 Dolphin Emulator Project
+// Licensed under GPLv2
+// Refer to the license.txt file included.
+
+// WARNING - THIS LIBRARY IS NOT THREAD SAFE!!!
+
+#pragma once
+
+#include <cstddef>
+#include <cstring>
+#include <functional>
+
+#include "Common/bit_set.h"
+#include "Common/code_block.h"
+#include "Common/common_types.h"
+
+namespace Gen
+{
+
+enum X64Reg
+{
+	EAX = 0, EBX = 3, ECX = 1, EDX = 2,
+	ESI = 6, EDI = 7, EBP = 5, ESP = 4,
+
+	RAX = 0, RBX = 3, RCX = 1, RDX = 2,
+	RSI = 6, RDI = 7, RBP = 5, RSP = 4,
+	R8  = 8, R9  = 9, R10 = 10,R11 = 11,
+	R12 = 12,R13 = 13,R14 = 14,R15 = 15,
+
+	AL = 0, BL = 3, CL = 1, DL = 2,
+	SIL = 6, DIL = 7, BPL = 5, SPL = 4,
+	AH = 0x104, BH = 0x107, CH = 0x105, DH = 0x106,
+
+	AX = 0, BX = 3, CX = 1, DX = 2,
+	SI = 6, DI = 7, BP = 5, SP = 4,
+
+	XMM0=0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+	XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15,
+
+	YMM0=0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
+	YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15,
+
+	INVALID_REG = 0xFFFFFFFF
+};
+
+enum CCFlags
+{
+	CC_O   = 0,
+	CC_NO  = 1,
+	CC_B   = 2, CC_C   = 2, CC_NAE = 2,
+	CC_NB  = 3, CC_NC  = 3, CC_AE  = 3,
+	CC_Z   = 4, CC_E   = 4,
+	CC_NZ  = 5, CC_NE  = 5,
+	CC_BE  = 6, CC_NA  = 6,
+	CC_NBE = 7, CC_A   = 7,
+	CC_S   = 8,
+	CC_NS  = 9,
+	CC_P   = 0xA, CC_PE  = 0xA,
+	CC_NP  = 0xB, CC_PO  = 0xB,
+	CC_L   = 0xC, CC_NGE = 0xC,
+	CC_NL  = 0xD, CC_GE  = 0xD,
+	CC_LE  = 0xE, CC_NG  = 0xE,
+	CC_NLE = 0xF, CC_G   = 0xF
+};
+
+enum
+{
+	NUMGPRs = 16,
+	NUMXMMs = 16,
+};
+
+enum
+{
+	SCALE_NONE = 0,
+	SCALE_1 = 1,
+	SCALE_2 = 2,
+	SCALE_4 = 4,
+	SCALE_8 = 8,
+	SCALE_ATREG = 16,
+	//SCALE_NOBASE_1 is not supported and can be replaced with SCALE_ATREG
+	SCALE_NOBASE_2 = 34,
+	SCALE_NOBASE_4 = 36,
+	SCALE_NOBASE_8 = 40,
+	SCALE_RIP = 0xFF,
+	SCALE_IMM8  = 0xF0,
+	SCALE_IMM16 = 0xF1,
+	SCALE_IMM32 = 0xF2,
+	SCALE_IMM64 = 0xF3,
+};
+
+enum NormalOp {
+	nrmADD,
+	nrmADC,
+	nrmSUB,
+	nrmSBB,
+	nrmAND,
+	nrmOR ,
+	nrmXOR,
+	nrmMOV,
+	nrmTEST,
+	nrmCMP,
+	nrmXCHG,
+};
+
+enum {
+	CMP_EQ = 0,
+	CMP_LT = 1,
+	CMP_LE = 2,
+	CMP_UNORD = 3,
+	CMP_NEQ = 4,
+	CMP_NLT = 5,
+	CMP_NLE = 6,
+	CMP_ORD = 7,
+};
+
+enum FloatOp {
+	floatLD = 0,
+	floatST = 2,
+	floatSTP = 3,
+	floatLD80 = 5,
+	floatSTP80 = 7,
+
+	floatINVALID = -1,
+};
+
+class XEmitter;
+
+// RIP addressing does not benefit from micro op fusion on Core arch
+struct OpArg
+{
+	OpArg() {}  // dummy op arg, used for storage
+	OpArg(u64 _offset, int _scale, X64Reg rmReg = RAX, X64Reg scaledReg = RAX)
+	{
+		operandReg = 0;
+		scale = (u8)_scale;
+		offsetOrBaseReg = (u16)rmReg;
+		indexReg = (u16)scaledReg;
+		//if scale == 0 never mind offsetting
+		offset = _offset;
+	}
+	bool operator==(OpArg b)
+	{
+		return operandReg == b.operandReg && scale == b.scale && offsetOrBaseReg == b.offsetOrBaseReg &&
+		       indexReg == b.indexReg && offset == b.offset;
+	}
+	void WriteRex(XEmitter *emit, int opBits, int bits, int customOp = -1) const;
+	void WriteVex(XEmitter* emit, X64Reg regOp1, X64Reg regOp2, int L, int pp, int mmmmm, int W = 0) const;
+	void WriteRest(XEmitter *emit, int extraBytes=0, X64Reg operandReg=INVALID_REG, bool warn_64bit_offset = true) const;
+	void WriteFloatModRM(XEmitter *emit, FloatOp op);
+	void WriteSingleByteOp(XEmitter *emit, u8 op, X64Reg operandReg, int bits);
+	// This one is public - must be written to
+	u64 offset;  // use RIP-relative as much as possible - 64-bit immediates are not available.
+	u16 operandReg;
+
+	void WriteNormalOp(XEmitter *emit, bool toRM, NormalOp op, const OpArg &operand, int bits) const;
+	bool IsImm() const {return scale == SCALE_IMM8 || scale == SCALE_IMM16 || scale == SCALE_IMM32 || scale == SCALE_IMM64;}
+	bool IsSimpleReg() const {return scale == SCALE_NONE;}
+	bool IsSimpleReg(X64Reg reg) const
+	{
+		if (!IsSimpleReg())
+			return false;
+		return GetSimpleReg() == reg;
+	}
+
+	bool CanDoOpWith(const OpArg &other) const
+	{
+		if (IsSimpleReg()) return true;
+		if (!IsSimpleReg() && !other.IsSimpleReg() && !other.IsImm()) return false;
+		return true;
+	}
+
+	int GetImmBits() const
+	{
+		switch (scale)
+		{
+		case SCALE_IMM8: return 8;
+		case SCALE_IMM16: return 16;
+		case SCALE_IMM32: return 32;
+		case SCALE_IMM64: return 64;
+		default: return -1;
+		}
+	}
+
+	X64Reg GetSimpleReg() const
+	{
+		if (scale == SCALE_NONE)
+			return (X64Reg)offsetOrBaseReg;
+		else
+			return INVALID_REG;
+	}
+private:
+	u8 scale;
+	u16 offsetOrBaseReg;
+	u16 indexReg;
+};
+
+template <typename T>
+inline OpArg M(const T *ptr)    {return OpArg((u64)(const void *)ptr, (int)SCALE_RIP);}
+inline OpArg R(X64Reg value)    {return OpArg(0, SCALE_NONE, value);}
+inline OpArg MatR(X64Reg value) {return OpArg(0, SCALE_ATREG, value);}
+
+inline OpArg MDisp(X64Reg value, int offset)
+{
+	return OpArg((u32)offset, SCALE_ATREG, value);
+}
+
+inline OpArg MComplex(X64Reg base, X64Reg scaled, int scale, int offset)
+{
+	return OpArg(offset, scale, base, scaled);
+}
+
+inline OpArg MScaled(X64Reg scaled, int scale, int offset)
+{
+	if (scale == SCALE_1)
+		return OpArg(offset, SCALE_ATREG, scaled);
+	else
+		return OpArg(offset, scale | 0x20, RAX, scaled);
+}
+
+inline OpArg MRegSum(X64Reg base, X64Reg offset)
+{
+	return MComplex(base, offset, 1, 0);
+}
+
+inline OpArg Imm8 (u8 imm)  {return OpArg(imm, SCALE_IMM8);}
+inline OpArg Imm16(u16 imm) {return OpArg(imm, SCALE_IMM16);} //rarely used
+inline OpArg Imm32(u32 imm) {return OpArg(imm, SCALE_IMM32);}
+inline OpArg Imm64(u64 imm) {return OpArg(imm, SCALE_IMM64);}
+#ifdef _ARCH_64
+inline OpArg ImmPtr(const void* imm) {return Imm64((u64)imm);}
+#else
+inline OpArg ImmPtr(const void* imm) {return Imm32((u32)imm);}
+#endif
+
+inline u32 PtrOffset(const void* ptr, const void* base)
+{
+#ifdef _ARCH_64
+	s64 distance = (s64)ptr-(s64)base;
+	if (distance >= 0x80000000LL ||
+	    distance < -0x80000000LL)
+	{
+		_assert_msg_(DYNA_REC, 0, "pointer offset out of range");
+		return 0;
+	}
+
+	return (u32)distance;
+#else
+	return (u32)ptr-(u32)base;
+#endif
+}
+
+//usage: int a[]; ARRAY_OFFSET(a,10)
+#define ARRAY_OFFSET(array,index) ((u32)((u64)&(array)[index]-(u64)&(array)[0]))
+//usage: struct {int e;} s; STRUCT_OFFSET(s,e)
+#define STRUCT_OFFSET(str,elem) ((u32)((u64)&(str).elem-(u64)&(str)))
+
+struct FixupBranch
+{
+	u8 *ptr;
+	int type; //0 = 8bit 1 = 32bit
+};
+
+enum SSECompare
+{
+	EQ = 0,
+	LT,
+	LE,
+	UNORD,
+	NEQ,
+	NLT,
+	NLE,
+	ORD,
+};
+
+typedef const u8* JumpTarget;
+
+class XEmitter
+{
+	friend struct OpArg;  // for Write8 etc
+private:
+	u8 *code;
+	bool flags_locked;
+
+	void CheckFlags();
+
+	void Rex(int w, int r, int x, int b);
+	void WriteSimple1Byte(int bits, u8 byte, X64Reg reg);
+	void WriteSimple2Byte(int bits, u8 byte1, u8 byte2, X64Reg reg);
+	void WriteMulDivType(int bits, OpArg src, int ext);
+	void WriteBitSearchType(int bits, X64Reg dest, OpArg src, u8 byte2, bool rep = false);
+	void WriteShift(int bits, OpArg dest, OpArg &shift, int ext);
+	void WriteBitTest(int bits, OpArg &dest, OpArg &index, int ext);
+	void WriteMXCSR(OpArg arg, int ext);
+	void WriteSSEOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteSSSE3Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteSSE41Op(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int extrabytes = 0);
+	void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp, OpArg arg, int W = 0, int extrabytes = 0);
+	void WriteAVXOp(u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int W = 0, int extrabytes = 0);
+	void WriteVEXOp(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteBMI1Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteBMI2Op(int size, u8 opPrefix, u16 op, X64Reg regOp1, X64Reg regOp2, OpArg arg, int extrabytes = 0);
+	void WriteFloatLoadStore(int bits, FloatOp op, FloatOp op_80b, OpArg arg);
+	void WriteNormalOp(XEmitter *emit, int bits, NormalOp op, const OpArg &a1, const OpArg &a2);
+
+	void ABI_CalculateFrameSize(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size, size_t* shadowp, size_t* subtractionp, size_t* xmm_offsetp);
+
+protected:
+	inline void Write8(u8 value)   {*code++ = value;}
+	inline void Write16(u16 value) {*(u16*)code = (value); code += 2;}
+	inline void Write32(u32 value) {*(u32*)code = (value); code += 4;}
+	inline void Write64(u64 value) {*(u64*)code = (value); code += 8;}
+
+public:
+	XEmitter() { code = nullptr; flags_locked = false; }
+	XEmitter(u8 *code_ptr) { code = code_ptr; flags_locked = false; }
+	virtual ~XEmitter() {}
+
+	void WriteModRM(int mod, int rm, int reg);
+	void WriteSIB(int scale, int index, int base);
+
+	void SetCodePtr(u8 *ptr);
+	void ReserveCodeSpace(int bytes);
+	const u8 *AlignCode4();
+	const u8 *AlignCode16();
+	const u8 *AlignCodePage();
+	const u8 *GetCodePtr() const;
+	u8 *GetWritableCodePtr();
+
+	void LockFlags() { flags_locked = true; }
+	void UnlockFlags() { flags_locked = false; }
+
+	// Looking for one of these? It's BANNED!! Some instructions are slow on modern CPU
+	// INC, DEC, LOOP, LOOPNE, LOOPE, ENTER, LEAVE, XCHG, XLAT, REP MOVSB/MOVSD, REP SCASD + other string instr.,
+	// INC and DEC are slow on Intel Core, but not on AMD. They create a
+	// false flag dependency because they only update a subset of the flags.
+	// XCHG is SLOW and should be avoided.
+
+	// Debug breakpoint
+	void INT3();
+
+	// Do nothing
+	void NOP(size_t count = 1);
+
+	// Save energy in wait-loops on P4 only. Probably not too useful.
+	void PAUSE();
+
+	// Flag control
+	void STC();
+	void CLC();
+	void CMC();
+
+	// These two can not be executed in 64-bit mode on early Intel 64-bit CPU:s, only on Core2 and AMD!
+	void LAHF(); // 3 cycle vector path
+	void SAHF(); // direct path fast
+
+
+	// Stack control
+	void PUSH(X64Reg reg);
+	void POP(X64Reg reg);
+	void PUSH(int bits, const OpArg &reg);
+	void POP(int bits, const OpArg &reg);
+	void PUSHF();
+	void POPF();
+
+	// Flow control
+	void RET();
+	void RET_FAST();
+	void UD2();
+	FixupBranch J(bool force5bytes = false);
+
+	void JMP(const u8 * addr, bool force5Bytes = false);
+	void JMPptr(const OpArg &arg);
+	void JMPself(); //infinite loop!
+#ifdef CALL
+#undef CALL
+#endif
+	void CALL(const void *fnptr);
+	void CALLptr(OpArg arg);
+
+	FixupBranch J_CC(CCFlags conditionCode, bool force5bytes = false);
+	//void J_CC(CCFlags conditionCode, JumpTarget target);
+	void J_CC(CCFlags conditionCode, const u8* addr);
+
+	void SetJumpTarget(const FixupBranch &branch);
+
+	void SETcc(CCFlags flag, OpArg dest);
+	// Note: CMOV brings small if any benefit on current CPUs.
+	void CMOVcc(int bits, X64Reg dest, OpArg src, CCFlags flag);
+
+	// Fences
+	void LFENCE();
+	void MFENCE();
+	void SFENCE();
+
+	// Bit scan
+	void BSF(int bits, X64Reg dest, OpArg src); //bottom bit to top bit
+	void BSR(int bits, X64Reg dest, OpArg src); //top bit to bottom bit
+
+	// Cache control
+	enum PrefetchLevel
+	{
+		PF_NTA, //Non-temporal (data used once and only once)
+		PF_T0,  //All cache levels
+		PF_T1,  //Levels 2+ (aliased to T0 on AMD)
+		PF_T2,  //Levels 3+ (aliased to T0 on AMD)
+	};
+	void PREFETCH(PrefetchLevel level, OpArg arg);
+	void MOVNTI(int bits, OpArg dest, X64Reg src);
+	void MOVNTDQ(OpArg arg, X64Reg regOp);
+	void MOVNTPS(OpArg arg, X64Reg regOp);
+	void MOVNTPD(OpArg arg, X64Reg regOp);
+
+	// Multiplication / division
+	void MUL(int bits, OpArg src); //UNSIGNED
+	void IMUL(int bits, OpArg src); //SIGNED
+	void IMUL(int bits, X64Reg regOp, OpArg src);
+	void IMUL(int bits, X64Reg regOp, OpArg src, OpArg imm);
+	void DIV(int bits, OpArg src);
+	void IDIV(int bits, OpArg src);
+
+	// Shift
+	void ROL(int bits, OpArg dest, OpArg shift);
+	void ROR(int bits, OpArg dest, OpArg shift);
+	void RCL(int bits, OpArg dest, OpArg shift);
+	void RCR(int bits, OpArg dest, OpArg shift);
+	void SHL(int bits, OpArg dest, OpArg shift);
+	void SHR(int bits, OpArg dest, OpArg shift);
+	void SAR(int bits, OpArg dest, OpArg shift);
+
+	// Bit Test
+	void BT(int bits, OpArg dest, OpArg index);
+	void BTS(int bits, OpArg dest, OpArg index);
+	void BTR(int bits, OpArg dest, OpArg index);
+	void BTC(int bits, OpArg dest, OpArg index);
+
+	// Double-Precision Shift
+	void SHRD(int bits, OpArg dest, OpArg src, OpArg shift);
+	void SHLD(int bits, OpArg dest, OpArg src, OpArg shift);
+
+	// Extend EAX into EDX in various ways
+	void CWD(int bits = 16);
+	inline void CDQ() {CWD(32);}
+	inline void CQO() {CWD(64);}
+	void CBW(int bits = 8);
+	inline void CWDE() {CBW(16);}
+	inline void CDQE() {CBW(32);}
+
+	// Load effective address
+	void LEA(int bits, X64Reg dest, OpArg src);
+
+	// Integer arithmetic
+	void NEG (int bits, OpArg src);
+	void ADD (int bits, const OpArg &a1, const OpArg &a2);
+	void ADC (int bits, const OpArg &a1, const OpArg &a2);
+	void SUB (int bits, const OpArg &a1, const OpArg &a2);
+	void SBB (int bits, const OpArg &a1, const OpArg &a2);
+	void AND (int bits, const OpArg &a1, const OpArg &a2);
+	void CMP (int bits, const OpArg &a1, const OpArg &a2);
+
+	// Bit operations
+	void NOT (int bits, OpArg src);
+	void OR  (int bits, const OpArg &a1, const OpArg &a2);
+	void XOR (int bits, const OpArg &a1, const OpArg &a2);
+	void MOV (int bits, const OpArg &a1, const OpArg &a2);
+	void TEST(int bits, const OpArg &a1, const OpArg &a2);
+
+	// Are these useful at all? Consider removing.
+	void XCHG(int bits, const OpArg &a1, const OpArg &a2);
+	void XCHG_AHAL();
+
+	// Byte swapping (32 and 64-bit only).
+	void BSWAP(int bits, X64Reg reg);
+
+	// Sign/zero extension
+	void MOVSX(int dbits, int sbits, X64Reg dest, OpArg src); //automatically uses MOVSXD if necessary
+	void MOVZX(int dbits, int sbits, X64Reg dest, OpArg src);
+
+	// Available only on Atom or >= Haswell so far. Test with cpu_info.bMOVBE.
+	void MOVBE(int dbits, const OpArg& dest, const OpArg& src);
+
+	// Available only on AMD >= Phenom or Intel >= Haswell
+	void LZCNT(int bits, X64Reg dest, OpArg src);
+	// Note: this one is actually part of BMI1
+	void TZCNT(int bits, X64Reg dest, OpArg src);
+
+	// WARNING - These two take 11-13 cycles and are VectorPath! (AMD64)
+	void STMXCSR(OpArg memloc);
+	void LDMXCSR(OpArg memloc);
+
+	// Prefixes
+	void LOCK();
+	void REP();
+	void REPNE();
+	void FSOverride();
+	void GSOverride();
+
+	// x87
+	enum x87StatusWordBits {
+		x87_InvalidOperation = 0x1,
+		x87_DenormalizedOperand = 0x2,
+		x87_DivisionByZero = 0x4,
+		x87_Overflow = 0x8,
+		x87_Underflow = 0x10,
+		x87_Precision = 0x20,
+		x87_StackFault = 0x40,
+		x87_ErrorSummary = 0x80,
+		x87_C0 = 0x100,
+		x87_C1 = 0x200,
+		x87_C2 = 0x400,
+		x87_TopOfStack = 0x2000 | 0x1000 | 0x800,
+		x87_C3 = 0x4000,
+		x87_FPUBusy = 0x8000,
+	};
+
+	void FLD(int bits, OpArg src);
+	void FST(int bits, OpArg dest);
+	void FSTP(int bits, OpArg dest);
+	void FNSTSW_AX();
+	void FWAIT();
+
+	// SSE/SSE2: Floating point arithmetic
+	void ADDSS(X64Reg regOp, OpArg arg);
+	void ADDSD(X64Reg regOp, OpArg arg);
+	void SUBSS(X64Reg regOp, OpArg arg);
+	void SUBSD(X64Reg regOp, OpArg arg);
+	void MULSS(X64Reg regOp, OpArg arg);
+	void MULSD(X64Reg regOp, OpArg arg);
+	void DIVSS(X64Reg regOp, OpArg arg);
+	void DIVSD(X64Reg regOp, OpArg arg);
+	void MINSS(X64Reg regOp, OpArg arg);
+	void MINSD(X64Reg regOp, OpArg arg);
+	void MAXSS(X64Reg regOp, OpArg arg);
+	void MAXSD(X64Reg regOp, OpArg arg);
+	void SQRTSS(X64Reg regOp, OpArg arg);
+	void SQRTSD(X64Reg regOp, OpArg arg);
+	void RSQRTSS(X64Reg regOp, OpArg arg);
+
+	// SSE/SSE2: Floating point bitwise (yes)
+	void CMPSS(X64Reg regOp, OpArg arg, u8 compare);
+	void CMPSD(X64Reg regOp, OpArg arg, u8 compare);
+
+	inline void CMPEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_EQ); }
+	inline void CMPLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LT); }
+	inline void CMPLESS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_LE); }
+	inline void CMPUNORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_UNORD); }
+	inline void CMPNEQSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NEQ); }
+	inline void CMPNLTSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_NLT); }
+	inline void CMPORDSS(X64Reg regOp, OpArg arg) { CMPSS(regOp, arg, CMP_ORD); }
+
+	// SSE/SSE2: Floating point packed arithmetic (x4 for float, x2 for double)
+	void ADDPS(X64Reg regOp, OpArg arg);
+	void ADDPD(X64Reg regOp, OpArg arg);
+	void SUBPS(X64Reg regOp, OpArg arg);
+	void SUBPD(X64Reg regOp, OpArg arg);
+	void CMPPS(X64Reg regOp, OpArg arg, u8 compare);
+	void CMPPD(X64Reg regOp, OpArg arg, u8 compare);
+	void MULPS(X64Reg regOp, OpArg arg);
+	void MULPD(X64Reg regOp, OpArg arg);
+	void DIVPS(X64Reg regOp, OpArg arg);
+	void DIVPD(X64Reg regOp, OpArg arg);
+	void MINPS(X64Reg regOp, OpArg arg);
+	void MINPD(X64Reg regOp, OpArg arg);
+	void MAXPS(X64Reg regOp, OpArg arg);
+	void MAXPD(X64Reg regOp, OpArg arg);
+	void SQRTPS(X64Reg regOp, OpArg arg);
+	void SQRTPD(X64Reg regOp, OpArg arg);
+	void RSQRTPS(X64Reg regOp, OpArg arg);
+
+	// SSE/SSE2: Floating point packed bitwise (x4 for float, x2 for double)
+	void ANDPS(X64Reg regOp, OpArg arg);
+	void ANDPD(X64Reg regOp, OpArg arg);
+	void ANDNPS(X64Reg regOp, OpArg arg);
+	void ANDNPD(X64Reg regOp, OpArg arg);
+	void ORPS(X64Reg regOp, OpArg arg);
+	void ORPD(X64Reg regOp, OpArg arg);
+	void XORPS(X64Reg regOp, OpArg arg);
+	void XORPD(X64Reg regOp, OpArg arg);
+
+	// SSE/SSE2: Shuffle components. These are tricky - see Intel documentation.
+	void SHUFPS(X64Reg regOp, OpArg arg, u8 shuffle);
+	void SHUFPD(X64Reg regOp, OpArg arg, u8 shuffle);
+
+	// SSE/SSE2: Useful alternative to shuffle in some cases.
+	void MOVDDUP(X64Reg regOp, OpArg arg);
+
+	void UNPCKLPS(X64Reg dest, OpArg src);
+	void UNPCKHPS(X64Reg dest, OpArg src);
+	void UNPCKLPD(X64Reg dest, OpArg src);
+	void UNPCKHPD(X64Reg dest, OpArg src);
+
+	// SSE/SSE2: Compares.
+	void COMISS(X64Reg regOp, OpArg arg);
+	void COMISD(X64Reg regOp, OpArg arg);
+	void UCOMISS(X64Reg regOp, OpArg arg);
+	void UCOMISD(X64Reg regOp, OpArg arg);
+
+	// SSE/SSE2: Moves. Use the right data type for your data, in most cases.
+	void MOVAPS(X64Reg regOp, OpArg arg);
+	void MOVAPD(X64Reg regOp, OpArg arg);
+	void MOVAPS(OpArg arg, X64Reg regOp);
+	void MOVAPD(OpArg arg, X64Reg regOp);
+
+	void MOVUPS(X64Reg regOp, OpArg arg);
+	void MOVUPD(X64Reg regOp, OpArg arg);
+	void MOVUPS(OpArg arg, X64Reg regOp);
+	void MOVUPD(OpArg arg, X64Reg regOp);
+
+	void MOVDQA(X64Reg regOp, OpArg arg);
+	void MOVDQA(OpArg arg, X64Reg regOp);
+	void MOVDQU(X64Reg regOp, OpArg arg);
+	void MOVDQU(OpArg arg, X64Reg regOp);
+
+	void MOVSS(X64Reg regOp, OpArg arg);
+	void MOVSD(X64Reg regOp, OpArg arg);
+	void MOVSS(OpArg arg, X64Reg regOp);
+	void MOVSD(OpArg arg, X64Reg regOp);
+
+	void MOVLPD(X64Reg regOp, OpArg arg);
+	void MOVHPD(X64Reg regOp, OpArg arg);
+	void MOVLPD(OpArg arg, X64Reg regOp);
+	void MOVHPD(OpArg arg, X64Reg regOp);
+
+	void MOVHLPS(X64Reg regOp1, X64Reg regOp2);
+	void MOVLHPS(X64Reg regOp1, X64Reg regOp2);
+
+	void MOVD_xmm(X64Reg dest, const OpArg &arg);
+	void MOVQ_xmm(X64Reg dest, OpArg arg);
+	void MOVD_xmm(const OpArg &arg, X64Reg src);
+	void MOVQ_xmm(OpArg arg, X64Reg src);
+
+	// SSE/SSE2: Generates a mask from the high bits of the components of the packed register in question.
+	void MOVMSKPS(X64Reg dest, OpArg arg);
+	void MOVMSKPD(X64Reg dest, OpArg arg);
+
+	// SSE2: Selective byte store, mask in src register. EDI/RDI specifies store address. This is a weird one.
+	void MASKMOVDQU(X64Reg dest, X64Reg src);
+	void LDDQU(X64Reg dest, OpArg src);
+
+	// SSE/SSE2: Data type conversions.
+	void CVTPS2PD(X64Reg dest, OpArg src);
+	void CVTPD2PS(X64Reg dest, OpArg src);
+	void CVTSS2SD(X64Reg dest, OpArg src);
+	void CVTSI2SS(X64Reg dest, OpArg src);
+	void CVTSD2SS(X64Reg dest, OpArg src);
+	void CVTSI2SD(X64Reg dest, OpArg src);
+	void CVTDQ2PD(X64Reg regOp, OpArg arg);
+	void CVTPD2DQ(X64Reg regOp, OpArg arg);
+	void CVTDQ2PS(X64Reg regOp, OpArg arg);
+	void CVTPS2DQ(X64Reg regOp, OpArg arg);
+
+	void CVTTPS2DQ(X64Reg regOp, OpArg arg);
+	void CVTTPD2DQ(X64Reg regOp, OpArg arg);
+
+	// Destinations are X64 regs (rax, rbx, ...) for these instructions.
+	void CVTSS2SI(X64Reg xregdest, OpArg src);
+	void CVTSD2SI(X64Reg xregdest, OpArg src);
+	void CVTTSS2SI(X64Reg xregdest, OpArg arg);
+	void CVTTSD2SI(X64Reg xregdest, OpArg arg);
+
+	// SSE2: Packed integer instructions
+	void PACKSSDW(X64Reg dest, OpArg arg);
+	void PACKSSWB(X64Reg dest, OpArg arg);
+	void PACKUSDW(X64Reg dest, OpArg arg);
+	void PACKUSWB(X64Reg dest, OpArg arg);
+
+	void PUNPCKLBW(X64Reg dest, const OpArg &arg);
+	void PUNPCKLWD(X64Reg dest, const OpArg &arg);
+	void PUNPCKLDQ(X64Reg dest, const OpArg &arg);
+
+	void PTEST(X64Reg dest, OpArg arg);
+	void PAND(X64Reg dest, OpArg arg);
+	void PANDN(X64Reg dest, OpArg arg);
+	void PXOR(X64Reg dest, OpArg arg);
+	void POR(X64Reg dest, OpArg arg);
+
+	void PADDB(X64Reg dest, OpArg arg);
+	void PADDW(X64Reg dest, OpArg arg);
+	void PADDD(X64Reg dest, OpArg arg);
+	void PADDQ(X64Reg dest, OpArg arg);
+
+	void PADDSB(X64Reg dest, OpArg arg);
+	void PADDSW(X64Reg dest, OpArg arg);
+	void PADDUSB(X64Reg dest, OpArg arg);
+	void PADDUSW(X64Reg dest, OpArg arg);
+
+	void PSUBB(X64Reg dest, OpArg arg);
+	void PSUBW(X64Reg dest, OpArg arg);
+	void PSUBD(X64Reg dest, OpArg arg);
+	void PSUBQ(X64Reg dest, OpArg arg);
+
+	void PSUBSB(X64Reg dest, OpArg arg);
+	void PSUBSW(X64Reg dest, OpArg arg);
+	void PSUBUSB(X64Reg dest, OpArg arg);
+	void PSUBUSW(X64Reg dest, OpArg arg);
+
+	void PAVGB(X64Reg dest, OpArg arg);
+	void PAVGW(X64Reg dest, OpArg arg);
+
+	void PCMPEQB(X64Reg dest, OpArg arg);
+	void PCMPEQW(X64Reg dest, OpArg arg);
+	void PCMPEQD(X64Reg dest, OpArg arg);
+
+	void PCMPGTB(X64Reg dest, OpArg arg);
+	void PCMPGTW(X64Reg dest, OpArg arg);
+	void PCMPGTD(X64Reg dest, OpArg arg);
+
+	void PEXTRW(X64Reg dest, OpArg arg, u8 subreg);
+	void PINSRW(X64Reg dest, OpArg arg, u8 subreg);
+
+	void PMADDWD(X64Reg dest, OpArg arg);
+	void PSADBW(X64Reg dest, OpArg arg);
+
+	void PMAXSW(X64Reg dest, OpArg arg);
+	void PMAXUB(X64Reg dest, OpArg arg);
+	void PMINSW(X64Reg dest, OpArg arg);
+	void PMINUB(X64Reg dest, OpArg arg);
+
+	void PMOVMSKB(X64Reg dest, OpArg arg);
+	void PSHUFD(X64Reg dest, OpArg arg, u8 shuffle);
+	void PSHUFB(X64Reg dest, OpArg arg);
+
+	void PSHUFLW(X64Reg dest, OpArg arg, u8 shuffle);
+	void PSHUFHW(X64Reg dest, OpArg arg, u8 shuffle);
+
+	void PSRLW(X64Reg reg, int shift);
+	void PSRLD(X64Reg reg, int shift);
+	void PSRLQ(X64Reg reg, int shift);
+	void PSRLQ(X64Reg reg, OpArg arg);
+	void PSRLDQ(X64Reg reg, int shift);
+
+	void PSLLW(X64Reg reg, int shift);
+	void PSLLD(X64Reg reg, int shift);
+	void PSLLQ(X64Reg reg, int shift);
+	void PSLLDQ(X64Reg reg, int shift);
+
+	void PSRAW(X64Reg reg, int shift);
+	void PSRAD(X64Reg reg, int shift);
+
+	// SSE4: data type conversions
+	void PMOVSXBW(X64Reg dest, OpArg arg);
+	void PMOVSXBD(X64Reg dest, OpArg arg);
+	void PMOVSXBQ(X64Reg dest, OpArg arg);
+	void PMOVSXWD(X64Reg dest, OpArg arg);
+	void PMOVSXWQ(X64Reg dest, OpArg arg);
+	void PMOVSXDQ(X64Reg dest, OpArg arg);
+	void PMOVZXBW(X64Reg dest, OpArg arg);
+	void PMOVZXBD(X64Reg dest, OpArg arg);
+	void PMOVZXBQ(X64Reg dest, OpArg arg);
+	void PMOVZXWD(X64Reg dest, OpArg arg);
+	void PMOVZXWQ(X64Reg dest, OpArg arg);
+	void PMOVZXDQ(X64Reg dest, OpArg arg);
+
+	// SSE4: variable blend instructions (xmm0 implicit argument)
+	void PBLENDVB(X64Reg dest, OpArg arg);
+	void BLENDVPS(X64Reg dest, OpArg arg);
+	void BLENDVPD(X64Reg dest, OpArg arg);
+
+	// AVX
+	void VADDSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSUBSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VMULSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VDIVSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VADDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSUBPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VMULPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VDIVPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSQRTSD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VSHUFPD(X64Reg regOp1, X64Reg regOp2, OpArg arg, u8 shuffle);
+	void VUNPCKLPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VUNPCKHPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+
+	void VANDPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VANDPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VANDNPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VANDNPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VXORPS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VXORPD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+
+	void VPAND(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPANDN(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VPXOR(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+
+	// FMA3
+	void VFMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMADD231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB132SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB213SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB231SS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB132SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB213SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFNMSUB231SD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADDSUB132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADDSUB213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADDSUB231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADDSUB132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADDSUB213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMADDSUB231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUBADD132PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUBADD213PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUBADD231PS(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUBADD132PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUBADD213PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void VFMSUBADD231PD(X64Reg regOp1, X64Reg regOp2, OpArg arg);
+
+	// VEX GPR instructions
+	void SARX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void SHLX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void SHRX(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void RORX(int bits, X64Reg regOp, OpArg arg, u8 rotate);
+	void PEXT(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void PDEP(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void MULX(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+	void BZHI(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void BLSR(int bits, X64Reg regOp, OpArg arg);
+	void BLSMSK(int bits, X64Reg regOp, OpArg arg);
+	void BLSI(int bits, X64Reg regOp, OpArg arg);
+	void BEXTR(int bits, X64Reg regOp1, OpArg arg, X64Reg regOp2);
+	void ANDN(int bits, X64Reg regOp1, X64Reg regOp2, OpArg arg);
+
+	void RDTSC();
+
+	// Utility functions
+	// The difference between this and CALL is that this aligns the stack
+	// where appropriate.
+	void ABI_CallFunction(const void *func);
+
+	void ABI_CallFunctionC16(const void *func, u16 param1);
+	void ABI_CallFunctionCC16(const void *func, u32 param1, u16 param2);
+
+	// These only support u32 parameters, but that's enough for a lot of uses.
+	// These will destroy the 1 or 2 first "parameter regs".
+	void ABI_CallFunctionC(const void *func, u32 param1);
+	void ABI_CallFunctionCC(const void *func, u32 param1, u32 param2);
+	void ABI_CallFunctionCP(const void *func, u32 param1, void *param2);
+	void ABI_CallFunctionCCC(const void *func, u32 param1, u32 param2, u32 param3);
+	void ABI_CallFunctionCCP(const void *func, u32 param1, u32 param2, void *param3);
+	void ABI_CallFunctionCCCP(const void *func, u32 param1, u32 param2,u32 param3, void *param4);
+	void ABI_CallFunctionPC(const void *func, void *param1, u32 param2);
+	void ABI_CallFunctionPPC(const void *func, void *param1, void *param2, u32 param3);
+	void ABI_CallFunctionAC(int bits, const void *func, const OpArg &arg1, u32 param2);
+	void ABI_CallFunctionA(int bits, const void *func, const OpArg &arg1);
+
+	// Pass a register as a parameter.
+	void ABI_CallFunctionR(const void *func, X64Reg reg1);
+	void ABI_CallFunctionRR(const void *func, X64Reg reg1, X64Reg reg2);
+
+	// Helper method for the above, or can be used separately.
+	void MOVTwo(int bits, Gen::X64Reg dst1, Gen::X64Reg src1, Gen::X64Reg dst2, Gen::X64Reg src2);
+
+	// Saves/restores the registers and adjusts the stack to be aligned as
+	// required by the ABI, where the previous alignment was as specified.
+	// Push returns the size of the shadow space, i.e. the offset of the frame.
+	size_t ABI_PushRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+	void ABI_PopRegistersAndAdjustStack(BitSet32 mask, size_t rsp_alignment, size_t needed_frame_size = 0);
+
+	inline int ABI_GetNumXMMRegs() { return 16; }
+
+	// Strange call wrappers.
+	void CallCdeclFunction3(void* fnptr, u32 arg0, u32 arg1, u32 arg2);
+	void CallCdeclFunction4(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
+	void CallCdeclFunction5(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
+	void CallCdeclFunction6(void* fnptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
+
+	// Comments from VertexLoader.cpp about these horrors:
+
+	// This is a horrible hack that is necessary in 64-bit mode because Opengl32.dll is based way, way above the 32-bit
+	// address space that is within reach of a CALL, and just doing &fn gives us these high uncallable addresses. So we
+	// want to grab the function pointers from the import table instead.
+
+	void ___CallCdeclImport3(void* impptr, u32 arg0, u32 arg1, u32 arg2);
+	void ___CallCdeclImport4(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3);
+	void ___CallCdeclImport5(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4);
+	void ___CallCdeclImport6(void* impptr, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 arg5);
+
+	#define CallCdeclFunction3_I(a,b,c,d) ___CallCdeclImport3(&__imp_##a,b,c,d)
+	#define CallCdeclFunction4_I(a,b,c,d,e) ___CallCdeclImport4(&__imp_##a,b,c,d,e)
+	#define CallCdeclFunction5_I(a,b,c,d,e,f) ___CallCdeclImport5(&__imp_##a,b,c,d,e,f)
+	#define CallCdeclFunction6_I(a,b,c,d,e,f,g) ___CallCdeclImport6(&__imp_##a,b,c,d,e,f,g)
+
+	#define DECLARE_IMPORT(x) extern "C" void *__imp_##x
+
+	// Utility to generate a call to a std::function object.
+	//
+	// Unfortunately, calling operator() directly is undefined behavior in C++
+	// (this method might be a thunk in the case of multi-inheritance) so we
+	// have to go through a trampoline function.
+	template <typename T, typename... Args>
+	static void CallLambdaTrampoline(const std::function<T(Args...)>* f,
+	                                 Args... args)
+	{
+		(*f)(args...);
+	}
+
+	template <typename T, typename... Args>
+	void ABI_CallLambdaC(const std::function<T(Args...)>* f, u32 p1)
+	{
+		// Double casting is required by VC++ for some reason.
+		auto trampoline = (void(*)())&XEmitter::CallLambdaTrampoline<T, Args...>;
+		ABI_CallFunctionPC((void*)trampoline, const_cast<void*>((const void*)f), p1);
+	}
+};  // class XEmitter
+
+class X64CodeBlock : public CodeBlock<XEmitter>
+{
+private:
+	void PoisonMemory() override
+	{
+		// x86/64: 0xCC = breakpoint
+		memset(region, 0xCC, region_size);
+	}
+};
+
+}  // namespace