Initial commit

2026-02-02 04:50:13 +01:00
commit 5b11698731
22592 changed files with 7677434 additions and 0 deletions
--- a/graphics/blit/blit-alpha.cpp
+++ b/graphics/blit/blit-alpha.cpp
@@ -0,0 +1,541 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "common/system.h"
+#include "graphics/blit.h"
+#include "graphics/pixelformat.h"
+
+namespace Graphics {
+
+namespace {
+
+template<typename Color, int Size>
+static inline uint32 READ_PIXEL(const byte *src) {
+	if (Size == sizeof(Color)) {
+		return *(const Color *)src;
+	} else {
+		uint32 color;
+		uint8 *col = (uint8 *)&color;
+#ifdef SCUMM_BIG_ENDIAN
+		if (Size == 3)
+			col++;
+#endif
+		memcpy(col, src, Size);
+		return color;
+	}
+}
+
+template<typename Color, int Size>
+static inline void WRITE_PIXEL(byte *dst, const uint32 color) {
+	if (Size == sizeof(Color)) {
+		*(Color *)dst = color;
+	} else {
+		const uint8 *col = (const uint8 *)&color;
+#ifdef SCUMM_BIG_ENDIAN
+		if (Size == 3)
+			col++;
+#endif
+		memcpy(dst, col, Size);
+	}
+}
+
+template<typename SrcColor, int SrcSize, typename DstColor, int DstSize, bool hasKey, bool hasMask, bool hasMap>
+static inline void alphaBlitLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
+						const PixelFormat &srcFmt, const PixelFormat &dstFmt, const uint32 *map,
+						const int srcDelta, const int dstDelta, const int maskDelta,
+						const int srcInc, const int dstInc, const int maskInc,
+						const uint32 key, const byte flip, const byte aMod) {
+	const uint32 alphaMask = srcFmt.ARGBToColor(255, 0, 0, 0);
+	const bool convert = hasMap ? false : ((SrcSize != DstSize) ? true : srcFmt == dstFmt);
+
+	for (uint y = 0; y < h; ++y) {
+		for (uint x = 0; x < w; ++x) {
+			const uint32 srcColor = hasMap ? map[*src]
+				: READ_PIXEL<SrcColor, SrcSize>(src);
+
+			const bool isOpaque = hasMask ? (*mask == 0xff)
+				: (hasKey ? (READ_PIXEL<SrcColor, SrcSize>(src) != key)
+				: !alphaMask || ((srcColor & alphaMask) == alphaMask));
+			const bool isTransparent = hasMask ? (*mask == 0x00)
+				: (hasKey ? (READ_PIXEL<SrcColor, SrcSize>(src) == key)
+				: alphaMask && ((srcColor & alphaMask) == 0));
+
+			if (isOpaque && aMod == 0xff) {
+				if (convert) {
+					byte sR, sG, sB;
+					srcFmt.colorToRGB(srcColor, sR, sG, sB);
+					WRITE_PIXEL<DstColor, DstSize>(dst, dstFmt.RGBToColor(sR, sG, sB));
+				} else {
+					WRITE_PIXEL<DstColor, DstSize>(dst, srcColor);
+				}
+			} else if (!isTransparent) {
+				// TODO: Optimise for matching formats?
+				const uint32 dstColor = READ_PIXEL<DstColor, DstSize>(dst);
+
+				byte sA, sR, sG, sB;
+				srcFmt.colorToARGB(srcColor, sA, sR, sG, sB);
+
+				byte dR, dG, dB;
+				dstFmt.colorToRGB(dstColor, dR, dG, dB);
+
+				if (hasKey)
+					sA = aMod;
+				else if (hasMask)
+					sA = ((*mask * aMod) >> 8);
+				else
+					sA = ((sA * aMod) >> 8);
+
+				dR = (dR * (255-sA) + sR * sA) >> 8;
+				dG = (dG * (255-sA) + sG * sA) >> 8;
+				dB = (dB * (255-sA) + sB * sA) >> 8;
+
+				const uint32 outColor = dstFmt.RGBToColor(dR, dG, dB);
+				WRITE_PIXEL<DstColor, DstSize>(dst, outColor);
+			}
+
+			src += srcInc;
+			dst += dstInc;
+			if (hasMask)
+				mask += maskInc;
+		}
+
+		src += srcDelta;
+		dst += dstDelta;
+		if (hasMask)
+			mask += maskDelta;
+	}
+}
+
+template<bool hasKey, bool hasMask>
+static inline bool alphaBlitHelper(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
+                         const PixelFormat &srcFmt, const PixelFormat &dstFmt,
+                         const uint srcPitch, const uint dstPitch, const uint maskPitch,
+                         const uint32 key, const byte flip, const byte aMod) {
+	const bool hasMap = false;
+	const bool flipx = flip & FLIP_H;
+	const bool flipy = flip & FLIP_V;
+
+	// Faster, but larger, to provide optimized handling for each case.
+	      int dstDelta = (dstPitch - w * dstFmt.bytesPerPixel);
+	const int srcDelta = (srcPitch - w * srcFmt.bytesPerPixel);
+	const int maskDelta = hasMask ? (maskPitch - w) : 0;
+
+	const int dstInc = flipx ? -dstFmt.bytesPerPixel : dstFmt.bytesPerPixel;
+	const int srcInc = srcFmt.bytesPerPixel;
+	const int maskInc = 1;
+
+	if (flipx)
+		dst += (w - 1) * dstFmt.bytesPerPixel;
+
+	if (flipy)
+		dst += (h - 1) * dstPitch;
+
+	if (flipy && flipx)
+		dstDelta = -dstDelta;
+	else if (flipy)
+		dstDelta = -((dstPitch * 2) - dstDelta);
+	else if (flipx)
+		dstDelta =   (dstPitch * 2) - dstDelta;
+
+	if (aMod == 0)
+		return true;
+
+	// TODO: optimized cases for dstDelta of 0
+	if (dstFmt.bytesPerPixel == 2) {
+		if (srcFmt.bytesPerPixel == 2) {
+			alphaBlitLogic<uint16, 2, uint16, 2, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
+		} else if (srcFmt.bytesPerPixel == 3) {
+			alphaBlitLogic<uint8,  3, uint16, 2, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
+		} else {
+			alphaBlitLogic<uint32, 4, uint16, 2, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
+		}
+	} else if (dstFmt.bytesPerPixel == 4) {
+		if (srcFmt.bytesPerPixel == 2) {
+			alphaBlitLogic<uint16, 2, uint32, 4, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
+		} else if (srcFmt.bytesPerPixel == 3) {
+			alphaBlitLogic<uint8,  3, uint32, 4, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
+		} else {
+			alphaBlitLogic<uint32, 4, uint32, 4, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
+		}
+	} else {
+		return false;
+	}
+	return true;
+}
+
+template<bool hasKey, bool hasMask>
+static inline bool alphaBlitMapHelper(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
+                         const PixelFormat &dstFmt, const uint32 *map,
+                         const uint srcPitch, const uint dstPitch, const uint maskPitch,
+                         const uint32 key, const byte flip, const byte aMod) {
+	const Graphics::PixelFormat &srcFmt = dstFmt;
+	const bool hasMap = true;
+	const bool flipx = flip & FLIP_H;
+	const bool flipy = flip & FLIP_V;
+
+	// Faster, but larger, to provide optimized handling for each case.
+	      int dstDelta = (dstPitch - w * dstFmt.bytesPerPixel);
+	const int srcDelta = (srcPitch - w);
+	const int maskDelta = hasMask ? (maskPitch - w) : 0;
+
+	const int dstInc = flipx ? -dstFmt.bytesPerPixel : dstFmt.bytesPerPixel;
+	const int srcInc = 1;
+	const int maskInc = 1;
+
+	if (flipx)
+		dst += (w - 1) * dstFmt.bytesPerPixel;
+
+	if (flipy)
+		dst += (h - 1) * dstPitch;
+
+	if (flipy && flipx)
+		dstDelta = -dstDelta;
+	else if (flipy)
+		dstDelta = -((dstPitch * 2) - dstDelta);
+	else if (flipx)
+		dstDelta =   (dstPitch * 2) - dstDelta;
+
+	// TODO: optimized cases for dstDelta of 0
+	if (dstFmt.bytesPerPixel == 2) {
+		alphaBlitLogic<uint8,  1, uint16, 2, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, map, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
+	} else if (dstFmt.bytesPerPixel == 4) {
+		alphaBlitLogic<uint8,  1, uint32, 4, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, map, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
+	} else {
+		return false;
+	}
+	return true;
+}
+
+} // End of anonymous namespace
+
+bool alphaBlit(byte *dst, const byte *src,
+				const uint dstPitch, const uint srcPitch,
+				const uint w, const uint h,
+				const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt,
+				const byte flip, const byte aMod) {
+	// Error out if conversion is impossible
+	if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
+			|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	return alphaBlitHelper<false, false>(dst, src, nullptr, w, h, srcFmt, dstFmt, srcPitch, dstPitch, 0, 0, flip, aMod);
+}
+
+bool alphaKeyBlit(byte *dst, const byte *src,
+				const uint dstPitch, const uint srcPitch,
+				const uint w, const uint h,
+				const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt,
+				const uint32 key, const byte flip, const byte aMod) {
+	// Error out if conversion is impossible
+	if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
+			|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	return alphaBlitHelper<true, false>(dst, src, nullptr, w, h, srcFmt, dstFmt, srcPitch, dstPitch, 0, key, flip, aMod);
+}
+
+bool alphaMaskBlit(byte *dst, const byte *src, const byte *mask,
+				const uint dstPitch, const uint srcPitch, const uint maskPitch,
+				const uint w, const uint h,
+				const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt,
+				const byte flip, const byte aMod) {
+	// Error out if conversion is impossible
+	if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
+			|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	return alphaBlitHelper<false, true>(dst, src, mask, w, h, srcFmt, dstFmt, srcPitch, dstPitch, maskPitch, 0, flip, aMod);
+}
+
+bool alphaBlitMap(byte *dst, const byte *src,
+			        const uint dstPitch, const uint srcPitch,
+			        const uint w, const uint h,
+			        const Graphics::PixelFormat &dstFmt, const uint32 *map,
+			        const byte flip, const byte aMod) {
+	// Error out if conversion is impossible
+	if ((dstFmt.bytesPerPixel == 1) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	return alphaBlitMapHelper<false, false>(dst, src, nullptr, w, h, dstFmt, map, srcPitch, dstPitch, 0, 0, flip, aMod);
+}
+
+bool alphaKeyBlitMap(byte *dst, const byte *src,
+			        const uint dstPitch, const uint srcPitch,
+			        const uint w, const uint h,
+			        const Graphics::PixelFormat &dstFmt, const uint32 *map,
+			        const uint32 key, const byte flip, const byte aMod) {
+	// Error out if conversion is impossible
+	if ((dstFmt.bytesPerPixel == 1) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	return alphaBlitMapHelper<true, false>(dst, src, nullptr, w, h, dstFmt, map, srcPitch, dstPitch, 0, key, flip, aMod);
+}
+
+bool alphaMaskBlitMap(byte *dst, const byte *src, const byte *mask,
+			        const uint dstPitch, const uint srcPitch, const uint maskPitch,
+			        const uint w, const uint h,
+			        const Graphics::PixelFormat &dstFmt, const uint32 *map,
+			        const byte flip, const byte aMod) {
+	// Error out if conversion is impossible
+	if ((dstFmt.bytesPerPixel == 1) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	return alphaBlitMapHelper<false, true>(dst, src, mask, w, h, dstFmt, map, srcPitch, dstPitch, maskPitch, 0, flip, aMod);
+}
+
+namespace {
+
+template<typename Size, bool overwriteAlpha>
+inline bool applyColorKeyLogic(byte *dst, const byte *src, const uint w, const uint h,
+							   const uint srcDelta, const uint dstDelta,
+							   const Graphics::PixelFormat &format,
+							   const uint8 rKey, const uint8 gKey, const uint8 bKey,
+							   const uint8 rNew, const uint8 gNew, const uint8 bNew) {
+
+	const uint32 keyPix    = format.ARGBToColor(0,   rKey, gKey, bKey);
+	const uint32 newPix    = format.ARGBToColor(0,   rNew, gNew, bNew);
+	const uint32 rgbMask   = format.ARGBToColor(0,   255,  255,  255);
+	const uint32 alphaMask = format.ARGBToColor(255, 0,    0,    0);
+	bool applied = false;
+
+	for (uint y = 0; y < h; ++y) {
+		for (uint x = 0; x < w; ++x) {
+			uint32 pix = *(const Size *)src;
+
+			if ((pix & rgbMask) == keyPix) {
+				*(Size *)dst = newPix;
+				applied = true;
+			} else if (overwriteAlpha) {
+				*(Size *)dst = pix | alphaMask;
+			}
+
+			src += sizeof(Size);
+			dst += sizeof(Size);
+		}
+
+		src += srcDelta;
+		dst += dstDelta;
+	}
+
+	return applied;
+}
+
+template<typename Size, bool skipTransparent>
+inline void setAlphaLogic(byte *dst, const byte *src, const uint w, const uint h,
+						  const uint srcDelta, const uint dstDelta,
+						  const Graphics::PixelFormat &format, const uint8 alpha) {
+
+	const uint32 newAlpha  = format.ARGBToColor(alpha, 0,   0,   0);
+	const uint32 rgbMask   = format.ARGBToColor(0,     255, 255, 255);
+	const uint32 alphaMask = format.ARGBToColor(255,   0,   0,   0);
+
+	for (uint y = 0; y < h; ++y) {
+		for (uint x = 0; x < w; ++x) {
+			uint32 pix = *(const Size *)src;
+
+			if (!skipTransparent || (pix & alphaMask))
+				*(Size *)dst = (pix & rgbMask) | newAlpha;
+			else
+				*(Size *)dst = pix;
+
+			src += sizeof(Size);
+			dst += sizeof(Size);
+		}
+
+		src += srcDelta;
+		dst += dstDelta;
+	}
+}
+
+} // End of anonymous namespace
+
+// Function to merge a transparent color key with the alpha channel
+bool applyColorKey(byte *dst, const byte *src,
+				   const uint dstPitch, const uint srcPitch,
+				   const uint w, const uint h,
+				   const Graphics::PixelFormat &format, const bool overwriteAlpha,
+				   const uint8 rKey, const uint8 gKey, const uint8 bKey,
+				   const uint8 rNew, const uint8 gNew, const uint8 bNew) {
+
+	// Faster, but larger, to provide optimized handling for each case.
+	const uint srcDelta = (srcPitch - w * format.bytesPerPixel);
+	const uint dstDelta = (dstPitch - w * format.bytesPerPixel);
+
+	if (format.aBits() == 0) {
+		return false;
+	}
+
+	if (overwriteAlpha) {
+		if (format.bytesPerPixel == 1) {
+			return applyColorKeyLogic<uint8, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+		} else if (format.bytesPerPixel == 2) {
+			return applyColorKeyLogic<uint16, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+		} else if (format.bytesPerPixel == 4) {
+			return applyColorKeyLogic<uint32, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+		} else {
+			return false;
+		}
+	} else {
+		if (format.bytesPerPixel == 1) {
+			return applyColorKeyLogic<uint8, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+		} else if (format.bytesPerPixel == 2) {
+			return applyColorKeyLogic<uint16, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+		} else if (format.bytesPerPixel == 4) {
+			return applyColorKeyLogic<uint32, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
+		} else {
+			return false;
+		}
+	}
+}
+
+// Function to set the alpha channel for all pixels to the specified value
+bool setAlpha(byte *dst, const byte *src,
+			  const uint dstPitch, const uint srcPitch,
+			  const uint w, const uint h,
+			  const Graphics::PixelFormat &format,
+			  const bool skipTransparent, const uint8 alpha) {
+
+	// Faster, but larger, to provide optimized handling for each case.
+	const uint srcDelta = (srcPitch - w * format.bytesPerPixel);
+	const uint dstDelta = (dstPitch - w * format.bytesPerPixel);
+
+	if (format.aBits() == 0) {
+		return false;
+	}
+
+	if (skipTransparent) {
+		if (format.bytesPerPixel == 1) {
+			setAlphaLogic<uint8, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+		} else if (format.bytesPerPixel == 2) {
+			setAlphaLogic<uint16, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+		} else if (format.bytesPerPixel == 4) {
+			setAlphaLogic<uint32, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+		} else {
+			return false;
+		}
+	} else {
+		if (format.bytesPerPixel == 1) {
+			setAlphaLogic<uint8, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+		} else if (format.bytesPerPixel == 2) {
+			setAlphaLogic<uint16, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+		} else if (format.bytesPerPixel == 4) {
+			setAlphaLogic<uint32, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
+		} else {
+			return false;
+		}
+	}
+
+	return true;
+}
+
+BlendBlit::Args::Args(byte *dst, const byte *src,
+	const uint _dstPitch, const uint _srcPitch,
+	const int posX, const int posY,
+	const uint _width, const uint _height,
+	const int _scaleX, const int _scaleY,
+	const int scaleXsrcOff, const int scaleYsrcOff,
+	const uint32 colorMod, const uint _flipping) :
+		xp(0), yp(0), dstPitch(_dstPitch),
+		width(_width), height(_height), color(colorMod),
+		scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
+		scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
+	bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
+	
+	rgbmod   = ((colorMod & kRGBModMask) != kRGBModMask);
+	alphamod = ((colorMod & kAModMask)   != kAModMask);
+	inStep = 4;
+	inoStep = _srcPitch;
+	if (flipping & FLIP_H) {
+		inStep = -inStep;
+		xp = width - 1;
+		if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
+	}
+
+	if (flipping & FLIP_V) {
+		inoStep = -inoStep;
+		yp = height - 1;
+		if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
+	}
+
+	ino = src + yp * _srcPitch + xp * 4;
+	outo = dst + posY * _dstPitch + posX * 4;
+}
+
+// Initialize these to nullptr at the start
+BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
+BlendBlit::FillFunc BlendBlit::fillFunc = nullptr;
+
+// Only blits to and from 32bpp images
+// So this function is just here to jump to whatever function is in
+// BlendBlit::blitFunc. This way, we can detect at runtime whether or not
+// the cpu has certain SIMD feature enabled or not.
+void BlendBlit::blit(byte *dst, const byte *src,
+					 const uint dstPitch, const uint srcPitch,
+					 const int posX, const int posY,
+					 const uint width, const uint height,
+					 const int scaleX, const int scaleY,
+					 const int scaleXsrcOff, const int scaleYsrcOff,
+					 const uint32 colorMod, const uint flipping,
+					 const TSpriteBlendMode blendMode,
+					 const AlphaType alphaType) {
+	if (width == 0 || height == 0) return;
+
+	// If no function has been selected yet, detect and select
+	if (!blitFunc) {
+		// Get the correct blit function
+		blitFunc = blitGeneric;
+#ifdef SCUMMVM_NEON
+		if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
+#endif
+#ifdef SCUMMVM_SSE2
+		if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
+#endif
+#ifdef SCUMMVM_AVX2
+		if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
+#endif
+	}
+	
+	Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
+	blitFunc(args, blendMode, alphaType);
+}
+
+// Only fills 32bpp images
+// So this function is just here to jump to whatever function is in
+// BlendBlit::fillFunc. This way, we can detect at runtime whether or not
+// the cpu has certain SIMD feature enabled or not.
+void BlendBlit::fill(byte *dst, const uint dstPitch,
+					 const uint width, const uint height,
+					 const uint32 colorMod,
+					 const TSpriteBlendMode blendMode) {
+	if (width == 0 || height == 0) return;
+
+	// If no function has been selected yet, detect and select
+	if (!fillFunc) {
+		// Get the correct blit function
+		// TODO: Add SIMD variants
+		fillFunc = fillGeneric;
+	}
+
+	Args args(dst, nullptr, dstPitch, 0, 0, 0, width, height, 0, 0, 0, 0, colorMod, 0);
+	fillFunc(args, blendMode);
+}
+
+} // End of namespace Graphics
--- a/graphics/blit/blit-alpha.h
+++ b/graphics/blit/blit-alpha.h
@@ -0,0 +1,501 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "graphics/blit.h"
+
+namespace Graphics {
+
+class BlendBlitImpl_Base {
+	friend class BlendBlit;
+protected:
+
+template<bool rgbmod, bool alphamod>
+struct BaseBlend {
+public:
+	constexpr BaseBlend(const uint32 color) :
+		ca(alphamod ? ((color >> BlendBlit::kAModShift) & 0xFF) : 255),
+		cr(rgbmod   ? ((color >> BlendBlit::kRModShift) & 0xFF) : 255),
+		cg(rgbmod   ? ((color >> BlendBlit::kGModShift) & 0xFF) : 255),
+		cb(rgbmod   ? ((color >> BlendBlit::kBModShift) & 0xFF) : 255) {}
+
+protected:
+	const byte ca, cr, cg, cb;
+};
+
+template<bool rgbmod, bool alphamod>
+struct AlphaBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr AlphaBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		uint32 ina;
+
+		if (alphamod) {
+			ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		} else {
+			ina = in[BlendBlit::kAIndex];
+		}
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = (in[BlendBlit::kBIndex] * this->cb >> 8);
+				out[BlendBlit::kGIndex] = (in[BlendBlit::kGIndex] * this->cg >> 8);
+				out[BlendBlit::kRIndex] = (in[BlendBlit::kRIndex] * this->cr >> 8);
+			} else {
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = in[BlendBlit::kBIndex];
+				out[BlendBlit::kGIndex] = in[BlendBlit::kGIndex];
+				out[BlendBlit::kRIndex] = in[BlendBlit::kRIndex];
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				const uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+				const uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+				const uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * this->cb >> 16);
+				out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * this->cg >> 16);
+				out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * this->cr >> 16);
+			} else {
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * (255 - ina) + in[BlendBlit::kBIndex] * ina) >> 8;
+				out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * (255 - ina) + in[BlendBlit::kGIndex] * ina) >> 8;
+				out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * (255 - ina) + in[BlendBlit::kRIndex] * ina) >> 8;
+			}
+		}
+
+	}
+
+	inline void fill(byte *out) const {
+		uint32 ina = this->ca;
+
+		/* if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = this->cb;
+				out[BlendBlit::kGIndex] = this->cg;
+				out[BlendBlit::kRIndex] = this->cr;
+			} else {
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = 255;
+				out[BlendBlit::kGIndex] = 255;
+				out[BlendBlit::kRIndex] = 255;
+			}
+		} else if (ina != 0) */ {
+			if (rgbmod) {
+				const uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
+				const uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
+				const uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
+
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = outb + (255 * ina * this->cb >> 16);
+				out[BlendBlit::kGIndex] = outg + (255 * ina * this->cg >> 16);
+				out[BlendBlit::kRIndex] = outr + (255 * ina * this->cr >> 16);
+			} else {
+				out[BlendBlit::kAIndex] = 255;
+				out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * (255 - ina) + 255 * ina) >> 8;
+				out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * (255 - ina) + 255 * ina) >> 8;
+				out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * (255 - ina) + 255 * ina) >> 8;
+			}
+		}
+
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct MultiplyBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr MultiplyBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		uint32 ina;
+
+		if (alphamod) {
+			ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		} else {
+			ina = in[BlendBlit::kAIndex];
+		}
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * this->cb) >> 8) >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * this->cg) >> 8) >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * this->cr) >> 8) >> 8;
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * in[BlendBlit::kBIndex] >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * in[BlendBlit::kGIndex] >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * in[BlendBlit::kRIndex] >> 8;
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16) >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16) >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16) >> 8;
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * ina) >> 8) >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * ina) >> 8) >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * ina) >> 8) >> 8;
+			}
+		}
+	}
+
+	inline void fill(byte *out) const {
+		uint32 ina = this->ca;
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * this->cb) >> 8;
+				out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * this->cg) >> 8;
+				out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * this->cr) >> 8;
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((this->cb * ina) >> 8) >> 8;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((this->cg * ina) >> 8) >> 8;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((this->cr * ina) >> 8) >> 8;
+			} else {
+				out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * ina) >> 8;
+				out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * ina) >> 8;
+				out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * ina) >> 8;
+			}
+		}
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct OpaqueBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr OpaqueBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct BinaryBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr BinaryBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		uint32 pix = *(const uint32 *)in;
+		uint32 a = pix & BlendBlit::kAModMask;
+
+		if (a != 0) {   // Full opacity (Any value not exactly 0 is Opaque here)
+			*(uint32 *)out = pix | BlendBlit::kAModMask;
+		}
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct AdditiveBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr AdditiveBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		uint32 ina;
+
+		if (alphamod) {
+			ina = in[BlendBlit::kAIndex] * this->ca >> 8;
+		} else {
+			ina = in[BlendBlit::kAIndex];
+		}
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * this->cb) >> 8);
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * this->cg) >> 8);
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * this->cr) >> 8);
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + in[BlendBlit::kBIndex];
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + in[BlendBlit::kGIndex];
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + in[BlendBlit::kRIndex];
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16);
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16);
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16);
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * ina) >> 8);
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * ina) >> 8);
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * ina) >> 8);
+			}
+		}
+	}
+
+	inline void fill(byte *out) const {
+		uint32 ina = this->ca;
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + this->cb;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + this->cg;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + this->cr;
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + 255;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + 255;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + 255;
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((this->cb * ina) >> 8);
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((this->cg * ina) >> 8);
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((this->cr * ina) >> 8);
+			} else {
+				out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ina;
+				out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ina;
+				out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ina;
+			}
+		}
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct SubtractiveBlend : public BaseBlend<rgbmod, alphamod> {
+public:
+	constexpr SubtractiveBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
+
+	inline void normal(const byte *in, byte *out) const {
+		uint32 ina = in[BlendBlit::kAIndex];
+		out[BlendBlit::kAIndex] = 255;
+
+		if (ina == 255) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * this->cb * (out[BlendBlit::kBIndex])) >> 16), 0);
+				out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * this->cg * (out[BlendBlit::kGIndex])) >> 16), 0);
+				out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * this->cr * (out[BlendBlit::kRIndex])) >> 16), 0);
+			} else {
+				out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * (out[BlendBlit::kBIndex])) >> 8), 0);
+				out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * (out[BlendBlit::kGIndex])) >> 8), 0);
+				out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * (out[BlendBlit::kRIndex])) >> 8), 0);
+			}
+		} else if (ina != 0) {
+			if (rgbmod) {
+				out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * this->cb * (out[BlendBlit::kBIndex]) * ina) >> 24), 0);
+				out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * this->cg * (out[BlendBlit::kGIndex]) * ina) >> 24), 0);
+				out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * this->cr * (out[BlendBlit::kRIndex]) * ina) >> 24), 0);
+			} else {
+				out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * (out[BlendBlit::kBIndex]) * ina) >> 16), 0);
+				out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * (out[BlendBlit::kGIndex]) * ina) >> 16), 0);
+				out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * (out[BlendBlit::kRIndex]) * ina) >> 16), 0);
+			}
+		}
+	}
+
+	inline void fill(byte *out) const {
+		out[BlendBlit::kAIndex] = 255;
+
+		if (rgbmod) {
+			out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((this->cb * out[BlendBlit::kBIndex]) >> 8), 0);
+			out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((this->cg * out[BlendBlit::kGIndex]) >> 8), 0);
+			out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((this->cr * out[BlendBlit::kRIndex]) >> 8), 0);
+		} else {
+			out[BlendBlit::kBIndex] = 0;
+			out[BlendBlit::kGIndex] = 0;
+			out[BlendBlit::kRIndex] = 0;
+		}
+	}
+};
+
+}; // End of class BlendBlitImpl_Base
+
+template<class T>
+void BlendBlit::blitT(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+	bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask);
+	bool alphamod = ((args.color & kAModMask)   != kAModMask);
+	if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) {
+		if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+			T::template blitInnerLoop<T::template OpaqueBlend, false, false, false>(args);
+		} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+			T::template blitInnerLoop<T::template BinaryBlend, false, false, false>(args);
+		} else {
+			if (blendMode == BLEND_ADDITIVE) {
+				if (rgbmod) {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template AdditiveBlend, false, true, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template AdditiveBlend, false, true, false>(args);
+					}
+				} else {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template AdditiveBlend, false, false, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template AdditiveBlend, false, false, false>(args);
+					}
+				}
+			} else if (blendMode == BLEND_SUBTRACTIVE) {
+				if (rgbmod) {
+					T::template blitInnerLoop<T::template SubtractiveBlend, false, true, false>(args);
+				} else {
+					T::template blitInnerLoop<T::template SubtractiveBlend, false, false, false>(args);
+				}
+			} else if (blendMode == BLEND_MULTIPLY) {
+				if (rgbmod) {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template MultiplyBlend, false, true, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template MultiplyBlend, false, true, false>(args);
+					}
+				} else {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template MultiplyBlend, false, false, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template MultiplyBlend, false, false, false>(args);
+					}
+				}
+			} else {
+				assert(blendMode == BLEND_NORMAL);
+				if (rgbmod) {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template AlphaBlend, false, true, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template AlphaBlend, false, true, false>(args);
+					}
+				} else {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template AlphaBlend, false, false, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template AlphaBlend, false, false, false>(args);
+					}
+				}
+			}
+		}
+	} else {
+		if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
+			T::template blitInnerLoop<T::template OpaqueBlend, true, false, false>(args);
+		} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
+			T::template blitInnerLoop<T::template BinaryBlend, true, false, false>(args);
+		} else {
+			if (blendMode == BLEND_ADDITIVE) {
+				if (rgbmod) {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template AdditiveBlend, true, true, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template AdditiveBlend, true, true, false>(args);
+					}
+				} else {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template AdditiveBlend, true, false, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template AdditiveBlend, true, false, false>(args);
+					}
+				}
+			} else if (blendMode == BLEND_SUBTRACTIVE) {
+				if (rgbmod) {
+					T::template blitInnerLoop<T::template SubtractiveBlend, true, true, false>(args);
+				} else {
+					T::template blitInnerLoop<T::template SubtractiveBlend, true, false, false>(args);
+				}
+			} else if (blendMode == BLEND_MULTIPLY) {
+				if (rgbmod) {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template MultiplyBlend, true, true, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template MultiplyBlend, true, true, false>(args);
+					}
+				} else {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template MultiplyBlend, true, false, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template MultiplyBlend, true, false, false>(args);
+					}
+				}
+			} else {
+				assert(blendMode == BLEND_NORMAL);
+				if (rgbmod) {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template AlphaBlend, true, true, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template AlphaBlend, true, true, false>(args);
+					}
+				} else {
+					if (alphamod) {
+						T::template blitInnerLoop<T::template AlphaBlend, true, false, true>(args);
+					} else {
+						T::template blitInnerLoop<T::template AlphaBlend, true, false, false>(args);
+					}
+				}
+			}
+		}
+	}
+}
+
+template<class T>
+void BlendBlit::fillT(Args &args, const TSpriteBlendMode &blendMode) {
+	bool rgbmod   = ((args.color & kRGBModMask) != kRGBModMask);
+	bool alphamod = ((args.color & kAModMask)   != kAModMask);
+
+	if (blendMode == BLEND_ADDITIVE) {
+		if (rgbmod) {
+			if (alphamod) {
+				T::template fillInnerLoop<T::template AdditiveBlend, true, true>(args);
+			} else {
+				T::template fillInnerLoop<T::template AdditiveBlend, true, false>(args);
+			}
+		} else {
+			if (alphamod) {
+				T::template fillInnerLoop<T::template AdditiveBlend, false, true>(args);
+			} else {
+				T::template fillInnerLoop<T::template AdditiveBlend, false, false>(args);
+			}
+		}
+	} else if (blendMode == BLEND_SUBTRACTIVE) {
+		if (rgbmod) {
+			T::template fillInnerLoop<T::template SubtractiveBlend, true, false>(args);
+		} else {
+			T::template fillInnerLoop<T::template SubtractiveBlend, false, false>(args);
+		}
+	} else if (blendMode == BLEND_MULTIPLY) {
+		if (rgbmod) {
+			if (alphamod) {
+				T::template fillInnerLoop<T::template MultiplyBlend, true, true>(args);
+			} else {
+				T::template fillInnerLoop<T::template MultiplyBlend, true, false>(args);
+			}
+		} else {
+			if (alphamod) {
+				T::template fillInnerLoop<T::template MultiplyBlend, false, true>(args);
+			} else {
+				T::template fillInnerLoop<T::template MultiplyBlend, false, false>(args);
+			}
+		}
+	} else {
+		assert(blendMode == BLEND_NORMAL);
+		if (rgbmod) {
+			if (alphamod) {
+				T::template fillInnerLoop<T::template AlphaBlend, true, true>(args);
+			} else {
+				T::template fillInnerLoop<T::template AlphaBlend, true, false>(args);
+			}
+		} else {
+			if (alphamod) {
+				T::template fillInnerLoop<T::template AlphaBlend, false, true>(args);
+			} else {
+				T::template fillInnerLoop<T::template AlphaBlend, false, false>(args);
+			}
+		}
+	}
+}
+
+} // End of namespace Graphics
--- a/graphics/blit/blit-atari.cpp
+++ b/graphics/blit/blit-atari.cpp
@@ -0,0 +1,283 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "graphics/blit.h"
+
+#include <mint/cookie.h>
+
+#include "backends/graphics/atari/atari-supervidel.h"
+#include "backends/platform/atari/dlmalloc.h"	// MALLOC_ALIGNMENT
+
+static_assert(MALLOC_ALIGNMENT == 16, "MALLOC_ALIGNMENT must be == 16");
+
+#ifdef USE_MOVE16
+static inline bool hasMove16() {
+	long val;
+	static bool hasMove16 = Getcookie(C__CPU, &val) == C_FOUND && val >= 40;
+	return hasMove16;
+}
+
+template<typename T>
+constexpr bool isAligned(T val) {
+	return (reinterpret_cast<uintptr>(val) & (MALLOC_ALIGNMENT - 1)) == 0;
+}
+#endif
+
+namespace Graphics {
+
+// Function to blit a rect with a transparent color key
+void keyBlitLogicAtari(byte *dst, const byte *src, const uint w, const uint h,
+					   const uint srcDelta, const uint dstDelta, const uint32 key) {
+#ifdef USE_SV_BLITTER
+	if (key == 0 && (uintptr)src >= 0xA0000000 && (uintptr)dst >= 0xA0000000) {
+		if (g_superVidelFwVersion >= 9) {
+			*SV_BLITTER_FIFO = (long)src;				// SV_BLITTER_SRC1
+			*SV_BLITTER_FIFO = (long)(g_blitMask ? g_blitMask : src);	// SV_BLITTER_SRC2
+			*SV_BLITTER_FIFO = (long)dst;				// SV_BLITTER_DST
+			*SV_BLITTER_FIFO = w - 1;					// SV_BLITTER_COUNT
+			*SV_BLITTER_FIFO = srcDelta + w;			// SV_BLITTER_SRC1_OFFSET
+			*SV_BLITTER_FIFO = srcDelta + w;			// SV_BLITTER_SRC2_OFFSET
+			*SV_BLITTER_FIFO = dstDelta + w;			// SV_BLITTER_DST_OFFSET
+			*SV_BLITTER_FIFO = h;						// SV_BLITTER_MASK_AND_LINES
+			*SV_BLITTER_FIFO = 0x03;					// SV_BLITTER_CONTROL
+		}  else {
+			// make sure the blitter is idle
+			while (*SV_BLITTER_CONTROL & 1);
+
+			*SV_BLITTER_SRC1           = (long)src;
+			*SV_BLITTER_SRC2           = (long)(g_blitMask ? g_blitMask : src);
+			*SV_BLITTER_DST            = (long)dst;
+			*SV_BLITTER_COUNT          = w - 1;
+			*SV_BLITTER_SRC1_OFFSET    = srcDelta + w;
+			*SV_BLITTER_SRC2_OFFSET    = srcDelta + w;
+			*SV_BLITTER_DST_OFFSET     = dstDelta + w;
+			*SV_BLITTER_MASK_AND_LINES = h;
+			*SV_BLITTER_CONTROL        = 0x03;
+		}
+
+		SyncSuperBlitter();
+	} else
+#endif
+	{
+		for (uint y = 0; y < h; ++y) {
+			for (uint x = 0; x < w; ++x) {
+				const uint32 color = *src++;
+				if (color != key)
+					*dst++ = color;
+				else
+					dst++;
+			}
+
+			src += srcDelta;
+			dst += dstDelta;
+		}
+	}
+}
+
+// Function to blit a rect (version optimized for Atari Falcon with SuperVidel's SuperBlitter)
+void copyBlit(byte *dst, const byte *src,
+			   const uint dstPitch, const uint srcPitch,
+			   const uint w, const uint h,
+			   const uint bytesPerPixel) {
+	if (dst == src)
+		return;
+
+#ifdef USE_SV_BLITTER
+	if ((uintptr)src >= 0xA0000000 && (uintptr)dst >= 0xA0000000) {
+		if (g_superVidelFwVersion >= 9) {
+			*SV_BLITTER_FIFO = (long)src;				// SV_BLITTER_SRC1
+			*SV_BLITTER_FIFO = 0x00000000;				// SV_BLITTER_SRC2
+			*SV_BLITTER_FIFO = (long)dst;				// SV_BLITTER_DST
+			*SV_BLITTER_FIFO = w * bytesPerPixel - 1;	// SV_BLITTER_COUNT
+			*SV_BLITTER_FIFO = srcPitch;				// SV_BLITTER_SRC1_OFFSET
+			*SV_BLITTER_FIFO = 0x00000000;				// SV_BLITTER_SRC2_OFFSET
+			*SV_BLITTER_FIFO = dstPitch;				// SV_BLITTER_DST_OFFSET
+			*SV_BLITTER_FIFO = h;						// SV_BLITTER_MASK_AND_LINES
+			*SV_BLITTER_FIFO = 0x01;					// SV_BLITTER_CONTROL
+		}  else {
+			// make sure the blitter is idle
+			while (*SV_BLITTER_CONTROL & 1);
+
+			*SV_BLITTER_SRC1           = (long)src;
+			*SV_BLITTER_SRC2           = 0x00000000;
+			*SV_BLITTER_DST            = (long)dst;
+			*SV_BLITTER_COUNT          = w * bytesPerPixel - 1;
+			*SV_BLITTER_SRC1_OFFSET    = srcPitch;
+			*SV_BLITTER_SRC2_OFFSET    = 0x00000000;
+			*SV_BLITTER_DST_OFFSET     = dstPitch;
+			*SV_BLITTER_MASK_AND_LINES = h;
+			*SV_BLITTER_CONTROL        = 0x01;
+		}
+
+		SyncSuperBlitter();
+	} else
+#endif
+	if (dstPitch == srcPitch && dstPitch == (w * bytesPerPixel)) {
+#ifdef USE_MOVE16
+		if (hasMove16() && isAligned(src) && isAligned(dst)) {
+			__asm__ volatile(
+			"	move.l	%2,%%d0\n"
+			"	lsr.l	#4,%%d0\n"
+			"	beq.b	3f\n"
+
+			"	moveq	#0x0f,%%d1\n"
+			"	and.l	%%d0,%%d1\n"
+			"	neg.l	%%d1\n"
+			"	lsr.l	#4,%%d0\n"
+			"	jmp		(2f,%%pc,%%d1.l*4)\n"
+			"1:\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"2:\n"
+			"	dbra	%%d0,1b\n"
+			// handle also the case when 'dstPitch' is not
+			// divisible by 16 but 'src' and 'dst' are
+			"3:\n"
+			"	moveq	#0x0f,%%d0\n"
+			"	and.l	%2,%%d0\n"
+			"	neg.l	%%d0\n"
+			"	jmp		(4f,%%pc,%%d0.l*2)\n"
+			// only 15x move.b as 16 would be handled above
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"4:\n"
+				: // outputs
+				: "a"(src), "a"(dst), "g"(dstPitch * h) // inputs
+				: "d0", "d1", "cc" AND_MEMORY
+			);
+		} else {
+#else
+		{
+#endif
+			memcpy(dst, src, dstPitch * h);
+		}
+	} else {
+#ifdef USE_MOVE16
+		if (hasMove16() && isAligned(src) && isAligned(dst) && isAligned(srcPitch) && isAligned(dstPitch)) {
+			__asm__ volatile(
+			"	move.l	%2,%%d0\n"
+
+			"	moveq	#0x0f,%%d1\n"
+			"	and.l	%%d0,%%d1\n"
+			"	neg.l	%%d1\n"
+			"	lea		(4f,%%pc,%%d1.l*2),%%a0\n"
+			"	move.l	%%a0,%%a1\n"
+
+			"	lsr.l	#4,%%d0\n"
+			"	beq.b	3f\n"
+
+			"	moveq	#0x0f,%%d1\n"
+			"	and.l	%%d0,%%d1\n"
+			"	neg.l	%%d1\n"
+			"	lea		(2f,%%pc,%%d1.l*4),%%a0\n"
+			"	lsr.l	#4,%%d0\n"
+			"	move.l	%%d0,%%d1\n"
+			"0:\n"
+			"	move.l	%%d1,%%d0\n"
+			"	jmp		(%%a0)\n"
+			"1:\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"	move16	(%0)+,(%1)+\n"
+			"2:\n"
+			"	dbra	%%d0,1b\n"
+			// handle (w * bytesPerPixel) % 16
+			"3:\n"
+			"	jmp		(%%a1)\n"
+			// only 15x move.b as 16 would be handled above
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"	move.b	(%0)+,(%1)+\n"
+			"4:\n"
+			"	add.l	%4,%1\n"
+			"	add.l	%5,%0\n"
+			"	dbra	%3,0b\n"
+				: // outputs
+				: "a"(src), "a"(dst), "g"(w * bytesPerPixel), "d"(h - 1),
+				  "g"(dstPitch - w * bytesPerPixel), "g"(srcPitch - w * bytesPerPixel) // inputs
+				: "d0", "d1", "a0", "a1", "cc" AND_MEMORY
+			);
+		} else {
+#else
+		{
+#endif
+			for (uint i = 0; i < h; ++i) {
+				memcpy(dst, src, w * bytesPerPixel);
+				dst += dstPitch;
+				src += srcPitch;
+			}
+		}
+	}
+}
+
+} // End of namespace Graphics
--- a/graphics/blit/blit-avx2.cpp
+++ b/graphics/blit/blit-avx2.cpp
@@ -0,0 +1,330 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "common/scummsys.h"
+
+#include "graphics/blit/blit-alpha.h"
+#include "graphics/pixelformat.h"
+
+#include <immintrin.h>
+
+#if defined(__clang__)
+#pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("avx2")
+#endif
+
+namespace Graphics {
+
+class BlendBlitImpl_AVX2 : public BlendBlitImpl_Base {
+	friend class BlendBlit;
+
+template<bool rgbmod, bool alphamod>
+struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
+public:
+	constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
+		__m256i ina;
+		if (alphamod)
+			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
+		else
+			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+		__m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
+	
+		if (rgbmod) {
+			__m256i dstR = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i dstB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i srcR = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i srcB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			dstR = _mm256_slli_epi32(_mm256_mullo_epi16(dstR, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
+			dstG = _mm256_slli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
+			dstB = _mm256_mullo_epi16(dstB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina));
+			srcR = _mm256_add_epi32(dstR, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcR, ina), 8), _mm256_set1_epi32(this->cr)), BlendBlit::kRModShift - 8));
+			srcG = _mm256_add_epi32(dstG, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8), _mm256_set1_epi32(this->cg)), BlendBlit::kGModShift - 8));
+			srcB = _mm256_add_epi32(dstB, _mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcB, ina), 8), _mm256_set1_epi32(this->cb)));
+			src = _mm256_or_si256(_mm256_and_si256(srcB, _mm256_set1_epi32(BlendBlit::kBModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), src);
+			src = _mm256_or_si256(_mm256_and_si256(srcR, _mm256_set1_epi32(BlendBlit::kRModMask)), src);
+		} else {
+			__m256i dstRB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i srcRB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+
+			dstRB = _mm256_srli_epi32(_mm256_mullo_epi32(dstRB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
+			dstG = _mm256_srli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
+			srcRB = _mm256_slli_epi32(_mm256_add_epi32(dstRB, _mm256_srli_epi32(_mm256_mullo_epi32(srcRB, ina), 8)), BlendBlit::kBModShift);
+			srcG = _mm256_slli_epi32(_mm256_add_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
+			src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(_mm256_and_si256(srcRB, _mm256_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+		}
+
+		dst = _mm256_and_si256(alphaMask, dst);
+		src = _mm256_andnot_si256(alphaMask, src);
+		return _mm256_or_si256(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
+public:
+	constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
+		__m256i ina, alphaMask;
+		if (alphamod) {
+			ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
+			alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
+		} else {
+			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+			alphaMask = _mm256_set1_epi32(BlendBlit::kAModMask);
+		}
+
+		if (rgbmod) {
+			__m256i srcB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i srcR = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m256i dstB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i dstR = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcB = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstB, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcB, _mm256_set1_epi32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm256_set1_epi32(BlendBlit::kBModMask));
+			srcG = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcG, _mm256_set1_epi32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcR = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstR, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcR, _mm256_set1_epi32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+			src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(src, _mm256_or_si256(srcB, _mm256_or_si256(srcG, srcR)));
+		} else {
+			constexpr uint32 rbMask = BlendBlit::kRModMask | BlendBlit::kBModMask;
+			__m256i dstRB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i srcRB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+
+			srcG = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi16(dstG, _mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8)), 8), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcRB = _mm256_and_si256(_mm256_mullo_epi16(dstRB, _mm256_srli_epi32(_mm256_and_si256(_mm256_mullo_epi32(srcRB, ina), _mm256_set1_epi32(rbMask)), 8)), _mm256_set1_epi32(rbMask));
+			
+			src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(src, _mm256_or_si256(srcRB, srcG));
+		}
+
+		dst = _mm256_and_si256(alphaMask, dst);
+		src = _mm256_andnot_si256(alphaMask, src);
+		return _mm256_or_si256(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
+public:
+	constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
+		return _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
+public:
+	constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
+		__m256i alphaMask = _mm256_cmpeq_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_setzero_si256());
+		dst = _mm256_and_si256(dst, alphaMask);
+		src = _mm256_andnot_si256(alphaMask, _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)));
+		return _mm256_or_si256(src, dst);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
+public:
+	constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
+		__m256i ina;
+		if (alphamod)
+			ina = _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
+		else
+			ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+		__m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_set1_epi32(0));
+
+		if (rgbmod) {
+			__m256i srcb = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask));
+			__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m256i dstb = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask));
+			__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcb = _mm256_and_si256(_mm256_add_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(srcb, _mm256_mullo_epi32(_mm256_set1_epi32(this->cb), ina)), 16)), _mm256_set1_epi32(BlendBlit::kBModMask));
+			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_mullo_epi32(srcg, _mm256_mullo_epi32(_mm256_set1_epi32(this->cg), ina))), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcr = _mm256_and_si256(_mm256_add_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(srcr, _mm256_mullo_epi32(_mm256_set1_epi32(this->cr), ina)), BlendBlit::kRModShift - 16)), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+			src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcb)));
+		} else if (alphamod) {
+			__m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
+			__m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
+			__m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcrb = _mm256_and_si256(_mm256_add_epi32(dstrb, _mm256_mullo_epi32(srcrb, ina)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+			src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
+		} else {
+			__m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
+			__m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
+			__m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = _mm256_and_si256(_mm256_add_epi32(dstg, srcg), _mm256_set1_epi32(BlendBlit::kGModMask));
+			srcrb = _mm256_and_si256(_mm256_slli_epi32(_mm256_add_epi32(dstrb, srcrb), 8), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+			src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
+			src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
+		}
+
+		dst = _mm256_and_si256(alphaMask, dst);
+		src = _mm256_andnot_si256(alphaMask, src);
+		return _mm256_or_si256(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
+public:
+	constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
+
+	inline __m256i simd(__m256i src, __m256i dst) const {
+		__m256i ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
+		__m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+		__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+		__m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+		__m256i dstb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+		__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+		__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+		srcb = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcb, _mm256_set1_epi32(this->cb)), _mm256_mullo_epi32(dstb, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kBModShift), _mm256_set1_epi32(BlendBlit::kBModMask));
+		srcg = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcg, _mm256_set1_epi32(this->cg)), _mm256_mullo_epi32(dstg, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kGModShift), _mm256_set1_epi32(BlendBlit::kGModMask));
+		srcr = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcr, _mm256_set1_epi32(this->cr)), _mm256_mullo_epi32(dstr, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kRModShift), _mm256_set1_epi32(BlendBlit::kRModMask));
+
+		return _mm256_or_si256(_mm256_set1_epi32(BlendBlit::kAModMask), _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
+	}
+};
+
+public:
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
+static void blitInnerLoop(BlendBlit::Args &args) {
+	const bool loaddst = true; // TODO: Only set this when necessary
+
+	const byte *in;
+	byte *out;
+
+	const PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
+
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
+	const byte *inBase;
+
+	if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 7;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = args.scaleXoff;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+
+		uint32 j = 0;
+		for (; j + 8 <= args.width; j += 8) {
+			__m256i dstPixels, srcPixels;
+			if (loaddst) dstPixels = _mm256_loadu_si256((const __m256i *)out);
+			if (!doscale) {
+				srcPixels = _mm256_loadu_si256((const __m256i *)in);
+			} else {
+				srcPixels = _mm256_setr_epi32(
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 4) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 5) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 6) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 7) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
+				);
+				scaleXCtr += args.scaleX * 8;
+			}
+			if (!doscale && (args.flipping & FLIP_H)) {
+				srcPixels = _mm256_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
+				srcPixels = _mm256_permute2x128_si256(srcPixels, srcPixels, 0x01);
+			}
+			{
+				const __m256i res = pixelFunc.simd(srcPixels, dstPixels);
+				_mm256_storeu_si256((__m256i *)out, res);
+			}
+			if (!doscale) in += (ptrdiff_t)args.inStep * 8;
+			out += 4ULL * 8;
+		}
+		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 7;
+		for (; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+			}
+
+			pixelFunc.normal(in, out);
+
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
+			out += 4;
+		}
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
+	}
+}
+
+}; // end of class BlendBlitImpl_AVX2
+
+void BlendBlit::blitAVX2(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+	blitT<BlendBlitImpl_AVX2>(args, blendMode, alphaType);
+}
+
+} // End of namespace Graphics
+
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
--- a/graphics/blit/blit-fast.cpp
+++ b/graphics/blit/blit-fast.cpp
@@ -0,0 +1,138 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "graphics/blit.h"
+#include "graphics/pixelformat.h"
+#include "common/endian.h"
+#include "common/system.h"
+
+namespace Graphics {
+
+namespace {
+
+template<bool bswap, int rotate>
+static void swapBlit(byte *dst, const byte *src,
+                     const uint dstPitch, const uint srcPitch,
+                     const uint w, const uint h) {
+	// Faster, but larger, to provide optimized handling for each case.
+	const uint srcDelta = (srcPitch - w * sizeof(uint32));
+	const uint dstDelta = (dstPitch - w * sizeof(uint32));
+
+	for (uint y = 0; y < h; ++y) {
+		for (uint x = 0; x < w; ++x) {
+			uint32 col = *(const uint32 *)src;
+
+			if (bswap)
+				col = SWAP_BYTES_32(col);
+			if (rotate != 0)
+				col = ROTATE_RIGHT_32(col, rotate);
+
+			*(uint32 *)dst = col;
+
+			src += sizeof(uint32);
+			dst += sizeof(uint32);
+		}
+		src += srcDelta;
+		dst += dstDelta;
+	}
+}
+
+} // End of anonymous namespace
+
+// TODO: Add fast 24<->32bpp conversion
+// TODO: Add fast 16<->16bpp conversion
+struct FastBlitLookup {
+	FastBlitFunc func;
+	Graphics::PixelFormat srcFmt, dstFmt;
+};
+
+static const FastBlitLookup fastBlitFuncs_4to4[] = {
+	// 32-bit byteswap
+	{ swapBlit<true,   0>, Graphics::PixelFormat(4, 8, 8, 8, 8,  0,  8, 16, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16,  8,  0) }, // ABGR8888 -> RGBA8888
+	{ swapBlit<true,   0>, Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16,  8,  0), Graphics::PixelFormat(4, 8, 8, 8, 8,  0,  8, 16, 24) }, // RGBA8888 -> ABGR8888
+	{ swapBlit<true,   0>, Graphics::PixelFormat(4, 8, 8, 8, 8, 16,  8,  0, 24), Graphics::PixelFormat(4, 8, 8, 8, 8,  8, 16, 24,  0) }, // ARGB8888 -> BGRA8888
+	{ swapBlit<true,   0>, Graphics::PixelFormat(4, 8, 8, 8, 8,  8, 16, 24,  0), Graphics::PixelFormat(4, 8, 8, 8, 8, 16,  8,  0, 24) }, // BGRA8888 -> ARGB8888
+
+	// 32-bit rotate right
+	{ swapBlit<false,  8>, Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16,  8,  0), Graphics::PixelFormat(4, 8, 8, 8, 8, 16,  8,  0, 24) }, // RGBA8888 -> ARGB8888
+	{ swapBlit<false,  8>, Graphics::PixelFormat(4, 8, 8, 8, 8,  8, 16, 24,  0), Graphics::PixelFormat(4, 8, 8, 8, 8,  0,  8, 16, 24) }, // BGRA8888 -> ABGR8888
+
+	// 32-bit rotate left
+	{ swapBlit<false, 24>, Graphics::PixelFormat(4, 8, 8, 8, 8,  0,  8, 16, 24), Graphics::PixelFormat(4, 8, 8, 8, 8,  8, 16, 24,  0) }, // ABGR8888 -> BGRA8888
+	{ swapBlit<false, 24>, Graphics::PixelFormat(4, 8, 8, 8, 8, 16,  8,  0, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16,  8,  0) }, // ARGB8888 -> RGBA8888
+
+	// 32-bit byteswap and rotate right
+	{ swapBlit<true,   8>, Graphics::PixelFormat(4, 8, 8, 8, 8,  0,  8, 16, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 16,  8,  0, 24) }, // ABGR8888 -> ARGB8888
+	{ swapBlit<true,   8>, Graphics::PixelFormat(4, 8, 8, 8, 8, 16,  8,  0, 24), Graphics::PixelFormat(4, 8, 8, 8, 8,  0,  8, 16, 24) }, // ARGB8888 -> ABGR8888
+
+	// 32-bit byteswap and rotate left
+	{ swapBlit<true,  24>, Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16,  8,  0), Graphics::PixelFormat(4, 8, 8, 8, 8,  8, 16, 24,  0) }, // RGBA8888 -> BGRA8888
+	{ swapBlit<true,  24>, Graphics::PixelFormat(4, 8, 8, 8, 8,  8, 16, 24,  0), Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16,  8,  0) }  // BGRA8888 -> RGBA8888
+
+};
+
+#ifdef SCUMMVM_NEON
+static const FastBlitLookup fastBlitFuncs_NEON[] = {
+	// 16-bit with NEON
+	{ fastBlitNEON_XRGB1555_RGB565, Graphics::PixelFormat(2, 5, 5, 5, 0, 10, 5, 0, 0), Graphics::PixelFormat(2, 5, 6, 5, 0, 11, 5, 0, 0) }, // XRGB1555 -> RGB565
+};
+#endif
+
+FastBlitFunc getFastBlitFunc(const PixelFormat &dstFmt, const PixelFormat &srcFmt) {
+	const uint dstBpp = dstFmt.bytesPerPixel;
+	const uint srcBpp = srcFmt.bytesPerPixel;
+	const FastBlitLookup *table = nullptr;
+	size_t length = 0;
+
+	if (srcBpp == 4 && dstBpp == 4) {
+		table = fastBlitFuncs_4to4;
+		length = ARRAYSIZE(fastBlitFuncs_4to4);
+
+		for (size_t i = 0; i < length; i++) {
+			if (srcFmt != table[i].srcFmt)
+				continue;
+			if (dstFmt != table[i].dstFmt)
+				continue;
+
+			return table[i].func;
+		}
+	}
+
+#ifdef SCUMMVM_NEON
+	if (srcBpp == 2 && dstBpp == 2 && g_system->hasFeature(OSystem::kFeatureCpuNEON)) {
+		table = fastBlitFuncs_NEON;
+		length = ARRAYSIZE(fastBlitFuncs_NEON);
+
+		for (size_t i = 0; i < length; i++) {
+			if (srcFmt != table[i].srcFmt)
+				continue;
+			if (dstFmt != table[i].dstFmt)
+				continue;
+
+			return table[i].func;
+		}
+	}
+#endif
+
+	return nullptr;
+}
+
+} // End of namespace Graphics
--- a/graphics/blit/blit-generic.cpp
+++ b/graphics/blit/blit-generic.cpp
@@ -0,0 +1,99 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "graphics/blit/blit-alpha.h"
+#include "graphics/pixelformat.h"
+
+namespace Graphics {
+
+class BlendBlitImpl_Default : public BlendBlitImpl_Base {
+	friend class BlendBlit;
+public:
+
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
+static inline void blitInnerLoop(BlendBlit::Args &args) {
+	const byte *in;
+	byte *out;
+
+	const PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
+
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
+	const byte *inBase;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = args.scaleXoff;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+
+		for (uint32 j = 0; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+			}
+
+			pixelFunc.normal(in, out);
+
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
+			out += 4;
+		}
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
+	}
+}
+
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool rgbmod, bool alphamod>
+static inline void fillInnerLoop(BlendBlit::Args &args) {
+	byte *out;
+
+	const PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
+
+	for (uint32 i = 0; i < args.height; i++) {
+		out = args.outo;
+
+		for (uint32 j = 0; j < args.width; j++) {
+			pixelFunc.fill(out);
+
+			out += 4;
+		}
+		args.outo += args.dstPitch;
+	}
+}
+
+}; // end of class BlendBlitImpl_Default
+
+void BlendBlit::blitGeneric(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+	blitT<BlendBlitImpl_Default>(args, blendMode, alphaType);
+}
+
+void BlendBlit::fillGeneric(Args &args, const TSpriteBlendMode &blendMode) {
+	fillT<BlendBlitImpl_Default>(args, blendMode);
+}
+
+} // End of namespace Graphics
--- a/graphics/blit/blit-neon.cpp
+++ b/graphics/blit/blit-neon.cpp
@@ -0,0 +1,391 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "common/scummsys.h"
+
+#ifdef SCUMMVM_NEON
+
+#include "graphics/blit/blit-alpha.h"
+#include "graphics/pixelformat.h"
+
+#include <arm_neon.h>
+
+#if !defined(__aarch64__) && !defined(__ARM_NEON)
+
+#if defined(__clang__)
+#pragma clang attribute push (__attribute__((target("neon"))), apply_to=function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+
+#endif // !defined(__aarch64__) && !defined(__ARM_NEON)
+
+namespace Graphics {
+
+class BlendBlitImpl_NEON : public BlendBlitImpl_Base {
+	friend class BlendBlit;
+
+template<bool rgbmod, bool alphamod>
+struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
+public:
+	constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
+		uint32x4_t ina;
+		if (alphamod)
+			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
+		else
+			ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+		uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+
+		if (rgbmod) {
+			uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), 16);
+			uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), 16);
+			uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), 8);
+			uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), 8);
+			uint32x4_t dstB = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+			uint32x4_t srcB = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+
+			dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+			dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+			dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+			srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(this->cr)), 16));
+			srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(this->cg)), 16));
+			srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(this->cb)), 16));
+			src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
+			src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
+			src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
+		} else {
+			uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
+			uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
+			uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+			uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+
+			dstRB = vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina));
+			dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
+			srcRB = vaddq_u32(dstRB, vmulq_u32(srcRB, ina));
+			srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
+			src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
+			src = vorrq_u32(vandq_u32(srcRB, vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+		}
+
+		dst = vandq_u32(alphaMask, dst);
+		src = vandq_u32(vmvnq_u32(alphaMask), src);
+		return vorrq_u32(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
+public:
+	constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
+		uint32x4_t ina, alphaMask;
+		if (alphamod) {
+			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
+			alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+		} else {
+			ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+			alphaMask = vdupq_n_u32(BlendBlit::kAModMask);
+		}
+
+		if (rgbmod) {
+			uint32x4_t srcB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			uint32x4_t dstB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcB = vandq_u32(vshlq_n_u32(vmulq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, vmovq_n_u32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), vmovq_n_u32(BlendBlit::kBModMask));
+			srcG = vandq_u32(vshlq_n_u32(vmulq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, vmovq_n_u32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), vmovq_n_u32(BlendBlit::kGModMask));
+			srcR = vandq_u32(vshlq_n_u32(vmulq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, vmovq_n_u32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), vmovq_n_u32(BlendBlit::kRModMask));
+
+			src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+			src = vorrq_u32(src, vorrq_u32(srcB, vorrq_u32(srcG, srcR)));
+		} else {
+			constexpr uint32 rbMask = BlendBlit::kRModMask | BlendBlit::kBModMask;
+			uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(rbMask)), BlendBlit::kBModShift);
+			uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(rbMask)), BlendBlit::kBModShift);
+
+			srcG = vandq_u32(vshlq_n_u32(vmulq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8)), 8), vmovq_n_u32(BlendBlit::kGModMask));
+			srcRB = vandq_u32(vreinterpretq_u32_u16(vmulq_u16(vreinterpretq_u16_u32(dstRB), vreinterpretq_u16_u32(vshrq_n_u32(vandq_u32(vmulq_u32(srcRB, ina), vmovq_n_u32(rbMask)), 8)))), vmovq_n_u32(rbMask));
+
+			src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+			src = vorrq_u32(src, vorrq_u32(srcRB, srcG));
+		}
+
+		dst = vandq_u32(alphaMask, dst);
+		src = vandq_u32(vmvnq_u32(alphaMask), src);
+		return vorrq_u32(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
+public:
+	constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
+		return vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
+public:
+	constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
+		uint32x4_t alphaMask = vceqq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmovq_n_u32(0));
+		dst = vandq_u32(dst, alphaMask);
+		src = vandq_u32(vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmvnq_u32(alphaMask));
+		return vorrq_u32(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
+public:
+	constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
+		uint32x4_t ina;
+		if (alphamod)
+			ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
+		else
+			ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+		uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
+
+		if (rgbmod) {
+			uint32x4_t srcb = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
+			uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			uint32x4_t dstb = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
+			uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcb = vandq_u32(vaddq_u32(dstb, vshrq_n_u32(vmulq_u32(srcb, vmulq_u32(vmovq_n_u32(this->cb), ina)), 16)), vmovq_n_u32(BlendBlit::kBModMask));
+			srcg = vandq_u32(vaddq_u32(dstg, vmulq_u32(srcg, vmulq_u32(vmovq_n_u32(this->cg), ina))), vmovq_n_u32(BlendBlit::kGModMask));
+			srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(this->cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
+
+			src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
+			src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
+		} else if (alphamod) {
+			uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+			uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+			uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = vandq_u32(vaddq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), vmovq_n_u32(BlendBlit::kGModMask));
+			srcrb = vandq_u32(vaddq_u32(dstrb, vmulq_u32(srcrb, ina)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+			src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
+			src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
+		} else {
+			uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
+			uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
+			uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = vandq_u32(vaddq_u32(dstg, srcg), vmovq_n_u32(BlendBlit::kGModMask));
+			srcrb = vandq_u32(vshlq_n_u32(vaddq_u32(dstrb, srcrb), 8), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+			src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
+			src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
+		}
+
+		dst = vandq_u32(alphaMask, dst);
+		src = vandq_u32(vmvnq_u32(alphaMask), src);
+		return vorrq_u32(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
+public:
+	constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
+
+	inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
+		uint32x4_t ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
+		uint32x4_t srcb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+		uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+		uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+		uint32x4_t dstb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+		uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+		uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+		srcb = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstb), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(this->cb)), vmulq_u32(dstb, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kBModShift), vmovq_n_u32(BlendBlit::kBModMask));
+		srcg = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstg), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(this->cg)), vmulq_u32(dstg, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kGModShift), vmovq_n_u32(BlendBlit::kGModMask));
+		srcr = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstr), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(this->cr)), vmulq_u32(dstr, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kRModShift), vmovq_n_u32(BlendBlit::kRModMask));
+
+		return vorrq_u32(vmovq_n_u32(BlendBlit::kAModMask), vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
+	}
+};
+
+public:
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
+static inline void blitInnerLoop(BlendBlit::Args &args) {
+	const bool loaddst = true; // TODO: Only set this when necessary
+
+	const byte *in;
+	byte *out;
+
+	PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
+
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
+	const byte *inBase;
+
+	if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = args.scaleXoff;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+		uint32 j = 0;
+		for (; j + 4 <= args.width; j += 4) {
+			uint32x4_t dstPixels;
+			if (loaddst) dstPixels = vld1q_u32((const uint32 *)out);
+			uint32x4_t srcPixels;
+			if (!doscale) {
+				srcPixels = vld1q_u32((const uint32 *)in);
+			} else {
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), vmovq_n_u32(0), 0);
+				scaleXCtr += args.scaleX;
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 1);
+				scaleXCtr += args.scaleX;
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 2);
+				scaleXCtr += args.scaleX;
+				srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 3);
+				scaleXCtr += args.scaleX;
+			}
+			if (!doscale && (args.flipping & FLIP_H)) {
+				srcPixels = vrev64q_u32(srcPixels);
+				srcPixels = vcombine_u32(vget_high_u32(srcPixels), vget_low_u32(srcPixels));
+			}
+			{
+				const uint32x4_t res = pixelFunc.simd(srcPixels, dstPixels);
+				vst1q_u32((uint32 *)out, res);
+			}
+			if (!doscale) in += args.inStep * 4;
+			out += 4 * 4;
+		}
+		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
+		for (; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+			}
+
+			pixelFunc.normal(in, out);
+
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
+			out += 4;
+		}
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
+	}
+}
+
+}; // end of class BlendBlitImpl_NEON
+
+void BlendBlit::blitNEON(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+	blitT<BlendBlitImpl_NEON>(args, blendMode, alphaType);
+}
+
+void fastBlitNEON_XRGB1555_RGB565(byte *dst, const byte *src,
+                  const uint dstPitch, const uint srcPitch,
+                  const uint w, const uint h) {
+	const uint srcDelta = (srcPitch - w * 2);
+	const uint dstDelta = (dstPitch - w * 2);
+
+	const uint16 *src_ptr = (const uint16 *)src;
+	uint16 *dst_ptr = (uint16 *)dst;
+	uint16x4_t pixels;
+
+	for (uint y = h; y > 0; --y) {
+		uint x = w;
+		for (; x >= 4; x -= 4) {
+			src_ptr = (const uint16 *)src;
+			dst_ptr = (uint16 *)dst;
+
+			// Load pixels to NEON
+			pixels = vld1_u16(src_ptr);
+
+			// Convert from XRGB1555 to RGB565
+			// Here we do : ((pixels & 0x7FE0) << 1) | ((pixels & 0x0200) >> 4) | (pixels & 0x001F)
+			pixels = vorr_u16(
+				vorr_u16(
+					vshl_n_u16(vand_u16(pixels, vmov_n_u16(0x7FE0)), 1),
+					vshr_n_u16(vand_u16(pixels, vmov_n_u16(0x0200)), 4)
+				),
+				vand_u16(pixels, vmov_n_u16(0x001F))
+			);
+
+			// Store pixels to destination
+			vst1_u16(dst_ptr, pixels);
+
+			src += 4 * 2;
+			dst += 4 * 2;
+		}
+
+		for (; x > 0; --x) {
+			// We have remaining pixels, convert them the classic way
+			src_ptr = (const uint16 *)src;
+			dst_ptr = (uint16 *)dst;
+
+			*dst_ptr = ((((*src_ptr) & 0x7FE0) << 1) | (((*src_ptr) & 0x0200) >> 4) | ((*src_ptr) & 0x001F));
+
+			src += 2;
+			dst += 2;
+		}
+
+		src += srcDelta;
+		dst += dstDelta;
+	}
+}
+
+} // end of namespace Graphics
+
+#if !defined(__aarch64__) && !defined(__ARM_NEON)
+
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+
+#endif // !defined(__aarch64__) && !defined(__ARM_NEON)
+
+#endif // SCUMMVM_NEON
--- a/graphics/blit/blit-scale.cpp
+++ b/graphics/blit/blit-scale.cpp
@@ -0,0 +1,552 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ *
+ * The bottom part of this is file is adapted from SDL_rotozoom.c. The
+ * relevant copyright notice for those specific functions can be found at the
+ * top of that section.
+ *
+ */
+
+#include "graphics/blit.h"
+#include "graphics/pixelformat.h"
+#include "graphics/transform_struct.h"
+
+#include "common/endian.h"
+#include "common/rect.h"
+#include "math/utils.h"
+
+namespace Graphics {
+
+namespace {
+
+static void scaleVertical(byte *dst, const byte *src,
+                          const uint dstPitch, const uint srcPitch,
+                          const uint w, const uint dstH, const uint srcH,
+                          const byte flip, const uint bytesPerPixel) {
+	const bool flipy = flip & FLIP_V;
+
+	// 16.16 fixed point
+	const uint32 srcIncY = (srcH << 16) / dstH;
+
+	const int dstIncY = (flipy ? -static_cast<int>(dstPitch) : static_cast<int>(dstPitch));
+
+	if (flipy) {
+		dst += (dstH - 1) * dstPitch;
+	}
+
+	for (uint32 y = 0, yoff = 0; y < dstH; y++, yoff += srcIncY) {
+		const byte *srcP = src + ((yoff >> 16) * srcPitch);
+		memcpy(dst, srcP, w * bytesPerPixel);
+		dst += dstIncY;
+	}
+}
+
+template <typename Color, int Size>
+static void scaleNN(byte *dst, const byte *src,
+			   const uint dstPitch, const uint srcPitch,
+			   const uint dstW, const uint dstH,
+			   const uint srcW, const uint srcH,
+			   const byte flip) {
+	const bool flipx = flip & FLIP_H;
+	const bool flipy = flip & FLIP_V;
+
+	// 16.16 fixed point
+	const uint32 srcIncX = (srcW << 16) / dstW;
+	const uint32 srcIncY = (srcH << 16) / dstH;
+
+	const int dstIncX = (flipx ? -1 : 1);
+	const int dstIncY = (flipy ? -static_cast<int>(dstPitch) : static_cast<int>(dstPitch));
+
+	if (flipx) {
+		dst += (dstW - 1) * Size;
+	}
+
+	if (flipy) {
+		dst += (dstH - 1) * dstPitch;
+	}
+
+	for (uint32 y = 0, yoff = 0; y < dstH; y++, yoff += srcIncY) {
+		const byte *srcP = src + ((yoff >> 16) * srcPitch);
+		byte *dst1 = dst;
+		for (uint32 x = 0, xoff = 0; x < dstW; x++, xoff += srcIncX) {
+			const byte *src1 = srcP + ((xoff >> 16) * Size);
+			if (Size == sizeof(Color)) {
+				*(Color *)dst1 = *(const Color *)src1;
+			} else {
+				memcpy(dst1, src, Size);
+			}
+			dst1 += dstIncX * Size;
+		}
+		dst += dstIncY;
+	}
+}
+
+} // End of anonymous namespace
+
+bool scaleBlit(byte *dst, const byte *src,
+			   const uint dstPitch, const uint srcPitch,
+			   const uint dstW, const uint dstH,
+			   const uint srcW, const uint srcH,
+			   const Graphics::PixelFormat &fmt,
+						   const byte flip) {
+	// This should be OK since int16 is used in Graphics::Surface.
+	assert(srcW <= 65535);
+	assert(srcH <= 65535);
+
+	if (dstW == srcW && !(flip & FLIP_H)) {
+		if (dstH == srcH && !(flip & FLIP_V))
+			copyBlit(dst, src, dstPitch, srcPitch, dstW, dstH, fmt.bytesPerPixel);
+		else
+			scaleVertical(dst, src, dstPitch, srcPitch, dstW, dstH, srcH, flip, fmt.bytesPerPixel);
+		return true;
+	}
+
+	switch (fmt.bytesPerPixel) {
+	case 1:
+		scaleNN<uint8,  1>(dst, src, dstPitch, srcPitch, dstW,  dstH, srcW, srcH, flip);
+		return true;
+	case 2:
+		scaleNN<uint16, 2>(dst, src, dstPitch, srcPitch, dstW,  dstH, srcW, srcH, flip);
+		return true;
+	case 3:
+		scaleNN<uint8,  3>(dst, src, dstPitch, srcPitch, dstW,  dstH, srcW, srcH, flip);
+		return true;
+	case 4:
+		scaleNN<uint32, 4>(dst, src, dstPitch, srcPitch, dstW,  dstH, srcW, srcH, flip);
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+/*
+
+The functions below are adapted from SDL_rotozoom.c,
+taken from SDL_gfx-2.0.18.
+
+Its copyright notice:
+
+=============================================================================
+SDL_rotozoom.c: rotozoomer, zoomer and shrinker for 32bit or 8bit surfaces
+
+Copyright (C) 2001-2012  Andreas Schiffler
+
+This software is provided 'as-is', without any express or implied
+warranty. In no event will the authors be held liable for any damages
+arising from the use of this software.
+
+Permission is granted to anyone to use this software for any purpose,
+including commercial applications, and to alter it and redistribute it
+freely, subject to the following restrictions:
+
+1. The origin of this software must not be misrepresented; you must not
+claim that you wrote the original software. If you use this software
+in a product, an acknowledgment in the product documentation would be
+appreciated but is not required.
+
+2. Altered source versions must be plainly marked as such, and must not be
+misrepresented as being the original software.
+
+3. This notice may not be removed or altered from any source
+distribution.
+
+Andreas Schiffler -- aschiffler at ferzkopp dot net
+=============================================================================
+
+
+The functions have been adapted for different structures, coordinate
+systems and pixel formats.
+
+*/
+
+namespace {
+
+template <typename Color, int Size>
+inline uint32 getPixel(const byte *sp) {
+	if (Size == sizeof(Color)) {
+		return *(const Color *)sp;
+	} else {
+		return READ_UINT24(sp);
+	}
+}
+
+template <typename Color, int Size>
+inline void setPixel(byte *pc, const uint32 pix) {
+	if (Size == sizeof(Color)) {
+		*(Color *)pc = pix;
+	} else {
+		WRITE_UINT24(pc, pix);
+	}
+}
+
+inline byte scaleBlitBilinearInterpolate(byte c01, byte c00, byte c11, byte c10, int ex, int ey) {
+	int t1 = ((((c01 - c00) * ex) >> 16) + c00) & 0xff;
+	int t2 = ((((c11 - c10) * ex) >> 16) + c10) & 0xff;
+	return (((t2 - t1) * ey) >> 16) + t1;
+}
+
+template <typename ColorMask, typename Color, int Size>
+void scaleBlitBilinearInterpolate(byte *dp, const byte *c01, const byte *c00, const byte *c11, const byte *c10, int ex, int ey,
+								  const Graphics::PixelFormat &fmt) {
+	byte c01_a, c01_r, c01_g, c01_b;
+	fmt.colorToARGBT<ColorMask>(getPixel<Color, Size>(c01), c01_a, c01_r, c01_g, c01_b);
+
+	byte c00_a, c00_r, c00_g, c00_b;
+	fmt.colorToARGBT<ColorMask>(getPixel<Color, Size>(c00), c00_a, c00_r, c00_g, c00_b);
+
+	byte c11_a, c11_r, c11_g, c11_b;
+	fmt.colorToARGBT<ColorMask>(getPixel<Color, Size>(c11), c11_a, c11_r, c11_g, c11_b);
+
+	byte c10_a, c10_r, c10_g, c10_b;
+	fmt.colorToARGBT<ColorMask>(getPixel<Color, Size>(c10), c10_a, c10_r, c10_g, c10_b);
+
+	byte dp_a = scaleBlitBilinearInterpolate(c01_a, c00_a, c11_a, c10_a, ex, ey);
+	byte dp_r = scaleBlitBilinearInterpolate(c01_r, c00_r, c11_r, c10_r, ex, ey);
+	byte dp_g = scaleBlitBilinearInterpolate(c01_g, c00_g, c11_g, c10_g, ex, ey);
+	byte dp_b = scaleBlitBilinearInterpolate(c01_b, c00_b, c11_b, c10_b, ex, ey);
+	setPixel<Color, Size>(dp, fmt.ARGBToColorT<ColorMask>(dp_a, dp_r, dp_g, dp_b));
+}
+
+template <typename ColorMask, typename Color, int Size>
+void scaleBlitBilinearLogic(byte *dst, const byte *src,
+							const uint dstPitch, const uint srcPitch,
+							const uint dstW, const uint dstH,
+							const uint srcW, const uint srcH,
+							const Graphics::PixelFormat &fmt,
+							int *sax, int *say, byte flip) {
+	const bool flipx = flip & FLIP_H;
+	const bool flipy = flip & FLIP_V;
+
+	int spixelw = (srcW - 1);
+	int spixelh = (srcH - 1);
+
+	const byte *sp = src;
+
+	if (flipx) {
+		sp += spixelw * Size;
+	}
+	if (flipy) {
+		sp += srcPitch * spixelh;
+	}
+
+	int *csay = say;
+	for (uint y = 0; y < dstH; y++) {
+		byte *dp = dst + (dstPitch * y);
+		const byte *csp = sp;
+		int *csax = sax;
+		for (uint x = 0; x < dstW; x++) {
+			/*
+			* Setup color source pointers
+			*/
+			int ex = (*csax & 0xffff);
+			int ey = (*csay & 0xffff);
+			int cx = (*csax >> 16);
+			int cy = (*csay >> 16);
+
+			const byte *c00, *c01, *c10, *c11;
+			c00 = c01 = c10 = sp;
+			if (cy < spixelh) {
+				if (flipy) {
+					c10 -= srcPitch;
+				} else {
+					c10 += srcPitch;
+				}
+			}
+			c11 = c10;
+			if (cx < spixelw) {
+				if (flipx) {
+					c01 -= Size;
+					c11 -= Size;
+				} else {
+					c01 += Size;
+					c11 += Size;
+				}
+			}
+
+			/*
+			* Draw and interpolate colors
+			*/
+			scaleBlitBilinearInterpolate<ColorMask, Color, Size>(dp, c01, c00, c11, c10, ex, ey, fmt);
+
+			/*
+			* Advance source pointer x
+			*/
+			int *salastx = csax;
+			csax++;
+			int sstepx = (*csax >> 16) - (*salastx >> 16);
+			if (flipx) {
+				sp -= sstepx * Size;
+			} else {
+				sp += sstepx * Size;
+			}
+
+			/*
+			* Advance destination pointer x
+			*/
+			dp += Size;
+		}
+		/*
+		* Advance source pointer y
+		*/
+		int *salasty = csay;
+		csay++;
+		int sstepy = (*csay >> 16) - (*salasty >> 16);
+		sstepy *= srcPitch;
+		if (flipy) {
+			sp = csp - sstepy;
+		} else {
+			sp = csp + sstepy;
+		}
+	}
+}
+
+template<typename ColorMask, typename Color, int Size, bool filtering>
+void rotoscaleBlitLogic(byte *dst, const byte *src,
+						const uint dstPitch, const uint srcPitch,
+						const uint dstW, const uint dstH,
+						const uint srcW, const uint srcH,
+						const Graphics::PixelFormat &fmt,
+						const TransformStruct &transform,
+						const Common::Point &newHotspot) {
+	const bool flipx = transform._flip & FLIP_H;
+	const bool flipy = transform._flip & FLIP_V;
+
+	assert(transform._angle != kDefaultAngle); // This would not be ideal; rotoscale() should never be called in conditional branches where angle = 0 anyway.
+
+	if (transform._zoom.x == 0 || transform._zoom.y == 0) {
+		return;
+	}
+
+	uint32 invAngle = 360 - (transform._angle % 360);
+	float invAngleRad = Math::deg2rad<uint32,float>(invAngle);
+	float invCos = cos(invAngleRad);
+	float invSin = sin(invAngleRad);
+
+	int icosx = (int)(invCos * (65536.0f * kDefaultZoomX / transform._zoom.x));
+	int isinx = (int)(invSin * (65536.0f * kDefaultZoomX / transform._zoom.x));
+	int icosy = (int)(invCos * (65536.0f * kDefaultZoomY / transform._zoom.y));
+	int isiny = (int)(invSin * (65536.0f * kDefaultZoomY / transform._zoom.y));
+
+	int xd = transform._hotspot.x << 16;
+	int yd = transform._hotspot.y << 16;
+	int cx = newHotspot.x;
+	int cy = newHotspot.y;
+
+	int ax = -icosx * cx;
+	int ay = -isiny * cx;
+	int sw = srcW - 1;
+	int sh = srcH - 1;
+
+	byte *pc = dst;
+
+	for (uint y = 0; y < dstH; y++) {
+		int t = cy - y;
+		int sdx = ax + (isinx * t) + xd;
+		int sdy = ay - (icosy * t) + yd;
+		for (uint x = 0; x < dstW; x++) {
+			int dx = (sdx >> 16);
+			int dy = (sdy >> 16);
+			if (flipx) {
+				dx = sw - dx;
+			}
+			if (flipy) {
+				dy = sh - dy;
+			}
+
+			if (filtering) {
+				if ((dx > -1) && (dy > -1) && (dx < sw) && (dy < sh)) {
+					const byte *sp = src + dy * srcPitch + dx * Size;
+					const byte *c00, *c01, *c10, *c11;
+					c00 = sp;
+					sp += Size;
+					c01 = sp;
+					sp += srcPitch;
+					c11 = sp;
+					sp -= Size;
+					c10 = sp;
+					if (flipx) {
+						SWAP(c00, c01);
+						SWAP(c10, c11);
+					}
+					if (flipy) {
+						SWAP(c00, c10);
+						SWAP(c01, c11);
+					}
+					/*
+					* Interpolate colors
+					*/
+					int ex = (sdx & 0xffff);
+					int ey = (sdy & 0xffff);
+					scaleBlitBilinearInterpolate<ColorMask, Color, Size>(pc, c01, c00, c11, c10, ex, ey, fmt);
+				}
+			} else {
+				if ((dx >= 0) && (dy >= 0) && (dx < (int)srcW) && (dy < (int)srcH)) {
+					const byte *sp = src + dy * srcPitch + dx * Size;
+					if (Size == sizeof(Color)) {
+						*(Color *)pc = *(const Color *)sp;
+					} else {
+						memcpy(pc, sp, Size);
+					}
+				}
+			}
+			sdx += icosx;
+			sdy += isiny;
+			pc += Size;
+		}
+	}
+}
+
+} // End of anonymous namespace
+
+bool scaleBlitBilinear(byte *dst, const byte *src,
+					   const uint dstPitch, const uint srcPitch,
+					   const uint dstW, const uint dstH,
+					   const uint srcW, const uint srcH,
+					   const Graphics::PixelFormat &fmt,
+					   const byte flip) {
+	if (fmt.bytesPerPixel != 2 && fmt.bytesPerPixel != 3 && fmt.bytesPerPixel != 4)
+		return false;
+
+	int *sax = new int[dstW + 1];
+	int *say = new int[dstH + 1];
+	assert(sax && say);
+
+	/*
+	* Precalculate row increments
+	*/
+	int spixelw = (srcW - 1);
+	int spixelh = (srcH - 1);
+	int sx = (int)(65536.0f * (float) spixelw / (float) (dstW - 1));
+	int sy = (int)(65536.0f * (float) spixelh / (float) (dstH - 1));
+
+	/* Maximum scaled source size */
+	int ssx = (srcW << 16) - 1;
+	int ssy = (srcH << 16) - 1;
+
+	/* Precalculate horizontal row increments */
+	int csx = 0;
+	int *csax = sax;
+	for (uint x = 0; x <= dstW; x++) {
+		*csax = csx;
+		csax++;
+		csx += sx;
+
+		/* Guard from overflows */
+		if (csx > ssx) {
+			csx = ssx;
+		}
+	}
+
+	/* Precalculate vertical row increments */
+	int csy = 0;
+	int *csay = say;
+	for (uint y = 0; y <= dstH; y++) {
+		*csay = csy;
+		csay++;
+		csy += sy;
+
+		/* Guard from overflows */
+		if (csy > ssy) {
+			csy = ssy;
+		}
+	}
+
+	if (fmt == createPixelFormat<8888>()) {
+		scaleBlitBilinearLogic<ColorMasks<8888>, uint32, 4>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
+	} else if (fmt == createPixelFormat<888>()) {
+		scaleBlitBilinearLogic<ColorMasks<888>,  uint32, 4>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
+	} else if (fmt == createPixelFormat<565>()) {
+		scaleBlitBilinearLogic<ColorMasks<565>,  uint16, 2>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
+	} else if (fmt == createPixelFormat<555>()) {
+		scaleBlitBilinearLogic<ColorMasks<555>,  uint16, 2>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
+
+	} else if (fmt.bytesPerPixel == 4) {
+		scaleBlitBilinearLogic<ColorMasks<0>,    uint32, 4>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
+	} else if (fmt.bytesPerPixel == 3) {
+		scaleBlitBilinearLogic<ColorMasks<0>,    uint8,  3>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
+	} else if (fmt.bytesPerPixel == 2) {
+		scaleBlitBilinearLogic<ColorMasks<0>,    uint16, 2>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
+	} else {
+		delete[] sax;
+		delete[] say;
+
+		return false;
+	}
+
+	delete[] sax;
+	delete[] say;
+
+	return true;
+}
+
+bool rotoscaleBlit(byte *dst, const byte *src,
+				   const uint dstPitch, const uint srcPitch,
+				   const uint dstW, const uint dstH,
+				   const uint srcW, const uint srcH,
+				   const Graphics::PixelFormat &fmt,
+				   const TransformStruct &transform,
+				   const Common::Point &newHotspot) {
+	if (fmt.bytesPerPixel == 4) {
+		rotoscaleBlitLogic<ColorMasks<0>, uint32, 4, false>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else if (fmt.bytesPerPixel == 3) {
+		rotoscaleBlitLogic<ColorMasks<0>, uint8,  3, false>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else if (fmt.bytesPerPixel == 2) {
+		rotoscaleBlitLogic<ColorMasks<0>, uint16, 2, false>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else if (fmt.bytesPerPixel == 1) {
+		rotoscaleBlitLogic<ColorMasks<0>, uint8,  1, false>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else {
+		return false;
+	}
+
+	return true;
+}
+
+bool rotoscaleBlitBilinear(byte *dst, const byte *src,
+						   const uint dstPitch, const uint srcPitch,
+						   const uint dstW, const uint dstH,
+						   const uint srcW, const uint srcH,
+						   const Graphics::PixelFormat &fmt,
+						   const TransformStruct &transform,
+						   const Common::Point &newHotspot) {
+	if (fmt == createPixelFormat<8888>()) {
+		rotoscaleBlitLogic<ColorMasks<8888>, uint32, 4, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else if (fmt == createPixelFormat<888>()) {
+		rotoscaleBlitLogic<ColorMasks<888>,  uint32, 4, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else if (fmt == createPixelFormat<565>()) {
+		rotoscaleBlitLogic<ColorMasks<565>,  uint16, 2, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else if (fmt == createPixelFormat<555>()) {
+		rotoscaleBlitLogic<ColorMasks<555>,  uint16, 2, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+
+	} else if (fmt.bytesPerPixel == 4) {
+		rotoscaleBlitLogic<ColorMasks<0>,    uint32, 4, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else if (fmt.bytesPerPixel == 3) {
+		rotoscaleBlitLogic<ColorMasks<0>,    uint8,  3, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else if (fmt.bytesPerPixel == 2) {
+		rotoscaleBlitLogic<ColorMasks<0>,    uint16, 2, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
+	} else {
+		return false;
+	}
+
+	return true;
+}
+
+} // End of namespace Graphics
--- a/graphics/blit/blit-sse2.cpp
+++ b/graphics/blit/blit-sse2.cpp
@@ -0,0 +1,339 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "common/scummsys.h"
+
+#include "graphics/blit/blit-alpha.h"
+#include "graphics/pixelformat.h"
+
+#include <emmintrin.h>
+
+#if !defined(__x86_64__)
+
+#if defined(__clang__)
+#pragma clang attribute push (__attribute__((target("sse2"))), apply_to=function)
+#elif defined(__GNUC__)
+#pragma GCC push_options
+#pragma GCC target("sse2")
+#endif
+
+#endif // !defined(__x86_64__)
+
+namespace Graphics {
+
+static FORCEINLINE __m128i sse2_mul32(__m128i a, __m128i b) {
+	__m128i even = _mm_shuffle_epi32(_mm_mul_epu32(a, b), _MM_SHUFFLE(0, 0, 2, 0));
+	__m128i odd = _mm_shuffle_epi32(_mm_mul_epu32(_mm_bsrli_si128(a, 4), _mm_bsrli_si128(b, 4)), _MM_SHUFFLE(0, 0, 2, 0));
+	return _mm_unpacklo_epi32(even, odd);
+}
+
+class BlendBlitImpl_SSE2 : public BlendBlitImpl_Base {
+	friend class BlendBlit;
+
+template<bool rgbmod, bool alphamod>
+struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
+public:
+	constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
+		__m128i ina;
+		if (alphamod)
+			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
+		else
+			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+		__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
+	
+		if (rgbmod) {
+			__m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			dstR = _mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
+			dstG = _mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
+			dstB = _mm_mullo_epi16(dstB, _mm_sub_epi32(_mm_set1_epi32(255), ina));
+			srcR = _mm_add_epi32(dstR, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcR, ina), 8), _mm_set1_epi32(this->cr)), BlendBlit::kRModShift - 8));
+			srcG = _mm_add_epi32(dstG, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8), _mm_set1_epi32(this->cg)), BlendBlit::kGModShift - 8));
+			srcB = _mm_add_epi32(dstB, _mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcB, ina), 8), _mm_set1_epi32(this->cb)));
+			src = _mm_or_si128(_mm_and_si128(srcB, _mm_set1_epi32(BlendBlit::kBModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), src);
+			src = _mm_or_si128(_mm_and_si128(srcR, _mm_set1_epi32(BlendBlit::kRModMask)), src);
+		} else {
+			__m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+
+			dstRB = _mm_srli_epi32(sse2_mul32(dstRB, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
+			dstG = _mm_srli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
+			srcRB = _mm_slli_epi32(_mm_add_epi32(dstRB, _mm_srli_epi32(sse2_mul32(srcRB, ina), 8)), BlendBlit::kBModShift);
+			srcG = _mm_slli_epi32(_mm_add_epi32(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
+			src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(_mm_and_si128(srcRB, _mm_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
+		}
+
+		dst = _mm_and_si128(alphaMask, dst);
+		src = _mm_andnot_si128(alphaMask, src);
+		return _mm_or_si128(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
+public:
+	constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
+		__m128i ina, alphaMask;
+		if (alphamod) {
+			ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
+			alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
+		} else {
+			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+			alphaMask = _mm_set1_epi32(BlendBlit::kAModMask);
+		}
+
+		if (rgbmod) {
+			__m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcB = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstB, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcB, _mm_set1_epi32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm_set1_epi32(BlendBlit::kBModMask));
+			srcG = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcG, _mm_set1_epi32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcR = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcR, _mm_set1_epi32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm_set1_epi32(BlendBlit::kRModMask));
+
+			src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(src, _mm_or_si128(srcB, _mm_or_si128(srcG, srcR)));
+		} else {
+			constexpr uint32 rbMask = BlendBlit::kRModMask | BlendBlit::kBModMask;
+			__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(rbMask)), BlendBlit::kBModShift);
+			__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(rbMask)), BlendBlit::kBModShift);
+
+			srcG = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), 8), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcRB = _mm_and_si128(_mm_mullo_epi16(dstRB, _mm_srli_epi32(_mm_and_si128(sse2_mul32(srcRB, ina), _mm_set1_epi32(rbMask)), 8)), _mm_set1_epi32(rbMask));
+			
+			src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(src, _mm_or_si128(srcRB, srcG));
+		}
+
+		dst = _mm_and_si128(alphaMask, dst);
+		src = _mm_andnot_si128(alphaMask, src);
+		return _mm_or_si128(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
+public:
+	constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
+		return _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
+public:
+	constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
+		__m128i alphaMask = _mm_cmpeq_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_setzero_si128());
+		dst = _mm_and_si128(dst, alphaMask);
+		src = _mm_andnot_si128(alphaMask, _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)));
+		return _mm_or_si128(src, dst);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
+public:
+	constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
+		__m128i ina;
+		if (alphamod)
+			ina = _mm_srli_epi32(sse2_mul32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
+		else
+			ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+		__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_set1_epi32(0));
+
+		if (rgbmod) {
+			__m128i srcb = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask));
+			__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+			__m128i dstb = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask));
+			__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+			__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+			srcb = _mm_and_si128(_mm_add_epi32(dstb, _mm_srli_epi32(sse2_mul32(srcb, sse2_mul32(_mm_set1_epi32(this->cb), ina)), 16)), _mm_set1_epi32(BlendBlit::kBModMask));
+			srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(this->cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(this->cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
+
+			src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
+		} else if (alphamod) {
+			__m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
+			__m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
+			__m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = _mm_and_si128(_mm_add_epi32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcrb = _mm_and_si128(_mm_add_epi32(dstrb, sse2_mul32(srcrb, ina)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+			src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
+		} else {
+			__m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
+			__m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+			__m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
+			__m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
+
+			srcg = _mm_and_si128(_mm_add_epi32(dstg, srcg), _mm_set1_epi32(BlendBlit::kGModMask));
+			srcrb = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(dstrb, srcrb), 8), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
+
+			src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
+			src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
+		}
+
+		dst = _mm_and_si128(alphaMask, dst);
+		src = _mm_andnot_si128(alphaMask, src);
+		return _mm_or_si128(dst, src);
+	}
+};
+
+template<bool rgbmod, bool alphamod>
+struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
+public:
+	constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
+
+	inline __m128i simd(__m128i src, __m128i dst) const {
+		__m128i ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
+		__m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+		__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+		__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+		__m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
+		__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
+		__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
+
+		srcb = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstb, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcb, _mm_set1_epi32(this->cb)), sse2_mul32(dstb, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kBModShift), _mm_set1_epi32(BlendBlit::kBModMask));
+		srcg = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstg, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcg, _mm_set1_epi32(this->cg)), sse2_mul32(dstg, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kGModShift), _mm_set1_epi32(BlendBlit::kGModMask));
+		srcr = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstr, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcr, _mm_set1_epi32(this->cr)), sse2_mul32(dstr, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kRModShift), _mm_set1_epi32(BlendBlit::kRModMask));
+
+		return _mm_or_si128(_mm_set1_epi32(BlendBlit::kAModMask), _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
+	}
+};
+
+public:
+template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
+static inline void blitInnerLoop(BlendBlit::Args &args) {
+	const bool loaddst = true; // TODO: Only set this when necessary
+
+	const byte *in;
+	byte *out;
+
+	PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
+
+	int scaleXCtr, scaleYCtr = args.scaleYoff;
+	const byte *inBase;
+
+	if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
+
+	for (uint32 i = 0; i < args.height; i++) {
+		if (doscale) {
+			inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
+			scaleXCtr = args.scaleXoff;
+		} else {
+			in = args.ino;
+		}
+		out = args.outo;
+
+		uint32 j = 0;
+		for (; j + 4 <= args.width; j += 4) {
+			__m128i dstPixels, srcPixels;
+			if (loaddst) dstPixels = _mm_loadu_si128((const __m128i *)out);
+			if (!doscale) {
+				srcPixels = _mm_loadu_si128((const __m128i *)in);
+			} else {
+				srcPixels = _mm_setr_epi32(
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
+					*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
+				);
+				scaleXCtr += args.scaleX * 4;
+			}
+			if (!doscale && (args.flipping & FLIP_H)) {
+				srcPixels = _mm_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
+			}
+			{
+				const __m128i res = pixelFunc.simd(srcPixels, dstPixels);
+				_mm_storeu_si128((__m128i *)out, res);
+			}
+			if (!doscale) in += (ptrdiff_t)args.inStep * 4;
+			out += 4ULL * 4;
+		}
+		if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
+		for (; j < args.width; j++) {
+			if (doscale) {
+				in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
+			}
+
+			pixelFunc.normal(in, out);
+
+			if (doscale)
+				scaleXCtr += args.scaleX;
+			else
+				in += args.inStep;
+			out += 4;
+		}
+		if (doscale)
+			scaleYCtr += args.scaleY;
+		else
+			args.ino += args.inoStep;
+		args.outo += args.dstPitch;
+	}
+}
+
+}; // End of class BlendBlitImpl_SSE2
+
+void BlendBlit::blitSSE2(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
+	blitT<BlendBlitImpl_SSE2>(args, blendMode, alphaType);
+}
+
+} // End of namespace Graphics
+
+#if !defined(__x86_64__)
+
+#if defined(__clang__)
+#pragma clang attribute pop
+#elif defined(__GNUC__)
+#pragma GCC pop_options
+#endif
+
+#endif // !defined(__x86_64__)
--- a/graphics/blit/blit.cpp
+++ b/graphics/blit/blit.cpp
@@ -0,0 +1,501 @@
+/* ScummVM - Graphic Adventure Engine
+ *
+ * ScummVM is the legal property of its developers, whose names
+ * are too numerous to list here. Please refer to the COPYRIGHT
+ * file distributed with this source distribution.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ */
+
+#include "graphics/blit.h"
+#include "graphics/pixelformat.h"
+#include "common/endian.h"
+
+namespace Graphics {
+
+// see graphics/blit/blit-atari.cpp
+#ifdef ATARI
+extern void keyBlitLogicAtari(byte *dst, const byte *src, const uint w, const uint h,
+							  const uint srcDelta, const uint dstDelta, const uint32 key);
+#else
+// Function to blit a rect
+void copyBlit(byte *dst, const byte *src,
+				const uint dstPitch, const uint srcPitch,
+				const uint w, const uint h,
+				const uint bytesPerPixel) {
+	if (dst == src)
+		return;
+
+	if (dstPitch == srcPitch && ((w * bytesPerPixel) == dstPitch)) {
+		// Buffers have equal line pitch AND total number of bytes per line matches that pitch
+		// Therefore we may copy a whole subset of h full-width raster lines in one go.
+		memcpy(dst, src, dstPitch * h);
+	} else {
+		// Not transferring whole width of either source or destination buffer, therefore must copy line-by-line
+		for (uint i = 0; i < h; ++i) {
+			// Copy sublength w of one full buffer raster line
+			memcpy(dst, src, w * bytesPerPixel);
+			// Iterate both buffer pointers by respective pitch, to horizontally align starting point of next raster line with that of the one just copied
+			dst += dstPitch;
+			src += srcPitch;
+		}
+	}
+}
+#endif
+
+namespace {
+
+template<typename Color, int Size>
+inline void keyBlitLogic(byte *dst, const byte *src, const uint w, const uint h,
+						const uint srcDelta, const uint dstDelta, const uint32 key) {
+	const uint8 *col = (const uint8 *)&key;
+#ifdef SCUMM_BIG_ENDIAN
+	if (Size == 3)
+		col++;
+#endif
+
+	for (uint y = 0; y < h; ++y) {
+		for (uint x = 0; x < w; ++x) {
+			if (Size == sizeof(Color)) {
+				const uint32 color = *(const Color *)src;
+				if (color != key)
+					*(Color *)dst = color;
+			} else {
+				if (memcmp(src, col, Size))
+					memcpy(dst, src, Size);
+			}
+
+			src += Size;
+			dst += Size;
+		}
+
+		src += srcDelta;
+		dst += dstDelta;
+	}
+}
+
+#ifdef ATARI
+template<>
+inline void keyBlitLogic<uint8, 1>(byte *dst, const byte *src, const uint w, const uint h,
+								   const uint srcDelta, const uint dstDelta, const uint32 key) {
+	keyBlitLogicAtari(dst, src, w, h, srcDelta, dstDelta, key);
+}
+#endif
+
+} // End of anonymous namespace
+
+// Function to blit a rect with a transparent color key
+bool keyBlit(byte *dst, const byte *src,
+				const uint dstPitch, const uint srcPitch,
+				const uint w, const uint h,
+				const uint bytesPerPixel, const uint32 key) {
+	if (dst == src)
+		return true;
+
+	// Faster, but larger, to provide optimized handling for each case.
+	const uint srcDelta = (srcPitch - w * bytesPerPixel);
+	const uint dstDelta = (dstPitch - w * bytesPerPixel);
+
+	if (bytesPerPixel == 1) {
+		keyBlitLogic<uint8, 1>(dst, src, w, h, srcDelta, dstDelta, key);
+	} else if (bytesPerPixel == 2) {
+		keyBlitLogic<uint16, 2>(dst, src, w, h, srcDelta, dstDelta, key);
+	} else if (bytesPerPixel == 3) {
+		keyBlitLogic<uint8, 3>(dst, src, w, h, srcDelta, dstDelta, key);
+	} else if (bytesPerPixel == 4) {
+		keyBlitLogic<uint32, 4>(dst, src, w, h, srcDelta, dstDelta, key);
+	} else {
+		return false;
+	}
+
+	return true;
+}
+
+namespace {
+
+template<typename Color, int Size>
+inline void maskBlitLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
+						const uint srcDelta, const uint dstDelta, const uint maskDelta) {
+	for (uint y = 0; y < h; ++y) {
+		for (uint x = 0; x < w; ++x) {
+			if (*mask) {
+				if (Size == sizeof(Color)) {
+					*(Color *)dst = *(const Color *)src;
+				} else {
+					memcpy(dst, src, Size);
+				}
+			}
+
+			src  += Size;
+			dst  += Size;
+			mask += 1;
+		}
+
+		src  += srcDelta;
+		dst  += dstDelta;
+		mask += maskDelta;
+	}
+}
+
+} // End of anonymous namespace
+
+// Function to blit a rect with a transparent color mask
+bool maskBlit(byte *dst, const byte *src, const byte *mask,
+				const uint dstPitch, const uint srcPitch, const uint maskPitch,
+				const uint w, const uint h,
+				const uint bytesPerPixel) {
+	if (dst == src)
+		return true;
+
+	// Faster, but larger, to provide optimized handling for each case.
+	const uint srcDelta  = (srcPitch  - w * bytesPerPixel);
+	const uint dstDelta  = (dstPitch  - w * bytesPerPixel);
+	const uint maskDelta = (maskPitch - w);
+
+	if (bytesPerPixel == 1) {
+		maskBlitLogic<uint8, 1>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta);
+	} else if (bytesPerPixel == 2) {
+		maskBlitLogic<uint16, 2>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta);
+	} else if (bytesPerPixel == 3) {
+		maskBlitLogic<uint8, 3>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta);
+	} else if (bytesPerPixel == 4) {
+		maskBlitLogic<uint32, 4>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta);
+	} else {
+		return false;
+	}
+
+	return true;
+}
+
+namespace {
+
+template<typename SrcColor, int SrcSize, typename DstColor, int DstSize, bool backward, bool hasKey, bool hasMask>
+inline void crossBlitLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
+							const PixelFormat &srcFmt, const PixelFormat &dstFmt,
+							const uint srcDelta, const uint dstDelta, const uint maskDelta,
+							const uint32 key) {
+	uint32 color;
+	byte a, r, g, b;
+	uint8 *col = (uint8 *)&color;
+#ifdef SCUMM_BIG_ENDIAN
+	if (SrcSize == 3 || DstSize == 3)
+		col++;
+#endif
+
+	for (uint y = 0; y < h; ++y) {
+		for (uint x = 0; x < w; ++x) {
+			if (SrcSize == sizeof(SrcColor))
+				color = *(const SrcColor *)src;
+			else
+				memcpy(col, src, SrcSize);
+
+			if ((!hasKey || color != key) && (!hasMask || *mask != 0)) {
+				srcFmt.colorToARGB(color, a, r, g, b);
+				color = dstFmt.ARGBToColor(a, r, g, b);
+
+				if (DstSize == sizeof(DstColor))
+					*(DstColor *)dst = color;
+				else
+					memcpy(dst, col, DstSize);
+			}
+
+			if (backward) {
+				src -= SrcSize;
+				dst -= DstSize;
+				if (hasMask)
+					mask -= 1;
+			} else {
+				src += SrcSize;
+				dst += DstSize;
+				if (hasMask)
+					mask += 1;
+			}
+		}
+
+		if (backward) {
+			src -= srcDelta;
+			dst -= dstDelta;
+			if (hasMask)
+				mask -= maskDelta;
+		} else {
+			src += srcDelta;
+			dst += dstDelta;
+			if (hasMask)
+				mask += maskDelta;
+		}
+	}
+}
+
+template<bool hasKey, bool hasMask>
+inline bool crossBlitHelper(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
+							const PixelFormat &srcFmt, const PixelFormat &dstFmt,
+							const uint srcPitch, const uint dstPitch, const uint maskPitch,
+							const uint32 key) {
+	// Faster, but larger, to provide optimized handling for each case.
+	const uint srcDelta = (srcPitch - w * srcFmt.bytesPerPixel);
+	const uint dstDelta = (dstPitch - w * dstFmt.bytesPerPixel);
+	const uint maskDelta = hasMask ? (maskPitch - w) : 0;
+
+	// TODO: optimized cases for dstDelta of 0
+	if (dstFmt.bytesPerPixel == 2) {
+		if (srcFmt.bytesPerPixel == 2) {
+			crossBlitLogic<uint16, 2, uint16, 2, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		} else if (srcFmt.bytesPerPixel == 3) {
+			crossBlitLogic<uint8, 3, uint16, 2, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		} else {
+			crossBlitLogic<uint32, 4, uint16, 2, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		}
+	} else if (dstFmt.bytesPerPixel == 3) {
+		if (srcFmt.bytesPerPixel == 2) {
+			// We need to blit the surface from bottom right to top left here.
+			// This is needed, because when we convert to the same memory
+			// buffer copying the surface from top left to bottom right would
+			// overwrite the source, since we have more bits per destination
+			// color than per source color.
+			dst += h * dstPitch - dstDelta - dstFmt.bytesPerPixel;
+			src += h * srcPitch - srcDelta - srcFmt.bytesPerPixel;
+			if (hasMask) mask += h * maskPitch - maskDelta - 1;
+			crossBlitLogic<uint16, 2, uint8, 3, true, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		} else if (srcFmt.bytesPerPixel == 3) {
+			crossBlitLogic<uint8, 3, uint8, 3, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		} else {
+			crossBlitLogic<uint32, 4, uint8, 3, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		}
+	} else if (dstFmt.bytesPerPixel == 4) {
+		if (srcFmt.bytesPerPixel == 2) {
+			// We need to blit the surface from bottom right to top left here.
+			// This is neeeded, because when we convert to the same memory
+			// buffer copying the surface from top left to bottom right would
+			// overwrite the source, since we have more bits per destination
+			// color than per source color.
+			dst += h * dstPitch - dstDelta - dstFmt.bytesPerPixel;
+			src += h * srcPitch - srcDelta - srcFmt.bytesPerPixel;
+			if (hasMask) mask += h * maskPitch - maskDelta - 1;
+			crossBlitLogic<uint16, 2, uint32, 4, true, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		} else if (srcFmt.bytesPerPixel == 3) {
+			// We need to blit the surface from bottom right to top left here.
+			// This is neeeded, because when we convert to the same memory
+			// buffer copying the surface from top left to bottom right would
+			// overwrite the source, since we have more bits per destination
+			// color than per source color.
+			dst += h * dstPitch - dstDelta - dstFmt.bytesPerPixel;
+			src += h * srcPitch - srcDelta - srcFmt.bytesPerPixel;
+			if (hasMask) mask += h * maskPitch - maskDelta - 1;
+			crossBlitLogic<uint8, 3, uint32, 4, true, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		} else {
+			crossBlitLogic<uint32, 4, uint32, 4, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
+		}
+	} else {
+		return false;
+	}
+	return true;
+}
+
+} // End of anonymous namespace
+
+// Function to blit a rect from one color format to another
+bool crossBlit(byte *dst, const byte *src,
+				const uint dstPitch, const uint srcPitch,
+				const uint w, const uint h,
+				const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt) {
+	// Error out if conversion is impossible
+	if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
+			|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	// Don't perform unnecessary conversion
+	if (srcFmt == dstFmt) {
+		copyBlit(dst, src, dstPitch, srcPitch, w, h, dstFmt.bytesPerPixel);
+		return true;
+	}
+
+	// Attempt to use a faster method if possible
+	FastBlitFunc blitFunc = getFastBlitFunc(dstFmt, srcFmt);
+	if (blitFunc) {
+		blitFunc(dst, src, dstPitch, srcPitch, w, h);
+		return true;
+	}
+
+	return crossBlitHelper<false, false>(dst, src, nullptr, w, h, srcFmt, dstFmt, srcPitch, dstPitch, 0, 0);
+}
+
+// Function to blit a rect from one color format to another with a transparent color key
+bool crossKeyBlit(byte *dst, const byte *src,
+				const uint dstPitch, const uint srcPitch,
+				const uint w, const uint h,
+				const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt, const uint32 key) {
+	// Error out if conversion is impossible
+	if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
+			|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	// Don't perform unnecessary conversion
+	if (srcFmt == dstFmt) {
+		keyBlit(dst, src, dstPitch, srcPitch, w, h, dstFmt.bytesPerPixel, key);
+		return true;
+	}
+
+	return crossBlitHelper<true, false>(dst, src, nullptr, w, h, srcFmt, dstFmt, srcPitch, dstPitch, 0, key);
+}
+
+// Function to blit a rect from one color format to another with a transparent color mask
+bool crossMaskBlit(byte *dst, const byte *src, const byte *mask,
+				const uint dstPitch, const uint srcPitch, const uint maskPitch,
+				const uint w, const uint h,
+				const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt) {
+	// Error out if conversion is impossible
+	if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
+			|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
+		return false;
+
+	// Don't perform unnecessary conversion
+	if (srcFmt == dstFmt) {
+		maskBlit(dst, src, mask, dstPitch, srcPitch, maskPitch, w, h, dstFmt.bytesPerPixel);
+		return true;
+	}
+
+	return crossBlitHelper<false, true>(dst, src, mask, w, h, srcFmt, dstFmt, srcPitch, dstPitch, maskPitch, 0);
+}
+
+namespace {
+
+template<typename DstColor, int DstSize, bool backward, bool hasKey, bool hasMask>
+inline void crossBlitMapLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
+									const uint srcDelta, const uint dstDelta, const uint maskDelta, const uint32 *map, const uint32 key) {
+	for (uint y = 0; y < h; ++y) {
+		for (uint x = 0; x < w; ++x) {
+			const byte color = *src;
+			if ((!hasKey || color != key) && (!hasMask || *mask != 0)) {
+				if (DstSize == sizeof(DstColor)) {
+					*(DstColor *)dst = map[color];
+				} else {
+					WRITE_UINT24(dst, map[color]);
+				}
+			}
+
+			if (backward) {
+				src -= 1;
+				dst -= DstSize;
+				if (hasMask)
+					mask -= 1;
+			} else {
+				src += 1;
+				dst += DstSize;
+				if (hasMask)
+					mask += 1;
+			}
+		}
+
+		if (backward) {
+			src -= srcDelta;
+			dst -= dstDelta;
+			if (hasMask)
+				mask -= maskDelta;
+		} else {
+			src += srcDelta;
+			dst += dstDelta;
+			if (hasMask)
+				mask += maskDelta;
+		}
+	}
+}
+
+template<bool hasKey, bool hasMask>
+inline bool crossBlitMapHelperLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
+							const uint bytesPerPixel, const uint32 *map,
+							const uint srcPitch, const uint dstPitch, const uint maskPitch,
+							const uint32 key) {
+	// Faster, but larger, to provide optimized handling for each case.
+	const uint srcDelta  = (srcPitch  - w);
+	const uint dstDelta  = (dstPitch  - w * bytesPerPixel);
+	const uint maskDelta = hasMask ? (maskPitch - w) : 0;
+
+	if (bytesPerPixel == 1) {
+		crossBlitMapLogic<uint8, 1, false, hasKey, hasMask>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta, map, key);
+	} else if (bytesPerPixel == 2) {
+		// We need to blit the surface from bottom right to top left here.
+		// This is neeeded, because when we convert to the same memory
+		// buffer copying the surface from top left to bottom right would
+		// overwrite the source, since we have more bits per destination
+		// color than per source color.
+		dst += h * dstPitch - dstDelta - bytesPerPixel;
+		src += h * srcPitch - srcDelta - 1;
+		if (hasMask) mask += h * maskPitch - maskDelta - 1;
+		crossBlitMapLogic<uint16, 2, true, hasKey, hasMask>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta, map, key);
+	} else if (bytesPerPixel == 3) {
+		// We need to blit the surface from bottom right to top left here.
+		// This is needed, because when we convert to the same memory
+		// buffer copying the surface from top left to bottom right would
+		// overwrite the source, since we have more bits per destination
+		// color than per source color.
+		dst += h * dstPitch - dstDelta - bytesPerPixel;
+		src += h * srcPitch - srcDelta - 1;
+		if (hasMask) mask += h * maskPitch - maskDelta - 1;
+		crossBlitMapLogic<uint8, 3, true, hasKey, hasMask>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta, map, key);
+	} else if (bytesPerPixel == 4) {
+		// We need to blit the surface from bottom right to top left here.
+		// This is needed, because when we convert to the same memory
+		// buffer copying the surface from top left to bottom right would
+		// overwrite the source, since we have more bits per destination
+		// color than per source color.
+		dst += h * dstPitch - dstDelta - bytesPerPixel;
+		src += h * srcPitch - srcDelta - 1;
+		if (hasMask) mask += h * maskPitch - maskDelta - 1;
+		crossBlitMapLogic<uint32, 4, true, hasKey, hasMask>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta, map, key);
+	} else {
+		return false;
+	}
+	return true;
+}
+
+} // End of anonymous namespace
+
+// Function to blit a rect from one color format to another using a map
+bool crossBlitMap(byte *dst, const byte *src,
+				const uint dstPitch, const uint srcPitch,
+				const uint w, const uint h,
+				const uint bytesPerPixel, const uint32 *map) {
+	// Error out if conversion is impossible
+	if (!bytesPerPixel)
+		return false;
+
+	return crossBlitMapHelperLogic<false, false>(dst, src, nullptr, w, h, bytesPerPixel, map, srcPitch, dstPitch, 0, 0);
+}
+
+// Function to blit a rect from one color format to another using a map with a transparent color key
+bool crossKeyBlitMap(byte *dst, const byte *src,
+				const uint dstPitch, const uint srcPitch,
+				const uint w, const uint h,
+				const uint bytesPerPixel, const uint32 *map, const uint32 key) {
+	// Error out if conversion is impossible
+	if (!bytesPerPixel)
+		return false;
+
+	return crossBlitMapHelperLogic<true, false>(dst, src, nullptr, w, h, bytesPerPixel, map, srcPitch, dstPitch, 0, key);
+}
+
+// Function to blit a rect from one color format to another using a map with a transparent color mask
+bool crossMaskBlitMap(byte *dst, const byte *src, const byte *mask,
+				const uint dstPitch, const uint srcPitch, const uint maskPitch,
+				const uint w, const uint h,
+				const uint bytesPerPixel, const uint32 *map) {
+	// Error out if conversion is impossible
+	if (!bytesPerPixel)
+		return false;
+
+	return crossBlitMapHelperLogic<false, true>(dst, src, mask, w, h, bytesPerPixel, map, srcPitch, dstPitch, maskPitch, 0);
+}
+
+} // End of namespace Graphics