Initial commit

This commit is contained in:
2026-02-02 04:50:13 +01:00
commit 5b11698731
22592 changed files with 7677434 additions and 0 deletions

View File

@@ -0,0 +1,541 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "common/system.h"
#include "graphics/blit.h"
#include "graphics/pixelformat.h"
namespace Graphics {
namespace {
template<typename Color, int Size>
static inline uint32 READ_PIXEL(const byte *src) {
if (Size == sizeof(Color)) {
return *(const Color *)src;
} else {
uint32 color;
uint8 *col = (uint8 *)&color;
#ifdef SCUMM_BIG_ENDIAN
if (Size == 3)
col++;
#endif
memcpy(col, src, Size);
return color;
}
}
template<typename Color, int Size>
static inline void WRITE_PIXEL(byte *dst, const uint32 color) {
if (Size == sizeof(Color)) {
*(Color *)dst = color;
} else {
const uint8 *col = (const uint8 *)&color;
#ifdef SCUMM_BIG_ENDIAN
if (Size == 3)
col++;
#endif
memcpy(dst, col, Size);
}
}
template<typename SrcColor, int SrcSize, typename DstColor, int DstSize, bool hasKey, bool hasMask, bool hasMap>
static inline void alphaBlitLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
const PixelFormat &srcFmt, const PixelFormat &dstFmt, const uint32 *map,
const int srcDelta, const int dstDelta, const int maskDelta,
const int srcInc, const int dstInc, const int maskInc,
const uint32 key, const byte flip, const byte aMod) {
const uint32 alphaMask = srcFmt.ARGBToColor(255, 0, 0, 0);
const bool convert = hasMap ? false : ((SrcSize != DstSize) ? true : srcFmt == dstFmt);
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
const uint32 srcColor = hasMap ? map[*src]
: READ_PIXEL<SrcColor, SrcSize>(src);
const bool isOpaque = hasMask ? (*mask == 0xff)
: (hasKey ? (READ_PIXEL<SrcColor, SrcSize>(src) != key)
: !alphaMask || ((srcColor & alphaMask) == alphaMask));
const bool isTransparent = hasMask ? (*mask == 0x00)
: (hasKey ? (READ_PIXEL<SrcColor, SrcSize>(src) == key)
: alphaMask && ((srcColor & alphaMask) == 0));
if (isOpaque && aMod == 0xff) {
if (convert) {
byte sR, sG, sB;
srcFmt.colorToRGB(srcColor, sR, sG, sB);
WRITE_PIXEL<DstColor, DstSize>(dst, dstFmt.RGBToColor(sR, sG, sB));
} else {
WRITE_PIXEL<DstColor, DstSize>(dst, srcColor);
}
} else if (!isTransparent) {
// TODO: Optimise for matching formats?
const uint32 dstColor = READ_PIXEL<DstColor, DstSize>(dst);
byte sA, sR, sG, sB;
srcFmt.colorToARGB(srcColor, sA, sR, sG, sB);
byte dR, dG, dB;
dstFmt.colorToRGB(dstColor, dR, dG, dB);
if (hasKey)
sA = aMod;
else if (hasMask)
sA = ((*mask * aMod) >> 8);
else
sA = ((sA * aMod) >> 8);
dR = (dR * (255-sA) + sR * sA) >> 8;
dG = (dG * (255-sA) + sG * sA) >> 8;
dB = (dB * (255-sA) + sB * sA) >> 8;
const uint32 outColor = dstFmt.RGBToColor(dR, dG, dB);
WRITE_PIXEL<DstColor, DstSize>(dst, outColor);
}
src += srcInc;
dst += dstInc;
if (hasMask)
mask += maskInc;
}
src += srcDelta;
dst += dstDelta;
if (hasMask)
mask += maskDelta;
}
}
template<bool hasKey, bool hasMask>
static inline bool alphaBlitHelper(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
const PixelFormat &srcFmt, const PixelFormat &dstFmt,
const uint srcPitch, const uint dstPitch, const uint maskPitch,
const uint32 key, const byte flip, const byte aMod) {
const bool hasMap = false;
const bool flipx = flip & FLIP_H;
const bool flipy = flip & FLIP_V;
// Faster, but larger, to provide optimized handling for each case.
int dstDelta = (dstPitch - w * dstFmt.bytesPerPixel);
const int srcDelta = (srcPitch - w * srcFmt.bytesPerPixel);
const int maskDelta = hasMask ? (maskPitch - w) : 0;
const int dstInc = flipx ? -dstFmt.bytesPerPixel : dstFmt.bytesPerPixel;
const int srcInc = srcFmt.bytesPerPixel;
const int maskInc = 1;
if (flipx)
dst += (w - 1) * dstFmt.bytesPerPixel;
if (flipy)
dst += (h - 1) * dstPitch;
if (flipy && flipx)
dstDelta = -dstDelta;
else if (flipy)
dstDelta = -((dstPitch * 2) - dstDelta);
else if (flipx)
dstDelta = (dstPitch * 2) - dstDelta;
if (aMod == 0)
return true;
// TODO: optimized cases for dstDelta of 0
if (dstFmt.bytesPerPixel == 2) {
if (srcFmt.bytesPerPixel == 2) {
alphaBlitLogic<uint16, 2, uint16, 2, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
} else if (srcFmt.bytesPerPixel == 3) {
alphaBlitLogic<uint8, 3, uint16, 2, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
} else {
alphaBlitLogic<uint32, 4, uint16, 2, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
}
} else if (dstFmt.bytesPerPixel == 4) {
if (srcFmt.bytesPerPixel == 2) {
alphaBlitLogic<uint16, 2, uint32, 4, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
} else if (srcFmt.bytesPerPixel == 3) {
alphaBlitLogic<uint8, 3, uint32, 4, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
} else {
alphaBlitLogic<uint32, 4, uint32, 4, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, nullptr, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
}
} else {
return false;
}
return true;
}
template<bool hasKey, bool hasMask>
static inline bool alphaBlitMapHelper(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
const PixelFormat &dstFmt, const uint32 *map,
const uint srcPitch, const uint dstPitch, const uint maskPitch,
const uint32 key, const byte flip, const byte aMod) {
const Graphics::PixelFormat &srcFmt = dstFmt;
const bool hasMap = true;
const bool flipx = flip & FLIP_H;
const bool flipy = flip & FLIP_V;
// Faster, but larger, to provide optimized handling for each case.
int dstDelta = (dstPitch - w * dstFmt.bytesPerPixel);
const int srcDelta = (srcPitch - w);
const int maskDelta = hasMask ? (maskPitch - w) : 0;
const int dstInc = flipx ? -dstFmt.bytesPerPixel : dstFmt.bytesPerPixel;
const int srcInc = 1;
const int maskInc = 1;
if (flipx)
dst += (w - 1) * dstFmt.bytesPerPixel;
if (flipy)
dst += (h - 1) * dstPitch;
if (flipy && flipx)
dstDelta = -dstDelta;
else if (flipy)
dstDelta = -((dstPitch * 2) - dstDelta);
else if (flipx)
dstDelta = (dstPitch * 2) - dstDelta;
// TODO: optimized cases for dstDelta of 0
if (dstFmt.bytesPerPixel == 2) {
alphaBlitLogic<uint8, 1, uint16, 2, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, map, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
} else if (dstFmt.bytesPerPixel == 4) {
alphaBlitLogic<uint8, 1, uint32, 4, hasKey, hasMask, hasMap>(dst, src, mask, w, h, srcFmt, dstFmt, map, srcDelta, dstDelta, maskDelta, srcInc, dstInc, maskInc, key, flip, aMod);
} else {
return false;
}
return true;
}
} // End of anonymous namespace
bool alphaBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt,
const byte flip, const byte aMod) {
// Error out if conversion is impossible
if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
return false;
return alphaBlitHelper<false, false>(dst, src, nullptr, w, h, srcFmt, dstFmt, srcPitch, dstPitch, 0, 0, flip, aMod);
}
bool alphaKeyBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt,
const uint32 key, const byte flip, const byte aMod) {
// Error out if conversion is impossible
if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
return false;
return alphaBlitHelper<true, false>(dst, src, nullptr, w, h, srcFmt, dstFmt, srcPitch, dstPitch, 0, key, flip, aMod);
}
bool alphaMaskBlit(byte *dst, const byte *src, const byte *mask,
const uint dstPitch, const uint srcPitch, const uint maskPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt,
const byte flip, const byte aMod) {
// Error out if conversion is impossible
if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
return false;
return alphaBlitHelper<false, true>(dst, src, mask, w, h, srcFmt, dstFmt, srcPitch, dstPitch, maskPitch, 0, flip, aMod);
}
bool alphaBlitMap(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const uint32 *map,
const byte flip, const byte aMod) {
// Error out if conversion is impossible
if ((dstFmt.bytesPerPixel == 1) || (!dstFmt.bytesPerPixel))
return false;
return alphaBlitMapHelper<false, false>(dst, src, nullptr, w, h, dstFmt, map, srcPitch, dstPitch, 0, 0, flip, aMod);
}
bool alphaKeyBlitMap(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const uint32 *map,
const uint32 key, const byte flip, const byte aMod) {
// Error out if conversion is impossible
if ((dstFmt.bytesPerPixel == 1) || (!dstFmt.bytesPerPixel))
return false;
return alphaBlitMapHelper<true, false>(dst, src, nullptr, w, h, dstFmt, map, srcPitch, dstPitch, 0, key, flip, aMod);
}
bool alphaMaskBlitMap(byte *dst, const byte *src, const byte *mask,
const uint dstPitch, const uint srcPitch, const uint maskPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const uint32 *map,
const byte flip, const byte aMod) {
// Error out if conversion is impossible
if ((dstFmt.bytesPerPixel == 1) || (!dstFmt.bytesPerPixel))
return false;
return alphaBlitMapHelper<false, true>(dst, src, mask, w, h, dstFmt, map, srcPitch, dstPitch, maskPitch, 0, flip, aMod);
}
namespace {
template<typename Size, bool overwriteAlpha>
inline bool applyColorKeyLogic(byte *dst, const byte *src, const uint w, const uint h,
const uint srcDelta, const uint dstDelta,
const Graphics::PixelFormat &format,
const uint8 rKey, const uint8 gKey, const uint8 bKey,
const uint8 rNew, const uint8 gNew, const uint8 bNew) {
const uint32 keyPix = format.ARGBToColor(0, rKey, gKey, bKey);
const uint32 newPix = format.ARGBToColor(0, rNew, gNew, bNew);
const uint32 rgbMask = format.ARGBToColor(0, 255, 255, 255);
const uint32 alphaMask = format.ARGBToColor(255, 0, 0, 0);
bool applied = false;
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
uint32 pix = *(const Size *)src;
if ((pix & rgbMask) == keyPix) {
*(Size *)dst = newPix;
applied = true;
} else if (overwriteAlpha) {
*(Size *)dst = pix | alphaMask;
}
src += sizeof(Size);
dst += sizeof(Size);
}
src += srcDelta;
dst += dstDelta;
}
return applied;
}
template<typename Size, bool skipTransparent>
inline void setAlphaLogic(byte *dst, const byte *src, const uint w, const uint h,
const uint srcDelta, const uint dstDelta,
const Graphics::PixelFormat &format, const uint8 alpha) {
const uint32 newAlpha = format.ARGBToColor(alpha, 0, 0, 0);
const uint32 rgbMask = format.ARGBToColor(0, 255, 255, 255);
const uint32 alphaMask = format.ARGBToColor(255, 0, 0, 0);
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
uint32 pix = *(const Size *)src;
if (!skipTransparent || (pix & alphaMask))
*(Size *)dst = (pix & rgbMask) | newAlpha;
else
*(Size *)dst = pix;
src += sizeof(Size);
dst += sizeof(Size);
}
src += srcDelta;
dst += dstDelta;
}
}
} // End of anonymous namespace
// Function to merge a transparent color key with the alpha channel
bool applyColorKey(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const Graphics::PixelFormat &format, const bool overwriteAlpha,
const uint8 rKey, const uint8 gKey, const uint8 bKey,
const uint8 rNew, const uint8 gNew, const uint8 bNew) {
// Faster, but larger, to provide optimized handling for each case.
const uint srcDelta = (srcPitch - w * format.bytesPerPixel);
const uint dstDelta = (dstPitch - w * format.bytesPerPixel);
if (format.aBits() == 0) {
return false;
}
if (overwriteAlpha) {
if (format.bytesPerPixel == 1) {
return applyColorKeyLogic<uint8, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
} else if (format.bytesPerPixel == 2) {
return applyColorKeyLogic<uint16, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
} else if (format.bytesPerPixel == 4) {
return applyColorKeyLogic<uint32, true>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
} else {
return false;
}
} else {
if (format.bytesPerPixel == 1) {
return applyColorKeyLogic<uint8, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
} else if (format.bytesPerPixel == 2) {
return applyColorKeyLogic<uint16, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
} else if (format.bytesPerPixel == 4) {
return applyColorKeyLogic<uint32, false>(dst, src, w, h, srcDelta, dstDelta, format, rKey, gKey, bKey, rNew, gNew, bNew);
} else {
return false;
}
}
}
// Function to set the alpha channel for all pixels to the specified value
bool setAlpha(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const Graphics::PixelFormat &format,
const bool skipTransparent, const uint8 alpha) {
// Faster, but larger, to provide optimized handling for each case.
const uint srcDelta = (srcPitch - w * format.bytesPerPixel);
const uint dstDelta = (dstPitch - w * format.bytesPerPixel);
if (format.aBits() == 0) {
return false;
}
if (skipTransparent) {
if (format.bytesPerPixel == 1) {
setAlphaLogic<uint8, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
} else if (format.bytesPerPixel == 2) {
setAlphaLogic<uint16, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
} else if (format.bytesPerPixel == 4) {
setAlphaLogic<uint32, true>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
} else {
return false;
}
} else {
if (format.bytesPerPixel == 1) {
setAlphaLogic<uint8, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
} else if (format.bytesPerPixel == 2) {
setAlphaLogic<uint16, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
} else if (format.bytesPerPixel == 4) {
setAlphaLogic<uint32, false>(dst, src, w, h, srcDelta, dstDelta, format, alpha);
} else {
return false;
}
}
return true;
}
BlendBlit::Args::Args(byte *dst, const byte *src,
const uint _dstPitch, const uint _srcPitch,
const int posX, const int posY,
const uint _width, const uint _height,
const int _scaleX, const int _scaleY,
const int scaleXsrcOff, const int scaleYsrcOff,
const uint32 colorMod, const uint _flipping) :
xp(0), yp(0), dstPitch(_dstPitch),
width(_width), height(_height), color(colorMod),
scaleX(_scaleX), scaleY(_scaleY), flipping(_flipping),
scaleXoff(scaleXsrcOff), scaleYoff(scaleYsrcOff) {
bool doScale = scaleX != SCALE_THRESHOLD || scaleY != SCALE_THRESHOLD;
rgbmod = ((colorMod & kRGBModMask) != kRGBModMask);
alphamod = ((colorMod & kAModMask) != kAModMask);
inStep = 4;
inoStep = _srcPitch;
if (flipping & FLIP_H) {
inStep = -inStep;
xp = width - 1;
if (doScale) xp = xp * scaleX / SCALE_THRESHOLD;
}
if (flipping & FLIP_V) {
inoStep = -inoStep;
yp = height - 1;
if (doScale) yp = yp * scaleY / SCALE_THRESHOLD;
}
ino = src + yp * _srcPitch + xp * 4;
outo = dst + posY * _dstPitch + posX * 4;
}
// Initialize these to nullptr at the start
BlendBlit::BlitFunc BlendBlit::blitFunc = nullptr;
BlendBlit::FillFunc BlendBlit::fillFunc = nullptr;
// Only blits to and from 32bpp images
// So this function is just here to jump to whatever function is in
// BlendBlit::blitFunc. This way, we can detect at runtime whether or not
// the cpu has certain SIMD feature enabled or not.
void BlendBlit::blit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const int posX, const int posY,
const uint width, const uint height,
const int scaleX, const int scaleY,
const int scaleXsrcOff, const int scaleYsrcOff,
const uint32 colorMod, const uint flipping,
const TSpriteBlendMode blendMode,
const AlphaType alphaType) {
if (width == 0 || height == 0) return;
// If no function has been selected yet, detect and select
if (!blitFunc) {
// Get the correct blit function
blitFunc = blitGeneric;
#ifdef SCUMMVM_NEON
if (g_system->hasFeature(OSystem::kFeatureCpuNEON)) blitFunc = blitNEON;
#endif
#ifdef SCUMMVM_SSE2
if (g_system->hasFeature(OSystem::kFeatureCpuSSE2)) blitFunc = blitSSE2;
#endif
#ifdef SCUMMVM_AVX2
if (g_system->hasFeature(OSystem::kFeatureCpuAVX2)) blitFunc = blitAVX2;
#endif
}
Args args(dst, src, dstPitch, srcPitch, posX, posY, width, height, scaleX, scaleY, scaleXsrcOff, scaleYsrcOff, colorMod, flipping);
blitFunc(args, blendMode, alphaType);
}
// Only fills 32bpp images
// So this function is just here to jump to whatever function is in
// BlendBlit::fillFunc. This way, we can detect at runtime whether or not
// the cpu has certain SIMD feature enabled or not.
void BlendBlit::fill(byte *dst, const uint dstPitch,
const uint width, const uint height,
const uint32 colorMod,
const TSpriteBlendMode blendMode) {
if (width == 0 || height == 0) return;
// If no function has been selected yet, detect and select
if (!fillFunc) {
// Get the correct blit function
// TODO: Add SIMD variants
fillFunc = fillGeneric;
}
Args args(dst, nullptr, dstPitch, 0, 0, 0, width, height, 0, 0, 0, 0, colorMod, 0);
fillFunc(args, blendMode);
}
} // End of namespace Graphics

501
graphics/blit/blit-alpha.h Normal file
View File

@@ -0,0 +1,501 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "graphics/blit.h"
namespace Graphics {
class BlendBlitImpl_Base {
friend class BlendBlit;
protected:
template<bool rgbmod, bool alphamod>
struct BaseBlend {
public:
constexpr BaseBlend(const uint32 color) :
ca(alphamod ? ((color >> BlendBlit::kAModShift) & 0xFF) : 255),
cr(rgbmod ? ((color >> BlendBlit::kRModShift) & 0xFF) : 255),
cg(rgbmod ? ((color >> BlendBlit::kGModShift) & 0xFF) : 255),
cb(rgbmod ? ((color >> BlendBlit::kBModShift) & 0xFF) : 255) {}
protected:
const byte ca, cr, cg, cb;
};
template<bool rgbmod, bool alphamod>
struct AlphaBlend : public BaseBlend<rgbmod, alphamod> {
public:
constexpr AlphaBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
inline void normal(const byte *in, byte *out) const {
uint32 ina;
if (alphamod) {
ina = in[BlendBlit::kAIndex] * this->ca >> 8;
} else {
ina = in[BlendBlit::kAIndex];
}
if (ina == 255) {
if (rgbmod) {
out[BlendBlit::kAIndex] = 255;
out[BlendBlit::kBIndex] = (in[BlendBlit::kBIndex] * this->cb >> 8);
out[BlendBlit::kGIndex] = (in[BlendBlit::kGIndex] * this->cg >> 8);
out[BlendBlit::kRIndex] = (in[BlendBlit::kRIndex] * this->cr >> 8);
} else {
out[BlendBlit::kAIndex] = 255;
out[BlendBlit::kBIndex] = in[BlendBlit::kBIndex];
out[BlendBlit::kGIndex] = in[BlendBlit::kGIndex];
out[BlendBlit::kRIndex] = in[BlendBlit::kRIndex];
}
} else if (ina != 0) {
if (rgbmod) {
const uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
const uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
const uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
out[BlendBlit::kAIndex] = 255;
out[BlendBlit::kBIndex] = outb + (in[BlendBlit::kBIndex] * ina * this->cb >> 16);
out[BlendBlit::kGIndex] = outg + (in[BlendBlit::kGIndex] * ina * this->cg >> 16);
out[BlendBlit::kRIndex] = outr + (in[BlendBlit::kRIndex] * ina * this->cr >> 16);
} else {
out[BlendBlit::kAIndex] = 255;
out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * (255 - ina) + in[BlendBlit::kBIndex] * ina) >> 8;
out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * (255 - ina) + in[BlendBlit::kGIndex] * ina) >> 8;
out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * (255 - ina) + in[BlendBlit::kRIndex] * ina) >> 8;
}
}
}
inline void fill(byte *out) const {
uint32 ina = this->ca;
/* if (ina == 255) {
if (rgbmod) {
out[BlendBlit::kAIndex] = 255;
out[BlendBlit::kBIndex] = this->cb;
out[BlendBlit::kGIndex] = this->cg;
out[BlendBlit::kRIndex] = this->cr;
} else {
out[BlendBlit::kAIndex] = 255;
out[BlendBlit::kBIndex] = 255;
out[BlendBlit::kGIndex] = 255;
out[BlendBlit::kRIndex] = 255;
}
} else if (ina != 0) */ {
if (rgbmod) {
const uint outb = (out[BlendBlit::kBIndex] * (255 - ina) >> 8);
const uint outg = (out[BlendBlit::kGIndex] * (255 - ina) >> 8);
const uint outr = (out[BlendBlit::kRIndex] * (255 - ina) >> 8);
out[BlendBlit::kAIndex] = 255;
out[BlendBlit::kBIndex] = outb + (255 * ina * this->cb >> 16);
out[BlendBlit::kGIndex] = outg + (255 * ina * this->cg >> 16);
out[BlendBlit::kRIndex] = outr + (255 * ina * this->cr >> 16);
} else {
out[BlendBlit::kAIndex] = 255;
out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * (255 - ina) + 255 * ina) >> 8;
out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * (255 - ina) + 255 * ina) >> 8;
out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * (255 - ina) + 255 * ina) >> 8;
}
}
}
};
template<bool rgbmod, bool alphamod>
struct MultiplyBlend : public BaseBlend<rgbmod, alphamod> {
public:
constexpr MultiplyBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
inline void normal(const byte *in, byte *out) const {
uint32 ina;
if (alphamod) {
ina = in[BlendBlit::kAIndex] * this->ca >> 8;
} else {
ina = in[BlendBlit::kAIndex];
}
if (ina == 255) {
if (rgbmod) {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * this->cb) >> 8) >> 8;
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * this->cg) >> 8) >> 8;
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * this->cr) >> 8) >> 8;
} else {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * in[BlendBlit::kBIndex] >> 8;
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * in[BlendBlit::kGIndex] >> 8;
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * in[BlendBlit::kRIndex] >> 8;
}
} else if (ina != 0) {
if (rgbmod) {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16) >> 8;
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16) >> 8;
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16) >> 8;
} else {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((in[BlendBlit::kBIndex] * ina) >> 8) >> 8;
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((in[BlendBlit::kGIndex] * ina) >> 8) >> 8;
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((in[BlendBlit::kRIndex] * ina) >> 8) >> 8;
}
}
}
inline void fill(byte *out) const {
uint32 ina = this->ca;
if (ina == 255) {
if (rgbmod) {
out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * this->cb) >> 8;
out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * this->cg) >> 8;
out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * this->cr) >> 8;
}
} else if (ina != 0) {
if (rgbmod) {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] * ((this->cb * ina) >> 8) >> 8;
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] * ((this->cg * ina) >> 8) >> 8;
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] * ((this->cr * ina) >> 8) >> 8;
} else {
out[BlendBlit::kBIndex] = (out[BlendBlit::kBIndex] * ina) >> 8;
out[BlendBlit::kGIndex] = (out[BlendBlit::kGIndex] * ina) >> 8;
out[BlendBlit::kRIndex] = (out[BlendBlit::kRIndex] * ina) >> 8;
}
}
}
};
template<bool rgbmod, bool alphamod>
struct OpaqueBlend : public BaseBlend<rgbmod, alphamod> {
public:
constexpr OpaqueBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
inline void normal(const byte *in, byte *out) const {
*(uint32 *)out = *(const uint32 *)in | BlendBlit::kAModMask;
}
};
template<bool rgbmod, bool alphamod>
struct BinaryBlend : public BaseBlend<rgbmod, alphamod> {
public:
constexpr BinaryBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
inline void normal(const byte *in, byte *out) const {
uint32 pix = *(const uint32 *)in;
uint32 a = pix & BlendBlit::kAModMask;
if (a != 0) { // Full opacity (Any value not exactly 0 is Opaque here)
*(uint32 *)out = pix | BlendBlit::kAModMask;
}
}
};
template<bool rgbmod, bool alphamod>
struct AdditiveBlend : public BaseBlend<rgbmod, alphamod> {
public:
constexpr AdditiveBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
inline void normal(const byte *in, byte *out) const {
uint32 ina;
if (alphamod) {
ina = in[BlendBlit::kAIndex] * this->ca >> 8;
} else {
ina = in[BlendBlit::kAIndex];
}
if (ina == 255) {
if (rgbmod) {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * this->cb) >> 8);
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * this->cg) >> 8);
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * this->cr) >> 8);
} else {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + in[BlendBlit::kBIndex];
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + in[BlendBlit::kGIndex];
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + in[BlendBlit::kRIndex];
}
} else if (ina != 0) {
if (rgbmod) {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * this->cb * ina) >> 16);
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * this->cg * ina) >> 16);
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * this->cr * ina) >> 16);
} else {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((in[BlendBlit::kBIndex] * ina) >> 8);
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((in[BlendBlit::kGIndex] * ina) >> 8);
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((in[BlendBlit::kRIndex] * ina) >> 8);
}
}
}
inline void fill(byte *out) const {
uint32 ina = this->ca;
if (ina == 255) {
if (rgbmod) {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + this->cb;
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + this->cg;
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + this->cr;
} else {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + 255;
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + 255;
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + 255;
}
} else if (ina != 0) {
if (rgbmod) {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ((this->cb * ina) >> 8);
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ((this->cg * ina) >> 8);
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ((this->cr * ina) >> 8);
} else {
out[BlendBlit::kBIndex] = out[BlendBlit::kBIndex] + ina;
out[BlendBlit::kGIndex] = out[BlendBlit::kGIndex] + ina;
out[BlendBlit::kRIndex] = out[BlendBlit::kRIndex] + ina;
}
}
}
};
template<bool rgbmod, bool alphamod>
struct SubtractiveBlend : public BaseBlend<rgbmod, alphamod> {
public:
constexpr SubtractiveBlend(const uint32 color) : BaseBlend<rgbmod, alphamod>(color) {}
inline void normal(const byte *in, byte *out) const {
uint32 ina = in[BlendBlit::kAIndex];
out[BlendBlit::kAIndex] = 255;
if (ina == 255) {
if (rgbmod) {
out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * this->cb * (out[BlendBlit::kBIndex])) >> 16), 0);
out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * this->cg * (out[BlendBlit::kGIndex])) >> 16), 0);
out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * this->cr * (out[BlendBlit::kRIndex])) >> 16), 0);
} else {
out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * (out[BlendBlit::kBIndex])) >> 8), 0);
out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * (out[BlendBlit::kGIndex])) >> 8), 0);
out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * (out[BlendBlit::kRIndex])) >> 8), 0);
}
} else if (ina != 0) {
if (rgbmod) {
out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * this->cb * (out[BlendBlit::kBIndex]) * ina) >> 24), 0);
out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * this->cg * (out[BlendBlit::kGIndex]) * ina) >> 24), 0);
out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * this->cr * (out[BlendBlit::kRIndex]) * ina) >> 24), 0);
} else {
out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((in[BlendBlit::kBIndex] * (out[BlendBlit::kBIndex]) * ina) >> 16), 0);
out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((in[BlendBlit::kGIndex] * (out[BlendBlit::kGIndex]) * ina) >> 16), 0);
out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((in[BlendBlit::kRIndex] * (out[BlendBlit::kRIndex]) * ina) >> 16), 0);
}
}
}
inline void fill(byte *out) const {
out[BlendBlit::kAIndex] = 255;
if (rgbmod) {
out[BlendBlit::kBIndex] = MAX<int32>(out[BlendBlit::kBIndex] - ((this->cb * out[BlendBlit::kBIndex]) >> 8), 0);
out[BlendBlit::kGIndex] = MAX<int32>(out[BlendBlit::kGIndex] - ((this->cg * out[BlendBlit::kGIndex]) >> 8), 0);
out[BlendBlit::kRIndex] = MAX<int32>(out[BlendBlit::kRIndex] - ((this->cr * out[BlendBlit::kRIndex]) >> 8), 0);
} else {
out[BlendBlit::kBIndex] = 0;
out[BlendBlit::kGIndex] = 0;
out[BlendBlit::kRIndex] = 0;
}
}
};
}; // End of class BlendBlitImpl_Base
template<class T>
void BlendBlit::blitT(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
bool rgbmod = ((args.color & kRGBModMask) != kRGBModMask);
bool alphamod = ((args.color & kAModMask) != kAModMask);
if (args.scaleX == SCALE_THRESHOLD && args.scaleY == SCALE_THRESHOLD) {
if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
T::template blitInnerLoop<T::template OpaqueBlend, false, false, false>(args);
} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
T::template blitInnerLoop<T::template BinaryBlend, false, false, false>(args);
} else {
if (blendMode == BLEND_ADDITIVE) {
if (rgbmod) {
if (alphamod) {
T::template blitInnerLoop<T::template AdditiveBlend, false, true, true>(args);
} else {
T::template blitInnerLoop<T::template AdditiveBlend, false, true, false>(args);
}
} else {
if (alphamod) {
T::template blitInnerLoop<T::template AdditiveBlend, false, false, true>(args);
} else {
T::template blitInnerLoop<T::template AdditiveBlend, false, false, false>(args);
}
}
} else if (blendMode == BLEND_SUBTRACTIVE) {
if (rgbmod) {
T::template blitInnerLoop<T::template SubtractiveBlend, false, true, false>(args);
} else {
T::template blitInnerLoop<T::template SubtractiveBlend, false, false, false>(args);
}
} else if (blendMode == BLEND_MULTIPLY) {
if (rgbmod) {
if (alphamod) {
T::template blitInnerLoop<T::template MultiplyBlend, false, true, true>(args);
} else {
T::template blitInnerLoop<T::template MultiplyBlend, false, true, false>(args);
}
} else {
if (alphamod) {
T::template blitInnerLoop<T::template MultiplyBlend, false, false, true>(args);
} else {
T::template blitInnerLoop<T::template MultiplyBlend, false, false, false>(args);
}
}
} else {
assert(blendMode == BLEND_NORMAL);
if (rgbmod) {
if (alphamod) {
T::template blitInnerLoop<T::template AlphaBlend, false, true, true>(args);
} else {
T::template blitInnerLoop<T::template AlphaBlend, false, true, false>(args);
}
} else {
if (alphamod) {
T::template blitInnerLoop<T::template AlphaBlend, false, false, true>(args);
} else {
T::template blitInnerLoop<T::template AlphaBlend, false, false, false>(args);
}
}
}
}
} else {
if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_OPAQUE) {
T::template blitInnerLoop<T::template OpaqueBlend, true, false, false>(args);
} else if (args.color == 0xffffffff && blendMode == BLEND_NORMAL && alphaType == ALPHA_BINARY) {
T::template blitInnerLoop<T::template BinaryBlend, true, false, false>(args);
} else {
if (blendMode == BLEND_ADDITIVE) {
if (rgbmod) {
if (alphamod) {
T::template blitInnerLoop<T::template AdditiveBlend, true, true, true>(args);
} else {
T::template blitInnerLoop<T::template AdditiveBlend, true, true, false>(args);
}
} else {
if (alphamod) {
T::template blitInnerLoop<T::template AdditiveBlend, true, false, true>(args);
} else {
T::template blitInnerLoop<T::template AdditiveBlend, true, false, false>(args);
}
}
} else if (blendMode == BLEND_SUBTRACTIVE) {
if (rgbmod) {
T::template blitInnerLoop<T::template SubtractiveBlend, true, true, false>(args);
} else {
T::template blitInnerLoop<T::template SubtractiveBlend, true, false, false>(args);
}
} else if (blendMode == BLEND_MULTIPLY) {
if (rgbmod) {
if (alphamod) {
T::template blitInnerLoop<T::template MultiplyBlend, true, true, true>(args);
} else {
T::template blitInnerLoop<T::template MultiplyBlend, true, true, false>(args);
}
} else {
if (alphamod) {
T::template blitInnerLoop<T::template MultiplyBlend, true, false, true>(args);
} else {
T::template blitInnerLoop<T::template MultiplyBlend, true, false, false>(args);
}
}
} else {
assert(blendMode == BLEND_NORMAL);
if (rgbmod) {
if (alphamod) {
T::template blitInnerLoop<T::template AlphaBlend, true, true, true>(args);
} else {
T::template blitInnerLoop<T::template AlphaBlend, true, true, false>(args);
}
} else {
if (alphamod) {
T::template blitInnerLoop<T::template AlphaBlend, true, false, true>(args);
} else {
T::template blitInnerLoop<T::template AlphaBlend, true, false, false>(args);
}
}
}
}
}
}
template<class T>
void BlendBlit::fillT(Args &args, const TSpriteBlendMode &blendMode) {
bool rgbmod = ((args.color & kRGBModMask) != kRGBModMask);
bool alphamod = ((args.color & kAModMask) != kAModMask);
if (blendMode == BLEND_ADDITIVE) {
if (rgbmod) {
if (alphamod) {
T::template fillInnerLoop<T::template AdditiveBlend, true, true>(args);
} else {
T::template fillInnerLoop<T::template AdditiveBlend, true, false>(args);
}
} else {
if (alphamod) {
T::template fillInnerLoop<T::template AdditiveBlend, false, true>(args);
} else {
T::template fillInnerLoop<T::template AdditiveBlend, false, false>(args);
}
}
} else if (blendMode == BLEND_SUBTRACTIVE) {
if (rgbmod) {
T::template fillInnerLoop<T::template SubtractiveBlend, true, false>(args);
} else {
T::template fillInnerLoop<T::template SubtractiveBlend, false, false>(args);
}
} else if (blendMode == BLEND_MULTIPLY) {
if (rgbmod) {
if (alphamod) {
T::template fillInnerLoop<T::template MultiplyBlend, true, true>(args);
} else {
T::template fillInnerLoop<T::template MultiplyBlend, true, false>(args);
}
} else {
if (alphamod) {
T::template fillInnerLoop<T::template MultiplyBlend, false, true>(args);
} else {
T::template fillInnerLoop<T::template MultiplyBlend, false, false>(args);
}
}
} else {
assert(blendMode == BLEND_NORMAL);
if (rgbmod) {
if (alphamod) {
T::template fillInnerLoop<T::template AlphaBlend, true, true>(args);
} else {
T::template fillInnerLoop<T::template AlphaBlend, true, false>(args);
}
} else {
if (alphamod) {
T::template fillInnerLoop<T::template AlphaBlend, false, true>(args);
} else {
T::template fillInnerLoop<T::template AlphaBlend, false, false>(args);
}
}
}
}
} // End of namespace Graphics

View File

@@ -0,0 +1,283 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "graphics/blit.h"
#include <mint/cookie.h>
#include "backends/graphics/atari/atari-supervidel.h"
#include "backends/platform/atari/dlmalloc.h" // MALLOC_ALIGNMENT
static_assert(MALLOC_ALIGNMENT == 16, "MALLOC_ALIGNMENT must be == 16");
#ifdef USE_MOVE16
static inline bool hasMove16() {
long val;
static bool hasMove16 = Getcookie(C__CPU, &val) == C_FOUND && val >= 40;
return hasMove16;
}
template<typename T>
constexpr bool isAligned(T val) {
return (reinterpret_cast<uintptr>(val) & (MALLOC_ALIGNMENT - 1)) == 0;
}
#endif
namespace Graphics {
// Function to blit a rect with a transparent color key
void keyBlitLogicAtari(byte *dst, const byte *src, const uint w, const uint h,
const uint srcDelta, const uint dstDelta, const uint32 key) {
#ifdef USE_SV_BLITTER
if (key == 0 && (uintptr)src >= 0xA0000000 && (uintptr)dst >= 0xA0000000) {
if (g_superVidelFwVersion >= 9) {
*SV_BLITTER_FIFO = (long)src; // SV_BLITTER_SRC1
*SV_BLITTER_FIFO = (long)(g_blitMask ? g_blitMask : src); // SV_BLITTER_SRC2
*SV_BLITTER_FIFO = (long)dst; // SV_BLITTER_DST
*SV_BLITTER_FIFO = w - 1; // SV_BLITTER_COUNT
*SV_BLITTER_FIFO = srcDelta + w; // SV_BLITTER_SRC1_OFFSET
*SV_BLITTER_FIFO = srcDelta + w; // SV_BLITTER_SRC2_OFFSET
*SV_BLITTER_FIFO = dstDelta + w; // SV_BLITTER_DST_OFFSET
*SV_BLITTER_FIFO = h; // SV_BLITTER_MASK_AND_LINES
*SV_BLITTER_FIFO = 0x03; // SV_BLITTER_CONTROL
} else {
// make sure the blitter is idle
while (*SV_BLITTER_CONTROL & 1);
*SV_BLITTER_SRC1 = (long)src;
*SV_BLITTER_SRC2 = (long)(g_blitMask ? g_blitMask : src);
*SV_BLITTER_DST = (long)dst;
*SV_BLITTER_COUNT = w - 1;
*SV_BLITTER_SRC1_OFFSET = srcDelta + w;
*SV_BLITTER_SRC2_OFFSET = srcDelta + w;
*SV_BLITTER_DST_OFFSET = dstDelta + w;
*SV_BLITTER_MASK_AND_LINES = h;
*SV_BLITTER_CONTROL = 0x03;
}
SyncSuperBlitter();
} else
#endif
{
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
const uint32 color = *src++;
if (color != key)
*dst++ = color;
else
dst++;
}
src += srcDelta;
dst += dstDelta;
}
}
}
// Function to blit a rect (version optimized for Atari Falcon with SuperVidel's SuperBlitter)
void copyBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const uint bytesPerPixel) {
if (dst == src)
return;
#ifdef USE_SV_BLITTER
if ((uintptr)src >= 0xA0000000 && (uintptr)dst >= 0xA0000000) {
if (g_superVidelFwVersion >= 9) {
*SV_BLITTER_FIFO = (long)src; // SV_BLITTER_SRC1
*SV_BLITTER_FIFO = 0x00000000; // SV_BLITTER_SRC2
*SV_BLITTER_FIFO = (long)dst; // SV_BLITTER_DST
*SV_BLITTER_FIFO = w * bytesPerPixel - 1; // SV_BLITTER_COUNT
*SV_BLITTER_FIFO = srcPitch; // SV_BLITTER_SRC1_OFFSET
*SV_BLITTER_FIFO = 0x00000000; // SV_BLITTER_SRC2_OFFSET
*SV_BLITTER_FIFO = dstPitch; // SV_BLITTER_DST_OFFSET
*SV_BLITTER_FIFO = h; // SV_BLITTER_MASK_AND_LINES
*SV_BLITTER_FIFO = 0x01; // SV_BLITTER_CONTROL
} else {
// make sure the blitter is idle
while (*SV_BLITTER_CONTROL & 1);
*SV_BLITTER_SRC1 = (long)src;
*SV_BLITTER_SRC2 = 0x00000000;
*SV_BLITTER_DST = (long)dst;
*SV_BLITTER_COUNT = w * bytesPerPixel - 1;
*SV_BLITTER_SRC1_OFFSET = srcPitch;
*SV_BLITTER_SRC2_OFFSET = 0x00000000;
*SV_BLITTER_DST_OFFSET = dstPitch;
*SV_BLITTER_MASK_AND_LINES = h;
*SV_BLITTER_CONTROL = 0x01;
}
SyncSuperBlitter();
} else
#endif
if (dstPitch == srcPitch && dstPitch == (w * bytesPerPixel)) {
#ifdef USE_MOVE16
if (hasMove16() && isAligned(src) && isAligned(dst)) {
__asm__ volatile(
" move.l %2,%%d0\n"
" lsr.l #4,%%d0\n"
" beq.b 3f\n"
" moveq #0x0f,%%d1\n"
" and.l %%d0,%%d1\n"
" neg.l %%d1\n"
" lsr.l #4,%%d0\n"
" jmp (2f,%%pc,%%d1.l*4)\n"
"1:\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
"2:\n"
" dbra %%d0,1b\n"
// handle also the case when 'dstPitch' is not
// divisible by 16 but 'src' and 'dst' are
"3:\n"
" moveq #0x0f,%%d0\n"
" and.l %2,%%d0\n"
" neg.l %%d0\n"
" jmp (4f,%%pc,%%d0.l*2)\n"
// only 15x move.b as 16 would be handled above
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
"4:\n"
: // outputs
: "a"(src), "a"(dst), "g"(dstPitch * h) // inputs
: "d0", "d1", "cc" AND_MEMORY
);
} else {
#else
{
#endif
memcpy(dst, src, dstPitch * h);
}
} else {
#ifdef USE_MOVE16
if (hasMove16() && isAligned(src) && isAligned(dst) && isAligned(srcPitch) && isAligned(dstPitch)) {
__asm__ volatile(
" move.l %2,%%d0\n"
" moveq #0x0f,%%d1\n"
" and.l %%d0,%%d1\n"
" neg.l %%d1\n"
" lea (4f,%%pc,%%d1.l*2),%%a0\n"
" move.l %%a0,%%a1\n"
" lsr.l #4,%%d0\n"
" beq.b 3f\n"
" moveq #0x0f,%%d1\n"
" and.l %%d0,%%d1\n"
" neg.l %%d1\n"
" lea (2f,%%pc,%%d1.l*4),%%a0\n"
" lsr.l #4,%%d0\n"
" move.l %%d0,%%d1\n"
"0:\n"
" move.l %%d1,%%d0\n"
" jmp (%%a0)\n"
"1:\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
" move16 (%0)+,(%1)+\n"
"2:\n"
" dbra %%d0,1b\n"
// handle (w * bytesPerPixel) % 16
"3:\n"
" jmp (%%a1)\n"
// only 15x move.b as 16 would be handled above
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
" move.b (%0)+,(%1)+\n"
"4:\n"
" add.l %4,%1\n"
" add.l %5,%0\n"
" dbra %3,0b\n"
: // outputs
: "a"(src), "a"(dst), "g"(w * bytesPerPixel), "d"(h - 1),
"g"(dstPitch - w * bytesPerPixel), "g"(srcPitch - w * bytesPerPixel) // inputs
: "d0", "d1", "a0", "a1", "cc" AND_MEMORY
);
} else {
#else
{
#endif
for (uint i = 0; i < h; ++i) {
memcpy(dst, src, w * bytesPerPixel);
dst += dstPitch;
src += srcPitch;
}
}
}
}
} // End of namespace Graphics

330
graphics/blit/blit-avx2.cpp Normal file
View File

@@ -0,0 +1,330 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "common/scummsys.h"
#include "graphics/blit/blit-alpha.h"
#include "graphics/pixelformat.h"
#include <immintrin.h>
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("avx2"))), apply_to=function)
#elif defined(__GNUC__)
#pragma GCC push_options
#pragma GCC target("avx2")
#endif
namespace Graphics {
class BlendBlitImpl_AVX2 : public BlendBlitImpl_Base {
friend class BlendBlit;
template<bool rgbmod, bool alphamod>
struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
public:
constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
inline __m256i simd(__m256i src, __m256i dst) const {
__m256i ina;
if (alphamod)
ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
else
ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
__m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
if (rgbmod) {
__m256i dstR = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i dstB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i srcR = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i srcB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
dstR = _mm256_slli_epi32(_mm256_mullo_epi16(dstR, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
dstG = _mm256_slli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
dstB = _mm256_mullo_epi16(dstB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina));
srcR = _mm256_add_epi32(dstR, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcR, ina), 8), _mm256_set1_epi32(this->cr)), BlendBlit::kRModShift - 8));
srcG = _mm256_add_epi32(dstG, _mm256_slli_epi32(_mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8), _mm256_set1_epi32(this->cg)), BlendBlit::kGModShift - 8));
srcB = _mm256_add_epi32(dstB, _mm256_mullo_epi16(_mm256_srli_epi32(_mm256_mullo_epi16(srcB, ina), 8), _mm256_set1_epi32(this->cb)));
src = _mm256_or_si256(_mm256_and_si256(srcB, _mm256_set1_epi32(BlendBlit::kBModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), src);
src = _mm256_or_si256(_mm256_and_si256(srcR, _mm256_set1_epi32(BlendBlit::kRModMask)), src);
} else {
__m256i dstRB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i srcRB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
dstRB = _mm256_srli_epi32(_mm256_mullo_epi32(dstRB, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
dstG = _mm256_srli_epi32(_mm256_mullo_epi16(dstG, _mm256_sub_epi32(_mm256_set1_epi32(255), ina)), 8);
srcRB = _mm256_slli_epi32(_mm256_add_epi32(dstRB, _mm256_srli_epi32(_mm256_mullo_epi32(srcRB, ina), 8)), BlendBlit::kBModShift);
srcG = _mm256_slli_epi32(_mm256_add_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
src = _mm256_or_si256(_mm256_and_si256(srcG, _mm256_set1_epi32(BlendBlit::kGModMask)), _mm256_set1_epi32(BlendBlit::kAModMask));
src = _mm256_or_si256(_mm256_and_si256(srcRB, _mm256_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
}
dst = _mm256_and_si256(alphaMask, dst);
src = _mm256_andnot_si256(alphaMask, src);
return _mm256_or_si256(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
public:
constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
inline __m256i simd(__m256i src, __m256i dst) const {
__m256i ina, alphaMask;
if (alphamod) {
ina = _mm256_srli_epi32(_mm256_mullo_epi16(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
alphaMask = _mm256_cmpeq_epi32(ina, _mm256_setzero_si256());
} else {
ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
alphaMask = _mm256_set1_epi32(BlendBlit::kAModMask);
}
if (rgbmod) {
__m256i srcB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i srcR = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m256i dstB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i dstR = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcB = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstB, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcB, _mm256_set1_epi32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm256_set1_epi32(BlendBlit::kBModMask));
srcG = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstG, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcG, _mm256_set1_epi32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm256_set1_epi32(BlendBlit::kGModMask));
srcR = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi32(dstR, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi16(srcR, _mm256_set1_epi32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm256_set1_epi32(BlendBlit::kRModMask));
src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
src = _mm256_or_si256(src, _mm256_or_si256(srcB, _mm256_or_si256(srcG, srcR)));
} else {
constexpr uint32 rbMask = BlendBlit::kRModMask | BlendBlit::kBModMask;
__m256i dstRB = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i srcRB = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i dstG = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i srcG = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
srcG = _mm256_and_si256(_mm256_slli_epi32(_mm256_mullo_epi16(dstG, _mm256_srli_epi32(_mm256_mullo_epi16(srcG, ina), 8)), 8), _mm256_set1_epi32(BlendBlit::kGModMask));
srcRB = _mm256_and_si256(_mm256_mullo_epi16(dstRB, _mm256_srli_epi32(_mm256_and_si256(_mm256_mullo_epi32(srcRB, ina), _mm256_set1_epi32(rbMask)), 8)), _mm256_set1_epi32(rbMask));
src = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
src = _mm256_or_si256(src, _mm256_or_si256(srcRB, srcG));
}
dst = _mm256_and_si256(alphaMask, dst);
src = _mm256_andnot_si256(alphaMask, src);
return _mm256_or_si256(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
public:
constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
inline __m256i simd(__m256i src, __m256i dst) const {
return _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
}
};
template<bool rgbmod, bool alphamod>
struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
public:
constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
inline __m256i simd(__m256i src, __m256i dst) const {
__m256i alphaMask = _mm256_cmpeq_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_setzero_si256());
dst = _mm256_and_si256(dst, alphaMask);
src = _mm256_andnot_si256(alphaMask, _mm256_or_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)));
return _mm256_or_si256(src, dst);
}
};
template<bool rgbmod, bool alphamod>
struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
public:
constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
inline __m256i simd(__m256i src, __m256i dst) const {
__m256i ina;
if (alphamod)
ina = _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask)), _mm256_set1_epi32(this->ca)), 8);
else
ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
__m256i alphaMask = _mm256_cmpeq_epi32(ina, _mm256_set1_epi32(0));
if (rgbmod) {
__m256i srcb = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask));
__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m256i dstb = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask));
__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcb = _mm256_and_si256(_mm256_add_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(srcb, _mm256_mullo_epi32(_mm256_set1_epi32(this->cb), ina)), 16)), _mm256_set1_epi32(BlendBlit::kBModMask));
srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_mullo_epi32(srcg, _mm256_mullo_epi32(_mm256_set1_epi32(this->cg), ina))), _mm256_set1_epi32(BlendBlit::kGModMask));
srcr = _mm256_and_si256(_mm256_add_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(srcr, _mm256_mullo_epi32(_mm256_set1_epi32(this->cr), ina)), BlendBlit::kRModShift - 16)), _mm256_set1_epi32(BlendBlit::kRModMask));
src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
src = _mm256_or_si256(src, _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcb)));
} else if (alphamod) {
__m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
__m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
__m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
srcg = _mm256_and_si256(_mm256_add_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(srcg, ina), 8)), _mm256_set1_epi32(BlendBlit::kGModMask));
srcrb = _mm256_and_si256(_mm256_add_epi32(dstrb, _mm256_mullo_epi32(srcrb, ina)), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
} else {
__m256i srcg = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask));
__m256i srcrb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i dstg = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask));
__m256i dstrb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
srcg = _mm256_and_si256(_mm256_add_epi32(dstg, srcg), _mm256_set1_epi32(BlendBlit::kGModMask));
srcrb = _mm256_and_si256(_mm256_slli_epi32(_mm256_add_epi32(dstrb, srcrb), 8), _mm256_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
src = _mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kAModMask));
src = _mm256_or_si256(src, _mm256_or_si256(srcrb, srcg));
}
dst = _mm256_and_si256(alphaMask, dst);
src = _mm256_andnot_si256(alphaMask, src);
return _mm256_or_si256(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
public:
constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
inline __m256i simd(__m256i src, __m256i dst) const {
__m256i ina = _mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kAModMask));
__m256i srcb = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i srcg = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i srcr = _mm256_srli_epi32(_mm256_and_si256(src, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m256i dstb = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m256i dstg = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m256i dstr = _mm256_srli_epi32(_mm256_and_si256(dst, _mm256_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcb = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstb, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcb, _mm256_set1_epi32(this->cb)), _mm256_mullo_epi32(dstb, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kBModShift), _mm256_set1_epi32(BlendBlit::kBModMask));
srcg = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstg, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcg, _mm256_set1_epi32(this->cg)), _mm256_mullo_epi32(dstg, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kGModShift), _mm256_set1_epi32(BlendBlit::kGModMask));
srcr = _mm256_and_si256(_mm256_slli_epi32(_mm256_max_epi16(_mm256_sub_epi32(dstr, _mm256_srli_epi32(_mm256_mullo_epi32(_mm256_mullo_epi32(srcr, _mm256_set1_epi32(this->cr)), _mm256_mullo_epi32(dstr, ina)), 24)), _mm256_set1_epi32(0)), BlendBlit::kRModShift), _mm256_set1_epi32(BlendBlit::kRModMask));
return _mm256_or_si256(_mm256_set1_epi32(BlendBlit::kAModMask), _mm256_or_si256(srcb, _mm256_or_si256(srcg, srcr)));
}
};
public:
template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
static void blitInnerLoop(BlendBlit::Args &args) {
const bool loaddst = true; // TODO: Only set this when necessary
const byte *in;
byte *out;
const PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
int scaleXCtr, scaleYCtr = args.scaleYoff;
const byte *inBase;
if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 7;
for (uint32 i = 0; i < args.height; i++) {
if (doscale) {
inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
scaleXCtr = args.scaleXoff;
} else {
in = args.ino;
}
out = args.outo;
uint32 j = 0;
for (; j + 8 <= args.width; j += 8) {
__m256i dstPixels, srcPixels;
if (loaddst) dstPixels = _mm256_loadu_si256((const __m256i *)out);
if (!doscale) {
srcPixels = _mm256_loadu_si256((const __m256i *)in);
} else {
srcPixels = _mm256_setr_epi32(
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 4) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 5) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 6) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 7) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
);
scaleXCtr += args.scaleX * 8;
}
if (!doscale && (args.flipping & FLIP_H)) {
srcPixels = _mm256_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
srcPixels = _mm256_permute2x128_si256(srcPixels, srcPixels, 0x01);
}
{
const __m256i res = pixelFunc.simd(srcPixels, dstPixels);
_mm256_storeu_si256((__m256i *)out, res);
}
if (!doscale) in += (ptrdiff_t)args.inStep * 8;
out += 4ULL * 8;
}
if (!doscale && (args.flipping & FLIP_H)) in += 4 * 7;
for (; j < args.width; j++) {
if (doscale) {
in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
}
pixelFunc.normal(in, out);
if (doscale)
scaleXCtr += args.scaleX;
else
in += args.inStep;
out += 4;
}
if (doscale)
scaleYCtr += args.scaleY;
else
args.ino += args.inoStep;
args.outo += args.dstPitch;
}
}
}; // end of class BlendBlitImpl_AVX2
void BlendBlit::blitAVX2(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
blitT<BlendBlitImpl_AVX2>(args, blendMode, alphaType);
}
} // End of namespace Graphics
#if defined(__clang__)
#pragma clang attribute pop
#elif defined(__GNUC__)
#pragma GCC pop_options
#endif

138
graphics/blit/blit-fast.cpp Normal file
View File

@@ -0,0 +1,138 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "graphics/blit.h"
#include "graphics/pixelformat.h"
#include "common/endian.h"
#include "common/system.h"
namespace Graphics {
namespace {
template<bool bswap, int rotate>
static void swapBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h) {
// Faster, but larger, to provide optimized handling for each case.
const uint srcDelta = (srcPitch - w * sizeof(uint32));
const uint dstDelta = (dstPitch - w * sizeof(uint32));
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
uint32 col = *(const uint32 *)src;
if (bswap)
col = SWAP_BYTES_32(col);
if (rotate != 0)
col = ROTATE_RIGHT_32(col, rotate);
*(uint32 *)dst = col;
src += sizeof(uint32);
dst += sizeof(uint32);
}
src += srcDelta;
dst += dstDelta;
}
}
} // End of anonymous namespace
// TODO: Add fast 24<->32bpp conversion
// TODO: Add fast 16<->16bpp conversion
struct FastBlitLookup {
FastBlitFunc func;
Graphics::PixelFormat srcFmt, dstFmt;
};
static const FastBlitLookup fastBlitFuncs_4to4[] = {
// 32-bit byteswap
{ swapBlit<true, 0>, Graphics::PixelFormat(4, 8, 8, 8, 8, 0, 8, 16, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0) }, // ABGR8888 -> RGBA8888
{ swapBlit<true, 0>, Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0), Graphics::PixelFormat(4, 8, 8, 8, 8, 0, 8, 16, 24) }, // RGBA8888 -> ABGR8888
{ swapBlit<true, 0>, Graphics::PixelFormat(4, 8, 8, 8, 8, 16, 8, 0, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 8, 16, 24, 0) }, // ARGB8888 -> BGRA8888
{ swapBlit<true, 0>, Graphics::PixelFormat(4, 8, 8, 8, 8, 8, 16, 24, 0), Graphics::PixelFormat(4, 8, 8, 8, 8, 16, 8, 0, 24) }, // BGRA8888 -> ARGB8888
// 32-bit rotate right
{ swapBlit<false, 8>, Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0), Graphics::PixelFormat(4, 8, 8, 8, 8, 16, 8, 0, 24) }, // RGBA8888 -> ARGB8888
{ swapBlit<false, 8>, Graphics::PixelFormat(4, 8, 8, 8, 8, 8, 16, 24, 0), Graphics::PixelFormat(4, 8, 8, 8, 8, 0, 8, 16, 24) }, // BGRA8888 -> ABGR8888
// 32-bit rotate left
{ swapBlit<false, 24>, Graphics::PixelFormat(4, 8, 8, 8, 8, 0, 8, 16, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 8, 16, 24, 0) }, // ABGR8888 -> BGRA8888
{ swapBlit<false, 24>, Graphics::PixelFormat(4, 8, 8, 8, 8, 16, 8, 0, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0) }, // ARGB8888 -> RGBA8888
// 32-bit byteswap and rotate right
{ swapBlit<true, 8>, Graphics::PixelFormat(4, 8, 8, 8, 8, 0, 8, 16, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 16, 8, 0, 24) }, // ABGR8888 -> ARGB8888
{ swapBlit<true, 8>, Graphics::PixelFormat(4, 8, 8, 8, 8, 16, 8, 0, 24), Graphics::PixelFormat(4, 8, 8, 8, 8, 0, 8, 16, 24) }, // ARGB8888 -> ABGR8888
// 32-bit byteswap and rotate left
{ swapBlit<true, 24>, Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0), Graphics::PixelFormat(4, 8, 8, 8, 8, 8, 16, 24, 0) }, // RGBA8888 -> BGRA8888
{ swapBlit<true, 24>, Graphics::PixelFormat(4, 8, 8, 8, 8, 8, 16, 24, 0), Graphics::PixelFormat(4, 8, 8, 8, 8, 24, 16, 8, 0) } // BGRA8888 -> RGBA8888
};
#ifdef SCUMMVM_NEON
static const FastBlitLookup fastBlitFuncs_NEON[] = {
// 16-bit with NEON
{ fastBlitNEON_XRGB1555_RGB565, Graphics::PixelFormat(2, 5, 5, 5, 0, 10, 5, 0, 0), Graphics::PixelFormat(2, 5, 6, 5, 0, 11, 5, 0, 0) }, // XRGB1555 -> RGB565
};
#endif
FastBlitFunc getFastBlitFunc(const PixelFormat &dstFmt, const PixelFormat &srcFmt) {
const uint dstBpp = dstFmt.bytesPerPixel;
const uint srcBpp = srcFmt.bytesPerPixel;
const FastBlitLookup *table = nullptr;
size_t length = 0;
if (srcBpp == 4 && dstBpp == 4) {
table = fastBlitFuncs_4to4;
length = ARRAYSIZE(fastBlitFuncs_4to4);
for (size_t i = 0; i < length; i++) {
if (srcFmt != table[i].srcFmt)
continue;
if (dstFmt != table[i].dstFmt)
continue;
return table[i].func;
}
}
#ifdef SCUMMVM_NEON
if (srcBpp == 2 && dstBpp == 2 && g_system->hasFeature(OSystem::kFeatureCpuNEON)) {
table = fastBlitFuncs_NEON;
length = ARRAYSIZE(fastBlitFuncs_NEON);
for (size_t i = 0; i < length; i++) {
if (srcFmt != table[i].srcFmt)
continue;
if (dstFmt != table[i].dstFmt)
continue;
return table[i].func;
}
}
#endif
return nullptr;
}
} // End of namespace Graphics

View File

@@ -0,0 +1,99 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "graphics/blit/blit-alpha.h"
#include "graphics/pixelformat.h"
namespace Graphics {
class BlendBlitImpl_Default : public BlendBlitImpl_Base {
friend class BlendBlit;
public:
template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
static inline void blitInnerLoop(BlendBlit::Args &args) {
const byte *in;
byte *out;
const PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
int scaleXCtr, scaleYCtr = args.scaleYoff;
const byte *inBase;
for (uint32 i = 0; i < args.height; i++) {
if (doscale) {
inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
scaleXCtr = args.scaleXoff;
} else {
in = args.ino;
}
out = args.outo;
for (uint32 j = 0; j < args.width; j++) {
if (doscale) {
in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
}
pixelFunc.normal(in, out);
if (doscale)
scaleXCtr += args.scaleX;
else
in += args.inStep;
out += 4;
}
if (doscale)
scaleYCtr += args.scaleY;
else
args.ino += args.inoStep;
args.outo += args.dstPitch;
}
}
template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool rgbmod, bool alphamod>
static inline void fillInnerLoop(BlendBlit::Args &args) {
byte *out;
const PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
for (uint32 i = 0; i < args.height; i++) {
out = args.outo;
for (uint32 j = 0; j < args.width; j++) {
pixelFunc.fill(out);
out += 4;
}
args.outo += args.dstPitch;
}
}
}; // end of class BlendBlitImpl_Default
void BlendBlit::blitGeneric(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
blitT<BlendBlitImpl_Default>(args, blendMode, alphaType);
}
void BlendBlit::fillGeneric(Args &args, const TSpriteBlendMode &blendMode) {
fillT<BlendBlitImpl_Default>(args, blendMode);
}
} // End of namespace Graphics

391
graphics/blit/blit-neon.cpp Normal file
View File

@@ -0,0 +1,391 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "common/scummsys.h"
#ifdef SCUMMVM_NEON
#include "graphics/blit/blit-alpha.h"
#include "graphics/pixelformat.h"
#include <arm_neon.h>
#if !defined(__aarch64__) && !defined(__ARM_NEON)
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("neon"))), apply_to=function)
#elif defined(__GNUC__)
#pragma GCC push_options
#pragma GCC target("fpu=neon")
#endif
#endif // !defined(__aarch64__) && !defined(__ARM_NEON)
namespace Graphics {
class BlendBlitImpl_NEON : public BlendBlitImpl_Base {
friend class BlendBlit;
template<bool rgbmod, bool alphamod>
struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
public:
constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
uint32x4_t ina;
if (alphamod)
ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
else
ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
if (rgbmod) {
uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), 16);
uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), 16);
uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), 8);
uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), 8);
uint32x4_t dstB = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
uint32x4_t srcB = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
dstR = vshrq_n_u32(vmulq_u32(dstR, vsubq_u32(vmovq_n_u32(255), ina)), 8);
dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
dstB = vshrq_n_u32(vmulq_u32(dstB, vsubq_u32(vmovq_n_u32(255), ina)), 8);
srcR = vaddq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, ina), vmovq_n_u32(this->cr)), 16));
srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, ina), vmovq_n_u32(this->cg)), 16));
srcB = vaddq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, ina), vmovq_n_u32(this->cb)), 16));
src = vorrq_u32(vandq_u32(srcB, vmovq_n_u32(BlendBlit::kBModMask)), vmovq_n_u32(BlendBlit::kAModMask));
src = vorrq_u32(vandq_u32(vshlq_n_u32(srcG, 8), vmovq_n_u32(BlendBlit::kGModMask)), src);
src = vorrq_u32(vandq_u32(vshlq_n_u32(srcR, 16), vmovq_n_u32(BlendBlit::kRModMask)), src);
} else {
uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), 8);
uint32x4_t dstG = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
uint32x4_t srcG = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
dstRB = vmulq_u32(dstRB, vsubq_u32(vmovq_n_u32(255), ina));
dstG = vshrq_n_u32(vmulq_u32(dstG, vsubq_u32(vmovq_n_u32(255), ina)), 8);
srcRB = vaddq_u32(dstRB, vmulq_u32(srcRB, ina));
srcG = vaddq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8));
src = vorrq_u32(vandq_u32(srcG, vmovq_n_u32(BlendBlit::kGModMask)), vmovq_n_u32(BlendBlit::kAModMask));
src = vorrq_u32(vandq_u32(srcRB, vmovq_n_u32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
}
dst = vandq_u32(alphaMask, dst);
src = vandq_u32(vmvnq_u32(alphaMask), src);
return vorrq_u32(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
public:
constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
uint32x4_t ina, alphaMask;
if (alphamod) {
ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
} else {
ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
alphaMask = vdupq_n_u32(BlendBlit::kAModMask);
}
if (rgbmod) {
uint32x4_t srcB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
uint32x4_t srcR = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
uint32x4_t dstB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
uint32x4_t dstR = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcB = vandq_u32(vshlq_n_u32(vmulq_u32(dstB, vshrq_n_u32(vmulq_u32(vmulq_u32(srcB, vmovq_n_u32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), vmovq_n_u32(BlendBlit::kBModMask));
srcG = vandq_u32(vshlq_n_u32(vmulq_u32(dstG, vshrq_n_u32(vmulq_u32(vmulq_u32(srcG, vmovq_n_u32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), vmovq_n_u32(BlendBlit::kGModMask));
srcR = vandq_u32(vshlq_n_u32(vmulq_u32(dstR, vshrq_n_u32(vmulq_u32(vmulq_u32(srcR, vmovq_n_u32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), vmovq_n_u32(BlendBlit::kRModMask));
src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
src = vorrq_u32(src, vorrq_u32(srcB, vorrq_u32(srcG, srcR)));
} else {
constexpr uint32 rbMask = BlendBlit::kRModMask | BlendBlit::kBModMask;
uint32x4_t srcG = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
uint32x4_t srcRB = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(rbMask)), BlendBlit::kBModShift);
uint32x4_t dstG = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
uint32x4_t dstRB = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(rbMask)), BlendBlit::kBModShift);
srcG = vandq_u32(vshlq_n_u32(vmulq_u32(dstG, vshrq_n_u32(vmulq_u32(srcG, ina), 8)), 8), vmovq_n_u32(BlendBlit::kGModMask));
srcRB = vandq_u32(vreinterpretq_u32_u16(vmulq_u16(vreinterpretq_u16_u32(dstRB), vreinterpretq_u16_u32(vshrq_n_u32(vandq_u32(vmulq_u32(srcRB, ina), vmovq_n_u32(rbMask)), 8)))), vmovq_n_u32(rbMask));
src = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
src = vorrq_u32(src, vorrq_u32(srcRB, srcG));
}
dst = vandq_u32(alphaMask, dst);
src = vandq_u32(vmvnq_u32(alphaMask), src);
return vorrq_u32(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
public:
constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
return vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
}
};
template<bool rgbmod, bool alphamod>
struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
public:
constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
uint32x4_t alphaMask = vceqq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmovq_n_u32(0));
dst = vandq_u32(dst, alphaMask);
src = vandq_u32(vorrq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vmvnq_u32(alphaMask));
return vorrq_u32(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
public:
constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
uint32x4_t ina;
if (alphamod)
ina = vshrq_n_u32(vmulq_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask)), vdupq_n_u32(this->ca)), 8);
else
ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
uint32x4_t alphaMask = vceqq_u32(ina, vmovq_n_u32(0));
if (rgbmod) {
uint32x4_t srcb = vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask));
uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
uint32x4_t dstb = vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask));
uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcb = vandq_u32(vaddq_u32(dstb, vshrq_n_u32(vmulq_u32(srcb, vmulq_u32(vmovq_n_u32(this->cb), ina)), 16)), vmovq_n_u32(BlendBlit::kBModMask));
srcg = vandq_u32(vaddq_u32(dstg, vmulq_u32(srcg, vmulq_u32(vmovq_n_u32(this->cg), ina))), vmovq_n_u32(BlendBlit::kGModMask));
srcr = vandq_u32(vaddq_u32(dstr, vshrq_n_u32(vmulq_u32(srcr, vmulq_u32(vmovq_n_u32(this->cr), ina)), BlendBlit::kRModShift - 16)), vmovq_n_u32(BlendBlit::kRModMask));
src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
src = vorrq_u32(src, vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
} else if (alphamod) {
uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
srcg = vandq_u32(vaddq_u32(dstg, vshrq_n_u32(vmulq_u32(srcg, ina), 8)), vmovq_n_u32(BlendBlit::kGModMask));
srcrb = vandq_u32(vaddq_u32(dstrb, vmulq_u32(srcrb, ina)), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
} else {
uint32x4_t srcg = vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask));
uint32x4_t srcrb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
uint32x4_t dstg = vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask));
uint32x4_t dstrb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
srcg = vandq_u32(vaddq_u32(dstg, srcg), vmovq_n_u32(BlendBlit::kGModMask));
srcrb = vandq_u32(vshlq_n_u32(vaddq_u32(dstrb, srcrb), 8), vmovq_n_u32(BlendBlit::kRModMask | BlendBlit::kBModMask));
src = vandq_u32(dst, vmovq_n_u32(BlendBlit::kAModMask));
src = vorrq_u32(src, vorrq_u32(srcrb, srcg));
}
dst = vandq_u32(alphaMask, dst);
src = vandq_u32(vmvnq_u32(alphaMask), src);
return vorrq_u32(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
public:
constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
inline uint32x4_t simd(uint32x4_t src, uint32x4_t dst) const {
uint32x4_t ina = vandq_u32(src, vmovq_n_u32(BlendBlit::kAModMask));
uint32x4_t srcb = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
uint32x4_t srcg = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
uint32x4_t srcr = vshrq_n_u32(vandq_u32(src, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
uint32x4_t dstb = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
uint32x4_t dstg = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
uint32x4_t dstr = vshrq_n_u32(vandq_u32(dst, vmovq_n_u32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcb = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstb), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcb, vmovq_n_u32(this->cb)), vmulq_u32(dstb, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kBModShift), vmovq_n_u32(BlendBlit::kBModMask));
srcg = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstg), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcg, vmovq_n_u32(this->cg)), vmulq_u32(dstg, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kGModShift), vmovq_n_u32(BlendBlit::kGModMask));
srcr = vandq_u32(vshlq_n_u32(vreinterpretq_u32_s32(vmaxq_s32(vsubq_s32(vreinterpretq_s32_u32(dstr), vreinterpretq_s32_u32(vshrq_n_u32(vmulq_u32(vmulq_u32(srcr, vmovq_n_u32(this->cr)), vmulq_u32(dstr, ina)), 24))), vmovq_n_s32(0))), BlendBlit::kRModShift), vmovq_n_u32(BlendBlit::kRModMask));
return vorrq_u32(vmovq_n_u32(BlendBlit::kAModMask), vorrq_u32(srcb, vorrq_u32(srcg, srcr)));
}
};
public:
template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
static inline void blitInnerLoop(BlendBlit::Args &args) {
const bool loaddst = true; // TODO: Only set this when necessary
const byte *in;
byte *out;
PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
int scaleXCtr, scaleYCtr = args.scaleYoff;
const byte *inBase;
if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
for (uint32 i = 0; i < args.height; i++) {
if (doscale) {
inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
scaleXCtr = args.scaleXoff;
} else {
in = args.ino;
}
out = args.outo;
uint32 j = 0;
for (; j + 4 <= args.width; j += 4) {
uint32x4_t dstPixels;
if (loaddst) dstPixels = vld1q_u32((const uint32 *)out);
uint32x4_t srcPixels;
if (!doscale) {
srcPixels = vld1q_u32((const uint32 *)in);
} else {
srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), vmovq_n_u32(0), 0);
scaleXCtr += args.scaleX;
srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 1);
scaleXCtr += args.scaleX;
srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 2);
scaleXCtr += args.scaleX;
srcPixels = vsetq_lane_u32(*(const uint32 *)(inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep), srcPixels, 3);
scaleXCtr += args.scaleX;
}
if (!doscale && (args.flipping & FLIP_H)) {
srcPixels = vrev64q_u32(srcPixels);
srcPixels = vcombine_u32(vget_high_u32(srcPixels), vget_low_u32(srcPixels));
}
{
const uint32x4_t res = pixelFunc.simd(srcPixels, dstPixels);
vst1q_u32((uint32 *)out, res);
}
if (!doscale) in += args.inStep * 4;
out += 4 * 4;
}
if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
for (; j < args.width; j++) {
if (doscale) {
in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
}
pixelFunc.normal(in, out);
if (doscale)
scaleXCtr += args.scaleX;
else
in += args.inStep;
out += 4;
}
if (doscale)
scaleYCtr += args.scaleY;
else
args.ino += args.inoStep;
args.outo += args.dstPitch;
}
}
}; // end of class BlendBlitImpl_NEON
void BlendBlit::blitNEON(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
blitT<BlendBlitImpl_NEON>(args, blendMode, alphaType);
}
void fastBlitNEON_XRGB1555_RGB565(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h) {
const uint srcDelta = (srcPitch - w * 2);
const uint dstDelta = (dstPitch - w * 2);
const uint16 *src_ptr = (const uint16 *)src;
uint16 *dst_ptr = (uint16 *)dst;
uint16x4_t pixels;
for (uint y = h; y > 0; --y) {
uint x = w;
for (; x >= 4; x -= 4) {
src_ptr = (const uint16 *)src;
dst_ptr = (uint16 *)dst;
// Load pixels to NEON
pixels = vld1_u16(src_ptr);
// Convert from XRGB1555 to RGB565
// Here we do : ((pixels & 0x7FE0) << 1) | ((pixels & 0x0200) >> 4) | (pixels & 0x001F)
pixels = vorr_u16(
vorr_u16(
vshl_n_u16(vand_u16(pixels, vmov_n_u16(0x7FE0)), 1),
vshr_n_u16(vand_u16(pixels, vmov_n_u16(0x0200)), 4)
),
vand_u16(pixels, vmov_n_u16(0x001F))
);
// Store pixels to destination
vst1_u16(dst_ptr, pixels);
src += 4 * 2;
dst += 4 * 2;
}
for (; x > 0; --x) {
// We have remaining pixels, convert them the classic way
src_ptr = (const uint16 *)src;
dst_ptr = (uint16 *)dst;
*dst_ptr = ((((*src_ptr) & 0x7FE0) << 1) | (((*src_ptr) & 0x0200) >> 4) | ((*src_ptr) & 0x001F));
src += 2;
dst += 2;
}
src += srcDelta;
dst += dstDelta;
}
}
} // end of namespace Graphics
#if !defined(__aarch64__) && !defined(__ARM_NEON)
#if defined(__clang__)
#pragma clang attribute pop
#elif defined(__GNUC__)
#pragma GCC pop_options
#endif
#endif // !defined(__aarch64__) && !defined(__ARM_NEON)
#endif // SCUMMVM_NEON

View File

@@ -0,0 +1,552 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*
* The bottom part of this is file is adapted from SDL_rotozoom.c. The
* relevant copyright notice for those specific functions can be found at the
* top of that section.
*
*/
#include "graphics/blit.h"
#include "graphics/pixelformat.h"
#include "graphics/transform_struct.h"
#include "common/endian.h"
#include "common/rect.h"
#include "math/utils.h"
namespace Graphics {
namespace {
static void scaleVertical(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint dstH, const uint srcH,
const byte flip, const uint bytesPerPixel) {
const bool flipy = flip & FLIP_V;
// 16.16 fixed point
const uint32 srcIncY = (srcH << 16) / dstH;
const int dstIncY = (flipy ? -static_cast<int>(dstPitch) : static_cast<int>(dstPitch));
if (flipy) {
dst += (dstH - 1) * dstPitch;
}
for (uint32 y = 0, yoff = 0; y < dstH; y++, yoff += srcIncY) {
const byte *srcP = src + ((yoff >> 16) * srcPitch);
memcpy(dst, srcP, w * bytesPerPixel);
dst += dstIncY;
}
}
template <typename Color, int Size>
static void scaleNN(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint dstW, const uint dstH,
const uint srcW, const uint srcH,
const byte flip) {
const bool flipx = flip & FLIP_H;
const bool flipy = flip & FLIP_V;
// 16.16 fixed point
const uint32 srcIncX = (srcW << 16) / dstW;
const uint32 srcIncY = (srcH << 16) / dstH;
const int dstIncX = (flipx ? -1 : 1);
const int dstIncY = (flipy ? -static_cast<int>(dstPitch) : static_cast<int>(dstPitch));
if (flipx) {
dst += (dstW - 1) * Size;
}
if (flipy) {
dst += (dstH - 1) * dstPitch;
}
for (uint32 y = 0, yoff = 0; y < dstH; y++, yoff += srcIncY) {
const byte *srcP = src + ((yoff >> 16) * srcPitch);
byte *dst1 = dst;
for (uint32 x = 0, xoff = 0; x < dstW; x++, xoff += srcIncX) {
const byte *src1 = srcP + ((xoff >> 16) * Size);
if (Size == sizeof(Color)) {
*(Color *)dst1 = *(const Color *)src1;
} else {
memcpy(dst1, src, Size);
}
dst1 += dstIncX * Size;
}
dst += dstIncY;
}
}
} // End of anonymous namespace
bool scaleBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint dstW, const uint dstH,
const uint srcW, const uint srcH,
const Graphics::PixelFormat &fmt,
const byte flip) {
// This should be OK since int16 is used in Graphics::Surface.
assert(srcW <= 65535);
assert(srcH <= 65535);
if (dstW == srcW && !(flip & FLIP_H)) {
if (dstH == srcH && !(flip & FLIP_V))
copyBlit(dst, src, dstPitch, srcPitch, dstW, dstH, fmt.bytesPerPixel);
else
scaleVertical(dst, src, dstPitch, srcPitch, dstW, dstH, srcH, flip, fmt.bytesPerPixel);
return true;
}
switch (fmt.bytesPerPixel) {
case 1:
scaleNN<uint8, 1>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, flip);
return true;
case 2:
scaleNN<uint16, 2>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, flip);
return true;
case 3:
scaleNN<uint8, 3>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, flip);
return true;
case 4:
scaleNN<uint32, 4>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, flip);
return true;
default:
break;
}
return false;
}
/*
The functions below are adapted from SDL_rotozoom.c,
taken from SDL_gfx-2.0.18.
Its copyright notice:
=============================================================================
SDL_rotozoom.c: rotozoomer, zoomer and shrinker for 32bit or 8bit surfaces
Copyright (C) 2001-2012 Andreas Schiffler
This software is provided 'as-is', without any express or implied
warranty. In no event will the authors be held liable for any damages
arising from the use of this software.
Permission is granted to anyone to use this software for any purpose,
including commercial applications, and to alter it and redistribute it
freely, subject to the following restrictions:
1. The origin of this software must not be misrepresented; you must not
claim that you wrote the original software. If you use this software
in a product, an acknowledgment in the product documentation would be
appreciated but is not required.
2. Altered source versions must be plainly marked as such, and must not be
misrepresented as being the original software.
3. This notice may not be removed or altered from any source
distribution.
Andreas Schiffler -- aschiffler at ferzkopp dot net
=============================================================================
The functions have been adapted for different structures, coordinate
systems and pixel formats.
*/
namespace {
template <typename Color, int Size>
inline uint32 getPixel(const byte *sp) {
if (Size == sizeof(Color)) {
return *(const Color *)sp;
} else {
return READ_UINT24(sp);
}
}
template <typename Color, int Size>
inline void setPixel(byte *pc, const uint32 pix) {
if (Size == sizeof(Color)) {
*(Color *)pc = pix;
} else {
WRITE_UINT24(pc, pix);
}
}
inline byte scaleBlitBilinearInterpolate(byte c01, byte c00, byte c11, byte c10, int ex, int ey) {
int t1 = ((((c01 - c00) * ex) >> 16) + c00) & 0xff;
int t2 = ((((c11 - c10) * ex) >> 16) + c10) & 0xff;
return (((t2 - t1) * ey) >> 16) + t1;
}
template <typename ColorMask, typename Color, int Size>
void scaleBlitBilinearInterpolate(byte *dp, const byte *c01, const byte *c00, const byte *c11, const byte *c10, int ex, int ey,
const Graphics::PixelFormat &fmt) {
byte c01_a, c01_r, c01_g, c01_b;
fmt.colorToARGBT<ColorMask>(getPixel<Color, Size>(c01), c01_a, c01_r, c01_g, c01_b);
byte c00_a, c00_r, c00_g, c00_b;
fmt.colorToARGBT<ColorMask>(getPixel<Color, Size>(c00), c00_a, c00_r, c00_g, c00_b);
byte c11_a, c11_r, c11_g, c11_b;
fmt.colorToARGBT<ColorMask>(getPixel<Color, Size>(c11), c11_a, c11_r, c11_g, c11_b);
byte c10_a, c10_r, c10_g, c10_b;
fmt.colorToARGBT<ColorMask>(getPixel<Color, Size>(c10), c10_a, c10_r, c10_g, c10_b);
byte dp_a = scaleBlitBilinearInterpolate(c01_a, c00_a, c11_a, c10_a, ex, ey);
byte dp_r = scaleBlitBilinearInterpolate(c01_r, c00_r, c11_r, c10_r, ex, ey);
byte dp_g = scaleBlitBilinearInterpolate(c01_g, c00_g, c11_g, c10_g, ex, ey);
byte dp_b = scaleBlitBilinearInterpolate(c01_b, c00_b, c11_b, c10_b, ex, ey);
setPixel<Color, Size>(dp, fmt.ARGBToColorT<ColorMask>(dp_a, dp_r, dp_g, dp_b));
}
template <typename ColorMask, typename Color, int Size>
void scaleBlitBilinearLogic(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint dstW, const uint dstH,
const uint srcW, const uint srcH,
const Graphics::PixelFormat &fmt,
int *sax, int *say, byte flip) {
const bool flipx = flip & FLIP_H;
const bool flipy = flip & FLIP_V;
int spixelw = (srcW - 1);
int spixelh = (srcH - 1);
const byte *sp = src;
if (flipx) {
sp += spixelw * Size;
}
if (flipy) {
sp += srcPitch * spixelh;
}
int *csay = say;
for (uint y = 0; y < dstH; y++) {
byte *dp = dst + (dstPitch * y);
const byte *csp = sp;
int *csax = sax;
for (uint x = 0; x < dstW; x++) {
/*
* Setup color source pointers
*/
int ex = (*csax & 0xffff);
int ey = (*csay & 0xffff);
int cx = (*csax >> 16);
int cy = (*csay >> 16);
const byte *c00, *c01, *c10, *c11;
c00 = c01 = c10 = sp;
if (cy < spixelh) {
if (flipy) {
c10 -= srcPitch;
} else {
c10 += srcPitch;
}
}
c11 = c10;
if (cx < spixelw) {
if (flipx) {
c01 -= Size;
c11 -= Size;
} else {
c01 += Size;
c11 += Size;
}
}
/*
* Draw and interpolate colors
*/
scaleBlitBilinearInterpolate<ColorMask, Color, Size>(dp, c01, c00, c11, c10, ex, ey, fmt);
/*
* Advance source pointer x
*/
int *salastx = csax;
csax++;
int sstepx = (*csax >> 16) - (*salastx >> 16);
if (flipx) {
sp -= sstepx * Size;
} else {
sp += sstepx * Size;
}
/*
* Advance destination pointer x
*/
dp += Size;
}
/*
* Advance source pointer y
*/
int *salasty = csay;
csay++;
int sstepy = (*csay >> 16) - (*salasty >> 16);
sstepy *= srcPitch;
if (flipy) {
sp = csp - sstepy;
} else {
sp = csp + sstepy;
}
}
}
template<typename ColorMask, typename Color, int Size, bool filtering>
void rotoscaleBlitLogic(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint dstW, const uint dstH,
const uint srcW, const uint srcH,
const Graphics::PixelFormat &fmt,
const TransformStruct &transform,
const Common::Point &newHotspot) {
const bool flipx = transform._flip & FLIP_H;
const bool flipy = transform._flip & FLIP_V;
assert(transform._angle != kDefaultAngle); // This would not be ideal; rotoscale() should never be called in conditional branches where angle = 0 anyway.
if (transform._zoom.x == 0 || transform._zoom.y == 0) {
return;
}
uint32 invAngle = 360 - (transform._angle % 360);
float invAngleRad = Math::deg2rad<uint32,float>(invAngle);
float invCos = cos(invAngleRad);
float invSin = sin(invAngleRad);
int icosx = (int)(invCos * (65536.0f * kDefaultZoomX / transform._zoom.x));
int isinx = (int)(invSin * (65536.0f * kDefaultZoomX / transform._zoom.x));
int icosy = (int)(invCos * (65536.0f * kDefaultZoomY / transform._zoom.y));
int isiny = (int)(invSin * (65536.0f * kDefaultZoomY / transform._zoom.y));
int xd = transform._hotspot.x << 16;
int yd = transform._hotspot.y << 16;
int cx = newHotspot.x;
int cy = newHotspot.y;
int ax = -icosx * cx;
int ay = -isiny * cx;
int sw = srcW - 1;
int sh = srcH - 1;
byte *pc = dst;
for (uint y = 0; y < dstH; y++) {
int t = cy - y;
int sdx = ax + (isinx * t) + xd;
int sdy = ay - (icosy * t) + yd;
for (uint x = 0; x < dstW; x++) {
int dx = (sdx >> 16);
int dy = (sdy >> 16);
if (flipx) {
dx = sw - dx;
}
if (flipy) {
dy = sh - dy;
}
if (filtering) {
if ((dx > -1) && (dy > -1) && (dx < sw) && (dy < sh)) {
const byte *sp = src + dy * srcPitch + dx * Size;
const byte *c00, *c01, *c10, *c11;
c00 = sp;
sp += Size;
c01 = sp;
sp += srcPitch;
c11 = sp;
sp -= Size;
c10 = sp;
if (flipx) {
SWAP(c00, c01);
SWAP(c10, c11);
}
if (flipy) {
SWAP(c00, c10);
SWAP(c01, c11);
}
/*
* Interpolate colors
*/
int ex = (sdx & 0xffff);
int ey = (sdy & 0xffff);
scaleBlitBilinearInterpolate<ColorMask, Color, Size>(pc, c01, c00, c11, c10, ex, ey, fmt);
}
} else {
if ((dx >= 0) && (dy >= 0) && (dx < (int)srcW) && (dy < (int)srcH)) {
const byte *sp = src + dy * srcPitch + dx * Size;
if (Size == sizeof(Color)) {
*(Color *)pc = *(const Color *)sp;
} else {
memcpy(pc, sp, Size);
}
}
}
sdx += icosx;
sdy += isiny;
pc += Size;
}
}
}
} // End of anonymous namespace
bool scaleBlitBilinear(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint dstW, const uint dstH,
const uint srcW, const uint srcH,
const Graphics::PixelFormat &fmt,
const byte flip) {
if (fmt.bytesPerPixel != 2 && fmt.bytesPerPixel != 3 && fmt.bytesPerPixel != 4)
return false;
int *sax = new int[dstW + 1];
int *say = new int[dstH + 1];
assert(sax && say);
/*
* Precalculate row increments
*/
int spixelw = (srcW - 1);
int spixelh = (srcH - 1);
int sx = (int)(65536.0f * (float) spixelw / (float) (dstW - 1));
int sy = (int)(65536.0f * (float) spixelh / (float) (dstH - 1));
/* Maximum scaled source size */
int ssx = (srcW << 16) - 1;
int ssy = (srcH << 16) - 1;
/* Precalculate horizontal row increments */
int csx = 0;
int *csax = sax;
for (uint x = 0; x <= dstW; x++) {
*csax = csx;
csax++;
csx += sx;
/* Guard from overflows */
if (csx > ssx) {
csx = ssx;
}
}
/* Precalculate vertical row increments */
int csy = 0;
int *csay = say;
for (uint y = 0; y <= dstH; y++) {
*csay = csy;
csay++;
csy += sy;
/* Guard from overflows */
if (csy > ssy) {
csy = ssy;
}
}
if (fmt == createPixelFormat<8888>()) {
scaleBlitBilinearLogic<ColorMasks<8888>, uint32, 4>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
} else if (fmt == createPixelFormat<888>()) {
scaleBlitBilinearLogic<ColorMasks<888>, uint32, 4>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
} else if (fmt == createPixelFormat<565>()) {
scaleBlitBilinearLogic<ColorMasks<565>, uint16, 2>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
} else if (fmt == createPixelFormat<555>()) {
scaleBlitBilinearLogic<ColorMasks<555>, uint16, 2>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
} else if (fmt.bytesPerPixel == 4) {
scaleBlitBilinearLogic<ColorMasks<0>, uint32, 4>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
} else if (fmt.bytesPerPixel == 3) {
scaleBlitBilinearLogic<ColorMasks<0>, uint8, 3>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
} else if (fmt.bytesPerPixel == 2) {
scaleBlitBilinearLogic<ColorMasks<0>, uint16, 2>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, sax, say, flip);
} else {
delete[] sax;
delete[] say;
return false;
}
delete[] sax;
delete[] say;
return true;
}
bool rotoscaleBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint dstW, const uint dstH,
const uint srcW, const uint srcH,
const Graphics::PixelFormat &fmt,
const TransformStruct &transform,
const Common::Point &newHotspot) {
if (fmt.bytesPerPixel == 4) {
rotoscaleBlitLogic<ColorMasks<0>, uint32, 4, false>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt.bytesPerPixel == 3) {
rotoscaleBlitLogic<ColorMasks<0>, uint8, 3, false>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt.bytesPerPixel == 2) {
rotoscaleBlitLogic<ColorMasks<0>, uint16, 2, false>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt.bytesPerPixel == 1) {
rotoscaleBlitLogic<ColorMasks<0>, uint8, 1, false>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else {
return false;
}
return true;
}
bool rotoscaleBlitBilinear(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint dstW, const uint dstH,
const uint srcW, const uint srcH,
const Graphics::PixelFormat &fmt,
const TransformStruct &transform,
const Common::Point &newHotspot) {
if (fmt == createPixelFormat<8888>()) {
rotoscaleBlitLogic<ColorMasks<8888>, uint32, 4, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt == createPixelFormat<888>()) {
rotoscaleBlitLogic<ColorMasks<888>, uint32, 4, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt == createPixelFormat<565>()) {
rotoscaleBlitLogic<ColorMasks<565>, uint16, 2, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt == createPixelFormat<555>()) {
rotoscaleBlitLogic<ColorMasks<555>, uint16, 2, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt.bytesPerPixel == 4) {
rotoscaleBlitLogic<ColorMasks<0>, uint32, 4, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt.bytesPerPixel == 3) {
rotoscaleBlitLogic<ColorMasks<0>, uint8, 3, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else if (fmt.bytesPerPixel == 2) {
rotoscaleBlitLogic<ColorMasks<0>, uint16, 2, true>(dst, src, dstPitch, srcPitch, dstW, dstH, srcW, srcH, fmt, transform, newHotspot);
} else {
return false;
}
return true;
}
} // End of namespace Graphics

339
graphics/blit/blit-sse2.cpp Normal file
View File

@@ -0,0 +1,339 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "common/scummsys.h"
#include "graphics/blit/blit-alpha.h"
#include "graphics/pixelformat.h"
#include <emmintrin.h>
#if !defined(__x86_64__)
#if defined(__clang__)
#pragma clang attribute push (__attribute__((target("sse2"))), apply_to=function)
#elif defined(__GNUC__)
#pragma GCC push_options
#pragma GCC target("sse2")
#endif
#endif // !defined(__x86_64__)
namespace Graphics {
static FORCEINLINE __m128i sse2_mul32(__m128i a, __m128i b) {
__m128i even = _mm_shuffle_epi32(_mm_mul_epu32(a, b), _MM_SHUFFLE(0, 0, 2, 0));
__m128i odd = _mm_shuffle_epi32(_mm_mul_epu32(_mm_bsrli_si128(a, 4), _mm_bsrli_si128(b, 4)), _MM_SHUFFLE(0, 0, 2, 0));
return _mm_unpacklo_epi32(even, odd);
}
class BlendBlitImpl_SSE2 : public BlendBlitImpl_Base {
friend class BlendBlit;
template<bool rgbmod, bool alphamod>
struct AlphaBlend : public BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod> {
public:
constexpr AlphaBlend(const uint32 color) : BlendBlitImpl_Base::AlphaBlend<rgbmod, alphamod>(color) {}
inline __m128i simd(__m128i src, __m128i dst) const {
__m128i ina;
if (alphamod)
ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
else
ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
if (rgbmod) {
__m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
dstR = _mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kRModShift - 8);
dstG = _mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), BlendBlit::kGModShift - 8);
dstB = _mm_mullo_epi16(dstB, _mm_sub_epi32(_mm_set1_epi32(255), ina));
srcR = _mm_add_epi32(dstR, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcR, ina), 8), _mm_set1_epi32(this->cr)), BlendBlit::kRModShift - 8));
srcG = _mm_add_epi32(dstG, _mm_slli_epi32(_mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8), _mm_set1_epi32(this->cg)), BlendBlit::kGModShift - 8));
srcB = _mm_add_epi32(dstB, _mm_mullo_epi16(_mm_srli_epi32(_mm_mullo_epi16(srcB, ina), 8), _mm_set1_epi32(this->cb)));
src = _mm_or_si128(_mm_and_si128(srcB, _mm_set1_epi32(BlendBlit::kBModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), src);
src = _mm_or_si128(_mm_and_si128(srcR, _mm_set1_epi32(BlendBlit::kRModMask)), src);
} else {
__m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
dstRB = _mm_srli_epi32(sse2_mul32(dstRB, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
dstG = _mm_srli_epi32(_mm_mullo_epi16(dstG, _mm_sub_epi32(_mm_set1_epi32(255), ina)), 8);
srcRB = _mm_slli_epi32(_mm_add_epi32(dstRB, _mm_srli_epi32(sse2_mul32(srcRB, ina), 8)), BlendBlit::kBModShift);
srcG = _mm_slli_epi32(_mm_add_epi32(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), BlendBlit::kGModShift);
src = _mm_or_si128(_mm_and_si128(srcG, _mm_set1_epi32(BlendBlit::kGModMask)), _mm_set1_epi32(BlendBlit::kAModMask));
src = _mm_or_si128(_mm_and_si128(srcRB, _mm_set1_epi32(BlendBlit::kBModMask | BlendBlit::kRModMask)), src);
}
dst = _mm_and_si128(alphaMask, dst);
src = _mm_andnot_si128(alphaMask, src);
return _mm_or_si128(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct MultiplyBlend : public BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod> {
public:
constexpr MultiplyBlend(const uint32 color) : BlendBlitImpl_Base::MultiplyBlend<rgbmod, alphamod>(color) {}
inline __m128i simd(__m128i src, __m128i dst) const {
__m128i ina, alphaMask;
if (alphamod) {
ina = _mm_srli_epi32(_mm_mullo_epi16(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
alphaMask = _mm_cmpeq_epi32(ina, _mm_setzero_si128());
} else {
ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
alphaMask = _mm_set1_epi32(BlendBlit::kAModMask);
}
if (rgbmod) {
__m128i srcB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i srcR = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m128i dstB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i dstR = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcB = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstB, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcB, _mm_set1_epi32(this->cb)), ina), 16)), BlendBlit::kBModShift - 8), _mm_set1_epi32(BlendBlit::kBModMask));
srcG = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcG, _mm_set1_epi32(this->cg)), ina), 16)), BlendBlit::kGModShift - 8), _mm_set1_epi32(BlendBlit::kGModMask));
srcR = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstR, _mm_srli_epi32(sse2_mul32(_mm_mullo_epi16(srcR, _mm_set1_epi32(this->cr)), ina), 16)), BlendBlit::kRModShift - 8), _mm_set1_epi32(BlendBlit::kRModMask));
src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
src = _mm_or_si128(src, _mm_or_si128(srcB, _mm_or_si128(srcG, srcR)));
} else {
constexpr uint32 rbMask = BlendBlit::kRModMask | BlendBlit::kBModMask;
__m128i srcG = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i srcRB = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(rbMask)), BlendBlit::kBModShift);
__m128i dstG = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i dstRB = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(rbMask)), BlendBlit::kBModShift);
srcG = _mm_and_si128(_mm_slli_epi32(_mm_mullo_epi16(dstG, _mm_srli_epi32(_mm_mullo_epi16(srcG, ina), 8)), 8), _mm_set1_epi32(BlendBlit::kGModMask));
srcRB = _mm_and_si128(_mm_mullo_epi16(dstRB, _mm_srli_epi32(_mm_and_si128(sse2_mul32(srcRB, ina), _mm_set1_epi32(rbMask)), 8)), _mm_set1_epi32(rbMask));
src = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
src = _mm_or_si128(src, _mm_or_si128(srcRB, srcG));
}
dst = _mm_and_si128(alphaMask, dst);
src = _mm_andnot_si128(alphaMask, src);
return _mm_or_si128(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct OpaqueBlend : public BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod> {
public:
constexpr OpaqueBlend(const uint32 color) : BlendBlitImpl_Base::OpaqueBlend<rgbmod, alphamod>(color) {}
inline __m128i simd(__m128i src, __m128i dst) const {
return _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
}
};
template<bool rgbmod, bool alphamod>
struct BinaryBlend : public BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod> {
public:
constexpr BinaryBlend(const uint32 color) : BlendBlitImpl_Base::BinaryBlend<rgbmod, alphamod>(color) {}
inline __m128i simd(__m128i src, __m128i dst) const {
__m128i alphaMask = _mm_cmpeq_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_setzero_si128());
dst = _mm_and_si128(dst, alphaMask);
src = _mm_andnot_si128(alphaMask, _mm_or_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)));
return _mm_or_si128(src, dst);
}
};
template<bool rgbmod, bool alphamod>
struct AdditiveBlend : public BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod> {
public:
constexpr AdditiveBlend(const uint32 color) : BlendBlitImpl_Base::AdditiveBlend<rgbmod, alphamod>(color) {}
inline __m128i simd(__m128i src, __m128i dst) const {
__m128i ina;
if (alphamod)
ina = _mm_srli_epi32(sse2_mul32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask)), _mm_set1_epi32(this->ca)), 8);
else
ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
__m128i alphaMask = _mm_cmpeq_epi32(ina, _mm_set1_epi32(0));
if (rgbmod) {
__m128i srcb = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask));
__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m128i dstb = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask));
__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcb = _mm_and_si128(_mm_add_epi32(dstb, _mm_srli_epi32(sse2_mul32(srcb, sse2_mul32(_mm_set1_epi32(this->cb), ina)), 16)), _mm_set1_epi32(BlendBlit::kBModMask));
srcg = _mm_and_si128(_mm_add_epi32(dstg, sse2_mul32(srcg, sse2_mul32(_mm_set1_epi32(this->cg), ina))), _mm_set1_epi32(BlendBlit::kGModMask));
srcr = _mm_and_si128(_mm_add_epi32(dstr, _mm_srli_epi32(sse2_mul32(srcr, sse2_mul32(_mm_set1_epi32(this->cr), ina)), BlendBlit::kRModShift - 16)), _mm_set1_epi32(BlendBlit::kRModMask));
src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
src = _mm_or_si128(src, _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
} else if (alphamod) {
__m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
__m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
__m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
srcg = _mm_and_si128(_mm_add_epi32(dstg, _mm_srli_epi32(sse2_mul32(srcg, ina), 8)), _mm_set1_epi32(BlendBlit::kGModMask));
srcrb = _mm_and_si128(_mm_add_epi32(dstrb, sse2_mul32(srcrb, ina)), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
} else {
__m128i srcg = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask));
__m128i srcrb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i dstg = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask));
__m128i dstrb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask)), BlendBlit::kBModShift);
srcg = _mm_and_si128(_mm_add_epi32(dstg, srcg), _mm_set1_epi32(BlendBlit::kGModMask));
srcrb = _mm_and_si128(_mm_slli_epi32(_mm_add_epi32(dstrb, srcrb), 8), _mm_set1_epi32(BlendBlit::kRModMask | BlendBlit::kBModMask));
src = _mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kAModMask));
src = _mm_or_si128(src, _mm_or_si128(srcrb, srcg));
}
dst = _mm_and_si128(alphaMask, dst);
src = _mm_andnot_si128(alphaMask, src);
return _mm_or_si128(dst, src);
}
};
template<bool rgbmod, bool alphamod>
struct SubtractiveBlend : public BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod> {
public:
constexpr SubtractiveBlend(const uint32 color) : BlendBlitImpl_Base::SubtractiveBlend<rgbmod, alphamod>(color) {}
inline __m128i simd(__m128i src, __m128i dst) const {
__m128i ina = _mm_and_si128(src, _mm_set1_epi32(BlendBlit::kAModMask));
__m128i srcb = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i srcg = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i srcr = _mm_srli_epi32(_mm_and_si128(src, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
__m128i dstb = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kBModMask)), BlendBlit::kBModShift);
__m128i dstg = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kGModMask)), BlendBlit::kGModShift);
__m128i dstr = _mm_srli_epi32(_mm_and_si128(dst, _mm_set1_epi32(BlendBlit::kRModMask)), BlendBlit::kRModShift);
srcb = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstb, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcb, _mm_set1_epi32(this->cb)), sse2_mul32(dstb, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kBModShift), _mm_set1_epi32(BlendBlit::kBModMask));
srcg = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstg, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcg, _mm_set1_epi32(this->cg)), sse2_mul32(dstg, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kGModShift), _mm_set1_epi32(BlendBlit::kGModMask));
srcr = _mm_and_si128(_mm_slli_epi32(_mm_max_epi16(_mm_sub_epi32(dstr, _mm_srli_epi32(sse2_mul32(sse2_mul32(srcr, _mm_set1_epi32(this->cr)), sse2_mul32(dstr, ina)), 24)), _mm_set1_epi32(0)), BlendBlit::kRModShift), _mm_set1_epi32(BlendBlit::kRModMask));
return _mm_or_si128(_mm_set1_epi32(BlendBlit::kAModMask), _mm_or_si128(srcb, _mm_or_si128(srcg, srcr)));
}
};
public:
template<template <bool RGBMOD, bool ALPHAMOD> class PixelFunc, bool doscale, bool rgbmod, bool alphamod>
static inline void blitInnerLoop(BlendBlit::Args &args) {
const bool loaddst = true; // TODO: Only set this when necessary
const byte *in;
byte *out;
PixelFunc<rgbmod, alphamod> pixelFunc(args.color);
int scaleXCtr, scaleYCtr = args.scaleYoff;
const byte *inBase;
if (!doscale && (args.flipping & FLIP_H)) args.ino -= 4 * 3;
for (uint32 i = 0; i < args.height; i++) {
if (doscale) {
inBase = args.ino + scaleYCtr / BlendBlit::SCALE_THRESHOLD * args.inoStep;
scaleXCtr = args.scaleXoff;
} else {
in = args.ino;
}
out = args.outo;
uint32 j = 0;
for (; j + 4 <= args.width; j += 4) {
__m128i dstPixels, srcPixels;
if (loaddst) dstPixels = _mm_loadu_si128((const __m128i *)out);
if (!doscale) {
srcPixels = _mm_loadu_si128((const __m128i *)in);
} else {
srcPixels = _mm_setr_epi32(
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 0) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 1) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 2) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep),
*(const uint32 *)(inBase + (ptrdiff_t)(scaleXCtr + args.scaleX * 3) / (ptrdiff_t)BlendBlit::SCALE_THRESHOLD * args.inStep)
);
scaleXCtr += args.scaleX * 4;
}
if (!doscale && (args.flipping & FLIP_H)) {
srcPixels = _mm_shuffle_epi32(srcPixels, _MM_SHUFFLE(0, 1, 2, 3));
}
{
const __m128i res = pixelFunc.simd(srcPixels, dstPixels);
_mm_storeu_si128((__m128i *)out, res);
}
if (!doscale) in += (ptrdiff_t)args.inStep * 4;
out += 4ULL * 4;
}
if (!doscale && (args.flipping & FLIP_H)) in += 4 * 3;
for (; j < args.width; j++) {
if (doscale) {
in = inBase + scaleXCtr / BlendBlit::SCALE_THRESHOLD * args.inStep;
}
pixelFunc.normal(in, out);
if (doscale)
scaleXCtr += args.scaleX;
else
in += args.inStep;
out += 4;
}
if (doscale)
scaleYCtr += args.scaleY;
else
args.ino += args.inoStep;
args.outo += args.dstPitch;
}
}
}; // End of class BlendBlitImpl_SSE2
void BlendBlit::blitSSE2(Args &args, const TSpriteBlendMode &blendMode, const AlphaType &alphaType) {
blitT<BlendBlitImpl_SSE2>(args, blendMode, alphaType);
}
} // End of namespace Graphics
#if !defined(__x86_64__)
#if defined(__clang__)
#pragma clang attribute pop
#elif defined(__GNUC__)
#pragma GCC pop_options
#endif
#endif // !defined(__x86_64__)

501
graphics/blit/blit.cpp Normal file
View File

@@ -0,0 +1,501 @@
/* ScummVM - Graphic Adventure Engine
*
* ScummVM is the legal property of its developers, whose names
* are too numerous to list here. Please refer to the COPYRIGHT
* file distributed with this source distribution.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*
*/
#include "graphics/blit.h"
#include "graphics/pixelformat.h"
#include "common/endian.h"
namespace Graphics {
// see graphics/blit/blit-atari.cpp
#ifdef ATARI
extern void keyBlitLogicAtari(byte *dst, const byte *src, const uint w, const uint h,
const uint srcDelta, const uint dstDelta, const uint32 key);
#else
// Function to blit a rect
void copyBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const uint bytesPerPixel) {
if (dst == src)
return;
if (dstPitch == srcPitch && ((w * bytesPerPixel) == dstPitch)) {
// Buffers have equal line pitch AND total number of bytes per line matches that pitch
// Therefore we may copy a whole subset of h full-width raster lines in one go.
memcpy(dst, src, dstPitch * h);
} else {
// Not transferring whole width of either source or destination buffer, therefore must copy line-by-line
for (uint i = 0; i < h; ++i) {
// Copy sublength w of one full buffer raster line
memcpy(dst, src, w * bytesPerPixel);
// Iterate both buffer pointers by respective pitch, to horizontally align starting point of next raster line with that of the one just copied
dst += dstPitch;
src += srcPitch;
}
}
}
#endif
namespace {
template<typename Color, int Size>
inline void keyBlitLogic(byte *dst, const byte *src, const uint w, const uint h,
const uint srcDelta, const uint dstDelta, const uint32 key) {
const uint8 *col = (const uint8 *)&key;
#ifdef SCUMM_BIG_ENDIAN
if (Size == 3)
col++;
#endif
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
if (Size == sizeof(Color)) {
const uint32 color = *(const Color *)src;
if (color != key)
*(Color *)dst = color;
} else {
if (memcmp(src, col, Size))
memcpy(dst, src, Size);
}
src += Size;
dst += Size;
}
src += srcDelta;
dst += dstDelta;
}
}
#ifdef ATARI
template<>
inline void keyBlitLogic<uint8, 1>(byte *dst, const byte *src, const uint w, const uint h,
const uint srcDelta, const uint dstDelta, const uint32 key) {
keyBlitLogicAtari(dst, src, w, h, srcDelta, dstDelta, key);
}
#endif
} // End of anonymous namespace
// Function to blit a rect with a transparent color key
bool keyBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const uint bytesPerPixel, const uint32 key) {
if (dst == src)
return true;
// Faster, but larger, to provide optimized handling for each case.
const uint srcDelta = (srcPitch - w * bytesPerPixel);
const uint dstDelta = (dstPitch - w * bytesPerPixel);
if (bytesPerPixel == 1) {
keyBlitLogic<uint8, 1>(dst, src, w, h, srcDelta, dstDelta, key);
} else if (bytesPerPixel == 2) {
keyBlitLogic<uint16, 2>(dst, src, w, h, srcDelta, dstDelta, key);
} else if (bytesPerPixel == 3) {
keyBlitLogic<uint8, 3>(dst, src, w, h, srcDelta, dstDelta, key);
} else if (bytesPerPixel == 4) {
keyBlitLogic<uint32, 4>(dst, src, w, h, srcDelta, dstDelta, key);
} else {
return false;
}
return true;
}
namespace {
template<typename Color, int Size>
inline void maskBlitLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
const uint srcDelta, const uint dstDelta, const uint maskDelta) {
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
if (*mask) {
if (Size == sizeof(Color)) {
*(Color *)dst = *(const Color *)src;
} else {
memcpy(dst, src, Size);
}
}
src += Size;
dst += Size;
mask += 1;
}
src += srcDelta;
dst += dstDelta;
mask += maskDelta;
}
}
} // End of anonymous namespace
// Function to blit a rect with a transparent color mask
bool maskBlit(byte *dst, const byte *src, const byte *mask,
const uint dstPitch, const uint srcPitch, const uint maskPitch,
const uint w, const uint h,
const uint bytesPerPixel) {
if (dst == src)
return true;
// Faster, but larger, to provide optimized handling for each case.
const uint srcDelta = (srcPitch - w * bytesPerPixel);
const uint dstDelta = (dstPitch - w * bytesPerPixel);
const uint maskDelta = (maskPitch - w);
if (bytesPerPixel == 1) {
maskBlitLogic<uint8, 1>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta);
} else if (bytesPerPixel == 2) {
maskBlitLogic<uint16, 2>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta);
} else if (bytesPerPixel == 3) {
maskBlitLogic<uint8, 3>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta);
} else if (bytesPerPixel == 4) {
maskBlitLogic<uint32, 4>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta);
} else {
return false;
}
return true;
}
namespace {
template<typename SrcColor, int SrcSize, typename DstColor, int DstSize, bool backward, bool hasKey, bool hasMask>
inline void crossBlitLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
const PixelFormat &srcFmt, const PixelFormat &dstFmt,
const uint srcDelta, const uint dstDelta, const uint maskDelta,
const uint32 key) {
uint32 color;
byte a, r, g, b;
uint8 *col = (uint8 *)&color;
#ifdef SCUMM_BIG_ENDIAN
if (SrcSize == 3 || DstSize == 3)
col++;
#endif
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
if (SrcSize == sizeof(SrcColor))
color = *(const SrcColor *)src;
else
memcpy(col, src, SrcSize);
if ((!hasKey || color != key) && (!hasMask || *mask != 0)) {
srcFmt.colorToARGB(color, a, r, g, b);
color = dstFmt.ARGBToColor(a, r, g, b);
if (DstSize == sizeof(DstColor))
*(DstColor *)dst = color;
else
memcpy(dst, col, DstSize);
}
if (backward) {
src -= SrcSize;
dst -= DstSize;
if (hasMask)
mask -= 1;
} else {
src += SrcSize;
dst += DstSize;
if (hasMask)
mask += 1;
}
}
if (backward) {
src -= srcDelta;
dst -= dstDelta;
if (hasMask)
mask -= maskDelta;
} else {
src += srcDelta;
dst += dstDelta;
if (hasMask)
mask += maskDelta;
}
}
}
template<bool hasKey, bool hasMask>
inline bool crossBlitHelper(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
const PixelFormat &srcFmt, const PixelFormat &dstFmt,
const uint srcPitch, const uint dstPitch, const uint maskPitch,
const uint32 key) {
// Faster, but larger, to provide optimized handling for each case.
const uint srcDelta = (srcPitch - w * srcFmt.bytesPerPixel);
const uint dstDelta = (dstPitch - w * dstFmt.bytesPerPixel);
const uint maskDelta = hasMask ? (maskPitch - w) : 0;
// TODO: optimized cases for dstDelta of 0
if (dstFmt.bytesPerPixel == 2) {
if (srcFmt.bytesPerPixel == 2) {
crossBlitLogic<uint16, 2, uint16, 2, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
} else if (srcFmt.bytesPerPixel == 3) {
crossBlitLogic<uint8, 3, uint16, 2, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
} else {
crossBlitLogic<uint32, 4, uint16, 2, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
}
} else if (dstFmt.bytesPerPixel == 3) {
if (srcFmt.bytesPerPixel == 2) {
// We need to blit the surface from bottom right to top left here.
// This is needed, because when we convert to the same memory
// buffer copying the surface from top left to bottom right would
// overwrite the source, since we have more bits per destination
// color than per source color.
dst += h * dstPitch - dstDelta - dstFmt.bytesPerPixel;
src += h * srcPitch - srcDelta - srcFmt.bytesPerPixel;
if (hasMask) mask += h * maskPitch - maskDelta - 1;
crossBlitLogic<uint16, 2, uint8, 3, true, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
} else if (srcFmt.bytesPerPixel == 3) {
crossBlitLogic<uint8, 3, uint8, 3, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
} else {
crossBlitLogic<uint32, 4, uint8, 3, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
}
} else if (dstFmt.bytesPerPixel == 4) {
if (srcFmt.bytesPerPixel == 2) {
// We need to blit the surface from bottom right to top left here.
// This is neeeded, because when we convert to the same memory
// buffer copying the surface from top left to bottom right would
// overwrite the source, since we have more bits per destination
// color than per source color.
dst += h * dstPitch - dstDelta - dstFmt.bytesPerPixel;
src += h * srcPitch - srcDelta - srcFmt.bytesPerPixel;
if (hasMask) mask += h * maskPitch - maskDelta - 1;
crossBlitLogic<uint16, 2, uint32, 4, true, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
} else if (srcFmt.bytesPerPixel == 3) {
// We need to blit the surface from bottom right to top left here.
// This is neeeded, because when we convert to the same memory
// buffer copying the surface from top left to bottom right would
// overwrite the source, since we have more bits per destination
// color than per source color.
dst += h * dstPitch - dstDelta - dstFmt.bytesPerPixel;
src += h * srcPitch - srcDelta - srcFmt.bytesPerPixel;
if (hasMask) mask += h * maskPitch - maskDelta - 1;
crossBlitLogic<uint8, 3, uint32, 4, true, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
} else {
crossBlitLogic<uint32, 4, uint32, 4, false, hasKey, hasMask>(dst, src, mask, w, h, srcFmt, dstFmt, srcDelta, dstDelta, maskDelta, key);
}
} else {
return false;
}
return true;
}
} // End of anonymous namespace
// Function to blit a rect from one color format to another
bool crossBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt) {
// Error out if conversion is impossible
if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
return false;
// Don't perform unnecessary conversion
if (srcFmt == dstFmt) {
copyBlit(dst, src, dstPitch, srcPitch, w, h, dstFmt.bytesPerPixel);
return true;
}
// Attempt to use a faster method if possible
FastBlitFunc blitFunc = getFastBlitFunc(dstFmt, srcFmt);
if (blitFunc) {
blitFunc(dst, src, dstPitch, srcPitch, w, h);
return true;
}
return crossBlitHelper<false, false>(dst, src, nullptr, w, h, srcFmt, dstFmt, srcPitch, dstPitch, 0, 0);
}
// Function to blit a rect from one color format to another with a transparent color key
bool crossKeyBlit(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt, const uint32 key) {
// Error out if conversion is impossible
if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
return false;
// Don't perform unnecessary conversion
if (srcFmt == dstFmt) {
keyBlit(dst, src, dstPitch, srcPitch, w, h, dstFmt.bytesPerPixel, key);
return true;
}
return crossBlitHelper<true, false>(dst, src, nullptr, w, h, srcFmt, dstFmt, srcPitch, dstPitch, 0, key);
}
// Function to blit a rect from one color format to another with a transparent color mask
bool crossMaskBlit(byte *dst, const byte *src, const byte *mask,
const uint dstPitch, const uint srcPitch, const uint maskPitch,
const uint w, const uint h,
const Graphics::PixelFormat &dstFmt, const Graphics::PixelFormat &srcFmt) {
// Error out if conversion is impossible
if ((srcFmt.bytesPerPixel == 1) || (dstFmt.bytesPerPixel == 1)
|| (!srcFmt.bytesPerPixel) || (!dstFmt.bytesPerPixel))
return false;
// Don't perform unnecessary conversion
if (srcFmt == dstFmt) {
maskBlit(dst, src, mask, dstPitch, srcPitch, maskPitch, w, h, dstFmt.bytesPerPixel);
return true;
}
return crossBlitHelper<false, true>(dst, src, mask, w, h, srcFmt, dstFmt, srcPitch, dstPitch, maskPitch, 0);
}
namespace {
template<typename DstColor, int DstSize, bool backward, bool hasKey, bool hasMask>
inline void crossBlitMapLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
const uint srcDelta, const uint dstDelta, const uint maskDelta, const uint32 *map, const uint32 key) {
for (uint y = 0; y < h; ++y) {
for (uint x = 0; x < w; ++x) {
const byte color = *src;
if ((!hasKey || color != key) && (!hasMask || *mask != 0)) {
if (DstSize == sizeof(DstColor)) {
*(DstColor *)dst = map[color];
} else {
WRITE_UINT24(dst, map[color]);
}
}
if (backward) {
src -= 1;
dst -= DstSize;
if (hasMask)
mask -= 1;
} else {
src += 1;
dst += DstSize;
if (hasMask)
mask += 1;
}
}
if (backward) {
src -= srcDelta;
dst -= dstDelta;
if (hasMask)
mask -= maskDelta;
} else {
src += srcDelta;
dst += dstDelta;
if (hasMask)
mask += maskDelta;
}
}
}
template<bool hasKey, bool hasMask>
inline bool crossBlitMapHelperLogic(byte *dst, const byte *src, const byte *mask, const uint w, const uint h,
const uint bytesPerPixel, const uint32 *map,
const uint srcPitch, const uint dstPitch, const uint maskPitch,
const uint32 key) {
// Faster, but larger, to provide optimized handling for each case.
const uint srcDelta = (srcPitch - w);
const uint dstDelta = (dstPitch - w * bytesPerPixel);
const uint maskDelta = hasMask ? (maskPitch - w) : 0;
if (bytesPerPixel == 1) {
crossBlitMapLogic<uint8, 1, false, hasKey, hasMask>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta, map, key);
} else if (bytesPerPixel == 2) {
// We need to blit the surface from bottom right to top left here.
// This is neeeded, because when we convert to the same memory
// buffer copying the surface from top left to bottom right would
// overwrite the source, since we have more bits per destination
// color than per source color.
dst += h * dstPitch - dstDelta - bytesPerPixel;
src += h * srcPitch - srcDelta - 1;
if (hasMask) mask += h * maskPitch - maskDelta - 1;
crossBlitMapLogic<uint16, 2, true, hasKey, hasMask>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta, map, key);
} else if (bytesPerPixel == 3) {
// We need to blit the surface from bottom right to top left here.
// This is needed, because when we convert to the same memory
// buffer copying the surface from top left to bottom right would
// overwrite the source, since we have more bits per destination
// color than per source color.
dst += h * dstPitch - dstDelta - bytesPerPixel;
src += h * srcPitch - srcDelta - 1;
if (hasMask) mask += h * maskPitch - maskDelta - 1;
crossBlitMapLogic<uint8, 3, true, hasKey, hasMask>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta, map, key);
} else if (bytesPerPixel == 4) {
// We need to blit the surface from bottom right to top left here.
// This is needed, because when we convert to the same memory
// buffer copying the surface from top left to bottom right would
// overwrite the source, since we have more bits per destination
// color than per source color.
dst += h * dstPitch - dstDelta - bytesPerPixel;
src += h * srcPitch - srcDelta - 1;
if (hasMask) mask += h * maskPitch - maskDelta - 1;
crossBlitMapLogic<uint32, 4, true, hasKey, hasMask>(dst, src, mask, w, h, srcDelta, dstDelta, maskDelta, map, key);
} else {
return false;
}
return true;
}
} // End of anonymous namespace
// Function to blit a rect from one color format to another using a map
bool crossBlitMap(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const uint bytesPerPixel, const uint32 *map) {
// Error out if conversion is impossible
if (!bytesPerPixel)
return false;
return crossBlitMapHelperLogic<false, false>(dst, src, nullptr, w, h, bytesPerPixel, map, srcPitch, dstPitch, 0, 0);
}
// Function to blit a rect from one color format to another using a map with a transparent color key
bool crossKeyBlitMap(byte *dst, const byte *src,
const uint dstPitch, const uint srcPitch,
const uint w, const uint h,
const uint bytesPerPixel, const uint32 *map, const uint32 key) {
// Error out if conversion is impossible
if (!bytesPerPixel)
return false;
return crossBlitMapHelperLogic<true, false>(dst, src, nullptr, w, h, bytesPerPixel, map, srcPitch, dstPitch, 0, key);
}
// Function to blit a rect from one color format to another using a map with a transparent color mask
bool crossMaskBlitMap(byte *dst, const byte *src, const byte *mask,
const uint dstPitch, const uint srcPitch, const uint maskPitch,
const uint w, const uint h,
const uint bytesPerPixel, const uint32 *map) {
// Error out if conversion is impossible
if (!bytesPerPixel)
return false;
return crossBlitMapHelperLogic<false, true>(dst, src, mask, w, h, bytesPerPixel, map, srcPitch, dstPitch, maskPitch, 0);
}
} // End of namespace Graphics