summaryrefslogtreecommitdiff
path: root/fb/fbmmx.c
diff options
context:
space:
mode:
Diffstat (limited to 'fb/fbmmx.c')
-rw-r--r--fb/fbmmx.c880
1 files changed, 684 insertions, 196 deletions
diff --git a/fb/fbmmx.c b/fb/fbmmx.c
index 5bbede10b..0e9074648 100644
--- a/fb/fbmmx.c
+++ b/fb/fbmmx.c
@@ -85,30 +85,40 @@
typedef unsigned long long ullong;
+#ifdef __GNUC__
+typedef ullong mmxdatafield;
+#endif
+#ifdef _MSC_VER
+typedef unsigned __int64 ullong;
+typedef __m64 mmxdatafield;
+#endif
+
typedef struct
{
- ullong mmx_4x00ff;
- ullong mmx_4x0080;
- ullong mmx_565_rgb;
- ullong mmx_565_unpack_multiplier;
- ullong mmx_565_r;
- ullong mmx_565_g;
- ullong mmx_565_b;
- ullong mmx_mask_0;
- ullong mmx_mask_1;
- ullong mmx_mask_2;
- ullong mmx_mask_3;
- ullong mmx_full_alpha;
- ullong mmx_ffff0000ffff0000;
- ullong mmx_0000ffff00000000;
- ullong mmx_000000000000ffff;
+ mmxdatafield mmx_4x00ff;
+ mmxdatafield mmx_4x0080;
+ mmxdatafield mmx_565_rgb;
+ mmxdatafield mmx_565_unpack_multiplier;
+ mmxdatafield mmx_565_r;
+ mmxdatafield mmx_565_g;
+ mmxdatafield mmx_565_b;
+ mmxdatafield mmx_mask_0;
+ mmxdatafield mmx_mask_1;
+ mmxdatafield mmx_mask_2;
+ mmxdatafield mmx_mask_3;
+ mmxdatafield mmx_full_alpha;
+ mmxdatafield mmx_ffff0000ffff0000;
+ mmxdatafield mmx_0000ffff00000000;
+ mmxdatafield mmx_000000000000ffff;
} MMXData;
static const MMXData c =
{
+#ifdef __GNUC__
.mmx_4x00ff = 0x00ff00ff00ff00ffULL,
.mmx_4x0080 = 0x0080008000800080ULL,
.mmx_565_rgb = 0x000001f0003f001fULL,
+ .mmx_565_unpack_multiplier = 0x0000008404100840ULL,
.mmx_565_r = 0x000000f800000000ULL,
.mmx_565_g = 0x0000000000fc0000ULL,
.mmx_565_b = 0x00000000000000f8ULL,
@@ -117,15 +127,42 @@ static const MMXData c =
.mmx_mask_2 = 0xffff0000ffffffffULL,
.mmx_mask_3 = 0x0000ffffffffffffULL,
.mmx_full_alpha = 0x00ff000000000000ULL,
- .mmx_565_unpack_multiplier = 0x0000008404100840ULL,
.mmx_ffff0000ffff0000 = 0xffff0000ffff0000ULL,
.mmx_0000ffff00000000 = 0x0000ffff00000000ULL,
.mmx_000000000000ffff = 0x000000000000ffffULL,
+#endif
+#ifdef _MSC_VER
+ { 0x00ff00ff00ff00ffUI64 },
+ { 0x0080008000800080UI64 },
+ { 0x000001f0003f001fUI64 },
+ { 0x0000008404100840UI64 },
+ { 0x000000f800000000UI64 },
+ { 0x0000000000fc0000UI64 },
+ { 0x00000000000000f8UI64 },
+ { 0xffffffffffff0000UI64 },
+ { 0xffffffff0000ffffUI64 },
+ { 0xffff0000ffffffffUI64 },
+ { 0x0000ffffffffffffUI64 },
+ { 0x00ff000000000000UI64 },
+ { 0xffff0000ffff0000UI64 },
+ { 0x0000ffff00000000UI64 },
+ { 0x000000000000ffffUI64 },
+#endif
};
+#ifdef _MSC_VER
+#undef inline
+#define inline __forceinline
+#endif
+
+#ifdef __GNUC__
#define MC(x) ((__m64) c.mmx_##x)
+#endif
+#ifdef _MSC_VER
+#define MC(x) c.mmx_##x
+#endif
-static __inline__ __m64
+static inline __m64
shift (__m64 v, int s)
{
if (s > 0)
@@ -136,13 +173,13 @@ shift (__m64 v, int s)
return v;
}
-static __inline__ __m64
+static inline __m64
negate (__m64 mask)
{
return _mm_xor_si64 (mask, MC(4x00ff));
}
-static __inline__ __m64
+static inline __m64
pix_multiply (__m64 a, __m64 b)
{
__m64 res;
@@ -155,7 +192,7 @@ pix_multiply (__m64 a, __m64 b)
return res;
}
-static __inline__ __m64
+static inline __m64
pix_add (__m64 a, __m64 b)
{
return _mm_adds_pu8 (a, b);
@@ -163,19 +200,19 @@ pix_add (__m64 a, __m64 b)
#ifdef USE_SSE
-static __inline__ __m64
+static inline __m64
expand_alpha (__m64 pixel)
{
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
}
-static __inline__ __m64
+static inline __m64
expand_alpha_rev (__m64 pixel)
{
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
}
-static __inline__ __m64
+static inline __m64
invert_colors (__m64 pixel)
{
return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
@@ -183,7 +220,7 @@ invert_colors (__m64 pixel)
#else
-static __inline__ __m64
+static inline __m64
expand_alpha (__m64 pixel)
{
__m64 t1, t2;
@@ -197,7 +234,7 @@ expand_alpha (__m64 pixel)
return t1;
}
-static __inline__ __m64
+static inline __m64
expand_alpha_rev (__m64 pixel)
{
__m64 t1, t2;
@@ -214,7 +251,7 @@ expand_alpha_rev (__m64 pixel)
return t1;
}
-static __inline__ __m64
+static inline __m64
invert_colors (__m64 pixel)
{
__m64 x, y, z;
@@ -236,13 +273,13 @@ invert_colors (__m64 pixel)
#endif
-static __inline__ __m64
+static inline __m64
over (__m64 src, __m64 srca, __m64 dest)
{
return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
}
-static __inline__ __m64
+static inline __m64
over_rev_non_pre (__m64 src, __m64 dest)
{
__m64 srca = expand_alpha (src);
@@ -251,14 +288,15 @@ over_rev_non_pre (__m64 src, __m64 dest)
return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
}
-static __inline__ __m64
+static inline __m64
in (__m64 src,
__m64 mask)
{
return pix_multiply (src, mask);
}
-static __inline__ __m64
+#ifndef _MSC_VER
+static inline __m64
in_over (__m64 src,
__m64 srca,
__m64 mask,
@@ -266,20 +304,23 @@ in_over (__m64 src,
{
return over(in(src, mask), pix_multiply(srca, mask), dest);
}
+#else
+#define in_over(src, srca, mask, dest) over(in(src, mask), pix_multiply(srca, mask), dest)
+#endif
-static __inline__ __m64
+static inline __m64
load8888 (CARD32 v)
{
return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
}
-static __inline__ __m64
+static inline __m64
pack8888 (__m64 lo, __m64 hi)
{
return _mm_packs_pu16 (lo, hi);
}
-static __inline__ CARD32
+static inline CARD32
store8888 (__m64 v)
{
return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
@@ -299,7 +340,7 @@ store8888 (__m64 v)
* Note the trick here - the top word is shifted by another nibble to
* avoid it bumping into the middle word
*/
-static __inline__ __m64
+static inline __m64
expand565 (__m64 pixel, int pos)
{
__m64 p = pixel;
@@ -319,7 +360,7 @@ expand565 (__m64 pixel, int pos)
return _mm_srli_pi16 (pixel, 8);
}
-static __inline__ __m64
+static inline __m64
expand8888 (__m64 in, int pos)
{
if (pos == 0)
@@ -328,7 +369,7 @@ expand8888 (__m64 in, int pos)
return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
}
-static __inline__ __m64
+static inline __m64
pack565 (__m64 pixel, __m64 target, int pos)
{
__m64 p = pixel;
@@ -358,20 +399,28 @@ pack565 (__m64 pixel, __m64 target, int pos)
return _mm_or_si64 (b, p);
}
-static __inline__ __m64
+#ifndef _MSC_VER
+static inline __m64
pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
{
- x = _mm_mullo_pi16 (x, a);
- y = _mm_mullo_pi16 (y, b);
- x = _mm_srli_pi16(x, 1);
- y = _mm_srli_pi16(y, 1);
- x = _mm_adds_pu16 (x, y);
- x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8));
+ x = _mm_mullo_pi16 (x, a);
+ y = _mm_mullo_pi16 (y, b);
x = _mm_adds_pu16 (x, MC(4x0080));
- x = _mm_srli_pi16 (x, 7);
+ x = _mm_adds_pu16 (x, y);
+ x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8));
+ x = _mm_srli_pi16 (x, 8);
return x;
}
+#else
+#define pix_add_mul(x, a, y, b) \
+( x = _mm_mullo_pi16 (x, a), \
+ y = _mm_mullo_pi16 (y, b), \
+ x = _mm_adds_pu16 (x, MC(4x0080)), \
+ x = _mm_adds_pu16 (x, y), \
+ x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)), \
+ _mm_srli_pi16 (x, 8) )
+#endif
/* --------------- MMX code patch for fbcompose.c --------------------- */
@@ -590,8 +639,8 @@ mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
CARD32 da = ~d >> 24;
if (sa > da) {
- __m64 msa = load8888(FbIntDiv(da, sa));
- msa = expand_alpha_rev(msa);
+ __m64 msa = load8888(FbIntDiv(da, sa) << 24);
+ msa = expand_alpha(msa);
ms = pix_multiply(ms, msa);
}
md = pix_add(md, ms);
@@ -1140,23 +1189,22 @@ fbCompositeSrc_8888x8x8888mmx (CARD8 op,
{
CARD32 *dstLine, *dst;
CARD32 *srcLine, *src;
- CARD8 *maskLine;
CARD32 mask;
__m64 vmask;
- FbStride dstStride, srcStride, maskStride;
+ FbStride dstStride, srcStride;
CARD16 w;
__m64 srca;
-
+
CHECKPOINT();
-
+
fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
- fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
- mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
+ fbComposeGetSolid (pMask, mask, pDst->format);
+ mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
vmask = load8888 (mask);
srca = MC(4x00ff);
-
+
while (height--)
{
dst = dstLine;
@@ -1169,9 +1217,93 @@ fbCompositeSrc_8888x8x8888mmx (CARD8 op,
{
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
-
+
+ *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+
+ w--;
+ dst++;
+ src++;
+ }
+
+ while (w >= 2)
+ {
+ __m64 vs = *(__m64 *)src;
+ __m64 vd = *(__m64 *)dst;
+ __m64 vsrc0 = expand8888 (vs, 0);
+ __m64 vsrc1 = expand8888 (vs, 1);
+
+ *(__m64 *)dst = pack8888 (
+ in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
+ in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
+
+ w -= 2;
+ dst += 2;
+ src += 2;
+ }
+
+ while (w)
+ {
+ __m64 s = load8888 (*src);
+ __m64 d = load8888 (*dst);
+
+ *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
+
+ w--;
+ dst++;
+ src++;
+ }
+ }
+
+ _mm_empty();
+}
+
+void
+fbCompositeSrc_x888x8x8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 *dstLine, *dst;
+ CARD32 *srcLine, *src;
+ CARD32 mask;
+ __m64 vmask;
+ FbStride dstStride, srcStride;
+ CARD16 w;
+ __m64 srca;
+
+ CHECKPOINT();
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
+ fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
+ fbComposeGetSolid (pMask, mask, pDst->format);
+
+ mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
+ vmask = load8888 (mask);
+ srca = MC(4x00ff);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+ while (w && (unsigned long)dst & 7)
+ {
+ __m64 s = load8888 (*src);
+ __m64 d = load8888 (*dst);
+
*dst = store8888 (in_over (s, srca, vmask, d));
-
+
w--;
dst++;
src++;
@@ -1197,39 +1329,39 @@ fbCompositeSrc_8888x8x8888mmx (CARD8 op,
__m64 vs6 = *(__m64 *)(src + 12);
__m64 vs7 = *(__m64 *)(src + 14);
- vd0 = (__m64)pack8888 (
+ vd0 = pack8888 (
in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
-
- vd1 = (__m64)pack8888 (
+
+ vd1 = pack8888 (
in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
-
- vd2 = (__m64)pack8888 (
+
+ vd2 = pack8888 (
in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
-
- vd3 = (__m64)pack8888 (
+
+ vd3 = pack8888 (
in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
-
- vd4 = (__m64)pack8888 (
+
+ vd4 = pack8888 (
in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
-
- vd5 = (__m64)pack8888 (
+
+ vd5 = pack8888 (
in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
-
- vd6 = (__m64)pack8888 (
+
+ vd6 = pack8888 (
in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
-
- vd7 = (__m64)pack8888 (
+
+ vd7 = pack8888 (
in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
- *(__m64 *)(dst + 0) = vd0;
+ *(__m64 *)(dst + 0) = vd0;
*(__m64 *)(dst + 2) = vd1;
*(__m64 *)(dst + 4) = vd2;
*(__m64 *)(dst + 6) = vd3;
@@ -1237,26 +1369,26 @@ fbCompositeSrc_8888x8x8888mmx (CARD8 op,
*(__m64 *)(dst + 10) = vd5;
*(__m64 *)(dst + 12) = vd6;
*(__m64 *)(dst + 14) = vd7;
-
+
w -= 16;
dst += 16;
src += 16;
}
-
+
while (w)
{
__m64 s = load8888 (*src);
__m64 d = load8888 (*dst);
-
+
*dst = store8888 (in_over (s, srca, vmask, d));
-
+
w--;
dst++;
src++;
}
}
- _mm_empty();
+ _mm_empty();
}
void
@@ -1552,6 +1684,256 @@ fbCompositeSolidMask_nx8x8888mmx (CARD8 op,
}
+Bool
+fbSolidFillmmx (DrawablePtr pDraw,
+ int x,
+ int y,
+ int width,
+ int height,
+ FbBits xor)
+{
+ FbStride stride;
+ int bpp;
+ ullong fill;
+ __m64 vfill;
+ CARD32 byte_width;
+ CARD8 *byte_line;
+ FbBits *bits;
+ int xoff, yoff;
+ __m64 v1, v2, v3, v4, v5, v6, v7;
+
+ CHECKPOINT();
+
+ fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
+
+ if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
+ return FALSE;
+
+ if (bpp != 16 && bpp != 32)
+ return FALSE;
+
+ if (bpp == 16)
+ {
+ stride = stride * sizeof (FbBits) / 2;
+ byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y + yoff) + (x + xoff));
+ byte_width = 2 * width;
+ stride *= 2;
+ }
+ else
+ {
+ stride = stride * sizeof (FbBits) / 4;
+ byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y + yoff) + (x + xoff));
+ byte_width = 4 * width;
+ stride *= 4;
+ }
+
+ fill = ((ullong)xor << 32) | xor;
+ vfill = (__m64)fill;
+
+ __asm__ (
+ "movq %7, %0\n"
+ "movq %7, %1\n"
+ "movq %7, %2\n"
+ "movq %7, %3\n"
+ "movq %7, %4\n"
+ "movq %7, %5\n"
+ "movq %7, %6\n"
+ : "=y" (v1), "=y" (v2), "=y" (v3),
+ "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)
+ : "y" (vfill));
+
+ while (height--)
+ {
+ int w;
+ CARD8 *d = byte_line;
+ byte_line += stride;
+ w = byte_width;
+
+ while (w >= 2 && ((unsigned long)d & 3))
+ {
+ *(CARD16 *)d = xor;
+ w -= 2;
+ d += 2;
+ }
+
+ while (w >= 4 && ((unsigned long)d & 7))
+ {
+ *(CARD32 *)d = xor;
+
+ w -= 4;
+ d += 4;
+ }
+
+ while (w >= 64)
+ {
+ __asm__ (
+ "movq %1, (%0)\n"
+ "movq %2, 8(%0)\n"
+ "movq %3, 16(%0)\n"
+ "movq %4, 24(%0)\n"
+ "movq %5, 32(%0)\n"
+ "movq %6, 40(%0)\n"
+ "movq %7, 48(%0)\n"
+ "movq %8, 56(%0)\n"
+ :
+ : "r" (d),
+ "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
+ "y" (v4), "y" (v5), "y" (v6), "y" (v7)
+ : "memory");
+
+ w -= 64;
+ d += 64;
+ }
+
+ while (w >= 4)
+ {
+ *(CARD32 *)d = xor;
+
+ w -= 4;
+ d += 4;
+ }
+ if (w >= 2)
+ {
+ *(CARD16 *)d = xor;
+ w -= 2;
+ d += 2;
+ }
+ }
+
+ _mm_empty();
+ return TRUE;
+}
+
+void
+fbCompositeSolidMaskSrc_nx8x8888mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD32 src, srca;
+ CARD32 *dstLine, *dst;
+ CARD8 *maskLine, *mask;
+ FbStride dstStride, maskStride;
+ CARD16 w;
+ __m64 vsrc, vsrca;
+ ullong srcsrc;
+
+ CHECKPOINT();
+
+ fbComposeGetSolid(pSrc, src, pDst->format);
+
+ srca = src >> 24;
+ if (srca == 0)
+ {
+ fbSolidFillmmx (pDst->pDrawable, xDst, yDst, width, height, 0);
+ return;
+ }
+
+ srcsrc = (ullong)src << 32 | src;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
+ fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
+
+ vsrc = load8888 (src);
+ vsrca = expand_alpha (vsrc);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ mask = maskLine;
+ maskLine += maskStride;
+ w = width;
+
+ CHECKPOINT();
+
+ while (w && (unsigned long)dst & 7)
+ {
+ ullong m = *mask;
+
+ if (m)
+ {
+ __m64 vdest = in(vsrc, expand_alpha_rev ((__m64)m));
+ *dst = store8888(vdest);
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ mask++;
+ dst++;
+ }
+
+ CHECKPOINT();
+
+ while (w >= 2)
+ {
+ ullong m0, m1;
+ m0 = *mask;
+ m1 = *(mask + 1);
+
+ if (srca == 0xff && (m0 & m1) == 0xff)
+ {
+ *(ullong *)dst = srcsrc;
+ }
+ else if (m0 | m1)
+ {
+ __m64 vdest;
+ __m64 dest0, dest1;
+
+ vdest = *(__m64 *)dst;
+
+ dest0 = in(vsrc, expand_alpha_rev ((__m64)m0));
+ dest1 = in(vsrc, expand_alpha_rev ((__m64)m1));
+
+ *(__m64 *)dst = pack8888(dest0, dest1);
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ mask += 2;
+ dst += 2;
+ w -= 2;
+ }
+
+ CHECKPOINT();
+
+ while (w)
+ {
+ ullong m = *mask;
+
+ if (m)
+ {
+ __m64 vdest = load8888(*dst);
+ vdest = in(vsrc, expand_alpha_rev ((__m64)m));
+ *dst = store8888(vdest);
+ }
+ else
+ {
+ *dst = 0;
+ }
+
+ w--;
+ mask++;
+ dst++;
+ }
+ }
+
+ _mm_empty();
+}
+
void
fbCompositeSolidMask_nx8x0565mmx (CARD8 op,
PicturePtr pSrc,
@@ -2003,6 +2385,232 @@ fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op,
}
void
+fbCompositeIn_nx8x8mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD8 *dstLine, *dst;
+ CARD8 *maskLine, *mask;
+ FbStride dstStride, maskStride;
+ CARD16 w;
+ CARD32 src;
+ CARD8 sa;
+ __m64 vsrc, vsrca;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
+ fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
+
+ fbComposeGetSolid(pSrc, src, pDst->format);
+
+ sa = src >> 24;
+ if (sa == 0)
+ return;
+
+ vsrc = load8888(src);
+ vsrca = expand_alpha(vsrc);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ mask = maskLine;
+ maskLine += maskStride;
+ w = width;
+
+ if ((((unsigned long)pDst & 3) == 0) &&
+ (((unsigned long)pSrc & 3) == 0))
+ {
+ while (w >= 4)
+ {
+ CARD32 m;
+ __m64 vmask;
+ __m64 vdest;
+
+ m = 0;
+
+ vmask = load8888 (*(CARD32 *)mask);
+ vdest = load8888 (*(CARD32 *)dst);
+
+ *(CARD32 *)dst = store8888 (in (in (vsrca, vmask), vdest));
+
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+ }
+
+ while (w--)
+ {
+ CARD16 tmp;
+ CARD8 a;
+ CARD32 m, d;
+ CARD32 r;
+
+ a = *mask++;
+ d = *dst;
+
+ m = FbInU (sa, 0, a, tmp);
+ r = FbInU (m, 0, d, tmp);
+
+ *dst++ = r;
+ }
+ }
+
+ _mm_empty();
+}
+
+void
+fbCompositeIn_8x8mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD8 *dstLine, *dst;
+ CARD8 *srcLine, *src;
+ FbStride srcStride, dstStride;
+ CARD16 w;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
+ fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ src = srcLine;
+ srcLine += srcStride;
+ w = width;
+
+ if ((((unsigned long)pDst & 3) == 0) &&
+ (((unsigned long)pSrc & 3) == 0))
+ {
+ while (w >= 4)
+ {
+ CARD32 *s = (CARD32 *)src;
+ CARD32 *d = (CARD32 *)dst;
+
+ *d = store8888 (in (load8888 (*s), load8888 (*d)));
+
+ w -= 4;
+ dst += 4;
+ src += 4;
+ }
+ }
+
+ while (w--)
+ {
+ CARD8 s, d;
+ CARD16 tmp;
+
+ s = *src;
+ d = *dst;
+
+ *dst = FbInU (s, 0, d, tmp);
+
+ src++;
+ dst++;
+ }
+ }
+
+ _mm_empty ();
+}
+
+void
+fbCompositeSrcAdd_8888x8x8mmx (CARD8 op,
+ PicturePtr pSrc,
+ PicturePtr pMask,
+ PicturePtr pDst,
+ INT16 xSrc,
+ INT16 ySrc,
+ INT16 xMask,
+ INT16 yMask,
+ INT16 xDst,
+ INT16 yDst,
+ CARD16 width,
+ CARD16 height)
+{
+ CARD8 *dstLine, *dst;
+ CARD8 *maskLine, *mask;
+ FbStride dstStride, maskStride;
+ CARD16 w;
+ CARD32 src;
+ CARD8 sa;
+ __m64 vsrc, vsrca;
+
+ fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1);
+ fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
+
+ fbComposeGetSolid(pSrc, src, pDst->format);
+
+ sa = src >> 24;
+ if (sa == 0)
+ return;
+
+ vsrc = load8888(src);
+ vsrca = expand_alpha(vsrc);
+
+ while (height--)
+ {
+ dst = dstLine;
+ dstLine += dstStride;
+ mask = maskLine;
+ maskLine += maskStride;
+ w = width;
+
+ if ((((unsigned long)pMask & 3) == 0) &&
+ (((unsigned long)pDst & 3) == 0))
+ {
+ while (w >= 4)
+ {
+ __m64 vmask = load8888 (*(CARD32 *)mask);
+ __m64 vdest = load8888 (*(CARD32 *)dst);
+
+ *(CARD32 *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
+
+ w -= 4;
+ dst += 4;
+ mask += 4;
+ }
+ }
+
+ while (w--)
+ {
+ CARD16 tmp;
+ CARD16 a;
+ CARD32 m, d;
+ CARD32 r;
+
+ a = *mask++;
+ d = *dst;
+
+ m = FbInU (sa, 0, a, tmp);
+ r = FbAdd (m, d, 0, tmp);
+
+ *dst++ = r;
+ }
+ }
+
+ _mm_empty();
+}
+
+void
fbCompositeSrcAdd_8000x8000mmx (CARD8 op,
PicturePtr pSrc,
PicturePtr pMask,
@@ -2135,126 +2743,6 @@ fbCompositeSrcAdd_8888x8888mmx (CARD8 op,
}
Bool
-fbSolidFillmmx (DrawablePtr pDraw,
- int x,
- int y,
- int width,
- int height,
- FbBits xor)
-{
- FbStride stride;
- int bpp;
- ullong fill;
- __m64 vfill;
- CARD32 byte_width;
- CARD8 *byte_line;
- FbBits *bits;
- int xoff, yoff;
- __m64 v1, v2, v3, v4, v5, v6, v7;
-
- CHECKPOINT();
-
- fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
-
- if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
- return FALSE;
-
- if (bpp != 16 && bpp != 32)
- return FALSE;
-
- if (bpp == 16)
- {
- stride = stride * sizeof (FbBits) / 2;
- byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y + yoff) + (x + xoff));
- byte_width = 2 * width;
- stride *= 2;
- }
- else
- {
- stride = stride * sizeof (FbBits) / 4;
- byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y + yoff) + (x + xoff));
- byte_width = 4 * width;
- stride *= 4;
- }
-
- fill = ((ullong)xor << 32) | xor;
- vfill = (__m64)fill;
-
- __asm__ (
- "movq %7, %0\n"
- "movq %7, %1\n"
- "movq %7, %2\n"
- "movq %7, %3\n"
- "movq %7, %4\n"
- "movq %7, %5\n"
- "movq %7, %6\n"
- : "=y" (v1), "=y" (v2), "=y" (v3),
- "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7)
- : "y" (vfill));
-
- while (height--)
- {
- int w;
- CARD8 *d = byte_line;
- byte_line += stride;
- w = byte_width;
-
- while (w >= 2 && ((unsigned long)d & 3))
- {
- *(CARD16 *)d = xor;
- w -= 2;
- d += 2;
- }
-
- while (w >= 4 && ((unsigned long)d & 7))
- {
- *(CARD32 *)d = xor;
-
- w -= 4;
- d += 4;
- }
-
- while (w >= 64)
- {
- __asm__ (
- "movq %1, (%0)\n"
- "movq %2, 8(%0)\n"
- "movq %3, 16(%0)\n"
- "movq %4, 24(%0)\n"
- "movq %5, 32(%0)\n"
- "movq %6, 40(%0)\n"
- "movq %7, 48(%0)\n"
- "movq %8, 56(%0)\n"
- :
- : "r" (d),
- "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
- "y" (v4), "y" (v5), "y" (v6), "y" (v7)
- : "memory");
-
- w -= 64;
- d += 64;
- }
-
- while (w >= 4)
- {
- *(CARD32 *)d = xor;
-
- w -= 4;
- d += 4;
- }
- if (w >= 2)
- {
- *(CARD16 *)d = xor;
- w -= 2;
- d += 2;
- }
- }
-
- _mm_empty();
- return TRUE;
-}
-
-Bool
fbCopyAreammx (DrawablePtr pSrc,
DrawablePtr pDst,
int src_x,