diff options
Diffstat (limited to 'fb/fbmmx.c')
-rw-r--r-- | fb/fbmmx.c | 880 |
1 files changed, 684 insertions, 196 deletions
diff --git a/fb/fbmmx.c b/fb/fbmmx.c index 5bbede10b..0e9074648 100644 --- a/fb/fbmmx.c +++ b/fb/fbmmx.c @@ -85,30 +85,40 @@ typedef unsigned long long ullong; +#ifdef __GNUC__ +typedef ullong mmxdatafield; +#endif +#ifdef _MSC_VER +typedef unsigned __int64 ullong; +typedef __m64 mmxdatafield; +#endif + typedef struct { - ullong mmx_4x00ff; - ullong mmx_4x0080; - ullong mmx_565_rgb; - ullong mmx_565_unpack_multiplier; - ullong mmx_565_r; - ullong mmx_565_g; - ullong mmx_565_b; - ullong mmx_mask_0; - ullong mmx_mask_1; - ullong mmx_mask_2; - ullong mmx_mask_3; - ullong mmx_full_alpha; - ullong mmx_ffff0000ffff0000; - ullong mmx_0000ffff00000000; - ullong mmx_000000000000ffff; + mmxdatafield mmx_4x00ff; + mmxdatafield mmx_4x0080; + mmxdatafield mmx_565_rgb; + mmxdatafield mmx_565_unpack_multiplier; + mmxdatafield mmx_565_r; + mmxdatafield mmx_565_g; + mmxdatafield mmx_565_b; + mmxdatafield mmx_mask_0; + mmxdatafield mmx_mask_1; + mmxdatafield mmx_mask_2; + mmxdatafield mmx_mask_3; + mmxdatafield mmx_full_alpha; + mmxdatafield mmx_ffff0000ffff0000; + mmxdatafield mmx_0000ffff00000000; + mmxdatafield mmx_000000000000ffff; } MMXData; static const MMXData c = { +#ifdef __GNUC__ .mmx_4x00ff = 0x00ff00ff00ff00ffULL, .mmx_4x0080 = 0x0080008000800080ULL, .mmx_565_rgb = 0x000001f0003f001fULL, + .mmx_565_unpack_multiplier = 0x0000008404100840ULL, .mmx_565_r = 0x000000f800000000ULL, .mmx_565_g = 0x0000000000fc0000ULL, .mmx_565_b = 0x00000000000000f8ULL, @@ -117,15 +127,42 @@ static const MMXData c = .mmx_mask_2 = 0xffff0000ffffffffULL, .mmx_mask_3 = 0x0000ffffffffffffULL, .mmx_full_alpha = 0x00ff000000000000ULL, - .mmx_565_unpack_multiplier = 0x0000008404100840ULL, .mmx_ffff0000ffff0000 = 0xffff0000ffff0000ULL, .mmx_0000ffff00000000 = 0x0000ffff00000000ULL, .mmx_000000000000ffff = 0x000000000000ffffULL, +#endif +#ifdef _MSC_VER + { 0x00ff00ff00ff00ffUI64 }, + { 0x0080008000800080UI64 }, + { 0x000001f0003f001fUI64 }, + { 0x0000008404100840UI64 }, + { 0x000000f800000000UI64 }, + { 0x0000000000fc0000UI64 }, + { 0x00000000000000f8UI64 }, + { 0xffffffffffff0000UI64 }, + { 0xffffffff0000ffffUI64 }, + { 0xffff0000ffffffffUI64 }, + { 0x0000ffffffffffffUI64 }, + { 0x00ff000000000000UI64 }, + { 0xffff0000ffff0000UI64 }, + { 0x0000ffff00000000UI64 }, + { 0x000000000000ffffUI64 }, +#endif }; +#ifdef _MSC_VER +#undef inline +#define inline __forceinline +#endif + +#ifdef __GNUC__ #define MC(x) ((__m64) c.mmx_##x) +#endif +#ifdef _MSC_VER +#define MC(x) c.mmx_##x +#endif -static __inline__ __m64 +static inline __m64 shift (__m64 v, int s) { if (s > 0) @@ -136,13 +173,13 @@ shift (__m64 v, int s) return v; } -static __inline__ __m64 +static inline __m64 negate (__m64 mask) { return _mm_xor_si64 (mask, MC(4x00ff)); } -static __inline__ __m64 +static inline __m64 pix_multiply (__m64 a, __m64 b) { __m64 res; @@ -155,7 +192,7 @@ pix_multiply (__m64 a, __m64 b) return res; } -static __inline__ __m64 +static inline __m64 pix_add (__m64 a, __m64 b) { return _mm_adds_pu8 (a, b); @@ -163,19 +200,19 @@ pix_add (__m64 a, __m64 b) #ifdef USE_SSE -static __inline__ __m64 +static inline __m64 expand_alpha (__m64 pixel) { return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); } -static __inline__ __m64 +static inline __m64 expand_alpha_rev (__m64 pixel) { return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); } -static __inline__ __m64 +static inline __m64 invert_colors (__m64 pixel) { return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); @@ -183,7 +220,7 @@ invert_colors (__m64 pixel) #else -static __inline__ __m64 +static inline __m64 expand_alpha (__m64 pixel) { __m64 t1, t2; @@ -197,7 +234,7 @@ expand_alpha (__m64 pixel) return t1; } -static __inline__ __m64 +static inline __m64 expand_alpha_rev (__m64 pixel) { __m64 t1, t2; @@ -214,7 +251,7 @@ expand_alpha_rev (__m64 pixel) return t1; } -static __inline__ __m64 +static inline __m64 invert_colors (__m64 pixel) { __m64 x, y, z; @@ -236,13 +273,13 @@ invert_colors (__m64 pixel) #endif -static __inline__ __m64 +static inline __m64 over (__m64 src, __m64 srca, __m64 dest) { return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); } -static __inline__ __m64 +static inline __m64 over_rev_non_pre (__m64 src, __m64 dest) { __m64 srca = expand_alpha (src); @@ -251,14 +288,15 @@ over_rev_non_pre (__m64 src, __m64 dest) return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); } -static __inline__ __m64 +static inline __m64 in (__m64 src, __m64 mask) { return pix_multiply (src, mask); } -static __inline__ __m64 +#ifndef _MSC_VER +static inline __m64 in_over (__m64 src, __m64 srca, __m64 mask, @@ -266,20 +304,23 @@ in_over (__m64 src, { return over(in(src, mask), pix_multiply(srca, mask), dest); } +#else +#define in_over(src, srca, mask, dest) over(in(src, mask), pix_multiply(srca, mask), dest) +#endif -static __inline__ __m64 +static inline __m64 load8888 (CARD32 v) { return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); } -static __inline__ __m64 +static inline __m64 pack8888 (__m64 lo, __m64 hi) { return _mm_packs_pu16 (lo, hi); } -static __inline__ CARD32 +static inline CARD32 store8888 (__m64 v) { return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64())); @@ -299,7 +340,7 @@ store8888 (__m64 v) * Note the trick here - the top word is shifted by another nibble to * avoid it bumping into the middle word */ -static __inline__ __m64 +static inline __m64 expand565 (__m64 pixel, int pos) { __m64 p = pixel; @@ -319,7 +360,7 @@ expand565 (__m64 pixel, int pos) return _mm_srli_pi16 (pixel, 8); } -static __inline__ __m64 +static inline __m64 expand8888 (__m64 in, int pos) { if (pos == 0) @@ -328,7 +369,7 @@ expand8888 (__m64 in, int pos) return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); } -static __inline__ __m64 +static inline __m64 pack565 (__m64 pixel, __m64 target, int pos) { __m64 p = pixel; @@ -358,20 +399,28 @@ pack565 (__m64 pixel, __m64 target, int pos) return _mm_or_si64 (b, p); } -static __inline__ __m64 +#ifndef _MSC_VER +static inline __m64 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) { - x = _mm_mullo_pi16 (x, a); - y = _mm_mullo_pi16 (y, b); - x = _mm_srli_pi16(x, 1); - y = _mm_srli_pi16(y, 1); - x = _mm_adds_pu16 (x, y); - x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); + x = _mm_mullo_pi16 (x, a); + y = _mm_mullo_pi16 (y, b); x = _mm_adds_pu16 (x, MC(4x0080)); - x = _mm_srli_pi16 (x, 7); + x = _mm_adds_pu16 (x, y); + x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)); + x = _mm_srli_pi16 (x, 8); return x; } +#else +#define pix_add_mul(x, a, y, b) \ +( x = _mm_mullo_pi16 (x, a), \ + y = _mm_mullo_pi16 (y, b), \ + x = _mm_adds_pu16 (x, MC(4x0080)), \ + x = _mm_adds_pu16 (x, y), \ + x = _mm_adds_pu16 (x, _mm_srli_pi16 (x, 8)), \ + _mm_srli_pi16 (x, 8) ) +#endif /* --------------- MMX code patch for fbcompose.c --------------------- */ @@ -590,8 +639,8 @@ mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width) CARD32 da = ~d >> 24; if (sa > da) { - __m64 msa = load8888(FbIntDiv(da, sa)); - msa = expand_alpha_rev(msa); + __m64 msa = load8888(FbIntDiv(da, sa) << 24); + msa = expand_alpha(msa); ms = pix_multiply(ms, msa); } md = pix_add(md, ms); @@ -1140,23 +1189,22 @@ fbCompositeSrc_8888x8x8888mmx (CARD8 op, { CARD32 *dstLine, *dst; CARD32 *srcLine, *src; - CARD8 *maskLine; CARD32 mask; __m64 vmask; - FbStride dstStride, srcStride, maskStride; + FbStride dstStride, srcStride; CARD16 w; __m64 srca; - + CHECKPOINT(); - + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); - fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); - mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine; + fbComposeGetSolid (pMask, mask, pDst->format); + mask = mask | mask >> 8 | mask >> 16 | mask >> 24; vmask = load8888 (mask); srca = MC(4x00ff); - + while (height--) { dst = dstLine; @@ -1169,9 +1217,93 @@ fbCompositeSrc_8888x8x8888mmx (CARD8 op, { __m64 s = load8888 (*src); __m64 d = load8888 (*dst); - + + *dst = store8888 (in_over (s, expand_alpha (s), vmask, d)); + + w--; + dst++; + src++; + } + + while (w >= 2) + { + __m64 vs = *(__m64 *)src; + __m64 vd = *(__m64 *)dst; + __m64 vsrc0 = expand8888 (vs, 0); + __m64 vsrc1 = expand8888 (vs, 1); + + *(__m64 *)dst = pack8888 ( + in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), + in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); + + w -= 2; + dst += 2; + src += 2; + } + + while (w) + { + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); + + *dst = store8888 (in_over (s, expand_alpha (s), vmask, d)); + + w--; + dst++; + src++; + } + } + + _mm_empty(); +} + +void +fbCompositeSrc_x888x8x8888mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 *dstLine, *dst; + CARD32 *srcLine, *src; + CARD32 mask; + __m64 vmask; + FbStride dstStride, srcStride; + CARD16 w; + __m64 srca; + + CHECKPOINT(); + + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); + fbComposeGetSolid (pMask, mask, pDst->format); + + mask = mask | mask >> 8 | mask >> 16 | mask >> 24; + vmask = load8888 (mask); + srca = MC(4x00ff); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + while (w && (unsigned long)dst & 7) + { + __m64 s = load8888 (*src); + __m64 d = load8888 (*dst); + *dst = store8888 (in_over (s, srca, vmask, d)); - + w--; dst++; src++; @@ -1197,39 +1329,39 @@ fbCompositeSrc_8888x8x8888mmx (CARD8 op, __m64 vs6 = *(__m64 *)(src + 12); __m64 vs7 = *(__m64 *)(src + 14); - vd0 = (__m64)pack8888 ( + vd0 = pack8888 ( in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); - - vd1 = (__m64)pack8888 ( + + vd1 = pack8888 ( in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); - - vd2 = (__m64)pack8888 ( + + vd2 = pack8888 ( in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); - - vd3 = (__m64)pack8888 ( + + vd3 = pack8888 ( in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); - - vd4 = (__m64)pack8888 ( + + vd4 = pack8888 ( in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); - - vd5 = (__m64)pack8888 ( + + vd5 = pack8888 ( in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); - - vd6 = (__m64)pack8888 ( + + vd6 = pack8888 ( in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); - - vd7 = (__m64)pack8888 ( + + vd7 = pack8888 ( in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); - *(__m64 *)(dst + 0) = vd0; + *(__m64 *)(dst + 0) = vd0; *(__m64 *)(dst + 2) = vd1; *(__m64 *)(dst + 4) = vd2; *(__m64 *)(dst + 6) = vd3; @@ -1237,26 +1369,26 @@ fbCompositeSrc_8888x8x8888mmx (CARD8 op, *(__m64 *)(dst + 10) = vd5; *(__m64 *)(dst + 12) = vd6; *(__m64 *)(dst + 14) = vd7; - + w -= 16; dst += 16; src += 16; } - + while (w) { __m64 s = load8888 (*src); __m64 d = load8888 (*dst); - + *dst = store8888 (in_over (s, srca, vmask, d)); - + w--; dst++; src++; } } - _mm_empty(); + _mm_empty(); } void @@ -1552,6 +1684,256 @@ fbCompositeSolidMask_nx8x8888mmx (CARD8 op, } +Bool +fbSolidFillmmx (DrawablePtr pDraw, + int x, + int y, + int width, + int height, + FbBits xor) +{ + FbStride stride; + int bpp; + ullong fill; + __m64 vfill; + CARD32 byte_width; + CARD8 *byte_line; + FbBits *bits; + int xoff, yoff; + __m64 v1, v2, v3, v4, v5, v6, v7; + + CHECKPOINT(); + + fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); + + if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) + return FALSE; + + if (bpp != 16 && bpp != 32) + return FALSE; + + if (bpp == 16) + { + stride = stride * sizeof (FbBits) / 2; + byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y + yoff) + (x + xoff)); + byte_width = 2 * width; + stride *= 2; + } + else + { + stride = stride * sizeof (FbBits) / 4; + byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y + yoff) + (x + xoff)); + byte_width = 4 * width; + stride *= 4; + } + + fill = ((ullong)xor << 32) | xor; + vfill = (__m64)fill; + + __asm__ ( + "movq %7, %0\n" + "movq %7, %1\n" + "movq %7, %2\n" + "movq %7, %3\n" + "movq %7, %4\n" + "movq %7, %5\n" + "movq %7, %6\n" + : "=y" (v1), "=y" (v2), "=y" (v3), + "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7) + : "y" (vfill)); + + while (height--) + { + int w; + CARD8 *d = byte_line; + byte_line += stride; + w = byte_width; + + while (w >= 2 && ((unsigned long)d & 3)) + { + *(CARD16 *)d = xor; + w -= 2; + d += 2; + } + + while (w >= 4 && ((unsigned long)d & 7)) + { + *(CARD32 *)d = xor; + + w -= 4; + d += 4; + } + + while (w >= 64) + { + __asm__ ( + "movq %1, (%0)\n" + "movq %2, 8(%0)\n" + "movq %3, 16(%0)\n" + "movq %4, 24(%0)\n" + "movq %5, 32(%0)\n" + "movq %6, 40(%0)\n" + "movq %7, 48(%0)\n" + "movq %8, 56(%0)\n" + : + : "r" (d), + "y" (vfill), "y" (v1), "y" (v2), "y" (v3), + "y" (v4), "y" (v5), "y" (v6), "y" (v7) + : "memory"); + + w -= 64; + d += 64; + } + + while (w >= 4) + { + *(CARD32 *)d = xor; + + w -= 4; + d += 4; + } + if (w >= 2) + { + *(CARD16 *)d = xor; + w -= 2; + d += 2; + } + } + + _mm_empty(); + return TRUE; +} + +void +fbCompositeSolidMaskSrc_nx8x8888mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD32 src, srca; + CARD32 *dstLine, *dst; + CARD8 *maskLine, *mask; + FbStride dstStride, maskStride; + CARD16 w; + __m64 vsrc, vsrca; + ullong srcsrc; + + CHECKPOINT(); + + fbComposeGetSolid(pSrc, src, pDst->format); + + srca = src >> 24; + if (srca == 0) + { + fbSolidFillmmx (pDst->pDrawable, xDst, yDst, width, height, 0); + return; + } + + srcsrc = (ullong)src << 32 | src; + + fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); + + vsrc = load8888 (src); + vsrca = expand_alpha (vsrc); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + CHECKPOINT(); + + while (w && (unsigned long)dst & 7) + { + ullong m = *mask; + + if (m) + { + __m64 vdest = in(vsrc, expand_alpha_rev ((__m64)m)); + *dst = store8888(vdest); + } + else + { + *dst = 0; + } + + w--; + mask++; + dst++; + } + + CHECKPOINT(); + + while (w >= 2) + { + ullong m0, m1; + m0 = *mask; + m1 = *(mask + 1); + + if (srca == 0xff && (m0 & m1) == 0xff) + { + *(ullong *)dst = srcsrc; + } + else if (m0 | m1) + { + __m64 vdest; + __m64 dest0, dest1; + + vdest = *(__m64 *)dst; + + dest0 = in(vsrc, expand_alpha_rev ((__m64)m0)); + dest1 = in(vsrc, expand_alpha_rev ((__m64)m1)); + + *(__m64 *)dst = pack8888(dest0, dest1); + } + else + { + *dst = 0; + } + + mask += 2; + dst += 2; + w -= 2; + } + + CHECKPOINT(); + + while (w) + { + ullong m = *mask; + + if (m) + { + __m64 vdest = load8888(*dst); + vdest = in(vsrc, expand_alpha_rev ((__m64)m)); + *dst = store8888(vdest); + } + else + { + *dst = 0; + } + + w--; + mask++; + dst++; + } + } + + _mm_empty(); +} + void fbCompositeSolidMask_nx8x0565mmx (CARD8 op, PicturePtr pSrc, @@ -2003,6 +2385,232 @@ fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op, } void +fbCompositeIn_nx8x8mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD8 *dstLine, *dst; + CARD8 *maskLine, *mask; + FbStride dstStride, maskStride; + CARD16 w; + CARD32 src; + CARD8 sa; + __m64 vsrc, vsrca; + + fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); + + fbComposeGetSolid(pSrc, src, pDst->format); + + sa = src >> 24; + if (sa == 0) + return; + + vsrc = load8888(src); + vsrca = expand_alpha(vsrc); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + if ((((unsigned long)pDst & 3) == 0) && + (((unsigned long)pSrc & 3) == 0)) + { + while (w >= 4) + { + CARD32 m; + __m64 vmask; + __m64 vdest; + + m = 0; + + vmask = load8888 (*(CARD32 *)mask); + vdest = load8888 (*(CARD32 *)dst); + + *(CARD32 *)dst = store8888 (in (in (vsrca, vmask), vdest)); + + dst += 4; + mask += 4; + w -= 4; + } + } + + while (w--) + { + CARD16 tmp; + CARD8 a; + CARD32 m, d; + CARD32 r; + + a = *mask++; + d = *dst; + + m = FbInU (sa, 0, a, tmp); + r = FbInU (m, 0, d, tmp); + + *dst++ = r; + } + } + + _mm_empty(); +} + +void +fbCompositeIn_8x8mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD8 *dstLine, *dst; + CARD8 *srcLine, *src; + FbStride srcStride, dstStride; + CARD16 w; + + fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); + fbComposeGetStart (pSrc, xSrc, ySrc, CARD8, srcStride, srcLine, 1); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + src = srcLine; + srcLine += srcStride; + w = width; + + if ((((unsigned long)pDst & 3) == 0) && + (((unsigned long)pSrc & 3) == 0)) + { + while (w >= 4) + { + CARD32 *s = (CARD32 *)src; + CARD32 *d = (CARD32 *)dst; + + *d = store8888 (in (load8888 (*s), load8888 (*d))); + + w -= 4; + dst += 4; + src += 4; + } + } + + while (w--) + { + CARD8 s, d; + CARD16 tmp; + + s = *src; + d = *dst; + + *dst = FbInU (s, 0, d, tmp); + + src++; + dst++; + } + } + + _mm_empty (); +} + +void +fbCompositeSrcAdd_8888x8x8mmx (CARD8 op, + PicturePtr pSrc, + PicturePtr pMask, + PicturePtr pDst, + INT16 xSrc, + INT16 ySrc, + INT16 xMask, + INT16 yMask, + INT16 xDst, + INT16 yDst, + CARD16 width, + CARD16 height) +{ + CARD8 *dstLine, *dst; + CARD8 *maskLine, *mask; + FbStride dstStride, maskStride; + CARD16 w; + CARD32 src; + CARD8 sa; + __m64 vsrc, vsrca; + + fbComposeGetStart (pDst, xDst, yDst, CARD8, dstStride, dstLine, 1); + fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); + + fbComposeGetSolid(pSrc, src, pDst->format); + + sa = src >> 24; + if (sa == 0) + return; + + vsrc = load8888(src); + vsrca = expand_alpha(vsrc); + + while (height--) + { + dst = dstLine; + dstLine += dstStride; + mask = maskLine; + maskLine += maskStride; + w = width; + + if ((((unsigned long)pMask & 3) == 0) && + (((unsigned long)pDst & 3) == 0)) + { + while (w >= 4) + { + __m64 vmask = load8888 (*(CARD32 *)mask); + __m64 vdest = load8888 (*(CARD32 *)dst); + + *(CARD32 *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest)); + + w -= 4; + dst += 4; + mask += 4; + } + } + + while (w--) + { + CARD16 tmp; + CARD16 a; + CARD32 m, d; + CARD32 r; + + a = *mask++; + d = *dst; + + m = FbInU (sa, 0, a, tmp); + r = FbAdd (m, d, 0, tmp); + + *dst++ = r; + } + } + + _mm_empty(); +} + +void fbCompositeSrcAdd_8000x8000mmx (CARD8 op, PicturePtr pSrc, PicturePtr pMask, @@ -2135,126 +2743,6 @@ fbCompositeSrcAdd_8888x8888mmx (CARD8 op, } Bool -fbSolidFillmmx (DrawablePtr pDraw, - int x, - int y, - int width, - int height, - FbBits xor) -{ - FbStride stride; - int bpp; - ullong fill; - __m64 vfill; - CARD32 byte_width; - CARD8 *byte_line; - FbBits *bits; - int xoff, yoff; - __m64 v1, v2, v3, v4, v5, v6, v7; - - CHECKPOINT(); - - fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); - - if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) - return FALSE; - - if (bpp != 16 && bpp != 32) - return FALSE; - - if (bpp == 16) - { - stride = stride * sizeof (FbBits) / 2; - byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y + yoff) + (x + xoff)); - byte_width = 2 * width; - stride *= 2; - } - else - { - stride = stride * sizeof (FbBits) / 4; - byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y + yoff) + (x + xoff)); - byte_width = 4 * width; - stride *= 4; - } - - fill = ((ullong)xor << 32) | xor; - vfill = (__m64)fill; - - __asm__ ( - "movq %7, %0\n" - "movq %7, %1\n" - "movq %7, %2\n" - "movq %7, %3\n" - "movq %7, %4\n" - "movq %7, %5\n" - "movq %7, %6\n" - : "=y" (v1), "=y" (v2), "=y" (v3), - "=y" (v4), "=y" (v5), "=y" (v6), "=y" (v7) - : "y" (vfill)); - - while (height--) - { - int w; - CARD8 *d = byte_line; - byte_line += stride; - w = byte_width; - - while (w >= 2 && ((unsigned long)d & 3)) - { - *(CARD16 *)d = xor; - w -= 2; - d += 2; - } - - while (w >= 4 && ((unsigned long)d & 7)) - { - *(CARD32 *)d = xor; - - w -= 4; - d += 4; - } - - while (w >= 64) - { - __asm__ ( - "movq %1, (%0)\n" - "movq %2, 8(%0)\n" - "movq %3, 16(%0)\n" - "movq %4, 24(%0)\n" - "movq %5, 32(%0)\n" - "movq %6, 40(%0)\n" - "movq %7, 48(%0)\n" - "movq %8, 56(%0)\n" - : - : "r" (d), - "y" (vfill), "y" (v1), "y" (v2), "y" (v3), - "y" (v4), "y" (v5), "y" (v6), "y" (v7) - : "memory"); - - w -= 64; - d += 64; - } - - while (w >= 4) - { - *(CARD32 *)d = xor; - - w -= 4; - d += 4; - } - if (w >= 2) - { - *(CARD16 *)d = xor; - w -= 2; - d += 2; - } - } - - _mm_empty(); - return TRUE; -} - -Bool fbCopyAreammx (DrawablePtr pSrc, DrawablePtr pDst, int src_x, |