summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSøren Sandmann Pedersen <ssp@redhat.com>2010-03-31 20:13:24 -0400
committerSøren Sandmann Pedersen <ssp@redhat.com>2010-03-31 20:13:24 -0400
commitcde17cb174cf01feffb5241b4449762b788593e9 (patch)
tree2750077f8c73f1f38f939336d5707e9e5e507abb
parentefd41c62875d97c5127233cb6a4c353b4d495531 (diff)
Beginning of sse2 macrosse2-macro
-rw-r--r--pixman/pixman-sse2.c212
1 files changed, 212 insertions, 0 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index 946e7ba3..4577aaee 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5596,6 +5596,218 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
_mm_empty ();
}
+#define MAKE_SSE2_FAST_PATH(name, op, src_format, mask_format, dest_format) \
+ static void \
+ sse2_composite_ ## name (pixman_implementation_t *imp, \
+ pixman_op_t op, \
+ pixman_image_t * src_image, \
+ pixman_image_t * mask_image, \
+ pixman_image_t * dst_image, \
+ int32_t src_x, \
+ int32_t src_y, \
+ int32_t mask_x, \
+ int32_t mask_y, \
+ int32_t dest_x, \
+ int32_t dest_y, \
+ int32_t width, \
+ int32_t height) \
+ { \
+ int src_stride, mask_stride, dst_stride; \
+ uint32_t w; \
+ int n_pixels; \
+ int src_bpp, mask_bpp, dst_bpp; \
+ \
+ dst_bpp = PIXMAN_FORMAT_BPP (dest_format); \
+ \
+ if (src_format == PIXMAN_solid) \
+ src_bpp = 0; \
+ \
+ if (mask_format == PIXMAN_solid || \
+ mask_format == PIXMAN_null) \
+ { \
+ mask_bpp = 0; \
+ } \
+ \
+ n_pixels = 128 / PIXMAN_FORMAT_BPP (dest_format); \
+ while (height--) \
+ { \
+ w = width; \
+ \
+ while (w && (unsigned long)dst & 15) \
+ { \
+ w--; \
+ dst += dst_bpp / 8; \
+ if (src_bpp) \
+ src += src_bpp / 8; \
+ if (mask_bpp) \
+ mask += mask_bpp / 8; \
+ } \
+ \
+ while (w >= n_pixels) \
+ { \
+ \
+ w -= n_pixels; \
+ dst += n_pixels * dst_bpp / 8; \
+ \
+ if (src_bpp) \
+ src += n_pixels * src_bpp / 8; \
+ \
+ if (mask_bpp) \
+ mask += n_pixels * mask_bpp / 8; \
+ } \
+ \
+ if (w) \
+ goto one_pixel; \
+ } \
+ }
+
+static void
+sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
+ pixman_op_t op,
+ pixman_image_t * src_image,
+ pixman_image_t * mask_image,
+ pixman_image_t * dst_image,
+ int32_t src_x,
+ int32_t src_y,
+ int32_t mask_x,
+ int32_t mask_y,
+ int32_t dest_x,
+ int32_t dest_y,
+ int32_t width,
+ int32_t height)
+{
+ uint32_t *src, *src_line, s;
+ uint32_t *dst, *dst_line, d;
+ uint8_t *mask, *mask_line;
+ uint32_t m;
+ int src_stride, mask_stride, dst_stride;
+ int32_t w;
+ __m64 ms;
+
+ __m128i xmm_src, xmm_src_lo, xmm_src_hi;
+ __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
+ __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
+
+ PIXMAN_IMAGE_GET_LINE (
+ dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
+ PIXMAN_IMAGE_GET_LINE (
+ src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+
+ while (height--)
+ {
+ src = src_line;
+ src_line += src_stride;
+ dst = dst_line;
+ dst_line += dst_stride;
+ mask = mask_line;
+ mask_line += mask_stride;
+
+ w = width;
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)mask);
+
+ while (w && (unsigned long)dst & 15)
+ {
+ s = 0xff000000 | *src++;
+ m = (uint32_t) *mask++;
+ d = *dst;
+ ms = unpack_32_1x64 (s);
+
+ if (m != 0xff)
+ {
+ __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+ __m64 md = unpack_32_1x64 (d);
+
+ ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md);
+ }
+
+ *dst++ = pack_1x64_32 (ms);
+ w--;
+ }
+
+ /* call prefetch hint to optimize cache load*/
+ cache_prefetch ((__m128i*)src);
+ cache_prefetch ((__m128i*)dst);
+ cache_prefetch ((__m128i*)mask);
+
+ while (w >= 4)
+ {
+ /* fill cache line with next memory */
+ cache_prefetch_next ((__m128i*)src);
+ cache_prefetch_next ((__m128i*)dst);
+ cache_prefetch_next ((__m128i*)mask);
+
+ m = *(uint32_t*) mask;
+ xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000);
+
+ if (m == 0xffffffff)
+ {
+ save_128_aligned ((__m128i*)dst, xmm_src);
+ }
+ else
+ {
+ xmm_dst = load_128_aligned ((__m128i*)dst);
+
+ xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
+
+ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
+ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
+ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
+
+ expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
+
+ in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
+
+ save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
+ }
+
+ src += 4;
+ dst += 4;
+ mask += 4;
+ w -= 4;
+ }
+
+ while (w)
+ {
+ m = (uint32_t) *mask++;
+
+ if (m)
+ {
+ s = 0xff000000 | *src;
+
+ if (m == 0xff)
+ {
+ *dst = s;
+ }
+ else
+ {
+ __m64 ma, md, ms;
+
+ d = *dst;
+
+ ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m));
+ md = unpack_32_1x64 (d);
+ ms = unpack_32_1x64 (s);
+
+ *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md));
+ }
+
+ }
+
+ src++;
+ dst++;
+ w--;
+ }
+ }
+
+ _mm_empty ();
+}
+
static void
sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
pixman_op_t op,