diff options
author | Søren Sandmann Pedersen <ssp@redhat.com> | 2010-03-31 20:13:24 -0400 |
---|---|---|
committer | Søren Sandmann Pedersen <ssp@redhat.com> | 2010-03-31 20:13:24 -0400 |
commit | cde17cb174cf01feffb5241b4449762b788593e9 (patch) | |
tree | 2750077f8c73f1f38f939336d5707e9e5e507abb | |
parent | efd41c62875d97c5127233cb6a4c353b4d495531 (diff) |
Beginning of sse2 macrosse2-macro
-rw-r--r-- | pixman/pixman-sse2.c | 212 |
1 files changed, 212 insertions, 0 deletions
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c index 946e7ba3..4577aaee 100644 --- a/pixman/pixman-sse2.c +++ b/pixman/pixman-sse2.c @@ -5596,6 +5596,218 @@ sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, _mm_empty (); } +#define MAKE_SSE2_FAST_PATH(name, op, src_format, mask_format, dest_format) \ + static void \ + sse2_composite_ ## name (pixman_implementation_t *imp, \ + pixman_op_t op, \ + pixman_image_t * src_image, \ + pixman_image_t * mask_image, \ + pixman_image_t * dst_image, \ + int32_t src_x, \ + int32_t src_y, \ + int32_t mask_x, \ + int32_t mask_y, \ + int32_t dest_x, \ + int32_t dest_y, \ + int32_t width, \ + int32_t height) \ + { \ + int src_stride, mask_stride, dst_stride; \ + uint32_t w; \ + int n_pixels; \ + int src_bpp, mask_bpp, dst_bpp; \ + \ + dst_bpp = PIXMAN_FORMAT_BPP (dest_format); \ + \ + if (src_format == PIXMAN_solid) \ + src_bpp = 0; \ + \ + if (mask_format == PIXMAN_solid || \ + mask_format == PIXMAN_null) \ + { \ + mask_bpp = 0; \ + } \ + \ + n_pixels = 128 / PIXMAN_FORMAT_BPP (dest_format); \ + while (height--) \ + { \ + w = width; \ + \ + while (w && (unsigned long)dst & 15) \ + { \ + w--; \ + dst += dst_bpp / 8; \ + if (src_bpp) \ + src += src_bpp / 8; \ + if (mask_bpp) \ + mask += mask_bpp / 8; \ + } \ + \ + while (w >= n_pixels) \ + { \ + \ + w -= n_pixels; \ + dst += n_pixels * dst_bpp / 8; \ + \ + if (src_bpp) \ + src += n_pixels * src_bpp / 8; \ + \ + if (mask_bpp) \ + mask += n_pixels * mask_bpp / 8; \ + } \ + \ + if (w) \ + goto one_pixel; \ + } \ + } + +static void +sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, + pixman_op_t op, + pixman_image_t * src_image, + pixman_image_t * mask_image, + pixman_image_t * dst_image, + int32_t src_x, + int32_t src_y, + int32_t mask_x, + int32_t mask_y, + int32_t dest_x, + int32_t dest_y, + int32_t width, + int32_t height) +{ + uint32_t *src, *src_line, s; + uint32_t *dst, *dst_line, d; + uint8_t *mask, *mask_line; + uint32_t m; + int src_stride, mask_stride, dst_stride; + int32_t w; + __m64 ms; + + __m128i xmm_src, xmm_src_lo, xmm_src_hi; + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; + + PIXMAN_IMAGE_GET_LINE ( + dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); + PIXMAN_IMAGE_GET_LINE ( + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); + PIXMAN_IMAGE_GET_LINE ( + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); + + while (height--) + { + src = src_line; + src_line += src_stride; + dst = dst_line; + dst_line += dst_stride; + mask = mask_line; + mask_line += mask_stride; + + w = width; + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)mask); + + while (w && (unsigned long)dst & 15) + { + s = 0xff000000 | *src++; + m = (uint32_t) *mask++; + d = *dst; + ms = unpack_32_1x64 (s); + + if (m != 0xff) + { + __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); + __m64 md = unpack_32_1x64 (d); + + ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md); + } + + *dst++ = pack_1x64_32 (ms); + w--; + } + + /* call prefetch hint to optimize cache load*/ + cache_prefetch ((__m128i*)src); + cache_prefetch ((__m128i*)dst); + cache_prefetch ((__m128i*)mask); + + while (w >= 4) + { + /* fill cache line with next memory */ + cache_prefetch_next ((__m128i*)src); + cache_prefetch_next ((__m128i*)dst); + cache_prefetch_next ((__m128i*)mask); + + m = *(uint32_t*) mask; + xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); + + if (m == 0xffffffff) + { + save_128_aligned ((__m128i*)dst, xmm_src); + } + else + { + xmm_dst = load_128_aligned ((__m128i*)dst); + + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); + + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); + + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); + + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); + + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); + } + + src += 4; + dst += 4; + mask += 4; + w -= 4; + } + + while (w) + { + m = (uint32_t) *mask++; + + if (m) + { + s = 0xff000000 | *src; + + if (m == 0xff) + { + *dst = s; + } + else + { + __m64 ma, md, ms; + + d = *dst; + + ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); + md = unpack_32_1x64 (d); + ms = unpack_32_1x64 (s); + + *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md)); + } + + } + + src++; + dst++; + w--; + } + } + + _mm_empty (); +} + static void sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, pixman_op_t op, |