/* * Copyright © 2008 Rodrigo Kumpera * Copyright © 2008 André Tupinambá * * Permission to use, copy, modify, distribute, and sell this software and its * documentation for any purpose is hereby granted without fee, provided that * the above copyright notice appear in all copies and that both that * copyright notice and this permission notice appear in supporting * documentation, and that the name of Red Hat not be used in advertising or * publicity pertaining to distribution of the software without specific, * written prior permission. Red Hat makes no representations about the * suitability of this software for any purpose. It is provided "as is" * without express or implied warranty. * * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS * SOFTWARE. * * Author: Rodrigo Kumpera (kumpera@gmail.com) * André Tupinambá (andrelrt@gmail.com) * * Based on work by Owen Taylor and Søren Sandmann */ #ifdef HAVE_CONFIG_H #include #endif #include #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ #include /* for SSE2 intrinsics */ #include "pixman-private.h" #include "pixman-combine32.h" #if defined(_MSC_VER) && defined(_M_AMD64) /* Windows 64 doesn't allow MMX to be used, so * the pixman-x64-mmx-emulation.h file contains * implementations of those MMX intrinsics that * are used in the SSE2 implementation. */ # include "pixman-x64-mmx-emulation.h" #endif #ifdef USE_SSE2 /* -------------------------------------------------------------------- * Locals */ static __m64 mask_x0080; static __m64 mask_x00ff; static __m64 mask_x0101; static __m64 mask_x_alpha; static __m64 mask_x565_rgb; static __m64 mask_x565_unpack; static __m128i mask_0080; static __m128i mask_00ff; static __m128i mask_0101; static __m128i mask_ffff; static __m128i mask_ff000000; static __m128i mask_alpha; static __m128i mask_565_r; static __m128i mask_565_g1, mask_565_g2; static __m128i mask_565_b; static __m128i mask_red; static __m128i mask_green; static __m128i mask_blue; static __m128i mask_565_fix_rb; static __m128i mask_565_fix_g; /* ---------------------------------------------------------------------- * SSE2 Inlines */ static force_inline __m128i unpack_32_1x128 (uint32_t data) { return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); } static force_inline void unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) { *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); } static force_inline __m128i unpack_565_to_8888 (__m128i lo) { __m128i r, g, b, rb, t; r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); rb = _mm_or_si128 (r, b); t = _mm_and_si128 (rb, mask_565_fix_rb); t = _mm_srli_epi32 (t, 5); rb = _mm_or_si128 (rb, t); t = _mm_and_si128 (g, mask_565_fix_g); t = _mm_srli_epi32 (t, 6); g = _mm_or_si128 (g, t); return _mm_or_si128 (rb, g); } static force_inline void unpack_565_128_4x128 (__m128i data, __m128i* data0, __m128i* data1, __m128i* data2, __m128i* data3) { __m128i lo, hi; lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); lo = unpack_565_to_8888 (lo); hi = unpack_565_to_8888 (hi); unpack_128_2x128 (lo, data0, data1); unpack_128_2x128 (hi, data2, data3); } static force_inline uint16_t pack_565_32_16 (uint32_t pixel) { return (uint16_t) (((pixel >> 8) & 0xf800) | ((pixel >> 5) & 0x07e0) | ((pixel >> 3) & 0x001f)); } static force_inline __m128i pack_2x128_128 (__m128i lo, __m128i hi) { return _mm_packus_epi16 (lo, hi); } static force_inline __m128i pack_565_2x128_128 (__m128i lo, __m128i hi) { __m128i data; __m128i r, g1, g2, b; data = pack_2x128_128 (lo, hi); r = _mm_and_si128 (data, mask_565_r); g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); } static force_inline __m128i pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) { return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), pack_565_2x128_128 (*xmm2, *xmm3)); } static force_inline int is_opaque (__m128i x) { __m128i ffs = _mm_cmpeq_epi8 (x, x); return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; } static force_inline int is_zero (__m128i x) { return _mm_movemask_epi8 ( _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; } static force_inline int is_transparent (__m128i x) { return (_mm_movemask_epi8 ( _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; } static force_inline __m128i expand_pixel_32_1x128 (uint32_t data) { return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); } static force_inline __m128i expand_alpha_1x128 (__m128i data) { return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 3, 3, 3)), _MM_SHUFFLE (3, 3, 3, 3)); } static force_inline void expand_alpha_2x128 (__m128i data_lo, __m128i data_hi, __m128i* alpha_lo, __m128i* alpha_hi) { __m128i lo, hi; lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); } static force_inline void expand_alpha_rev_2x128 (__m128i data_lo, __m128i data_hi, __m128i* alpha_lo, __m128i* alpha_hi) { __m128i lo, hi; lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); } static force_inline void pix_multiply_2x128 (__m128i* data_lo, __m128i* data_hi, __m128i* alpha_lo, __m128i* alpha_hi, __m128i* ret_lo, __m128i* ret_hi) { __m128i lo, hi; lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); lo = _mm_adds_epu16 (lo, mask_0080); hi = _mm_adds_epu16 (hi, mask_0080); *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); } static force_inline void pix_add_multiply_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_dst_lo, __m128i* alpha_dst_hi, __m128i* dst_lo, __m128i* dst_hi, __m128i* alpha_src_lo, __m128i* alpha_src_hi, __m128i* ret_lo, __m128i* ret_hi) { __m128i t1_lo, t1_hi; __m128i t2_lo, t2_hi; pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); } static force_inline void negate_2x128 (__m128i data_lo, __m128i data_hi, __m128i* neg_lo, __m128i* neg_hi) { *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); } static force_inline void invert_colors_2x128 (__m128i data_lo, __m128i data_hi, __m128i* inv_lo, __m128i* inv_hi) { __m128i lo, hi; lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); } static force_inline void over_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_lo, __m128i* alpha_hi, __m128i* dst_lo, __m128i* dst_hi) { __m128i t1, t2; negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); } static force_inline void over_rev_non_pre_2x128 (__m128i src_lo, __m128i src_hi, __m128i* dst_lo, __m128i* dst_hi) { __m128i lo, hi; __m128i alpha_lo, alpha_hi; expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); lo = _mm_or_si128 (alpha_lo, mask_alpha); hi = _mm_or_si128 (alpha_hi, mask_alpha); invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); } static force_inline void in_over_2x128 (__m128i* src_lo, __m128i* src_hi, __m128i* alpha_lo, __m128i* alpha_hi, __m128i* mask_lo, __m128i* mask_hi, __m128i* dst_lo, __m128i* dst_hi) { __m128i s_lo, s_hi; __m128i a_lo, a_hi; pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); } static force_inline void cache_prefetch (__m128i* addr) { _mm_prefetch ((void const*)addr, _MM_HINT_T0); } static force_inline void cache_prefetch_next (__m128i* addr) { _mm_prefetch ((void const *)(addr + 4), _MM_HINT_T0); /* 64 bytes ahead */ } /* prefetching NULL is very slow on some systems. don't do that. */ static force_inline void maybe_prefetch (__m128i* addr) { if (addr) cache_prefetch (addr); } static force_inline void maybe_prefetch_next (__m128i* addr) { if (addr) cache_prefetch_next (addr); } /* load 4 pixels from a 16-byte boundary aligned address */ static force_inline __m128i load_128_aligned (__m128i* src) { return _mm_load_si128 (src); } /* load 4 pixels from a unaligned address */ static force_inline __m128i load_128_unaligned (const __m128i* src) { return _mm_loadu_si128 (src); } /* save 4 pixels using Write Combining memory on a 16-byte * boundary aligned address */ static force_inline void save_128_write_combining (__m128i* dst, __m128i data) { _mm_stream_si128 (dst, data); } /* save 4 pixels on a 16-byte boundary aligned address */ static force_inline void save_128_aligned (__m128i* dst, __m128i data) { _mm_store_si128 (dst, data); } /* save 4 pixels on a unaligned address */ static force_inline void save_128_unaligned (__m128i* dst, __m128i data) { _mm_storeu_si128 (dst, data); } /* ------------------------------------------------------------------ * MMX inlines */ static force_inline __m64 unpack_32_1x64 (uint32_t data) { return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (data), _mm_setzero_si64 ()); } static force_inline __m64 expand_alpha_1x64 (__m64 data) { return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 3, 3, 3)); } static force_inline __m64 expand_alpha_rev_1x64 (__m64 data) { return _mm_shuffle_pi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); } static force_inline __m64 expand_pixel_8_1x64 (uint8_t data) { return _mm_shuffle_pi16 ( unpack_32_1x64 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); } static force_inline __m64 pix_multiply_1x64 (__m64 data, __m64 alpha) { return _mm_mulhi_pu16 (_mm_adds_pu16 (_mm_mullo_pi16 (data, alpha), mask_x0080), mask_x0101); } static force_inline __m64 pix_add_multiply_1x64 (__m64* src, __m64* alpha_dst, __m64* dst, __m64* alpha_src) { __m64 t1 = pix_multiply_1x64 (*src, *alpha_dst); __m64 t2 = pix_multiply_1x64 (*dst, *alpha_src); return _mm_adds_pu8 (t1, t2); } static force_inline __m64 negate_1x64 (__m64 data) { return _mm_xor_si64 (data, mask_x00ff); } static force_inline __m64 invert_colors_1x64 (__m64 data) { return _mm_shuffle_pi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); } static force_inline __m64 over_1x64 (__m64 src, __m64 alpha, __m64 dst) { return _mm_adds_pu8 (src, pix_multiply_1x64 (dst, negate_1x64 (alpha))); } static force_inline __m64 in_over_1x64 (__m64* src, __m64* alpha, __m64* mask, __m64* dst) { return over_1x64 (pix_multiply_1x64 (*src, *mask), pix_multiply_1x64 (*alpha, *mask), *dst); } static force_inline __m64 over_rev_non_pre_1x64 (__m64 src, __m64 dst) { __m64 alpha = expand_alpha_1x64 (src); return over_1x64 (pix_multiply_1x64 (invert_colors_1x64 (src), _mm_or_si64 (alpha, mask_x_alpha)), alpha, dst); } static force_inline uint32_t pack_1x64_32 (__m64 data) { return _mm_cvtsi64_si32 (_mm_packs_pu16 (data, _mm_setzero_si64 ())); } /* Expand 16 bits positioned at @pos (0-3) of a mmx register into * * 00RR00GG00BB * * --- Expanding 565 in the low word --- * * m = (m << (32 - 3)) | (m << (16 - 5)) | m; * m = m & (01f0003f001f); * m = m * (008404100840); * m = m >> 8; * * Note the trick here - the top word is shifted by another nibble to * avoid it bumping into the middle word */ static force_inline __m64 expand565_16_1x64 (uint16_t pixel) { __m64 p; __m64 t1, t2; p = _mm_cvtsi32_si64 ((uint32_t) pixel); t1 = _mm_slli_si64 (p, 36 - 11); t2 = _mm_slli_si64 (p, 16 - 5); p = _mm_or_si64 (t1, p); p = _mm_or_si64 (t2, p); p = _mm_and_si64 (p, mask_x565_rgb); p = _mm_mullo_pi16 (p, mask_x565_unpack); return _mm_srli_pi16 (p, 8); } /* ---------------------------------------------------------------------------- * Compose Core transformations */ static force_inline uint32_t core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) { uint8_t a; __m64 ms; a = src >> 24; if (a == 0xff) { return src; } else if (src) { ms = unpack_32_1x64 (src); return pack_1x64_32 ( over_1x64 (ms, expand_alpha_1x64 (ms), unpack_32_1x64 (dst))); } return dst; } static force_inline uint32_t combine1 (const uint32_t *ps, const uint32_t *pm) { uint32_t s = *ps; if (pm) { __m64 ms, mm; mm = unpack_32_1x64 (*pm); mm = expand_alpha_1x64 (mm); ms = unpack_32_1x64 (s); ms = pix_multiply_1x64 (ms, mm); s = pack_1x64_32 (ms); } return s; } static force_inline __m128i combine4 (const __m128i *ps, const __m128i *pm) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_msk_lo, xmm_msk_hi; __m128i s; if (pm) { xmm_msk_lo = load_128_unaligned (pm); if (is_transparent (xmm_msk_lo)) return _mm_setzero_si128 (); } s = load_128_unaligned (ps); if (pm) { unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_msk_lo, &xmm_msk_hi, &xmm_src_lo, &xmm_src_hi); s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); } return s; } static force_inline void core_combine_over_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w) { uint32_t s, d; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { d = *pd; s = combine1 (ps, pm); *pd++ = core_combine_over_u_pixel_sse2 (s, d); ps++; if (pm) pm++; w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); /* I'm loading unaligned because I'm not sure about * the address alignment. */ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); if (is_opaque (xmm_src_hi)) { save_128_aligned ((__m128i*)pd, xmm_src_hi); } else if (!is_zero (xmm_src_hi)) { xmm_dst_hi = load_128_aligned ((__m128i*) pd); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 ( xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi); /* rebuid the 4 pixel data and save*/ save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); } w -= 4; ps += 4; pd += 4; if (pm) pm += 4; } while (w) { d = *pd; s = combine1 (ps, pm); *pd++ = core_combine_over_u_pixel_sse2 (s, d); ps++; if (pm) pm++; w--; } } static force_inline void core_combine_over_reverse_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w) { uint32_t s, d; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)pd & 15)) { d = *pd; s = combine1 (ps, pm); *pd++ = core_combine_over_u_pixel_sse2 (d, s); w--; ps++; if (pm) pm++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); /* I'm loading unaligned because I'm not sure * about the address alignment. */ xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi); over_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_src_lo, &xmm_src_hi); /* rebuid the 4 pixel data and save*/ save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_src_lo, xmm_src_hi)); w -= 4; ps += 4; pd += 4; if (pm) pm += 4; } while (w) { d = *pd; s = combine1 (ps, pm); *pd++ = core_combine_over_u_pixel_sse2 (d, s); ps++; w--; if (pm) pm++; } } static force_inline uint32_t core_combine_in_u_pixelsse2 (uint32_t src, uint32_t dst) { uint32_t maska = src >> 24; if (maska == 0) { return 0; } else if (maska != 0xff) { return pack_1x64_32 ( pix_multiply_1x64 (unpack_32_1x64 (dst), expand_alpha_1x64 (unpack_32_1x64 (src)))); } return dst; } static force_inline void core_combine_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w) { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_in_u_pixelsse2 (d, s); w--; ps++; if (pm) pm++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ((__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; w -= 4; if (pm) pm += 4; } while (w) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_in_u_pixelsse2 (d, s); w--; ps++; if (pm) pm++; } } static force_inline void core_combine_reverse_in_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_in_u_pixelsse2 (s, d); ps++; w--; if (pm) pm++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; w -= 4; if (pm) pm += 4; } while (w) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_in_u_pixelsse2 (s, d); w--; ps++; if (pm) pm++; } } static force_inline void core_combine_reverse_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w) { /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (d), negate_1x64 ( expand_alpha_1x64 (unpack_32_1x64 (s))))); if (pm) pm++; ps++; w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; if (pm) pm += 4; w -= 4; } while (w) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (d), negate_1x64 ( expand_alpha_1x64 (unpack_32_1x64 (s))))); ps++; if (pm) pm++; w--; } } static force_inline void core_combine_out_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w) { /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (s), negate_1x64 ( expand_alpha_1x64 (unpack_32_1x64 (d))))); w--; ps++; if (pm) pm++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; w -= 4; if (pm) pm += 4; } while (w) { uint32_t s = combine1 (ps, pm); uint32_t d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (s), negate_1x64 ( expand_alpha_1x64 (unpack_32_1x64 (d))))); w--; ps++; if (pm) pm++; } } static force_inline uint32_t core_combine_atop_u_pixel_sse2 (uint32_t src, uint32_t dst) { __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); __m64 sa = negate_1x64 (expand_alpha_1x64 (s)); __m64 da = expand_alpha_1x64 (d); return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); } static force_inline void core_combine_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w) { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_atop_u_pixel_sse2 (s, d); w--; ps++; if (pm) pm++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi); pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; w -= 4; if (pm) pm += 4; } while (w) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_atop_u_pixel_sse2 (s, d); w--; ps++; if (pm) pm++; } } static force_inline uint32_t core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, uint32_t dst) { __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); __m64 sa = expand_alpha_1x64 (s); __m64 da = negate_1x64 (expand_alpha_1x64 (d)); return pack_1x64_32 (pix_add_multiply_1x64 (&s, &da, &d, &sa)); } static force_inline void core_combine_reverse_atop_u_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t* pm, int w) { uint32_t s, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); ps++; w--; if (pm) pm++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*) pd); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; w -= 4; if (pm) pm += 4; } while (w) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); ps++; w--; if (pm) pm++; } } static force_inline uint32_t core_combine_xor_u_pixel_sse2 (uint32_t src, uint32_t dst) { __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); __m64 neg_d = negate_1x64 (expand_alpha_1x64 (d)); __m64 neg_s = negate_1x64 (expand_alpha_1x64 (s)); return pack_1x64_32 (pix_add_multiply_1x64 (&s, &neg_d, &d, &neg_s)); } static force_inline void core_combine_xor_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t *mask, int width) { int w = width; uint32_t s, d; uint32_t* pd = dst; const uint32_t* ps = src; const uint32_t* pm = mask; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && ((unsigned long) pd & 15)) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_xor_u_pixel_sse2 (s, d); w--; ps++; if (pm) pm++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); xmm_dst = load_128_aligned ((__m128i*) pd); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi); negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); pix_add_multiply_2x128 ( &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; w -= 4; if (pm) pm += 4; } while (w) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_xor_u_pixel_sse2 (s, d); w--; ps++; if (pm) pm++; } } static force_inline void core_combine_add_u_sse2 (uint32_t* dst, const uint32_t* src, const uint32_t* mask, int width) { int w = width; uint32_t s, d; uint32_t* pd = dst; const uint32_t* ps = src; const uint32_t* pm = mask; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); d = *pd; ps++; if (pm) pm++; *pd++ = _mm_cvtsi64_si32 ( _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { __m128i s; /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); s = combine4 ((__m128i*)ps, (__m128i*)pm); save_128_aligned ( (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); pd += 4; ps += 4; if (pm) pm += 4; w -= 4; } while (w--) { s = combine1 (ps, pm); d = *pd; ps++; *pd++ = _mm_cvtsi64_si32 ( _mm_adds_pu8 (_mm_cvtsi32_si64 (s), _mm_cvtsi32_si64 (d))); if (pm) pm++; } } static force_inline uint32_t core_combine_saturate_u_pixel_sse2 (uint32_t src, uint32_t dst) { __m64 ms = unpack_32_1x64 (src); __m64 md = unpack_32_1x64 (dst); uint32_t sa = src >> 24; uint32_t da = ~dst >> 24; if (sa > da) { ms = pix_multiply_1x64 ( ms, expand_alpha_1x64 (unpack_32_1x64 (DIV_UN8 (da, sa) << 24))); } return pack_1x64_32 (_mm_adds_pu16 (md, ms)); } static force_inline void core_combine_saturate_u_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, d; uint32_t pack_cmp; __m128i xmm_src, xmm_dst; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); w--; ps++; if (pm) pm++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); maybe_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); maybe_prefetch_next ((__m128i*)pm); xmm_dst = load_128_aligned ((__m128i*)pd); xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); pack_cmp = _mm_movemask_epi8 ( _mm_cmpgt_epi32 ( _mm_srli_epi32 (xmm_src, 24), _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); /* if some alpha src is grater than respective ~alpha dst */ if (pack_cmp) { s = combine1 (ps++, pm); d = *pd; *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); if (pm) pm++; s = combine1 (ps++, pm); d = *pd; *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); if (pm) pm++; s = combine1 (ps++, pm); d = *pd; *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); if (pm) pm++; s = combine1 (ps++, pm); d = *pd; *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); if (pm) pm++; } else { save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); pd += 4; ps += 4; if (pm) pm += 4; } w -= 4; } while (w--) { s = combine1 (ps, pm); d = *pd; *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); ps++; if (pm) pm++; } } static force_inline void core_combine_src_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) { uint32_t s, m; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst_lo, xmm_dst_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; *pd++ = pack_1x64_32 ( pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; *pd++ = pack_1x64_32 ( pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m))); w--; } } static force_inline uint32_t core_combine_over_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { __m64 s = unpack_32_1x64 (src); __m64 expAlpha = expand_alpha_1x64 (s); __m64 unpk_mask = unpack_32_1x64 (mask); __m64 unpk_dst = unpack_32_1x64 (dst); return pack_1x64_32 (in_over_1x64 (&s, &expAlpha, &unpk_mask, &unpk_dst)); } static force_inline void core_combine_over_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); w--; } } static force_inline uint32_t core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { __m64 d = unpack_32_1x64 (dst); return pack_1x64_32 ( over_1x64 (d, expand_alpha_1x64 (d), pix_multiply_1x64 (unpack_32_1x64 (src), unpack_32_1x64 (mask)))); } static force_inline void core_combine_over_reverse_ca_sse2 (uint32_t* pd, const uint32_t* ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); over_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); w--; } } static force_inline void core_combine_in_ca_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), expand_alpha_1x64 (unpack_32_1x64 (d)))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( pix_multiply_1x64 ( unpack_32_1x64 (s), unpack_32_1x64 (m)), expand_alpha_1x64 (unpack_32_1x64 (d)))); w--; } } static force_inline void core_combine_in_reverse_ca_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (d), pix_multiply_1x64 (unpack_32_1x64 (m), expand_alpha_1x64 (unpack_32_1x64 (s))))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_alpha_lo, &xmm_alpha_hi); pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (d), pix_multiply_1x64 (unpack_32_1x64 (m), expand_alpha_1x64 (unpack_32_1x64 (s))))); w--; } } static force_inline void core_combine_out_ca_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( pix_multiply_1x64 ( unpack_32_1x64 (s), unpack_32_1x64 (m)), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi); negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, &xmm_alpha_lo, &xmm_alpha_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( pix_multiply_1x64 ( unpack_32_1x64 (s), unpack_32_1x64 (m)), negate_1x64 (expand_alpha_1x64 (unpack_32_1x64 (d))))); w--; } } static force_inline void core_combine_out_reverse_ca_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (d), negate_1x64 (pix_multiply_1x64 ( unpack_32_1x64 (m), expand_alpha_1x64 (unpack_32_1x64 (s)))))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask_lo, &xmm_mask_hi); negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (d), negate_1x64 (pix_multiply_1x64 ( unpack_32_1x64 (m), expand_alpha_1x64 (unpack_32_1x64 (s)))))); w--; } } static force_inline uint32_t core_combine_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { __m64 m = unpack_32_1x64 (mask); __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); __m64 sa = expand_alpha_1x64 (s); __m64 da = expand_alpha_1x64 (d); s = pix_multiply_1x64 (s, m); m = negate_1x64 (pix_multiply_1x64 (m, sa)); return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); } static force_inline void core_combine_atop_ca_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi); pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi); negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); pix_add_multiply_2x128 ( &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); w--; } } static force_inline uint32_t core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { __m64 m = unpack_32_1x64 (mask); __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); __m64 da = negate_1x64 (expand_alpha_1x64 (d)); __m64 sa = expand_alpha_1x64 (s); s = pix_multiply_1x64 (s, m); m = pix_multiply_1x64 (m, sa); return pack_1x64_32 (pix_add_multiply_1x64 (&d, &m, &s, &da)); } static force_inline void core_combine_reverse_atop_ca_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi); pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi); negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); pix_add_multiply_2x128 ( &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); w--; } } static force_inline uint32_t core_combine_xor_ca_pixel_sse2 (uint32_t src, uint32_t mask, uint32_t dst) { __m64 a = unpack_32_1x64 (mask); __m64 s = unpack_32_1x64 (src); __m64 d = unpack_32_1x64 (dst); __m64 alpha_dst = negate_1x64 (pix_multiply_1x64 ( a, expand_alpha_1x64 (s))); __m64 dest = pix_multiply_1x64 (s, a); __m64 alpha_src = negate_1x64 (expand_alpha_1x64 (d)); return pack_1x64_32 (pix_add_multiply_1x64 (&d, &alpha_dst, &dest, &alpha_src)); } static force_inline void core_combine_xor_ca_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi); expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi); pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, &xmm_mask_lo, &xmm_mask_hi); negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); pix_add_multiply_2x128 ( &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); w--; } } static force_inline void core_combine_add_ca_sse2 (uint32_t * pd, const uint32_t *ps, const uint32_t *pm, int w) { uint32_t s, m, d; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask_lo, xmm_mask_hi; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), unpack_32_1x64 (d))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)ps); cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)ps); cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_src_hi = load_128_unaligned ((__m128i*)ps); xmm_mask_hi = load_128_unaligned ((__m128i*)pm); xmm_dst_hi = load_128_aligned ((__m128i*)pd); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_mask_lo, &xmm_mask_hi, &xmm_src_lo, &xmm_src_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 ( _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); ps += 4; pd += 4; pm += 4; w -= 4; } while (w) { s = *ps++; m = *pm++; d = *pd; *pd++ = pack_1x64_32 ( _mm_adds_pu8 (pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (m)), unpack_32_1x64 (d))); w--; } } /* --------------------------------------------------- * fb_compose_setup_sSE2 */ static force_inline __m64 create_mask_16_64 (uint16_t mask) { return _mm_set1_pi16 (mask); } static force_inline __m128i create_mask_16_128 (uint16_t mask) { return _mm_set1_epi16 (mask); } static force_inline __m64 create_mask_2x32_64 (uint32_t mask0, uint32_t mask1) { return _mm_set_pi32 (mask0, mask1); } /* Work around a code generation bug in Sun Studio 12. */ #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) # define create_mask_2x32_128(mask0, mask1) \ (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) #else static force_inline __m128i create_mask_2x32_128 (uint32_t mask0, uint32_t mask1) { return _mm_set_epi32 (mask0, mask1, mask0, mask1); } #endif /* SSE2 code patch for fbcompose.c */ static void sse2_combine_over_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_over_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_over_reverse_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_over_reverse_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_in_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_in_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_in_reverse_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_reverse_in_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_out_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_out_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_out_reverse_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_reverse_out_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_atop_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_atop_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_atop_reverse_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_reverse_atop_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_xor_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_xor_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_add_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_add_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_saturate_u (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_saturate_u_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_src_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_src_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_over_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_over_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_over_reverse_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_over_reverse_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_in_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_in_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_in_reverse_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_in_reverse_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_out_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_out_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_out_reverse_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_out_reverse_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_atop_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_atop_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_reverse_atop_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_xor_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_xor_ca_sse2 (dst, src, mask, width); _mm_empty (); } static void sse2_combine_add_ca (pixman_implementation_t *imp, pixman_op_t op, uint32_t * dst, const uint32_t * src, const uint32_t * mask, int width) { core_combine_add_ca_sse2 (dst, src, mask, width); _mm_empty (); } /* ------------------------------------------------------------------- * composite_over_n_8888 */ static void sse2_composite_over_n_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t src; uint32_t *dst_line, *dst, d; uint16_t w; int dst_stride; __m128i xmm_src, xmm_alpha; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; src = _pixman_image_get_solid (src_image, dst_image->bits.format); if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); while (height--) { dst = dst_line; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)dst); dst_line += dst_stride; w = width; while (w && (unsigned long)dst & 15) { d = *dst; *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (d))); w--; } cache_prefetch ((__m128i*)dst); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)dst); xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst_lo, &xmm_dst_hi); /* rebuid the 4 pixel data and save*/ save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); w -= 4; dst += 4; } while (w) { d = *dst; *dst++ = pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (d))); w--; } } _mm_empty (); } /* --------------------------------------------------------------------- * composite_over_n_0565 */ static void sse2_composite_over_n_0565 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t src; uint16_t *dst_line, *dst, d; uint16_t w; int dst_stride; __m128i xmm_src, xmm_alpha; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; src = _pixman_image_get_solid (src_image, dst_image->bits.format); if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); while (height--) { dst = dst_line; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)dst); dst_line += dst_stride; w = width; while (w && (unsigned long)dst & 15) { d = *dst; *dst++ = pack_565_32_16 ( pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), _mm_movepi64_pi64 (xmm_alpha), expand565_16_1x64 (d)))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)dst); while (w >= 8) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)dst); xmm_dst = load_128_aligned ((__m128i*)dst); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst0, &xmm_dst1); over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_dst2, &xmm_dst3); xmm_dst = pack_565_4x128_128 ( &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); save_128_aligned ((__m128i*)dst, xmm_dst); dst += 8; w -= 8; } while (w--) { d = *dst; *dst++ = pack_565_32_16 ( pack_1x64_32 (over_1x64 (_mm_movepi64_pi64 (xmm_src), _mm_movepi64_pi64 (xmm_alpha), expand565_16_1x64 (d)))); } } _mm_empty (); } /* ------------------------------ * composite_add_n_8888_8888_ca */ static void sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t src, srca; uint32_t *dst_line, d; uint32_t *mask_line, m; uint32_t pack_cmp; int dst_stride, mask_stride; __m128i xmm_src, xmm_alpha; __m128i xmm_dst; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (src_image, dst_image->bits.format); srca = src >> 24; if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); xmm_src = _mm_unpacklo_epi8 ( create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); xmm_alpha = expand_alpha_1x128 (xmm_src); mmx_src = _mm_movepi64_pi64 (xmm_src); mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); while (height--) { int w = width; const uint32_t *pm = (uint32_t *)mask_line; uint32_t *pd = (uint32_t *)dst_line; dst_line += dst_stride; mask_line += mask_stride; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { m = *pm++; if (m) { d = *pd; mmx_mask = unpack_32_1x64 (m); mmx_dest = unpack_32_1x64 (d); *pd = pack_1x64_32 ( _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); } pd++; w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ if (pack_cmp != 0xffff) { xmm_dst = load_128_aligned ((__m128i*)pd); unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); pix_multiply_2x128 (&xmm_src, &xmm_src, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); save_128_aligned ( (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); } pd += 4; pm += 4; w -= 4; } while (w) { m = *pm++; if (m) { d = *pd; mmx_mask = unpack_32_1x64 (m); mmx_dest = unpack_32_1x64 (d); *pd = pack_1x64_32 ( _mm_adds_pu8 (pix_multiply_1x64 (mmx_mask, mmx_src), mmx_dest)); } pd++; w--; } } _mm_empty (); } /* --------------------------------------------------------------------------- * composite_over_n_8888_8888_ca */ static void sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t src; uint32_t *dst_line, d; uint32_t *mask_line, m; uint32_t pack_cmp; int dst_stride, mask_stride; __m128i xmm_src, xmm_alpha; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (src_image, dst_image->bits.format); if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); xmm_src = _mm_unpacklo_epi8 ( create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); xmm_alpha = expand_alpha_1x128 (xmm_src); mmx_src = _mm_movepi64_pi64 (xmm_src); mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); while (height--) { int w = width; const uint32_t *pm = (uint32_t *)mask_line; uint32_t *pd = (uint32_t *)dst_line; dst_line += dst_stride; mask_line += mask_stride; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w && (unsigned long)pd & 15) { m = *pm++; if (m) { d = *pd; mmx_mask = unpack_32_1x64 (m); mmx_dest = unpack_32_1x64 (d); *pd = pack_1x64_32 (in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); } pd++; w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)pd); cache_prefetch ((__m128i*)pm); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)pd); cache_prefetch_next ((__m128i*)pm); xmm_mask = load_128_unaligned ((__m128i*)pm); pack_cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ if (pack_cmp != 0xffff) { xmm_dst = load_128_aligned ((__m128i*)pd); unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); } pd += 4; pm += 4; w -= 4; } while (w) { m = *pm++; if (m) { d = *pd; mmx_mask = unpack_32_1x64 (m); mmx_dest = unpack_32_1x64 (d); *pd = pack_1x64_32 ( in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); } pd++; w--; } } _mm_empty (); } /*--------------------------------------------------------------------- * composite_over_8888_n_8888 */ static void sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t *dst_line, *dst; uint32_t *src_line, *src; uint32_t mask; uint16_t w; int dst_stride, src_stride; __m128i xmm_mask; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_alpha_lo, xmm_alpha_hi; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); xmm_mask = create_mask_16_128 (mask >> 24); while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)dst); cache_prefetch ((__m128i*)src); while (w && (unsigned long)dst & 15) { uint32_t s = *src++; uint32_t d = *dst; __m64 ms = unpack_32_1x64 (s); __m64 alpha = expand_alpha_1x64 (ms); __m64 dest = _mm_movepi64_pi64 (xmm_mask); __m64 alpha_dst = unpack_32_1x64 (d); *dst++ = pack_1x64_32 ( in_over_1x64 (&ms, &alpha, &dest, &alpha_dst)); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)dst); cache_prefetch ((__m128i*)src); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)dst); cache_prefetch_next ((__m128i*)src); xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_mask, &xmm_mask, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); dst += 4; src += 4; w -= 4; } while (w) { uint32_t s = *src++; uint32_t d = *dst; __m64 ms = unpack_32_1x64 (s); __m64 alpha = expand_alpha_1x64 (ms); __m64 mask = _mm_movepi64_pi64 (xmm_mask); __m64 dest = unpack_32_1x64 (d); *dst++ = pack_1x64_32 ( in_over_1x64 (&ms, &alpha, &mask, &dest)); w--; } } _mm_empty (); } /* --------------------------------------------------------------------- * composite_over_x888_n_8888 */ static void sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t *dst_line, *dst; uint32_t *src_line, *src; uint32_t mask; int dst_stride, src_stride; uint16_t w; __m128i xmm_mask, xmm_alpha; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); xmm_mask = create_mask_16_128 (mask >> 24); xmm_alpha = mask_00ff; while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)dst); cache_prefetch ((__m128i*)src); while (w && (unsigned long)dst & 15) { uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; __m64 src = unpack_32_1x64 (s); __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); __m64 mask = _mm_movepi64_pi64 (xmm_mask); __m64 dest = unpack_32_1x64 (d); *dst++ = pack_1x64_32 ( in_over_1x64 (&src, &alpha, &mask, &dest)); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)dst); cache_prefetch ((__m128i*)src); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)dst); cache_prefetch_next ((__m128i*)src); xmm_src = _mm_or_si128 ( load_128_unaligned ((__m128i*)src), mask_ff000000); xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha, &xmm_alpha, &xmm_mask, &xmm_mask, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); dst += 4; src += 4; w -= 4; } while (w) { uint32_t s = (*src++) | 0xff000000; uint32_t d = *dst; __m64 src = unpack_32_1x64 (s); __m64 alpha = _mm_movepi64_pi64 (xmm_alpha); __m64 mask = _mm_movepi64_pi64 (xmm_mask); __m64 dest = unpack_32_1x64 (d); *dst++ = pack_1x64_32 ( in_over_1x64 (&src, &alpha, &mask, &dest)); w--; } } _mm_empty (); } /* -------------------------------------------------------------------- * composite_over_8888_8888 */ static void sse2_composite_over_8888_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { int dst_stride, src_stride; uint32_t *dst_line, *dst; uint32_t *src_line, *src; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); dst = dst_line; src = src_line; while (height--) { core_combine_over_u_sse2 (dst, src, NULL, width); dst += dst_stride; src += src_stride; } _mm_empty (); } /* ------------------------------------------------------------------ * composite_over_8888_0565 */ static force_inline uint16_t composite_over_8888_0565pixel (uint32_t src, uint16_t dst) { __m64 ms; ms = unpack_32_1x64 (src); return pack_565_32_16 ( pack_1x64_32 ( over_1x64 ( ms, expand_alpha_1x64 (ms), expand565_16_1x64 (dst)))); } static void sse2_composite_over_8888_0565 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint16_t *dst_line, *dst, d; uint32_t *src_line, *src, s; int dst_stride, src_stride; uint16_t w; __m128i xmm_alpha_lo, xmm_alpha_hi; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME * * I copy the code from MMX one and keep the fixme. * If it's a problem there, probably is a problem here. */ assert (src_image->drawable == mask_image->drawable); #endif while (height--) { dst = dst_line; src = src_line; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); dst_line += dst_stride; src_line += src_stride; w = width; /* Align dst on a 16-byte boundary */ while (w && ((unsigned long)dst & 15)) { s = *src++; d = *dst; *dst++ = composite_over_8888_0565pixel (s, d); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); /* It's a 8 pixel loop */ while (w >= 8) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)src); cache_prefetch_next ((__m128i*)dst); /* I'm loading unaligned because I'm not sure * about the address alignment. */ xmm_src = load_128_unaligned ((__m128i*) src); xmm_dst = load_128_aligned ((__m128i*) dst); /* Unpacking */ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); /* I'm loading next 4 pixels from memory * before to optimze the memory read. */ xmm_src = load_128_unaligned ((__m128i*) (src + 4)); over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst0, &xmm_dst1); /* Unpacking */ unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, &xmm_dst2, &xmm_dst3); save_128_aligned ( (__m128i*)dst, pack_565_4x128_128 ( &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); w -= 8; dst += 8; src += 8; } while (w--) { s = *src++; d = *dst; *dst++ = composite_over_8888_0565pixel (s, d); } } _mm_empty (); } /* ----------------------------------------------------------------- * composite_over_n_8_8888 */ static void sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t src, srca; uint32_t *dst_line, *dst; uint8_t *mask_line, *mask; int dst_stride, mask_stride; uint16_t w; uint32_t m, d; __m128i xmm_src, xmm_alpha, xmm_def; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (src_image, dst_image->bits.format); srca = src >> 24; if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); xmm_def = create_mask_2x32_128 (src, src); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); mmx_src = _mm_movepi64_pi64 (xmm_src); mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; if (m) { d = *dst; mmx_mask = expand_pixel_8_1x64 (m); mmx_dest = unpack_32_1x64 (d); *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); } w--; dst++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)mask); cache_prefetch_next ((__m128i*)dst); m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) { save_128_aligned ((__m128i*)dst, xmm_def); } else if (m) { xmm_dst = load_128_aligned ((__m128i*) dst); xmm_mask = unpack_32_1x128 (m); xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); /* Unpacking */ unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); } w -= 4; dst += 4; mask += 4; } while (w) { uint8_t m = *mask++; if (m) { d = *dst; mmx_mask = expand_pixel_8_1x64 (m); mmx_dest = unpack_32_1x64 (d); *dst = pack_1x64_32 (in_over_1x64 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); } w--; dst++; } } _mm_empty (); } /* ---------------------------------------------------------------- * composite_over_n_8_8888 */ pixman_bool_t pixman_fill_sse2 (uint32_t *bits, int stride, int bpp, int x, int y, int width, int height, uint32_t data) { uint32_t byte_width; uint8_t *byte_line; __m128i xmm_def; if (bpp == 16 && (data >> 16 != (data & 0xffff))) return FALSE; if (bpp != 16 && bpp != 32) return FALSE; if (bpp == 16) { stride = stride * (int) sizeof (uint32_t) / 2; byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); byte_width = 2 * width; stride *= 2; } else { stride = stride * (int) sizeof (uint32_t) / 4; byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); byte_width = 4 * width; stride *= 4; } cache_prefetch ((__m128i*)byte_line); xmm_def = create_mask_2x32_128 (data, data); while (height--) { int w; uint8_t *d = byte_line; byte_line += stride; w = byte_width; cache_prefetch_next ((__m128i*)d); while (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = data; w -= 2; d += 2; } while (w >= 4 && ((unsigned long)d & 15)) { *(uint32_t *)d = data; w -= 4; d += 4; } cache_prefetch_next ((__m128i*)d); while (w >= 128) { cache_prefetch (((__m128i*)d) + 12); save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); save_128_aligned ((__m128i*)(d + 48), xmm_def); save_128_aligned ((__m128i*)(d + 64), xmm_def); save_128_aligned ((__m128i*)(d + 80), xmm_def); save_128_aligned ((__m128i*)(d + 96), xmm_def); save_128_aligned ((__m128i*)(d + 112), xmm_def); d += 128; w -= 128; } if (w >= 64) { cache_prefetch (((__m128i*)d) + 8); save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); save_128_aligned ((__m128i*)(d + 32), xmm_def); save_128_aligned ((__m128i*)(d + 48), xmm_def); d += 64; w -= 64; } cache_prefetch_next ((__m128i*)d); if (w >= 32) { save_128_aligned ((__m128i*)(d), xmm_def); save_128_aligned ((__m128i*)(d + 16), xmm_def); d += 32; w -= 32; } if (w >= 16) { save_128_aligned ((__m128i*)(d), xmm_def); d += 16; w -= 16; } cache_prefetch_next ((__m128i*)d); while (w >= 4) { *(uint32_t *)d = data; w -= 4; d += 4; } if (w >= 2) { *(uint16_t *)d = data; w -= 2; d += 2; } } _mm_empty (); return TRUE; } static void sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t src, srca; uint32_t *dst_line, *dst; uint8_t *mask_line, *mask; int dst_stride, mask_stride; uint16_t w; uint32_t m; __m128i xmm_src, xmm_def; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; src = _pixman_image_get_solid (src_image, dst_image->bits.format); srca = src >> 24; if (src == 0) { pixman_fill_sse2 (dst_image->bits.bits, dst_image->bits.rowstride, PIXMAN_FORMAT_BPP (dst_image->bits.format), dest_x, dest_y, width, height, 0); return; } PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); xmm_def = create_mask_2x32_128 (src, src); xmm_src = expand_pixel_32_1x128 (src); while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w && (unsigned long)dst & 15) { uint8_t m = *mask++; if (m) { *dst = pack_1x64_32 ( pix_multiply_1x64 ( _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); } else { *dst = 0; } w--; dst++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)mask); cache_prefetch_next ((__m128i*)dst); m = *((uint32_t*)mask); if (srca == 0xff && m == 0xffffffff) { save_128_aligned ((__m128i*)dst, xmm_def); } else if (m) { xmm_mask = unpack_32_1x128 (m); xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); /* Unpacking */ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); pix_multiply_2x128 (&xmm_src, &xmm_src, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); } else { save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); } w -= 4; dst += 4; mask += 4; } while (w) { uint8_t m = *mask++; if (m) { *dst = pack_1x64_32 ( pix_multiply_1x64 ( _mm_movepi64_pi64 (xmm_src), expand_pixel_8_1x64 (m))); } else { *dst = 0; } w--; dst++; } } _mm_empty (); } /*----------------------------------------------------------------------- * composite_over_n_8_0565 */ static void sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t src, srca; uint16_t *dst_line, *dst, d; uint8_t *mask_line, *mask; int dst_stride, mask_stride; uint16_t w; uint32_t m; __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; __m128i xmm_src, xmm_alpha; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; src = _pixman_image_get_solid (src_image, dst_image->bits.format); srca = src >> 24; if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); mmx_src = _mm_movepi64_pi64 (xmm_src); mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w && (unsigned long)dst & 15) { m = *mask++; if (m) { d = *dst; mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); mmx_dest = expand565_16_1x64 (d); *dst = pack_565_32_16 ( pack_1x64_32 ( in_over_1x64 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } w--; dst++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w >= 8) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)mask); cache_prefetch_next ((__m128i*)dst); xmm_dst = load_128_aligned ((__m128i*) dst); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); m = *((uint32_t*)mask); mask += 4; if (m) { xmm_mask = unpack_32_1x128 (m); xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); /* Unpacking */ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst0, &xmm_dst1); } m = *((uint32_t*)mask); mask += 4; if (m) { xmm_mask = unpack_32_1x128 (m); xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); /* Unpacking */ unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst2, &xmm_dst3); } save_128_aligned ( (__m128i*)dst, pack_565_4x128_128 ( &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); w -= 8; dst += 8; } while (w) { m = *mask++; if (m) { d = *dst; mmx_mask = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); mmx_dest = expand565_16_1x64 (d); *dst = pack_565_32_16 ( pack_1x64_32 ( in_over_1x64 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } w--; dst++; } } _mm_empty (); } /* ----------------------------------------------------------------------- * composite_over_pixbuf_0565 */ static void sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint16_t *dst_line, *dst, d; uint32_t *src_line, *src, s; int dst_stride, src_stride; uint16_t w; uint32_t opaque, zero; __m64 ms; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME * * I copy the code from MMX one and keep the fixme. * If it's a problem there, probably is a problem here. */ assert (src_image->drawable == mask_image->drawable); #endif while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); while (w && (unsigned long)dst & 15) { s = *src++; d = *dst; ms = unpack_32_1x64 (s); *dst++ = pack_565_32_16 ( pack_1x64_32 ( over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); while (w >= 8) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)src); cache_prefetch_next ((__m128i*)dst); /* First round */ xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); opaque = is_opaque (xmm_src); zero = is_zero (xmm_src); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); /* preload next round*/ xmm_src = load_128_unaligned ((__m128i*)(src + 4)); if (opaque) { invert_colors_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst0, &xmm_dst1); } else if (!zero) { over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst0, &xmm_dst1); } /* Second round */ opaque = is_opaque (xmm_src); zero = is_zero (xmm_src); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); if (opaque) { invert_colors_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst2, &xmm_dst3); } else if (!zero) { over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst2, &xmm_dst3); } save_128_aligned ( (__m128i*)dst, pack_565_4x128_128 ( &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); w -= 8; src += 8; dst += 8; } while (w) { s = *src++; d = *dst; ms = unpack_32_1x64 (s); *dst++ = pack_565_32_16 ( pack_1x64_32 ( over_rev_non_pre_1x64 (ms, expand565_16_1x64 (d)))); w--; } } _mm_empty (); } /* ------------------------------------------------------------------------- * composite_over_pixbuf_8888 */ static void sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t *dst_line, *dst, d; uint32_t *src_line, *src, s; int dst_stride, src_stride; uint16_t w; uint32_t opaque, zero; __m128i xmm_src_lo, xmm_src_hi; __m128i xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); #if 0 /* FIXME * * I copy the code from MMX one and keep the fixme. * If it's a problem there, probably is a problem here. */ assert (src_image->drawable == mask_image->drawable); #endif while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); while (w && (unsigned long)dst & 15) { s = *src++; d = *dst; *dst++ = pack_1x64_32 ( over_rev_non_pre_1x64 ( unpack_32_1x64 (s), unpack_32_1x64 (d))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)src); cache_prefetch_next ((__m128i*)dst); xmm_src_hi = load_128_unaligned ((__m128i*)src); opaque = is_opaque (xmm_src_hi); zero = is_zero (xmm_src_hi); unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); if (opaque) { invert_colors_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); } else if (!zero) { xmm_dst_hi = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); } w -= 4; dst += 4; src += 4; } while (w) { s = *src++; d = *dst; *dst++ = pack_1x64_32 ( over_rev_non_pre_1x64 ( unpack_32_1x64 (s), unpack_32_1x64 (d))); w--; } } _mm_empty (); } /* ------------------------------------------------------------------------------------------------- * composite_over_n_8888_0565_ca */ static void sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t src; uint16_t *dst_line, *dst, d; uint32_t *mask_line, *mask, m; int dst_stride, mask_stride; int w; uint32_t pack_cmp; __m128i xmm_src, xmm_alpha; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; __m64 mmx_src, mmx_alpha, mmx_mask, mmx_dest; src = _pixman_image_get_solid (src_image, dst_image->bits.format); if (src == 0) return; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); xmm_src = expand_pixel_32_1x128 (src); xmm_alpha = expand_alpha_1x128 (xmm_src); mmx_src = _mm_movepi64_pi64 (xmm_src); mmx_alpha = _mm_movepi64_pi64 (xmm_alpha); while (height--) { w = width; mask = mask_line; dst = dst_line; mask_line += mask_stride; dst_line += dst_stride; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w && ((unsigned long)dst & 15)) { m = *(uint32_t *) mask; if (m) { d = *dst; mmx_mask = unpack_32_1x64 (m); mmx_dest = expand565_16_1x64 (d); *dst = pack_565_32_16 ( pack_1x64_32 ( in_over_1x64 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } w--; dst++; mask++; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w >= 8) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)mask); cache_prefetch_next ((__m128i*)dst); /* First round */ xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); pack_cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); unpack_565_128_4x128 (xmm_dst, &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); /* preload next round */ xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); /* preload next round */ if (pack_cmp != 0xffff) { in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst0, &xmm_dst1); } /* Second round */ pack_cmp = _mm_movemask_epi8 ( _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); if (pack_cmp != 0xffff) { in_over_2x128 (&xmm_src, &xmm_src, &xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst2, &xmm_dst3); } save_128_aligned ( (__m128i*)dst, pack_565_4x128_128 ( &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); w -= 8; dst += 8; mask += 8; } while (w) { m = *(uint32_t *) mask; if (m) { d = *dst; mmx_mask = unpack_32_1x64 (m); mmx_dest = expand565_16_1x64 (d); *dst = pack_565_32_16 ( pack_1x64_32 ( in_over_1x64 ( &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); } w--; dst++; mask++; } } _mm_empty (); } /* ----------------------------------------------------------------------- * composite_in_n_8_8 */ static void sse2_composite_in_n_8_8 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint8_t *dst_line, *dst; uint8_t *mask_line, *mask; int dst_stride, mask_stride; uint16_t w, d, m; uint32_t src; uint8_t sa; __m128i xmm_alpha; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); src = _pixman_image_get_solid (src_image, dst_image->bits.format); sa = src >> 24; xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x64_32 ( pix_multiply_1x64 ( pix_multiply_1x64 (_mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), unpack_32_1x64 (d))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w >= 16) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)mask); cache_prefetch_next ((__m128i*)dst); xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); mask += 16; dst += 16; w -= 16; } while (w) { m = (uint32_t) *mask++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x64_32 ( pix_multiply_1x64 ( pix_multiply_1x64 ( _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), unpack_32_1x64 (d))); w--; } } _mm_empty (); } /* --------------------------------------------------------------------------- * composite_in_8_8 */ static void sse2_composite_in_8_8 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint8_t *dst_line, *dst; uint8_t *src_line, *src; int src_stride, dst_stride; uint16_t w; uint32_t s, d; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); while (w && ((unsigned long)dst & 15)) { s = (uint32_t) *src++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x64_32 ( pix_multiply_1x64 ( unpack_32_1x64 (s), unpack_32_1x64 (d))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); while (w >= 16) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)src); cache_prefetch_next ((__m128i*)dst); xmm_src = load_128_unaligned ((__m128i*)src); xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_dst_lo, &xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); src += 16; dst += 16; w -= 16; } while (w) { s = (uint32_t) *src++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x64_32 ( pix_multiply_1x64 (unpack_32_1x64 (s), unpack_32_1x64 (d))); w--; } } _mm_empty (); } /* ------------------------------------------------------------------------- * composite_add_n_8_8 */ static void sse2_composite_add_n_8_8 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint8_t *dst_line, *dst; uint8_t *mask_line, *mask; int dst_stride, mask_stride; uint16_t w; uint32_t src; uint8_t sa; uint32_t m, d; __m128i xmm_alpha; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); src = _pixman_image_get_solid (src_image, dst_image->bits.format); sa = src >> 24; xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); while (height--) { dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w && ((unsigned long)dst & 15)) { m = (uint32_t) *mask++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x64_32 ( _mm_adds_pu16 ( pix_multiply_1x64 ( _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), unpack_32_1x64 (d))); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)mask); cache_prefetch ((__m128i*)dst); while (w >= 16) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)mask); cache_prefetch_next ((__m128i*)dst); xmm_mask = load_128_unaligned ((__m128i*)mask); xmm_dst = load_128_aligned ((__m128i*)dst); unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, &xmm_mask_lo, &xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); save_128_aligned ( (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); mask += 16; dst += 16; w -= 16; } while (w) { m = (uint32_t) *mask++; d = (uint32_t) *dst; *dst++ = (uint8_t) pack_1x64_32 ( _mm_adds_pu16 ( pix_multiply_1x64 ( _mm_movepi64_pi64 (xmm_alpha), unpack_32_1x64 (m)), unpack_32_1x64 (d))); w--; } } _mm_empty (); } /* ---------------------------------------------------------------------- * composite_add_8000_8000 */ static void sse2_composite_add_8000_8000 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint8_t *dst_line, *dst; uint8_t *src_line, *src; int dst_stride, src_stride; uint16_t w; uint16_t t; PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); while (height--) { dst = dst_line; src = src_line; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); dst_line += dst_stride; src_line += src_stride; w = width; /* Small head */ while (w && (unsigned long)dst & 3) { t = (*dst) + (*src++); *dst++ = t | (0 - (t >> 8)); w--; } core_combine_add_u_sse2 ((uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); /* Small tail */ dst += w & 0xfffc; src += w & 0xfffc; w &= 3; while (w) { t = (*dst) + (*src++); *dst++ = t | (0 - (t >> 8)); w--; } } _mm_empty (); } /* --------------------------------------------------------------------- * composite_add_8888_8888 */ static void sse2_composite_add_8888_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t *dst_line, *dst; uint32_t *src_line, *src; int dst_stride, src_stride; PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); while (height--) { dst = dst_line; dst_line += dst_stride; src = src_line; src_line += src_stride; core_combine_add_u_sse2 (dst, src, NULL, width); } _mm_empty (); } /* ------------------------------------------------------------------------------------------------- * sse2_composite_copy_area */ static pixman_bool_t pixman_blt_sse2 (uint32_t *src_bits, uint32_t *dst_bits, int src_stride, int dst_stride, int src_bpp, int dst_bpp, int src_x, int src_y, int dst_x, int dst_y, int width, int height) { uint8_t * src_bytes; uint8_t * dst_bytes; int byte_width; if (src_bpp != dst_bpp) return FALSE; if (src_bpp == 16) { src_stride = src_stride * (int) sizeof (uint32_t) / 2; dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); byte_width = 2 * width; src_stride *= 2; dst_stride *= 2; } else if (src_bpp == 32) { src_stride = src_stride * (int) sizeof (uint32_t) / 4; dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); byte_width = 4 * width; src_stride *= 4; dst_stride *= 4; } else { return FALSE; } cache_prefetch ((__m128i*)src_bytes); cache_prefetch ((__m128i*)dst_bytes); while (height--) { int w; uint8_t *s = src_bytes; uint8_t *d = dst_bytes; src_bytes += src_stride; dst_bytes += dst_stride; w = byte_width; cache_prefetch_next ((__m128i*)s); cache_prefetch_next ((__m128i*)d); while (w >= 2 && ((unsigned long)d & 3)) { *(uint16_t *)d = *(uint16_t *)s; w -= 2; s += 2; d += 2; } while (w >= 4 && ((unsigned long)d & 15)) { *(uint32_t *)d = *(uint32_t *)s; w -= 4; s += 4; d += 4; } cache_prefetch_next ((__m128i*)s); cache_prefetch_next ((__m128i*)d); while (w >= 64) { __m128i xmm0, xmm1, xmm2, xmm3; /* 128 bytes ahead */ cache_prefetch (((__m128i*)s) + 8); cache_prefetch (((__m128i*)d) + 8); xmm0 = load_128_unaligned ((__m128i*)(s)); xmm1 = load_128_unaligned ((__m128i*)(s + 16)); xmm2 = load_128_unaligned ((__m128i*)(s + 32)); xmm3 = load_128_unaligned ((__m128i*)(s + 48)); save_128_aligned ((__m128i*)(d), xmm0); save_128_aligned ((__m128i*)(d + 16), xmm1); save_128_aligned ((__m128i*)(d + 32), xmm2); save_128_aligned ((__m128i*)(d + 48), xmm3); s += 64; d += 64; w -= 64; } cache_prefetch_next ((__m128i*)s); cache_prefetch_next ((__m128i*)d); while (w >= 16) { save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); w -= 16; d += 16; s += 16; } cache_prefetch_next ((__m128i*)s); cache_prefetch_next ((__m128i*)d); while (w >= 4) { *(uint32_t *)d = *(uint32_t *)s; w -= 4; s += 4; d += 4; } if (w >= 2) { *(uint16_t *)d = *(uint16_t *)s; w -= 2; s += 2; d += 2; } } _mm_empty (); return TRUE; } static void sse2_composite_copy_area (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { pixman_blt_sse2 (src_image->bits.bits, dst_image->bits.bits, src_image->bits.rowstride, dst_image->bits.rowstride, PIXMAN_FORMAT_BPP (src_image->bits.format), PIXMAN_FORMAT_BPP (dst_image->bits.format), src_x, src_y, dest_x, dest_y, width, height); } static void sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src_image, pixman_image_t * mask_image, pixman_image_t * dst_image, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { uint32_t *src, *src_line, s; uint32_t *dst, *dst_line, d; uint8_t *mask, *mask_line; uint32_t m; int src_stride, mask_stride, dst_stride; uint16_t w; __m64 ms; __m128i xmm_src, xmm_src_lo, xmm_src_hi; __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; PIXMAN_IMAGE_GET_LINE ( dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); PIXMAN_IMAGE_GET_LINE ( mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); PIXMAN_IMAGE_GET_LINE ( src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); while (height--) { src = src_line; src_line += src_stride; dst = dst_line; dst_line += dst_stride; mask = mask_line; mask_line += mask_stride; w = width; /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); cache_prefetch ((__m128i*)mask); while (w && (unsigned long)dst & 15) { s = 0xff000000 | *src++; m = (uint32_t) *mask++; d = *dst; ms = unpack_32_1x64 (s); if (m != 0xff) { __m64 ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); __m64 md = unpack_32_1x64 (d); ms = in_over_1x64 (&ms, &mask_x00ff, &ma, &md); } *dst++ = pack_1x64_32 (ms); w--; } /* call prefetch hint to optimize cache load*/ cache_prefetch ((__m128i*)src); cache_prefetch ((__m128i*)dst); cache_prefetch ((__m128i*)mask); while (w >= 4) { /* fill cache line with next memory */ cache_prefetch_next ((__m128i*)src); cache_prefetch_next ((__m128i*)dst); cache_prefetch_next ((__m128i*)mask); m = *(uint32_t*) mask; xmm_src = _mm_or_si128 (load_128_unaligned ((__m128i*)src), mask_ff000000); if (m == 0xffffffff) { save_128_aligned ((__m128i*)dst, xmm_src); } else { xmm_dst = load_128_aligned ((__m128i*)dst); xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); } src += 4; dst += 4; mask += 4; w -= 4; } while (w) { m = (uint32_t) *mask++; if (m) { s = 0xff000000 | *src; if (m == 0xff) { *dst = s; } else { __m64 ma, md, ms; d = *dst; ma = expand_alpha_rev_1x64 (unpack_32_1x64 (m)); md = unpack_32_1x64 (d); ms = unpack_32_1x64 (s); *dst = pack_1x64_32 (in_over_1x64 (&ms, &mask_x00ff, &ma, &md)); } } src++; dst++; w--; } } _mm_empty (); } static const pixman_fast_path_t sse2_fast_paths[] = { { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_r5g6b5, sse2_composite_over_n_8_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_b5g6r5, sse2_composite_over_n_8_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_n_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_over_8888_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_over_8888_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_over_8888_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_over_8888_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_over_8888_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_over_8888_0565, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_8_8888, 0 }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_x888_n_8888, NEED_SOLID_MASK }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, { PIXMAN_OP_OVER, PIXMAN_a8r8g8b8, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, { PIXMAN_OP_OVER, PIXMAN_a8b8g8r8, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_over_8888_n_8888, NEED_SOLID_MASK }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA }, { PIXMAN_OP_OVER, PIXMAN_solid, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_n_8888_0565_ca, NEED_COMPONENT_ALPHA }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_a8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_x8r8g8b8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_a8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_x8b8g8r8, sse2_composite_over_pixbuf_8888, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8r8g8b8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_a8b8g8r8, PIXMAN_r5g6b5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8r8g8b8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_a8b8g8r8, PIXMAN_b5g6r5, sse2_composite_over_pixbuf_0565, NEED_PIXBUF }, { PIXMAN_OP_OVER, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_OVER, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8r8g8b8, PIXMAN_a8r8g8b8, sse2_composite_add_n_8888_8888_ca, NEED_COMPONENT_ALPHA }, { PIXMAN_OP_ADD, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_add_8000_8000, 0 }, { PIXMAN_OP_ADD, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_add_8888_8888, 0 }, { PIXMAN_OP_ADD, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_add_8888_8888, 0 }, { PIXMAN_OP_ADD, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_add_n_8_8, 0 }, { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8r8g8b8, sse2_composite_src_n_8_8888, 0 }, { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8r8g8b8, sse2_composite_src_n_8_8888, 0 }, { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8b8g8r8, sse2_composite_src_n_8_8888, 0 }, { PIXMAN_OP_SRC, PIXMAN_solid, PIXMAN_a8, PIXMAN_x8b8g8r8, sse2_composite_src_n_8_8888, 0 }, { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_a8r8g8b8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_a8b8g8r8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_SRC, PIXMAN_a8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_SRC, PIXMAN_a8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_SRC, PIXMAN_x8r8g8b8, PIXMAN_null, PIXMAN_x8r8g8b8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_SRC, PIXMAN_x8b8g8r8, PIXMAN_null, PIXMAN_x8b8g8r8, sse2_composite_copy_area, 0 }, { PIXMAN_OP_SRC, PIXMAN_r5g6b5, PIXMAN_null, PIXMAN_r5g6b5, sse2_composite_copy_area, 0 }, { PIXMAN_OP_SRC, PIXMAN_b5g6r5, PIXMAN_null, PIXMAN_b5g6r5, sse2_composite_copy_area, 0 }, { PIXMAN_OP_IN, PIXMAN_a8, PIXMAN_null, PIXMAN_a8, sse2_composite_in_8_8, 0 }, { PIXMAN_OP_IN, PIXMAN_solid, PIXMAN_a8, PIXMAN_a8, sse2_composite_in_n_8_8, 0 }, { PIXMAN_OP_NONE }, }; /* * Work around GCC bug causing crashes in Mozilla with SSE2 * * When using -msse, gcc generates movdqa instructions assuming that * the stack is 16 byte aligned. Unfortunately some applications, such * as Mozilla and Mono, end up aligning the stack to 4 bytes, which * causes the movdqa instructions to fail. * * The __force_align_arg_pointer__ makes gcc generate a prologue that * realigns the stack pointer to 16 bytes. * * On x86-64 this is not necessary because the standard ABI already * calls for a 16 byte aligned stack. * * See https://bugs.freedesktop.org/show_bug.cgi?id=15693 */ #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) __attribute__((__force_align_arg_pointer__)) #endif static void sse2_composite (pixman_implementation_t *imp, pixman_op_t op, pixman_image_t * src, pixman_image_t * mask, pixman_image_t * dest, int32_t src_x, int32_t src_y, int32_t mask_x, int32_t mask_y, int32_t dest_x, int32_t dest_y, int32_t width, int32_t height) { if (_pixman_run_fast_path (sse2_fast_paths, imp, op, src, mask, dest, src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height)) { return; } _pixman_implementation_composite (imp->delegate, op, src, mask, dest, src_x, src_y, mask_x, mask_y, dest_x, dest_y, width, height); } #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) __attribute__((__force_align_arg_pointer__)) #endif static pixman_bool_t sse2_blt (pixman_implementation_t *imp, uint32_t * src_bits, uint32_t * dst_bits, int src_stride, int dst_stride, int src_bpp, int dst_bpp, int src_x, int src_y, int dst_x, int dst_y, int width, int height) { if (!pixman_blt_sse2 ( src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y, width, height)) { return _pixman_implementation_blt ( imp->delegate, src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, src_x, src_y, dst_x, dst_y, width, height); } return TRUE; } #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) __attribute__((__force_align_arg_pointer__)) #endif static pixman_bool_t sse2_fill (pixman_implementation_t *imp, uint32_t * bits, int stride, int bpp, int x, int y, int width, int height, uint32_t xor) { if (!pixman_fill_sse2 (bits, stride, bpp, x, y, width, height, xor)) { return _pixman_implementation_fill ( imp->delegate, bits, stride, bpp, x, y, width, height, xor); } return TRUE; } #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) __attribute__((__force_align_arg_pointer__)) #endif pixman_implementation_t * _pixman_implementation_create_sse2 (void) { pixman_implementation_t *mmx = _pixman_implementation_create_mmx (); pixman_implementation_t *imp = _pixman_implementation_create (mmx); /* SSE2 constants */ mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); mask_0080 = create_mask_16_128 (0x0080); mask_00ff = create_mask_16_128 (0x00ff); mask_0101 = create_mask_16_128 (0x0101); mask_ffff = create_mask_16_128 (0xffff); mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); /* MMX constants */ mask_x565_rgb = create_mask_2x32_64 (0x000001f0, 0x003f001f); mask_x565_unpack = create_mask_2x32_64 (0x00000084, 0x04100840); mask_x0080 = create_mask_16_64 (0x0080); mask_x00ff = create_mask_16_64 (0x00ff); mask_x0101 = create_mask_16_64 (0x0101); mask_x_alpha = create_mask_2x32_64 (0x00ff0000, 0x00000000); _mm_empty (); /* Set up function pointers */ /* SSE code patch for fbcompose.c */ imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; imp->composite = sse2_composite; imp->blt = sse2_blt; imp->fill = sse2_fill; return imp; } #endif /* USE_SSE2 */