diff options
author | Matt Turner <mattst88@gmail.com> | 2012-02-21 23:33:02 -0500 |
---|---|---|
committer | Matt Turner <mattst88@gmail.com> | 2012-02-24 08:46:48 -0500 |
commit | 350e231b3f01d6f82a2fdc7d9a9945234c404d0a (patch) | |
tree | a58498318b78321315798138950bb7dc2bedf417 | |
parent | ab68316eda91bbf6bb41158c622347723e1fa8c4 (diff) |
mmx: make load8888 take a pointer to data instead of the data itself
Allows us to tune how we load data into the vector registers.
Signed-off-by: Matt Turner <mattst88@gmail.com>
And squashed in:
mmx: define and use load8888u function
For unaligned loads.
Signed-off-by: Matt Turner <mattst88@gmail.com>
-rw-r--r-- | pixman/pixman-mmx.c | 277 |
1 files changed, 148 insertions, 129 deletions
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index 6ffeb4c..bd44f63 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -347,5 +347,12 @@ static __inline__ uint32_t ldl_u(uint32_t *p) static force_inline __m64 -load8888 (uint32_t v) +load8888 (const uint32_t *v) { - return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ()); + return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (*v), _mm_setzero_si64 ()); +} + +static force_inline __m64 +load8888u (const uint32_t *v) +{ + uint32_t l = ldl_u(v); + return load8888(&l); } @@ -473,4 +480,4 @@ combine (const uint32_t *src, const uint32_t *mask) { - __m64 m = load8888 (*mask); - __m64 s = load8888 (ssrc); + __m64 m = load8888 (mask); + __m64 s = load8888 (&ssrc); @@ -507,5 +514,5 @@ mmx_combine_over_u (pixman_implementation_t *imp, __m64 s, sa; - s = load8888 (ssrc); + s = load8888 (&ssrc); sa = expand_alpha (s); - store8888 (dest, over (s, sa, load8888 (*dest))); + store8888 (dest, over (s, sa, load8888 (dest))); } @@ -535,5 +542,5 @@ mmx_combine_over_reverse_u (pixman_implementation_t *imp, - d = load8888 (*dest); + d = load8888 (dest); da = expand_alpha (d); - store8888 (dest, over (d, da, load8888 (s))); + store8888 (dest, over (d, da, load8888 (&s))); @@ -560,5 +567,6 @@ mmx_combine_in_u (pixman_implementation_t *imp, __m64 x, a; + uint32_t ssrc = combine (src, mask); - x = load8888 (combine (src, mask)); - a = load8888 (*dest); + x = load8888 (&ssrc); + a = load8888 (dest); a = expand_alpha (a); @@ -589,5 +597,6 @@ mmx_combine_in_reverse_u (pixman_implementation_t *imp, __m64 x, a; + uint32_t ssrc = combine (src, mask); - x = load8888 (*dest); - a = load8888 (combine (src, mask)); + x = load8888 (dest); + a = load8888 (&ssrc); a = expand_alpha (a); @@ -617,5 +626,6 @@ mmx_combine_out_u (pixman_implementation_t *imp, __m64 x, a; + uint32_t ssrc = combine (src, mask); - x = load8888 (combine (src, mask)); - a = load8888 (*dest); + x = load8888 (&ssrc); + a = load8888 (dest); a = expand_alpha (a); @@ -646,5 +656,6 @@ mmx_combine_out_reverse_u (pixman_implementation_t *imp, __m64 x, a; + uint32_t ssrc = combine (src, mask); - x = load8888 (*dest); - a = load8888 (combine (src, mask)); + x = load8888 (dest); + a = load8888 (&ssrc); a = expand_alpha (a); @@ -676,5 +687,6 @@ mmx_combine_atop_u (pixman_implementation_t *imp, __m64 s, da, d, sia; + uint32_t ssrc = combine (src, mask); - s = load8888 (combine (src, mask)); - d = load8888 (*dest); + s = load8888 (&ssrc); + d = load8888 (dest); sia = expand_alpha (s); @@ -708,5 +720,6 @@ mmx_combine_atop_reverse_u (pixman_implementation_t *imp, __m64 s, dia, d, sa; + uint32_t ssrc = combine (src, mask); - s = load8888 (combine (src, mask)); - d = load8888 (*dest); + s = load8888 (&ssrc); + d = load8888 (dest); sa = expand_alpha (s); @@ -738,5 +751,6 @@ mmx_combine_xor_u (pixman_implementation_t *imp, __m64 s, dia, d, sia; + uint32_t ssrc = combine (src, mask); - s = load8888 (combine (src, mask)); - d = load8888 (*dest); + s = load8888 (&ssrc); + d = load8888 (dest); sia = expand_alpha (s); @@ -769,5 +783,6 @@ mmx_combine_add_u (pixman_implementation_t *imp, __m64 s, d; + uint32_t ssrc = combine (src, mask); - s = load8888 (combine (src, mask)); - d = load8888 (*dest); + s = load8888 (&ssrc); + d = load8888 (dest); s = pix_add (s, d); @@ -797,4 +812,4 @@ mmx_combine_saturate_u (pixman_implementation_t *imp, uint32_t d = *dest; - __m64 ms = load8888 (s); - __m64 md = load8888 (d); + __m64 ms = load8888 (&s); + __m64 md = load8888 (&d); uint32_t sa = s >> 24; @@ -804,3 +819,4 @@ mmx_combine_saturate_u (pixman_implementation_t *imp, { - __m64 msa = load8888 (DIV_UN8 (da, sa) << 24); + uint32_t quot = DIV_UN8 (da, sa) << 24; + __m64 msa = load8888 ("); msa = expand_alpha (msa); @@ -832,4 +848,4 @@ mmx_combine_src_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); @@ -857,5 +873,5 @@ mmx_combine_over_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 sa = expand_alpha (s); @@ -883,5 +899,5 @@ mmx_combine_over_reverse_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); @@ -909,5 +925,5 @@ mmx_combine_in_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); @@ -937,5 +953,5 @@ mmx_combine_in_reverse_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 sa = expand_alpha (s); @@ -965,5 +981,5 @@ mmx_combine_out_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); @@ -994,5 +1010,5 @@ mmx_combine_out_reverse_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 sa = expand_alpha (s); @@ -1023,5 +1039,5 @@ mmx_combine_atop_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); @@ -1054,5 +1070,5 @@ mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); @@ -1085,5 +1101,5 @@ mmx_combine_xor_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); __m64 da = expand_alpha (d); @@ -1117,5 +1133,5 @@ mmx_combine_add_ca (pixman_implementation_t *imp, { - __m64 a = load8888 (*mask); - __m64 s = load8888 (*src); - __m64 d = load8888 (*dest); + __m64 a = load8888 (mask); + __m64 s = load8888 (src); + __m64 d = load8888 (dest); @@ -1154,3 +1170,3 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); @@ -1167,3 +1183,3 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp, { - store8888 (dst, over (vsrc, vsrca, load8888 (*dst))); + store8888 (dst, over (vsrc, vsrca, load8888 (dst))); @@ -1193,3 +1209,3 @@ mmx_composite_over_n_8888 (pixman_implementation_t *imp, { - store8888 (dst, over (vsrc, vsrca, load8888 (*dst))); + store8888 (dst, over (vsrc, vsrca, load8888 (dst))); } @@ -1220,3 +1236,3 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); @@ -1299,3 +1315,3 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); @@ -1314,4 +1330,4 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, { - __m64 vdest = load8888 (*q); - vdest = in_over (vsrc, vsrca, load8888 (m), vdest); + __m64 vdest = load8888 (q); + vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); store8888 (q, vdest); @@ -1335,5 +1351,5 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, - dest0 = in_over (vsrc, vsrca, load8888 (m0), + dest0 = in_over (vsrc, vsrca, load8888 (&m0), expand8888 (vdest, 0)); - dest1 = in_over (vsrc, vsrca, load8888 (m1), + dest1 = in_over (vsrc, vsrca, load8888 (&m1), expand8888 (vdest, 1)); @@ -1354,4 +1370,4 @@ mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, { - __m64 vdest = load8888 (*q); - vdest = in_over (vsrc, vsrca, load8888 (m), vdest); + __m64 vdest = load8888 (q); + vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); store8888 (q, vdest); @@ -1391,3 +1407,3 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, mask = mask | mask >> 8 | mask >> 16 | mask >> 24; - vmask = load8888 (mask); + vmask = load8888 (&mask); @@ -1403,4 +1419,4 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); @@ -1431,4 +1447,4 @@ mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); @@ -1462,3 +1478,3 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, mask = mask | mask >> 8 | mask >> 16 | mask >> 24; - vmask = load8888 (mask); + vmask = load8888 (&mask); srca = MC (4x00ff); @@ -1475,4 +1491,5 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, { - __m64 s = load8888 (*src | 0xff000000); - __m64 d = load8888 (*dst); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); + __m64 d = load8888 (dst); @@ -1553,4 +1570,5 @@ mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, { - __m64 s = load8888 (*src | 0xff000000); - __m64 d = load8888 (*dst); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); + __m64 d = load8888 (dst); @@ -1604,5 +1622,5 @@ mmx_composite_over_8888_8888 (pixman_implementation_t *imp, __m64 ms, sa; - ms = load8888 (s); + ms = load8888 (&s); sa = expand_alpha (ms); - store8888 (dst, over (ms, sa, load8888 (*dst))); + store8888 (dst, over (ms, sa, load8888 (dst))); } @@ -1647,3 +1665,3 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; @@ -1668,6 +1686,6 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, - vsrc0 = load8888 (*(src + 0)); - vsrc1 = load8888 (*(src + 1)); - vsrc2 = load8888 (*(src + 2)); - vsrc3 = load8888 (*(src + 3)); + vsrc0 = load8888 ((src + 0)); + vsrc1 = load8888 ((src + 1)); + vsrc2 = load8888 ((src + 2)); + vsrc3 = load8888 ((src + 3)); @@ -1691,3 +1709,3 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp, { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; @@ -1734,3 +1752,3 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); @@ -1755,3 +1773,3 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, expand_alpha_rev (to_m64 (m)), - load8888 (*dst)); + load8888 (dst)); @@ -1806,3 +1824,3 @@ mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, { - __m64 vdest = load8888 (*dst); + __m64 vdest = load8888 (dst); @@ -1999,3 +2017,3 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); @@ -2070,3 +2088,3 @@ mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, { - __m64 vdest = load8888 (*dst); + __m64 vdest = load8888 (dst); @@ -2109,3 +2127,3 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); @@ -2248,3 +2266,3 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; @@ -2281,6 +2299,6 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, __m64 vdest; - vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0); - vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1); - vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2); - vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3); + vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0); + vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1); + vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2); + vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3); @@ -2292,6 +2310,6 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, - vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3); + vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0); + vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1); + vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2); + vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3); @@ -2309,3 +2327,3 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, { - __m64 vsrc = load8888 (*src); + __m64 vsrc = load8888 (src); uint64_t d = *dst; @@ -2356,4 +2374,4 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); @@ -2368,3 +2386,3 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, { - uint64_t s0, s1; + uint32_t s0, s1; unsigned char a0, a1; @@ -2380,4 +2398,4 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, { - d0 = invert_colors (load8888 (s0)); - d1 = invert_colors (load8888 (s1)); + d0 = invert_colors (load8888 (&s0)); + d1 = invert_colors (load8888 (&s1)); @@ -2389,4 +2407,4 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, - d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0)); - d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1)); + d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); + d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); @@ -2402,4 +2420,4 @@ mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, { - __m64 s = load8888 (*src); - __m64 d = load8888 (*dst); + __m64 s = load8888 (src); + __m64 d = load8888 (dst); @@ -2433,3 +2451,3 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); @@ -2450,3 +2468,3 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, __m64 vdest = expand565 (to_m64 (d), 0); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); *q = to_uint64 (vdest); @@ -2472,6 +2490,6 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3); @@ -2493,3 +2511,3 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, __m64 vdest = expand565 (to_m64 (d), 0); - vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); *q = to_uint64 (vdest); @@ -2529,3 +2547,3 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); @@ -2561,4 +2579,4 @@ mmx_composite_in_n_8_8 (pixman_implementation_t *imp, - vmask = load8888 (ldl_u((uint32_t *)mask)); - vdest = load8888 (*(uint32_t *)dst); + vmask = load8888u ((uint32_t *)mask); + vdest = load8888 ((uint32_t *)dst); @@ -2631,3 +2649,3 @@ mmx_composite_in_8_8 (pixman_implementation_t *imp, - store8888 (d, in (load8888 (ldl_u((uint32_t *)s)), load8888 (*d))); + store8888 (d, in (load8888u (s), load8888 (d))); @@ -2679,3 +2697,3 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp, - vsrc = load8888 (src); + vsrc = load8888 (&src); vsrca = expand_alpha (vsrc); @@ -2712,4 +2730,4 @@ mmx_composite_add_n_8_8 (pixman_implementation_t *imp, - vmask = load8888 (ldl_u((uint32_t *)mask)); - vdest = load8888 (*(uint32_t *)dst); + vmask = load8888u ((uint32_t *)mask); + vdest = load8888 ((uint32_t *)dst); @@ -3056,3 +3074,4 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, { - __m64 s = load8888 (*src | 0xff000000); + uint32_t ssrc = *src | 0xff000000; + __m64 s = load8888 (&ssrc); @@ -3066,3 +3085,3 @@ mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, __m64 vm = expand_alpha_rev (to_m64 (m)); - __m64 vdest = in_over (s, sa, vm, load8888 (*dst)); + __m64 vdest = in_over (s, sa, vm, load8888 (dst)); |