summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2012-05-09 19:20:55 -0400
committerMatt Turner <mattst88@gmail.com>2012-05-10 16:21:07 -0400
commit7d4beedc612a32b73d7673bbf6447de0f3fca298 (patch)
treecc9cbb0cd391c3f2ff108f1a4bc22e7d52621c7d
parent2beabd9fed76de0023eb36b0c938b8803aa8d129 (diff)
mmx: add and use pack_4x565 function
The pack_4x565 makes use of the pack_4xpacked565 function which uses pmadd. Some of the speed up is probably attributable to removing the artificial serialization imposed by the vdest = pack_565 (..., vdest, 0); vdest = pack_565 (..., vdest, 1); ... pattern. Loongson: over_n_0565 = L1: 16.44 L2: 16.42 M: 13.83 ( 9.85%) HT: 12.83 VT: 12.61 R: 12.34 RT: 8.90 ( 93Kops/s) over_n_0565 = L1: 42.48 L2: 42.53 M: 29.83 ( 21.20%) HT: 23.39 VT: 23.72 R: 21.80 RT: 11.60 ( 113Kops/s) over_8888_0565 = L1: 15.61 L2: 15.42 M: 12.11 ( 25.79%) HT: 11.07 VT: 10.70 R: 10.37 RT: 7.25 ( 82Kops/s) over_8888_0565 = L1: 35.01 L2: 35.20 M: 21.42 ( 45.57%) HT: 18.12 VT: 17.61 R: 16.09 RT: 9.01 ( 97Kops/s) over_n_8_0565 = L1: 15.17 L2: 14.94 M: 12.57 ( 17.86%) HT: 11.96 VT: 11.52 R: 10.79 RT: 7.31 ( 79Kops/s) over_n_8_0565 = L1: 29.83 L2: 29.79 M: 21.85 ( 30.94%) HT: 18.82 VT: 18.25 R: 16.15 RT: 8.72 ( 91Kops/s) over_n_8888_0565_ca = L1: 15.25 L2: 15.02 M: 11.64 ( 41.39%) HT: 11.08 VT: 10.72 R: 10.02 RT: 7.00 ( 77Kops/s) over_n_8888_0565_ca = L1: 30.12 L2: 29.99 M: 19.47 ( 68.99%) HT: 17.05 VT: 16.55 R: 14.67 RT: 8.38 ( 88Kops/s) ARM/iwMMXt: over_n_0565 = L1: 19.29 L2: 19.88 M: 17.38 ( 10.54%) HT: 15.53 VT: 16.11 R: 13.69 RT: 11.00 ( 96Kops/s) over_n_0565 = L1: 36.02 L2: 34.85 M: 28.04 ( 16.97%) HT: 22.12 VT: 24.21 R: 22.36 RT: 12.22 ( 103Kops/s) over_8888_0565 = L1: 18.38 L2: 16.59 M: 12.34 ( 22.29%) HT: 11.67 VT: 11.71 R: 11.02 RT: 6.89 ( 72Kops/s) over_8888_0565 = L1: 24.96 L2: 22.17 M: 15.11 ( 26.81%) HT: 14.14 VT: 13.71 R: 13.18 RT: 8.13 ( 78Kops/s) over_n_8_0565 = L1: 14.65 L2: 12.44 M: 11.56 ( 14.50%) HT: 10.93 VT: 10.39 R: 10.06 RT: 7.05 ( 70Kops/s) over_n_8_0565 = L1: 18.37 L2: 14.98 M: 13.97 ( 16.51%) HT: 12.67 VT: 10.35 R: 11.80 RT: 8.14 ( 74Kops/s) over_n_8888_0565_ca = L1: 14.27 L2: 12.93 M: 10.52 ( 33.23%) HT: 9.70 VT: 9.90 R: 9.31 RT: 6.34 ( 65Kops/s) over_n_8888_0565_ca = L1: 19.69 L2: 17.58 M: 13.40 ( 42.35%) HT: 11.75 VT: 11.33 R: 11.17 RT: 7.49 ( 73Kops/s)
-rw-r--r--pixman/pixman-mmx.c107
1 files changed, 52 insertions, 55 deletions
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index b14201a4..01a2bc93 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -598,6 +598,12 @@ pack_4xpacked565 (__m64 a, __m64 b)
#endif
}
+static force_inline __m64
+pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
+{
+ return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
+}
+
#ifndef _MSC_VER
static force_inline __m64
@@ -1396,16 +1402,14 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
while (w >= 4)
{
- __m64 vdest;
+ __m64 vdest = *(__m64 *)dst;
- vdest = *(__m64 *)dst;
-
- vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
- vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
- vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
- vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
+ __m64 v0 = over (vsrc, vsrca, expand565 (vdest, 0));
+ __m64 v1 = over (vsrc, vsrca, expand565 (vdest, 1));
+ __m64 v2 = over (vsrc, vsrca, expand565 (vdest, 2));
+ __m64 v3 = over (vsrc, vsrca, expand565 (vdest, 3));
- *(__m64 *)dst = vdest;
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
dst += 4;
w -= 4;
@@ -1818,22 +1822,19 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
while (w >= 4)
{
- __m64 vsrc0, vsrc1, vsrc2, vsrc3;
- __m64 vdest;
+ __m64 vdest = *(__m64 *)dst;
- vsrc0 = load8888 ((src + 0));
- vsrc1 = load8888 ((src + 1));
- vsrc2 = load8888 ((src + 2));
- vsrc3 = load8888 ((src + 3));
+ __m64 vsrc0 = load8888 ((src + 0));
+ __m64 vsrc1 = load8888 ((src + 1));
+ __m64 vsrc2 = load8888 ((src + 2));
+ __m64 vsrc3 = load8888 ((src + 3));
- vdest = *(__m64 *)dst;
-
- vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
- vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
- vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
- vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
+ __m64 v0 = over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0));
+ __m64 v1 = over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1));
+ __m64 v2 = over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2));
+ __m64 v3 = over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3));
- *(__m64 *)dst = vdest;
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
w -= 4;
dst += 4;
@@ -2368,25 +2369,22 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
}
else if (m0 | m1 | m2 | m3)
{
- __m64 vdest;
- __m64 vm0, vm1, vm2, vm3;
-
- vdest = *(__m64 *)dst;
+ __m64 vdest = *(__m64 *)dst;
- vm0 = to_m64 (m0);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
- expand565 (vdest, 0)), vdest, 0);
- vm1 = to_m64 (m1);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
- expand565 (vdest, 1)), vdest, 1);
- vm2 = to_m64 (m2);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
- expand565 (vdest, 2)), vdest, 2);
- vm3 = to_m64 (m3);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
- expand565 (vdest, 3)), vdest, 3);
-
- *(__m64 *)dst = vdest;
+ __m64 vm0 = to_m64 (m0);
+ __m64 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0),
+ expand565 (vdest, 0));
+ __m64 vm1 = to_m64 (m1);
+ __m64 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1),
+ expand565 (vdest, 1));
+ __m64 vm2 = to_m64 (m2);
+ __m64 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2),
+ expand565 (vdest, 2));
+ __m64 vm3 = to_m64 (m3);
+ __m64 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3),
+ expand565 (vdest, 3));
+
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
}
w -= 4;
@@ -2483,24 +2481,23 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
if ((a0 & a1 & a2 & a3) == 0xFF)
{
- __m64 vdest;
- vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0);
- vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1);
- vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2);
- vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3);
+ __m64 v0 = invert_colors (load8888 (&s0));
+ __m64 v1 = invert_colors (load8888 (&s1));
+ __m64 v2 = invert_colors (load8888 (&s2));
+ __m64 v3 = invert_colors (load8888 (&s3));
- *(__m64 *)dst = vdest;
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
}
else if (s0 | s1 | s2 | s3)
{
__m64 vdest = *(__m64 *)dst;
- vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0);
- vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1);
- vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2);
- vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3);
+ __m64 v0 = over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0));
+ __m64 v1 = over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1));
+ __m64 v2 = over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2));
+ __m64 v3 = over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3));
- *(__m64 *)dst = vdest;
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
}
w -= 4;
@@ -2675,12 +2672,12 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
{
__m64 vdest = *(__m64 *)q;
- vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0);
- vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1);
- vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2);
- vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3);
+ __m64 v0 = in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0));
+ __m64 v1 = in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1));
+ __m64 v2 = in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2));
+ __m64 v3 = in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3));
- *(__m64 *)q = vdest;
+ *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
}
twidth -= 4;
p += 4;