summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2011-09-26 23:37:39 -0400
committerMatt Turner <mattst88@gmail.com>2011-09-26 23:49:42 -0400
commit07a46f61756f6df2d00f8c187b726d10b95ebc02 (patch)
treee5792fd49d67dba2684924134a14185708d284aa
parenta826d75fa58a7dcacfeafe0a3df3329a9aad8d13 (diff)
mmx: add ARM/iwmmxt inline assembly blit codeiwmmxt-optimizations6
Alignment check ensures that if src is unaligned, we can efficiently handle it with walignr instructions, and if it's aligned, we don't have to bother with walignr instructions. Signed-off-by: Matt Turner <mattst88@gmail.com>
-rw-r--r--pixman/pixman-mmx.c80
1 files changed, 72 insertions, 8 deletions
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index f848ab4..b462cf5 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -2899,6 +2899,11 @@ pixman_blt_mmx (uint32_t *src_bits,
while (height--)
{
+#ifdef USE_ARM_IWMMXT
+ int align;
+ uint8_t *align_s;
+ __m64 v0;
+#endif
int w;
uint8_t *s = src_bytes;
uint8_t *d = dst_bytes;
@@ -2931,6 +2936,40 @@ pixman_blt_mmx (uint32_t *src_bits,
d += 4;
}
+#ifdef USE_ARM_IWMMXT
+ align = ((unsigned long)s & 7);
+ align_s = (uint8_t *)((unsigned long)s & ~7);
+ v0 = *(__m64 *)align_s;
+ align_s += 8;
+
+ if (align == 0) {
+ while (w >= 64)
+ {
+ __m64 v1, v2, v3, v4, v5, v6, v7, v8;
+ v1 = *(__m64 *)(s + 0);
+ v2 = *(__m64 *)(s + 8);
+ v3 = *(__m64 *)(s + 16);
+ v4 = *(__m64 *)(s + 24);
+ v5 = *(__m64 *)(s + 32);
+ v6 = *(__m64 *)(s + 40);
+ v7 = *(__m64 *)(s + 48);
+ v8 = *(__m64 *)(s + 56);
+
+ *(__m64 *)(d + 0) = v1;
+ *(__m64 *)(d + 8) = v2;
+ *(__m64 *)(d + 16) = v3;
+ *(__m64 *)(d + 24) = v4;
+ *(__m64 *)(d + 32) = v5;
+ *(__m64 *)(d + 40) = v6;
+ *(__m64 *)(d + 48) = v7;
+ *(__m64 *)(d + 56) = v8;
+
+ w -= 64;
+ s += 64;
+ d += 64;
+ }
+ } else {
+#endif
while (w >= 64)
{
#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
@@ -2958,14 +2997,34 @@ pixman_blt_mmx (uint32_t *src_bits,
"%mm0", "%mm1", "%mm2", "%mm3",
"%mm4", "%mm5", "%mm6", "%mm7");
#else
- __m64 v0 = ldq_u((uint64_t *)(s + 0));
- __m64 v1 = ldq_u((uint64_t *)(s + 8));
- __m64 v2 = ldq_u((uint64_t *)(s + 16));
- __m64 v3 = ldq_u((uint64_t *)(s + 24));
- __m64 v4 = ldq_u((uint64_t *)(s + 32));
- __m64 v5 = ldq_u((uint64_t *)(s + 40));
- __m64 v6 = ldq_u((uint64_t *)(s + 48));
- __m64 v7 = ldq_u((uint64_t *)(s + 56));
+# if defined USE_ARM_IWMMXT
+ __m64 v1 = *(__m64 *)(align_s + 0);
+ __m64 v2 = *(__m64 *)(align_s + 8);
+ __m64 v3 = *(__m64 *)(align_s + 16);
+ __m64 v4 = *(__m64 *)(align_s + 24);
+ __m64 v5 = *(__m64 *)(align_s + 32);
+ __m64 v6 = *(__m64 *)(align_s + 40);
+ __m64 v7 = *(__m64 *)(align_s + 48);
+ __m64 v8 = *(__m64 *)(align_s + 56);
+
+ v0 = _mm_align_si64(v0, v1, align);
+ v1 = _mm_align_si64(v1, v2, align);
+ v2 = _mm_align_si64(v2, v3, align);
+ v3 = _mm_align_si64(v3, v4, align);
+ v4 = _mm_align_si64(v4, v5, align);
+ v5 = _mm_align_si64(v5, v6, align);
+ v6 = _mm_align_si64(v6, v7, align);
+ v7 = _mm_align_si64(v7, v8, align);
+# else
+ __m64 v0 = *(__m64 *)(s + 0);
+ __m64 v1 = *(__m64 *)(s + 8);
+ __m64 v2 = *(__m64 *)(s + 16);
+ __m64 v3 = *(__m64 *)(s + 24);
+ __m64 v4 = *(__m64 *)(s + 32);
+ __m64 v5 = *(__m64 *)(s + 40);
+ __m64 v6 = *(__m64 *)(s + 48);
+ __m64 v7 = *(__m64 *)(s + 56);
+# endif
*(__m64 *)(d + 0) = v0;
*(__m64 *)(d + 8) = v1;
*(__m64 *)(d + 16) = v2;
@@ -2979,6 +3038,11 @@ pixman_blt_mmx (uint32_t *src_bits,
w -= 64;
s += 64;
d += 64;
+#if defined USE_ARM_IWMMXT
+ align_s += 64;
+ v0 = v8;
+ }
+#endif
}
while (w >= 4)
{