diff options
author | Matt Turner <mattst88@gmail.com> | 2011-09-26 23:37:39 -0400 |
---|---|---|
committer | Matt Turner <mattst88@gmail.com> | 2011-09-26 23:49:42 -0400 |
commit | 07a46f61756f6df2d00f8c187b726d10b95ebc02 (patch) | |
tree | e5792fd49d67dba2684924134a14185708d284aa | |
parent | a826d75fa58a7dcacfeafe0a3df3329a9aad8d13 (diff) |
mmx: add ARM/iwmmxt inline assembly blit codeiwmmxt-optimizations6
Alignment check ensures that if src is unaligned, we can efficiently
handle it with walignr instructions, and if it's aligned, we don't have
to bother with walignr instructions.
Signed-off-by: Matt Turner <mattst88@gmail.com>
-rw-r--r-- | pixman/pixman-mmx.c | 80 |
1 files changed, 72 insertions, 8 deletions
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c index f848ab4..b462cf5 100644 --- a/pixman/pixman-mmx.c +++ b/pixman/pixman-mmx.c @@ -2899,6 +2899,11 @@ pixman_blt_mmx (uint32_t *src_bits, while (height--) { +#ifdef USE_ARM_IWMMXT + int align; + uint8_t *align_s; + __m64 v0; +#endif int w; uint8_t *s = src_bytes; uint8_t *d = dst_bytes; @@ -2931,6 +2936,40 @@ pixman_blt_mmx (uint32_t *src_bits, d += 4; } +#ifdef USE_ARM_IWMMXT + align = ((unsigned long)s & 7); + align_s = (uint8_t *)((unsigned long)s & ~7); + v0 = *(__m64 *)align_s; + align_s += 8; + + if (align == 0) { + while (w >= 64) + { + __m64 v1, v2, v3, v4, v5, v6, v7, v8; + v1 = *(__m64 *)(s + 0); + v2 = *(__m64 *)(s + 8); + v3 = *(__m64 *)(s + 16); + v4 = *(__m64 *)(s + 24); + v5 = *(__m64 *)(s + 32); + v6 = *(__m64 *)(s + 40); + v7 = *(__m64 *)(s + 48); + v8 = *(__m64 *)(s + 56); + + *(__m64 *)(d + 0) = v1; + *(__m64 *)(d + 8) = v2; + *(__m64 *)(d + 16) = v3; + *(__m64 *)(d + 24) = v4; + *(__m64 *)(d + 32) = v5; + *(__m64 *)(d + 40) = v6; + *(__m64 *)(d + 48) = v7; + *(__m64 *)(d + 56) = v8; + + w -= 64; + s += 64; + d += 64; + } + } else { +#endif while (w >= 64) { #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX @@ -2958,14 +2997,34 @@ pixman_blt_mmx (uint32_t *src_bits, "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"); #else - __m64 v0 = ldq_u((uint64_t *)(s + 0)); - __m64 v1 = ldq_u((uint64_t *)(s + 8)); - __m64 v2 = ldq_u((uint64_t *)(s + 16)); - __m64 v3 = ldq_u((uint64_t *)(s + 24)); - __m64 v4 = ldq_u((uint64_t *)(s + 32)); - __m64 v5 = ldq_u((uint64_t *)(s + 40)); - __m64 v6 = ldq_u((uint64_t *)(s + 48)); - __m64 v7 = ldq_u((uint64_t *)(s + 56)); +# if defined USE_ARM_IWMMXT + __m64 v1 = *(__m64 *)(align_s + 0); + __m64 v2 = *(__m64 *)(align_s + 8); + __m64 v3 = *(__m64 *)(align_s + 16); + __m64 v4 = *(__m64 *)(align_s + 24); + __m64 v5 = *(__m64 *)(align_s + 32); + __m64 v6 = *(__m64 *)(align_s + 40); + __m64 v7 = *(__m64 *)(align_s + 48); + __m64 v8 = *(__m64 *)(align_s + 56); + + v0 = _mm_align_si64(v0, v1, align); + v1 = _mm_align_si64(v1, v2, align); + v2 = _mm_align_si64(v2, v3, align); + v3 = _mm_align_si64(v3, v4, align); + v4 = _mm_align_si64(v4, v5, align); + v5 = _mm_align_si64(v5, v6, align); + v6 = _mm_align_si64(v6, v7, align); + v7 = _mm_align_si64(v7, v8, align); +# else + __m64 v0 = *(__m64 *)(s + 0); + __m64 v1 = *(__m64 *)(s + 8); + __m64 v2 = *(__m64 *)(s + 16); + __m64 v3 = *(__m64 *)(s + 24); + __m64 v4 = *(__m64 *)(s + 32); + __m64 v5 = *(__m64 *)(s + 40); + __m64 v6 = *(__m64 *)(s + 48); + __m64 v7 = *(__m64 *)(s + 56); +# endif *(__m64 *)(d + 0) = v0; *(__m64 *)(d + 8) = v1; *(__m64 *)(d + 16) = v2; @@ -2979,6 +3038,11 @@ pixman_blt_mmx (uint32_t *src_bits, w -= 64; s += 64; d += 64; +#if defined USE_ARM_IWMMXT + align_s += 64; + v0 = v8; + } +#endif } while (w >= 4) { |