diff options
author | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2016-04-01 11:36:24 +0300 |
---|---|---|
committer | Siarhei Siamashka <siarhei.siamashka@gmail.com> | 2016-04-01 13:15:51 +0300 |
commit | 5b54fe8ca51e47e33827cce93fcfea657e80a631 (patch) | |
tree | cc917bb5480828d2319393768ac1621feaba525f | |
parent | 8e4881eb47987e01c12eb2e7f938d603ee12eea8 (diff) |
AArch64: Remove LSL instructions from bilinear src_8888_888820160331-arm64-review
They are not needed in the 32-bit ARM NEON code too.
Benchmarks on ARM Cortex-A53 @1.15GHz (PINE64 board):
== before the patch ==
bilinear src_8888_8888 = L1: 72.77 L2: 70.66 M: 69.49
== after the patch ==
bilinear src_8888_8888 = L1: 77.49 L2: 75.12 M: 73.04
== 32-bit implementation for comparison ==
bilinear src_8888_8888 = L1: 78.85 L2: 77.69 M: 75.86
Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
-rw-r--r-- | pixman/pixman-arma64-neon-asm.S | 24 |
1 files changed, 8 insertions, 16 deletions
diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S index ff5c20a..2b08766 100644 --- a/pixman/pixman-arma64-neon-asm.S +++ b/pixman/pixman-arma64-neon-asm.S @@ -3577,19 +3577,16 @@ pixman_asm_function fname .macro bilinear_interpolate_four_pixels_8888_8888_head asr TMP1, X, #16 add X, X, UX - lsl TMP4, TMP1, #2 - add TMP1, TOP, TMP4 + add TMP1, TOP, TMP1, lsl #2 asr TMP2, X, #16 add X, X, UX - lsl TMP4, TMP2, #2 - add TMP2, TOP, TMP4 + add TMP2, TOP, TMP2, lsl #2 ld1 {v22.2s}, [TMP1], STRIDE ld1 {v23.2s}, [TMP1] asr TMP3, X, #16 add X, X, UX - lsl TMP4, TMP3, #2 - add TMP3, TOP, TMP4 + add TMP3, TOP, TMP3, lsl #2 umull v8.8h, v22.8b, v28.8b umlal v8.8h, v23.8b, v29.8b @@ -3597,8 +3594,7 @@ pixman_asm_function fname ld1 {v23.2s}, [TMP2] asr TMP4, X, #16 add X, X, UX - lsl TMP1, TMP4, #2 - add TMP4, TOP, TMP1 + add TMP4, TOP, TMP4, lsl #2 umull v9.8h, v22.8b, v28.8b umlal v9.8h, v23.8b, v29.8b @@ -3646,12 +3642,10 @@ pixman_asm_function fname .macro bilinear_interpolate_four_pixels_8888_8888_tail_head asr TMP1, X, #16 add X, X, UX - lsl TMP2, TMP1, #2 - add TMP1, TOP, TMP2 + add TMP1, TOP, TMP1, lsl #2 asr TMP2, X, #16 add X, X, UX - lsl TMP3, TMP2, #2 - add TMP2, TOP, TMP3 + add TMP2, TOP, TMP2, lsl #2 umlal2 v1.4s, v9.8h, v15.h[4] ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS @@ -3674,12 +3668,10 @@ pixman_asm_function fname umull v9.8h, v22.8b, v28.8b asr TMP3, X, #16 add X, X, UX - lsl TMP4, TMP3, #2 - add TMP3, TOP, TMP4 + add TMP3, TOP, TMP3, lsl #2 asr TMP4, X, #16 add X, X, UX - lsl TMP1, TMP4, #2 - add TMP4, TOP, TMP1 + add TMP4, TOP, TMP4, lsl #2 umlal v9.8h, v23.8b, v29.8b ld1 {v22.2s}, [TMP3], STRIDE ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS) |