summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiarhei Siamashka <siarhei.siamashka@gmail.com>2016-04-01 11:36:24 +0300
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2016-04-01 13:15:51 +0300
commit5b54fe8ca51e47e33827cce93fcfea657e80a631 (patch)
treecc917bb5480828d2319393768ac1621feaba525f
parent8e4881eb47987e01c12eb2e7f938d603ee12eea8 (diff)
AArch64: Remove LSL instructions from bilinear src_8888_888820160331-arm64-review
They are not needed in the 32-bit ARM NEON code too. Benchmarks on ARM Cortex-A53 @1.15GHz (PINE64 board): == before the patch == bilinear src_8888_8888 = L1: 72.77 L2: 70.66 M: 69.49 == after the patch == bilinear src_8888_8888 = L1: 77.49 L2: 75.12 M: 73.04 == 32-bit implementation for comparison == bilinear src_8888_8888 = L1: 78.85 L2: 77.69 M: 75.86 Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
-rw-r--r--pixman/pixman-arma64-neon-asm.S24
1 files changed, 8 insertions, 16 deletions
diff --git a/pixman/pixman-arma64-neon-asm.S b/pixman/pixman-arma64-neon-asm.S
index ff5c20a..2b08766 100644
--- a/pixman/pixman-arma64-neon-asm.S
+++ b/pixman/pixman-arma64-neon-asm.S
@@ -3577,19 +3577,16 @@ pixman_asm_function fname
.macro bilinear_interpolate_four_pixels_8888_8888_head
asr TMP1, X, #16
add X, X, UX
- lsl TMP4, TMP1, #2
- add TMP1, TOP, TMP4
+ add TMP1, TOP, TMP1, lsl #2
asr TMP2, X, #16
add X, X, UX
- lsl TMP4, TMP2, #2
- add TMP2, TOP, TMP4
+ add TMP2, TOP, TMP2, lsl #2
ld1 {v22.2s}, [TMP1], STRIDE
ld1 {v23.2s}, [TMP1]
asr TMP3, X, #16
add X, X, UX
- lsl TMP4, TMP3, #2
- add TMP3, TOP, TMP4
+ add TMP3, TOP, TMP3, lsl #2
umull v8.8h, v22.8b, v28.8b
umlal v8.8h, v23.8b, v29.8b
@@ -3597,8 +3594,7 @@ pixman_asm_function fname
ld1 {v23.2s}, [TMP2]
asr TMP4, X, #16
add X, X, UX
- lsl TMP1, TMP4, #2
- add TMP4, TOP, TMP1
+ add TMP4, TOP, TMP4, lsl #2
umull v9.8h, v22.8b, v28.8b
umlal v9.8h, v23.8b, v29.8b
@@ -3646,12 +3642,10 @@ pixman_asm_function fname
.macro bilinear_interpolate_four_pixels_8888_8888_tail_head
asr TMP1, X, #16
add X, X, UX
- lsl TMP2, TMP1, #2
- add TMP1, TOP, TMP2
+ add TMP1, TOP, TMP1, lsl #2
asr TMP2, X, #16
add X, X, UX
- lsl TMP3, TMP2, #2
- add TMP2, TOP, TMP3
+ add TMP2, TOP, TMP2, lsl #2
umlal2 v1.4s, v9.8h, v15.h[4]
ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)
ushll v2.4s, v10.4h, #BILINEAR_INTERPOLATION_BITS
@@ -3674,12 +3668,10 @@ pixman_asm_function fname
umull v9.8h, v22.8b, v28.8b
asr TMP3, X, #16
add X, X, UX
- lsl TMP4, TMP3, #2
- add TMP3, TOP, TMP4
+ add TMP3, TOP, TMP3, lsl #2
asr TMP4, X, #16
add X, X, UX
- lsl TMP1, TMP4, #2
- add TMP4, TOP, TMP1
+ add TMP4, TOP, TMP4, lsl #2
umlal v9.8h, v23.8b, v29.8b
ld1 {v22.2s}, [TMP3], STRIDE
ushr v15.8h, v12.8h, #(16 - BILINEAR_INTERPOLATION_BITS)