summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBen Avison <bavison@riscosopen.org>2014-03-28 11:13:21 +0200
committerSiarhei Siamashka <siarhei.siamashka@gmail.com>2014-04-02 12:46:24 +0300
commit4ee85b008333a95d4dfc27e7b04c444dcbb3a7e7 (patch)
tree29be193b83370a79f32b71bb126877295eee36f4
parent56622140e3a8175c8ccc82c9717adf8372043364 (diff)
ARMv6: Add fast path for over_reverse_n_8888
Benchmark results, "before" is upstream commit c343846 lowlevel-blt-bench: add in_reverse_8888_8888 test and "after" is with this patch only added on top. lowlevel-blt-bench, over_reverse_n_8888, 100 iterations: Before After Mean StdDev Mean StdDev Confidence Change L1 15.1 0.1 274.5 2.3 100.00% +1718.9% L2 12.8 0.3 181.8 0.7 100.00% +1315.5% M 10.8 0.0 77.9 0.0 100.00% +621.2% HT 9.7 0.0 29.4 0.2 100.00% +204.9% VT 9.5 0.0 26.7 0.1 100.00% +179.3% R 9.3 0.0 25.3 0.1 100.00% +173.6% RT 6.0 0.1 11.0 0.2 100.00% +82.9% At most 16 outliers rejected per case per set. cairo-perf-trace with trimmed traces, 30 iterations: Before After Mean StdDev Mean StdDev Confidence Change t-poppler.trace 12.9 0.1 9.7 0.0 100.00% +32.6% t-firefox-talos-gfx.trace 33.2 0.7 32.9 0.4 95.23% +0.9% (insignificant) t-firefox-particles.trace 27.4 0.1 27.3 0.2 99.65% +0.4% t-firefox-canvas-alpha.trace 20.5 0.3 20.5 0.3 57.51% +0.3% (insignificant) t-poppler-reseau.trace 22.4 0.1 22.4 0.1 95.69% +0.3% (insignificant) t-firefox-fishtank.trace 13.2 0.0 13.2 0.0 99.84% +0.1% t-swfdec-giant-steps.trace 14.9 0.0 14.9 0.0 87.68% +0.1% (insignificant) t-swfdec-youtube.trace 7.8 0.0 7.8 0.0 35.22% +0.1% (insignificant) t-firefox-planet-gnome.trace 11.5 0.0 11.5 0.0 29.37% +0.0% (insignificant) t-firefox-fishbowl.trace 21.2 0.0 21.2 0.0 18.09% +0.0% (insignificant) t-grads-heat-map.trace 4.4 0.0 4.4 0.0 1.84% +0.0% (insignificant) t-firefox-paintball.trace 18.0 0.0 18.0 0.0 33.43% -0.0% (insignificant) t-firefox-talos-svg.trace 20.5 0.0 20.5 0.1 68.56% -0.1% (insignificant) t-midori-zoomed.trace 8.0 0.0 8.0 0.0 99.98% -0.1% t-firefox-canvas-swscroll.trace 32.1 0.1 32.1 0.1 85.27% -0.1% (insignificant) t-gnome-system-monitor.trace 17.2 0.0 17.2 0.0 99.97% -0.2% t-firefox-chalkboard.trace 36.5 0.0 36.6 0.0 100.00% -0.2% t-firefox-asteroids.trace 11.1 0.0 11.1 0.0 100.00% -0.2% t-firefox-canvas.trace 17.9 0.0 18.0 0.0 100.00% -0.3% t-chromium-tabs.trace 4.9 0.0 4.9 0.0 97.95% -0.3% (insignificant) t-xfce4-terminal-a1.trace 4.8 0.0 4.8 0.0 100.00% -0.4% t-firefox-scrolling.trace 31.1 0.1 31.2 0.1 100.00% -0.5% t-evolution.trace 13.7 0.1 13.8 0.1 99.99% -0.6% t-gnome-terminal-vim.trace 22.0 0.2 22.2 0.1 99.99% -0.7% t-gvim.trace 33.2 0.2 33.5 0.2 100.00% -0.8% At most 6 outliers rejected per case per set. Cairo perf reports the running time, but the change is computed for operations per second instead (inverse of running time). Changes in the order of +/- 1% can be accounted for measurement errors, even if they are deemed to be statistically significant. This claim is based on comparing two 30-iteration identical "before" runs using the exact same binaries, and observing changes from -0.4% to +0.5% with >=99% confidence. Confidence is based on Welch's t-test. v4, Pekka Paalanen <pekka.paalanen@collabora.co.uk> : Rebased, re-benchmarked on Raspberry Pi, commit message.
-rw-r--r--pixman/pixman-arm-simd-asm.S78
-rw-r--r--pixman/pixman-arm-simd.c6
2 files changed, 84 insertions, 0 deletions
diff --git a/pixman/pixman-arm-simd-asm.S b/pixman/pixman-arm-simd-asm.S
index c209688..dd6f788 100644
--- a/pixman/pixman-arm-simd-asm.S
+++ b/pixman/pixman-arm-simd-asm.S
@@ -611,3 +611,81 @@ generate_composite_function \
/******************************************************************************/
+.macro over_reverse_n_8888_init
+ ldr SRC, [sp, #ARGS_STACK_OFFSET]
+ ldr MASK, =0x00800080
+ /* Split source pixel into RB/AG parts */
+ uxtb16 STRIDE_S, SRC
+ uxtb16 STRIDE_M, SRC, ror #8
+ /* Set GE[3:0] to 0101 so SEL instructions do what we want */
+ uadd8 SCRATCH, MASK, MASK
+ line_saved_regs STRIDE_D, ORIG_W
+.endm
+
+.macro over_reverse_n_8888_newline
+ mov STRIDE_D, #0xFF
+.endm
+
+.macro over_reverse_n_8888_process_head cond, numbytes, firstreg, unaligned_src, unaligned_mask, preload
+ pixld , numbytes, firstreg, DST, 0
+.endm
+
+.macro over_reverse_n_8888_1pixel d, is_only
+ teq WK&d, #0
+ beq 8f /* replace with source */
+ bics ORIG_W, STRIDE_D, WK&d, lsr #24
+ .if is_only == 1
+ beq 49f /* skip store */
+ .else
+ beq 9f /* write same value back */
+ .endif
+ mla SCRATCH, STRIDE_S, ORIG_W, MASK /* red/blue */
+ mla ORIG_W, STRIDE_M, ORIG_W, MASK /* alpha/green */
+ uxtab16 SCRATCH, SCRATCH, SCRATCH, ror #8
+ uxtab16 ORIG_W, ORIG_W, ORIG_W, ror #8
+ mov SCRATCH, SCRATCH, ror #8
+ sel ORIG_W, SCRATCH, ORIG_W
+ uqadd8 WK&d, WK&d, ORIG_W
+ b 9f
+8: mov WK&d, SRC
+9:
+.endm
+
+.macro over_reverse_n_8888_tail numbytes, reg1, reg2, reg3, reg4
+ .if numbytes == 4
+ over_reverse_n_8888_1pixel reg1, 1
+ .else
+ and SCRATCH, WK&reg1, WK&reg2
+ .if numbytes == 16
+ and SCRATCH, SCRATCH, WK&reg3
+ and SCRATCH, SCRATCH, WK&reg4
+ .endif
+ mvns SCRATCH, SCRATCH, asr #24
+ beq 49f /* skip store if all opaque */
+ over_reverse_n_8888_1pixel reg1, 0
+ over_reverse_n_8888_1pixel reg2, 0
+ .if numbytes == 16
+ over_reverse_n_8888_1pixel reg3, 0
+ over_reverse_n_8888_1pixel reg4, 0
+ .endif
+ .endif
+ pixst , numbytes, reg1, DST
+49:
+.endm
+
+.macro over_reverse_n_8888_process_tail cond, numbytes, firstreg
+ over_reverse_n_8888_tail numbytes, firstreg, %(firstreg+1), %(firstreg+2), %(firstreg+3)
+.endm
+
+generate_composite_function \
+ pixman_composite_over_reverse_n_8888_asm_armv6, 0, 0, 32 \
+ FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE | FLAG_SPILL_LINE_VARS | FLAG_PROCESS_CORRUPTS_SCRATCH, \
+ 3, /* prefetch distance */ \
+ over_reverse_n_8888_init, \
+ over_reverse_n_8888_newline, \
+ nop_macro, /* cleanup */ \
+ over_reverse_n_8888_process_head, \
+ over_reverse_n_8888_process_tail
+
+/******************************************************************************/
+
diff --git a/pixman/pixman-arm-simd.c b/pixman/pixman-arm-simd.c
index af062e1..8fbc439 100644
--- a/pixman/pixman-arm-simd.c
+++ b/pixman/pixman-arm-simd.c
@@ -47,6 +47,9 @@ PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, add_8_8,
PIXMAN_ARM_BIND_FAST_PATH_SRC_DST (armv6, over_8888_8888,
uint32_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_DST (0, armv6, over_reverse_n_8888,
+ uint32_t, 1)
+
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, armv6, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -225,6 +228,9 @@ static const pixman_fast_path_t arm_simd_fast_paths[] =
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, armv6_composite_over_8888_n_8888),
PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, armv6_composite_over_8888_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, armv6_composite_over_reverse_n_8888),
+ PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, armv6_composite_over_reverse_n_8888),
+
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, armv6_composite_add_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, armv6_composite_over_n_8_8888),