summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCarsten Haitzler (Rasterman) <raster@rasterman.com>2019-01-29 16:55:30 +0000
committerEmil Velikov <emil.l.velikov@gmail.com>2019-01-29 19:30:12 +0000
commitb280cdb59e38af5d10e148fb5f2ff5e29503bf10 (patch)
tree5f35504d988ad49ec3d45fcda456ffb25fc832e7
parent3b9e9e4723106dc0e8f3fb049026f3ef66f4f6c0 (diff)
vc4: Use named parameters for the NEON inline asm.
This makes the asm code more intelligible and clarifies the functional change in the next commit. (commit message and commit squashing by anholt) (cherry picked from commiti 522f68847152e9111def094f7fb35b44f3d0fc80) [Emil: apply the patch to vc4_tiling_lt.c instead of v3d_cpu_tiling.h] Signed-off-by: Emil Velikov <emil.velikov@collabora.com> Conflicts: src/broadcom/common/v3d_cpu_tiling.h
-rw-r--r--src/gallium/drivers/vc4/vc4_tiling_lt.c180
1 files changed, 100 insertions, 80 deletions
diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c
index ec42a3dc2f7..df6236be423 100644
--- a/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -73,20 +73,22 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "vldm %0, {q0, q1, q2, q3}\n"
+ "vldm %[gpu], {q0, q1, q2, q3}\n"
/* Store each 8-byte line to cpu-side destination,
* incrementing it by the stride each time.
*/
- "vst1.8 d0, [%1], %2\n"
- "vst1.8 d1, [%1], %2\n"
- "vst1.8 d2, [%1], %2\n"
- "vst1.8 d3, [%1], %2\n"
- "vst1.8 d4, [%1], %2\n"
- "vst1.8 d5, [%1], %2\n"
- "vst1.8 d6, [%1], %2\n"
- "vst1.8 d7, [%1]\n"
+ "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d1, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d3, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d7, [%[cpu]]\n"
:
- : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ : [gpu] "r"(gpu),
+ [cpu] "r"(cpu),
+ [cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
} else {
assert(gpu_stride == 16);
@@ -94,21 +96,24 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "vldm %0, {q0, q1, q2, q3};\n"
+ "vldm %[gpu], {q0, q1, q2, q3};\n"
/* Store each 16-byte line in 2 parts to the cpu-side
* destination. (vld1 can only store one d-register
* at a time).
*/
- "vst1.8 d0, [%1], %3\n"
- "vst1.8 d1, [%2], %3\n"
- "vst1.8 d2, [%1], %3\n"
- "vst1.8 d3, [%2], %3\n"
- "vst1.8 d4, [%1], %3\n"
- "vst1.8 d5, [%2], %3\n"
- "vst1.8 d6, [%1]\n"
- "vst1.8 d7, [%2]\n"
+ "vst1.8 d0, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d1, [%[cpu2]],%[cpu_stride]\n"
+ "vst1.8 d2, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d3, [%[cpu2]],%[cpu_stride]\n"
+ "vst1.8 d4, [%[cpu]], %[cpu_stride]\n"
+ "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
+ "vst1.8 d6, [%[cpu]]\n"
+ "vst1.8 d7, [%[cpu2]]\n"
:
- : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ : [gpu] "r"(gpu),
+ [cpu] "r"(cpu),
+ [cpu2] "r"(cpu + 8),
+ [cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
}
#elif defined (PIPE_ARCH_AARCH64)
@@ -117,20 +122,22 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
+ "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
/* Store each 8-byte line to cpu-side destination,
* incrementing it by the stride each time.
*/
- "st1 {v0.D}[0], [%1], %2\n"
- "st1 {v0.D}[1], [%1], %2\n"
- "st1 {v1.D}[0], [%1], %2\n"
- "st1 {v1.D}[1], [%1], %2\n"
- "st1 {v2.D}[0], [%1], %2\n"
- "st1 {v2.D}[1], [%1], %2\n"
- "st1 {v3.D}[0], [%1], %2\n"
- "st1 {v3.D}[1], [%1]\n"
+ "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v3.D}[1], [%[cpu]]\n"
:
- : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ : [gpu] "r"(gpu),
+ [cpu] "r"(cpu),
+ [cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
} else {
assert(gpu_stride == 16);
@@ -138,21 +145,24 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
+ "ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
/* Store each 16-byte line in 2 parts to the cpu-side
* destination. (vld1 can only store one d-register
* at a time).
*/
- "st1 {v0.D}[0], [%1], %3\n"
- "st1 {v0.D}[1], [%2], %3\n"
- "st1 {v1.D}[0], [%1], %3\n"
- "st1 {v1.D}[1], [%2], %3\n"
- "st1 {v2.D}[0], [%1], %3\n"
- "st1 {v2.D}[1], [%2], %3\n"
- "st1 {v3.D}[0], [%1]\n"
- "st1 {v3.D}[1], [%2]\n"
+ "st1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "st1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "st1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "st1 {v3.D}[0], [%[cpu]]\n"
+ "st1 {v3.D}[1], [%[cpu2]]\n"
:
- : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ : [gpu] "r"(gpu),
+ [cpu] "r"(cpu),
+ [cpu2] "r"(cpu + 8),
+ [cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
}
#else
@@ -174,20 +184,22 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
/* Load each 8-byte line from cpu-side source,
* incrementing it by the stride each time.
*/
- "vld1.8 d0, [%1], %2\n"
- "vld1.8 d1, [%1], %2\n"
- "vld1.8 d2, [%1], %2\n"
- "vld1.8 d3, [%1], %2\n"
- "vld1.8 d4, [%1], %2\n"
- "vld1.8 d5, [%1], %2\n"
- "vld1.8 d6, [%1], %2\n"
- "vld1.8 d7, [%1]\n"
+ "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d1, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d3, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d5, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d6, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d7, [%[cpu]]\n"
/* Load from the GPU in one shot, no interleave, to
* d0-d7.
*/
- "vstm %0, {q0, q1, q2, q3}\n"
+ "vstm %[gpu], {q0, q1, q2, q3}\n"
:
- : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ : [gpu] "r"(gpu),
+ [cpu] "r"(cpu),
+ [cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
} else {
assert(gpu_stride == 16);
@@ -196,18 +208,21 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
* destination. (vld1 can only store one d-register
* at a time).
*/
- "vld1.8 d0, [%1], %3\n"
- "vld1.8 d1, [%2], %3\n"
- "vld1.8 d2, [%1], %3\n"
- "vld1.8 d3, [%2], %3\n"
- "vld1.8 d4, [%1], %3\n"
- "vld1.8 d5, [%2], %3\n"
- "vld1.8 d6, [%1]\n"
- "vld1.8 d7, [%2]\n"
+ "vld1.8 d0, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d1, [%[cpu2]],%[cpu_stride]\n"
+ "vld1.8 d2, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d3, [%[cpu2]],%[cpu_stride]\n"
+ "vld1.8 d4, [%[cpu]], %[cpu_stride]\n"
+ "vld1.8 d5, [%[cpu2]],%[cpu_stride]\n"
+ "vld1.8 d6, [%[cpu]]\n"
+ "vld1.8 d7, [%[cpu2]]\n"
/* Store to the GPU in one shot, no interleave. */
- "vstm %0, {q0, q1, q2, q3}\n"
+ "vstm %[gpu], {q0, q1, q2, q3}\n"
:
- : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ : [gpu] "r"(gpu),
+ [cpu] "r"(cpu),
+ [cpu2] "r"(cpu + 8),
+ [cpu_stride] "r"(cpu_stride)
: "q0", "q1", "q2", "q3");
}
#elif defined (PIPE_ARCH_AARCH64)
@@ -216,18 +231,20 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
/* Load each 8-byte line from cpu-side source,
* incrementing it by the stride each time.
*/
- "ld1 {v0.D}[0], [%1], %2\n"
- "ld1 {v0.D}[1], [%1], %2\n"
- "ld1 {v1.D}[0], [%1], %2\n"
- "ld1 {v1.D}[1], [%1], %2\n"
- "ld1 {v2.D}[0], [%1], %2\n"
- "ld1 {v2.D}[1], [%1], %2\n"
- "ld1 {v3.D}[0], [%1], %2\n"
- "ld1 {v3.D}[1], [%1]\n"
+ "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v0.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v1.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v3.D}[1], [%[cpu]]\n"
/* Store to the GPU in one shot, no interleave. */
- "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
+ "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
:
- : "r"(gpu), "r"(cpu), "r"(cpu_stride)
+ : [gpu] "r"(gpu),
+ [cpu] "r"(cpu),
+ [cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
} else {
assert(gpu_stride == 16);
@@ -236,18 +253,21 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
* destination. (vld1 can only store one d-register
* at a time).
*/
- "ld1 {v0.D}[0], [%1], %3\n"
- "ld1 {v0.D}[1], [%2], %3\n"
- "ld1 {v1.D}[0], [%1], %3\n"
- "ld1 {v1.D}[1], [%2], %3\n"
- "ld1 {v2.D}[0], [%1], %3\n"
- "ld1 {v2.D}[1], [%2], %3\n"
- "ld1 {v3.D}[0], [%1]\n"
- "ld1 {v3.D}[1], [%2]\n"
+ "ld1 {v0.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v0.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "ld1 {v1.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v1.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "ld1 {v2.D}[0], [%[cpu]], %[cpu_stride]\n"
+ "ld1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
+ "ld1 {v3.D}[0], [%[cpu]]\n"
+ "ld1 {v3.D}[1], [%[cpu2]]\n"
/* Store to the GPU in one shot, no interleave. */
- "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%0]\n"
+ "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
:
- : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride)
+ : [gpu] "r"(gpu),
+ [cpu] "r"(cpu),
+ [cpu2] "r"(cpu + 8),
+ [cpu_stride] "r"(cpu_stride)
: "v0", "v1", "v2", "v3");
}
#else