vc4: Declare the cpu pointers as being modified in NEON asm.

Otherwise, the compiler is free to reuse the register containing the input for another call and assume that the value hasn't been modified. Fixes crashes on texture upload/download with current gcc. We now have to have a temporary for the cpu2 value, since outputs must be lvalues. (commit message by anholt) Fixes: 4d30024238ef ("vc4: Use NEON to speed up utile loads on Pi2.") (cherry picked from commit 300d3ae8b1445b5060f92c77c0f577f4b7b2c7d6) [Emil: apply the patch to vc4_tiling_lt.c instead of v3d_cpu_tiling.h] Signed-off-by: Emil Velikov <emil.velikov@collabora.com> Conflicts: src/broadcom/common/v3d_cpu_tiling.h Squashed with commit: vc4: Declare the last cpu pointer as being modified in NEON asm. Earlier commit addressed 7 of the 8 instances available. v2: Rebase patch back to master (by anholt) Cc: Carsten Haitzler (Rasterman) <raster@rasterman.com> Cc: Eric Anholt <eric@anholt.net> Fixes: 300d3ae8b14 ("vc4: Declare the cpu pointers as being modified in NEON asm.") Signed-off-by: Emil Velikov <emil.velikov@collabora.com> (cherry picked from commit 385843ac3ce1b868d9e24fcb2dbc0c8d5f5a7c99) Conflicts: src/broadcom/common/v3d_cpu_tiling.h
author: Carsten Haitzler (Rasterman) <raster@rasterman.com> 2019-01-08 16:28:30 +0000
committer: Emil Velikov <emil.l.velikov@gmail.com> 2019-01-30 17:33:23 +0000
commit: 813f0a82960ed637e862ee596cef23c6574a7888 (patch)
tree: 8b8989fa9d6b06e9b785d95dd6d0f3801743317f
parent: b280cdb59e38af5d10e148fb5f2ff5e29503bf10 (diff)
1 files changed, 16 insertions, 20 deletions
diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c
index df6236be423..324a6334668 100644
--- a/src/gallium/drivers/vc4/vc4_tiling_lt.c
+++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c
@@ -85,13 +85,13 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         "vst1.8 d5, [%[cpu]], %[cpu_stride]\n"
                         "vst1.8 d6, [%[cpu]], %[cpu_stride]\n"
                         "vst1.8 d7, [%[cpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         } else {
                 assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load from the GPU in one shot, no interleave, to
                          * d0-d7.
@@ -109,10 +109,9 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         "vst1.8 d5, [%[cpu2]],%[cpu_stride]\n"
                         "vst1.8 d6, [%[cpu]]\n"
                         "vst1.8 d7, [%[cpu2]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         }
@@ -134,13 +133,13 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         "st1 {v2.D}[1], [%[cpu]], %[cpu_stride]\n"
                         "st1 {v3.D}[0], [%[cpu]], %[cpu_stride]\n"
                         "st1 {v3.D}[1], [%[cpu]]\n"
-			:
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
         } else {
                 assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load from the GPU in one shot, no interleave, to
                          * d0-d7.
@@ -158,10 +157,9 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp)
                         "st1 {v2.D}[1], [%[cpu2]],%[cpu_stride]\n"
                         "st1 {v3.D}[0], [%[cpu]]\n"
                         "st1 {v3.D}[1], [%[cpu2]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
         }
@@ -196,13 +194,13 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                          * d0-d7.
                          */
                         "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        :
+                        : [cpu]         "r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         } else {
                 assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load each 16-byte line in 2 parts from the cpu-side
                          * destination.  (vld1 can only store one d-register
@@ -218,10 +216,9 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                         "vld1.8 d7, [%[cpu2]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "vstm %[gpu], {q0, q1, q2, q3}\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "q0", "q1", "q2", "q3");
         }
@@ -241,13 +238,13 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                         "ld1 {v3.D}[1], [%[cpu]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
         } else {
                 assert(gpu_stride == 16);
+                void *cpu2 = cpu + 8;
                 __asm__ volatile (
                         /* Load each 16-byte line in 2 parts from the cpu-side
                          * destination.  (vld1 can only store one d-register
@@ -263,10 +260,9 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp)
                         "ld1 {v3.D}[1], [%[cpu2]]\n"
                         /* Store to the GPU in one shot, no interleave. */
                         "st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [%[gpu]]\n"
-                        :
+                        : [cpu]         "+r"(cpu),
+                          [cpu2]        "+r"(cpu2)
                         : [gpu]         "r"(gpu),
-                          [cpu]         "r"(cpu),
-                          [cpu2]        "r"(cpu + 8),
                           [cpu_stride]  "r"(cpu_stride)
                         : "v0", "v1", "v2", "v3");
         }
author	Carsten Haitzler (Rasterman) <raster@rasterman.com>	2019-01-08 16:28:30 +0000
committer	Emil Velikov <emil.l.velikov@gmail.com>	2019-01-30 17:33:23 +0000
commit	813f0a82960ed637e862ee596cef23c6574a7888 (patch)
tree	8b8989fa9d6b06e9b785d95dd6d0f3801743317f
parent	b280cdb59e38af5d10e148fb5f2ff5e29503bf10 (diff)