summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJason Ekstrand <jason@jlekstrand.net>2019-07-25 18:28:44 -0500
committerJuan A. Suarez Romero <jasuarez@igalia.com>2019-07-31 08:06:48 +0000
commitf522c7ca9e196a38a9a88965ee6a74d157c6b8c9 (patch)
treea75ae7529f21fc88a9ace7af5ea31f7bc393cbe2
parentac7f03caed981149786e83c6b8b23ee81b321e9b (diff)
intel/fs: Use ALIGN16 instructions for all derivatives on gen <= 7
The issue here was discovered by a set of Vulkan CTS tests: dEQP-VK.glsl.derivate.*.dynamic_* These tests use ballot ops to construct a branch condition that takes the same path for each 2x2 quad but may not be uniform across the whole subgroup. They then tests that derivatives work and give the correct value even when executed inside such a branch. Because the derivative isn't executed in uniform control-flow and the values coming into the derivative aren't smooth (or worse, linear), they nicely catch bugs that aren't uncovered by simpler derivative tests. Unfortunately, these tests require Vulkan and the equivalent GL test would require the GL_ARB_shader_ballot extension which requires int64. Because the requirements for these tests are so high, it's not easy to test on older hardware and the bug is only proven to exist on gen7; gen4-6 are a conjecture. Cc: mesa-stable@lists.freedesktop.org Reviewed-by: Matt Turner <mattst88@gmail.com> (cherry picked from commit 499d760c6e8a81d87bc4ea37c3de2ee9b9da2aec)
-rw-r--r--src/intel/compiler/brw_fs.cpp6
-rw-r--r--src/intel/compiler/brw_fs_generator.cpp83
2 files changed, 65 insertions, 24 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 2ea01aa6777..af4ea33eef6 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -6130,9 +6130,6 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
case FS_OPCODE_LINTERP:
case SHADER_OPCODE_GET_BUFFER_SIZE:
- case FS_OPCODE_DDX_COARSE:
- case FS_OPCODE_DDX_FINE:
- case FS_OPCODE_DDY_COARSE:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
@@ -6149,6 +6146,9 @@ get_lowered_simd_width(const struct gen_device_info *devinfo,
*/
return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
+ case FS_OPCODE_DDX_COARSE:
+ case FS_OPCODE_DDX_FINE:
+ case FS_OPCODE_DDY_COARSE:
case FS_OPCODE_DDY_FINE:
/* The implementation of this virtual opcode may require emitting
* compressed Align16 instructions, which are severely limited on some
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index 406e0a046e7..67740c783f1 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1206,27 +1206,50 @@ fs_generator::generate_ddx(const fs_inst *inst,
{
unsigned vstride, width;
- if (inst->opcode == FS_OPCODE_DDX_FINE) {
- /* produce accurate derivatives */
- vstride = BRW_VERTICAL_STRIDE_2;
- width = BRW_WIDTH_2;
- } else {
- /* replicate the derivative at the top-left pixel to other pixels */
- vstride = BRW_VERTICAL_STRIDE_4;
- width = BRW_WIDTH_4;
- }
+ if (devinfo->gen >= 8) {
+ if (inst->opcode == FS_OPCODE_DDX_FINE) {
+ /* produce accurate derivatives */
+ vstride = BRW_VERTICAL_STRIDE_2;
+ width = BRW_WIDTH_2;
+ } else {
+ /* replicate the derivative at the top-left pixel to other pixels */
+ vstride = BRW_VERTICAL_STRIDE_4;
+ width = BRW_WIDTH_4;
+ }
+
+ struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
+ struct brw_reg src1 = src;
- struct brw_reg src0 = byte_offset(src, type_sz(src.type));;
- struct brw_reg src1 = src;
+ src0.vstride = vstride;
+ src0.width = width;
+ src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+ src1.vstride = vstride;
+ src1.width = width;
+ src1.hstride = BRW_HORIZONTAL_STRIDE_0;
- src0.vstride = vstride;
- src0.width = width;
- src0.hstride = BRW_HORIZONTAL_STRIDE_0;
- src1.vstride = vstride;
- src1.width = width;
- src1.hstride = BRW_HORIZONTAL_STRIDE_0;
+ brw_ADD(p, dst, src0, negate(src1));
+ } else {
+ /* On Haswell and earlier, the region used above appears to not work
+ * correctly for compressed instructions. At least on Haswell and
+ * Iron Lake, compressed ALIGN16 instructions do work. Since we
+ * would have to split to SIMD8 no matter which method we choose, we
+ * may as well use ALIGN16 on all platforms gen7 and earlier.
+ */
+ struct brw_reg src0 = stride(src, 4, 4, 1);
+ struct brw_reg src1 = stride(src, 4, 4, 1);
+ if (inst->opcode == FS_OPCODE_DDX_FINE) {
+ src0.swizzle = BRW_SWIZZLE_XXZZ;
+ src1.swizzle = BRW_SWIZZLE_YYWW;
+ } else {
+ src0.swizzle = BRW_SWIZZLE_XXXX;
+ src1.swizzle = BRW_SWIZZLE_YYYY;
+ }
- brw_ADD(p, dst, src0, negate(src1));
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_ADD(p, dst, negate(src0), src1);
+ brw_pop_insn_state(p);
+ }
}
/* The negate_value boolean is used to negate the derivative computation for
@@ -1279,10 +1302,28 @@ fs_generator::generate_ddy(const fs_inst *inst,
}
} else {
/* replicate the derivative at the top-left pixel to other pixels */
- struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
- struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
+ if (devinfo->gen >= 8) {
+ struct brw_reg src0 = byte_offset(stride(src, 4, 4, 0), 0 * type_size);
+ struct brw_reg src1 = byte_offset(stride(src, 4, 4, 0), 2 * type_size);
- brw_ADD(p, dst, negate(src0), src1);
+ brw_ADD(p, dst, negate(src0), src1);
+ } else {
+ /* On Haswell and earlier, the region used above appears to not work
+ * correctly for compressed instructions. At least on Haswell and
+ * Iron Lake, compressed ALIGN16 instructions do work. Since we
+ * would have to split to SIMD8 no matter which method we choose, we
+ * may as well use ALIGN16 on all platforms gen7 and earlier.
+ */
+ struct brw_reg src0 = stride(src, 4, 4, 1);
+ struct brw_reg src1 = stride(src, 4, 4, 1);
+ src0.swizzle = BRW_SWIZZLE_XXXX;
+ src1.swizzle = BRW_SWIZZLE_ZZZZ;
+
+ brw_push_insn_state(p);
+ brw_set_default_access_mode(p, BRW_ALIGN_16);
+ brw_ADD(p, dst, negate(src0), src1);
+ brw_pop_insn_state(p);
+ }
}
}