intel/fs: Add support for SLM fence in Gen11

Gen11 SLM is not on L3 anymore, so now the hardware has two separate fences. Add a way to control which fence types to use. At this time, we don't have enough information in NIR to control the visibility of the memory being fenced, so for now be conservative and assume that fences will need a stall. With more information later we'll be able to reduce those. Fixes Vulkan CTS tests in ICL: dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_nonlocal.workgroup.guard_local.buffer.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.device.payload_local.image.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.buffer.guard_nonlocal.workgroup.comp dEQP-VK.memory_model.message_passing.core11.u32.coherent.fence_fence.atomicwrite.workgroup.payload_local.image.guard_nonlocal.workgroup.comp The whole set of supported tests in dEQP-VK.memory_model.* group should be passing in ICL now. v2: Pass BTI around instead of having an enum. (Jason) Emit two SHADER_OPCODE_MEMORY_FENCE instead of one that gets transformed into two. (Jason) List tests fixed. (Lionel) v3: For clarity, split the decision of which fences to emit from the emission code. (Jason) Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> Acked-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
author: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> 2019-07-10 12:02:23 -0700
committer: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com> 2019-07-11 08:29:32 -0700
commit: b390ff35170fdc2b7f1fb1709a79d81edcd56981 (patch)
tree: 7de3a499d1cab78265da8ff678c2cd3bb486537e
parent: 838374b6dd818b6ca09808c4f922d7486ecbd329 (diff)
6 files changed, 66 insertions, 12 deletions
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index dd504cc25fb..c4dba558ecd 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -1117,7 +1117,8 @@ brw_memory_fence(struct brw_codegen *p,
                  struct brw_reg dst,
                  struct brw_reg src,
                  enum opcode send_op,
-                 bool stall);
+                 bool stall,
+                 unsigned bti);
 
 void
 brw_pixel_interpolator_query(struct brw_codegen *p,
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index e8ca7ff8b98..1d4c0b83c87 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -452,6 +452,17 @@ enum opcode {
    SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL,
    SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL,
 
+   /**
+    * Memory fence messages.
+    *
+    * Source 0: Must be register g0, used as header.
+    * Source 1: Immediate bool to indicate whether or not we need to stall
+    *           until memory transactions prior to the fence are completed.
+    * Source 2: Immediate byte indicating which memory to fence.  Zero means
+    *           global memory; GEN7_BTI_SLM means SLM (for Gen11+ only).
+    *
+    * Vec4 backend only uses Source 0.
+    */
    SHADER_OPCODE_MEMORY_FENCE,
 
    SHADER_OPCODE_GEN4_SCRATCH_READ,
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
index 8e7263ce447..60761e83c62 100644
--- a/src/intel/compiler/brw_eu_emit.c
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -3012,7 +3012,8 @@ static void
 brw_set_memory_fence_message(struct brw_codegen *p,
                              struct brw_inst *insn,
                              enum brw_message_target sfid,
-                             bool commit_enable)
+                             bool commit_enable,
+                             unsigned bti)
 {
    const struct gen_device_info *devinfo = p->devinfo;
 
@@ -3034,6 +3035,9 @@ brw_set_memory_fence_message(struct brw_codegen *p,
 
    if (commit_enable)
       brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
+
+   assert(devinfo->gen >= 11 || bti == 0);
+   brw_inst_set_binding_table_index(devinfo, insn, bti);
 }
 
 void
@@ -3041,7 +3045,8 @@ brw_memory_fence(struct brw_codegen *p,
                  struct brw_reg dst,
                  struct brw_reg src,
                  enum opcode send_op,
-                 bool stall)
+                 bool stall,
+                 unsigned bti)
 {
    const struct gen_device_info *devinfo = p->devinfo;
    const bool commit_enable = stall ||
@@ -3062,7 +3067,7 @@ brw_memory_fence(struct brw_codegen *p,
    brw_set_dest(p, insn, dst);
    brw_set_src0(p, insn, src);
    brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
-                                commit_enable);
+                                commit_enable, bti);
 
    if (devinfo->gen == 7 && !devinfo->is_haswell) {
       /* IVB does typed surface access through the render cache, so we need to
@@ -3073,7 +3078,7 @@ brw_memory_fence(struct brw_codegen *p,
       brw_set_dest(p, insn, offset(dst, 1));
       brw_set_src0(p, insn, src);
       brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
-                                   commit_enable);
+                                   commit_enable, bti);
 
       /* Now write the response of the second message into the response of the
        * first to trigger a pipeline stall -- This way future render and data
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index be6a00e8476..88de5189064 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -2069,13 +2069,14 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
 
       case SHADER_OPCODE_MEMORY_FENCE:
          assert(src[1].file == BRW_IMMEDIATE_VALUE);
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud);
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, src[1].ud, src[2].ud);
          break;
 
       case SHADER_OPCODE_INTERLOCK:
          assert(devinfo->gen >= 9);
          /* The interlock is basically a memory fence issued via sendc */
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false);
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SENDC, false, /* bti */ 0);
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 00ce6af23c7..aeebaaeb62c 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -4416,11 +4416,47 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_memory_barrier_buffer:
    case nir_intrinsic_memory_barrier_image:
    case nir_intrinsic_memory_barrier: {
+      bool l3_fence, slm_fence;
+      if (devinfo->gen >= 11) {
+         l3_fence = instr->intrinsic != nir_intrinsic_memory_barrier_shared;
+         slm_fence = instr->intrinsic == nir_intrinsic_group_memory_barrier ||
+                     instr->intrinsic == nir_intrinsic_memory_barrier ||
+                     instr->intrinsic == nir_intrinsic_memory_barrier_shared;
+      } else {
+         /* Prior to gen11, we only have one kind of fence. */
+         l3_fence = true;
+         slm_fence = false;
+      }
+
+      /* Be conservative in Gen11+ and always stall in a fence.  Since there
+       * are two different fences, and shader might want to synchronize
+       * between them.
+       *
+       * TODO: Improve NIR so that scope and visibility information for the
+       * barriers is available here to make a better decision.
+       *
+       * TODO: When emitting more than one fence, it might help emit all
+       * the fences first and then generate the stall moves.
+       */
+      const bool stall = devinfo->gen >= 11;
+
       const fs_builder ubld = bld.group(8, 0);
       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
-                brw_vec8_grf(0, 0), brw_imm_ud(0))
-         ->size_written = 2 * REG_SIZE;
+
+      if (l3_fence) {
+         ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
+                   brw_vec8_grf(0, 0), brw_imm_ud(stall),
+                   /* bti */ brw_imm_ud(0))
+            ->size_written = 2 * REG_SIZE;
+      }
+
+      if (slm_fence) {
+         ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
+                   brw_vec8_grf(0, 0), brw_imm_ud(stall),
+                   brw_imm_ud(GEN7_BTI_SLM))
+            ->size_written = 2 * REG_SIZE;
+      }
+
       break;
    }
 
@@ -5238,7 +5274,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       const fs_builder ubld = bld.group(8, 0);
       const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
       ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp,
-                brw_vec8_grf(0, 0), brw_imm_ud(1))
+                brw_vec8_grf(0, 0), brw_imm_ud(1), brw_imm_ud(0))
          ->size_written = 2 * REG_SIZE;
       break;
    }
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
index 8f9e4f16677..d85e3c43241 100644
--- a/src/intel/compiler/brw_vec4_generator.cpp
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -1886,7 +1886,7 @@ generate_code(struct brw_codegen *p,
          break;
 
       case SHADER_OPCODE_MEMORY_FENCE:
-         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false);
+         brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, false, /* bti */ 0);
          break;
 
       case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
author	Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>	2019-07-10 12:02:23 -0700
committer	Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>	2019-07-11 08:29:32 -0700
commit	b390ff35170fdc2b7f1fb1709a79d81edcd56981 (patch)
tree	7de3a499d1cab78265da8ff678c2cd3bb486537e
parent	838374b6dd818b6ca09808c4f922d7486ecbd329 (diff)