R600: Make sure OQAP defs and uses happen in the same clause

Reading the special OQAP register pops the top value off the LDS input queue and returns it to the instruction. This queue is invalidated at the end of an ALU clause and leaving values in the queue can lead to GPU hangs. This means that if we load a value into the queue, we must use it before the end of the clause. This fixes some hangs in the OpenCV test suite.
author: Tom Stellard <thomas.stellard@amd.com> 2013-09-05 08:59:32 -0700
committer: Tom Stellard <thomas.stellard@amd.com> 2013-10-14 13:19:20 -0700
commit: c1db95cbd8d82274d3be14404cabef1ec82c4c67 (patch)
tree: e057d0af2af8071debd225930bb872d0e6cb77e5
parent: a8dc6c77e618c62a0afbe94caf965a0225b6ce4e (diff)
3 files changed, 41 insertions, 14 deletions
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 6c26d9ece40..611b7f488a0 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -93,11 +93,12 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   }
 
 
-  // We want to scheduled AR defs as soon as possible to make sure they aren't
-  // put in a different ALU clause from their uses.
-  if (!SU && !UnscheduledARDefs.empty()) {
-      SU = UnscheduledARDefs[0];
-      UnscheduledARDefs.erase(UnscheduledARDefs.begin());
+  // We want to scheduled defs that cannot be live outside of this clause 
+  // as soon as possible to make sure they aren't put in a different
+  // ALU clause from their uses.
+  if (!SU && !UnscheduledNoLiveOutDefs.empty()) {
+      SU = UnscheduledNoLiveOutDefs[0];
+      UnscheduledNoLiveOutDefs.erase(UnscheduledNoLiveOutDefs.begin());
       NextInstKind = IDAlu;
   }
 
@@ -132,9 +133,9 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
 
   // We want to schedule the AR uses as late as possible to make sure that
   // the AR defs have been released.
-  if (!SU && !UnscheduledARUses.empty()) {
-      SU = UnscheduledARUses[0];
-      UnscheduledARUses.erase(UnscheduledARUses.begin());
+  if (!SU && !UnscheduledNoLiveOutUses.empty()) {
+      SU = UnscheduledNoLiveOutUses[0];
+      UnscheduledNoLiveOutUses.erase(UnscheduledNoLiveOutUses.begin());
       NextInstKind = IDAlu;
   }
 
@@ -217,15 +218,15 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
 
   int IK = getInstKind(SU);
 
-  // Check for AR register defines
+  // Check for registers that do not live across ALU clauses.
   for (MachineInstr::const_mop_iterator I = SU->getInstr()->operands_begin(),
                                         E = SU->getInstr()->operands_end();
                                         I != E; ++I) {
-    if (I->isReg() && I->getReg() == AMDGPU::AR_X) {
+    if (I->isReg() && (I->getReg() == AMDGPU::AR_X || I->getReg() == AMDGPU::OQAP)) {
       if (I->isDef()) {
-        UnscheduledARDefs.push_back(SU);
+        UnscheduledNoLiveOutDefs.push_back(SU);
       } else {
-        UnscheduledARUses.push_back(SU);
+        UnscheduledNoLiveOutUses.push_back(SU);
       }
       return;
     }
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index 0a6f1204a4d..db2e188d665 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -53,8 +53,8 @@ class R600SchedStrategy : public MachineSchedStrategy {
 
   std::vector<SUnit *> Available[IDLast], Pending[IDLast];
   std::vector<SUnit *> AvailableAlus[AluLast];
-  std::vector<SUnit *> UnscheduledARDefs;
-  std::vector<SUnit *> UnscheduledARUses;
+  std::vector<SUnit *> UnscheduledNoLiveOutDefs;
+  std::vector<SUnit *> UnscheduledNoLiveOutUses;
   std::vector<SUnit *> PhysicalRegCopy;
 
   InstKind CurInstKind;
diff --git a/test/CodeGen/R600/lds-input-queue.ll b/test/CodeGen/R600/lds-input-queue.ll
new file mode 100644
index 00000000000..548b41ca775
--- /dev/null
+++ b/test/CodeGen/R600/lds-input-queue.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;
+; This test checks that the lds input queue will is empty at the end of
+; the ALU clause.
+
+; CHECK-LABEL: @lds_input_queue
+; CHECK: LDS_READ_RET * OQAP
+; CHECK-NOT: ALU clause
+; CHECK: MOV T{{[0-9]\.[XYZW]}}, OQAP
+
+@local_mem = internal addrspace(3) unnamed_addr global [2 x i32] [i32 1, i32 2], align 4
+
+define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+  %1 = load i32 addrspace(3)* %0
+  call void @llvm.AMDGPU.barrier.local()
+
+  ; This will start a new clause for the vertex fetch
+  %2 = load i32 addrspace(1)* %in
+  %3 = add i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local()
author	Tom Stellard <thomas.stellard@amd.com>	2013-09-05 08:59:32 -0700
committer	Tom Stellard <thomas.stellard@amd.com>	2013-10-14 13:19:20 -0700
commit	c1db95cbd8d82274d3be14404cabef1ec82c4c67 (patch)
tree	e057d0af2af8071debd225930bb872d0e6cb77e5
parent	a8dc6c77e618c62a0afbe94caf965a0225b6ce4e (diff)