diff options
author | Tom Stellard <thomas.stellard@amd.com> | 2013-09-05 08:59:32 -0700 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2013-10-14 13:19:20 -0700 |
commit | c1db95cbd8d82274d3be14404cabef1ec82c4c67 (patch) | |
tree | e057d0af2af8071debd225930bb872d0e6cb77e5 | |
parent | a8dc6c77e618c62a0afbe94caf965a0225b6ce4e (diff) |
R600: Make sure OQAP defs and uses happen in the same clause
Reading the special OQAP register pops the top value off the LDS
input queue and returns it to the instruction. This queue is
invalidated at the end of an ALU clause and leaving values in the queue
can lead to GPU hangs. This means that if we load a value into the queue,
we must use it before the end of the clause.
This fixes some hangs in the OpenCV test suite.
-rw-r--r-- | lib/Target/R600/R600MachineScheduler.cpp | 25 | ||||
-rw-r--r-- | lib/Target/R600/R600MachineScheduler.h | 4 | ||||
-rw-r--r-- | test/CodeGen/R600/lds-input-queue.ll | 26 |
3 files changed, 41 insertions, 14 deletions
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 6c26d9ece40..611b7f488a0 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -93,11 +93,12 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { } - // We want to scheduled AR defs as soon as possible to make sure they aren't - // put in a different ALU clause from their uses. - if (!SU && !UnscheduledARDefs.empty()) { - SU = UnscheduledARDefs[0]; - UnscheduledARDefs.erase(UnscheduledARDefs.begin()); + // We want to scheduled defs that cannot be live outside of this clause + // as soon as possible to make sure they aren't put in a different + // ALU clause from their uses. + if (!SU && !UnscheduledNoLiveOutDefs.empty()) { + SU = UnscheduledNoLiveOutDefs[0]; + UnscheduledNoLiveOutDefs.erase(UnscheduledNoLiveOutDefs.begin()); NextInstKind = IDAlu; } @@ -132,9 +133,9 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { // We want to schedule the AR uses as late as possible to make sure that // the AR defs have been released. - if (!SU && !UnscheduledARUses.empty()) { - SU = UnscheduledARUses[0]; - UnscheduledARUses.erase(UnscheduledARUses.begin()); + if (!SU && !UnscheduledNoLiveOutUses.empty()) { + SU = UnscheduledNoLiveOutUses[0]; + UnscheduledNoLiveOutUses.erase(UnscheduledNoLiveOutUses.begin()); NextInstKind = IDAlu; } @@ -217,15 +218,15 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) { int IK = getInstKind(SU); - // Check for AR register defines + // Check for registers that do not live across ALU clauses. for (MachineInstr::const_mop_iterator I = SU->getInstr()->operands_begin(), E = SU->getInstr()->operands_end(); I != E; ++I) { - if (I->isReg() && I->getReg() == AMDGPU::AR_X) { + if (I->isReg() && (I->getReg() == AMDGPU::AR_X || I->getReg() == AMDGPU::OQAP)) { if (I->isDef()) { - UnscheduledARDefs.push_back(SU); + UnscheduledNoLiveOutDefs.push_back(SU); } else { - UnscheduledARUses.push_back(SU); + UnscheduledNoLiveOutUses.push_back(SU); } return; } diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h index 0a6f1204a4d..db2e188d665 100644 --- a/lib/Target/R600/R600MachineScheduler.h +++ b/lib/Target/R600/R600MachineScheduler.h @@ -53,8 +53,8 @@ class R600SchedStrategy : public MachineSchedStrategy { std::vector<SUnit *> Available[IDLast], Pending[IDLast]; std::vector<SUnit *> AvailableAlus[AluLast]; - std::vector<SUnit *> UnscheduledARDefs; - std::vector<SUnit *> UnscheduledARUses; + std::vector<SUnit *> UnscheduledNoLiveOutDefs; + std::vector<SUnit *> UnscheduledNoLiveOutUses; std::vector<SUnit *> PhysicalRegCopy; InstKind CurInstKind; diff --git a/test/CodeGen/R600/lds-input-queue.ll b/test/CodeGen/R600/lds-input-queue.ll new file mode 100644 index 00000000000..548b41ca775 --- /dev/null +++ b/test/CodeGen/R600/lds-input-queue.ll @@ -0,0 +1,26 @@ +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s +; +; This test checks that the lds input queue will is empty at the end of +; the ALU clause. + +; CHECK-LABEL: @lds_input_queue +; CHECK: LDS_READ_RET * OQAP +; CHECK-NOT: ALU clause +; CHECK: MOV T{{[0-9]\.[XYZW]}}, OQAP + +@local_mem = internal addrspace(3) unnamed_addr global [2 x i32] [i32 1, i32 2], align 4 + +define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) { +entry: + %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index + %1 = load i32 addrspace(3)* %0 + call void @llvm.AMDGPU.barrier.local() + + ; This will start a new clause for the vertex fetch + %2 = load i32 addrspace(1)* %in + %3 = add i32 %1, %2 + store i32 %3, i32 addrspace(1)* %out + ret void +} + +declare void @llvm.AMDGPU.barrier.local() |