lib/Target/AMDGPU/SILowerFlowControl.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161

//===-- SILowerFlowControl.cpp - Use predicates for flow control ----------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass lowers the pseudo flow control instructions (SI_IF_NZ, ELSE, ENDIF)
// to predicated instructions.
//
// All flow control (except loops) is handled using predicated instructions and
// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
// by writting to the 64-bit EXEC register (each bit corresponds to a
// single vector ALU).  Typically, for predicates, a vector ALU will write
// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
// Vector ALU) and then the ScalarALU will AND the VCC register with the
// EXEC to update the predicates.
//
// For example:
// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
// SI_IF_NZ %VCC
//   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
// ELSE
//   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
// ENDIF
//
// becomes:
//
// %SGPR0 = S_MOV_B64 %EXEC          // Save the current exec mask
// %EXEC = S_AND_B64 %VCC, %EXEC     // Update the exec mask
// S_CBRANCH_EXECZ label0            // This instruction is an
//                                   // optimization which allows us to
//                                   // branch if all the bits of
//                                   // EXEC are zero.
// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
//
// label0:
// %EXEC = S_NOT_B64 %EXEC            // Invert the exec mask for the
//                                    // Then block.
// %EXEC = S_AND_B64 %SGPR0, %EXEC
// S_BRANCH_EXECZ label1              // Use our branch optimization
//                                    // instruction again.
// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
// label1:
// S_MOV_B64                          // Restore the old EXEC value
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "SIInstrInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"

using namespace llvm;

namespace {

class SILowerFlowControlPass : public MachineFunctionPass {

private:
  static char ID;
  const TargetInstrInfo *TII;
  std::vector<unsigned> PredicateStack;
  std::vector<unsigned> UnusedRegisters;

  void pushExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);
  void popExecMask(MachineBasicBlock &MBB, MachineBasicBlock::iterator I);

public:
  SILowerFlowControlPass(TargetMachine &tm) :
    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }

  virtual bool runOnMachineFunction(MachineFunction &MF);

  const char *getPassName() const {
    return "SI Lower flow control instructions";
  }

};

} // End anonymous namespace

char SILowerFlowControlPass::ID = 0;

FunctionPass *llvm::createSILowerFlowControlPass(TargetMachine &tm) {
  return new SILowerFlowControlPass(tm);
}

bool SILowerFlowControlPass::runOnMachineFunction(MachineFunction &MF) {

  // Find all the unused registers that can be used for the predicate stack.
  for (TargetRegisterClass::iterator S = AMDGPU::SReg_64RegClass.begin(),
                                     I = AMDGPU::SReg_64RegClass.end();
                                     I != S; --I) {
    unsigned Reg = *I;
    if (!MF.getRegInfo().isPhysRegOrOverlapUsed(Reg)) {
      UnusedRegisters.push_back(Reg);
    }
  }

  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
                                                  BB != BB_E; ++BB) {
    MachineBasicBlock &MBB = *BB;
    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
                               I != MBB.end(); I = Next, Next = llvm::next(I)) {
      MachineInstr &MI = *I;
      switch (MI.getOpcode()) {
        default: break;
        case AMDGPU::SI_IF_NZ:
          pushExecMask(MBB, I);
          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64),
                  AMDGPU::EXEC)
                  .addOperand(MI.getOperand(0)) // VCC
                  .addReg(AMDGPU::EXEC);
          MI.eraseFromParent();
          break;
        case AMDGPU::ELSE:
          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_NOT_B64),
                  AMDGPU::EXEC)
                  .addReg(AMDGPU::EXEC);
          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_B64),
                  AMDGPU::EXEC)
                  .addReg(PredicateStack.back())
                  .addReg(AMDGPU::EXEC);
          MI.eraseFromParent();
          break;
        case AMDGPU::ENDIF:
          popExecMask(MBB, I);
          MI.eraseFromParent();
          break;
      }
    }
  }
  return false;
}

void SILowerFlowControlPass::pushExecMask(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I) {

  assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
  unsigned StackReg = UnusedRegisters.back();
  UnusedRegisters.pop_back();
  PredicateStack.push_back(StackReg);
  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
          StackReg)
          .addReg(AMDGPU::EXEC);
}

void SILowerFlowControlPass::popExecMask(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I) {
  unsigned StackReg = PredicateStack.back();
  PredicateStack.pop_back();
  UnusedRegisters.push_back(StackReg);
  BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B64),
          AMDGPU::EXEC)
          .addReg(StackReg);
}