summaryrefslogtreecommitdiff
path: root/lib/Target/R600/R600MachineScheduler.cpp
blob: 5d4c4d7e736d947595b42684612840086916e7d4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
//===-- R600MachineScheduler.cpp - R600 Scheduler Interface -*- C++ -*-----===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file
/// \brief R600 Machine Scheduler interface
// TODO: Scheduling is optimised for VLIW4 arch, modify it to support TRANS slot
//
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "misched"

#define TOP_DOWN 0

#include "R600MachineScheduler.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/Pass.h"
#include "llvm/PassManager.h"
#include "llvm/Support/raw_ostream.h"
#include <set>

using namespace llvm;

cl::opt<bool> TopDown("r600-topdown", cl::Hidden,
    cl::desc("Switch top-down scheduling"), cl::init(false));
static
unsigned getShortIndex(unsigned Reg, unsigned SubReg) {
  unsigned ShortReg = TargetRegisterInfo::virtReg2Index(Reg);
  return (ShortReg << 4) | SubReg;
}

static
const unsigned SubRegName[5] = {
  AMDGPU::NoRegister,
  AMDGPU::sub0,
  AMDGPU::sub1,
  AMDGPU::sub2,
  AMDGPU::sub3
};

class DependencyFix : public ScheduleDAGMutation {
private:

  struct SubVReg2SUnit {
    unsigned VirtReg;
    unsigned SubReg;
    SUnit *SU;

    SubVReg2SUnit(unsigned reg, unsigned sr, SUnit *su):
        VirtReg(reg), SubReg(sr), SU(su) {}

    unsigned getSparseSetIndex() const {
      return getShortIndex(VirtReg, SubReg);
    }
  };

  typedef SparseSet<SubVReg2SUnit> SubVReg2SUnitMap;

  SubVReg2SUnitMap VRegDefs;

  void addOutputDep(unsigned Reg, unsigned SubReg, unsigned ShortIndex,
      SUnit *SU) {
    SubVReg2SUnitMap::iterator DefI = VRegDefs.find(ShortIndex);
    if (DefI == VRegDefs.end())
      VRegDefs.insert(SubVReg2SUnit(Reg, SubReg, SU));
    else {
      SUnit *DefSU = DefI->SU;
      if (DefSU != SU) {
        SDep Dep(SU, SDep::Output, Reg);
        DefSU->addPred(Dep);
      }
      DefI->SU = SU;
    }
  }

  void addVRegDefDeps(MachineRegisterInfo &MRI, SUnit *SU, unsigned OperIdx) {
    const MachineInstr *MI = SU->getInstr();
    unsigned Reg = MI->getOperand(OperIdx).getReg();
    unsigned SubReg = MI->getOperand(OperIdx).getSubReg();

    // Singly defined vregs do not have output/anti dependencies.
    // The current operand is a def, so we have at least one.
    // Check here if there are any others...
    if (MRI.hasOneDef(Reg))
      return;

    addOutputDep(Reg, SubReg, getShortIndex(Reg, SubReg), SU);
    if (SubReg == AMDGPU::NoRegister) {
      for (unsigned i = 1; i < 5; i++) {
        addOutputDep(Reg, SubReg, getShortIndex(Reg, SubRegName[i]), SU);
      }
    }
  }

  void addAntiDep(unsigned Reg, unsigned SubReg, SUnit *SU) {
    SubVReg2SUnitMap::iterator DefI =
        VRegDefs.find(getShortIndex(Reg, SubReg));
    if (DefI != VRegDefs.end() && DefI->SU != SU)
      DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
  }
  
  void addVRegUseDeps(ScheduleDAGMI *dag, MachineInstr *RegionBegin, SUnit *SU,
      unsigned OperIdx) {
    MachineInstr *MI = SU->getInstr();
    unsigned Reg = MI->getOperand(OperIdx).getReg();
    unsigned SubReg = MI->getOperand(OperIdx).getSubReg();

    // Add antidependence to the following def of the vreg it uses.
    addAntiDep(Reg, SubReg, SU);
    if (SubReg == AMDGPU::NoRegister) {
      for (unsigned i = 0; i < 5; i++) {
        addAntiDep(Reg, SubRegName[i], SU);
      }
    }

    // Lookup this operand's reaching definition.
    // TODO: Use LiveInterval to avoid parsing whole MBB
    for (MachineBasicBlock::iterator MII = SU->getInstr(), E = RegionBegin;
        MII != E; --MII) {
      MachineInstr *MI = llvm::prior(MII);
      for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
        const MachineOperand &MO = MI->getOperand(j);
        if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
          continue;
        if (SubReg == AMDGPU::NoRegister) {
          SUnit *DefSU = dag->getSUnit(MI);
          if (DefSU != SU) {
            SDep dep(DefSU, SDep::Data, Reg);
            SU->addPred(dep);
          }
          continue;
        } else if (MO.getSubReg() == AMDGPU::NoRegister ||
            SubReg == MO.getSubReg()) {
          SUnit *DefSU = dag->getSUnit(MI);
          if (DefSU != SU) {
            SDep dep(DefSU, SDep::Data, Reg);
            SU->addPred(dep);
          }
          return;
        }
      }
    }
  }

public:
  DependencyFix() { }

  /// \brief Recompute Output and Anti dependencies of incoming dag
  /// ScheduleDAGInstrs has a conservative policy about subreg dependencies.
  /// All subreg write of a same superreg will be chained by Output/Anti deps.
  /// These artificial deps delay releases of MI and thus reduce parallelism
  /// oportunities. This function recompute the ScheduleDag to produce proper
  /// subreg aware dependencies.
  void apply(ScheduleDAGMI *dag) {
    VRegDefs.clear();
    // Remove all Output/Anti deps
    for (unsigned i = 0; i < dag->SUnits.size(); ++i) {
      SUnit &SU = dag->SUnits[i];
      for (SUnit::pred_iterator SUIt = SU.Preds.begin(), SUE = SU.Preds.end();
          SUIt != SUE; ++SUIt) {
        SDep &SD = *SUIt;
        if (SD.getKind() != SDep::Order &&
            TargetRegisterInfo::isVirtualRegister(SD.getReg())) {
          SU.removePred(SD);
        }
      }
    }

    MachineBasicBlock *MBB = dag->SUnits.front().getInstr()->getParent();
    MachineBasicBlock::iterator RegionBegin = MBB->begin(),
        RegionEnd = MBB->end();

    while (!dag->getSUnit(RegionBegin))
      RegionBegin++;

    while (!dag->getSUnit(RegionEnd))
      RegionEnd--;

    VRegDefs.setUniverse((dag->MRI.getNumVirtRegs() << 4) | 15);
    // Now recompute output/anti dependencies
    for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
         MII != MIE; --MII) {
      MachineInstr *MI = MII;
      SUnit *SU = dag->getSUnit(MI);
      for (unsigned j = 0, n = MI->getNumOperands(); j != n; ++j) {
        const MachineOperand &MO = MI->getOperand(j);
        if (!MO.isReg()) continue;
        unsigned Reg = MO.getReg();
        if (Reg == 0) continue;

        if (TargetRegisterInfo::isVirtualRegister(Reg)) {
          if (MO.isDef()) {
            addVRegDefDeps(dag->MRI, SU, j);
          }
          else if (MO.readsReg()) {// ignore undef operands
            addVRegUseDeps(dag, RegionBegin, SU, j);
          }
        }
      }
    }
    DEBUG(
    for (unsigned i = 0, e = dag->SUnits.size(); i < e; ++i) {
      SUnit &SU = dag->SUnits[i];
      dbgs() << "\n Dependencies for SUnit ";
      SU.dump(dag);
      dbgs() << "Preds:\n";
      for (unsigned j = 0, e2 = SU.Preds.size(); j < e2; j++) {
        SUnit *Pred = SU.Preds[j].getSUnit();
        dbgs() << " - (" << SU.Preds[j].getKind() << ")";
        Pred->dump(dag);
      }
      dbgs() << "Succs:\n";
      for (unsigned j = 0, e2 = SU.Succs.size(); j < e2; j++) {
        SUnit *Succ = SU.Succs[j].getSUnit();
        if (Succ == &dag->ExitSU)
          continue;
        dbgs() << " - (" << SU.Succs[j].getKind() << ")";
        Succ->dump(dag);
      }
    }
    );
  }
};

ScheduleDAGInstrs *llvm::createR600MachineScheduler(MachineSchedContext *C) {
  ScheduleDAGMI *dag = new ScheduleDAGMI(C, new R600SchedStrategy());
  dag->addMutation(new DependencyFix);
  return dag;
}

void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {

  DAG = dag;
  TII = static_cast<const R600InstrInfo*>(DAG->TII);
  TRI = static_cast<const R600RegisterInfo*>(DAG->TRI);
  MRI = &DAG->MRI;
  CurInstKind = IDOther;
  CurEmitted = 0;
  OccupedSlotsMask = 15;
  InstKindLimit[IDAlu] = TII->getMaxAlusPerClause();
  InstKindLimit[IDOther] = 32;

  const AMDGPUSubtarget &ST = DAG->TM.getSubtarget<AMDGPUSubtarget>();
  InstKindLimit[IDFetch] = ST.getTexVTXClauseSize();
}

void R600SchedStrategy::MoveUnits(std::vector<SUnit *> &QSrc, std::vector<SUnit *> &QDst)
{
  if (QSrc.empty())
    return;
  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
    QDst.push_back(QSrc[i]);
  }
  QSrc.clear();
}

SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
  SUnit *SU = 0;
  NextInstKind = IDOther;

  if (!FakeCopy.empty()) {
    IsTopNode = true;
    SU = *FakeCopy.begin();
    FakeCopy.erase(FakeCopy.begin());
    return SU;
  }

  IsTopNode = TOP_DOWN;

  // check if we might want to switch current clause type
  bool AllowSwitchToAlu = (CurEmitted > InstKindLimit[CurInstKind]) ||
      (Available[CurInstKind].empty());
  bool AllowSwitchFromAlu = (CurEmitted > InstKindLimit[CurInstKind]) &&
      (!Available[IDFetch].empty() || !Available[IDOther].empty());

  if ((AllowSwitchToAlu && CurInstKind != IDAlu) ||
      (!AllowSwitchFromAlu && CurInstKind == IDAlu)) {
    // try to pick ALU
    SU = pickAlu();
    if (SU) {
      if (CurEmitted >= InstKindLimit[IDAlu])
        CurEmitted = 0;
      NextInstKind = IDAlu;
    }
  }

  if (!SU) {
    // try to pick FETCH
    SU = pickOther(IDFetch);
    if (SU)
      NextInstKind = IDFetch;
  }

  // try to pick other
  if (!SU) {
    SU = pickOther(IDOther);
    if (SU)
      NextInstKind = IDOther;
  }

  DEBUG(
      if (SU) {
        dbgs() << " ** Pick node **\n";
        SU->dump(DAG);
      } else {
        dbgs() << "NO NODE \n";
        for (unsigned i = 0; i < DAG->SUnits.size(); i++) {
          const SUnit &S = DAG->SUnits[i];
          if (!S.isScheduled)
            S.dump(DAG);
        }
      }
  );

  return SU;
}

void R600SchedStrategy::schedNode(SUnit *SU, bool IsTopNode) {

  if (NextInstKind != CurInstKind) {
    DEBUG(dbgs() << "Instruction Type Switch\n");
    if (NextInstKind != IDAlu)
      OccupedSlotsMask = 15;
    CurEmitted = 0;
    CurInstKind = NextInstKind;
  }

  if (CurInstKind == IDAlu) {
    switch (getAluKind(SU)) {
    case AluT_XYZW:
      CurEmitted += 4;
      break;
    case AluDiscarded:
      break;
    default: {
      ++CurEmitted;
      for (MachineInstr::mop_iterator It = SU->getInstr()->operands_begin(),
          E = SU->getInstr()->operands_end(); It != E; ++It) {
        MachineOperand &MO = *It;
        if (MO.isReg() && MO.getReg() == AMDGPU::ALU_LITERAL_X)
          ++CurEmitted;
      }
    }
    }
  } else {
    ++CurEmitted;
  }


  DEBUG(dbgs() << CurEmitted << " Instructions Emitted in this clause\n");

  if (CurInstKind != IDFetch) {
    MoveUnits(Pending[IDFetch], Available[IDFetch]);
  }
  MoveUnits(Pending[IDOther], Available[IDOther]);
}

static bool
isFakeCopy(MachineInstr *MI) {
  if (MI->getOpcode() != AMDGPU::COPY)
    return false;
  if (!TargetRegisterInfo::isPhysicalRegister(MI->getOperand(1).getReg()))
    return false;
  if (MI->getOperand(1).getReg() == AMDGPU::ZERO)
    return false;
  return true;
}

void R600SchedStrategy::releaseTopNode(SUnit *SU) {
  DEBUG(dbgs() << "Top Releasing ";SU->dump(DAG););
  if (isFakeCopy(SU->getInstr())) {
    FakeCopy.push_back(SU);
    return;
  }

  if (TOP_DOWN) {
    int IK = getInstKind(SU);

    Pending[IK].push_back(SU);
  }
}

void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
  DEBUG(dbgs() << "Bottom Releasing ";SU->dump(DAG););
  if (isFakeCopy(SU->getInstr())) {
    return;
  }
  if (!TOP_DOWN) {
    int IK = getInstKind(SU);

    if (IK == IDOther)
      Available[IDOther].push_back(SU);
    else
      Pending[IK].push_back(SU);
  }
}

bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
                                          const TargetRegisterClass *RC) const {
  if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
    return RC->contains(Reg);
  } else {
    return MRI->getRegClass(Reg) == RC;
  }
}

R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
  MachineInstr *MI = SU->getInstr();

    switch (MI->getOpcode()) {
    case AMDGPU::INTERP_PAIR_XY:
    case AMDGPU::PRED_X:
    case AMDGPU::INTERP_PAIR_ZW:
    case AMDGPU::INTERP_VEC_LOAD:
      return AluT_XYZW;
    case AMDGPU::COPY:
      if (MI->getOperand(1).isUndef()) {
        // MI will become a KILL, don't considers it in scheduling
        return AluDiscarded;
      }
    default:
      break;
    }

    // Does the instruction take a whole IG ?
    if(TII->isVector(*MI) || TII->isTransOnly(MI) ||
        TII->isCubeOp(MI->getOpcode()) ||
        TII->isReductionOp(MI->getOpcode()))
      return AluT_XYZW;

    // Is the result already assigned to a channel ?
    unsigned DestSubReg = MI->getOperand(0).getSubReg();
    switch (DestSubReg) {
    case AMDGPU::sub0:
      return AluT_X;
    case AMDGPU::sub1:
      return AluT_Y;
    case AMDGPU::sub2:
      return AluT_Z;
    case AMDGPU::sub3:
      return AluT_W;
    default:
      break;
    }

    // Is the result already member of a X/Y/Z/W class ?
    unsigned DestReg = MI->getOperand(0).getReg();
    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_XRegClass) ||
        regBelongsToClass(DestReg, &AMDGPU::R600_AddrRegClass))
      return AluT_X;
    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_YRegClass))
      return AluT_Y;
    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass))
      return AluT_Z;
    if (regBelongsToClass(DestReg, &AMDGPU::R600_TReg32_WRegClass))
      return AluT_W;
    if (regBelongsToClass(DestReg, &AMDGPU::R600_Reg128RegClass))
      return AluT_XYZW;

    return AluAny;

}

int R600SchedStrategy::getInstKind(SUnit* SU) {
  int Opcode = SU->getInstr()->getOpcode();

  if (TII->usesTextureCache(Opcode) || TII->usesVertexCache(Opcode))
    return IDFetch;

  if (TII->isALUInstr(Opcode)) {
    return IDAlu;
  }

  switch (Opcode) {
  case AMDGPU::COPY:
  case AMDGPU::CONST_COPY:
  case AMDGPU::INTERP_PAIR_XY:
  case AMDGPU::INTERP_PAIR_ZW:
  case AMDGPU::INTERP_VEC_LOAD:
  case AMDGPU::DOT4_eg_pseudo:
  case AMDGPU::DOT4_r600_pseudo:
    return IDAlu;
  default:
    return IDOther;
  }
}

SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
  if (Q.empty())
    return NULL;
  for (std::vector<SUnit *>::iterator It = Q.begin(), E = Q.end();
      It != E; ++It) {
    SUnit *SU = *It;
    InstructionsGroupCandidate.push_back(SU->getInstr());
    if (TII->canBundle(InstructionsGroupCandidate)) {
      InstructionsGroupCandidate.pop_back();
      Q.erase(It);
      return SU;
    } else {
      InstructionsGroupCandidate.pop_back();
    }
  }
  return NULL;
}

void R600SchedStrategy::LoadAlu() {
  std::vector<SUnit *> &QSrc = Pending[IDAlu];
  for (unsigned i = 0, e = QSrc.size(); i < e; ++i) {
    AluKind AK = getAluKind(QSrc[i]);
    AvailableAlus[AK].push_back(QSrc[i]);
  }
  QSrc.clear();
}

void R600SchedStrategy::PrepareNextSlot() {
  DEBUG(dbgs() << "New Slot\n");
  assert (OccupedSlotsMask && "Slot wasn't filled");
  OccupedSlotsMask = 0;
  InstructionsGroupCandidate.clear();
  LoadAlu();
}

void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
  unsigned DestReg = MI->getOperand(0).getReg();
  // PressureRegister crashes if an operand is def and used in the same inst
  // and we try to constraint its regclass
  for (MachineInstr::mop_iterator It = MI->operands_begin(),
      E = MI->operands_end(); It != E; ++It) {
    MachineOperand &MO = *It;
    if (MO.isReg() && !MO.isDef() &&
        MO.getReg() == MI->getOperand(0).getReg())
      return;
  }
  // Constrains the regclass of DestReg to assign it to Slot
  switch (Slot) {
  case 0:
    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_XRegClass);
    break;
  case 1:
    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_YRegClass);
    break;
  case 2:
    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_ZRegClass);
    break;
  case 3:
    MRI->constrainRegClass(DestReg, &AMDGPU::R600_TReg32_WRegClass);
    break;
  }
}

SUnit *R600SchedStrategy::AttemptFillSlot(unsigned Slot) {
  static const AluKind IndexToID[] = {AluT_X, AluT_Y, AluT_Z, AluT_W};
  SUnit *SlotedSU = PopInst(AvailableAlus[IndexToID[Slot]]);
  if (SlotedSU)
    return SlotedSU;
  SUnit *UnslotedSU = PopInst(AvailableAlus[AluAny]);
  if (UnslotedSU)
    AssignSlot(UnslotedSU->getInstr(), Slot);
  return UnslotedSU;
}

bool R600SchedStrategy::isAvailablesAluEmpty() const {
  return Pending[IDAlu].empty() && AvailableAlus[AluAny].empty() &&
      AvailableAlus[AluT_XYZW].empty() && AvailableAlus[AluT_X].empty() &&
      AvailableAlus[AluT_Y].empty() && AvailableAlus[AluT_Z].empty() &&
      AvailableAlus[AluT_W].empty() && AvailableAlus[AluDiscarded].empty();
}

SUnit* R600SchedStrategy::pickAlu() {
  while (!isAvailablesAluEmpty()) {
    if (!OccupedSlotsMask) {
      // Flush physical reg copies (RA will discard them)
      if (!AvailableAlus[AluDiscarded].empty()) {
        OccupedSlotsMask = 15;
        return PopInst(AvailableAlus[AluDiscarded]);
      }
      // If there is a T_XYZW alu available, use it
      if (!AvailableAlus[AluT_XYZW].empty()) {
        OccupedSlotsMask = 15;
        return PopInst(AvailableAlus[AluT_XYZW]);
      }
    }
    for (int Chan = 3; Chan > -1; --Chan) {
      bool isOccupied = OccupedSlotsMask & (1 << Chan);
      if (!isOccupied) {
        SUnit *SU = AttemptFillSlot(Chan);
        if (SU) {
          OccupedSlotsMask |= (1 << Chan);
          InstructionsGroupCandidate.push_back(SU->getInstr());
          return SU;
        }
      }
    }
    PrepareNextSlot();
  }
  return NULL;
}

SUnit* R600SchedStrategy::pickOther(int QID) {
  SUnit *SU = 0;
  std::vector<SUnit *> &AQ = Available[QID];

  if (AQ.empty()) {
    MoveUnits(Pending[QID], AQ);
  }
  if (!AQ.empty()) {
    SU = *AQ.begin();
    AQ.erase(AQ.begin());
  }
  return SU;
}