1/*
2 * Copyright 2011 Christoph Bumiller
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be included in
12 * all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
18 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20 * OTHER DEALINGS IN THE SOFTWARE.
21 */
22
23#include "codegen/nv50_ir.h"
24#include "codegen/nv50_ir_target.h"
25
26#include <algorithm>
27#include <stack>
28#include <limits>
29#if __cplusplus >= 201103L
30#include <unordered_map>
31#else
32#include <tr1/unordered_map>
33#endif
34
35namespace nv50_ir {
36
37#if __cplusplus >= 201103L
38using std::hash;
39using std::unordered_map;
40#else
41using std::tr1::hash;
42using std::tr1::unordered_map;
43#endif
44
45#define MAX_REGISTER_FILE_SIZE 256
46
47class RegisterSet
48{
49public:
50   RegisterSet(const Target *);
51
52   void init(const Target *);
53   void reset(DataFile, bool resetMax = false);
54
55   void periodicMask(DataFile f, uint32_t lock, uint32_t unlock);
56   void intersect(DataFile f, const RegisterSet *);
57
58   bool assign(int32_t& reg, DataFile f, unsigned int size, unsigned int maxReg);
59   void release(DataFile f, int32_t reg, unsigned int size);
60   void occupy(DataFile f, int32_t reg, unsigned int size);
61   void occupy(const Value *);
62   void occupyMask(DataFile f, int32_t reg, uint8_t mask);
63   bool isOccupied(DataFile f, int32_t reg, unsigned int size) const;
64   bool testOccupy(const Value *);
65   bool testOccupy(DataFile f, int32_t reg, unsigned int size);
66
67   inline int getMaxAssigned(DataFile f) const { return fill[f]; }
68
69   inline unsigned int getFileSize(DataFile f) const
70   {
71      return last[f] + 1;
72   }
73
74   inline unsigned int units(DataFile f, unsigned int size) const
75   {
76      return size >> unit[f];
77   }
78   // for regs of size >= 4, id is counted in 4-byte words (like nv50/c0 binary)
79   inline unsigned int idToBytes(const Value *v) const
80   {
81      return v->reg.data.id * MIN2(v->reg.size, 4);
82   }
83   inline unsigned int idToUnits(const Value *v) const
84   {
85      return units(v->reg.file, idToBytes(v));
86   }
87   inline int bytesToId(Value *v, unsigned int bytes) const
88   {
89      if (v->reg.size < 4)
90         return units(v->reg.file, bytes);
91      return bytes / 4;
92   }
93   inline int unitsToId(DataFile f, int u, uint8_t size) const
94   {
95      if (u < 0)
96         return -1;
97      return (size < 4) ? u : ((u << unit[f]) / 4);
98   }
99
100   void print(DataFile f) const;
101
102   const bool restrictedGPR16Range;
103
104private:
105   BitSet bits[LAST_REGISTER_FILE + 1];
106
107   int unit[LAST_REGISTER_FILE + 1]; // log2 of allocation granularity
108
109   int last[LAST_REGISTER_FILE + 1];
110   int fill[LAST_REGISTER_FILE + 1];
111};
112
113void
114RegisterSet::reset(DataFile f, bool resetMax)
115{
116   bits[f].fill(0);
117   if (resetMax)
118      fill[f] = -1;
119}
120
121void
122RegisterSet::init(const Target *targ)
123{
124   for (unsigned int rf = 0; rf <= LAST_REGISTER_FILE; ++rf) {
125      DataFile f = static_cast<DataFile>(rf);
126      last[rf] = targ->getFileSize(f) - 1;
127      unit[rf] = targ->getFileUnit(f);
128      fill[rf] = -1;
129      assert(last[rf] < MAX_REGISTER_FILE_SIZE);
130      bits[rf].allocate(last[rf] + 1, true);
131   }
132}
133
134RegisterSet::RegisterSet(const Target *targ)
135  : restrictedGPR16Range(targ->getChipset() < 0xc0)
136{
137   init(targ);
138   for (unsigned int i = 0; i <= LAST_REGISTER_FILE; ++i)
139      reset(static_cast<DataFile>(i));
140}
141
142void
143RegisterSet::periodicMask(DataFile f, uint32_t lock, uint32_t unlock)
144{
145   bits[f].periodicMask32(lock, unlock);
146}
147
148void
149RegisterSet::intersect(DataFile f, const RegisterSet *set)
150{
151   bits[f] |= set->bits[f];
152}
153
154void
155RegisterSet::print(DataFile f) const
156{
157   INFO("GPR:");
158   bits[f].print();
159   INFO("\n");
160}
161
162bool
163RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size, unsigned int maxReg)
164{
165   reg = bits[f].findFreeRange(size, maxReg);
166   if (reg < 0)
167      return false;
168   fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
169   return true;
170}
171
172bool
173RegisterSet::isOccupied(DataFile f, int32_t reg, unsigned int size) const
174{
175   return bits[f].testRange(reg, size);
176}
177
178void
179RegisterSet::occupy(const Value *v)
180{
181   occupy(v->reg.file, idToUnits(v), v->reg.size >> unit[v->reg.file]);
182}
183
184void
185RegisterSet::occupyMask(DataFile f, int32_t reg, uint8_t mask)
186{
187   bits[f].setMask(reg & ~31, static_cast<uint32_t>(mask) << (reg % 32));
188}
189
190void
191RegisterSet::occupy(DataFile f, int32_t reg, unsigned int size)
192{
193   bits[f].setRange(reg, size);
194
195   INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %u\n", f, reg, size);
196
197   fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
198}
199
200bool
201RegisterSet::testOccupy(const Value *v)
202{
203   return testOccupy(v->reg.file,
204                     idToUnits(v), v->reg.size >> unit[v->reg.file]);
205}
206
207bool
208RegisterSet::testOccupy(DataFile f, int32_t reg, unsigned int size)
209{
210   if (isOccupied(f, reg, size))
211      return false;
212   occupy(f, reg, size);
213   return true;
214}
215
216void
217RegisterSet::release(DataFile f, int32_t reg, unsigned int size)
218{
219   bits[f].clrRange(reg, size);
220
221   INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %u\n", f, reg, size);
222}
223
224class RegAlloc
225{
226public:
227   RegAlloc(Program *program) : prog(program), func(NULL), sequence(0) { }
228
229   bool exec();
230   bool execFunc();
231
232private:
233   class PhiMovesPass : public Pass {
234   private:
235      virtual bool visit(BasicBlock *);
236      inline bool needNewElseBlock(BasicBlock *b, BasicBlock *p);
237      inline void splitEdges(BasicBlock *b);
238   };
239
240   class ArgumentMovesPass : public Pass {
241   private:
242      virtual bool visit(BasicBlock *);
243   };
244
245   class BuildIntervalsPass : public Pass {
246   private:
247      virtual bool visit(BasicBlock *);
248      void collectLiveValues(BasicBlock *);
249      void addLiveRange(Value *, const BasicBlock *, int end);
250   };
251
252   class InsertConstraintsPass : public Pass {
253   public:
254      InsertConstraintsPass() : targ(NULL) { }
255      bool exec(Function *func);
256   private:
257      virtual bool visit(BasicBlock *);
258
259      void insertConstraintMove(Instruction *, int s);
260      bool insertConstraintMoves();
261
262      void condenseDefs(Instruction *);
263      void condenseDefs(Instruction *, const int first, const int last);
264      void condenseSrcs(Instruction *, const int first, const int last);
265
266      void addHazard(Instruction *i, const ValueRef *src);
267      void textureMask(TexInstruction *);
268      void addConstraint(Instruction *, int s, int n);
269      bool detectConflict(Instruction *, int s);
270
271      // target specific functions, TODO: put in subclass or Target
272      void texConstraintNV50(TexInstruction *);
273      void texConstraintNVC0(TexInstruction *);
274      void texConstraintNVE0(TexInstruction *);
275      void texConstraintGM107(TexInstruction *);
276
277      bool isScalarTexGM107(TexInstruction *);
278      void handleScalarTexGM107(TexInstruction *);
279
280      std::list<Instruction *> constrList;
281
282      const Target *targ;
283   };
284
285   bool buildLiveSets(BasicBlock *);
286
287private:
288   Program *prog;
289   Function *func;
290
291   // instructions in control flow / chronological order
292   ArrayList insns;
293
294   int sequence; // for manual passes through CFG
295};
296
297typedef std::pair<Value *, Value *> ValuePair;
298
299class MergedDefs
300{
301private:
302   std::list<ValueDef *>& entry(Value *val) {
303      auto it = defs.find(val);
304
305      if (it == defs.end()) {
306         std::list<ValueDef *> &res = defs[val];
307         res = val->defs;
308         return res;
309      } else {
310         return (*it).second;
311      }
312   }
313
314   std::unordered_map<Value *, std::list<ValueDef *> > defs;
315
316public:
317   std::list<ValueDef *>& operator()(Value *val) {
318      return entry(val);
319   }
320
321   void add(Value *val, const std::list<ValueDef *> &vals) {
322      assert(val);
323      std::list<ValueDef *> &valdefs = entry(val);
324      valdefs.insert(valdefs.end(), vals.begin(), vals.end());
325   }
326
327   void removeDefsOfInstruction(Instruction *insn) {
328      for (int d = 0; insn->defExists(d); ++d) {
329         ValueDef *def = &insn->def(d);
330         defs.erase(def->get());
331         for (auto &p : defs)
332            p.second.remove(def);
333      }
334   }
335
336   void merge() {
337      for (auto &p : defs)
338         p.first->defs = p.second;
339   }
340};
341
342class SpillCodeInserter
343{
344public:
345   SpillCodeInserter(Function *fn, MergedDefs &mergedDefs) : func(fn), mergedDefs(mergedDefs), stackSize(0), stackBase(0) { }
346
347   bool run(const std::list<ValuePair>&);
348
349   Symbol *assignSlot(const Interval&, const unsigned int size);
350   Value *offsetSlot(Value *, const LValue *);
351   inline int32_t getStackSize() const { return stackSize; }
352
353private:
354   Function *func;
355   MergedDefs &mergedDefs;
356
357   struct SpillSlot
358   {
359      Interval occup;
360      std::list<Value *> residents; // needed to recalculate occup
361      Symbol *sym;
362      int32_t offset;
363      inline uint8_t size() const { return sym->reg.size; }
364   };
365   std::list<SpillSlot> slots;
366   int32_t stackSize;
367   int32_t stackBase;
368
369   LValue *unspill(Instruction *usei, LValue *, Value *slot);
370   void spill(Instruction *defi, Value *slot, LValue *);
371};
372
373void
374RegAlloc::BuildIntervalsPass::addLiveRange(Value *val,
375                                           const BasicBlock *bb,
376                                           int end)
377{
378   Instruction *insn = val->getUniqueInsn();
379
380   if (!insn)
381      insn = bb->getFirst();
382
383   assert(bb->getFirst()->serial <= bb->getExit()->serial);
384   assert(bb->getExit()->serial + 1 >= end);
385
386   int begin = insn->serial;
387   if (begin < bb->getEntry()->serial || begin > bb->getExit()->serial)
388      begin = bb->getEntry()->serial;
389
390   INFO_DBG(prog->dbgFlags, REG_ALLOC, "%%%i <- live range [%i(%i), %i)\n",
391            val->id, begin, insn->serial, end);
392
393   if (begin != end) // empty ranges are only added as hazards for fixed regs
394      val->livei.extend(begin, end);
395}
396
397bool
398RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p)
399{
400   if (b->cfg.incidentCount() <= 1)
401      return false;
402
403   int n = 0;
404   for (Graph::EdgeIterator ei = p->cfg.outgoing(); !ei.end(); ei.next())
405      if (ei.getType() == Graph::Edge::TREE ||
406          ei.getType() == Graph::Edge::FORWARD)
407         ++n;
408   return (n == 2);
409}
410
411struct PhiMapHash {
412   size_t operator()(const std::pair<Instruction *, BasicBlock *>& val) const {
413      return hash<Instruction*>()(val.first) * 31 +
414         hash<BasicBlock*>()(val.second);
415   }
416};
417
418typedef unordered_map<
419   std::pair<Instruction *, BasicBlock *>, Value *, PhiMapHash> PhiMap;
420
421// Critical edges need to be split up so that work can be inserted along
422// specific edge transitions. Unfortunately manipulating incident edges into a
423// BB invalidates all the PHI nodes since their sources are implicitly ordered
424// by incident edge order.
425//
426// TODO: Make it so that that is not the case, and PHI nodes store pointers to
427// the original BBs.
428void
429RegAlloc::PhiMovesPass::splitEdges(BasicBlock *bb)
430{
431   BasicBlock *pb, *pn;
432   Instruction *phi;
433   Graph::EdgeIterator ei;
434   std::stack<BasicBlock *> stack;
435   int j = 0;
436
437   for (ei = bb->cfg.incident(); !ei.end(); ei.next()) {
438      pb = BasicBlock::get(ei.getNode());
439      assert(pb);
440      if (needNewElseBlock(bb, pb))
441         stack.push(pb);
442   }
443
444   // No critical edges were found, no need to perform any work.
445   if (stack.empty())
446      return;
447
448   // We're about to, potentially, reorder the inbound edges. This means that
449   // we need to hold on to the (phi, bb) -> src mapping, and fix up the phi
450   // nodes after the graph has been modified.
451   PhiMap phis;
452
453   j = 0;
454   for (ei = bb->cfg.incident(); !ei.end(); ei.next(), j++) {
455      pb = BasicBlock::get(ei.getNode());
456      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next)
457         phis.insert(std::make_pair(std::make_pair(phi, pb), phi->getSrc(j)));
458   }
459
460   while (!stack.empty()) {
461      pb = stack.top();
462      pn = new BasicBlock(func);
463      stack.pop();
464
465      pb->cfg.detach(&bb->cfg);
466      pb->cfg.attach(&pn->cfg, Graph::Edge::TREE);
467      pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD);
468
469      assert(pb->getExit()->op != OP_CALL);
470      if (pb->getExit()->asFlow()->target.bb == bb)
471         pb->getExit()->asFlow()->target.bb = pn;
472
473      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
474         PhiMap::iterator it = phis.find(std::make_pair(phi, pb));
475         assert(it != phis.end());
476         phis.insert(std::make_pair(std::make_pair(phi, pn), it->second));
477         phis.erase(it);
478      }
479   }
480
481   // Now go through and fix up all of the phi node sources.
482   j = 0;
483   for (ei = bb->cfg.incident(); !ei.end(); ei.next(), j++) {
484      pb = BasicBlock::get(ei.getNode());
485      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
486         PhiMap::const_iterator it = phis.find(std::make_pair(phi, pb));
487         assert(it != phis.end());
488
489         phi->setSrc(j, it->second);
490      }
491   }
492}
493
494// For each operand of each PHI in b, generate a new value by inserting a MOV
495// at the end of the block it is coming from and replace the operand with its
496// result. This eliminates liveness conflicts and enables us to let values be
497// copied to the right register if such a conflict exists nonetheless.
498//
499// These MOVs are also crucial in making sure the live intervals of phi srces
500// are extended until the end of the loop, since they are not included in the
501// live-in sets.
502bool
503RegAlloc::PhiMovesPass::visit(BasicBlock *bb)
504{
505   Instruction *phi, *mov;
506
507   splitEdges(bb);
508
509   // insert MOVs (phi->src(j) should stem from j-th in-BB)
510   int j = 0;
511   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
512      BasicBlock *pb = BasicBlock::get(ei.getNode());
513      if (!pb->isTerminated())
514         pb->insertTail(new_FlowInstruction(func, OP_BRA, bb));
515
516      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
517         LValue *tmp = new_LValue(func, phi->getDef(0)->asLValue());
518         mov = new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
519
520         mov->setSrc(0, phi->getSrc(j));
521         mov->setDef(0, tmp);
522         phi->setSrc(j, tmp);
523
524         pb->insertBefore(pb->getExit(), mov);
525      }
526      ++j;
527   }
528
529   return true;
530}
531
532bool
533RegAlloc::ArgumentMovesPass::visit(BasicBlock *bb)
534{
535   // Bind function call inputs/outputs to the same physical register
536   // the callee uses, inserting moves as appropriate for the case a
537   // conflict arises.
538   for (Instruction *i = bb->getEntry(); i; i = i->next) {
539      FlowInstruction *cal = i->asFlow();
540      // TODO: Handle indirect calls.
541      // Right now they should only be generated for builtins.
542      if (!cal || cal->op != OP_CALL || cal->builtin || cal->indirect)
543         continue;
544      RegisterSet clobberSet(prog->getTarget());
545
546      // Bind input values.
547      for (int s = cal->indirect ? 1 : 0; cal->srcExists(s); ++s) {
548         const int t = cal->indirect ? (s - 1) : s;
549         LValue *tmp = new_LValue(func, cal->getSrc(s)->asLValue());
550         tmp->reg.data.id = cal->target.fn->ins[t].rep()->reg.data.id;
551
552         Instruction *mov =
553            new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
554         mov->setDef(0, tmp);
555         mov->setSrc(0, cal->getSrc(s));
556         cal->setSrc(s, tmp);
557
558         bb->insertBefore(cal, mov);
559      }
560
561      // Bind output values.
562      for (int d = 0; cal->defExists(d); ++d) {
563         LValue *tmp = new_LValue(func, cal->getDef(d)->asLValue());
564         tmp->reg.data.id = cal->target.fn->outs[d].rep()->reg.data.id;
565
566         Instruction *mov =
567            new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
568         mov->setSrc(0, tmp);
569         mov->setDef(0, cal->getDef(d));
570         cal->setDef(d, tmp);
571
572         bb->insertAfter(cal, mov);
573         clobberSet.occupy(tmp);
574      }
575
576      // Bind clobbered values.
577      for (std::deque<Value *>::iterator it = cal->target.fn->clobbers.begin();
578           it != cal->target.fn->clobbers.end();
579           ++it) {
580         if (clobberSet.testOccupy(*it)) {
581            Value *tmp = new_LValue(func, (*it)->asLValue());
582            tmp->reg.data.id = (*it)->reg.data.id;
583            cal->setDef(cal->defCount(), tmp);
584         }
585      }
586   }
587
588   // Update the clobber set of the function.
589   if (BasicBlock::get(func->cfgExit) == bb) {
590      func->buildDefSets();
591      for (unsigned int i = 0; i < bb->defSet.getSize(); ++i)
592         if (bb->defSet.test(i))
593            func->clobbers.push_back(func->getLValue(i));
594   }
595
596   return true;
597}
598
599// Build the set of live-in variables of bb.
600bool
601RegAlloc::buildLiveSets(BasicBlock *bb)
602{
603   Function *f = bb->getFunction();
604   BasicBlock *bn;
605   Instruction *i;
606   unsigned int s, d;
607
608   INFO_DBG(prog->dbgFlags, REG_ALLOC, "buildLiveSets(BB:%i)\n", bb->getId());
609
610   bb->liveSet.allocate(func->allLValues.getSize(), false);
611
612   int n = 0;
613   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
614      bn = BasicBlock::get(ei.getNode());
615      if (bn == bb)
616         continue;
617      if (bn->cfg.visit(sequence))
618         if (!buildLiveSets(bn))
619            return false;
620      if (n++ || bb->liveSet.marker)
621         bb->liveSet |= bn->liveSet;
622      else
623         bb->liveSet = bn->liveSet;
624   }
625   if (!n && !bb->liveSet.marker)
626      bb->liveSet.fill(0);
627   bb->liveSet.marker = true;
628
629   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
630      INFO("BB:%i live set of out blocks:\n", bb->getId());
631      bb->liveSet.print();
632   }
633
634   // if (!bb->getEntry())
635   //   return true;
636
637   if (bb == BasicBlock::get(f->cfgExit)) {
638      for (std::deque<ValueRef>::iterator it = f->outs.begin();
639           it != f->outs.end(); ++it) {
640         assert(it->get()->asLValue());
641         bb->liveSet.set(it->get()->id);
642      }
643   }
644
645   for (i = bb->getExit(); i && i != bb->getEntry()->prev; i = i->prev) {
646      for (d = 0; i->defExists(d); ++d)
647         bb->liveSet.clr(i->getDef(d)->id);
648      for (s = 0; i->srcExists(s); ++s)
649         if (i->getSrc(s)->asLValue())
650            bb->liveSet.set(i->getSrc(s)->id);
651   }
652   for (i = bb->getPhi(); i && i->op == OP_PHI; i = i->next)
653      bb->liveSet.clr(i->getDef(0)->id);
654
655   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
656      INFO("BB:%i live set after propagation:\n", bb->getId());
657      bb->liveSet.print();
658   }
659
660   return true;
661}
662
663void
664RegAlloc::BuildIntervalsPass::collectLiveValues(BasicBlock *bb)
665{
666   BasicBlock *bbA = NULL, *bbB = NULL;
667
668   if (bb->cfg.outgoingCount()) {
669      // trickery to save a loop of OR'ing liveSets
670      // aliasing works fine with BitSet::setOr
671      for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
672         if (bbA) {
673            bb->liveSet.setOr(&bbA->liveSet, &bbB->liveSet);
674            bbA = bb;
675         } else {
676            bbA = bbB;
677         }
678         bbB = BasicBlock::get(ei.getNode());
679      }
680      bb->liveSet.setOr(&bbB->liveSet, bbA ? &bbA->liveSet : NULL);
681   } else
682   if (bb->cfg.incidentCount()) {
683      bb->liveSet.fill(0);
684   }
685}
686
687bool
688RegAlloc::BuildIntervalsPass::visit(BasicBlock *bb)
689{
690   collectLiveValues(bb);
691
692   INFO_DBG(prog->dbgFlags, REG_ALLOC, "BuildIntervals(BB:%i)\n", bb->getId());
693
694   // go through out blocks and delete phi sources that do not originate from
695   // the current block from the live set
696   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
697      BasicBlock *out = BasicBlock::get(ei.getNode());
698
699      for (Instruction *i = out->getPhi(); i && i->op == OP_PHI; i = i->next) {
700         bb->liveSet.clr(i->getDef(0)->id);
701
702         for (int s = 0; i->srcExists(s); ++s) {
703            assert(i->src(s).getInsn());
704            if (i->getSrc(s)->getUniqueInsn()->bb == bb) // XXX: reachableBy ?
705               bb->liveSet.set(i->getSrc(s)->id);
706            else
707               bb->liveSet.clr(i->getSrc(s)->id);
708         }
709      }
710   }
711
712   // remaining live-outs are live until end
713   if (bb->getExit()) {
714      for (unsigned int j = 0; j < bb->liveSet.getSize(); ++j)
715         if (bb->liveSet.test(j))
716            addLiveRange(func->getLValue(j), bb, bb->getExit()->serial + 1);
717   }
718
719   for (Instruction *i = bb->getExit(); i && i->op != OP_PHI; i = i->prev) {
720      for (int d = 0; i->defExists(d); ++d) {
721         bb->liveSet.clr(i->getDef(d)->id);
722         if (i->getDef(d)->reg.data.id >= 0) // add hazard for fixed regs
723            i->getDef(d)->livei.extend(i->serial, i->serial);
724      }
725
726      for (int s = 0; i->srcExists(s); ++s) {
727         if (!i->getSrc(s)->asLValue())
728            continue;
729         if (!bb->liveSet.test(i->getSrc(s)->id)) {
730            bb->liveSet.set(i->getSrc(s)->id);
731            addLiveRange(i->getSrc(s), bb, i->serial);
732         }
733      }
734   }
735
736   if (bb == BasicBlock::get(func->cfg.getRoot())) {
737      for (std::deque<ValueDef>::iterator it = func->ins.begin();
738           it != func->ins.end(); ++it) {
739         if (it->get()->reg.data.id >= 0) // add hazard for fixed regs
740            it->get()->livei.extend(0, 1);
741      }
742   }
743
744   return true;
745}
746
747
748#define JOIN_MASK_PHI        (1 << 0)
749#define JOIN_MASK_UNION      (1 << 1)
750#define JOIN_MASK_MOV        (1 << 2)
751#define JOIN_MASK_TEX        (1 << 3)
752
753class GCRA
754{
755public:
756   GCRA(Function *, SpillCodeInserter&, MergedDefs&);
757   ~GCRA();
758
759   bool allocateRegisters(ArrayList& insns);
760
761   void printNodeInfo() const;
762
763private:
764   class RIG_Node : public Graph::Node
765   {
766   public:
767      RIG_Node();
768
769      void init(const RegisterSet&, LValue *);
770
771      void addInterference(RIG_Node *);
772      void addRegPreference(RIG_Node *);
773
774      inline LValue *getValue() const
775      {
776         return reinterpret_cast<LValue *>(data);
777      }
778      inline void setValue(LValue *lval) { data = lval; }
779
780      inline uint8_t getCompMask() const
781      {
782         return ((1 << colors) - 1) << (reg & 7);
783      }
784
785      static inline RIG_Node *get(const Graph::EdgeIterator& ei)
786      {
787         return static_cast<RIG_Node *>(ei.getNode());
788      }
789
790   public:
791      uint32_t degree;
792      uint16_t degreeLimit; // if deg < degLimit, node is trivially colourable
793      uint16_t maxReg;
794      uint16_t colors;
795
796      DataFile f;
797      int32_t reg;
798
799      float weight;
800
801      // list pointers for simplify() phase
802      RIG_Node *next;
803      RIG_Node *prev;
804
805      // union of the live intervals of all coalesced values (we want to retain
806      //  the separate intervals for testing interference of compound values)
807      Interval livei;
808
809      std::list<RIG_Node *> prefRegs;
810   };
811
812private:
813   inline RIG_Node *getNode(const LValue *v) const { return &nodes[v->id]; }
814
815   void buildRIG(ArrayList&);
816   bool coalesce(ArrayList&);
817   bool doCoalesce(ArrayList&, unsigned int mask);
818   void calculateSpillWeights();
819   bool simplify();
820   bool selectRegisters();
821   void cleanup(const bool success);
822
823   void simplifyEdge(RIG_Node *, RIG_Node *);
824   void simplifyNode(RIG_Node *);
825
826   bool coalesceValues(Value *, Value *, bool force);
827   void resolveSplitsAndMerges();
828   void makeCompound(Instruction *, bool isSplit);
829
830   inline void checkInterference(const RIG_Node *, Graph::EdgeIterator&);
831
832   inline void insertOrderedTail(std::list<RIG_Node *>&, RIG_Node *);
833   void checkList(std::list<RIG_Node *>&);
834
835private:
836   std::stack<uint32_t> stack;
837
838   // list headers for simplify() phase
839   RIG_Node lo[2];
840   RIG_Node hi;
841
842   Graph RIG;
843   RIG_Node *nodes;
844   unsigned int nodeCount;
845
846   Function *func;
847   Program *prog;
848
849   struct RelDegree {
850      uint8_t data[17][17];
851
852      RelDegree() {
853         for (int i = 1; i <= 16; ++i)
854            for (int j = 1; j <= 16; ++j)
855               data[i][j] = j * ((i + j - 1) / j);
856      }
857
858      const uint8_t* operator[](std::size_t i) const {
859         return data[i];
860      }
861   };
862
863   static const RelDegree relDegree;
864
865   RegisterSet regs;
866
867   // need to fixup register id for participants of OP_MERGE/SPLIT
868   std::list<Instruction *> merges;
869   std::list<Instruction *> splits;
870
871   SpillCodeInserter& spill;
872   std::list<ValuePair> mustSpill;
873
874   MergedDefs &mergedDefs;
875};
876
877const GCRA::RelDegree GCRA::relDegree;
878
879GCRA::RIG_Node::RIG_Node() : Node(NULL), degree(0), degreeLimit(0), maxReg(0),
880   colors(0), f(FILE_NULL), reg(0), weight(0), next(this), prev(this)
881{
882}
883
884void
885GCRA::printNodeInfo() const
886{
887   for (unsigned int i = 0; i < nodeCount; ++i) {
888      if (!nodes[i].colors)
889         continue;
890      INFO("RIG_Node[%%%i]($[%u]%i): %u colors, weight %f, deg %u/%u\n X",
891           i,
892           nodes[i].f,nodes[i].reg,nodes[i].colors,
893           nodes[i].weight,
894           nodes[i].degree, nodes[i].degreeLimit);
895
896      for (Graph::EdgeIterator ei = nodes[i].outgoing(); !ei.end(); ei.next())
897         INFO(" %%%i", RIG_Node::get(ei)->getValue()->id);
898      for (Graph::EdgeIterator ei = nodes[i].incident(); !ei.end(); ei.next())
899         INFO(" %%%i", RIG_Node::get(ei)->getValue()->id);
900      INFO("\n");
901   }
902}
903
904static bool
905isShortRegOp(Instruction *insn)
906{
907   // Immediates are always in src1 (except zeroes, which end up getting
908   // replaced with a zero reg). Every other situation can be resolved by
909   // using a long encoding.
910   return insn->srcExists(1) && insn->src(1).getFile() == FILE_IMMEDIATE &&
911      insn->getSrc(1)->reg.data.u64;
912}
913
914// Check if this LValue is ever used in an instruction that can't be encoded
915// with long registers (i.e. > r63)
916static bool
917isShortRegVal(LValue *lval)
918{
919   if (lval->getInsn() == NULL)
920      return false;
921   for (Value::DefCIterator def = lval->defs.begin();
922        def != lval->defs.end(); ++def)
923      if (isShortRegOp((*def)->getInsn()))
924         return true;
925   for (Value::UseCIterator use = lval->uses.begin();
926        use != lval->uses.end(); ++use)
927      if (isShortRegOp((*use)->getInsn()))
928         return true;
929   return false;
930}
931
932void
933GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval)
934{
935   setValue(lval);
936   if (lval->reg.data.id >= 0)
937      lval->noSpill = lval->fixedReg = 1;
938
939   colors = regs.units(lval->reg.file, lval->reg.size);
940   f = lval->reg.file;
941   reg = -1;
942   if (lval->reg.data.id >= 0)
943      reg = regs.idToUnits(lval);
944
945   weight = std::numeric_limits<float>::infinity();
946   degree = 0;
947   maxReg = regs.getFileSize(f);
948   // On nv50, we lose a bit of gpr encoding when there's an embedded
949   // immediate.
950   if (regs.restrictedGPR16Range && f == FILE_GPR && (lval->reg.size == 2 || isShortRegVal(lval)))
951      maxReg /= 2;
952   degreeLimit = maxReg;
953   degreeLimit -= relDegree[1][colors] - 1;
954
955   livei.insert(lval->livei);
956}
957
958bool
959GCRA::coalesceValues(Value *dst, Value *src, bool force)
960{
961   LValue *rep = dst->join->asLValue();
962   LValue *val = src->join->asLValue();
963
964   if (!force && val->reg.data.id >= 0) {
965      rep = src->join->asLValue();
966      val = dst->join->asLValue();
967   }
968   RIG_Node *nRep = &nodes[rep->id];
969   RIG_Node *nVal = &nodes[val->id];
970
971   if (src->reg.file != dst->reg.file) {
972      if (!force)
973         return false;
974      WARN("forced coalescing of values in different files !\n");
975   }
976   if (!force && dst->reg.size != src->reg.size)
977      return false;
978
979   if ((rep->reg.data.id >= 0) && (rep->reg.data.id != val->reg.data.id)) {
980      if (force) {
981         if (val->reg.data.id >= 0)
982            WARN("forced coalescing of values in different fixed regs !\n");
983      } else {
984         if (val->reg.data.id >= 0)
985            return false;
986         // make sure that there is no overlap with the fixed register of rep
987         for (ArrayList::Iterator it = func->allLValues.iterator();
988              !it.end(); it.next()) {
989            Value *reg = reinterpret_cast<Value *>(it.get())->asLValue();
990            assert(reg);
991            if (reg->interfers(rep) && reg->livei.overlaps(nVal->livei))
992               return false;
993         }
994      }
995   }
996
997   if (!force && nRep->livei.overlaps(nVal->livei))
998      return false;
999
1000   INFO_DBG(prog->dbgFlags, REG_ALLOC, "joining %%%i($%i) <- %%%i\n",
1001            rep->id, rep->reg.data.id, val->id);
1002
1003   // set join pointer of all values joined with val
1004   const std::list<ValueDef *> &defs = mergedDefs(val);
1005   for (ValueDef *def : defs)
1006      def->get()->join = rep;
1007   assert(rep->join == rep && val->join == rep);
1008
1009   // add val's definitions to rep and extend the live interval of its RIG node
1010   mergedDefs.add(rep, defs);
1011   nRep->livei.unify(nVal->livei);
1012   nRep->degreeLimit = MIN2(nRep->degreeLimit, nVal->degreeLimit);
1013   nRep->maxReg = MIN2(nRep->maxReg, nVal->maxReg);
1014   return true;
1015}
1016
1017bool
1018GCRA::coalesce(ArrayList& insns)
1019{
1020   bool ret = doCoalesce(insns, JOIN_MASK_PHI);
1021   if (!ret)
1022      return false;
1023   switch (func->getProgram()->getTarget()->getChipset() & ~0xf) {
1024   case 0x50:
1025   case 0x80:
1026   case 0x90:
1027   case 0xa0:
1028      ret = doCoalesce(insns, JOIN_MASK_UNION | JOIN_MASK_TEX);
1029      break;
1030   case 0xc0:
1031   case 0xd0:
1032   case 0xe0:
1033   case 0xf0:
1034   case 0x100:
1035   case 0x110:
1036   case 0x120:
1037   case 0x130:
1038   case 0x140:
1039   case 0x160:
1040      ret = doCoalesce(insns, JOIN_MASK_UNION);
1041      break;
1042   default:
1043      break;
1044   }
1045   if (!ret)
1046      return false;
1047   return doCoalesce(insns, JOIN_MASK_MOV);
1048}
1049
1050static inline uint8_t makeCompMask(int compSize, int base, int size)
1051{
1052   uint8_t m = ((1 << size) - 1) << base;
1053
1054   switch (compSize) {
1055   case 1:
1056      return 0xff;
1057   case 2:
1058      m |= (m << 2);
1059      return (m << 4) | m;
1060   case 3:
1061   case 4:
1062      return (m << 4) | m;
1063   default:
1064      assert(compSize <= 8);
1065      return m;
1066   }
1067}
1068
1069// Used when coalescing moves. The non-compound value will become one, e.g.:
1070// mov b32 $r0 $r2            / merge b64 $r0d { $r0 $r1 }
1071// split b64 { $r0 $r1 } $r0d / mov b64 $r0d f64 $r2d
1072static inline void copyCompound(Value *dst, Value *src)
1073{
1074   LValue *ldst = dst->asLValue();
1075   LValue *lsrc = src->asLValue();
1076
1077   if (ldst->compound && !lsrc->compound) {
1078      LValue *swap = lsrc;
1079      lsrc = ldst;
1080      ldst = swap;
1081   }
1082
1083   ldst->compound = lsrc->compound;
1084   ldst->compMask = lsrc->compMask;
1085}
1086
1087void
1088GCRA::makeCompound(Instruction *insn, bool split)
1089{
1090   LValue *rep = (split ? insn->getSrc(0) : insn->getDef(0))->asLValue();
1091
1092   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
1093      INFO("makeCompound(split = %i): ", split);
1094      insn->print();
1095   }
1096
1097   const unsigned int size = getNode(rep)->colors;
1098   unsigned int base = 0;
1099
1100   if (!rep->compound)
1101      rep->compMask = 0xff;
1102   rep->compound = 1;
1103
1104   for (int c = 0; split ? insn->defExists(c) : insn->srcExists(c); ++c) {
1105      LValue *val = (split ? insn->getDef(c) : insn->getSrc(c))->asLValue();
1106
1107      val->compound = 1;
1108      if (!val->compMask)
1109         val->compMask = 0xff;
1110      val->compMask &= makeCompMask(size, base, getNode(val)->colors);
1111      assert(val->compMask);
1112
1113      INFO_DBG(prog->dbgFlags, REG_ALLOC, "compound: %%%i:%02x <- %%%i:%02x\n",
1114           rep->id, rep->compMask, val->id, val->compMask);
1115
1116      base += getNode(val)->colors;
1117   }
1118   assert(base == size);
1119}
1120
1121bool
1122GCRA::doCoalesce(ArrayList& insns, unsigned int mask)
1123{
1124   int c, n;
1125
1126   for (n = 0; n < insns.getSize(); ++n) {
1127      Instruction *i;
1128      Instruction *insn = reinterpret_cast<Instruction *>(insns.get(n));
1129
1130      switch (insn->op) {
1131      case OP_PHI:
1132         if (!(mask & JOIN_MASK_PHI))
1133            break;
1134         for (c = 0; insn->srcExists(c); ++c)
1135            if (!coalesceValues(insn->getDef(0), insn->getSrc(c), false)) {
1136               // this is bad
1137               ERROR("failed to coalesce phi operands\n");
1138               return false;
1139            }
1140         break;
1141      case OP_UNION:
1142      case OP_MERGE:
1143         if (!(mask & JOIN_MASK_UNION))
1144            break;
1145         for (c = 0; insn->srcExists(c); ++c)
1146            coalesceValues(insn->getDef(0), insn->getSrc(c), true);
1147         if (insn->op == OP_MERGE) {
1148            merges.push_back(insn);
1149            if (insn->srcExists(1))
1150               makeCompound(insn, false);
1151         }
1152         break;
1153      case OP_SPLIT:
1154         if (!(mask & JOIN_MASK_UNION))
1155            break;
1156         splits.push_back(insn);
1157         for (c = 0; insn->defExists(c); ++c)
1158            coalesceValues(insn->getSrc(0), insn->getDef(c), true);
1159         makeCompound(insn, true);
1160         break;
1161      case OP_MOV:
1162         if (!(mask & JOIN_MASK_MOV))
1163            break;
1164         i = NULL;
1165         if (!insn->getDef(0)->uses.empty())
1166            i = (*insn->getDef(0)->uses.begin())->getInsn();
1167         // if this is a contraint-move there will only be a single use
1168         if (i && i->op == OP_MERGE) // do we really still need this ?
1169            break;
1170         i = insn->getSrc(0)->getUniqueInsn();
1171         if (i && !i->constrainedDefs()) {
1172            if (coalesceValues(insn->getDef(0), insn->getSrc(0), false))
1173               copyCompound(insn->getSrc(0), insn->getDef(0));
1174         }
1175         break;
1176      case OP_TEX:
1177      case OP_TXB:
1178      case OP_TXL:
1179      case OP_TXF:
1180      case OP_TXQ:
1181      case OP_TXD:
1182      case OP_TXG:
1183      case OP_TXLQ:
1184      case OP_TEXCSAA:
1185      case OP_TEXPREP:
1186         if (!(mask & JOIN_MASK_TEX))
1187            break;
1188         for (c = 0; insn->srcExists(c) && c != insn->predSrc; ++c)
1189            coalesceValues(insn->getDef(c), insn->getSrc(c), true);
1190         break;
1191      default:
1192         break;
1193      }
1194   }
1195   return true;
1196}
1197
1198void
1199GCRA::RIG_Node::addInterference(RIG_Node *node)
1200{
1201   this->degree += relDegree[node->colors][colors];
1202   node->degree += relDegree[colors][node->colors];
1203
1204   this->attach(node, Graph::Edge::CROSS);
1205}
1206
1207void
1208GCRA::RIG_Node::addRegPreference(RIG_Node *node)
1209{
1210   prefRegs.push_back(node);
1211}
1212
1213GCRA::GCRA(Function *fn, SpillCodeInserter& spill, MergedDefs& mergedDefs) :
1214   nodes(NULL),
1215   nodeCount(0),
1216   func(fn),
1217   regs(fn->getProgram()->getTarget()),
1218   spill(spill),
1219   mergedDefs(mergedDefs)
1220{
1221   prog = func->getProgram();
1222}
1223
1224GCRA::~GCRA()
1225{
1226   if (nodes)
1227      delete[] nodes;
1228}
1229
1230void
1231GCRA::checkList(std::list<RIG_Node *>& lst)
1232{
1233   GCRA::RIG_Node *prev = NULL;
1234
1235   for (std::list<RIG_Node *>::iterator it = lst.begin();
1236        it != lst.end();
1237        ++it) {
1238      assert((*it)->getValue()->join == (*it)->getValue());
1239      if (prev)
1240         assert(prev->livei.begin() <= (*it)->livei.begin());
1241      prev = *it;
1242   }
1243}
1244
1245void
1246GCRA::insertOrderedTail(std::list<RIG_Node *>& list, RIG_Node *node)
1247{
1248   if (node->livei.isEmpty())
1249      return;
1250   // only the intervals of joined values don't necessarily arrive in order
1251   std::list<RIG_Node *>::iterator prev, it;
1252   for (it = list.end(); it != list.begin(); it = prev) {
1253      prev = it;
1254      --prev;
1255      if ((*prev)->livei.begin() <= node->livei.begin())
1256         break;
1257   }
1258   list.insert(it, node);
1259}
1260
1261void
1262GCRA::buildRIG(ArrayList& insns)
1263{
1264   std::list<RIG_Node *> values, active;
1265
1266   for (std::deque<ValueDef>::iterator it = func->ins.begin();
1267        it != func->ins.end(); ++it)
1268      insertOrderedTail(values, getNode(it->get()->asLValue()));
1269
1270   for (int i = 0; i < insns.getSize(); ++i) {
1271      Instruction *insn = reinterpret_cast<Instruction *>(insns.get(i));
1272      for (int d = 0; insn->defExists(d); ++d)
1273         if (insn->getDef(d)->reg.file <= LAST_REGISTER_FILE &&
1274             insn->getDef(d)->rep() == insn->getDef(d))
1275            insertOrderedTail(values, getNode(insn->getDef(d)->asLValue()));
1276   }
1277   checkList(values);
1278
1279   while (!values.empty()) {
1280      RIG_Node *cur = values.front();
1281
1282      for (std::list<RIG_Node *>::iterator it = active.begin();
1283           it != active.end();) {
1284         RIG_Node *node = *it;
1285
1286         if (node->livei.end() <= cur->livei.begin()) {
1287            it = active.erase(it);
1288         } else {
1289            if (node->f == cur->f && node->livei.overlaps(cur->livei))
1290               cur->addInterference(node);
1291            ++it;
1292         }
1293      }
1294      values.pop_front();
1295      active.push_back(cur);
1296   }
1297}
1298
1299void
1300GCRA::calculateSpillWeights()
1301{
1302   for (unsigned int i = 0; i < nodeCount; ++i) {
1303      RIG_Node *const n = &nodes[i];
1304      if (!nodes[i].colors || nodes[i].livei.isEmpty())
1305         continue;
1306      if (nodes[i].reg >= 0) {
1307         // update max reg
1308         regs.occupy(n->f, n->reg, n->colors);
1309         continue;
1310      }
1311      LValue *val = nodes[i].getValue();
1312
1313      if (!val->noSpill) {
1314         int rc = 0;
1315         for (ValueDef *def : mergedDefs(val))
1316            rc += def->get()->refCount();
1317
1318         nodes[i].weight =
1319            (float)rc * (float)rc / (float)nodes[i].livei.extent();
1320      }
1321
1322      if (nodes[i].degree < nodes[i].degreeLimit) {
1323         int l = 0;
1324         if (val->reg.size > 4)
1325            l = 1;
1326         DLLIST_ADDHEAD(&lo[l], &nodes[i]);
1327      } else {
1328         DLLIST_ADDHEAD(&hi, &nodes[i]);
1329      }
1330   }
1331   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
1332      printNodeInfo();
1333}
1334
1335void
1336GCRA::simplifyEdge(RIG_Node *a, RIG_Node *b)
1337{
1338   bool move = b->degree >= b->degreeLimit;
1339
1340   INFO_DBG(prog->dbgFlags, REG_ALLOC,
1341            "edge: (%%%i, deg %u/%u) >-< (%%%i, deg %u/%u)\n",
1342            a->getValue()->id, a->degree, a->degreeLimit,
1343            b->getValue()->id, b->degree, b->degreeLimit);
1344
1345   b->degree -= relDegree[a->colors][b->colors];
1346
1347   move = move && b->degree < b->degreeLimit;
1348   if (move && !DLLIST_EMPTY(b)) {
1349      int l = (b->getValue()->reg.size > 4) ? 1 : 0;
1350      DLLIST_DEL(b);
1351      DLLIST_ADDTAIL(&lo[l], b);
1352   }
1353}
1354
1355void
1356GCRA::simplifyNode(RIG_Node *node)
1357{
1358   for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
1359      simplifyEdge(node, RIG_Node::get(ei));
1360
1361   for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next())
1362      simplifyEdge(node, RIG_Node::get(ei));
1363
1364   DLLIST_DEL(node);
1365   stack.push(node->getValue()->id);
1366
1367   INFO_DBG(prog->dbgFlags, REG_ALLOC, "SIMPLIFY: pushed %%%i%s\n",
1368            node->getValue()->id,
1369            (node->degree < node->degreeLimit) ? "" : "(spill)");
1370}
1371
1372bool
1373GCRA::simplify()
1374{
1375   for (;;) {
1376      if (!DLLIST_EMPTY(&lo[0])) {
1377         do {
1378            simplifyNode(lo[0].next);
1379         } while (!DLLIST_EMPTY(&lo[0]));
1380      } else
1381      if (!DLLIST_EMPTY(&lo[1])) {
1382         simplifyNode(lo[1].next);
1383      } else
1384      if (!DLLIST_EMPTY(&hi)) {
1385         RIG_Node *best = hi.next;
1386         unsigned bestMaxReg = best->maxReg;
1387         float bestScore = best->weight / (float)best->degree;
1388         // Spill candidate. First go through the ones with the highest max
1389         // register, then the ones with lower. That way the ones with the
1390         // lowest requirement will be allocated first, since it's a stack.
1391         for (RIG_Node *it = best->next; it != &hi; it = it->next) {
1392            float score = it->weight / (float)it->degree;
1393            if (score < bestScore || it->maxReg > bestMaxReg) {
1394               best = it;
1395               bestScore = score;
1396               bestMaxReg = it->maxReg;
1397            }
1398         }
1399         if (isinf(bestScore)) {
1400            ERROR("no viable spill candidates left\n");
1401            return false;
1402         }
1403         simplifyNode(best);
1404      } else {
1405         return true;
1406      }
1407   }
1408}
1409
1410void
1411GCRA::checkInterference(const RIG_Node *node, Graph::EdgeIterator& ei)
1412{
1413   const RIG_Node *intf = RIG_Node::get(ei);
1414
1415   if (intf->reg < 0)
1416      return;
1417   LValue *vA = node->getValue();
1418   LValue *vB = intf->getValue();
1419
1420   const uint8_t intfMask = ((1 << intf->colors) - 1) << (intf->reg & 7);
1421
1422   if (vA->compound | vB->compound) {
1423      // NOTE: this only works for >aligned< register tuples !
1424      for (const ValueDef *D : mergedDefs(vA)) {
1425      for (const ValueDef *d : mergedDefs(vB)) {
1426         const LValue *vD = D->get()->asLValue();
1427         const LValue *vd = d->get()->asLValue();
1428
1429         if (!vD->livei.overlaps(vd->livei)) {
1430            INFO_DBG(prog->dbgFlags, REG_ALLOC, "(%%%i) X (%%%i): no overlap\n",
1431                     vD->id, vd->id);
1432            continue;
1433         }
1434
1435         uint8_t mask = vD->compound ? vD->compMask : ~0;
1436         if (vd->compound) {
1437            assert(vB->compound);
1438            mask &= vd->compMask & vB->compMask;
1439         } else {
1440            mask &= intfMask;
1441         }
1442
1443         INFO_DBG(prog->dbgFlags, REG_ALLOC,
1444                  "(%%%i)%02x X (%%%i)%02x & %02x: $r%i.%02x\n",
1445                  vD->id,
1446                  vD->compound ? vD->compMask : 0xff,
1447                  vd->id,
1448                  vd->compound ? vd->compMask : intfMask,
1449                  vB->compMask, intf->reg & ~7, mask);
1450         if (mask)
1451            regs.occupyMask(node->f, intf->reg & ~7, mask);
1452      }
1453      }
1454   } else {
1455      INFO_DBG(prog->dbgFlags, REG_ALLOC,
1456               "(%%%i) X (%%%i): $r%i + %u\n",
1457               vA->id, vB->id, intf->reg, intf->colors);
1458      regs.occupy(node->f, intf->reg, intf->colors);
1459   }
1460}
1461
1462bool
1463GCRA::selectRegisters()
1464{
1465   INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nSELECT phase\n");
1466
1467   while (!stack.empty()) {
1468      RIG_Node *node = &nodes[stack.top()];
1469      stack.pop();
1470
1471      regs.reset(node->f);
1472
1473      INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nNODE[%%%i, %u colors]\n",
1474               node->getValue()->id, node->colors);
1475
1476      for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
1477         checkInterference(node, ei);
1478      for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next())
1479         checkInterference(node, ei);
1480
1481      if (!node->prefRegs.empty()) {
1482         for (std::list<RIG_Node *>::const_iterator it = node->prefRegs.begin();
1483              it != node->prefRegs.end();
1484              ++it) {
1485            if ((*it)->reg >= 0 &&
1486                regs.testOccupy(node->f, (*it)->reg, node->colors)) {
1487               node->reg = (*it)->reg;
1488               break;
1489            }
1490         }
1491      }
1492      if (node->reg >= 0)
1493         continue;
1494      LValue *lval = node->getValue();
1495      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
1496         regs.print(node->f);
1497      bool ret = regs.assign(node->reg, node->f, node->colors, node->maxReg);
1498      if (ret) {
1499         INFO_DBG(prog->dbgFlags, REG_ALLOC, "assigned reg %i\n", node->reg);
1500         lval->compMask = node->getCompMask();
1501      } else {
1502         INFO_DBG(prog->dbgFlags, REG_ALLOC, "must spill: %%%i (size %u)\n",
1503                  lval->id, lval->reg.size);
1504         Symbol *slot = NULL;
1505         if (lval->reg.file == FILE_GPR)
1506            slot = spill.assignSlot(node->livei, lval->reg.size);
1507         mustSpill.push_back(ValuePair(lval, slot));
1508      }
1509   }
1510   if (!mustSpill.empty())
1511      return false;
1512   for (unsigned int i = 0; i < nodeCount; ++i) {
1513      LValue *lval = nodes[i].getValue();
1514      if (nodes[i].reg >= 0 && nodes[i].colors > 0)
1515         lval->reg.data.id =
1516            regs.unitsToId(nodes[i].f, nodes[i].reg, lval->reg.size);
1517   }
1518   return true;
1519}
1520
1521bool
1522GCRA::allocateRegisters(ArrayList& insns)
1523{
1524   bool ret;
1525
1526   INFO_DBG(prog->dbgFlags, REG_ALLOC,
1527            "allocateRegisters to %u instructions\n", insns.getSize());
1528
1529   nodeCount = func->allLValues.getSize();
1530   nodes = new RIG_Node[nodeCount];
1531   if (!nodes)
1532      return false;
1533   for (unsigned int i = 0; i < nodeCount; ++i) {
1534      LValue *lval = reinterpret_cast<LValue *>(func->allLValues.get(i));
1535      if (lval) {
1536         nodes[i].init(regs, lval);
1537         RIG.insert(&nodes[i]);
1538
1539         if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL) {
1540            Instruction *insn = lval->getInsn();
1541            if (insn->op != OP_MAD && insn->op != OP_FMA && insn->op != OP_SAD)
1542               continue;
1543            // For both of the cases below, we only want to add the preference
1544            // if all arguments are in registers.
1545            if (insn->src(0).getFile() != FILE_GPR ||
1546                insn->src(1).getFile() != FILE_GPR ||
1547                insn->src(2).getFile() != FILE_GPR)
1548               continue;
1549            if (prog->getTarget()->getChipset() < 0xc0) {
1550               // Outputting a flag is not supported with short encodings nor
1551               // with immediate arguments.
1552               // See handleMADforNV50.
1553               if (insn->flagsDef >= 0)
1554                  continue;
1555            } else {
1556               // We can only fold immediate arguments if dst == src2. This
1557               // only matters if one of the first two arguments is an
1558               // immediate. This form is also only supported for floats.
1559               // See handleMADforNVC0.
1560               ImmediateValue imm;
1561               if (insn->dType != TYPE_F32)
1562                  continue;
1563               if (!insn->src(0).getImmediate(imm) &&
1564                   !insn->src(1).getImmediate(imm))
1565                  continue;
1566            }
1567
1568            nodes[i].addRegPreference(getNode(insn->getSrc(2)->asLValue()));
1569         }
1570      }
1571   }
1572
1573   // coalesce first, we use only 1 RIG node for a group of joined values
1574   ret = coalesce(insns);
1575   if (!ret)
1576      goto out;
1577
1578   if (func->getProgram()->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
1579      func->printLiveIntervals();
1580
1581   buildRIG(insns);
1582   calculateSpillWeights();
1583   ret = simplify();
1584   if (!ret)
1585      goto out;
1586
1587   ret = selectRegisters();
1588   if (!ret) {
1589      INFO_DBG(prog->dbgFlags, REG_ALLOC,
1590               "selectRegisters failed, inserting spill code ...\n");
1591      regs.reset(FILE_GPR, true);
1592      spill.run(mustSpill);
1593      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
1594         func->print();
1595   } else {
1596      mergedDefs.merge();
1597      prog->maxGPR = std::max(prog->maxGPR, regs.getMaxAssigned(FILE_GPR));
1598   }
1599
1600out:
1601   cleanup(ret);
1602   return ret;
1603}
1604
1605void
1606GCRA::cleanup(const bool success)
1607{
1608   mustSpill.clear();
1609
1610   for (ArrayList::Iterator it = func->allLValues.iterator();
1611        !it.end(); it.next()) {
1612      LValue *lval =  reinterpret_cast<LValue *>(it.get());
1613
1614      lval->livei.clear();
1615
1616      lval->compound = 0;
1617      lval->compMask = 0;
1618
1619      if (lval->join == lval)
1620         continue;
1621
1622      if (success)
1623         lval->reg.data.id = lval->join->reg.data.id;
1624      else
1625         lval->join = lval;
1626   }
1627
1628   if (success)
1629      resolveSplitsAndMerges();
1630   splits.clear(); // avoid duplicate entries on next coalesce pass
1631   merges.clear();
1632
1633   delete[] nodes;
1634   nodes = NULL;
1635   hi.next = hi.prev = &hi;
1636   lo[0].next = lo[0].prev = &lo[0];
1637   lo[1].next = lo[1].prev = &lo[1];
1638}
1639
1640Symbol *
1641SpillCodeInserter::assignSlot(const Interval &livei, const unsigned int size)
1642{
1643   SpillSlot slot;
1644   int32_t offsetBase = stackSize;
1645   int32_t offset;
1646   std::list<SpillSlot>::iterator pos = slots.end(), it = slots.begin();
1647
1648   if (offsetBase % size)
1649      offsetBase += size - (offsetBase % size);
1650
1651   slot.sym = NULL;
1652
1653   for (offset = offsetBase; offset < stackSize; offset += size) {
1654      const int32_t entryEnd = offset + size;
1655      while (it != slots.end() && it->offset < offset)
1656         ++it;
1657      if (it == slots.end()) // no slots left
1658         break;
1659      std::list<SpillSlot>::iterator bgn = it;
1660
1661      while (it != slots.end() && it->offset < entryEnd) {
1662         it->occup.print();
1663         if (it->occup.overlaps(livei))
1664            break;
1665         ++it;
1666      }
1667      if (it == slots.end() || it->offset >= entryEnd) {
1668         // fits
1669         for (; bgn != slots.end() && bgn->offset < entryEnd; ++bgn) {
1670            bgn->occup.insert(livei);
1671            if (bgn->size() == size)
1672               slot.sym = bgn->sym;
1673         }
1674         break;
1675      }
1676   }
1677   if (!slot.sym) {
1678      stackSize = offset + size;
1679      slot.offset = offset;
1680      slot.sym = new_Symbol(func->getProgram(), FILE_MEMORY_LOCAL);
1681      if (!func->stackPtr)
1682         offset += func->tlsBase;
1683      slot.sym->setAddress(NULL, offset);
1684      slot.sym->reg.size = size;
1685      slots.insert(pos, slot)->occup.insert(livei);
1686   }
1687   return slot.sym;
1688}
1689
1690Value *
1691SpillCodeInserter::offsetSlot(Value *base, const LValue *lval)
1692{
1693   if (!lval->compound || (lval->compMask & 0x1))
1694      return base;
1695   Value *slot = cloneShallow(func, base);
1696
1697   slot->reg.data.offset += (ffs(lval->compMask) - 1) * lval->reg.size;
1698   slot->reg.size = lval->reg.size;
1699
1700   return slot;
1701}
1702
1703void
1704SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)
1705{
1706   const DataType ty = typeOfSize(lval->reg.size);
1707
1708   slot = offsetSlot(slot, lval);
1709
1710   Instruction *st;
1711   if (slot->reg.file == FILE_MEMORY_LOCAL) {
1712      lval->noSpill = 1;
1713      if (ty != TYPE_B96) {
1714         st = new_Instruction(func, OP_STORE, ty);
1715         st->setSrc(0, slot);
1716         st->setSrc(1, lval);
1717      } else {
1718         st = new_Instruction(func, OP_SPLIT, ty);
1719         st->setSrc(0, lval);
1720         for (int d = 0; d < lval->reg.size / 4; ++d)
1721            st->setDef(d, new_LValue(func, FILE_GPR));
1722
1723         for (int d = lval->reg.size / 4 - 1; d >= 0; --d) {
1724            Value *tmp = cloneShallow(func, slot);
1725            tmp->reg.size = 4;
1726            tmp->reg.data.offset += 4 * d;
1727
1728            Instruction *s = new_Instruction(func, OP_STORE, TYPE_U32);
1729            s->setSrc(0, tmp);
1730            s->setSrc(1, st->getDef(d));
1731            defi->bb->insertAfter(defi, s);
1732         }
1733      }
1734   } else {
1735      st = new_Instruction(func, OP_CVT, ty);
1736      st->setDef(0, slot);
1737      st->setSrc(0, lval);
1738      if (lval->reg.file == FILE_FLAGS)
1739         st->flagsSrc = 0;
1740   }
1741   defi->bb->insertAfter(defi, st);
1742}
1743
1744LValue *
1745SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
1746{
1747   const DataType ty = typeOfSize(lval->reg.size);
1748
1749   slot = offsetSlot(slot, lval);
1750   lval = cloneShallow(func, lval);
1751
1752   Instruction *ld;
1753   if (slot->reg.file == FILE_MEMORY_LOCAL) {
1754      lval->noSpill = 1;
1755      if (ty != TYPE_B96) {
1756         ld = new_Instruction(func, OP_LOAD, ty);
1757      } else {
1758         ld = new_Instruction(func, OP_MERGE, ty);
1759         for (int d = 0; d < lval->reg.size / 4; ++d) {
1760            Value *tmp = cloneShallow(func, slot);
1761            LValue *val;
1762            tmp->reg.size = 4;
1763            tmp->reg.data.offset += 4 * d;
1764
1765            Instruction *l = new_Instruction(func, OP_LOAD, TYPE_U32);
1766            l->setDef(0, (val = new_LValue(func, FILE_GPR)));
1767            l->setSrc(0, tmp);
1768            usei->bb->insertBefore(usei, l);
1769            ld->setSrc(d, val);
1770            val->noSpill = 1;
1771         }
1772         ld->setDef(0, lval);
1773         usei->bb->insertBefore(usei, ld);
1774         return lval;
1775      }
1776   } else {
1777      ld = new_Instruction(func, OP_CVT, ty);
1778   }
1779   ld->setDef(0, lval);
1780   ld->setSrc(0, slot);
1781   if (lval->reg.file == FILE_FLAGS)
1782      ld->flagsDef = 0;
1783
1784   usei->bb->insertBefore(usei, ld);
1785   return lval;
1786}
1787
1788static bool
1789value_cmp(ValueRef *a, ValueRef *b) {
1790   Instruction *ai = a->getInsn(), *bi = b->getInsn();
1791   if (ai->bb != bi->bb)
1792      return ai->bb->getId() < bi->bb->getId();
1793   return ai->serial < bi->serial;
1794}
1795
1796// For each value that is to be spilled, go through all its definitions.
1797// A value can have multiple definitions if it has been coalesced before.
1798// For each definition, first go through all its uses and insert an unspill
1799// instruction before it, then replace the use with the temporary register.
1800// Unspill can be either a load from memory or simply a move to another
1801// register file.
1802// For "Pseudo" instructions (like PHI, SPLIT, MERGE) we can erase the use
1803// if we have spilled to a memory location, or simply with the new register.
1804// No load or conversion instruction should be needed.
1805bool
1806SpillCodeInserter::run(const std::list<ValuePair>& lst)
1807{
1808   for (std::list<ValuePair>::const_iterator it = lst.begin(); it != lst.end();
1809        ++it) {
1810      LValue *lval = it->first->asLValue();
1811      Symbol *mem = it->second ? it->second->asSym() : NULL;
1812
1813      // Keep track of which instructions to delete later. Deleting them
1814      // inside the loop is unsafe since a single instruction may have
1815      // multiple destinations that all need to be spilled (like OP_SPLIT).
1816      unordered_set<Instruction *> to_del;
1817
1818      std::list<ValueDef *> &defs = mergedDefs(lval);
1819      for (Value::DefIterator d = defs.begin(); d != defs.end();
1820           ++d) {
1821         Value *slot = mem ?
1822            static_cast<Value *>(mem) : new_LValue(func, FILE_GPR);
1823         Value *tmp = NULL;
1824         Instruction *last = NULL;
1825
1826         LValue *dval = (*d)->get()->asLValue();
1827         Instruction *defi = (*d)->getInsn();
1828
1829         // Sort all the uses by BB/instruction so that we don't unspill
1830         // multiple times in a row, and also remove a source of
1831         // non-determinism.
1832         std::vector<ValueRef *> refs(dval->uses.begin(), dval->uses.end());
1833         std::sort(refs.begin(), refs.end(), value_cmp);
1834
1835         // Unspill at each use *before* inserting spill instructions,
1836         // we don't want to have the spill instructions in the use list here.
1837         for (std::vector<ValueRef*>::const_iterator it = refs.begin();
1838              it != refs.end(); ++it) {
1839            ValueRef *u = *it;
1840            Instruction *usei = u->getInsn();
1841            assert(usei);
1842            if (usei->isPseudo()) {
1843               tmp = (slot->reg.file == FILE_MEMORY_LOCAL) ? NULL : slot;
1844               last = NULL;
1845            } else {
1846               if (!last || (usei != last->next && usei != last))
1847                  tmp = unspill(usei, dval, slot);
1848               last = usei;
1849            }
1850            u->set(tmp);
1851         }
1852
1853         assert(defi);
1854         if (defi->isPseudo()) {
1855            d = defs.erase(d);
1856            --d;
1857            if (slot->reg.file == FILE_MEMORY_LOCAL)
1858               to_del.insert(defi);
1859            else
1860               defi->setDef(0, slot);
1861         } else {
1862            spill(defi, slot, dval);
1863         }
1864      }
1865
1866      for (unordered_set<Instruction *>::const_iterator it = to_del.begin();
1867           it != to_del.end(); ++it) {
1868         mergedDefs.removeDefsOfInstruction(*it);
1869         delete_Instruction(func->getProgram(), *it);
1870      }
1871   }
1872
1873   // TODO: We're not trying to reuse old slots in a potential next iteration.
1874   //  We have to update the slots' livei intervals to be able to do that.
1875   stackBase = stackSize;
1876   slots.clear();
1877   return true;
1878}
1879
1880bool
1881RegAlloc::exec()
1882{
1883   for (IteratorRef it = prog->calls.iteratorDFS(false);
1884        !it->end(); it->next()) {
1885      func = Function::get(reinterpret_cast<Graph::Node *>(it->get()));
1886
1887      func->tlsBase = prog->tlsSize;
1888      if (!execFunc())
1889         return false;
1890      prog->tlsSize += func->tlsSize;
1891   }
1892   return true;
1893}
1894
1895bool
1896RegAlloc::execFunc()
1897{
1898   MergedDefs mergedDefs;
1899   InsertConstraintsPass insertConstr;
1900   PhiMovesPass insertPhiMoves;
1901   ArgumentMovesPass insertArgMoves;
1902   BuildIntervalsPass buildIntervals;
1903   SpillCodeInserter insertSpills(func, mergedDefs);
1904
1905   GCRA gcra(func, insertSpills, mergedDefs);
1906
1907   unsigned int i, retries;
1908   bool ret;
1909
1910   if (!func->ins.empty()) {
1911      // Insert a nop at the entry so inputs only used by the first instruction
1912      // don't count as having an empty live range.
1913      Instruction *nop = new_Instruction(func, OP_NOP, TYPE_NONE);
1914      BasicBlock::get(func->cfg.getRoot())->insertHead(nop);
1915   }
1916
1917   ret = insertConstr.exec(func);
1918   if (!ret)
1919      goto out;
1920
1921   ret = insertPhiMoves.run(func);
1922   if (!ret)
1923      goto out;
1924
1925   ret = insertArgMoves.run(func);
1926   if (!ret)
1927      goto out;
1928
1929   // TODO: need to fix up spill slot usage ranges to support > 1 retry
1930   for (retries = 0; retries < 3; ++retries) {
1931      if (retries && (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC))
1932         INFO("Retry: %i\n", retries);
1933      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
1934         func->print();
1935
1936      // spilling to registers may add live ranges, need to rebuild everything
1937      ret = true;
1938      for (sequence = func->cfg.nextSequence(), i = 0;
1939           ret && i <= func->loopNestingBound;
1940           sequence = func->cfg.nextSequence(), ++i)
1941         ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot()));
1942      // reset marker
1943      for (ArrayList::Iterator bi = func->allBBlocks.iterator();
1944           !bi.end(); bi.next())
1945         BasicBlock::get(bi)->liveSet.marker = false;
1946      if (!ret)
1947         break;
1948      func->orderInstructions(this->insns);
1949
1950      ret = buildIntervals.run(func);
1951      if (!ret)
1952         break;
1953      ret = gcra.allocateRegisters(insns);
1954      if (ret)
1955         break; // success
1956   }
1957   INFO_DBG(prog->dbgFlags, REG_ALLOC, "RegAlloc done: %i\n", ret);
1958
1959   func->tlsSize = insertSpills.getStackSize();
1960out:
1961   return ret;
1962}
1963
1964// TODO: check if modifying Instruction::join here breaks anything
1965void
1966GCRA::resolveSplitsAndMerges()
1967{
1968   for (std::list<Instruction *>::iterator it = splits.begin();
1969        it != splits.end();
1970        ++it) {
1971      Instruction *split = *it;
1972      unsigned int reg = regs.idToBytes(split->getSrc(0));
1973      for (int d = 0; split->defExists(d); ++d) {
1974         Value *v = split->getDef(d);
1975         v->reg.data.id = regs.bytesToId(v, reg);
1976         v->join = v;
1977         reg += v->reg.size;
1978      }
1979   }
1980   splits.clear();
1981
1982   for (std::list<Instruction *>::iterator it = merges.begin();
1983        it != merges.end();
1984        ++it) {
1985      Instruction *merge = *it;
1986      unsigned int reg = regs.idToBytes(merge->getDef(0));
1987      for (int s = 0; merge->srcExists(s); ++s) {
1988         Value *v = merge->getSrc(s);
1989         v->reg.data.id = regs.bytesToId(v, reg);
1990         v->join = v;
1991         // If the value is defined by a phi/union node, we also need to
1992         // perform the same fixup on that node's sources, since after RA
1993         // their registers should be identical.
1994         if (v->getInsn()->op == OP_PHI || v->getInsn()->op == OP_UNION) {
1995            Instruction *phi = v->getInsn();
1996            for (int phis = 0; phi->srcExists(phis); ++phis) {
1997               phi->getSrc(phis)->join = v;
1998               phi->getSrc(phis)->reg.data.id = v->reg.data.id;
1999            }
2000         }
2001         reg += v->reg.size;
2002      }
2003   }
2004   merges.clear();
2005}
2006
2007bool Program::registerAllocation()
2008{
2009   RegAlloc ra(this);
2010   return ra.exec();
2011}
2012
2013bool
2014RegAlloc::InsertConstraintsPass::exec(Function *ir)
2015{
2016   constrList.clear();
2017
2018   bool ret = run(ir, true, true);
2019   if (ret)
2020      ret = insertConstraintMoves();
2021   return ret;
2022}
2023
2024// TODO: make part of texture insn
2025void
2026RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex)
2027{
2028   Value *def[4];
2029   int c, k, d;
2030   uint8_t mask = 0;
2031
2032   for (d = 0, k = 0, c = 0; c < 4; ++c) {
2033      if (!(tex->tex.mask & (1 << c)))
2034         continue;
2035      if (tex->getDef(k)->refCount()) {
2036         mask |= 1 << c;
2037         def[d++] = tex->getDef(k);
2038      }
2039      ++k;
2040   }
2041   tex->tex.mask = mask;
2042
2043   for (c = 0; c < d; ++c)
2044      tex->setDef(c, def[c]);
2045   for (; c < 4; ++c)
2046      tex->setDef(c, NULL);
2047}
2048
2049bool
2050RegAlloc::InsertConstraintsPass::detectConflict(Instruction *cst, int s)
2051{
2052   Value *v = cst->getSrc(s);
2053
2054   // current register allocation can't handle it if a value participates in
2055   // multiple constraints
2056   for (Value::UseIterator it = v->uses.begin(); it != v->uses.end(); ++it) {
2057      if (cst != (*it)->getInsn())
2058         return true;
2059   }
2060
2061   // can start at s + 1 because detectConflict is called on all sources
2062   for (int c = s + 1; cst->srcExists(c); ++c)
2063      if (v == cst->getSrc(c))
2064         return true;
2065
2066   Instruction *defi = v->getInsn();
2067
2068   return (!defi || defi->constrainedDefs());
2069}
2070
2071void
2072RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n)
2073{
2074   Instruction *cst;
2075   int d;
2076
2077   // first, look for an existing identical constraint op
2078   for (std::list<Instruction *>::iterator it = constrList.begin();
2079        it != constrList.end();
2080        ++it) {
2081      cst = (*it);
2082      if (!i->bb->dominatedBy(cst->bb))
2083         break;
2084      for (d = 0; d < n; ++d)
2085         if (cst->getSrc(d) != i->getSrc(d + s))
2086            break;
2087      if (d >= n) {
2088         for (d = 0; d < n; ++d, ++s)
2089            i->setSrc(s, cst->getDef(d));
2090         return;
2091      }
2092   }
2093   cst = new_Instruction(func, OP_CONSTRAINT, i->dType);
2094
2095   for (d = 0; d < n; ++s, ++d) {
2096      cst->setDef(d, new_LValue(func, FILE_GPR));
2097      cst->setSrc(d, i->getSrc(s));
2098      i->setSrc(s, cst->getDef(d));
2099   }
2100   i->bb->insertBefore(i, cst);
2101
2102   constrList.push_back(cst);
2103}
2104
2105// Add a dummy use of the pointer source of >= 8 byte loads after the load
2106// to prevent it from being assigned a register which overlapping the load's
2107// destination, which would produce random corruptions.
2108void
2109RegAlloc::InsertConstraintsPass::addHazard(Instruction *i, const ValueRef *src)
2110{
2111   Instruction *hzd = new_Instruction(func, OP_NOP, TYPE_NONE);
2112   hzd->setSrc(0, src->get());
2113   i->bb->insertAfter(i, hzd);
2114
2115}
2116
2117// b32 { %r0 %r1 %r2 %r3 } -> b128 %r0q
2118void
2119RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn)
2120{
2121   int n;
2122   for (n = 0; insn->defExists(n) && insn->def(n).getFile() == FILE_GPR; ++n);
2123   condenseDefs(insn, 0, n - 1);
2124}
2125
2126void
2127RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn,
2128                                              const int a, const int b)
2129{
2130   uint8_t size = 0;
2131   if (a >= b)
2132      return;
2133   for (int s = a; s <= b; ++s)
2134      size += insn->getDef(s)->reg.size;
2135   if (!size)
2136      return;
2137
2138   LValue *lval = new_LValue(func, FILE_GPR);
2139   lval->reg.size = size;
2140
2141   Instruction *split = new_Instruction(func, OP_SPLIT, typeOfSize(size));
2142   split->setSrc(0, lval);
2143   for (int d = a; d <= b; ++d) {
2144      split->setDef(d - a, insn->getDef(d));
2145      insn->setDef(d, NULL);
2146   }
2147   insn->setDef(a, lval);
2148
2149   for (int k = a + 1, d = b + 1; insn->defExists(d); ++d, ++k) {
2150      insn->setDef(k, insn->getDef(d));
2151      insn->setDef(d, NULL);
2152   }
2153   // carry over predicate if any (mainly for OP_UNION uses)
2154   split->setPredicate(insn->cc, insn->getPredicate());
2155
2156   insn->bb->insertAfter(insn, split);
2157   constrList.push_back(split);
2158}
2159
2160void
2161RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn,
2162                                              const int a, const int b)
2163{
2164   uint8_t size = 0;
2165   if (a >= b)
2166      return;
2167   for (int s = a; s <= b; ++s)
2168      size += insn->getSrc(s)->reg.size;
2169   if (!size)
2170      return;
2171   LValue *lval = new_LValue(func, FILE_GPR);
2172   lval->reg.size = size;
2173
2174   Value *save[3];
2175   insn->takeExtraSources(0, save);
2176
2177   Instruction *merge = new_Instruction(func, OP_MERGE, typeOfSize(size));
2178   merge->setDef(0, lval);
2179   for (int s = a, i = 0; s <= b; ++s, ++i) {
2180      merge->setSrc(i, insn->getSrc(s));
2181   }
2182   insn->moveSources(b + 1, a - b);
2183   insn->setSrc(a, lval);
2184   insn->bb->insertBefore(insn, merge);
2185
2186   insn->putExtraSources(0, save);
2187
2188   constrList.push_back(merge);
2189}
2190
2191bool
2192RegAlloc::InsertConstraintsPass::isScalarTexGM107(TexInstruction *tex)
2193{
2194   if (tex->tex.sIndirectSrc >= 0 ||
2195       tex->tex.rIndirectSrc >= 0 ||
2196       tex->tex.derivAll)
2197      return false;
2198
2199   if (tex->tex.mask == 5 || tex->tex.mask == 6)
2200      return false;
2201
2202   switch (tex->op) {
2203   case OP_TEX:
2204   case OP_TXF:
2205   case OP_TXG:
2206   case OP_TXL:
2207      break;
2208   default:
2209      return false;
2210   }
2211
2212   // legal variants:
2213   // TEXS.1D.LZ
2214   // TEXS.2D
2215   // TEXS.2D.LZ
2216   // TEXS.2D.LL
2217   // TEXS.2D.DC
2218   // TEXS.2D.LL.DC
2219   // TEXS.2D.LZ.DC
2220   // TEXS.A2D
2221   // TEXS.A2D.LZ
2222   // TEXS.A2D.LZ.DC
2223   // TEXS.3D
2224   // TEXS.3D.LZ
2225   // TEXS.CUBE
2226   // TEXS.CUBE.LL
2227
2228   // TLDS.1D.LZ
2229   // TLDS.1D.LL
2230   // TLDS.2D.LZ
2231   // TLSD.2D.LZ.AOFFI
2232   // TLDS.2D.LZ.MZ
2233   // TLDS.2D.LL
2234   // TLDS.2D.LL.AOFFI
2235   // TLDS.A2D.LZ
2236   // TLDS.3D.LZ
2237
2238   // TLD4S: all 2D/RECT variants and only offset
2239
2240   switch (tex->op) {
2241   case OP_TEX:
2242      if (tex->tex.useOffsets)
2243         return false;
2244
2245      switch (tex->tex.target.getEnum()) {
2246      case TEX_TARGET_1D:
2247      case TEX_TARGET_2D_ARRAY_SHADOW:
2248         return tex->tex.levelZero;
2249      case TEX_TARGET_CUBE:
2250         return !tex->tex.levelZero;
2251      case TEX_TARGET_2D:
2252      case TEX_TARGET_2D_ARRAY:
2253      case TEX_TARGET_2D_SHADOW:
2254      case TEX_TARGET_3D:
2255      case TEX_TARGET_RECT:
2256      case TEX_TARGET_RECT_SHADOW:
2257         return true;
2258      default:
2259         return false;
2260      }
2261
2262   case OP_TXL:
2263      if (tex->tex.useOffsets)
2264         return false;
2265
2266      switch (tex->tex.target.getEnum()) {
2267      case TEX_TARGET_2D:
2268      case TEX_TARGET_2D_SHADOW:
2269      case TEX_TARGET_RECT:
2270      case TEX_TARGET_RECT_SHADOW:
2271      case TEX_TARGET_CUBE:
2272         return true;
2273      default:
2274         return false;
2275      }
2276
2277   case OP_TXF:
2278      switch (tex->tex.target.getEnum()) {
2279      case TEX_TARGET_1D:
2280         return !tex->tex.useOffsets;
2281      case TEX_TARGET_2D:
2282      case TEX_TARGET_RECT:
2283         return true;
2284      case TEX_TARGET_2D_ARRAY:
2285      case TEX_TARGET_2D_MS:
2286      case TEX_TARGET_3D:
2287         return !tex->tex.useOffsets && tex->tex.levelZero;
2288      default:
2289         return false;
2290      }
2291
2292   case OP_TXG:
2293      if (tex->tex.useOffsets > 1)
2294         return false;
2295      if (tex->tex.mask != 0x3 && tex->tex.mask != 0xf)
2296         return false;
2297
2298      switch (tex->tex.target.getEnum()) {
2299      case TEX_TARGET_2D:
2300      case TEX_TARGET_2D_MS:
2301      case TEX_TARGET_2D_SHADOW:
2302      case TEX_TARGET_RECT:
2303      case TEX_TARGET_RECT_SHADOW:
2304         return true;
2305      default:
2306         return false;
2307      }
2308
2309   default:
2310      return false;
2311   }
2312}
2313
2314void
2315RegAlloc::InsertConstraintsPass::handleScalarTexGM107(TexInstruction *tex)
2316{
2317   int defCount = tex->defCount(0xff);
2318   int srcCount = tex->srcCount(0xff);
2319
2320   tex->tex.scalar = true;
2321
2322   // 1. handle defs
2323   if (defCount > 3)
2324      condenseDefs(tex, 2, 3);
2325   if (defCount > 1)
2326      condenseDefs(tex, 0, 1);
2327
2328   // 2. handle srcs
2329   // special case for TXF.A2D
2330   if (tex->op == OP_TXF && tex->tex.target == TEX_TARGET_2D_ARRAY) {
2331      assert(srcCount >= 3);
2332      condenseSrcs(tex, 1, 2);
2333   } else {
2334      if (srcCount > 3)
2335         condenseSrcs(tex, 2, 3);
2336      // only if we have more than 2 sources
2337      if (srcCount > 2)
2338         condenseSrcs(tex, 0, 1);
2339   }
2340
2341   assert(!tex->defExists(2) && !tex->srcExists(2));
2342}
2343
2344void
2345RegAlloc::InsertConstraintsPass::texConstraintGM107(TexInstruction *tex)
2346{
2347   int n, s;
2348
2349   if (isTextureOp(tex->op))
2350      textureMask(tex);
2351
2352   if (targ->getChipset() < NVISA_GV100_CHIPSET) {
2353      if (isScalarTexGM107(tex)) {
2354         handleScalarTexGM107(tex);
2355         return;
2356      }
2357
2358      assert(!tex->tex.scalar);
2359      condenseDefs(tex);
2360   } else {
2361      if (isTextureOp(tex->op)) {
2362         int defCount = tex->defCount(0xff);
2363         if (defCount > 3)
2364            condenseDefs(tex, 2, 3);
2365         if (defCount > 1)
2366            condenseDefs(tex, 0, 1);
2367      } else {
2368         condenseDefs(tex);
2369      }
2370   }
2371
2372   if (isSurfaceOp(tex->op)) {
2373      int s = tex->tex.target.getDim() +
2374         (tex->tex.target.isArray() || tex->tex.target.isCube());
2375      int n = 0;
2376
2377      switch (tex->op) {
2378      case OP_SUSTB:
2379      case OP_SUSTP:
2380         n = 4;
2381         break;
2382      case OP_SUREDB:
2383      case OP_SUREDP:
2384         if (tex->subOp == NV50_IR_SUBOP_ATOM_CAS)
2385            n = 2;
2386         break;
2387      default:
2388         break;
2389      }
2390
2391      if (s > 1)
2392         condenseSrcs(tex, 0, s - 1);
2393      if (n > 1)
2394         condenseSrcs(tex, 1, n); // do not condense the tex handle
2395   } else
2396   if (isTextureOp(tex->op)) {
2397      if (tex->op != OP_TXQ) {
2398         s = tex->tex.target.getArgCount() - tex->tex.target.isMS();
2399         if (tex->op == OP_TXD) {
2400            // Indirect handle belongs in the first arg
2401            if (tex->tex.rIndirectSrc >= 0)
2402               s++;
2403            if (!tex->tex.target.isArray() && tex->tex.useOffsets)
2404               s++;
2405         }
2406         n = tex->srcCount(0xff, true) - s;
2407         // TODO: Is this necessary? Perhaps just has to be aligned to the
2408         // level that the first arg is, not necessarily to 4. This
2409         // requirement has not been rigorously verified, as it has been on
2410         // Kepler.
2411         if (n > 0 && n < 3) {
2412            if (tex->srcExists(n + s)) // move potential predicate out of the way
2413               tex->moveSources(n + s, 3 - n);
2414            while (n < 3)
2415               tex->setSrc(s + n++, new_LValue(func, FILE_GPR));
2416         }
2417      } else {
2418         s = tex->srcCount(0xff, true);
2419         n = 0;
2420      }
2421
2422      if (s > 1)
2423         condenseSrcs(tex, 0, s - 1);
2424      if (n > 1) // NOTE: first call modified positions already
2425         condenseSrcs(tex, 1, n);
2426   }
2427}
2428
2429void
2430RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
2431{
2432   if (isTextureOp(tex->op))
2433      textureMask(tex);
2434   condenseDefs(tex);
2435
2436   if (tex->op == OP_SUSTB || tex->op == OP_SUSTP) {
2437      condenseSrcs(tex, 3, 6);
2438   } else
2439   if (isTextureOp(tex->op)) {
2440      int n = tex->srcCount(0xff, true);
2441      int s = n > 4 ? 4 : n;
2442      if (n > 4 && n < 7) {
2443         if (tex->srcExists(n)) // move potential predicate out of the way
2444            tex->moveSources(n, 7 - n);
2445
2446         while (n < 7)
2447            tex->setSrc(n++, new_LValue(func, FILE_GPR));
2448      }
2449      if (s > 1)
2450         condenseSrcs(tex, 0, s - 1);
2451      if (n > 4)
2452         condenseSrcs(tex, 1, n - s);
2453   }
2454}
2455
2456void
2457RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex)
2458{
2459   int n, s;
2460
2461   if (isTextureOp(tex->op))
2462      textureMask(tex);
2463
2464   if (tex->op == OP_TXQ) {
2465      s = tex->srcCount(0xff);
2466      n = 0;
2467   } else if (isSurfaceOp(tex->op)) {
2468      s = tex->tex.target.getDim() + (tex->tex.target.isArray() || tex->tex.target.isCube());
2469      if (tex->op == OP_SUSTB || tex->op == OP_SUSTP)
2470         n = 4;
2471      else
2472         n = 0;
2473   } else {
2474      s = tex->tex.target.getArgCount() - tex->tex.target.isMS();
2475      if (!tex->tex.target.isArray() &&
2476          (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0))
2477         ++s;
2478      if (tex->op == OP_TXD && tex->tex.useOffsets)
2479         ++s;
2480      n = tex->srcCount(0xff) - s;
2481      assert(n <= 4);
2482   }
2483
2484   if (s > 1)
2485      condenseSrcs(tex, 0, s - 1);
2486   if (n > 1) // NOTE: first call modified positions already
2487      condenseSrcs(tex, 1, n);
2488
2489   condenseDefs(tex);
2490}
2491
2492void
2493RegAlloc::InsertConstraintsPass::texConstraintNV50(TexInstruction *tex)
2494{
2495   Value *pred = tex->getPredicate();
2496   if (pred)
2497      tex->setPredicate(tex->cc, NULL);
2498
2499   textureMask(tex);
2500
2501   assert(tex->defExists(0) && tex->srcExists(0));
2502   // make src and def count match
2503   int c;
2504   for (c = 0; tex->srcExists(c) || tex->defExists(c); ++c) {
2505      if (!tex->srcExists(c))
2506         tex->setSrc(c, new_LValue(func, tex->getSrc(0)->asLValue()));
2507      else
2508         insertConstraintMove(tex, c);
2509      if (!tex->defExists(c))
2510         tex->setDef(c, new_LValue(func, tex->getDef(0)->asLValue()));
2511   }
2512   if (pred)
2513      tex->setPredicate(tex->cc, pred);
2514   condenseDefs(tex);
2515   condenseSrcs(tex, 0, c - 1);
2516}
2517
2518// Insert constraint markers for instructions whose multiple sources must be
2519// located in consecutive registers.
2520bool
2521RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
2522{
2523   TexInstruction *tex;
2524   Instruction *next;
2525   int s, size;
2526
2527   targ = bb->getProgram()->getTarget();
2528
2529   for (Instruction *i = bb->getEntry(); i; i = next) {
2530      next = i->next;
2531
2532      if ((tex = i->asTex())) {
2533         switch (targ->getChipset() & ~0xf) {
2534         case 0x50:
2535         case 0x80:
2536         case 0x90:
2537         case 0xa0:
2538            texConstraintNV50(tex);
2539            break;
2540         case 0xc0:
2541         case 0xd0:
2542            texConstraintNVC0(tex);
2543            break;
2544         case 0xe0:
2545         case 0xf0:
2546         case 0x100:
2547            texConstraintNVE0(tex);
2548            break;
2549         case 0x110:
2550         case 0x120:
2551         case 0x130:
2552         case 0x140:
2553         case 0x160:
2554            texConstraintGM107(tex);
2555            break;
2556         default:
2557            break;
2558         }
2559      } else
2560      if (i->op == OP_EXPORT || i->op == OP_STORE) {
2561         for (size = typeSizeof(i->dType), s = 1; size > 0; ++s) {
2562            assert(i->srcExists(s));
2563            size -= i->getSrc(s)->reg.size;
2564         }
2565         condenseSrcs(i, 1, s - 1);
2566      } else
2567      if (i->op == OP_LOAD || i->op == OP_VFETCH) {
2568         condenseDefs(i);
2569         if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8)
2570            addHazard(i, i->src(0).getIndirect(0));
2571         if (i->src(0).isIndirect(1) && typeSizeof(i->dType) >= 8)
2572            addHazard(i, i->src(0).getIndirect(1));
2573         if (i->op == OP_LOAD && i->fixed && targ->getChipset() < 0xc0) {
2574            // Add a hazard to make sure we keep the op around. These are used
2575            // for membars.
2576            Instruction *nop = new_Instruction(func, OP_NOP, i->dType);
2577            nop->setSrc(0, i->getDef(0));
2578            i->bb->insertAfter(i, nop);
2579         }
2580      } else
2581      if (i->op == OP_UNION ||
2582          i->op == OP_MERGE ||
2583          i->op == OP_SPLIT) {
2584         constrList.push_back(i);
2585      } else
2586      if (i->op == OP_ATOM && i->subOp == NV50_IR_SUBOP_ATOM_CAS &&
2587          targ->getChipset() < 0xc0) {
2588         // Like a hazard, but for a def.
2589         Instruction *nop = new_Instruction(func, OP_NOP, i->dType);
2590         nop->setSrc(0, i->getDef(0));
2591         i->bb->insertAfter(i, nop);
2592      }
2593   }
2594   return true;
2595}
2596
2597void
2598RegAlloc::InsertConstraintsPass::insertConstraintMove(Instruction *cst, int s)
2599{
2600   const uint8_t size = cst->src(s).getSize();
2601
2602   assert(cst->getSrc(s)->defs.size() == 1); // still SSA
2603
2604   Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
2605
2606   bool imm = defi->op == OP_MOV &&
2607      defi->src(0).getFile() == FILE_IMMEDIATE;
2608   bool load = defi->op == OP_LOAD &&
2609      defi->src(0).getFile() == FILE_MEMORY_CONST &&
2610      !defi->src(0).isIndirect(0);
2611   // catch some cases where don't really need MOVs
2612   if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs()) {
2613      if (imm || load) {
2614         // Move the defi right before the cst. No point in expanding
2615         // the range.
2616         defi->bb->remove(defi);
2617         cst->bb->insertBefore(cst, defi);
2618      }
2619      return;
2620   }
2621
2622   LValue *lval = new_LValue(func, cst->src(s).getFile());
2623   lval->reg.size = size;
2624
2625   Instruction *mov = new_Instruction(func, OP_MOV, typeOfSize(size));
2626   mov->setDef(0, lval);
2627   mov->setSrc(0, cst->getSrc(s));
2628
2629   if (load) {
2630      mov->op = OP_LOAD;
2631      mov->setSrc(0, defi->getSrc(0));
2632   } else if (imm) {
2633      mov->setSrc(0, defi->getSrc(0));
2634   }
2635
2636   if (defi->getPredicate())
2637      mov->setPredicate(defi->cc, defi->getPredicate());
2638
2639   cst->setSrc(s, mov->getDef(0));
2640   cst->bb->insertBefore(cst, mov);
2641
2642   cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help
2643}
2644
2645// Insert extra moves so that, if multiple register constraints on a value are
2646// in conflict, these conflicts can be resolved.
2647bool
2648RegAlloc::InsertConstraintsPass::insertConstraintMoves()
2649{
2650   for (std::list<Instruction *>::iterator it = constrList.begin();
2651        it != constrList.end();
2652        ++it) {
2653      Instruction *cst = *it;
2654      Instruction *mov;
2655
2656      if (cst->op == OP_SPLIT && false) {
2657         // spilling splits is annoying, just make sure they're separate
2658         for (int d = 0; cst->defExists(d); ++d) {
2659            if (!cst->getDef(d)->refCount())
2660               continue;
2661            LValue *lval = new_LValue(func, cst->def(d).getFile());
2662            const uint8_t size = cst->def(d).getSize();
2663            lval->reg.size = size;
2664
2665            mov = new_Instruction(func, OP_MOV, typeOfSize(size));
2666            mov->setSrc(0, lval);
2667            mov->setDef(0, cst->getDef(d));
2668            cst->setDef(d, mov->getSrc(0));
2669            cst->bb->insertAfter(cst, mov);
2670
2671            cst->getSrc(0)->asLValue()->noSpill = 1;
2672            mov->getSrc(0)->asLValue()->noSpill = 1;
2673         }
2674      } else
2675      if (cst->op == OP_MERGE || cst->op == OP_UNION) {
2676         for (int s = 0; cst->srcExists(s); ++s) {
2677            const uint8_t size = cst->src(s).getSize();
2678
2679            if (!cst->getSrc(s)->defs.size()) {
2680               mov = new_Instruction(func, OP_NOP, typeOfSize(size));
2681               mov->setDef(0, cst->getSrc(s));
2682               cst->bb->insertBefore(cst, mov);
2683               continue;
2684            }
2685
2686            insertConstraintMove(cst, s);
2687         }
2688      }
2689   }
2690
2691   return true;
2692}
2693
2694} // namespace nv50_ir
2695