1/*
2 * Copyright (c) 2017 Lima Project
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sub license,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the
12 * next paragraph) shall be included in all copies or substantial portions
13 * of the Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 *
23 */
24
25#include "util/ralloc.h"
26#include "util/register_allocate.h"
27#include "util/u_debug.h"
28
29#include "ppir.h"
30#include "lima_context.h"
31
32#define PPIR_REG_COUNT  (6 * 4)
33
34enum ppir_ra_reg_class {
35   ppir_ra_reg_class_vec1,
36   ppir_ra_reg_class_vec2,
37   ppir_ra_reg_class_vec3,
38   ppir_ra_reg_class_vec4,
39
40   /* 4 reg class for load/store instr regs:
41    * load/store instr has no swizzle field, so the (virtual) register
42    * must be allocated at the beginning of a (physical) register,
43    */
44   ppir_ra_reg_class_head_vec1,
45   ppir_ra_reg_class_head_vec2,
46   ppir_ra_reg_class_head_vec3,
47   ppir_ra_reg_class_head_vec4,
48
49   ppir_ra_reg_class_num,
50};
51
52struct ra_regs *ppir_regalloc_init(void *mem_ctx)
53{
54   struct ra_regs *ret = ra_alloc_reg_set(mem_ctx, PPIR_REG_COUNT, false);
55   if (!ret)
56      return NULL;
57
58   /* Classes for contiguous 1-4 channel groups anywhere within a register. */
59   struct ra_class *classes[ppir_ra_reg_class_num];
60   for (int i = 0; i < ppir_ra_reg_class_head_vec1; i++) {
61      classes[i] = ra_alloc_contig_reg_class(ret, i + 1);
62
63      for (int j = 0; j < PPIR_REG_COUNT; j += 4) {
64         for (int swiz = 0; swiz < (4 - i); swiz++)
65            ra_class_add_reg(classes[i], j + swiz);
66      }
67   }
68
69   /* Classes for contiguous 1-4 channels with a start channel of .x */
70   for (int i = ppir_ra_reg_class_head_vec1; i < ppir_ra_reg_class_num; i++) {
71      classes[i] = ra_alloc_contig_reg_class(ret, i - ppir_ra_reg_class_head_vec1 + 1);
72
73      for (int j = 0; j < PPIR_REG_COUNT; j += 4)
74         ra_class_add_reg(classes[i], j);
75   }
76
77   ra_set_finalize(ret, NULL);
78   return ret;
79}
80
81static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp)
82{
83   list_for_each_entry(ppir_block, block, &comp->block_list, list) {
84      list_for_each_entry(ppir_node, node, &block->node_list, list) {
85         if (node->is_end)
86            continue;
87
88         if (!node->instr || node->op == ppir_op_const)
89            continue;
90
91         ppir_dest *dest = ppir_node_get_dest(node);
92         if (dest) {
93            ppir_reg *reg = NULL;
94
95            if (dest->type == ppir_target_ssa) {
96               reg = &dest->ssa;
97               list_addtail(&reg->list, &comp->reg_list);
98               comp->reg_num++;
99            }
100         }
101      }
102   }
103}
104
105static void ppir_regalloc_print_result(ppir_compiler *comp)
106{
107   printf("======ppir regalloc result======\n");
108   list_for_each_entry(ppir_block, block, &comp->block_list, list) {
109      list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
110         printf("%03d:", instr->index);
111         for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
112            ppir_node *node = instr->slots[i];
113            if (!node)
114               continue;
115
116            printf(" (%d|", node->index);
117
118            ppir_dest *dest = ppir_node_get_dest(node);
119            if (dest)
120               printf("%d", ppir_target_get_dest_reg_index(dest));
121
122            printf("|");
123
124            for (int i = 0; i < ppir_node_get_src_num(node); i++) {
125               if (i)
126                  printf(" ");
127               printf("%d", ppir_target_get_src_reg_index(ppir_node_get_src(node, i)));
128            }
129
130            printf(")");
131         }
132         printf("\n");
133      }
134   }
135   printf("--------------------------\n");
136}
137
138static bool create_new_instr_after(ppir_block *block, ppir_instr *ref,
139                                   ppir_node *node)
140{
141   ppir_instr *newinstr = ppir_instr_create(block);
142   if (unlikely(!newinstr))
143      return false;
144
145   list_del(&newinstr->list);
146   list_add(&newinstr->list, &ref->list);
147
148   if (!ppir_instr_insert_node(newinstr, node))
149      return false;
150
151   list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
152      instr->seq++;
153   }
154   newinstr->seq = ref->seq+1;
155   newinstr->scheduled = true;
156   return true;
157}
158
159static bool create_new_instr_before(ppir_block *block, ppir_instr *ref,
160                                    ppir_node *node)
161{
162   ppir_instr *newinstr = ppir_instr_create(block);
163   if (unlikely(!newinstr))
164      return false;
165
166   list_del(&newinstr->list);
167   list_addtail(&newinstr->list, &ref->list);
168
169   if (!ppir_instr_insert_node(newinstr, node))
170      return false;
171
172   list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) {
173      instr->seq++;
174   }
175   newinstr->seq = ref->seq-1;
176   newinstr->scheduled = true;
177   return true;
178}
179
180static bool ppir_update_spilled_src(ppir_compiler *comp, ppir_block *block,
181                                    ppir_node *node, ppir_src *src,
182                                    ppir_node **fill_node)
183{
184   /* nodes might have multiple references to the same value.
185    * avoid creating unnecessary loads for the same fill by
186    * saving the node resulting from the temporary load */
187   if (*fill_node)
188      goto update_src;
189
190   int num_components = src->reg->num_components;
191
192   /* alloc new node to load value */
193   ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
194   if (!load_node)
195      return false;
196   list_addtail(&load_node->list, &node->list);
197   comp->num_fills++;
198
199   ppir_load_node *load = ppir_node_to_load(load_node);
200
201   load->index = -comp->prog->state.stack_size; /* index sizes are negative */
202   load->num_components = num_components;
203
204   ppir_dest *ld_dest = &load->dest;
205   ld_dest->type = ppir_target_pipeline;
206   ld_dest->pipeline = ppir_pipeline_reg_uniform;
207   ld_dest->write_mask = u_bit_consecutive(0, num_components);
208
209   /* If the uniform slot is empty, we can insert the load_temp
210    * there and use it directly. Exceptionally, if the node is in the
211    * varying or texld slot, this doesn't work. */
212   if (!node->instr->slots[PPIR_INSTR_SLOT_UNIFORM] &&
213        node->instr_pos != PPIR_INSTR_SLOT_VARYING &&
214        node->instr_pos != PPIR_INSTR_SLOT_TEXLD) {
215      ppir_node_target_assign(src, load_node);
216      *fill_node = load_node;
217      return ppir_instr_insert_node(node->instr, load_node);
218   }
219
220   /* Uniform slot was taken, so fall back to a new instruction with a mov */
221   if (!create_new_instr_before(block, node->instr, load_node))
222      return false;
223
224   /* Create move node */
225   ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
226   if (unlikely(!move_node))
227      return false;
228   list_addtail(&move_node->list, &node->list);
229
230   ppir_alu_node *move_alu = ppir_node_to_alu(move_node);
231
232   move_alu->num_src = 1;
233   move_alu->src->type = ppir_target_pipeline;
234   move_alu->src->pipeline = ppir_pipeline_reg_uniform;
235   for (int i = 0; i < 4; i++)
236      move_alu->src->swizzle[i] = i;
237
238   ppir_dest *alu_dest = &move_alu->dest;
239   alu_dest->type = ppir_target_ssa;
240   alu_dest->ssa.num_components = num_components;
241   alu_dest->ssa.spilled = true;
242   alu_dest->write_mask = u_bit_consecutive(0, num_components);
243
244   list_addtail(&alu_dest->ssa.list, &comp->reg_list);
245   comp->reg_num++;
246
247   if (!ppir_instr_insert_node(load_node->instr, move_node))
248      return false;
249
250   /* insert the new node as predecessor */
251   ppir_node_foreach_pred_safe(node, dep) {
252      ppir_node *pred = dep->pred;
253      ppir_node_remove_dep(dep);
254      ppir_node_add_dep(load_node, pred, ppir_dep_src);
255   }
256   ppir_node_add_dep(node, move_node, ppir_dep_src);
257   ppir_node_add_dep(move_node, load_node, ppir_dep_src);
258
259   *fill_node = move_node;
260
261update_src:
262   /* switch node src to use the fill node dest */
263   ppir_node_target_assign(src, *fill_node);
264
265   return true;
266}
267
268static bool ppir_update_spilled_dest_load(ppir_compiler *comp, ppir_block *block,
269                                          ppir_node *node)
270{
271   ppir_dest *dest = ppir_node_get_dest(node);
272   assert(dest != NULL);
273   assert(dest->type == ppir_target_register);
274   ppir_reg *reg = dest->reg;
275   int num_components = reg->num_components;
276
277   /* alloc new node to load value */
278   ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0);
279   if (!load_node)
280      return NULL;
281   list_addtail(&load_node->list, &node->list);
282   comp->num_fills++;
283
284   ppir_load_node *load = ppir_node_to_load(load_node);
285
286   load->index = -comp->prog->state.stack_size; /* index sizes are negative */
287   load->num_components = num_components;
288
289   load->dest.type = ppir_target_pipeline;
290   load->dest.pipeline = ppir_pipeline_reg_uniform;
291   load->dest.write_mask = u_bit_consecutive(0, num_components);
292
293   /* New instruction is needed since we're updating a dest register
294    * and we can't write to the uniform pipeline reg */
295   if (!create_new_instr_before(block, node->instr, load_node))
296      return false;
297
298   /* Create move node */
299   ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0);
300   if (unlikely(!move_node))
301      return false;
302   list_addtail(&move_node->list, &node->list);
303
304   ppir_alu_node *move_alu = ppir_node_to_alu(move_node);
305
306   move_alu->num_src = 1;
307   move_alu->src->type = ppir_target_pipeline;
308   move_alu->src->pipeline = ppir_pipeline_reg_uniform;
309   for (int i = 0; i < 4; i++)
310      move_alu->src->swizzle[i] = i;
311
312   move_alu->dest.type = ppir_target_register;
313   move_alu->dest.reg = reg;
314   move_alu->dest.write_mask = u_bit_consecutive(0, num_components);
315
316   if (!ppir_instr_insert_node(load_node->instr, move_node))
317      return false;
318
319   ppir_node_foreach_pred_safe(node, dep) {
320      ppir_node *pred = dep->pred;
321      ppir_node_remove_dep(dep);
322      ppir_node_add_dep(load_node, pred, ppir_dep_src);
323   }
324   ppir_node_add_dep(node, move_node, ppir_dep_src);
325   ppir_node_add_dep(move_node, load_node, ppir_dep_src);
326
327   return true;
328}
329
330static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block,
331                                     ppir_node *node)
332{
333   ppir_dest *dest = ppir_node_get_dest(node);
334   assert(dest != NULL);
335   ppir_reg *reg = ppir_dest_get_reg(dest);
336
337   /* alloc new node to store value */
338   ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0);
339   if (!store_node)
340      return false;
341   list_addtail(&store_node->list, &node->list);
342   comp->num_spills++;
343
344   ppir_store_node *store = ppir_node_to_store(store_node);
345
346   store->index = -comp->prog->state.stack_size; /* index sizes are negative */
347
348   ppir_node_target_assign(&store->src, node);
349   store->num_components = reg->num_components;
350
351   /* insert the new node as successor */
352   ppir_node_foreach_succ_safe(node, dep) {
353      ppir_node *succ = dep->succ;
354      ppir_node_remove_dep(dep);
355      ppir_node_add_dep(succ, store_node, ppir_dep_src);
356   }
357   ppir_node_add_dep(store_node, node, ppir_dep_src);
358
359   /* If the store temp slot is empty, we can insert the store_temp
360    * there and use it directly. Exceptionally, if the node is in the
361    * combine slot, this doesn't work. */
362   if (!node->instr->slots[PPIR_INSTR_SLOT_STORE_TEMP] &&
363        node->instr_pos != PPIR_INSTR_SLOT_ALU_COMBINE)
364      return ppir_instr_insert_node(node->instr, store_node);
365
366   /* Not possible to merge store, so fall back to a new instruction */
367   return create_new_instr_after(block, node->instr, store_node);
368}
369
370static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen)
371{
372   list_for_each_entry(ppir_block, block, &comp->block_list, list) {
373      list_for_each_entry(ppir_node, node, &block->node_list, list) {
374
375         ppir_dest *dest = ppir_node_get_dest(node);
376         if (dest && ppir_dest_get_reg(dest) == chosen) {
377            /* If dest is a register, it might be updating only some its
378             * components, so need to load the existing value first */
379            if (dest->type == ppir_target_register) {
380               if (!ppir_update_spilled_dest_load(comp, block, node))
381                  return false;
382            }
383            if (!ppir_update_spilled_dest(comp, block, node))
384               return false;
385         }
386
387         ppir_node *fill_node = NULL;
388         /* nodes might have multiple references to the same value.
389          * avoid creating unnecessary loads for the same fill by
390          * saving the node resulting from the temporary load */
391         for (int i = 0; i < ppir_node_get_src_num(node); i++) {
392            ppir_src *src = ppir_node_get_src(node, i);
393            ppir_reg *reg = ppir_src_get_reg(src);
394            if (reg == chosen) {
395               if (!ppir_update_spilled_src(comp, block, node, src, &fill_node))
396                  return false;
397            }
398         }
399      }
400   }
401
402   return true;
403}
404
405static ppir_reg *ppir_regalloc_choose_spill_node(ppir_compiler *comp,
406                                                 struct ra_graph *g)
407{
408   float spill_costs[comp->reg_num];
409   /* experimentally determined, it seems to be worth scaling cost of
410    * regs in instructions that have used uniform/store_temp slots,
411    * but not too much as to offset the num_components base cost. */
412   const float slot_scale = 1.1f;
413
414   list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
415      if (reg->spilled) {
416         /* not considered for spilling */
417         spill_costs[reg->regalloc_index] = 0.0f;
418         continue;
419      }
420
421      /* It is beneficial to spill registers with higher component number,
422       * so increase the cost of spilling registers with few components */
423      float spill_cost = 4.0f / (float)reg->num_components;
424      spill_costs[reg->regalloc_index] = spill_cost;
425   }
426
427   list_for_each_entry(ppir_block, block, &comp->block_list, list) {
428      list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
429         if (instr->slots[PPIR_INSTR_SLOT_UNIFORM]) {
430            for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
431               ppir_node *node = instr->slots[i];
432               if (!node)
433                  continue;
434               for (int j = 0; j < ppir_node_get_src_num(node); j++) {
435                  ppir_src *src = ppir_node_get_src(node, j);
436                  if (!src)
437                     continue;
438                  ppir_reg *reg = ppir_src_get_reg(src);
439                  if (!reg)
440                     continue;
441
442                  spill_costs[reg->regalloc_index] *= slot_scale;
443               }
444            }
445         }
446         if (instr->slots[PPIR_INSTR_SLOT_STORE_TEMP]) {
447            for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) {
448               ppir_node *node = instr->slots[i];
449               if (!node)
450                  continue;
451               ppir_dest *dest = ppir_node_get_dest(node);
452               if (!dest)
453                  continue;
454               ppir_reg *reg = ppir_dest_get_reg(dest);
455               if (!reg)
456                  continue;
457
458               spill_costs[reg->regalloc_index] *= slot_scale;
459            }
460         }
461      }
462   }
463
464   for (int i = 0; i < comp->reg_num; i++)
465      ra_set_node_spill_cost(g, i, spill_costs[i]);
466
467   int r = ra_get_best_spill_node(g);
468   if (r == -1)
469      return NULL;
470
471   ppir_reg *chosen = NULL;
472   int i = 0;
473   list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
474      if (i++ == r) {
475         chosen = reg;
476         break;
477      }
478   }
479   assert(chosen);
480   chosen->spilled = true;
481   chosen->is_head = true; /* store_temp unable to do swizzle */
482
483   return chosen;
484}
485
486static void ppir_regalloc_reset_liveness_info(ppir_compiler *comp)
487{
488   int idx = 0;
489
490   list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
491      reg->regalloc_index = idx++;
492   }
493
494   list_for_each_entry(ppir_block, block, &comp->block_list, list) {
495      list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
496
497         if (instr->live_mask)
498            ralloc_free(instr->live_mask);
499         instr->live_mask = rzalloc_array(comp, uint8_t,
500                                          reg_mask_size(comp->reg_num));
501
502         if (instr->live_set)
503            ralloc_free(instr->live_set);
504         instr->live_set = rzalloc_array(comp, BITSET_WORD, comp->reg_num);
505
506         if (instr->live_internal)
507            ralloc_free(instr->live_internal);
508         instr->live_internal = rzalloc_array(comp, BITSET_WORD, comp->reg_num);
509      }
510   }
511}
512
513static void ppir_all_interference(ppir_compiler *comp, struct ra_graph *g,
514                                  BITSET_WORD *liveness)
515{
516   int i, j;
517   BITSET_FOREACH_SET(i, liveness, comp->reg_num) {
518      BITSET_FOREACH_SET(j, liveness, comp->reg_num) {
519         ra_add_node_interference(g, i, j);
520      }
521      BITSET_CLEAR(liveness, i);
522   }
523}
524
525int lima_ppir_force_spilling = 0;
526
527static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled)
528{
529   ppir_regalloc_reset_liveness_info(comp);
530
531   struct ra_graph *g = ra_alloc_interference_graph(
532      comp->ra, comp->reg_num);
533
534   int n = 0;
535   list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
536      int c = ppir_ra_reg_class_vec1 + (reg->num_components - 1);
537      if (reg->is_head)
538         c += 4;
539      ra_set_node_class(g, n++, ra_get_class_from_index(comp->ra, c));
540   }
541
542   ppir_liveness_analysis(comp);
543
544   list_for_each_entry(ppir_block, block, &comp->block_list, list) {
545      list_for_each_entry(ppir_instr, instr, &block->instr_list, list) {
546         int i;
547         BITSET_FOREACH_SET(i, instr->live_internal, comp->reg_num) {
548            BITSET_SET(instr->live_set, i);
549         }
550         ppir_all_interference(comp, g, instr->live_set);
551      }
552   }
553
554   *spilled = false;
555   bool ok = ra_allocate(g);
556   if (!ok || (comp->force_spilling-- > 0)) {
557      ppir_reg *chosen = ppir_regalloc_choose_spill_node(comp, g);
558      if (chosen) {
559         /* stack_size will be used to assemble the frame reg in lima_draw.
560          * It is also be used in the spilling code, as negative indices
561          * starting from -1, to create stack addresses. */
562         comp->prog->state.stack_size++;
563         if (!ppir_regalloc_spill_reg(comp, chosen))
564            goto err_out;
565         /* Ask the outer loop to call back in. */
566         *spilled = true;
567
568         ppir_debug("spilled register %d/%d, num_components: %d\n",
569                    chosen->regalloc_index, comp->reg_num,
570                    chosen->num_components);
571         goto err_out;
572      }
573
574      ppir_error("regalloc fail\n");
575      goto err_out;
576   }
577
578   n = 0;
579   list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) {
580      reg->index = ra_get_node_reg(g, n++);
581   }
582
583   ralloc_free(g);
584
585   if (lima_debug & LIMA_DEBUG_PP)
586      ppir_regalloc_print_result(comp);
587
588   return true;
589
590err_out:
591   ralloc_free(g);
592   return false;
593}
594
595bool ppir_regalloc_prog(ppir_compiler *comp)
596{
597   bool spilled = false;
598   comp->prog->state.stack_size = 0;
599
600   /* Set from an environment variable to force spilling
601    * for debugging purposes, see lima_screen.c */
602   comp->force_spilling = lima_ppir_force_spilling;
603
604   ppir_regalloc_update_reglist_ssa(comp);
605
606   /* No registers? Probably shader consists of discard instruction */
607   if (list_is_empty(&comp->reg_list))
608      return true;
609
610   /* this will most likely succeed in the first
611    * try, except for very complicated shaders */
612   while (!ppir_regalloc_prog_try(comp, &spilled))
613      if (!spilled)
614         return false;
615
616   return true;
617}
618