1/* 2 * Copyright (c) 2017 Lima Project 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sub license, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the 12 * next paragraph) shall be included in all copies or substantial portions 13 * of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 * 23 */ 24 25#include "util/ralloc.h" 26#include "util/register_allocate.h" 27#include "util/u_debug.h" 28 29#include "ppir.h" 30#include "lima_context.h" 31 32#define PPIR_REG_COUNT (6 * 4) 33 34enum ppir_ra_reg_class { 35 ppir_ra_reg_class_vec1, 36 ppir_ra_reg_class_vec2, 37 ppir_ra_reg_class_vec3, 38 ppir_ra_reg_class_vec4, 39 40 /* 4 reg class for load/store instr regs: 41 * load/store instr has no swizzle field, so the (virtual) register 42 * must be allocated at the beginning of a (physical) register, 43 */ 44 ppir_ra_reg_class_head_vec1, 45 ppir_ra_reg_class_head_vec2, 46 ppir_ra_reg_class_head_vec3, 47 ppir_ra_reg_class_head_vec4, 48 49 ppir_ra_reg_class_num, 50}; 51 52struct ra_regs *ppir_regalloc_init(void *mem_ctx) 53{ 54 struct ra_regs *ret = ra_alloc_reg_set(mem_ctx, PPIR_REG_COUNT, false); 55 if (!ret) 56 return NULL; 57 58 /* Classes for contiguous 1-4 channel groups anywhere within a register. */ 59 struct ra_class *classes[ppir_ra_reg_class_num]; 60 for (int i = 0; i < ppir_ra_reg_class_head_vec1; i++) { 61 classes[i] = ra_alloc_contig_reg_class(ret, i + 1); 62 63 for (int j = 0; j < PPIR_REG_COUNT; j += 4) { 64 for (int swiz = 0; swiz < (4 - i); swiz++) 65 ra_class_add_reg(classes[i], j + swiz); 66 } 67 } 68 69 /* Classes for contiguous 1-4 channels with a start channel of .x */ 70 for (int i = ppir_ra_reg_class_head_vec1; i < ppir_ra_reg_class_num; i++) { 71 classes[i] = ra_alloc_contig_reg_class(ret, i - ppir_ra_reg_class_head_vec1 + 1); 72 73 for (int j = 0; j < PPIR_REG_COUNT; j += 4) 74 ra_class_add_reg(classes[i], j); 75 } 76 77 ra_set_finalize(ret, NULL); 78 return ret; 79} 80 81static void ppir_regalloc_update_reglist_ssa(ppir_compiler *comp) 82{ 83 list_for_each_entry(ppir_block, block, &comp->block_list, list) { 84 list_for_each_entry(ppir_node, node, &block->node_list, list) { 85 if (node->is_end) 86 continue; 87 88 if (!node->instr || node->op == ppir_op_const) 89 continue; 90 91 ppir_dest *dest = ppir_node_get_dest(node); 92 if (dest) { 93 ppir_reg *reg = NULL; 94 95 if (dest->type == ppir_target_ssa) { 96 reg = &dest->ssa; 97 list_addtail(®->list, &comp->reg_list); 98 comp->reg_num++; 99 } 100 } 101 } 102 } 103} 104 105static void ppir_regalloc_print_result(ppir_compiler *comp) 106{ 107 printf("======ppir regalloc result======\n"); 108 list_for_each_entry(ppir_block, block, &comp->block_list, list) { 109 list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { 110 printf("%03d:", instr->index); 111 for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) { 112 ppir_node *node = instr->slots[i]; 113 if (!node) 114 continue; 115 116 printf(" (%d|", node->index); 117 118 ppir_dest *dest = ppir_node_get_dest(node); 119 if (dest) 120 printf("%d", ppir_target_get_dest_reg_index(dest)); 121 122 printf("|"); 123 124 for (int i = 0; i < ppir_node_get_src_num(node); i++) { 125 if (i) 126 printf(" "); 127 printf("%d", ppir_target_get_src_reg_index(ppir_node_get_src(node, i))); 128 } 129 130 printf(")"); 131 } 132 printf("\n"); 133 } 134 } 135 printf("--------------------------\n"); 136} 137 138static bool create_new_instr_after(ppir_block *block, ppir_instr *ref, 139 ppir_node *node) 140{ 141 ppir_instr *newinstr = ppir_instr_create(block); 142 if (unlikely(!newinstr)) 143 return false; 144 145 list_del(&newinstr->list); 146 list_add(&newinstr->list, &ref->list); 147 148 if (!ppir_instr_insert_node(newinstr, node)) 149 return false; 150 151 list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) { 152 instr->seq++; 153 } 154 newinstr->seq = ref->seq+1; 155 newinstr->scheduled = true; 156 return true; 157} 158 159static bool create_new_instr_before(ppir_block *block, ppir_instr *ref, 160 ppir_node *node) 161{ 162 ppir_instr *newinstr = ppir_instr_create(block); 163 if (unlikely(!newinstr)) 164 return false; 165 166 list_del(&newinstr->list); 167 list_addtail(&newinstr->list, &ref->list); 168 169 if (!ppir_instr_insert_node(newinstr, node)) 170 return false; 171 172 list_for_each_entry_from(ppir_instr, instr, ref, &block->instr_list, list) { 173 instr->seq++; 174 } 175 newinstr->seq = ref->seq-1; 176 newinstr->scheduled = true; 177 return true; 178} 179 180static bool ppir_update_spilled_src(ppir_compiler *comp, ppir_block *block, 181 ppir_node *node, ppir_src *src, 182 ppir_node **fill_node) 183{ 184 /* nodes might have multiple references to the same value. 185 * avoid creating unnecessary loads for the same fill by 186 * saving the node resulting from the temporary load */ 187 if (*fill_node) 188 goto update_src; 189 190 int num_components = src->reg->num_components; 191 192 /* alloc new node to load value */ 193 ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0); 194 if (!load_node) 195 return false; 196 list_addtail(&load_node->list, &node->list); 197 comp->num_fills++; 198 199 ppir_load_node *load = ppir_node_to_load(load_node); 200 201 load->index = -comp->prog->state.stack_size; /* index sizes are negative */ 202 load->num_components = num_components; 203 204 ppir_dest *ld_dest = &load->dest; 205 ld_dest->type = ppir_target_pipeline; 206 ld_dest->pipeline = ppir_pipeline_reg_uniform; 207 ld_dest->write_mask = u_bit_consecutive(0, num_components); 208 209 /* If the uniform slot is empty, we can insert the load_temp 210 * there and use it directly. Exceptionally, if the node is in the 211 * varying or texld slot, this doesn't work. */ 212 if (!node->instr->slots[PPIR_INSTR_SLOT_UNIFORM] && 213 node->instr_pos != PPIR_INSTR_SLOT_VARYING && 214 node->instr_pos != PPIR_INSTR_SLOT_TEXLD) { 215 ppir_node_target_assign(src, load_node); 216 *fill_node = load_node; 217 return ppir_instr_insert_node(node->instr, load_node); 218 } 219 220 /* Uniform slot was taken, so fall back to a new instruction with a mov */ 221 if (!create_new_instr_before(block, node->instr, load_node)) 222 return false; 223 224 /* Create move node */ 225 ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0); 226 if (unlikely(!move_node)) 227 return false; 228 list_addtail(&move_node->list, &node->list); 229 230 ppir_alu_node *move_alu = ppir_node_to_alu(move_node); 231 232 move_alu->num_src = 1; 233 move_alu->src->type = ppir_target_pipeline; 234 move_alu->src->pipeline = ppir_pipeline_reg_uniform; 235 for (int i = 0; i < 4; i++) 236 move_alu->src->swizzle[i] = i; 237 238 ppir_dest *alu_dest = &move_alu->dest; 239 alu_dest->type = ppir_target_ssa; 240 alu_dest->ssa.num_components = num_components; 241 alu_dest->ssa.spilled = true; 242 alu_dest->write_mask = u_bit_consecutive(0, num_components); 243 244 list_addtail(&alu_dest->ssa.list, &comp->reg_list); 245 comp->reg_num++; 246 247 if (!ppir_instr_insert_node(load_node->instr, move_node)) 248 return false; 249 250 /* insert the new node as predecessor */ 251 ppir_node_foreach_pred_safe(node, dep) { 252 ppir_node *pred = dep->pred; 253 ppir_node_remove_dep(dep); 254 ppir_node_add_dep(load_node, pred, ppir_dep_src); 255 } 256 ppir_node_add_dep(node, move_node, ppir_dep_src); 257 ppir_node_add_dep(move_node, load_node, ppir_dep_src); 258 259 *fill_node = move_node; 260 261update_src: 262 /* switch node src to use the fill node dest */ 263 ppir_node_target_assign(src, *fill_node); 264 265 return true; 266} 267 268static bool ppir_update_spilled_dest_load(ppir_compiler *comp, ppir_block *block, 269 ppir_node *node) 270{ 271 ppir_dest *dest = ppir_node_get_dest(node); 272 assert(dest != NULL); 273 assert(dest->type == ppir_target_register); 274 ppir_reg *reg = dest->reg; 275 int num_components = reg->num_components; 276 277 /* alloc new node to load value */ 278 ppir_node *load_node = ppir_node_create(block, ppir_op_load_temp, -1, 0); 279 if (!load_node) 280 return NULL; 281 list_addtail(&load_node->list, &node->list); 282 comp->num_fills++; 283 284 ppir_load_node *load = ppir_node_to_load(load_node); 285 286 load->index = -comp->prog->state.stack_size; /* index sizes are negative */ 287 load->num_components = num_components; 288 289 load->dest.type = ppir_target_pipeline; 290 load->dest.pipeline = ppir_pipeline_reg_uniform; 291 load->dest.write_mask = u_bit_consecutive(0, num_components); 292 293 /* New instruction is needed since we're updating a dest register 294 * and we can't write to the uniform pipeline reg */ 295 if (!create_new_instr_before(block, node->instr, load_node)) 296 return false; 297 298 /* Create move node */ 299 ppir_node *move_node = ppir_node_create(block, ppir_op_mov, -1 , 0); 300 if (unlikely(!move_node)) 301 return false; 302 list_addtail(&move_node->list, &node->list); 303 304 ppir_alu_node *move_alu = ppir_node_to_alu(move_node); 305 306 move_alu->num_src = 1; 307 move_alu->src->type = ppir_target_pipeline; 308 move_alu->src->pipeline = ppir_pipeline_reg_uniform; 309 for (int i = 0; i < 4; i++) 310 move_alu->src->swizzle[i] = i; 311 312 move_alu->dest.type = ppir_target_register; 313 move_alu->dest.reg = reg; 314 move_alu->dest.write_mask = u_bit_consecutive(0, num_components); 315 316 if (!ppir_instr_insert_node(load_node->instr, move_node)) 317 return false; 318 319 ppir_node_foreach_pred_safe(node, dep) { 320 ppir_node *pred = dep->pred; 321 ppir_node_remove_dep(dep); 322 ppir_node_add_dep(load_node, pred, ppir_dep_src); 323 } 324 ppir_node_add_dep(node, move_node, ppir_dep_src); 325 ppir_node_add_dep(move_node, load_node, ppir_dep_src); 326 327 return true; 328} 329 330static bool ppir_update_spilled_dest(ppir_compiler *comp, ppir_block *block, 331 ppir_node *node) 332{ 333 ppir_dest *dest = ppir_node_get_dest(node); 334 assert(dest != NULL); 335 ppir_reg *reg = ppir_dest_get_reg(dest); 336 337 /* alloc new node to store value */ 338 ppir_node *store_node = ppir_node_create(block, ppir_op_store_temp, -1, 0); 339 if (!store_node) 340 return false; 341 list_addtail(&store_node->list, &node->list); 342 comp->num_spills++; 343 344 ppir_store_node *store = ppir_node_to_store(store_node); 345 346 store->index = -comp->prog->state.stack_size; /* index sizes are negative */ 347 348 ppir_node_target_assign(&store->src, node); 349 store->num_components = reg->num_components; 350 351 /* insert the new node as successor */ 352 ppir_node_foreach_succ_safe(node, dep) { 353 ppir_node *succ = dep->succ; 354 ppir_node_remove_dep(dep); 355 ppir_node_add_dep(succ, store_node, ppir_dep_src); 356 } 357 ppir_node_add_dep(store_node, node, ppir_dep_src); 358 359 /* If the store temp slot is empty, we can insert the store_temp 360 * there and use it directly. Exceptionally, if the node is in the 361 * combine slot, this doesn't work. */ 362 if (!node->instr->slots[PPIR_INSTR_SLOT_STORE_TEMP] && 363 node->instr_pos != PPIR_INSTR_SLOT_ALU_COMBINE) 364 return ppir_instr_insert_node(node->instr, store_node); 365 366 /* Not possible to merge store, so fall back to a new instruction */ 367 return create_new_instr_after(block, node->instr, store_node); 368} 369 370static bool ppir_regalloc_spill_reg(ppir_compiler *comp, ppir_reg *chosen) 371{ 372 list_for_each_entry(ppir_block, block, &comp->block_list, list) { 373 list_for_each_entry(ppir_node, node, &block->node_list, list) { 374 375 ppir_dest *dest = ppir_node_get_dest(node); 376 if (dest && ppir_dest_get_reg(dest) == chosen) { 377 /* If dest is a register, it might be updating only some its 378 * components, so need to load the existing value first */ 379 if (dest->type == ppir_target_register) { 380 if (!ppir_update_spilled_dest_load(comp, block, node)) 381 return false; 382 } 383 if (!ppir_update_spilled_dest(comp, block, node)) 384 return false; 385 } 386 387 ppir_node *fill_node = NULL; 388 /* nodes might have multiple references to the same value. 389 * avoid creating unnecessary loads for the same fill by 390 * saving the node resulting from the temporary load */ 391 for (int i = 0; i < ppir_node_get_src_num(node); i++) { 392 ppir_src *src = ppir_node_get_src(node, i); 393 ppir_reg *reg = ppir_src_get_reg(src); 394 if (reg == chosen) { 395 if (!ppir_update_spilled_src(comp, block, node, src, &fill_node)) 396 return false; 397 } 398 } 399 } 400 } 401 402 return true; 403} 404 405static ppir_reg *ppir_regalloc_choose_spill_node(ppir_compiler *comp, 406 struct ra_graph *g) 407{ 408 float spill_costs[comp->reg_num]; 409 /* experimentally determined, it seems to be worth scaling cost of 410 * regs in instructions that have used uniform/store_temp slots, 411 * but not too much as to offset the num_components base cost. */ 412 const float slot_scale = 1.1f; 413 414 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { 415 if (reg->spilled) { 416 /* not considered for spilling */ 417 spill_costs[reg->regalloc_index] = 0.0f; 418 continue; 419 } 420 421 /* It is beneficial to spill registers with higher component number, 422 * so increase the cost of spilling registers with few components */ 423 float spill_cost = 4.0f / (float)reg->num_components; 424 spill_costs[reg->regalloc_index] = spill_cost; 425 } 426 427 list_for_each_entry(ppir_block, block, &comp->block_list, list) { 428 list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { 429 if (instr->slots[PPIR_INSTR_SLOT_UNIFORM]) { 430 for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) { 431 ppir_node *node = instr->slots[i]; 432 if (!node) 433 continue; 434 for (int j = 0; j < ppir_node_get_src_num(node); j++) { 435 ppir_src *src = ppir_node_get_src(node, j); 436 if (!src) 437 continue; 438 ppir_reg *reg = ppir_src_get_reg(src); 439 if (!reg) 440 continue; 441 442 spill_costs[reg->regalloc_index] *= slot_scale; 443 } 444 } 445 } 446 if (instr->slots[PPIR_INSTR_SLOT_STORE_TEMP]) { 447 for (int i = 0; i < PPIR_INSTR_SLOT_NUM; i++) { 448 ppir_node *node = instr->slots[i]; 449 if (!node) 450 continue; 451 ppir_dest *dest = ppir_node_get_dest(node); 452 if (!dest) 453 continue; 454 ppir_reg *reg = ppir_dest_get_reg(dest); 455 if (!reg) 456 continue; 457 458 spill_costs[reg->regalloc_index] *= slot_scale; 459 } 460 } 461 } 462 } 463 464 for (int i = 0; i < comp->reg_num; i++) 465 ra_set_node_spill_cost(g, i, spill_costs[i]); 466 467 int r = ra_get_best_spill_node(g); 468 if (r == -1) 469 return NULL; 470 471 ppir_reg *chosen = NULL; 472 int i = 0; 473 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { 474 if (i++ == r) { 475 chosen = reg; 476 break; 477 } 478 } 479 assert(chosen); 480 chosen->spilled = true; 481 chosen->is_head = true; /* store_temp unable to do swizzle */ 482 483 return chosen; 484} 485 486static void ppir_regalloc_reset_liveness_info(ppir_compiler *comp) 487{ 488 int idx = 0; 489 490 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { 491 reg->regalloc_index = idx++; 492 } 493 494 list_for_each_entry(ppir_block, block, &comp->block_list, list) { 495 list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { 496 497 if (instr->live_mask) 498 ralloc_free(instr->live_mask); 499 instr->live_mask = rzalloc_array(comp, uint8_t, 500 reg_mask_size(comp->reg_num)); 501 502 if (instr->live_set) 503 ralloc_free(instr->live_set); 504 instr->live_set = rzalloc_array(comp, BITSET_WORD, comp->reg_num); 505 506 if (instr->live_internal) 507 ralloc_free(instr->live_internal); 508 instr->live_internal = rzalloc_array(comp, BITSET_WORD, comp->reg_num); 509 } 510 } 511} 512 513static void ppir_all_interference(ppir_compiler *comp, struct ra_graph *g, 514 BITSET_WORD *liveness) 515{ 516 int i, j; 517 BITSET_FOREACH_SET(i, liveness, comp->reg_num) { 518 BITSET_FOREACH_SET(j, liveness, comp->reg_num) { 519 ra_add_node_interference(g, i, j); 520 } 521 BITSET_CLEAR(liveness, i); 522 } 523} 524 525int lima_ppir_force_spilling = 0; 526 527static bool ppir_regalloc_prog_try(ppir_compiler *comp, bool *spilled) 528{ 529 ppir_regalloc_reset_liveness_info(comp); 530 531 struct ra_graph *g = ra_alloc_interference_graph( 532 comp->ra, comp->reg_num); 533 534 int n = 0; 535 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { 536 int c = ppir_ra_reg_class_vec1 + (reg->num_components - 1); 537 if (reg->is_head) 538 c += 4; 539 ra_set_node_class(g, n++, ra_get_class_from_index(comp->ra, c)); 540 } 541 542 ppir_liveness_analysis(comp); 543 544 list_for_each_entry(ppir_block, block, &comp->block_list, list) { 545 list_for_each_entry(ppir_instr, instr, &block->instr_list, list) { 546 int i; 547 BITSET_FOREACH_SET(i, instr->live_internal, comp->reg_num) { 548 BITSET_SET(instr->live_set, i); 549 } 550 ppir_all_interference(comp, g, instr->live_set); 551 } 552 } 553 554 *spilled = false; 555 bool ok = ra_allocate(g); 556 if (!ok || (comp->force_spilling-- > 0)) { 557 ppir_reg *chosen = ppir_regalloc_choose_spill_node(comp, g); 558 if (chosen) { 559 /* stack_size will be used to assemble the frame reg in lima_draw. 560 * It is also be used in the spilling code, as negative indices 561 * starting from -1, to create stack addresses. */ 562 comp->prog->state.stack_size++; 563 if (!ppir_regalloc_spill_reg(comp, chosen)) 564 goto err_out; 565 /* Ask the outer loop to call back in. */ 566 *spilled = true; 567 568 ppir_debug("spilled register %d/%d, num_components: %d\n", 569 chosen->regalloc_index, comp->reg_num, 570 chosen->num_components); 571 goto err_out; 572 } 573 574 ppir_error("regalloc fail\n"); 575 goto err_out; 576 } 577 578 n = 0; 579 list_for_each_entry(ppir_reg, reg, &comp->reg_list, list) { 580 reg->index = ra_get_node_reg(g, n++); 581 } 582 583 ralloc_free(g); 584 585 if (lima_debug & LIMA_DEBUG_PP) 586 ppir_regalloc_print_result(comp); 587 588 return true; 589 590err_out: 591 ralloc_free(g); 592 return false; 593} 594 595bool ppir_regalloc_prog(ppir_compiler *comp) 596{ 597 bool spilled = false; 598 comp->prog->state.stack_size = 0; 599 600 /* Set from an environment variable to force spilling 601 * for debugging purposes, see lima_screen.c */ 602 comp->force_spilling = lima_ppir_force_spilling; 603 604 ppir_regalloc_update_reglist_ssa(comp); 605 606 /* No registers? Probably shader consists of discard instruction */ 607 if (list_is_empty(&comp->reg_list)) 608 return true; 609 610 /* this will most likely succeed in the first 611 * try, except for very complicated shaders */ 612 while (!ppir_regalloc_prog_try(comp, &spilled)) 613 if (!spilled) 614 return false; 615 616 return true; 617} 618