ir3.c revision 7ec681f3
1/* 2 * Copyright (c) 2012 Rob Clark <robdclark@gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include "ir3.h" 25 26#include <assert.h> 27#include <errno.h> 28#include <stdbool.h> 29#include <stdio.h> 30#include <stdlib.h> 31#include <string.h> 32 33#include "util/bitscan.h" 34#include "util/half_float.h" 35#include "util/ralloc.h" 36#include "util/u_math.h" 37 38#include "instr-a3xx.h" 39#include "ir3_shader.h" 40 41/* simple allocator to carve allocations out of an up-front allocated heap, 42 * so that we can free everything easily in one shot. 43 */ 44void * 45ir3_alloc(struct ir3 *shader, int sz) 46{ 47 return rzalloc_size(shader, sz); /* TODO: don't use rzalloc */ 48} 49 50struct ir3 * 51ir3_create(struct ir3_compiler *compiler, struct ir3_shader_variant *v) 52{ 53 struct ir3 *shader = rzalloc(v, struct ir3); 54 55 shader->compiler = compiler; 56 shader->type = v->type; 57 58 list_inithead(&shader->block_list); 59 list_inithead(&shader->array_list); 60 61 return shader; 62} 63 64void 65ir3_destroy(struct ir3 *shader) 66{ 67 ralloc_free(shader); 68} 69 70static void 71collect_reg_info(struct ir3_instruction *instr, struct ir3_register *reg, 72 struct ir3_info *info) 73{ 74 struct ir3_shader_variant *v = info->data; 75 unsigned repeat = instr->repeat; 76 77 if (reg->flags & IR3_REG_IMMED) { 78 /* nothing to do */ 79 return; 80 } 81 82 if (!(reg->flags & IR3_REG_R)) { 83 repeat = 0; 84 } 85 86 unsigned components; 87 int16_t max; 88 89 if (reg->flags & IR3_REG_RELATIV) { 90 components = reg->size; 91 max = (reg->array.base + components - 1); 92 } else { 93 components = util_last_bit(reg->wrmask); 94 max = (reg->num + repeat + components - 1); 95 } 96 97 if (reg->flags & IR3_REG_CONST) { 98 info->max_const = MAX2(info->max_const, max >> 2); 99 } else if (max < regid(48, 0)) { 100 if (reg->flags & IR3_REG_HALF) { 101 if (v->mergedregs) { 102 /* starting w/ a6xx, half regs conflict with full regs: */ 103 info->max_reg = MAX2(info->max_reg, max >> 3); 104 } else { 105 info->max_half_reg = MAX2(info->max_half_reg, max >> 2); 106 } 107 } else { 108 info->max_reg = MAX2(info->max_reg, max >> 2); 109 } 110 } 111} 112 113bool 114ir3_should_double_threadsize(struct ir3_shader_variant *v, unsigned regs_count) 115{ 116 const struct ir3_compiler *compiler = v->shader->compiler; 117 118 /* We can't support more than compiler->branchstack_size diverging threads 119 * in a wave. Thus, doubling the threadsize is only possible if we don't 120 * exceed the branchstack size limit. 121 */ 122 if (MIN2(v->branchstack, compiler->threadsize_base * 2) > 123 compiler->branchstack_size) { 124 return false; 125 } 126 127 switch (v->type) { 128 case MESA_SHADER_COMPUTE: { 129 unsigned threads_per_wg = 130 v->local_size[0] * v->local_size[1] * v->local_size[2]; 131 132 /* For a5xx, if the workgroup size is greater than the maximum number 133 * of threads per core with 32 threads per wave (512) then we have to 134 * use the doubled threadsize because otherwise the workgroup wouldn't 135 * fit. For smaller workgroup sizes, we follow the blob and use the 136 * smaller threadsize. 137 */ 138 if (compiler->gen < 6) { 139 return v->local_size_variable || 140 threads_per_wg > 141 compiler->threadsize_base * compiler->max_waves; 142 } 143 144 /* On a6xx, we prefer the larger threadsize unless the workgroup is 145 * small enough that it would be useless. Note that because 146 * threadsize_base is bumped to 64, we don't have to worry about the 147 * workgroup fitting, unlike the a5xx case. 148 */ 149 if (!v->local_size_variable) { 150 if (threads_per_wg <= compiler->threadsize_base) 151 return false; 152 } 153 } 154 FALLTHROUGH; 155 case MESA_SHADER_FRAGMENT: { 156 /* Check that doubling the threadsize wouldn't exceed the regfile size */ 157 return regs_count * 2 <= compiler->reg_size_vec4; 158 } 159 160 default: 161 /* On a6xx+, it's impossible to use a doubled wavesize in the geometry 162 * stages - the bit doesn't exist. The blob never used it for the VS 163 * on earlier gen's anyway. 164 */ 165 return false; 166 } 167} 168 169/* Get the maximum number of waves that could be used even if this shader 170 * didn't use any registers. 171 */ 172unsigned 173ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v, 174 bool double_threadsize) 175{ 176 const struct ir3_compiler *compiler = v->shader->compiler; 177 unsigned max_waves = compiler->max_waves; 178 179 /* If this is a compute shader, compute the limit based on shared size */ 180 if (v->type == MESA_SHADER_COMPUTE) { 181 /* Shared is allocated in chunks of 1k */ 182 unsigned shared_per_wg = ALIGN_POT(v->shared_size, 1024); 183 if (shared_per_wg > 0 && !v->local_size_variable) { 184 unsigned wgs_per_core = compiler->local_mem_size / shared_per_wg; 185 unsigned threads_per_wg = 186 v->local_size[0] * v->local_size[1] * v->local_size[2]; 187 unsigned waves_per_wg = 188 DIV_ROUND_UP(threads_per_wg, compiler->threadsize_base * 189 (double_threadsize ? 2 : 1) * 190 compiler->wave_granularity); 191 max_waves = MIN2(max_waves, waves_per_wg * wgs_per_core * 192 compiler->wave_granularity); 193 } 194 } 195 196 /* Compute the limit based on branchstack */ 197 if (v->branchstack > 0) { 198 unsigned branchstack_max_waves = compiler->branchstack_size / 199 v->branchstack * 200 compiler->wave_granularity; 201 max_waves = MIN2(max_waves, branchstack_max_waves); 202 } 203 204 return max_waves; 205} 206 207/* Get the maximum number of waves that could be launched limited by reg size. 208 */ 209unsigned 210ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler, 211 unsigned reg_count, bool double_threadsize) 212{ 213 return reg_count ? (compiler->reg_size_vec4 / 214 (reg_count * (double_threadsize ? 2 : 1)) * 215 compiler->wave_granularity) 216 : compiler->max_waves; 217} 218 219void 220ir3_collect_info(struct ir3_shader_variant *v) 221{ 222 struct ir3_info *info = &v->info; 223 struct ir3 *shader = v->ir; 224 const struct ir3_compiler *compiler = v->shader->compiler; 225 226 memset(info, 0, sizeof(*info)); 227 info->data = v; 228 info->max_reg = -1; 229 info->max_half_reg = -1; 230 info->max_const = -1; 231 info->multi_dword_ldp_stp = false; 232 233 uint32_t instr_count = 0; 234 foreach_block (block, &shader->block_list) { 235 foreach_instr (instr, &block->instr_list) { 236 instr_count++; 237 } 238 } 239 240 v->instrlen = DIV_ROUND_UP(instr_count, compiler->instr_align); 241 242 /* Pad out with NOPs to instrlen, including at least 4 so that cffdump 243 * doesn't try to decode the following data as instructions (such as the 244 * next stage's shader in turnip) 245 */ 246 info->size = MAX2(v->instrlen * compiler->instr_align, instr_count + 4) * 8; 247 info->sizedwords = info->size / 4; 248 249 foreach_block (block, &shader->block_list) { 250 int sfu_delay = 0; 251 252 foreach_instr (instr, &block->instr_list) { 253 254 foreach_src (reg, instr) { 255 collect_reg_info(instr, reg, info); 256 } 257 258 foreach_dst (reg, instr) { 259 if (is_dest_gpr(reg)) { 260 collect_reg_info(instr, reg, info); 261 } 262 } 263 264 if ((instr->opc == OPC_STP || instr->opc == OPC_LDP)) { 265 unsigned components = instr->srcs[2]->uim_val; 266 if (components * type_size(instr->cat6.type) > 32) { 267 info->multi_dword_ldp_stp = true; 268 } 269 270 if (instr->opc == OPC_STP) 271 info->stp_count += components; 272 else 273 info->ldp_count += components; 274 } 275 276 if ((instr->opc == OPC_BARY_F) && (instr->dsts[0]->flags & IR3_REG_EI)) 277 info->last_baryf = info->instrs_count; 278 279 unsigned instrs_count = 1 + instr->repeat + instr->nop; 280 unsigned nops_count = instr->nop; 281 282 if (instr->opc == OPC_NOP) { 283 nops_count = 1 + instr->repeat; 284 info->instrs_per_cat[0] += nops_count; 285 } else { 286 info->instrs_per_cat[opc_cat(instr->opc)] += 1 + instr->repeat; 287 info->instrs_per_cat[0] += nops_count; 288 } 289 290 if (instr->opc == OPC_MOV) { 291 if (instr->cat1.src_type == instr->cat1.dst_type) { 292 info->mov_count += 1 + instr->repeat; 293 } else { 294 info->cov_count += 1 + instr->repeat; 295 } 296 } 297 298 info->instrs_count += instrs_count; 299 info->nops_count += nops_count; 300 301 if (instr->flags & IR3_INSTR_SS) { 302 info->ss++; 303 info->sstall += sfu_delay; 304 sfu_delay = 0; 305 } 306 307 if (instr->flags & IR3_INSTR_SY) 308 info->sy++; 309 310 if (is_sfu(instr)) { 311 sfu_delay = 10; 312 } else { 313 int n = MIN2(sfu_delay, 1 + instr->repeat + instr->nop); 314 sfu_delay -= n; 315 } 316 } 317 } 318 319 /* TODO: for a5xx and below, is there a separate regfile for 320 * half-registers? 321 */ 322 unsigned regs_count = 323 info->max_reg + 1 + 324 (compiler->gen >= 6 ? ((info->max_half_reg + 2) / 2) : 0); 325 326 info->double_threadsize = ir3_should_double_threadsize(v, regs_count); 327 unsigned reg_independent_max_waves = 328 ir3_get_reg_independent_max_waves(v, info->double_threadsize); 329 unsigned reg_dependent_max_waves = ir3_get_reg_dependent_max_waves( 330 compiler, regs_count, info->double_threadsize); 331 info->max_waves = MIN2(reg_independent_max_waves, reg_dependent_max_waves); 332 assert(info->max_waves <= v->shader->compiler->max_waves); 333} 334 335static struct ir3_register * 336reg_create(struct ir3 *shader, int num, int flags) 337{ 338 struct ir3_register *reg = ir3_alloc(shader, sizeof(struct ir3_register)); 339 reg->wrmask = 1; 340 reg->flags = flags; 341 reg->num = num; 342 return reg; 343} 344 345static void 346insert_instr(struct ir3_block *block, struct ir3_instruction *instr) 347{ 348 struct ir3 *shader = block->shader; 349 350 instr->serialno = ++shader->instr_count; 351 352 list_addtail(&instr->node, &block->instr_list); 353 354 if (is_input(instr)) 355 array_insert(shader, shader->baryfs, instr); 356} 357 358struct ir3_block * 359ir3_block_create(struct ir3 *shader) 360{ 361 struct ir3_block *block = ir3_alloc(shader, sizeof(*block)); 362#ifdef DEBUG 363 block->serialno = ++shader->block_count; 364#endif 365 block->shader = shader; 366 list_inithead(&block->node); 367 list_inithead(&block->instr_list); 368 return block; 369} 370 371void 372ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred) 373{ 374 array_insert(block, block->predecessors, pred); 375} 376 377void 378ir3_block_add_physical_predecessor(struct ir3_block *block, 379 struct ir3_block *pred) 380{ 381 array_insert(block, block->physical_predecessors, pred); 382} 383 384void 385ir3_block_remove_predecessor(struct ir3_block *block, struct ir3_block *pred) 386{ 387 for (unsigned i = 0; i < block->predecessors_count; i++) { 388 if (block->predecessors[i] == pred) { 389 if (i < block->predecessors_count - 1) { 390 block->predecessors[i] = 391 block->predecessors[block->predecessors_count - 1]; 392 } 393 394 block->predecessors_count--; 395 return; 396 } 397 } 398} 399 400void 401ir3_block_remove_physical_predecessor(struct ir3_block *block, struct ir3_block *pred) 402{ 403 for (unsigned i = 0; i < block->physical_predecessors_count; i++) { 404 if (block->physical_predecessors[i] == pred) { 405 if (i < block->physical_predecessors_count - 1) { 406 block->physical_predecessors[i] = 407 block->physical_predecessors[block->physical_predecessors_count - 1]; 408 } 409 410 block->physical_predecessors_count--; 411 return; 412 } 413 } 414} 415 416unsigned 417ir3_block_get_pred_index(struct ir3_block *block, struct ir3_block *pred) 418{ 419 for (unsigned i = 0; i < block->predecessors_count; i++) { 420 if (block->predecessors[i] == pred) { 421 return i; 422 } 423 } 424 425 unreachable("ir3_block_get_pred_index() invalid predecessor"); 426} 427 428static struct ir3_instruction * 429instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc) 430{ 431 /* Add extra sources for array destinations and the address reg */ 432 if (1 <= opc_cat(opc)) 433 nsrc += 2; 434 struct ir3_instruction *instr; 435 unsigned sz = sizeof(*instr) + (ndst * sizeof(instr->dsts[0])) + 436 (nsrc * sizeof(instr->srcs[0])); 437 char *ptr = ir3_alloc(block->shader, sz); 438 439 instr = (struct ir3_instruction *)ptr; 440 ptr += sizeof(*instr); 441 instr->dsts = (struct ir3_register **)ptr; 442 instr->srcs = instr->dsts + ndst; 443 444#ifdef DEBUG 445 instr->dsts_max = ndst; 446 instr->srcs_max = nsrc; 447#endif 448 449 return instr; 450} 451 452struct ir3_instruction * 453ir3_instr_create(struct ir3_block *block, opc_t opc, int ndst, int nsrc) 454{ 455 struct ir3_instruction *instr = instr_create(block, opc, ndst, nsrc); 456 instr->block = block; 457 instr->opc = opc; 458 insert_instr(block, instr); 459 return instr; 460} 461 462struct ir3_instruction * 463ir3_instr_clone(struct ir3_instruction *instr) 464{ 465 struct ir3_instruction *new_instr = instr_create( 466 instr->block, instr->opc, instr->dsts_count, instr->srcs_count); 467 struct ir3_register **dsts, **srcs; 468 469 dsts = new_instr->dsts; 470 srcs = new_instr->srcs; 471 *new_instr = *instr; 472 new_instr->dsts = dsts; 473 new_instr->srcs = srcs; 474 475 insert_instr(instr->block, new_instr); 476 477 /* clone registers: */ 478 new_instr->dsts_count = 0; 479 new_instr->srcs_count = 0; 480 foreach_dst (reg, instr) { 481 struct ir3_register *new_reg = 482 ir3_dst_create(new_instr, reg->num, reg->flags); 483 *new_reg = *reg; 484 if (new_reg->instr) 485 new_reg->instr = new_instr; 486 } 487 foreach_src (reg, instr) { 488 struct ir3_register *new_reg = 489 ir3_src_create(new_instr, reg->num, reg->flags); 490 *new_reg = *reg; 491 } 492 493 if (instr->address) { 494 assert(instr->srcs_count > 0); 495 new_instr->address = new_instr->srcs[instr->srcs_count - 1]; 496 } 497 498 return new_instr; 499} 500 501/* Add a false dependency to instruction, to ensure it is scheduled first: */ 502void 503ir3_instr_add_dep(struct ir3_instruction *instr, struct ir3_instruction *dep) 504{ 505 for (unsigned i = 0; i < instr->deps_count; i++) { 506 if (instr->deps[i] == dep) 507 return; 508 } 509 510 array_insert(instr, instr->deps, dep); 511} 512 513struct ir3_register * 514ir3_src_create(struct ir3_instruction *instr, int num, int flags) 515{ 516 struct ir3 *shader = instr->block->shader; 517#ifdef DEBUG 518 debug_assert(instr->srcs_count < instr->srcs_max); 519#endif 520 struct ir3_register *reg = reg_create(shader, num, flags); 521 instr->srcs[instr->srcs_count++] = reg; 522 return reg; 523} 524 525struct ir3_register * 526ir3_dst_create(struct ir3_instruction *instr, int num, int flags) 527{ 528 struct ir3 *shader = instr->block->shader; 529#ifdef DEBUG 530 debug_assert(instr->dsts_count < instr->dsts_max); 531#endif 532 struct ir3_register *reg = reg_create(shader, num, flags); 533 instr->dsts[instr->dsts_count++] = reg; 534 return reg; 535} 536 537struct ir3_register * 538ir3_reg_clone(struct ir3 *shader, struct ir3_register *reg) 539{ 540 struct ir3_register *new_reg = reg_create(shader, 0, 0); 541 *new_reg = *reg; 542 return new_reg; 543} 544 545void 546ir3_reg_set_last_array(struct ir3_instruction *instr, struct ir3_register *reg, 547 struct ir3_register *last_write) 548{ 549 assert(reg->flags & IR3_REG_ARRAY); 550 struct ir3_register *new_reg = ir3_src_create(instr, 0, 0); 551 *new_reg = *reg; 552 new_reg->def = last_write; 553 ir3_reg_tie(reg, new_reg); 554} 555 556void 557ir3_instr_set_address(struct ir3_instruction *instr, 558 struct ir3_instruction *addr) 559{ 560 if (!instr->address) { 561 struct ir3 *ir = instr->block->shader; 562 563 debug_assert(instr->block == addr->block); 564 565 instr->address = 566 ir3_src_create(instr, addr->dsts[0]->num, addr->dsts[0]->flags); 567 instr->address->def = addr->dsts[0]; 568 debug_assert(reg_num(addr->dsts[0]) == REG_A0); 569 unsigned comp = reg_comp(addr->dsts[0]); 570 if (comp == 0) { 571 array_insert(ir, ir->a0_users, instr); 572 } else { 573 debug_assert(comp == 1); 574 array_insert(ir, ir->a1_users, instr); 575 } 576 } else { 577 debug_assert(instr->address->def->instr == addr); 578 } 579} 580 581void 582ir3_block_clear_mark(struct ir3_block *block) 583{ 584 foreach_instr (instr, &block->instr_list) 585 instr->flags &= ~IR3_INSTR_MARK; 586} 587 588void 589ir3_clear_mark(struct ir3 *ir) 590{ 591 foreach_block (block, &ir->block_list) { 592 ir3_block_clear_mark(block); 593 } 594} 595 596unsigned 597ir3_count_instructions(struct ir3 *ir) 598{ 599 unsigned cnt = 1; 600 foreach_block (block, &ir->block_list) { 601 block->start_ip = cnt; 602 foreach_instr (instr, &block->instr_list) { 603 instr->ip = cnt++; 604 } 605 block->end_ip = cnt; 606 } 607 return cnt; 608} 609 610/* When counting instructions for RA, we insert extra fake instructions at the 611 * beginning of each block, where values become live, and at the end where 612 * values die. This prevents problems where values live-in at the beginning or 613 * live-out at the end of a block from being treated as if they were 614 * live-in/live-out at the first/last instruction, which would be incorrect. 615 * In ir3_legalize these ip's are assumed to be actual ip's of the final 616 * program, so it would be incorrect to use this everywhere. 617 */ 618 619unsigned 620ir3_count_instructions_ra(struct ir3 *ir) 621{ 622 unsigned cnt = 1; 623 foreach_block (block, &ir->block_list) { 624 block->start_ip = cnt++; 625 foreach_instr (instr, &block->instr_list) { 626 instr->ip = cnt++; 627 } 628 block->end_ip = cnt++; 629 } 630 return cnt; 631} 632 633struct ir3_array * 634ir3_lookup_array(struct ir3 *ir, unsigned id) 635{ 636 foreach_array (arr, &ir->array_list) 637 if (arr->id == id) 638 return arr; 639 return NULL; 640} 641 642void 643ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps) 644{ 645 /* We could do this in a single pass if we can assume instructions 646 * are always sorted. Which currently might not always be true. 647 * (In particular after ir3_group pass, but maybe other places.) 648 */ 649 foreach_block (block, &ir->block_list) 650 foreach_instr (instr, &block->instr_list) 651 instr->uses = NULL; 652 653 foreach_block (block, &ir->block_list) { 654 foreach_instr (instr, &block->instr_list) { 655 foreach_ssa_src_n (src, n, instr) { 656 if (__is_false_dep(instr, n) && !falsedeps) 657 continue; 658 if (!src->uses) 659 src->uses = _mesa_pointer_set_create(mem_ctx); 660 _mesa_set_add(src->uses, instr); 661 } 662 } 663 } 664} 665 666/** 667 * Set the destination type of an instruction, for example if a 668 * conversion is folded in, handling the special cases where the 669 * instruction's dest type or opcode needs to be fixed up. 670 */ 671void 672ir3_set_dst_type(struct ir3_instruction *instr, bool half) 673{ 674 if (half) { 675 instr->dsts[0]->flags |= IR3_REG_HALF; 676 } else { 677 instr->dsts[0]->flags &= ~IR3_REG_HALF; 678 } 679 680 switch (opc_cat(instr->opc)) { 681 case 1: /* move instructions */ 682 if (half) { 683 instr->cat1.dst_type = half_type(instr->cat1.dst_type); 684 } else { 685 instr->cat1.dst_type = full_type(instr->cat1.dst_type); 686 } 687 break; 688 case 4: 689 if (half) { 690 instr->opc = cat4_half_opc(instr->opc); 691 } else { 692 instr->opc = cat4_full_opc(instr->opc); 693 } 694 break; 695 case 5: 696 if (half) { 697 instr->cat5.type = half_type(instr->cat5.type); 698 } else { 699 instr->cat5.type = full_type(instr->cat5.type); 700 } 701 break; 702 } 703} 704 705/** 706 * One-time fixup for instruction src-types. Other than cov's that 707 * are folded, an instruction's src type does not change. 708 */ 709void 710ir3_fixup_src_type(struct ir3_instruction *instr) 711{ 712 switch (opc_cat(instr->opc)) { 713 case 1: /* move instructions */ 714 if (instr->srcs[0]->flags & IR3_REG_HALF) { 715 instr->cat1.src_type = half_type(instr->cat1.src_type); 716 } else { 717 instr->cat1.src_type = full_type(instr->cat1.src_type); 718 } 719 break; 720 case 3: 721 if (instr->srcs[0]->flags & IR3_REG_HALF) { 722 instr->opc = cat3_half_opc(instr->opc); 723 } else { 724 instr->opc = cat3_full_opc(instr->opc); 725 } 726 break; 727 } 728} 729 730/** 731 * Map a floating point immed to FLUT (float lookup table) value, 732 * returns negative for immediates that cannot be mapped. 733 */ 734int 735ir3_flut(struct ir3_register *src_reg) 736{ 737 static const struct { 738 uint32_t f32; 739 uint16_t f16; 740 } flut[] = { 741 { .f32 = 0x00000000, .f16 = 0x0000 }, /* 0.0 */ 742 { .f32 = 0x3f000000, .f16 = 0x3800 }, /* 0.5 */ 743 { .f32 = 0x3f800000, .f16 = 0x3c00 }, /* 1.0 */ 744 { .f32 = 0x40000000, .f16 = 0x4000 }, /* 2.0 */ 745 { .f32 = 0x402df854, .f16 = 0x4170 }, /* e */ 746 { .f32 = 0x40490fdb, .f16 = 0x4248 }, /* pi */ 747 { .f32 = 0x3ea2f983, .f16 = 0x3518 }, /* 1/pi */ 748 { .f32 = 0x3f317218, .f16 = 0x398c }, /* 1/log2(e) */ 749 { .f32 = 0x3fb8aa3b, .f16 = 0x3dc5 }, /* log2(e) */ 750 { .f32 = 0x3e9a209b, .f16 = 0x34d1 }, /* 1/log2(10) */ 751 { .f32 = 0x40549a78, .f16 = 0x42a5 }, /* log2(10) */ 752 { .f32 = 0x40800000, .f16 = 0x4400 }, /* 4.0 */ 753 }; 754 755 if (src_reg->flags & IR3_REG_HALF) { 756 /* Note that half-float immeds are already lowered to 16b in nir: */ 757 uint32_t imm = src_reg->uim_val; 758 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) { 759 if (flut[i].f16 == imm) { 760 return i; 761 } 762 } 763 } else { 764 uint32_t imm = src_reg->uim_val; 765 for (unsigned i = 0; i < ARRAY_SIZE(flut); i++) { 766 if (flut[i].f32 == imm) { 767 return i; 768 } 769 } 770 } 771 772 return -1; 773} 774 775static unsigned 776cp_flags(unsigned flags) 777{ 778 /* only considering these flags (at least for now): */ 779 flags &= (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_FNEG | IR3_REG_FABS | 780 IR3_REG_SNEG | IR3_REG_SABS | IR3_REG_BNOT | IR3_REG_RELATIV | 781 IR3_REG_SHARED); 782 return flags; 783} 784 785bool 786ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags) 787{ 788 struct ir3_compiler *compiler = instr->block->shader->compiler; 789 unsigned valid_flags; 790 791 if ((flags & IR3_REG_SHARED) && opc_cat(instr->opc) > 3) 792 return false; 793 794 flags = cp_flags(flags); 795 796 /* If destination is indirect, then source cannot be.. at least 797 * I don't think so.. 798 */ 799 if (instr->dsts_count > 0 && (instr->dsts[0]->flags & IR3_REG_RELATIV) && 800 (flags & IR3_REG_RELATIV)) 801 return false; 802 803 if (flags & IR3_REG_RELATIV) { 804 /* TODO need to test on earlier gens.. pretty sure the earlier 805 * problem was just that we didn't check that the src was from 806 * same block (since we can't propagate address register values 807 * across blocks currently) 808 */ 809 if (compiler->gen < 6) 810 return false; 811 812 /* NOTE in the special try_swap_mad_two_srcs() case we can be 813 * called on a src that has already had an indirect load folded 814 * in, in which case ssa() returns NULL 815 */ 816 if (instr->srcs[n]->flags & IR3_REG_SSA) { 817 struct ir3_instruction *src = ssa(instr->srcs[n]); 818 if (src->address->def->instr->block != instr->block) 819 return false; 820 } 821 } 822 823 if (is_meta(instr)) { 824 /* collect and phi nodes support const/immed sources, which will be 825 * turned into move instructions, but not anything else. 826 */ 827 if (flags & ~(IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_SHARED)) 828 return false; 829 830 if ((flags & IR3_REG_SHARED) && !(instr->dsts[0]->flags & IR3_REG_SHARED)) 831 return false; 832 833 return true; 834 } 835 836 switch (opc_cat(instr->opc)) { 837 case 0: /* end, chmask */ 838 return flags == 0; 839 case 1: 840 switch (instr->opc) { 841 case OPC_MOVMSK: 842 case OPC_SWZ: 843 case OPC_SCT: 844 case OPC_GAT: 845 valid_flags = IR3_REG_SHARED; 846 break; 847 default: 848 valid_flags = 849 IR3_REG_IMMED | IR3_REG_CONST | IR3_REG_RELATIV | IR3_REG_SHARED; 850 } 851 if (flags & ~valid_flags) 852 return false; 853 break; 854 case 2: 855 valid_flags = ir3_cat2_absneg(instr->opc) | IR3_REG_CONST | 856 IR3_REG_RELATIV | IR3_REG_IMMED | IR3_REG_SHARED; 857 858 if (flags & ~valid_flags) 859 return false; 860 861 if (flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_SHARED)) { 862 unsigned m = n ^ 1; 863 /* cannot deal w/ const or shared in both srcs: 864 * (note that some cat2 actually only have a single src) 865 */ 866 if (m < instr->srcs_count) { 867 struct ir3_register *reg = instr->srcs[m]; 868 if ((flags & (IR3_REG_CONST | IR3_REG_SHARED)) && 869 (reg->flags & (IR3_REG_CONST | IR3_REG_SHARED))) 870 return false; 871 if ((flags & IR3_REG_IMMED) && reg->flags & (IR3_REG_IMMED)) 872 return false; 873 } 874 } 875 break; 876 case 3: 877 valid_flags = 878 ir3_cat3_absneg(instr->opc) | IR3_REG_RELATIV | IR3_REG_SHARED; 879 880 if (instr->opc == OPC_SHLG_B16) { 881 valid_flags |= IR3_REG_IMMED; 882 /* shlg.b16 can be RELATIV+CONST but not CONST: */ 883 if (flags & IR3_REG_RELATIV) 884 valid_flags |= IR3_REG_CONST; 885 } else { 886 valid_flags |= IR3_REG_CONST; 887 } 888 889 if (flags & ~valid_flags) 890 return false; 891 892 if (flags & (IR3_REG_CONST | IR3_REG_SHARED | IR3_REG_RELATIV)) { 893 /* cannot deal w/ const/shared/relativ in 2nd src: */ 894 if (n == 1) 895 return false; 896 } 897 898 break; 899 case 4: 900 /* seems like blob compiler avoids const as src.. */ 901 /* TODO double check if this is still the case on a4xx */ 902 if (flags & (IR3_REG_CONST | IR3_REG_IMMED)) 903 return false; 904 if (flags & (IR3_REG_SABS | IR3_REG_SNEG)) 905 return false; 906 break; 907 case 5: 908 /* no flags allowed */ 909 if (flags) 910 return false; 911 break; 912 case 6: 913 valid_flags = IR3_REG_IMMED; 914 if (flags & ~valid_flags) 915 return false; 916 917 if (flags & IR3_REG_IMMED) { 918 /* doesn't seem like we can have immediate src for store 919 * instructions: 920 * 921 * TODO this restriction could also apply to load instructions, 922 * but for load instructions this arg is the address (and not 923 * really sure any good way to test a hard-coded immed addr src) 924 */ 925 if (is_store(instr) && (instr->opc != OPC_STG) && (n == 1)) 926 return false; 927 928 if ((instr->opc == OPC_LDL) && (n == 0)) 929 return false; 930 931 if ((instr->opc == OPC_STL) && (n != 2)) 932 return false; 933 934 if ((instr->opc == OPC_LDP) && (n == 0)) 935 return false; 936 937 if ((instr->opc == OPC_STP) && (n != 2)) 938 return false; 939 940 if (instr->opc == OPC_STLW && n == 0) 941 return false; 942 943 if (instr->opc == OPC_LDLW && n == 0) 944 return false; 945 946 /* disallow immediates in anything but the SSBO slot argument for 947 * cat6 instructions: 948 */ 949 if (is_atomic(instr->opc) && (n != 0)) 950 return false; 951 952 if (is_atomic(instr->opc) && !(instr->flags & IR3_INSTR_G)) 953 return false; 954 955 if (instr->opc == OPC_STG && (n == 2)) 956 return false; 957 958 if (instr->opc == OPC_STG_A && (n == 4)) 959 return false; 960 961 /* as with atomics, these cat6 instrs can only have an immediate 962 * for SSBO/IBO slot argument 963 */ 964 switch (instr->opc) { 965 case OPC_LDIB: 966 case OPC_STIB: 967 case OPC_RESINFO: 968 if (n != 0) 969 return false; 970 break; 971 default: 972 break; 973 } 974 } 975 976 break; 977 } 978 979 return true; 980} 981 982bool 983ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed) 984{ 985 if (instr->opc == OPC_MOV || is_meta(instr)) 986 return true; 987 988 if (is_mem(instr)) { 989 switch (instr->opc) { 990 /* Some load/store instructions have a 13-bit offset and size which must 991 * always be an immediate and the rest of the sources cannot be 992 * immediates, so the frontend is responsible for checking the size: 993 */ 994 case OPC_LDL: 995 case OPC_STL: 996 case OPC_LDP: 997 case OPC_STP: 998 case OPC_LDG: 999 case OPC_STG: 1000 case OPC_SPILL_MACRO: 1001 case OPC_RELOAD_MACRO: 1002 case OPC_LDG_A: 1003 case OPC_STG_A: 1004 case OPC_LDLW: 1005 case OPC_STLW: 1006 case OPC_LDLV: 1007 return true; 1008 default: 1009 /* most cat6 src immediates can only encode 8 bits: */ 1010 return !(immed & ~0xff); 1011 } 1012 } 1013 1014 /* Other than cat1 (mov) we can only encode up to 10 bits, sign-extended: */ 1015 return !(immed & ~0x1ff) || !(-immed & ~0x1ff); 1016} 1017