1 /* 2 * Copyright (c) 2013 Rob Clark <robdclark (at) gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24 #ifndef IR3_H_ 25 #define IR3_H_ 26 27 #include <stdbool.h> 28 #include <stdint.h> 29 30 #include "compiler/shader_enums.h" 31 32 #include "util/bitscan.h" 33 #include "util/list.h" 34 #include "util/set.h" 35 #include "util/u_debug.h" 36 37 #include "instr-a3xx.h" 38 39 /* low level intermediate representation of an adreno shader program */ 40 41 struct ir3_compiler; 42 struct ir3; 43 struct ir3_instruction; 44 struct ir3_block; 45 46 struct ir3_info { 47 void *data; /* used internally in ir3 assembler */ 48 /* Size in bytes of the shader binary, including NIR constants and 49 * padding 50 */ 51 uint32_t size; 52 /* byte offset from start of the shader to the NIR constant data. */ 53 uint32_t constant_data_offset; 54 /* Size in dwords of the instructions. */ 55 uint16_t sizedwords; 56 uint16_t instrs_count; /* expanded to account for rpt's */ 57 uint16_t nops_count; /* # of nop instructions, including nopN */ 58 uint16_t mov_count; 59 uint16_t cov_count; 60 uint16_t stp_count; 61 uint16_t ldp_count; 62 /* NOTE: max_reg, etc, does not include registers not touched 63 * by the shader (ie. vertex fetched via VFD_DECODE but not 64 * touched by shader) 65 */ 66 int8_t max_reg; /* highest GPR # used by shader */ 67 int8_t max_half_reg; 68 int16_t max_const; 69 /* This is the maximum # of waves that can executed at once in one core, 70 * assuming that they are all executing this shader. 71 */ 72 int8_t max_waves; 73 bool double_threadsize; 74 bool multi_dword_ldp_stp; 75 76 /* number of sync bits: */ 77 uint16_t ss, sy; 78 79 /* estimate of number of cycles stalled on (ss) */ 80 uint16_t sstall; 81 82 uint16_t last_baryf; /* instruction # of last varying fetch */ 83 84 /* Number of instructions of a given category: */ 85 uint16_t instrs_per_cat[8]; 86 }; 87 88 struct ir3_merge_set { 89 uint16_t preferred_reg; 90 uint16_t size; 91 uint16_t alignment; 92 93 unsigned interval_start; 94 unsigned spill_slot; 95 96 unsigned regs_count; 97 struct ir3_register **regs; 98 }; 99 100 struct ir3_register { 101 enum { 102 IR3_REG_CONST = 0x001, 103 IR3_REG_IMMED = 0x002, 104 IR3_REG_HALF = 0x004, 105 /* Shared registers have the same value for all threads when read. 106 * They can only be written when one thread is active (that is, inside 107 * a "getone" block). 108 */ 109 IR3_REG_SHARED = 0x008, 110 IR3_REG_RELATIV = 0x010, 111 IR3_REG_R = 0x020, 112 /* Most instructions, it seems, can do float abs/neg but not 113 * integer. The CP pass needs to know what is intended (int or 114 * float) in order to do the right thing. For this reason the 115 * abs/neg flags are split out into float and int variants. In 116 * addition, .b (bitwise) operations, the negate is actually a 117 * bitwise not, so split that out into a new flag to make it 118 * more clear. 119 */ 120 IR3_REG_FNEG = 0x040, 121 IR3_REG_FABS = 0x080, 122 IR3_REG_SNEG = 0x100, 123 IR3_REG_SABS = 0x200, 124 IR3_REG_BNOT = 0x400, 125 /* (ei) flag, end-input? Set on last bary, presumably to signal 126 * that the shader needs no more input: 127 */ 128 IR3_REG_EI = 0x2000, 129 /* meta-flags, for intermediate stages of IR, ie. 130 * before register assignment is done: 131 */ 132 IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */ 133 IR3_REG_ARRAY = 0x8000, 134 135 /* Set on a use whenever the SSA value becomes dead after the current 136 * instruction. 137 */ 138 IR3_REG_KILL = 0x10000, 139 140 /* Similar to IR3_REG_KILL, except that if there are multiple uses of the 141 * same SSA value in a single instruction, this is only set on the first 142 * use. 143 */ 144 IR3_REG_FIRST_KILL = 0x20000, 145 146 /* Set when a destination doesn't have any uses and is dead immediately 147 * after the instruction. This can happen even after optimizations for 148 * corner cases such as destinations of atomic instructions. 149 */ 150 IR3_REG_UNUSED = 0x40000, 151 } flags; 152 153 unsigned name; 154 155 /* used for cat5 instructions, but also for internal/IR level 156 * tracking of what registers are read/written by an instruction. 157 * wrmask may be a bad name since it is used to represent both 158 * src and dst that touch multiple adjacent registers. 159 */ 160 unsigned wrmask : 16; /* up to vec16 */ 161 162 /* for relative addressing, 32bits for array size is too small, 163 * but otoh we don't need to deal with disjoint sets, so instead 164 * use a simple size field (number of scalar components). 165 * 166 * Note the size field isn't important for relative const (since 167 * we don't have to do register allocation for constants). 168 */ 169 unsigned size : 16; 170 171 /* normal registers: 172 * the component is in the low two bits of the reg #, so 173 * rN.x becomes: (N << 2) | x 174 */ 175 uint16_t num; 176 union { 177 /* immediate: */ 178 int32_t iim_val; 179 uint32_t uim_val; 180 float fim_val; 181 /* relative: */ 182 struct { 183 uint16_t id; 184 int16_t offset; 185 uint16_t base; 186 } array; 187 }; 188 189 /* For IR3_REG_DEST, pointer back to the instruction containing this 190 * register. 191 */ 192 struct ir3_instruction *instr; 193 194 /* For IR3_REG_SSA, src registers contain ptr back to assigning 195 * instruction. 196 * 197 * For IR3_REG_ARRAY, the pointer is back to the last dependent 198 * array access (although the net effect is the same, it points 199 * back to a previous instruction that we depend on). 200 */ 201 struct ir3_register *def; 202 203 /* Pointer to another register in the instruction that must share the same 204 * physical register. Each destination can be tied with one source, and 205 * they must have "tied" pointing to each other. 206 */ 207 struct ir3_register *tied; 208 209 unsigned spill_slot, next_use; 210 211 unsigned merge_set_offset; 212 struct ir3_merge_set *merge_set; 213 unsigned interval_start, interval_end; 214 }; 215 216 /* 217 * Stupid/simple growable array implementation: 218 */ 219 #define DECLARE_ARRAY(type, name) \ 220 unsigned name##_count, name##_sz; \ 221 type *name; 222 223 #define array_insert(ctx, arr, ...) \ 224 do { \ 225 if (arr##_count == arr##_sz) { \ 226 arr##_sz = MAX2(2 * arr##_sz, 16); \ 227 arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0])); \ 228 } \ 229 arr[arr##_count++] = __VA_ARGS__; \ 230 } while (0) 231 232 struct ir3_instruction { 233 struct ir3_block *block; 234 opc_t opc; 235 enum { 236 /* (sy) flag is set on first instruction, and after sample 237 * instructions (probably just on RAW hazard). 238 */ 239 IR3_INSTR_SY = 0x001, 240 /* (ss) flag is set on first instruction, and first instruction 241 * to depend on the result of "long" instructions (RAW hazard): 242 * 243 * rcp, rsq, log2, exp2, sin, cos, sqrt 244 * 245 * It seems to synchronize until all in-flight instructions are 246 * completed, for example: 247 * 248 * rsq hr1.w, hr1.w 249 * add.f hr2.z, (neg)hr2.z, hc0.y 250 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y 251 * rsq hr2.x, hr2.x 252 * (rpt1)nop 253 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w 254 * nop 255 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w 256 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w 257 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x 258 * 259 * The last mul.f does not have (ss) set, presumably because the 260 * (ss) on the previous instruction does the job. 261 * 262 * The blob driver also seems to set it on WAR hazards, although 263 * not really clear if this is needed or just blob compiler being 264 * sloppy. So far I haven't found a case where removing the (ss) 265 * causes problems for WAR hazard, but I could just be getting 266 * lucky: 267 * 268 * rcp r1.y, r3.y 269 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z 270 * 271 */ 272 IR3_INSTR_SS = 0x002, 273 /* (jp) flag is set on jump targets: 274 */ 275 IR3_INSTR_JP = 0x004, 276 IR3_INSTR_UL = 0x008, 277 IR3_INSTR_3D = 0x010, 278 IR3_INSTR_A = 0x020, 279 IR3_INSTR_O = 0x040, 280 IR3_INSTR_P = 0x080, 281 IR3_INSTR_S = 0x100, 282 IR3_INSTR_S2EN = 0x200, 283 IR3_INSTR_G = 0x400, 284 IR3_INSTR_SAT = 0x800, 285 /* (cat5/cat6) Bindless */ 286 IR3_INSTR_B = 0x1000, 287 /* (cat5/cat6) nonuniform */ 288 IR3_INSTR_NONUNIF = 0x02000, 289 /* (cat5-only) Get some parts of the encoding from a1.x */ 290 IR3_INSTR_A1EN = 0x04000, 291 /* meta-flags, for intermediate stages of IR, ie. 292 * before register assignment is done: 293 */ 294 IR3_INSTR_MARK = 0x08000, 295 IR3_INSTR_UNUSED = 0x10000, 296 } flags; 297 uint8_t repeat; 298 uint8_t nop; 299 #ifdef DEBUG 300 unsigned srcs_max, dsts_max; 301 #endif 302 unsigned srcs_count, dsts_count; 303 struct ir3_register **dsts; 304 struct ir3_register **srcs; 305 union { 306 struct { 307 char inv1, inv2; 308 char comp1, comp2; 309 int immed; 310 struct ir3_block *target; 311 const char *target_label; 312 brtype_t brtype; 313 unsigned idx; /* for brac.N */ 314 } cat0; 315 struct { 316 type_t src_type, dst_type; 317 round_t round; 318 } cat1; 319 struct { 320 enum { 321 IR3_COND_LT = 0, 322 IR3_COND_LE = 1, 323 IR3_COND_GT = 2, 324 IR3_COND_GE = 3, 325 IR3_COND_EQ = 4, 326 IR3_COND_NE = 5, 327 } condition; 328 } cat2; 329 struct { 330 unsigned samp, tex; 331 unsigned tex_base : 3; 332 type_t type; 333 } cat5; 334 struct { 335 type_t type; 336 /* TODO remove dst_offset and handle as a ir3_register 337 * which might be IMMED, similar to how src_offset is 338 * handled. 339 */ 340 int dst_offset; 341 int iim_val : 3; /* for ldgb/stgb, # of components */ 342 unsigned d : 3; /* for ldc, component offset */ 343 bool typed : 1; 344 unsigned base : 3; 345 } cat6; 346 struct { 347 unsigned w : 1; /* write */ 348 unsigned r : 1; /* read */ 349 unsigned l : 1; /* local */ 350 unsigned g : 1; /* global */ 351 } cat7; 352 /* for meta-instructions, just used to hold extra data 353 * before instruction scheduling, etc 354 */ 355 struct { 356 int off; /* component/offset */ 357 } split; 358 struct { 359 /* Per-source index back to the entry in the 360 * ir3_shader_variant::outputs table. 361 */ 362 unsigned *outidxs; 363 } end; 364 struct { 365 /* used to temporarily hold reference to nir_phi_instr 366 * until we resolve the phi srcs 367 */ 368 void *nphi; 369 } phi; 370 struct { 371 unsigned samp, tex; 372 unsigned input_offset; 373 unsigned samp_base : 3; 374 unsigned tex_base : 3; 375 } prefetch; 376 struct { 377 /* maps back to entry in ir3_shader_variant::inputs table: */ 378 int inidx; 379 /* for sysvals, identifies the sysval type. Mostly so we can 380 * identify the special cases where a sysval should not be DCE'd 381 * (currently, just pre-fs texture fetch) 382 */ 383 gl_system_value sysval; 384 } input; 385 }; 386 387 /* For assigning jump offsets, we need instruction's position: */ 388 uint32_t ip; 389 390 /* used for per-pass extra instruction data. 391 * 392 * TODO we should remove the per-pass data like this and 'use_count' 393 * and do something similar to what RA does w/ ir3_ra_instr_data.. 394 * ie. use the ir3_count_instructions pass, and then use instr->ip 395 * to index into a table of pass-private data. 396 */ 397 void *data; 398 399 /** 400 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use() 401 */ 402 struct set *uses; 403 404 int use_count; /* currently just updated/used by cp */ 405 406 /* an instruction can reference at most one address register amongst 407 * it's src/dst registers. Beyond that, you need to insert mov's. 408 * 409 * NOTE: do not write this directly, use ir3_instr_set_address() 410 */ 411 struct ir3_register *address; 412 413 /* Tracking for additional dependent instructions. Used to handle 414 * barriers, WAR hazards for arrays/SSBOs/etc. 415 */ 416 DECLARE_ARRAY(struct ir3_instruction *, deps); 417 418 /* 419 * From PoV of instruction scheduling, not execution (ie. ignores global/ 420 * local distinction): 421 * shared image atomic SSBO everything 422 * barrier()/ - R/W R/W R/W R/W X 423 * groupMemoryBarrier() 424 * memoryBarrier() 425 * (but only images declared coherent?) 426 * memoryBarrierAtomic() - R/W 427 * memoryBarrierBuffer() - R/W 428 * memoryBarrierImage() - R/W 429 * memoryBarrierShared() - R/W 430 * 431 * TODO I think for SSBO/image/shared, in cases where we can determine 432 * which variable is accessed, we don't need to care about accesses to 433 * different variables (unless declared coherent??) 434 */ 435 enum { 436 IR3_BARRIER_EVERYTHING = 1 << 0, 437 IR3_BARRIER_SHARED_R = 1 << 1, 438 IR3_BARRIER_SHARED_W = 1 << 2, 439 IR3_BARRIER_IMAGE_R = 1 << 3, 440 IR3_BARRIER_IMAGE_W = 1 << 4, 441 IR3_BARRIER_BUFFER_R = 1 << 5, 442 IR3_BARRIER_BUFFER_W = 1 << 6, 443 IR3_BARRIER_ARRAY_R = 1 << 7, 444 IR3_BARRIER_ARRAY_W = 1 << 8, 445 IR3_BARRIER_PRIVATE_R = 1 << 9, 446 IR3_BARRIER_PRIVATE_W = 1 << 10, 447 } barrier_class, 448 barrier_conflict; 449 450 /* Entry in ir3_block's instruction list: */ 451 struct list_head node; 452 453 uint32_t serialno; 454 455 // TODO only computerator/assembler: 456 int line; 457 }; 458 459 struct ir3 { 460 struct ir3_compiler *compiler; 461 gl_shader_stage type; 462 463 DECLARE_ARRAY(struct ir3_instruction *, inputs); 464 465 /* Track bary.f (and ldlv) instructions.. this is needed in 466 * scheduling to ensure that all varying fetches happen before 467 * any potential kill instructions. The hw gets grumpy if all 468 * threads in a group are killed before the last bary.f gets 469 * a chance to signal end of input (ei). 470 */ 471 DECLARE_ARRAY(struct ir3_instruction *, baryfs); 472 473 /* Track all indirect instructions (read and write). To avoid 474 * deadlock scenario where an address register gets scheduled, 475 * but other dependent src instructions cannot be scheduled due 476 * to dependency on a *different* address register value, the 477 * scheduler needs to ensure that all dependencies other than 478 * the instruction other than the address register are scheduled 479 * before the one that writes the address register. Having a 480 * convenient list of instructions that reference some address 481 * register simplifies this. 482 */ 483 DECLARE_ARRAY(struct ir3_instruction *, a0_users); 484 485 /* same for a1.x: */ 486 DECLARE_ARRAY(struct ir3_instruction *, a1_users); 487 488 /* and same for instructions that consume predicate register: */ 489 DECLARE_ARRAY(struct ir3_instruction *, predicates); 490 491 /* Track texture sample instructions which need texture state 492 * patched in (for astc-srgb workaround): 493 */ 494 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb); 495 496 /* List of blocks: */ 497 struct list_head block_list; 498 499 /* List of ir3_array's: */ 500 struct list_head array_list; 501 502 #ifdef DEBUG 503 unsigned block_count; 504 #endif 505 unsigned instr_count; 506 }; 507 508 struct ir3_array { 509 struct list_head node; 510 unsigned length; 511 unsigned id; 512 513 struct nir_register *r; 514 515 /* To avoid array write's from getting DCE'd, keep track of the 516 * most recent write. Any array access depends on the most 517 * recent write. This way, nothing depends on writes after the 518 * last read. But all the writes that happen before that have 519 * something depending on them 520 */ 521 struct ir3_register *last_write; 522 523 /* extra stuff used in RA pass: */ 524 unsigned base; /* base vreg name */ 525 unsigned reg; /* base physical reg */ 526 uint16_t start_ip, end_ip; 527 528 /* Indicates if half-precision */ 529 bool half; 530 531 bool unused; 532 }; 533 534 struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id); 535 536 enum ir3_branch_type { 537 IR3_BRANCH_COND, /* condition */ 538 IR3_BRANCH_ANY, /* subgroupAny(condition) */ 539 IR3_BRANCH_ALL, /* subgroupAll(condition) */ 540 IR3_BRANCH_GETONE, /* subgroupElect() */ 541 }; 542 543 struct ir3_block { 544 struct list_head node; 545 struct ir3 *shader; 546 547 const struct nir_block *nblock; 548 549 struct list_head instr_list; /* list of ir3_instruction */ 550 551 /* The actual branch condition, if there are two successors */ 552 enum ir3_branch_type brtype; 553 554 /* each block has either one or two successors.. in case of two 555 * successors, 'condition' decides which one to follow. A block preceding 556 * an if/else has two successors. 557 * 558 * In some cases the path that the machine actually takes through the 559 * program may not match the per-thread view of the CFG. In particular 560 * this is the case for if/else, where the machine jumps from the end of 561 * the if to the beginning of the else and switches active lanes. While 562 * most things only care about the per-thread view, we need to use the 563 * "physical" view when allocating shared registers. "successors" contains 564 * the per-thread successors, and "physical_successors" contains the 565 * physical successors which includes the fallthrough edge from the if to 566 * the else. 567 */ 568 struct ir3_instruction *condition; 569 struct ir3_block *successors[2]; 570 struct ir3_block *physical_successors[2]; 571 572 DECLARE_ARRAY(struct ir3_block *, predecessors); 573 DECLARE_ARRAY(struct ir3_block *, physical_predecessors); 574 575 uint16_t start_ip, end_ip; 576 577 /* Track instructions which do not write a register but other- 578 * wise must not be discarded (such as kill, stg, etc) 579 */ 580 DECLARE_ARRAY(struct ir3_instruction *, keeps); 581 582 /* used for per-pass extra block data. Mainly used right 583 * now in RA step to track livein/liveout. 584 */ 585 void *data; 586 587 uint32_t index; 588 589 struct ir3_block *imm_dom; 590 DECLARE_ARRAY(struct ir3_block *, dom_children); 591 592 uint32_t dom_pre_index; 593 uint32_t dom_post_index; 594 595 uint32_t loop_id; 596 uint32_t loop_depth; 597 598 #ifdef DEBUG 599 uint32_t serialno; 600 #endif 601 }; 602 603 static inline uint32_t 604 block_id(struct ir3_block *block) 605 { 606 #ifdef DEBUG 607 return block->serialno; 608 #else 609 return (uint32_t)(unsigned long)block; 610 #endif 611 } 612 613 static inline struct ir3_block * 614 ir3_start_block(struct ir3 *ir) 615 { 616 return list_first_entry(&ir->block_list, struct ir3_block, node); 617 } 618 619 void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred); 620 void ir3_block_add_physical_predecessor(struct ir3_block *block, 621 struct ir3_block *pred); 622 void ir3_block_remove_predecessor(struct ir3_block *block, 623 struct ir3_block *pred); 624 void ir3_block_remove_physical_predecessor(struct ir3_block *block, 625 struct ir3_block *pred); 626 unsigned ir3_block_get_pred_index(struct ir3_block *block, 627 struct ir3_block *pred); 628 629 void ir3_calc_dominance(struct ir3 *ir); 630 bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b); 631 632 struct ir3_shader_variant; 633 634 struct ir3 *ir3_create(struct ir3_compiler *compiler, 635 struct ir3_shader_variant *v); 636 void ir3_destroy(struct ir3 *shader); 637 638 void ir3_collect_info(struct ir3_shader_variant *v); 639 void *ir3_alloc(struct ir3 *shader, int sz); 640 641 unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler, 642 unsigned reg_count, 643 bool double_threadsize); 644 645 unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v, 646 bool double_threadsize); 647 648 bool ir3_should_double_threadsize(struct ir3_shader_variant *v, 649 unsigned regs_count); 650 651 struct ir3_block *ir3_block_create(struct ir3 *shader); 652 653 struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc, 654 int ndst, int nsrc); 655 struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr); 656 void ir3_instr_add_dep(struct ir3_instruction *instr, 657 struct ir3_instruction *dep); 658 const char *ir3_instr_name(struct ir3_instruction *instr); 659 660 struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num, 661 int flags); 662 struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num, 663 int flags); 664 struct ir3_register *ir3_reg_clone(struct ir3 *shader, 665 struct ir3_register *reg); 666 667 static inline void 668 ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src) 669 { 670 assert(!dst->tied && !src->tied); 671 dst->tied = src; 672 src->tied = dst; 673 } 674 675 void ir3_reg_set_last_array(struct ir3_instruction *instr, 676 struct ir3_register *reg, 677 struct ir3_register *last_write); 678 679 void ir3_instr_set_address(struct ir3_instruction *instr, 680 struct ir3_instruction *addr); 681 682 static inline bool 683 ir3_instr_check_mark(struct ir3_instruction *instr) 684 { 685 if (instr->flags & IR3_INSTR_MARK) 686 return true; /* already visited */ 687 instr->flags |= IR3_INSTR_MARK; 688 return false; 689 } 690 691 void ir3_block_clear_mark(struct ir3_block *block); 692 void ir3_clear_mark(struct ir3 *shader); 693 694 unsigned ir3_count_instructions(struct ir3 *ir); 695 unsigned ir3_count_instructions_ra(struct ir3 *ir); 696 697 /** 698 * Move 'instr' to just before 'after' 699 */ 700 static inline void 701 ir3_instr_move_before(struct ir3_instruction *instr, 702 struct ir3_instruction *after) 703 { 704 list_delinit(&instr->node); 705 list_addtail(&instr->node, &after->node); 706 } 707 708 /** 709 * Move 'instr' to just after 'before': 710 */ 711 static inline void 712 ir3_instr_move_after(struct ir3_instruction *instr, 713 struct ir3_instruction *before) 714 { 715 list_delinit(&instr->node); 716 list_add(&instr->node, &before->node); 717 } 718 719 /** 720 * Move 'instr' to the beginning of the block: 721 */ 722 static inline void 723 ir3_instr_move_before_block(struct ir3_instruction *instr, 724 struct ir3_block *block) 725 { 726 list_delinit(&instr->node); 727 list_add(&instr->node, &block->instr_list); 728 } 729 730 void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps); 731 732 void ir3_set_dst_type(struct ir3_instruction *instr, bool half); 733 void ir3_fixup_src_type(struct ir3_instruction *instr); 734 735 int ir3_flut(struct ir3_register *src_reg); 736 737 bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags); 738 739 bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed); 740 741 #include "util/set.h" 742 #define foreach_ssa_use(__use, __instr) \ 743 for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses; \ 744 __use = NULL) \ 745 set_foreach ((__instr)->uses, __entry) \ 746 if ((__use = (void *)__entry->key)) 747 748 static inline uint32_t 749 reg_num(const struct ir3_register *reg) 750 { 751 return reg->num >> 2; 752 } 753 754 static inline uint32_t 755 reg_comp(const struct ir3_register *reg) 756 { 757 return reg->num & 0x3; 758 } 759 760 static inline bool 761 is_flow(struct ir3_instruction *instr) 762 { 763 return (opc_cat(instr->opc) == 0); 764 } 765 766 static inline bool 767 is_kill_or_demote(struct ir3_instruction *instr) 768 { 769 return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE; 770 } 771 772 static inline bool 773 is_nop(struct ir3_instruction *instr) 774 { 775 return instr->opc == OPC_NOP; 776 } 777 778 static inline bool 779 is_same_type_reg(struct ir3_register *dst, struct ir3_register *src) 780 { 781 unsigned dst_type = (dst->flags & IR3_REG_HALF); 782 unsigned src_type = (src->flags & IR3_REG_HALF); 783 784 /* Treat shared->normal copies as same-type, because they can generally be 785 * folded, but not normal->shared copies. 786 */ 787 if (dst_type != src_type || 788 ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED))) 789 return false; 790 else 791 return true; 792 } 793 794 /* Is it a non-transformative (ie. not type changing) mov? This can 795 * also include absneg.s/absneg.f, which for the most part can be 796 * treated as a mov (single src argument). 797 */ 798 static inline bool 799 is_same_type_mov(struct ir3_instruction *instr) 800 { 801 struct ir3_register *dst; 802 803 switch (instr->opc) { 804 case OPC_MOV: 805 if (instr->cat1.src_type != instr->cat1.dst_type) 806 return false; 807 /* If the type of dest reg and src reg are different, 808 * it shouldn't be considered as same type mov 809 */ 810 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0])) 811 return false; 812 break; 813 case OPC_ABSNEG_F: 814 case OPC_ABSNEG_S: 815 if (instr->flags & IR3_INSTR_SAT) 816 return false; 817 /* If the type of dest reg and src reg are different, 818 * it shouldn't be considered as same type mov 819 */ 820 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0])) 821 return false; 822 break; 823 case OPC_META_PHI: 824 return instr->srcs_count == 1; 825 default: 826 return false; 827 } 828 829 dst = instr->dsts[0]; 830 831 /* mov's that write to a0 or p0.x are special: */ 832 if (dst->num == regid(REG_P0, 0)) 833 return false; 834 if (reg_num(dst) == REG_A0) 835 return false; 836 837 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY)) 838 return false; 839 840 return true; 841 } 842 843 /* A move from const, which changes size but not type, can also be 844 * folded into dest instruction in some cases. 845 */ 846 static inline bool 847 is_const_mov(struct ir3_instruction *instr) 848 { 849 if (instr->opc != OPC_MOV) 850 return false; 851 852 if (!(instr->srcs[0]->flags & IR3_REG_CONST)) 853 return false; 854 855 type_t src_type = instr->cat1.src_type; 856 type_t dst_type = instr->cat1.dst_type; 857 858 return (type_float(src_type) && type_float(dst_type)) || 859 (type_uint(src_type) && type_uint(dst_type)) || 860 (type_sint(src_type) && type_sint(dst_type)); 861 } 862 863 static inline bool 864 is_alu(struct ir3_instruction *instr) 865 { 866 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3); 867 } 868 869 static inline bool 870 is_sfu(struct ir3_instruction *instr) 871 { 872 return (opc_cat(instr->opc) == 4); 873 } 874 875 static inline bool 876 is_tex(struct ir3_instruction *instr) 877 { 878 return (opc_cat(instr->opc) == 5); 879 } 880 881 static inline bool 882 is_tex_or_prefetch(struct ir3_instruction *instr) 883 { 884 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH); 885 } 886 887 static inline bool 888 is_mem(struct ir3_instruction *instr) 889 { 890 return (opc_cat(instr->opc) == 6); 891 } 892 893 static inline bool 894 is_barrier(struct ir3_instruction *instr) 895 { 896 return (opc_cat(instr->opc) == 7); 897 } 898 899 static inline bool 900 is_half(struct ir3_instruction *instr) 901 { 902 return !!(instr->dsts[0]->flags & IR3_REG_HALF); 903 } 904 905 static inline bool 906 is_shared(struct ir3_instruction *instr) 907 { 908 return !!(instr->dsts[0]->flags & IR3_REG_SHARED); 909 } 910 911 static inline bool 912 is_store(struct ir3_instruction *instr) 913 { 914 /* these instructions, the "destination" register is 915 * actually a source, the address to store to. 916 */ 917 switch (instr->opc) { 918 case OPC_STG: 919 case OPC_STG_A: 920 case OPC_STGB: 921 case OPC_STIB: 922 case OPC_STP: 923 case OPC_STL: 924 case OPC_STLW: 925 case OPC_L2G: 926 case OPC_G2L: 927 return true; 928 default: 929 return false; 930 } 931 } 932 933 static inline bool 934 is_load(struct ir3_instruction *instr) 935 { 936 switch (instr->opc) { 937 case OPC_LDG: 938 case OPC_LDG_A: 939 case OPC_LDGB: 940 case OPC_LDIB: 941 case OPC_LDL: 942 case OPC_LDP: 943 case OPC_L2G: 944 case OPC_LDLW: 945 case OPC_LDC: 946 case OPC_LDLV: 947 /* probably some others too.. */ 948 return true; 949 default: 950 return false; 951 } 952 } 953 954 static inline bool 955 is_input(struct ir3_instruction *instr) 956 { 957 /* in some cases, ldlv is used to fetch varying without 958 * interpolation.. fortunately inloc is the first src 959 * register in either case 960 */ 961 switch (instr->opc) { 962 case OPC_LDLV: 963 case OPC_BARY_F: 964 return true; 965 default: 966 return false; 967 } 968 } 969 970 static inline bool 971 is_bool(struct ir3_instruction *instr) 972 { 973 switch (instr->opc) { 974 case OPC_CMPS_F: 975 case OPC_CMPS_S: 976 case OPC_CMPS_U: 977 return true; 978 default: 979 return false; 980 } 981 } 982 983 static inline opc_t 984 cat3_half_opc(opc_t opc) 985 { 986 switch (opc) { 987 case OPC_MAD_F32: 988 return OPC_MAD_F16; 989 case OPC_SEL_B32: 990 return OPC_SEL_B16; 991 case OPC_SEL_S32: 992 return OPC_SEL_S16; 993 case OPC_SEL_F32: 994 return OPC_SEL_F16; 995 case OPC_SAD_S32: 996 return OPC_SAD_S16; 997 default: 998 return opc; 999 } 1000 } 1001 1002 static inline opc_t 1003 cat3_full_opc(opc_t opc) 1004 { 1005 switch (opc) { 1006 case OPC_MAD_F16: 1007 return OPC_MAD_F32; 1008 case OPC_SEL_B16: 1009 return OPC_SEL_B32; 1010 case OPC_SEL_S16: 1011 return OPC_SEL_S32; 1012 case OPC_SEL_F16: 1013 return OPC_SEL_F32; 1014 case OPC_SAD_S16: 1015 return OPC_SAD_S32; 1016 default: 1017 return opc; 1018 } 1019 } 1020 1021 static inline opc_t 1022 cat4_half_opc(opc_t opc) 1023 { 1024 switch (opc) { 1025 case OPC_RSQ: 1026 return OPC_HRSQ; 1027 case OPC_LOG2: 1028 return OPC_HLOG2; 1029 case OPC_EXP2: 1030 return OPC_HEXP2; 1031 default: 1032 return opc; 1033 } 1034 } 1035 1036 static inline opc_t 1037 cat4_full_opc(opc_t opc) 1038 { 1039 switch (opc) { 1040 case OPC_HRSQ: 1041 return OPC_RSQ; 1042 case OPC_HLOG2: 1043 return OPC_LOG2; 1044 case OPC_HEXP2: 1045 return OPC_EXP2; 1046 default: 1047 return opc; 1048 } 1049 } 1050 1051 static inline bool 1052 is_meta(struct ir3_instruction *instr) 1053 { 1054 return (opc_cat(instr->opc) == -1); 1055 } 1056 1057 static inline unsigned 1058 reg_elems(const struct ir3_register *reg) 1059 { 1060 if (reg->flags & IR3_REG_ARRAY) 1061 return reg->size; 1062 else 1063 return util_last_bit(reg->wrmask); 1064 } 1065 1066 static inline unsigned 1067 reg_elem_size(const struct ir3_register *reg) 1068 { 1069 return (reg->flags & IR3_REG_HALF) ? 1 : 2; 1070 } 1071 1072 static inline unsigned 1073 reg_size(const struct ir3_register *reg) 1074 { 1075 return reg_elems(reg) * reg_elem_size(reg); 1076 } 1077 1078 static inline unsigned 1079 dest_regs(struct ir3_instruction *instr) 1080 { 1081 if (instr->dsts_count == 0) 1082 return 0; 1083 1084 debug_assert(instr->dsts_count == 1); 1085 return util_last_bit(instr->dsts[0]->wrmask); 1086 } 1087 1088 /* is dst a normal temp register: */ 1089 static inline bool 1090 is_dest_gpr(struct ir3_register *dst) 1091 { 1092 if (dst->wrmask == 0) 1093 return false; 1094 if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0))) 1095 return false; 1096 return true; 1097 } 1098 1099 static inline bool 1100 writes_gpr(struct ir3_instruction *instr) 1101 { 1102 if (dest_regs(instr) == 0) 1103 return false; 1104 return is_dest_gpr(instr->dsts[0]); 1105 } 1106 1107 static inline bool 1108 writes_addr0(struct ir3_instruction *instr) 1109 { 1110 /* Note: only the first dest can write to a0.x */ 1111 if (instr->dsts_count > 0) { 1112 struct ir3_register *dst = instr->dsts[0]; 1113 return dst->num == regid(REG_A0, 0); 1114 } 1115 return false; 1116 } 1117 1118 static inline bool 1119 writes_addr1(struct ir3_instruction *instr) 1120 { 1121 /* Note: only the first dest can write to a1.x */ 1122 if (instr->dsts_count > 0) { 1123 struct ir3_register *dst = instr->dsts[0]; 1124 return dst->num == regid(REG_A0, 1); 1125 } 1126 return false; 1127 } 1128 1129 static inline bool 1130 writes_pred(struct ir3_instruction *instr) 1131 { 1132 /* Note: only the first dest can write to p0.x */ 1133 if (instr->dsts_count > 0) { 1134 struct ir3_register *dst = instr->dsts[0]; 1135 return reg_num(dst) == REG_P0; 1136 } 1137 return false; 1138 } 1139 1140 /* Is it something other than a normal register. Shared regs, p0, and a0/a1 1141 * are considered special here. Special registers are always accessed with one 1142 * size and never alias normal registers, even though a naive calculation 1143 * would sometimes make it seem like e.g. r30.z aliases a0.x. 1144 */ 1145 static inline bool 1146 is_reg_special(const struct ir3_register *reg) 1147 { 1148 return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) || 1149 (reg_num(reg) == REG_P0); 1150 } 1151 1152 /* Same as above but in cases where we don't have a register. r48.x and above 1153 * are shared/special. 1154 */ 1155 static inline bool 1156 is_reg_num_special(unsigned num) 1157 { 1158 return num >= 48 * 4; 1159 } 1160 1161 /* returns defining instruction for reg */ 1162 /* TODO better name */ 1163 static inline struct ir3_instruction * 1164 ssa(struct ir3_register *reg) 1165 { 1166 if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def) 1167 return reg->def->instr; 1168 return NULL; 1169 } 1170 1171 static inline bool 1172 conflicts(struct ir3_register *a, struct ir3_register *b) 1173 { 1174 return (a && b) && (a->def != b->def); 1175 } 1176 1177 static inline bool 1178 reg_gpr(struct ir3_register *r) 1179 { 1180 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED)) 1181 return false; 1182 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) 1183 return false; 1184 return true; 1185 } 1186 1187 static inline type_t 1188 half_type(type_t type) 1189 { 1190 switch (type) { 1191 case TYPE_F32: 1192 return TYPE_F16; 1193 case TYPE_U32: 1194 return TYPE_U16; 1195 case TYPE_S32: 1196 return TYPE_S16; 1197 case TYPE_F16: 1198 case TYPE_U16: 1199 case TYPE_S16: 1200 return type; 1201 default: 1202 assert(0); 1203 return ~0; 1204 } 1205 } 1206 1207 static inline type_t 1208 full_type(type_t type) 1209 { 1210 switch (type) { 1211 case TYPE_F16: 1212 return TYPE_F32; 1213 case TYPE_U16: 1214 return TYPE_U32; 1215 case TYPE_S16: 1216 return TYPE_S32; 1217 case TYPE_F32: 1218 case TYPE_U32: 1219 case TYPE_S32: 1220 return type; 1221 default: 1222 assert(0); 1223 return ~0; 1224 } 1225 } 1226 1227 /* some cat2 instructions (ie. those which are not float) can embed an 1228 * immediate: 1229 */ 1230 static inline bool 1231 ir3_cat2_int(opc_t opc) 1232 { 1233 switch (opc) { 1234 case OPC_ADD_U: 1235 case OPC_ADD_S: 1236 case OPC_SUB_U: 1237 case OPC_SUB_S: 1238 case OPC_CMPS_U: 1239 case OPC_CMPS_S: 1240 case OPC_MIN_U: 1241 case OPC_MIN_S: 1242 case OPC_MAX_U: 1243 case OPC_MAX_S: 1244 case OPC_CMPV_U: 1245 case OPC_CMPV_S: 1246 case OPC_MUL_U24: 1247 case OPC_MUL_S24: 1248 case OPC_MULL_U: 1249 case OPC_CLZ_S: 1250 case OPC_ABSNEG_S: 1251 case OPC_AND_B: 1252 case OPC_OR_B: 1253 case OPC_NOT_B: 1254 case OPC_XOR_B: 1255 case OPC_BFREV_B: 1256 case OPC_CLZ_B: 1257 case OPC_SHL_B: 1258 case OPC_SHR_B: 1259 case OPC_ASHR_B: 1260 case OPC_MGEN_B: 1261 case OPC_GETBIT_B: 1262 case OPC_CBITS_B: 1263 case OPC_BARY_F: 1264 return true; 1265 1266 default: 1267 return false; 1268 } 1269 } 1270 1271 /* map cat2 instruction to valid abs/neg flags: */ 1272 static inline unsigned 1273 ir3_cat2_absneg(opc_t opc) 1274 { 1275 switch (opc) { 1276 case OPC_ADD_F: 1277 case OPC_MIN_F: 1278 case OPC_MAX_F: 1279 case OPC_MUL_F: 1280 case OPC_SIGN_F: 1281 case OPC_CMPS_F: 1282 case OPC_ABSNEG_F: 1283 case OPC_CMPV_F: 1284 case OPC_FLOOR_F: 1285 case OPC_CEIL_F: 1286 case OPC_RNDNE_F: 1287 case OPC_RNDAZ_F: 1288 case OPC_TRUNC_F: 1289 case OPC_BARY_F: 1290 return IR3_REG_FABS | IR3_REG_FNEG; 1291 1292 case OPC_ADD_U: 1293 case OPC_ADD_S: 1294 case OPC_SUB_U: 1295 case OPC_SUB_S: 1296 case OPC_CMPS_U: 1297 case OPC_CMPS_S: 1298 case OPC_MIN_U: 1299 case OPC_MIN_S: 1300 case OPC_MAX_U: 1301 case OPC_MAX_S: 1302 case OPC_CMPV_U: 1303 case OPC_CMPV_S: 1304 case OPC_MUL_U24: 1305 case OPC_MUL_S24: 1306 case OPC_MULL_U: 1307 case OPC_CLZ_S: 1308 return 0; 1309 1310 case OPC_ABSNEG_S: 1311 return IR3_REG_SABS | IR3_REG_SNEG; 1312 1313 case OPC_AND_B: 1314 case OPC_OR_B: 1315 case OPC_NOT_B: 1316 case OPC_XOR_B: 1317 case OPC_BFREV_B: 1318 case OPC_CLZ_B: 1319 case OPC_SHL_B: 1320 case OPC_SHR_B: 1321 case OPC_ASHR_B: 1322 case OPC_MGEN_B: 1323 case OPC_GETBIT_B: 1324 case OPC_CBITS_B: 1325 return IR3_REG_BNOT; 1326 1327 default: 1328 return 0; 1329 } 1330 } 1331 1332 /* map cat3 instructions to valid abs/neg flags: */ 1333 static inline unsigned 1334 ir3_cat3_absneg(opc_t opc) 1335 { 1336 switch (opc) { 1337 case OPC_MAD_F16: 1338 case OPC_MAD_F32: 1339 case OPC_SEL_F16: 1340 case OPC_SEL_F32: 1341 return IR3_REG_FNEG; 1342 1343 case OPC_MAD_U16: 1344 case OPC_MADSH_U16: 1345 case OPC_MAD_S16: 1346 case OPC_MADSH_M16: 1347 case OPC_MAD_U24: 1348 case OPC_MAD_S24: 1349 case OPC_SEL_S16: 1350 case OPC_SEL_S32: 1351 case OPC_SAD_S16: 1352 case OPC_SAD_S32: 1353 /* neg *may* work on 3rd src.. */ 1354 1355 case OPC_SEL_B16: 1356 case OPC_SEL_B32: 1357 1358 case OPC_SHLG_B16: 1359 1360 default: 1361 return 0; 1362 } 1363 } 1364 1365 /* Return the type (float, int, or uint) the op uses when converting from the 1366 * internal result of the op (which is assumed to be the same size as the 1367 * sources) to the destination when they are not the same size. If F32 it does 1368 * a floating-point conversion, if U32 it does a truncation/zero-extension, if 1369 * S32 it does a truncation/sign-extension. "can_fold" will be false if it 1370 * doesn't do anything sensible or is unknown. 1371 */ 1372 static inline type_t 1373 ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold) 1374 { 1375 *can_fold = true; 1376 switch (instr->opc) { 1377 case OPC_ADD_F: 1378 case OPC_MUL_F: 1379 case OPC_BARY_F: 1380 case OPC_MAD_F32: 1381 case OPC_MAD_F16: 1382 return TYPE_F32; 1383 1384 case OPC_ADD_U: 1385 case OPC_SUB_U: 1386 case OPC_MIN_U: 1387 case OPC_MAX_U: 1388 case OPC_AND_B: 1389 case OPC_OR_B: 1390 case OPC_NOT_B: 1391 case OPC_XOR_B: 1392 case OPC_MUL_U24: 1393 case OPC_MULL_U: 1394 case OPC_SHL_B: 1395 case OPC_SHR_B: 1396 case OPC_ASHR_B: 1397 case OPC_MAD_U24: 1398 /* Comparison ops zero-extend/truncate their results, so consider them as 1399 * unsigned here. 1400 */ 1401 case OPC_CMPS_F: 1402 case OPC_CMPV_F: 1403 case OPC_CMPS_U: 1404 case OPC_CMPS_S: 1405 return TYPE_U32; 1406 1407 case OPC_ADD_S: 1408 case OPC_SUB_S: 1409 case OPC_MIN_S: 1410 case OPC_MAX_S: 1411 case OPC_ABSNEG_S: 1412 case OPC_MUL_S24: 1413 case OPC_MAD_S24: 1414 return TYPE_S32; 1415 1416 /* We assume that any move->move folding that could be done was done by 1417 * NIR. 1418 */ 1419 case OPC_MOV: 1420 default: 1421 *can_fold = false; 1422 return TYPE_U32; 1423 } 1424 } 1425 1426 /* Return the src and dst types for the conversion which is already folded 1427 * into the op. We can assume that instr has folded in a conversion from 1428 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense 1429 * to call if ir3_output_conv_type() returns can_fold = true. 1430 */ 1431 static inline type_t 1432 ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type) 1433 { 1434 switch (instr->opc) { 1435 case OPC_CMPS_F: 1436 case OPC_CMPV_F: 1437 case OPC_CMPS_U: 1438 case OPC_CMPS_S: 1439 /* Comparisons only return 0/1 and the size of the comparison sources 1440 * is irrelevant, never consider them as having an output conversion 1441 * by returning a type with the dest size here: 1442 */ 1443 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type) 1444 : full_type(base_type); 1445 1446 case OPC_BARY_F: 1447 /* bary.f doesn't have an explicit source, but we can assume here that 1448 * the varying data it reads is in fp32. 1449 * 1450 * This may be fp16 on older gen's depending on some register 1451 * settings, but it's probably not worth plumbing that through for a 1452 * small improvement that NIR would hopefully handle for us anyway. 1453 */ 1454 return TYPE_F32; 1455 1456 default: 1457 return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type) 1458 : full_type(base_type); 1459 } 1460 } 1461 1462 static inline type_t 1463 ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type) 1464 { 1465 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type) 1466 : full_type(base_type); 1467 } 1468 1469 /* Some instructions have signed/unsigned variants which are identical except 1470 * for whether the folded conversion sign-extends or zero-extends, and we can 1471 * fold in a mismatching move by rewriting the opcode. Return the opcode to 1472 * switch signedness, and whether one exists. 1473 */ 1474 static inline opc_t 1475 ir3_try_swap_signedness(opc_t opc, bool *can_swap) 1476 { 1477 switch (opc) { 1478 #define PAIR(u, s) \ 1479 case OPC_##u: \ 1480 return OPC_##s; \ 1481 case OPC_##s: \ 1482 return OPC_##u; 1483 PAIR(ADD_U, ADD_S) 1484 PAIR(SUB_U, SUB_S) 1485 /* Note: these are only identical when the sources are half, but that's 1486 * the only case we call this function for anyway. 1487 */ 1488 PAIR(MUL_U24, MUL_S24) 1489 1490 default: 1491 *can_swap = false; 1492 return opc; 1493 } 1494 } 1495 1496 #define MASK(n) ((1 << (n)) - 1) 1497 1498 /* iterator for an instructions's sources (reg), also returns src #: */ 1499 #define foreach_src_n(__srcreg, __n, __instr) \ 1500 if ((__instr)->srcs_count) \ 1501 for (struct ir3_register *__srcreg = (void *)~0; __srcreg; \ 1502 __srcreg = NULL) \ 1503 for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; \ 1504 __n++) \ 1505 if ((__srcreg = (__instr)->srcs[__n])) 1506 1507 /* iterator for an instructions's sources (reg): */ 1508 #define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr) 1509 1510 /* iterator for an instructions's destinations (reg), also returns dst #: */ 1511 #define foreach_dst_n(__dstreg, __n, __instr) \ 1512 if ((__instr)->dsts_count) \ 1513 for (struct ir3_register *__dstreg = (void *)~0; __dstreg; \ 1514 __dstreg = NULL) \ 1515 for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; \ 1516 __n++) \ 1517 if ((__dstreg = (__instr)->dsts[__n])) 1518 1519 /* iterator for an instructions's destinations (reg): */ 1520 #define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr) 1521 1522 static inline unsigned 1523 __ssa_src_cnt(struct ir3_instruction *instr) 1524 { 1525 return instr->srcs_count + instr->deps_count; 1526 } 1527 1528 static inline bool 1529 __is_false_dep(struct ir3_instruction *instr, unsigned n) 1530 { 1531 if (n >= instr->srcs_count) 1532 return true; 1533 return false; 1534 } 1535 1536 static inline struct ir3_instruction ** 1537 __ssa_srcp_n(struct ir3_instruction *instr, unsigned n) 1538 { 1539 if (__is_false_dep(instr, n)) 1540 return &instr->deps[n - instr->srcs_count]; 1541 if (ssa(instr->srcs[n])) 1542 return &instr->srcs[n]->def->instr; 1543 return NULL; 1544 } 1545 1546 #define foreach_ssa_srcp_n(__srcp, __n, __instr) \ 1547 for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \ 1548 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; \ 1549 __n++) \ 1550 if ((__srcp = __ssa_srcp_n(__instr, __n))) 1551 1552 #define foreach_ssa_srcp(__srcp, __instr) \ 1553 foreach_ssa_srcp_n (__srcp, __i, __instr) 1554 1555 /* iterator for an instruction's SSA sources (instr), also returns src #: */ 1556 #define foreach_ssa_src_n(__srcinst, __n, __instr) \ 1557 for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; \ 1558 __srcinst = NULL) \ 1559 foreach_ssa_srcp_n (__srcp, __n, __instr) \ 1560 if ((__srcinst = *__srcp)) 1561 1562 /* iterator for an instruction's SSA sources (instr): */ 1563 #define foreach_ssa_src(__srcinst, __instr) \ 1564 foreach_ssa_src_n (__srcinst, __i, __instr) 1565 1566 /* iterators for shader inputs: */ 1567 #define foreach_input_n(__ininstr, __cnt, __ir) \ 1568 for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; \ 1569 __ininstr = NULL) \ 1570 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \ 1571 if ((__ininstr = (__ir)->inputs[__cnt])) 1572 #define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir) 1573 1574 /* iterators for instructions: */ 1575 #define foreach_instr(__instr, __list) \ 1576 list_for_each_entry (struct ir3_instruction, __instr, __list, node) 1577 #define foreach_instr_rev(__instr, __list) \ 1578 list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node) 1579 #define foreach_instr_safe(__instr, __list) \ 1580 list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node) 1581 #define foreach_instr_from_safe(__instr, __start, __list) \ 1582 list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, \ 1583 __list, node) 1584 1585 /* iterators for blocks: */ 1586 #define foreach_block(__block, __list) \ 1587 list_for_each_entry (struct ir3_block, __block, __list, node) 1588 #define foreach_block_safe(__block, __list) \ 1589 list_for_each_entry_safe (struct ir3_block, __block, __list, node) 1590 #define foreach_block_rev(__block, __list) \ 1591 list_for_each_entry_rev (struct ir3_block, __block, __list, node) 1592 1593 /* iterators for arrays: */ 1594 #define foreach_array(__array, __list) \ 1595 list_for_each_entry (struct ir3_array, __array, __list, node) 1596 #define foreach_array_safe(__array, __list) \ 1597 list_for_each_entry_safe (struct ir3_array, __array, __list, node) 1598 1599 #define IR3_PASS(ir, pass, ...) \ 1600 ({ \ 1601 bool progress = pass(ir, ##__VA_ARGS__); \ 1602 if (progress) { \ 1603 ir3_debug_print(ir, "AFTER: " #pass); \ 1604 ir3_validate(ir); \ 1605 } \ 1606 progress; \ 1607 }) 1608 1609 /* validate: */ 1610 void ir3_validate(struct ir3 *ir); 1611 1612 /* dump: */ 1613 void ir3_print(struct ir3 *ir); 1614 void ir3_print_instr(struct ir3_instruction *instr); 1615 1616 struct log_stream; 1617 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr); 1618 1619 /* delay calculation: */ 1620 int ir3_delayslots(struct ir3_instruction *assigner, 1621 struct ir3_instruction *consumer, unsigned n, bool soft); 1622 unsigned ir3_delay_calc_prera(struct ir3_block *block, 1623 struct ir3_instruction *instr); 1624 unsigned ir3_delay_calc_postra(struct ir3_block *block, 1625 struct ir3_instruction *instr, bool soft, 1626 bool mergedregs); 1627 unsigned ir3_delay_calc_exact(struct ir3_block *block, 1628 struct ir3_instruction *instr, bool mergedregs); 1629 void ir3_remove_nops(struct ir3 *ir); 1630 1631 /* unreachable block elimination: */ 1632 bool ir3_remove_unreachable(struct ir3 *ir); 1633 1634 /* dead code elimination: */ 1635 struct ir3_shader_variant; 1636 bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so); 1637 1638 /* fp16 conversion folding */ 1639 bool ir3_cf(struct ir3 *ir); 1640 1641 /* copy-propagate: */ 1642 bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so); 1643 bool ir3_cp_postsched(struct ir3 *ir); 1644 1645 /* common subexpression elimination: */ 1646 bool ir3_cse(struct ir3 *ir); 1647 1648 /* Make arrays SSA */ 1649 bool ir3_array_to_ssa(struct ir3 *ir); 1650 1651 /* scheduling: */ 1652 bool ir3_sched_add_deps(struct ir3 *ir); 1653 int ir3_sched(struct ir3 *ir); 1654 1655 struct ir3_context; 1656 bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v); 1657 1658 /* register assignment: */ 1659 int ir3_ra(struct ir3_shader_variant *v); 1660 1661 /* lower subgroup ops: */ 1662 bool ir3_lower_subgroups(struct ir3 *ir); 1663 1664 /* legalize: */ 1665 bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary); 1666 1667 static inline bool 1668 ir3_has_latency_to_hide(struct ir3 *ir) 1669 { 1670 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't 1671 * know the nature of the fragment shader. Just assume it will have 1672 * latency to hide: 1673 */ 1674 if (ir->type != MESA_SHADER_FRAGMENT) 1675 return true; 1676 1677 foreach_block (block, &ir->block_list) { 1678 foreach_instr (instr, &block->instr_list) { 1679 if (is_tex_or_prefetch(instr)) 1680 return true; 1681 1682 if (is_load(instr)) { 1683 switch (instr->opc) { 1684 case OPC_LDLV: 1685 case OPC_LDL: 1686 case OPC_LDLW: 1687 break; 1688 default: 1689 return true; 1690 } 1691 } 1692 } 1693 } 1694 1695 return false; 1696 } 1697 1698 /* ************************************************************************* */ 1699 /* instruction helpers */ 1700 1701 /* creates SSA src of correct type (ie. half vs full precision) */ 1702 static inline struct ir3_register * 1703 __ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src, 1704 unsigned flags) 1705 { 1706 struct ir3_register *reg; 1707 if (src->dsts[0]->flags & IR3_REG_HALF) 1708 flags |= IR3_REG_HALF; 1709 reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags); 1710 reg->def = src->dsts[0]; 1711 reg->wrmask = src->dsts[0]->wrmask; 1712 return reg; 1713 } 1714 1715 static inline struct ir3_register * 1716 __ssa_dst(struct ir3_instruction *instr) 1717 { 1718 struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA); 1719 reg->instr = instr; 1720 return reg; 1721 } 1722 1723 static inline struct ir3_instruction * 1724 create_immed_typed(struct ir3_block *block, uint32_t val, type_t type) 1725 { 1726 struct ir3_instruction *mov; 1727 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0; 1728 1729 mov = ir3_instr_create(block, OPC_MOV, 1, 1); 1730 mov->cat1.src_type = type; 1731 mov->cat1.dst_type = type; 1732 __ssa_dst(mov)->flags |= flags; 1733 ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val; 1734 1735 return mov; 1736 } 1737 1738 static inline struct ir3_instruction * 1739 create_immed(struct ir3_block *block, uint32_t val) 1740 { 1741 return create_immed_typed(block, val, TYPE_U32); 1742 } 1743 1744 static inline struct ir3_instruction * 1745 create_uniform_typed(struct ir3_block *block, unsigned n, type_t type) 1746 { 1747 struct ir3_instruction *mov; 1748 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0; 1749 1750 mov = ir3_instr_create(block, OPC_MOV, 1, 1); 1751 mov->cat1.src_type = type; 1752 mov->cat1.dst_type = type; 1753 __ssa_dst(mov)->flags |= flags; 1754 ir3_src_create(mov, n, IR3_REG_CONST | flags); 1755 1756 return mov; 1757 } 1758 1759 static inline struct ir3_instruction * 1760 create_uniform(struct ir3_block *block, unsigned n) 1761 { 1762 return create_uniform_typed(block, n, TYPE_F32); 1763 } 1764 1765 static inline struct ir3_instruction * 1766 create_uniform_indirect(struct ir3_block *block, int n, type_t type, 1767 struct ir3_instruction *address) 1768 { 1769 struct ir3_instruction *mov; 1770 1771 mov = ir3_instr_create(block, OPC_MOV, 1, 1); 1772 mov->cat1.src_type = type; 1773 mov->cat1.dst_type = type; 1774 __ssa_dst(mov); 1775 ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; 1776 1777 ir3_instr_set_address(mov, address); 1778 1779 return mov; 1780 } 1781 1782 static inline struct ir3_instruction * 1783 ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type) 1784 { 1785 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1); 1786 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0; 1787 1788 __ssa_dst(instr)->flags |= flags; 1789 if (src->dsts[0]->flags & IR3_REG_ARRAY) { 1790 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY); 1791 src_reg->array = src->dsts[0]->array; 1792 } else { 1793 __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED); 1794 } 1795 debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV)); 1796 instr->cat1.src_type = type; 1797 instr->cat1.dst_type = type; 1798 return instr; 1799 } 1800 1801 static inline struct ir3_instruction * 1802 ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type, 1803 type_t dst_type) 1804 { 1805 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1); 1806 unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0; 1807 unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0; 1808 1809 debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags); 1810 1811 __ssa_dst(instr)->flags |= dst_flags; 1812 __ssa_src(instr, src, 0); 1813 instr->cat1.src_type = src_type; 1814 instr->cat1.dst_type = dst_type; 1815 debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY)); 1816 return instr; 1817 } 1818 1819 static inline struct ir3_instruction * 1820 ir3_MOVMSK(struct ir3_block *block, unsigned components) 1821 { 1822 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0); 1823 1824 struct ir3_register *dst = __ssa_dst(instr); 1825 dst->flags |= IR3_REG_SHARED; 1826 dst->wrmask = (1 << components) - 1; 1827 instr->repeat = components - 1; 1828 return instr; 1829 } 1830 1831 static inline struct ir3_instruction * 1832 ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src, 1833 unsigned components) 1834 { 1835 struct ir3_instruction *instr = 1836 ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1); 1837 1838 struct ir3_register *dst = __ssa_dst(instr); 1839 dst->flags |= IR3_REG_SHARED; 1840 dst->wrmask = (1 << components) - 1; 1841 1842 __ssa_src(instr, src, 0); 1843 1844 return instr; 1845 } 1846 1847 static inline struct ir3_instruction * 1848 ir3_NOP(struct ir3_block *block) 1849 { 1850 return ir3_instr_create(block, OPC_NOP, 0, 0); 1851 } 1852 1853 #define IR3_INSTR_0 0 1854 1855 /* clang-format off */ 1856 #define __INSTR0(flag, name, opc) \ 1857 static inline struct ir3_instruction *ir3_##name(struct ir3_block *block) \ 1858 { \ 1859 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0); \ 1860 instr->flags |= flag; \ 1861 return instr; \ 1862 } 1863 /* clang-format on */ 1864 #define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name) 1865 #define INSTR0(name) __INSTR0(0, name, OPC_##name) 1866 1867 /* clang-format off */ 1868 #define __INSTR1(flag, dst_count, name, opc) \ 1869 static inline struct ir3_instruction *ir3_##name( \ 1870 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags) \ 1871 { \ 1872 struct ir3_instruction *instr = \ 1873 ir3_instr_create(block, opc, dst_count, 1); \ 1874 for (unsigned i = 0; i < dst_count; i++) \ 1875 __ssa_dst(instr); \ 1876 __ssa_src(instr, a, aflags); \ 1877 instr->flags |= flag; \ 1878 return instr; \ 1879 } 1880 /* clang-format on */ 1881 #define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name) 1882 #define INSTR1(name) __INSTR1(0, 1, name, OPC_##name) 1883 #define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name) 1884 1885 /* clang-format off */ 1886 #define __INSTR2(flag, name, opc) \ 1887 static inline struct ir3_instruction *ir3_##name( \ 1888 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1889 struct ir3_instruction *b, unsigned bflags) \ 1890 { \ 1891 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 2); \ 1892 __ssa_dst(instr); \ 1893 __ssa_src(instr, a, aflags); \ 1894 __ssa_src(instr, b, bflags); \ 1895 instr->flags |= flag; \ 1896 return instr; \ 1897 } 1898 /* clang-format on */ 1899 #define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, name##_##f, OPC_##name) 1900 #define INSTR2(name) __INSTR2(0, name, OPC_##name) 1901 1902 /* clang-format off */ 1903 #define __INSTR3(flag, dst_count, name, opc) \ 1904 static inline struct ir3_instruction *ir3_##name( \ 1905 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1906 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \ 1907 unsigned cflags) \ 1908 { \ 1909 struct ir3_instruction *instr = \ 1910 ir3_instr_create(block, opc, dst_count, 3); \ 1911 for (unsigned i = 0; i < dst_count; i++) \ 1912 __ssa_dst(instr); \ 1913 __ssa_src(instr, a, aflags); \ 1914 __ssa_src(instr, b, bflags); \ 1915 __ssa_src(instr, c, cflags); \ 1916 instr->flags |= flag; \ 1917 return instr; \ 1918 } 1919 /* clang-format on */ 1920 #define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name) 1921 #define INSTR3(name) __INSTR3(0, 1, name, OPC_##name) 1922 #define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name) 1923 1924 /* clang-format off */ 1925 #define __INSTR4(flag, dst_count, name, opc) \ 1926 static inline struct ir3_instruction *ir3_##name( \ 1927 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1928 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \ 1929 unsigned cflags, struct ir3_instruction *d, unsigned dflags) \ 1930 { \ 1931 struct ir3_instruction *instr = \ 1932 ir3_instr_create(block, opc, dst_count, 4); \ 1933 for (unsigned i = 0; i < dst_count; i++) \ 1934 __ssa_dst(instr); \ 1935 __ssa_src(instr, a, aflags); \ 1936 __ssa_src(instr, b, bflags); \ 1937 __ssa_src(instr, c, cflags); \ 1938 __ssa_src(instr, d, dflags); \ 1939 instr->flags |= flag; \ 1940 return instr; \ 1941 } 1942 /* clang-format on */ 1943 #define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name) 1944 #define INSTR4(name) __INSTR4(0, 1, name, OPC_##name) 1945 #define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name) 1946 1947 /* clang-format off */ 1948 #define __INSTR5(flag, name, opc) \ 1949 static inline struct ir3_instruction *ir3_##name( \ 1950 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1951 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \ 1952 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \ 1953 struct ir3_instruction *e, unsigned eflags) \ 1954 { \ 1955 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5); \ 1956 __ssa_dst(instr); \ 1957 __ssa_src(instr, a, aflags); \ 1958 __ssa_src(instr, b, bflags); \ 1959 __ssa_src(instr, c, cflags); \ 1960 __ssa_src(instr, d, dflags); \ 1961 __ssa_src(instr, e, eflags); \ 1962 instr->flags |= flag; \ 1963 return instr; \ 1964 } 1965 /* clang-format on */ 1966 #define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name) 1967 #define INSTR5(name) __INSTR5(0, name, OPC_##name) 1968 1969 /* clang-format off */ 1970 #define __INSTR6(flag, dst_count, name, opc) \ 1971 static inline struct ir3_instruction *ir3_##name( \ 1972 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1973 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \ 1974 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \ 1975 struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f, \ 1976 unsigned fflags) \ 1977 { \ 1978 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6); \ 1979 for (unsigned i = 0; i < dst_count; i++) \ 1980 __ssa_dst(instr); \ 1981 __ssa_src(instr, a, aflags); \ 1982 __ssa_src(instr, b, bflags); \ 1983 __ssa_src(instr, c, cflags); \ 1984 __ssa_src(instr, d, dflags); \ 1985 __ssa_src(instr, e, eflags); \ 1986 __ssa_src(instr, f, fflags); \ 1987 instr->flags |= flag; \ 1988 return instr; \ 1989 } 1990 /* clang-format on */ 1991 #define INSTR6F(f, name) __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name) 1992 #define INSTR6(name) __INSTR6(0, 1, name, OPC_##name) 1993 #define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name) 1994 1995 /* cat0 instructions: */ 1996 INSTR1NODST(B) 1997 INSTR0(JUMP) 1998 INSTR1NODST(KILL) 1999 INSTR1NODST(DEMOTE) 2000 INSTR0(END) 2001 INSTR0(CHSH) 2002 INSTR0(CHMASK) 2003 INSTR1NODST(PREDT) 2004 INSTR0(PREDF) 2005 INSTR0(PREDE) 2006 INSTR0(GETONE) 2007 2008 /* cat1 macros */ 2009 INSTR1(ANY_MACRO) 2010 INSTR1(ALL_MACRO) 2011 INSTR1(READ_FIRST_MACRO) 2012 INSTR2(READ_COND_MACRO) 2013 2014 static inline struct ir3_instruction * 2015 ir3_ELECT_MACRO(struct ir3_block *block) 2016 { 2017 struct ir3_instruction *instr = 2018 ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0); 2019 __ssa_dst(instr); 2020 return instr; 2021 } 2022 2023 /* cat2 instructions, most 2 src but some 1 src: */ 2024 INSTR2(ADD_F) 2025 INSTR2(MIN_F) 2026 INSTR2(MAX_F) 2027 INSTR2(MUL_F) 2028 INSTR1(SIGN_F) 2029 INSTR2(CMPS_F) 2030 INSTR1(ABSNEG_F) 2031 INSTR2(CMPV_F) 2032 INSTR1(FLOOR_F) 2033 INSTR1(CEIL_F) 2034 INSTR1(RNDNE_F) 2035 INSTR1(RNDAZ_F) 2036 INSTR1(TRUNC_F) 2037 INSTR2(ADD_U) 2038 INSTR2(ADD_S) 2039 INSTR2(SUB_U) 2040 INSTR2(SUB_S) 2041 INSTR2(CMPS_U) 2042 INSTR2(CMPS_S) 2043 INSTR2(MIN_U) 2044 INSTR2(MIN_S) 2045 INSTR2(MAX_U) 2046 INSTR2(MAX_S) 2047 INSTR1(ABSNEG_S) 2048 INSTR2(AND_B) 2049 INSTR2(OR_B) 2050 INSTR1(NOT_B) 2051 INSTR2(XOR_B) 2052 INSTR2(CMPV_U) 2053 INSTR2(CMPV_S) 2054 INSTR2(MUL_U24) 2055 INSTR2(MUL_S24) 2056 INSTR2(MULL_U) 2057 INSTR1(BFREV_B) 2058 INSTR1(CLZ_S) 2059 INSTR1(CLZ_B) 2060 INSTR2(SHL_B) 2061 INSTR2(SHR_B) 2062 INSTR2(ASHR_B) 2063 INSTR2(BARY_F) 2064 INSTR2(MGEN_B) 2065 INSTR2(GETBIT_B) 2066 INSTR1(SETRM) 2067 INSTR1(CBITS_B) 2068 INSTR2(SHB) 2069 INSTR2(MSAD) 2070 2071 /* cat3 instructions: */ 2072 INSTR3(MAD_U16) 2073 INSTR3(MADSH_U16) 2074 INSTR3(MAD_S16) 2075 INSTR3(MADSH_M16) 2076 INSTR3(MAD_U24) 2077 INSTR3(MAD_S24) 2078 INSTR3(MAD_F16) 2079 INSTR3(MAD_F32) 2080 /* NOTE: SEL_B32 checks for zero vs nonzero */ 2081 INSTR3(SEL_B16) 2082 INSTR3(SEL_B32) 2083 INSTR3(SEL_S16) 2084 INSTR3(SEL_S32) 2085 INSTR3(SEL_F16) 2086 INSTR3(SEL_F32) 2087 INSTR3(SAD_S16) 2088 INSTR3(SAD_S32) 2089 2090 /* cat4 instructions: */ 2091 INSTR1(RCP) 2092 INSTR1(RSQ) 2093 INSTR1(HRSQ) 2094 INSTR1(LOG2) 2095 INSTR1(HLOG2) 2096 INSTR1(EXP2) 2097 INSTR1(HEXP2) 2098 INSTR1(SIN) 2099 INSTR1(COS) 2100 INSTR1(SQRT) 2101 2102 /* cat5 instructions: */ 2103 INSTR1(DSX) 2104 INSTR1(DSXPP_MACRO) 2105 INSTR1(DSY) 2106 INSTR1(DSYPP_MACRO) 2107 INSTR1F(3D, DSX) 2108 INSTR1F(3D, DSY) 2109 INSTR1(RGETPOS) 2110 2111 static inline struct ir3_instruction * 2112 ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask, 2113 unsigned flags, struct ir3_instruction *samp_tex, 2114 struct ir3_instruction *src0, struct ir3_instruction *src1) 2115 { 2116 struct ir3_instruction *sam; 2117 unsigned nreg = 0; 2118 2119 if (flags & IR3_INSTR_S2EN) { 2120 nreg++; 2121 } 2122 if (src0) { 2123 nreg++; 2124 } 2125 if (src1) { 2126 nreg++; 2127 } 2128 2129 sam = ir3_instr_create(block, opc, 1, nreg); 2130 sam->flags |= flags; 2131 __ssa_dst(sam)->wrmask = wrmask; 2132 if (flags & IR3_INSTR_S2EN) { 2133 __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF); 2134 } 2135 if (src0) { 2136 __ssa_src(sam, src0, 0); 2137 } 2138 if (src1) { 2139 __ssa_src(sam, src1, 0); 2140 } 2141 sam->cat5.type = type; 2142 2143 return sam; 2144 } 2145 2146 /* cat6 instructions: */ 2147 INSTR2(LDLV) 2148 INSTR3(LDG) 2149 INSTR3(LDL) 2150 INSTR3(LDLW) 2151 INSTR3(LDP) 2152 INSTR4NODST(STG) 2153 INSTR3NODST(STL) 2154 INSTR3NODST(STLW) 2155 INSTR3NODST(STP) 2156 INSTR1(RESINFO) 2157 INSTR1(RESFMT) 2158 INSTR2(ATOMIC_ADD) 2159 INSTR2(ATOMIC_SUB) 2160 INSTR2(ATOMIC_XCHG) 2161 INSTR2(ATOMIC_INC) 2162 INSTR2(ATOMIC_DEC) 2163 INSTR2(ATOMIC_CMPXCHG) 2164 INSTR2(ATOMIC_MIN) 2165 INSTR2(ATOMIC_MAX) 2166 INSTR2(ATOMIC_AND) 2167 INSTR2(ATOMIC_OR) 2168 INSTR2(ATOMIC_XOR) 2169 INSTR2(LDC) 2170 #if GPU >= 600 2171 INSTR3NODST(STIB); 2172 INSTR2(LDIB); 2173 INSTR5(LDG_A); 2174 INSTR6NODST(STG_A); 2175 INSTR3F(G, ATOMIC_ADD) 2176 INSTR3F(G, ATOMIC_SUB) 2177 INSTR3F(G, ATOMIC_XCHG) 2178 INSTR3F(G, ATOMIC_INC) 2179 INSTR3F(G, ATOMIC_DEC) 2180 INSTR3F(G, ATOMIC_CMPXCHG) 2181 INSTR3F(G, ATOMIC_MIN) 2182 INSTR3F(G, ATOMIC_MAX) 2183 INSTR3F(G, ATOMIC_AND) 2184 INSTR3F(G, ATOMIC_OR) 2185 INSTR3F(G, ATOMIC_XOR) 2186 #elif GPU >= 400 2187 INSTR3(LDGB) 2188 #if GPU >= 500 2189 INSTR3(LDIB) 2190 #endif 2191 INSTR4NODST(STGB) 2192 INSTR4NODST(STIB) 2193 INSTR4F(G, ATOMIC_ADD) 2194 INSTR4F(G, ATOMIC_SUB) 2195 INSTR4F(G, ATOMIC_XCHG) 2196 INSTR4F(G, ATOMIC_INC) 2197 INSTR4F(G, ATOMIC_DEC) 2198 INSTR4F(G, ATOMIC_CMPXCHG) 2199 INSTR4F(G, ATOMIC_MIN) 2200 INSTR4F(G, ATOMIC_MAX) 2201 INSTR4F(G, ATOMIC_AND) 2202 INSTR4F(G, ATOMIC_OR) 2203 INSTR4F(G, ATOMIC_XOR) 2204 #endif 2205 2206 /* cat7 instructions: */ 2207 INSTR0(BAR) 2208 INSTR0(FENCE) 2209 2210 /* ************************************************************************* */ 2211 #include "bitset.h" 2212 2213 #define MAX_REG 256 2214 2215 typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG); 2216 2217 typedef struct { 2218 bool mergedregs; 2219 regmaskstate_t mask; 2220 } regmask_t; 2221 2222 static inline bool 2223 __regmask_get(regmask_t *regmask, bool half, unsigned n) 2224 { 2225 if (regmask->mergedregs) { 2226 /* a6xx+ case, with merged register file, we track things in terms 2227 * of half-precision registers, with a full precisions register 2228 * using two half-precision slots. 2229 * 2230 * Pretend that special regs (a0.x, a1.x, etc.) are full registers to 2231 * avoid having them alias normal full regs. 2232 */ 2233 if (half && !is_reg_num_special(n)) { 2234 return BITSET_TEST(regmask->mask, n); 2235 } else { 2236 n *= 2; 2237 return BITSET_TEST(regmask->mask, n) || 2238 BITSET_TEST(regmask->mask, n + 1); 2239 } 2240 } else { 2241 /* pre a6xx case, with separate register file for half and full 2242 * precision: 2243 */ 2244 if (half) 2245 n += MAX_REG; 2246 return BITSET_TEST(regmask->mask, n); 2247 } 2248 } 2249 2250 static inline void 2251 __regmask_set(regmask_t *regmask, bool half, unsigned n) 2252 { 2253 if (regmask->mergedregs) { 2254 /* a6xx+ case, with merged register file, we track things in terms 2255 * of half-precision registers, with a full precisions register 2256 * using two half-precision slots: 2257 */ 2258 if (half && !is_reg_num_special(n)) { 2259 BITSET_SET(regmask->mask, n); 2260 } else { 2261 n *= 2; 2262 BITSET_SET(regmask->mask, n); 2263 BITSET_SET(regmask->mask, n + 1); 2264 } 2265 } else { 2266 /* pre a6xx case, with separate register file for half and full 2267 * precision: 2268 */ 2269 if (half) 2270 n += MAX_REG; 2271 BITSET_SET(regmask->mask, n); 2272 } 2273 } 2274 2275 static inline void 2276 __regmask_clear(regmask_t *regmask, bool half, unsigned n) 2277 { 2278 if (regmask->mergedregs) { 2279 /* a6xx+ case, with merged register file, we track things in terms 2280 * of half-precision registers, with a full precisions register 2281 * using two half-precision slots: 2282 */ 2283 if (half && !is_reg_num_special(n)) { 2284 BITSET_CLEAR(regmask->mask, n); 2285 } else { 2286 n *= 2; 2287 BITSET_CLEAR(regmask->mask, n); 2288 BITSET_CLEAR(regmask->mask, n + 1); 2289 } 2290 } else { 2291 /* pre a6xx case, with separate register file for half and full 2292 * precision: 2293 */ 2294 if (half) 2295 n += MAX_REG; 2296 BITSET_CLEAR(regmask->mask, n); 2297 } 2298 } 2299 2300 static inline void 2301 regmask_init(regmask_t *regmask, bool mergedregs) 2302 { 2303 memset(®mask->mask, 0, sizeof(regmask->mask)); 2304 regmask->mergedregs = mergedregs; 2305 } 2306 2307 static inline void 2308 regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b) 2309 { 2310 assert(dst->mergedregs == a->mergedregs); 2311 assert(dst->mergedregs == b->mergedregs); 2312 2313 for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++) 2314 dst->mask[i] = a->mask[i] | b->mask[i]; 2315 } 2316 2317 2318 static inline void 2319 regmask_set(regmask_t *regmask, struct ir3_register *reg) 2320 { 2321 bool half = reg->flags & IR3_REG_HALF; 2322 if (reg->flags & IR3_REG_RELATIV) { 2323 for (unsigned i = 0; i < reg->size; i++) 2324 __regmask_set(regmask, half, reg->array.base + i); 2325 } else { 2326 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++) 2327 if (mask & 1) 2328 __regmask_set(regmask, half, n); 2329 } 2330 } 2331 2332 static inline bool 2333 regmask_get(regmask_t *regmask, struct ir3_register *reg) 2334 { 2335 bool half = reg->flags & IR3_REG_HALF; 2336 if (reg->flags & IR3_REG_RELATIV) { 2337 for (unsigned i = 0; i < reg->size; i++) 2338 if (__regmask_get(regmask, half, reg->array.base + i)) 2339 return true; 2340 } else { 2341 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++) 2342 if (mask & 1) 2343 if (__regmask_get(regmask, half, n)) 2344 return true; 2345 } 2346 return false; 2347 } 2348 /* ************************************************************************* */ 2349 2350 #endif /* IR3_H_ */ 2351