ir3.h revision 7ec681f3
1/* 2 * Copyright (c) 2013 Rob Clark <robdclark@gmail.com> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#ifndef IR3_H_ 25#define IR3_H_ 26 27#include <stdbool.h> 28#include <stdint.h> 29 30#include "compiler/shader_enums.h" 31 32#include "util/bitscan.h" 33#include "util/list.h" 34#include "util/set.h" 35#include "util/u_debug.h" 36 37#include "instr-a3xx.h" 38 39/* low level intermediate representation of an adreno shader program */ 40 41struct ir3_compiler; 42struct ir3; 43struct ir3_instruction; 44struct ir3_block; 45 46struct ir3_info { 47 void *data; /* used internally in ir3 assembler */ 48 /* Size in bytes of the shader binary, including NIR constants and 49 * padding 50 */ 51 uint32_t size; 52 /* byte offset from start of the shader to the NIR constant data. */ 53 uint32_t constant_data_offset; 54 /* Size in dwords of the instructions. */ 55 uint16_t sizedwords; 56 uint16_t instrs_count; /* expanded to account for rpt's */ 57 uint16_t nops_count; /* # of nop instructions, including nopN */ 58 uint16_t mov_count; 59 uint16_t cov_count; 60 uint16_t stp_count; 61 uint16_t ldp_count; 62 /* NOTE: max_reg, etc, does not include registers not touched 63 * by the shader (ie. vertex fetched via VFD_DECODE but not 64 * touched by shader) 65 */ 66 int8_t max_reg; /* highest GPR # used by shader */ 67 int8_t max_half_reg; 68 int16_t max_const; 69 /* This is the maximum # of waves that can executed at once in one core, 70 * assuming that they are all executing this shader. 71 */ 72 int8_t max_waves; 73 bool double_threadsize; 74 bool multi_dword_ldp_stp; 75 76 /* number of sync bits: */ 77 uint16_t ss, sy; 78 79 /* estimate of number of cycles stalled on (ss) */ 80 uint16_t sstall; 81 82 uint16_t last_baryf; /* instruction # of last varying fetch */ 83 84 /* Number of instructions of a given category: */ 85 uint16_t instrs_per_cat[8]; 86}; 87 88struct ir3_merge_set { 89 uint16_t preferred_reg; 90 uint16_t size; 91 uint16_t alignment; 92 93 unsigned interval_start; 94 unsigned spill_slot; 95 96 unsigned regs_count; 97 struct ir3_register **regs; 98}; 99 100struct ir3_register { 101 enum { 102 IR3_REG_CONST = 0x001, 103 IR3_REG_IMMED = 0x002, 104 IR3_REG_HALF = 0x004, 105 /* Shared registers have the same value for all threads when read. 106 * They can only be written when one thread is active (that is, inside 107 * a "getone" block). 108 */ 109 IR3_REG_SHARED = 0x008, 110 IR3_REG_RELATIV = 0x010, 111 IR3_REG_R = 0x020, 112 /* Most instructions, it seems, can do float abs/neg but not 113 * integer. The CP pass needs to know what is intended (int or 114 * float) in order to do the right thing. For this reason the 115 * abs/neg flags are split out into float and int variants. In 116 * addition, .b (bitwise) operations, the negate is actually a 117 * bitwise not, so split that out into a new flag to make it 118 * more clear. 119 */ 120 IR3_REG_FNEG = 0x040, 121 IR3_REG_FABS = 0x080, 122 IR3_REG_SNEG = 0x100, 123 IR3_REG_SABS = 0x200, 124 IR3_REG_BNOT = 0x400, 125 /* (ei) flag, end-input? Set on last bary, presumably to signal 126 * that the shader needs no more input: 127 */ 128 IR3_REG_EI = 0x2000, 129 /* meta-flags, for intermediate stages of IR, ie. 130 * before register assignment is done: 131 */ 132 IR3_REG_SSA = 0x4000, /* 'def' is ptr to assigning destination */ 133 IR3_REG_ARRAY = 0x8000, 134 135 /* Set on a use whenever the SSA value becomes dead after the current 136 * instruction. 137 */ 138 IR3_REG_KILL = 0x10000, 139 140 /* Similar to IR3_REG_KILL, except that if there are multiple uses of the 141 * same SSA value in a single instruction, this is only set on the first 142 * use. 143 */ 144 IR3_REG_FIRST_KILL = 0x20000, 145 146 /* Set when a destination doesn't have any uses and is dead immediately 147 * after the instruction. This can happen even after optimizations for 148 * corner cases such as destinations of atomic instructions. 149 */ 150 IR3_REG_UNUSED = 0x40000, 151 } flags; 152 153 unsigned name; 154 155 /* used for cat5 instructions, but also for internal/IR level 156 * tracking of what registers are read/written by an instruction. 157 * wrmask may be a bad name since it is used to represent both 158 * src and dst that touch multiple adjacent registers. 159 */ 160 unsigned wrmask : 16; /* up to vec16 */ 161 162 /* for relative addressing, 32bits for array size is too small, 163 * but otoh we don't need to deal with disjoint sets, so instead 164 * use a simple size field (number of scalar components). 165 * 166 * Note the size field isn't important for relative const (since 167 * we don't have to do register allocation for constants). 168 */ 169 unsigned size : 16; 170 171 /* normal registers: 172 * the component is in the low two bits of the reg #, so 173 * rN.x becomes: (N << 2) | x 174 */ 175 uint16_t num; 176 union { 177 /* immediate: */ 178 int32_t iim_val; 179 uint32_t uim_val; 180 float fim_val; 181 /* relative: */ 182 struct { 183 uint16_t id; 184 int16_t offset; 185 uint16_t base; 186 } array; 187 }; 188 189 /* For IR3_REG_DEST, pointer back to the instruction containing this 190 * register. 191 */ 192 struct ir3_instruction *instr; 193 194 /* For IR3_REG_SSA, src registers contain ptr back to assigning 195 * instruction. 196 * 197 * For IR3_REG_ARRAY, the pointer is back to the last dependent 198 * array access (although the net effect is the same, it points 199 * back to a previous instruction that we depend on). 200 */ 201 struct ir3_register *def; 202 203 /* Pointer to another register in the instruction that must share the same 204 * physical register. Each destination can be tied with one source, and 205 * they must have "tied" pointing to each other. 206 */ 207 struct ir3_register *tied; 208 209 unsigned spill_slot, next_use; 210 211 unsigned merge_set_offset; 212 struct ir3_merge_set *merge_set; 213 unsigned interval_start, interval_end; 214}; 215 216/* 217 * Stupid/simple growable array implementation: 218 */ 219#define DECLARE_ARRAY(type, name) \ 220 unsigned name##_count, name##_sz; \ 221 type *name; 222 223#define array_insert(ctx, arr, ...) \ 224 do { \ 225 if (arr##_count == arr##_sz) { \ 226 arr##_sz = MAX2(2 * arr##_sz, 16); \ 227 arr = reralloc_size(ctx, arr, arr##_sz * sizeof(arr[0])); \ 228 } \ 229 arr[arr##_count++] = __VA_ARGS__; \ 230 } while (0) 231 232struct ir3_instruction { 233 struct ir3_block *block; 234 opc_t opc; 235 enum { 236 /* (sy) flag is set on first instruction, and after sample 237 * instructions (probably just on RAW hazard). 238 */ 239 IR3_INSTR_SY = 0x001, 240 /* (ss) flag is set on first instruction, and first instruction 241 * to depend on the result of "long" instructions (RAW hazard): 242 * 243 * rcp, rsq, log2, exp2, sin, cos, sqrt 244 * 245 * It seems to synchronize until all in-flight instructions are 246 * completed, for example: 247 * 248 * rsq hr1.w, hr1.w 249 * add.f hr2.z, (neg)hr2.z, hc0.y 250 * mul.f hr2.w, (neg)hr2.y, (neg)hr2.y 251 * rsq hr2.x, hr2.x 252 * (rpt1)nop 253 * mad.f16 hr2.w, hr2.z, hr2.z, hr2.w 254 * nop 255 * mad.f16 hr2.w, (neg)hr0.w, (neg)hr0.w, hr2.w 256 * (ss)(rpt2)mul.f hr1.x, (r)hr1.x, hr1.w 257 * (rpt2)mul.f hr0.x, (neg)(r)hr0.x, hr2.x 258 * 259 * The last mul.f does not have (ss) set, presumably because the 260 * (ss) on the previous instruction does the job. 261 * 262 * The blob driver also seems to set it on WAR hazards, although 263 * not really clear if this is needed or just blob compiler being 264 * sloppy. So far I haven't found a case where removing the (ss) 265 * causes problems for WAR hazard, but I could just be getting 266 * lucky: 267 * 268 * rcp r1.y, r3.y 269 * (ss)(rpt2)mad.f32 r3.y, (r)c9.x, r1.x, (r)r3.z 270 * 271 */ 272 IR3_INSTR_SS = 0x002, 273 /* (jp) flag is set on jump targets: 274 */ 275 IR3_INSTR_JP = 0x004, 276 IR3_INSTR_UL = 0x008, 277 IR3_INSTR_3D = 0x010, 278 IR3_INSTR_A = 0x020, 279 IR3_INSTR_O = 0x040, 280 IR3_INSTR_P = 0x080, 281 IR3_INSTR_S = 0x100, 282 IR3_INSTR_S2EN = 0x200, 283 IR3_INSTR_G = 0x400, 284 IR3_INSTR_SAT = 0x800, 285 /* (cat5/cat6) Bindless */ 286 IR3_INSTR_B = 0x1000, 287 /* (cat5/cat6) nonuniform */ 288 IR3_INSTR_NONUNIF = 0x02000, 289 /* (cat5-only) Get some parts of the encoding from a1.x */ 290 IR3_INSTR_A1EN = 0x04000, 291 /* meta-flags, for intermediate stages of IR, ie. 292 * before register assignment is done: 293 */ 294 IR3_INSTR_MARK = 0x08000, 295 IR3_INSTR_UNUSED = 0x10000, 296 } flags; 297 uint8_t repeat; 298 uint8_t nop; 299#ifdef DEBUG 300 unsigned srcs_max, dsts_max; 301#endif 302 unsigned srcs_count, dsts_count; 303 struct ir3_register **dsts; 304 struct ir3_register **srcs; 305 union { 306 struct { 307 char inv1, inv2; 308 char comp1, comp2; 309 int immed; 310 struct ir3_block *target; 311 const char *target_label; 312 brtype_t brtype; 313 unsigned idx; /* for brac.N */ 314 } cat0; 315 struct { 316 type_t src_type, dst_type; 317 round_t round; 318 } cat1; 319 struct { 320 enum { 321 IR3_COND_LT = 0, 322 IR3_COND_LE = 1, 323 IR3_COND_GT = 2, 324 IR3_COND_GE = 3, 325 IR3_COND_EQ = 4, 326 IR3_COND_NE = 5, 327 } condition; 328 } cat2; 329 struct { 330 unsigned samp, tex; 331 unsigned tex_base : 3; 332 type_t type; 333 } cat5; 334 struct { 335 type_t type; 336 /* TODO remove dst_offset and handle as a ir3_register 337 * which might be IMMED, similar to how src_offset is 338 * handled. 339 */ 340 int dst_offset; 341 int iim_val : 3; /* for ldgb/stgb, # of components */ 342 unsigned d : 3; /* for ldc, component offset */ 343 bool typed : 1; 344 unsigned base : 3; 345 } cat6; 346 struct { 347 unsigned w : 1; /* write */ 348 unsigned r : 1; /* read */ 349 unsigned l : 1; /* local */ 350 unsigned g : 1; /* global */ 351 } cat7; 352 /* for meta-instructions, just used to hold extra data 353 * before instruction scheduling, etc 354 */ 355 struct { 356 int off; /* component/offset */ 357 } split; 358 struct { 359 /* Per-source index back to the entry in the 360 * ir3_shader_variant::outputs table. 361 */ 362 unsigned *outidxs; 363 } end; 364 struct { 365 /* used to temporarily hold reference to nir_phi_instr 366 * until we resolve the phi srcs 367 */ 368 void *nphi; 369 } phi; 370 struct { 371 unsigned samp, tex; 372 unsigned input_offset; 373 unsigned samp_base : 3; 374 unsigned tex_base : 3; 375 } prefetch; 376 struct { 377 /* maps back to entry in ir3_shader_variant::inputs table: */ 378 int inidx; 379 /* for sysvals, identifies the sysval type. Mostly so we can 380 * identify the special cases where a sysval should not be DCE'd 381 * (currently, just pre-fs texture fetch) 382 */ 383 gl_system_value sysval; 384 } input; 385 }; 386 387 /* For assigning jump offsets, we need instruction's position: */ 388 uint32_t ip; 389 390 /* used for per-pass extra instruction data. 391 * 392 * TODO we should remove the per-pass data like this and 'use_count' 393 * and do something similar to what RA does w/ ir3_ra_instr_data.. 394 * ie. use the ir3_count_instructions pass, and then use instr->ip 395 * to index into a table of pass-private data. 396 */ 397 void *data; 398 399 /** 400 * Valid if pass calls ir3_find_ssa_uses().. see foreach_ssa_use() 401 */ 402 struct set *uses; 403 404 int use_count; /* currently just updated/used by cp */ 405 406 /* an instruction can reference at most one address register amongst 407 * it's src/dst registers. Beyond that, you need to insert mov's. 408 * 409 * NOTE: do not write this directly, use ir3_instr_set_address() 410 */ 411 struct ir3_register *address; 412 413 /* Tracking for additional dependent instructions. Used to handle 414 * barriers, WAR hazards for arrays/SSBOs/etc. 415 */ 416 DECLARE_ARRAY(struct ir3_instruction *, deps); 417 418 /* 419 * From PoV of instruction scheduling, not execution (ie. ignores global/ 420 * local distinction): 421 * shared image atomic SSBO everything 422 * barrier()/ - R/W R/W R/W R/W X 423 * groupMemoryBarrier() 424 * memoryBarrier() 425 * (but only images declared coherent?) 426 * memoryBarrierAtomic() - R/W 427 * memoryBarrierBuffer() - R/W 428 * memoryBarrierImage() - R/W 429 * memoryBarrierShared() - R/W 430 * 431 * TODO I think for SSBO/image/shared, in cases where we can determine 432 * which variable is accessed, we don't need to care about accesses to 433 * different variables (unless declared coherent??) 434 */ 435 enum { 436 IR3_BARRIER_EVERYTHING = 1 << 0, 437 IR3_BARRIER_SHARED_R = 1 << 1, 438 IR3_BARRIER_SHARED_W = 1 << 2, 439 IR3_BARRIER_IMAGE_R = 1 << 3, 440 IR3_BARRIER_IMAGE_W = 1 << 4, 441 IR3_BARRIER_BUFFER_R = 1 << 5, 442 IR3_BARRIER_BUFFER_W = 1 << 6, 443 IR3_BARRIER_ARRAY_R = 1 << 7, 444 IR3_BARRIER_ARRAY_W = 1 << 8, 445 IR3_BARRIER_PRIVATE_R = 1 << 9, 446 IR3_BARRIER_PRIVATE_W = 1 << 10, 447 } barrier_class, 448 barrier_conflict; 449 450 /* Entry in ir3_block's instruction list: */ 451 struct list_head node; 452 453 uint32_t serialno; 454 455 // TODO only computerator/assembler: 456 int line; 457}; 458 459struct ir3 { 460 struct ir3_compiler *compiler; 461 gl_shader_stage type; 462 463 DECLARE_ARRAY(struct ir3_instruction *, inputs); 464 465 /* Track bary.f (and ldlv) instructions.. this is needed in 466 * scheduling to ensure that all varying fetches happen before 467 * any potential kill instructions. The hw gets grumpy if all 468 * threads in a group are killed before the last bary.f gets 469 * a chance to signal end of input (ei). 470 */ 471 DECLARE_ARRAY(struct ir3_instruction *, baryfs); 472 473 /* Track all indirect instructions (read and write). To avoid 474 * deadlock scenario where an address register gets scheduled, 475 * but other dependent src instructions cannot be scheduled due 476 * to dependency on a *different* address register value, the 477 * scheduler needs to ensure that all dependencies other than 478 * the instruction other than the address register are scheduled 479 * before the one that writes the address register. Having a 480 * convenient list of instructions that reference some address 481 * register simplifies this. 482 */ 483 DECLARE_ARRAY(struct ir3_instruction *, a0_users); 484 485 /* same for a1.x: */ 486 DECLARE_ARRAY(struct ir3_instruction *, a1_users); 487 488 /* and same for instructions that consume predicate register: */ 489 DECLARE_ARRAY(struct ir3_instruction *, predicates); 490 491 /* Track texture sample instructions which need texture state 492 * patched in (for astc-srgb workaround): 493 */ 494 DECLARE_ARRAY(struct ir3_instruction *, astc_srgb); 495 496 /* List of blocks: */ 497 struct list_head block_list; 498 499 /* List of ir3_array's: */ 500 struct list_head array_list; 501 502#ifdef DEBUG 503 unsigned block_count; 504#endif 505 unsigned instr_count; 506}; 507 508struct ir3_array { 509 struct list_head node; 510 unsigned length; 511 unsigned id; 512 513 struct nir_register *r; 514 515 /* To avoid array write's from getting DCE'd, keep track of the 516 * most recent write. Any array access depends on the most 517 * recent write. This way, nothing depends on writes after the 518 * last read. But all the writes that happen before that have 519 * something depending on them 520 */ 521 struct ir3_register *last_write; 522 523 /* extra stuff used in RA pass: */ 524 unsigned base; /* base vreg name */ 525 unsigned reg; /* base physical reg */ 526 uint16_t start_ip, end_ip; 527 528 /* Indicates if half-precision */ 529 bool half; 530 531 bool unused; 532}; 533 534struct ir3_array *ir3_lookup_array(struct ir3 *ir, unsigned id); 535 536enum ir3_branch_type { 537 IR3_BRANCH_COND, /* condition */ 538 IR3_BRANCH_ANY, /* subgroupAny(condition) */ 539 IR3_BRANCH_ALL, /* subgroupAll(condition) */ 540 IR3_BRANCH_GETONE, /* subgroupElect() */ 541}; 542 543struct ir3_block { 544 struct list_head node; 545 struct ir3 *shader; 546 547 const struct nir_block *nblock; 548 549 struct list_head instr_list; /* list of ir3_instruction */ 550 551 /* The actual branch condition, if there are two successors */ 552 enum ir3_branch_type brtype; 553 554 /* each block has either one or two successors.. in case of two 555 * successors, 'condition' decides which one to follow. A block preceding 556 * an if/else has two successors. 557 * 558 * In some cases the path that the machine actually takes through the 559 * program may not match the per-thread view of the CFG. In particular 560 * this is the case for if/else, where the machine jumps from the end of 561 * the if to the beginning of the else and switches active lanes. While 562 * most things only care about the per-thread view, we need to use the 563 * "physical" view when allocating shared registers. "successors" contains 564 * the per-thread successors, and "physical_successors" contains the 565 * physical successors which includes the fallthrough edge from the if to 566 * the else. 567 */ 568 struct ir3_instruction *condition; 569 struct ir3_block *successors[2]; 570 struct ir3_block *physical_successors[2]; 571 572 DECLARE_ARRAY(struct ir3_block *, predecessors); 573 DECLARE_ARRAY(struct ir3_block *, physical_predecessors); 574 575 uint16_t start_ip, end_ip; 576 577 /* Track instructions which do not write a register but other- 578 * wise must not be discarded (such as kill, stg, etc) 579 */ 580 DECLARE_ARRAY(struct ir3_instruction *, keeps); 581 582 /* used for per-pass extra block data. Mainly used right 583 * now in RA step to track livein/liveout. 584 */ 585 void *data; 586 587 uint32_t index; 588 589 struct ir3_block *imm_dom; 590 DECLARE_ARRAY(struct ir3_block *, dom_children); 591 592 uint32_t dom_pre_index; 593 uint32_t dom_post_index; 594 595 uint32_t loop_id; 596 uint32_t loop_depth; 597 598#ifdef DEBUG 599 uint32_t serialno; 600#endif 601}; 602 603static inline uint32_t 604block_id(struct ir3_block *block) 605{ 606#ifdef DEBUG 607 return block->serialno; 608#else 609 return (uint32_t)(unsigned long)block; 610#endif 611} 612 613static inline struct ir3_block * 614ir3_start_block(struct ir3 *ir) 615{ 616 return list_first_entry(&ir->block_list, struct ir3_block, node); 617} 618 619void ir3_block_add_predecessor(struct ir3_block *block, struct ir3_block *pred); 620void ir3_block_add_physical_predecessor(struct ir3_block *block, 621 struct ir3_block *pred); 622void ir3_block_remove_predecessor(struct ir3_block *block, 623 struct ir3_block *pred); 624void ir3_block_remove_physical_predecessor(struct ir3_block *block, 625 struct ir3_block *pred); 626unsigned ir3_block_get_pred_index(struct ir3_block *block, 627 struct ir3_block *pred); 628 629void ir3_calc_dominance(struct ir3 *ir); 630bool ir3_block_dominates(struct ir3_block *a, struct ir3_block *b); 631 632struct ir3_shader_variant; 633 634struct ir3 *ir3_create(struct ir3_compiler *compiler, 635 struct ir3_shader_variant *v); 636void ir3_destroy(struct ir3 *shader); 637 638void ir3_collect_info(struct ir3_shader_variant *v); 639void *ir3_alloc(struct ir3 *shader, int sz); 640 641unsigned ir3_get_reg_dependent_max_waves(const struct ir3_compiler *compiler, 642 unsigned reg_count, 643 bool double_threadsize); 644 645unsigned ir3_get_reg_independent_max_waves(struct ir3_shader_variant *v, 646 bool double_threadsize); 647 648bool ir3_should_double_threadsize(struct ir3_shader_variant *v, 649 unsigned regs_count); 650 651struct ir3_block *ir3_block_create(struct ir3 *shader); 652 653struct ir3_instruction *ir3_instr_create(struct ir3_block *block, opc_t opc, 654 int ndst, int nsrc); 655struct ir3_instruction *ir3_instr_clone(struct ir3_instruction *instr); 656void ir3_instr_add_dep(struct ir3_instruction *instr, 657 struct ir3_instruction *dep); 658const char *ir3_instr_name(struct ir3_instruction *instr); 659 660struct ir3_register *ir3_src_create(struct ir3_instruction *instr, int num, 661 int flags); 662struct ir3_register *ir3_dst_create(struct ir3_instruction *instr, int num, 663 int flags); 664struct ir3_register *ir3_reg_clone(struct ir3 *shader, 665 struct ir3_register *reg); 666 667static inline void 668ir3_reg_tie(struct ir3_register *dst, struct ir3_register *src) 669{ 670 assert(!dst->tied && !src->tied); 671 dst->tied = src; 672 src->tied = dst; 673} 674 675void ir3_reg_set_last_array(struct ir3_instruction *instr, 676 struct ir3_register *reg, 677 struct ir3_register *last_write); 678 679void ir3_instr_set_address(struct ir3_instruction *instr, 680 struct ir3_instruction *addr); 681 682static inline bool 683ir3_instr_check_mark(struct ir3_instruction *instr) 684{ 685 if (instr->flags & IR3_INSTR_MARK) 686 return true; /* already visited */ 687 instr->flags |= IR3_INSTR_MARK; 688 return false; 689} 690 691void ir3_block_clear_mark(struct ir3_block *block); 692void ir3_clear_mark(struct ir3 *shader); 693 694unsigned ir3_count_instructions(struct ir3 *ir); 695unsigned ir3_count_instructions_ra(struct ir3 *ir); 696 697/** 698 * Move 'instr' to just before 'after' 699 */ 700static inline void 701ir3_instr_move_before(struct ir3_instruction *instr, 702 struct ir3_instruction *after) 703{ 704 list_delinit(&instr->node); 705 list_addtail(&instr->node, &after->node); 706} 707 708/** 709 * Move 'instr' to just after 'before': 710 */ 711static inline void 712ir3_instr_move_after(struct ir3_instruction *instr, 713 struct ir3_instruction *before) 714{ 715 list_delinit(&instr->node); 716 list_add(&instr->node, &before->node); 717} 718 719/** 720 * Move 'instr' to the beginning of the block: 721 */ 722static inline void 723ir3_instr_move_before_block(struct ir3_instruction *instr, 724 struct ir3_block *block) 725{ 726 list_delinit(&instr->node); 727 list_add(&instr->node, &block->instr_list); 728} 729 730void ir3_find_ssa_uses(struct ir3 *ir, void *mem_ctx, bool falsedeps); 731 732void ir3_set_dst_type(struct ir3_instruction *instr, bool half); 733void ir3_fixup_src_type(struct ir3_instruction *instr); 734 735int ir3_flut(struct ir3_register *src_reg); 736 737bool ir3_valid_flags(struct ir3_instruction *instr, unsigned n, unsigned flags); 738 739bool ir3_valid_immediate(struct ir3_instruction *instr, int32_t immed); 740 741#include "util/set.h" 742#define foreach_ssa_use(__use, __instr) \ 743 for (struct ir3_instruction *__use = (void *)~0; __use && (__instr)->uses; \ 744 __use = NULL) \ 745 set_foreach ((__instr)->uses, __entry) \ 746 if ((__use = (void *)__entry->key)) 747 748static inline uint32_t 749reg_num(const struct ir3_register *reg) 750{ 751 return reg->num >> 2; 752} 753 754static inline uint32_t 755reg_comp(const struct ir3_register *reg) 756{ 757 return reg->num & 0x3; 758} 759 760static inline bool 761is_flow(struct ir3_instruction *instr) 762{ 763 return (opc_cat(instr->opc) == 0); 764} 765 766static inline bool 767is_kill_or_demote(struct ir3_instruction *instr) 768{ 769 return instr->opc == OPC_KILL || instr->opc == OPC_DEMOTE; 770} 771 772static inline bool 773is_nop(struct ir3_instruction *instr) 774{ 775 return instr->opc == OPC_NOP; 776} 777 778static inline bool 779is_same_type_reg(struct ir3_register *dst, struct ir3_register *src) 780{ 781 unsigned dst_type = (dst->flags & IR3_REG_HALF); 782 unsigned src_type = (src->flags & IR3_REG_HALF); 783 784 /* Treat shared->normal copies as same-type, because they can generally be 785 * folded, but not normal->shared copies. 786 */ 787 if (dst_type != src_type || 788 ((dst->flags & IR3_REG_SHARED) && !(src->flags & IR3_REG_SHARED))) 789 return false; 790 else 791 return true; 792} 793 794/* Is it a non-transformative (ie. not type changing) mov? This can 795 * also include absneg.s/absneg.f, which for the most part can be 796 * treated as a mov (single src argument). 797 */ 798static inline bool 799is_same_type_mov(struct ir3_instruction *instr) 800{ 801 struct ir3_register *dst; 802 803 switch (instr->opc) { 804 case OPC_MOV: 805 if (instr->cat1.src_type != instr->cat1.dst_type) 806 return false; 807 /* If the type of dest reg and src reg are different, 808 * it shouldn't be considered as same type mov 809 */ 810 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0])) 811 return false; 812 break; 813 case OPC_ABSNEG_F: 814 case OPC_ABSNEG_S: 815 if (instr->flags & IR3_INSTR_SAT) 816 return false; 817 /* If the type of dest reg and src reg are different, 818 * it shouldn't be considered as same type mov 819 */ 820 if (!is_same_type_reg(instr->dsts[0], instr->srcs[0])) 821 return false; 822 break; 823 case OPC_META_PHI: 824 return instr->srcs_count == 1; 825 default: 826 return false; 827 } 828 829 dst = instr->dsts[0]; 830 831 /* mov's that write to a0 or p0.x are special: */ 832 if (dst->num == regid(REG_P0, 0)) 833 return false; 834 if (reg_num(dst) == REG_A0) 835 return false; 836 837 if (dst->flags & (IR3_REG_RELATIV | IR3_REG_ARRAY)) 838 return false; 839 840 return true; 841} 842 843/* A move from const, which changes size but not type, can also be 844 * folded into dest instruction in some cases. 845 */ 846static inline bool 847is_const_mov(struct ir3_instruction *instr) 848{ 849 if (instr->opc != OPC_MOV) 850 return false; 851 852 if (!(instr->srcs[0]->flags & IR3_REG_CONST)) 853 return false; 854 855 type_t src_type = instr->cat1.src_type; 856 type_t dst_type = instr->cat1.dst_type; 857 858 return (type_float(src_type) && type_float(dst_type)) || 859 (type_uint(src_type) && type_uint(dst_type)) || 860 (type_sint(src_type) && type_sint(dst_type)); 861} 862 863static inline bool 864is_alu(struct ir3_instruction *instr) 865{ 866 return (1 <= opc_cat(instr->opc)) && (opc_cat(instr->opc) <= 3); 867} 868 869static inline bool 870is_sfu(struct ir3_instruction *instr) 871{ 872 return (opc_cat(instr->opc) == 4); 873} 874 875static inline bool 876is_tex(struct ir3_instruction *instr) 877{ 878 return (opc_cat(instr->opc) == 5); 879} 880 881static inline bool 882is_tex_or_prefetch(struct ir3_instruction *instr) 883{ 884 return is_tex(instr) || (instr->opc == OPC_META_TEX_PREFETCH); 885} 886 887static inline bool 888is_mem(struct ir3_instruction *instr) 889{ 890 return (opc_cat(instr->opc) == 6); 891} 892 893static inline bool 894is_barrier(struct ir3_instruction *instr) 895{ 896 return (opc_cat(instr->opc) == 7); 897} 898 899static inline bool 900is_half(struct ir3_instruction *instr) 901{ 902 return !!(instr->dsts[0]->flags & IR3_REG_HALF); 903} 904 905static inline bool 906is_shared(struct ir3_instruction *instr) 907{ 908 return !!(instr->dsts[0]->flags & IR3_REG_SHARED); 909} 910 911static inline bool 912is_store(struct ir3_instruction *instr) 913{ 914 /* these instructions, the "destination" register is 915 * actually a source, the address to store to. 916 */ 917 switch (instr->opc) { 918 case OPC_STG: 919 case OPC_STG_A: 920 case OPC_STGB: 921 case OPC_STIB: 922 case OPC_STP: 923 case OPC_STL: 924 case OPC_STLW: 925 case OPC_L2G: 926 case OPC_G2L: 927 return true; 928 default: 929 return false; 930 } 931} 932 933static inline bool 934is_load(struct ir3_instruction *instr) 935{ 936 switch (instr->opc) { 937 case OPC_LDG: 938 case OPC_LDG_A: 939 case OPC_LDGB: 940 case OPC_LDIB: 941 case OPC_LDL: 942 case OPC_LDP: 943 case OPC_L2G: 944 case OPC_LDLW: 945 case OPC_LDC: 946 case OPC_LDLV: 947 /* probably some others too.. */ 948 return true; 949 default: 950 return false; 951 } 952} 953 954static inline bool 955is_input(struct ir3_instruction *instr) 956{ 957 /* in some cases, ldlv is used to fetch varying without 958 * interpolation.. fortunately inloc is the first src 959 * register in either case 960 */ 961 switch (instr->opc) { 962 case OPC_LDLV: 963 case OPC_BARY_F: 964 return true; 965 default: 966 return false; 967 } 968} 969 970static inline bool 971is_bool(struct ir3_instruction *instr) 972{ 973 switch (instr->opc) { 974 case OPC_CMPS_F: 975 case OPC_CMPS_S: 976 case OPC_CMPS_U: 977 return true; 978 default: 979 return false; 980 } 981} 982 983static inline opc_t 984cat3_half_opc(opc_t opc) 985{ 986 switch (opc) { 987 case OPC_MAD_F32: 988 return OPC_MAD_F16; 989 case OPC_SEL_B32: 990 return OPC_SEL_B16; 991 case OPC_SEL_S32: 992 return OPC_SEL_S16; 993 case OPC_SEL_F32: 994 return OPC_SEL_F16; 995 case OPC_SAD_S32: 996 return OPC_SAD_S16; 997 default: 998 return opc; 999 } 1000} 1001 1002static inline opc_t 1003cat3_full_opc(opc_t opc) 1004{ 1005 switch (opc) { 1006 case OPC_MAD_F16: 1007 return OPC_MAD_F32; 1008 case OPC_SEL_B16: 1009 return OPC_SEL_B32; 1010 case OPC_SEL_S16: 1011 return OPC_SEL_S32; 1012 case OPC_SEL_F16: 1013 return OPC_SEL_F32; 1014 case OPC_SAD_S16: 1015 return OPC_SAD_S32; 1016 default: 1017 return opc; 1018 } 1019} 1020 1021static inline opc_t 1022cat4_half_opc(opc_t opc) 1023{ 1024 switch (opc) { 1025 case OPC_RSQ: 1026 return OPC_HRSQ; 1027 case OPC_LOG2: 1028 return OPC_HLOG2; 1029 case OPC_EXP2: 1030 return OPC_HEXP2; 1031 default: 1032 return opc; 1033 } 1034} 1035 1036static inline opc_t 1037cat4_full_opc(opc_t opc) 1038{ 1039 switch (opc) { 1040 case OPC_HRSQ: 1041 return OPC_RSQ; 1042 case OPC_HLOG2: 1043 return OPC_LOG2; 1044 case OPC_HEXP2: 1045 return OPC_EXP2; 1046 default: 1047 return opc; 1048 } 1049} 1050 1051static inline bool 1052is_meta(struct ir3_instruction *instr) 1053{ 1054 return (opc_cat(instr->opc) == -1); 1055} 1056 1057static inline unsigned 1058reg_elems(const struct ir3_register *reg) 1059{ 1060 if (reg->flags & IR3_REG_ARRAY) 1061 return reg->size; 1062 else 1063 return util_last_bit(reg->wrmask); 1064} 1065 1066static inline unsigned 1067reg_elem_size(const struct ir3_register *reg) 1068{ 1069 return (reg->flags & IR3_REG_HALF) ? 1 : 2; 1070} 1071 1072static inline unsigned 1073reg_size(const struct ir3_register *reg) 1074{ 1075 return reg_elems(reg) * reg_elem_size(reg); 1076} 1077 1078static inline unsigned 1079dest_regs(struct ir3_instruction *instr) 1080{ 1081 if (instr->dsts_count == 0) 1082 return 0; 1083 1084 debug_assert(instr->dsts_count == 1); 1085 return util_last_bit(instr->dsts[0]->wrmask); 1086} 1087 1088/* is dst a normal temp register: */ 1089static inline bool 1090is_dest_gpr(struct ir3_register *dst) 1091{ 1092 if (dst->wrmask == 0) 1093 return false; 1094 if ((reg_num(dst) == REG_A0) || (dst->num == regid(REG_P0, 0))) 1095 return false; 1096 return true; 1097} 1098 1099static inline bool 1100writes_gpr(struct ir3_instruction *instr) 1101{ 1102 if (dest_regs(instr) == 0) 1103 return false; 1104 return is_dest_gpr(instr->dsts[0]); 1105} 1106 1107static inline bool 1108writes_addr0(struct ir3_instruction *instr) 1109{ 1110 /* Note: only the first dest can write to a0.x */ 1111 if (instr->dsts_count > 0) { 1112 struct ir3_register *dst = instr->dsts[0]; 1113 return dst->num == regid(REG_A0, 0); 1114 } 1115 return false; 1116} 1117 1118static inline bool 1119writes_addr1(struct ir3_instruction *instr) 1120{ 1121 /* Note: only the first dest can write to a1.x */ 1122 if (instr->dsts_count > 0) { 1123 struct ir3_register *dst = instr->dsts[0]; 1124 return dst->num == regid(REG_A0, 1); 1125 } 1126 return false; 1127} 1128 1129static inline bool 1130writes_pred(struct ir3_instruction *instr) 1131{ 1132 /* Note: only the first dest can write to p0.x */ 1133 if (instr->dsts_count > 0) { 1134 struct ir3_register *dst = instr->dsts[0]; 1135 return reg_num(dst) == REG_P0; 1136 } 1137 return false; 1138} 1139 1140/* Is it something other than a normal register. Shared regs, p0, and a0/a1 1141 * are considered special here. Special registers are always accessed with one 1142 * size and never alias normal registers, even though a naive calculation 1143 * would sometimes make it seem like e.g. r30.z aliases a0.x. 1144 */ 1145static inline bool 1146is_reg_special(const struct ir3_register *reg) 1147{ 1148 return (reg->flags & IR3_REG_SHARED) || (reg_num(reg) == REG_A0) || 1149 (reg_num(reg) == REG_P0); 1150} 1151 1152/* Same as above but in cases where we don't have a register. r48.x and above 1153 * are shared/special. 1154 */ 1155static inline bool 1156is_reg_num_special(unsigned num) 1157{ 1158 return num >= 48 * 4; 1159} 1160 1161/* returns defining instruction for reg */ 1162/* TODO better name */ 1163static inline struct ir3_instruction * 1164ssa(struct ir3_register *reg) 1165{ 1166 if ((reg->flags & (IR3_REG_SSA | IR3_REG_ARRAY)) && reg->def) 1167 return reg->def->instr; 1168 return NULL; 1169} 1170 1171static inline bool 1172conflicts(struct ir3_register *a, struct ir3_register *b) 1173{ 1174 return (a && b) && (a->def != b->def); 1175} 1176 1177static inline bool 1178reg_gpr(struct ir3_register *r) 1179{ 1180 if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED)) 1181 return false; 1182 if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0)) 1183 return false; 1184 return true; 1185} 1186 1187static inline type_t 1188half_type(type_t type) 1189{ 1190 switch (type) { 1191 case TYPE_F32: 1192 return TYPE_F16; 1193 case TYPE_U32: 1194 return TYPE_U16; 1195 case TYPE_S32: 1196 return TYPE_S16; 1197 case TYPE_F16: 1198 case TYPE_U16: 1199 case TYPE_S16: 1200 return type; 1201 default: 1202 assert(0); 1203 return ~0; 1204 } 1205} 1206 1207static inline type_t 1208full_type(type_t type) 1209{ 1210 switch (type) { 1211 case TYPE_F16: 1212 return TYPE_F32; 1213 case TYPE_U16: 1214 return TYPE_U32; 1215 case TYPE_S16: 1216 return TYPE_S32; 1217 case TYPE_F32: 1218 case TYPE_U32: 1219 case TYPE_S32: 1220 return type; 1221 default: 1222 assert(0); 1223 return ~0; 1224 } 1225} 1226 1227/* some cat2 instructions (ie. those which are not float) can embed an 1228 * immediate: 1229 */ 1230static inline bool 1231ir3_cat2_int(opc_t opc) 1232{ 1233 switch (opc) { 1234 case OPC_ADD_U: 1235 case OPC_ADD_S: 1236 case OPC_SUB_U: 1237 case OPC_SUB_S: 1238 case OPC_CMPS_U: 1239 case OPC_CMPS_S: 1240 case OPC_MIN_U: 1241 case OPC_MIN_S: 1242 case OPC_MAX_U: 1243 case OPC_MAX_S: 1244 case OPC_CMPV_U: 1245 case OPC_CMPV_S: 1246 case OPC_MUL_U24: 1247 case OPC_MUL_S24: 1248 case OPC_MULL_U: 1249 case OPC_CLZ_S: 1250 case OPC_ABSNEG_S: 1251 case OPC_AND_B: 1252 case OPC_OR_B: 1253 case OPC_NOT_B: 1254 case OPC_XOR_B: 1255 case OPC_BFREV_B: 1256 case OPC_CLZ_B: 1257 case OPC_SHL_B: 1258 case OPC_SHR_B: 1259 case OPC_ASHR_B: 1260 case OPC_MGEN_B: 1261 case OPC_GETBIT_B: 1262 case OPC_CBITS_B: 1263 case OPC_BARY_F: 1264 return true; 1265 1266 default: 1267 return false; 1268 } 1269} 1270 1271/* map cat2 instruction to valid abs/neg flags: */ 1272static inline unsigned 1273ir3_cat2_absneg(opc_t opc) 1274{ 1275 switch (opc) { 1276 case OPC_ADD_F: 1277 case OPC_MIN_F: 1278 case OPC_MAX_F: 1279 case OPC_MUL_F: 1280 case OPC_SIGN_F: 1281 case OPC_CMPS_F: 1282 case OPC_ABSNEG_F: 1283 case OPC_CMPV_F: 1284 case OPC_FLOOR_F: 1285 case OPC_CEIL_F: 1286 case OPC_RNDNE_F: 1287 case OPC_RNDAZ_F: 1288 case OPC_TRUNC_F: 1289 case OPC_BARY_F: 1290 return IR3_REG_FABS | IR3_REG_FNEG; 1291 1292 case OPC_ADD_U: 1293 case OPC_ADD_S: 1294 case OPC_SUB_U: 1295 case OPC_SUB_S: 1296 case OPC_CMPS_U: 1297 case OPC_CMPS_S: 1298 case OPC_MIN_U: 1299 case OPC_MIN_S: 1300 case OPC_MAX_U: 1301 case OPC_MAX_S: 1302 case OPC_CMPV_U: 1303 case OPC_CMPV_S: 1304 case OPC_MUL_U24: 1305 case OPC_MUL_S24: 1306 case OPC_MULL_U: 1307 case OPC_CLZ_S: 1308 return 0; 1309 1310 case OPC_ABSNEG_S: 1311 return IR3_REG_SABS | IR3_REG_SNEG; 1312 1313 case OPC_AND_B: 1314 case OPC_OR_B: 1315 case OPC_NOT_B: 1316 case OPC_XOR_B: 1317 case OPC_BFREV_B: 1318 case OPC_CLZ_B: 1319 case OPC_SHL_B: 1320 case OPC_SHR_B: 1321 case OPC_ASHR_B: 1322 case OPC_MGEN_B: 1323 case OPC_GETBIT_B: 1324 case OPC_CBITS_B: 1325 return IR3_REG_BNOT; 1326 1327 default: 1328 return 0; 1329 } 1330} 1331 1332/* map cat3 instructions to valid abs/neg flags: */ 1333static inline unsigned 1334ir3_cat3_absneg(opc_t opc) 1335{ 1336 switch (opc) { 1337 case OPC_MAD_F16: 1338 case OPC_MAD_F32: 1339 case OPC_SEL_F16: 1340 case OPC_SEL_F32: 1341 return IR3_REG_FNEG; 1342 1343 case OPC_MAD_U16: 1344 case OPC_MADSH_U16: 1345 case OPC_MAD_S16: 1346 case OPC_MADSH_M16: 1347 case OPC_MAD_U24: 1348 case OPC_MAD_S24: 1349 case OPC_SEL_S16: 1350 case OPC_SEL_S32: 1351 case OPC_SAD_S16: 1352 case OPC_SAD_S32: 1353 /* neg *may* work on 3rd src.. */ 1354 1355 case OPC_SEL_B16: 1356 case OPC_SEL_B32: 1357 1358 case OPC_SHLG_B16: 1359 1360 default: 1361 return 0; 1362 } 1363} 1364 1365/* Return the type (float, int, or uint) the op uses when converting from the 1366 * internal result of the op (which is assumed to be the same size as the 1367 * sources) to the destination when they are not the same size. If F32 it does 1368 * a floating-point conversion, if U32 it does a truncation/zero-extension, if 1369 * S32 it does a truncation/sign-extension. "can_fold" will be false if it 1370 * doesn't do anything sensible or is unknown. 1371 */ 1372static inline type_t 1373ir3_output_conv_type(struct ir3_instruction *instr, bool *can_fold) 1374{ 1375 *can_fold = true; 1376 switch (instr->opc) { 1377 case OPC_ADD_F: 1378 case OPC_MUL_F: 1379 case OPC_BARY_F: 1380 case OPC_MAD_F32: 1381 case OPC_MAD_F16: 1382 return TYPE_F32; 1383 1384 case OPC_ADD_U: 1385 case OPC_SUB_U: 1386 case OPC_MIN_U: 1387 case OPC_MAX_U: 1388 case OPC_AND_B: 1389 case OPC_OR_B: 1390 case OPC_NOT_B: 1391 case OPC_XOR_B: 1392 case OPC_MUL_U24: 1393 case OPC_MULL_U: 1394 case OPC_SHL_B: 1395 case OPC_SHR_B: 1396 case OPC_ASHR_B: 1397 case OPC_MAD_U24: 1398 /* Comparison ops zero-extend/truncate their results, so consider them as 1399 * unsigned here. 1400 */ 1401 case OPC_CMPS_F: 1402 case OPC_CMPV_F: 1403 case OPC_CMPS_U: 1404 case OPC_CMPS_S: 1405 return TYPE_U32; 1406 1407 case OPC_ADD_S: 1408 case OPC_SUB_S: 1409 case OPC_MIN_S: 1410 case OPC_MAX_S: 1411 case OPC_ABSNEG_S: 1412 case OPC_MUL_S24: 1413 case OPC_MAD_S24: 1414 return TYPE_S32; 1415 1416 /* We assume that any move->move folding that could be done was done by 1417 * NIR. 1418 */ 1419 case OPC_MOV: 1420 default: 1421 *can_fold = false; 1422 return TYPE_U32; 1423 } 1424} 1425 1426/* Return the src and dst types for the conversion which is already folded 1427 * into the op. We can assume that instr has folded in a conversion from 1428 * ir3_output_conv_src_type() to ir3_output_conv_dst_type(). Only makes sense 1429 * to call if ir3_output_conv_type() returns can_fold = true. 1430 */ 1431static inline type_t 1432ir3_output_conv_src_type(struct ir3_instruction *instr, type_t base_type) 1433{ 1434 switch (instr->opc) { 1435 case OPC_CMPS_F: 1436 case OPC_CMPV_F: 1437 case OPC_CMPS_U: 1438 case OPC_CMPS_S: 1439 /* Comparisons only return 0/1 and the size of the comparison sources 1440 * is irrelevant, never consider them as having an output conversion 1441 * by returning a type with the dest size here: 1442 */ 1443 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type) 1444 : full_type(base_type); 1445 1446 case OPC_BARY_F: 1447 /* bary.f doesn't have an explicit source, but we can assume here that 1448 * the varying data it reads is in fp32. 1449 * 1450 * This may be fp16 on older gen's depending on some register 1451 * settings, but it's probably not worth plumbing that through for a 1452 * small improvement that NIR would hopefully handle for us anyway. 1453 */ 1454 return TYPE_F32; 1455 1456 default: 1457 return (instr->srcs[0]->flags & IR3_REG_HALF) ? half_type(base_type) 1458 : full_type(base_type); 1459 } 1460} 1461 1462static inline type_t 1463ir3_output_conv_dst_type(struct ir3_instruction *instr, type_t base_type) 1464{ 1465 return (instr->dsts[0]->flags & IR3_REG_HALF) ? half_type(base_type) 1466 : full_type(base_type); 1467} 1468 1469/* Some instructions have signed/unsigned variants which are identical except 1470 * for whether the folded conversion sign-extends or zero-extends, and we can 1471 * fold in a mismatching move by rewriting the opcode. Return the opcode to 1472 * switch signedness, and whether one exists. 1473 */ 1474static inline opc_t 1475ir3_try_swap_signedness(opc_t opc, bool *can_swap) 1476{ 1477 switch (opc) { 1478#define PAIR(u, s) \ 1479 case OPC_##u: \ 1480 return OPC_##s; \ 1481 case OPC_##s: \ 1482 return OPC_##u; 1483 PAIR(ADD_U, ADD_S) 1484 PAIR(SUB_U, SUB_S) 1485 /* Note: these are only identical when the sources are half, but that's 1486 * the only case we call this function for anyway. 1487 */ 1488 PAIR(MUL_U24, MUL_S24) 1489 1490 default: 1491 *can_swap = false; 1492 return opc; 1493 } 1494} 1495 1496#define MASK(n) ((1 << (n)) - 1) 1497 1498/* iterator for an instructions's sources (reg), also returns src #: */ 1499#define foreach_src_n(__srcreg, __n, __instr) \ 1500 if ((__instr)->srcs_count) \ 1501 for (struct ir3_register *__srcreg = (void *)~0; __srcreg; \ 1502 __srcreg = NULL) \ 1503 for (unsigned __cnt = (__instr)->srcs_count, __n = 0; __n < __cnt; \ 1504 __n++) \ 1505 if ((__srcreg = (__instr)->srcs[__n])) 1506 1507/* iterator for an instructions's sources (reg): */ 1508#define foreach_src(__srcreg, __instr) foreach_src_n (__srcreg, __i, __instr) 1509 1510/* iterator for an instructions's destinations (reg), also returns dst #: */ 1511#define foreach_dst_n(__dstreg, __n, __instr) \ 1512 if ((__instr)->dsts_count) \ 1513 for (struct ir3_register *__dstreg = (void *)~0; __dstreg; \ 1514 __dstreg = NULL) \ 1515 for (unsigned __cnt = (__instr)->dsts_count, __n = 0; __n < __cnt; \ 1516 __n++) \ 1517 if ((__dstreg = (__instr)->dsts[__n])) 1518 1519/* iterator for an instructions's destinations (reg): */ 1520#define foreach_dst(__dstreg, __instr) foreach_dst_n (__dstreg, __i, __instr) 1521 1522static inline unsigned 1523__ssa_src_cnt(struct ir3_instruction *instr) 1524{ 1525 return instr->srcs_count + instr->deps_count; 1526} 1527 1528static inline bool 1529__is_false_dep(struct ir3_instruction *instr, unsigned n) 1530{ 1531 if (n >= instr->srcs_count) 1532 return true; 1533 return false; 1534} 1535 1536static inline struct ir3_instruction ** 1537__ssa_srcp_n(struct ir3_instruction *instr, unsigned n) 1538{ 1539 if (__is_false_dep(instr, n)) 1540 return &instr->deps[n - instr->srcs_count]; 1541 if (ssa(instr->srcs[n])) 1542 return &instr->srcs[n]->def->instr; 1543 return NULL; 1544} 1545 1546#define foreach_ssa_srcp_n(__srcp, __n, __instr) \ 1547 for (struct ir3_instruction **__srcp = (void *)~0; __srcp; __srcp = NULL) \ 1548 for (unsigned __cnt = __ssa_src_cnt(__instr), __n = 0; __n < __cnt; \ 1549 __n++) \ 1550 if ((__srcp = __ssa_srcp_n(__instr, __n))) 1551 1552#define foreach_ssa_srcp(__srcp, __instr) \ 1553 foreach_ssa_srcp_n (__srcp, __i, __instr) 1554 1555/* iterator for an instruction's SSA sources (instr), also returns src #: */ 1556#define foreach_ssa_src_n(__srcinst, __n, __instr) \ 1557 for (struct ir3_instruction *__srcinst = (void *)~0; __srcinst; \ 1558 __srcinst = NULL) \ 1559 foreach_ssa_srcp_n (__srcp, __n, __instr) \ 1560 if ((__srcinst = *__srcp)) 1561 1562/* iterator for an instruction's SSA sources (instr): */ 1563#define foreach_ssa_src(__srcinst, __instr) \ 1564 foreach_ssa_src_n (__srcinst, __i, __instr) 1565 1566/* iterators for shader inputs: */ 1567#define foreach_input_n(__ininstr, __cnt, __ir) \ 1568 for (struct ir3_instruction *__ininstr = (void *)~0; __ininstr; \ 1569 __ininstr = NULL) \ 1570 for (unsigned __cnt = 0; __cnt < (__ir)->inputs_count; __cnt++) \ 1571 if ((__ininstr = (__ir)->inputs[__cnt])) 1572#define foreach_input(__ininstr, __ir) foreach_input_n (__ininstr, __i, __ir) 1573 1574/* iterators for instructions: */ 1575#define foreach_instr(__instr, __list) \ 1576 list_for_each_entry (struct ir3_instruction, __instr, __list, node) 1577#define foreach_instr_rev(__instr, __list) \ 1578 list_for_each_entry_rev (struct ir3_instruction, __instr, __list, node) 1579#define foreach_instr_safe(__instr, __list) \ 1580 list_for_each_entry_safe (struct ir3_instruction, __instr, __list, node) 1581#define foreach_instr_from_safe(__instr, __start, __list) \ 1582 list_for_each_entry_from_safe(struct ir3_instruction, __instr, __start, \ 1583 __list, node) 1584 1585/* iterators for blocks: */ 1586#define foreach_block(__block, __list) \ 1587 list_for_each_entry (struct ir3_block, __block, __list, node) 1588#define foreach_block_safe(__block, __list) \ 1589 list_for_each_entry_safe (struct ir3_block, __block, __list, node) 1590#define foreach_block_rev(__block, __list) \ 1591 list_for_each_entry_rev (struct ir3_block, __block, __list, node) 1592 1593/* iterators for arrays: */ 1594#define foreach_array(__array, __list) \ 1595 list_for_each_entry (struct ir3_array, __array, __list, node) 1596#define foreach_array_safe(__array, __list) \ 1597 list_for_each_entry_safe (struct ir3_array, __array, __list, node) 1598 1599#define IR3_PASS(ir, pass, ...) \ 1600 ({ \ 1601 bool progress = pass(ir, ##__VA_ARGS__); \ 1602 if (progress) { \ 1603 ir3_debug_print(ir, "AFTER: " #pass); \ 1604 ir3_validate(ir); \ 1605 } \ 1606 progress; \ 1607 }) 1608 1609/* validate: */ 1610void ir3_validate(struct ir3 *ir); 1611 1612/* dump: */ 1613void ir3_print(struct ir3 *ir); 1614void ir3_print_instr(struct ir3_instruction *instr); 1615 1616struct log_stream; 1617void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr); 1618 1619/* delay calculation: */ 1620int ir3_delayslots(struct ir3_instruction *assigner, 1621 struct ir3_instruction *consumer, unsigned n, bool soft); 1622unsigned ir3_delay_calc_prera(struct ir3_block *block, 1623 struct ir3_instruction *instr); 1624unsigned ir3_delay_calc_postra(struct ir3_block *block, 1625 struct ir3_instruction *instr, bool soft, 1626 bool mergedregs); 1627unsigned ir3_delay_calc_exact(struct ir3_block *block, 1628 struct ir3_instruction *instr, bool mergedregs); 1629void ir3_remove_nops(struct ir3 *ir); 1630 1631/* unreachable block elimination: */ 1632bool ir3_remove_unreachable(struct ir3 *ir); 1633 1634/* dead code elimination: */ 1635struct ir3_shader_variant; 1636bool ir3_dce(struct ir3 *ir, struct ir3_shader_variant *so); 1637 1638/* fp16 conversion folding */ 1639bool ir3_cf(struct ir3 *ir); 1640 1641/* copy-propagate: */ 1642bool ir3_cp(struct ir3 *ir, struct ir3_shader_variant *so); 1643bool ir3_cp_postsched(struct ir3 *ir); 1644 1645/* common subexpression elimination: */ 1646bool ir3_cse(struct ir3 *ir); 1647 1648/* Make arrays SSA */ 1649bool ir3_array_to_ssa(struct ir3 *ir); 1650 1651/* scheduling: */ 1652bool ir3_sched_add_deps(struct ir3 *ir); 1653int ir3_sched(struct ir3 *ir); 1654 1655struct ir3_context; 1656bool ir3_postsched(struct ir3 *ir, struct ir3_shader_variant *v); 1657 1658/* register assignment: */ 1659int ir3_ra(struct ir3_shader_variant *v); 1660 1661/* lower subgroup ops: */ 1662bool ir3_lower_subgroups(struct ir3 *ir); 1663 1664/* legalize: */ 1665bool ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary); 1666 1667static inline bool 1668ir3_has_latency_to_hide(struct ir3 *ir) 1669{ 1670 /* VS/GS/TCS/TESS co-exist with frag shader invocations, but we don't 1671 * know the nature of the fragment shader. Just assume it will have 1672 * latency to hide: 1673 */ 1674 if (ir->type != MESA_SHADER_FRAGMENT) 1675 return true; 1676 1677 foreach_block (block, &ir->block_list) { 1678 foreach_instr (instr, &block->instr_list) { 1679 if (is_tex_or_prefetch(instr)) 1680 return true; 1681 1682 if (is_load(instr)) { 1683 switch (instr->opc) { 1684 case OPC_LDLV: 1685 case OPC_LDL: 1686 case OPC_LDLW: 1687 break; 1688 default: 1689 return true; 1690 } 1691 } 1692 } 1693 } 1694 1695 return false; 1696} 1697 1698/* ************************************************************************* */ 1699/* instruction helpers */ 1700 1701/* creates SSA src of correct type (ie. half vs full precision) */ 1702static inline struct ir3_register * 1703__ssa_src(struct ir3_instruction *instr, struct ir3_instruction *src, 1704 unsigned flags) 1705{ 1706 struct ir3_register *reg; 1707 if (src->dsts[0]->flags & IR3_REG_HALF) 1708 flags |= IR3_REG_HALF; 1709 reg = ir3_src_create(instr, INVALID_REG, IR3_REG_SSA | flags); 1710 reg->def = src->dsts[0]; 1711 reg->wrmask = src->dsts[0]->wrmask; 1712 return reg; 1713} 1714 1715static inline struct ir3_register * 1716__ssa_dst(struct ir3_instruction *instr) 1717{ 1718 struct ir3_register *reg = ir3_dst_create(instr, INVALID_REG, IR3_REG_SSA); 1719 reg->instr = instr; 1720 return reg; 1721} 1722 1723static inline struct ir3_instruction * 1724create_immed_typed(struct ir3_block *block, uint32_t val, type_t type) 1725{ 1726 struct ir3_instruction *mov; 1727 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0; 1728 1729 mov = ir3_instr_create(block, OPC_MOV, 1, 1); 1730 mov->cat1.src_type = type; 1731 mov->cat1.dst_type = type; 1732 __ssa_dst(mov)->flags |= flags; 1733 ir3_src_create(mov, 0, IR3_REG_IMMED | flags)->uim_val = val; 1734 1735 return mov; 1736} 1737 1738static inline struct ir3_instruction * 1739create_immed(struct ir3_block *block, uint32_t val) 1740{ 1741 return create_immed_typed(block, val, TYPE_U32); 1742} 1743 1744static inline struct ir3_instruction * 1745create_uniform_typed(struct ir3_block *block, unsigned n, type_t type) 1746{ 1747 struct ir3_instruction *mov; 1748 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0; 1749 1750 mov = ir3_instr_create(block, OPC_MOV, 1, 1); 1751 mov->cat1.src_type = type; 1752 mov->cat1.dst_type = type; 1753 __ssa_dst(mov)->flags |= flags; 1754 ir3_src_create(mov, n, IR3_REG_CONST | flags); 1755 1756 return mov; 1757} 1758 1759static inline struct ir3_instruction * 1760create_uniform(struct ir3_block *block, unsigned n) 1761{ 1762 return create_uniform_typed(block, n, TYPE_F32); 1763} 1764 1765static inline struct ir3_instruction * 1766create_uniform_indirect(struct ir3_block *block, int n, type_t type, 1767 struct ir3_instruction *address) 1768{ 1769 struct ir3_instruction *mov; 1770 1771 mov = ir3_instr_create(block, OPC_MOV, 1, 1); 1772 mov->cat1.src_type = type; 1773 mov->cat1.dst_type = type; 1774 __ssa_dst(mov); 1775 ir3_src_create(mov, 0, IR3_REG_CONST | IR3_REG_RELATIV)->array.offset = n; 1776 1777 ir3_instr_set_address(mov, address); 1778 1779 return mov; 1780} 1781 1782static inline struct ir3_instruction * 1783ir3_MOV(struct ir3_block *block, struct ir3_instruction *src, type_t type) 1784{ 1785 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1); 1786 unsigned flags = (type_size(type) < 32) ? IR3_REG_HALF : 0; 1787 1788 __ssa_dst(instr)->flags |= flags; 1789 if (src->dsts[0]->flags & IR3_REG_ARRAY) { 1790 struct ir3_register *src_reg = __ssa_src(instr, src, IR3_REG_ARRAY); 1791 src_reg->array = src->dsts[0]->array; 1792 } else { 1793 __ssa_src(instr, src, src->dsts[0]->flags & IR3_REG_SHARED); 1794 } 1795 debug_assert(!(src->dsts[0]->flags & IR3_REG_RELATIV)); 1796 instr->cat1.src_type = type; 1797 instr->cat1.dst_type = type; 1798 return instr; 1799} 1800 1801static inline struct ir3_instruction * 1802ir3_COV(struct ir3_block *block, struct ir3_instruction *src, type_t src_type, 1803 type_t dst_type) 1804{ 1805 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOV, 1, 1); 1806 unsigned dst_flags = (type_size(dst_type) < 32) ? IR3_REG_HALF : 0; 1807 unsigned src_flags = (type_size(src_type) < 32) ? IR3_REG_HALF : 0; 1808 1809 debug_assert((src->dsts[0]->flags & IR3_REG_HALF) == src_flags); 1810 1811 __ssa_dst(instr)->flags |= dst_flags; 1812 __ssa_src(instr, src, 0); 1813 instr->cat1.src_type = src_type; 1814 instr->cat1.dst_type = dst_type; 1815 debug_assert(!(src->dsts[0]->flags & IR3_REG_ARRAY)); 1816 return instr; 1817} 1818 1819static inline struct ir3_instruction * 1820ir3_MOVMSK(struct ir3_block *block, unsigned components) 1821{ 1822 struct ir3_instruction *instr = ir3_instr_create(block, OPC_MOVMSK, 1, 0); 1823 1824 struct ir3_register *dst = __ssa_dst(instr); 1825 dst->flags |= IR3_REG_SHARED; 1826 dst->wrmask = (1 << components) - 1; 1827 instr->repeat = components - 1; 1828 return instr; 1829} 1830 1831static inline struct ir3_instruction * 1832ir3_BALLOT_MACRO(struct ir3_block *block, struct ir3_instruction *src, 1833 unsigned components) 1834{ 1835 struct ir3_instruction *instr = 1836 ir3_instr_create(block, OPC_BALLOT_MACRO, 1, 1); 1837 1838 struct ir3_register *dst = __ssa_dst(instr); 1839 dst->flags |= IR3_REG_SHARED; 1840 dst->wrmask = (1 << components) - 1; 1841 1842 __ssa_src(instr, src, 0); 1843 1844 return instr; 1845} 1846 1847static inline struct ir3_instruction * 1848ir3_NOP(struct ir3_block *block) 1849{ 1850 return ir3_instr_create(block, OPC_NOP, 0, 0); 1851} 1852 1853#define IR3_INSTR_0 0 1854 1855/* clang-format off */ 1856#define __INSTR0(flag, name, opc) \ 1857static inline struct ir3_instruction *ir3_##name(struct ir3_block *block) \ 1858{ \ 1859 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 0); \ 1860 instr->flags |= flag; \ 1861 return instr; \ 1862} 1863/* clang-format on */ 1864#define INSTR0F(f, name) __INSTR0(IR3_INSTR_##f, name##_##f, OPC_##name) 1865#define INSTR0(name) __INSTR0(0, name, OPC_##name) 1866 1867/* clang-format off */ 1868#define __INSTR1(flag, dst_count, name, opc) \ 1869static inline struct ir3_instruction *ir3_##name( \ 1870 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags) \ 1871{ \ 1872 struct ir3_instruction *instr = \ 1873 ir3_instr_create(block, opc, dst_count, 1); \ 1874 for (unsigned i = 0; i < dst_count; i++) \ 1875 __ssa_dst(instr); \ 1876 __ssa_src(instr, a, aflags); \ 1877 instr->flags |= flag; \ 1878 return instr; \ 1879} 1880/* clang-format on */ 1881#define INSTR1F(f, name) __INSTR1(IR3_INSTR_##f, 1, name##_##f, OPC_##name) 1882#define INSTR1(name) __INSTR1(0, 1, name, OPC_##name) 1883#define INSTR1NODST(name) __INSTR1(0, 0, name, OPC_##name) 1884 1885/* clang-format off */ 1886#define __INSTR2(flag, name, opc) \ 1887static inline struct ir3_instruction *ir3_##name( \ 1888 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1889 struct ir3_instruction *b, unsigned bflags) \ 1890{ \ 1891 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 2); \ 1892 __ssa_dst(instr); \ 1893 __ssa_src(instr, a, aflags); \ 1894 __ssa_src(instr, b, bflags); \ 1895 instr->flags |= flag; \ 1896 return instr; \ 1897} 1898/* clang-format on */ 1899#define INSTR2F(f, name) __INSTR2(IR3_INSTR_##f, name##_##f, OPC_##name) 1900#define INSTR2(name) __INSTR2(0, name, OPC_##name) 1901 1902/* clang-format off */ 1903#define __INSTR3(flag, dst_count, name, opc) \ 1904static inline struct ir3_instruction *ir3_##name( \ 1905 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1906 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \ 1907 unsigned cflags) \ 1908{ \ 1909 struct ir3_instruction *instr = \ 1910 ir3_instr_create(block, opc, dst_count, 3); \ 1911 for (unsigned i = 0; i < dst_count; i++) \ 1912 __ssa_dst(instr); \ 1913 __ssa_src(instr, a, aflags); \ 1914 __ssa_src(instr, b, bflags); \ 1915 __ssa_src(instr, c, cflags); \ 1916 instr->flags |= flag; \ 1917 return instr; \ 1918} 1919/* clang-format on */ 1920#define INSTR3F(f, name) __INSTR3(IR3_INSTR_##f, 1, name##_##f, OPC_##name) 1921#define INSTR3(name) __INSTR3(0, 1, name, OPC_##name) 1922#define INSTR3NODST(name) __INSTR3(0, 0, name, OPC_##name) 1923 1924/* clang-format off */ 1925#define __INSTR4(flag, dst_count, name, opc) \ 1926static inline struct ir3_instruction *ir3_##name( \ 1927 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1928 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \ 1929 unsigned cflags, struct ir3_instruction *d, unsigned dflags) \ 1930{ \ 1931 struct ir3_instruction *instr = \ 1932 ir3_instr_create(block, opc, dst_count, 4); \ 1933 for (unsigned i = 0; i < dst_count; i++) \ 1934 __ssa_dst(instr); \ 1935 __ssa_src(instr, a, aflags); \ 1936 __ssa_src(instr, b, bflags); \ 1937 __ssa_src(instr, c, cflags); \ 1938 __ssa_src(instr, d, dflags); \ 1939 instr->flags |= flag; \ 1940 return instr; \ 1941} 1942/* clang-format on */ 1943#define INSTR4F(f, name) __INSTR4(IR3_INSTR_##f, 1, name##_##f, OPC_##name) 1944#define INSTR4(name) __INSTR4(0, 1, name, OPC_##name) 1945#define INSTR4NODST(name) __INSTR4(0, 0, name, OPC_##name) 1946 1947/* clang-format off */ 1948#define __INSTR5(flag, name, opc) \ 1949static inline struct ir3_instruction *ir3_##name( \ 1950 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1951 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \ 1952 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \ 1953 struct ir3_instruction *e, unsigned eflags) \ 1954{ \ 1955 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 5); \ 1956 __ssa_dst(instr); \ 1957 __ssa_src(instr, a, aflags); \ 1958 __ssa_src(instr, b, bflags); \ 1959 __ssa_src(instr, c, cflags); \ 1960 __ssa_src(instr, d, dflags); \ 1961 __ssa_src(instr, e, eflags); \ 1962 instr->flags |= flag; \ 1963 return instr; \ 1964} 1965/* clang-format on */ 1966#define INSTR5F(f, name) __INSTR5(IR3_INSTR_##f, name##_##f, OPC_##name) 1967#define INSTR5(name) __INSTR5(0, name, OPC_##name) 1968 1969/* clang-format off */ 1970#define __INSTR6(flag, dst_count, name, opc) \ 1971static inline struct ir3_instruction *ir3_##name( \ 1972 struct ir3_block *block, struct ir3_instruction *a, unsigned aflags, \ 1973 struct ir3_instruction *b, unsigned bflags, struct ir3_instruction *c, \ 1974 unsigned cflags, struct ir3_instruction *d, unsigned dflags, \ 1975 struct ir3_instruction *e, unsigned eflags, struct ir3_instruction *f, \ 1976 unsigned fflags) \ 1977{ \ 1978 struct ir3_instruction *instr = ir3_instr_create(block, opc, 1, 6); \ 1979 for (unsigned i = 0; i < dst_count; i++) \ 1980 __ssa_dst(instr); \ 1981 __ssa_src(instr, a, aflags); \ 1982 __ssa_src(instr, b, bflags); \ 1983 __ssa_src(instr, c, cflags); \ 1984 __ssa_src(instr, d, dflags); \ 1985 __ssa_src(instr, e, eflags); \ 1986 __ssa_src(instr, f, fflags); \ 1987 instr->flags |= flag; \ 1988 return instr; \ 1989} 1990/* clang-format on */ 1991#define INSTR6F(f, name) __INSTR6(IR3_INSTR_##f, 1, name##_##f, OPC_##name) 1992#define INSTR6(name) __INSTR6(0, 1, name, OPC_##name) 1993#define INSTR6NODST(name) __INSTR6(0, 0, name, OPC_##name) 1994 1995/* cat0 instructions: */ 1996INSTR1NODST(B) 1997INSTR0(JUMP) 1998INSTR1NODST(KILL) 1999INSTR1NODST(DEMOTE) 2000INSTR0(END) 2001INSTR0(CHSH) 2002INSTR0(CHMASK) 2003INSTR1NODST(PREDT) 2004INSTR0(PREDF) 2005INSTR0(PREDE) 2006INSTR0(GETONE) 2007 2008/* cat1 macros */ 2009INSTR1(ANY_MACRO) 2010INSTR1(ALL_MACRO) 2011INSTR1(READ_FIRST_MACRO) 2012INSTR2(READ_COND_MACRO) 2013 2014static inline struct ir3_instruction * 2015ir3_ELECT_MACRO(struct ir3_block *block) 2016{ 2017 struct ir3_instruction *instr = 2018 ir3_instr_create(block, OPC_ELECT_MACRO, 1, 0); 2019 __ssa_dst(instr); 2020 return instr; 2021} 2022 2023/* cat2 instructions, most 2 src but some 1 src: */ 2024INSTR2(ADD_F) 2025INSTR2(MIN_F) 2026INSTR2(MAX_F) 2027INSTR2(MUL_F) 2028INSTR1(SIGN_F) 2029INSTR2(CMPS_F) 2030INSTR1(ABSNEG_F) 2031INSTR2(CMPV_F) 2032INSTR1(FLOOR_F) 2033INSTR1(CEIL_F) 2034INSTR1(RNDNE_F) 2035INSTR1(RNDAZ_F) 2036INSTR1(TRUNC_F) 2037INSTR2(ADD_U) 2038INSTR2(ADD_S) 2039INSTR2(SUB_U) 2040INSTR2(SUB_S) 2041INSTR2(CMPS_U) 2042INSTR2(CMPS_S) 2043INSTR2(MIN_U) 2044INSTR2(MIN_S) 2045INSTR2(MAX_U) 2046INSTR2(MAX_S) 2047INSTR1(ABSNEG_S) 2048INSTR2(AND_B) 2049INSTR2(OR_B) 2050INSTR1(NOT_B) 2051INSTR2(XOR_B) 2052INSTR2(CMPV_U) 2053INSTR2(CMPV_S) 2054INSTR2(MUL_U24) 2055INSTR2(MUL_S24) 2056INSTR2(MULL_U) 2057INSTR1(BFREV_B) 2058INSTR1(CLZ_S) 2059INSTR1(CLZ_B) 2060INSTR2(SHL_B) 2061INSTR2(SHR_B) 2062INSTR2(ASHR_B) 2063INSTR2(BARY_F) 2064INSTR2(MGEN_B) 2065INSTR2(GETBIT_B) 2066INSTR1(SETRM) 2067INSTR1(CBITS_B) 2068INSTR2(SHB) 2069INSTR2(MSAD) 2070 2071/* cat3 instructions: */ 2072INSTR3(MAD_U16) 2073INSTR3(MADSH_U16) 2074INSTR3(MAD_S16) 2075INSTR3(MADSH_M16) 2076INSTR3(MAD_U24) 2077INSTR3(MAD_S24) 2078INSTR3(MAD_F16) 2079INSTR3(MAD_F32) 2080/* NOTE: SEL_B32 checks for zero vs nonzero */ 2081INSTR3(SEL_B16) 2082INSTR3(SEL_B32) 2083INSTR3(SEL_S16) 2084INSTR3(SEL_S32) 2085INSTR3(SEL_F16) 2086INSTR3(SEL_F32) 2087INSTR3(SAD_S16) 2088INSTR3(SAD_S32) 2089 2090/* cat4 instructions: */ 2091INSTR1(RCP) 2092INSTR1(RSQ) 2093INSTR1(HRSQ) 2094INSTR1(LOG2) 2095INSTR1(HLOG2) 2096INSTR1(EXP2) 2097INSTR1(HEXP2) 2098INSTR1(SIN) 2099INSTR1(COS) 2100INSTR1(SQRT) 2101 2102/* cat5 instructions: */ 2103INSTR1(DSX) 2104INSTR1(DSXPP_MACRO) 2105INSTR1(DSY) 2106INSTR1(DSYPP_MACRO) 2107INSTR1F(3D, DSX) 2108INSTR1F(3D, DSY) 2109INSTR1(RGETPOS) 2110 2111static inline struct ir3_instruction * 2112ir3_SAM(struct ir3_block *block, opc_t opc, type_t type, unsigned wrmask, 2113 unsigned flags, struct ir3_instruction *samp_tex, 2114 struct ir3_instruction *src0, struct ir3_instruction *src1) 2115{ 2116 struct ir3_instruction *sam; 2117 unsigned nreg = 0; 2118 2119 if (flags & IR3_INSTR_S2EN) { 2120 nreg++; 2121 } 2122 if (src0) { 2123 nreg++; 2124 } 2125 if (src1) { 2126 nreg++; 2127 } 2128 2129 sam = ir3_instr_create(block, opc, 1, nreg); 2130 sam->flags |= flags; 2131 __ssa_dst(sam)->wrmask = wrmask; 2132 if (flags & IR3_INSTR_S2EN) { 2133 __ssa_src(sam, samp_tex, (flags & IR3_INSTR_B) ? 0 : IR3_REG_HALF); 2134 } 2135 if (src0) { 2136 __ssa_src(sam, src0, 0); 2137 } 2138 if (src1) { 2139 __ssa_src(sam, src1, 0); 2140 } 2141 sam->cat5.type = type; 2142 2143 return sam; 2144} 2145 2146/* cat6 instructions: */ 2147INSTR2(LDLV) 2148INSTR3(LDG) 2149INSTR3(LDL) 2150INSTR3(LDLW) 2151INSTR3(LDP) 2152INSTR4NODST(STG) 2153INSTR3NODST(STL) 2154INSTR3NODST(STLW) 2155INSTR3NODST(STP) 2156INSTR1(RESINFO) 2157INSTR1(RESFMT) 2158INSTR2(ATOMIC_ADD) 2159INSTR2(ATOMIC_SUB) 2160INSTR2(ATOMIC_XCHG) 2161INSTR2(ATOMIC_INC) 2162INSTR2(ATOMIC_DEC) 2163INSTR2(ATOMIC_CMPXCHG) 2164INSTR2(ATOMIC_MIN) 2165INSTR2(ATOMIC_MAX) 2166INSTR2(ATOMIC_AND) 2167INSTR2(ATOMIC_OR) 2168INSTR2(ATOMIC_XOR) 2169INSTR2(LDC) 2170#if GPU >= 600 2171INSTR3NODST(STIB); 2172INSTR2(LDIB); 2173INSTR5(LDG_A); 2174INSTR6NODST(STG_A); 2175INSTR3F(G, ATOMIC_ADD) 2176INSTR3F(G, ATOMIC_SUB) 2177INSTR3F(G, ATOMIC_XCHG) 2178INSTR3F(G, ATOMIC_INC) 2179INSTR3F(G, ATOMIC_DEC) 2180INSTR3F(G, ATOMIC_CMPXCHG) 2181INSTR3F(G, ATOMIC_MIN) 2182INSTR3F(G, ATOMIC_MAX) 2183INSTR3F(G, ATOMIC_AND) 2184INSTR3F(G, ATOMIC_OR) 2185INSTR3F(G, ATOMIC_XOR) 2186#elif GPU >= 400 2187INSTR3(LDGB) 2188#if GPU >= 500 2189INSTR3(LDIB) 2190#endif 2191INSTR4NODST(STGB) 2192INSTR4NODST(STIB) 2193INSTR4F(G, ATOMIC_ADD) 2194INSTR4F(G, ATOMIC_SUB) 2195INSTR4F(G, ATOMIC_XCHG) 2196INSTR4F(G, ATOMIC_INC) 2197INSTR4F(G, ATOMIC_DEC) 2198INSTR4F(G, ATOMIC_CMPXCHG) 2199INSTR4F(G, ATOMIC_MIN) 2200INSTR4F(G, ATOMIC_MAX) 2201INSTR4F(G, ATOMIC_AND) 2202INSTR4F(G, ATOMIC_OR) 2203INSTR4F(G, ATOMIC_XOR) 2204#endif 2205 2206/* cat7 instructions: */ 2207INSTR0(BAR) 2208INSTR0(FENCE) 2209 2210/* ************************************************************************* */ 2211#include "bitset.h" 2212 2213#define MAX_REG 256 2214 2215typedef BITSET_DECLARE(regmaskstate_t, 2 * MAX_REG); 2216 2217typedef struct { 2218 bool mergedregs; 2219 regmaskstate_t mask; 2220} regmask_t; 2221 2222static inline bool 2223__regmask_get(regmask_t *regmask, bool half, unsigned n) 2224{ 2225 if (regmask->mergedregs) { 2226 /* a6xx+ case, with merged register file, we track things in terms 2227 * of half-precision registers, with a full precisions register 2228 * using two half-precision slots. 2229 * 2230 * Pretend that special regs (a0.x, a1.x, etc.) are full registers to 2231 * avoid having them alias normal full regs. 2232 */ 2233 if (half && !is_reg_num_special(n)) { 2234 return BITSET_TEST(regmask->mask, n); 2235 } else { 2236 n *= 2; 2237 return BITSET_TEST(regmask->mask, n) || 2238 BITSET_TEST(regmask->mask, n + 1); 2239 } 2240 } else { 2241 /* pre a6xx case, with separate register file for half and full 2242 * precision: 2243 */ 2244 if (half) 2245 n += MAX_REG; 2246 return BITSET_TEST(regmask->mask, n); 2247 } 2248} 2249 2250static inline void 2251__regmask_set(regmask_t *regmask, bool half, unsigned n) 2252{ 2253 if (regmask->mergedregs) { 2254 /* a6xx+ case, with merged register file, we track things in terms 2255 * of half-precision registers, with a full precisions register 2256 * using two half-precision slots: 2257 */ 2258 if (half && !is_reg_num_special(n)) { 2259 BITSET_SET(regmask->mask, n); 2260 } else { 2261 n *= 2; 2262 BITSET_SET(regmask->mask, n); 2263 BITSET_SET(regmask->mask, n + 1); 2264 } 2265 } else { 2266 /* pre a6xx case, with separate register file for half and full 2267 * precision: 2268 */ 2269 if (half) 2270 n += MAX_REG; 2271 BITSET_SET(regmask->mask, n); 2272 } 2273} 2274 2275static inline void 2276__regmask_clear(regmask_t *regmask, bool half, unsigned n) 2277{ 2278 if (regmask->mergedregs) { 2279 /* a6xx+ case, with merged register file, we track things in terms 2280 * of half-precision registers, with a full precisions register 2281 * using two half-precision slots: 2282 */ 2283 if (half && !is_reg_num_special(n)) { 2284 BITSET_CLEAR(regmask->mask, n); 2285 } else { 2286 n *= 2; 2287 BITSET_CLEAR(regmask->mask, n); 2288 BITSET_CLEAR(regmask->mask, n + 1); 2289 } 2290 } else { 2291 /* pre a6xx case, with separate register file for half and full 2292 * precision: 2293 */ 2294 if (half) 2295 n += MAX_REG; 2296 BITSET_CLEAR(regmask->mask, n); 2297 } 2298} 2299 2300static inline void 2301regmask_init(regmask_t *regmask, bool mergedregs) 2302{ 2303 memset(®mask->mask, 0, sizeof(regmask->mask)); 2304 regmask->mergedregs = mergedregs; 2305} 2306 2307static inline void 2308regmask_or(regmask_t *dst, regmask_t *a, regmask_t *b) 2309{ 2310 assert(dst->mergedregs == a->mergedregs); 2311 assert(dst->mergedregs == b->mergedregs); 2312 2313 for (unsigned i = 0; i < ARRAY_SIZE(dst->mask); i++) 2314 dst->mask[i] = a->mask[i] | b->mask[i]; 2315} 2316 2317 2318static inline void 2319regmask_set(regmask_t *regmask, struct ir3_register *reg) 2320{ 2321 bool half = reg->flags & IR3_REG_HALF; 2322 if (reg->flags & IR3_REG_RELATIV) { 2323 for (unsigned i = 0; i < reg->size; i++) 2324 __regmask_set(regmask, half, reg->array.base + i); 2325 } else { 2326 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++) 2327 if (mask & 1) 2328 __regmask_set(regmask, half, n); 2329 } 2330} 2331 2332static inline bool 2333regmask_get(regmask_t *regmask, struct ir3_register *reg) 2334{ 2335 bool half = reg->flags & IR3_REG_HALF; 2336 if (reg->flags & IR3_REG_RELATIV) { 2337 for (unsigned i = 0; i < reg->size; i++) 2338 if (__regmask_get(regmask, half, reg->array.base + i)) 2339 return true; 2340 } else { 2341 for (unsigned mask = reg->wrmask, n = reg->num; mask; mask >>= 1, n++) 2342 if (mask & 1) 2343 if (__regmask_get(regmask, half, n)) 2344 return true; 2345 } 2346 return false; 2347} 2348/* ************************************************************************* */ 2349 2350#endif /* IR3_H_ */ 2351