1/* 2 * Copyright (C) 2018 Alyssa Rosenzweig <alyssa@rosenzweig.io> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 */ 23 24#include <sys/types.h> 25#include <sys/stat.h> 26#include <sys/mman.h> 27#include <fcntl.h> 28#include <stdint.h> 29#include <stdlib.h> 30#include <stdio.h> 31#include <err.h> 32 33#include "main/mtypes.h" 34#include "compiler/glsl/glsl_to_nir.h" 35#include "compiler/nir_types.h" 36#include "main/imports.h" 37#include "compiler/nir/nir_builder.h" 38#include "util/half_float.h" 39#include "util/register_allocate.h" 40#include "util/u_debug.h" 41#include "util/u_dynarray.h" 42#include "util/list.h" 43#include "main/mtypes.h" 44 45#include "midgard.h" 46#include "midgard_nir.h" 47#include "midgard_compile.h" 48#include "helpers.h" 49 50#include "disassemble.h" 51 52static const struct debug_named_value debug_options[] = { 53 {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"}, 54 {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"}, 55 DEBUG_NAMED_VALUE_END 56}; 57 58DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", debug_options, 0) 59 60int midgard_debug = 0; 61 62#define DBG(fmt, ...) \ 63 do { if (midgard_debug & MIDGARD_DBG_MSGS) \ 64 fprintf(stderr, "%s:%d: "fmt, \ 65 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) 66 67/* Instruction arguments represented as block-local SSA indices, rather than 68 * registers. Negative values mean unused. */ 69 70typedef struct { 71 int src0; 72 int src1; 73 int dest; 74 75 /* src1 is -not- SSA but instead a 16-bit inline constant to be smudged 76 * in. Only valid for ALU ops. */ 77 bool inline_constant; 78} ssa_args; 79 80/* Forward declare so midgard_branch can reference */ 81struct midgard_block; 82 83/* Target types. Defaults to TARGET_GOTO (the type corresponding directly to 84 * the hardware), hence why that must be zero. TARGET_DISCARD signals this 85 * instruction is actually a discard op. */ 86 87#define TARGET_GOTO 0 88#define TARGET_BREAK 1 89#define TARGET_CONTINUE 2 90#define TARGET_DISCARD 3 91 92typedef struct midgard_branch { 93 /* If conditional, the condition is specified in r31.w */ 94 bool conditional; 95 96 /* For conditionals, if this is true, we branch on FALSE. If false, we branch on TRUE. */ 97 bool invert_conditional; 98 99 /* Branch targets: the start of a block, the start of a loop (continue), the end of a loop (break). Value is one of TARGET_ */ 100 unsigned target_type; 101 102 /* The actual target */ 103 union { 104 int target_block; 105 int target_break; 106 int target_continue; 107 }; 108} midgard_branch; 109 110static bool 111midgard_is_branch_unit(unsigned unit) 112{ 113 return (unit == ALU_ENAB_BRANCH) || (unit == ALU_ENAB_BR_COMPACT); 114} 115 116/* Generic in-memory data type repesenting a single logical instruction, rather 117 * than a single instruction group. This is the preferred form for code gen. 118 * Multiple midgard_insturctions will later be combined during scheduling, 119 * though this is not represented in this structure. Its format bridges 120 * the low-level binary representation with the higher level semantic meaning. 121 * 122 * Notably, it allows registers to be specified as block local SSA, for code 123 * emitted before the register allocation pass. 124 */ 125 126typedef struct midgard_instruction { 127 /* Must be first for casting */ 128 struct list_head link; 129 130 unsigned type; /* ALU, load/store, texture */ 131 132 /* If the register allocator has not run yet... */ 133 ssa_args ssa_args; 134 135 /* Special fields for an ALU instruction */ 136 midgard_reg_info registers; 137 138 /* I.e. (1 << alu_bit) */ 139 int unit; 140 141 bool has_constants; 142 float constants[4]; 143 uint16_t inline_constant; 144 bool has_blend_constant; 145 146 bool compact_branch; 147 bool writeout; 148 bool prepacked_branch; 149 150 union { 151 midgard_load_store_word load_store; 152 midgard_vector_alu alu; 153 midgard_texture_word texture; 154 midgard_branch_extended branch_extended; 155 uint16_t br_compact; 156 157 /* General branch, rather than packed br_compact. Higher level 158 * than the other components */ 159 midgard_branch branch; 160 }; 161} midgard_instruction; 162 163typedef struct midgard_block { 164 /* Link to next block. Must be first for mir_get_block */ 165 struct list_head link; 166 167 /* List of midgard_instructions emitted for the current block */ 168 struct list_head instructions; 169 170 bool is_scheduled; 171 172 /* List of midgard_bundles emitted (after the scheduler has run) */ 173 struct util_dynarray bundles; 174 175 /* Number of quadwords _actually_ emitted, as determined after scheduling */ 176 unsigned quadword_count; 177 178 /* Successors: always one forward (the block after us), maybe 179 * one backwards (for a backward branch). No need for a second 180 * forward, since graph traversal would get there eventually 181 * anyway */ 182 struct midgard_block *successors[2]; 183 unsigned nr_successors; 184 185 /* The successors pointer form a graph, and in the case of 186 * complex control flow, this graph has a cycles. To aid 187 * traversal during liveness analysis, we have a visited? 188 * boolean for passes to use as they see fit, provided they 189 * clean up later */ 190 bool visited; 191} midgard_block; 192 193static void 194midgard_block_add_successor(midgard_block *block, midgard_block *successor) 195{ 196 block->successors[block->nr_successors++] = successor; 197 assert(block->nr_successors <= ARRAY_SIZE(block->successors)); 198} 199 200/* Helpers to generate midgard_instruction's using macro magic, since every 201 * driver seems to do it that way */ 202 203#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__)); 204#define SWIZZLE_XYZW SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W) 205 206#define M_LOAD_STORE(name, rname, uname) \ 207 static midgard_instruction m_##name(unsigned ssa, unsigned address) { \ 208 midgard_instruction i = { \ 209 .type = TAG_LOAD_STORE_4, \ 210 .ssa_args = { \ 211 .rname = ssa, \ 212 .uname = -1, \ 213 .src1 = -1 \ 214 }, \ 215 .load_store = { \ 216 .op = midgard_op_##name, \ 217 .mask = 0xF, \ 218 .swizzle = SWIZZLE_XYZW, \ 219 .address = address \ 220 } \ 221 }; \ 222 \ 223 return i; \ 224 } 225 226#define M_LOAD(name) M_LOAD_STORE(name, dest, src0) 227#define M_STORE(name) M_LOAD_STORE(name, src0, dest) 228 229const midgard_vector_alu_src blank_alu_src = { 230 .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), 231}; 232 233const midgard_vector_alu_src blank_alu_src_xxxx = { 234 .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_X, COMPONENT_X, COMPONENT_X), 235}; 236 237const midgard_scalar_alu_src blank_scalar_alu_src = { 238 .full = true 239}; 240 241/* Used for encoding the unused source of 1-op instructions */ 242const midgard_vector_alu_src zero_alu_src = { 0 }; 243 244/* Coerce structs to integer */ 245 246static unsigned 247vector_alu_srco_unsigned(midgard_vector_alu_src src) 248{ 249 unsigned u; 250 memcpy(&u, &src, sizeof(src)); 251 return u; 252} 253 254static midgard_vector_alu_src 255vector_alu_from_unsigned(unsigned u) 256{ 257 midgard_vector_alu_src s; 258 memcpy(&s, &u, sizeof(s)); 259 return s; 260} 261 262/* Inputs a NIR ALU source, with modifiers attached if necessary, and outputs 263 * the corresponding Midgard source */ 264 265static midgard_vector_alu_src 266vector_alu_modifiers(nir_alu_src *src, bool is_int) 267{ 268 if (!src) return blank_alu_src; 269 270 midgard_vector_alu_src alu_src = { 271 .rep_low = 0, 272 .rep_high = 0, 273 .half = 0, /* TODO */ 274 .swizzle = SWIZZLE_FROM_ARRAY(src->swizzle) 275 }; 276 277 if (is_int) { 278 /* TODO: sign-extend/zero-extend */ 279 alu_src.mod = midgard_int_normal; 280 281 /* These should have been lowered away */ 282 assert(!(src->abs || src->negate)); 283 } else { 284 alu_src.mod = (src->abs << 0) | (src->negate << 1); 285 } 286 287 return alu_src; 288} 289 290static bool 291mir_nontrivial_mod(midgard_vector_alu_src src, bool is_int, unsigned mask) 292{ 293 /* abs or neg */ 294 if (!is_int && src.mod) return true; 295 296 /* swizzle */ 297 for (unsigned c = 0; c < 4; ++c) { 298 if (!(mask & (1 << c))) continue; 299 if (((src.swizzle >> (2*c)) & 3) != c) return true; 300 } 301 302 return false; 303} 304 305/* 'Intrinsic' move for misc aliasing uses independent of actual NIR ALU code */ 306 307static midgard_instruction 308v_fmov(unsigned src, midgard_vector_alu_src mod, unsigned dest) 309{ 310 midgard_instruction ins = { 311 .type = TAG_ALU_4, 312 .ssa_args = { 313 .src0 = SSA_UNUSED_1, 314 .src1 = src, 315 .dest = dest, 316 }, 317 .alu = { 318 .op = midgard_alu_op_fmov, 319 .reg_mode = midgard_reg_mode_32, 320 .dest_override = midgard_dest_override_none, 321 .mask = 0xFF, 322 .src1 = vector_alu_srco_unsigned(zero_alu_src), 323 .src2 = vector_alu_srco_unsigned(mod) 324 }, 325 }; 326 327 return ins; 328} 329 330/* load/store instructions have both 32-bit and 16-bit variants, depending on 331 * whether we are using vectors composed of highp or mediump. At the moment, we 332 * don't support half-floats -- this requires changes in other parts of the 333 * compiler -- therefore the 16-bit versions are commented out. */ 334 335//M_LOAD(load_attr_16); 336M_LOAD(load_attr_32); 337//M_LOAD(load_vary_16); 338M_LOAD(load_vary_32); 339//M_LOAD(load_uniform_16); 340M_LOAD(load_uniform_32); 341M_LOAD(load_color_buffer_8); 342//M_STORE(store_vary_16); 343M_STORE(store_vary_32); 344M_STORE(store_cubemap_coords); 345 346static midgard_instruction 347v_alu_br_compact_cond(midgard_jmp_writeout_op op, unsigned tag, signed offset, unsigned cond) 348{ 349 midgard_branch_cond branch = { 350 .op = op, 351 .dest_tag = tag, 352 .offset = offset, 353 .cond = cond 354 }; 355 356 uint16_t compact; 357 memcpy(&compact, &branch, sizeof(branch)); 358 359 midgard_instruction ins = { 360 .type = TAG_ALU_4, 361 .unit = ALU_ENAB_BR_COMPACT, 362 .prepacked_branch = true, 363 .compact_branch = true, 364 .br_compact = compact 365 }; 366 367 if (op == midgard_jmp_writeout_op_writeout) 368 ins.writeout = true; 369 370 return ins; 371} 372 373static midgard_instruction 374v_branch(bool conditional, bool invert) 375{ 376 midgard_instruction ins = { 377 .type = TAG_ALU_4, 378 .unit = ALU_ENAB_BRANCH, 379 .compact_branch = true, 380 .branch = { 381 .conditional = conditional, 382 .invert_conditional = invert 383 } 384 }; 385 386 return ins; 387} 388 389static midgard_branch_extended 390midgard_create_branch_extended( midgard_condition cond, 391 midgard_jmp_writeout_op op, 392 unsigned dest_tag, 393 signed quadword_offset) 394{ 395 /* For unclear reasons, the condition code is repeated 8 times */ 396 uint16_t duplicated_cond = 397 (cond << 14) | 398 (cond << 12) | 399 (cond << 10) | 400 (cond << 8) | 401 (cond << 6) | 402 (cond << 4) | 403 (cond << 2) | 404 (cond << 0); 405 406 midgard_branch_extended branch = { 407 .op = op, 408 .dest_tag = dest_tag, 409 .offset = quadword_offset, 410 .cond = duplicated_cond 411 }; 412 413 return branch; 414} 415 416typedef struct midgard_bundle { 417 /* Tag for the overall bundle */ 418 int tag; 419 420 /* Instructions contained by the bundle */ 421 int instruction_count; 422 midgard_instruction instructions[5]; 423 424 /* Bundle-wide ALU configuration */ 425 int padding; 426 int control; 427 bool has_embedded_constants; 428 float constants[4]; 429 bool has_blend_constant; 430 431 uint16_t register_words[8]; 432 int register_words_count; 433 434 uint64_t body_words[8]; 435 size_t body_size[8]; 436 int body_words_count; 437} midgard_bundle; 438 439typedef struct compiler_context { 440 nir_shader *nir; 441 gl_shader_stage stage; 442 443 /* Is internally a blend shader? Depends on stage == FRAGMENT */ 444 bool is_blend; 445 446 /* Tracking for blend constant patching */ 447 int blend_constant_number; 448 int blend_constant_offset; 449 450 /* Current NIR function */ 451 nir_function *func; 452 453 /* Unordered list of midgard_blocks */ 454 int block_count; 455 struct list_head blocks; 456 457 midgard_block *initial_block; 458 midgard_block *previous_source_block; 459 midgard_block *final_block; 460 461 /* List of midgard_instructions emitted for the current block */ 462 midgard_block *current_block; 463 464 /* The current "depth" of the loop, for disambiguating breaks/continues 465 * when using nested loops */ 466 int current_loop_depth; 467 468 /* Constants which have been loaded, for later inlining */ 469 struct hash_table_u64 *ssa_constants; 470 471 /* SSA indices to be outputted to corresponding varying offset */ 472 struct hash_table_u64 *ssa_varyings; 473 474 /* SSA values / registers which have been aliased. Naively, these 475 * demand a fmov output; instead, we alias them in a later pass to 476 * avoid the wasted op. 477 * 478 * A note on encoding: to avoid dynamic memory management here, rather 479 * than ampping to a pointer, we map to the source index; the key 480 * itself is just the destination index. */ 481 482 struct hash_table_u64 *ssa_to_alias; 483 struct set *leftover_ssa_to_alias; 484 485 /* Actual SSA-to-register for RA */ 486 struct hash_table_u64 *ssa_to_register; 487 488 /* Mapping of hashes computed from NIR indices to the sequential temp indices ultimately used in MIR */ 489 struct hash_table_u64 *hash_to_temp; 490 int temp_count; 491 int max_hash; 492 493 /* Just the count of the max register used. Higher count => higher 494 * register pressure */ 495 int work_registers; 496 497 /* Used for cont/last hinting. Increase when a tex op is added. 498 * Decrease when a tex op is removed. */ 499 int texture_op_count; 500 501 /* Mapping of texture register -> SSA index for unaliasing */ 502 int texture_index[2]; 503 504 /* If any path hits a discard instruction */ 505 bool can_discard; 506 507 /* The number of uniforms allowable for the fast path */ 508 int uniform_cutoff; 509 510 /* Count of instructions emitted from NIR overall, across all blocks */ 511 int instruction_count; 512 513 /* Alpha ref value passed in */ 514 float alpha_ref; 515 516 /* The index corresponding to the fragment output */ 517 unsigned fragment_output; 518 519 /* The mapping of sysvals to uniforms, the count, and the off-by-one inverse */ 520 unsigned sysvals[MAX_SYSVAL_COUNT]; 521 unsigned sysval_count; 522 struct hash_table_u64 *sysval_to_id; 523} compiler_context; 524 525/* Append instruction to end of current block */ 526 527static midgard_instruction * 528mir_upload_ins(struct midgard_instruction ins) 529{ 530 midgard_instruction *heap = malloc(sizeof(ins)); 531 memcpy(heap, &ins, sizeof(ins)); 532 return heap; 533} 534 535static void 536emit_mir_instruction(struct compiler_context *ctx, struct midgard_instruction ins) 537{ 538 list_addtail(&(mir_upload_ins(ins))->link, &ctx->current_block->instructions); 539} 540 541static void 542mir_insert_instruction_before(struct midgard_instruction *tag, struct midgard_instruction ins) 543{ 544 list_addtail(&(mir_upload_ins(ins))->link, &tag->link); 545} 546 547static void 548mir_remove_instruction(struct midgard_instruction *ins) 549{ 550 list_del(&ins->link); 551} 552 553static midgard_instruction* 554mir_prev_op(struct midgard_instruction *ins) 555{ 556 return list_last_entry(&(ins->link), midgard_instruction, link); 557} 558 559static midgard_instruction* 560mir_next_op(struct midgard_instruction *ins) 561{ 562 return list_first_entry(&(ins->link), midgard_instruction, link); 563} 564 565#define mir_foreach_block(ctx, v) list_for_each_entry(struct midgard_block, v, &ctx->blocks, link) 566#define mir_foreach_block_from(ctx, from, v) list_for_each_entry_from(struct midgard_block, v, from, &ctx->blocks, link) 567 568#define mir_foreach_instr(ctx, v) list_for_each_entry(struct midgard_instruction, v, &ctx->current_block->instructions, link) 569#define mir_foreach_instr_safe(ctx, v) list_for_each_entry_safe(struct midgard_instruction, v, &ctx->current_block->instructions, link) 570#define mir_foreach_instr_in_block(block, v) list_for_each_entry(struct midgard_instruction, v, &block->instructions, link) 571#define mir_foreach_instr_in_block_safe(block, v) list_for_each_entry_safe(struct midgard_instruction, v, &block->instructions, link) 572#define mir_foreach_instr_in_block_safe_rev(block, v) list_for_each_entry_safe_rev(struct midgard_instruction, v, &block->instructions, link) 573#define mir_foreach_instr_in_block_from(block, v, from) list_for_each_entry_from(struct midgard_instruction, v, from, &block->instructions, link) 574#define mir_foreach_instr_in_block_from_rev(block, v, from) list_for_each_entry_from_rev(struct midgard_instruction, v, from, &block->instructions, link) 575 576 577static midgard_instruction * 578mir_last_in_block(struct midgard_block *block) 579{ 580 return list_last_entry(&block->instructions, struct midgard_instruction, link); 581} 582 583static midgard_block * 584mir_get_block(compiler_context *ctx, int idx) 585{ 586 struct list_head *lst = &ctx->blocks; 587 588 while ((idx--) + 1) 589 lst = lst->next; 590 591 return (struct midgard_block *) lst; 592} 593 594/* Pretty printer for internal Midgard IR */ 595 596static void 597print_mir_source(int source) 598{ 599 if (source >= SSA_FIXED_MINIMUM) { 600 /* Specific register */ 601 int reg = SSA_REG_FROM_FIXED(source); 602 603 /* TODO: Moving threshold */ 604 if (reg > 16 && reg < 24) 605 printf("u%d", 23 - reg); 606 else 607 printf("r%d", reg); 608 } else { 609 printf("%d", source); 610 } 611} 612 613static void 614print_mir_instruction(midgard_instruction *ins) 615{ 616 printf("\t"); 617 618 switch (ins->type) { 619 case TAG_ALU_4: { 620 midgard_alu_op op = ins->alu.op; 621 const char *name = alu_opcode_props[op].name; 622 623 if (ins->unit) 624 printf("%d.", ins->unit); 625 626 printf("%s", name ? name : "??"); 627 break; 628 } 629 630 case TAG_LOAD_STORE_4: { 631 midgard_load_store_op op = ins->load_store.op; 632 const char *name = load_store_opcode_names[op]; 633 634 assert(name); 635 printf("%s", name); 636 break; 637 } 638 639 case TAG_TEXTURE_4: { 640 printf("texture"); 641 break; 642 } 643 644 default: 645 assert(0); 646 } 647 648 ssa_args *args = &ins->ssa_args; 649 650 printf(" %d, ", args->dest); 651 652 print_mir_source(args->src0); 653 printf(", "); 654 655 if (args->inline_constant) 656 printf("#%d", ins->inline_constant); 657 else 658 print_mir_source(args->src1); 659 660 if (ins->has_constants) 661 printf(" <%f, %f, %f, %f>", ins->constants[0], ins->constants[1], ins->constants[2], ins->constants[3]); 662 663 printf("\n"); 664} 665 666static void 667print_mir_block(midgard_block *block) 668{ 669 printf("{\n"); 670 671 mir_foreach_instr_in_block(block, ins) { 672 print_mir_instruction(ins); 673 } 674 675 printf("}\n"); 676} 677 678static void 679attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name) 680{ 681 ins->has_constants = true; 682 memcpy(&ins->constants, constants, 16); 683 684 /* If this is the special blend constant, mark this instruction */ 685 686 if (ctx->is_blend && ctx->blend_constant_number == name) 687 ins->has_blend_constant = true; 688} 689 690static int 691glsl_type_size(const struct glsl_type *type, bool bindless) 692{ 693 return glsl_count_attribute_slots(type, false); 694} 695 696/* Lower fdot2 to a vector multiplication followed by channel addition */ 697static void 698midgard_nir_lower_fdot2_body(nir_builder *b, nir_alu_instr *alu) 699{ 700 if (alu->op != nir_op_fdot2) 701 return; 702 703 b->cursor = nir_before_instr(&alu->instr); 704 705 nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0); 706 nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1); 707 708 nir_ssa_def *product = nir_fmul(b, src0, src1); 709 710 nir_ssa_def *sum = nir_fadd(b, 711 nir_channel(b, product, 0), 712 nir_channel(b, product, 1)); 713 714 /* Replace the fdot2 with this sum */ 715 nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(sum)); 716} 717 718/* Lower csel with mixed condition channels to mulitple csel instructions. For 719 * context, the csel ops on Midgard are vector in *outputs*, but not in 720 * *conditions*. So, if the condition is e.g. yyyy, a single op can select a 721 * vec4. But if the condition is e.g. xyzw, four ops are needed as the ISA 722 * can't cope with the divergent channels.*/ 723 724static void 725midgard_nir_lower_mixed_csel_body(nir_builder *b, nir_alu_instr *alu) 726{ 727 if (alu->op != nir_op_bcsel) 728 return; 729 730 b->cursor = nir_before_instr(&alu->instr); 731 732 /* Must be run before registering */ 733 assert(alu->dest.dest.is_ssa); 734 735 /* Check for mixed condition */ 736 737 unsigned comp = alu->src[0].swizzle[0]; 738 unsigned nr_components = alu->dest.dest.ssa.num_components; 739 740 bool mixed = false; 741 742 for (unsigned c = 1; c < nr_components; ++c) 743 mixed |= (alu->src[0].swizzle[c] != comp); 744 745 if (!mixed) 746 return; 747 748 /* We're mixed, so lower */ 749 750 assert(nr_components <= 4); 751 nir_ssa_def *results[4]; 752 753 nir_ssa_def *cond = nir_ssa_for_alu_src(b, alu, 0); 754 nir_ssa_def *choice0 = nir_ssa_for_alu_src(b, alu, 1); 755 nir_ssa_def *choice1 = nir_ssa_for_alu_src(b, alu, 2); 756 757 for (unsigned c = 0; c < nr_components; ++c) { 758 results[c] = nir_bcsel(b, 759 nir_channel(b, cond, c), 760 nir_channel(b, choice0, c), 761 nir_channel(b, choice1, c)); 762 } 763 764 /* Replace with our scalarized version */ 765 766 nir_ssa_def *result = nir_vec(b, results, nr_components); 767 nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, nir_src_for_ssa(result)); 768} 769 770static int 771midgard_nir_sysval_for_intrinsic(nir_intrinsic_instr *instr) 772{ 773 switch (instr->intrinsic) { 774 case nir_intrinsic_load_viewport_scale: 775 return PAN_SYSVAL_VIEWPORT_SCALE; 776 case nir_intrinsic_load_viewport_offset: 777 return PAN_SYSVAL_VIEWPORT_OFFSET; 778 default: 779 return -1; 780 } 781} 782 783static void 784midgard_nir_assign_sysval_body(compiler_context *ctx, nir_instr *instr) 785{ 786 int sysval = -1; 787 788 if (instr->type == nir_instr_type_intrinsic) { 789 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 790 sysval = midgard_nir_sysval_for_intrinsic(intr); 791 } 792 793 if (sysval < 0) 794 return; 795 796 /* We have a sysval load; check if it's already been assigned */ 797 798 if (_mesa_hash_table_u64_search(ctx->sysval_to_id, sysval)) 799 return; 800 801 /* It hasn't -- so assign it now! */ 802 803 unsigned id = ctx->sysval_count++; 804 _mesa_hash_table_u64_insert(ctx->sysval_to_id, sysval, (void *) ((uintptr_t) id + 1)); 805 ctx->sysvals[id] = sysval; 806} 807 808static void 809midgard_nir_assign_sysvals(compiler_context *ctx, nir_shader *shader) 810{ 811 ctx->sysval_count = 0; 812 813 nir_foreach_function(function, shader) { 814 if (!function->impl) continue; 815 816 nir_foreach_block(block, function->impl) { 817 nir_foreach_instr_safe(instr, block) { 818 midgard_nir_assign_sysval_body(ctx, instr); 819 } 820 } 821 } 822} 823 824static bool 825midgard_nir_lower_fdot2(nir_shader *shader) 826{ 827 bool progress = false; 828 829 nir_foreach_function(function, shader) { 830 if (!function->impl) continue; 831 832 nir_builder _b; 833 nir_builder *b = &_b; 834 nir_builder_init(b, function->impl); 835 836 nir_foreach_block(block, function->impl) { 837 nir_foreach_instr_safe(instr, block) { 838 if (instr->type != nir_instr_type_alu) continue; 839 840 nir_alu_instr *alu = nir_instr_as_alu(instr); 841 midgard_nir_lower_fdot2_body(b, alu); 842 843 progress |= true; 844 } 845 } 846 847 nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); 848 849 } 850 851 return progress; 852} 853 854static bool 855midgard_nir_lower_mixed_csel(nir_shader *shader) 856{ 857 bool progress = false; 858 859 nir_foreach_function(function, shader) { 860 if (!function->impl) continue; 861 862 nir_builder _b; 863 nir_builder *b = &_b; 864 nir_builder_init(b, function->impl); 865 866 nir_foreach_block(block, function->impl) { 867 nir_foreach_instr_safe(instr, block) { 868 if (instr->type != nir_instr_type_alu) continue; 869 870 nir_alu_instr *alu = nir_instr_as_alu(instr); 871 midgard_nir_lower_mixed_csel_body(b, alu); 872 873 progress |= true; 874 } 875 } 876 877 nir_metadata_preserve(function->impl, nir_metadata_block_index | nir_metadata_dominance); 878 879 } 880 881 return progress; 882} 883 884static void 885optimise_nir(nir_shader *nir) 886{ 887 bool progress; 888 889 NIR_PASS(progress, nir, nir_lower_regs_to_ssa); 890 NIR_PASS(progress, nir, midgard_nir_lower_fdot2); 891 NIR_PASS(progress, nir, midgard_nir_lower_mixed_csel); 892 893 nir_lower_tex_options lower_tex_options = { 894 .lower_rect = true 895 }; 896 897 NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); 898 899 do { 900 progress = false; 901 902 NIR_PASS(progress, nir, nir_lower_var_copies); 903 NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 904 905 NIR_PASS(progress, nir, nir_copy_prop); 906 NIR_PASS(progress, nir, nir_opt_dce); 907 NIR_PASS(progress, nir, nir_opt_dead_cf); 908 NIR_PASS(progress, nir, nir_opt_cse); 909 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); 910 NIR_PASS(progress, nir, nir_opt_algebraic); 911 NIR_PASS(progress, nir, nir_opt_constant_folding); 912 NIR_PASS(progress, nir, nir_opt_undef); 913 NIR_PASS(progress, nir, nir_opt_loop_unroll, 914 nir_var_shader_in | 915 nir_var_shader_out | 916 nir_var_function_temp); 917 918 /* TODO: Enable vectorize when merged upstream */ 919 // NIR_PASS(progress, nir, nir_opt_vectorize); 920 } while (progress); 921 922 /* Must be run at the end to prevent creation of fsin/fcos ops */ 923 NIR_PASS(progress, nir, midgard_nir_scale_trig); 924 925 do { 926 progress = false; 927 928 NIR_PASS(progress, nir, nir_opt_dce); 929 NIR_PASS(progress, nir, nir_opt_algebraic); 930 NIR_PASS(progress, nir, nir_opt_constant_folding); 931 NIR_PASS(progress, nir, nir_copy_prop); 932 } while (progress); 933 934 NIR_PASS(progress, nir, nir_opt_algebraic_late); 935 NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); 936 937 /* Lower mods for float ops only. Integer ops don't support modifiers 938 * (saturate doesn't make sense on integers, neg/abs require dedicated 939 * instructions) */ 940 941 NIR_PASS(progress, nir, nir_lower_to_source_mods, nir_lower_float_source_mods); 942 NIR_PASS(progress, nir, nir_copy_prop); 943 NIR_PASS(progress, nir, nir_opt_dce); 944 945 /* We implement booleans as 32-bit 0/~0 */ 946 NIR_PASS(progress, nir, nir_lower_bool_to_int32); 947 948 /* Take us out of SSA */ 949 NIR_PASS(progress, nir, nir_lower_locals_to_regs); 950 NIR_PASS(progress, nir, nir_convert_from_ssa, true); 951 952 /* We are a vector architecture; write combine where possible */ 953 NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); 954 NIR_PASS(progress, nir, nir_lower_vec_to_movs); 955 956 NIR_PASS(progress, nir, nir_opt_dce); 957} 958 959/* Front-half of aliasing the SSA slots, merely by inserting the flag in the 960 * appropriate hash table. Intentional off-by-one to avoid confusing NULL with 961 * r0. See the comments in compiler_context */ 962 963static void 964alias_ssa(compiler_context *ctx, int dest, int src) 965{ 966 _mesa_hash_table_u64_insert(ctx->ssa_to_alias, dest + 1, (void *) ((uintptr_t) src + 1)); 967 _mesa_set_add(ctx->leftover_ssa_to_alias, (void *) (uintptr_t) (dest + 1)); 968} 969 970/* ...or undo it, after which the original index will be used (dummy move should be emitted alongside this) */ 971 972static void 973unalias_ssa(compiler_context *ctx, int dest) 974{ 975 _mesa_hash_table_u64_remove(ctx->ssa_to_alias, dest + 1); 976 /* TODO: Remove from leftover or no? */ 977} 978 979static void 980midgard_pin_output(compiler_context *ctx, int index, int reg) 981{ 982 _mesa_hash_table_u64_insert(ctx->ssa_to_register, index + 1, (void *) ((uintptr_t) reg + 1)); 983} 984 985static bool 986midgard_is_pinned(compiler_context *ctx, int index) 987{ 988 return _mesa_hash_table_u64_search(ctx->ssa_to_register, index + 1) != NULL; 989} 990 991/* Do not actually emit a load; instead, cache the constant for inlining */ 992 993static void 994emit_load_const(compiler_context *ctx, nir_load_const_instr *instr) 995{ 996 nir_ssa_def def = instr->def; 997 998 float *v = rzalloc_array(NULL, float, 4); 999 nir_const_load_to_arr(v, instr, f32); 1000 _mesa_hash_table_u64_insert(ctx->ssa_constants, def.index + 1, v); 1001} 1002 1003/* Duplicate bits to convert sane 4-bit writemask to obscure 8-bit format (or 1004 * do the inverse) */ 1005 1006static unsigned 1007expand_writemask(unsigned mask) 1008{ 1009 unsigned o = 0; 1010 1011 for (int i = 0; i < 4; ++i) 1012 if (mask & (1 << i)) 1013 o |= (3 << (2 * i)); 1014 1015 return o; 1016} 1017 1018static unsigned 1019squeeze_writemask(unsigned mask) 1020{ 1021 unsigned o = 0; 1022 1023 for (int i = 0; i < 4; ++i) 1024 if (mask & (3 << (2 * i))) 1025 o |= (1 << i); 1026 1027 return o; 1028 1029} 1030 1031/* Determines effective writemask, taking quirks and expansion into account */ 1032static unsigned 1033effective_writemask(midgard_vector_alu *alu) 1034{ 1035 /* Channel count is off-by-one to fit in two-bits (0 channel makes no 1036 * sense) */ 1037 1038 unsigned channel_count = GET_CHANNEL_COUNT(alu_opcode_props[alu->op].props); 1039 1040 /* If there is a fixed channel count, construct the appropriate mask */ 1041 1042 if (channel_count) 1043 return (1 << channel_count) - 1; 1044 1045 /* Otherwise, just squeeze the existing mask */ 1046 return squeeze_writemask(alu->mask); 1047} 1048 1049static unsigned 1050find_or_allocate_temp(compiler_context *ctx, unsigned hash) 1051{ 1052 if ((hash < 0) || (hash >= SSA_FIXED_MINIMUM)) 1053 return hash; 1054 1055 unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->hash_to_temp, hash + 1); 1056 1057 if (temp) 1058 return temp - 1; 1059 1060 /* If no temp is find, allocate one */ 1061 temp = ctx->temp_count++; 1062 ctx->max_hash = MAX2(ctx->max_hash, hash); 1063 1064 _mesa_hash_table_u64_insert(ctx->hash_to_temp, hash + 1, (void *) ((uintptr_t) temp + 1)); 1065 1066 return temp; 1067} 1068 1069static unsigned 1070nir_src_index(compiler_context *ctx, nir_src *src) 1071{ 1072 if (src->is_ssa) 1073 return src->ssa->index; 1074 else { 1075 assert(!src->reg.indirect); 1076 return ctx->func->impl->ssa_alloc + src->reg.reg->index; 1077 } 1078} 1079 1080static unsigned 1081nir_dest_index(compiler_context *ctx, nir_dest *dst) 1082{ 1083 if (dst->is_ssa) 1084 return dst->ssa.index; 1085 else { 1086 assert(!dst->reg.indirect); 1087 return ctx->func->impl->ssa_alloc + dst->reg.reg->index; 1088 } 1089} 1090 1091static unsigned 1092nir_alu_src_index(compiler_context *ctx, nir_alu_src *src) 1093{ 1094 return nir_src_index(ctx, &src->src); 1095} 1096 1097/* Midgard puts conditionals in r31.w; move an arbitrary source (the output of 1098 * a conditional test) into that register */ 1099 1100static void 1101emit_condition(compiler_context *ctx, nir_src *src, bool for_branch, unsigned component) 1102{ 1103 int condition = nir_src_index(ctx, src); 1104 1105 /* Source to swizzle the desired component into w */ 1106 1107 const midgard_vector_alu_src alu_src = { 1108 .swizzle = SWIZZLE(component, component, component, component), 1109 }; 1110 1111 /* There is no boolean move instruction. Instead, we simulate a move by 1112 * ANDing the condition with itself to get it into r31.w */ 1113 1114 midgard_instruction ins = { 1115 .type = TAG_ALU_4, 1116 .unit = for_branch ? UNIT_SMUL : UNIT_SADD, /* TODO: DEDUCE THIS */ 1117 .ssa_args = { 1118 .src0 = condition, 1119 .src1 = condition, 1120 .dest = SSA_FIXED_REGISTER(31), 1121 }, 1122 .alu = { 1123 .op = midgard_alu_op_iand, 1124 .reg_mode = midgard_reg_mode_32, 1125 .dest_override = midgard_dest_override_none, 1126 .mask = (0x3 << 6), /* w */ 1127 .src1 = vector_alu_srco_unsigned(alu_src), 1128 .src2 = vector_alu_srco_unsigned(alu_src) 1129 }, 1130 }; 1131 1132 emit_mir_instruction(ctx, ins); 1133} 1134 1135/* Likewise, indirect offsets are put in r27.w. TODO: Allow componentwise 1136 * pinning to eliminate this move in all known cases */ 1137 1138static void 1139emit_indirect_offset(compiler_context *ctx, nir_src *src) 1140{ 1141 int offset = nir_src_index(ctx, src); 1142 1143 midgard_instruction ins = { 1144 .type = TAG_ALU_4, 1145 .ssa_args = { 1146 .src0 = SSA_UNUSED_1, 1147 .src1 = offset, 1148 .dest = SSA_FIXED_REGISTER(REGISTER_OFFSET), 1149 }, 1150 .alu = { 1151 .op = midgard_alu_op_imov, 1152 .reg_mode = midgard_reg_mode_32, 1153 .dest_override = midgard_dest_override_none, 1154 .mask = (0x3 << 6), /* w */ 1155 .src1 = vector_alu_srco_unsigned(zero_alu_src), 1156 .src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx) 1157 }, 1158 }; 1159 1160 emit_mir_instruction(ctx, ins); 1161} 1162 1163#define ALU_CASE(nir, _op) \ 1164 case nir_op_##nir: \ 1165 op = midgard_alu_op_##_op; \ 1166 break; 1167 1168static bool 1169nir_is_fzero_constant(nir_src src) 1170{ 1171 if (!nir_src_is_const(src)) 1172 return false; 1173 1174 for (unsigned c = 0; c < nir_src_num_components(src); ++c) { 1175 if (nir_src_comp_as_float(src, c) != 0.0) 1176 return false; 1177 } 1178 1179 return true; 1180} 1181 1182static void 1183emit_alu(compiler_context *ctx, nir_alu_instr *instr) 1184{ 1185 bool is_ssa = instr->dest.dest.is_ssa; 1186 1187 unsigned dest = nir_dest_index(ctx, &instr->dest.dest); 1188 unsigned nr_components = is_ssa ? instr->dest.dest.ssa.num_components : instr->dest.dest.reg.reg->num_components; 1189 unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; 1190 1191 /* Most Midgard ALU ops have a 1:1 correspondance to NIR ops; these are 1192 * supported. A few do not and are commented for now. Also, there are a 1193 * number of NIR ops which Midgard does not support and need to be 1194 * lowered, also TODO. This switch block emits the opcode and calling 1195 * convention of the Midgard instruction; actual packing is done in 1196 * emit_alu below */ 1197 1198 unsigned op; 1199 1200 switch (instr->op) { 1201 ALU_CASE(fadd, fadd); 1202 ALU_CASE(fmul, fmul); 1203 ALU_CASE(fmin, fmin); 1204 ALU_CASE(fmax, fmax); 1205 ALU_CASE(imin, imin); 1206 ALU_CASE(imax, imax); 1207 ALU_CASE(umin, umin); 1208 ALU_CASE(umax, umax); 1209 ALU_CASE(fmov, fmov); 1210 ALU_CASE(ffloor, ffloor); 1211 ALU_CASE(fround_even, froundeven); 1212 ALU_CASE(ftrunc, ftrunc); 1213 ALU_CASE(fceil, fceil); 1214 ALU_CASE(fdot3, fdot3); 1215 ALU_CASE(fdot4, fdot4); 1216 ALU_CASE(iadd, iadd); 1217 ALU_CASE(isub, isub); 1218 ALU_CASE(imul, imul); 1219 ALU_CASE(iabs, iabs); 1220 ALU_CASE(imov, imov); 1221 1222 ALU_CASE(feq32, feq); 1223 ALU_CASE(fne32, fne); 1224 ALU_CASE(flt32, flt); 1225 ALU_CASE(ieq32, ieq); 1226 ALU_CASE(ine32, ine); 1227 ALU_CASE(ilt32, ilt); 1228 ALU_CASE(ult32, ult); 1229 1230 /* We don't have a native b2f32 instruction. Instead, like many 1231 * GPUs, we exploit booleans as 0/~0 for false/true, and 1232 * correspondingly AND 1233 * by 1.0 to do the type conversion. For the moment, prime us 1234 * to emit: 1235 * 1236 * iand [whatever], #0 1237 * 1238 * At the end of emit_alu (as MIR), we'll fix-up the constant 1239 */ 1240 1241 ALU_CASE(b2f32, iand); 1242 ALU_CASE(b2i32, iand); 1243 1244 /* Likewise, we don't have a dedicated f2b32 instruction, but 1245 * we can do a "not equal to 0.0" test. */ 1246 1247 ALU_CASE(f2b32, fne); 1248 ALU_CASE(i2b32, ine); 1249 1250 ALU_CASE(frcp, frcp); 1251 ALU_CASE(frsq, frsqrt); 1252 ALU_CASE(fsqrt, fsqrt); 1253 ALU_CASE(fexp2, fexp2); 1254 ALU_CASE(flog2, flog2); 1255 1256 ALU_CASE(f2i32, f2i); 1257 ALU_CASE(f2u32, f2u); 1258 ALU_CASE(i2f32, i2f); 1259 ALU_CASE(u2f32, u2f); 1260 1261 ALU_CASE(fsin, fsin); 1262 ALU_CASE(fcos, fcos); 1263 1264 ALU_CASE(iand, iand); 1265 ALU_CASE(ior, ior); 1266 ALU_CASE(ixor, ixor); 1267 ALU_CASE(inot, inand); 1268 ALU_CASE(ishl, ishl); 1269 ALU_CASE(ishr, iasr); 1270 ALU_CASE(ushr, ilsr); 1271 1272 ALU_CASE(b32all_fequal2, fball_eq); 1273 ALU_CASE(b32all_fequal3, fball_eq); 1274 ALU_CASE(b32all_fequal4, fball_eq); 1275 1276 ALU_CASE(b32any_fnequal2, fbany_neq); 1277 ALU_CASE(b32any_fnequal3, fbany_neq); 1278 ALU_CASE(b32any_fnequal4, fbany_neq); 1279 1280 ALU_CASE(b32all_iequal2, iball_eq); 1281 ALU_CASE(b32all_iequal3, iball_eq); 1282 ALU_CASE(b32all_iequal4, iball_eq); 1283 1284 ALU_CASE(b32any_inequal2, ibany_neq); 1285 ALU_CASE(b32any_inequal3, ibany_neq); 1286 ALU_CASE(b32any_inequal4, ibany_neq); 1287 1288 /* For greater-or-equal, we lower to less-or-equal and flip the 1289 * arguments */ 1290 1291 case nir_op_fge: 1292 case nir_op_fge32: 1293 case nir_op_ige32: 1294 case nir_op_uge32: { 1295 op = 1296 instr->op == nir_op_fge ? midgard_alu_op_fle : 1297 instr->op == nir_op_fge32 ? midgard_alu_op_fle : 1298 instr->op == nir_op_ige32 ? midgard_alu_op_ile : 1299 instr->op == nir_op_uge32 ? midgard_alu_op_ule : 1300 0; 1301 1302 /* Swap via temporary */ 1303 nir_alu_src temp = instr->src[1]; 1304 instr->src[1] = instr->src[0]; 1305 instr->src[0] = temp; 1306 1307 break; 1308 } 1309 1310 /* For a few special csel cases not handled by NIR, we can opt to 1311 * bitwise. Otherwise, we emit the condition and do a real csel */ 1312 1313 case nir_op_b32csel: { 1314 if (nir_is_fzero_constant(instr->src[2].src)) { 1315 /* (b ? v : 0) = (b & v) */ 1316 op = midgard_alu_op_iand; 1317 nr_inputs = 2; 1318 } else if (nir_is_fzero_constant(instr->src[1].src)) { 1319 /* (b ? 0 : v) = (!b ? v : 0) = (~b & v) = (v & ~b) */ 1320 op = midgard_alu_op_iandnot; 1321 nr_inputs = 2; 1322 instr->src[1] = instr->src[0]; 1323 instr->src[0] = instr->src[2]; 1324 } else { 1325 /* Midgard features both fcsel and icsel, depending on 1326 * the type of the arguments/output. However, as long 1327 * as we're careful we can _always_ use icsel and 1328 * _never_ need fcsel, since the latter does additional 1329 * floating-point-specific processing whereas the 1330 * former just moves bits on the wire. It's not obvious 1331 * why these are separate opcodes, save for the ability 1332 * to do things like sat/pos/abs/neg for free */ 1333 1334 op = midgard_alu_op_icsel; 1335 1336 /* csel works as a two-arg in Midgard, since the condition is hardcoded in r31.w */ 1337 nr_inputs = 2; 1338 1339 /* Figure out which component the condition is in */ 1340 1341 unsigned comp = instr->src[0].swizzle[0]; 1342 1343 /* Make sure NIR isn't throwing a mixed condition at us */ 1344 1345 for (unsigned c = 1; c < nr_components; ++c) 1346 assert(instr->src[0].swizzle[c] == comp); 1347 1348 /* Emit the condition into r31.w */ 1349 emit_condition(ctx, &instr->src[0].src, false, comp); 1350 1351 /* The condition is the first argument; move the other 1352 * arguments up one to be a binary instruction for 1353 * Midgard */ 1354 1355 memmove(instr->src, instr->src + 1, 2 * sizeof(nir_alu_src)); 1356 } 1357 break; 1358 } 1359 1360 default: 1361 DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name); 1362 assert(0); 1363 return; 1364 } 1365 1366 /* Midgard can perform certain modifiers on output ofa n ALU op */ 1367 midgard_outmod outmod = 1368 instr->dest.saturate ? midgard_outmod_sat : midgard_outmod_none; 1369 1370 /* fmax(a, 0.0) can turn into a .pos modifier as an optimization */ 1371 1372 if (instr->op == nir_op_fmax) { 1373 if (nir_is_fzero_constant(instr->src[0].src)) { 1374 op = midgard_alu_op_fmov; 1375 nr_inputs = 1; 1376 outmod = midgard_outmod_pos; 1377 instr->src[0] = instr->src[1]; 1378 } else if (nir_is_fzero_constant(instr->src[1].src)) { 1379 op = midgard_alu_op_fmov; 1380 nr_inputs = 1; 1381 outmod = midgard_outmod_pos; 1382 } 1383 } 1384 1385 /* Fetch unit, quirks, etc information */ 1386 unsigned opcode_props = alu_opcode_props[op].props; 1387 bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; 1388 1389 /* src0 will always exist afaik, but src1 will not for 1-argument 1390 * instructions. The latter can only be fetched if the instruction 1391 * needs it, or else we may segfault. */ 1392 1393 unsigned src0 = nir_alu_src_index(ctx, &instr->src[0]); 1394 unsigned src1 = nr_inputs == 2 ? nir_alu_src_index(ctx, &instr->src[1]) : SSA_UNUSED_0; 1395 1396 /* Rather than use the instruction generation helpers, we do it 1397 * ourselves here to avoid the mess */ 1398 1399 midgard_instruction ins = { 1400 .type = TAG_ALU_4, 1401 .ssa_args = { 1402 .src0 = quirk_flipped_r24 ? SSA_UNUSED_1 : src0, 1403 .src1 = quirk_flipped_r24 ? src0 : src1, 1404 .dest = dest, 1405 } 1406 }; 1407 1408 nir_alu_src *nirmods[2] = { NULL }; 1409 1410 if (nr_inputs == 2) { 1411 nirmods[0] = &instr->src[0]; 1412 nirmods[1] = &instr->src[1]; 1413 } else if (nr_inputs == 1) { 1414 nirmods[quirk_flipped_r24] = &instr->src[0]; 1415 } else { 1416 assert(0); 1417 } 1418 1419 bool is_int = midgard_is_integer_op(op); 1420 1421 midgard_vector_alu alu = { 1422 .op = op, 1423 .reg_mode = midgard_reg_mode_32, 1424 .dest_override = midgard_dest_override_none, 1425 .outmod = outmod, 1426 1427 /* Writemask only valid for non-SSA NIR */ 1428 .mask = expand_writemask((1 << nr_components) - 1), 1429 1430 .src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int)), 1431 .src2 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[1], is_int)), 1432 }; 1433 1434 /* Apply writemask if non-SSA, keeping in mind that we can't write to components that don't exist */ 1435 1436 if (!is_ssa) 1437 alu.mask &= expand_writemask(instr->dest.write_mask); 1438 1439 ins.alu = alu; 1440 1441 /* Late fixup for emulated instructions */ 1442 1443 if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) { 1444 /* Presently, our second argument is an inline #0 constant. 1445 * Switch over to an embedded 1.0 constant (that can't fit 1446 * inline, since we're 32-bit, not 16-bit like the inline 1447 * constants) */ 1448 1449 ins.ssa_args.inline_constant = false; 1450 ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 1451 ins.has_constants = true; 1452 1453 if (instr->op == nir_op_b2f32) { 1454 ins.constants[0] = 1.0f; 1455 } else { 1456 /* Type pun it into place */ 1457 uint32_t one = 0x1; 1458 memcpy(&ins.constants[0], &one, sizeof(uint32_t)); 1459 } 1460 1461 ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx); 1462 } else if (instr->op == nir_op_f2b32 || instr->op == nir_op_i2b32) { 1463 ins.ssa_args.inline_constant = false; 1464 ins.ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 1465 ins.has_constants = true; 1466 ins.constants[0] = 0.0f; 1467 ins.alu.src2 = vector_alu_srco_unsigned(blank_alu_src_xxxx); 1468 } else if (instr->op == nir_op_inot) { 1469 /* ~b = ~(b & b), so duplicate the source */ 1470 ins.ssa_args.src1 = ins.ssa_args.src0; 1471 ins.alu.src2 = ins.alu.src1; 1472 } 1473 1474 if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { 1475 /* To avoid duplicating the lookup tables (probably), true LUT 1476 * instructions can only operate as if they were scalars. Lower 1477 * them here by changing the component. */ 1478 1479 uint8_t original_swizzle[4]; 1480 memcpy(original_swizzle, nirmods[0]->swizzle, sizeof(nirmods[0]->swizzle)); 1481 1482 for (int i = 0; i < nr_components; ++i) { 1483 ins.alu.mask = (0x3) << (2 * i); /* Mask the associated component */ 1484 1485 for (int j = 0; j < 4; ++j) 1486 nirmods[0]->swizzle[j] = original_swizzle[i]; /* Pull from the correct component */ 1487 1488 ins.alu.src1 = vector_alu_srco_unsigned(vector_alu_modifiers(nirmods[0], is_int)); 1489 emit_mir_instruction(ctx, ins); 1490 } 1491 } else { 1492 emit_mir_instruction(ctx, ins); 1493 } 1494} 1495 1496#undef ALU_CASE 1497 1498static void 1499emit_uniform_read(compiler_context *ctx, unsigned dest, unsigned offset, nir_src *indirect_offset) 1500{ 1501 /* TODO: half-floats */ 1502 1503 if (!indirect_offset && offset < ctx->uniform_cutoff) { 1504 /* Fast path: For the first 16 uniforms, direct accesses are 1505 * 0-cycle, since they're just a register fetch in the usual 1506 * case. So, we alias the registers while we're still in 1507 * SSA-space */ 1508 1509 int reg_slot = 23 - offset; 1510 alias_ssa(ctx, dest, SSA_FIXED_REGISTER(reg_slot)); 1511 } else { 1512 /* Otherwise, read from the 'special' UBO to access 1513 * higher-indexed uniforms, at a performance cost. More 1514 * generally, we're emitting a UBO read instruction. */ 1515 1516 midgard_instruction ins = m_load_uniform_32(dest, offset); 1517 1518 /* TODO: Don't split */ 1519 ins.load_store.varying_parameters = (offset & 7) << 7; 1520 ins.load_store.address = offset >> 3; 1521 1522 if (indirect_offset) { 1523 emit_indirect_offset(ctx, indirect_offset); 1524 ins.load_store.unknown = 0x8700; /* xxx: what is this? */ 1525 } else { 1526 ins.load_store.unknown = 0x1E00; /* xxx: what is this? */ 1527 } 1528 1529 emit_mir_instruction(ctx, ins); 1530 } 1531} 1532 1533static void 1534emit_sysval_read(compiler_context *ctx, nir_intrinsic_instr *instr) 1535{ 1536 /* First, pull out the destination */ 1537 unsigned dest = nir_dest_index(ctx, &instr->dest); 1538 1539 /* Now, figure out which uniform this is */ 1540 int sysval = midgard_nir_sysval_for_intrinsic(instr); 1541 void *val = _mesa_hash_table_u64_search(ctx->sysval_to_id, sysval); 1542 1543 /* Sysvals are prefix uniforms */ 1544 unsigned uniform = ((uintptr_t) val) - 1; 1545 1546 /* Emit the read itself -- this is never indirect */ 1547 emit_uniform_read(ctx, dest, uniform, NULL); 1548} 1549 1550static void 1551emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) 1552{ 1553 unsigned offset, reg; 1554 1555 switch (instr->intrinsic) { 1556 case nir_intrinsic_discard_if: 1557 emit_condition(ctx, &instr->src[0], true, COMPONENT_X); 1558 1559 /* fallthrough */ 1560 1561 case nir_intrinsic_discard: { 1562 bool conditional = instr->intrinsic == nir_intrinsic_discard_if; 1563 struct midgard_instruction discard = v_branch(conditional, false); 1564 discard.branch.target_type = TARGET_DISCARD; 1565 emit_mir_instruction(ctx, discard); 1566 1567 ctx->can_discard = true; 1568 break; 1569 } 1570 1571 case nir_intrinsic_load_uniform: 1572 case nir_intrinsic_load_input: 1573 offset = nir_intrinsic_base(instr); 1574 1575 bool direct = nir_src_is_const(instr->src[0]); 1576 1577 if (direct) { 1578 offset += nir_src_as_uint(instr->src[0]); 1579 } 1580 1581 reg = nir_dest_index(ctx, &instr->dest); 1582 1583 if (instr->intrinsic == nir_intrinsic_load_uniform && !ctx->is_blend) { 1584 emit_uniform_read(ctx, reg, ctx->sysval_count + offset, !direct ? &instr->src[0] : NULL); 1585 } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->is_blend) { 1586 /* XXX: Half-floats? */ 1587 /* TODO: swizzle, mask */ 1588 1589 midgard_instruction ins = m_load_vary_32(reg, offset); 1590 1591 midgard_varying_parameter p = { 1592 .is_varying = 1, 1593 .interpolation = midgard_interp_default, 1594 .flat = /*var->data.interpolation == INTERP_MODE_FLAT*/ 0 1595 }; 1596 1597 unsigned u; 1598 memcpy(&u, &p, sizeof(p)); 1599 ins.load_store.varying_parameters = u; 1600 1601 if (direct) { 1602 /* We have the offset totally ready */ 1603 ins.load_store.unknown = 0x1e9e; /* xxx: what is this? */ 1604 } else { 1605 /* We have it partially ready, but we need to 1606 * add in the dynamic index, moved to r27.w */ 1607 emit_indirect_offset(ctx, &instr->src[0]); 1608 ins.load_store.unknown = 0x79e; /* xxx: what is this? */ 1609 } 1610 1611 emit_mir_instruction(ctx, ins); 1612 } else if (ctx->is_blend && instr->intrinsic == nir_intrinsic_load_uniform) { 1613 /* Constant encoded as a pinned constant */ 1614 1615 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, reg); 1616 ins.has_constants = true; 1617 ins.has_blend_constant = true; 1618 emit_mir_instruction(ctx, ins); 1619 } else if (ctx->is_blend) { 1620 /* For blend shaders, a load might be 1621 * translated various ways depending on what 1622 * we're loading. Figure out how this is used */ 1623 1624 nir_variable *out = NULL; 1625 1626 nir_foreach_variable(var, &ctx->nir->inputs) { 1627 int drvloc = var->data.driver_location; 1628 1629 if (nir_intrinsic_base(instr) == drvloc) { 1630 out = var; 1631 break; 1632 } 1633 } 1634 1635 assert(out); 1636 1637 if (out->data.location == VARYING_SLOT_COL0) { 1638 /* Source color preloaded to r0 */ 1639 1640 midgard_pin_output(ctx, reg, 0); 1641 } else if (out->data.location == VARYING_SLOT_COL1) { 1642 /* Destination color must be read from framebuffer */ 1643 1644 midgard_instruction ins = m_load_color_buffer_8(reg, 0); 1645 ins.load_store.swizzle = 0; /* xxxx */ 1646 1647 /* Read each component sequentially */ 1648 1649 for (int c = 0; c < 4; ++c) { 1650 ins.load_store.mask = (1 << c); 1651 ins.load_store.unknown = c; 1652 emit_mir_instruction(ctx, ins); 1653 } 1654 1655 /* vadd.u2f hr2, zext(hr2), #0 */ 1656 1657 midgard_vector_alu_src alu_src = blank_alu_src; 1658 alu_src.mod = midgard_int_zero_extend; 1659 alu_src.half = true; 1660 1661 midgard_instruction u2f = { 1662 .type = TAG_ALU_4, 1663 .ssa_args = { 1664 .src0 = reg, 1665 .src1 = SSA_UNUSED_0, 1666 .dest = reg, 1667 .inline_constant = true 1668 }, 1669 .alu = { 1670 .op = midgard_alu_op_u2f, 1671 .reg_mode = midgard_reg_mode_16, 1672 .dest_override = midgard_dest_override_none, 1673 .mask = 0xF, 1674 .src1 = vector_alu_srco_unsigned(alu_src), 1675 .src2 = vector_alu_srco_unsigned(blank_alu_src), 1676 } 1677 }; 1678 1679 emit_mir_instruction(ctx, u2f); 1680 1681 /* vmul.fmul.sat r1, hr2, #0.00392151 */ 1682 1683 alu_src.mod = 0; 1684 1685 midgard_instruction fmul = { 1686 .type = TAG_ALU_4, 1687 .inline_constant = _mesa_float_to_half(1.0 / 255.0), 1688 .ssa_args = { 1689 .src0 = reg, 1690 .dest = reg, 1691 .src1 = SSA_UNUSED_0, 1692 .inline_constant = true 1693 }, 1694 .alu = { 1695 .op = midgard_alu_op_fmul, 1696 .reg_mode = midgard_reg_mode_32, 1697 .dest_override = midgard_dest_override_none, 1698 .outmod = midgard_outmod_sat, 1699 .mask = 0xFF, 1700 .src1 = vector_alu_srco_unsigned(alu_src), 1701 .src2 = vector_alu_srco_unsigned(blank_alu_src), 1702 } 1703 }; 1704 1705 emit_mir_instruction(ctx, fmul); 1706 } else { 1707 DBG("Unknown input in blend shader\n"); 1708 assert(0); 1709 } 1710 } else if (ctx->stage == MESA_SHADER_VERTEX) { 1711 midgard_instruction ins = m_load_attr_32(reg, offset); 1712 ins.load_store.unknown = 0x1E1E; /* XXX: What is this? */ 1713 ins.load_store.mask = (1 << instr->num_components) - 1; 1714 emit_mir_instruction(ctx, ins); 1715 } else { 1716 DBG("Unknown load\n"); 1717 assert(0); 1718 } 1719 1720 break; 1721 1722 case nir_intrinsic_store_output: 1723 assert(nir_src_is_const(instr->src[1]) && "no indirect outputs"); 1724 1725 offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]); 1726 1727 reg = nir_src_index(ctx, &instr->src[0]); 1728 1729 if (ctx->stage == MESA_SHADER_FRAGMENT) { 1730 /* gl_FragColor is not emitted with load/store 1731 * instructions. Instead, it gets plonked into 1732 * r0 at the end of the shader and we do the 1733 * framebuffer writeout dance. TODO: Defer 1734 * writes */ 1735 1736 midgard_pin_output(ctx, reg, 0); 1737 1738 /* Save the index we're writing to for later reference 1739 * in the epilogue */ 1740 1741 ctx->fragment_output = reg; 1742 } else if (ctx->stage == MESA_SHADER_VERTEX) { 1743 /* Varyings are written into one of two special 1744 * varying register, r26 or r27. The register itself is selected as the register 1745 * in the st_vary instruction, minus the base of 26. E.g. write into r27 and then call st_vary(1) 1746 * 1747 * Normally emitting fmov's is frowned upon, 1748 * but due to unique constraints of 1749 * REGISTER_VARYING, fmov emission + a 1750 * dedicated cleanup pass is the only way to 1751 * guarantee correctness when considering some 1752 * (common) edge cases XXX: FIXME */ 1753 1754 /* If this varying corresponds to a constant (why?!), 1755 * emit that now since it won't get picked up by 1756 * hoisting (since there is no corresponding move 1757 * emitted otherwise) */ 1758 1759 void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, reg + 1); 1760 1761 if (constant_value) { 1762 /* Special case: emit the varying write 1763 * directly to r26 (looks funny in asm but it's 1764 * fine) and emit the store _now_. Possibly 1765 * slightly slower, but this is a really stupid 1766 * special case anyway (why on earth would you 1767 * have a constant varying? Your own fault for 1768 * slightly worse perf :P) */ 1769 1770 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(26)); 1771 attach_constants(ctx, &ins, constant_value, reg + 1); 1772 emit_mir_instruction(ctx, ins); 1773 1774 midgard_instruction st = m_store_vary_32(SSA_FIXED_REGISTER(0), offset); 1775 st.load_store.unknown = 0x1E9E; /* XXX: What is this? */ 1776 emit_mir_instruction(ctx, st); 1777 } else { 1778 /* Do not emit the varying yet -- instead, just mark down that we need to later */ 1779 1780 _mesa_hash_table_u64_insert(ctx->ssa_varyings, reg + 1, (void *) ((uintptr_t) (offset + 1))); 1781 } 1782 } else { 1783 DBG("Unknown store\n"); 1784 assert(0); 1785 } 1786 1787 break; 1788 1789 case nir_intrinsic_load_alpha_ref_float: 1790 assert(instr->dest.is_ssa); 1791 1792 float ref_value = ctx->alpha_ref; 1793 1794 float *v = ralloc_array(NULL, float, 4); 1795 memcpy(v, &ref_value, sizeof(float)); 1796 _mesa_hash_table_u64_insert(ctx->ssa_constants, instr->dest.ssa.index + 1, v); 1797 break; 1798 1799 case nir_intrinsic_load_viewport_scale: 1800 case nir_intrinsic_load_viewport_offset: 1801 emit_sysval_read(ctx, instr); 1802 break; 1803 1804 default: 1805 printf ("Unhandled intrinsic\n"); 1806 assert(0); 1807 break; 1808 } 1809} 1810 1811static unsigned 1812midgard_tex_format(enum glsl_sampler_dim dim) 1813{ 1814 switch (dim) { 1815 case GLSL_SAMPLER_DIM_2D: 1816 case GLSL_SAMPLER_DIM_EXTERNAL: 1817 return TEXTURE_2D; 1818 1819 case GLSL_SAMPLER_DIM_3D: 1820 return TEXTURE_3D; 1821 1822 case GLSL_SAMPLER_DIM_CUBE: 1823 return TEXTURE_CUBE; 1824 1825 default: 1826 DBG("Unknown sampler dim type\n"); 1827 assert(0); 1828 return 0; 1829 } 1830} 1831 1832static void 1833emit_tex(compiler_context *ctx, nir_tex_instr *instr) 1834{ 1835 /* TODO */ 1836 //assert (!instr->sampler); 1837 //assert (!instr->texture_array_size); 1838 assert (instr->op == nir_texop_tex); 1839 1840 /* Allocate registers via a round robin scheme to alternate between the two registers */ 1841 int reg = ctx->texture_op_count & 1; 1842 int in_reg = reg, out_reg = reg; 1843 1844 /* Make room for the reg */ 1845 1846 if (ctx->texture_index[reg] > -1) 1847 unalias_ssa(ctx, ctx->texture_index[reg]); 1848 1849 int texture_index = instr->texture_index; 1850 int sampler_index = texture_index; 1851 1852 for (unsigned i = 0; i < instr->num_srcs; ++i) { 1853 switch (instr->src[i].src_type) { 1854 case nir_tex_src_coord: { 1855 int index = nir_src_index(ctx, &instr->src[i].src); 1856 1857 midgard_vector_alu_src alu_src = blank_alu_src; 1858 1859 int reg = SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + in_reg); 1860 1861 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 1862 /* For cubemaps, we need to load coords into 1863 * special r27, and then use a special ld/st op 1864 * to copy into the texture register */ 1865 1866 alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_X); 1867 1868 midgard_instruction move = v_fmov(index, alu_src, SSA_FIXED_REGISTER(27)); 1869 emit_mir_instruction(ctx, move); 1870 1871 midgard_instruction st = m_store_cubemap_coords(reg, 0); 1872 st.load_store.unknown = 0x24; /* XXX: What is this? */ 1873 st.load_store.mask = 0x3; /* xy? */ 1874 st.load_store.swizzle = alu_src.swizzle; 1875 emit_mir_instruction(ctx, st); 1876 1877 } else { 1878 alu_src.swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_X, COMPONENT_X); 1879 1880 midgard_instruction ins = v_fmov(index, alu_src, reg); 1881 emit_mir_instruction(ctx, ins); 1882 } 1883 1884 break; 1885 } 1886 1887 default: { 1888 DBG("Unknown source type\n"); 1889 //assert(0); 1890 break; 1891 } 1892 } 1893 } 1894 1895 /* No helper to build texture words -- we do it all here */ 1896 midgard_instruction ins = { 1897 .type = TAG_TEXTURE_4, 1898 .texture = { 1899 .op = TEXTURE_OP_NORMAL, 1900 .format = midgard_tex_format(instr->sampler_dim), 1901 .texture_handle = texture_index, 1902 .sampler_handle = sampler_index, 1903 1904 /* TODO: Don't force xyzw */ 1905 .swizzle = SWIZZLE(COMPONENT_X, COMPONENT_Y, COMPONENT_Z, COMPONENT_W), 1906 .mask = 0xF, 1907 1908 /* TODO: half */ 1909 //.in_reg_full = 1, 1910 .out_full = 1, 1911 1912 .filter = 1, 1913 1914 /* Always 1 */ 1915 .unknown7 = 1, 1916 1917 /* Assume we can continue; hint it out later */ 1918 .cont = 1, 1919 } 1920 }; 1921 1922 /* Set registers to read and write from the same place */ 1923 ins.texture.in_reg_select = in_reg; 1924 ins.texture.out_reg_select = out_reg; 1925 1926 /* TODO: Dynamic swizzle input selection, half-swizzles? */ 1927 if (instr->sampler_dim == GLSL_SAMPLER_DIM_3D) { 1928 ins.texture.in_reg_swizzle_right = COMPONENT_X; 1929 ins.texture.in_reg_swizzle_left = COMPONENT_Y; 1930 //ins.texture.in_reg_swizzle_third = COMPONENT_Z; 1931 } else { 1932 ins.texture.in_reg_swizzle_left = COMPONENT_X; 1933 ins.texture.in_reg_swizzle_right = COMPONENT_Y; 1934 //ins.texture.in_reg_swizzle_third = COMPONENT_X; 1935 } 1936 1937 emit_mir_instruction(ctx, ins); 1938 1939 /* Simultaneously alias the destination and emit a move for it. The move will be eliminated if possible */ 1940 1941 int o_reg = REGISTER_TEXTURE_BASE + out_reg, o_index = nir_dest_index(ctx, &instr->dest); 1942 alias_ssa(ctx, o_index, SSA_FIXED_REGISTER(o_reg)); 1943 ctx->texture_index[reg] = o_index; 1944 1945 midgard_instruction ins2 = v_fmov(SSA_FIXED_REGISTER(o_reg), blank_alu_src, o_index); 1946 emit_mir_instruction(ctx, ins2); 1947 1948 /* Used for .cont and .last hinting */ 1949 ctx->texture_op_count++; 1950} 1951 1952static void 1953emit_jump(compiler_context *ctx, nir_jump_instr *instr) 1954{ 1955 switch (instr->type) { 1956 case nir_jump_break: { 1957 /* Emit a branch out of the loop */ 1958 struct midgard_instruction br = v_branch(false, false); 1959 br.branch.target_type = TARGET_BREAK; 1960 br.branch.target_break = ctx->current_loop_depth; 1961 emit_mir_instruction(ctx, br); 1962 1963 DBG("break..\n"); 1964 break; 1965 } 1966 1967 default: 1968 DBG("Unknown jump type %d\n", instr->type); 1969 break; 1970 } 1971} 1972 1973static void 1974emit_instr(compiler_context *ctx, struct nir_instr *instr) 1975{ 1976 switch (instr->type) { 1977 case nir_instr_type_load_const: 1978 emit_load_const(ctx, nir_instr_as_load_const(instr)); 1979 break; 1980 1981 case nir_instr_type_intrinsic: 1982 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); 1983 break; 1984 1985 case nir_instr_type_alu: 1986 emit_alu(ctx, nir_instr_as_alu(instr)); 1987 break; 1988 1989 case nir_instr_type_tex: 1990 emit_tex(ctx, nir_instr_as_tex(instr)); 1991 break; 1992 1993 case nir_instr_type_jump: 1994 emit_jump(ctx, nir_instr_as_jump(instr)); 1995 break; 1996 1997 case nir_instr_type_ssa_undef: 1998 /* Spurious */ 1999 break; 2000 2001 default: 2002 DBG("Unhandled instruction type\n"); 2003 break; 2004 } 2005} 2006 2007/* Determine the actual hardware from the index based on the RA results or special values */ 2008 2009static int 2010dealias_register(compiler_context *ctx, struct ra_graph *g, int reg, int maxreg) 2011{ 2012 if (reg >= SSA_FIXED_MINIMUM) 2013 return SSA_REG_FROM_FIXED(reg); 2014 2015 if (reg >= 0) { 2016 assert(reg < maxreg); 2017 assert(g); 2018 int r = ra_get_node_reg(g, reg); 2019 ctx->work_registers = MAX2(ctx->work_registers, r); 2020 return r; 2021 } 2022 2023 switch (reg) { 2024 /* fmov style unused */ 2025 case SSA_UNUSED_0: 2026 return REGISTER_UNUSED; 2027 2028 /* lut style unused */ 2029 case SSA_UNUSED_1: 2030 return REGISTER_UNUSED; 2031 2032 default: 2033 DBG("Unknown SSA register alias %d\n", reg); 2034 assert(0); 2035 return 31; 2036 } 2037} 2038 2039static unsigned int 2040midgard_ra_select_callback(struct ra_graph *g, BITSET_WORD *regs, void *data) 2041{ 2042 /* Choose the first available register to minimise reported register pressure */ 2043 2044 for (int i = 0; i < 16; ++i) { 2045 if (BITSET_TEST(regs, i)) { 2046 return i; 2047 } 2048 } 2049 2050 assert(0); 2051 return 0; 2052} 2053 2054static bool 2055midgard_is_live_in_instr(midgard_instruction *ins, int src) 2056{ 2057 if (ins->ssa_args.src0 == src) return true; 2058 if (ins->ssa_args.src1 == src) return true; 2059 2060 return false; 2061} 2062 2063/* Determine if a variable is live in the successors of a block */ 2064static bool 2065is_live_after_successors(compiler_context *ctx, midgard_block *bl, int src) 2066{ 2067 for (unsigned i = 0; i < bl->nr_successors; ++i) { 2068 midgard_block *succ = bl->successors[i]; 2069 2070 /* If we already visited, the value we're seeking 2071 * isn't down this path (or we would have short 2072 * circuited */ 2073 2074 if (succ->visited) continue; 2075 2076 /* Otherwise (it's visited *now*), check the block */ 2077 2078 succ->visited = true; 2079 2080 mir_foreach_instr_in_block(succ, ins) { 2081 if (midgard_is_live_in_instr(ins, src)) 2082 return true; 2083 } 2084 2085 /* ...and also, check *its* successors */ 2086 if (is_live_after_successors(ctx, succ, src)) 2087 return true; 2088 2089 } 2090 2091 /* Welp. We're really not live. */ 2092 2093 return false; 2094} 2095 2096static bool 2097is_live_after(compiler_context *ctx, midgard_block *block, midgard_instruction *start, int src) 2098{ 2099 /* Check the rest of the block for liveness */ 2100 2101 mir_foreach_instr_in_block_from(block, ins, mir_next_op(start)) { 2102 if (midgard_is_live_in_instr(ins, src)) 2103 return true; 2104 } 2105 2106 /* Check the rest of the blocks for liveness recursively */ 2107 2108 bool succ = is_live_after_successors(ctx, block, src); 2109 2110 mir_foreach_block(ctx, block) { 2111 block->visited = false; 2112 } 2113 2114 return succ; 2115} 2116 2117/* Once registers have been decided via register allocation 2118 * (allocate_registers), we need to rewrite the MIR to use registers instead of 2119 * SSA */ 2120 2121static void 2122install_registers(compiler_context *ctx, struct ra_graph *g) 2123{ 2124 mir_foreach_block(ctx, block) { 2125 mir_foreach_instr_in_block(block, ins) { 2126 if (ins->compact_branch) continue; 2127 2128 ssa_args args = ins->ssa_args; 2129 2130 switch (ins->type) { 2131 case TAG_ALU_4: 2132 ins->registers.src1_reg = dealias_register(ctx, g, args.src0, ctx->temp_count); 2133 2134 ins->registers.src2_imm = args.inline_constant; 2135 2136 if (args.inline_constant) { 2137 /* Encode inline 16-bit constant as a vector by default */ 2138 2139 ins->registers.src2_reg = ins->inline_constant >> 11; 2140 2141 int lower_11 = ins->inline_constant & ((1 << 12) - 1); 2142 2143 uint16_t imm = ((lower_11 >> 8) & 0x7) | ((lower_11 & 0xFF) << 3); 2144 ins->alu.src2 = imm << 2; 2145 } else { 2146 ins->registers.src2_reg = dealias_register(ctx, g, args.src1, ctx->temp_count); 2147 } 2148 2149 ins->registers.out_reg = dealias_register(ctx, g, args.dest, ctx->temp_count); 2150 2151 break; 2152 2153 case TAG_LOAD_STORE_4: { 2154 if (OP_IS_STORE_VARY(ins->load_store.op)) { 2155 /* TODO: use ssa_args for store_vary */ 2156 ins->load_store.reg = 0; 2157 } else { 2158 bool has_dest = args.dest >= 0; 2159 int ssa_arg = has_dest ? args.dest : args.src0; 2160 2161 ins->load_store.reg = dealias_register(ctx, g, ssa_arg, ctx->temp_count); 2162 } 2163 2164 break; 2165 } 2166 2167 default: 2168 break; 2169 } 2170 } 2171 } 2172 2173} 2174 2175/* This routine performs the actual register allocation. It should be succeeded 2176 * by install_registers */ 2177 2178static struct ra_graph * 2179allocate_registers(compiler_context *ctx) 2180{ 2181 /* First, initialize the RA */ 2182 struct ra_regs *regs = ra_alloc_reg_set(NULL, 32, true); 2183 2184 /* Create a primary (general purpose) class, as well as special purpose 2185 * pipeline register classes */ 2186 2187 int primary_class = ra_alloc_reg_class(regs); 2188 int varying_class = ra_alloc_reg_class(regs); 2189 2190 /* Add the full set of work registers */ 2191 int work_count = 16 - MAX2((ctx->uniform_cutoff - 8), 0); 2192 for (int i = 0; i < work_count; ++i) 2193 ra_class_add_reg(regs, primary_class, i); 2194 2195 /* Add special registers */ 2196 ra_class_add_reg(regs, varying_class, REGISTER_VARYING_BASE); 2197 ra_class_add_reg(regs, varying_class, REGISTER_VARYING_BASE + 1); 2198 2199 /* We're done setting up */ 2200 ra_set_finalize(regs, NULL); 2201 2202 /* Transform the MIR into squeezed index form */ 2203 mir_foreach_block(ctx, block) { 2204 mir_foreach_instr_in_block(block, ins) { 2205 if (ins->compact_branch) continue; 2206 2207 ins->ssa_args.src0 = find_or_allocate_temp(ctx, ins->ssa_args.src0); 2208 ins->ssa_args.src1 = find_or_allocate_temp(ctx, ins->ssa_args.src1); 2209 ins->ssa_args.dest = find_or_allocate_temp(ctx, ins->ssa_args.dest); 2210 } 2211 if (midgard_debug & MIDGARD_DBG_SHADERS) 2212 print_mir_block(block); 2213 } 2214 2215 /* No register allocation to do with no SSA */ 2216 2217 if (!ctx->temp_count) 2218 return NULL; 2219 2220 /* Let's actually do register allocation */ 2221 int nodes = ctx->temp_count; 2222 struct ra_graph *g = ra_alloc_interference_graph(regs, nodes); 2223 2224 /* Set everything to the work register class, unless it has somewhere 2225 * special to go */ 2226 2227 mir_foreach_block(ctx, block) { 2228 mir_foreach_instr_in_block(block, ins) { 2229 if (ins->compact_branch) continue; 2230 2231 if (ins->ssa_args.dest < 0) continue; 2232 2233 if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; 2234 2235 int class = primary_class; 2236 2237 ra_set_node_class(g, ins->ssa_args.dest, class); 2238 } 2239 } 2240 2241 for (int index = 0; index <= ctx->max_hash; ++index) { 2242 unsigned temp = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_register, index + 1); 2243 2244 if (temp) { 2245 unsigned reg = temp - 1; 2246 int t = find_or_allocate_temp(ctx, index); 2247 ra_set_node_reg(g, t, reg); 2248 } 2249 } 2250 2251 /* Determine liveness */ 2252 2253 int *live_start = malloc(nodes * sizeof(int)); 2254 int *live_end = malloc(nodes * sizeof(int)); 2255 2256 /* Initialize as non-existent */ 2257 2258 for (int i = 0; i < nodes; ++i) { 2259 live_start[i] = live_end[i] = -1; 2260 } 2261 2262 int d = 0; 2263 2264 mir_foreach_block(ctx, block) { 2265 mir_foreach_instr_in_block(block, ins) { 2266 if (ins->compact_branch) continue; 2267 2268 /* Dest is < 0 for store_vary instructions, which break 2269 * the usual SSA conventions. Liveness analysis doesn't 2270 * make sense on these instructions, so skip them to 2271 * avoid memory corruption */ 2272 2273 if (ins->ssa_args.dest < 0) continue; 2274 2275 if (ins->ssa_args.dest < SSA_FIXED_MINIMUM) { 2276 /* If this destination is not yet live, it is now since we just wrote it */ 2277 2278 int dest = ins->ssa_args.dest; 2279 2280 if (live_start[dest] == -1) 2281 live_start[dest] = d; 2282 } 2283 2284 /* Since we just used a source, the source might be 2285 * dead now. Scan the rest of the block for 2286 * invocations, and if there are none, the source dies 2287 * */ 2288 2289 int sources[2] = { ins->ssa_args.src0, ins->ssa_args.src1 }; 2290 2291 for (int src = 0; src < 2; ++src) { 2292 int s = sources[src]; 2293 2294 if (s < 0) continue; 2295 2296 if (s >= SSA_FIXED_MINIMUM) continue; 2297 2298 if (!is_live_after(ctx, block, ins, s)) { 2299 live_end[s] = d; 2300 } 2301 } 2302 2303 ++d; 2304 } 2305 } 2306 2307 /* If a node still hasn't been killed, kill it now */ 2308 2309 for (int i = 0; i < nodes; ++i) { 2310 /* live_start == -1 most likely indicates a pinned output */ 2311 2312 if (live_end[i] == -1) 2313 live_end[i] = d; 2314 } 2315 2316 /* Setup interference between nodes that are live at the same time */ 2317 2318 for (int i = 0; i < nodes; ++i) { 2319 for (int j = i + 1; j < nodes; ++j) { 2320 if (!(live_start[i] >= live_end[j] || live_start[j] >= live_end[i])) 2321 ra_add_node_interference(g, i, j); 2322 } 2323 } 2324 2325 ra_set_select_reg_callback(g, midgard_ra_select_callback, NULL); 2326 2327 if (!ra_allocate(g)) { 2328 DBG("Error allocating registers\n"); 2329 assert(0); 2330 } 2331 2332 /* Cleanup */ 2333 free(live_start); 2334 free(live_end); 2335 2336 return g; 2337} 2338 2339/* Midgard IR only knows vector ALU types, but we sometimes need to actually 2340 * use scalar ALU instructions, for functional or performance reasons. To do 2341 * this, we just demote vector ALU payloads to scalar. */ 2342 2343static int 2344component_from_mask(unsigned mask) 2345{ 2346 for (int c = 0; c < 4; ++c) { 2347 if (mask & (3 << (2 * c))) 2348 return c; 2349 } 2350 2351 assert(0); 2352 return 0; 2353} 2354 2355static bool 2356is_single_component_mask(unsigned mask) 2357{ 2358 int components = 0; 2359 2360 for (int c = 0; c < 4; ++c) 2361 if (mask & (3 << (2 * c))) 2362 components++; 2363 2364 return components == 1; 2365} 2366 2367/* Create a mask of accessed components from a swizzle to figure out vector 2368 * dependencies */ 2369 2370static unsigned 2371swizzle_to_access_mask(unsigned swizzle) 2372{ 2373 unsigned component_mask = 0; 2374 2375 for (int i = 0; i < 4; ++i) { 2376 unsigned c = (swizzle >> (2 * i)) & 3; 2377 component_mask |= (1 << c); 2378 } 2379 2380 return component_mask; 2381} 2382 2383static unsigned 2384vector_to_scalar_source(unsigned u, bool is_int) 2385{ 2386 midgard_vector_alu_src v; 2387 memcpy(&v, &u, sizeof(v)); 2388 2389 /* TODO: Integers */ 2390 2391 midgard_scalar_alu_src s = { 2392 .full = !v.half, 2393 .component = (v.swizzle & 3) << 1 2394 }; 2395 2396 if (is_int) { 2397 /* TODO */ 2398 } else { 2399 s.abs = v.mod & MIDGARD_FLOAT_MOD_ABS; 2400 s.negate = v.mod & MIDGARD_FLOAT_MOD_NEG; 2401 } 2402 2403 unsigned o; 2404 memcpy(&o, &s, sizeof(s)); 2405 2406 return o & ((1 << 6) - 1); 2407} 2408 2409static midgard_scalar_alu 2410vector_to_scalar_alu(midgard_vector_alu v, midgard_instruction *ins) 2411{ 2412 bool is_int = midgard_is_integer_op(v.op); 2413 2414 /* The output component is from the mask */ 2415 midgard_scalar_alu s = { 2416 .op = v.op, 2417 .src1 = vector_to_scalar_source(v.src1, is_int), 2418 .src2 = vector_to_scalar_source(v.src2, is_int), 2419 .unknown = 0, 2420 .outmod = v.outmod, 2421 .output_full = 1, /* TODO: Half */ 2422 .output_component = component_from_mask(v.mask) << 1, 2423 }; 2424 2425 /* Inline constant is passed along rather than trying to extract it 2426 * from v */ 2427 2428 if (ins->ssa_args.inline_constant) { 2429 uint16_t imm = 0; 2430 int lower_11 = ins->inline_constant & ((1 << 12) - 1); 2431 imm |= (lower_11 >> 9) & 3; 2432 imm |= (lower_11 >> 6) & 4; 2433 imm |= (lower_11 >> 2) & 0x38; 2434 imm |= (lower_11 & 63) << 6; 2435 2436 s.src2 = imm; 2437 } 2438 2439 return s; 2440} 2441 2442/* Midgard prefetches instruction types, so during emission we need to 2443 * lookahead too. Unless this is the last instruction, in which we return 1. Or 2444 * if this is the second to last and the last is an ALU, then it's also 1... */ 2445 2446#define IS_ALU(tag) (tag == TAG_ALU_4 || tag == TAG_ALU_8 || \ 2447 tag == TAG_ALU_12 || tag == TAG_ALU_16) 2448 2449#define EMIT_AND_COUNT(type, val) util_dynarray_append(emission, type, val); \ 2450 bytes_emitted += sizeof(type) 2451 2452static void 2453emit_binary_vector_instruction(midgard_instruction *ains, 2454 uint16_t *register_words, int *register_words_count, 2455 uint64_t *body_words, size_t *body_size, int *body_words_count, 2456 size_t *bytes_emitted) 2457{ 2458 memcpy(®ister_words[(*register_words_count)++], &ains->registers, sizeof(ains->registers)); 2459 *bytes_emitted += sizeof(midgard_reg_info); 2460 2461 body_size[*body_words_count] = sizeof(midgard_vector_alu); 2462 memcpy(&body_words[(*body_words_count)++], &ains->alu, sizeof(ains->alu)); 2463 *bytes_emitted += sizeof(midgard_vector_alu); 2464} 2465 2466/* Checks for an SSA data hazard between two adjacent instructions, keeping in 2467 * mind that we are a vector architecture and we can write to different 2468 * components simultaneously */ 2469 2470static bool 2471can_run_concurrent_ssa(midgard_instruction *first, midgard_instruction *second) 2472{ 2473 /* Each instruction reads some registers and writes to a register. See 2474 * where the first writes */ 2475 2476 /* Figure out where exactly we wrote to */ 2477 int source = first->ssa_args.dest; 2478 int source_mask = first->type == TAG_ALU_4 ? squeeze_writemask(first->alu.mask) : 0xF; 2479 2480 /* As long as the second doesn't read from the first, we're okay */ 2481 if (second->ssa_args.src0 == source) { 2482 if (first->type == TAG_ALU_4) { 2483 /* Figure out which components we just read from */ 2484 2485 int q = second->alu.src1; 2486 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; 2487 2488 /* Check if there are components in common, and fail if so */ 2489 if (swizzle_to_access_mask(m->swizzle) & source_mask) 2490 return false; 2491 } else 2492 return false; 2493 2494 } 2495 2496 if (second->ssa_args.src1 == source) 2497 return false; 2498 2499 /* Otherwise, it's safe in that regard. Another data hazard is both 2500 * writing to the same place, of course */ 2501 2502 if (second->ssa_args.dest == source) { 2503 /* ...but only if the components overlap */ 2504 int dest_mask = second->type == TAG_ALU_4 ? squeeze_writemask(second->alu.mask) : 0xF; 2505 2506 if (dest_mask & source_mask) 2507 return false; 2508 } 2509 2510 /* ...That's it */ 2511 return true; 2512} 2513 2514static bool 2515midgard_has_hazard( 2516 midgard_instruction **segment, unsigned segment_size, 2517 midgard_instruction *ains) 2518{ 2519 for (int s = 0; s < segment_size; ++s) 2520 if (!can_run_concurrent_ssa(segment[s], ains)) 2521 return true; 2522 2523 return false; 2524 2525 2526} 2527 2528/* Schedules, but does not emit, a single basic block. After scheduling, the 2529 * final tag and size of the block are known, which are necessary for branching 2530 * */ 2531 2532static midgard_bundle 2533schedule_bundle(compiler_context *ctx, midgard_block *block, midgard_instruction *ins, int *skip) 2534{ 2535 int instructions_emitted = 0, instructions_consumed = -1; 2536 midgard_bundle bundle = { 0 }; 2537 2538 uint8_t tag = ins->type; 2539 2540 /* Default to the instruction's tag */ 2541 bundle.tag = tag; 2542 2543 switch (ins->type) { 2544 case TAG_ALU_4: { 2545 uint32_t control = 0; 2546 size_t bytes_emitted = sizeof(control); 2547 2548 /* TODO: Constant combining */ 2549 int index = 0, last_unit = 0; 2550 2551 /* Previous instructions, for the purpose of parallelism */ 2552 midgard_instruction *segment[4] = {0}; 2553 int segment_size = 0; 2554 2555 instructions_emitted = -1; 2556 midgard_instruction *pins = ins; 2557 2558 for (;;) { 2559 midgard_instruction *ains = pins; 2560 2561 /* Advance instruction pointer */ 2562 if (index) { 2563 ains = mir_next_op(pins); 2564 pins = ains; 2565 } 2566 2567 /* Out-of-work condition */ 2568 if ((struct list_head *) ains == &block->instructions) 2569 break; 2570 2571 /* Ensure that the chain can continue */ 2572 if (ains->type != TAG_ALU_4) break; 2573 2574 /* According to the presentation "The ARM 2575 * Mali-T880 Mobile GPU" from HotChips 27, 2576 * there are two pipeline stages. Branching 2577 * position determined experimentally. Lines 2578 * are executed in parallel: 2579 * 2580 * [ VMUL ] [ SADD ] 2581 * [ VADD ] [ SMUL ] [ LUT ] [ BRANCH ] 2582 * 2583 * Verify that there are no ordering dependencies here. 2584 * 2585 * TODO: Allow for parallelism!!! 2586 */ 2587 2588 /* Pick a unit for it if it doesn't force a particular unit */ 2589 2590 int unit = ains->unit; 2591 2592 if (!unit) { 2593 int op = ains->alu.op; 2594 int units = alu_opcode_props[op].props; 2595 2596 /* TODO: Promotion of scalars to vectors */ 2597 int vector = ((!is_single_component_mask(ains->alu.mask)) || ((units & UNITS_SCALAR) == 0)) && (units & UNITS_ANY_VECTOR); 2598 2599 if (!vector) 2600 assert(units & UNITS_SCALAR); 2601 2602 if (vector) { 2603 if (last_unit >= UNIT_VADD) { 2604 if (units & UNIT_VLUT) 2605 unit = UNIT_VLUT; 2606 else 2607 break; 2608 } else { 2609 if ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) 2610 unit = UNIT_VMUL; 2611 else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) 2612 unit = UNIT_VADD; 2613 else if (units & UNIT_VLUT) 2614 unit = UNIT_VLUT; 2615 else 2616 break; 2617 } 2618 } else { 2619 if (last_unit >= UNIT_VADD) { 2620 if ((units & UNIT_SMUL) && !(control & UNIT_SMUL)) 2621 unit = UNIT_SMUL; 2622 else if (units & UNIT_VLUT) 2623 unit = UNIT_VLUT; 2624 else 2625 break; 2626 } else { 2627 if ((units & UNIT_SADD) && !(control & UNIT_SADD) && !midgard_has_hazard(segment, segment_size, ains)) 2628 unit = UNIT_SADD; 2629 else if (units & UNIT_SMUL) 2630 unit = ((units & UNIT_VMUL) && !(control & UNIT_VMUL)) ? UNIT_VMUL : UNIT_SMUL; 2631 else if ((units & UNIT_VADD) && !(control & UNIT_VADD)) 2632 unit = UNIT_VADD; 2633 else 2634 break; 2635 } 2636 } 2637 2638 assert(unit & units); 2639 } 2640 2641 /* Late unit check, this time for encoding (not parallelism) */ 2642 if (unit <= last_unit) break; 2643 2644 /* Clear the segment */ 2645 if (last_unit < UNIT_VADD && unit >= UNIT_VADD) 2646 segment_size = 0; 2647 2648 if (midgard_has_hazard(segment, segment_size, ains)) 2649 break; 2650 2651 /* We're good to go -- emit the instruction */ 2652 ains->unit = unit; 2653 2654 segment[segment_size++] = ains; 2655 2656 /* Only one set of embedded constants per 2657 * bundle possible; if we have more, we must 2658 * break the chain early, unfortunately */ 2659 2660 if (ains->has_constants) { 2661 if (bundle.has_embedded_constants) { 2662 /* ...but if there are already 2663 * constants but these are the 2664 * *same* constants, we let it 2665 * through */ 2666 2667 if (memcmp(bundle.constants, ains->constants, sizeof(bundle.constants))) 2668 break; 2669 } else { 2670 bundle.has_embedded_constants = true; 2671 memcpy(bundle.constants, ains->constants, sizeof(bundle.constants)); 2672 2673 /* If this is a blend shader special constant, track it for patching */ 2674 if (ains->has_blend_constant) 2675 bundle.has_blend_constant = true; 2676 } 2677 } 2678 2679 if (ains->unit & UNITS_ANY_VECTOR) { 2680 emit_binary_vector_instruction(ains, bundle.register_words, 2681 &bundle.register_words_count, bundle.body_words, 2682 bundle.body_size, &bundle.body_words_count, &bytes_emitted); 2683 } else if (ains->compact_branch) { 2684 /* All of r0 has to be written out 2685 * along with the branch writeout. 2686 * (slow!) */ 2687 2688 if (ains->writeout) { 2689 if (index == 0) { 2690 midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); 2691 ins.unit = UNIT_VMUL; 2692 2693 control |= ins.unit; 2694 2695 emit_binary_vector_instruction(&ins, bundle.register_words, 2696 &bundle.register_words_count, bundle.body_words, 2697 bundle.body_size, &bundle.body_words_count, &bytes_emitted); 2698 } else { 2699 /* Analyse the group to see if r0 is written in full, on-time, without hanging dependencies*/ 2700 bool written_late = false; 2701 bool components[4] = { 0 }; 2702 uint16_t register_dep_mask = 0; 2703 uint16_t written_mask = 0; 2704 2705 midgard_instruction *qins = ins; 2706 for (int t = 0; t < index; ++t) { 2707 if (qins->registers.out_reg != 0) { 2708 /* Mark down writes */ 2709 2710 written_mask |= (1 << qins->registers.out_reg); 2711 } else { 2712 /* Mark down the register dependencies for errata check */ 2713 2714 if (qins->registers.src1_reg < 16) 2715 register_dep_mask |= (1 << qins->registers.src1_reg); 2716 2717 if (qins->registers.src2_reg < 16) 2718 register_dep_mask |= (1 << qins->registers.src2_reg); 2719 2720 int mask = qins->alu.mask; 2721 2722 for (int c = 0; c < 4; ++c) 2723 if (mask & (0x3 << (2 * c))) 2724 components[c] = true; 2725 2726 /* ..but if the writeout is too late, we have to break up anyway... for some reason */ 2727 2728 if (qins->unit == UNIT_VLUT) 2729 written_late = true; 2730 } 2731 2732 /* Advance instruction pointer */ 2733 qins = mir_next_op(qins); 2734 } 2735 2736 2737 /* ERRATA (?): In a bundle ending in a fragment writeout, the register dependencies of r0 cannot be written within this bundle (discovered in -bshading:shading=phong) */ 2738 if (register_dep_mask & written_mask) { 2739 DBG("ERRATA WORKAROUND: Breakup for writeout dependency masks %X vs %X (common %X)\n", register_dep_mask, written_mask, register_dep_mask & written_mask); 2740 break; 2741 } 2742 2743 if (written_late) 2744 break; 2745 2746 /* If even a single component is not written, break it up (conservative check). */ 2747 bool breakup = false; 2748 2749 for (int c = 0; c < 4; ++c) 2750 if (!components[c]) 2751 breakup = true; 2752 2753 if (breakup) 2754 break; 2755 2756 /* Otherwise, we're free to proceed */ 2757 } 2758 } 2759 2760 if (ains->unit == ALU_ENAB_BRANCH) { 2761 bundle.body_size[bundle.body_words_count] = sizeof(midgard_branch_extended); 2762 memcpy(&bundle.body_words[bundle.body_words_count++], &ains->branch_extended, sizeof(midgard_branch_extended)); 2763 bytes_emitted += sizeof(midgard_branch_extended); 2764 } else { 2765 bundle.body_size[bundle.body_words_count] = sizeof(ains->br_compact); 2766 memcpy(&bundle.body_words[bundle.body_words_count++], &ains->br_compact, sizeof(ains->br_compact)); 2767 bytes_emitted += sizeof(ains->br_compact); 2768 } 2769 } else { 2770 memcpy(&bundle.register_words[bundle.register_words_count++], &ains->registers, sizeof(ains->registers)); 2771 bytes_emitted += sizeof(midgard_reg_info); 2772 2773 bundle.body_size[bundle.body_words_count] = sizeof(midgard_scalar_alu); 2774 bundle.body_words_count++; 2775 bytes_emitted += sizeof(midgard_scalar_alu); 2776 } 2777 2778 /* Defer marking until after writing to allow for break */ 2779 control |= ains->unit; 2780 last_unit = ains->unit; 2781 ++instructions_emitted; 2782 ++index; 2783 } 2784 2785 /* Bubble up the number of instructions for skipping */ 2786 instructions_consumed = index - 1; 2787 2788 int padding = 0; 2789 2790 /* Pad ALU op to nearest word */ 2791 2792 if (bytes_emitted & 15) { 2793 padding = 16 - (bytes_emitted & 15); 2794 bytes_emitted += padding; 2795 } 2796 2797 /* Constants must always be quadwords */ 2798 if (bundle.has_embedded_constants) 2799 bytes_emitted += 16; 2800 2801 /* Size ALU instruction for tag */ 2802 bundle.tag = (TAG_ALU_4) + (bytes_emitted / 16) - 1; 2803 bundle.padding = padding; 2804 bundle.control = bundle.tag | control; 2805 2806 break; 2807 } 2808 2809 case TAG_LOAD_STORE_4: { 2810 /* Load store instructions have two words at once. If 2811 * we only have one queued up, we need to NOP pad. 2812 * Otherwise, we store both in succession to save space 2813 * and cycles -- letting them go in parallel -- skip 2814 * the next. The usefulness of this optimisation is 2815 * greatly dependent on the quality of the instruction 2816 * scheduler. 2817 */ 2818 2819 midgard_instruction *next_op = mir_next_op(ins); 2820 2821 if ((struct list_head *) next_op != &block->instructions && next_op->type == TAG_LOAD_STORE_4) { 2822 /* As the two operate concurrently, make sure 2823 * they are not dependent */ 2824 2825 if (can_run_concurrent_ssa(ins, next_op) || true) { 2826 /* Skip ahead, since it's redundant with the pair */ 2827 instructions_consumed = 1 + (instructions_emitted++); 2828 } 2829 } 2830 2831 break; 2832 } 2833 2834 default: 2835 /* Texture ops default to single-op-per-bundle scheduling */ 2836 break; 2837 } 2838 2839 /* Copy the instructions into the bundle */ 2840 bundle.instruction_count = instructions_emitted + 1; 2841 2842 int used_idx = 0; 2843 2844 midgard_instruction *uins = ins; 2845 for (int i = 0; used_idx < bundle.instruction_count; ++i) { 2846 bundle.instructions[used_idx++] = *uins; 2847 uins = mir_next_op(uins); 2848 } 2849 2850 *skip = (instructions_consumed == -1) ? instructions_emitted : instructions_consumed; 2851 2852 return bundle; 2853} 2854 2855static int 2856quadword_size(int tag) 2857{ 2858 switch (tag) { 2859 case TAG_ALU_4: 2860 return 1; 2861 2862 case TAG_ALU_8: 2863 return 2; 2864 2865 case TAG_ALU_12: 2866 return 3; 2867 2868 case TAG_ALU_16: 2869 return 4; 2870 2871 case TAG_LOAD_STORE_4: 2872 return 1; 2873 2874 case TAG_TEXTURE_4: 2875 return 1; 2876 2877 default: 2878 assert(0); 2879 return 0; 2880 } 2881} 2882 2883/* Schedule a single block by iterating its instruction to create bundles. 2884 * While we go, tally about the bundle sizes to compute the block size. */ 2885 2886static void 2887schedule_block(compiler_context *ctx, midgard_block *block) 2888{ 2889 util_dynarray_init(&block->bundles, NULL); 2890 2891 block->quadword_count = 0; 2892 2893 mir_foreach_instr_in_block(block, ins) { 2894 int skip; 2895 midgard_bundle bundle = schedule_bundle(ctx, block, ins, &skip); 2896 util_dynarray_append(&block->bundles, midgard_bundle, bundle); 2897 2898 if (bundle.has_blend_constant) { 2899 /* TODO: Multiblock? */ 2900 int quadwords_within_block = block->quadword_count + quadword_size(bundle.tag) - 1; 2901 ctx->blend_constant_offset = quadwords_within_block * 0x10; 2902 } 2903 2904 while(skip--) 2905 ins = mir_next_op(ins); 2906 2907 block->quadword_count += quadword_size(bundle.tag); 2908 } 2909 2910 block->is_scheduled = true; 2911} 2912 2913static void 2914schedule_program(compiler_context *ctx) 2915{ 2916 /* We run RA prior to scheduling */ 2917 struct ra_graph *g = allocate_registers(ctx); 2918 install_registers(ctx, g); 2919 2920 mir_foreach_block(ctx, block) { 2921 schedule_block(ctx, block); 2922 } 2923} 2924 2925/* After everything is scheduled, emit whole bundles at a time */ 2926 2927static void 2928emit_binary_bundle(compiler_context *ctx, midgard_bundle *bundle, struct util_dynarray *emission, int next_tag) 2929{ 2930 int lookahead = next_tag << 4; 2931 2932 switch (bundle->tag) { 2933 case TAG_ALU_4: 2934 case TAG_ALU_8: 2935 case TAG_ALU_12: 2936 case TAG_ALU_16: { 2937 /* Actually emit each component */ 2938 util_dynarray_append(emission, uint32_t, bundle->control | lookahead); 2939 2940 for (int i = 0; i < bundle->register_words_count; ++i) 2941 util_dynarray_append(emission, uint16_t, bundle->register_words[i]); 2942 2943 /* Emit body words based on the instructions bundled */ 2944 for (int i = 0; i < bundle->instruction_count; ++i) { 2945 midgard_instruction *ins = &bundle->instructions[i]; 2946 2947 if (ins->unit & UNITS_ANY_VECTOR) { 2948 memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins->alu, sizeof(midgard_vector_alu)); 2949 } else if (ins->compact_branch) { 2950 /* Dummy move, XXX DRY */ 2951 if ((i == 0) && ins->writeout) { 2952 midgard_instruction ins = v_fmov(0, blank_alu_src, SSA_FIXED_REGISTER(0)); 2953 memcpy(util_dynarray_grow(emission, sizeof(midgard_vector_alu)), &ins.alu, sizeof(midgard_vector_alu)); 2954 } 2955 2956 if (ins->unit == ALU_ENAB_BR_COMPACT) { 2957 memcpy(util_dynarray_grow(emission, sizeof(ins->br_compact)), &ins->br_compact, sizeof(ins->br_compact)); 2958 } else { 2959 memcpy(util_dynarray_grow(emission, sizeof(ins->branch_extended)), &ins->branch_extended, sizeof(ins->branch_extended)); 2960 } 2961 } else { 2962 /* Scalar */ 2963 midgard_scalar_alu scalarised = vector_to_scalar_alu(ins->alu, ins); 2964 memcpy(util_dynarray_grow(emission, sizeof(scalarised)), &scalarised, sizeof(scalarised)); 2965 } 2966 } 2967 2968 /* Emit padding (all zero) */ 2969 memset(util_dynarray_grow(emission, bundle->padding), 0, bundle->padding); 2970 2971 /* Tack on constants */ 2972 2973 if (bundle->has_embedded_constants) { 2974 util_dynarray_append(emission, float, bundle->constants[0]); 2975 util_dynarray_append(emission, float, bundle->constants[1]); 2976 util_dynarray_append(emission, float, bundle->constants[2]); 2977 util_dynarray_append(emission, float, bundle->constants[3]); 2978 } 2979 2980 break; 2981 } 2982 2983 case TAG_LOAD_STORE_4: { 2984 /* One or two composing instructions */ 2985 2986 uint64_t current64, next64 = LDST_NOP; 2987 2988 memcpy(¤t64, &bundle->instructions[0].load_store, sizeof(current64)); 2989 2990 if (bundle->instruction_count == 2) 2991 memcpy(&next64, &bundle->instructions[1].load_store, sizeof(next64)); 2992 2993 midgard_load_store instruction = { 2994 .type = bundle->tag, 2995 .next_type = next_tag, 2996 .word1 = current64, 2997 .word2 = next64 2998 }; 2999 3000 util_dynarray_append(emission, midgard_load_store, instruction); 3001 3002 break; 3003 } 3004 3005 case TAG_TEXTURE_4: { 3006 /* Texture instructions are easy, since there is no 3007 * pipelining nor VLIW to worry about. We may need to set the .last flag */ 3008 3009 midgard_instruction *ins = &bundle->instructions[0]; 3010 3011 ins->texture.type = TAG_TEXTURE_4; 3012 ins->texture.next_type = next_tag; 3013 3014 ctx->texture_op_count--; 3015 3016 if (!ctx->texture_op_count) { 3017 ins->texture.cont = 0; 3018 ins->texture.last = 1; 3019 } 3020 3021 util_dynarray_append(emission, midgard_texture_word, ins->texture); 3022 break; 3023 } 3024 3025 default: 3026 DBG("Unknown midgard instruction type\n"); 3027 assert(0); 3028 break; 3029 } 3030} 3031 3032 3033/* ALU instructions can inline or embed constants, which decreases register 3034 * pressure and saves space. */ 3035 3036#define CONDITIONAL_ATTACH(src) { \ 3037 void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src + 1); \ 3038\ 3039 if (entry) { \ 3040 attach_constants(ctx, alu, entry, alu->ssa_args.src + 1); \ 3041 alu->ssa_args.src = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ 3042 } \ 3043} 3044 3045static void 3046inline_alu_constants(compiler_context *ctx) 3047{ 3048 mir_foreach_instr(ctx, alu) { 3049 /* Other instructions cannot inline constants */ 3050 if (alu->type != TAG_ALU_4) continue; 3051 3052 /* If there is already a constant here, we can do nothing */ 3053 if (alu->has_constants) continue; 3054 3055 /* It makes no sense to inline constants on a branch */ 3056 if (alu->compact_branch || alu->prepacked_branch) continue; 3057 3058 CONDITIONAL_ATTACH(src0); 3059 3060 if (!alu->has_constants) { 3061 CONDITIONAL_ATTACH(src1) 3062 } else if (!alu->inline_constant) { 3063 /* Corner case: _two_ vec4 constants, for instance with a 3064 * csel. For this case, we can only use a constant 3065 * register for one, we'll have to emit a move for the 3066 * other. Note, if both arguments are constants, then 3067 * necessarily neither argument depends on the value of 3068 * any particular register. As the destination register 3069 * will be wiped, that means we can spill the constant 3070 * to the destination register. 3071 */ 3072 3073 void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->ssa_args.src1 + 1); 3074 unsigned scratch = alu->ssa_args.dest; 3075 3076 if (entry) { 3077 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, scratch); 3078 attach_constants(ctx, &ins, entry, alu->ssa_args.src1 + 1); 3079 3080 /* Force a break XXX Defer r31 writes */ 3081 ins.unit = UNIT_VLUT; 3082 3083 /* Set the source */ 3084 alu->ssa_args.src1 = scratch; 3085 3086 /* Inject us -before- the last instruction which set r31 */ 3087 mir_insert_instruction_before(mir_prev_op(alu), ins); 3088 } 3089 } 3090 } 3091} 3092 3093/* Midgard supports two types of constants, embedded constants (128-bit) and 3094 * inline constants (16-bit). Sometimes, especially with scalar ops, embedded 3095 * constants can be demoted to inline constants, for space savings and 3096 * sometimes a performance boost */ 3097 3098static void 3099embedded_to_inline_constant(compiler_context *ctx) 3100{ 3101 mir_foreach_instr(ctx, ins) { 3102 if (!ins->has_constants) continue; 3103 3104 if (ins->ssa_args.inline_constant) continue; 3105 3106 /* Blend constants must not be inlined by definition */ 3107 if (ins->has_blend_constant) continue; 3108 3109 /* src1 cannot be an inline constant due to encoding 3110 * restrictions. So, if possible we try to flip the arguments 3111 * in that case */ 3112 3113 int op = ins->alu.op; 3114 3115 if (ins->ssa_args.src0 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { 3116 switch (op) { 3117 /* These ops require an operational change to flip 3118 * their arguments TODO */ 3119 case midgard_alu_op_flt: 3120 case midgard_alu_op_fle: 3121 case midgard_alu_op_ilt: 3122 case midgard_alu_op_ile: 3123 case midgard_alu_op_fcsel: 3124 case midgard_alu_op_icsel: 3125 DBG("Missed non-commutative flip (%s)\n", alu_opcode_props[op].name); 3126 default: 3127 break; 3128 } 3129 3130 if (alu_opcode_props[op].props & OP_COMMUTES) { 3131 /* Flip the SSA numbers */ 3132 ins->ssa_args.src0 = ins->ssa_args.src1; 3133 ins->ssa_args.src1 = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 3134 3135 /* And flip the modifiers */ 3136 3137 unsigned src_temp; 3138 3139 src_temp = ins->alu.src2; 3140 ins->alu.src2 = ins->alu.src1; 3141 ins->alu.src1 = src_temp; 3142 } 3143 } 3144 3145 if (ins->ssa_args.src1 == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { 3146 /* Extract the source information */ 3147 3148 midgard_vector_alu_src *src; 3149 int q = ins->alu.src2; 3150 midgard_vector_alu_src *m = (midgard_vector_alu_src *) &q; 3151 src = m; 3152 3153 /* Component is from the swizzle, e.g. r26.w -> w component. TODO: What if x is masked out? */ 3154 int component = src->swizzle & 3; 3155 3156 /* Scale constant appropriately, if we can legally */ 3157 uint16_t scaled_constant = 0; 3158 3159 /* XXX: Check legality */ 3160 if (midgard_is_integer_op(op)) { 3161 /* TODO: Inline integer */ 3162 continue; 3163 3164 unsigned int *iconstants = (unsigned int *) ins->constants; 3165 scaled_constant = (uint16_t) iconstants[component]; 3166 3167 /* Constant overflow after resize */ 3168 if (scaled_constant != iconstants[component]) 3169 continue; 3170 } else { 3171 float original = (float) ins->constants[component]; 3172 scaled_constant = _mesa_float_to_half(original); 3173 3174 /* Check for loss of precision. If this is 3175 * mediump, we don't care, but for a highp 3176 * shader, we need to pay attention. NIR 3177 * doesn't yet tell us which mode we're in! 3178 * Practically this prevents most constants 3179 * from being inlined, sadly. */ 3180 3181 float fp32 = _mesa_half_to_float(scaled_constant); 3182 3183 if (fp32 != original) 3184 continue; 3185 } 3186 3187 /* We don't know how to handle these with a constant */ 3188 3189 if (src->mod || src->half || src->rep_low || src->rep_high) { 3190 DBG("Bailing inline constant...\n"); 3191 continue; 3192 } 3193 3194 /* Make sure that the constant is not itself a 3195 * vector by checking if all accessed values 3196 * (by the swizzle) are the same. */ 3197 3198 uint32_t *cons = (uint32_t *) ins->constants; 3199 uint32_t value = cons[component]; 3200 3201 bool is_vector = false; 3202 unsigned mask = effective_writemask(&ins->alu); 3203 3204 for (int c = 1; c < 4; ++c) { 3205 /* We only care if this component is actually used */ 3206 if (!(mask & (1 << c))) 3207 continue; 3208 3209 uint32_t test = cons[(src->swizzle >> (2 * c)) & 3]; 3210 3211 if (test != value) { 3212 is_vector = true; 3213 break; 3214 } 3215 } 3216 3217 if (is_vector) 3218 continue; 3219 3220 /* Get rid of the embedded constant */ 3221 ins->has_constants = false; 3222 ins->ssa_args.src1 = SSA_UNUSED_0; 3223 ins->ssa_args.inline_constant = true; 3224 ins->inline_constant = scaled_constant; 3225 } 3226 } 3227} 3228 3229/* Map normal SSA sources to other SSA sources / fixed registers (like 3230 * uniforms) */ 3231 3232static void 3233map_ssa_to_alias(compiler_context *ctx, int *ref) 3234{ 3235 unsigned int alias = (uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_to_alias, *ref + 1); 3236 3237 if (alias) { 3238 /* Remove entry in leftovers to avoid a redunant fmov */ 3239 3240 struct set_entry *leftover = _mesa_set_search(ctx->leftover_ssa_to_alias, ((void *) (uintptr_t) (*ref + 1))); 3241 3242 if (leftover) 3243 _mesa_set_remove(ctx->leftover_ssa_to_alias, leftover); 3244 3245 /* Assign the alias map */ 3246 *ref = alias - 1; 3247 return; 3248 } 3249} 3250 3251/* Basic dead code elimination on the MIR itself, which cleans up e.g. the 3252 * texture pipeline */ 3253 3254static bool 3255midgard_opt_dead_code_eliminate(compiler_context *ctx, midgard_block *block) 3256{ 3257 bool progress = false; 3258 3259 mir_foreach_instr_in_block_safe(block, ins) { 3260 if (ins->type != TAG_ALU_4) continue; 3261 if (ins->compact_branch) continue; 3262 3263 if (ins->ssa_args.dest >= SSA_FIXED_MINIMUM) continue; 3264 if (midgard_is_pinned(ctx, ins->ssa_args.dest)) continue; 3265 if (is_live_after(ctx, block, ins, ins->ssa_args.dest)) continue; 3266 3267 mir_remove_instruction(ins); 3268 progress = true; 3269 } 3270 3271 return progress; 3272} 3273 3274static bool 3275midgard_opt_copy_prop(compiler_context *ctx, midgard_block *block) 3276{ 3277 bool progress = false; 3278 3279 mir_foreach_instr_in_block_safe(block, ins) { 3280 if (ins->type != TAG_ALU_4) continue; 3281 if (!OP_IS_MOVE(ins->alu.op)) continue; 3282 3283 unsigned from = ins->ssa_args.src1; 3284 unsigned to = ins->ssa_args.dest; 3285 3286 /* We only work on pure SSA */ 3287 3288 if (to >= SSA_FIXED_MINIMUM) continue; 3289 if (from >= SSA_FIXED_MINIMUM) continue; 3290 if (to >= ctx->func->impl->ssa_alloc) continue; 3291 if (from >= ctx->func->impl->ssa_alloc) continue; 3292 3293 /* Also, if the move has side effects, we're helpless */ 3294 3295 midgard_vector_alu_src src = 3296 vector_alu_from_unsigned(ins->alu.src2); 3297 unsigned mask = squeeze_writemask(ins->alu.mask); 3298 bool is_int = midgard_is_integer_op(ins->alu.op); 3299 3300 if (mir_nontrivial_mod(src, is_int, mask)) continue; 3301 if (ins->alu.outmod != midgard_outmod_none) continue; 3302 3303 mir_foreach_instr_in_block_from(block, v, mir_next_op(ins)) { 3304 if (v->ssa_args.src0 == to) { 3305 v->ssa_args.src0 = from; 3306 progress = true; 3307 } 3308 3309 if (v->ssa_args.src1 == to && !v->ssa_args.inline_constant) { 3310 v->ssa_args.src1 = from; 3311 progress = true; 3312 } 3313 } 3314 } 3315 3316 return progress; 3317} 3318 3319static bool 3320midgard_opt_copy_prop_tex(compiler_context *ctx, midgard_block *block) 3321{ 3322 bool progress = false; 3323 3324 mir_foreach_instr_in_block_safe(block, ins) { 3325 if (ins->type != TAG_ALU_4) continue; 3326 if (!OP_IS_MOVE(ins->alu.op)) continue; 3327 3328 unsigned from = ins->ssa_args.src1; 3329 unsigned to = ins->ssa_args.dest; 3330 3331 /* Make sure it's simple enough for us to handle */ 3332 3333 if (from >= SSA_FIXED_MINIMUM) continue; 3334 if (from >= ctx->func->impl->ssa_alloc) continue; 3335 if (to < SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE)) continue; 3336 if (to > SSA_FIXED_REGISTER(REGISTER_TEXTURE_BASE + 1)) continue; 3337 3338 bool eliminated = false; 3339 3340 mir_foreach_instr_in_block_from_rev(block, v, mir_prev_op(ins)) { 3341 /* The texture registers are not SSA so be careful. 3342 * Conservatively, just stop if we hit a texture op 3343 * (even if it may not write) to where we are */ 3344 3345 if (v->type != TAG_ALU_4) 3346 break; 3347 3348 if (v->ssa_args.dest == from) { 3349 /* We don't want to track partial writes ... */ 3350 if (v->alu.mask == 0xF) { 3351 v->ssa_args.dest = to; 3352 eliminated = true; 3353 } 3354 3355 break; 3356 } 3357 } 3358 3359 if (eliminated) 3360 mir_remove_instruction(ins); 3361 3362 progress |= eliminated; 3363 } 3364 3365 return progress; 3366} 3367 3368/* We don't really understand the imov/fmov split, so always use fmov (but let 3369 * it be imov in the IR so we don't do unsafe floating point "optimizations" 3370 * and break things */ 3371 3372static void 3373midgard_imov_workaround(compiler_context *ctx, midgard_block *block) 3374{ 3375 mir_foreach_instr_in_block_safe(block, ins) { 3376 if (ins->type != TAG_ALU_4) continue; 3377 if (ins->alu.op != midgard_alu_op_imov) continue; 3378 3379 ins->alu.op = midgard_alu_op_fmov; 3380 ins->alu.outmod = midgard_outmod_none; 3381 3382 /* Remove flags that don't make sense */ 3383 3384 midgard_vector_alu_src s = 3385 vector_alu_from_unsigned(ins->alu.src2); 3386 3387 s.mod = 0; 3388 3389 ins->alu.src2 = vector_alu_srco_unsigned(s); 3390 } 3391} 3392 3393/* The following passes reorder MIR instructions to enable better scheduling */ 3394 3395static void 3396midgard_pair_load_store(compiler_context *ctx, midgard_block *block) 3397{ 3398 mir_foreach_instr_in_block_safe(block, ins) { 3399 if (ins->type != TAG_LOAD_STORE_4) continue; 3400 3401 /* We've found a load/store op. Check if next is also load/store. */ 3402 midgard_instruction *next_op = mir_next_op(ins); 3403 if (&next_op->link != &block->instructions) { 3404 if (next_op->type == TAG_LOAD_STORE_4) { 3405 /* If so, we're done since we're a pair */ 3406 ins = mir_next_op(ins); 3407 continue; 3408 } 3409 3410 /* Maximum search distance to pair, to avoid register pressure disasters */ 3411 int search_distance = 8; 3412 3413 /* Otherwise, we have an orphaned load/store -- search for another load */ 3414 mir_foreach_instr_in_block_from(block, c, mir_next_op(ins)) { 3415 /* Terminate search if necessary */ 3416 if (!(search_distance--)) break; 3417 3418 if (c->type != TAG_LOAD_STORE_4) continue; 3419 3420 /* Stores cannot be reordered, since they have 3421 * dependencies. For the same reason, indirect 3422 * loads cannot be reordered as their index is 3423 * loaded in r27.w */ 3424 3425 if (OP_IS_STORE(c->load_store.op)) continue; 3426 3427 /* It appears the 0x800 bit is set whenever a 3428 * load is direct, unset when it is indirect. 3429 * Skip indirect loads. */ 3430 3431 if (!(c->load_store.unknown & 0x800)) continue; 3432 3433 /* We found one! Move it up to pair and remove it from the old location */ 3434 3435 mir_insert_instruction_before(ins, *c); 3436 mir_remove_instruction(c); 3437 3438 break; 3439 } 3440 } 3441 } 3442} 3443 3444/* Emit varying stores late */ 3445 3446static void 3447midgard_emit_store(compiler_context *ctx, midgard_block *block) { 3448 /* Iterate in reverse to get the final write, rather than the first */ 3449 3450 mir_foreach_instr_in_block_safe_rev(block, ins) { 3451 /* Check if what we just wrote needs a store */ 3452 int idx = ins->ssa_args.dest; 3453 uintptr_t varying = ((uintptr_t) _mesa_hash_table_u64_search(ctx->ssa_varyings, idx + 1)); 3454 3455 if (!varying) continue; 3456 3457 varying -= 1; 3458 3459 /* We need to store to the appropriate varying, so emit the 3460 * move/store */ 3461 3462 /* TODO: Integrate with special purpose RA (and scheduler?) */ 3463 bool high_varying_register = false; 3464 3465 midgard_instruction mov = v_fmov(idx, blank_alu_src, SSA_FIXED_REGISTER(REGISTER_VARYING_BASE + high_varying_register)); 3466 3467 midgard_instruction st = m_store_vary_32(SSA_FIXED_REGISTER(high_varying_register), varying); 3468 st.load_store.unknown = 0x1E9E; /* XXX: What is this? */ 3469 3470 mir_insert_instruction_before(mir_next_op(ins), st); 3471 mir_insert_instruction_before(mir_next_op(ins), mov); 3472 3473 /* We no longer need to store this varying */ 3474 _mesa_hash_table_u64_remove(ctx->ssa_varyings, idx + 1); 3475 } 3476} 3477 3478/* If there are leftovers after the below pass, emit actual fmov 3479 * instructions for the slow-but-correct path */ 3480 3481static void 3482emit_leftover_move(compiler_context *ctx) 3483{ 3484 set_foreach(ctx->leftover_ssa_to_alias, leftover) { 3485 int base = ((uintptr_t) leftover->key) - 1; 3486 int mapped = base; 3487 3488 map_ssa_to_alias(ctx, &mapped); 3489 EMIT(fmov, mapped, blank_alu_src, base); 3490 } 3491} 3492 3493static void 3494actualise_ssa_to_alias(compiler_context *ctx) 3495{ 3496 mir_foreach_instr(ctx, ins) { 3497 map_ssa_to_alias(ctx, &ins->ssa_args.src0); 3498 map_ssa_to_alias(ctx, &ins->ssa_args.src1); 3499 } 3500 3501 emit_leftover_move(ctx); 3502} 3503 3504static void 3505emit_fragment_epilogue(compiler_context *ctx) 3506{ 3507 /* Special case: writing out constants requires us to include the move 3508 * explicitly now, so shove it into r0 */ 3509 3510 void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, ctx->fragment_output + 1); 3511 3512 if (constant_value) { 3513 midgard_instruction ins = v_fmov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), blank_alu_src, SSA_FIXED_REGISTER(0)); 3514 attach_constants(ctx, &ins, constant_value, ctx->fragment_output + 1); 3515 emit_mir_instruction(ctx, ins); 3516 } 3517 3518 /* Perform the actual fragment writeout. We have two writeout/branch 3519 * instructions, forming a loop until writeout is successful as per the 3520 * docs. TODO: gl_FragDepth */ 3521 3522 EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always); 3523 EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always); 3524} 3525 3526/* For the blend epilogue, we need to convert the blended fragment vec4 (stored 3527 * in r0) to a RGBA8888 value by scaling and type converting. We then output it 3528 * with the int8 analogue to the fragment epilogue */ 3529 3530static void 3531emit_blend_epilogue(compiler_context *ctx) 3532{ 3533 /* vmul.fmul.none.fulllow hr48, r0, #255 */ 3534 3535 midgard_instruction scale = { 3536 .type = TAG_ALU_4, 3537 .unit = UNIT_VMUL, 3538 .inline_constant = _mesa_float_to_half(255.0), 3539 .ssa_args = { 3540 .src0 = SSA_FIXED_REGISTER(0), 3541 .src1 = SSA_UNUSED_0, 3542 .dest = SSA_FIXED_REGISTER(24), 3543 .inline_constant = true 3544 }, 3545 .alu = { 3546 .op = midgard_alu_op_fmul, 3547 .reg_mode = midgard_reg_mode_32, 3548 .dest_override = midgard_dest_override_lower, 3549 .mask = 0xFF, 3550 .src1 = vector_alu_srco_unsigned(blank_alu_src), 3551 .src2 = vector_alu_srco_unsigned(blank_alu_src), 3552 } 3553 }; 3554 3555 emit_mir_instruction(ctx, scale); 3556 3557 /* vadd.f2u8.pos.low hr0, hr48, #0 */ 3558 3559 midgard_vector_alu_src alu_src = blank_alu_src; 3560 alu_src.half = true; 3561 3562 midgard_instruction f2u8 = { 3563 .type = TAG_ALU_4, 3564 .ssa_args = { 3565 .src0 = SSA_FIXED_REGISTER(24), 3566 .src1 = SSA_UNUSED_0, 3567 .dest = SSA_FIXED_REGISTER(0), 3568 .inline_constant = true 3569 }, 3570 .alu = { 3571 .op = midgard_alu_op_f2u8, 3572 .reg_mode = midgard_reg_mode_16, 3573 .dest_override = midgard_dest_override_lower, 3574 .outmod = midgard_outmod_pos, 3575 .mask = 0xF, 3576 .src1 = vector_alu_srco_unsigned(alu_src), 3577 .src2 = vector_alu_srco_unsigned(blank_alu_src), 3578 } 3579 }; 3580 3581 emit_mir_instruction(ctx, f2u8); 3582 3583 /* vmul.imov.quarter r0, r0, r0 */ 3584 3585 midgard_instruction imov_8 = { 3586 .type = TAG_ALU_4, 3587 .ssa_args = { 3588 .src0 = SSA_UNUSED_1, 3589 .src1 = SSA_FIXED_REGISTER(0), 3590 .dest = SSA_FIXED_REGISTER(0), 3591 }, 3592 .alu = { 3593 .op = midgard_alu_op_imov, 3594 .reg_mode = midgard_reg_mode_8, 3595 .dest_override = midgard_dest_override_none, 3596 .mask = 0xFF, 3597 .src1 = vector_alu_srco_unsigned(blank_alu_src), 3598 .src2 = vector_alu_srco_unsigned(blank_alu_src), 3599 } 3600 }; 3601 3602 /* Emit branch epilogue with the 8-bit move as the source */ 3603 3604 emit_mir_instruction(ctx, imov_8); 3605 EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, 0, midgard_condition_always); 3606 3607 emit_mir_instruction(ctx, imov_8); 3608 EMIT(alu_br_compact_cond, midgard_jmp_writeout_op_writeout, TAG_ALU_4, -1, midgard_condition_always); 3609} 3610 3611static midgard_block * 3612emit_block(compiler_context *ctx, nir_block *block) 3613{ 3614 midgard_block *this_block = calloc(sizeof(midgard_block), 1); 3615 list_addtail(&this_block->link, &ctx->blocks); 3616 3617 this_block->is_scheduled = false; 3618 ++ctx->block_count; 3619 3620 ctx->texture_index[0] = -1; 3621 ctx->texture_index[1] = -1; 3622 3623 /* Add us as a successor to the block we are following */ 3624 if (ctx->current_block) 3625 midgard_block_add_successor(ctx->current_block, this_block); 3626 3627 /* Set up current block */ 3628 list_inithead(&this_block->instructions); 3629 ctx->current_block = this_block; 3630 3631 nir_foreach_instr(instr, block) { 3632 emit_instr(ctx, instr); 3633 ++ctx->instruction_count; 3634 } 3635 3636 inline_alu_constants(ctx); 3637 embedded_to_inline_constant(ctx); 3638 3639 /* Perform heavylifting for aliasing */ 3640 actualise_ssa_to_alias(ctx); 3641 3642 midgard_emit_store(ctx, this_block); 3643 midgard_pair_load_store(ctx, this_block); 3644 midgard_imov_workaround(ctx, this_block); 3645 3646 /* Append fragment shader epilogue (value writeout) */ 3647 if (ctx->stage == MESA_SHADER_FRAGMENT) { 3648 if (block == nir_impl_last_block(ctx->func->impl)) { 3649 if (ctx->is_blend) 3650 emit_blend_epilogue(ctx); 3651 else 3652 emit_fragment_epilogue(ctx); 3653 } 3654 } 3655 3656 if (block == nir_start_block(ctx->func->impl)) 3657 ctx->initial_block = this_block; 3658 3659 if (block == nir_impl_last_block(ctx->func->impl)) 3660 ctx->final_block = this_block; 3661 3662 /* Allow the next control flow to access us retroactively, for 3663 * branching etc */ 3664 ctx->current_block = this_block; 3665 3666 /* Document the fallthrough chain */ 3667 ctx->previous_source_block = this_block; 3668 3669 return this_block; 3670} 3671 3672static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list); 3673 3674static void 3675emit_if(struct compiler_context *ctx, nir_if *nif) 3676{ 3677 /* Conditional branches expect the condition in r31.w; emit a move for 3678 * that in the _previous_ block (which is the current block). */ 3679 emit_condition(ctx, &nif->condition, true, COMPONENT_X); 3680 3681 /* Speculatively emit the branch, but we can't fill it in until later */ 3682 EMIT(branch, true, true); 3683 midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); 3684 3685 /* Emit the two subblocks */ 3686 midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); 3687 3688 /* Emit a jump from the end of the then block to the end of the else */ 3689 EMIT(branch, false, false); 3690 midgard_instruction *then_exit = mir_last_in_block(ctx->current_block); 3691 3692 /* Emit second block, and check if it's empty */ 3693 3694 int else_idx = ctx->block_count; 3695 int count_in = ctx->instruction_count; 3696 midgard_block *else_block = emit_cf_list(ctx, &nif->else_list); 3697 int after_else_idx = ctx->block_count; 3698 3699 /* Now that we have the subblocks emitted, fix up the branches */ 3700 3701 assert(then_block); 3702 assert(else_block); 3703 3704 if (ctx->instruction_count == count_in) { 3705 /* The else block is empty, so don't emit an exit jump */ 3706 mir_remove_instruction(then_exit); 3707 then_branch->branch.target_block = after_else_idx; 3708 } else { 3709 then_branch->branch.target_block = else_idx; 3710 then_exit->branch.target_block = after_else_idx; 3711 } 3712} 3713 3714static void 3715emit_loop(struct compiler_context *ctx, nir_loop *nloop) 3716{ 3717 /* Remember where we are */ 3718 midgard_block *start_block = ctx->current_block; 3719 3720 /* Allocate a loop number, growing the current inner loop depth */ 3721 int loop_idx = ++ctx->current_loop_depth; 3722 3723 /* Get index from before the body so we can loop back later */ 3724 int start_idx = ctx->block_count; 3725 3726 /* Emit the body itself */ 3727 emit_cf_list(ctx, &nloop->body); 3728 3729 /* Branch back to loop back */ 3730 struct midgard_instruction br_back = v_branch(false, false); 3731 br_back.branch.target_block = start_idx; 3732 emit_mir_instruction(ctx, br_back); 3733 3734 /* Mark down that branch in the graph. Note that we're really branching 3735 * to the block *after* we started in. TODO: Why doesn't the branch 3736 * itself have an off-by-one then...? */ 3737 midgard_block_add_successor(ctx->current_block, start_block->successors[0]); 3738 3739 /* Find the index of the block about to follow us (note: we don't add 3740 * one; blocks are 0-indexed so we get a fencepost problem) */ 3741 int break_block_idx = ctx->block_count; 3742 3743 /* Fix up the break statements we emitted to point to the right place, 3744 * now that we can allocate a block number for them */ 3745 3746 list_for_each_entry_from(struct midgard_block, block, start_block, &ctx->blocks, link) { 3747 mir_foreach_instr_in_block(block, ins) { 3748 if (ins->type != TAG_ALU_4) continue; 3749 if (!ins->compact_branch) continue; 3750 if (ins->prepacked_branch) continue; 3751 3752 /* We found a branch -- check the type to see if we need to do anything */ 3753 if (ins->branch.target_type != TARGET_BREAK) continue; 3754 3755 /* It's a break! Check if it's our break */ 3756 if (ins->branch.target_break != loop_idx) continue; 3757 3758 /* Okay, cool, we're breaking out of this loop. 3759 * Rewrite from a break to a goto */ 3760 3761 ins->branch.target_type = TARGET_GOTO; 3762 ins->branch.target_block = break_block_idx; 3763 } 3764 } 3765 3766 /* Now that we've finished emitting the loop, free up the depth again 3767 * so we play nice with recursion amid nested loops */ 3768 --ctx->current_loop_depth; 3769} 3770 3771static midgard_block * 3772emit_cf_list(struct compiler_context *ctx, struct exec_list *list) 3773{ 3774 midgard_block *start_block = NULL; 3775 3776 foreach_list_typed(nir_cf_node, node, node, list) { 3777 switch (node->type) { 3778 case nir_cf_node_block: { 3779 midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node)); 3780 3781 if (!start_block) 3782 start_block = block; 3783 3784 break; 3785 } 3786 3787 case nir_cf_node_if: 3788 emit_if(ctx, nir_cf_node_as_if(node)); 3789 break; 3790 3791 case nir_cf_node_loop: 3792 emit_loop(ctx, nir_cf_node_as_loop(node)); 3793 break; 3794 3795 case nir_cf_node_function: 3796 assert(0); 3797 break; 3798 } 3799 } 3800 3801 return start_block; 3802} 3803 3804/* Due to lookahead, we need to report the first tag executed in the command 3805 * stream and in branch targets. An initial block might be empty, so iterate 3806 * until we find one that 'works' */ 3807 3808static unsigned 3809midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) 3810{ 3811 midgard_block *initial_block = mir_get_block(ctx, block_idx); 3812 3813 unsigned first_tag = 0; 3814 3815 do { 3816 midgard_bundle *initial_bundle = util_dynarray_element(&initial_block->bundles, midgard_bundle, 0); 3817 3818 if (initial_bundle) { 3819 first_tag = initial_bundle->tag; 3820 break; 3821 } 3822 3823 /* Initial block is empty, try the next block */ 3824 initial_block = list_first_entry(&(initial_block->link), midgard_block, link); 3825 } while(initial_block != NULL); 3826 3827 assert(first_tag); 3828 return first_tag; 3829} 3830 3831int 3832midgard_compile_shader_nir(nir_shader *nir, midgard_program *program, bool is_blend) 3833{ 3834 struct util_dynarray *compiled = &program->compiled; 3835 3836 midgard_debug = debug_get_option_midgard_debug(); 3837 3838 compiler_context ictx = { 3839 .nir = nir, 3840 .stage = nir->info.stage, 3841 3842 .is_blend = is_blend, 3843 .blend_constant_offset = -1, 3844 3845 .alpha_ref = program->alpha_ref 3846 }; 3847 3848 compiler_context *ctx = &ictx; 3849 3850 /* TODO: Decide this at runtime */ 3851 ctx->uniform_cutoff = 8; 3852 3853 /* Assign var locations early, so the epilogue can use them if necessary */ 3854 3855 nir_assign_var_locations(&nir->outputs, &nir->num_outputs, glsl_type_size); 3856 nir_assign_var_locations(&nir->inputs, &nir->num_inputs, glsl_type_size); 3857 nir_assign_var_locations(&nir->uniforms, &nir->num_uniforms, glsl_type_size); 3858 3859 /* Initialize at a global (not block) level hash tables */ 3860 3861 ctx->ssa_constants = _mesa_hash_table_u64_create(NULL); 3862 ctx->ssa_varyings = _mesa_hash_table_u64_create(NULL); 3863 ctx->ssa_to_alias = _mesa_hash_table_u64_create(NULL); 3864 ctx->ssa_to_register = _mesa_hash_table_u64_create(NULL); 3865 ctx->hash_to_temp = _mesa_hash_table_u64_create(NULL); 3866 ctx->sysval_to_id = _mesa_hash_table_u64_create(NULL); 3867 ctx->leftover_ssa_to_alias = _mesa_set_create(NULL, _mesa_hash_pointer, _mesa_key_pointer_equal); 3868 3869 /* Record the varying mapping for the command stream's bookkeeping */ 3870 3871 struct exec_list *varyings = 3872 ctx->stage == MESA_SHADER_VERTEX ? &nir->outputs : &nir->inputs; 3873 3874 nir_foreach_variable(var, varyings) { 3875 unsigned loc = var->data.driver_location; 3876 unsigned sz = glsl_type_size(var->type, FALSE); 3877 3878 for (int c = 0; c < sz; ++c) { 3879 program->varyings[loc + c] = var->data.location; 3880 } 3881 } 3882 3883 /* Lower gl_Position pre-optimisation */ 3884 3885 if (ctx->stage == MESA_SHADER_VERTEX) 3886 NIR_PASS_V(nir, nir_lower_viewport_transform); 3887 3888 NIR_PASS_V(nir, nir_lower_var_copies); 3889 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3890 NIR_PASS_V(nir, nir_split_var_copies); 3891 NIR_PASS_V(nir, nir_lower_var_copies); 3892 NIR_PASS_V(nir, nir_lower_global_vars_to_local); 3893 NIR_PASS_V(nir, nir_lower_var_copies); 3894 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3895 3896 NIR_PASS_V(nir, nir_lower_io, nir_var_all, glsl_type_size, 0); 3897 3898 /* Optimisation passes */ 3899 3900 optimise_nir(nir); 3901 3902 if (midgard_debug & MIDGARD_DBG_SHADERS) { 3903 nir_print_shader(nir, stdout); 3904 } 3905 3906 /* Assign sysvals and counts, now that we're sure 3907 * (post-optimisation) */ 3908 3909 midgard_nir_assign_sysvals(ctx, nir); 3910 3911 program->uniform_count = nir->num_uniforms; 3912 program->sysval_count = ctx->sysval_count; 3913 memcpy(program->sysvals, ctx->sysvals, sizeof(ctx->sysvals[0]) * ctx->sysval_count); 3914 3915 program->attribute_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_inputs : 0; 3916 program->varying_count = (ctx->stage == MESA_SHADER_VERTEX) ? nir->num_outputs : ((ctx->stage == MESA_SHADER_FRAGMENT) ? nir->num_inputs : 0); 3917 3918 nir_foreach_function(func, nir) { 3919 if (!func->impl) 3920 continue; 3921 3922 list_inithead(&ctx->blocks); 3923 ctx->block_count = 0; 3924 ctx->func = func; 3925 3926 emit_cf_list(ctx, &func->impl->body); 3927 emit_block(ctx, func->impl->end_block); 3928 3929 break; /* TODO: Multi-function shaders */ 3930 } 3931 3932 util_dynarray_init(compiled, NULL); 3933 3934 /* MIR-level optimizations */ 3935 3936 bool progress = false; 3937 3938 do { 3939 progress = false; 3940 3941 mir_foreach_block(ctx, block) { 3942 progress |= midgard_opt_copy_prop(ctx, block); 3943 progress |= midgard_opt_copy_prop_tex(ctx, block); 3944 progress |= midgard_opt_dead_code_eliminate(ctx, block); 3945 } 3946 } while (progress); 3947 3948 /* Schedule! */ 3949 schedule_program(ctx); 3950 3951 /* Now that all the bundles are scheduled and we can calculate block 3952 * sizes, emit actual branch instructions rather than placeholders */ 3953 3954 int br_block_idx = 0; 3955 3956 mir_foreach_block(ctx, block) { 3957 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { 3958 for (int c = 0; c < bundle->instruction_count; ++c) { 3959 midgard_instruction *ins = &bundle->instructions[c]; 3960 3961 if (!midgard_is_branch_unit(ins->unit)) continue; 3962 3963 if (ins->prepacked_branch) continue; 3964 3965 /* Parse some basic branch info */ 3966 bool is_compact = ins->unit == ALU_ENAB_BR_COMPACT; 3967 bool is_conditional = ins->branch.conditional; 3968 bool is_inverted = ins->branch.invert_conditional; 3969 bool is_discard = ins->branch.target_type == TARGET_DISCARD; 3970 3971 /* Determine the block we're jumping to */ 3972 int target_number = ins->branch.target_block; 3973 3974 /* Report the destination tag. Discards don't need this */ 3975 int dest_tag = is_discard ? 0 : midgard_get_first_tag_from_block(ctx, target_number); 3976 3977 /* Count up the number of quadwords we're jumping over. That is, the number of quadwords in each of the blocks between (br_block_idx, target_number) */ 3978 int quadword_offset = 0; 3979 3980 if (is_discard) { 3981 /* Jump to the end of the shader. We 3982 * need to include not only the 3983 * following blocks, but also the 3984 * contents of our current block (since 3985 * discard can come in the middle of 3986 * the block) */ 3987 3988 midgard_block *blk = mir_get_block(ctx, br_block_idx + 1); 3989 3990 for (midgard_bundle *bun = bundle + 1; bun < (midgard_bundle *)((char*) block->bundles.data + block->bundles.size); ++bun) { 3991 quadword_offset += quadword_size(bun->tag); 3992 } 3993 3994 mir_foreach_block_from(ctx, blk, b) { 3995 quadword_offset += b->quadword_count; 3996 } 3997 3998 } else if (target_number > br_block_idx) { 3999 /* Jump forward */ 4000 4001 for (int idx = br_block_idx + 1; idx < target_number; ++idx) { 4002 midgard_block *blk = mir_get_block(ctx, idx); 4003 assert(blk); 4004 4005 quadword_offset += blk->quadword_count; 4006 } 4007 } else { 4008 /* Jump backwards */ 4009 4010 for (int idx = br_block_idx; idx >= target_number; --idx) { 4011 midgard_block *blk = mir_get_block(ctx, idx); 4012 assert(blk); 4013 4014 quadword_offset -= blk->quadword_count; 4015 } 4016 } 4017 4018 /* Unconditional extended branches (far jumps) 4019 * have issues, so we always use a conditional 4020 * branch, setting the condition to always for 4021 * unconditional. For compact unconditional 4022 * branches, cond isn't used so it doesn't 4023 * matter what we pick. */ 4024 4025 midgard_condition cond = 4026 !is_conditional ? midgard_condition_always : 4027 is_inverted ? midgard_condition_false : 4028 midgard_condition_true; 4029 4030 midgard_jmp_writeout_op op = 4031 is_discard ? midgard_jmp_writeout_op_discard : 4032 (is_compact && !is_conditional) ? midgard_jmp_writeout_op_branch_uncond : 4033 midgard_jmp_writeout_op_branch_cond; 4034 4035 if (!is_compact) { 4036 midgard_branch_extended branch = 4037 midgard_create_branch_extended( 4038 cond, op, 4039 dest_tag, 4040 quadword_offset); 4041 4042 memcpy(&ins->branch_extended, &branch, sizeof(branch)); 4043 } else if (is_conditional || is_discard) { 4044 midgard_branch_cond branch = { 4045 .op = op, 4046 .dest_tag = dest_tag, 4047 .offset = quadword_offset, 4048 .cond = cond 4049 }; 4050 4051 assert(branch.offset == quadword_offset); 4052 4053 memcpy(&ins->br_compact, &branch, sizeof(branch)); 4054 } else { 4055 assert(op == midgard_jmp_writeout_op_branch_uncond); 4056 4057 midgard_branch_uncond branch = { 4058 .op = op, 4059 .dest_tag = dest_tag, 4060 .offset = quadword_offset, 4061 .unknown = 1 4062 }; 4063 4064 assert(branch.offset == quadword_offset); 4065 4066 memcpy(&ins->br_compact, &branch, sizeof(branch)); 4067 } 4068 } 4069 } 4070 4071 ++br_block_idx; 4072 } 4073 4074 /* Emit flat binary from the instruction arrays. Iterate each block in 4075 * sequence. Save instruction boundaries such that lookahead tags can 4076 * be assigned easily */ 4077 4078 /* Cache _all_ bundles in source order for lookahead across failed branches */ 4079 4080 int bundle_count = 0; 4081 mir_foreach_block(ctx, block) { 4082 bundle_count += block->bundles.size / sizeof(midgard_bundle); 4083 } 4084 midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count); 4085 int bundle_idx = 0; 4086 mir_foreach_block(ctx, block) { 4087 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { 4088 source_order_bundles[bundle_idx++] = bundle; 4089 } 4090 } 4091 4092 int current_bundle = 0; 4093 4094 mir_foreach_block(ctx, block) { 4095 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { 4096 int lookahead = 1; 4097 4098 if (current_bundle + 1 < bundle_count) { 4099 uint8_t next = source_order_bundles[current_bundle + 1]->tag; 4100 4101 if (!(current_bundle + 2 < bundle_count) && IS_ALU(next)) { 4102 lookahead = 1; 4103 } else { 4104 lookahead = next; 4105 } 4106 } 4107 4108 emit_binary_bundle(ctx, bundle, compiled, lookahead); 4109 ++current_bundle; 4110 } 4111 4112 /* TODO: Free deeper */ 4113 //util_dynarray_fini(&block->instructions); 4114 } 4115 4116 free(source_order_bundles); 4117 4118 /* Report the very first tag executed */ 4119 program->first_tag = midgard_get_first_tag_from_block(ctx, 0); 4120 4121 /* Deal with off-by-one related to the fencepost problem */ 4122 program->work_register_count = ctx->work_registers + 1; 4123 4124 program->can_discard = ctx->can_discard; 4125 program->uniform_cutoff = ctx->uniform_cutoff; 4126 4127 program->blend_patch_offset = ctx->blend_constant_offset; 4128 4129 if (midgard_debug & MIDGARD_DBG_SHADERS) 4130 disassemble_midgard(program->compiled.data, program->compiled.size); 4131 4132 return 0; 4133} 4134