1/* 2 * Copyright (C) 2020 Collabora Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors (Collabora): 24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 25 */ 26 27#include "compiler.h" 28#include "bi_builder.h" 29 30/* Arguments common to worklist, passed by value for convenience */ 31 32struct bi_worklist { 33 /* # of instructions in the block */ 34 unsigned count; 35 36 /* Instructions in the block */ 37 bi_instr **instructions; 38 39 /* Bitset of instructions in the block ready for scheduling */ 40 BITSET_WORD *worklist; 41 42 /* The backwards dependency graph. nr_dependencies is the number of 43 * unscheduled instructions that must still be scheduled after (before) 44 * this instruction. dependents are which instructions need to be 45 * scheduled before (after) this instruction. */ 46 unsigned *dep_counts; 47 BITSET_WORD **dependents; 48}; 49 50/* State of a single tuple and clause under construction */ 51 52struct bi_reg_state { 53 /* Number of register writes */ 54 unsigned nr_writes; 55 56 /* Register reads, expressed as (equivalence classes of) 57 * sources. Only 3 reads are allowed, but up to 2 may spill as 58 * "forced" for the next scheduled tuple, provided such a tuple 59 * can be constructed */ 60 bi_index reads[5]; 61 unsigned nr_reads; 62 63 /* The previous tuple scheduled (= the next tuple executed in the 64 * program) may require certain writes, in order to bypass the register 65 * file and use a temporary passthrough for the value. Up to 2 such 66 * constraints are architecturally satisfiable */ 67 unsigned forced_count; 68 bi_index forceds[2]; 69}; 70 71struct bi_tuple_state { 72 /* Is this the last tuple in the clause */ 73 bool last; 74 75 /* Scheduled ADD instruction, or null if none */ 76 bi_instr *add; 77 78 /* Reads for previous (succeeding) tuple */ 79 bi_index prev_reads[5]; 80 unsigned nr_prev_reads; 81 bi_tuple *prev; 82 83 /* Register slot state for current tuple */ 84 struct bi_reg_state reg; 85 86 /* Constants are shared in the tuple. If constant_count is nonzero, it 87 * is a size for constant count. Otherwise, fau is the slot read from 88 * FAU, or zero if none is assigned. Ordinarily FAU slot 0 reads zero, 89 * but within a tuple, that should be encoded as constant_count != 0 90 * and constants[0] = constants[1] = 0 */ 91 unsigned constant_count; 92 93 union { 94 uint32_t constants[2]; 95 enum bir_fau fau; 96 }; 97 98 unsigned pcrel_idx; 99}; 100 101struct bi_const_state { 102 unsigned constant_count; 103 bool pcrel; /* applies to first const */ 104 uint32_t constants[2]; 105 106 /* Index of the constant into the clause */ 107 unsigned word_idx; 108}; 109 110struct bi_clause_state { 111 /* Has a message-passing instruction already been assigned? */ 112 bool message; 113 114 /* Indices already accessed, this needs to be tracked to avoid hazards 115 * around message-passing instructions */ 116 unsigned access_count; 117 bi_index accesses[(BI_MAX_SRCS + 2) * 16]; 118 119 unsigned tuple_count; 120 struct bi_const_state consts[8]; 121}; 122 123/* Determines messsage type by checking the table and a few special cases. Only 124 * case missing is tilebuffer instructions that access depth/stencil, which 125 * require a Z_STENCIL message (to implement 126 * ARM_shader_framebuffer_fetch_depth_stencil) */ 127 128static enum bifrost_message_type 129bi_message_type_for_instr(bi_instr *ins) 130{ 131 enum bifrost_message_type msg = bi_opcode_props[ins->op].message; 132 bool ld_var_special = (ins->op == BI_OPCODE_LD_VAR_SPECIAL); 133 134 if (ld_var_special && ins->varying_name == BI_VARYING_NAME_FRAG_Z) 135 return BIFROST_MESSAGE_Z_STENCIL; 136 137 if (msg == BIFROST_MESSAGE_LOAD && ins->seg == BI_SEG_UBO) 138 return BIFROST_MESSAGE_ATTRIBUTE; 139 140 return msg; 141} 142 143/* Attribute, texture, and UBO load (attribute message) instructions support 144 * bindless, so just check the message type */ 145 146ASSERTED static bool 147bi_supports_dtsel(bi_instr *ins) 148{ 149 switch (bi_message_type_for_instr(ins)) { 150 case BIFROST_MESSAGE_ATTRIBUTE: 151 return ins->op != BI_OPCODE_LD_GCLK_U64; 152 case BIFROST_MESSAGE_TEX: 153 return true; 154 default: 155 return false; 156 } 157} 158 159/* Adds an edge to the dependency graph */ 160 161static void 162bi_push_dependency(unsigned parent, unsigned child, 163 BITSET_WORD **dependents, unsigned *dep_counts) 164{ 165 if (!BITSET_TEST(dependents[parent], child)) { 166 BITSET_SET(dependents[parent], child); 167 dep_counts[child]++; 168 } 169} 170 171static void 172add_dependency(struct util_dynarray *table, unsigned index, unsigned child, 173 BITSET_WORD **dependents, unsigned *dep_counts) 174{ 175 assert(index < 64); 176 util_dynarray_foreach(table + index, unsigned, parent) 177 bi_push_dependency(*parent, child, dependents, dep_counts); 178} 179 180static void 181mark_access(struct util_dynarray *table, unsigned index, unsigned parent) 182{ 183 assert(index < 64); 184 util_dynarray_append(&table[index], unsigned, parent); 185} 186 187static bool 188bi_is_sched_barrier(bi_instr *I) 189{ 190 switch (I->op) { 191 case BI_OPCODE_BARRIER: 192 case BI_OPCODE_DISCARD_F32: 193 return true; 194 default: 195 return false; 196 } 197} 198 199static void 200bi_create_dependency_graph(struct bi_worklist st, bool inorder, bool is_blend) 201{ 202 struct util_dynarray last_read[64], last_write[64]; 203 204 for (unsigned i = 0; i < 64; ++i) { 205 util_dynarray_init(&last_read[i], NULL); 206 util_dynarray_init(&last_write[i], NULL); 207 } 208 209 /* Initialize dependency graph */ 210 for (unsigned i = 0; i < st.count; ++i) { 211 st.dependents[i] = 212 calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); 213 214 st.dep_counts[i] = 0; 215 } 216 217 unsigned prev_msg = ~0; 218 219 /* Populate dependency graph */ 220 for (signed i = st.count - 1; i >= 0; --i) { 221 bi_instr *ins = st.instructions[i]; 222 223 bi_foreach_src(ins, s) { 224 if (ins->src[s].type != BI_INDEX_REGISTER) continue; 225 unsigned count = bi_count_read_registers(ins, s); 226 227 for (unsigned c = 0; c < count; ++c) 228 add_dependency(last_write, ins->src[s].value + c, i, st.dependents, st.dep_counts); 229 } 230 231 /* Keep message-passing ops in order. (This pass only cares 232 * about bundling; reordering of message-passing instructions 233 * happens during earlier scheduling.) */ 234 235 if (bi_message_type_for_instr(ins)) { 236 if (prev_msg != ~0) 237 bi_push_dependency(prev_msg, i, st.dependents, st.dep_counts); 238 239 prev_msg = i; 240 } 241 242 /* Handle schedule barriers, adding All the deps */ 243 if (inorder || bi_is_sched_barrier(ins)) { 244 for (unsigned j = 0; j < st.count; ++j) { 245 if (i == j) continue; 246 247 bi_push_dependency(MAX2(i, j), MIN2(i, j), 248 st.dependents, st.dep_counts); 249 } 250 } 251 252 bi_foreach_dest(ins, d) { 253 if (ins->dest[d].type != BI_INDEX_REGISTER) continue; 254 unsigned dest = ins->dest[d].value; 255 256 unsigned count = bi_count_write_registers(ins, d); 257 258 for (unsigned c = 0; c < count; ++c) { 259 add_dependency(last_read, dest + c, i, st.dependents, st.dep_counts); 260 add_dependency(last_write, dest + c, i, st.dependents, st.dep_counts); 261 mark_access(last_write, dest + c, i); 262 } 263 } 264 265 /* Blend shaders are allowed to clobber R0-R15. Treat these 266 * registers like extra destinations for scheduling purposes. 267 */ 268 if (ins->op == BI_OPCODE_BLEND && !is_blend) { 269 for (unsigned c = 0; c < 16; ++c) { 270 add_dependency(last_read, c, i, st.dependents, st.dep_counts); 271 add_dependency(last_write, c, i, st.dependents, st.dep_counts); 272 mark_access(last_write, c, i); 273 } 274 } 275 276 bi_foreach_src(ins, s) { 277 if (ins->src[s].type != BI_INDEX_REGISTER) continue; 278 279 unsigned count = bi_count_read_registers(ins, s); 280 281 for (unsigned c = 0; c < count; ++c) 282 mark_access(last_read, ins->src[s].value + c, i); 283 } 284 } 285 286 /* If there is a branch, all instructions depend on it, as interblock 287 * execution must be purely in-order */ 288 289 bi_instr *last = st.instructions[st.count - 1]; 290 if (last->branch_target || last->op == BI_OPCODE_JUMP) { 291 for (signed i = st.count - 2; i >= 0; --i) 292 bi_push_dependency(st.count - 1, i, st.dependents, st.dep_counts); 293 } 294 295 /* Free the intermediate structures */ 296 for (unsigned i = 0; i < 64; ++i) { 297 util_dynarray_fini(&last_read[i]); 298 util_dynarray_fini(&last_write[i]); 299 } 300} 301 302/* Scheduler pseudoinstruction lowerings to enable instruction pairings. 303 * Currently only support CUBEFACE -> *CUBEFACE1/+CUBEFACE2 304 */ 305 306static bi_instr * 307bi_lower_cubeface(bi_context *ctx, 308 struct bi_clause_state *clause, struct bi_tuple_state *tuple) 309{ 310 bi_instr *pinstr = tuple->add; 311 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); 312 bi_instr *cubeface1 = bi_cubeface1_to(&b, pinstr->dest[0], 313 pinstr->src[0], pinstr->src[1], pinstr->src[2]); 314 315 pinstr->op = BI_OPCODE_CUBEFACE2; 316 pinstr->dest[0] = pinstr->dest[1]; 317 pinstr->dest[1] = bi_null(); 318 pinstr->src[0] = cubeface1->dest[0]; 319 pinstr->src[1] = bi_null(); 320 pinstr->src[2] = bi_null(); 321 322 return cubeface1; 323} 324 325/* Psuedo arguments are (rbase, address lo, address hi). We need *ATOM_C.i32 to 326 * have the arguments (address lo, address hi, rbase), and +ATOM_CX to have the 327 * arguments (rbase, address lo, address hi, rbase) */ 328 329static bi_instr * 330bi_lower_atom_c(bi_context *ctx, struct bi_clause_state *clause, struct 331 bi_tuple_state *tuple) 332{ 333 bi_instr *pinstr = tuple->add; 334 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); 335 bi_instr *atom_c = bi_atom_c_return_i32(&b, 336 pinstr->src[1], pinstr->src[2], pinstr->src[0], 337 pinstr->atom_opc); 338 339 if (bi_is_null(pinstr->dest[0])) 340 atom_c->op = BI_OPCODE_ATOM_C_I32; 341 342 pinstr->op = BI_OPCODE_ATOM_CX; 343 pinstr->src[3] = atom_c->src[2]; 344 345 return atom_c; 346} 347 348static bi_instr * 349bi_lower_atom_c1(bi_context *ctx, struct bi_clause_state *clause, struct 350 bi_tuple_state *tuple) 351{ 352 bi_instr *pinstr = tuple->add; 353 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); 354 bi_instr *atom_c = bi_atom_c1_return_i32(&b, 355 pinstr->src[0], pinstr->src[1], pinstr->atom_opc); 356 357 if (bi_is_null(pinstr->dest[0])) 358 atom_c->op = BI_OPCODE_ATOM_C1_I32; 359 360 pinstr->op = BI_OPCODE_ATOM_CX; 361 pinstr->src[2] = pinstr->src[1]; 362 pinstr->src[1] = pinstr->src[0]; 363 pinstr->src[3] = bi_dontcare(); 364 pinstr->src[0] = bi_null(); 365 366 return atom_c; 367} 368 369static bi_instr * 370bi_lower_seg_add(bi_context *ctx, 371 struct bi_clause_state *clause, struct bi_tuple_state *tuple) 372{ 373 bi_instr *pinstr = tuple->add; 374 bi_builder b = bi_init_builder(ctx, bi_before_instr(pinstr)); 375 376 bi_instr *fma = bi_seg_add_to(&b, pinstr->dest[0], pinstr->src[0], 377 pinstr->preserve_null, pinstr->seg); 378 379 pinstr->op = BI_OPCODE_SEG_ADD; 380 pinstr->src[0] = pinstr->src[1]; 381 pinstr->src[1] = bi_null(); 382 383 assert(pinstr->dest[0].type == BI_INDEX_REGISTER); 384 pinstr->dest[0].value += 1; 385 386 return fma; 387} 388 389static bi_instr * 390bi_lower_dtsel(bi_context *ctx, 391 struct bi_clause_state *clause, struct bi_tuple_state *tuple) 392{ 393 bi_instr *add = tuple->add; 394 bi_builder b = bi_init_builder(ctx, bi_before_instr(add)); 395 396 bi_instr *dtsel = bi_dtsel_imm_to(&b, bi_temp(b.shader), 397 add->src[0], add->table); 398 add->src[0] = dtsel->dest[0]; 399 400 assert(bi_supports_dtsel(add)); 401 return dtsel; 402} 403 404/* Flatten linked list to array for O(1) indexing */ 405 406static bi_instr ** 407bi_flatten_block(bi_block *block, unsigned *len) 408{ 409 if (list_is_empty(&block->instructions)) 410 return NULL; 411 412 *len = list_length(&block->instructions); 413 bi_instr **instructions = malloc(sizeof(bi_instr *) * (*len)); 414 415 unsigned i = 0; 416 417 bi_foreach_instr_in_block(block, ins) 418 instructions[i++] = ins; 419 420 return instructions; 421} 422 423/* The worklist would track instructions without outstanding dependencies. For 424 * debug, force in-order scheduling (no dependency graph is constructed). 425 */ 426 427static struct bi_worklist 428bi_initialize_worklist(bi_block *block, bool inorder, bool is_blend) 429{ 430 struct bi_worklist st = { }; 431 st.instructions = bi_flatten_block(block, &st.count); 432 433 if (!st.count) 434 return st; 435 436 st.dependents = calloc(st.count, sizeof(st.dependents[0])); 437 st.dep_counts = calloc(st.count, sizeof(st.dep_counts[0])); 438 439 bi_create_dependency_graph(st, inorder, is_blend); 440 st.worklist = calloc(BITSET_WORDS(st.count), sizeof(BITSET_WORD)); 441 442 for (unsigned i = 0; i < st.count; ++i) { 443 if (st.dep_counts[i] == 0) 444 BITSET_SET(st.worklist, i); 445 } 446 447 return st; 448} 449 450static void 451bi_free_worklist(struct bi_worklist st) 452{ 453 free(st.dep_counts); 454 free(st.dependents); 455 free(st.instructions); 456 free(st.worklist); 457} 458 459static void 460bi_update_worklist(struct bi_worklist st, unsigned idx) 461{ 462 assert(st.dep_counts[idx] == 0); 463 464 if (!st.dependents[idx]) 465 return; 466 467 /* Iterate each dependent to remove one dependency (`done`), 468 * adding dependents to the worklist where possible. */ 469 470 unsigned i; 471 BITSET_FOREACH_SET(i, st.dependents[idx], st.count) { 472 assert(st.dep_counts[i] != 0); 473 unsigned new_deps = --st.dep_counts[i]; 474 475 if (new_deps == 0) 476 BITSET_SET(st.worklist, i); 477 } 478 479 free(st.dependents[idx]); 480} 481 482/* Scheduler predicates */ 483 484/* IADDC.i32 can implement IADD.u32 if no saturation or swizzling is in use */ 485static bool 486bi_can_iaddc(bi_instr *ins) 487{ 488 return (ins->op == BI_OPCODE_IADD_U32 && !ins->saturate && 489 ins->src[0].swizzle == BI_SWIZZLE_H01 && 490 ins->src[1].swizzle == BI_SWIZZLE_H01); 491} 492 493/* 494 * The encoding of *FADD.v2f16 only specifies a single abs flag. All abs 495 * encodings are permitted by swapping operands; however, this scheme fails if 496 * both operands are equal. Test for this case. 497 */ 498static bool 499bi_impacted_abs(bi_instr *I) 500{ 501 return I->src[0].abs && I->src[1].abs && 502 bi_is_word_equiv(I->src[0], I->src[1]); 503} 504 505bool 506bi_can_fma(bi_instr *ins) 507{ 508 /* +IADD.i32 -> *IADDC.i32 */ 509 if (bi_can_iaddc(ins)) 510 return true; 511 512 /* *FADD.v2f16 has restricted abs modifiers, use +FADD.v2f16 instead */ 513 if (ins->op == BI_OPCODE_FADD_V2F16 && bi_impacted_abs(ins)) 514 return false; 515 516 /* TODO: some additional fp16 constraints */ 517 return bi_opcode_props[ins->op].fma; 518} 519 520static bool 521bi_impacted_fadd_widens(bi_instr *I) 522{ 523 enum bi_swizzle swz0 = I->src[0].swizzle; 524 enum bi_swizzle swz1 = I->src[1].swizzle; 525 526 return (swz0 == BI_SWIZZLE_H00 && swz1 == BI_SWIZZLE_H11) || 527 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H11) || 528 (swz0 == BI_SWIZZLE_H11 && swz1 == BI_SWIZZLE_H00); 529} 530 531bool 532bi_can_add(bi_instr *ins) 533{ 534 /* +FADD.v2f16 lacks clamp modifier, use *FADD.v2f16 instead */ 535 if (ins->op == BI_OPCODE_FADD_V2F16 && ins->clamp) 536 return false; 537 538 /* +FCMP.v2f16 lacks abs modifier, use *FCMP.v2f16 instead */ 539 if (ins->op == BI_OPCODE_FCMP_V2F16 && (ins->src[0].abs || ins->src[1].abs)) 540 return false; 541 542 /* +FADD.f32 has restricted widens, use +FADD.f32 for the full set */ 543 if (ins->op == BI_OPCODE_FADD_F32 && bi_impacted_fadd_widens(ins)) 544 return false; 545 546 /* TODO: some additional fp16 constraints */ 547 return bi_opcode_props[ins->op].add; 548} 549 550/* Architecturally, no single instruction has a "not last" constraint. However, 551 * pseudoinstructions writing multiple destinations (expanding to multiple 552 * paired instructions) can run afoul of the "no two writes on the last clause" 553 * constraint, so we check for that here. 554 */ 555 556static bool 557bi_must_not_last(bi_instr *ins) 558{ 559 return !bi_is_null(ins->dest[0]) && !bi_is_null(ins->dest[1]); 560} 561 562/* Check for a message-passing instruction. +DISCARD.f32 is special-cased; we 563 * treat it as a message-passing instruction for the purpose of scheduling 564 * despite no passing no logical message. Otherwise invalid encoding faults may 565 * be raised for unknown reasons (possibly an errata). 566 */ 567 568bool 569bi_must_message(bi_instr *ins) 570{ 571 return (bi_opcode_props[ins->op].message != BIFROST_MESSAGE_NONE) || 572 (ins->op == BI_OPCODE_DISCARD_F32); 573} 574 575static bool 576bi_fma_atomic(enum bi_opcode op) 577{ 578 switch (op) { 579 case BI_OPCODE_ATOM_C_I32: 580 case BI_OPCODE_ATOM_C_I64: 581 case BI_OPCODE_ATOM_C1_I32: 582 case BI_OPCODE_ATOM_C1_I64: 583 case BI_OPCODE_ATOM_C1_RETURN_I32: 584 case BI_OPCODE_ATOM_C1_RETURN_I64: 585 case BI_OPCODE_ATOM_C_RETURN_I32: 586 case BI_OPCODE_ATOM_C_RETURN_I64: 587 case BI_OPCODE_ATOM_POST_I32: 588 case BI_OPCODE_ATOM_POST_I64: 589 case BI_OPCODE_ATOM_PRE_I64: 590 return true; 591 default: 592 return false; 593 } 594} 595 596bool 597bi_reads_zero(bi_instr *ins) 598{ 599 return !(bi_fma_atomic(ins->op) || ins->op == BI_OPCODE_IMULD); 600} 601 602bool 603bi_reads_temps(bi_instr *ins, unsigned src) 604{ 605 switch (ins->op) { 606 /* Cannot permute a temporary */ 607 case BI_OPCODE_CLPER_I32: 608 case BI_OPCODE_CLPER_V6_I32: 609 return src != 0; 610 case BI_OPCODE_IMULD: 611 return false; 612 default: 613 return true; 614 } 615} 616 617static bool 618bi_impacted_t_modifiers(bi_instr *I, unsigned src) 619{ 620 enum bi_swizzle swizzle = I->src[src].swizzle; 621 622 switch (I->op) { 623 case BI_OPCODE_F16_TO_F32: 624 case BI_OPCODE_F16_TO_S32: 625 case BI_OPCODE_F16_TO_U32: 626 case BI_OPCODE_MKVEC_V2I16: 627 case BI_OPCODE_S16_TO_F32: 628 case BI_OPCODE_S16_TO_S32: 629 case BI_OPCODE_U16_TO_F32: 630 case BI_OPCODE_U16_TO_U32: 631 return (swizzle != BI_SWIZZLE_H00); 632 633 case BI_OPCODE_BRANCH_F32: 634 case BI_OPCODE_LOGB_F32: 635 case BI_OPCODE_ILOGB_F32: 636 case BI_OPCODE_FADD_F32: 637 case BI_OPCODE_FCMP_F32: 638 case BI_OPCODE_FREXPE_F32: 639 case BI_OPCODE_FREXPM_F32: 640 case BI_OPCODE_FROUND_F32: 641 return (swizzle != BI_SWIZZLE_H01); 642 643 case BI_OPCODE_IADD_S32: 644 case BI_OPCODE_IADD_U32: 645 case BI_OPCODE_ISUB_S32: 646 case BI_OPCODE_ISUB_U32: 647 case BI_OPCODE_IADD_V4S8: 648 case BI_OPCODE_IADD_V4U8: 649 case BI_OPCODE_ISUB_V4S8: 650 case BI_OPCODE_ISUB_V4U8: 651 return (src == 1) && (swizzle != BI_SWIZZLE_H01); 652 653 case BI_OPCODE_S8_TO_F32: 654 case BI_OPCODE_S8_TO_S32: 655 case BI_OPCODE_U8_TO_F32: 656 case BI_OPCODE_U8_TO_U32: 657 return (swizzle != BI_SWIZZLE_B0000); 658 659 case BI_OPCODE_V2S8_TO_V2F16: 660 case BI_OPCODE_V2S8_TO_V2S16: 661 case BI_OPCODE_V2U8_TO_V2F16: 662 case BI_OPCODE_V2U8_TO_V2U16: 663 return (swizzle != BI_SWIZZLE_B0022); 664 665 case BI_OPCODE_IADD_V2S16: 666 case BI_OPCODE_IADD_V2U16: 667 case BI_OPCODE_ISUB_V2S16: 668 case BI_OPCODE_ISUB_V2U16: 669 return (src == 1) && (swizzle >= BI_SWIZZLE_H11); 670 671#if 0 672 /* Restriction on IADD in 64-bit clauses on G72 */ 673 case BI_OPCODE_IADD_S64: 674 case BI_OPCODE_IADD_U64: 675 return (src == 1) && (swizzle != BI_SWIZZLE_D0); 676#endif 677 678 default: 679 return false; 680 } 681} 682 683bool 684bi_reads_t(bi_instr *ins, unsigned src) 685{ 686 /* Branch offset cannot come from passthrough */ 687 if (bi_opcode_props[ins->op].branch) 688 return src != 2; 689 690 /* Table can never read passthrough */ 691 if (bi_opcode_props[ins->op].table) 692 return false; 693 694 /* Staging register reads may happen before the succeeding register 695 * block encodes a write, so effectively there is no passthrough */ 696 if (src == 0 && bi_opcode_props[ins->op].sr_read) 697 return false; 698 699 /* Bifrost cores newer than Mali G71 have restrictions on swizzles on 700 * same-cycle temporaries. Check the list for these hazards. */ 701 if (bi_impacted_t_modifiers(ins, src)) 702 return false; 703 704 /* Descriptor must not come from a passthrough */ 705 switch (ins->op) { 706 case BI_OPCODE_LD_CVT: 707 case BI_OPCODE_LD_TILE: 708 case BI_OPCODE_ST_CVT: 709 case BI_OPCODE_ST_TILE: 710 case BI_OPCODE_TEXC: 711 return src != 2; 712 case BI_OPCODE_BLEND: 713 return src != 2 && src != 3; 714 715 /* Else, just check if we can read any temps */ 716 default: 717 return bi_reads_temps(ins, src); 718 } 719} 720 721/* Counts the number of 64-bit constants required by a clause. TODO: We 722 * might want to account for merging, right now we overestimate, but 723 * that's probably fine most of the time */ 724 725static unsigned 726bi_nconstants(struct bi_clause_state *clause) 727{ 728 unsigned count_32 = 0; 729 730 for (unsigned i = 0; i < ARRAY_SIZE(clause->consts); ++i) 731 count_32 += clause->consts[i].constant_count; 732 733 return DIV_ROUND_UP(count_32, 2); 734} 735 736/* Would there be space for constants if we added one tuple? */ 737 738static bool 739bi_space_for_more_constants(struct bi_clause_state *clause) 740{ 741 return (bi_nconstants(clause) < 13 - (clause->tuple_count + 1)); 742} 743 744/* Updates the FAU assignment for a tuple. A valid FAU assignment must be 745 * possible (as a precondition), though not necessarily on the selected unit; 746 * this is gauranteed per-instruction by bi_lower_fau and per-tuple by 747 * bi_instr_schedulable */ 748 749static bool 750bi_update_fau(struct bi_clause_state *clause, 751 struct bi_tuple_state *tuple, 752 bi_instr *instr, bool fma, bool destructive) 753{ 754 /* Maintain our own constants, for nondestructive mode */ 755 uint32_t copied_constants[2], copied_count; 756 unsigned *constant_count = &tuple->constant_count; 757 uint32_t *constants = tuple->constants; 758 enum bir_fau fau = tuple->fau; 759 760 if (!destructive) { 761 memcpy(copied_constants, tuple->constants, 762 (*constant_count) * sizeof(constants[0])); 763 copied_count = tuple->constant_count; 764 765 constant_count = &copied_count; 766 constants = copied_constants; 767 } 768 769 bi_foreach_src(instr, s) { 770 bi_index src = instr->src[s]; 771 772 if (src.type == BI_INDEX_FAU) { 773 bool no_constants = *constant_count == 0; 774 bool no_other_fau = (fau == src.value) || !fau; 775 bool mergable = no_constants && no_other_fau; 776 777 if (destructive) { 778 assert(mergable); 779 tuple->fau = src.value; 780 } else if (!mergable) { 781 return false; 782 } 783 784 fau = src.value; 785 } else if (src.type == BI_INDEX_CONSTANT) { 786 /* No need to reserve space if we have a fast 0 */ 787 if (src.value == 0 && fma && bi_reads_zero(instr)) 788 continue; 789 790 /* If there is a branch target, #0 by convention is the 791 * PC-relative offset to the target */ 792 bool pcrel = instr->branch_target && src.value == 0; 793 bool found = false; 794 795 for (unsigned i = 0; i < *constant_count; ++i) { 796 found |= (constants[i] == src.value) && 797 (i != tuple->pcrel_idx); 798 } 799 800 /* pcrel constants are unique, so don't match */ 801 if (found && !pcrel) 802 continue; 803 804 bool no_fau = (*constant_count > 0) || !fau; 805 bool mergable = no_fau && ((*constant_count) < 2); 806 807 if (destructive) { 808 assert(mergable); 809 810 if (pcrel) 811 tuple->pcrel_idx = *constant_count; 812 } else if (!mergable) 813 return false; 814 815 constants[(*constant_count)++] = src.value; 816 } 817 } 818 819 /* Constants per clause may be limited by tuple count */ 820 bool room_for_constants = (*constant_count == 0) || 821 bi_space_for_more_constants(clause); 822 823 if (destructive) 824 assert(room_for_constants); 825 else if (!room_for_constants) 826 return false; 827 828 return true; 829} 830 831/* Given an in-progress tuple, a candidate new instruction to add to the tuple, 832 * and a source (index) from that candidate, determine whether this source is 833 * "new", in the sense of requiring an additional read slot. That is, checks 834 * whether the specified source reads from the register file via a read slot 835 * (determined by its type and placement) and whether the source was already 836 * specified by a prior read slot (to avoid double counting) */ 837 838static bool 839bi_tuple_is_new_src(bi_instr *instr, struct bi_reg_state *reg, unsigned src_idx) 840{ 841 bi_index src = instr->src[src_idx]; 842 843 /* Only consider sources which come from the register file */ 844 if (!(src.type == BI_INDEX_NORMAL || src.type == BI_INDEX_REGISTER)) 845 return false; 846 847 /* Staging register reads bypass the usual register file mechanism */ 848 if (src_idx == 0 && bi_opcode_props[instr->op].sr_read) 849 return false; 850 851 /* If a source is already read in the tuple, it is already counted */ 852 for (unsigned t = 0; t < reg->nr_reads; ++t) 853 if (bi_is_word_equiv(src, reg->reads[t])) 854 return false; 855 856 /* If a source is read in _this instruction_, it is already counted */ 857 for (unsigned t = 0; t < src_idx; ++t) 858 if (bi_is_word_equiv(src, instr->src[t])) 859 return false; 860 861 return true; 862} 863 864/* Given two tuples in source order, count the number of register reads of the 865 * successor, determined as the number of unique words accessed that aren't 866 * written by the predecessor (since those are tempable). 867 */ 868 869static unsigned 870bi_count_succ_reads(bi_index t0, bi_index t1, 871 bi_index *succ_reads, unsigned nr_succ_reads) 872{ 873 unsigned reads = 0; 874 875 for (unsigned i = 0; i < nr_succ_reads; ++i) { 876 bool unique = true; 877 878 for (unsigned j = 0; j < i; ++j) 879 if (bi_is_word_equiv(succ_reads[i], succ_reads[j])) 880 unique = false; 881 882 if (!unique) 883 continue; 884 885 if (bi_is_word_equiv(succ_reads[i], t0)) 886 continue; 887 888 if (bi_is_word_equiv(succ_reads[i], t1)) 889 continue; 890 891 reads++; 892 } 893 894 return reads; 895} 896 897/* Not all instructions can read from the staging passthrough (as determined by 898 * reads_t), check if a given pair of instructions has such a restriction. Note 899 * we also use this mechanism to prevent data races around staging register 900 * reads, so we allow the input source to potentially be vector-valued */ 901 902static bool 903bi_has_staging_passthrough_hazard(bi_index fma, bi_instr *add) 904{ 905 bi_foreach_src(add, s) { 906 bi_index src = add->src[s]; 907 908 if (src.type != BI_INDEX_REGISTER) 909 continue; 910 911 unsigned count = bi_count_read_registers(add, s); 912 bool read = false; 913 914 for (unsigned d = 0; d < count; ++d) 915 read |= bi_is_equiv(fma, bi_register(src.value + d)); 916 917 if (read && !bi_reads_t(add, s)) 918 return true; 919 } 920 921 return false; 922} 923 924/* Likewise for cross-tuple passthrough (reads_temps) */ 925 926static bool 927bi_has_cross_passthrough_hazard(bi_tuple *succ, bi_instr *ins) 928{ 929 bi_foreach_instr_in_tuple(succ, pins) { 930 bi_foreach_src(pins, s) { 931 if (bi_is_word_equiv(ins->dest[0], pins->src[s]) && 932 !bi_reads_temps(pins, s)) 933 return true; 934 } 935 } 936 937 return false; 938} 939 940/* Is a register written other than the staging mechanism? ATEST is special, 941 * writing to both a staging register and a regular register (fixed packing). 942 * BLEND is special since it has to write r48 the normal way even if it never 943 * gets read. This depends on liveness analysis, as a register is not needed 944 * for a write that will be discarded after one tuple. */ 945 946static unsigned 947bi_write_count(bi_instr *instr, uint64_t live_after_temp) 948{ 949 if (instr->op == BI_OPCODE_ATEST || instr->op == BI_OPCODE_BLEND) 950 return 1; 951 952 unsigned count = 0; 953 954 bi_foreach_dest(instr, d) { 955 if (d == 0 && bi_opcode_props[instr->op].sr_write) 956 continue; 957 958 if (bi_is_null(instr->dest[d])) 959 continue; 960 961 assert(instr->dest[0].type == BI_INDEX_REGISTER); 962 if (live_after_temp & BITFIELD64_BIT(instr->dest[0].value)) 963 count++; 964 } 965 966 return count; 967} 968 969/* Instruction placement entails two questions: what subset of instructions in 970 * the block can legally be scheduled? and of those which is the best? That is, 971 * we seek to maximize a cost function on a subset of the worklist satisfying a 972 * particular predicate. The necessary predicate is determined entirely by 973 * Bifrost's architectural limitations and is described in the accompanying 974 * whitepaper. The cost function is a heuristic. */ 975 976static bool 977bi_instr_schedulable(bi_instr *instr, 978 struct bi_clause_state *clause, 979 struct bi_tuple_state *tuple, 980 uint64_t live_after_temp, 981 bool fma) 982{ 983 /* The units must match */ 984 if ((fma && !bi_can_fma(instr)) || (!fma && !bi_can_add(instr))) 985 return false; 986 987 /* There can only be one message-passing instruction per clause */ 988 if (bi_must_message(instr) && clause->message) 989 return false; 990 991 /* Some instructions have placement requirements */ 992 if (bi_opcode_props[instr->op].last && !tuple->last) 993 return false; 994 995 if (bi_must_not_last(instr) && tuple->last) 996 return false; 997 998 /* Message-passing instructions are not guaranteed write within the 999 * same clause (most likely they will not), so if a later instruction 1000 * in the clause accesses the destination, the message-passing 1001 * instruction can't be scheduled */ 1002 if (bi_opcode_props[instr->op].sr_write && !bi_is_null(instr->dest[0])) { 1003 unsigned nr = bi_count_write_registers(instr, 0); 1004 assert(instr->dest[0].type == BI_INDEX_REGISTER); 1005 unsigned reg = instr->dest[0].value; 1006 1007 for (unsigned i = 0; i < clause->access_count; ++i) { 1008 bi_index idx = clause->accesses[i]; 1009 for (unsigned d = 0; d < nr; ++d) { 1010 if (bi_is_equiv(bi_register(reg + d), idx)) 1011 return false; 1012 } 1013 } 1014 } 1015 1016 if (bi_opcode_props[instr->op].sr_read && !bi_is_null(instr->src[0])) { 1017 unsigned nr = bi_count_read_registers(instr, 0); 1018 assert(instr->src[0].type == BI_INDEX_REGISTER); 1019 unsigned reg = instr->src[0].value; 1020 1021 for (unsigned i = 0; i < clause->access_count; ++i) { 1022 bi_index idx = clause->accesses[i]; 1023 for (unsigned d = 0; d < nr; ++d) { 1024 if (bi_is_equiv(bi_register(reg + d), idx)) 1025 return false; 1026 } 1027 } 1028 } 1029 1030 /* If FAU is already assigned, we may not disrupt that. Do a 1031 * non-disruptive test update */ 1032 if (!bi_update_fau(clause, tuple, instr, fma, false)) 1033 return false; 1034 1035 /* If this choice of FMA would force a staging passthrough, the ADD 1036 * instruction must support such a passthrough */ 1037 if (tuple->add && bi_has_staging_passthrough_hazard(instr->dest[0], tuple->add)) 1038 return false; 1039 1040 /* If this choice of destination would force a cross-tuple passthrough, the next tuple must support that */ 1041 if (tuple->prev && bi_has_cross_passthrough_hazard(tuple->prev, instr)) 1042 return false; 1043 1044 /* Register file writes are limited */ 1045 unsigned total_writes = tuple->reg.nr_writes; 1046 total_writes += bi_write_count(instr, live_after_temp); 1047 1048 /* Last tuple in a clause can only write a single value */ 1049 if (tuple->last && total_writes > 1) 1050 return false; 1051 1052 /* Register file reads are limited, so count unique */ 1053 1054 unsigned unique_new_srcs = 0; 1055 1056 bi_foreach_src(instr, s) { 1057 if (bi_tuple_is_new_src(instr, &tuple->reg, s)) 1058 unique_new_srcs++; 1059 } 1060 1061 unsigned total_srcs = tuple->reg.nr_reads + unique_new_srcs; 1062 1063 bool can_spill_to_moves = (!tuple->add); 1064 can_spill_to_moves &= (bi_nconstants(clause) < 13 - (clause->tuple_count + 2)); 1065 can_spill_to_moves &= (clause->tuple_count < 7); 1066 1067 /* However, we can get an extra 1 or 2 sources by inserting moves */ 1068 if (total_srcs > (can_spill_to_moves ? 4 : 3)) 1069 return false; 1070 1071 /* Count effective reads for the successor */ 1072 unsigned succ_reads = bi_count_succ_reads(instr->dest[0], 1073 tuple->add ? tuple->add->dest[0] : bi_null(), 1074 tuple->prev_reads, tuple->nr_prev_reads); 1075 1076 /* Successor must satisfy R+W <= 4, so we require W <= 4-R */ 1077 if ((signed) total_writes > (4 - (signed) succ_reads)) 1078 return false; 1079 1080 return true; 1081} 1082 1083static signed 1084bi_instr_cost(bi_instr *instr, struct bi_tuple_state *tuple) 1085{ 1086 signed cost = 0; 1087 1088 /* Instructions that can schedule to either FMA or to ADD should be 1089 * deprioritized since they're easier to reschedule elsewhere */ 1090 if (bi_can_fma(instr) && bi_can_add(instr)) 1091 cost++; 1092 1093 /* Message-passing instructions impose constraints on the registers 1094 * later in the clause, so schedule them as late within a clause as 1095 * possible (<==> prioritize them since we're backwards <==> decrease 1096 * cost) */ 1097 if (bi_must_message(instr)) 1098 cost--; 1099 1100 /* Last instructions are big constraints (XXX: no effect on shader-db) */ 1101 if (bi_opcode_props[instr->op].last) 1102 cost -= 2; 1103 1104 return cost; 1105} 1106 1107static unsigned 1108bi_choose_index(struct bi_worklist st, 1109 struct bi_clause_state *clause, 1110 struct bi_tuple_state *tuple, 1111 uint64_t live_after_temp, 1112 bool fma) 1113{ 1114 unsigned i, best_idx = ~0; 1115 signed best_cost = INT_MAX; 1116 1117 BITSET_FOREACH_SET(i, st.worklist, st.count) { 1118 bi_instr *instr = st.instructions[i]; 1119 1120 if (!bi_instr_schedulable(instr, clause, tuple, live_after_temp, fma)) 1121 continue; 1122 1123 signed cost = bi_instr_cost(instr, tuple); 1124 1125 /* Tie break in favour of later instructions, under the 1126 * assumption this promotes temporary usage (reducing pressure 1127 * on the register file). This is a side effect of a prepass 1128 * scheduling for pressure. */ 1129 1130 if (cost <= best_cost) { 1131 best_idx = i; 1132 best_cost = cost; 1133 } 1134 } 1135 1136 return best_idx; 1137} 1138 1139static void 1140bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple, 1141 bi_instr *instr, uint64_t live_after_temp, bool fma) 1142{ 1143 bi_update_fau(clause, tuple, instr, fma, true); 1144 1145 /* TODO: maybe opt a bit? or maybe doesn't matter */ 1146 assert(clause->access_count + BI_MAX_SRCS + BI_MAX_DESTS <= ARRAY_SIZE(clause->accesses)); 1147 memcpy(clause->accesses + clause->access_count, instr->src, sizeof(instr->src)); 1148 clause->access_count += BI_MAX_SRCS; 1149 memcpy(clause->accesses + clause->access_count, instr->dest, sizeof(instr->dest)); 1150 clause->access_count += BI_MAX_DESTS; 1151 tuple->reg.nr_writes += bi_write_count(instr, live_after_temp); 1152 1153 bi_foreach_src(instr, s) { 1154 if (bi_tuple_is_new_src(instr, &tuple->reg, s)) 1155 tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s]; 1156 } 1157} 1158 1159/* Choose the best instruction and pop it off the worklist. Returns NULL if no 1160 * instruction is available. This function is destructive. */ 1161 1162static bi_instr * 1163bi_take_instr(bi_context *ctx, struct bi_worklist st, 1164 struct bi_clause_state *clause, 1165 struct bi_tuple_state *tuple, 1166 uint64_t live_after_temp, 1167 bool fma) 1168{ 1169 if (tuple->add && tuple->add->op == BI_OPCODE_CUBEFACE) 1170 return bi_lower_cubeface(ctx, clause, tuple); 1171 else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C_I32) 1172 return bi_lower_atom_c(ctx, clause, tuple); 1173 else if (tuple->add && tuple->add->op == BI_OPCODE_PATOM_C1_I32) 1174 return bi_lower_atom_c1(ctx, clause, tuple); 1175 else if (tuple->add && tuple->add->op == BI_OPCODE_SEG_ADD_I64) 1176 return bi_lower_seg_add(ctx, clause, tuple); 1177 else if (tuple->add && tuple->add->table) 1178 return bi_lower_dtsel(ctx, clause, tuple); 1179 1180 /* TODO: Optimize these moves */ 1181 if (!fma && tuple->nr_prev_reads > 3) { 1182 /* Only spill by one source for now */ 1183 assert(tuple->nr_prev_reads == 4); 1184 1185 /* Pick a source to spill */ 1186 bi_index src = tuple->prev_reads[0]; 1187 1188 /* Schedule the spill */ 1189 bi_builder b = bi_init_builder(ctx, bi_before_tuple(tuple->prev)); 1190 bi_instr *mov = bi_mov_i32_to(&b, src, src); 1191 bi_pop_instr(clause, tuple, mov, live_after_temp, fma); 1192 return mov; 1193 } 1194 1195#ifndef NDEBUG 1196 /* Don't pair instructions if debugging */ 1197 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && tuple->add) 1198 return NULL; 1199#endif 1200 1201 unsigned idx = bi_choose_index(st, clause, tuple, live_after_temp, fma); 1202 1203 if (idx >= st.count) 1204 return NULL; 1205 1206 /* Update state to reflect taking the instruction */ 1207 bi_instr *instr = st.instructions[idx]; 1208 1209 BITSET_CLEAR(st.worklist, idx); 1210 bi_update_worklist(st, idx); 1211 bi_pop_instr(clause, tuple, instr, live_after_temp, fma); 1212 1213 /* Fixups */ 1214 if (instr->op == BI_OPCODE_IADD_U32 && fma) { 1215 assert(bi_can_iaddc(instr)); 1216 instr->op = BI_OPCODE_IADDC_I32; 1217 instr->src[2] = bi_zero(); 1218 } 1219 1220 return instr; 1221} 1222 1223/* Variant of bi_rewrite_index_src_single that uses word-equivalence, rewriting 1224 * to a passthrough register. If except_zero is true, the zeroth (first) source 1225 * is skipped, so staging register reads are not accidentally encoded as 1226 * passthrough (which is impossible) */ 1227 1228static void 1229bi_use_passthrough(bi_instr *ins, bi_index old, 1230 enum bifrost_packed_src new, 1231 bool except_zero) 1232{ 1233 /* Optional for convenience */ 1234 if (!ins || bi_is_null(old)) 1235 return; 1236 1237 bi_foreach_src(ins, i) { 1238 if (i == 0 && except_zero) 1239 continue; 1240 1241 if (bi_is_word_equiv(ins->src[i], old)) { 1242 ins->src[i].type = BI_INDEX_PASS; 1243 ins->src[i].value = new; 1244 ins->src[i].reg = false; 1245 ins->src[i].offset = 0; 1246 } 1247 } 1248} 1249 1250/* Rewrites an adjacent pair of tuples _prec_eding and _succ_eding to use 1251 * intertuple passthroughs where necessary. Passthroughs are allowed as a 1252 * post-condition of scheduling. Note we rewrite ADD first, FMA second -- 1253 * opposite the order of execution. This is deliberate -- if both FMA and ADD 1254 * write to the same logical register, the next executed tuple will get the 1255 * latter result. There's no interference issue under the assumption of correct 1256 * register allocation. */ 1257 1258static void 1259bi_rewrite_passthrough(bi_tuple prec, bi_tuple succ) 1260{ 1261 bool sr_read = succ.add ? bi_opcode_props[succ.add->op].sr_read : false; 1262 1263 if (prec.add) { 1264 bi_use_passthrough(succ.fma, prec.add->dest[0], BIFROST_SRC_PASS_ADD, false); 1265 bi_use_passthrough(succ.add, prec.add->dest[0], BIFROST_SRC_PASS_ADD, sr_read); 1266 } 1267 1268 if (prec.fma) { 1269 bi_use_passthrough(succ.fma, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, false); 1270 bi_use_passthrough(succ.add, prec.fma->dest[0], BIFROST_SRC_PASS_FMA, sr_read); 1271 } 1272} 1273 1274static void 1275bi_rewrite_fau_to_pass(bi_tuple *tuple) 1276{ 1277 bi_foreach_instr_and_src_in_tuple(tuple, ins, s) { 1278 if (ins->src[s].type != BI_INDEX_FAU) continue; 1279 1280 bi_index pass = bi_passthrough(ins->src[s].offset ? 1281 BIFROST_SRC_FAU_HI : BIFROST_SRC_FAU_LO); 1282 1283 ins->src[s] = bi_replace_index(ins->src[s], pass); 1284 } 1285} 1286 1287static void 1288bi_rewrite_zero(bi_instr *ins, bool fma) 1289{ 1290 bi_index zero = bi_passthrough(fma ? BIFROST_SRC_STAGE : BIFROST_SRC_FAU_LO); 1291 1292 bi_foreach_src(ins, s) { 1293 bi_index src = ins->src[s]; 1294 1295 if (src.type == BI_INDEX_CONSTANT && src.value == 0) 1296 ins->src[s] = bi_replace_index(src, zero); 1297 } 1298} 1299 1300/* Assumes #0 to {T, FAU} rewrite has already occurred */ 1301 1302static void 1303bi_rewrite_constants_to_pass(bi_tuple *tuple, uint64_t constant, bool pcrel) 1304{ 1305 bi_foreach_instr_and_src_in_tuple(tuple, ins, s) { 1306 if (ins->src[s].type != BI_INDEX_CONSTANT) continue; 1307 1308 uint32_t cons = ins->src[s].value; 1309 1310 ASSERTED bool lo = (cons == (constant & 0xffffffff)); 1311 bool hi = (cons == (constant >> 32ull)); 1312 1313 /* PC offsets always live in the upper half, set to zero by 1314 * convention before pack time. (This is safe, since if you 1315 * wanted to compare against zero, you would use a BRANCHZ 1316 * instruction instead.) */ 1317 if (cons == 0 && ins->branch_target != NULL) { 1318 assert(pcrel); 1319 hi = true; 1320 lo = false; 1321 } else if (pcrel) { 1322 hi = false; 1323 } 1324 1325 assert(lo || hi); 1326 1327 ins->src[s] = bi_replace_index(ins->src[s], 1328 bi_passthrough(hi ? BIFROST_SRC_FAU_HI : 1329 BIFROST_SRC_FAU_LO)); 1330 } 1331} 1332 1333/* Constructs a constant state given a tuple state. This has the 1334 * postcondition that pcrel applies to the first constant by convention, 1335 * and PC-relative constants will be #0 by convention here, so swap to 1336 * match if needed */ 1337 1338static struct bi_const_state 1339bi_get_const_state(struct bi_tuple_state *tuple) 1340{ 1341 struct bi_const_state consts = { 1342 .constant_count = tuple->constant_count, 1343 .constants[0] = tuple->constants[0], 1344 .constants[1] = tuple->constants[1], 1345 .pcrel = tuple->add && tuple->add->branch_target, 1346 }; 1347 1348 /* pcrel applies to the first constant by convention, and 1349 * PC-relative constants will be #0 by convention here, so swap 1350 * to match if needed */ 1351 if (consts.pcrel && consts.constants[0]) { 1352 assert(consts.constant_count == 2); 1353 assert(consts.constants[1] == 0); 1354 1355 consts.constants[1] = consts.constants[0]; 1356 consts.constants[0] = 0; 1357 } 1358 1359 return consts; 1360} 1361 1362/* Merges constants in a clause, satisfying the following rules, assuming no 1363 * more than one tuple has pcrel: 1364 * 1365 * 1. If a tuple has two constants, they must be packed together. If one is 1366 * pcrel, it must be the high constant to use the M1=4 modification [sx64(E0) + 1367 * (PC << 32)]. Otherwise choose an arbitrary order. 1368 * 1369 * 4. If a tuple has one constant, it may be shared with an existing 1370 * pair that already contains that constant, or it may be combined with another 1371 * (distinct) tuple of a single constant. 1372 * 1373 * This gaurantees a packing is possible. The next routine handles modification 1374 * related swapping, to satisfy format 12 and the lack of modification for 1375 * tuple count 5/8 in EC0. 1376 */ 1377 1378static uint64_t 1379bi_merge_u32(uint32_t c0, uint32_t c1, bool pcrel) 1380{ 1381 /* At this point in the constant merge algorithm, pcrel constants are 1382 * treated as zero, so pcrel implies at least one constants is zero */ 1383 assert(!pcrel || (c0 == 0 || c1 == 0)); 1384 1385 /* Order: pcrel, maximum non-pcrel, minimum non-pcrel */ 1386 uint32_t hi = pcrel ? 0 : MAX2(c0, c1); 1387 uint32_t lo = (c0 == hi) ? c1 : c0; 1388 1389 /* Merge in the selected order */ 1390 return lo | (((uint64_t) hi) << 32ull); 1391} 1392 1393static unsigned 1394bi_merge_pairs(struct bi_const_state *consts, unsigned tuple_count, 1395 uint64_t *merged, unsigned *pcrel_pair) 1396{ 1397 unsigned merge_count = 0; 1398 1399 for (unsigned t = 0; t < tuple_count; ++t) { 1400 if (consts[t].constant_count != 2) continue; 1401 1402 unsigned idx = ~0; 1403 uint64_t val = bi_merge_u32(consts[t].constants[0], 1404 consts[t].constants[1], consts[t].pcrel); 1405 1406 /* Skip the pcrel pair if assigned, because if one is assigned, 1407 * this one is not pcrel by uniqueness so it's a mismatch */ 1408 for (unsigned s = 0; s < merge_count; ++s) { 1409 if (merged[s] == val && (*pcrel_pair) != s) { 1410 idx = s; 1411 break; 1412 } 1413 } 1414 1415 if (idx == ~0) { 1416 idx = merge_count++; 1417 merged[idx] = val; 1418 1419 if (consts[t].pcrel) 1420 (*pcrel_pair) = idx; 1421 } 1422 1423 consts[t].word_idx = idx; 1424 } 1425 1426 return merge_count; 1427} 1428 1429static unsigned 1430bi_merge_singles(struct bi_const_state *consts, unsigned tuple_count, 1431 uint64_t *pairs, unsigned pair_count, unsigned *pcrel_pair) 1432{ 1433 bool pending = false, pending_pcrel = false; 1434 uint32_t pending_single = 0; 1435 1436 for (unsigned t = 0; t < tuple_count; ++t) { 1437 if (consts[t].constant_count != 1) continue; 1438 1439 uint32_t val = consts[t].constants[0]; 1440 unsigned idx = ~0; 1441 1442 /* Try to match, but don't match pcrel with non-pcrel, even 1443 * though we can merge a pcrel with a non-pcrel single */ 1444 for (unsigned i = 0; i < pair_count; ++i) { 1445 bool lo = ((pairs[i] & 0xffffffff) == val); 1446 bool hi = ((pairs[i] >> 32) == val); 1447 bool match = (lo || hi); 1448 match &= ((*pcrel_pair) != i); 1449 if (match && !consts[t].pcrel) { 1450 idx = i; 1451 break; 1452 } 1453 } 1454 1455 if (idx == ~0) { 1456 idx = pair_count; 1457 1458 if (pending && pending_single != val) { 1459 assert(!(pending_pcrel && consts[t].pcrel)); 1460 bool pcrel = pending_pcrel || consts[t].pcrel; 1461 1462 if (pcrel) 1463 *pcrel_pair = idx; 1464 1465 pairs[pair_count++] = bi_merge_u32(pending_single, val, pcrel); 1466 1467 pending = pending_pcrel = false; 1468 } else { 1469 pending = true; 1470 pending_pcrel = consts[t].pcrel; 1471 pending_single = val; 1472 } 1473 } 1474 1475 consts[t].word_idx = idx; 1476 } 1477 1478 /* Shift so it works whether pending_pcrel is set or not */ 1479 if (pending) { 1480 if (pending_pcrel) 1481 *pcrel_pair = pair_count; 1482 1483 pairs[pair_count++] = ((uint64_t) pending_single) << 32ull; 1484 } 1485 1486 return pair_count; 1487} 1488 1489static unsigned 1490bi_merge_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned *pcrel_idx) 1491{ 1492 unsigned pair_count = bi_merge_pairs(consts, 8, pairs, pcrel_idx); 1493 return bi_merge_singles(consts, 8, pairs, pair_count, pcrel_idx); 1494} 1495 1496/* Swap two constants at word i and i+1 by swapping their actual positions and 1497 * swapping all references so the meaning of the clause is preserved */ 1498 1499static void 1500bi_swap_constants(struct bi_const_state *consts, uint64_t *pairs, unsigned i) 1501{ 1502 uint64_t tmp_pair = pairs[i + 0]; 1503 pairs[i + 0] = pairs[i + 1]; 1504 pairs[i + 1] = tmp_pair; 1505 1506 for (unsigned t = 0; t < 8; ++t) { 1507 if (consts[t].word_idx == i) 1508 consts[t].word_idx = (i + 1); 1509 else if (consts[t].word_idx == (i + 1)) 1510 consts[t].word_idx = i; 1511 } 1512} 1513 1514/* Given merged constants, one of which might be PC-relative, fix up the M 1515 * values so the PC-relative constant (if it exists) has the M1=4 modification 1516 * and other constants are used as-is (which might require swapping) */ 1517 1518static unsigned 1519bi_apply_constant_modifiers(struct bi_const_state *consts, 1520 uint64_t *pairs, unsigned *pcrel_idx, 1521 unsigned tuple_count, unsigned constant_count) 1522{ 1523 unsigned start = bi_ec0_packed(tuple_count) ? 1 : 0; 1524 1525 /* Clauses with these tuple counts lack an M field for the packed EC0, 1526 * so EC0 cannot be PC-relative, which might require swapping (and 1527 * possibly adding an unused constant) to fit */ 1528 1529 if (*pcrel_idx == 0 && (tuple_count == 5 || tuple_count == 8)) { 1530 constant_count = MAX2(constant_count, 2); 1531 *pcrel_idx = 1; 1532 bi_swap_constants(consts, pairs, 0); 1533 } 1534 1535 /* EC0 might be packed free, after that constants are packed in pairs 1536 * (with clause format 12), with M1 values computed from the pair */ 1537 1538 for (unsigned i = start; i < constant_count; i += 2) { 1539 bool swap = false; 1540 bool last = (i + 1) == constant_count; 1541 1542 unsigned A1 = (pairs[i] >> 60); 1543 unsigned B1 = (pairs[i + 1] >> 60); 1544 1545 if (*pcrel_idx == i || *pcrel_idx == (i + 1)) { 1546 /* PC-relative constant must be E0, not E1 */ 1547 swap = (*pcrel_idx == (i + 1)); 1548 1549 /* Set M1 = 4 by noting (A - B) mod 16 = 4 is 1550 * equivalent to A = (B + 4) mod 16 and that we can 1551 * control A */ 1552 unsigned B = swap ? A1 : B1; 1553 unsigned A = (B + 4) & 0xF; 1554 pairs[*pcrel_idx] |= ((uint64_t) A) << 60; 1555 1556 /* Swapped if swap set, identity if swap not set */ 1557 *pcrel_idx = i; 1558 } else { 1559 /* Compute M1 value if we don't swap */ 1560 unsigned M1 = (16 + A1 - B1) & 0xF; 1561 1562 /* For M1 = 0 or M1 >= 8, the constants are unchanged, 1563 * we have 0 < (A1 - B1) % 16 < 8, which implies (B1 - 1564 * A1) % 16 >= 8, so swapping will let them be used 1565 * unchanged */ 1566 swap = (M1 != 0) && (M1 < 8); 1567 1568 /* However, we can't swap the last constant, so we 1569 * force M1 = 0 instead for this case */ 1570 if (last && swap) { 1571 pairs[i + 1] |= pairs[i] & (0xfull << 60); 1572 swap = false; 1573 } 1574 } 1575 1576 if (swap) { 1577 assert(!last); 1578 bi_swap_constants(consts, pairs, i); 1579 } 1580 } 1581 1582 return constant_count; 1583} 1584 1585/* Schedule a single clause. If no instructions remain, return NULL. */ 1586 1587static bi_clause * 1588bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint64_t *live) 1589{ 1590 struct bi_clause_state clause_state = { 0 }; 1591 bi_clause *clause = rzalloc(ctx, bi_clause); 1592 bi_tuple *tuple = NULL; 1593 1594 const unsigned max_tuples = ARRAY_SIZE(clause->tuples); 1595 1596 /* TODO: Decide flow control better */ 1597 clause->flow_control = BIFROST_FLOW_NBTB; 1598 1599 /* The last clause can only write one instruction, so initialize that */ 1600 struct bi_reg_state reg_state = {}; 1601 bi_index prev_reads[5] = { bi_null() }; 1602 unsigned nr_prev_reads = 0; 1603 1604 /* We need to track future liveness. The main *live set tracks what is 1605 * live at the current point int he program we are scheduling, but to 1606 * determine temp eligibility, we instead want what will be live after 1607 * the next tuple in the program. If you scheduled forwards, you'd need 1608 * a crystall ball for this. Luckily we schedule backwards, so we just 1609 * delay updates to the live_after_temp by an extra tuple. */ 1610 uint64_t live_after_temp = *live; 1611 uint64_t live_next_tuple = live_after_temp; 1612 1613 do { 1614 struct bi_tuple_state tuple_state = { 1615 .last = (clause->tuple_count == 0), 1616 .reg = reg_state, 1617 .nr_prev_reads = nr_prev_reads, 1618 .prev = tuple, 1619 .pcrel_idx = ~0, 1620 }; 1621 1622 assert(nr_prev_reads < ARRAY_SIZE(prev_reads)); 1623 memcpy(tuple_state.prev_reads, prev_reads, sizeof(prev_reads)); 1624 1625 unsigned idx = max_tuples - clause->tuple_count - 1; 1626 1627 tuple = &clause->tuples[idx]; 1628 1629 if (clause->message && bi_opcode_props[clause->message->op].sr_read && !bi_is_null(clause->message->src[0])) { 1630 unsigned nr = bi_count_read_registers(clause->message, 0); 1631 live_after_temp |= (BITFIELD64_MASK(nr) << clause->message->src[0].value); 1632 } 1633 1634 /* Since we schedule backwards, we schedule ADD first */ 1635 tuple_state.add = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, false); 1636 tuple->fma = bi_take_instr(ctx, st, &clause_state, &tuple_state, live_after_temp, true); 1637 tuple->add = tuple_state.add; 1638 1639 /* Update liveness from the new instructions */ 1640 if (tuple->add) 1641 *live = bi_postra_liveness_ins(*live, tuple->add); 1642 1643 if (tuple->fma) 1644 *live = bi_postra_liveness_ins(*live, tuple->fma); 1645 1646 /* Rotate in the new per-tuple liveness */ 1647 live_after_temp = live_next_tuple; 1648 live_next_tuple = *live; 1649 1650 /* We may have a message, but only one per clause */ 1651 if (tuple->add && bi_must_message(tuple->add)) { 1652 assert(!clause_state.message); 1653 clause_state.message = true; 1654 1655 clause->message_type = 1656 bi_message_type_for_instr(tuple->add); 1657 clause->message = tuple->add; 1658 1659 /* We don't need to set dependencies for blend shaders 1660 * because the BLEND instruction in the fragment 1661 * shader should have already done the wait */ 1662 if (!ctx->inputs->is_blend) { 1663 switch (tuple->add->op) { 1664 case BI_OPCODE_ATEST: 1665 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH); 1666 break; 1667 case BI_OPCODE_LD_TILE: 1668 case BI_OPCODE_ST_TILE: 1669 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR); 1670 break; 1671 case BI_OPCODE_BLEND: 1672 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_DEPTH); 1673 clause->dependencies |= (1 << BIFROST_SLOT_ELDEST_COLOUR); 1674 break; 1675 default: 1676 break; 1677 } 1678 } 1679 } 1680 1681 clause_state.consts[idx] = bi_get_const_state(&tuple_state); 1682 1683 /* Before merging constants, eliminate zeroes, otherwise the 1684 * merging will fight over the #0 that never gets read (and is 1685 * never marked as read by update_fau) */ 1686 if (tuple->fma && bi_reads_zero(tuple->fma)) 1687 bi_rewrite_zero(tuple->fma, true); 1688 1689 /* Rewrite away FAU, constant write is deferred */ 1690 if (!tuple_state.constant_count) { 1691 tuple->fau_idx = tuple_state.fau; 1692 bi_rewrite_fau_to_pass(tuple); 1693 } 1694 1695 /* Use passthrough register for cross-stage accesses. Since 1696 * there are just FMA and ADD stages, that means we rewrite to 1697 * passthrough the sources of the ADD that read from the 1698 * destination of the FMA */ 1699 1700 if (tuple->fma) { 1701 bi_use_passthrough(tuple->add, tuple->fma->dest[0], 1702 BIFROST_SRC_STAGE, false); 1703 } 1704 1705 /* Don't add an empty tuple, unless the worklist has nothing 1706 * but a (pseudo)instruction failing to schedule due to a "not 1707 * last instruction" constraint */ 1708 1709 int some_instruction = __bitset_ffs(st.worklist, BITSET_WORDS(st.count)); 1710 bool not_last = (some_instruction > 0) && 1711 bi_must_not_last(st.instructions[some_instruction - 1]); 1712 1713 bool insert_empty = tuple_state.last && not_last; 1714 1715 if (!(tuple->fma || tuple->add || insert_empty)) 1716 break; 1717 1718 clause->tuple_count++; 1719 1720 /* Adding enough tuple might overflow constants */ 1721 if (!bi_space_for_more_constants(&clause_state)) 1722 break; 1723 1724#ifndef NDEBUG 1725 /* Don't schedule more than 1 tuple if debugging */ 1726 if ((bifrost_debug & BIFROST_DBG_NOSCHED) && !insert_empty) 1727 break; 1728#endif 1729 1730 /* Link through the register state */ 1731 STATIC_ASSERT(sizeof(prev_reads) == sizeof(tuple_state.reg.reads)); 1732 memcpy(prev_reads, tuple_state.reg.reads, sizeof(prev_reads)); 1733 nr_prev_reads = tuple_state.reg.nr_reads; 1734 clause_state.tuple_count++; 1735 } while(clause->tuple_count < 8); 1736 1737 /* Don't schedule an empty clause */ 1738 if (!clause->tuple_count) 1739 return NULL; 1740 1741 /* Before merging, rewrite away any tuples that read only zero */ 1742 for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) { 1743 bi_tuple *tuple = &clause->tuples[i]; 1744 struct bi_const_state *st = &clause_state.consts[i]; 1745 1746 if (st->constant_count == 0 || st->constants[0] || st->constants[1] || st->pcrel) 1747 continue; 1748 1749 bi_foreach_instr_in_tuple(tuple, ins) 1750 bi_rewrite_zero(ins, false); 1751 1752 /* Constant has been demoted to FAU, so don't pack it separately */ 1753 st->constant_count = 0; 1754 1755 /* Default */ 1756 assert(tuple->fau_idx == BIR_FAU_ZERO); 1757 } 1758 1759 uint64_t constant_pairs[8] = { 0 }; 1760 unsigned pcrel_idx = ~0; 1761 unsigned constant_words = 1762 bi_merge_constants(clause_state.consts, constant_pairs, &pcrel_idx); 1763 1764 constant_words = bi_apply_constant_modifiers(clause_state.consts, 1765 constant_pairs, &pcrel_idx, clause->tuple_count, 1766 constant_words); 1767 1768 clause->pcrel_idx = pcrel_idx; 1769 1770 for (unsigned i = max_tuples - clause->tuple_count; i < max_tuples; ++i) { 1771 bi_tuple *tuple = &clause->tuples[i]; 1772 1773 /* If no constants, leave FAU as it is, possibly defaulting to 0 */ 1774 if (clause_state.consts[i].constant_count == 0) 1775 continue; 1776 1777 /* FAU is already handled */ 1778 assert(!tuple->fau_idx); 1779 1780 unsigned word_idx = clause_state.consts[i].word_idx; 1781 assert(word_idx <= 8); 1782 1783 /* We could try to merge regardless of bottom bits as well, but 1784 * that's probably diminishing returns */ 1785 uint64_t pair = constant_pairs[word_idx]; 1786 unsigned lo = pair & 0xF; 1787 1788 tuple->fau_idx = bi_constant_field(word_idx) | lo; 1789 bi_rewrite_constants_to_pass(tuple, pair, word_idx == pcrel_idx); 1790 } 1791 1792 clause->constant_count = constant_words; 1793 memcpy(clause->constants, constant_pairs, sizeof(constant_pairs)); 1794 1795 /* Branches must be last, so this can be factored out */ 1796 bi_instr *last = clause->tuples[max_tuples - 1].add; 1797 clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP); 1798 clause->block = block; 1799 1800 /* TODO: scoreboard assignment post-sched */ 1801 clause->dependencies |= (1 << 0); 1802 1803 /* We emit in reverse and emitted to the back of the tuples array, so 1804 * move it up front for easy indexing */ 1805 memmove(clause->tuples, 1806 clause->tuples + (max_tuples - clause->tuple_count), 1807 clause->tuple_count * sizeof(clause->tuples[0])); 1808 1809 /* Use passthrough register for cross-tuple accesses. Note this is 1810 * after the memmove, so this is forwards. Skip the first tuple since 1811 * there is nothing before it to passthrough */ 1812 1813 for (unsigned t = 1; t < clause->tuple_count; ++t) 1814 bi_rewrite_passthrough(clause->tuples[t - 1], clause->tuples[t]); 1815 1816 return clause; 1817} 1818 1819static void 1820bi_schedule_block(bi_context *ctx, bi_block *block) 1821{ 1822 list_inithead(&block->clauses); 1823 1824 /* Copy list to dynamic array */ 1825 struct bi_worklist st = bi_initialize_worklist(block, 1826 bifrost_debug & BIFROST_DBG_INORDER, 1827 ctx->inputs->is_blend); 1828 1829 if (!st.count) { 1830 bi_free_worklist(st); 1831 return; 1832 } 1833 1834 /* We need to track liveness during scheduling in order to determine whether we can use temporary (passthrough) registers */ 1835 uint64_t live = block->reg_live_out; 1836 1837 /* Schedule as many clauses as needed to fill the block */ 1838 bi_clause *u = NULL; 1839 while((u = bi_schedule_clause(ctx, block, st, &live))) 1840 list_add(&u->link, &block->clauses); 1841 1842 /* Back-to-back bit affects only the last clause of a block, 1843 * the rest are implicitly true */ 1844 if (!list_is_empty(&block->clauses)) { 1845 bi_clause *last_clause = list_last_entry(&block->clauses, bi_clause, link); 1846 if (bi_reconverge_branches(block)) 1847 last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL; 1848 } 1849 1850 /* Reorder instructions to match the new schedule. First remove 1851 * existing instructions and then recreate the list */ 1852 1853 bi_foreach_instr_in_block_safe(block, ins) { 1854 list_del(&ins->link); 1855 } 1856 1857 bi_foreach_clause_in_block(block, clause) { 1858 for (unsigned i = 0; i < clause->tuple_count; ++i) { 1859 bi_foreach_instr_in_tuple(&clause->tuples[i], ins) { 1860 list_addtail(&ins->link, &block->instructions); 1861 } 1862 } 1863 } 1864 1865 block->scheduled = true; 1866 1867#ifndef NDEBUG 1868 unsigned i; 1869 bool incomplete = false; 1870 1871 BITSET_FOREACH_SET(i, st.worklist, st.count) { 1872 bi_print_instr(st.instructions[i], stderr); 1873 incomplete = true; 1874 } 1875 1876 if (incomplete) 1877 unreachable("The above instructions failed to schedule."); 1878#endif 1879 1880 bi_free_worklist(st); 1881} 1882 1883static bool 1884bi_check_fau_src(bi_instr *ins, unsigned s, uint32_t *constants, unsigned *cwords, bi_index *fau) 1885{ 1886 bi_index src = ins->src[s]; 1887 1888 /* Staging registers can't have FAU accesses */ 1889 if (s == 0 && bi_opcode_props[ins->op].sr_read) 1890 return (src.type != BI_INDEX_CONSTANT) && (src.type != BI_INDEX_FAU); 1891 1892 if (src.type == BI_INDEX_CONSTANT) { 1893 /* Allow fast zero */ 1894 if (src.value == 0 && bi_opcode_props[ins->op].fma && bi_reads_zero(ins)) 1895 return true; 1896 1897 if (!bi_is_null(*fau)) 1898 return false; 1899 1900 /* Else, try to inline a constant */ 1901 for (unsigned i = 0; i < *cwords; ++i) { 1902 if (src.value == constants[i]) 1903 return true; 1904 } 1905 1906 if (*cwords >= 2) 1907 return false; 1908 1909 constants[(*cwords)++] = src.value; 1910 } else if (src.type == BI_INDEX_FAU) { 1911 if (*cwords != 0) 1912 return false; 1913 1914 /* Can only read from one pair of FAU words */ 1915 if (!bi_is_null(*fau) && (src.value != fau->value)) 1916 return false; 1917 1918 /* If there is a target, we'll need a PC-relative constant */ 1919 if (ins->branch_target) 1920 return false; 1921 1922 *fau = src; 1923 } 1924 1925 return true; 1926} 1927 1928void 1929bi_lower_fau(bi_context *ctx) 1930{ 1931 bi_foreach_instr_global_safe(ctx, ins) { 1932 bi_builder b = bi_init_builder(ctx, bi_before_instr(ins)); 1933 1934 uint32_t constants[2]; 1935 unsigned cwords = 0; 1936 bi_index fau = bi_null(); 1937 1938 /* ATEST must have the ATEST datum encoded, not any other 1939 * uniform. See to it this is the case. */ 1940 if (ins->op == BI_OPCODE_ATEST) 1941 fau = ins->src[2]; 1942 1943 bi_foreach_src(ins, s) { 1944 if (bi_check_fau_src(ins, s, constants, &cwords, &fau)) continue; 1945 1946 bi_index copy = bi_mov_i32(&b, ins->src[s]); 1947 ins->src[s] = bi_replace_index(ins->src[s], copy); 1948 } 1949 } 1950} 1951 1952/* Only v7 allows specifying a dependency on the tilebuffer for the first 1953 * clause of a shader. v6 requires adding a NOP clause with the depedency. */ 1954 1955static void 1956bi_add_nop_for_atest(bi_context *ctx) 1957{ 1958 /* Only needed on v6 */ 1959 if (ctx->arch >= 7) 1960 return; 1961 1962 if (list_is_empty(&ctx->blocks)) 1963 return; 1964 1965 /* Fetch the first clause of the shader */ 1966 bi_block *block = list_first_entry(&ctx->blocks, bi_block, link); 1967 bi_clause *clause = bi_next_clause(ctx, block, NULL); 1968 1969 if (!clause || !(clause->dependencies & ((1 << BIFROST_SLOT_ELDEST_DEPTH) | 1970 (1 << BIFROST_SLOT_ELDEST_COLOUR)))) 1971 return; 1972 1973 /* Add a NOP so we can wait for the dependencies required by the first 1974 * clause */ 1975 1976 bi_instr *I = rzalloc(ctx, bi_instr); 1977 I->op = BI_OPCODE_NOP; 1978 I->dest[0] = bi_null(); 1979 1980 bi_clause *new_clause = ralloc(ctx, bi_clause); 1981 *new_clause = (bi_clause) { 1982 .flow_control = BIFROST_FLOW_NBTB, 1983 .next_clause_prefetch = true, 1984 .block = clause->block, 1985 1986 .tuple_count = 1, 1987 .tuples[0] = { .fma = I, }, 1988 }; 1989 1990 list_add(&new_clause->link, &clause->block->clauses); 1991} 1992 1993void 1994bi_schedule(bi_context *ctx) 1995{ 1996 /* Fed into both scheduling and DCE */ 1997 bi_postra_liveness(ctx); 1998 1999 bi_foreach_block(ctx, block) { 2000 bi_schedule_block(ctx, block); 2001 } 2002 2003 bi_opt_dce_post_ra(ctx); 2004 bi_add_nop_for_atest(ctx); 2005} 2006