1/* 2 * Copyright © 2016-2017 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "broadcom/common/v3d_device_info.h" 25#include "v3d_compiler.h" 26#include "util/u_prim.h" 27#include "compiler/nir/nir_schedule.h" 28#include "compiler/nir/nir_builder.h" 29 30int 31vir_get_nsrc(struct qinst *inst) 32{ 33 switch (inst->qpu.type) { 34 case V3D_QPU_INSTR_TYPE_BRANCH: 35 return 0; 36 case V3D_QPU_INSTR_TYPE_ALU: 37 if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) 38 return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); 39 else 40 return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); 41 } 42 43 return 0; 44} 45 46/** 47 * Returns whether the instruction has any side effects that must be 48 * preserved. 49 */ 50bool 51vir_has_side_effects(struct v3d_compile *c, struct qinst *inst) 52{ 53 switch (inst->qpu.type) { 54 case V3D_QPU_INSTR_TYPE_BRANCH: 55 return true; 56 case V3D_QPU_INSTR_TYPE_ALU: 57 switch (inst->qpu.alu.add.op) { 58 case V3D_QPU_A_SETREVF: 59 case V3D_QPU_A_SETMSF: 60 case V3D_QPU_A_VPMSETUP: 61 case V3D_QPU_A_STVPMV: 62 case V3D_QPU_A_STVPMD: 63 case V3D_QPU_A_STVPMP: 64 case V3D_QPU_A_VPMWT: 65 case V3D_QPU_A_TMUWT: 66 return true; 67 default: 68 break; 69 } 70 71 switch (inst->qpu.alu.mul.op) { 72 case V3D_QPU_M_MULTOP: 73 return true; 74 default: 75 break; 76 } 77 } 78 79 if (inst->qpu.sig.ldtmu || 80 inst->qpu.sig.ldvary || 81 inst->qpu.sig.ldtlbu || 82 inst->qpu.sig.ldtlb || 83 inst->qpu.sig.wrtmuc || 84 inst->qpu.sig.thrsw) { 85 return true; 86 } 87 88 /* ldunifa works like ldunif: it reads an element and advances the 89 * pointer, so each read has a side effect (we don't care for ldunif 90 * because we reconstruct the uniform stream buffer after compiling 91 * with the surviving uniforms), so allowing DCE to remove 92 * one would break follow-up loads. We could fix this by emiting a 93 * unifa for each ldunifa, but each unifa requires 3 delay slots 94 * before a ldunifa, so that would be quite expensive. 95 */ 96 if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf) 97 return true; 98 99 return false; 100} 101 102bool 103vir_is_raw_mov(struct qinst *inst) 104{ 105 if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || 106 (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV && 107 inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) { 108 return false; 109 } 110 111 if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE || 112 inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) { 113 return false; 114 } 115 116 if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || 117 inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || 118 inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || 119 inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { 120 return false; 121 } 122 123 if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || 124 inst->qpu.flags.mc != V3D_QPU_COND_NONE) 125 return false; 126 127 return true; 128} 129 130bool 131vir_is_add(struct qinst *inst) 132{ 133 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 134 inst->qpu.alu.add.op != V3D_QPU_A_NOP); 135} 136 137bool 138vir_is_mul(struct qinst *inst) 139{ 140 return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 141 inst->qpu.alu.mul.op != V3D_QPU_M_NOP); 142} 143 144bool 145vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) 146{ 147 if (inst->dst.file == QFILE_MAGIC) 148 return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index); 149 150 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 151 inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { 152 return true; 153 } 154 155 return false; 156} 157 158bool 159vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) 160{ 161 for (int i = 0; i < vir_get_nsrc(inst); i++) { 162 switch (inst->src[i].file) { 163 case QFILE_VPM: 164 return true; 165 default: 166 break; 167 } 168 } 169 170 if (devinfo->ver < 41 && (inst->qpu.sig.ldvary || 171 inst->qpu.sig.ldtlb || 172 inst->qpu.sig.ldtlbu || 173 inst->qpu.sig.ldvpm)) { 174 return true; 175 } 176 177 return false; 178} 179 180bool 181vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) 182{ 183 switch (inst->dst.file) { 184 case QFILE_MAGIC: 185 switch (inst->dst.index) { 186 case V3D_QPU_WADDR_RECIP: 187 case V3D_QPU_WADDR_RSQRT: 188 case V3D_QPU_WADDR_EXP: 189 case V3D_QPU_WADDR_LOG: 190 case V3D_QPU_WADDR_SIN: 191 return true; 192 } 193 break; 194 default: 195 break; 196 } 197 198 if (devinfo->ver < 41 && inst->qpu.sig.ldtmu) 199 return true; 200 201 return false; 202} 203 204void 205vir_set_unpack(struct qinst *inst, int src, 206 enum v3d_qpu_input_unpack unpack) 207{ 208 assert(src == 0 || src == 1); 209 210 if (vir_is_add(inst)) { 211 if (src == 0) 212 inst->qpu.alu.add.a_unpack = unpack; 213 else 214 inst->qpu.alu.add.b_unpack = unpack; 215 } else { 216 assert(vir_is_mul(inst)); 217 if (src == 0) 218 inst->qpu.alu.mul.a_unpack = unpack; 219 else 220 inst->qpu.alu.mul.b_unpack = unpack; 221 } 222} 223 224void 225vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack) 226{ 227 if (vir_is_add(inst)) { 228 inst->qpu.alu.add.output_pack = pack; 229 } else { 230 assert(vir_is_mul(inst)); 231 inst->qpu.alu.mul.output_pack = pack; 232 } 233} 234 235void 236vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond) 237{ 238 if (vir_is_add(inst)) { 239 inst->qpu.flags.ac = cond; 240 } else { 241 assert(vir_is_mul(inst)); 242 inst->qpu.flags.mc = cond; 243 } 244} 245 246enum v3d_qpu_cond 247vir_get_cond(struct qinst *inst) 248{ 249 assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 250 251 if (vir_is_add(inst)) 252 return inst->qpu.flags.ac; 253 else if (vir_is_mul(inst)) 254 return inst->qpu.flags.mc; 255 else /* NOP */ 256 return V3D_QPU_COND_NONE; 257} 258 259void 260vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf) 261{ 262 c->flags_temp = -1; 263 if (vir_is_add(inst)) { 264 inst->qpu.flags.apf = pf; 265 } else { 266 assert(vir_is_mul(inst)); 267 inst->qpu.flags.mpf = pf; 268 } 269} 270 271void 272vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf) 273{ 274 c->flags_temp = -1; 275 if (vir_is_add(inst)) { 276 inst->qpu.flags.auf = uf; 277 } else { 278 assert(vir_is_mul(inst)); 279 inst->qpu.flags.muf = uf; 280 } 281} 282 283#if 0 284uint8_t 285vir_channels_written(struct qinst *inst) 286{ 287 if (vir_is_mul(inst)) { 288 switch (inst->dst.pack) { 289 case QPU_PACK_MUL_NOP: 290 case QPU_PACK_MUL_8888: 291 return 0xf; 292 case QPU_PACK_MUL_8A: 293 return 0x1; 294 case QPU_PACK_MUL_8B: 295 return 0x2; 296 case QPU_PACK_MUL_8C: 297 return 0x4; 298 case QPU_PACK_MUL_8D: 299 return 0x8; 300 } 301 } else { 302 switch (inst->dst.pack) { 303 case QPU_PACK_A_NOP: 304 case QPU_PACK_A_8888: 305 case QPU_PACK_A_8888_SAT: 306 case QPU_PACK_A_32_SAT: 307 return 0xf; 308 case QPU_PACK_A_8A: 309 case QPU_PACK_A_8A_SAT: 310 return 0x1; 311 case QPU_PACK_A_8B: 312 case QPU_PACK_A_8B_SAT: 313 return 0x2; 314 case QPU_PACK_A_8C: 315 case QPU_PACK_A_8C_SAT: 316 return 0x4; 317 case QPU_PACK_A_8D: 318 case QPU_PACK_A_8D_SAT: 319 return 0x8; 320 case QPU_PACK_A_16A: 321 case QPU_PACK_A_16A_SAT: 322 return 0x3; 323 case QPU_PACK_A_16B: 324 case QPU_PACK_A_16B_SAT: 325 return 0xc; 326 } 327 } 328 unreachable("Bad pack field"); 329} 330#endif 331 332struct qreg 333vir_get_temp(struct v3d_compile *c) 334{ 335 struct qreg reg; 336 337 reg.file = QFILE_TEMP; 338 reg.index = c->num_temps++; 339 340 if (c->num_temps > c->defs_array_size) { 341 uint32_t old_size = c->defs_array_size; 342 c->defs_array_size = MAX2(old_size * 2, 16); 343 344 c->defs = reralloc(c, c->defs, struct qinst *, 345 c->defs_array_size); 346 memset(&c->defs[old_size], 0, 347 sizeof(c->defs[0]) * (c->defs_array_size - old_size)); 348 349 c->spillable = reralloc(c, c->spillable, 350 BITSET_WORD, 351 BITSET_WORDS(c->defs_array_size)); 352 for (int i = old_size; i < c->defs_array_size; i++) 353 BITSET_SET(c->spillable, i); 354 } 355 356 return reg; 357} 358 359struct qinst * 360vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1) 361{ 362 struct qinst *inst = calloc(1, sizeof(*inst)); 363 364 inst->qpu = v3d_qpu_nop(); 365 inst->qpu.alu.add.op = op; 366 367 inst->dst = dst; 368 inst->src[0] = src0; 369 inst->src[1] = src1; 370 inst->uniform = ~0; 371 372 return inst; 373} 374 375struct qinst * 376vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1) 377{ 378 struct qinst *inst = calloc(1, sizeof(*inst)); 379 380 inst->qpu = v3d_qpu_nop(); 381 inst->qpu.alu.mul.op = op; 382 383 inst->dst = dst; 384 inst->src[0] = src0; 385 inst->src[1] = src1; 386 inst->uniform = ~0; 387 388 return inst; 389} 390 391struct qinst * 392vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) 393{ 394 struct qinst *inst = calloc(1, sizeof(*inst)); 395 396 inst->qpu = v3d_qpu_nop(); 397 inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH; 398 inst->qpu.branch.cond = cond; 399 inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE; 400 inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL; 401 inst->qpu.branch.ub = true; 402 inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL; 403 404 inst->dst = vir_nop_reg(); 405 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0); 406 407 return inst; 408} 409 410static void 411vir_emit(struct v3d_compile *c, struct qinst *inst) 412{ 413 switch (c->cursor.mode) { 414 case vir_cursor_add: 415 list_add(&inst->link, c->cursor.link); 416 break; 417 case vir_cursor_addtail: 418 list_addtail(&inst->link, c->cursor.link); 419 break; 420 } 421 422 c->cursor = vir_after_inst(inst); 423 c->live_intervals_valid = false; 424} 425 426/* Updates inst to write to a new temporary, emits it, and notes the def. */ 427struct qreg 428vir_emit_def(struct v3d_compile *c, struct qinst *inst) 429{ 430 assert(inst->dst.file == QFILE_NULL); 431 432 /* If we're emitting an instruction that's a def, it had better be 433 * writing a register. 434 */ 435 if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { 436 assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP || 437 v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op)); 438 assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP || 439 v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op)); 440 } 441 442 inst->dst = vir_get_temp(c); 443 444 if (inst->dst.file == QFILE_TEMP) 445 c->defs[inst->dst.index] = inst; 446 447 vir_emit(c, inst); 448 449 return inst->dst; 450} 451 452struct qinst * 453vir_emit_nondef(struct v3d_compile *c, struct qinst *inst) 454{ 455 if (inst->dst.file == QFILE_TEMP) 456 c->defs[inst->dst.index] = NULL; 457 458 vir_emit(c, inst); 459 460 return inst; 461} 462 463struct qblock * 464vir_new_block(struct v3d_compile *c) 465{ 466 struct qblock *block = rzalloc(c, struct qblock); 467 468 list_inithead(&block->instructions); 469 470 block->predecessors = _mesa_set_create(block, 471 _mesa_hash_pointer, 472 _mesa_key_pointer_equal); 473 474 block->index = c->next_block_index++; 475 476 return block; 477} 478 479void 480vir_set_emit_block(struct v3d_compile *c, struct qblock *block) 481{ 482 c->cur_block = block; 483 c->cursor = vir_after_block(block); 484 list_addtail(&block->link, &c->blocks); 485} 486 487struct qblock * 488vir_entry_block(struct v3d_compile *c) 489{ 490 return list_first_entry(&c->blocks, struct qblock, link); 491} 492 493struct qblock * 494vir_exit_block(struct v3d_compile *c) 495{ 496 return list_last_entry(&c->blocks, struct qblock, link); 497} 498 499void 500vir_link_blocks(struct qblock *predecessor, struct qblock *successor) 501{ 502 _mesa_set_add(successor->predecessors, predecessor); 503 if (predecessor->successors[0]) { 504 assert(!predecessor->successors[1]); 505 predecessor->successors[1] = successor; 506 } else { 507 predecessor->successors[0] = successor; 508 } 509} 510 511const struct v3d_compiler * 512v3d_compiler_init(const struct v3d_device_info *devinfo) 513{ 514 struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler); 515 if (!compiler) 516 return NULL; 517 518 compiler->devinfo = devinfo; 519 520 if (!vir_init_reg_sets(compiler)) { 521 ralloc_free(compiler); 522 return NULL; 523 } 524 525 return compiler; 526} 527 528void 529v3d_compiler_free(const struct v3d_compiler *compiler) 530{ 531 ralloc_free((void *)compiler); 532} 533 534static struct v3d_compile * 535vir_compile_init(const struct v3d_compiler *compiler, 536 struct v3d_key *key, 537 nir_shader *s, 538 void (*debug_output)(const char *msg, 539 void *debug_output_data), 540 void *debug_output_data, 541 int program_id, int variant_id, 542 uint32_t max_threads, 543 uint32_t min_threads_for_reg_alloc, 544 bool tmu_spilling_allowed, 545 bool disable_loop_unrolling, 546 bool disable_constant_ubo_load_sorting, 547 bool disable_tmu_pipelining, 548 bool fallback_scheduler) 549{ 550 struct v3d_compile *c = rzalloc(NULL, struct v3d_compile); 551 552 c->compiler = compiler; 553 c->devinfo = compiler->devinfo; 554 c->key = key; 555 c->program_id = program_id; 556 c->variant_id = variant_id; 557 c->threads = max_threads; 558 c->debug_output = debug_output; 559 c->debug_output_data = debug_output_data; 560 c->compilation_result = V3D_COMPILATION_SUCCEEDED; 561 c->min_threads_for_reg_alloc = min_threads_for_reg_alloc; 562 c->tmu_spilling_allowed = tmu_spilling_allowed; 563 c->fallback_scheduler = fallback_scheduler; 564 c->disable_tmu_pipelining = disable_tmu_pipelining; 565 c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting; 566 c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL 567 ? true : disable_loop_unrolling; 568 569 s = nir_shader_clone(c, s); 570 c->s = s; 571 572 list_inithead(&c->blocks); 573 vir_set_emit_block(c, vir_new_block(c)); 574 575 c->output_position_index = -1; 576 c->output_sample_mask_index = -1; 577 578 c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer, 579 _mesa_key_pointer_equal); 580 581 c->tmu.outstanding_regs = _mesa_pointer_set_create(c); 582 c->flags_temp = -1; 583 584 return c; 585} 586 587static int 588type_size_vec4(const struct glsl_type *type, bool bindless) 589{ 590 return glsl_count_attribute_slots(type, false); 591} 592 593static void 594v3d_lower_nir(struct v3d_compile *c) 595{ 596 struct nir_lower_tex_options tex_options = { 597 .lower_txd = true, 598 .lower_tg4_broadcom_swizzle = true, 599 600 .lower_rect = false, /* XXX: Use this on V3D 3.x */ 601 .lower_txp = ~0, 602 /* Apply swizzles to all samplers. */ 603 .swizzle_result = ~0, 604 }; 605 606 /* Lower the format swizzle and (for 32-bit returns) 607 * ARB_texture_swizzle-style swizzle. 608 */ 609 assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex)); 610 for (int i = 0; i < c->key->num_tex_used; i++) { 611 for (int j = 0; j < 4; j++) 612 tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j]; 613 } 614 615 assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler)); 616 for (int i = 0; i < c->key->num_samplers_used; i++) { 617 if (c->key->sampler[i].return_size == 16) { 618 tex_options.lower_tex_packing[i] = 619 nir_lower_tex_packing_16; 620 } 621 } 622 623 /* CS textures may not have return_size reflecting the shadow state. */ 624 nir_foreach_uniform_variable(var, c->s) { 625 const struct glsl_type *type = glsl_without_array(var->type); 626 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 627 628 if (!glsl_type_is_sampler(type) || 629 !glsl_sampler_type_is_shadow(type)) 630 continue; 631 632 for (int i = 0; i < array_len; i++) { 633 tex_options.lower_tex_packing[var->data.binding + i] = 634 nir_lower_tex_packing_16; 635 } 636 } 637 638 NIR_PASS_V(c->s, nir_lower_tex, &tex_options); 639 NIR_PASS_V(c->s, nir_lower_system_values); 640 NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL); 641 642 NIR_PASS_V(c->s, nir_lower_vars_to_scratch, 643 nir_var_function_temp, 644 0, 645 glsl_get_natural_size_align_bytes); 646 NIR_PASS_V(c->s, v3d_nir_lower_scratch); 647} 648 649static void 650v3d_set_prog_data_uniforms(struct v3d_compile *c, 651 struct v3d_prog_data *prog_data) 652{ 653 int count = c->num_uniforms; 654 struct v3d_uniform_list *ulist = &prog_data->uniforms; 655 656 ulist->count = count; 657 ulist->data = ralloc_array(prog_data, uint32_t, count); 658 memcpy(ulist->data, c->uniform_data, 659 count * sizeof(*ulist->data)); 660 ulist->contents = ralloc_array(prog_data, enum quniform_contents, count); 661 memcpy(ulist->contents, c->uniform_contents, 662 count * sizeof(*ulist->contents)); 663} 664 665static void 666v3d_vs_set_prog_data(struct v3d_compile *c, 667 struct v3d_vs_prog_data *prog_data) 668{ 669 /* The vertex data gets format converted by the VPM so that 670 * each attribute channel takes up a VPM column. Precompute 671 * the sizes for the shader record. 672 */ 673 for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) { 674 prog_data->vattr_sizes[i] = c->vattr_sizes[i]; 675 prog_data->vpm_input_size += c->vattr_sizes[i]; 676 } 677 678 memset(prog_data->driver_location_map, -1, 679 sizeof(prog_data->driver_location_map)); 680 681 nir_foreach_shader_in_variable(var, c->s) { 682 prog_data->driver_location_map[var->data.location] = 683 var->data.driver_location; 684 } 685 686 prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read, 687 SYSTEM_VALUE_VERTEX_ID) || 688 BITSET_TEST(c->s->info.system_values_read, 689 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); 690 691 prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read, 692 SYSTEM_VALUE_BASE_INSTANCE); 693 694 prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read, 695 SYSTEM_VALUE_INSTANCE_ID) || 696 BITSET_TEST(c->s->info.system_values_read, 697 SYSTEM_VALUE_INSTANCE_INDEX); 698 699 if (prog_data->uses_vid) 700 prog_data->vpm_input_size++; 701 if (prog_data->uses_biid) 702 prog_data->vpm_input_size++; 703 if (prog_data->uses_iid) 704 prog_data->vpm_input_size++; 705 706 /* Input/output segment size are in sectors (8 rows of 32 bits per 707 * channel). 708 */ 709 prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; 710 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; 711 712 /* Set us up for shared input/output segments. This is apparently 713 * necessary for our VCM setup to avoid varying corruption. 714 */ 715 prog_data->separate_segments = false; 716 prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, 717 prog_data->vpm_input_size); 718 prog_data->vpm_input_size = 0; 719 720 /* Compute VCM cache size. We set up our program to take up less than 721 * half of the VPM, so that any set of bin and render programs won't 722 * run out of space. We need space for at least one input segment, 723 * and then allocate the rest to output segments (one for the current 724 * program, the rest to VCM). The valid range of the VCM cache size 725 * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4 726 * batches. 727 */ 728 assert(c->devinfo->vpm_size); 729 int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; 730 int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size; 731 int half_vpm = vpm_size_in_sectors / 2; 732 int vpm_output_sectors = half_vpm - prog_data->vpm_input_size; 733 int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size; 734 assert(vpm_output_batches >= 2); 735 prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4); 736} 737 738static void 739v3d_gs_set_prog_data(struct v3d_compile *c, 740 struct v3d_gs_prog_data *prog_data) 741{ 742 prog_data->num_inputs = c->num_inputs; 743 memcpy(prog_data->input_slots, c->input_slots, 744 c->num_inputs * sizeof(*c->input_slots)); 745 746 /* gl_PrimitiveIdIn is written by the GBG into the first word of the 747 * VPM output header automatically and the shader will overwrite 748 * it after reading it if necessary, so it doesn't add to the VPM 749 * size requirements. 750 */ 751 prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read, 752 SYSTEM_VALUE_PRIMITIVE_ID); 753 754 /* Output segment size is in sectors (8 rows of 32 bits per channel) */ 755 prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; 756 757 /* Compute SIMD dispatch width and update VPM output size accordingly 758 * to ensure we can fit our program in memory. Available widths are 759 * 16, 8, 4, 1. 760 * 761 * Notice that at draw time we will have to consider VPM memory 762 * requirements from other stages and choose a smaller dispatch 763 * width if needed to fit the program in VPM memory. 764 */ 765 prog_data->simd_width = 16; 766 while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) || 767 prog_data->simd_width == 2) { 768 prog_data->simd_width >>= 1; 769 prog_data->vpm_output_size = 770 align(prog_data->vpm_output_size, 2) / 2; 771 } 772 assert(prog_data->vpm_output_size <= 16); 773 assert(prog_data->simd_width != 2); 774 775 prog_data->out_prim_type = c->s->info.gs.output_primitive; 776 prog_data->num_invocations = c->s->info.gs.invocations; 777 778 prog_data->writes_psiz = 779 c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ); 780} 781 782static void 783v3d_set_fs_prog_data_inputs(struct v3d_compile *c, 784 struct v3d_fs_prog_data *prog_data) 785{ 786 prog_data->num_inputs = c->num_inputs; 787 memcpy(prog_data->input_slots, c->input_slots, 788 c->num_inputs * sizeof(*c->input_slots)); 789 790 STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) > 791 (V3D_MAX_FS_INPUTS - 1) / 24); 792 for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) { 793 if (BITSET_TEST(c->flat_shade_flags, i)) 794 prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24); 795 796 if (BITSET_TEST(c->noperspective_flags, i)) 797 prog_data->noperspective_flags[i / 24] |= 1 << (i % 24); 798 799 if (BITSET_TEST(c->centroid_flags, i)) 800 prog_data->centroid_flags[i / 24] |= 1 << (i % 24); 801 } 802} 803 804static void 805v3d_fs_set_prog_data(struct v3d_compile *c, 806 struct v3d_fs_prog_data *prog_data) 807{ 808 v3d_set_fs_prog_data_inputs(c, prog_data); 809 prog_data->writes_z = c->writes_z; 810 prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; 811 prog_data->uses_center_w = c->uses_center_w; 812 prog_data->uses_implicit_point_line_varyings = 813 c->uses_implicit_point_line_varyings; 814 prog_data->lock_scoreboard_on_first_thrsw = 815 c->lock_scoreboard_on_first_thrsw; 816 prog_data->force_per_sample_msaa = c->force_per_sample_msaa; 817 prog_data->uses_pid = c->fs_uses_primitive_id; 818} 819 820static void 821v3d_cs_set_prog_data(struct v3d_compile *c, 822 struct v3d_compute_prog_data *prog_data) 823{ 824 prog_data->shared_size = c->s->info.shared_size; 825 826 prog_data->local_size[0] = c->s->info.workgroup_size[0]; 827 prog_data->local_size[1] = c->s->info.workgroup_size[1]; 828 prog_data->local_size[2] = c->s->info.workgroup_size[2]; 829 830 prog_data->has_subgroups = c->has_subgroups; 831} 832 833static void 834v3d_set_prog_data(struct v3d_compile *c, 835 struct v3d_prog_data *prog_data) 836{ 837 prog_data->threads = c->threads; 838 prog_data->single_seg = !c->last_thrsw; 839 prog_data->spill_size = c->spill_size; 840 prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl; 841 prog_data->has_control_barrier = c->s->info.uses_control_barrier; 842 843 v3d_set_prog_data_uniforms(c, prog_data); 844 845 switch (c->s->info.stage) { 846 case MESA_SHADER_VERTEX: 847 v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data); 848 break; 849 case MESA_SHADER_GEOMETRY: 850 v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data); 851 break; 852 case MESA_SHADER_FRAGMENT: 853 v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data); 854 break; 855 case MESA_SHADER_COMPUTE: 856 v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); 857 break; 858 default: 859 unreachable("unsupported shader stage"); 860 } 861} 862 863static uint64_t * 864v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size) 865{ 866 *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t); 867 868 uint64_t *qpu_insts = malloc(*final_assembly_size); 869 if (!qpu_insts) 870 return NULL; 871 872 memcpy(qpu_insts, c->qpu_insts, *final_assembly_size); 873 874 vir_compile_destroy(c); 875 876 return qpu_insts; 877} 878 879static void 880v3d_nir_lower_vs_early(struct v3d_compile *c) 881{ 882 /* Split our I/O vars and dead code eliminate the unused 883 * components. 884 */ 885 NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, 886 nir_var_shader_in | nir_var_shader_out); 887 uint64_t used_outputs[4] = {0}; 888 for (int i = 0; i < c->vs_key->num_used_outputs; i++) { 889 int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]); 890 int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]); 891 used_outputs[comp] |= 1ull << slot; 892 } 893 NIR_PASS_V(c->s, nir_remove_unused_io_vars, 894 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ 895 NIR_PASS_V(c->s, nir_lower_global_vars_to_local); 896 v3d_optimize_nir(c, c->s); 897 NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); 898 899 /* This must go before nir_lower_io */ 900 if (c->vs_key->per_vertex_point_size) 901 NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); 902 903 NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 904 type_size_vec4, 905 (nir_lower_io_options)0); 906 /* clean up nir_lower_io's deref_var remains and do a constant folding pass 907 * on the code it generated. 908 */ 909 NIR_PASS_V(c->s, nir_opt_dce); 910 NIR_PASS_V(c->s, nir_opt_constant_folding); 911} 912 913static void 914v3d_nir_lower_gs_early(struct v3d_compile *c) 915{ 916 /* Split our I/O vars and dead code eliminate the unused 917 * components. 918 */ 919 NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, 920 nir_var_shader_in | nir_var_shader_out); 921 uint64_t used_outputs[4] = {0}; 922 for (int i = 0; i < c->gs_key->num_used_outputs; i++) { 923 int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]); 924 int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]); 925 used_outputs[comp] |= 1ull << slot; 926 } 927 NIR_PASS_V(c->s, nir_remove_unused_io_vars, 928 nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ 929 NIR_PASS_V(c->s, nir_lower_global_vars_to_local); 930 v3d_optimize_nir(c, c->s); 931 NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); 932 933 /* This must go before nir_lower_io */ 934 if (c->gs_key->per_vertex_point_size) 935 NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); 936 937 NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 938 type_size_vec4, 939 (nir_lower_io_options)0); 940 /* clean up nir_lower_io's deref_var remains */ 941 NIR_PASS_V(c->s, nir_opt_dce); 942} 943 944static void 945v3d_fixup_fs_output_types(struct v3d_compile *c) 946{ 947 nir_foreach_shader_out_variable(var, c->s) { 948 uint32_t mask = 0; 949 950 switch (var->data.location) { 951 case FRAG_RESULT_COLOR: 952 mask = ~0; 953 break; 954 case FRAG_RESULT_DATA0: 955 case FRAG_RESULT_DATA1: 956 case FRAG_RESULT_DATA2: 957 case FRAG_RESULT_DATA3: 958 mask = 1 << (var->data.location - FRAG_RESULT_DATA0); 959 break; 960 } 961 962 if (c->fs_key->int_color_rb & mask) { 963 var->type = 964 glsl_vector_type(GLSL_TYPE_INT, 965 glsl_get_components(var->type)); 966 } else if (c->fs_key->uint_color_rb & mask) { 967 var->type = 968 glsl_vector_type(GLSL_TYPE_UINT, 969 glsl_get_components(var->type)); 970 } 971 } 972} 973 974static void 975v3d_nir_lower_fs_early(struct v3d_compile *c) 976{ 977 if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) 978 v3d_fixup_fs_output_types(c); 979 980 NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c); 981 982 if (c->fs_key->line_smoothing) { 983 v3d_nir_lower_line_smooth(c->s); 984 NIR_PASS_V(c->s, nir_lower_global_vars_to_local); 985 /* The lowering pass can introduce new sysval reads */ 986 nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s)); 987 } 988} 989 990static void 991v3d_nir_lower_gs_late(struct v3d_compile *c) 992{ 993 if (c->key->ucp_enables) { 994 NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables, 995 false, NULL); 996 } 997 998 /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */ 999 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 1000} 1001 1002static void 1003v3d_nir_lower_vs_late(struct v3d_compile *c) 1004{ 1005 if (c->key->ucp_enables) { 1006 NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables, 1007 false, false, NULL); 1008 NIR_PASS_V(c->s, nir_lower_io_to_scalar, 1009 nir_var_shader_out); 1010 } 1011 1012 /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */ 1013 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 1014} 1015 1016static void 1017v3d_nir_lower_fs_late(struct v3d_compile *c) 1018{ 1019 /* In OpenGL the fragment shader can't read gl_ClipDistance[], but 1020 * Vulkan allows it, in which case the SPIR-V compiler will declare 1021 * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as 1022 * the last parameter to always operate with a compact array in both 1023 * OpenGL and Vulkan so we do't have to care about the API we 1024 * are using. 1025 */ 1026 if (c->key->ucp_enables) 1027 NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true); 1028 1029 NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); 1030} 1031 1032static uint32_t 1033vir_get_max_temps(struct v3d_compile *c) 1034{ 1035 int max_ip = 0; 1036 vir_for_each_inst_inorder(inst, c) 1037 max_ip++; 1038 1039 uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip); 1040 1041 for (int t = 0; t < c->num_temps; t++) { 1042 for (int i = c->temp_start[t]; (i < c->temp_end[t] && 1043 i < max_ip); i++) { 1044 if (i > max_ip) 1045 break; 1046 pressure[i]++; 1047 } 1048 } 1049 1050 uint32_t max_temps = 0; 1051 for (int i = 0; i < max_ip; i++) 1052 max_temps = MAX2(max_temps, pressure[i]); 1053 1054 ralloc_free(pressure); 1055 1056 return max_temps; 1057} 1058 1059enum v3d_dependency_class { 1060 V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0 1061}; 1062 1063static bool 1064v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr, 1065 nir_schedule_dependency *dep, 1066 void *user_data) 1067{ 1068 struct v3d_compile *c = user_data; 1069 1070 switch (intr->intrinsic) { 1071 case nir_intrinsic_store_output: 1072 /* Writing to location 0 overwrites the value passed in for 1073 * gl_PrimitiveID on geometry shaders 1074 */ 1075 if (c->s->info.stage != MESA_SHADER_GEOMETRY || 1076 nir_intrinsic_base(intr) != 0) 1077 break; 1078 1079 nir_const_value *const_value = 1080 nir_src_as_const_value(intr->src[1]); 1081 1082 if (const_value == NULL) 1083 break; 1084 1085 uint64_t offset = 1086 nir_const_value_as_uint(*const_value, 1087 nir_src_bit_size(intr->src[1])); 1088 if (offset != 0) 1089 break; 1090 1091 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0; 1092 dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY; 1093 return true; 1094 1095 case nir_intrinsic_load_primitive_id: 1096 if (c->s->info.stage != MESA_SHADER_GEOMETRY) 1097 break; 1098 1099 dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0; 1100 dep->type = NIR_SCHEDULE_READ_DEPENDENCY; 1101 return true; 1102 1103 default: 1104 break; 1105 } 1106 1107 return false; 1108} 1109 1110static bool 1111should_split_wrmask(const nir_instr *instr, const void *data) 1112{ 1113 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1114 switch (intr->intrinsic) { 1115 case nir_intrinsic_store_ssbo: 1116 case nir_intrinsic_store_shared: 1117 case nir_intrinsic_store_global: 1118 case nir_intrinsic_store_scratch: 1119 return true; 1120 default: 1121 return false; 1122 } 1123} 1124 1125static nir_intrinsic_instr * 1126nir_instr_as_constant_ubo_load(nir_instr *inst) 1127{ 1128 if (inst->type != nir_instr_type_intrinsic) 1129 return NULL; 1130 1131 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst); 1132 if (intr->intrinsic != nir_intrinsic_load_ubo) 1133 return NULL; 1134 1135 assert(nir_src_is_const(intr->src[0])); 1136 if (!nir_src_is_const(intr->src[1])) 1137 return NULL; 1138 1139 return intr; 1140} 1141 1142static bool 1143v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref) 1144{ 1145 bool progress = false; 1146 1147 nir_instr *ref_inst = &ref->instr; 1148 uint32_t ref_offset = nir_src_as_uint(ref->src[1]); 1149 uint32_t ref_index = nir_src_as_uint(ref->src[0]); 1150 1151 /* Go through all instructions after ref searching for constant UBO 1152 * loads for the same UBO index. 1153 */ 1154 bool seq_break = false; 1155 nir_instr *inst = &ref->instr; 1156 nir_instr *next_inst = NULL; 1157 while (true) { 1158 inst = next_inst ? next_inst : nir_instr_next(inst); 1159 if (!inst) 1160 break; 1161 1162 next_inst = NULL; 1163 1164 if (inst->type != nir_instr_type_intrinsic) 1165 continue; 1166 1167 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst); 1168 if (intr->intrinsic != nir_intrinsic_load_ubo) 1169 continue; 1170 1171 /* We only produce unifa sequences for non-divergent loads */ 1172 if (nir_src_is_divergent(intr->src[1])) 1173 continue; 1174 1175 /* If there are any UBO loads that are not constant or that 1176 * use a different UBO index in between the reference load and 1177 * any other constant load for the same index, they would break 1178 * the unifa sequence. We will flag that so we can then move 1179 * all constant UBO loads for the reference index before these 1180 * and not just the ones that are not ordered to avoid breaking 1181 * the sequence and reduce unifa writes. 1182 */ 1183 if (!nir_src_is_const(intr->src[1])) { 1184 seq_break = true; 1185 continue; 1186 } 1187 uint32_t offset = nir_src_as_uint(intr->src[1]); 1188 1189 assert(nir_src_is_const(intr->src[0])); 1190 uint32_t index = nir_src_as_uint(intr->src[0]); 1191 if (index != ref_index) { 1192 seq_break = true; 1193 continue; 1194 } 1195 1196 /* Only move loads with an offset that is close enough to the 1197 * reference offset, since otherwise we would not be able to 1198 * skip the unifa write for them. See ntq_emit_load_ubo_unifa. 1199 */ 1200 if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE) 1201 continue; 1202 1203 /* We will move this load if its offset is smaller than ref's 1204 * (in which case we will move it before ref) or if the offset 1205 * is larger than ref's but there are sequence breakers in 1206 * in between (in which case we will move it after ref and 1207 * before the sequence breakers). 1208 */ 1209 if (!seq_break && offset >= ref_offset) 1210 continue; 1211 1212 /* Find where exactly we want to move this load: 1213 * 1214 * If we are moving it before ref, we want to check any other 1215 * UBO loads we placed before ref and make sure we insert this 1216 * one properly ordered with them. Likewise, if we are moving 1217 * it after ref. 1218 */ 1219 nir_instr *pos = ref_inst; 1220 nir_instr *tmp = pos; 1221 do { 1222 if (offset < ref_offset) 1223 tmp = nir_instr_prev(tmp); 1224 else 1225 tmp = nir_instr_next(tmp); 1226 1227 if (!tmp || tmp == inst) 1228 break; 1229 1230 /* Ignore non-unifa UBO loads */ 1231 if (tmp->type != nir_instr_type_intrinsic) 1232 continue; 1233 1234 nir_intrinsic_instr *tmp_intr = 1235 nir_instr_as_intrinsic(tmp); 1236 if (tmp_intr->intrinsic != nir_intrinsic_load_ubo) 1237 continue; 1238 1239 if (nir_src_is_divergent(tmp_intr->src[1])) 1240 continue; 1241 1242 /* Stop if we find a unifa UBO load that breaks the 1243 * sequence. 1244 */ 1245 if (!nir_src_is_const(tmp_intr->src[1])) 1246 break; 1247 1248 if (nir_src_as_uint(tmp_intr->src[0]) != index) 1249 break; 1250 1251 uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]); 1252 if (offset < ref_offset) { 1253 if (tmp_offset < offset || 1254 tmp_offset >= ref_offset) { 1255 break; 1256 } else { 1257 pos = tmp; 1258 } 1259 } else { 1260 if (tmp_offset > offset || 1261 tmp_offset <= ref_offset) { 1262 break; 1263 } else { 1264 pos = tmp; 1265 } 1266 } 1267 } while (true); 1268 1269 /* We can't move the UBO load before the instruction that 1270 * defines its constant offset. If that instruction is placed 1271 * in between the new location (pos) and the current location 1272 * of this load, we will have to move that instruction too. 1273 * 1274 * We don't care about the UBO index definition because that 1275 * is optimized to be reused by all UBO loads for the same 1276 * index and therefore is certain to be defined before the 1277 * first UBO load that uses it. 1278 */ 1279 nir_instr *offset_inst = NULL; 1280 tmp = inst; 1281 while ((tmp = nir_instr_prev(tmp)) != NULL) { 1282 if (pos == tmp) { 1283 /* We reached the target location without 1284 * finding the instruction that defines the 1285 * offset, so that instruction must be before 1286 * the new position and we don't have to fix it. 1287 */ 1288 break; 1289 } 1290 if (intr->src[1].ssa->parent_instr == tmp) { 1291 offset_inst = tmp; 1292 break; 1293 } 1294 } 1295 1296 if (offset_inst) { 1297 exec_node_remove(&offset_inst->node); 1298 exec_node_insert_node_before(&pos->node, 1299 &offset_inst->node); 1300 } 1301 1302 /* Since we are moving the instruction before its current 1303 * location, grab its successor before the move so that 1304 * we can continue the next iteration of the main loop from 1305 * that instruction. 1306 */ 1307 next_inst = nir_instr_next(inst); 1308 1309 /* Move this load to the selected location */ 1310 exec_node_remove(&inst->node); 1311 if (offset < ref_offset) 1312 exec_node_insert_node_before(&pos->node, &inst->node); 1313 else 1314 exec_node_insert_after(&pos->node, &inst->node); 1315 1316 progress = true; 1317 } 1318 1319 return progress; 1320} 1321 1322static bool 1323v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c, 1324 nir_block *block) 1325{ 1326 bool progress = false; 1327 bool local_progress; 1328 do { 1329 local_progress = false; 1330 nir_foreach_instr_safe(inst, block) { 1331 nir_intrinsic_instr *intr = 1332 nir_instr_as_constant_ubo_load(inst); 1333 if (intr) { 1334 local_progress |= 1335 v3d_nir_sort_constant_ubo_load(block, intr); 1336 } 1337 } 1338 progress |= local_progress; 1339 } while (local_progress); 1340 1341 return progress; 1342} 1343 1344/** 1345 * Sorts constant UBO loads in each block by offset to maximize chances of 1346 * skipping unifa writes when converting to VIR. This can increase register 1347 * pressure. 1348 */ 1349static bool 1350v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c) 1351{ 1352 nir_foreach_function(function, s) { 1353 if (function->impl) { 1354 nir_foreach_block(block, function->impl) { 1355 c->sorted_any_ubo_loads |= 1356 v3d_nir_sort_constant_ubo_loads_block(c, block); 1357 } 1358 nir_metadata_preserve(function->impl, 1359 nir_metadata_block_index | 1360 nir_metadata_dominance); 1361 } 1362 } 1363 return c->sorted_any_ubo_loads; 1364} 1365 1366static void 1367lower_load_num_subgroups(struct v3d_compile *c, 1368 nir_builder *b, 1369 nir_intrinsic_instr *intr) 1370{ 1371 assert(c->s->info.stage == MESA_SHADER_COMPUTE); 1372 assert(intr->intrinsic == nir_intrinsic_load_num_subgroups); 1373 1374 b->cursor = nir_after_instr(&intr->instr); 1375 uint32_t num_subgroups = 1376 DIV_ROUND_UP(c->s->info.workgroup_size[0] * 1377 c->s->info.workgroup_size[1] * 1378 c->s->info.workgroup_size[2], V3D_CHANNELS); 1379 nir_ssa_def *result = nir_imm_int(b, num_subgroups); 1380 nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); 1381 nir_instr_remove(&intr->instr); 1382} 1383 1384static bool 1385lower_subgroup_intrinsics(struct v3d_compile *c, 1386 nir_block *block, nir_builder *b) 1387{ 1388 bool progress = false; 1389 nir_foreach_instr_safe(inst, block) { 1390 if (inst->type != nir_instr_type_intrinsic) 1391 continue;; 1392 1393 nir_intrinsic_instr *intr = 1394 nir_instr_as_intrinsic(inst); 1395 if (!intr) 1396 continue; 1397 1398 switch (intr->intrinsic) { 1399 case nir_intrinsic_load_num_subgroups: 1400 lower_load_num_subgroups(c, b, intr); 1401 progress = true; 1402 FALLTHROUGH; 1403 case nir_intrinsic_load_subgroup_id: 1404 case nir_intrinsic_load_subgroup_size: 1405 case nir_intrinsic_load_subgroup_invocation: 1406 case nir_intrinsic_elect: 1407 c->has_subgroups = true; 1408 break; 1409 default: 1410 break; 1411 } 1412 } 1413 1414 return progress; 1415} 1416 1417static bool 1418v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c) 1419{ 1420 bool progress = false; 1421 nir_foreach_function(function, s) { 1422 if (function->impl) { 1423 nir_builder b; 1424 nir_builder_init(&b, function->impl); 1425 1426 nir_foreach_block(block, function->impl) 1427 progress |= lower_subgroup_intrinsics(c, block, &b); 1428 1429 nir_metadata_preserve(function->impl, 1430 nir_metadata_block_index | 1431 nir_metadata_dominance); 1432 } 1433 } 1434 return progress; 1435} 1436 1437static void 1438v3d_attempt_compile(struct v3d_compile *c) 1439{ 1440 switch (c->s->info.stage) { 1441 case MESA_SHADER_VERTEX: 1442 c->vs_key = (struct v3d_vs_key *) c->key; 1443 break; 1444 case MESA_SHADER_GEOMETRY: 1445 c->gs_key = (struct v3d_gs_key *) c->key; 1446 break; 1447 case MESA_SHADER_FRAGMENT: 1448 c->fs_key = (struct v3d_fs_key *) c->key; 1449 break; 1450 case MESA_SHADER_COMPUTE: 1451 break; 1452 default: 1453 unreachable("unsupported shader stage"); 1454 } 1455 1456 switch (c->s->info.stage) { 1457 case MESA_SHADER_VERTEX: 1458 v3d_nir_lower_vs_early(c); 1459 break; 1460 case MESA_SHADER_GEOMETRY: 1461 v3d_nir_lower_gs_early(c); 1462 break; 1463 case MESA_SHADER_FRAGMENT: 1464 v3d_nir_lower_fs_early(c); 1465 break; 1466 default: 1467 break; 1468 } 1469 1470 v3d_lower_nir(c); 1471 1472 switch (c->s->info.stage) { 1473 case MESA_SHADER_VERTEX: 1474 v3d_nir_lower_vs_late(c); 1475 break; 1476 case MESA_SHADER_GEOMETRY: 1477 v3d_nir_lower_gs_late(c); 1478 break; 1479 case MESA_SHADER_FRAGMENT: 1480 v3d_nir_lower_fs_late(c); 1481 break; 1482 default: 1483 break; 1484 } 1485 1486 NIR_PASS_V(c->s, v3d_nir_lower_io, c); 1487 NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c); 1488 NIR_PASS_V(c->s, v3d_nir_lower_image_load_store); 1489 nir_lower_idiv_options idiv_options = { 1490 .imprecise_32bit_lowering = true, 1491 .allow_fp16 = true, 1492 }; 1493 NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options); 1494 1495 if (c->key->robust_buffer_access) { 1496 /* v3d_nir_lower_robust_buffer_access assumes constant buffer 1497 * indices on ubo/ssbo intrinsics so run copy propagation and 1498 * constant folding passes before we run the lowering to warrant 1499 * this. We also want to run the lowering before v3d_optimize to 1500 * clean-up redundant get_buffer_size calls produced in the pass. 1501 */ 1502 NIR_PASS_V(c->s, nir_copy_prop); 1503 NIR_PASS_V(c->s, nir_opt_constant_folding); 1504 NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c); 1505 } 1506 1507 NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s); 1508 1509 NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c); 1510 1511 v3d_optimize_nir(c, c->s); 1512 1513 /* Do late algebraic optimization to turn add(a, neg(b)) back into 1514 * subs, then the mandatory cleanup after algebraic. Note that it may 1515 * produce fnegs, and if so then we need to keep running to squash 1516 * fneg(fneg(a)). 1517 */ 1518 bool more_late_algebraic = true; 1519 while (more_late_algebraic) { 1520 more_late_algebraic = false; 1521 NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); 1522 NIR_PASS_V(c->s, nir_opt_constant_folding); 1523 NIR_PASS_V(c->s, nir_copy_prop); 1524 NIR_PASS_V(c->s, nir_opt_dce); 1525 NIR_PASS_V(c->s, nir_opt_cse); 1526 } 1527 1528 NIR_PASS_V(c->s, nir_lower_bool_to_int32); 1529 nir_convert_to_lcssa(c->s, true, true); 1530 NIR_PASS_V(c->s, nir_divergence_analysis); 1531 NIR_PASS_V(c->s, nir_convert_from_ssa, true); 1532 1533 struct nir_schedule_options schedule_options = { 1534 /* Schedule for about half our register space, to enable more 1535 * shaders to hit 4 threads. 1536 */ 1537 .threshold = 24, 1538 1539 /* Vertex shaders share the same memory for inputs and outputs, 1540 * fragement and geometry shaders do not. 1541 */ 1542 .stages_with_shared_io_memory = 1543 (((1 << MESA_ALL_SHADER_STAGES) - 1) & 1544 ~((1 << MESA_SHADER_FRAGMENT) | 1545 (1 << MESA_SHADER_GEOMETRY))), 1546 1547 .fallback = c->fallback_scheduler, 1548 1549 .intrinsic_cb = v3d_intrinsic_dependency_cb, 1550 .intrinsic_cb_data = c, 1551 }; 1552 NIR_PASS_V(c->s, nir_schedule, &schedule_options); 1553 1554 if (!c->disable_constant_ubo_load_sorting) 1555 NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c); 1556 1557 v3d_nir_to_vir(c); 1558} 1559 1560uint32_t 1561v3d_prog_data_size(gl_shader_stage stage) 1562{ 1563 static const int prog_data_size[] = { 1564 [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data), 1565 [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data), 1566 [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data), 1567 [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data), 1568 }; 1569 1570 assert(stage >= 0 && 1571 stage < ARRAY_SIZE(prog_data_size) && 1572 prog_data_size[stage]); 1573 1574 return prog_data_size[stage]; 1575} 1576 1577int v3d_shaderdb_dump(struct v3d_compile *c, 1578 char **shaderdb_str) 1579{ 1580 if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED) 1581 return -1; 1582 1583 return asprintf(shaderdb_str, 1584 "%s shader: %d inst, %d threads, %d loops, " 1585 "%d uniforms, %d max-temps, %d:%d spills:fills, " 1586 "%d sfu-stalls, %d inst-and-stalls, %d nops", 1587 vir_get_stage_name(c), 1588 c->qpu_inst_count, 1589 c->threads, 1590 c->loops, 1591 c->num_uniforms, 1592 vir_get_max_temps(c), 1593 c->spills, 1594 c->fills, 1595 c->qpu_inst_stalled_count, 1596 c->qpu_inst_count + c->qpu_inst_stalled_count, 1597 c->nop_count); 1598} 1599 1600/* This is a list of incremental changes to the compilation strategy 1601 * that will be used to try to compile the shader successfully. The 1602 * default strategy is to enable all optimizations which will have 1603 * the highest register pressure but is expected to produce most 1604 * optimal code. Following strategies incrementally disable specific 1605 * optimizations that are known to contribute to register pressure 1606 * in order to be able to compile the shader successfully while meeting 1607 * thread count requirements. 1608 * 1609 * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also 1610 * cover previous hardware as well (meaning that we are not limiting 1611 * register allocation to any particular thread count). This is fine 1612 * because v3d_nir_to_vir will cap this to the actual minimum. 1613 */ 1614struct v3d_compiler_strategy { 1615 const char *name; 1616 uint32_t max_threads; 1617 uint32_t min_threads; 1618 bool disable_loop_unrolling; 1619 bool disable_ubo_load_sorting; 1620 bool disable_tmu_pipelining; 1621 bool tmu_spilling_allowed; 1622} static const strategies[] = { 1623 /*0*/ { "default", 4, 4, false, false, false, false }, 1624 /*1*/ { "disable loop unrolling", 4, 4, true, false, false, false }, 1625 /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false }, 1626 /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false }, 1627 /*4*/ { "lower thread count", 2, 1, false, false, false, false }, 1628 /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, false }, 1629 /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, false }, 1630 /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true }, 1631 /*8*/ { "fallback scheduler", 2, 1, true, true, true, true } 1632}; 1633 1634/** 1635 * If a particular optimization didn't make any progress during a compile 1636 * attempt disabling it alone won't allow us to compile the shader successfuly, 1637 * since we'll end up with the same code. Detect these scenarios so we can 1638 * avoid wasting time with useless compiles. We should also consider if the 1639 * strategy changes other aspects of the compilation process though, like 1640 * spilling, and not skip it in that case. 1641 */ 1642static bool 1643skip_compile_strategy(struct v3d_compile *c, uint32_t idx) 1644{ 1645 /* We decide if we can skip a strategy based on the optimizations that 1646 * were active in the previous strategy, so we should only be calling this 1647 * for strategies after the first. 1648 */ 1649 assert(idx > 0); 1650 1651 /* Don't skip a strategy that changes spilling behavior */ 1652 if (strategies[idx].tmu_spilling_allowed != 1653 strategies[idx - 1].tmu_spilling_allowed) { 1654 return false; 1655 } 1656 1657 switch (idx) { 1658 /* Loop unrolling: skip if we didn't unroll any loops */ 1659 case 1: 1660 case 5: 1661 return !c->unrolled_any_loops; 1662 /* UBO load sorting: skip if we didn't sort any loads */ 1663 case 2: 1664 case 6: 1665 return !c->sorted_any_ubo_loads; 1666 /* TMU pipelining: skip if we didn't pipeline any TMU ops */ 1667 case 3: 1668 case 7: 1669 return !c->pipelined_any_tmu; 1670 /* Lower thread count: skip if we already tried less that 4 threads */ 1671 case 4: 1672 return c->threads < 4; 1673 default: 1674 return false; 1675 }; 1676} 1677uint64_t *v3d_compile(const struct v3d_compiler *compiler, 1678 struct v3d_key *key, 1679 struct v3d_prog_data **out_prog_data, 1680 nir_shader *s, 1681 void (*debug_output)(const char *msg, 1682 void *debug_output_data), 1683 void *debug_output_data, 1684 int program_id, int variant_id, 1685 uint32_t *final_assembly_size) 1686{ 1687 struct v3d_compile *c = NULL; 1688 for (int i = 0; i < ARRAY_SIZE(strategies); i++) { 1689 /* Fallback strategy */ 1690 if (i > 0) { 1691 assert(c); 1692 if (skip_compile_strategy(c, i)) 1693 continue; 1694 1695 char *debug_msg; 1696 int ret = asprintf(&debug_msg, 1697 "Falling back to strategy '%s' for %s", 1698 strategies[i].name, 1699 vir_get_stage_name(c)); 1700 1701 if (ret >= 0) { 1702 if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) 1703 fprintf(stderr, "%s\n", debug_msg); 1704 1705 c->debug_output(debug_msg, c->debug_output_data); 1706 free(debug_msg); 1707 } 1708 1709 vir_compile_destroy(c); 1710 } 1711 1712 c = vir_compile_init(compiler, key, s, 1713 debug_output, debug_output_data, 1714 program_id, variant_id, 1715 strategies[i].max_threads, 1716 strategies[i].min_threads, 1717 strategies[i].tmu_spilling_allowed, 1718 strategies[i].disable_loop_unrolling, 1719 strategies[i].disable_ubo_load_sorting, 1720 strategies[i].disable_tmu_pipelining, 1721 i == ARRAY_SIZE(strategies) - 1); 1722 1723 v3d_attempt_compile(c); 1724 1725 if (i >= ARRAY_SIZE(strategies) - 1 || 1726 c->compilation_result != 1727 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) { 1728 break; 1729 } 1730 } 1731 1732 if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) && 1733 c->compilation_result != 1734 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION && 1735 c->spills > 0) { 1736 char *debug_msg; 1737 int ret = asprintf(&debug_msg, 1738 "Compiled %s with %d spills and %d fills", 1739 vir_get_stage_name(c), 1740 c->spills, c->fills); 1741 fprintf(stderr, "%s\n", debug_msg); 1742 1743 if (ret >= 0) { 1744 c->debug_output(debug_msg, c->debug_output_data); 1745 free(debug_msg); 1746 } 1747 } 1748 1749 if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) { 1750 fprintf(stderr, "Failed to compile %s with any strategy.\n", 1751 vir_get_stage_name(c)); 1752 } 1753 1754 struct v3d_prog_data *prog_data; 1755 1756 prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage)); 1757 1758 v3d_set_prog_data(c, prog_data); 1759 1760 *out_prog_data = prog_data; 1761 1762 char *shaderdb; 1763 int ret = v3d_shaderdb_dump(c, &shaderdb); 1764 if (ret >= 0) { 1765 if (V3D_DEBUG & V3D_DEBUG_SHADERDB) 1766 fprintf(stderr, "SHADER-DB: %s\n", shaderdb); 1767 1768 c->debug_output(shaderdb, c->debug_output_data); 1769 free(shaderdb); 1770 } 1771 1772 return v3d_return_qpu_insts(c, final_assembly_size); 1773} 1774 1775void 1776vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst) 1777{ 1778 if (qinst->dst.file == QFILE_TEMP) 1779 c->defs[qinst->dst.index] = NULL; 1780 1781 assert(&qinst->link != c->cursor.link); 1782 1783 list_del(&qinst->link); 1784 free(qinst); 1785 1786 c->live_intervals_valid = false; 1787} 1788 1789struct qreg 1790vir_follow_movs(struct v3d_compile *c, struct qreg reg) 1791{ 1792 /* XXX 1793 int pack = reg.pack; 1794 1795 while (reg.file == QFILE_TEMP && 1796 c->defs[reg.index] && 1797 (c->defs[reg.index]->op == QOP_MOV || 1798 c->defs[reg.index]->op == QOP_FMOV) && 1799 !c->defs[reg.index]->dst.pack && 1800 !c->defs[reg.index]->src[0].pack) { 1801 reg = c->defs[reg.index]->src[0]; 1802 } 1803 1804 reg.pack = pack; 1805 */ 1806 return reg; 1807} 1808 1809void 1810vir_compile_destroy(struct v3d_compile *c) 1811{ 1812 /* Defuse the assert that we aren't removing the cursor's instruction. 1813 */ 1814 c->cursor.link = NULL; 1815 1816 vir_for_each_block(block, c) { 1817 while (!list_is_empty(&block->instructions)) { 1818 struct qinst *qinst = 1819 list_first_entry(&block->instructions, 1820 struct qinst, link); 1821 vir_remove_instruction(c, qinst); 1822 } 1823 } 1824 1825 ralloc_free(c); 1826} 1827 1828uint32_t 1829vir_get_uniform_index(struct v3d_compile *c, 1830 enum quniform_contents contents, 1831 uint32_t data) 1832{ 1833 for (int i = 0; i < c->num_uniforms; i++) { 1834 if (c->uniform_contents[i] == contents && 1835 c->uniform_data[i] == data) { 1836 return i; 1837 } 1838 } 1839 1840 uint32_t uniform = c->num_uniforms++; 1841 1842 if (uniform >= c->uniform_array_size) { 1843 c->uniform_array_size = MAX2(MAX2(16, uniform + 1), 1844 c->uniform_array_size * 2); 1845 1846 c->uniform_data = reralloc(c, c->uniform_data, 1847 uint32_t, 1848 c->uniform_array_size); 1849 c->uniform_contents = reralloc(c, c->uniform_contents, 1850 enum quniform_contents, 1851 c->uniform_array_size); 1852 } 1853 1854 c->uniform_contents[uniform] = contents; 1855 c->uniform_data[uniform] = data; 1856 1857 return uniform; 1858} 1859 1860/* Looks back into the current block to find the ldunif that wrote the uniform 1861 * at the requested index. If it finds it, it returns true and writes the 1862 * destination register of the ldunif instruction to 'unif'. 1863 * 1864 * This can impact register pressure and end up leading to worse code, so we 1865 * limit the number of instructions we are willing to look back through to 1866 * strike a good balance. 1867 */ 1868static bool 1869try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) 1870{ 1871 uint32_t count = 20; 1872 struct qinst *prev_inst = NULL; 1873 assert(c->cur_block); 1874 1875#ifdef DEBUG 1876 /* We can only reuse a uniform if it was emitted in the same block, 1877 * so callers must make sure the current instruction is being emitted 1878 * in the current block. 1879 */ 1880 bool found = false; 1881 vir_for_each_inst(inst, c->cur_block) { 1882 if (&inst->link == c->cursor.link) { 1883 found = true; 1884 break; 1885 } 1886 } 1887 1888 assert(found || &c->cur_block->instructions == c->cursor.link); 1889#endif 1890 1891 list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev, 1892 &c->cur_block->instructions, link) { 1893 if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) && 1894 inst->uniform == index) { 1895 prev_inst = inst; 1896 break; 1897 } 1898 1899 if (--count == 0) 1900 break; 1901 } 1902 1903 if (!prev_inst) 1904 return false; 1905 1906 1907 list_for_each_entry_from(struct qinst, inst, prev_inst->link.next, 1908 &c->cur_block->instructions, link) { 1909 if (inst->dst.file == prev_inst->dst.file && 1910 inst->dst.index == prev_inst->dst.index) { 1911 return false; 1912 } 1913 } 1914 1915 *unif = prev_inst->dst; 1916 return true; 1917} 1918 1919struct qreg 1920vir_uniform(struct v3d_compile *c, 1921 enum quniform_contents contents, 1922 uint32_t data) 1923{ 1924 const int num_uniforms = c->num_uniforms; 1925 const int index = vir_get_uniform_index(c, contents, data); 1926 1927 /* If this is not the first time we see this uniform try to reuse the 1928 * result of the last ldunif that loaded it. 1929 */ 1930 const bool is_new_uniform = num_uniforms != c->num_uniforms; 1931 if (!is_new_uniform && !c->disable_ldunif_opt) { 1932 struct qreg ldunif_dst; 1933 if (try_opt_ldunif(c, index, &ldunif_dst)) 1934 return ldunif_dst; 1935 } 1936 1937 struct qinst *inst = vir_NOP(c); 1938 inst->qpu.sig.ldunif = true; 1939 inst->uniform = index; 1940 inst->dst = vir_get_temp(c); 1941 c->defs[inst->dst.index] = inst; 1942 return inst->dst; 1943} 1944 1945#define OPTPASS(func) \ 1946 do { \ 1947 bool stage_progress = func(c); \ 1948 if (stage_progress) { \ 1949 progress = true; \ 1950 if (print_opt_debug) { \ 1951 fprintf(stderr, \ 1952 "VIR opt pass %2d: %s progress\n", \ 1953 pass, #func); \ 1954 } \ 1955 /*XXX vir_validate(c);*/ \ 1956 } \ 1957 } while (0) 1958 1959void 1960vir_optimize(struct v3d_compile *c) 1961{ 1962 bool print_opt_debug = false; 1963 int pass = 1; 1964 1965 while (true) { 1966 bool progress = false; 1967 1968 OPTPASS(vir_opt_copy_propagate); 1969 OPTPASS(vir_opt_redundant_flags); 1970 OPTPASS(vir_opt_dead_code); 1971 OPTPASS(vir_opt_small_immediates); 1972 OPTPASS(vir_opt_constant_alu); 1973 1974 if (!progress) 1975 break; 1976 1977 pass++; 1978 } 1979} 1980 1981const char * 1982vir_get_stage_name(struct v3d_compile *c) 1983{ 1984 if (c->vs_key && c->vs_key->is_coord) 1985 return "MESA_SHADER_VERTEX_BIN"; 1986 else if (c->gs_key && c->gs_key->is_coord) 1987 return "MESA_SHADER_GEOMETRY_BIN"; 1988 else 1989 return gl_shader_stage_name(c->s->info.stage); 1990} 1991 1992static inline uint32_t 1993compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo) 1994{ 1995 assert(devinfo->vpm_size > 0); 1996 const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; 1997 return devinfo->vpm_size / sector_size; 1998} 1999 2000/* Computes various parameters affecting VPM memory configuration for programs 2001 * involving geometry shaders to ensure the program fits in memory and honors 2002 * requirements described in section "VPM usage" of the programming manual. 2003 */ 2004static bool 2005compute_vpm_config_gs(struct v3d_device_info *devinfo, 2006 struct v3d_vs_prog_data *vs, 2007 struct v3d_gs_prog_data *gs, 2008 struct vpm_config *vpm_cfg_out) 2009{ 2010 const uint32_t A = vs->separate_segments ? 1 : 0; 2011 const uint32_t Ad = vs->vpm_input_size; 2012 const uint32_t Vd = vs->vpm_output_size; 2013 2014 const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo); 2015 2016 /* Try to fit program into our VPM memory budget by adjusting 2017 * configurable parameters iteratively. We do this in two phases: 2018 * the first phase tries to fit the program into the total available 2019 * VPM memory. If we succeed at that, then the second phase attempts 2020 * to fit the program into half of that budget so we can run bin and 2021 * render programs in parallel. 2022 */ 2023 struct vpm_config vpm_cfg[2]; 2024 struct vpm_config *final_vpm_cfg = NULL; 2025 uint32_t phase = 0; 2026 2027 vpm_cfg[phase].As = 1; 2028 vpm_cfg[phase].Gs = 1; 2029 vpm_cfg[phase].Gd = gs->vpm_output_size; 2030 vpm_cfg[phase].gs_width = gs->simd_width; 2031 2032 /* While there is a requirement that Vc >= [Vn / 16], this is 2033 * always the case when tessellation is not present because in that 2034 * case Vn can only be 6 at most (when input primitive is triangles 2035 * with adjacency). 2036 * 2037 * We always choose Vc=2. We can't go lower than this due to GFXH-1744, 2038 * and Broadcom has not found it worth it to increase it beyond this 2039 * in general. Increasing Vc also increases VPM memory pressure which 2040 * can turn up being detrimental for performance in some scenarios. 2041 */ 2042 vpm_cfg[phase].Vc = 2; 2043 2044 /* Gv is a constraint on the hardware to not exceed the 2045 * specified number of vertex segments per GS batch. If adding a 2046 * new primitive to a GS batch would result in a range of more 2047 * than Gv vertex segments being referenced by the batch, then 2048 * the hardware will flush the batch and start a new one. This 2049 * means that we can choose any value we want, we just need to 2050 * be aware that larger values improve GS batch utilization 2051 * at the expense of more VPM memory pressure (which can affect 2052 * other performance aspects, such as GS dispatch width). 2053 * We start with the largest value, and will reduce it if we 2054 * find that total memory pressure is too high. 2055 */ 2056 vpm_cfg[phase].Gv = 3; 2057 do { 2058 /* When GS is present in absence of TES, then we need to satisfy 2059 * that Ve >= Gv. We go with the smallest value of Ve to avoid 2060 * increasing memory pressure. 2061 */ 2062 vpm_cfg[phase].Ve = vpm_cfg[phase].Gv; 2063 2064 uint32_t vpm_sectors = 2065 A * vpm_cfg[phase].As * Ad + 2066 (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd + 2067 vpm_cfg[phase].Gs * vpm_cfg[phase].Gd; 2068 2069 /* Ideally we want to use no more than half of the available 2070 * memory so we can execute a bin and render program in parallel 2071 * without stalls. If we achieved that then we are done. 2072 */ 2073 if (vpm_sectors <= vpm_size / 2) { 2074 final_vpm_cfg = &vpm_cfg[phase]; 2075 break; 2076 } 2077 2078 /* At the very least, we should not allocate more than the 2079 * total available VPM memory. If we have a configuration that 2080 * succeeds at this we save it and continue to see if we can 2081 * meet the half-memory-use criteria too. 2082 */ 2083 if (phase == 0 && vpm_sectors <= vpm_size) { 2084 vpm_cfg[1] = vpm_cfg[0]; 2085 phase = 1; 2086 } 2087 2088 /* Try lowering Gv */ 2089 if (vpm_cfg[phase].Gv > 0) { 2090 vpm_cfg[phase].Gv--; 2091 continue; 2092 } 2093 2094 /* Try lowering GS dispatch width */ 2095 if (vpm_cfg[phase].gs_width > 1) { 2096 do { 2097 vpm_cfg[phase].gs_width >>= 1; 2098 vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2; 2099 } while (vpm_cfg[phase].gs_width == 2); 2100 2101 /* Reset Gv to max after dropping dispatch width */ 2102 vpm_cfg[phase].Gv = 3; 2103 continue; 2104 } 2105 2106 /* We ran out of options to reduce memory pressure. If we 2107 * are at phase 1 we have at least a valid configuration, so we 2108 * we use that. 2109 */ 2110 if (phase == 1) 2111 final_vpm_cfg = &vpm_cfg[0]; 2112 break; 2113 } while (true); 2114 2115 if (!final_vpm_cfg) 2116 return false; 2117 2118 assert(final_vpm_cfg); 2119 assert(final_vpm_cfg->Gd <= 16); 2120 assert(final_vpm_cfg->Gv < 4); 2121 assert(final_vpm_cfg->Ve < 4); 2122 assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4); 2123 assert(final_vpm_cfg->gs_width == 1 || 2124 final_vpm_cfg->gs_width == 4 || 2125 final_vpm_cfg->gs_width == 8 || 2126 final_vpm_cfg->gs_width == 16); 2127 2128 *vpm_cfg_out = *final_vpm_cfg; 2129 return true; 2130} 2131 2132bool 2133v3d_compute_vpm_config(struct v3d_device_info *devinfo, 2134 struct v3d_vs_prog_data *vs_bin, 2135 struct v3d_vs_prog_data *vs, 2136 struct v3d_gs_prog_data *gs_bin, 2137 struct v3d_gs_prog_data *gs, 2138 struct vpm_config *vpm_cfg_bin, 2139 struct vpm_config *vpm_cfg) 2140{ 2141 assert(vs && vs_bin); 2142 assert((gs != NULL) == (gs_bin != NULL)); 2143 2144 if (!gs) { 2145 vpm_cfg_bin->As = 1; 2146 vpm_cfg_bin->Ve = 0; 2147 vpm_cfg_bin->Vc = vs_bin->vcm_cache_size; 2148 2149 vpm_cfg->As = 1; 2150 vpm_cfg->Ve = 0; 2151 vpm_cfg->Vc = vs->vcm_cache_size; 2152 } else { 2153 if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin)) 2154 return false; 2155 2156 if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg)) 2157 return false; 2158 } 2159 2160 return true; 2161} 2162