nir_to_vir.c revision 01e04c3f
1/* 2 * Copyright © 2016 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <inttypes.h> 25#include "util/u_format.h" 26#include "util/u_math.h" 27#include "util/u_memory.h" 28#include "util/ralloc.h" 29#include "util/hash_table.h" 30#include "compiler/nir/nir.h" 31#include "compiler/nir/nir_builder.h" 32#include "common/v3d_device_info.h" 33#include "v3d_compiler.h" 34 35static void 36ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 37 38static void 39resize_qreg_array(struct v3d_compile *c, 40 struct qreg **regs, 41 uint32_t *size, 42 uint32_t decl_size) 43{ 44 if (*size >= decl_size) 45 return; 46 47 uint32_t old_size = *size; 48 *size = MAX2(*size * 2, decl_size); 49 *regs = reralloc(c, *regs, struct qreg, *size); 50 if (!*regs) { 51 fprintf(stderr, "Malloc failure\n"); 52 abort(); 53 } 54 55 for (uint32_t i = old_size; i < *size; i++) 56 (*regs)[i] = c->undef; 57} 58 59void 60vir_emit_thrsw(struct v3d_compile *c) 61{ 62 if (c->threads == 1) 63 return; 64 65 /* Always thread switch after each texture operation for now. 66 * 67 * We could do better by batching a bunch of texture fetches up and 68 * then doing one thread switch and collecting all their results 69 * afterward. 70 */ 71 c->last_thrsw = vir_NOP(c); 72 c->last_thrsw->qpu.sig.thrsw = true; 73 c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); 74} 75 76static struct qreg 77indirect_uniform_load(struct v3d_compile *c, nir_intrinsic_instr *intr) 78{ 79 struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); 80 uint32_t offset = nir_intrinsic_base(intr); 81 struct v3d_ubo_range *range = NULL; 82 unsigned i; 83 84 for (i = 0; i < c->num_ubo_ranges; i++) { 85 range = &c->ubo_ranges[i]; 86 if (offset >= range->src_offset && 87 offset < range->src_offset + range->size) { 88 break; 89 } 90 } 91 /* The driver-location-based offset always has to be within a declared 92 * uniform range. 93 */ 94 assert(i != c->num_ubo_ranges); 95 if (!c->ubo_range_used[i]) { 96 c->ubo_range_used[i] = true; 97 range->dst_offset = c->next_ubo_dst_offset; 98 c->next_ubo_dst_offset += range->size; 99 } 100 101 offset -= range->src_offset; 102 103 if (range->dst_offset + offset != 0) { 104 indirect_offset = vir_ADD(c, indirect_offset, 105 vir_uniform_ui(c, range->dst_offset + 106 offset)); 107 } 108 109 /* Adjust for where we stored the TGSI register base. */ 110 vir_ADD_dest(c, 111 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), 112 vir_uniform(c, QUNIFORM_UBO_ADDR, 0), 113 indirect_offset); 114 115 vir_emit_thrsw(c); 116 return vir_LDTMU(c); 117} 118 119static struct qreg * 120ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) 121{ 122 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 123 def->num_components); 124 _mesa_hash_table_insert(c->def_ht, def, qregs); 125 return qregs; 126} 127 128/** 129 * This function is responsible for getting VIR results into the associated 130 * storage for a NIR instruction. 131 * 132 * If it's a NIR SSA def, then we just set the associated hash table entry to 133 * the new result. 134 * 135 * If it's a NIR reg, then we need to update the existing qreg assigned to the 136 * NIR destination with the incoming value. To do that without introducing 137 * new MOVs, we require that the incoming qreg either be a uniform, or be 138 * SSA-defined by the previous VIR instruction in the block and rewritable by 139 * this function. That lets us sneak ahead and insert the SF flag beforehand 140 * (knowing that the previous instruction doesn't depend on flags) and rewrite 141 * its destination to be the NIR reg's destination 142 */ 143void 144ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, 145 struct qreg result) 146{ 147 struct qinst *last_inst = NULL; 148 if (!list_empty(&c->cur_block->instructions)) 149 last_inst = (struct qinst *)c->cur_block->instructions.prev; 150 151 assert(result.file == QFILE_UNIF || 152 (result.file == QFILE_TEMP && 153 last_inst && last_inst == c->defs[result.index])); 154 155 if (dest->is_ssa) { 156 assert(chan < dest->ssa.num_components); 157 158 struct qreg *qregs; 159 struct hash_entry *entry = 160 _mesa_hash_table_search(c->def_ht, &dest->ssa); 161 162 if (entry) 163 qregs = entry->data; 164 else 165 qregs = ntq_init_ssa_def(c, &dest->ssa); 166 167 qregs[chan] = result; 168 } else { 169 nir_register *reg = dest->reg.reg; 170 assert(dest->reg.base_offset == 0); 171 assert(reg->num_array_elems == 0); 172 struct hash_entry *entry = 173 _mesa_hash_table_search(c->def_ht, reg); 174 struct qreg *qregs = entry->data; 175 176 /* Insert a MOV if the source wasn't an SSA def in the 177 * previous instruction. 178 */ 179 if (result.file == QFILE_UNIF) { 180 result = vir_MOV(c, result); 181 last_inst = c->defs[result.index]; 182 } 183 184 /* We know they're both temps, so just rewrite index. */ 185 c->defs[last_inst->dst.index] = NULL; 186 last_inst->dst.index = qregs[chan].index; 187 188 /* If we're in control flow, then make this update of the reg 189 * conditional on the execution mask. 190 */ 191 if (c->execute.file != QFILE_NULL) { 192 last_inst->dst.index = qregs[chan].index; 193 194 /* Set the flags to the current exec mask. 195 */ 196 c->cursor = vir_before_inst(last_inst); 197 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); 198 c->cursor = vir_after_inst(last_inst); 199 200 vir_set_cond(last_inst, V3D_QPU_COND_IFA); 201 last_inst->cond_is_exec_mask = true; 202 } 203 } 204} 205 206struct qreg 207ntq_get_src(struct v3d_compile *c, nir_src src, int i) 208{ 209 struct hash_entry *entry; 210 if (src.is_ssa) { 211 entry = _mesa_hash_table_search(c->def_ht, src.ssa); 212 assert(i < src.ssa->num_components); 213 } else { 214 nir_register *reg = src.reg.reg; 215 entry = _mesa_hash_table_search(c->def_ht, reg); 216 assert(reg->num_array_elems == 0); 217 assert(src.reg.base_offset == 0); 218 assert(i < reg->num_components); 219 } 220 221 struct qreg *qregs = entry->data; 222 return qregs[i]; 223} 224 225static struct qreg 226ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, 227 unsigned src) 228{ 229 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 230 unsigned chan = ffs(instr->dest.write_mask) - 1; 231 struct qreg r = ntq_get_src(c, instr->src[src].src, 232 instr->src[src].swizzle[chan]); 233 234 assert(!instr->src[src].abs); 235 assert(!instr->src[src].negate); 236 237 return r; 238}; 239 240static struct qreg 241ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level) 242{ 243 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1)); 244} 245 246static void 247ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) 248{ 249 unsigned unit = instr->texture_index; 250 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod); 251 int dest_size = nir_tex_instr_dest_size(instr); 252 253 struct qreg lod = c->undef; 254 if (lod_index != -1) 255 lod = ntq_get_src(c, instr->src[lod_index].src, 0); 256 257 for (int i = 0; i < dest_size; i++) { 258 assert(i < 3); 259 enum quniform_contents contents; 260 261 if (instr->is_array && i == dest_size - 1) 262 contents = QUNIFORM_TEXTURE_ARRAY_SIZE; 263 else 264 contents = QUNIFORM_TEXTURE_WIDTH + i; 265 266 struct qreg size = vir_uniform(c, contents, unit); 267 268 switch (instr->sampler_dim) { 269 case GLSL_SAMPLER_DIM_1D: 270 case GLSL_SAMPLER_DIM_2D: 271 case GLSL_SAMPLER_DIM_3D: 272 case GLSL_SAMPLER_DIM_CUBE: 273 /* Don't minify the array size. */ 274 if (!(instr->is_array && i == dest_size - 1)) { 275 size = ntq_minify(c, size, lod); 276 } 277 break; 278 279 case GLSL_SAMPLER_DIM_RECT: 280 /* There's no LOD field for rects */ 281 break; 282 283 default: 284 unreachable("Bad sampler type"); 285 } 286 287 ntq_store_dest(c, &instr->dest, i, size); 288 } 289} 290 291static void 292ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) 293{ 294 unsigned unit = instr->texture_index; 295 296 /* Since each texture sampling op requires uploading uniforms to 297 * reference the texture, there's no HW support for texture size and 298 * you just upload uniforms containing the size. 299 */ 300 switch (instr->op) { 301 case nir_texop_query_levels: 302 ntq_store_dest(c, &instr->dest, 0, 303 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); 304 return; 305 case nir_texop_txs: 306 ntq_emit_txs(c, instr); 307 return; 308 default: 309 break; 310 } 311 312 if (c->devinfo->ver >= 40) 313 v3d40_vir_emit_tex(c, instr); 314 else 315 v3d33_vir_emit_tex(c, instr); 316} 317 318static struct qreg 319ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos) 320{ 321 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI)); 322 if (is_cos) 323 input = vir_FADD(c, input, vir_uniform_f(c, 0.5)); 324 325 struct qreg periods = vir_FROUND(c, input); 326 struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods)); 327 return vir_XOR(c, sin_output, vir_SHL(c, 328 vir_FTOIN(c, periods), 329 vir_uniform_ui(c, -1))); 330} 331 332static struct qreg 333ntq_fsign(struct v3d_compile *c, struct qreg src) 334{ 335 struct qreg t = vir_get_temp(c); 336 337 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); 338 vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHZ); 339 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); 340 vir_PF(c, vir_FMOV(c, src), V3D_QPU_PF_PUSHN); 341 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); 342 return vir_MOV(c, t); 343} 344 345static struct qreg 346ntq_isign(struct v3d_compile *c, struct qreg src) 347{ 348 struct qreg t = vir_get_temp(c); 349 350 vir_MOV_dest(c, t, vir_uniform_ui(c, 0)); 351 vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHZ); 352 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_ui(c, 1)); 353 vir_PF(c, vir_MOV(c, src), V3D_QPU_PF_PUSHN); 354 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_ui(c, -1)); 355 return vir_MOV(c, t); 356} 357 358static void 359emit_fragcoord_input(struct v3d_compile *c, int attr) 360{ 361 c->inputs[attr * 4 + 0] = vir_FXCD(c); 362 c->inputs[attr * 4 + 1] = vir_FYCD(c); 363 c->inputs[attr * 4 + 2] = c->payload_z; 364 c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w); 365} 366 367static struct qreg 368emit_fragment_varying(struct v3d_compile *c, nir_variable *var, 369 uint8_t swizzle) 370{ 371 struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); 372 struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); 373 374 struct qreg vary; 375 if (c->devinfo->ver >= 41) { 376 struct qinst *ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, 377 c->undef, c->undef); 378 ldvary->qpu.sig.ldvary = true; 379 vary = vir_emit_def(c, ldvary); 380 } else { 381 vir_NOP(c)->qpu.sig.ldvary = true; 382 vary = r3; 383 } 384 385 /* For gl_PointCoord input or distance along a line, we'll be called 386 * with no nir_variable, and we don't count toward VPM size so we 387 * don't track an input slot. 388 */ 389 if (!var) { 390 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); 391 } 392 393 int i = c->num_inputs++; 394 c->input_slots[i] = v3d_slot_from_slot_and_component(var->data.location, 395 swizzle); 396 397 switch (var->data.interpolation) { 398 case INTERP_MODE_NONE: 399 /* If a gl_FrontColor or gl_BackColor input has no interp 400 * qualifier, then if we're using glShadeModel(GL_FLAT) it 401 * needs to be flat shaded. 402 */ 403 switch (var->data.location) { 404 case VARYING_SLOT_COL0: 405 case VARYING_SLOT_COL1: 406 case VARYING_SLOT_BFC0: 407 case VARYING_SLOT_BFC1: 408 if (c->fs_key->shade_model_flat) { 409 BITSET_SET(c->flat_shade_flags, i); 410 vir_MOV_dest(c, c->undef, vary); 411 return vir_MOV(c, r5); 412 } else { 413 return vir_FADD(c, vir_FMUL(c, vary, 414 c->payload_w), r5); 415 } 416 default: 417 break; 418 } 419 /* FALLTHROUGH */ 420 case INTERP_MODE_SMOOTH: 421 if (var->data.centroid) { 422 BITSET_SET(c->centroid_flags, i); 423 return vir_FADD(c, vir_FMUL(c, vary, 424 c->payload_w_centroid), r5); 425 } else { 426 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); 427 } 428 case INTERP_MODE_NOPERSPECTIVE: 429 BITSET_SET(c->noperspective_flags, i); 430 return vir_FADD(c, vir_MOV(c, vary), r5); 431 case INTERP_MODE_FLAT: 432 BITSET_SET(c->flat_shade_flags, i); 433 vir_MOV_dest(c, c->undef, vary); 434 return vir_MOV(c, r5); 435 default: 436 unreachable("Bad interp mode"); 437 } 438} 439 440static void 441emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var) 442{ 443 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) { 444 int chan = var->data.location_frac + i; 445 c->inputs[attr * 4 + chan] = 446 emit_fragment_varying(c, var, chan); 447 } 448} 449 450static void 451add_output(struct v3d_compile *c, 452 uint32_t decl_offset, 453 uint8_t slot, 454 uint8_t swizzle) 455{ 456 uint32_t old_array_size = c->outputs_array_size; 457 resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 458 decl_offset + 1); 459 460 if (old_array_size != c->outputs_array_size) { 461 c->output_slots = reralloc(c, 462 c->output_slots, 463 struct v3d_varying_slot, 464 c->outputs_array_size); 465 } 466 467 c->output_slots[decl_offset] = 468 v3d_slot_from_slot_and_component(slot, swizzle); 469} 470 471static void 472declare_uniform_range(struct v3d_compile *c, uint32_t start, uint32_t size) 473{ 474 unsigned array_id = c->num_ubo_ranges++; 475 if (array_id >= c->ubo_ranges_array_size) { 476 c->ubo_ranges_array_size = MAX2(c->ubo_ranges_array_size * 2, 477 array_id + 1); 478 c->ubo_ranges = reralloc(c, c->ubo_ranges, 479 struct v3d_ubo_range, 480 c->ubo_ranges_array_size); 481 c->ubo_range_used = reralloc(c, c->ubo_range_used, 482 bool, 483 c->ubo_ranges_array_size); 484 } 485 486 c->ubo_ranges[array_id].dst_offset = 0; 487 c->ubo_ranges[array_id].src_offset = start; 488 c->ubo_ranges[array_id].size = size; 489 c->ubo_range_used[array_id] = false; 490} 491 492/** 493 * If compare_instr is a valid comparison instruction, emits the 494 * compare_instr's comparison and returns the sel_instr's return value based 495 * on the compare_instr's result. 496 */ 497static bool 498ntq_emit_comparison(struct v3d_compile *c, struct qreg *dest, 499 nir_alu_instr *compare_instr, 500 nir_alu_instr *sel_instr) 501{ 502 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 503 struct qreg src1; 504 if (nir_op_infos[compare_instr->op].num_inputs > 1) 505 src1 = ntq_get_alu_src(c, compare_instr, 1); 506 bool cond_invert = false; 507 508 switch (compare_instr->op) { 509 case nir_op_feq: 510 case nir_op_seq: 511 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ); 512 break; 513 case nir_op_ieq: 514 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ); 515 break; 516 517 case nir_op_fne: 518 case nir_op_sne: 519 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHZ); 520 cond_invert = true; 521 break; 522 case nir_op_ine: 523 vir_PF(c, vir_XOR(c, src0, src1), V3D_QPU_PF_PUSHZ); 524 cond_invert = true; 525 break; 526 527 case nir_op_fge: 528 case nir_op_sge: 529 vir_PF(c, vir_FCMP(c, src1, src0), V3D_QPU_PF_PUSHC); 530 break; 531 case nir_op_ige: 532 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC); 533 cond_invert = true; 534 break; 535 case nir_op_uge: 536 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC); 537 cond_invert = true; 538 break; 539 540 case nir_op_slt: 541 case nir_op_flt: 542 vir_PF(c, vir_FCMP(c, src0, src1), V3D_QPU_PF_PUSHN); 543 break; 544 case nir_op_ilt: 545 vir_PF(c, vir_MIN(c, src1, src0), V3D_QPU_PF_PUSHC); 546 break; 547 case nir_op_ult: 548 vir_PF(c, vir_SUB(c, src0, src1), V3D_QPU_PF_PUSHC); 549 break; 550 551 default: 552 return false; 553 } 554 555 enum v3d_qpu_cond cond = (cond_invert ? 556 V3D_QPU_COND_IFNA : 557 V3D_QPU_COND_IFA); 558 559 switch (sel_instr->op) { 560 case nir_op_seq: 561 case nir_op_sne: 562 case nir_op_sge: 563 case nir_op_slt: 564 *dest = vir_SEL(c, cond, 565 vir_uniform_f(c, 1.0), vir_uniform_f(c, 0.0)); 566 break; 567 568 case nir_op_bcsel: 569 *dest = vir_SEL(c, cond, 570 ntq_get_alu_src(c, sel_instr, 1), 571 ntq_get_alu_src(c, sel_instr, 2)); 572 break; 573 574 default: 575 *dest = vir_SEL(c, cond, 576 vir_uniform_ui(c, ~0), vir_uniform_ui(c, 0)); 577 break; 578 } 579 580 /* Make the temporary for nir_store_dest(). */ 581 *dest = vir_MOV(c, *dest); 582 583 return true; 584} 585 586/** 587 * Attempts to fold a comparison generating a boolean result into the 588 * condition code for selecting between two values, instead of comparing the 589 * boolean result against 0 to generate the condition code. 590 */ 591static struct qreg ntq_emit_bcsel(struct v3d_compile *c, nir_alu_instr *instr, 592 struct qreg *src) 593{ 594 if (!instr->src[0].src.is_ssa) 595 goto out; 596 if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 597 goto out; 598 nir_alu_instr *compare = 599 nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 600 if (!compare) 601 goto out; 602 603 struct qreg dest; 604 if (ntq_emit_comparison(c, &dest, compare, instr)) 605 return dest; 606 607out: 608 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); 609 return vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, src[1], src[2])); 610} 611 612 613static void 614ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) 615{ 616 /* This should always be lowered to ALU operations for V3D. */ 617 assert(!instr->dest.saturate); 618 619 /* Vectors are special in that they have non-scalarized writemasks, 620 * and just take the first swizzle channel for each argument in order 621 * into each writemask channel. 622 */ 623 if (instr->op == nir_op_vec2 || 624 instr->op == nir_op_vec3 || 625 instr->op == nir_op_vec4) { 626 struct qreg srcs[4]; 627 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 628 srcs[i] = ntq_get_src(c, instr->src[i].src, 629 instr->src[i].swizzle[0]); 630 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 631 ntq_store_dest(c, &instr->dest.dest, i, 632 vir_MOV(c, srcs[i])); 633 return; 634 } 635 636 /* General case: We can just grab the one used channel per src. */ 637 struct qreg src[nir_op_infos[instr->op].num_inputs]; 638 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 639 src[i] = ntq_get_alu_src(c, instr, i); 640 } 641 642 struct qreg result; 643 644 switch (instr->op) { 645 case nir_op_fmov: 646 case nir_op_imov: 647 result = vir_MOV(c, src[0]); 648 break; 649 650 case nir_op_fneg: 651 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31)); 652 break; 653 case nir_op_ineg: 654 result = vir_NEG(c, src[0]); 655 break; 656 657 case nir_op_fmul: 658 result = vir_FMUL(c, src[0], src[1]); 659 break; 660 case nir_op_fadd: 661 result = vir_FADD(c, src[0], src[1]); 662 break; 663 case nir_op_fsub: 664 result = vir_FSUB(c, src[0], src[1]); 665 break; 666 case nir_op_fmin: 667 result = vir_FMIN(c, src[0], src[1]); 668 break; 669 case nir_op_fmax: 670 result = vir_FMAX(c, src[0], src[1]); 671 break; 672 673 case nir_op_f2i32: 674 result = vir_FTOIZ(c, src[0]); 675 break; 676 case nir_op_f2u32: 677 result = vir_FTOUZ(c, src[0]); 678 break; 679 case nir_op_i2f32: 680 result = vir_ITOF(c, src[0]); 681 break; 682 case nir_op_u2f32: 683 result = vir_UTOF(c, src[0]); 684 break; 685 case nir_op_b2f: 686 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0)); 687 break; 688 case nir_op_b2i: 689 result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); 690 break; 691 case nir_op_i2b: 692 case nir_op_f2b: 693 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); 694 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, 695 vir_uniform_ui(c, ~0), 696 vir_uniform_ui(c, 0))); 697 break; 698 699 case nir_op_iadd: 700 result = vir_ADD(c, src[0], src[1]); 701 break; 702 case nir_op_ushr: 703 result = vir_SHR(c, src[0], src[1]); 704 break; 705 case nir_op_isub: 706 result = vir_SUB(c, src[0], src[1]); 707 break; 708 case nir_op_ishr: 709 result = vir_ASR(c, src[0], src[1]); 710 break; 711 case nir_op_ishl: 712 result = vir_SHL(c, src[0], src[1]); 713 break; 714 case nir_op_imin: 715 result = vir_MIN(c, src[0], src[1]); 716 break; 717 case nir_op_umin: 718 result = vir_UMIN(c, src[0], src[1]); 719 break; 720 case nir_op_imax: 721 result = vir_MAX(c, src[0], src[1]); 722 break; 723 case nir_op_umax: 724 result = vir_UMAX(c, src[0], src[1]); 725 break; 726 case nir_op_iand: 727 result = vir_AND(c, src[0], src[1]); 728 break; 729 case nir_op_ior: 730 result = vir_OR(c, src[0], src[1]); 731 break; 732 case nir_op_ixor: 733 result = vir_XOR(c, src[0], src[1]); 734 break; 735 case nir_op_inot: 736 result = vir_NOT(c, src[0]); 737 break; 738 739 case nir_op_ufind_msb: 740 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0])); 741 break; 742 743 case nir_op_imul: 744 result = vir_UMUL(c, src[0], src[1]); 745 break; 746 747 case nir_op_seq: 748 case nir_op_sne: 749 case nir_op_sge: 750 case nir_op_slt: 751 case nir_op_feq: 752 case nir_op_fne: 753 case nir_op_fge: 754 case nir_op_flt: 755 case nir_op_ieq: 756 case nir_op_ine: 757 case nir_op_ige: 758 case nir_op_uge: 759 case nir_op_ilt: 760 case nir_op_ult: 761 if (!ntq_emit_comparison(c, &result, instr, instr)) { 762 fprintf(stderr, "Bad comparison instruction\n"); 763 } 764 break; 765 766 case nir_op_bcsel: 767 result = ntq_emit_bcsel(c, instr, src); 768 break; 769 case nir_op_fcsel: 770 vir_PF(c, src[0], V3D_QPU_PF_PUSHZ); 771 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, 772 src[1], src[2])); 773 break; 774 775 case nir_op_frcp: 776 result = vir_RECIP(c, src[0]); 777 break; 778 case nir_op_frsq: 779 result = vir_RSQRT(c, src[0]); 780 break; 781 case nir_op_fexp2: 782 result = vir_EXP(c, src[0]); 783 break; 784 case nir_op_flog2: 785 result = vir_LOG(c, src[0]); 786 break; 787 788 case nir_op_fceil: 789 result = vir_FCEIL(c, src[0]); 790 break; 791 case nir_op_ffloor: 792 result = vir_FFLOOR(c, src[0]); 793 break; 794 case nir_op_fround_even: 795 result = vir_FROUND(c, src[0]); 796 break; 797 case nir_op_ftrunc: 798 result = vir_FTRUNC(c, src[0]); 799 break; 800 case nir_op_ffract: 801 result = vir_FSUB(c, src[0], vir_FFLOOR(c, src[0])); 802 break; 803 804 case nir_op_fsin: 805 result = ntq_fsincos(c, src[0], false); 806 break; 807 case nir_op_fcos: 808 result = ntq_fsincos(c, src[0], true); 809 break; 810 811 case nir_op_fsign: 812 result = ntq_fsign(c, src[0]); 813 break; 814 case nir_op_isign: 815 result = ntq_isign(c, src[0]); 816 break; 817 818 case nir_op_fabs: { 819 result = vir_FMOV(c, src[0]); 820 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS); 821 break; 822 } 823 824 case nir_op_iabs: 825 result = vir_MAX(c, src[0], 826 vir_SUB(c, vir_uniform_ui(c, 0), src[0])); 827 break; 828 829 case nir_op_fddx: 830 case nir_op_fddx_coarse: 831 case nir_op_fddx_fine: 832 result = vir_FDX(c, src[0]); 833 break; 834 835 case nir_op_fddy: 836 case nir_op_fddy_coarse: 837 case nir_op_fddy_fine: 838 result = vir_FDY(c, src[0]); 839 break; 840 841 case nir_op_uadd_carry: 842 vir_PF(c, vir_ADD(c, src[0], src[1]), V3D_QPU_PF_PUSHC); 843 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, 844 vir_uniform_ui(c, ~0), 845 vir_uniform_ui(c, 0))); 846 break; 847 848 case nir_op_pack_half_2x16_split: 849 result = vir_VFPACK(c, src[0], src[1]); 850 break; 851 852 case nir_op_unpack_half_2x16_split_x: 853 result = vir_FMOV(c, src[0]); 854 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); 855 break; 856 857 case nir_op_unpack_half_2x16_split_y: 858 result = vir_FMOV(c, src[0]); 859 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); 860 break; 861 862 default: 863 fprintf(stderr, "unknown NIR ALU inst: "); 864 nir_print_instr(&instr->instr, stderr); 865 fprintf(stderr, "\n"); 866 abort(); 867 } 868 869 /* We have a scalar result, so the instruction should only have a 870 * single channel written to. 871 */ 872 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 873 ntq_store_dest(c, &instr->dest.dest, 874 ffs(instr->dest.write_mask) - 1, result); 875} 876 877/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit 878 * specifier. They come from a register that's preloaded with 0xffffffff 879 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low 880 * 8 bits are shifted off the bottom and 0xff shifted in from the top. 881 */ 882#define TLB_TYPE_F16_COLOR (3 << 6) 883#define TLB_TYPE_I32_COLOR (1 << 6) 884#define TLB_TYPE_F32_COLOR (0 << 6) 885#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */ 886#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2) 887#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2) 888#define TLB_F16_SWAP_HI_LO (1 << 1) 889#define TLB_VEC_SIZE_4_F16 (1 << 0) 890#define TLB_VEC_SIZE_2_F16 (0 << 0) 891#define TLB_VEC_SIZE_MINUS_1_SHIFT 0 892 893/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z" 894 * flag is set. 895 */ 896#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4)) 897#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */ 898#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */ 899 900/* Stencil is a single 32-bit write. */ 901#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4)) 902 903static void 904emit_frag_end(struct v3d_compile *c) 905{ 906 /* XXX 907 if (c->output_sample_mask_index != -1) { 908 vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]); 909 } 910 */ 911 912 bool has_any_tlb_color_write = false; 913 for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) { 914 if (c->output_color_var[rt]) 915 has_any_tlb_color_write = true; 916 } 917 918 if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) { 919 struct nir_variable *var = c->output_color_var[0]; 920 struct qreg *color = &c->outputs[var->data.driver_location * 4]; 921 922 vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), 923 vir_AND(c, 924 vir_MSF(c), 925 vir_FTOC(c, color[3]))); 926 } 927 928 if (c->output_position_index != -1) { 929 struct qinst *inst = vir_MOV_dest(c, 930 vir_reg(QFILE_TLBU, 0), 931 c->outputs[c->output_position_index]); 932 933 inst->src[vir_get_implicit_uniform_src(inst)] = 934 vir_uniform_ui(c, 935 TLB_TYPE_DEPTH | 936 TLB_DEPTH_TYPE_PER_PIXEL | 937 0xffffff00); 938 } else if (c->s->info.fs.uses_discard || 939 c->fs_key->sample_alpha_to_coverage || 940 !has_any_tlb_color_write) { 941 /* Emit passthrough Z if it needed to be delayed until shader 942 * end due to potential discards. 943 * 944 * Since (single-threaded) fragment shaders always need a TLB 945 * write, emit passthrouh Z if we didn't have any color 946 * buffers and flag us as potentially discarding, so that we 947 * can use Z as the TLB write. 948 */ 949 c->s->info.fs.uses_discard = true; 950 951 struct qinst *inst = vir_MOV_dest(c, 952 vir_reg(QFILE_TLBU, 0), 953 vir_reg(QFILE_NULL, 0)); 954 955 inst->src[vir_get_implicit_uniform_src(inst)] = 956 vir_uniform_ui(c, 957 TLB_TYPE_DEPTH | 958 TLB_DEPTH_TYPE_INVARIANT | 959 0xffffff00); 960 } 961 962 /* XXX: Performance improvement: Merge Z write and color writes TLB 963 * uniform setup 964 */ 965 966 for (int rt = 0; rt < c->fs_key->nr_cbufs; rt++) { 967 if (!c->output_color_var[rt]) 968 continue; 969 970 nir_variable *var = c->output_color_var[rt]; 971 struct qreg *color = &c->outputs[var->data.driver_location * 4]; 972 int num_components = glsl_get_vector_elements(var->type); 973 uint32_t conf = 0xffffff00; 974 struct qinst *inst; 975 976 conf |= TLB_SAMPLE_MODE_PER_PIXEL; 977 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 978 979 if (c->fs_key->swap_color_rb & (1 << rt)) 980 num_components = MAX2(num_components, 3); 981 982 assert(num_components != 0); 983 switch (glsl_get_base_type(var->type)) { 984 case GLSL_TYPE_UINT: 985 case GLSL_TYPE_INT: 986 /* The F32 vs I32 distinction was dropped in 4.2. */ 987 if (c->devinfo->ver < 42) 988 conf |= TLB_TYPE_I32_COLOR; 989 else 990 conf |= TLB_TYPE_F32_COLOR; 991 conf |= ((num_components - 1) << 992 TLB_VEC_SIZE_MINUS_1_SHIFT); 993 994 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), color[0]); 995 inst->src[vir_get_implicit_uniform_src(inst)] = 996 vir_uniform_ui(c, conf); 997 998 for (int i = 1; i < num_components; i++) { 999 inst = vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), 1000 color[i]); 1001 } 1002 break; 1003 1004 default: { 1005 struct qreg r = color[0]; 1006 struct qreg g = color[1]; 1007 struct qreg b = color[2]; 1008 struct qreg a = color[3]; 1009 1010 if (c->fs_key->f32_color_rb & (1 << rt)) { 1011 conf |= TLB_TYPE_F32_COLOR; 1012 conf |= ((num_components - 1) << 1013 TLB_VEC_SIZE_MINUS_1_SHIFT); 1014 } else { 1015 conf |= TLB_TYPE_F16_COLOR; 1016 conf |= TLB_F16_SWAP_HI_LO; 1017 if (num_components >= 3) 1018 conf |= TLB_VEC_SIZE_4_F16; 1019 else 1020 conf |= TLB_VEC_SIZE_2_F16; 1021 } 1022 1023 if (c->fs_key->swap_color_rb & (1 << rt)) { 1024 r = color[2]; 1025 b = color[0]; 1026 } 1027 1028 if (c->fs_key->sample_alpha_to_one) 1029 a = vir_uniform_f(c, 1.0); 1030 1031 if (c->fs_key->f32_color_rb & (1 << rt)) { 1032 inst = vir_MOV_dest(c, vir_reg(QFILE_TLBU, 0), r); 1033 inst->src[vir_get_implicit_uniform_src(inst)] = 1034 vir_uniform_ui(c, conf); 1035 1036 if (num_components >= 2) 1037 vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), g); 1038 if (num_components >= 3) 1039 vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), b); 1040 if (num_components >= 4) 1041 vir_MOV_dest(c, vir_reg(QFILE_TLB, 0), a); 1042 } else { 1043 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), r, g); 1044 if (conf != ~0) { 1045 inst->dst.file = QFILE_TLBU; 1046 inst->src[vir_get_implicit_uniform_src(inst)] = 1047 vir_uniform_ui(c, conf); 1048 } 1049 1050 if (num_components >= 3) 1051 inst = vir_VFPACK_dest(c, vir_reg(QFILE_TLB, 0), b, a); 1052 } 1053 break; 1054 } 1055 } 1056 } 1057} 1058 1059static void 1060vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t *vpm_index) 1061{ 1062 if (c->devinfo->ver >= 40) { 1063 vir_STVPMV(c, vir_uniform_ui(c, *vpm_index), val); 1064 *vpm_index = *vpm_index + 1; 1065 } else { 1066 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); 1067 } 1068 1069 c->num_vpm_writes++; 1070} 1071 1072static void 1073emit_scaled_viewport_write(struct v3d_compile *c, struct qreg rcp_w, 1074 uint32_t *vpm_index) 1075{ 1076 for (int i = 0; i < 2; i++) { 1077 struct qreg coord = c->outputs[c->output_position_index + i]; 1078 coord = vir_FMUL(c, coord, 1079 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 1080 0)); 1081 coord = vir_FMUL(c, coord, rcp_w); 1082 vir_VPM_WRITE(c, vir_FTOIN(c, coord), vpm_index); 1083 } 1084 1085} 1086 1087static void 1088emit_zs_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index) 1089{ 1090 struct qreg zscale = vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0); 1091 struct qreg zoffset = vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0); 1092 1093 struct qreg z = c->outputs[c->output_position_index + 2]; 1094 z = vir_FMUL(c, z, zscale); 1095 z = vir_FMUL(c, z, rcp_w); 1096 z = vir_FADD(c, z, zoffset); 1097 vir_VPM_WRITE(c, z, vpm_index); 1098} 1099 1100static void 1101emit_rcp_wc_write(struct v3d_compile *c, struct qreg rcp_w, uint32_t *vpm_index) 1102{ 1103 vir_VPM_WRITE(c, rcp_w, vpm_index); 1104} 1105 1106static void 1107emit_point_size_write(struct v3d_compile *c, uint32_t *vpm_index) 1108{ 1109 struct qreg point_size; 1110 1111 if (c->output_point_size_index != -1) 1112 point_size = c->outputs[c->output_point_size_index]; 1113 else 1114 point_size = vir_uniform_f(c, 1.0); 1115 1116 /* Workaround: HW-2726 PTB does not handle zero-size points (BCM2835, 1117 * BCM21553). 1118 */ 1119 point_size = vir_FMAX(c, point_size, vir_uniform_f(c, .125)); 1120 1121 vir_VPM_WRITE(c, point_size, vpm_index); 1122} 1123 1124static void 1125emit_vpm_write_setup(struct v3d_compile *c) 1126{ 1127 if (c->devinfo->ver >= 40) 1128 return; 1129 1130 v3d33_vir_vpm_write_setup(c); 1131} 1132 1133/** 1134 * Sets up c->outputs[c->output_position_index] for the vertex shader 1135 * epilogue, if an output vertex position wasn't specified in the user's 1136 * shader. This may be the case for transform feedback with rasterizer 1137 * discard enabled. 1138 */ 1139static void 1140setup_default_position(struct v3d_compile *c) 1141{ 1142 if (c->output_position_index != -1) 1143 return; 1144 1145 c->output_position_index = c->outputs_array_size; 1146 for (int i = 0; i < 4; i++) { 1147 add_output(c, 1148 c->output_position_index + i, 1149 VARYING_SLOT_POS, i); 1150 } 1151} 1152 1153static void 1154emit_vert_end(struct v3d_compile *c) 1155{ 1156 setup_default_position(c); 1157 1158 uint32_t vpm_index = 0; 1159 struct qreg rcp_w = vir_RECIP(c, 1160 c->outputs[c->output_position_index + 3]); 1161 1162 emit_vpm_write_setup(c); 1163 1164 if (c->vs_key->is_coord) { 1165 for (int i = 0; i < 4; i++) 1166 vir_VPM_WRITE(c, c->outputs[c->output_position_index + i], 1167 &vpm_index); 1168 emit_scaled_viewport_write(c, rcp_w, &vpm_index); 1169 if (c->vs_key->per_vertex_point_size) { 1170 emit_point_size_write(c, &vpm_index); 1171 /* emit_rcp_wc_write(c, rcp_w); */ 1172 } 1173 /* XXX: Z-only rendering */ 1174 if (0) 1175 emit_zs_write(c, rcp_w, &vpm_index); 1176 } else { 1177 emit_scaled_viewport_write(c, rcp_w, &vpm_index); 1178 emit_zs_write(c, rcp_w, &vpm_index); 1179 emit_rcp_wc_write(c, rcp_w, &vpm_index); 1180 if (c->vs_key->per_vertex_point_size) 1181 emit_point_size_write(c, &vpm_index); 1182 } 1183 1184 for (int i = 0; i < c->vs_key->num_fs_inputs; i++) { 1185 struct v3d_varying_slot input = c->vs_key->fs_inputs[i]; 1186 int j; 1187 1188 for (j = 0; j < c->num_outputs; j++) { 1189 struct v3d_varying_slot output = c->output_slots[j]; 1190 1191 if (!memcmp(&input, &output, sizeof(input))) { 1192 vir_VPM_WRITE(c, c->outputs[j], 1193 &vpm_index); 1194 break; 1195 } 1196 } 1197 /* Emit padding if we didn't find a declared VS output for 1198 * this FS input. 1199 */ 1200 if (j == c->num_outputs) 1201 vir_VPM_WRITE(c, vir_uniform_f(c, 0.0), 1202 &vpm_index); 1203 } 1204 1205 /* GFXH-1684: VPM writes need to be complete by the end of the shader. 1206 */ 1207 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 1208 vir_VPMWT(c); 1209} 1210 1211void 1212v3d_optimize_nir(struct nir_shader *s) 1213{ 1214 bool progress; 1215 1216 do { 1217 progress = false; 1218 1219 NIR_PASS_V(s, nir_lower_vars_to_ssa); 1220 NIR_PASS(progress, s, nir_lower_alu_to_scalar); 1221 NIR_PASS(progress, s, nir_lower_phis_to_scalar); 1222 NIR_PASS(progress, s, nir_copy_prop); 1223 NIR_PASS(progress, s, nir_opt_remove_phis); 1224 NIR_PASS(progress, s, nir_opt_dce); 1225 NIR_PASS(progress, s, nir_opt_dead_cf); 1226 NIR_PASS(progress, s, nir_opt_cse); 1227 NIR_PASS(progress, s, nir_opt_peephole_select, 8); 1228 NIR_PASS(progress, s, nir_opt_algebraic); 1229 NIR_PASS(progress, s, nir_opt_constant_folding); 1230 NIR_PASS(progress, s, nir_opt_undef); 1231 } while (progress); 1232 1233 NIR_PASS(progress, s, nir_opt_move_load_ubo); 1234} 1235 1236static int 1237driver_location_compare(const void *in_a, const void *in_b) 1238{ 1239 const nir_variable *const *a = in_a; 1240 const nir_variable *const *b = in_b; 1241 1242 return (*a)->data.driver_location - (*b)->data.driver_location; 1243} 1244 1245static struct qreg 1246ntq_emit_vpm_read(struct v3d_compile *c, 1247 uint32_t *num_components_queued, 1248 uint32_t *remaining, 1249 uint32_t vpm_index) 1250{ 1251 struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); 1252 1253 if (c->devinfo->ver >= 40 ) { 1254 return vir_LDVPMV_IN(c, 1255 vir_uniform_ui(c, 1256 (*num_components_queued)++)); 1257 } 1258 1259 if (*num_components_queued != 0) { 1260 (*num_components_queued)--; 1261 c->num_inputs++; 1262 return vir_MOV(c, vpm); 1263 } 1264 1265 uint32_t num_components = MIN2(*remaining, 32); 1266 1267 v3d33_vir_vpm_read_setup(c, num_components); 1268 1269 *num_components_queued = num_components - 1; 1270 *remaining -= num_components; 1271 c->num_inputs++; 1272 1273 return vir_MOV(c, vpm); 1274} 1275 1276static void 1277ntq_setup_vpm_inputs(struct v3d_compile *c) 1278{ 1279 /* Figure out how many components of each vertex attribute the shader 1280 * uses. Each variable should have been split to individual 1281 * components and unused ones DCEed. The vertex fetcher will load 1282 * from the start of the attribute to the number of components we 1283 * declare we need in c->vattr_sizes[]. 1284 */ 1285 nir_foreach_variable(var, &c->s->inputs) { 1286 /* No VS attribute array support. */ 1287 assert(MAX2(glsl_get_length(var->type), 1) == 1); 1288 1289 unsigned loc = var->data.driver_location; 1290 int start_component = var->data.location_frac; 1291 int num_components = glsl_get_components(var->type); 1292 1293 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc], 1294 start_component + num_components); 1295 } 1296 1297 unsigned num_components = 0; 1298 uint32_t vpm_components_queued = 0; 1299 bool uses_iid = c->s->info.system_values_read & 1300 (1ull << SYSTEM_VALUE_INSTANCE_ID); 1301 bool uses_vid = c->s->info.system_values_read & 1302 (1ull << SYSTEM_VALUE_VERTEX_ID); 1303 num_components += uses_iid; 1304 num_components += uses_vid; 1305 1306 for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++) 1307 num_components += c->vattr_sizes[i]; 1308 1309 if (uses_iid) { 1310 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued, 1311 &num_components, ~0); 1312 } 1313 1314 if (uses_vid) { 1315 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued, 1316 &num_components, ~0); 1317 } 1318 1319 for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { 1320 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 1321 (loc + 1) * 4); 1322 1323 for (int i = 0; i < c->vattr_sizes[loc]; i++) { 1324 c->inputs[loc * 4 + i] = 1325 ntq_emit_vpm_read(c, 1326 &vpm_components_queued, 1327 &num_components, 1328 loc * 4 + i); 1329 1330 } 1331 } 1332 1333 if (c->devinfo->ver >= 40) { 1334 assert(vpm_components_queued == num_components); 1335 } else { 1336 assert(vpm_components_queued == 0); 1337 assert(num_components == 0); 1338 } 1339} 1340 1341static void 1342ntq_setup_fs_inputs(struct v3d_compile *c) 1343{ 1344 unsigned num_entries = 0; 1345 unsigned num_components = 0; 1346 nir_foreach_variable(var, &c->s->inputs) { 1347 num_entries++; 1348 num_components += glsl_get_components(var->type); 1349 } 1350 1351 nir_variable *vars[num_entries]; 1352 1353 unsigned i = 0; 1354 nir_foreach_variable(var, &c->s->inputs) 1355 vars[i++] = var; 1356 1357 /* Sort the variables so that we emit the input setup in 1358 * driver_location order. This is required for VPM reads, whose data 1359 * is fetched into the VPM in driver_location (TGSI register index) 1360 * order. 1361 */ 1362 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); 1363 1364 for (unsigned i = 0; i < num_entries; i++) { 1365 nir_variable *var = vars[i]; 1366 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1367 unsigned loc = var->data.driver_location; 1368 1369 assert(array_len == 1); 1370 (void)array_len; 1371 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 1372 (loc + 1) * 4); 1373 1374 if (var->data.location == VARYING_SLOT_POS) { 1375 emit_fragcoord_input(c, loc); 1376 } else if (var->data.location == VARYING_SLOT_PNTC || 1377 (var->data.location >= VARYING_SLOT_VAR0 && 1378 (c->fs_key->point_sprite_mask & 1379 (1 << (var->data.location - 1380 VARYING_SLOT_VAR0))))) { 1381 c->inputs[loc * 4 + 0] = c->point_x; 1382 c->inputs[loc * 4 + 1] = c->point_y; 1383 } else { 1384 emit_fragment_input(c, loc, var); 1385 } 1386 } 1387} 1388 1389static void 1390ntq_setup_outputs(struct v3d_compile *c) 1391{ 1392 nir_foreach_variable(var, &c->s->outputs) { 1393 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1394 unsigned loc = var->data.driver_location * 4; 1395 1396 assert(array_len == 1); 1397 (void)array_len; 1398 1399 for (int i = 0; i < 4 - var->data.location_frac; i++) { 1400 add_output(c, loc + var->data.location_frac + i, 1401 var->data.location, 1402 var->data.location_frac + i); 1403 } 1404 1405 if (c->s->info.stage == MESA_SHADER_FRAGMENT) { 1406 switch (var->data.location) { 1407 case FRAG_RESULT_COLOR: 1408 c->output_color_var[0] = var; 1409 c->output_color_var[1] = var; 1410 c->output_color_var[2] = var; 1411 c->output_color_var[3] = var; 1412 break; 1413 case FRAG_RESULT_DATA0: 1414 case FRAG_RESULT_DATA1: 1415 case FRAG_RESULT_DATA2: 1416 case FRAG_RESULT_DATA3: 1417 c->output_color_var[var->data.location - 1418 FRAG_RESULT_DATA0] = var; 1419 break; 1420 case FRAG_RESULT_DEPTH: 1421 c->output_position_index = loc; 1422 break; 1423 case FRAG_RESULT_SAMPLE_MASK: 1424 c->output_sample_mask_index = loc; 1425 break; 1426 } 1427 } else { 1428 switch (var->data.location) { 1429 case VARYING_SLOT_POS: 1430 c->output_position_index = loc; 1431 break; 1432 case VARYING_SLOT_PSIZ: 1433 c->output_point_size_index = loc; 1434 break; 1435 } 1436 } 1437 } 1438} 1439 1440static void 1441ntq_setup_uniforms(struct v3d_compile *c) 1442{ 1443 nir_foreach_variable(var, &c->s->uniforms) { 1444 uint32_t vec4_count = glsl_count_attribute_slots(var->type, 1445 false); 1446 unsigned vec4_size = 4 * sizeof(float); 1447 1448 declare_uniform_range(c, var->data.driver_location * vec4_size, 1449 vec4_count * vec4_size); 1450 1451 } 1452} 1453 1454/** 1455 * Sets up the mapping from nir_register to struct qreg *. 1456 * 1457 * Each nir_register gets a struct qreg per 32-bit component being stored. 1458 */ 1459static void 1460ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) 1461{ 1462 foreach_list_typed(nir_register, nir_reg, node, list) { 1463 unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 1464 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 1465 array_len * 1466 nir_reg->num_components); 1467 1468 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 1469 1470 for (int i = 0; i < array_len * nir_reg->num_components; i++) 1471 qregs[i] = vir_get_temp(c); 1472 } 1473} 1474 1475static void 1476ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) 1477{ 1478 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1479 for (int i = 0; i < instr->def.num_components; i++) 1480 qregs[i] = vir_uniform_ui(c, instr->value.u32[i]); 1481 1482 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 1483} 1484 1485static void 1486ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr) 1487{ 1488 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1489 1490 /* VIR needs there to be *some* value, so pick 0 (same as for 1491 * ntq_setup_registers(). 1492 */ 1493 for (int i = 0; i < instr->def.num_components; i++) 1494 qregs[i] = vir_uniform_ui(c, 0); 1495} 1496 1497static void 1498ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) 1499{ 1500 nir_const_value *const_offset; 1501 unsigned offset; 1502 1503 switch (instr->intrinsic) { 1504 case nir_intrinsic_load_uniform: 1505 assert(instr->num_components == 1); 1506 const_offset = nir_src_as_const_value(instr->src[0]); 1507 if (const_offset) { 1508 offset = nir_intrinsic_base(instr) + const_offset->u32[0]; 1509 assert(offset % 4 == 0); 1510 /* We need dwords */ 1511 offset = offset / 4; 1512 ntq_store_dest(c, &instr->dest, 0, 1513 vir_uniform(c, QUNIFORM_UNIFORM, 1514 offset)); 1515 } else { 1516 ntq_store_dest(c, &instr->dest, 0, 1517 indirect_uniform_load(c, instr)); 1518 } 1519 break; 1520 1521 case nir_intrinsic_load_ubo: 1522 for (int i = 0; i < instr->num_components; i++) { 1523 int ubo = nir_src_as_const_value(instr->src[0])->u32[0]; 1524 1525 /* Adjust for where we stored the TGSI register base. */ 1526 vir_ADD_dest(c, 1527 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA), 1528 vir_uniform(c, QUNIFORM_UBO_ADDR, 1 + ubo), 1529 vir_ADD(c, 1530 ntq_get_src(c, instr->src[1], 0), 1531 vir_uniform_ui(c, i * 4))); 1532 1533 vir_emit_thrsw(c); 1534 1535 ntq_store_dest(c, &instr->dest, i, vir_LDTMU(c)); 1536 } 1537 break; 1538 1539 const_offset = nir_src_as_const_value(instr->src[0]); 1540 if (const_offset) { 1541 offset = nir_intrinsic_base(instr) + const_offset->u32[0]; 1542 assert(offset % 4 == 0); 1543 /* We need dwords */ 1544 offset = offset / 4; 1545 ntq_store_dest(c, &instr->dest, 0, 1546 vir_uniform(c, QUNIFORM_UNIFORM, 1547 offset)); 1548 } else { 1549 ntq_store_dest(c, &instr->dest, 0, 1550 indirect_uniform_load(c, instr)); 1551 } 1552 break; 1553 1554 case nir_intrinsic_load_user_clip_plane: 1555 for (int i = 0; i < instr->num_components; i++) { 1556 ntq_store_dest(c, &instr->dest, i, 1557 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 1558 nir_intrinsic_ucp_id(instr) * 1559 4 + i)); 1560 } 1561 break; 1562 1563 case nir_intrinsic_load_alpha_ref_float: 1564 ntq_store_dest(c, &instr->dest, 0, 1565 vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); 1566 break; 1567 1568 case nir_intrinsic_load_sample_mask_in: 1569 ntq_store_dest(c, &instr->dest, 0, 1570 vir_uniform(c, QUNIFORM_SAMPLE_MASK, 0)); 1571 break; 1572 1573 case nir_intrinsic_load_front_face: 1574 /* The register contains 0 (front) or 1 (back), and we need to 1575 * turn it into a NIR bool where true means front. 1576 */ 1577 ntq_store_dest(c, &instr->dest, 0, 1578 vir_ADD(c, 1579 vir_uniform_ui(c, -1), 1580 vir_REVF(c))); 1581 break; 1582 1583 case nir_intrinsic_load_instance_id: 1584 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); 1585 break; 1586 1587 case nir_intrinsic_load_vertex_id: 1588 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); 1589 break; 1590 1591 case nir_intrinsic_load_input: 1592 const_offset = nir_src_as_const_value(instr->src[0]); 1593 assert(const_offset && "v3d doesn't support indirect inputs"); 1594 for (int i = 0; i < instr->num_components; i++) { 1595 offset = nir_intrinsic_base(instr) + const_offset->u32[0]; 1596 int comp = nir_intrinsic_component(instr) + i; 1597 ntq_store_dest(c, &instr->dest, i, 1598 vir_MOV(c, c->inputs[offset * 4 + comp])); 1599 } 1600 break; 1601 1602 case nir_intrinsic_store_output: 1603 const_offset = nir_src_as_const_value(instr->src[1]); 1604 assert(const_offset && "v3d doesn't support indirect outputs"); 1605 offset = ((nir_intrinsic_base(instr) + 1606 const_offset->u32[0]) * 4 + 1607 nir_intrinsic_component(instr)); 1608 1609 for (int i = 0; i < instr->num_components; i++) { 1610 c->outputs[offset + i] = 1611 vir_MOV(c, ntq_get_src(c, instr->src[0], i)); 1612 } 1613 c->num_outputs = MAX2(c->num_outputs, 1614 offset + instr->num_components); 1615 break; 1616 1617 case nir_intrinsic_discard: 1618 if (c->execute.file != QFILE_NULL) { 1619 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); 1620 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), 1621 vir_uniform_ui(c, 0)), 1622 V3D_QPU_COND_IFA); 1623 } else { 1624 vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), 1625 vir_uniform_ui(c, 0)); 1626 } 1627 break; 1628 1629 case nir_intrinsic_discard_if: { 1630 /* true (~0) if we're discarding */ 1631 struct qreg cond = ntq_get_src(c, instr->src[0], 0); 1632 1633 if (c->execute.file != QFILE_NULL) { 1634 /* execute == 0 means the channel is active. Invert 1635 * the condition so that we can use zero as "executing 1636 * and discarding." 1637 */ 1638 vir_PF(c, vir_OR(c, c->execute, vir_NOT(c, cond)), 1639 V3D_QPU_PF_PUSHZ); 1640 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), 1641 vir_uniform_ui(c, 0)), 1642 V3D_QPU_COND_IFA); 1643 } else { 1644 vir_PF(c, cond, V3D_QPU_PF_PUSHZ); 1645 vir_set_cond(vir_SETMSF_dest(c, vir_reg(QFILE_NULL, 0), 1646 vir_uniform_ui(c, 0)), 1647 V3D_QPU_COND_IFNA); 1648 } 1649 1650 break; 1651 } 1652 1653 default: 1654 fprintf(stderr, "Unknown intrinsic: "); 1655 nir_print_instr(&instr->instr, stderr); 1656 fprintf(stderr, "\n"); 1657 break; 1658 } 1659} 1660 1661/* Clears (activates) the execute flags for any channels whose jump target 1662 * matches this block. 1663 */ 1664static void 1665ntq_activate_execute_for_block(struct v3d_compile *c) 1666{ 1667 vir_PF(c, vir_XOR(c, c->execute, vir_uniform_ui(c, c->cur_block->index)), 1668 V3D_QPU_PF_PUSHZ); 1669 1670 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 1671} 1672 1673static void 1674ntq_emit_if(struct v3d_compile *c, nir_if *if_stmt) 1675{ 1676 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 1677 bool empty_else_block = 1678 (nir_else_block == nir_if_last_else_block(if_stmt) && 1679 exec_list_is_empty(&nir_else_block->instr_list)); 1680 1681 struct qblock *then_block = vir_new_block(c); 1682 struct qblock *after_block = vir_new_block(c); 1683 struct qblock *else_block; 1684 if (empty_else_block) 1685 else_block = after_block; 1686 else 1687 else_block = vir_new_block(c); 1688 1689 bool was_top_level = false; 1690 if (c->execute.file == QFILE_NULL) { 1691 c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 1692 was_top_level = true; 1693 } 1694 1695 /* Set A for executing (execute == 0) and jumping (if->condition == 1696 * 0) channels, and then update execute flags for those to point to 1697 * the ELSE block. 1698 */ 1699 vir_PF(c, vir_OR(c, 1700 c->execute, 1701 ntq_get_src(c, if_stmt->condition, 0)), 1702 V3D_QPU_PF_PUSHZ); 1703 vir_MOV_cond(c, V3D_QPU_COND_IFA, 1704 c->execute, 1705 vir_uniform_ui(c, else_block->index)); 1706 1707 /* Jump to ELSE if nothing is active for THEN, otherwise fall 1708 * through. 1709 */ 1710 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); 1711 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); 1712 vir_link_blocks(c->cur_block, else_block); 1713 vir_link_blocks(c->cur_block, then_block); 1714 1715 /* Process the THEN block. */ 1716 vir_set_emit_block(c, then_block); 1717 ntq_emit_cf_list(c, &if_stmt->then_list); 1718 1719 if (!empty_else_block) { 1720 /* Handle the end of the THEN block. First, all currently 1721 * active channels update their execute flags to point to 1722 * ENDIF 1723 */ 1724 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); 1725 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 1726 vir_uniform_ui(c, after_block->index)); 1727 1728 /* If everything points at ENDIF, then jump there immediately. */ 1729 vir_PF(c, vir_XOR(c, c->execute, 1730 vir_uniform_ui(c, after_block->index)), 1731 V3D_QPU_PF_PUSHZ); 1732 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); 1733 vir_link_blocks(c->cur_block, after_block); 1734 vir_link_blocks(c->cur_block, else_block); 1735 1736 vir_set_emit_block(c, else_block); 1737 ntq_activate_execute_for_block(c); 1738 ntq_emit_cf_list(c, &if_stmt->else_list); 1739 } 1740 1741 vir_link_blocks(c->cur_block, after_block); 1742 1743 vir_set_emit_block(c, after_block); 1744 if (was_top_level) 1745 c->execute = c->undef; 1746 else 1747 ntq_activate_execute_for_block(c); 1748} 1749 1750static void 1751ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) 1752{ 1753 switch (jump->type) { 1754 case nir_jump_break: 1755 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); 1756 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 1757 vir_uniform_ui(c, c->loop_break_block->index)); 1758 break; 1759 1760 case nir_jump_continue: 1761 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); 1762 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 1763 vir_uniform_ui(c, c->loop_cont_block->index)); 1764 break; 1765 1766 case nir_jump_return: 1767 unreachable("All returns shouold be lowered\n"); 1768 } 1769} 1770 1771static void 1772ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) 1773{ 1774 switch (instr->type) { 1775 case nir_instr_type_alu: 1776 ntq_emit_alu(c, nir_instr_as_alu(instr)); 1777 break; 1778 1779 case nir_instr_type_intrinsic: 1780 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 1781 break; 1782 1783 case nir_instr_type_load_const: 1784 ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 1785 break; 1786 1787 case nir_instr_type_ssa_undef: 1788 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); 1789 break; 1790 1791 case nir_instr_type_tex: 1792 ntq_emit_tex(c, nir_instr_as_tex(instr)); 1793 break; 1794 1795 case nir_instr_type_jump: 1796 ntq_emit_jump(c, nir_instr_as_jump(instr)); 1797 break; 1798 1799 default: 1800 fprintf(stderr, "Unknown NIR instr type: "); 1801 nir_print_instr(instr, stderr); 1802 fprintf(stderr, "\n"); 1803 abort(); 1804 } 1805} 1806 1807static void 1808ntq_emit_block(struct v3d_compile *c, nir_block *block) 1809{ 1810 nir_foreach_instr(instr, block) { 1811 ntq_emit_instr(c, instr); 1812 } 1813} 1814 1815static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 1816 1817static void 1818ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) 1819{ 1820 bool was_top_level = false; 1821 if (c->execute.file == QFILE_NULL) { 1822 c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 1823 was_top_level = true; 1824 } 1825 1826 struct qblock *save_loop_cont_block = c->loop_cont_block; 1827 struct qblock *save_loop_break_block = c->loop_break_block; 1828 1829 c->loop_cont_block = vir_new_block(c); 1830 c->loop_break_block = vir_new_block(c); 1831 1832 vir_link_blocks(c->cur_block, c->loop_cont_block); 1833 vir_set_emit_block(c, c->loop_cont_block); 1834 ntq_activate_execute_for_block(c); 1835 1836 ntq_emit_cf_list(c, &loop->body); 1837 1838 /* Re-enable any previous continues now, so our ANYA check below 1839 * works. 1840 * 1841 * XXX: Use the .ORZ flags update, instead. 1842 */ 1843 vir_PF(c, vir_XOR(c, 1844 c->execute, 1845 vir_uniform_ui(c, c->loop_cont_block->index)), 1846 V3D_QPU_PF_PUSHZ); 1847 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 1848 1849 vir_PF(c, c->execute, V3D_QPU_PF_PUSHZ); 1850 1851 struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); 1852 /* Pixels that were not dispatched or have been discarded should not 1853 * contribute to looping again. 1854 */ 1855 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 1856 vir_link_blocks(c->cur_block, c->loop_cont_block); 1857 vir_link_blocks(c->cur_block, c->loop_break_block); 1858 1859 vir_set_emit_block(c, c->loop_break_block); 1860 if (was_top_level) 1861 c->execute = c->undef; 1862 else 1863 ntq_activate_execute_for_block(c); 1864 1865 c->loop_break_block = save_loop_break_block; 1866 c->loop_cont_block = save_loop_cont_block; 1867} 1868 1869static void 1870ntq_emit_function(struct v3d_compile *c, nir_function_impl *func) 1871{ 1872 fprintf(stderr, "FUNCTIONS not handled.\n"); 1873 abort(); 1874} 1875 1876static void 1877ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) 1878{ 1879 foreach_list_typed(nir_cf_node, node, node, list) { 1880 switch (node->type) { 1881 case nir_cf_node_block: 1882 ntq_emit_block(c, nir_cf_node_as_block(node)); 1883 break; 1884 1885 case nir_cf_node_if: 1886 ntq_emit_if(c, nir_cf_node_as_if(node)); 1887 break; 1888 1889 case nir_cf_node_loop: 1890 ntq_emit_loop(c, nir_cf_node_as_loop(node)); 1891 break; 1892 1893 case nir_cf_node_function: 1894 ntq_emit_function(c, nir_cf_node_as_function(node)); 1895 break; 1896 1897 default: 1898 fprintf(stderr, "Unknown NIR node type\n"); 1899 abort(); 1900 } 1901 } 1902} 1903 1904static void 1905ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) 1906{ 1907 ntq_setup_registers(c, &impl->registers); 1908 ntq_emit_cf_list(c, &impl->body); 1909} 1910 1911static void 1912nir_to_vir(struct v3d_compile *c) 1913{ 1914 if (c->s->info.stage == MESA_SHADER_FRAGMENT) { 1915 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); 1916 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); 1917 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); 1918 1919 if (c->fs_key->is_points) { 1920 c->point_x = emit_fragment_varying(c, NULL, 0); 1921 c->point_y = emit_fragment_varying(c, NULL, 0); 1922 } else if (c->fs_key->is_lines) { 1923 c->line_x = emit_fragment_varying(c, NULL, 0); 1924 } 1925 } 1926 1927 if (c->s->info.stage == MESA_SHADER_FRAGMENT) 1928 ntq_setup_fs_inputs(c); 1929 else 1930 ntq_setup_vpm_inputs(c); 1931 1932 ntq_setup_outputs(c); 1933 ntq_setup_uniforms(c); 1934 ntq_setup_registers(c, &c->s->registers); 1935 1936 /* Find the main function and emit the body. */ 1937 nir_foreach_function(function, c->s) { 1938 assert(strcmp(function->name, "main") == 0); 1939 assert(function->impl); 1940 ntq_emit_impl(c, function->impl); 1941 } 1942} 1943 1944const nir_shader_compiler_options v3d_nir_options = { 1945 .lower_all_io_to_temps = true, 1946 .lower_extract_byte = true, 1947 .lower_extract_word = true, 1948 .lower_bfm = true, 1949 .lower_bitfield_insert_to_shifts = true, 1950 .lower_bitfield_extract_to_shifts = true, 1951 .lower_bitfield_reverse = true, 1952 .lower_bit_count = true, 1953 .lower_pack_unorm_2x16 = true, 1954 .lower_pack_snorm_2x16 = true, 1955 .lower_pack_unorm_4x8 = true, 1956 .lower_pack_snorm_4x8 = true, 1957 .lower_unpack_unorm_4x8 = true, 1958 .lower_unpack_snorm_4x8 = true, 1959 .lower_pack_half_2x16 = true, 1960 .lower_unpack_half_2x16 = true, 1961 .lower_fdiv = true, 1962 .lower_find_lsb = true, 1963 .lower_ffma = true, 1964 .lower_flrp32 = true, 1965 .lower_fpow = true, 1966 .lower_fsat = true, 1967 .lower_fsqrt = true, 1968 .lower_ifind_msb = true, 1969 .lower_ldexp = true, 1970 .lower_mul_high = true, 1971 .lower_wpos_pntc = true, 1972 .native_integers = true, 1973}; 1974 1975 1976#if 0 1977static int 1978count_nir_instrs(nir_shader *nir) 1979{ 1980 int count = 0; 1981 nir_foreach_function(function, nir) { 1982 if (!function->impl) 1983 continue; 1984 nir_foreach_block(block, function->impl) { 1985 nir_foreach_instr(instr, block) 1986 count++; 1987 } 1988 } 1989 return count; 1990} 1991#endif 1992 1993/** 1994 * When demoting a shader down to single-threaded, removes the THRSW 1995 * instructions (one will still be inserted at v3d_vir_to_qpu() for the 1996 * program end). 1997 */ 1998static void 1999vir_remove_thrsw(struct v3d_compile *c) 2000{ 2001 vir_for_each_block(block, c) { 2002 vir_for_each_inst_safe(inst, block) { 2003 if (inst->qpu.sig.thrsw) 2004 vir_remove_instruction(c, inst); 2005 } 2006 } 2007 2008 c->last_thrsw = NULL; 2009} 2010 2011void 2012vir_emit_last_thrsw(struct v3d_compile *c) 2013{ 2014 /* On V3D before 4.1, we need a TMU op to be outstanding when thread 2015 * switching, so disable threads if we didn't do any TMU ops (each of 2016 * which would have emitted a THRSW). 2017 */ 2018 if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { 2019 c->threads = 1; 2020 if (c->last_thrsw) 2021 vir_remove_thrsw(c); 2022 return; 2023 } 2024 2025 /* If we're threaded and the last THRSW was in conditional code, then 2026 * we need to emit another one so that we can flag it as the last 2027 * thrsw. 2028 */ 2029 if (c->last_thrsw && !c->last_thrsw_at_top_level) { 2030 assert(c->devinfo->ver >= 41); 2031 vir_emit_thrsw(c); 2032 } 2033 2034 /* If we're threaded, then we need to mark the last THRSW instruction 2035 * so we can emit a pair of them at QPU emit time. 2036 * 2037 * For V3D 4.x, we can spawn the non-fragment shaders already in the 2038 * post-last-THRSW state, so we can skip this. 2039 */ 2040 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { 2041 assert(c->devinfo->ver >= 41); 2042 vir_emit_thrsw(c); 2043 } 2044 2045 if (c->last_thrsw) 2046 c->last_thrsw->is_last_thrsw = true; 2047} 2048 2049/* There's a flag in the shader for "center W is needed for reasons other than 2050 * non-centroid varyings", so we just walk the program after VIR optimization 2051 * to see if it's used. It should be harmless to set even if we only use 2052 * center W for varyings. 2053 */ 2054static void 2055vir_check_payload_w(struct v3d_compile *c) 2056{ 2057 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 2058 return; 2059 2060 vir_for_each_inst_inorder(inst, c) { 2061 for (int i = 0; i < vir_get_nsrc(inst); i++) { 2062 if (inst->src[i].file == QFILE_REG && 2063 inst->src[i].index == 0) { 2064 c->uses_center_w = true; 2065 return; 2066 } 2067 } 2068 } 2069 2070} 2071 2072void 2073v3d_nir_to_vir(struct v3d_compile *c) 2074{ 2075 if (V3D_DEBUG & (V3D_DEBUG_NIR | 2076 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2077 fprintf(stderr, "%s prog %d/%d NIR:\n", 2078 vir_get_stage_name(c), 2079 c->program_id, c->variant_id); 2080 nir_print_shader(c->s, stderr); 2081 } 2082 2083 nir_to_vir(c); 2084 2085 /* Emit the last THRSW before STVPM and TLB writes. */ 2086 vir_emit_last_thrsw(c); 2087 2088 switch (c->s->info.stage) { 2089 case MESA_SHADER_FRAGMENT: 2090 emit_frag_end(c); 2091 break; 2092 case MESA_SHADER_VERTEX: 2093 emit_vert_end(c); 2094 break; 2095 default: 2096 unreachable("bad stage"); 2097 } 2098 2099 if (V3D_DEBUG & (V3D_DEBUG_VIR | 2100 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2101 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", 2102 vir_get_stage_name(c), 2103 c->program_id, c->variant_id); 2104 vir_dump(c); 2105 fprintf(stderr, "\n"); 2106 } 2107 2108 vir_optimize(c); 2109 vir_lower_uniforms(c); 2110 2111 vir_check_payload_w(c); 2112 2113 /* XXX: vir_schedule_instructions(c); */ 2114 2115 if (V3D_DEBUG & (V3D_DEBUG_VIR | 2116 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2117 fprintf(stderr, "%s prog %d/%d VIR:\n", 2118 vir_get_stage_name(c), 2119 c->program_id, c->variant_id); 2120 vir_dump(c); 2121 fprintf(stderr, "\n"); 2122 } 2123 2124 /* Attempt to allocate registers for the temporaries. If we fail, 2125 * reduce thread count and try again. 2126 */ 2127 int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; 2128 struct qpu_reg *temp_registers; 2129 while (true) { 2130 bool spilled; 2131 temp_registers = v3d_register_allocate(c, &spilled); 2132 if (spilled) 2133 continue; 2134 2135 if (temp_registers) 2136 break; 2137 2138 if (c->threads == min_threads) { 2139 fprintf(stderr, "Failed to register allocate at %d threads:\n", 2140 c->threads); 2141 vir_dump(c); 2142 c->failed = true; 2143 return; 2144 } 2145 2146 c->threads /= 2; 2147 2148 if (c->threads == 1) 2149 vir_remove_thrsw(c); 2150 } 2151 2152 v3d_vir_to_qpu(c, temp_registers); 2153} 2154