nir_to_vir.c revision b8e80941
1/* 2 * Copyright © 2016 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <inttypes.h> 25#include "util/u_format.h" 26#include "util/u_math.h" 27#include "util/u_memory.h" 28#include "util/ralloc.h" 29#include "util/hash_table.h" 30#include "compiler/nir/nir.h" 31#include "compiler/nir/nir_builder.h" 32#include "common/v3d_device_info.h" 33#include "v3d_compiler.h" 34 35#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) 36#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) 37#define GENERAL_TMU_READ_OP_PREFETCH (0 << 3) 38#define GENERAL_TMU_READ_OP_CACHE_CLEAR (1 << 3) 39#define GENERAL_TMU_READ_OP_CACHE_FLUSH (3 << 3) 40#define GENERAL_TMU_READ_OP_CACHE_CLEAN (3 << 3) 41#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR (4 << 3) 42#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3) 43#define GENERAL_TMU_READ_OP_ATOMIC_INC (8 << 3) 44#define GENERAL_TMU_READ_OP_ATOMIC_DEC (9 << 3) 45#define GENERAL_TMU_READ_OP_ATOMIC_NOT (10 << 3) 46#define GENERAL_TMU_READ_OP_READ (15 << 3) 47#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0) 48#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0) 49#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0) 50#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0) 51#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0) 52#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0) 53#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0) 54#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0) 55 56#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP (0 << 3) 57#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP (1 << 3) 58#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG (2 << 3) 59#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG (3 << 3) 60#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN (4 << 3) 61#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX (5 << 3) 62#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN (6 << 3) 63#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX (7 << 3) 64#define GENERAL_TMU_WRITE_OP_ATOMIC_AND (8 << 3) 65#define GENERAL_TMU_WRITE_OP_ATOMIC_OR (9 << 3) 66#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR (10 << 3) 67#define GENERAL_TMU_WRITE_OP_WRITE (15 << 3) 68 69#define V3D_TSY_SET_QUORUM 0 70#define V3D_TSY_INC_WAITERS 1 71#define V3D_TSY_DEC_WAITERS 2 72#define V3D_TSY_INC_QUORUM 3 73#define V3D_TSY_DEC_QUORUM 4 74#define V3D_TSY_FREE_ALL 5 75#define V3D_TSY_RELEASE 6 76#define V3D_TSY_ACQUIRE 7 77#define V3D_TSY_WAIT 8 78#define V3D_TSY_WAIT_INC 9 79#define V3D_TSY_WAIT_CHECK 10 80#define V3D_TSY_WAIT_INC_CHECK 11 81#define V3D_TSY_WAIT_CV 12 82#define V3D_TSY_INC_SEMAPHORE 13 83#define V3D_TSY_DEC_SEMAPHORE 14 84#define V3D_TSY_SET_QUORUM_FREE_ALL 15 85 86static void 87ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 88 89static void 90resize_qreg_array(struct v3d_compile *c, 91 struct qreg **regs, 92 uint32_t *size, 93 uint32_t decl_size) 94{ 95 if (*size >= decl_size) 96 return; 97 98 uint32_t old_size = *size; 99 *size = MAX2(*size * 2, decl_size); 100 *regs = reralloc(c, *regs, struct qreg, *size); 101 if (!*regs) { 102 fprintf(stderr, "Malloc failure\n"); 103 abort(); 104 } 105 106 for (uint32_t i = old_size; i < *size; i++) 107 (*regs)[i] = c->undef; 108} 109 110void 111vir_emit_thrsw(struct v3d_compile *c) 112{ 113 if (c->threads == 1) 114 return; 115 116 /* Always thread switch after each texture operation for now. 117 * 118 * We could do better by batching a bunch of texture fetches up and 119 * then doing one thread switch and collecting all their results 120 * afterward. 121 */ 122 c->last_thrsw = vir_NOP(c); 123 c->last_thrsw->qpu.sig.thrsw = true; 124 c->last_thrsw_at_top_level = !c->in_control_flow; 125} 126 127static uint32_t 128v3d_general_tmu_op(nir_intrinsic_instr *instr) 129{ 130 switch (instr->intrinsic) { 131 case nir_intrinsic_load_ssbo: 132 case nir_intrinsic_load_ubo: 133 case nir_intrinsic_load_uniform: 134 case nir_intrinsic_load_shared: 135 case nir_intrinsic_load_scratch: 136 return GENERAL_TMU_READ_OP_READ; 137 case nir_intrinsic_store_ssbo: 138 case nir_intrinsic_store_shared: 139 case nir_intrinsic_store_scratch: 140 return GENERAL_TMU_WRITE_OP_WRITE; 141 case nir_intrinsic_ssbo_atomic_add: 142 case nir_intrinsic_shared_atomic_add: 143 return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP; 144 case nir_intrinsic_ssbo_atomic_imin: 145 case nir_intrinsic_shared_atomic_imin: 146 return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN; 147 case nir_intrinsic_ssbo_atomic_umin: 148 case nir_intrinsic_shared_atomic_umin: 149 return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN; 150 case nir_intrinsic_ssbo_atomic_imax: 151 case nir_intrinsic_shared_atomic_imax: 152 return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX; 153 case nir_intrinsic_ssbo_atomic_umax: 154 case nir_intrinsic_shared_atomic_umax: 155 return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX; 156 case nir_intrinsic_ssbo_atomic_and: 157 case nir_intrinsic_shared_atomic_and: 158 return GENERAL_TMU_WRITE_OP_ATOMIC_AND; 159 case nir_intrinsic_ssbo_atomic_or: 160 case nir_intrinsic_shared_atomic_or: 161 return GENERAL_TMU_WRITE_OP_ATOMIC_OR; 162 case nir_intrinsic_ssbo_atomic_xor: 163 case nir_intrinsic_shared_atomic_xor: 164 return GENERAL_TMU_WRITE_OP_ATOMIC_XOR; 165 case nir_intrinsic_ssbo_atomic_exchange: 166 case nir_intrinsic_shared_atomic_exchange: 167 return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG; 168 case nir_intrinsic_ssbo_atomic_comp_swap: 169 case nir_intrinsic_shared_atomic_comp_swap: 170 return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG; 171 default: 172 unreachable("unknown intrinsic op"); 173 } 174} 175 176/** 177 * Implements indirect uniform loads and SSBO accesses through the TMU general 178 * memory access interface. 179 */ 180static void 181ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, 182 bool is_shared_or_scratch) 183{ 184 /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR 185 * wants to have support for inc/dec? 186 */ 187 188 uint32_t tmu_op = v3d_general_tmu_op(instr); 189 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || 190 instr->intrinsic == nir_intrinsic_store_scratch || 191 instr->intrinsic == nir_intrinsic_store_shared); 192 bool has_index = !is_shared_or_scratch; 193 194 int offset_src; 195 int tmu_writes = 1; /* address */ 196 if (instr->intrinsic == nir_intrinsic_load_uniform) { 197 offset_src = 0; 198 } else if (instr->intrinsic == nir_intrinsic_load_ssbo || 199 instr->intrinsic == nir_intrinsic_load_ubo || 200 instr->intrinsic == nir_intrinsic_load_scratch || 201 instr->intrinsic == nir_intrinsic_load_shared) { 202 offset_src = 0 + has_index; 203 } else if (is_store) { 204 offset_src = 1 + has_index; 205 for (int i = 0; i < instr->num_components; i++) { 206 vir_MOV_dest(c, 207 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 208 ntq_get_src(c, instr->src[0], i)); 209 tmu_writes++; 210 } 211 } else { 212 offset_src = 0 + has_index; 213 vir_MOV_dest(c, 214 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 215 ntq_get_src(c, instr->src[1 + has_index], 0)); 216 tmu_writes++; 217 if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) { 218 vir_MOV_dest(c, 219 vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 220 ntq_get_src(c, instr->src[2 + has_index], 221 0)); 222 tmu_writes++; 223 } 224 } 225 226 bool dynamic_src = !nir_src_is_const(instr->src[offset_src]); 227 uint32_t const_offset = 0; 228 if (!dynamic_src) 229 const_offset = nir_src_as_uint(instr->src[offset_src]); 230 231 /* Make sure we won't exceed the 16-entry TMU fifo if each thread is 232 * storing at the same time. 233 */ 234 while (tmu_writes > 16 / c->threads) 235 c->threads /= 2; 236 237 struct qreg offset; 238 if (instr->intrinsic == nir_intrinsic_load_uniform) { 239 const_offset += nir_intrinsic_base(instr); 240 offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 241 v3d_unit_data_create(0, const_offset)); 242 const_offset = 0; 243 } else if (instr->intrinsic == nir_intrinsic_load_ubo) { 244 uint32_t index = nir_src_as_uint(instr->src[0]) + 1; 245 /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by 246 * 1 (0 is gallium's constant buffer 0). 247 */ 248 offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 249 v3d_unit_data_create(index, const_offset)); 250 const_offset = 0; 251 } else if (is_shared_or_scratch) { 252 /* Shared and scratch variables have no buffer index, and all 253 * start from a common base that we set up at the start of 254 * dispatch. 255 */ 256 if (instr->intrinsic == nir_intrinsic_load_scratch || 257 instr->intrinsic == nir_intrinsic_store_scratch) { 258 offset = c->spill_base; 259 } else { 260 offset = c->cs_shared_offset; 261 const_offset += nir_intrinsic_base(instr); 262 } 263 } else { 264 offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, 265 nir_src_as_uint(instr->src[is_store ? 266 1 : 0])); 267 } 268 269 /* The spec says that for atomics, the TYPE field is ignored, but that 270 * doesn't seem to be the case for CMPXCHG. Just use the number of 271 * tmud writes we did to decide the type (or choose "32bit" for atomic 272 * reads, which has been fine). 273 */ 274 int num_components; 275 if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) 276 num_components = 2; 277 else 278 num_components = instr->num_components; 279 280 uint32_t config = (0xffffff00 | 281 tmu_op | 282 GENERAL_TMU_LOOKUP_PER_PIXEL); 283 if (num_components == 1) { 284 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; 285 } else { 286 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; 287 } 288 289 if (vir_in_nonuniform_control_flow(c)) { 290 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 291 V3D_QPU_PF_PUSHZ); 292 } 293 294 struct qreg tmua; 295 if (config == ~0) 296 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); 297 else 298 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); 299 300 struct qinst *tmu; 301 if (dynamic_src) { 302 if (const_offset != 0) { 303 offset = vir_ADD(c, offset, 304 vir_uniform_ui(c, const_offset)); 305 } 306 tmu = vir_ADD_dest(c, tmua, offset, 307 ntq_get_src(c, instr->src[offset_src], 0)); 308 } else { 309 if (const_offset != 0) { 310 tmu = vir_ADD_dest(c, tmua, offset, 311 vir_uniform_ui(c, const_offset)); 312 } else { 313 tmu = vir_MOV_dest(c, tmua, offset); 314 } 315 } 316 317 if (config != ~0) { 318 tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 319 config); 320 } 321 322 if (vir_in_nonuniform_control_flow(c)) 323 vir_set_cond(tmu, V3D_QPU_COND_IFA); 324 325 vir_emit_thrsw(c); 326 327 /* Read the result, or wait for the TMU op to complete. */ 328 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) 329 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); 330 331 if (nir_intrinsic_dest_components(instr) == 0) 332 vir_TMUWT(c); 333} 334 335static struct qreg * 336ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) 337{ 338 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 339 def->num_components); 340 _mesa_hash_table_insert(c->def_ht, def, qregs); 341 return qregs; 342} 343 344/** 345 * This function is responsible for getting VIR results into the associated 346 * storage for a NIR instruction. 347 * 348 * If it's a NIR SSA def, then we just set the associated hash table entry to 349 * the new result. 350 * 351 * If it's a NIR reg, then we need to update the existing qreg assigned to the 352 * NIR destination with the incoming value. To do that without introducing 353 * new MOVs, we require that the incoming qreg either be a uniform, or be 354 * SSA-defined by the previous VIR instruction in the block and rewritable by 355 * this function. That lets us sneak ahead and insert the SF flag beforehand 356 * (knowing that the previous instruction doesn't depend on flags) and rewrite 357 * its destination to be the NIR reg's destination 358 */ 359void 360ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, 361 struct qreg result) 362{ 363 struct qinst *last_inst = NULL; 364 if (!list_empty(&c->cur_block->instructions)) 365 last_inst = (struct qinst *)c->cur_block->instructions.prev; 366 367 assert((result.file == QFILE_TEMP && 368 last_inst && last_inst == c->defs[result.index])); 369 370 if (dest->is_ssa) { 371 assert(chan < dest->ssa.num_components); 372 373 struct qreg *qregs; 374 struct hash_entry *entry = 375 _mesa_hash_table_search(c->def_ht, &dest->ssa); 376 377 if (entry) 378 qregs = entry->data; 379 else 380 qregs = ntq_init_ssa_def(c, &dest->ssa); 381 382 qregs[chan] = result; 383 } else { 384 nir_register *reg = dest->reg.reg; 385 assert(dest->reg.base_offset == 0); 386 assert(reg->num_array_elems == 0); 387 struct hash_entry *entry = 388 _mesa_hash_table_search(c->def_ht, reg); 389 struct qreg *qregs = entry->data; 390 391 /* Insert a MOV if the source wasn't an SSA def in the 392 * previous instruction. 393 */ 394 if ((vir_in_nonuniform_control_flow(c) && 395 c->defs[last_inst->dst.index]->qpu.sig.ldunif)) { 396 result = vir_MOV(c, result); 397 last_inst = c->defs[result.index]; 398 } 399 400 /* We know they're both temps, so just rewrite index. */ 401 c->defs[last_inst->dst.index] = NULL; 402 last_inst->dst.index = qregs[chan].index; 403 404 /* If we're in control flow, then make this update of the reg 405 * conditional on the execution mask. 406 */ 407 if (vir_in_nonuniform_control_flow(c)) { 408 last_inst->dst.index = qregs[chan].index; 409 410 /* Set the flags to the current exec mask. 411 */ 412 c->cursor = vir_before_inst(last_inst); 413 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 414 V3D_QPU_PF_PUSHZ); 415 c->cursor = vir_after_inst(last_inst); 416 417 vir_set_cond(last_inst, V3D_QPU_COND_IFA); 418 } 419 } 420} 421 422struct qreg 423ntq_get_src(struct v3d_compile *c, nir_src src, int i) 424{ 425 struct hash_entry *entry; 426 if (src.is_ssa) { 427 entry = _mesa_hash_table_search(c->def_ht, src.ssa); 428 assert(i < src.ssa->num_components); 429 } else { 430 nir_register *reg = src.reg.reg; 431 entry = _mesa_hash_table_search(c->def_ht, reg); 432 assert(reg->num_array_elems == 0); 433 assert(src.reg.base_offset == 0); 434 assert(i < reg->num_components); 435 } 436 437 struct qreg *qregs = entry->data; 438 return qregs[i]; 439} 440 441static struct qreg 442ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, 443 unsigned src) 444{ 445 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 446 unsigned chan = ffs(instr->dest.write_mask) - 1; 447 struct qreg r = ntq_get_src(c, instr->src[src].src, 448 instr->src[src].swizzle[chan]); 449 450 assert(!instr->src[src].abs); 451 assert(!instr->src[src].negate); 452 453 return r; 454}; 455 456static struct qreg 457ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level) 458{ 459 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1)); 460} 461 462static void 463ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) 464{ 465 unsigned unit = instr->texture_index; 466 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod); 467 int dest_size = nir_tex_instr_dest_size(instr); 468 469 struct qreg lod = c->undef; 470 if (lod_index != -1) 471 lod = ntq_get_src(c, instr->src[lod_index].src, 0); 472 473 for (int i = 0; i < dest_size; i++) { 474 assert(i < 3); 475 enum quniform_contents contents; 476 477 if (instr->is_array && i == dest_size - 1) 478 contents = QUNIFORM_TEXTURE_ARRAY_SIZE; 479 else 480 contents = QUNIFORM_TEXTURE_WIDTH + i; 481 482 struct qreg size = vir_uniform(c, contents, unit); 483 484 switch (instr->sampler_dim) { 485 case GLSL_SAMPLER_DIM_1D: 486 case GLSL_SAMPLER_DIM_2D: 487 case GLSL_SAMPLER_DIM_MS: 488 case GLSL_SAMPLER_DIM_3D: 489 case GLSL_SAMPLER_DIM_CUBE: 490 /* Don't minify the array size. */ 491 if (!(instr->is_array && i == dest_size - 1)) { 492 size = ntq_minify(c, size, lod); 493 } 494 break; 495 496 case GLSL_SAMPLER_DIM_RECT: 497 /* There's no LOD field for rects */ 498 break; 499 500 default: 501 unreachable("Bad sampler type"); 502 } 503 504 ntq_store_dest(c, &instr->dest, i, size); 505 } 506} 507 508static void 509ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) 510{ 511 unsigned unit = instr->texture_index; 512 513 /* Since each texture sampling op requires uploading uniforms to 514 * reference the texture, there's no HW support for texture size and 515 * you just upload uniforms containing the size. 516 */ 517 switch (instr->op) { 518 case nir_texop_query_levels: 519 ntq_store_dest(c, &instr->dest, 0, 520 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); 521 return; 522 case nir_texop_txs: 523 ntq_emit_txs(c, instr); 524 return; 525 default: 526 break; 527 } 528 529 if (c->devinfo->ver >= 40) 530 v3d40_vir_emit_tex(c, instr); 531 else 532 v3d33_vir_emit_tex(c, instr); 533} 534 535static struct qreg 536ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos) 537{ 538 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI)); 539 if (is_cos) 540 input = vir_FADD(c, input, vir_uniform_f(c, 0.5)); 541 542 struct qreg periods = vir_FROUND(c, input); 543 struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods)); 544 return vir_XOR(c, sin_output, vir_SHL(c, 545 vir_FTOIN(c, periods), 546 vir_uniform_ui(c, -1))); 547} 548 549static struct qreg 550ntq_fsign(struct v3d_compile *c, struct qreg src) 551{ 552 struct qreg t = vir_get_temp(c); 553 554 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); 555 vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ); 556 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); 557 vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN); 558 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); 559 return vir_MOV(c, t); 560} 561 562static void 563emit_fragcoord_input(struct v3d_compile *c, int attr) 564{ 565 c->inputs[attr * 4 + 0] = vir_FXCD(c); 566 c->inputs[attr * 4 + 1] = vir_FYCD(c); 567 c->inputs[attr * 4 + 2] = c->payload_z; 568 c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w); 569} 570 571static struct qreg 572emit_fragment_varying(struct v3d_compile *c, nir_variable *var, 573 uint8_t swizzle, int array_index) 574{ 575 struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); 576 struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); 577 578 struct qreg vary; 579 if (c->devinfo->ver >= 41) { 580 struct qinst *ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, 581 c->undef, c->undef); 582 ldvary->qpu.sig.ldvary = true; 583 vary = vir_emit_def(c, ldvary); 584 } else { 585 vir_NOP(c)->qpu.sig.ldvary = true; 586 vary = r3; 587 } 588 589 /* For gl_PointCoord input or distance along a line, we'll be called 590 * with no nir_variable, and we don't count toward VPM size so we 591 * don't track an input slot. 592 */ 593 if (!var) { 594 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); 595 } 596 597 int i = c->num_inputs++; 598 c->input_slots[i] = 599 v3d_slot_from_slot_and_component(var->data.location + 600 array_index, swizzle); 601 602 switch (var->data.interpolation) { 603 case INTERP_MODE_NONE: 604 /* If a gl_FrontColor or gl_BackColor input has no interp 605 * qualifier, then if we're using glShadeModel(GL_FLAT) it 606 * needs to be flat shaded. 607 */ 608 switch (var->data.location + array_index) { 609 case VARYING_SLOT_COL0: 610 case VARYING_SLOT_COL1: 611 case VARYING_SLOT_BFC0: 612 case VARYING_SLOT_BFC1: 613 if (c->fs_key->shade_model_flat) { 614 BITSET_SET(c->flat_shade_flags, i); 615 vir_MOV_dest(c, c->undef, vary); 616 return vir_MOV(c, r5); 617 } else { 618 return vir_FADD(c, vir_FMUL(c, vary, 619 c->payload_w), r5); 620 } 621 default: 622 break; 623 } 624 /* FALLTHROUGH */ 625 case INTERP_MODE_SMOOTH: 626 if (var->data.centroid) { 627 BITSET_SET(c->centroid_flags, i); 628 return vir_FADD(c, vir_FMUL(c, vary, 629 c->payload_w_centroid), r5); 630 } else { 631 return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); 632 } 633 case INTERP_MODE_NOPERSPECTIVE: 634 BITSET_SET(c->noperspective_flags, i); 635 return vir_FADD(c, vir_MOV(c, vary), r5); 636 case INTERP_MODE_FLAT: 637 BITSET_SET(c->flat_shade_flags, i); 638 vir_MOV_dest(c, c->undef, vary); 639 return vir_MOV(c, r5); 640 default: 641 unreachable("Bad interp mode"); 642 } 643} 644 645static void 646emit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, 647 int array_index) 648{ 649 for (int i = 0; i < glsl_get_vector_elements(var->type); i++) { 650 int chan = var->data.location_frac + i; 651 c->inputs[attr * 4 + chan] = 652 emit_fragment_varying(c, var, chan, array_index); 653 } 654} 655 656static void 657add_output(struct v3d_compile *c, 658 uint32_t decl_offset, 659 uint8_t slot, 660 uint8_t swizzle) 661{ 662 uint32_t old_array_size = c->outputs_array_size; 663 resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 664 decl_offset + 1); 665 666 if (old_array_size != c->outputs_array_size) { 667 c->output_slots = reralloc(c, 668 c->output_slots, 669 struct v3d_varying_slot, 670 c->outputs_array_size); 671 } 672 673 c->output_slots[decl_offset] = 674 v3d_slot_from_slot_and_component(slot, swizzle); 675} 676 677/** 678 * If compare_instr is a valid comparison instruction, emits the 679 * compare_instr's comparison and returns the sel_instr's return value based 680 * on the compare_instr's result. 681 */ 682static bool 683ntq_emit_comparison(struct v3d_compile *c, 684 nir_alu_instr *compare_instr, 685 enum v3d_qpu_cond *out_cond) 686{ 687 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 688 struct qreg src1; 689 if (nir_op_infos[compare_instr->op].num_inputs > 1) 690 src1 = ntq_get_alu_src(c, compare_instr, 1); 691 bool cond_invert = false; 692 struct qreg nop = vir_nop_reg(); 693 694 switch (compare_instr->op) { 695 case nir_op_feq32: 696 case nir_op_seq: 697 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 698 break; 699 case nir_op_ieq32: 700 vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 701 break; 702 703 case nir_op_fne32: 704 case nir_op_sne: 705 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 706 cond_invert = true; 707 break; 708 case nir_op_ine32: 709 vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 710 cond_invert = true; 711 break; 712 713 case nir_op_fge32: 714 case nir_op_sge: 715 vir_set_pf(vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 716 break; 717 case nir_op_ige32: 718 vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 719 cond_invert = true; 720 break; 721 case nir_op_uge32: 722 vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 723 cond_invert = true; 724 break; 725 726 case nir_op_slt: 727 case nir_op_flt32: 728 vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN); 729 break; 730 case nir_op_ilt32: 731 vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 732 break; 733 case nir_op_ult32: 734 vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 735 break; 736 737 case nir_op_i2b32: 738 vir_set_pf(vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 739 cond_invert = true; 740 break; 741 742 case nir_op_f2b32: 743 vir_set_pf(vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 744 cond_invert = true; 745 break; 746 747 default: 748 return false; 749 } 750 751 *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA; 752 753 return true; 754} 755 756/* Finds an ALU instruction that generates our src value that could 757 * (potentially) be greedily emitted in the consuming instruction. 758 */ 759static struct nir_alu_instr * 760ntq_get_alu_parent(nir_src src) 761{ 762 if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu) 763 return NULL; 764 nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr); 765 if (!instr) 766 return NULL; 767 768 /* If the ALU instr's srcs are non-SSA, then we would have to avoid 769 * moving emission of the ALU instr down past another write of the 770 * src. 771 */ 772 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 773 if (!instr->src[i].src.is_ssa) 774 return NULL; 775 } 776 777 return instr; 778} 779 780/* Turns a NIR bool into a condition code to predicate on. */ 781static enum v3d_qpu_cond 782ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src) 783{ 784 nir_alu_instr *compare = ntq_get_alu_parent(src); 785 if (!compare) 786 goto out; 787 788 enum v3d_qpu_cond cond; 789 if (ntq_emit_comparison(c, compare, &cond)) 790 return cond; 791 792out: 793 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)), 794 V3D_QPU_PF_PUSHZ); 795 return V3D_QPU_COND_IFNA; 796} 797 798static void 799ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) 800{ 801 /* This should always be lowered to ALU operations for V3D. */ 802 assert(!instr->dest.saturate); 803 804 /* Vectors are special in that they have non-scalarized writemasks, 805 * and just take the first swizzle channel for each argument in order 806 * into each writemask channel. 807 */ 808 if (instr->op == nir_op_vec2 || 809 instr->op == nir_op_vec3 || 810 instr->op == nir_op_vec4) { 811 struct qreg srcs[4]; 812 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 813 srcs[i] = ntq_get_src(c, instr->src[i].src, 814 instr->src[i].swizzle[0]); 815 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 816 ntq_store_dest(c, &instr->dest.dest, i, 817 vir_MOV(c, srcs[i])); 818 return; 819 } 820 821 /* General case: We can just grab the one used channel per src. */ 822 struct qreg src[nir_op_infos[instr->op].num_inputs]; 823 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 824 src[i] = ntq_get_alu_src(c, instr, i); 825 } 826 827 struct qreg result; 828 829 switch (instr->op) { 830 case nir_op_fmov: 831 case nir_op_imov: 832 result = vir_MOV(c, src[0]); 833 break; 834 835 case nir_op_fneg: 836 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31)); 837 break; 838 case nir_op_ineg: 839 result = vir_NEG(c, src[0]); 840 break; 841 842 case nir_op_fmul: 843 result = vir_FMUL(c, src[0], src[1]); 844 break; 845 case nir_op_fadd: 846 result = vir_FADD(c, src[0], src[1]); 847 break; 848 case nir_op_fsub: 849 result = vir_FSUB(c, src[0], src[1]); 850 break; 851 case nir_op_fmin: 852 result = vir_FMIN(c, src[0], src[1]); 853 break; 854 case nir_op_fmax: 855 result = vir_FMAX(c, src[0], src[1]); 856 break; 857 858 case nir_op_f2i32: { 859 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src); 860 if (src0_alu && src0_alu->op == nir_op_fround_even) { 861 result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0)); 862 } else { 863 result = vir_FTOIZ(c, src[0]); 864 } 865 break; 866 } 867 868 case nir_op_f2u32: 869 result = vir_FTOUZ(c, src[0]); 870 break; 871 case nir_op_i2f32: 872 result = vir_ITOF(c, src[0]); 873 break; 874 case nir_op_u2f32: 875 result = vir_UTOF(c, src[0]); 876 break; 877 case nir_op_b2f32: 878 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0)); 879 break; 880 case nir_op_b2i32: 881 result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); 882 break; 883 884 case nir_op_iadd: 885 result = vir_ADD(c, src[0], src[1]); 886 break; 887 case nir_op_ushr: 888 result = vir_SHR(c, src[0], src[1]); 889 break; 890 case nir_op_isub: 891 result = vir_SUB(c, src[0], src[1]); 892 break; 893 case nir_op_ishr: 894 result = vir_ASR(c, src[0], src[1]); 895 break; 896 case nir_op_ishl: 897 result = vir_SHL(c, src[0], src[1]); 898 break; 899 case nir_op_imin: 900 result = vir_MIN(c, src[0], src[1]); 901 break; 902 case nir_op_umin: 903 result = vir_UMIN(c, src[0], src[1]); 904 break; 905 case nir_op_imax: 906 result = vir_MAX(c, src[0], src[1]); 907 break; 908 case nir_op_umax: 909 result = vir_UMAX(c, src[0], src[1]); 910 break; 911 case nir_op_iand: 912 result = vir_AND(c, src[0], src[1]); 913 break; 914 case nir_op_ior: 915 result = vir_OR(c, src[0], src[1]); 916 break; 917 case nir_op_ixor: 918 result = vir_XOR(c, src[0], src[1]); 919 break; 920 case nir_op_inot: 921 result = vir_NOT(c, src[0]); 922 break; 923 924 case nir_op_ufind_msb: 925 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0])); 926 break; 927 928 case nir_op_imul: 929 result = vir_UMUL(c, src[0], src[1]); 930 break; 931 932 case nir_op_seq: 933 case nir_op_sne: 934 case nir_op_sge: 935 case nir_op_slt: { 936 enum v3d_qpu_cond cond; 937 MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond); 938 assert(ok); 939 result = vir_MOV(c, vir_SEL(c, cond, 940 vir_uniform_f(c, 1.0), 941 vir_uniform_f(c, 0.0))); 942 break; 943 } 944 945 case nir_op_i2b32: 946 case nir_op_f2b32: 947 case nir_op_feq32: 948 case nir_op_fne32: 949 case nir_op_fge32: 950 case nir_op_flt32: 951 case nir_op_ieq32: 952 case nir_op_ine32: 953 case nir_op_ige32: 954 case nir_op_uge32: 955 case nir_op_ilt32: 956 case nir_op_ult32: { 957 enum v3d_qpu_cond cond; 958 MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond); 959 assert(ok); 960 result = vir_MOV(c, vir_SEL(c, cond, 961 vir_uniform_ui(c, ~0), 962 vir_uniform_ui(c, 0))); 963 break; 964 } 965 966 case nir_op_b32csel: 967 result = vir_MOV(c, 968 vir_SEL(c, 969 ntq_emit_bool_to_cond(c, instr->src[0].src), 970 src[1], src[2])); 971 break; 972 973 case nir_op_fcsel: 974 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), src[0]), 975 V3D_QPU_PF_PUSHZ); 976 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, 977 src[1], src[2])); 978 break; 979 980 case nir_op_frcp: 981 result = vir_RECIP(c, src[0]); 982 break; 983 case nir_op_frsq: 984 result = vir_RSQRT(c, src[0]); 985 break; 986 case nir_op_fexp2: 987 result = vir_EXP(c, src[0]); 988 break; 989 case nir_op_flog2: 990 result = vir_LOG(c, src[0]); 991 break; 992 993 case nir_op_fceil: 994 result = vir_FCEIL(c, src[0]); 995 break; 996 case nir_op_ffloor: 997 result = vir_FFLOOR(c, src[0]); 998 break; 999 case nir_op_fround_even: 1000 result = vir_FROUND(c, src[0]); 1001 break; 1002 case nir_op_ftrunc: 1003 result = vir_FTRUNC(c, src[0]); 1004 break; 1005 1006 case nir_op_fsin: 1007 result = ntq_fsincos(c, src[0], false); 1008 break; 1009 case nir_op_fcos: 1010 result = ntq_fsincos(c, src[0], true); 1011 break; 1012 1013 case nir_op_fsign: 1014 result = ntq_fsign(c, src[0]); 1015 break; 1016 1017 case nir_op_fabs: { 1018 result = vir_FMOV(c, src[0]); 1019 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS); 1020 break; 1021 } 1022 1023 case nir_op_iabs: 1024 result = vir_MAX(c, src[0], vir_NEG(c, src[0])); 1025 break; 1026 1027 case nir_op_fddx: 1028 case nir_op_fddx_coarse: 1029 case nir_op_fddx_fine: 1030 result = vir_FDX(c, src[0]); 1031 break; 1032 1033 case nir_op_fddy: 1034 case nir_op_fddy_coarse: 1035 case nir_op_fddy_fine: 1036 result = vir_FDY(c, src[0]); 1037 break; 1038 1039 case nir_op_uadd_carry: 1040 vir_set_pf(vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), 1041 V3D_QPU_PF_PUSHC); 1042 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, 1043 vir_uniform_ui(c, ~0), 1044 vir_uniform_ui(c, 0))); 1045 break; 1046 1047 case nir_op_pack_half_2x16_split: 1048 result = vir_VFPACK(c, src[0], src[1]); 1049 break; 1050 1051 case nir_op_unpack_half_2x16_split_x: 1052 result = vir_FMOV(c, src[0]); 1053 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); 1054 break; 1055 1056 case nir_op_unpack_half_2x16_split_y: 1057 result = vir_FMOV(c, src[0]); 1058 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); 1059 break; 1060 1061 default: 1062 fprintf(stderr, "unknown NIR ALU inst: "); 1063 nir_print_instr(&instr->instr, stderr); 1064 fprintf(stderr, "\n"); 1065 abort(); 1066 } 1067 1068 /* We have a scalar result, so the instruction should only have a 1069 * single channel written to. 1070 */ 1071 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 1072 ntq_store_dest(c, &instr->dest.dest, 1073 ffs(instr->dest.write_mask) - 1, result); 1074} 1075 1076/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit 1077 * specifier. They come from a register that's preloaded with 0xffffffff 1078 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low 1079 * 8 bits are shifted off the bottom and 0xff shifted in from the top. 1080 */ 1081#define TLB_TYPE_F16_COLOR (3 << 6) 1082#define TLB_TYPE_I32_COLOR (1 << 6) 1083#define TLB_TYPE_F32_COLOR (0 << 6) 1084#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */ 1085#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2) 1086#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2) 1087#define TLB_F16_SWAP_HI_LO (1 << 1) 1088#define TLB_VEC_SIZE_4_F16 (1 << 0) 1089#define TLB_VEC_SIZE_2_F16 (0 << 0) 1090#define TLB_VEC_SIZE_MINUS_1_SHIFT 0 1091 1092/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z" 1093 * flag is set. 1094 */ 1095#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4)) 1096#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */ 1097#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */ 1098#define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */ 1099#define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */ 1100 1101/* Stencil is a single 32-bit write. */ 1102#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4)) 1103 1104static void 1105emit_frag_end(struct v3d_compile *c) 1106{ 1107 /* XXX 1108 if (c->output_sample_mask_index != -1) { 1109 vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]); 1110 } 1111 */ 1112 1113 bool has_any_tlb_color_write = false; 1114 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) { 1115 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt]) 1116 has_any_tlb_color_write = true; 1117 } 1118 1119 if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) { 1120 struct nir_variable *var = c->output_color_var[0]; 1121 struct qreg *color = &c->outputs[var->data.driver_location * 4]; 1122 1123 vir_SETMSF_dest(c, vir_nop_reg(), 1124 vir_AND(c, 1125 vir_MSF(c), 1126 vir_FTOC(c, color[3]))); 1127 } 1128 1129 struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB); 1130 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); 1131 if (c->output_position_index != -1) { 1132 struct qinst *inst = vir_MOV_dest(c, tlbu_reg, 1133 c->outputs[c->output_position_index]); 1134 uint8_t tlb_specifier = TLB_TYPE_DEPTH; 1135 1136 if (c->devinfo->ver >= 42) { 1137 tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | 1138 TLB_SAMPLE_MODE_PER_PIXEL); 1139 } else 1140 tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; 1141 1142 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 1143 tlb_specifier | 1144 0xffffff00); 1145 c->writes_z = true; 1146 } else if (c->s->info.fs.uses_discard || 1147 !c->s->info.fs.early_fragment_tests || 1148 c->fs_key->sample_alpha_to_coverage || 1149 !has_any_tlb_color_write) { 1150 /* Emit passthrough Z if it needed to be delayed until shader 1151 * end due to potential discards. 1152 * 1153 * Since (single-threaded) fragment shaders always need a TLB 1154 * write, emit passthrouh Z if we didn't have any color 1155 * buffers and flag us as potentially discarding, so that we 1156 * can use Z as the TLB write. 1157 */ 1158 c->s->info.fs.uses_discard = true; 1159 1160 struct qinst *inst = vir_MOV_dest(c, tlbu_reg, 1161 vir_nop_reg()); 1162 uint8_t tlb_specifier = TLB_TYPE_DEPTH; 1163 1164 if (c->devinfo->ver >= 42) { 1165 /* The spec says the PER_PIXEL flag is ignored for 1166 * invariant writes, but the simulator demands it. 1167 */ 1168 tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT | 1169 TLB_SAMPLE_MODE_PER_PIXEL); 1170 } else { 1171 tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; 1172 } 1173 1174 inst->uniform = vir_get_uniform_index(c, 1175 QUNIFORM_CONSTANT, 1176 tlb_specifier | 1177 0xffffff00); 1178 c->writes_z = true; 1179 } 1180 1181 /* XXX: Performance improvement: Merge Z write and color writes TLB 1182 * uniform setup 1183 */ 1184 1185 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) { 1186 if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt]) 1187 continue; 1188 1189 nir_variable *var = c->output_color_var[rt]; 1190 struct qreg *color = &c->outputs[var->data.driver_location * 4]; 1191 int num_components = glsl_get_vector_elements(var->type); 1192 uint32_t conf = 0xffffff00; 1193 struct qinst *inst; 1194 1195 conf |= TLB_SAMPLE_MODE_PER_PIXEL; 1196 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 1197 1198 if (c->fs_key->swap_color_rb & (1 << rt)) 1199 num_components = MAX2(num_components, 3); 1200 1201 assert(num_components != 0); 1202 switch (glsl_get_base_type(var->type)) { 1203 case GLSL_TYPE_UINT: 1204 case GLSL_TYPE_INT: 1205 /* The F32 vs I32 distinction was dropped in 4.2. */ 1206 if (c->devinfo->ver < 42) 1207 conf |= TLB_TYPE_I32_COLOR; 1208 else 1209 conf |= TLB_TYPE_F32_COLOR; 1210 conf |= ((num_components - 1) << 1211 TLB_VEC_SIZE_MINUS_1_SHIFT); 1212 1213 inst = vir_MOV_dest(c, tlbu_reg, color[0]); 1214 inst->uniform = vir_get_uniform_index(c, 1215 QUNIFORM_CONSTANT, 1216 conf); 1217 1218 for (int i = 1; i < num_components; i++) { 1219 inst = vir_MOV_dest(c, tlb_reg, color[i]); 1220 } 1221 break; 1222 1223 default: { 1224 struct qreg r = color[0]; 1225 struct qreg g = color[1]; 1226 struct qreg b = color[2]; 1227 struct qreg a = color[3]; 1228 1229 if (c->fs_key->f32_color_rb & (1 << rt)) { 1230 conf |= TLB_TYPE_F32_COLOR; 1231 conf |= ((num_components - 1) << 1232 TLB_VEC_SIZE_MINUS_1_SHIFT); 1233 } else { 1234 conf |= TLB_TYPE_F16_COLOR; 1235 conf |= TLB_F16_SWAP_HI_LO; 1236 if (num_components >= 3) 1237 conf |= TLB_VEC_SIZE_4_F16; 1238 else 1239 conf |= TLB_VEC_SIZE_2_F16; 1240 } 1241 1242 if (c->fs_key->swap_color_rb & (1 << rt)) { 1243 r = color[2]; 1244 b = color[0]; 1245 } 1246 1247 if (c->fs_key->sample_alpha_to_one) 1248 a = vir_uniform_f(c, 1.0); 1249 1250 if (c->fs_key->f32_color_rb & (1 << rt)) { 1251 inst = vir_MOV_dest(c, tlbu_reg, r); 1252 inst->uniform = vir_get_uniform_index(c, 1253 QUNIFORM_CONSTANT, 1254 conf); 1255 1256 if (num_components >= 2) 1257 vir_MOV_dest(c, tlb_reg, g); 1258 if (num_components >= 3) 1259 vir_MOV_dest(c, tlb_reg, b); 1260 if (num_components >= 4) 1261 vir_MOV_dest(c, tlb_reg, a); 1262 } else { 1263 inst = vir_VFPACK_dest(c, tlb_reg, r, g); 1264 if (conf != ~0) { 1265 inst->dst = tlbu_reg; 1266 inst->uniform = vir_get_uniform_index(c, 1267 QUNIFORM_CONSTANT, 1268 conf); 1269 } 1270 1271 if (num_components >= 3) 1272 inst = vir_VFPACK_dest(c, tlb_reg, b, a); 1273 } 1274 break; 1275 } 1276 } 1277 } 1278} 1279 1280static void 1281vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) 1282{ 1283 if (c->devinfo->ver >= 40) { 1284 vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val); 1285 } else { 1286 /* XXX: v3d33_vir_vpm_write_setup(c); */ 1287 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); 1288 } 1289} 1290 1291static void 1292emit_vert_end(struct v3d_compile *c) 1293{ 1294 /* GFXH-1684: VPM writes need to be complete by the end of the shader. 1295 */ 1296 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 1297 vir_VPMWT(c); 1298} 1299 1300void 1301v3d_optimize_nir(struct nir_shader *s) 1302{ 1303 bool progress; 1304 1305 do { 1306 progress = false; 1307 1308 NIR_PASS_V(s, nir_lower_vars_to_ssa); 1309 NIR_PASS(progress, s, nir_lower_alu_to_scalar); 1310 NIR_PASS(progress, s, nir_lower_phis_to_scalar); 1311 NIR_PASS(progress, s, nir_copy_prop); 1312 NIR_PASS(progress, s, nir_opt_remove_phis); 1313 NIR_PASS(progress, s, nir_opt_dce); 1314 NIR_PASS(progress, s, nir_opt_dead_cf); 1315 NIR_PASS(progress, s, nir_opt_cse); 1316 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); 1317 NIR_PASS(progress, s, nir_opt_algebraic); 1318 NIR_PASS(progress, s, nir_opt_constant_folding); 1319 NIR_PASS(progress, s, nir_opt_undef); 1320 } while (progress); 1321 1322 NIR_PASS(progress, s, nir_opt_move_load_ubo); 1323} 1324 1325static int 1326driver_location_compare(const void *in_a, const void *in_b) 1327{ 1328 const nir_variable *const *a = in_a; 1329 const nir_variable *const *b = in_b; 1330 1331 return (*a)->data.driver_location - (*b)->data.driver_location; 1332} 1333 1334static struct qreg 1335ntq_emit_vpm_read(struct v3d_compile *c, 1336 uint32_t *num_components_queued, 1337 uint32_t *remaining, 1338 uint32_t vpm_index) 1339{ 1340 struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); 1341 1342 if (c->devinfo->ver >= 40 ) { 1343 return vir_LDVPMV_IN(c, 1344 vir_uniform_ui(c, 1345 (*num_components_queued)++)); 1346 } 1347 1348 if (*num_components_queued != 0) { 1349 (*num_components_queued)--; 1350 return vir_MOV(c, vpm); 1351 } 1352 1353 uint32_t num_components = MIN2(*remaining, 32); 1354 1355 v3d33_vir_vpm_read_setup(c, num_components); 1356 1357 *num_components_queued = num_components - 1; 1358 *remaining -= num_components; 1359 1360 return vir_MOV(c, vpm); 1361} 1362 1363static void 1364ntq_setup_vpm_inputs(struct v3d_compile *c) 1365{ 1366 /* Figure out how many components of each vertex attribute the shader 1367 * uses. Each variable should have been split to individual 1368 * components and unused ones DCEed. The vertex fetcher will load 1369 * from the start of the attribute to the number of components we 1370 * declare we need in c->vattr_sizes[]. 1371 */ 1372 nir_foreach_variable(var, &c->s->inputs) { 1373 /* No VS attribute array support. */ 1374 assert(MAX2(glsl_get_length(var->type), 1) == 1); 1375 1376 unsigned loc = var->data.driver_location; 1377 int start_component = var->data.location_frac; 1378 int num_components = glsl_get_components(var->type); 1379 1380 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc], 1381 start_component + num_components); 1382 } 1383 1384 unsigned num_components = 0; 1385 uint32_t vpm_components_queued = 0; 1386 bool uses_iid = c->s->info.system_values_read & 1387 (1ull << SYSTEM_VALUE_INSTANCE_ID); 1388 bool uses_vid = c->s->info.system_values_read & 1389 (1ull << SYSTEM_VALUE_VERTEX_ID); 1390 num_components += uses_iid; 1391 num_components += uses_vid; 1392 1393 for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++) 1394 num_components += c->vattr_sizes[i]; 1395 1396 if (uses_iid) { 1397 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued, 1398 &num_components, ~0); 1399 } 1400 1401 if (uses_vid) { 1402 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued, 1403 &num_components, ~0); 1404 } 1405 1406 /* The actual loads will happen directly in nir_intrinsic_load_input 1407 * on newer versions. 1408 */ 1409 if (c->devinfo->ver >= 40) 1410 return; 1411 1412 for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { 1413 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 1414 (loc + 1) * 4); 1415 1416 for (int i = 0; i < c->vattr_sizes[loc]; i++) { 1417 c->inputs[loc * 4 + i] = 1418 ntq_emit_vpm_read(c, 1419 &vpm_components_queued, 1420 &num_components, 1421 loc * 4 + i); 1422 1423 } 1424 } 1425 1426 if (c->devinfo->ver >= 40) { 1427 assert(vpm_components_queued == num_components); 1428 } else { 1429 assert(vpm_components_queued == 0); 1430 assert(num_components == 0); 1431 } 1432} 1433 1434static void 1435ntq_setup_fs_inputs(struct v3d_compile *c) 1436{ 1437 unsigned num_entries = 0; 1438 unsigned num_components = 0; 1439 nir_foreach_variable(var, &c->s->inputs) { 1440 num_entries++; 1441 num_components += glsl_get_components(var->type); 1442 } 1443 1444 nir_variable *vars[num_entries]; 1445 1446 unsigned i = 0; 1447 nir_foreach_variable(var, &c->s->inputs) 1448 vars[i++] = var; 1449 1450 /* Sort the variables so that we emit the input setup in 1451 * driver_location order. This is required for VPM reads, whose data 1452 * is fetched into the VPM in driver_location (TGSI register index) 1453 * order. 1454 */ 1455 qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); 1456 1457 for (unsigned i = 0; i < num_entries; i++) { 1458 nir_variable *var = vars[i]; 1459 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1460 unsigned loc = var->data.driver_location; 1461 1462 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 1463 (loc + array_len) * 4); 1464 1465 if (var->data.location == VARYING_SLOT_POS) { 1466 emit_fragcoord_input(c, loc); 1467 } else if (var->data.location == VARYING_SLOT_PNTC || 1468 (var->data.location >= VARYING_SLOT_VAR0 && 1469 (c->fs_key->point_sprite_mask & 1470 (1 << (var->data.location - 1471 VARYING_SLOT_VAR0))))) { 1472 c->inputs[loc * 4 + 0] = c->point_x; 1473 c->inputs[loc * 4 + 1] = c->point_y; 1474 } else { 1475 for (int j = 0; j < array_len; j++) 1476 emit_fragment_input(c, loc + j, var, j); 1477 } 1478 } 1479} 1480 1481static void 1482ntq_setup_outputs(struct v3d_compile *c) 1483{ 1484 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 1485 return; 1486 1487 nir_foreach_variable(var, &c->s->outputs) { 1488 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1489 unsigned loc = var->data.driver_location * 4; 1490 1491 assert(array_len == 1); 1492 (void)array_len; 1493 1494 for (int i = 0; i < 4 - var->data.location_frac; i++) { 1495 add_output(c, loc + var->data.location_frac + i, 1496 var->data.location, 1497 var->data.location_frac + i); 1498 } 1499 1500 switch (var->data.location) { 1501 case FRAG_RESULT_COLOR: 1502 c->output_color_var[0] = var; 1503 c->output_color_var[1] = var; 1504 c->output_color_var[2] = var; 1505 c->output_color_var[3] = var; 1506 break; 1507 case FRAG_RESULT_DATA0: 1508 case FRAG_RESULT_DATA1: 1509 case FRAG_RESULT_DATA2: 1510 case FRAG_RESULT_DATA3: 1511 c->output_color_var[var->data.location - 1512 FRAG_RESULT_DATA0] = var; 1513 break; 1514 case FRAG_RESULT_DEPTH: 1515 c->output_position_index = loc; 1516 break; 1517 case FRAG_RESULT_SAMPLE_MASK: 1518 c->output_sample_mask_index = loc; 1519 break; 1520 } 1521 } 1522} 1523 1524/** 1525 * Sets up the mapping from nir_register to struct qreg *. 1526 * 1527 * Each nir_register gets a struct qreg per 32-bit component being stored. 1528 */ 1529static void 1530ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) 1531{ 1532 foreach_list_typed(nir_register, nir_reg, node, list) { 1533 unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 1534 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 1535 array_len * 1536 nir_reg->num_components); 1537 1538 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 1539 1540 for (int i = 0; i < array_len * nir_reg->num_components; i++) 1541 qregs[i] = vir_get_temp(c); 1542 } 1543} 1544 1545static void 1546ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) 1547{ 1548 /* XXX perf: Experiment with using immediate loads to avoid having 1549 * these end up in the uniform stream. Watch out for breaking the 1550 * small immediates optimization in the process! 1551 */ 1552 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1553 for (int i = 0; i < instr->def.num_components; i++) 1554 qregs[i] = vir_uniform_ui(c, instr->value[i].u32); 1555 1556 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 1557} 1558 1559static void 1560ntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr) 1561{ 1562 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1563 1564 /* VIR needs there to be *some* value, so pick 0 (same as for 1565 * ntq_setup_registers(). 1566 */ 1567 for (int i = 0; i < instr->def.num_components; i++) 1568 qregs[i] = vir_uniform_ui(c, 0); 1569} 1570 1571static void 1572ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) 1573{ 1574 assert(instr->intrinsic == nir_intrinsic_image_deref_size); 1575 nir_variable *var = nir_intrinsic_get_var(instr, 0); 1576 unsigned image_index = var->data.driver_location; 1577 const struct glsl_type *sampler_type = glsl_without_array(var->type); 1578 bool is_array = glsl_sampler_type_is_array(sampler_type); 1579 1580 ntq_store_dest(c, &instr->dest, 0, 1581 vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); 1582 if (instr->num_components > 1) { 1583 ntq_store_dest(c, &instr->dest, 1, 1584 vir_uniform(c, QUNIFORM_IMAGE_HEIGHT, 1585 image_index)); 1586 } 1587 if (instr->num_components > 2) { 1588 ntq_store_dest(c, &instr->dest, 2, 1589 vir_uniform(c, 1590 is_array ? 1591 QUNIFORM_IMAGE_ARRAY_SIZE : 1592 QUNIFORM_IMAGE_DEPTH, 1593 image_index)); 1594 } 1595} 1596 1597static void 1598ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) 1599{ 1600 unsigned offset; 1601 1602 switch (instr->intrinsic) { 1603 case nir_intrinsic_load_uniform: 1604 if (nir_src_is_const(instr->src[0])) { 1605 int offset = (nir_intrinsic_base(instr) + 1606 nir_src_as_uint(instr->src[0])); 1607 assert(offset % 4 == 0); 1608 /* We need dwords */ 1609 offset = offset / 4; 1610 for (int i = 0; i < instr->num_components; i++) { 1611 ntq_store_dest(c, &instr->dest, i, 1612 vir_uniform(c, QUNIFORM_UNIFORM, 1613 offset + i)); 1614 } 1615 } else { 1616 ntq_emit_tmu_general(c, instr, false); 1617 } 1618 break; 1619 1620 case nir_intrinsic_load_ubo: 1621 ntq_emit_tmu_general(c, instr, false); 1622 break; 1623 1624 case nir_intrinsic_ssbo_atomic_add: 1625 case nir_intrinsic_ssbo_atomic_imin: 1626 case nir_intrinsic_ssbo_atomic_umin: 1627 case nir_intrinsic_ssbo_atomic_imax: 1628 case nir_intrinsic_ssbo_atomic_umax: 1629 case nir_intrinsic_ssbo_atomic_and: 1630 case nir_intrinsic_ssbo_atomic_or: 1631 case nir_intrinsic_ssbo_atomic_xor: 1632 case nir_intrinsic_ssbo_atomic_exchange: 1633 case nir_intrinsic_ssbo_atomic_comp_swap: 1634 case nir_intrinsic_load_ssbo: 1635 case nir_intrinsic_store_ssbo: 1636 ntq_emit_tmu_general(c, instr, false); 1637 break; 1638 1639 case nir_intrinsic_shared_atomic_add: 1640 case nir_intrinsic_shared_atomic_imin: 1641 case nir_intrinsic_shared_atomic_umin: 1642 case nir_intrinsic_shared_atomic_imax: 1643 case nir_intrinsic_shared_atomic_umax: 1644 case nir_intrinsic_shared_atomic_and: 1645 case nir_intrinsic_shared_atomic_or: 1646 case nir_intrinsic_shared_atomic_xor: 1647 case nir_intrinsic_shared_atomic_exchange: 1648 case nir_intrinsic_shared_atomic_comp_swap: 1649 case nir_intrinsic_load_shared: 1650 case nir_intrinsic_store_shared: 1651 case nir_intrinsic_load_scratch: 1652 case nir_intrinsic_store_scratch: 1653 ntq_emit_tmu_general(c, instr, true); 1654 break; 1655 1656 case nir_intrinsic_image_deref_load: 1657 case nir_intrinsic_image_deref_store: 1658 case nir_intrinsic_image_deref_atomic_add: 1659 case nir_intrinsic_image_deref_atomic_min: 1660 case nir_intrinsic_image_deref_atomic_max: 1661 case nir_intrinsic_image_deref_atomic_and: 1662 case nir_intrinsic_image_deref_atomic_or: 1663 case nir_intrinsic_image_deref_atomic_xor: 1664 case nir_intrinsic_image_deref_atomic_exchange: 1665 case nir_intrinsic_image_deref_atomic_comp_swap: 1666 v3d40_vir_emit_image_load_store(c, instr); 1667 break; 1668 1669 case nir_intrinsic_get_buffer_size: 1670 ntq_store_dest(c, &instr->dest, 0, 1671 vir_uniform(c, QUNIFORM_GET_BUFFER_SIZE, 1672 nir_src_as_uint(instr->src[0]))); 1673 break; 1674 1675 case nir_intrinsic_load_user_clip_plane: 1676 for (int i = 0; i < instr->num_components; i++) { 1677 ntq_store_dest(c, &instr->dest, i, 1678 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 1679 nir_intrinsic_ucp_id(instr) * 1680 4 + i)); 1681 } 1682 break; 1683 1684 case nir_intrinsic_load_viewport_x_scale: 1685 ntq_store_dest(c, &instr->dest, 0, 1686 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); 1687 break; 1688 1689 case nir_intrinsic_load_viewport_y_scale: 1690 ntq_store_dest(c, &instr->dest, 0, 1691 vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); 1692 break; 1693 1694 case nir_intrinsic_load_viewport_z_scale: 1695 ntq_store_dest(c, &instr->dest, 0, 1696 vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); 1697 break; 1698 1699 case nir_intrinsic_load_viewport_z_offset: 1700 ntq_store_dest(c, &instr->dest, 0, 1701 vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); 1702 break; 1703 1704 case nir_intrinsic_load_alpha_ref_float: 1705 ntq_store_dest(c, &instr->dest, 0, 1706 vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); 1707 break; 1708 1709 case nir_intrinsic_load_sample_mask_in: 1710 ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); 1711 break; 1712 1713 case nir_intrinsic_load_helper_invocation: 1714 vir_set_pf(vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); 1715 ntq_store_dest(c, &instr->dest, 0, 1716 vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, 1717 vir_uniform_ui(c, ~0), 1718 vir_uniform_ui(c, 0)))); 1719 break; 1720 1721 case nir_intrinsic_load_front_face: 1722 /* The register contains 0 (front) or 1 (back), and we need to 1723 * turn it into a NIR bool where true means front. 1724 */ 1725 ntq_store_dest(c, &instr->dest, 0, 1726 vir_ADD(c, 1727 vir_uniform_ui(c, -1), 1728 vir_REVF(c))); 1729 break; 1730 1731 case nir_intrinsic_load_instance_id: 1732 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); 1733 break; 1734 1735 case nir_intrinsic_load_vertex_id: 1736 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); 1737 break; 1738 1739 case nir_intrinsic_load_input: 1740 /* Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset) 1741 * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR. 1742 */ 1743 offset = (nir_intrinsic_base(instr) + 1744 nir_src_as_uint(instr->src[0])); 1745 if (c->s->info.stage != MESA_SHADER_FRAGMENT && 1746 c->devinfo->ver >= 40) { 1747 /* Emit the LDVPM directly now, rather than at the top 1748 * of the shader like we did for V3D 3.x (which needs 1749 * vpmsetup when not just taking the next offset). 1750 * 1751 * Note that delaying like this may introduce stalls, 1752 * as LDVPMV takes a minimum of 1 instruction but may 1753 * be slower if the VPM unit is busy with another QPU. 1754 */ 1755 int index = 0; 1756 if (c->s->info.system_values_read & 1757 (1ull << SYSTEM_VALUE_INSTANCE_ID)) { 1758 index++; 1759 } 1760 if (c->s->info.system_values_read & 1761 (1ull << SYSTEM_VALUE_VERTEX_ID)) { 1762 index++; 1763 } 1764 for (int i = 0; i < offset; i++) 1765 index += c->vattr_sizes[i]; 1766 index += nir_intrinsic_component(instr); 1767 for (int i = 0; i < instr->num_components; i++) { 1768 struct qreg vpm_offset = 1769 vir_uniform_ui(c, index++); 1770 ntq_store_dest(c, &instr->dest, i, 1771 vir_LDVPMV_IN(c, vpm_offset)); 1772 } 1773 } else { 1774 for (int i = 0; i < instr->num_components; i++) { 1775 int comp = nir_intrinsic_component(instr) + i; 1776 ntq_store_dest(c, &instr->dest, i, 1777 vir_MOV(c, c->inputs[offset * 4 + 1778 comp])); 1779 } 1780 } 1781 break; 1782 1783 case nir_intrinsic_store_output: 1784 /* XXX perf: Use stvpmv with uniform non-constant offsets and 1785 * stvpmd with non-uniform offsets and enable 1786 * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. 1787 */ 1788 if (c->s->info.stage == MESA_SHADER_FRAGMENT) { 1789 offset = ((nir_intrinsic_base(instr) + 1790 nir_src_as_uint(instr->src[1])) * 4 + 1791 nir_intrinsic_component(instr)); 1792 for (int i = 0; i < instr->num_components; i++) { 1793 c->outputs[offset + i] = 1794 vir_MOV(c, 1795 ntq_get_src(c, 1796 instr->src[0], i)); 1797 } 1798 } else { 1799 assert(instr->num_components == 1); 1800 1801 vir_VPM_WRITE(c, 1802 ntq_get_src(c, instr->src[0], 0), 1803 nir_intrinsic_base(instr)); 1804 } 1805 break; 1806 1807 case nir_intrinsic_image_deref_size: 1808 ntq_emit_image_size(c, instr); 1809 break; 1810 1811 case nir_intrinsic_discard: 1812 if (vir_in_nonuniform_control_flow(c)) { 1813 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 1814 V3D_QPU_PF_PUSHZ); 1815 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 1816 vir_uniform_ui(c, 0)), 1817 V3D_QPU_COND_IFA); 1818 } else { 1819 vir_SETMSF_dest(c, vir_nop_reg(), 1820 vir_uniform_ui(c, 0)); 1821 } 1822 break; 1823 1824 case nir_intrinsic_discard_if: { 1825 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); 1826 1827 if (vir_in_nonuniform_control_flow(c)) { 1828 struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(), 1829 c->execute); 1830 if (cond == V3D_QPU_COND_IFA) { 1831 vir_set_uf(exec_flag, V3D_QPU_UF_ANDZ); 1832 } else { 1833 vir_set_uf(exec_flag, V3D_QPU_UF_NORNZ); 1834 cond = V3D_QPU_COND_IFA; 1835 } 1836 } 1837 1838 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 1839 vir_uniform_ui(c, 0)), cond); 1840 1841 break; 1842 } 1843 1844 case nir_intrinsic_memory_barrier: 1845 case nir_intrinsic_memory_barrier_atomic_counter: 1846 case nir_intrinsic_memory_barrier_buffer: 1847 case nir_intrinsic_memory_barrier_image: 1848 case nir_intrinsic_memory_barrier_shared: 1849 case nir_intrinsic_group_memory_barrier: 1850 /* We don't do any instruction scheduling of these NIR 1851 * instructions between each other, so we just need to make 1852 * sure that the TMU operations before the barrier are flushed 1853 * before the ones after the barrier. That is currently 1854 * handled by having a THRSW in each of them and a LDTMU 1855 * series or a TMUWT after. 1856 */ 1857 break; 1858 1859 case nir_intrinsic_barrier: 1860 /* Emit a TSY op to get all invocations in the workgroup 1861 * (actually supergroup) to block until the last invocation 1862 * reaches the TSY op. 1863 */ 1864 if (c->devinfo->ver >= 42) { 1865 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 1866 V3D_QPU_WADDR_SYNCB)); 1867 } else { 1868 struct qinst *sync = 1869 vir_BARRIERID_dest(c, 1870 vir_reg(QFILE_MAGIC, 1871 V3D_QPU_WADDR_SYNCU)); 1872 sync->uniform = 1873 vir_get_uniform_index(c, QUNIFORM_CONSTANT, 1874 0xffffff00 | 1875 V3D_TSY_WAIT_INC_CHECK); 1876 1877 } 1878 1879 /* The blocking of a TSY op only happens at the next thread 1880 * switch. No texturing may be outstanding at the time of a 1881 * TSY blocking operation. 1882 */ 1883 vir_emit_thrsw(c); 1884 break; 1885 1886 case nir_intrinsic_load_num_work_groups: 1887 for (int i = 0; i < 3; i++) { 1888 ntq_store_dest(c, &instr->dest, i, 1889 vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, 1890 i)); 1891 } 1892 break; 1893 1894 case nir_intrinsic_load_local_invocation_index: 1895 ntq_store_dest(c, &instr->dest, 0, 1896 vir_SHR(c, c->cs_payload[1], 1897 vir_uniform_ui(c, 32 - c->local_invocation_index_bits))); 1898 break; 1899 1900 case nir_intrinsic_load_work_group_id: 1901 ntq_store_dest(c, &instr->dest, 0, 1902 vir_AND(c, c->cs_payload[0], 1903 vir_uniform_ui(c, 0xffff))); 1904 ntq_store_dest(c, &instr->dest, 1, 1905 vir_SHR(c, c->cs_payload[0], 1906 vir_uniform_ui(c, 16))); 1907 ntq_store_dest(c, &instr->dest, 2, 1908 vir_AND(c, c->cs_payload[1], 1909 vir_uniform_ui(c, 0xffff))); 1910 break; 1911 1912 case nir_intrinsic_load_subgroup_id: 1913 ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); 1914 break; 1915 1916 default: 1917 fprintf(stderr, "Unknown intrinsic: "); 1918 nir_print_instr(&instr->instr, stderr); 1919 fprintf(stderr, "\n"); 1920 break; 1921 } 1922} 1923 1924/* Clears (activates) the execute flags for any channels whose jump target 1925 * matches this block. 1926 * 1927 * XXX perf: Could we be using flpush/flpop somehow for our execution channel 1928 * enabling? 1929 * 1930 * XXX perf: For uniform control flow, we should be able to skip c->execute 1931 * handling entirely. 1932 */ 1933static void 1934ntq_activate_execute_for_block(struct v3d_compile *c) 1935{ 1936 vir_set_pf(vir_XOR_dest(c, vir_nop_reg(), 1937 c->execute, vir_uniform_ui(c, c->cur_block->index)), 1938 V3D_QPU_PF_PUSHZ); 1939 1940 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 1941} 1942 1943static void 1944ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) 1945{ 1946 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 1947 bool empty_else_block = 1948 (nir_else_block == nir_if_last_else_block(if_stmt) && 1949 exec_list_is_empty(&nir_else_block->instr_list)); 1950 1951 struct qblock *then_block = vir_new_block(c); 1952 struct qblock *after_block = vir_new_block(c); 1953 struct qblock *else_block; 1954 if (empty_else_block) 1955 else_block = after_block; 1956 else 1957 else_block = vir_new_block(c); 1958 1959 /* Set up the flags for the IF condition (taking the THEN branch). */ 1960 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 1961 1962 /* Jump to ELSE. */ 1963 vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? 1964 V3D_QPU_BRANCH_COND_ALLNA : 1965 V3D_QPU_BRANCH_COND_ALLA); 1966 vir_link_blocks(c->cur_block, else_block); 1967 vir_link_blocks(c->cur_block, then_block); 1968 1969 /* Process the THEN block. */ 1970 vir_set_emit_block(c, then_block); 1971 ntq_emit_cf_list(c, &if_stmt->then_list); 1972 1973 if (!empty_else_block) { 1974 /* At the end of the THEN block, jump to ENDIF */ 1975 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 1976 vir_link_blocks(c->cur_block, after_block); 1977 1978 /* Emit the else block. */ 1979 vir_set_emit_block(c, else_block); 1980 ntq_activate_execute_for_block(c); 1981 ntq_emit_cf_list(c, &if_stmt->else_list); 1982 } 1983 1984 vir_link_blocks(c->cur_block, after_block); 1985 1986 vir_set_emit_block(c, after_block); 1987} 1988 1989static void 1990ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) 1991{ 1992 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 1993 bool empty_else_block = 1994 (nir_else_block == nir_if_last_else_block(if_stmt) && 1995 exec_list_is_empty(&nir_else_block->instr_list)); 1996 1997 struct qblock *then_block = vir_new_block(c); 1998 struct qblock *after_block = vir_new_block(c); 1999 struct qblock *else_block; 2000 if (empty_else_block) 2001 else_block = after_block; 2002 else 2003 else_block = vir_new_block(c); 2004 2005 bool was_uniform_control_flow = false; 2006 if (!vir_in_nonuniform_control_flow(c)) { 2007 c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 2008 was_uniform_control_flow = true; 2009 } 2010 2011 /* Set up the flags for the IF condition (taking the THEN branch). */ 2012 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 2013 2014 /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and 2015 * was previously active (execute Z) for updating the exec flags. 2016 */ 2017 if (was_uniform_control_flow) { 2018 cond = v3d_qpu_cond_invert(cond); 2019 } else { 2020 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute); 2021 if (cond == V3D_QPU_COND_IFA) { 2022 vir_set_uf(inst, V3D_QPU_UF_NORNZ); 2023 } else { 2024 vir_set_uf(inst, V3D_QPU_UF_ANDZ); 2025 cond = V3D_QPU_COND_IFA; 2026 } 2027 } 2028 2029 vir_MOV_cond(c, cond, 2030 c->execute, 2031 vir_uniform_ui(c, else_block->index)); 2032 2033 /* Jump to ELSE if nothing is active for THEN, otherwise fall 2034 * through. 2035 */ 2036 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 2037 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); 2038 vir_link_blocks(c->cur_block, else_block); 2039 vir_link_blocks(c->cur_block, then_block); 2040 2041 /* Process the THEN block. */ 2042 vir_set_emit_block(c, then_block); 2043 ntq_emit_cf_list(c, &if_stmt->then_list); 2044 2045 if (!empty_else_block) { 2046 /* Handle the end of the THEN block. First, all currently 2047 * active channels update their execute flags to point to 2048 * ENDIF 2049 */ 2050 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 2051 V3D_QPU_PF_PUSHZ); 2052 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 2053 vir_uniform_ui(c, after_block->index)); 2054 2055 /* If everything points at ENDIF, then jump there immediately. */ 2056 vir_set_pf(vir_XOR_dest(c, vir_nop_reg(), 2057 c->execute, 2058 vir_uniform_ui(c, after_block->index)), 2059 V3D_QPU_PF_PUSHZ); 2060 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); 2061 vir_link_blocks(c->cur_block, after_block); 2062 vir_link_blocks(c->cur_block, else_block); 2063 2064 vir_set_emit_block(c, else_block); 2065 ntq_activate_execute_for_block(c); 2066 ntq_emit_cf_list(c, &if_stmt->else_list); 2067 } 2068 2069 vir_link_blocks(c->cur_block, after_block); 2070 2071 vir_set_emit_block(c, after_block); 2072 if (was_uniform_control_flow) 2073 c->execute = c->undef; 2074 else 2075 ntq_activate_execute_for_block(c); 2076} 2077 2078static void 2079ntq_emit_if(struct v3d_compile *c, nir_if *nif) 2080{ 2081 bool was_in_control_flow = c->in_control_flow; 2082 c->in_control_flow = true; 2083 if (!vir_in_nonuniform_control_flow(c) && 2084 nir_src_is_dynamically_uniform(nif->condition)) { 2085 ntq_emit_uniform_if(c, nif); 2086 } else { 2087 ntq_emit_nonuniform_if(c, nif); 2088 } 2089 c->in_control_flow = was_in_control_flow; 2090} 2091 2092static void 2093ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) 2094{ 2095 switch (jump->type) { 2096 case nir_jump_break: 2097 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 2098 V3D_QPU_PF_PUSHZ); 2099 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 2100 vir_uniform_ui(c, c->loop_break_block->index)); 2101 break; 2102 2103 case nir_jump_continue: 2104 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 2105 V3D_QPU_PF_PUSHZ); 2106 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 2107 vir_uniform_ui(c, c->loop_cont_block->index)); 2108 break; 2109 2110 case nir_jump_return: 2111 unreachable("All returns shouold be lowered\n"); 2112 } 2113} 2114 2115static void 2116ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) 2117{ 2118 switch (instr->type) { 2119 case nir_instr_type_deref: 2120 /* ignored, will be walked by the intrinsic using it. */ 2121 break; 2122 2123 case nir_instr_type_alu: 2124 ntq_emit_alu(c, nir_instr_as_alu(instr)); 2125 break; 2126 2127 case nir_instr_type_intrinsic: 2128 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 2129 break; 2130 2131 case nir_instr_type_load_const: 2132 ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 2133 break; 2134 2135 case nir_instr_type_ssa_undef: 2136 ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); 2137 break; 2138 2139 case nir_instr_type_tex: 2140 ntq_emit_tex(c, nir_instr_as_tex(instr)); 2141 break; 2142 2143 case nir_instr_type_jump: 2144 ntq_emit_jump(c, nir_instr_as_jump(instr)); 2145 break; 2146 2147 default: 2148 fprintf(stderr, "Unknown NIR instr type: "); 2149 nir_print_instr(instr, stderr); 2150 fprintf(stderr, "\n"); 2151 abort(); 2152 } 2153} 2154 2155static void 2156ntq_emit_block(struct v3d_compile *c, nir_block *block) 2157{ 2158 nir_foreach_instr(instr, block) { 2159 ntq_emit_instr(c, instr); 2160 } 2161} 2162 2163static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 2164 2165static void 2166ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) 2167{ 2168 bool was_in_control_flow = c->in_control_flow; 2169 c->in_control_flow = true; 2170 2171 bool was_uniform_control_flow = false; 2172 if (!vir_in_nonuniform_control_flow(c)) { 2173 c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 2174 was_uniform_control_flow = true; 2175 } 2176 2177 struct qblock *save_loop_cont_block = c->loop_cont_block; 2178 struct qblock *save_loop_break_block = c->loop_break_block; 2179 2180 c->loop_cont_block = vir_new_block(c); 2181 c->loop_break_block = vir_new_block(c); 2182 2183 vir_link_blocks(c->cur_block, c->loop_cont_block); 2184 vir_set_emit_block(c, c->loop_cont_block); 2185 ntq_activate_execute_for_block(c); 2186 2187 ntq_emit_cf_list(c, &loop->body); 2188 2189 /* Re-enable any previous continues now, so our ANYA check below 2190 * works. 2191 * 2192 * XXX: Use the .ORZ flags update, instead. 2193 */ 2194 vir_set_pf(vir_XOR_dest(c, 2195 vir_nop_reg(), 2196 c->execute, 2197 vir_uniform_ui(c, c->loop_cont_block->index)), 2198 V3D_QPU_PF_PUSHZ); 2199 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 2200 2201 vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 2202 2203 struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); 2204 /* Pixels that were not dispatched or have been discarded should not 2205 * contribute to looping again. 2206 */ 2207 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 2208 vir_link_blocks(c->cur_block, c->loop_cont_block); 2209 vir_link_blocks(c->cur_block, c->loop_break_block); 2210 2211 vir_set_emit_block(c, c->loop_break_block); 2212 if (was_uniform_control_flow) 2213 c->execute = c->undef; 2214 else 2215 ntq_activate_execute_for_block(c); 2216 2217 c->loop_break_block = save_loop_break_block; 2218 c->loop_cont_block = save_loop_cont_block; 2219 2220 c->loops++; 2221 2222 c->in_control_flow = was_in_control_flow; 2223} 2224 2225static void 2226ntq_emit_function(struct v3d_compile *c, nir_function_impl *func) 2227{ 2228 fprintf(stderr, "FUNCTIONS not handled.\n"); 2229 abort(); 2230} 2231 2232static void 2233ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) 2234{ 2235 foreach_list_typed(nir_cf_node, node, node, list) { 2236 switch (node->type) { 2237 case nir_cf_node_block: 2238 ntq_emit_block(c, nir_cf_node_as_block(node)); 2239 break; 2240 2241 case nir_cf_node_if: 2242 ntq_emit_if(c, nir_cf_node_as_if(node)); 2243 break; 2244 2245 case nir_cf_node_loop: 2246 ntq_emit_loop(c, nir_cf_node_as_loop(node)); 2247 break; 2248 2249 case nir_cf_node_function: 2250 ntq_emit_function(c, nir_cf_node_as_function(node)); 2251 break; 2252 2253 default: 2254 fprintf(stderr, "Unknown NIR node type\n"); 2255 abort(); 2256 } 2257 } 2258} 2259 2260static void 2261ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) 2262{ 2263 ntq_setup_registers(c, &impl->registers); 2264 ntq_emit_cf_list(c, &impl->body); 2265} 2266 2267static void 2268nir_to_vir(struct v3d_compile *c) 2269{ 2270 switch (c->s->info.stage) { 2271 case MESA_SHADER_FRAGMENT: 2272 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); 2273 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); 2274 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); 2275 2276 /* XXX perf: We could set the "disable implicit point/line 2277 * varyings" field in the shader record and not emit these, if 2278 * they're not going to be used. 2279 */ 2280 if (c->fs_key->is_points) { 2281 c->point_x = emit_fragment_varying(c, NULL, 0, 0); 2282 c->point_y = emit_fragment_varying(c, NULL, 0, 0); 2283 } else if (c->fs_key->is_lines) { 2284 c->line_x = emit_fragment_varying(c, NULL, 0, 0); 2285 } 2286 break; 2287 case MESA_SHADER_COMPUTE: 2288 /* Set up the TSO for barriers, assuming we do some. */ 2289 if (c->devinfo->ver < 42) { 2290 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 2291 V3D_QPU_WADDR_SYNC)); 2292 } 2293 2294 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); 2295 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); 2296 2297 /* Set up the division between gl_LocalInvocationIndex and 2298 * wg_in_mem in the payload reg. 2299 */ 2300 int wg_size = (c->s->info.cs.local_size[0] * 2301 c->s->info.cs.local_size[1] * 2302 c->s->info.cs.local_size[2]); 2303 c->local_invocation_index_bits = 2304 ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1; 2305 assert(c->local_invocation_index_bits <= 8); 2306 2307 if (c->s->info.cs.shared_size) { 2308 struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1], 2309 vir_uniform_ui(c, 16)); 2310 if (c->s->info.cs.local_size[0] != 1 || 2311 c->s->info.cs.local_size[1] != 1 || 2312 c->s->info.cs.local_size[2] != 1) { 2313 int wg_bits = (16 - 2314 c->local_invocation_index_bits); 2315 int wg_mask = (1 << wg_bits) - 1; 2316 wg_in_mem = vir_AND(c, wg_in_mem, 2317 vir_uniform_ui(c, wg_mask)); 2318 } 2319 struct qreg shared_per_wg = 2320 vir_uniform_ui(c, c->s->info.cs.shared_size); 2321 2322 c->cs_shared_offset = 2323 vir_ADD(c, 2324 vir_uniform(c, QUNIFORM_SHARED_OFFSET,0), 2325 vir_UMUL(c, wg_in_mem, shared_per_wg)); 2326 } 2327 break; 2328 default: 2329 break; 2330 } 2331 2332 if (c->s->scratch_size) { 2333 v3d_setup_spill_base(c); 2334 c->spill_size += V3D_CHANNELS * c->s->scratch_size; 2335 } 2336 2337 if (c->s->info.stage == MESA_SHADER_FRAGMENT) 2338 ntq_setup_fs_inputs(c); 2339 else 2340 ntq_setup_vpm_inputs(c); 2341 2342 ntq_setup_outputs(c); 2343 2344 /* Find the main function and emit the body. */ 2345 nir_foreach_function(function, c->s) { 2346 assert(strcmp(function->name, "main") == 0); 2347 assert(function->impl); 2348 ntq_emit_impl(c, function->impl); 2349 } 2350} 2351 2352const nir_shader_compiler_options v3d_nir_options = { 2353 .lower_all_io_to_temps = true, 2354 .lower_extract_byte = true, 2355 .lower_extract_word = true, 2356 .lower_bfm = true, 2357 .lower_bitfield_insert_to_shifts = true, 2358 .lower_bitfield_extract_to_shifts = true, 2359 .lower_bitfield_reverse = true, 2360 .lower_bit_count = true, 2361 .lower_cs_local_id_from_index = true, 2362 .lower_ffract = true, 2363 .lower_pack_unorm_2x16 = true, 2364 .lower_pack_snorm_2x16 = true, 2365 .lower_pack_unorm_4x8 = true, 2366 .lower_pack_snorm_4x8 = true, 2367 .lower_unpack_unorm_4x8 = true, 2368 .lower_unpack_snorm_4x8 = true, 2369 .lower_pack_half_2x16 = true, 2370 .lower_unpack_half_2x16 = true, 2371 .lower_fdiv = true, 2372 .lower_find_lsb = true, 2373 .lower_ffma = true, 2374 .lower_flrp32 = true, 2375 .lower_fpow = true, 2376 .lower_fsat = true, 2377 .lower_fsqrt = true, 2378 .lower_ifind_msb = true, 2379 .lower_isign = true, 2380 .lower_ldexp = true, 2381 .lower_mul_high = true, 2382 .lower_wpos_pntc = true, 2383 .native_integers = true, 2384}; 2385 2386/** 2387 * When demoting a shader down to single-threaded, removes the THRSW 2388 * instructions (one will still be inserted at v3d_vir_to_qpu() for the 2389 * program end). 2390 */ 2391static void 2392vir_remove_thrsw(struct v3d_compile *c) 2393{ 2394 vir_for_each_block(block, c) { 2395 vir_for_each_inst_safe(inst, block) { 2396 if (inst->qpu.sig.thrsw) 2397 vir_remove_instruction(c, inst); 2398 } 2399 } 2400 2401 c->last_thrsw = NULL; 2402} 2403 2404void 2405vir_emit_last_thrsw(struct v3d_compile *c) 2406{ 2407 /* On V3D before 4.1, we need a TMU op to be outstanding when thread 2408 * switching, so disable threads if we didn't do any TMU ops (each of 2409 * which would have emitted a THRSW). 2410 */ 2411 if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { 2412 c->threads = 1; 2413 if (c->last_thrsw) 2414 vir_remove_thrsw(c); 2415 return; 2416 } 2417 2418 /* If we're threaded and the last THRSW was in conditional code, then 2419 * we need to emit another one so that we can flag it as the last 2420 * thrsw. 2421 */ 2422 if (c->last_thrsw && !c->last_thrsw_at_top_level) { 2423 assert(c->devinfo->ver >= 41); 2424 vir_emit_thrsw(c); 2425 } 2426 2427 /* If we're threaded, then we need to mark the last THRSW instruction 2428 * so we can emit a pair of them at QPU emit time. 2429 * 2430 * For V3D 4.x, we can spawn the non-fragment shaders already in the 2431 * post-last-THRSW state, so we can skip this. 2432 */ 2433 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { 2434 assert(c->devinfo->ver >= 41); 2435 vir_emit_thrsw(c); 2436 } 2437 2438 if (c->last_thrsw) 2439 c->last_thrsw->is_last_thrsw = true; 2440} 2441 2442/* There's a flag in the shader for "center W is needed for reasons other than 2443 * non-centroid varyings", so we just walk the program after VIR optimization 2444 * to see if it's used. It should be harmless to set even if we only use 2445 * center W for varyings. 2446 */ 2447static void 2448vir_check_payload_w(struct v3d_compile *c) 2449{ 2450 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 2451 return; 2452 2453 vir_for_each_inst_inorder(inst, c) { 2454 for (int i = 0; i < vir_get_nsrc(inst); i++) { 2455 if (inst->src[i].file == QFILE_REG && 2456 inst->src[i].index == 0) { 2457 c->uses_center_w = true; 2458 return; 2459 } 2460 } 2461 } 2462 2463} 2464 2465void 2466v3d_nir_to_vir(struct v3d_compile *c) 2467{ 2468 if (V3D_DEBUG & (V3D_DEBUG_NIR | 2469 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2470 fprintf(stderr, "%s prog %d/%d NIR:\n", 2471 vir_get_stage_name(c), 2472 c->program_id, c->variant_id); 2473 nir_print_shader(c->s, stderr); 2474 } 2475 2476 nir_to_vir(c); 2477 2478 /* Emit the last THRSW before STVPM and TLB writes. */ 2479 vir_emit_last_thrsw(c); 2480 2481 switch (c->s->info.stage) { 2482 case MESA_SHADER_FRAGMENT: 2483 emit_frag_end(c); 2484 break; 2485 case MESA_SHADER_VERTEX: 2486 emit_vert_end(c); 2487 break; 2488 case MESA_SHADER_COMPUTE: 2489 break; 2490 default: 2491 unreachable("bad stage"); 2492 } 2493 2494 if (V3D_DEBUG & (V3D_DEBUG_VIR | 2495 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2496 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", 2497 vir_get_stage_name(c), 2498 c->program_id, c->variant_id); 2499 vir_dump(c); 2500 fprintf(stderr, "\n"); 2501 } 2502 2503 vir_optimize(c); 2504 2505 vir_check_payload_w(c); 2506 2507 /* XXX perf: On VC4, we do a VIR-level instruction scheduling here. 2508 * We used that on that platform to pipeline TMU writes and reduce the 2509 * number of thread switches, as well as try (mostly successfully) to 2510 * reduce maximum register pressure to allow more threads. We should 2511 * do something of that sort for V3D -- either instruction scheduling 2512 * here, or delay the the THRSW and LDTMUs from our texture 2513 * instructions until the results are needed. 2514 */ 2515 2516 if (V3D_DEBUG & (V3D_DEBUG_VIR | 2517 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2518 fprintf(stderr, "%s prog %d/%d VIR:\n", 2519 vir_get_stage_name(c), 2520 c->program_id, c->variant_id); 2521 vir_dump(c); 2522 fprintf(stderr, "\n"); 2523 } 2524 2525 /* Attempt to allocate registers for the temporaries. If we fail, 2526 * reduce thread count and try again. 2527 */ 2528 int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; 2529 struct qpu_reg *temp_registers; 2530 while (true) { 2531 bool spilled; 2532 temp_registers = v3d_register_allocate(c, &spilled); 2533 if (spilled) 2534 continue; 2535 2536 if (temp_registers) 2537 break; 2538 2539 if (c->threads == min_threads) { 2540 fprintf(stderr, "Failed to register allocate at %d threads:\n", 2541 c->threads); 2542 vir_dump(c); 2543 c->failed = true; 2544 return; 2545 } 2546 2547 c->threads /= 2; 2548 2549 if (c->threads == 1) 2550 vir_remove_thrsw(c); 2551 } 2552 2553 if (c->spills && 2554 (V3D_DEBUG & (V3D_DEBUG_VIR | 2555 v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { 2556 fprintf(stderr, "%s prog %d/%d spilled VIR:\n", 2557 vir_get_stage_name(c), 2558 c->program_id, c->variant_id); 2559 vir_dump(c); 2560 fprintf(stderr, "\n"); 2561 } 2562 2563 v3d_vir_to_qpu(c, temp_registers); 2564} 2565