1/* 2 * Copyright © 2016 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include <inttypes.h> 25#include "util/format/u_format.h" 26#include "util/u_helpers.h" 27#include "util/u_math.h" 28#include "util/u_memory.h" 29#include "util/ralloc.h" 30#include "util/hash_table.h" 31#include "compiler/nir/nir.h" 32#include "compiler/nir/nir_builder.h" 33#include "common/v3d_device_info.h" 34#include "v3d_compiler.h" 35 36/* We don't do any address packing. */ 37#define __gen_user_data void 38#define __gen_address_type uint32_t 39#define __gen_address_offset(reloc) (*reloc) 40#define __gen_emit_reloc(cl, reloc) 41#include "cle/v3d_packet_v41_pack.h" 42 43#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) 44#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) 45#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0) 46#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0) 47#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0) 48#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0) 49#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0) 50#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0) 51#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0) 52#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0) 53 54#define V3D_TSY_SET_QUORUM 0 55#define V3D_TSY_INC_WAITERS 1 56#define V3D_TSY_DEC_WAITERS 2 57#define V3D_TSY_INC_QUORUM 3 58#define V3D_TSY_DEC_QUORUM 4 59#define V3D_TSY_FREE_ALL 5 60#define V3D_TSY_RELEASE 6 61#define V3D_TSY_ACQUIRE 7 62#define V3D_TSY_WAIT 8 63#define V3D_TSY_WAIT_INC 9 64#define V3D_TSY_WAIT_CHECK 10 65#define V3D_TSY_WAIT_INC_CHECK 11 66#define V3D_TSY_WAIT_CV 12 67#define V3D_TSY_INC_SEMAPHORE 13 68#define V3D_TSY_DEC_SEMAPHORE 14 69#define V3D_TSY_SET_QUORUM_FREE_ALL 15 70 71enum v3d_tmu_op_type 72{ 73 V3D_TMU_OP_TYPE_REGULAR, 74 V3D_TMU_OP_TYPE_ATOMIC, 75 V3D_TMU_OP_TYPE_CACHE 76}; 77 78static enum v3d_tmu_op_type 79v3d_tmu_get_type_from_op(uint32_t tmu_op, bool is_write) 80{ 81 switch(tmu_op) { 82 case V3D_TMU_OP_WRITE_ADD_READ_PREFETCH: 83 case V3D_TMU_OP_WRITE_SUB_READ_CLEAR: 84 case V3D_TMU_OP_WRITE_XCHG_READ_FLUSH: 85 case V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH: 86 case V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR: 87 return is_write ? V3D_TMU_OP_TYPE_ATOMIC : V3D_TMU_OP_TYPE_CACHE; 88 case V3D_TMU_OP_WRITE_UMAX: 89 case V3D_TMU_OP_WRITE_SMIN: 90 case V3D_TMU_OP_WRITE_SMAX: 91 assert(is_write); 92 FALLTHROUGH; 93 case V3D_TMU_OP_WRITE_AND_READ_INC: 94 case V3D_TMU_OP_WRITE_OR_READ_DEC: 95 case V3D_TMU_OP_WRITE_XOR_READ_NOT: 96 return V3D_TMU_OP_TYPE_ATOMIC; 97 case V3D_TMU_OP_REGULAR: 98 return V3D_TMU_OP_TYPE_REGULAR; 99 100 default: 101 unreachable("Unknown tmu_op\n"); 102 } 103} 104static void 105ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 106 107static void 108resize_qreg_array(struct v3d_compile *c, 109 struct qreg **regs, 110 uint32_t *size, 111 uint32_t decl_size) 112{ 113 if (*size >= decl_size) 114 return; 115 116 uint32_t old_size = *size; 117 *size = MAX2(*size * 2, decl_size); 118 *regs = reralloc(c, *regs, struct qreg, *size); 119 if (!*regs) { 120 fprintf(stderr, "Malloc failure\n"); 121 abort(); 122 } 123 124 for (uint32_t i = old_size; i < *size; i++) 125 (*regs)[i] = c->undef; 126} 127 128static void 129resize_interp_array(struct v3d_compile *c, 130 struct v3d_interp_input **regs, 131 uint32_t *size, 132 uint32_t decl_size) 133{ 134 if (*size >= decl_size) 135 return; 136 137 uint32_t old_size = *size; 138 *size = MAX2(*size * 2, decl_size); 139 *regs = reralloc(c, *regs, struct v3d_interp_input, *size); 140 if (!*regs) { 141 fprintf(stderr, "Malloc failure\n"); 142 abort(); 143 } 144 145 for (uint32_t i = old_size; i < *size; i++) { 146 (*regs)[i].vp = c->undef; 147 (*regs)[i].C = c->undef; 148 } 149} 150 151void 152vir_emit_thrsw(struct v3d_compile *c) 153{ 154 if (c->threads == 1) 155 return; 156 157 /* Always thread switch after each texture operation for now. 158 * 159 * We could do better by batching a bunch of texture fetches up and 160 * then doing one thread switch and collecting all their results 161 * afterward. 162 */ 163 c->last_thrsw = vir_NOP(c); 164 c->last_thrsw->qpu.sig.thrsw = true; 165 c->last_thrsw_at_top_level = !c->in_control_flow; 166 167 /* We need to lock the scoreboard before any tlb acess happens. If this 168 * thread switch comes after we have emitted a tlb load, then it means 169 * that we can't lock on the last thread switch any more. 170 */ 171 if (c->emitted_tlb_load) 172 c->lock_scoreboard_on_first_thrsw = true; 173} 174 175uint32_t 176v3d_get_op_for_atomic_add(nir_intrinsic_instr *instr, unsigned src) 177{ 178 if (nir_src_is_const(instr->src[src])) { 179 int64_t add_val = nir_src_as_int(instr->src[src]); 180 if (add_val == 1) 181 return V3D_TMU_OP_WRITE_AND_READ_INC; 182 else if (add_val == -1) 183 return V3D_TMU_OP_WRITE_OR_READ_DEC; 184 } 185 186 return V3D_TMU_OP_WRITE_ADD_READ_PREFETCH; 187} 188 189static uint32_t 190v3d_general_tmu_op(nir_intrinsic_instr *instr) 191{ 192 switch (instr->intrinsic) { 193 case nir_intrinsic_load_ssbo: 194 case nir_intrinsic_load_ubo: 195 case nir_intrinsic_load_uniform: 196 case nir_intrinsic_load_shared: 197 case nir_intrinsic_load_scratch: 198 case nir_intrinsic_store_ssbo: 199 case nir_intrinsic_store_shared: 200 case nir_intrinsic_store_scratch: 201 return V3D_TMU_OP_REGULAR; 202 case nir_intrinsic_ssbo_atomic_add: 203 return v3d_get_op_for_atomic_add(instr, 2); 204 case nir_intrinsic_shared_atomic_add: 205 return v3d_get_op_for_atomic_add(instr, 1); 206 case nir_intrinsic_ssbo_atomic_imin: 207 case nir_intrinsic_shared_atomic_imin: 208 return V3D_TMU_OP_WRITE_SMIN; 209 case nir_intrinsic_ssbo_atomic_umin: 210 case nir_intrinsic_shared_atomic_umin: 211 return V3D_TMU_OP_WRITE_UMIN_FULL_L1_CLEAR; 212 case nir_intrinsic_ssbo_atomic_imax: 213 case nir_intrinsic_shared_atomic_imax: 214 return V3D_TMU_OP_WRITE_SMAX; 215 case nir_intrinsic_ssbo_atomic_umax: 216 case nir_intrinsic_shared_atomic_umax: 217 return V3D_TMU_OP_WRITE_UMAX; 218 case nir_intrinsic_ssbo_atomic_and: 219 case nir_intrinsic_shared_atomic_and: 220 return V3D_TMU_OP_WRITE_AND_READ_INC; 221 case nir_intrinsic_ssbo_atomic_or: 222 case nir_intrinsic_shared_atomic_or: 223 return V3D_TMU_OP_WRITE_OR_READ_DEC; 224 case nir_intrinsic_ssbo_atomic_xor: 225 case nir_intrinsic_shared_atomic_xor: 226 return V3D_TMU_OP_WRITE_XOR_READ_NOT; 227 case nir_intrinsic_ssbo_atomic_exchange: 228 case nir_intrinsic_shared_atomic_exchange: 229 return V3D_TMU_OP_WRITE_XCHG_READ_FLUSH; 230 case nir_intrinsic_ssbo_atomic_comp_swap: 231 case nir_intrinsic_shared_atomic_comp_swap: 232 return V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH; 233 default: 234 unreachable("unknown intrinsic op"); 235 } 236} 237 238/** 239 * Checks if pipelining a new TMU operation requiring 'components' LDTMUs 240 * would overflow the Output TMU fifo. 241 * 242 * It is not allowed to overflow the Output fifo, however, we can overflow 243 * Input and Config fifos. Doing that makes the shader stall, but only for as 244 * long as it needs to be able to continue so it is better for pipelining to 245 * let the QPU stall on these if needed than trying to emit TMU flushes in the 246 * driver. 247 */ 248bool 249ntq_tmu_fifo_overflow(struct v3d_compile *c, uint32_t components) 250{ 251 if (c->tmu.flush_count >= MAX_TMU_QUEUE_SIZE) 252 return true; 253 254 return components > 0 && 255 c->tmu.output_fifo_size + components > 16 / c->threads; 256} 257 258/** 259 * Emits the thread switch and LDTMU/TMUWT for all outstanding TMU operations, 260 * popping all TMU fifo entries. 261 */ 262void 263ntq_flush_tmu(struct v3d_compile *c) 264{ 265 if (c->tmu.flush_count == 0) 266 return; 267 268 vir_emit_thrsw(c); 269 270 bool emitted_tmuwt = false; 271 for (int i = 0; i < c->tmu.flush_count; i++) { 272 if (c->tmu.flush[i].component_mask > 0) { 273 nir_dest *dest = c->tmu.flush[i].dest; 274 assert(dest); 275 276 for (int j = 0; j < 4; j++) { 277 if (c->tmu.flush[i].component_mask & (1 << j)) { 278 ntq_store_dest(c, dest, j, 279 vir_MOV(c, vir_LDTMU(c))); 280 } 281 } 282 } else if (!emitted_tmuwt) { 283 vir_TMUWT(c); 284 emitted_tmuwt = true; 285 } 286 } 287 288 c->tmu.output_fifo_size = 0; 289 c->tmu.flush_count = 0; 290 _mesa_set_clear(c->tmu.outstanding_regs, NULL); 291} 292 293/** 294 * Queues a pending thread switch + LDTMU/TMUWT for a TMU operation. The caller 295 * is reponsible for ensuring that doing this doesn't overflow the TMU fifos, 296 * and more specifically, the output fifo, since that can't stall. 297 */ 298void 299ntq_add_pending_tmu_flush(struct v3d_compile *c, 300 nir_dest *dest, 301 uint32_t component_mask) 302{ 303 const uint32_t num_components = util_bitcount(component_mask); 304 assert(!ntq_tmu_fifo_overflow(c, num_components)); 305 306 if (num_components > 0) { 307 c->tmu.output_fifo_size += num_components; 308 if (!dest->is_ssa) 309 _mesa_set_add(c->tmu.outstanding_regs, dest->reg.reg); 310 } 311 312 c->tmu.flush[c->tmu.flush_count].dest = dest; 313 c->tmu.flush[c->tmu.flush_count].component_mask = component_mask; 314 c->tmu.flush_count++; 315 316 if (c->disable_tmu_pipelining) 317 ntq_flush_tmu(c); 318 else if (c->tmu.flush_count > 1) 319 c->pipelined_any_tmu = true; 320} 321 322enum emit_mode { 323 MODE_COUNT = 0, 324 MODE_EMIT, 325 MODE_LAST, 326}; 327 328/** 329 * For a TMU general store instruction: 330 * 331 * In MODE_COUNT mode, records the number of TMU writes required and flushes 332 * any outstanding TMU operations the instruction depends on, but it doesn't 333 * emit any actual register writes. 334 * 335 * In MODE_EMIT mode, emits the data register writes required by the 336 * instruction. 337 */ 338static void 339emit_tmu_general_store_writes(struct v3d_compile *c, 340 enum emit_mode mode, 341 nir_intrinsic_instr *instr, 342 uint32_t base_const_offset, 343 uint32_t *writemask, 344 uint32_t *const_offset, 345 uint32_t *tmu_writes) 346{ 347 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); 348 349 /* Find the first set of consecutive components that 350 * are enabled in the writemask and emit the TMUD 351 * instructions for them. 352 */ 353 assert(*writemask != 0); 354 uint32_t first_component = ffs(*writemask) - 1; 355 uint32_t last_component = first_component; 356 while (*writemask & BITFIELD_BIT(last_component + 1)) 357 last_component++; 358 359 assert(first_component <= last_component && 360 last_component < instr->num_components); 361 362 for (int i = first_component; i <= last_component; i++) { 363 struct qreg data = ntq_get_src(c, instr->src[0], i); 364 if (mode == MODE_COUNT) 365 (*tmu_writes)++; 366 else 367 vir_MOV_dest(c, tmud, data); 368 } 369 370 if (mode == MODE_EMIT) { 371 /* Update the offset for the TMU write based on the 372 * the first component we are writing. 373 */ 374 *const_offset = base_const_offset + first_component * 4; 375 376 /* Clear these components from the writemask */ 377 uint32_t written_mask = 378 BITFIELD_RANGE(first_component, *tmu_writes); 379 (*writemask) &= ~written_mask; 380 } 381} 382 383/** 384 * For a TMU general atomic instruction: 385 * 386 * In MODE_COUNT mode, records the number of TMU writes required and flushes 387 * any outstanding TMU operations the instruction depends on, but it doesn't 388 * emit any actual register writes. 389 * 390 * In MODE_EMIT mode, emits the data register writes required by the 391 * instruction. 392 */ 393static void 394emit_tmu_general_atomic_writes(struct v3d_compile *c, 395 enum emit_mode mode, 396 nir_intrinsic_instr *instr, 397 uint32_t tmu_op, 398 bool has_index, 399 uint32_t *tmu_writes) 400{ 401 struct qreg tmud = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD); 402 403 struct qreg data = ntq_get_src(c, instr->src[1 + has_index], 0); 404 if (mode == MODE_COUNT) 405 (*tmu_writes)++; 406 else 407 vir_MOV_dest(c, tmud, data); 408 409 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { 410 data = ntq_get_src(c, instr->src[2 + has_index], 0); 411 if (mode == MODE_COUNT) 412 (*tmu_writes)++; 413 else 414 vir_MOV_dest(c, tmud, data); 415 } 416} 417 418/** 419 * For any TMU general instruction: 420 * 421 * In MODE_COUNT mode, records the number of TMU writes required to emit the 422 * address parameter and flushes any outstanding TMU operations the instruction 423 * depends on, but it doesn't emit any actual register writes. 424 * 425 * In MODE_EMIT mode, emits register writes required to emit the address. 426 */ 427static void 428emit_tmu_general_address_write(struct v3d_compile *c, 429 enum emit_mode mode, 430 nir_intrinsic_instr *instr, 431 uint32_t config, 432 bool dynamic_src, 433 int offset_src, 434 struct qreg base_offset, 435 uint32_t const_offset, 436 uint32_t *tmu_writes) 437{ 438 if (mode == MODE_COUNT) { 439 (*tmu_writes)++; 440 if (dynamic_src) 441 ntq_get_src(c, instr->src[offset_src], 0); 442 return; 443 } 444 445 if (vir_in_nonuniform_control_flow(c)) { 446 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 447 V3D_QPU_PF_PUSHZ); 448 } 449 450 struct qreg tmua; 451 if (config == ~0) 452 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); 453 else 454 tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); 455 456 struct qinst *tmu; 457 if (dynamic_src) { 458 struct qreg offset = base_offset; 459 if (const_offset != 0) { 460 offset = vir_ADD(c, offset, 461 vir_uniform_ui(c, const_offset)); 462 } 463 struct qreg data = ntq_get_src(c, instr->src[offset_src], 0); 464 tmu = vir_ADD_dest(c, tmua, offset, data); 465 } else { 466 if (const_offset != 0) { 467 tmu = vir_ADD_dest(c, tmua, base_offset, 468 vir_uniform_ui(c, const_offset)); 469 } else { 470 tmu = vir_MOV_dest(c, tmua, base_offset); 471 } 472 } 473 474 if (config != ~0) { 475 tmu->uniform = 476 vir_get_uniform_index(c, QUNIFORM_CONSTANT, config); 477 } 478 479 if (vir_in_nonuniform_control_flow(c)) 480 vir_set_cond(tmu, V3D_QPU_COND_IFA); 481} 482 483/** 484 * Implements indirect uniform loads and SSBO accesses through the TMU general 485 * memory access interface. 486 */ 487static void 488ntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, 489 bool is_shared_or_scratch) 490{ 491 uint32_t tmu_op = v3d_general_tmu_op(instr); 492 493 /* If we were able to replace atomic_add for an inc/dec, then we 494 * need/can to do things slightly different, like not loading the 495 * amount to add/sub, as that is implicit. 496 */ 497 bool atomic_add_replaced = 498 ((instr->intrinsic == nir_intrinsic_ssbo_atomic_add || 499 instr->intrinsic == nir_intrinsic_shared_atomic_add) && 500 (tmu_op == V3D_TMU_OP_WRITE_AND_READ_INC || 501 tmu_op == V3D_TMU_OP_WRITE_OR_READ_DEC)); 502 503 bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || 504 instr->intrinsic == nir_intrinsic_store_scratch || 505 instr->intrinsic == nir_intrinsic_store_shared); 506 507 bool is_load = (instr->intrinsic == nir_intrinsic_load_uniform || 508 instr->intrinsic == nir_intrinsic_load_ubo || 509 instr->intrinsic == nir_intrinsic_load_ssbo || 510 instr->intrinsic == nir_intrinsic_load_scratch || 511 instr->intrinsic == nir_intrinsic_load_shared); 512 513 if (!is_load) 514 c->tmu_dirty_rcl = true; 515 516 bool has_index = !is_shared_or_scratch; 517 518 int offset_src; 519 if (instr->intrinsic == nir_intrinsic_load_uniform) { 520 offset_src = 0; 521 } else if (instr->intrinsic == nir_intrinsic_load_ssbo || 522 instr->intrinsic == nir_intrinsic_load_ubo || 523 instr->intrinsic == nir_intrinsic_load_scratch || 524 instr->intrinsic == nir_intrinsic_load_shared || 525 atomic_add_replaced) { 526 offset_src = 0 + has_index; 527 } else if (is_store) { 528 offset_src = 1 + has_index; 529 } else { 530 offset_src = 0 + has_index; 531 } 532 533 bool dynamic_src = !nir_src_is_const(instr->src[offset_src]); 534 uint32_t const_offset = 0; 535 if (!dynamic_src) 536 const_offset = nir_src_as_uint(instr->src[offset_src]); 537 538 struct qreg base_offset; 539 if (instr->intrinsic == nir_intrinsic_load_uniform) { 540 const_offset += nir_intrinsic_base(instr); 541 base_offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 542 v3d_unit_data_create(0, const_offset)); 543 const_offset = 0; 544 } else if (instr->intrinsic == nir_intrinsic_load_ubo) { 545 uint32_t index = nir_src_as_uint(instr->src[0]); 546 /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index 547 * shifted up by 1 (0 is gallium's constant buffer 0). 548 */ 549 if (c->key->environment == V3D_ENVIRONMENT_OPENGL) 550 index++; 551 552 base_offset = 553 vir_uniform(c, QUNIFORM_UBO_ADDR, 554 v3d_unit_data_create(index, const_offset)); 555 const_offset = 0; 556 } else if (is_shared_or_scratch) { 557 /* Shared and scratch variables have no buffer index, and all 558 * start from a common base that we set up at the start of 559 * dispatch. 560 */ 561 if (instr->intrinsic == nir_intrinsic_load_scratch || 562 instr->intrinsic == nir_intrinsic_store_scratch) { 563 base_offset = c->spill_base; 564 } else { 565 base_offset = c->cs_shared_offset; 566 const_offset += nir_intrinsic_base(instr); 567 } 568 } else { 569 base_offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, 570 nir_src_as_uint(instr->src[is_store ? 571 1 : 0])); 572 } 573 574 /* We are ready to emit TMU register writes now, but before we actually 575 * emit them we need to flush outstanding TMU operations if any of our 576 * writes reads from the result of an outstanding TMU operation before 577 * we start the TMU sequence for this operation, since otherwise the 578 * flush could happen in the middle of the TMU sequence we are about to 579 * emit, which is illegal. To do this we run this logic twice, the 580 * first time it will count required register writes and flush pending 581 * TMU requests if necessary due to a dependency, and the second one 582 * will emit the actual TMU writes. 583 */ 584 const uint32_t dest_components = nir_intrinsic_dest_components(instr); 585 uint32_t base_const_offset = const_offset; 586 uint32_t writemask = is_store ? nir_intrinsic_write_mask(instr) : 0; 587 uint32_t tmu_writes = 0; 588 for (enum emit_mode mode = MODE_COUNT; mode != MODE_LAST; mode++) { 589 assert(mode == MODE_COUNT || tmu_writes > 0); 590 591 if (is_store) { 592 emit_tmu_general_store_writes(c, mode, instr, 593 base_const_offset, 594 &writemask, 595 &const_offset, 596 &tmu_writes); 597 } else if (!is_load && !atomic_add_replaced) { 598 emit_tmu_general_atomic_writes(c, mode, instr, 599 tmu_op, has_index, 600 &tmu_writes); 601 } 602 603 /* For atomics we use 32bit except for CMPXCHG, that we need 604 * to use VEC2. For the rest of the cases we use the number of 605 * tmud writes we did to decide the type. For cache operations 606 * the type is ignored. 607 */ 608 uint32_t config = 0; 609 if (mode == MODE_EMIT) { 610 uint32_t num_components; 611 if (is_load || atomic_add_replaced) { 612 num_components = instr->num_components; 613 } else { 614 assert(tmu_writes > 0); 615 num_components = tmu_writes - 1; 616 } 617 bool is_atomic = 618 v3d_tmu_get_type_from_op(tmu_op, !is_load) == 619 V3D_TMU_OP_TYPE_ATOMIC; 620 621 uint32_t perquad = 622 is_load && !vir_in_nonuniform_control_flow(c) 623 ? GENERAL_TMU_LOOKUP_PER_QUAD 624 : GENERAL_TMU_LOOKUP_PER_PIXEL; 625 config = 0xffffff00 | tmu_op << 3 | perquad; 626 627 if (tmu_op == V3D_TMU_OP_WRITE_CMPXCHG_READ_FLUSH) { 628 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2; 629 } else if (is_atomic || num_components == 1) { 630 config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; 631 } else { 632 config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + 633 num_components - 2; 634 } 635 } 636 637 emit_tmu_general_address_write(c, mode, instr, config, 638 dynamic_src, offset_src, 639 base_offset, const_offset, 640 &tmu_writes); 641 642 assert(tmu_writes > 0); 643 if (mode == MODE_COUNT) { 644 /* Make sure we won't exceed the 16-entry TMU 645 * fifo if each thread is storing at the same 646 * time. 647 */ 648 while (tmu_writes > 16 / c->threads) 649 c->threads /= 2; 650 651 /* If pipelining this TMU operation would 652 * overflow TMU fifos, we need to flush. 653 */ 654 if (ntq_tmu_fifo_overflow(c, dest_components)) 655 ntq_flush_tmu(c); 656 } else { 657 /* Delay emission of the thread switch and 658 * LDTMU/TMUWT until we really need to do it to 659 * improve pipelining. 660 */ 661 const uint32_t component_mask = 662 (1 << dest_components) - 1; 663 ntq_add_pending_tmu_flush(c, &instr->dest, 664 component_mask); 665 } 666 } 667 668 /* nir_lower_wrmasks should've ensured that any writemask on a store 669 * operation only has consecutive bits set, in which case we should've 670 * processed the full writemask above. 671 */ 672 assert(writemask == 0); 673} 674 675static struct qreg * 676ntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) 677{ 678 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 679 def->num_components); 680 _mesa_hash_table_insert(c->def_ht, def, qregs); 681 return qregs; 682} 683 684static bool 685is_ld_signal(const struct v3d_qpu_sig *sig) 686{ 687 return (sig->ldunif || 688 sig->ldunifa || 689 sig->ldunifrf || 690 sig->ldunifarf || 691 sig->ldtmu || 692 sig->ldvary || 693 sig->ldvpm || 694 sig->ldtlb || 695 sig->ldtlbu); 696} 697 698static inline bool 699is_ldunif_signal(const struct v3d_qpu_sig *sig) 700{ 701 return sig->ldunif || sig->ldunifrf; 702} 703 704/** 705 * This function is responsible for getting VIR results into the associated 706 * storage for a NIR instruction. 707 * 708 * If it's a NIR SSA def, then we just set the associated hash table entry to 709 * the new result. 710 * 711 * If it's a NIR reg, then we need to update the existing qreg assigned to the 712 * NIR destination with the incoming value. To do that without introducing 713 * new MOVs, we require that the incoming qreg either be a uniform, or be 714 * SSA-defined by the previous VIR instruction in the block and rewritable by 715 * this function. That lets us sneak ahead and insert the SF flag beforehand 716 * (knowing that the previous instruction doesn't depend on flags) and rewrite 717 * its destination to be the NIR reg's destination 718 */ 719void 720ntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, 721 struct qreg result) 722{ 723 struct qinst *last_inst = NULL; 724 if (!list_is_empty(&c->cur_block->instructions)) 725 last_inst = (struct qinst *)c->cur_block->instructions.prev; 726 727 bool is_reused_uniform = 728 is_ldunif_signal(&c->defs[result.index]->qpu.sig) && 729 last_inst != c->defs[result.index]; 730 731 assert(result.file == QFILE_TEMP && last_inst && 732 (last_inst == c->defs[result.index] || is_reused_uniform)); 733 734 if (dest->is_ssa) { 735 assert(chan < dest->ssa.num_components); 736 737 struct qreg *qregs; 738 struct hash_entry *entry = 739 _mesa_hash_table_search(c->def_ht, &dest->ssa); 740 741 if (entry) 742 qregs = entry->data; 743 else 744 qregs = ntq_init_ssa_def(c, &dest->ssa); 745 746 qregs[chan] = result; 747 } else { 748 nir_register *reg = dest->reg.reg; 749 assert(dest->reg.base_offset == 0); 750 assert(reg->num_array_elems == 0); 751 struct hash_entry *entry = 752 _mesa_hash_table_search(c->def_ht, reg); 753 struct qreg *qregs = entry->data; 754 755 /* If the previous instruction can't be predicated for 756 * the store into the nir_register, then emit a MOV 757 * that can be. 758 */ 759 if (is_reused_uniform || 760 (vir_in_nonuniform_control_flow(c) && 761 is_ld_signal(&c->defs[last_inst->dst.index]->qpu.sig))) { 762 result = vir_MOV(c, result); 763 last_inst = c->defs[result.index]; 764 } 765 766 /* We know they're both temps, so just rewrite index. */ 767 c->defs[last_inst->dst.index] = NULL; 768 last_inst->dst.index = qregs[chan].index; 769 770 /* If we're in control flow, then make this update of the reg 771 * conditional on the execution mask. 772 */ 773 if (vir_in_nonuniform_control_flow(c)) { 774 last_inst->dst.index = qregs[chan].index; 775 776 /* Set the flags to the current exec mask. 777 */ 778 c->cursor = vir_before_inst(last_inst); 779 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 780 V3D_QPU_PF_PUSHZ); 781 c->cursor = vir_after_inst(last_inst); 782 783 vir_set_cond(last_inst, V3D_QPU_COND_IFA); 784 } 785 } 786} 787 788/** 789 * This looks up the qreg associated with a particular ssa/reg used as a source 790 * in any instruction. 791 * 792 * It is expected that the definition for any NIR value read as a source has 793 * been emitted by a previous instruction, however, in the case of TMU 794 * operations we may have postponed emission of the thread switch and LDTMUs 795 * required to read the TMU results until the results are actually used to 796 * improve pipelining, which then would lead to us not finding them here 797 * (for SSA defs) or finding them in the list of registers awaiting a TMU flush 798 * (for registers), meaning that we need to flush outstanding TMU operations 799 * to read the correct value. 800 */ 801struct qreg 802ntq_get_src(struct v3d_compile *c, nir_src src, int i) 803{ 804 struct hash_entry *entry; 805 if (src.is_ssa) { 806 assert(i < src.ssa->num_components); 807 808 entry = _mesa_hash_table_search(c->def_ht, src.ssa); 809 if (!entry) { 810 ntq_flush_tmu(c); 811 entry = _mesa_hash_table_search(c->def_ht, src.ssa); 812 } 813 } else { 814 nir_register *reg = src.reg.reg; 815 assert(reg->num_array_elems == 0); 816 assert(src.reg.base_offset == 0); 817 assert(i < reg->num_components); 818 819 if (_mesa_set_search(c->tmu.outstanding_regs, reg)) 820 ntq_flush_tmu(c); 821 entry = _mesa_hash_table_search(c->def_ht, reg); 822 } 823 assert(entry); 824 825 struct qreg *qregs = entry->data; 826 return qregs[i]; 827} 828 829static struct qreg 830ntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, 831 unsigned src) 832{ 833 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 834 unsigned chan = ffs(instr->dest.write_mask) - 1; 835 struct qreg r = ntq_get_src(c, instr->src[src].src, 836 instr->src[src].swizzle[chan]); 837 838 assert(!instr->src[src].abs); 839 assert(!instr->src[src].negate); 840 841 return r; 842}; 843 844static struct qreg 845ntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level) 846{ 847 return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1)); 848} 849 850static void 851ntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) 852{ 853 unsigned unit = instr->texture_index; 854 int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod); 855 int dest_size = nir_tex_instr_dest_size(instr); 856 857 struct qreg lod = c->undef; 858 if (lod_index != -1) 859 lod = ntq_get_src(c, instr->src[lod_index].src, 0); 860 861 for (int i = 0; i < dest_size; i++) { 862 assert(i < 3); 863 enum quniform_contents contents; 864 865 if (instr->is_array && i == dest_size - 1) 866 contents = QUNIFORM_TEXTURE_ARRAY_SIZE; 867 else 868 contents = QUNIFORM_TEXTURE_WIDTH + i; 869 870 struct qreg size = vir_uniform(c, contents, unit); 871 872 switch (instr->sampler_dim) { 873 case GLSL_SAMPLER_DIM_1D: 874 case GLSL_SAMPLER_DIM_2D: 875 case GLSL_SAMPLER_DIM_MS: 876 case GLSL_SAMPLER_DIM_3D: 877 case GLSL_SAMPLER_DIM_CUBE: 878 case GLSL_SAMPLER_DIM_BUF: 879 /* Don't minify the array size. */ 880 if (!(instr->is_array && i == dest_size - 1)) { 881 size = ntq_minify(c, size, lod); 882 } 883 break; 884 885 case GLSL_SAMPLER_DIM_RECT: 886 /* There's no LOD field for rects */ 887 break; 888 889 default: 890 unreachable("Bad sampler type"); 891 } 892 893 ntq_store_dest(c, &instr->dest, i, size); 894 } 895} 896 897static void 898ntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) 899{ 900 unsigned unit = instr->texture_index; 901 902 /* Since each texture sampling op requires uploading uniforms to 903 * reference the texture, there's no HW support for texture size and 904 * you just upload uniforms containing the size. 905 */ 906 switch (instr->op) { 907 case nir_texop_query_levels: 908 ntq_store_dest(c, &instr->dest, 0, 909 vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); 910 return; 911 case nir_texop_texture_samples: 912 ntq_store_dest(c, &instr->dest, 0, 913 vir_uniform(c, QUNIFORM_TEXTURE_SAMPLES, unit)); 914 return; 915 case nir_texop_txs: 916 ntq_emit_txs(c, instr); 917 return; 918 default: 919 break; 920 } 921 922 if (c->devinfo->ver >= 40) 923 v3d40_vir_emit_tex(c, instr); 924 else 925 v3d33_vir_emit_tex(c, instr); 926} 927 928static struct qreg 929ntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos) 930{ 931 struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI)); 932 if (is_cos) 933 input = vir_FADD(c, input, vir_uniform_f(c, 0.5)); 934 935 struct qreg periods = vir_FROUND(c, input); 936 struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods)); 937 return vir_XOR(c, sin_output, vir_SHL(c, 938 vir_FTOIN(c, periods), 939 vir_uniform_ui(c, -1))); 940} 941 942static struct qreg 943ntq_fsign(struct v3d_compile *c, struct qreg src) 944{ 945 struct qreg t = vir_get_temp(c); 946 947 vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); 948 vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ); 949 vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); 950 vir_set_pf(c, vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN); 951 vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); 952 return vir_MOV(c, t); 953} 954 955static void 956emit_fragcoord_input(struct v3d_compile *c, int attr) 957{ 958 c->inputs[attr * 4 + 0] = vir_FXCD(c); 959 c->inputs[attr * 4 + 1] = vir_FYCD(c); 960 c->inputs[attr * 4 + 2] = c->payload_z; 961 c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w); 962} 963 964static struct qreg 965emit_smooth_varying(struct v3d_compile *c, 966 struct qreg vary, struct qreg w, struct qreg r5) 967{ 968 return vir_FADD(c, vir_FMUL(c, vary, w), r5); 969} 970 971static struct qreg 972emit_noperspective_varying(struct v3d_compile *c, 973 struct qreg vary, struct qreg r5) 974{ 975 return vir_FADD(c, vir_MOV(c, vary), r5); 976} 977 978static struct qreg 979emit_flat_varying(struct v3d_compile *c, 980 struct qreg vary, struct qreg r5) 981{ 982 vir_MOV_dest(c, c->undef, vary); 983 return vir_MOV(c, r5); 984} 985 986static struct qreg 987emit_fragment_varying(struct v3d_compile *c, nir_variable *var, 988 int8_t input_idx, uint8_t swizzle, int array_index) 989{ 990 struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); 991 struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); 992 993 struct qinst *ldvary = NULL; 994 struct qreg vary; 995 if (c->devinfo->ver >= 41) { 996 ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, 997 c->undef, c->undef); 998 ldvary->qpu.sig.ldvary = true; 999 vary = vir_emit_def(c, ldvary); 1000 } else { 1001 vir_NOP(c)->qpu.sig.ldvary = true; 1002 vary = r3; 1003 } 1004 1005 /* Store the input value before interpolation so we can implement 1006 * GLSL's interpolateAt functions if the shader uses them. 1007 */ 1008 if (input_idx >= 0) { 1009 assert(var); 1010 c->interp[input_idx].vp = vary; 1011 c->interp[input_idx].C = vir_MOV(c, r5); 1012 c->interp[input_idx].mode = var->data.interpolation; 1013 } 1014 1015 /* For gl_PointCoord input or distance along a line, we'll be called 1016 * with no nir_variable, and we don't count toward VPM size so we 1017 * don't track an input slot. 1018 */ 1019 if (!var) { 1020 assert(input_idx < 0); 1021 return emit_smooth_varying(c, vary, c->payload_w, r5); 1022 } 1023 1024 int i = c->num_inputs++; 1025 c->input_slots[i] = 1026 v3d_slot_from_slot_and_component(var->data.location + 1027 array_index, swizzle); 1028 1029 struct qreg result; 1030 switch (var->data.interpolation) { 1031 case INTERP_MODE_NONE: 1032 case INTERP_MODE_SMOOTH: 1033 if (var->data.centroid) { 1034 BITSET_SET(c->centroid_flags, i); 1035 result = emit_smooth_varying(c, vary, 1036 c->payload_w_centroid, r5); 1037 } else { 1038 result = emit_smooth_varying(c, vary, c->payload_w, r5); 1039 } 1040 break; 1041 1042 case INTERP_MODE_NOPERSPECTIVE: 1043 BITSET_SET(c->noperspective_flags, i); 1044 result = emit_noperspective_varying(c, vary, r5); 1045 break; 1046 1047 case INTERP_MODE_FLAT: 1048 BITSET_SET(c->flat_shade_flags, i); 1049 result = emit_flat_varying(c, vary, r5); 1050 break; 1051 1052 default: 1053 unreachable("Bad interp mode"); 1054 } 1055 1056 if (input_idx >= 0) 1057 c->inputs[input_idx] = result; 1058 return result; 1059} 1060 1061static void 1062emit_fragment_input(struct v3d_compile *c, int base_attr, nir_variable *var, 1063 int array_index, unsigned nelem) 1064{ 1065 for (int i = 0; i < nelem ; i++) { 1066 int chan = var->data.location_frac + i; 1067 int input_idx = (base_attr + array_index) * 4 + chan; 1068 emit_fragment_varying(c, var, input_idx, chan, array_index); 1069 } 1070} 1071 1072static void 1073emit_compact_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, 1074 int array_index) 1075{ 1076 /* Compact variables are scalar arrays where each set of 4 elements 1077 * consumes a single location. 1078 */ 1079 int loc_offset = array_index / 4; 1080 int chan = var->data.location_frac + array_index % 4; 1081 int input_idx = (attr + loc_offset) * 4 + chan; 1082 emit_fragment_varying(c, var, input_idx, chan, loc_offset); 1083} 1084 1085static void 1086add_output(struct v3d_compile *c, 1087 uint32_t decl_offset, 1088 uint8_t slot, 1089 uint8_t swizzle) 1090{ 1091 uint32_t old_array_size = c->outputs_array_size; 1092 resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 1093 decl_offset + 1); 1094 1095 if (old_array_size != c->outputs_array_size) { 1096 c->output_slots = reralloc(c, 1097 c->output_slots, 1098 struct v3d_varying_slot, 1099 c->outputs_array_size); 1100 } 1101 1102 c->output_slots[decl_offset] = 1103 v3d_slot_from_slot_and_component(slot, swizzle); 1104} 1105 1106/** 1107 * If compare_instr is a valid comparison instruction, emits the 1108 * compare_instr's comparison and returns the sel_instr's return value based 1109 * on the compare_instr's result. 1110 */ 1111static bool 1112ntq_emit_comparison(struct v3d_compile *c, 1113 nir_alu_instr *compare_instr, 1114 enum v3d_qpu_cond *out_cond) 1115{ 1116 struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 1117 struct qreg src1; 1118 if (nir_op_infos[compare_instr->op].num_inputs > 1) 1119 src1 = ntq_get_alu_src(c, compare_instr, 1); 1120 bool cond_invert = false; 1121 struct qreg nop = vir_nop_reg(); 1122 1123 switch (compare_instr->op) { 1124 case nir_op_feq32: 1125 case nir_op_seq: 1126 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1127 break; 1128 case nir_op_ieq32: 1129 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1130 break; 1131 1132 case nir_op_fneu32: 1133 case nir_op_sne: 1134 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1135 cond_invert = true; 1136 break; 1137 case nir_op_ine32: 1138 vir_set_pf(c, vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 1139 cond_invert = true; 1140 break; 1141 1142 case nir_op_fge32: 1143 case nir_op_sge: 1144 vir_set_pf(c, vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1145 break; 1146 case nir_op_ige32: 1147 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1148 cond_invert = true; 1149 break; 1150 case nir_op_uge32: 1151 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 1152 cond_invert = true; 1153 break; 1154 1155 case nir_op_slt: 1156 case nir_op_flt32: 1157 vir_set_pf(c, vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN); 1158 break; 1159 case nir_op_ilt32: 1160 vir_set_pf(c, vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 1161 break; 1162 case nir_op_ult32: 1163 vir_set_pf(c, vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 1164 break; 1165 1166 case nir_op_i2b32: 1167 vir_set_pf(c, vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 1168 cond_invert = true; 1169 break; 1170 1171 case nir_op_f2b32: 1172 vir_set_pf(c, vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 1173 cond_invert = true; 1174 break; 1175 1176 default: 1177 return false; 1178 } 1179 1180 *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA; 1181 1182 return true; 1183} 1184 1185/* Finds an ALU instruction that generates our src value that could 1186 * (potentially) be greedily emitted in the consuming instruction. 1187 */ 1188static struct nir_alu_instr * 1189ntq_get_alu_parent(nir_src src) 1190{ 1191 if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu) 1192 return NULL; 1193 nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr); 1194 if (!instr) 1195 return NULL; 1196 1197 /* If the ALU instr's srcs are non-SSA, then we would have to avoid 1198 * moving emission of the ALU instr down past another write of the 1199 * src. 1200 */ 1201 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1202 if (!instr->src[i].src.is_ssa) 1203 return NULL; 1204 } 1205 1206 return instr; 1207} 1208 1209/* Turns a NIR bool into a condition code to predicate on. */ 1210static enum v3d_qpu_cond 1211ntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src) 1212{ 1213 struct qreg qsrc = ntq_get_src(c, src, 0); 1214 /* skip if we already have src in the flags */ 1215 if (qsrc.file == QFILE_TEMP && c->flags_temp == qsrc.index) 1216 return c->flags_cond; 1217 1218 nir_alu_instr *compare = ntq_get_alu_parent(src); 1219 if (!compare) 1220 goto out; 1221 1222 enum v3d_qpu_cond cond; 1223 if (ntq_emit_comparison(c, compare, &cond)) 1224 return cond; 1225 1226out: 1227 1228 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)), 1229 V3D_QPU_PF_PUSHZ); 1230 return V3D_QPU_COND_IFNA; 1231} 1232 1233static struct qreg 1234ntq_emit_cond_to_bool(struct v3d_compile *c, enum v3d_qpu_cond cond) 1235{ 1236 struct qreg result = 1237 vir_MOV(c, vir_SEL(c, cond, 1238 vir_uniform_ui(c, ~0), 1239 vir_uniform_ui(c, 0))); 1240 c->flags_temp = result.index; 1241 c->flags_cond = cond; 1242 return result; 1243} 1244 1245static void 1246ntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) 1247{ 1248 /* This should always be lowered to ALU operations for V3D. */ 1249 assert(!instr->dest.saturate); 1250 1251 /* Vectors are special in that they have non-scalarized writemasks, 1252 * and just take the first swizzle channel for each argument in order 1253 * into each writemask channel. 1254 */ 1255 if (instr->op == nir_op_vec2 || 1256 instr->op == nir_op_vec3 || 1257 instr->op == nir_op_vec4) { 1258 struct qreg srcs[4]; 1259 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1260 srcs[i] = ntq_get_src(c, instr->src[i].src, 1261 instr->src[i].swizzle[0]); 1262 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 1263 ntq_store_dest(c, &instr->dest.dest, i, 1264 vir_MOV(c, srcs[i])); 1265 return; 1266 } 1267 1268 /* General case: We can just grab the one used channel per src. */ 1269 struct qreg src[nir_op_infos[instr->op].num_inputs]; 1270 for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 1271 src[i] = ntq_get_alu_src(c, instr, i); 1272 } 1273 1274 struct qreg result; 1275 1276 switch (instr->op) { 1277 case nir_op_mov: 1278 result = vir_MOV(c, src[0]); 1279 break; 1280 1281 case nir_op_fneg: 1282 result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31)); 1283 break; 1284 case nir_op_ineg: 1285 result = vir_NEG(c, src[0]); 1286 break; 1287 1288 case nir_op_fmul: 1289 result = vir_FMUL(c, src[0], src[1]); 1290 break; 1291 case nir_op_fadd: 1292 result = vir_FADD(c, src[0], src[1]); 1293 break; 1294 case nir_op_fsub: 1295 result = vir_FSUB(c, src[0], src[1]); 1296 break; 1297 case nir_op_fmin: 1298 result = vir_FMIN(c, src[0], src[1]); 1299 break; 1300 case nir_op_fmax: 1301 result = vir_FMAX(c, src[0], src[1]); 1302 break; 1303 1304 case nir_op_f2i32: { 1305 nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src); 1306 if (src0_alu && src0_alu->op == nir_op_fround_even) { 1307 result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0)); 1308 } else { 1309 result = vir_FTOIZ(c, src[0]); 1310 } 1311 break; 1312 } 1313 1314 case nir_op_f2u32: 1315 result = vir_FTOUZ(c, src[0]); 1316 break; 1317 case nir_op_i2f32: 1318 result = vir_ITOF(c, src[0]); 1319 break; 1320 case nir_op_u2f32: 1321 result = vir_UTOF(c, src[0]); 1322 break; 1323 case nir_op_b2f32: 1324 result = vir_AND(c, src[0], vir_uniform_f(c, 1.0)); 1325 break; 1326 case nir_op_b2i32: 1327 result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); 1328 break; 1329 1330 case nir_op_iadd: 1331 result = vir_ADD(c, src[0], src[1]); 1332 break; 1333 case nir_op_ushr: 1334 result = vir_SHR(c, src[0], src[1]); 1335 break; 1336 case nir_op_isub: 1337 result = vir_SUB(c, src[0], src[1]); 1338 break; 1339 case nir_op_ishr: 1340 result = vir_ASR(c, src[0], src[1]); 1341 break; 1342 case nir_op_ishl: 1343 result = vir_SHL(c, src[0], src[1]); 1344 break; 1345 case nir_op_imin: 1346 result = vir_MIN(c, src[0], src[1]); 1347 break; 1348 case nir_op_umin: 1349 result = vir_UMIN(c, src[0], src[1]); 1350 break; 1351 case nir_op_imax: 1352 result = vir_MAX(c, src[0], src[1]); 1353 break; 1354 case nir_op_umax: 1355 result = vir_UMAX(c, src[0], src[1]); 1356 break; 1357 case nir_op_iand: 1358 result = vir_AND(c, src[0], src[1]); 1359 break; 1360 case nir_op_ior: 1361 result = vir_OR(c, src[0], src[1]); 1362 break; 1363 case nir_op_ixor: 1364 result = vir_XOR(c, src[0], src[1]); 1365 break; 1366 case nir_op_inot: 1367 result = vir_NOT(c, src[0]); 1368 break; 1369 1370 case nir_op_ufind_msb: 1371 result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0])); 1372 break; 1373 1374 case nir_op_imul: 1375 result = vir_UMUL(c, src[0], src[1]); 1376 break; 1377 1378 case nir_op_seq: 1379 case nir_op_sne: 1380 case nir_op_sge: 1381 case nir_op_slt: { 1382 enum v3d_qpu_cond cond; 1383 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); 1384 assert(ok); 1385 result = vir_MOV(c, vir_SEL(c, cond, 1386 vir_uniform_f(c, 1.0), 1387 vir_uniform_f(c, 0.0))); 1388 c->flags_temp = result.index; 1389 c->flags_cond = cond; 1390 break; 1391 } 1392 1393 case nir_op_i2b32: 1394 case nir_op_f2b32: 1395 case nir_op_feq32: 1396 case nir_op_fneu32: 1397 case nir_op_fge32: 1398 case nir_op_flt32: 1399 case nir_op_ieq32: 1400 case nir_op_ine32: 1401 case nir_op_ige32: 1402 case nir_op_uge32: 1403 case nir_op_ilt32: 1404 case nir_op_ult32: { 1405 enum v3d_qpu_cond cond; 1406 ASSERTED bool ok = ntq_emit_comparison(c, instr, &cond); 1407 assert(ok); 1408 result = ntq_emit_cond_to_bool(c, cond); 1409 break; 1410 } 1411 1412 case nir_op_b32csel: 1413 result = vir_MOV(c, 1414 vir_SEL(c, 1415 ntq_emit_bool_to_cond(c, instr->src[0].src), 1416 src[1], src[2])); 1417 break; 1418 1419 case nir_op_fcsel: 1420 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), src[0]), 1421 V3D_QPU_PF_PUSHZ); 1422 result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, 1423 src[1], src[2])); 1424 break; 1425 1426 case nir_op_frcp: 1427 result = vir_RECIP(c, src[0]); 1428 break; 1429 case nir_op_frsq: 1430 result = vir_RSQRT(c, src[0]); 1431 break; 1432 case nir_op_fexp2: 1433 result = vir_EXP(c, src[0]); 1434 break; 1435 case nir_op_flog2: 1436 result = vir_LOG(c, src[0]); 1437 break; 1438 1439 case nir_op_fceil: 1440 result = vir_FCEIL(c, src[0]); 1441 break; 1442 case nir_op_ffloor: 1443 result = vir_FFLOOR(c, src[0]); 1444 break; 1445 case nir_op_fround_even: 1446 result = vir_FROUND(c, src[0]); 1447 break; 1448 case nir_op_ftrunc: 1449 result = vir_FTRUNC(c, src[0]); 1450 break; 1451 1452 case nir_op_fsin: 1453 result = ntq_fsincos(c, src[0], false); 1454 break; 1455 case nir_op_fcos: 1456 result = ntq_fsincos(c, src[0], true); 1457 break; 1458 1459 case nir_op_fsign: 1460 result = ntq_fsign(c, src[0]); 1461 break; 1462 1463 case nir_op_fabs: { 1464 result = vir_FMOV(c, src[0]); 1465 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS); 1466 break; 1467 } 1468 1469 case nir_op_iabs: 1470 result = vir_MAX(c, src[0], vir_NEG(c, src[0])); 1471 break; 1472 1473 case nir_op_fddx: 1474 case nir_op_fddx_coarse: 1475 case nir_op_fddx_fine: 1476 result = vir_FDX(c, src[0]); 1477 break; 1478 1479 case nir_op_fddy: 1480 case nir_op_fddy_coarse: 1481 case nir_op_fddy_fine: 1482 result = vir_FDY(c, src[0]); 1483 break; 1484 1485 case nir_op_uadd_carry: 1486 vir_set_pf(c, vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), 1487 V3D_QPU_PF_PUSHC); 1488 result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); 1489 break; 1490 1491 case nir_op_pack_half_2x16_split: 1492 result = vir_VFPACK(c, src[0], src[1]); 1493 break; 1494 1495 case nir_op_unpack_half_2x16_split_x: 1496 result = vir_FMOV(c, src[0]); 1497 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); 1498 break; 1499 1500 case nir_op_unpack_half_2x16_split_y: 1501 result = vir_FMOV(c, src[0]); 1502 vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); 1503 break; 1504 1505 case nir_op_fquantize2f16: { 1506 /* F32 -> F16 -> F32 conversion */ 1507 struct qreg tmp = vir_FMOV(c, src[0]); 1508 vir_set_pack(c->defs[tmp.index], V3D_QPU_PACK_L); 1509 tmp = vir_FMOV(c, tmp); 1510 vir_set_unpack(c->defs[tmp.index], 0, V3D_QPU_UNPACK_L); 1511 1512 /* Check for denorm */ 1513 struct qreg abs_src = vir_FMOV(c, src[0]); 1514 vir_set_unpack(c->defs[abs_src.index], 0, V3D_QPU_UNPACK_ABS); 1515 struct qreg threshold = vir_uniform_f(c, ldexpf(1.0, -14)); 1516 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), abs_src, threshold), 1517 V3D_QPU_PF_PUSHC); 1518 1519 /* Return +/-0 for denorms */ 1520 struct qreg zero = 1521 vir_AND(c, src[0], vir_uniform_ui(c, 0x80000000)); 1522 result = vir_FMOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, tmp, zero)); 1523 break; 1524 } 1525 1526 default: 1527 fprintf(stderr, "unknown NIR ALU inst: "); 1528 nir_print_instr(&instr->instr, stderr); 1529 fprintf(stderr, "\n"); 1530 abort(); 1531 } 1532 1533 /* We have a scalar result, so the instruction should only have a 1534 * single channel written to. 1535 */ 1536 assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 1537 ntq_store_dest(c, &instr->dest.dest, 1538 ffs(instr->dest.write_mask) - 1, result); 1539} 1540 1541/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit 1542 * specifier. They come from a register that's preloaded with 0xffffffff 1543 * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low 1544 * 8 bits are shifted off the bottom and 0xff shifted in from the top. 1545 */ 1546#define TLB_TYPE_F16_COLOR (3 << 6) 1547#define TLB_TYPE_I32_COLOR (1 << 6) 1548#define TLB_TYPE_F32_COLOR (0 << 6) 1549#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */ 1550#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2) 1551#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2) 1552#define TLB_F16_SWAP_HI_LO (1 << 1) 1553#define TLB_VEC_SIZE_4_F16 (1 << 0) 1554#define TLB_VEC_SIZE_2_F16 (0 << 0) 1555#define TLB_VEC_SIZE_MINUS_1_SHIFT 0 1556 1557/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z" 1558 * flag is set. 1559 */ 1560#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4)) 1561#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */ 1562#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */ 1563#define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */ 1564#define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */ 1565 1566/* Stencil is a single 32-bit write. */ 1567#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4)) 1568 1569static void 1570vir_emit_tlb_color_write(struct v3d_compile *c, unsigned rt) 1571{ 1572 if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt]) 1573 return; 1574 1575 struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB); 1576 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); 1577 1578 nir_variable *var = c->output_color_var[rt]; 1579 int num_components = glsl_get_vector_elements(var->type); 1580 uint32_t conf = 0xffffff00; 1581 struct qinst *inst; 1582 1583 conf |= c->msaa_per_sample_output ? TLB_SAMPLE_MODE_PER_SAMPLE : 1584 TLB_SAMPLE_MODE_PER_PIXEL; 1585 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 1586 1587 if (c->fs_key->swap_color_rb & (1 << rt)) 1588 num_components = MAX2(num_components, 3); 1589 assert(num_components != 0); 1590 1591 enum glsl_base_type type = glsl_get_base_type(var->type); 1592 bool is_int_format = type == GLSL_TYPE_INT || type == GLSL_TYPE_UINT; 1593 bool is_32b_tlb_format = is_int_format || 1594 (c->fs_key->f32_color_rb & (1 << rt)); 1595 1596 if (is_int_format) { 1597 /* The F32 vs I32 distinction was dropped in 4.2. */ 1598 if (c->devinfo->ver < 42) 1599 conf |= TLB_TYPE_I32_COLOR; 1600 else 1601 conf |= TLB_TYPE_F32_COLOR; 1602 conf |= ((num_components - 1) << TLB_VEC_SIZE_MINUS_1_SHIFT); 1603 } else { 1604 if (c->fs_key->f32_color_rb & (1 << rt)) { 1605 conf |= TLB_TYPE_F32_COLOR; 1606 conf |= ((num_components - 1) << 1607 TLB_VEC_SIZE_MINUS_1_SHIFT); 1608 } else { 1609 conf |= TLB_TYPE_F16_COLOR; 1610 conf |= TLB_F16_SWAP_HI_LO; 1611 if (num_components >= 3) 1612 conf |= TLB_VEC_SIZE_4_F16; 1613 else 1614 conf |= TLB_VEC_SIZE_2_F16; 1615 } 1616 } 1617 1618 int num_samples = c->msaa_per_sample_output ? V3D_MAX_SAMPLES : 1; 1619 for (int i = 0; i < num_samples; i++) { 1620 struct qreg *color = c->msaa_per_sample_output ? 1621 &c->sample_colors[(rt * V3D_MAX_SAMPLES + i) * 4] : 1622 &c->outputs[var->data.driver_location * 4]; 1623 1624 struct qreg r = color[0]; 1625 struct qreg g = color[1]; 1626 struct qreg b = color[2]; 1627 struct qreg a = color[3]; 1628 1629 if (c->fs_key->swap_color_rb & (1 << rt)) { 1630 r = color[2]; 1631 b = color[0]; 1632 } 1633 1634 if (c->fs_key->sample_alpha_to_one) 1635 a = vir_uniform_f(c, 1.0); 1636 1637 if (is_32b_tlb_format) { 1638 if (i == 0) { 1639 inst = vir_MOV_dest(c, tlbu_reg, r); 1640 inst->uniform = 1641 vir_get_uniform_index(c, 1642 QUNIFORM_CONSTANT, 1643 conf); 1644 } else { 1645 vir_MOV_dest(c, tlb_reg, r); 1646 } 1647 1648 if (num_components >= 2) 1649 vir_MOV_dest(c, tlb_reg, g); 1650 if (num_components >= 3) 1651 vir_MOV_dest(c, tlb_reg, b); 1652 if (num_components >= 4) 1653 vir_MOV_dest(c, tlb_reg, a); 1654 } else { 1655 inst = vir_VFPACK_dest(c, tlb_reg, r, g); 1656 if (conf != ~0 && i == 0) { 1657 inst->dst = tlbu_reg; 1658 inst->uniform = 1659 vir_get_uniform_index(c, 1660 QUNIFORM_CONSTANT, 1661 conf); 1662 } 1663 1664 if (num_components >= 3) 1665 vir_VFPACK_dest(c, tlb_reg, b, a); 1666 } 1667 } 1668} 1669 1670static void 1671emit_frag_end(struct v3d_compile *c) 1672{ 1673 /* If the shader has no non-TLB side effects and doesn't write Z 1674 * we can promote it to enabling early_fragment_tests even 1675 * if the user didn't. 1676 */ 1677 if (c->output_position_index == -1 && 1678 !(c->s->info.num_images || c->s->info.num_ssbos)) { 1679 c->s->info.fs.early_fragment_tests = true; 1680 } 1681 1682 if (c->output_sample_mask_index != -1) { 1683 vir_SETMSF_dest(c, vir_nop_reg(), 1684 vir_AND(c, 1685 vir_MSF(c), 1686 c->outputs[c->output_sample_mask_index])); 1687 } 1688 1689 bool has_any_tlb_color_write = false; 1690 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) { 1691 if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt]) 1692 has_any_tlb_color_write = true; 1693 } 1694 1695 if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) { 1696 struct nir_variable *var = c->output_color_var[0]; 1697 struct qreg *color = &c->outputs[var->data.driver_location * 4]; 1698 1699 vir_SETMSF_dest(c, vir_nop_reg(), 1700 vir_AND(c, 1701 vir_MSF(c), 1702 vir_FTOC(c, color[3]))); 1703 } 1704 1705 struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); 1706 if (c->output_position_index != -1 && 1707 !c->s->info.fs.early_fragment_tests) { 1708 struct qinst *inst = vir_MOV_dest(c, tlbu_reg, 1709 c->outputs[c->output_position_index]); 1710 uint8_t tlb_specifier = TLB_TYPE_DEPTH; 1711 1712 if (c->devinfo->ver >= 42) { 1713 tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | 1714 TLB_SAMPLE_MODE_PER_PIXEL); 1715 } else 1716 tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; 1717 1718 inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 1719 tlb_specifier | 1720 0xffffff00); 1721 c->writes_z = true; 1722 } else if (c->s->info.fs.uses_discard || 1723 !c->s->info.fs.early_fragment_tests || 1724 c->fs_key->sample_alpha_to_coverage || 1725 !has_any_tlb_color_write) { 1726 /* Emit passthrough Z if it needed to be delayed until shader 1727 * end due to potential discards. 1728 * 1729 * Since (single-threaded) fragment shaders always need a TLB 1730 * write, emit passthrouh Z if we didn't have any color 1731 * buffers and flag us as potentially discarding, so that we 1732 * can use Z as the TLB write. 1733 */ 1734 c->s->info.fs.uses_discard = true; 1735 1736 struct qinst *inst = vir_MOV_dest(c, tlbu_reg, 1737 vir_nop_reg()); 1738 uint8_t tlb_specifier = TLB_TYPE_DEPTH; 1739 1740 if (c->devinfo->ver >= 42) { 1741 /* The spec says the PER_PIXEL flag is ignored for 1742 * invariant writes, but the simulator demands it. 1743 */ 1744 tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT | 1745 TLB_SAMPLE_MODE_PER_PIXEL); 1746 } else { 1747 tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; 1748 } 1749 1750 inst->uniform = vir_get_uniform_index(c, 1751 QUNIFORM_CONSTANT, 1752 tlb_specifier | 1753 0xffffff00); 1754 c->writes_z = true; 1755 } 1756 1757 /* XXX: Performance improvement: Merge Z write and color writes TLB 1758 * uniform setup 1759 */ 1760 for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) 1761 vir_emit_tlb_color_write(c, rt); 1762} 1763 1764static inline void 1765vir_VPM_WRITE_indirect(struct v3d_compile *c, 1766 struct qreg val, 1767 struct qreg vpm_index, 1768 bool uniform_vpm_index) 1769{ 1770 assert(c->devinfo->ver >= 40); 1771 if (uniform_vpm_index) 1772 vir_STVPMV(c, vpm_index, val); 1773 else 1774 vir_STVPMD(c, vpm_index, val); 1775} 1776 1777static void 1778vir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) 1779{ 1780 if (c->devinfo->ver >= 40) { 1781 vir_VPM_WRITE_indirect(c, val, 1782 vir_uniform_ui(c, vpm_index), true); 1783 } else { 1784 /* XXX: v3d33_vir_vpm_write_setup(c); */ 1785 vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); 1786 } 1787} 1788 1789static void 1790emit_vert_end(struct v3d_compile *c) 1791{ 1792 /* GFXH-1684: VPM writes need to be complete by the end of the shader. 1793 */ 1794 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 1795 vir_VPMWT(c); 1796} 1797 1798static void 1799emit_geom_end(struct v3d_compile *c) 1800{ 1801 /* GFXH-1684: VPM writes need to be complete by the end of the shader. 1802 */ 1803 if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 1804 vir_VPMWT(c); 1805} 1806 1807static bool 1808mem_vectorize_callback(unsigned align_mul, unsigned align_offset, 1809 unsigned bit_size, 1810 unsigned num_components, 1811 nir_intrinsic_instr *low, 1812 nir_intrinsic_instr *high, 1813 void *data) 1814{ 1815 /* Our backend is 32-bit only at present */ 1816 if (bit_size != 32) 1817 return false; 1818 1819 if (align_mul % 4 != 0 || align_offset % 4 != 0) 1820 return false; 1821 1822 /* Vector accesses wrap at 16-byte boundaries so we can't vectorize 1823 * if the resulting vector crosses a 16-byte boundary. 1824 */ 1825 assert(util_is_power_of_two_nonzero(align_mul)); 1826 align_mul = MIN2(align_mul, 16); 1827 align_offset &= 0xf; 1828 if (16 - align_mul + align_offset + num_components * 4 > 16) 1829 return false; 1830 1831 return true; 1832} 1833 1834void 1835v3d_optimize_nir(struct v3d_compile *c, struct nir_shader *s) 1836{ 1837 bool progress; 1838 unsigned lower_flrp = 1839 (s->options->lower_flrp16 ? 16 : 0) | 1840 (s->options->lower_flrp32 ? 32 : 0) | 1841 (s->options->lower_flrp64 ? 64 : 0); 1842 1843 do { 1844 progress = false; 1845 1846 NIR_PASS_V(s, nir_lower_vars_to_ssa); 1847 NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); 1848 NIR_PASS(progress, s, nir_lower_phis_to_scalar, false); 1849 NIR_PASS(progress, s, nir_copy_prop); 1850 NIR_PASS(progress, s, nir_opt_remove_phis); 1851 NIR_PASS(progress, s, nir_opt_dce); 1852 NIR_PASS(progress, s, nir_opt_dead_cf); 1853 NIR_PASS(progress, s, nir_opt_cse); 1854 NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); 1855 NIR_PASS(progress, s, nir_opt_algebraic); 1856 NIR_PASS(progress, s, nir_opt_constant_folding); 1857 1858 nir_load_store_vectorize_options vectorize_opts = { 1859 .modes = nir_var_mem_ssbo | nir_var_mem_ubo | 1860 nir_var_mem_push_const | nir_var_mem_shared | 1861 nir_var_mem_global, 1862 .callback = mem_vectorize_callback, 1863 .robust_modes = 0, 1864 }; 1865 NIR_PASS(progress, s, nir_opt_load_store_vectorize, &vectorize_opts); 1866 1867 if (lower_flrp != 0) { 1868 bool lower_flrp_progress = false; 1869 1870 NIR_PASS(lower_flrp_progress, s, nir_lower_flrp, 1871 lower_flrp, 1872 false /* always_precise */); 1873 if (lower_flrp_progress) { 1874 NIR_PASS(progress, s, nir_opt_constant_folding); 1875 progress = true; 1876 } 1877 1878 /* Nothing should rematerialize any flrps, so we only 1879 * need to do this lowering once. 1880 */ 1881 lower_flrp = 0; 1882 } 1883 1884 NIR_PASS(progress, s, nir_opt_undef); 1885 NIR_PASS(progress, s, nir_lower_undef_to_zero); 1886 1887 if (c && !c->disable_loop_unrolling && 1888 s->options->max_unroll_iterations > 0) { 1889 bool local_progress = false; 1890 NIR_PASS(local_progress, s, nir_opt_loop_unroll); 1891 c->unrolled_any_loops |= local_progress; 1892 progress |= local_progress; 1893 } 1894 } while (progress); 1895 1896 nir_move_options sink_opts = 1897 nir_move_const_undef | nir_move_comparisons | nir_move_copies | 1898 nir_move_load_ubo; 1899 NIR_PASS(progress, s, nir_opt_sink, sink_opts); 1900 1901 NIR_PASS(progress, s, nir_opt_move, nir_move_load_ubo); 1902} 1903 1904static int 1905driver_location_compare(const nir_variable *a, const nir_variable *b) 1906{ 1907 return a->data.driver_location == b->data.driver_location ? 1908 a->data.location_frac - b->data.location_frac : 1909 a->data.driver_location - b->data.driver_location; 1910} 1911 1912static struct qreg 1913ntq_emit_vpm_read(struct v3d_compile *c, 1914 uint32_t *num_components_queued, 1915 uint32_t *remaining, 1916 uint32_t vpm_index) 1917{ 1918 struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); 1919 1920 if (c->devinfo->ver >= 40 ) { 1921 return vir_LDVPMV_IN(c, 1922 vir_uniform_ui(c, 1923 (*num_components_queued)++)); 1924 } 1925 1926 if (*num_components_queued != 0) { 1927 (*num_components_queued)--; 1928 return vir_MOV(c, vpm); 1929 } 1930 1931 uint32_t num_components = MIN2(*remaining, 32); 1932 1933 v3d33_vir_vpm_read_setup(c, num_components); 1934 1935 *num_components_queued = num_components - 1; 1936 *remaining -= num_components; 1937 1938 return vir_MOV(c, vpm); 1939} 1940 1941static void 1942ntq_setup_vs_inputs(struct v3d_compile *c) 1943{ 1944 /* Figure out how many components of each vertex attribute the shader 1945 * uses. Each variable should have been split to individual 1946 * components and unused ones DCEed. The vertex fetcher will load 1947 * from the start of the attribute to the number of components we 1948 * declare we need in c->vattr_sizes[]. 1949 * 1950 * BGRA vertex attributes are a bit special: since we implement these 1951 * as RGBA swapping R/B components we always need at least 3 components 1952 * if component 0 is read. 1953 */ 1954 nir_foreach_shader_in_variable(var, c->s) { 1955 /* No VS attribute array support. */ 1956 assert(MAX2(glsl_get_length(var->type), 1) == 1); 1957 1958 unsigned loc = var->data.driver_location; 1959 int start_component = var->data.location_frac; 1960 int num_components = glsl_get_components(var->type); 1961 1962 c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc], 1963 start_component + num_components); 1964 1965 /* Handle BGRA inputs */ 1966 if (start_component == 0 && 1967 c->vs_key->va_swap_rb_mask & (1 << var->data.location)) { 1968 c->vattr_sizes[loc] = MAX2(3, c->vattr_sizes[loc]); 1969 } 1970 } 1971 1972 unsigned num_components = 0; 1973 uint32_t vpm_components_queued = 0; 1974 bool uses_iid = BITSET_TEST(c->s->info.system_values_read, 1975 SYSTEM_VALUE_INSTANCE_ID) || 1976 BITSET_TEST(c->s->info.system_values_read, 1977 SYSTEM_VALUE_INSTANCE_INDEX); 1978 bool uses_biid = BITSET_TEST(c->s->info.system_values_read, 1979 SYSTEM_VALUE_BASE_INSTANCE); 1980 bool uses_vid = BITSET_TEST(c->s->info.system_values_read, 1981 SYSTEM_VALUE_VERTEX_ID) || 1982 BITSET_TEST(c->s->info.system_values_read, 1983 SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); 1984 1985 num_components += uses_iid; 1986 num_components += uses_biid; 1987 num_components += uses_vid; 1988 1989 for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++) 1990 num_components += c->vattr_sizes[i]; 1991 1992 if (uses_iid) { 1993 c->iid = ntq_emit_vpm_read(c, &vpm_components_queued, 1994 &num_components, ~0); 1995 } 1996 1997 if (uses_biid) { 1998 c->biid = ntq_emit_vpm_read(c, &vpm_components_queued, 1999 &num_components, ~0); 2000 } 2001 2002 if (uses_vid) { 2003 c->vid = ntq_emit_vpm_read(c, &vpm_components_queued, 2004 &num_components, ~0); 2005 } 2006 2007 /* The actual loads will happen directly in nir_intrinsic_load_input 2008 * on newer versions. 2009 */ 2010 if (c->devinfo->ver >= 40) 2011 return; 2012 2013 for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { 2014 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2015 (loc + 1) * 4); 2016 2017 for (int i = 0; i < c->vattr_sizes[loc]; i++) { 2018 c->inputs[loc * 4 + i] = 2019 ntq_emit_vpm_read(c, 2020 &vpm_components_queued, 2021 &num_components, 2022 loc * 4 + i); 2023 2024 } 2025 } 2026 2027 if (c->devinfo->ver >= 40) { 2028 assert(vpm_components_queued == num_components); 2029 } else { 2030 assert(vpm_components_queued == 0); 2031 assert(num_components == 0); 2032 } 2033} 2034 2035static bool 2036program_reads_point_coord(struct v3d_compile *c) 2037{ 2038 nir_foreach_shader_in_variable(var, c->s) { 2039 if (util_varying_is_point_coord(var->data.location, 2040 c->fs_key->point_sprite_mask)) { 2041 return true; 2042 } 2043 } 2044 2045 return false; 2046} 2047 2048static void 2049ntq_setup_gs_inputs(struct v3d_compile *c) 2050{ 2051 nir_sort_variables_with_modes(c->s, driver_location_compare, 2052 nir_var_shader_in); 2053 2054 nir_foreach_shader_in_variable(var, c->s) { 2055 /* All GS inputs are arrays with as many entries as vertices 2056 * in the input primitive, but here we only care about the 2057 * per-vertex input type. 2058 */ 2059 assert(glsl_type_is_array(var->type)); 2060 const struct glsl_type *type = glsl_get_array_element(var->type); 2061 unsigned array_len = MAX2(glsl_get_length(type), 1); 2062 unsigned loc = var->data.driver_location; 2063 2064 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2065 (loc + array_len) * 4); 2066 2067 if (var->data.compact) { 2068 for (unsigned j = 0; j < array_len; j++) { 2069 unsigned input_idx = c->num_inputs++; 2070 unsigned loc_frac = var->data.location_frac + j; 2071 unsigned loc = var->data.location + loc_frac / 4; 2072 unsigned comp = loc_frac % 4; 2073 c->input_slots[input_idx] = 2074 v3d_slot_from_slot_and_component(loc, comp); 2075 } 2076 continue; 2077 } 2078 2079 for (unsigned j = 0; j < array_len; j++) { 2080 unsigned num_elements = glsl_get_vector_elements(type); 2081 for (unsigned k = 0; k < num_elements; k++) { 2082 unsigned chan = var->data.location_frac + k; 2083 unsigned input_idx = c->num_inputs++; 2084 struct v3d_varying_slot slot = 2085 v3d_slot_from_slot_and_component(var->data.location + j, chan); 2086 c->input_slots[input_idx] = slot; 2087 } 2088 } 2089 } 2090} 2091 2092 2093static void 2094ntq_setup_fs_inputs(struct v3d_compile *c) 2095{ 2096 nir_sort_variables_with_modes(c->s, driver_location_compare, 2097 nir_var_shader_in); 2098 2099 nir_foreach_shader_in_variable(var, c->s) { 2100 unsigned var_len = glsl_count_vec4_slots(var->type, false, false); 2101 unsigned loc = var->data.driver_location; 2102 2103 uint32_t inputs_array_size = c->inputs_array_size; 2104 uint32_t inputs_array_required_size = (loc + var_len) * 4; 2105 resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 2106 inputs_array_required_size); 2107 resize_interp_array(c, &c->interp, &inputs_array_size, 2108 inputs_array_required_size); 2109 2110 if (var->data.location == VARYING_SLOT_POS) { 2111 emit_fragcoord_input(c, loc); 2112 } else if (var->data.location == VARYING_SLOT_PRIMITIVE_ID && 2113 !c->fs_key->has_gs) { 2114 /* If the fragment shader reads gl_PrimitiveID and we 2115 * don't have a geometry shader in the pipeline to write 2116 * it then we program the hardware to inject it as 2117 * an implicit varying. Take it from there. 2118 */ 2119 c->inputs[loc * 4] = c->primitive_id; 2120 } else if (util_varying_is_point_coord(var->data.location, 2121 c->fs_key->point_sprite_mask)) { 2122 c->inputs[loc * 4 + 0] = c->point_x; 2123 c->inputs[loc * 4 + 1] = c->point_y; 2124 } else if (var->data.compact) { 2125 for (int j = 0; j < var_len; j++) 2126 emit_compact_fragment_input(c, loc, var, j); 2127 } else if (glsl_type_is_struct(var->type)) { 2128 for (int j = 0; j < var_len; j++) { 2129 emit_fragment_input(c, loc, var, j, 4); 2130 } 2131 } else { 2132 for (int j = 0; j < var_len; j++) { 2133 emit_fragment_input(c, loc, var, j, glsl_get_vector_elements(var->type)); 2134 } 2135 } 2136 } 2137} 2138 2139static void 2140ntq_setup_outputs(struct v3d_compile *c) 2141{ 2142 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 2143 return; 2144 2145 nir_foreach_shader_out_variable(var, c->s) { 2146 unsigned array_len = MAX2(glsl_get_length(var->type), 1); 2147 unsigned loc = var->data.driver_location * 4; 2148 2149 assert(array_len == 1); 2150 (void)array_len; 2151 2152 for (int i = 0; i < 4 - var->data.location_frac; i++) { 2153 add_output(c, loc + var->data.location_frac + i, 2154 var->data.location, 2155 var->data.location_frac + i); 2156 } 2157 2158 switch (var->data.location) { 2159 case FRAG_RESULT_COLOR: 2160 c->output_color_var[0] = var; 2161 c->output_color_var[1] = var; 2162 c->output_color_var[2] = var; 2163 c->output_color_var[3] = var; 2164 break; 2165 case FRAG_RESULT_DATA0: 2166 case FRAG_RESULT_DATA1: 2167 case FRAG_RESULT_DATA2: 2168 case FRAG_RESULT_DATA3: 2169 c->output_color_var[var->data.location - 2170 FRAG_RESULT_DATA0] = var; 2171 break; 2172 case FRAG_RESULT_DEPTH: 2173 c->output_position_index = loc; 2174 break; 2175 case FRAG_RESULT_SAMPLE_MASK: 2176 c->output_sample_mask_index = loc; 2177 break; 2178 } 2179 } 2180} 2181 2182/** 2183 * Sets up the mapping from nir_register to struct qreg *. 2184 * 2185 * Each nir_register gets a struct qreg per 32-bit component being stored. 2186 */ 2187static void 2188ntq_setup_registers(struct v3d_compile *c, struct exec_list *list) 2189{ 2190 foreach_list_typed(nir_register, nir_reg, node, list) { 2191 unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 2192 struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 2193 array_len * 2194 nir_reg->num_components); 2195 2196 _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 2197 2198 for (int i = 0; i < array_len * nir_reg->num_components; i++) 2199 qregs[i] = vir_get_temp(c); 2200 } 2201} 2202 2203static void 2204ntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) 2205{ 2206 /* XXX perf: Experiment with using immediate loads to avoid having 2207 * these end up in the uniform stream. Watch out for breaking the 2208 * small immediates optimization in the process! 2209 */ 2210 struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 2211 for (int i = 0; i < instr->def.num_components; i++) 2212 qregs[i] = vir_uniform_ui(c, instr->value[i].u32); 2213 2214 _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 2215} 2216 2217static void 2218ntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) 2219{ 2220 unsigned image_index = nir_src_as_uint(instr->src[0]); 2221 bool is_array = nir_intrinsic_image_array(instr); 2222 2223 assert(nir_src_as_uint(instr->src[1]) == 0); 2224 2225 ntq_store_dest(c, &instr->dest, 0, 2226 vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); 2227 if (instr->num_components > 1) { 2228 ntq_store_dest(c, &instr->dest, 1, 2229 vir_uniform(c, 2230 instr->num_components == 2 && is_array ? 2231 QUNIFORM_IMAGE_ARRAY_SIZE : 2232 QUNIFORM_IMAGE_HEIGHT, 2233 image_index)); 2234 } 2235 if (instr->num_components > 2) { 2236 ntq_store_dest(c, &instr->dest, 2, 2237 vir_uniform(c, 2238 is_array ? 2239 QUNIFORM_IMAGE_ARRAY_SIZE : 2240 QUNIFORM_IMAGE_DEPTH, 2241 image_index)); 2242 } 2243} 2244 2245static void 2246vir_emit_tlb_color_read(struct v3d_compile *c, nir_intrinsic_instr *instr) 2247{ 2248 assert(c->s->info.stage == MESA_SHADER_FRAGMENT); 2249 2250 int rt = nir_src_as_uint(instr->src[0]); 2251 assert(rt < V3D_MAX_DRAW_BUFFERS); 2252 2253 int sample_index = nir_intrinsic_base(instr) ; 2254 assert(sample_index < V3D_MAX_SAMPLES); 2255 2256 int component = nir_intrinsic_component(instr); 2257 assert(component < 4); 2258 2259 /* We need to emit our TLB reads after we have acquired the scoreboard 2260 * lock, or the GPU will hang. Usually, we do our scoreboard locking on 2261 * the last thread switch to improve parallelism, however, that is only 2262 * guaranteed to happen before the tlb color writes. 2263 * 2264 * To fix that, we make sure we always emit a thread switch before the 2265 * first tlb color read. If that happens to be the last thread switch 2266 * we emit, then everything is fine, but otherwsie, if any code after 2267 * this point needs to emit additional thread switches, then we will 2268 * switch the strategy to locking the scoreboard on the first thread 2269 * switch instead -- see vir_emit_thrsw(). 2270 */ 2271 if (!c->emitted_tlb_load) { 2272 if (!c->last_thrsw_at_top_level) { 2273 assert(c->devinfo->ver >= 41); 2274 vir_emit_thrsw(c); 2275 } 2276 2277 c->emitted_tlb_load = true; 2278 } 2279 2280 struct qreg *color_reads_for_sample = 2281 &c->color_reads[(rt * V3D_MAX_SAMPLES + sample_index) * 4]; 2282 2283 if (color_reads_for_sample[component].file == QFILE_NULL) { 2284 enum pipe_format rt_format = c->fs_key->color_fmt[rt].format; 2285 int num_components = 2286 util_format_get_nr_components(rt_format); 2287 2288 const bool swap_rb = c->fs_key->swap_color_rb & (1 << rt); 2289 if (swap_rb) 2290 num_components = MAX2(num_components, 3); 2291 2292 nir_variable *var = c->output_color_var[rt]; 2293 enum glsl_base_type type = glsl_get_base_type(var->type); 2294 2295 bool is_int_format = type == GLSL_TYPE_INT || 2296 type == GLSL_TYPE_UINT; 2297 2298 bool is_32b_tlb_format = is_int_format || 2299 (c->fs_key->f32_color_rb & (1 << rt)); 2300 2301 int num_samples = c->fs_key->msaa ? V3D_MAX_SAMPLES : 1; 2302 2303 uint32_t conf = 0xffffff00; 2304 conf |= c->fs_key->msaa ? TLB_SAMPLE_MODE_PER_SAMPLE : 2305 TLB_SAMPLE_MODE_PER_PIXEL; 2306 conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 2307 2308 if (is_32b_tlb_format) { 2309 /* The F32 vs I32 distinction was dropped in 4.2. */ 2310 conf |= (c->devinfo->ver < 42 && is_int_format) ? 2311 TLB_TYPE_I32_COLOR : TLB_TYPE_F32_COLOR; 2312 2313 conf |= ((num_components - 1) << 2314 TLB_VEC_SIZE_MINUS_1_SHIFT); 2315 } else { 2316 conf |= TLB_TYPE_F16_COLOR; 2317 conf |= TLB_F16_SWAP_HI_LO; 2318 2319 if (num_components >= 3) 2320 conf |= TLB_VEC_SIZE_4_F16; 2321 else 2322 conf |= TLB_VEC_SIZE_2_F16; 2323 } 2324 2325 2326 for (int i = 0; i < num_samples; i++) { 2327 struct qreg r, g, b, a; 2328 if (is_32b_tlb_format) { 2329 r = conf != 0xffffffff && i == 0? 2330 vir_TLBU_COLOR_READ(c, conf) : 2331 vir_TLB_COLOR_READ(c); 2332 if (num_components >= 2) 2333 g = vir_TLB_COLOR_READ(c); 2334 if (num_components >= 3) 2335 b = vir_TLB_COLOR_READ(c); 2336 if (num_components >= 4) 2337 a = vir_TLB_COLOR_READ(c); 2338 } else { 2339 struct qreg rg = conf != 0xffffffff && i == 0 ? 2340 vir_TLBU_COLOR_READ(c, conf) : 2341 vir_TLB_COLOR_READ(c); 2342 r = vir_FMOV(c, rg); 2343 vir_set_unpack(c->defs[r.index], 0, 2344 V3D_QPU_UNPACK_L); 2345 g = vir_FMOV(c, rg); 2346 vir_set_unpack(c->defs[g.index], 0, 2347 V3D_QPU_UNPACK_H); 2348 2349 if (num_components > 2) { 2350 struct qreg ba = vir_TLB_COLOR_READ(c); 2351 b = vir_FMOV(c, ba); 2352 vir_set_unpack(c->defs[b.index], 0, 2353 V3D_QPU_UNPACK_L); 2354 a = vir_FMOV(c, ba); 2355 vir_set_unpack(c->defs[a.index], 0, 2356 V3D_QPU_UNPACK_H); 2357 } 2358 } 2359 2360 struct qreg *color_reads = 2361 &c->color_reads[(rt * V3D_MAX_SAMPLES + i) * 4]; 2362 2363 color_reads[0] = swap_rb ? b : r; 2364 if (num_components >= 2) 2365 color_reads[1] = g; 2366 if (num_components >= 3) 2367 color_reads[2] = swap_rb ? r : b; 2368 if (num_components >= 4) 2369 color_reads[3] = a; 2370 } 2371 } 2372 2373 assert(color_reads_for_sample[component].file != QFILE_NULL); 2374 ntq_store_dest(c, &instr->dest, 0, 2375 vir_MOV(c, color_reads_for_sample[component])); 2376} 2377 2378static void 2379ntq_emit_load_uniform(struct v3d_compile *c, nir_intrinsic_instr *instr) 2380{ 2381 if (nir_src_is_const(instr->src[0])) { 2382 int offset = (nir_intrinsic_base(instr) + 2383 nir_src_as_uint(instr->src[0])); 2384 assert(offset % 4 == 0); 2385 /* We need dwords */ 2386 offset = offset / 4; 2387 for (int i = 0; i < instr->num_components; i++) { 2388 ntq_store_dest(c, &instr->dest, i, 2389 vir_uniform(c, QUNIFORM_UNIFORM, 2390 offset + i)); 2391 } 2392 } else { 2393 ntq_emit_tmu_general(c, instr, false); 2394 } 2395} 2396 2397static void 2398ntq_emit_load_input(struct v3d_compile *c, nir_intrinsic_instr *instr) 2399{ 2400 /* XXX: Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset). 2401 * 2402 * Right now the driver sets PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR even 2403 * if we don't support non-uniform offsets because we also set the 2404 * lower_all_io_to_temps option in the NIR compiler. This ensures that 2405 * any indirect indexing on in/out variables is turned into indirect 2406 * indexing on temporary variables instead, that we handle by lowering 2407 * to scratch. If we implement non-uniform offset here we might be able 2408 * to avoid the temp and scratch lowering, which involves copying from 2409 * the input to the temp variable, possibly making code more optimal. 2410 */ 2411 unsigned offset = 2412 nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[0]); 2413 2414 if (c->s->info.stage != MESA_SHADER_FRAGMENT && c->devinfo->ver >= 40) { 2415 /* Emit the LDVPM directly now, rather than at the top 2416 * of the shader like we did for V3D 3.x (which needs 2417 * vpmsetup when not just taking the next offset). 2418 * 2419 * Note that delaying like this may introduce stalls, 2420 * as LDVPMV takes a minimum of 1 instruction but may 2421 * be slower if the VPM unit is busy with another QPU. 2422 */ 2423 int index = 0; 2424 if (BITSET_TEST(c->s->info.system_values_read, 2425 SYSTEM_VALUE_INSTANCE_ID)) { 2426 index++; 2427 } 2428 if (BITSET_TEST(c->s->info.system_values_read, 2429 SYSTEM_VALUE_BASE_INSTANCE)) { 2430 index++; 2431 } 2432 if (BITSET_TEST(c->s->info.system_values_read, 2433 SYSTEM_VALUE_VERTEX_ID)) { 2434 index++; 2435 } 2436 for (int i = 0; i < offset; i++) 2437 index += c->vattr_sizes[i]; 2438 index += nir_intrinsic_component(instr); 2439 for (int i = 0; i < instr->num_components; i++) { 2440 struct qreg vpm_offset = vir_uniform_ui(c, index++); 2441 ntq_store_dest(c, &instr->dest, i, 2442 vir_LDVPMV_IN(c, vpm_offset)); 2443 } 2444 } else { 2445 for (int i = 0; i < instr->num_components; i++) { 2446 int comp = nir_intrinsic_component(instr) + i; 2447 ntq_store_dest(c, &instr->dest, i, 2448 vir_MOV(c, c->inputs[offset * 4 + comp])); 2449 } 2450 } 2451} 2452 2453static void 2454ntq_emit_per_sample_color_write(struct v3d_compile *c, 2455 nir_intrinsic_instr *instr) 2456{ 2457 assert(instr->intrinsic == nir_intrinsic_store_tlb_sample_color_v3d); 2458 2459 unsigned rt = nir_src_as_uint(instr->src[1]); 2460 assert(rt < V3D_MAX_DRAW_BUFFERS); 2461 2462 unsigned sample_idx = nir_intrinsic_base(instr); 2463 assert(sample_idx < V3D_MAX_SAMPLES); 2464 2465 unsigned offset = (rt * V3D_MAX_SAMPLES + sample_idx) * 4; 2466 for (int i = 0; i < instr->num_components; i++) { 2467 c->sample_colors[offset + i] = 2468 vir_MOV(c, ntq_get_src(c, instr->src[0], i)); 2469 } 2470} 2471 2472static void 2473ntq_emit_color_write(struct v3d_compile *c, 2474 nir_intrinsic_instr *instr) 2475{ 2476 unsigned offset = (nir_intrinsic_base(instr) + 2477 nir_src_as_uint(instr->src[1])) * 4 + 2478 nir_intrinsic_component(instr); 2479 for (int i = 0; i < instr->num_components; i++) { 2480 c->outputs[offset + i] = 2481 vir_MOV(c, ntq_get_src(c, instr->src[0], i)); 2482 } 2483} 2484 2485static void 2486emit_store_output_gs(struct v3d_compile *c, nir_intrinsic_instr *instr) 2487{ 2488 assert(instr->num_components == 1); 2489 2490 struct qreg offset = ntq_get_src(c, instr->src[1], 0); 2491 2492 uint32_t base_offset = nir_intrinsic_base(instr); 2493 2494 if (base_offset) 2495 offset = vir_ADD(c, vir_uniform_ui(c, base_offset), offset); 2496 2497 /* Usually, for VS or FS, we only emit outputs once at program end so 2498 * our VPM writes are never in non-uniform control flow, but this 2499 * is not true for GS, where we are emitting multiple vertices. 2500 */ 2501 if (vir_in_nonuniform_control_flow(c)) { 2502 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 2503 V3D_QPU_PF_PUSHZ); 2504 } 2505 2506 struct qreg val = ntq_get_src(c, instr->src[0], 0); 2507 2508 /* The offset isn’t necessarily dynamically uniform for a geometry 2509 * shader. This can happen if the shader sometimes doesn’t emit one of 2510 * the vertices. In that case subsequent vertices will be written to 2511 * different offsets in the VPM and we need to use the scatter write 2512 * instruction to have a different offset for each lane. 2513 */ 2514 bool is_uniform_offset = 2515 !vir_in_nonuniform_control_flow(c) && 2516 !nir_src_is_divergent(instr->src[1]); 2517 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset); 2518 2519 if (vir_in_nonuniform_control_flow(c)) { 2520 struct qinst *last_inst = 2521 (struct qinst *)c->cur_block->instructions.prev; 2522 vir_set_cond(last_inst, V3D_QPU_COND_IFA); 2523 } 2524} 2525 2526static void 2527emit_store_output_vs(struct v3d_compile *c, nir_intrinsic_instr *instr) 2528{ 2529 assert(c->s->info.stage == MESA_SHADER_VERTEX); 2530 assert(instr->num_components == 1); 2531 2532 uint32_t base = nir_intrinsic_base(instr); 2533 struct qreg val = ntq_get_src(c, instr->src[0], 0); 2534 2535 if (nir_src_is_const(instr->src[1])) { 2536 vir_VPM_WRITE(c, val, 2537 base + nir_src_as_uint(instr->src[1])); 2538 } else { 2539 struct qreg offset = vir_ADD(c, 2540 ntq_get_src(c, instr->src[1], 1), 2541 vir_uniform_ui(c, base)); 2542 bool is_uniform_offset = 2543 !vir_in_nonuniform_control_flow(c) && 2544 !nir_src_is_divergent(instr->src[1]); 2545 vir_VPM_WRITE_indirect(c, val, offset, is_uniform_offset); 2546 } 2547} 2548 2549static void 2550ntq_emit_store_output(struct v3d_compile *c, nir_intrinsic_instr *instr) 2551{ 2552 if (c->s->info.stage == MESA_SHADER_FRAGMENT) 2553 ntq_emit_color_write(c, instr); 2554 else if (c->s->info.stage == MESA_SHADER_GEOMETRY) 2555 emit_store_output_gs(c, instr); 2556 else 2557 emit_store_output_vs(c, instr); 2558} 2559 2560/** 2561 * This implementation is based on v3d_sample_{x,y}_offset() from 2562 * v3d_sample_offset.h. 2563 */ 2564static void 2565ntq_get_sample_offset(struct v3d_compile *c, struct qreg sample_idx, 2566 struct qreg *sx, struct qreg *sy) 2567{ 2568 sample_idx = vir_ITOF(c, sample_idx); 2569 2570 struct qreg offset_x = 2571 vir_FADD(c, vir_uniform_f(c, -0.125f), 2572 vir_FMUL(c, sample_idx, 2573 vir_uniform_f(c, 0.5f))); 2574 vir_set_pf(c, vir_FCMP_dest(c, vir_nop_reg(), 2575 vir_uniform_f(c, 2.0f), sample_idx), 2576 V3D_QPU_PF_PUSHC); 2577 offset_x = vir_SEL(c, V3D_QPU_COND_IFA, 2578 vir_FSUB(c, offset_x, vir_uniform_f(c, 1.25f)), 2579 offset_x); 2580 2581 struct qreg offset_y = 2582 vir_FADD(c, vir_uniform_f(c, -0.375f), 2583 vir_FMUL(c, sample_idx, 2584 vir_uniform_f(c, 0.25f))); 2585 *sx = offset_x; 2586 *sy = offset_y; 2587} 2588 2589/** 2590 * This implementation is based on get_centroid_offset() from fep.c. 2591 */ 2592static void 2593ntq_get_barycentric_centroid(struct v3d_compile *c, 2594 struct qreg *out_x, 2595 struct qreg *out_y) 2596{ 2597 struct qreg sample_mask; 2598 if (c->output_sample_mask_index != -1) 2599 sample_mask = c->outputs[c->output_sample_mask_index]; 2600 else 2601 sample_mask = vir_MSF(c); 2602 2603 struct qreg i0 = vir_uniform_ui(c, 0); 2604 struct qreg i1 = vir_uniform_ui(c, 1); 2605 struct qreg i2 = vir_uniform_ui(c, 2); 2606 struct qreg i3 = vir_uniform_ui(c, 3); 2607 struct qreg i4 = vir_uniform_ui(c, 4); 2608 struct qreg i8 = vir_uniform_ui(c, 8); 2609 2610 /* sN = TRUE if sample N enabled in sample mask, FALSE otherwise */ 2611 struct qreg F = vir_uniform_ui(c, 0); 2612 struct qreg T = vir_uniform_ui(c, ~0); 2613 struct qreg s0 = vir_XOR(c, vir_AND(c, sample_mask, i1), i1); 2614 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); 2615 s0 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 2616 struct qreg s1 = vir_XOR(c, vir_AND(c, sample_mask, i2), i2); 2617 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); 2618 s1 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 2619 struct qreg s2 = vir_XOR(c, vir_AND(c, sample_mask, i4), i4); 2620 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); 2621 s2 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 2622 struct qreg s3 = vir_XOR(c, vir_AND(c, sample_mask, i8), i8); 2623 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s3), V3D_QPU_PF_PUSHZ); 2624 s3 = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 2625 2626 /* sample_idx = s0 ? 0 : s2 ? 2 : s1 ? 1 : 3 */ 2627 struct qreg sample_idx = i3; 2628 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s1), V3D_QPU_PF_PUSHZ); 2629 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i1, sample_idx); 2630 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s2), V3D_QPU_PF_PUSHZ); 2631 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i2, sample_idx); 2632 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), s0), V3D_QPU_PF_PUSHZ); 2633 sample_idx = vir_SEL(c, V3D_QPU_COND_IFNA, i0, sample_idx); 2634 2635 /* Get offset at selected sample index */ 2636 struct qreg offset_x, offset_y; 2637 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); 2638 2639 /* Select pixel center [offset=(0,0)] if two opposing samples (or none) 2640 * are selected. 2641 */ 2642 struct qreg s0_and_s3 = vir_AND(c, s0, s3); 2643 struct qreg s1_and_s2 = vir_AND(c, s1, s2); 2644 2645 struct qreg use_center = vir_XOR(c, sample_mask, vir_uniform_ui(c, 0)); 2646 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ); 2647 use_center = vir_SEL(c, V3D_QPU_COND_IFA, T, F); 2648 use_center = vir_OR(c, use_center, s0_and_s3); 2649 use_center = vir_OR(c, use_center, s1_and_s2); 2650 2651 struct qreg zero = vir_uniform_f(c, 0.0f); 2652 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), use_center), V3D_QPU_PF_PUSHZ); 2653 offset_x = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_x); 2654 offset_y = vir_SEL(c, V3D_QPU_COND_IFNA, zero, offset_y); 2655 2656 *out_x = offset_x; 2657 *out_y = offset_y; 2658} 2659 2660static struct qreg 2661ntq_emit_load_interpolated_input(struct v3d_compile *c, 2662 struct qreg p, 2663 struct qreg C, 2664 struct qreg offset_x, 2665 struct qreg offset_y, 2666 unsigned mode) 2667{ 2668 if (mode == INTERP_MODE_FLAT) 2669 return C; 2670 2671 struct qreg sample_offset_x = 2672 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))); 2673 struct qreg sample_offset_y = 2674 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); 2675 2676 struct qreg scaleX = 2677 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_x), 2678 offset_x); 2679 struct qreg scaleY = 2680 vir_FADD(c, vir_FSUB(c, vir_uniform_f(c, 0.5f), sample_offset_y), 2681 offset_y); 2682 2683 struct qreg pInterp = 2684 vir_FADD(c, p, vir_FADD(c, vir_FMUL(c, vir_FDX(c, p), scaleX), 2685 vir_FMUL(c, vir_FDY(c, p), scaleY))); 2686 2687 if (mode == INTERP_MODE_NOPERSPECTIVE) 2688 return vir_FADD(c, pInterp, C); 2689 2690 struct qreg w = c->payload_w; 2691 struct qreg wInterp = 2692 vir_FADD(c, w, vir_FADD(c, vir_FMUL(c, vir_FDX(c, w), scaleX), 2693 vir_FMUL(c, vir_FDY(c, w), scaleY))); 2694 2695 return vir_FADD(c, vir_FMUL(c, pInterp, wInterp), C); 2696} 2697 2698static void 2699emit_ldunifa(struct v3d_compile *c, struct qreg *result) 2700{ 2701 struct qinst *ldunifa = 2702 vir_add_inst(V3D_QPU_A_NOP, c->undef, c->undef, c->undef); 2703 ldunifa->qpu.sig.ldunifa = true; 2704 if (result) 2705 *result = vir_emit_def(c, ldunifa); 2706 else 2707 vir_emit_nondef(c, ldunifa); 2708 c->current_unifa_offset += 4; 2709} 2710 2711static void 2712ntq_emit_load_ubo_unifa(struct v3d_compile *c, nir_intrinsic_instr *instr) 2713{ 2714 /* Every ldunifa auto-increments the unifa address by 4 bytes, so our 2715 * current unifa offset is 4 bytes ahead of the offset of the last load. 2716 */ 2717 static const int32_t max_unifa_skip_dist = 2718 MAX_UNIFA_SKIP_DISTANCE - 4; 2719 2720 bool dynamic_src = !nir_src_is_const(instr->src[1]); 2721 uint32_t const_offset = 2722 dynamic_src ? 0 : nir_src_as_uint(instr->src[1]); 2723 2724 /* On OpenGL QUNIFORM_UBO_ADDR takes a UBO index 2725 * shifted up by 1 (0 is gallium's constant buffer 0). 2726 */ 2727 uint32_t index = nir_src_as_uint(instr->src[0]); 2728 if (c->key->environment == V3D_ENVIRONMENT_OPENGL) 2729 index++; 2730 2731 /* We can only keep track of the last unifa address we used with 2732 * constant offset loads. If the new load targets the same UBO and 2733 * is close enough to the previous load, we can skip the unifa register 2734 * write by emitting dummy ldunifa instructions to update the unifa 2735 * address. 2736 */ 2737 bool skip_unifa = false; 2738 uint32_t ldunifa_skips = 0; 2739 if (dynamic_src) { 2740 c->current_unifa_block = NULL; 2741 } else if (c->cur_block == c->current_unifa_block && 2742 c->current_unifa_index == index && 2743 c->current_unifa_offset <= const_offset && 2744 c->current_unifa_offset + max_unifa_skip_dist >= const_offset) { 2745 skip_unifa = true; 2746 ldunifa_skips = (const_offset - c->current_unifa_offset) / 4; 2747 } else { 2748 c->current_unifa_block = c->cur_block; 2749 c->current_unifa_index = index; 2750 c->current_unifa_offset = const_offset; 2751 } 2752 2753 if (!skip_unifa) { 2754 struct qreg base_offset = 2755 vir_uniform(c, QUNIFORM_UBO_ADDR, 2756 v3d_unit_data_create(index, const_offset)); 2757 2758 struct qreg unifa = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_UNIFA); 2759 if (!dynamic_src) { 2760 vir_MOV_dest(c, unifa, base_offset); 2761 } else { 2762 vir_ADD_dest(c, unifa, base_offset, 2763 ntq_get_src(c, instr->src[1], 0)); 2764 } 2765 } else { 2766 for (int i = 0; i < ldunifa_skips; i++) 2767 emit_ldunifa(c, NULL); 2768 } 2769 2770 for (uint32_t i = 0; i < nir_intrinsic_dest_components(instr); i++) { 2771 struct qreg data; 2772 emit_ldunifa(c, &data); 2773 ntq_store_dest(c, &instr->dest, i, vir_MOV(c, data)); 2774 } 2775} 2776 2777static inline struct qreg 2778emit_load_local_invocation_index(struct v3d_compile *c) 2779{ 2780 return vir_SHR(c, c->cs_payload[1], 2781 vir_uniform_ui(c, 32 - c->local_invocation_index_bits)); 2782} 2783 2784/* Various subgroup operations rely on the A flags, so this helper ensures that 2785 * A flags represents currently active lanes in the subgroup. 2786 */ 2787static void 2788set_a_flags_for_subgroup(struct v3d_compile *c) 2789{ 2790 /* MSF returns 0 for disabled lanes in compute shaders so 2791 * PUSHZ will set A=1 for disabled lanes. We want the inverse 2792 * of this but we don't have any means to negate the A flags 2793 * directly, but we can do it by repeating the same operation 2794 * with NORZ (A = ~A & ~Z). 2795 */ 2796 assert(c->s->info.stage == MESA_SHADER_COMPUTE); 2797 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); 2798 vir_set_uf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_UF_NORZ); 2799 2800 /* If we are under non-uniform control flow we also need to 2801 * AND the A flags with the current execute mask. 2802 */ 2803 if (vir_in_nonuniform_control_flow(c)) { 2804 const uint32_t bidx = c->cur_block->index; 2805 vir_set_uf(c, vir_XOR_dest(c, vir_nop_reg(), 2806 c->execute, 2807 vir_uniform_ui(c, bidx)), 2808 V3D_QPU_UF_ANDZ); 2809 } 2810} 2811 2812static void 2813ntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) 2814{ 2815 switch (instr->intrinsic) { 2816 case nir_intrinsic_load_uniform: 2817 ntq_emit_load_uniform(c, instr); 2818 break; 2819 2820 case nir_intrinsic_load_ubo: 2821 if (!nir_src_is_divergent(instr->src[1])) 2822 ntq_emit_load_ubo_unifa(c, instr); 2823 else 2824 ntq_emit_tmu_general(c, instr, false); 2825 break; 2826 2827 case nir_intrinsic_ssbo_atomic_add: 2828 case nir_intrinsic_ssbo_atomic_imin: 2829 case nir_intrinsic_ssbo_atomic_umin: 2830 case nir_intrinsic_ssbo_atomic_imax: 2831 case nir_intrinsic_ssbo_atomic_umax: 2832 case nir_intrinsic_ssbo_atomic_and: 2833 case nir_intrinsic_ssbo_atomic_or: 2834 case nir_intrinsic_ssbo_atomic_xor: 2835 case nir_intrinsic_ssbo_atomic_exchange: 2836 case nir_intrinsic_ssbo_atomic_comp_swap: 2837 case nir_intrinsic_load_ssbo: 2838 case nir_intrinsic_store_ssbo: 2839 ntq_emit_tmu_general(c, instr, false); 2840 break; 2841 2842 case nir_intrinsic_shared_atomic_add: 2843 case nir_intrinsic_shared_atomic_imin: 2844 case nir_intrinsic_shared_atomic_umin: 2845 case nir_intrinsic_shared_atomic_imax: 2846 case nir_intrinsic_shared_atomic_umax: 2847 case nir_intrinsic_shared_atomic_and: 2848 case nir_intrinsic_shared_atomic_or: 2849 case nir_intrinsic_shared_atomic_xor: 2850 case nir_intrinsic_shared_atomic_exchange: 2851 case nir_intrinsic_shared_atomic_comp_swap: 2852 case nir_intrinsic_load_shared: 2853 case nir_intrinsic_store_shared: 2854 case nir_intrinsic_load_scratch: 2855 case nir_intrinsic_store_scratch: 2856 ntq_emit_tmu_general(c, instr, true); 2857 break; 2858 2859 case nir_intrinsic_image_load: 2860 case nir_intrinsic_image_store: 2861 case nir_intrinsic_image_atomic_add: 2862 case nir_intrinsic_image_atomic_imin: 2863 case nir_intrinsic_image_atomic_umin: 2864 case nir_intrinsic_image_atomic_imax: 2865 case nir_intrinsic_image_atomic_umax: 2866 case nir_intrinsic_image_atomic_and: 2867 case nir_intrinsic_image_atomic_or: 2868 case nir_intrinsic_image_atomic_xor: 2869 case nir_intrinsic_image_atomic_exchange: 2870 case nir_intrinsic_image_atomic_comp_swap: 2871 v3d40_vir_emit_image_load_store(c, instr); 2872 break; 2873 2874 case nir_intrinsic_get_ssbo_size: 2875 ntq_store_dest(c, &instr->dest, 0, 2876 vir_uniform(c, QUNIFORM_GET_SSBO_SIZE, 2877 nir_src_comp_as_uint(instr->src[0], 0))); 2878 break; 2879 2880 case nir_intrinsic_get_ubo_size: 2881 ntq_store_dest(c, &instr->dest, 0, 2882 vir_uniform(c, QUNIFORM_GET_UBO_SIZE, 2883 nir_src_comp_as_uint(instr->src[0], 0))); 2884 break; 2885 2886 case nir_intrinsic_load_user_clip_plane: 2887 for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { 2888 ntq_store_dest(c, &instr->dest, i, 2889 vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 2890 nir_intrinsic_ucp_id(instr) * 2891 4 + i)); 2892 } 2893 break; 2894 2895 case nir_intrinsic_load_viewport_x_scale: 2896 ntq_store_dest(c, &instr->dest, 0, 2897 vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); 2898 break; 2899 2900 case nir_intrinsic_load_viewport_y_scale: 2901 ntq_store_dest(c, &instr->dest, 0, 2902 vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); 2903 break; 2904 2905 case nir_intrinsic_load_viewport_z_scale: 2906 ntq_store_dest(c, &instr->dest, 0, 2907 vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); 2908 break; 2909 2910 case nir_intrinsic_load_viewport_z_offset: 2911 ntq_store_dest(c, &instr->dest, 0, 2912 vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); 2913 break; 2914 2915 case nir_intrinsic_load_line_coord: 2916 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->line_x)); 2917 break; 2918 2919 case nir_intrinsic_load_line_width: 2920 ntq_store_dest(c, &instr->dest, 0, 2921 vir_uniform(c, QUNIFORM_LINE_WIDTH, 0)); 2922 break; 2923 2924 case nir_intrinsic_load_aa_line_width: 2925 ntq_store_dest(c, &instr->dest, 0, 2926 vir_uniform(c, QUNIFORM_AA_LINE_WIDTH, 0)); 2927 break; 2928 2929 case nir_intrinsic_load_sample_mask_in: 2930 ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); 2931 break; 2932 2933 case nir_intrinsic_load_helper_invocation: 2934 vir_set_pf(c, vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); 2935 struct qreg qdest = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); 2936 ntq_store_dest(c, &instr->dest, 0, qdest); 2937 break; 2938 2939 case nir_intrinsic_load_front_face: 2940 /* The register contains 0 (front) or 1 (back), and we need to 2941 * turn it into a NIR bool where true means front. 2942 */ 2943 ntq_store_dest(c, &instr->dest, 0, 2944 vir_ADD(c, 2945 vir_uniform_ui(c, -1), 2946 vir_REVF(c))); 2947 break; 2948 2949 case nir_intrinsic_load_base_instance: 2950 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->biid)); 2951 break; 2952 2953 case nir_intrinsic_load_instance_id: 2954 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); 2955 break; 2956 2957 case nir_intrinsic_load_vertex_id: 2958 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); 2959 break; 2960 2961 case nir_intrinsic_load_tlb_color_v3d: 2962 vir_emit_tlb_color_read(c, instr); 2963 break; 2964 2965 case nir_intrinsic_load_input: 2966 ntq_emit_load_input(c, instr); 2967 break; 2968 2969 case nir_intrinsic_store_tlb_sample_color_v3d: 2970 ntq_emit_per_sample_color_write(c, instr); 2971 break; 2972 2973 case nir_intrinsic_store_output: 2974 ntq_emit_store_output(c, instr); 2975 break; 2976 2977 case nir_intrinsic_image_size: 2978 ntq_emit_image_size(c, instr); 2979 break; 2980 2981 case nir_intrinsic_discard: 2982 ntq_flush_tmu(c); 2983 2984 if (vir_in_nonuniform_control_flow(c)) { 2985 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 2986 V3D_QPU_PF_PUSHZ); 2987 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 2988 vir_uniform_ui(c, 0)), 2989 V3D_QPU_COND_IFA); 2990 } else { 2991 vir_SETMSF_dest(c, vir_nop_reg(), 2992 vir_uniform_ui(c, 0)); 2993 } 2994 break; 2995 2996 case nir_intrinsic_discard_if: { 2997 ntq_flush_tmu(c); 2998 2999 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); 3000 3001 if (vir_in_nonuniform_control_flow(c)) { 3002 struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(), 3003 c->execute); 3004 if (cond == V3D_QPU_COND_IFA) { 3005 vir_set_uf(c, exec_flag, V3D_QPU_UF_ANDZ); 3006 } else { 3007 vir_set_uf(c, exec_flag, V3D_QPU_UF_NORNZ); 3008 cond = V3D_QPU_COND_IFA; 3009 } 3010 } 3011 3012 vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 3013 vir_uniform_ui(c, 0)), cond); 3014 3015 break; 3016 } 3017 3018 case nir_intrinsic_memory_barrier: 3019 case nir_intrinsic_memory_barrier_buffer: 3020 case nir_intrinsic_memory_barrier_image: 3021 case nir_intrinsic_memory_barrier_shared: 3022 case nir_intrinsic_memory_barrier_tcs_patch: 3023 case nir_intrinsic_group_memory_barrier: 3024 /* We don't do any instruction scheduling of these NIR 3025 * instructions between each other, so we just need to make 3026 * sure that the TMU operations before the barrier are flushed 3027 * before the ones after the barrier. 3028 */ 3029 ntq_flush_tmu(c); 3030 break; 3031 3032 case nir_intrinsic_control_barrier: 3033 /* Emit a TSY op to get all invocations in the workgroup 3034 * (actually supergroup) to block until the last invocation 3035 * reaches the TSY op. 3036 */ 3037 ntq_flush_tmu(c); 3038 3039 if (c->devinfo->ver >= 42) { 3040 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 3041 V3D_QPU_WADDR_SYNCB)); 3042 } else { 3043 struct qinst *sync = 3044 vir_BARRIERID_dest(c, 3045 vir_reg(QFILE_MAGIC, 3046 V3D_QPU_WADDR_SYNCU)); 3047 sync->uniform = 3048 vir_get_uniform_index(c, QUNIFORM_CONSTANT, 3049 0xffffff00 | 3050 V3D_TSY_WAIT_INC_CHECK); 3051 3052 } 3053 3054 /* The blocking of a TSY op only happens at the next thread 3055 * switch. No texturing may be outstanding at the time of a 3056 * TSY blocking operation. 3057 */ 3058 vir_emit_thrsw(c); 3059 break; 3060 3061 case nir_intrinsic_load_num_workgroups: 3062 for (int i = 0; i < 3; i++) { 3063 ntq_store_dest(c, &instr->dest, i, 3064 vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, 3065 i)); 3066 } 3067 break; 3068 3069 case nir_intrinsic_load_workgroup_id: { 3070 struct qreg x = vir_AND(c, c->cs_payload[0], 3071 vir_uniform_ui(c, 0xffff)); 3072 3073 struct qreg y = vir_SHR(c, c->cs_payload[0], 3074 vir_uniform_ui(c, 16)); 3075 3076 struct qreg z = vir_AND(c, c->cs_payload[1], 3077 vir_uniform_ui(c, 0xffff)); 3078 3079 /* We only support dispatch base in Vulkan */ 3080 if (c->key->environment == V3D_ENVIRONMENT_VULKAN) { 3081 x = vir_ADD(c, x, 3082 vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 0)); 3083 y = vir_ADD(c, y, 3084 vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 1)); 3085 z = vir_ADD(c, z, 3086 vir_uniform(c, QUNIFORM_WORK_GROUP_BASE, 2)); 3087 } 3088 3089 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, x)); 3090 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, y)); 3091 ntq_store_dest(c, &instr->dest, 2, vir_MOV(c, z)); 3092 break; 3093 } 3094 3095 case nir_intrinsic_load_local_invocation_index: 3096 ntq_store_dest(c, &instr->dest, 0, 3097 emit_load_local_invocation_index(c)); 3098 break; 3099 3100 case nir_intrinsic_load_subgroup_id: { 3101 /* This is basically the batch index, which is the Local 3102 * Invocation Index divided by the SIMD width). 3103 */ 3104 STATIC_ASSERT(util_is_power_of_two_nonzero(V3D_CHANNELS)); 3105 const uint32_t divide_shift = ffs(V3D_CHANNELS) - 1; 3106 struct qreg lii = emit_load_local_invocation_index(c); 3107 ntq_store_dest(c, &instr->dest, 0, 3108 vir_SHR(c, lii, 3109 vir_uniform_ui(c, divide_shift))); 3110 break; 3111 } 3112 3113 case nir_intrinsic_load_per_vertex_input: { 3114 /* The vertex shader writes all its used outputs into 3115 * consecutive VPM offsets, so if any output component is 3116 * unused, its VPM offset is used by the next used 3117 * component. This means that we can't assume that each 3118 * location will use 4 consecutive scalar offsets in the VPM 3119 * and we need to compute the VPM offset for each input by 3120 * going through the inputs and finding the one that matches 3121 * our location and component. 3122 * 3123 * col: vertex index, row = varying index 3124 */ 3125 assert(nir_src_is_const(instr->src[1])); 3126 uint32_t location = 3127 nir_intrinsic_io_semantics(instr).location + 3128 nir_src_as_uint(instr->src[1]); 3129 uint32_t component = nir_intrinsic_component(instr); 3130 3131 int32_t row_idx = -1; 3132 for (int i = 0; i < c->num_inputs; i++) { 3133 struct v3d_varying_slot slot = c->input_slots[i]; 3134 if (v3d_slot_get_slot(slot) == location && 3135 v3d_slot_get_component(slot) == component) { 3136 row_idx = i; 3137 break; 3138 } 3139 } 3140 3141 assert(row_idx != -1); 3142 3143 struct qreg col = ntq_get_src(c, instr->src[0], 0); 3144 for (int i = 0; i < instr->num_components; i++) { 3145 struct qreg row = vir_uniform_ui(c, row_idx++); 3146 ntq_store_dest(c, &instr->dest, i, 3147 vir_LDVPMG_IN(c, row, col)); 3148 } 3149 break; 3150 } 3151 3152 case nir_intrinsic_emit_vertex: 3153 case nir_intrinsic_end_primitive: 3154 unreachable("Should have been lowered in v3d_nir_lower_io"); 3155 break; 3156 3157 case nir_intrinsic_load_primitive_id: { 3158 /* gl_PrimitiveIdIn is written by the GBG in the first word of 3159 * VPM output header. According to docs, we should read this 3160 * using ldvpm(v,d)_in (See Table 71). 3161 */ 3162 assert(c->s->info.stage == MESA_SHADER_GEOMETRY); 3163 ntq_store_dest(c, &instr->dest, 0, 3164 vir_LDVPMV_IN(c, vir_uniform_ui(c, 0))); 3165 break; 3166 } 3167 3168 case nir_intrinsic_load_invocation_id: 3169 ntq_store_dest(c, &instr->dest, 0, vir_IID(c)); 3170 break; 3171 3172 case nir_intrinsic_load_fb_layers_v3d: 3173 ntq_store_dest(c, &instr->dest, 0, 3174 vir_uniform(c, QUNIFORM_FB_LAYERS, 0)); 3175 break; 3176 3177 case nir_intrinsic_load_sample_id: 3178 ntq_store_dest(c, &instr->dest, 0, vir_SAMPID(c)); 3179 break; 3180 3181 case nir_intrinsic_load_sample_pos: 3182 ntq_store_dest(c, &instr->dest, 0, 3183 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c)))); 3184 ntq_store_dest(c, &instr->dest, 1, 3185 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c)))); 3186 break; 3187 3188 case nir_intrinsic_load_barycentric_at_offset: 3189 ntq_store_dest(c, &instr->dest, 0, 3190 vir_MOV(c, ntq_get_src(c, instr->src[0], 0))); 3191 ntq_store_dest(c, &instr->dest, 1, 3192 vir_MOV(c, ntq_get_src(c, instr->src[0], 1))); 3193 break; 3194 3195 case nir_intrinsic_load_barycentric_pixel: 3196 ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); 3197 ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); 3198 break; 3199 3200 case nir_intrinsic_load_barycentric_at_sample: { 3201 if (!c->fs_key->msaa) { 3202 ntq_store_dest(c, &instr->dest, 0, vir_uniform_f(c, 0.0f)); 3203 ntq_store_dest(c, &instr->dest, 1, vir_uniform_f(c, 0.0f)); 3204 return; 3205 } 3206 3207 struct qreg offset_x, offset_y; 3208 struct qreg sample_idx = ntq_get_src(c, instr->src[0], 0); 3209 ntq_get_sample_offset(c, sample_idx, &offset_x, &offset_y); 3210 3211 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); 3212 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); 3213 break; 3214 } 3215 3216 case nir_intrinsic_load_barycentric_sample: { 3217 struct qreg offset_x = 3218 vir_FSUB(c, vir_FXCD(c), vir_ITOF(c, vir_XCD(c))); 3219 struct qreg offset_y = 3220 vir_FSUB(c, vir_FYCD(c), vir_ITOF(c, vir_YCD(c))); 3221 3222 ntq_store_dest(c, &instr->dest, 0, 3223 vir_FSUB(c, offset_x, vir_uniform_f(c, 0.5f))); 3224 ntq_store_dest(c, &instr->dest, 1, 3225 vir_FSUB(c, offset_y, vir_uniform_f(c, 0.5f))); 3226 break; 3227 } 3228 3229 case nir_intrinsic_load_barycentric_centroid: { 3230 struct qreg offset_x, offset_y; 3231 ntq_get_barycentric_centroid(c, &offset_x, &offset_y); 3232 ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, offset_x)); 3233 ntq_store_dest(c, &instr->dest, 1, vir_MOV(c, offset_y)); 3234 break; 3235 } 3236 3237 case nir_intrinsic_load_interpolated_input: { 3238 assert(nir_src_is_const(instr->src[1])); 3239 const uint32_t offset = nir_src_as_uint(instr->src[1]); 3240 3241 for (int i = 0; i < instr->num_components; i++) { 3242 const uint32_t input_idx = 3243 (nir_intrinsic_base(instr) + offset) * 4 + 3244 nir_intrinsic_component(instr) + i; 3245 3246 /* If we are not in MSAA or if we are not interpolating 3247 * a user varying, just return the pre-computed 3248 * interpolated input. 3249 */ 3250 if (!c->fs_key->msaa || 3251 c->interp[input_idx].vp.file == QFILE_NULL) { 3252 ntq_store_dest(c, &instr->dest, i, 3253 vir_MOV(c, c->inputs[input_idx])); 3254 continue; 3255 } 3256 3257 /* Otherwise compute interpolation at the specified 3258 * offset. 3259 */ 3260 struct qreg p = c->interp[input_idx].vp; 3261 struct qreg C = c->interp[input_idx].C; 3262 unsigned interp_mode = c->interp[input_idx].mode; 3263 3264 struct qreg offset_x = ntq_get_src(c, instr->src[0], 0); 3265 struct qreg offset_y = ntq_get_src(c, instr->src[0], 1); 3266 3267 struct qreg result = 3268 ntq_emit_load_interpolated_input(c, p, C, 3269 offset_x, offset_y, 3270 interp_mode); 3271 ntq_store_dest(c, &instr->dest, i, result); 3272 } 3273 break; 3274 } 3275 3276 case nir_intrinsic_load_subgroup_size: 3277 ntq_store_dest(c, &instr->dest, 0, 3278 vir_uniform_ui(c, V3D_CHANNELS)); 3279 break; 3280 3281 case nir_intrinsic_load_subgroup_invocation: 3282 ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); 3283 break; 3284 3285 case nir_intrinsic_elect: { 3286 set_a_flags_for_subgroup(c); 3287 struct qreg first = vir_FLAFIRST(c); 3288 3289 /* Produce a boolean result from Flafirst */ 3290 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 3291 first, vir_uniform_ui(c, 1)), 3292 V3D_QPU_PF_PUSHZ); 3293 struct qreg result = ntq_emit_cond_to_bool(c, V3D_QPU_COND_IFA); 3294 ntq_store_dest(c, &instr->dest, 0, result); 3295 break; 3296 } 3297 3298 case nir_intrinsic_load_num_subgroups: 3299 unreachable("Should have been lowered"); 3300 break; 3301 3302 case nir_intrinsic_load_view_index: 3303 ntq_store_dest(c, &instr->dest, 0, 3304 vir_uniform(c, QUNIFORM_VIEW_INDEX, 0)); 3305 break; 3306 3307 default: 3308 fprintf(stderr, "Unknown intrinsic: "); 3309 nir_print_instr(&instr->instr, stderr); 3310 fprintf(stderr, "\n"); 3311 break; 3312 } 3313} 3314 3315/* Clears (activates) the execute flags for any channels whose jump target 3316 * matches this block. 3317 * 3318 * XXX perf: Could we be using flpush/flpop somehow for our execution channel 3319 * enabling? 3320 * 3321 */ 3322static void 3323ntq_activate_execute_for_block(struct v3d_compile *c) 3324{ 3325 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 3326 c->execute, vir_uniform_ui(c, c->cur_block->index)), 3327 V3D_QPU_PF_PUSHZ); 3328 3329 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 3330} 3331 3332static void 3333ntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) 3334{ 3335 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 3336 bool empty_else_block = 3337 (nir_else_block == nir_if_last_else_block(if_stmt) && 3338 exec_list_is_empty(&nir_else_block->instr_list)); 3339 3340 struct qblock *then_block = vir_new_block(c); 3341 struct qblock *after_block = vir_new_block(c); 3342 struct qblock *else_block; 3343 if (empty_else_block) 3344 else_block = after_block; 3345 else 3346 else_block = vir_new_block(c); 3347 3348 /* Check if this if statement is really just a conditional jump with 3349 * the form: 3350 * 3351 * if (cond) { 3352 * break/continue; 3353 * } else { 3354 * } 3355 * 3356 * In which case we can skip the jump to ELSE we emit before the THEN 3357 * block and instead just emit the break/continue directly. 3358 */ 3359 nir_jump_instr *conditional_jump = NULL; 3360 if (empty_else_block) { 3361 nir_block *nir_then_block = nir_if_first_then_block(if_stmt); 3362 struct nir_instr *inst = nir_block_first_instr(nir_then_block); 3363 if (inst && inst->type == nir_instr_type_jump) 3364 conditional_jump = nir_instr_as_jump(inst); 3365 } 3366 3367 /* Set up the flags for the IF condition (taking the THEN branch). */ 3368 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 3369 3370 if (!conditional_jump) { 3371 /* Jump to ELSE. */ 3372 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? 3373 V3D_QPU_BRANCH_COND_ANYNA : 3374 V3D_QPU_BRANCH_COND_ANYA); 3375 /* Pixels that were not dispatched or have been discarded 3376 * should not contribute to the ANYA/ANYNA condition. 3377 */ 3378 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 3379 3380 vir_link_blocks(c->cur_block, else_block); 3381 vir_link_blocks(c->cur_block, then_block); 3382 3383 /* Process the THEN block. */ 3384 vir_set_emit_block(c, then_block); 3385 ntq_emit_cf_list(c, &if_stmt->then_list); 3386 3387 if (!empty_else_block) { 3388 /* At the end of the THEN block, jump to ENDIF, unless 3389 * the block ended in a break or continue. 3390 */ 3391 if (!c->cur_block->branch_emitted) { 3392 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 3393 vir_link_blocks(c->cur_block, after_block); 3394 } 3395 3396 /* Emit the else block. */ 3397 vir_set_emit_block(c, else_block); 3398 ntq_emit_cf_list(c, &if_stmt->else_list); 3399 } 3400 } else { 3401 /* Emit the conditional jump directly. 3402 * 3403 * Use ALL with breaks and ANY with continues to ensure that 3404 * we always break and never continue when all lanes have been 3405 * disabled (for example because of discards) to prevent 3406 * infinite loops. 3407 */ 3408 assert(conditional_jump && 3409 (conditional_jump->type == nir_jump_continue || 3410 conditional_jump->type == nir_jump_break)); 3411 3412 struct qinst *branch = vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? 3413 (conditional_jump->type == nir_jump_break ? 3414 V3D_QPU_BRANCH_COND_ALLA : 3415 V3D_QPU_BRANCH_COND_ANYA) : 3416 (conditional_jump->type == nir_jump_break ? 3417 V3D_QPU_BRANCH_COND_ALLNA : 3418 V3D_QPU_BRANCH_COND_ANYNA)); 3419 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 3420 3421 vir_link_blocks(c->cur_block, 3422 conditional_jump->type == nir_jump_break ? 3423 c->loop_break_block : 3424 c->loop_cont_block); 3425 } 3426 3427 vir_link_blocks(c->cur_block, after_block); 3428 3429 vir_set_emit_block(c, after_block); 3430} 3431 3432static void 3433ntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) 3434{ 3435 nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 3436 bool empty_else_block = 3437 (nir_else_block == nir_if_last_else_block(if_stmt) && 3438 exec_list_is_empty(&nir_else_block->instr_list)); 3439 3440 struct qblock *then_block = vir_new_block(c); 3441 struct qblock *after_block = vir_new_block(c); 3442 struct qblock *else_block; 3443 if (empty_else_block) 3444 else_block = after_block; 3445 else 3446 else_block = vir_new_block(c); 3447 3448 bool was_uniform_control_flow = false; 3449 if (!vir_in_nonuniform_control_flow(c)) { 3450 c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 3451 was_uniform_control_flow = true; 3452 } 3453 3454 /* Set up the flags for the IF condition (taking the THEN branch). */ 3455 enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 3456 3457 /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and 3458 * was previously active (execute Z) for updating the exec flags. 3459 */ 3460 if (was_uniform_control_flow) { 3461 cond = v3d_qpu_cond_invert(cond); 3462 } else { 3463 struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute); 3464 if (cond == V3D_QPU_COND_IFA) { 3465 vir_set_uf(c, inst, V3D_QPU_UF_NORNZ); 3466 } else { 3467 vir_set_uf(c, inst, V3D_QPU_UF_ANDZ); 3468 cond = V3D_QPU_COND_IFA; 3469 } 3470 } 3471 3472 vir_MOV_cond(c, cond, 3473 c->execute, 3474 vir_uniform_ui(c, else_block->index)); 3475 3476 /* Jump to ELSE if nothing is active for THEN, otherwise fall 3477 * through. 3478 */ 3479 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 3480 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); 3481 vir_link_blocks(c->cur_block, else_block); 3482 vir_link_blocks(c->cur_block, then_block); 3483 3484 /* Process the THEN block. */ 3485 vir_set_emit_block(c, then_block); 3486 ntq_emit_cf_list(c, &if_stmt->then_list); 3487 3488 if (!empty_else_block) { 3489 /* Handle the end of the THEN block. First, all currently 3490 * active channels update their execute flags to point to 3491 * ENDIF 3492 */ 3493 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 3494 V3D_QPU_PF_PUSHZ); 3495 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 3496 vir_uniform_ui(c, after_block->index)); 3497 3498 /* If everything points at ENDIF, then jump there immediately. */ 3499 vir_set_pf(c, vir_XOR_dest(c, vir_nop_reg(), 3500 c->execute, 3501 vir_uniform_ui(c, after_block->index)), 3502 V3D_QPU_PF_PUSHZ); 3503 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); 3504 vir_link_blocks(c->cur_block, after_block); 3505 vir_link_blocks(c->cur_block, else_block); 3506 3507 vir_set_emit_block(c, else_block); 3508 ntq_activate_execute_for_block(c); 3509 ntq_emit_cf_list(c, &if_stmt->else_list); 3510 } 3511 3512 vir_link_blocks(c->cur_block, after_block); 3513 3514 vir_set_emit_block(c, after_block); 3515 if (was_uniform_control_flow) 3516 c->execute = c->undef; 3517 else 3518 ntq_activate_execute_for_block(c); 3519} 3520 3521static void 3522ntq_emit_if(struct v3d_compile *c, nir_if *nif) 3523{ 3524 bool was_in_control_flow = c->in_control_flow; 3525 c->in_control_flow = true; 3526 if (!vir_in_nonuniform_control_flow(c) && 3527 !nir_src_is_divergent(nif->condition)) { 3528 ntq_emit_uniform_if(c, nif); 3529 } else { 3530 ntq_emit_nonuniform_if(c, nif); 3531 } 3532 c->in_control_flow = was_in_control_flow; 3533} 3534 3535static void 3536ntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) 3537{ 3538 switch (jump->type) { 3539 case nir_jump_break: 3540 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 3541 V3D_QPU_PF_PUSHZ); 3542 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 3543 vir_uniform_ui(c, c->loop_break_block->index)); 3544 break; 3545 3546 case nir_jump_continue: 3547 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), 3548 V3D_QPU_PF_PUSHZ); 3549 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 3550 vir_uniform_ui(c, c->loop_cont_block->index)); 3551 break; 3552 3553 case nir_jump_return: 3554 unreachable("All returns should be lowered\n"); 3555 break; 3556 3557 case nir_jump_halt: 3558 case nir_jump_goto: 3559 case nir_jump_goto_if: 3560 unreachable("not supported\n"); 3561 break; 3562 } 3563} 3564 3565static void 3566ntq_emit_uniform_jump(struct v3d_compile *c, nir_jump_instr *jump) 3567{ 3568 switch (jump->type) { 3569 case nir_jump_break: 3570 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 3571 vir_link_blocks(c->cur_block, c->loop_break_block); 3572 c->cur_block->branch_emitted = true; 3573 break; 3574 case nir_jump_continue: 3575 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 3576 vir_link_blocks(c->cur_block, c->loop_cont_block); 3577 c->cur_block->branch_emitted = true; 3578 break; 3579 3580 case nir_jump_return: 3581 unreachable("All returns should be lowered\n"); 3582 break; 3583 3584 case nir_jump_halt: 3585 case nir_jump_goto: 3586 case nir_jump_goto_if: 3587 unreachable("not supported\n"); 3588 break; 3589 } 3590} 3591 3592static void 3593ntq_emit_instr(struct v3d_compile *c, nir_instr *instr) 3594{ 3595 switch (instr->type) { 3596 case nir_instr_type_alu: 3597 ntq_emit_alu(c, nir_instr_as_alu(instr)); 3598 break; 3599 3600 case nir_instr_type_intrinsic: 3601 ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 3602 break; 3603 3604 case nir_instr_type_load_const: 3605 ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 3606 break; 3607 3608 case nir_instr_type_ssa_undef: 3609 unreachable("Should've been lowered by nir_lower_undef_to_zero"); 3610 break; 3611 3612 case nir_instr_type_tex: 3613 ntq_emit_tex(c, nir_instr_as_tex(instr)); 3614 break; 3615 3616 case nir_instr_type_jump: 3617 /* Always flush TMU before jumping to another block, for the 3618 * same reasons as in ntq_emit_block. 3619 */ 3620 ntq_flush_tmu(c); 3621 if (vir_in_nonuniform_control_flow(c)) 3622 ntq_emit_jump(c, nir_instr_as_jump(instr)); 3623 else 3624 ntq_emit_uniform_jump(c, nir_instr_as_jump(instr)); 3625 break; 3626 3627 default: 3628 fprintf(stderr, "Unknown NIR instr type: "); 3629 nir_print_instr(instr, stderr); 3630 fprintf(stderr, "\n"); 3631 abort(); 3632 } 3633} 3634 3635static void 3636ntq_emit_block(struct v3d_compile *c, nir_block *block) 3637{ 3638 nir_foreach_instr(instr, block) { 3639 ntq_emit_instr(c, instr); 3640 } 3641 3642 /* Always process pending TMU operations in the same block they were 3643 * emitted: we can't emit TMU operations in a block and then emit a 3644 * thread switch and LDTMU/TMUWT for them in another block, possibly 3645 * under control flow. 3646 */ 3647 ntq_flush_tmu(c); 3648} 3649 3650static void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 3651 3652static void 3653ntq_emit_nonuniform_loop(struct v3d_compile *c, nir_loop *loop) 3654{ 3655 bool was_uniform_control_flow = false; 3656 if (!vir_in_nonuniform_control_flow(c)) { 3657 c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 3658 was_uniform_control_flow = true; 3659 } 3660 3661 c->loop_cont_block = vir_new_block(c); 3662 c->loop_break_block = vir_new_block(c); 3663 3664 vir_link_blocks(c->cur_block, c->loop_cont_block); 3665 vir_set_emit_block(c, c->loop_cont_block); 3666 ntq_activate_execute_for_block(c); 3667 3668 ntq_emit_cf_list(c, &loop->body); 3669 3670 /* Re-enable any previous continues now, so our ANYA check below 3671 * works. 3672 * 3673 * XXX: Use the .ORZ flags update, instead. 3674 */ 3675 vir_set_pf(c, vir_XOR_dest(c, 3676 vir_nop_reg(), 3677 c->execute, 3678 vir_uniform_ui(c, c->loop_cont_block->index)), 3679 V3D_QPU_PF_PUSHZ); 3680 vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 3681 3682 vir_set_pf(c, vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 3683 3684 struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); 3685 /* Pixels that were not dispatched or have been discarded should not 3686 * contribute to looping again. 3687 */ 3688 branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 3689 vir_link_blocks(c->cur_block, c->loop_cont_block); 3690 vir_link_blocks(c->cur_block, c->loop_break_block); 3691 3692 vir_set_emit_block(c, c->loop_break_block); 3693 if (was_uniform_control_flow) 3694 c->execute = c->undef; 3695 else 3696 ntq_activate_execute_for_block(c); 3697} 3698 3699static void 3700ntq_emit_uniform_loop(struct v3d_compile *c, nir_loop *loop) 3701{ 3702 3703 c->loop_cont_block = vir_new_block(c); 3704 c->loop_break_block = vir_new_block(c); 3705 3706 vir_link_blocks(c->cur_block, c->loop_cont_block); 3707 vir_set_emit_block(c, c->loop_cont_block); 3708 3709 ntq_emit_cf_list(c, &loop->body); 3710 3711 if (!c->cur_block->branch_emitted) { 3712 vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 3713 vir_link_blocks(c->cur_block, c->loop_cont_block); 3714 } 3715 3716 vir_set_emit_block(c, c->loop_break_block); 3717} 3718 3719static void 3720ntq_emit_loop(struct v3d_compile *c, nir_loop *loop) 3721{ 3722 bool was_in_control_flow = c->in_control_flow; 3723 c->in_control_flow = true; 3724 3725 struct qblock *save_loop_cont_block = c->loop_cont_block; 3726 struct qblock *save_loop_break_block = c->loop_break_block; 3727 3728 if (vir_in_nonuniform_control_flow(c) || loop->divergent) { 3729 ntq_emit_nonuniform_loop(c, loop); 3730 } else { 3731 ntq_emit_uniform_loop(c, loop); 3732 } 3733 3734 c->loop_break_block = save_loop_break_block; 3735 c->loop_cont_block = save_loop_cont_block; 3736 3737 c->loops++; 3738 3739 c->in_control_flow = was_in_control_flow; 3740} 3741 3742static void 3743ntq_emit_function(struct v3d_compile *c, nir_function_impl *func) 3744{ 3745 fprintf(stderr, "FUNCTIONS not handled.\n"); 3746 abort(); 3747} 3748 3749static void 3750ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) 3751{ 3752 foreach_list_typed(nir_cf_node, node, node, list) { 3753 switch (node->type) { 3754 case nir_cf_node_block: 3755 ntq_emit_block(c, nir_cf_node_as_block(node)); 3756 break; 3757 3758 case nir_cf_node_if: 3759 ntq_emit_if(c, nir_cf_node_as_if(node)); 3760 break; 3761 3762 case nir_cf_node_loop: 3763 ntq_emit_loop(c, nir_cf_node_as_loop(node)); 3764 break; 3765 3766 case nir_cf_node_function: 3767 ntq_emit_function(c, nir_cf_node_as_function(node)); 3768 break; 3769 3770 default: 3771 fprintf(stderr, "Unknown NIR node type\n"); 3772 abort(); 3773 } 3774 } 3775} 3776 3777static void 3778ntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) 3779{ 3780 ntq_setup_registers(c, &impl->registers); 3781 ntq_emit_cf_list(c, &impl->body); 3782} 3783 3784static void 3785nir_to_vir(struct v3d_compile *c) 3786{ 3787 switch (c->s->info.stage) { 3788 case MESA_SHADER_FRAGMENT: 3789 c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); 3790 c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); 3791 c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); 3792 3793 /* V3D 4.x can disable implicit varyings if they are not used */ 3794 c->fs_uses_primitive_id = 3795 nir_find_variable_with_location(c->s, nir_var_shader_in, 3796 VARYING_SLOT_PRIMITIVE_ID); 3797 if (c->fs_uses_primitive_id && !c->fs_key->has_gs) { 3798 c->primitive_id = 3799 emit_fragment_varying(c, NULL, -1, 0, 0); 3800 } 3801 3802 if (c->fs_key->is_points && 3803 (c->devinfo->ver < 40 || program_reads_point_coord(c))) { 3804 c->point_x = emit_fragment_varying(c, NULL, -1, 0, 0); 3805 c->point_y = emit_fragment_varying(c, NULL, -1, 0, 0); 3806 c->uses_implicit_point_line_varyings = true; 3807 } else if (c->fs_key->is_lines && 3808 (c->devinfo->ver < 40 || 3809 BITSET_TEST(c->s->info.system_values_read, 3810 SYSTEM_VALUE_LINE_COORD))) { 3811 c->line_x = emit_fragment_varying(c, NULL, -1, 0, 0); 3812 c->uses_implicit_point_line_varyings = true; 3813 } 3814 3815 c->force_per_sample_msaa = 3816 c->s->info.fs.uses_sample_qualifier || 3817 BITSET_TEST(c->s->info.system_values_read, 3818 SYSTEM_VALUE_SAMPLE_ID) || 3819 BITSET_TEST(c->s->info.system_values_read, 3820 SYSTEM_VALUE_SAMPLE_POS); 3821 break; 3822 case MESA_SHADER_COMPUTE: 3823 /* Set up the TSO for barriers, assuming we do some. */ 3824 if (c->devinfo->ver < 42) { 3825 vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 3826 V3D_QPU_WADDR_SYNC)); 3827 } 3828 3829 c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); 3830 c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); 3831 3832 /* Set up the division between gl_LocalInvocationIndex and 3833 * wg_in_mem in the payload reg. 3834 */ 3835 int wg_size = (c->s->info.workgroup_size[0] * 3836 c->s->info.workgroup_size[1] * 3837 c->s->info.workgroup_size[2]); 3838 c->local_invocation_index_bits = 3839 ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1; 3840 assert(c->local_invocation_index_bits <= 8); 3841 3842 if (c->s->info.shared_size) { 3843 struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1], 3844 vir_uniform_ui(c, 16)); 3845 if (c->s->info.workgroup_size[0] != 1 || 3846 c->s->info.workgroup_size[1] != 1 || 3847 c->s->info.workgroup_size[2] != 1) { 3848 int wg_bits = (16 - 3849 c->local_invocation_index_bits); 3850 int wg_mask = (1 << wg_bits) - 1; 3851 wg_in_mem = vir_AND(c, wg_in_mem, 3852 vir_uniform_ui(c, wg_mask)); 3853 } 3854 struct qreg shared_per_wg = 3855 vir_uniform_ui(c, c->s->info.shared_size); 3856 3857 c->cs_shared_offset = 3858 vir_ADD(c, 3859 vir_uniform(c, QUNIFORM_SHARED_OFFSET,0), 3860 vir_UMUL(c, wg_in_mem, shared_per_wg)); 3861 } 3862 break; 3863 default: 3864 break; 3865 } 3866 3867 if (c->s->scratch_size) { 3868 v3d_setup_spill_base(c); 3869 c->spill_size += V3D_CHANNELS * c->s->scratch_size; 3870 } 3871 3872 switch (c->s->info.stage) { 3873 case MESA_SHADER_VERTEX: 3874 ntq_setup_vs_inputs(c); 3875 break; 3876 case MESA_SHADER_GEOMETRY: 3877 ntq_setup_gs_inputs(c); 3878 break; 3879 case MESA_SHADER_FRAGMENT: 3880 ntq_setup_fs_inputs(c); 3881 break; 3882 case MESA_SHADER_COMPUTE: 3883 break; 3884 default: 3885 unreachable("unsupported shader stage"); 3886 } 3887 3888 ntq_setup_outputs(c); 3889 3890 /* Find the main function and emit the body. */ 3891 nir_foreach_function(function, c->s) { 3892 assert(strcmp(function->name, "main") == 0); 3893 assert(function->impl); 3894 ntq_emit_impl(c, function->impl); 3895 } 3896} 3897 3898/** 3899 * When demoting a shader down to single-threaded, removes the THRSW 3900 * instructions (one will still be inserted at v3d_vir_to_qpu() for the 3901 * program end). 3902 */ 3903static void 3904vir_remove_thrsw(struct v3d_compile *c) 3905{ 3906 vir_for_each_block(block, c) { 3907 vir_for_each_inst_safe(inst, block) { 3908 if (inst->qpu.sig.thrsw) 3909 vir_remove_instruction(c, inst); 3910 } 3911 } 3912 3913 c->last_thrsw = NULL; 3914} 3915 3916/** 3917 * This makes sure we have a top-level last thread switch which signals the 3918 * start of the last thread section, which may include adding a new thrsw 3919 * instruction if needed. We don't allow spilling in the last thread section, so 3920 * if we need to do any spills that inject additional thread switches later on, 3921 * we ensure this thread switch will still be the last thread switch in the 3922 * program, which makes last thread switch signalling a lot easier when we have 3923 * spilling. If in the end we don't need to spill to compile the program and we 3924 * injected a new thread switch instruction here only for that, we will 3925 * eventually restore the previous last thread switch and remove the one we 3926 * added here. 3927 */ 3928static void 3929vir_emit_last_thrsw(struct v3d_compile *c, 3930 struct qinst **restore_last_thrsw, 3931 bool *restore_scoreboard_lock) 3932{ 3933 *restore_last_thrsw = c->last_thrsw; 3934 3935 /* On V3D before 4.1, we need a TMU op to be outstanding when thread 3936 * switching, so disable threads if we didn't do any TMU ops (each of 3937 * which would have emitted a THRSW). 3938 */ 3939 if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { 3940 c->threads = 1; 3941 if (c->last_thrsw) 3942 vir_remove_thrsw(c); 3943 *restore_last_thrsw = NULL; 3944 } 3945 3946 /* If we're threaded and the last THRSW was in conditional code, then 3947 * we need to emit another one so that we can flag it as the last 3948 * thrsw. 3949 */ 3950 if (c->last_thrsw && !c->last_thrsw_at_top_level) { 3951 assert(c->devinfo->ver >= 41); 3952 vir_emit_thrsw(c); 3953 } 3954 3955 /* If we're threaded, then we need to mark the last THRSW instruction 3956 * so we can emit a pair of them at QPU emit time. 3957 * 3958 * For V3D 4.x, we can spawn the non-fragment shaders already in the 3959 * post-last-THRSW state, so we can skip this. 3960 */ 3961 if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { 3962 assert(c->devinfo->ver >= 41); 3963 vir_emit_thrsw(c); 3964 } 3965 3966 /* If we have not inserted a last thread switch yet, do it now to ensure 3967 * any potential spilling we do happens before this. If we don't spill 3968 * in the end, we will restore the previous one. 3969 */ 3970 if (*restore_last_thrsw == c->last_thrsw) { 3971 if (*restore_last_thrsw) 3972 (*restore_last_thrsw)->is_last_thrsw = false; 3973 *restore_scoreboard_lock = c->lock_scoreboard_on_first_thrsw; 3974 vir_emit_thrsw(c); 3975 } else { 3976 *restore_last_thrsw = c->last_thrsw; 3977 } 3978 3979 assert(c->last_thrsw); 3980 c->last_thrsw->is_last_thrsw = true; 3981} 3982 3983static void 3984vir_restore_last_thrsw(struct v3d_compile *c, 3985 struct qinst *thrsw, 3986 bool scoreboard_lock) 3987{ 3988 assert(c->last_thrsw); 3989 vir_remove_instruction(c, c->last_thrsw); 3990 c->last_thrsw = thrsw; 3991 if (c->last_thrsw) 3992 c->last_thrsw->is_last_thrsw = true; 3993 c->lock_scoreboard_on_first_thrsw = scoreboard_lock; 3994} 3995 3996/* There's a flag in the shader for "center W is needed for reasons other than 3997 * non-centroid varyings", so we just walk the program after VIR optimization 3998 * to see if it's used. It should be harmless to set even if we only use 3999 * center W for varyings. 4000 */ 4001static void 4002vir_check_payload_w(struct v3d_compile *c) 4003{ 4004 if (c->s->info.stage != MESA_SHADER_FRAGMENT) 4005 return; 4006 4007 vir_for_each_inst_inorder(inst, c) { 4008 for (int i = 0; i < vir_get_nsrc(inst); i++) { 4009 if (inst->src[i].file == QFILE_REG && 4010 inst->src[i].index == 0) { 4011 c->uses_center_w = true; 4012 return; 4013 } 4014 } 4015 } 4016} 4017 4018void 4019v3d_nir_to_vir(struct v3d_compile *c) 4020{ 4021 if (V3D_DEBUG & (V3D_DEBUG_NIR | 4022 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4023 fprintf(stderr, "%s prog %d/%d NIR:\n", 4024 vir_get_stage_name(c), 4025 c->program_id, c->variant_id); 4026 nir_print_shader(c->s, stderr); 4027 } 4028 4029 nir_to_vir(c); 4030 4031 bool restore_scoreboard_lock = false; 4032 struct qinst *restore_last_thrsw; 4033 4034 /* Emit the last THRSW before STVPM and TLB writes. */ 4035 vir_emit_last_thrsw(c, 4036 &restore_last_thrsw, 4037 &restore_scoreboard_lock); 4038 4039 4040 switch (c->s->info.stage) { 4041 case MESA_SHADER_FRAGMENT: 4042 emit_frag_end(c); 4043 break; 4044 case MESA_SHADER_GEOMETRY: 4045 emit_geom_end(c); 4046 break; 4047 case MESA_SHADER_VERTEX: 4048 emit_vert_end(c); 4049 break; 4050 case MESA_SHADER_COMPUTE: 4051 break; 4052 default: 4053 unreachable("bad stage"); 4054 } 4055 4056 if (V3D_DEBUG & (V3D_DEBUG_VIR | 4057 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4058 fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", 4059 vir_get_stage_name(c), 4060 c->program_id, c->variant_id); 4061 vir_dump(c); 4062 fprintf(stderr, "\n"); 4063 } 4064 4065 vir_optimize(c); 4066 4067 vir_check_payload_w(c); 4068 4069 /* XXX perf: On VC4, we do a VIR-level instruction scheduling here. 4070 * We used that on that platform to pipeline TMU writes and reduce the 4071 * number of thread switches, as well as try (mostly successfully) to 4072 * reduce maximum register pressure to allow more threads. We should 4073 * do something of that sort for V3D -- either instruction scheduling 4074 * here, or delay the the THRSW and LDTMUs from our texture 4075 * instructions until the results are needed. 4076 */ 4077 4078 if (V3D_DEBUG & (V3D_DEBUG_VIR | 4079 v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 4080 fprintf(stderr, "%s prog %d/%d VIR:\n", 4081 vir_get_stage_name(c), 4082 c->program_id, c->variant_id); 4083 vir_dump(c); 4084 fprintf(stderr, "\n"); 4085 } 4086 4087 /* Attempt to allocate registers for the temporaries. If we fail, 4088 * reduce thread count and try again. 4089 */ 4090 int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; 4091 struct qpu_reg *temp_registers; 4092 while (true) { 4093 bool spilled; 4094 temp_registers = v3d_register_allocate(c, &spilled); 4095 if (spilled) 4096 continue; 4097 4098 if (temp_registers) 4099 break; 4100 4101 if (c->threads == min_threads && 4102 (V3D_DEBUG & V3D_DEBUG_RA)) { 4103 fprintf(stderr, 4104 "Failed to register allocate using %s\n", 4105 c->fallback_scheduler ? "the fallback scheduler:" : 4106 "the normal scheduler: \n"); 4107 4108 vir_dump(c); 4109 4110 char *shaderdb; 4111 int ret = v3d_shaderdb_dump(c, &shaderdb); 4112 if (ret > 0) { 4113 fprintf(stderr, "%s\n", shaderdb); 4114 free(shaderdb); 4115 } 4116 } 4117 4118 if (c->threads <= MAX2(c->min_threads_for_reg_alloc, min_threads)) { 4119 if (V3D_DEBUG & V3D_DEBUG_PERF) { 4120 fprintf(stderr, 4121 "Failed to register allocate %s at " 4122 "%d threads.\n", vir_get_stage_name(c), 4123 c->threads); 4124 } 4125 c->compilation_result = 4126 V3D_COMPILATION_FAILED_REGISTER_ALLOCATION; 4127 return; 4128 } 4129 4130 c->spill_count = 0; 4131 c->threads /= 2; 4132 4133 if (c->threads == 1) 4134 vir_remove_thrsw(c); 4135 } 4136 4137 /* If we didn't spill, then remove the last thread switch we injected 4138 * artificially (if any) and restore the previous one. 4139 */ 4140 if (!c->spills && c->last_thrsw != restore_last_thrsw) 4141 vir_restore_last_thrsw(c, restore_last_thrsw, restore_scoreboard_lock); 4142 4143 if (c->spills && 4144 (V3D_DEBUG & (V3D_DEBUG_VIR | 4145 v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { 4146 fprintf(stderr, "%s prog %d/%d spilled VIR:\n", 4147 vir_get_stage_name(c), 4148 c->program_id, c->variant_id); 4149 vir_dump(c); 4150 fprintf(stderr, "\n"); 4151 } 4152 4153 v3d_vir_to_qpu(c, temp_registers); 4154} 4155