1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2016 Broadcom 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg#include <inttypes.h> 25b8e80941Smrg#include "util/u_format.h" 26b8e80941Smrg#include "util/u_math.h" 27b8e80941Smrg#include "util/u_memory.h" 28b8e80941Smrg#include "util/ralloc.h" 29b8e80941Smrg#include "util/hash_table.h" 30b8e80941Smrg#include "compiler/nir/nir.h" 31b8e80941Smrg#include "compiler/nir/nir_builder.h" 32b8e80941Smrg#include "common/v3d_device_info.h" 33b8e80941Smrg#include "v3d_compiler.h" 34b8e80941Smrg 35b8e80941Smrg#define GENERAL_TMU_LOOKUP_PER_QUAD (0 << 7) 36b8e80941Smrg#define GENERAL_TMU_LOOKUP_PER_PIXEL (1 << 7) 37b8e80941Smrg#define GENERAL_TMU_READ_OP_PREFETCH (0 << 3) 38b8e80941Smrg#define GENERAL_TMU_READ_OP_CACHE_CLEAR (1 << 3) 39b8e80941Smrg#define GENERAL_TMU_READ_OP_CACHE_FLUSH (3 << 3) 40b8e80941Smrg#define GENERAL_TMU_READ_OP_CACHE_CLEAN (3 << 3) 41b8e80941Smrg#define GENERAL_TMU_READ_OP_CACHE_L1T_CLEAR (4 << 3) 42b8e80941Smrg#define GENERAL_TMU_READ_OP_CACHE_L1T_FLUSH_AGGREGATION (5 << 3) 43b8e80941Smrg#define GENERAL_TMU_READ_OP_ATOMIC_INC (8 << 3) 44b8e80941Smrg#define GENERAL_TMU_READ_OP_ATOMIC_DEC (9 << 3) 45b8e80941Smrg#define GENERAL_TMU_READ_OP_ATOMIC_NOT (10 << 3) 46b8e80941Smrg#define GENERAL_TMU_READ_OP_READ (15 << 3) 47b8e80941Smrg#define GENERAL_TMU_LOOKUP_TYPE_8BIT_I (0 << 0) 48b8e80941Smrg#define GENERAL_TMU_LOOKUP_TYPE_16BIT_I (1 << 0) 49b8e80941Smrg#define GENERAL_TMU_LOOKUP_TYPE_VEC2 (2 << 0) 50b8e80941Smrg#define GENERAL_TMU_LOOKUP_TYPE_VEC3 (3 << 0) 51b8e80941Smrg#define GENERAL_TMU_LOOKUP_TYPE_VEC4 (4 << 0) 52b8e80941Smrg#define GENERAL_TMU_LOOKUP_TYPE_8BIT_UI (5 << 0) 53b8e80941Smrg#define GENERAL_TMU_LOOKUP_TYPE_16BIT_UI (6 << 0) 54b8e80941Smrg#define GENERAL_TMU_LOOKUP_TYPE_32BIT_UI (7 << 0) 55b8e80941Smrg 56b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP (0 << 3) 57b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_SUB_WRAP (1 << 3) 58b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_XCHG (2 << 3) 59b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG (3 << 3) 60b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_UMIN (4 << 3) 61b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_UMAX (5 << 3) 62b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_SMIN (6 << 3) 63b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_SMAX (7 << 3) 64b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_AND (8 << 3) 65b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_OR (9 << 3) 66b8e80941Smrg#define GENERAL_TMU_WRITE_OP_ATOMIC_XOR (10 << 3) 67b8e80941Smrg#define GENERAL_TMU_WRITE_OP_WRITE (15 << 3) 68b8e80941Smrg 69b8e80941Smrg#define V3D_TSY_SET_QUORUM 0 70b8e80941Smrg#define V3D_TSY_INC_WAITERS 1 71b8e80941Smrg#define V3D_TSY_DEC_WAITERS 2 72b8e80941Smrg#define V3D_TSY_INC_QUORUM 3 73b8e80941Smrg#define V3D_TSY_DEC_QUORUM 4 74b8e80941Smrg#define V3D_TSY_FREE_ALL 5 75b8e80941Smrg#define V3D_TSY_RELEASE 6 76b8e80941Smrg#define V3D_TSY_ACQUIRE 7 77b8e80941Smrg#define V3D_TSY_WAIT 8 78b8e80941Smrg#define V3D_TSY_WAIT_INC 9 79b8e80941Smrg#define V3D_TSY_WAIT_CHECK 10 80b8e80941Smrg#define V3D_TSY_WAIT_INC_CHECK 11 81b8e80941Smrg#define V3D_TSY_WAIT_CV 12 82b8e80941Smrg#define V3D_TSY_INC_SEMAPHORE 13 83b8e80941Smrg#define V3D_TSY_DEC_SEMAPHORE 14 84b8e80941Smrg#define V3D_TSY_SET_QUORUM_FREE_ALL 15 85b8e80941Smrg 86b8e80941Smrgstatic void 87b8e80941Smrgntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 88b8e80941Smrg 89b8e80941Smrgstatic void 90b8e80941Smrgresize_qreg_array(struct v3d_compile *c, 91b8e80941Smrg struct qreg **regs, 92b8e80941Smrg uint32_t *size, 93b8e80941Smrg uint32_t decl_size) 94b8e80941Smrg{ 95b8e80941Smrg if (*size >= decl_size) 96b8e80941Smrg return; 97b8e80941Smrg 98b8e80941Smrg uint32_t old_size = *size; 99b8e80941Smrg *size = MAX2(*size * 2, decl_size); 100b8e80941Smrg *regs = reralloc(c, *regs, struct qreg, *size); 101b8e80941Smrg if (!*regs) { 102b8e80941Smrg fprintf(stderr, "Malloc failure\n"); 103b8e80941Smrg abort(); 104b8e80941Smrg } 105b8e80941Smrg 106b8e80941Smrg for (uint32_t i = old_size; i < *size; i++) 107b8e80941Smrg (*regs)[i] = c->undef; 108b8e80941Smrg} 109b8e80941Smrg 110b8e80941Smrgvoid 111b8e80941Smrgvir_emit_thrsw(struct v3d_compile *c) 112b8e80941Smrg{ 113b8e80941Smrg if (c->threads == 1) 114b8e80941Smrg return; 115b8e80941Smrg 116b8e80941Smrg /* Always thread switch after each texture operation for now. 117b8e80941Smrg * 118b8e80941Smrg * We could do better by batching a bunch of texture fetches up and 119b8e80941Smrg * then doing one thread switch and collecting all their results 120b8e80941Smrg * afterward. 121b8e80941Smrg */ 122b8e80941Smrg c->last_thrsw = vir_NOP(c); 123b8e80941Smrg c->last_thrsw->qpu.sig.thrsw = true; 124b8e80941Smrg c->last_thrsw_at_top_level = !c->in_control_flow; 125b8e80941Smrg} 126b8e80941Smrg 127b8e80941Smrgstatic uint32_t 128b8e80941Smrgv3d_general_tmu_op(nir_intrinsic_instr *instr) 129b8e80941Smrg{ 130b8e80941Smrg switch (instr->intrinsic) { 131b8e80941Smrg case nir_intrinsic_load_ssbo: 132b8e80941Smrg case nir_intrinsic_load_ubo: 133b8e80941Smrg case nir_intrinsic_load_uniform: 134b8e80941Smrg case nir_intrinsic_load_shared: 135b8e80941Smrg case nir_intrinsic_load_scratch: 136b8e80941Smrg return GENERAL_TMU_READ_OP_READ; 137b8e80941Smrg case nir_intrinsic_store_ssbo: 138b8e80941Smrg case nir_intrinsic_store_shared: 139b8e80941Smrg case nir_intrinsic_store_scratch: 140b8e80941Smrg return GENERAL_TMU_WRITE_OP_WRITE; 141b8e80941Smrg case nir_intrinsic_ssbo_atomic_add: 142b8e80941Smrg case nir_intrinsic_shared_atomic_add: 143b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_ADD_WRAP; 144b8e80941Smrg case nir_intrinsic_ssbo_atomic_imin: 145b8e80941Smrg case nir_intrinsic_shared_atomic_imin: 146b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_SMIN; 147b8e80941Smrg case nir_intrinsic_ssbo_atomic_umin: 148b8e80941Smrg case nir_intrinsic_shared_atomic_umin: 149b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_UMIN; 150b8e80941Smrg case nir_intrinsic_ssbo_atomic_imax: 151b8e80941Smrg case nir_intrinsic_shared_atomic_imax: 152b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_SMAX; 153b8e80941Smrg case nir_intrinsic_ssbo_atomic_umax: 154b8e80941Smrg case nir_intrinsic_shared_atomic_umax: 155b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_UMAX; 156b8e80941Smrg case nir_intrinsic_ssbo_atomic_and: 157b8e80941Smrg case nir_intrinsic_shared_atomic_and: 158b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_AND; 159b8e80941Smrg case nir_intrinsic_ssbo_atomic_or: 160b8e80941Smrg case nir_intrinsic_shared_atomic_or: 161b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_OR; 162b8e80941Smrg case nir_intrinsic_ssbo_atomic_xor: 163b8e80941Smrg case nir_intrinsic_shared_atomic_xor: 164b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_XOR; 165b8e80941Smrg case nir_intrinsic_ssbo_atomic_exchange: 166b8e80941Smrg case nir_intrinsic_shared_atomic_exchange: 167b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_XCHG; 168b8e80941Smrg case nir_intrinsic_ssbo_atomic_comp_swap: 169b8e80941Smrg case nir_intrinsic_shared_atomic_comp_swap: 170b8e80941Smrg return GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG; 171b8e80941Smrg default: 172b8e80941Smrg unreachable("unknown intrinsic op"); 173b8e80941Smrg } 174b8e80941Smrg} 175b8e80941Smrg 176b8e80941Smrg/** 177b8e80941Smrg * Implements indirect uniform loads and SSBO accesses through the TMU general 178b8e80941Smrg * memory access interface. 179b8e80941Smrg */ 180b8e80941Smrgstatic void 181b8e80941Smrgntq_emit_tmu_general(struct v3d_compile *c, nir_intrinsic_instr *instr, 182b8e80941Smrg bool is_shared_or_scratch) 183b8e80941Smrg{ 184b8e80941Smrg /* XXX perf: We should turn add/sub of 1 to inc/dec. Perhaps NIR 185b8e80941Smrg * wants to have support for inc/dec? 186b8e80941Smrg */ 187b8e80941Smrg 188b8e80941Smrg uint32_t tmu_op = v3d_general_tmu_op(instr); 189b8e80941Smrg bool is_store = (instr->intrinsic == nir_intrinsic_store_ssbo || 190b8e80941Smrg instr->intrinsic == nir_intrinsic_store_scratch || 191b8e80941Smrg instr->intrinsic == nir_intrinsic_store_shared); 192b8e80941Smrg bool has_index = !is_shared_or_scratch; 193b8e80941Smrg 194b8e80941Smrg int offset_src; 195b8e80941Smrg int tmu_writes = 1; /* address */ 196b8e80941Smrg if (instr->intrinsic == nir_intrinsic_load_uniform) { 197b8e80941Smrg offset_src = 0; 198b8e80941Smrg } else if (instr->intrinsic == nir_intrinsic_load_ssbo || 199b8e80941Smrg instr->intrinsic == nir_intrinsic_load_ubo || 200b8e80941Smrg instr->intrinsic == nir_intrinsic_load_scratch || 201b8e80941Smrg instr->intrinsic == nir_intrinsic_load_shared) { 202b8e80941Smrg offset_src = 0 + has_index; 203b8e80941Smrg } else if (is_store) { 204b8e80941Smrg offset_src = 1 + has_index; 205b8e80941Smrg for (int i = 0; i < instr->num_components; i++) { 206b8e80941Smrg vir_MOV_dest(c, 207b8e80941Smrg vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 208b8e80941Smrg ntq_get_src(c, instr->src[0], i)); 209b8e80941Smrg tmu_writes++; 210b8e80941Smrg } 211b8e80941Smrg } else { 212b8e80941Smrg offset_src = 0 + has_index; 213b8e80941Smrg vir_MOV_dest(c, 214b8e80941Smrg vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 215b8e80941Smrg ntq_get_src(c, instr->src[1 + has_index], 0)); 216b8e80941Smrg tmu_writes++; 217b8e80941Smrg if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) { 218b8e80941Smrg vir_MOV_dest(c, 219b8e80941Smrg vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUD), 220b8e80941Smrg ntq_get_src(c, instr->src[2 + has_index], 221b8e80941Smrg 0)); 222b8e80941Smrg tmu_writes++; 223b8e80941Smrg } 224b8e80941Smrg } 225b8e80941Smrg 226b8e80941Smrg bool dynamic_src = !nir_src_is_const(instr->src[offset_src]); 227b8e80941Smrg uint32_t const_offset = 0; 228b8e80941Smrg if (!dynamic_src) 229b8e80941Smrg const_offset = nir_src_as_uint(instr->src[offset_src]); 230b8e80941Smrg 231b8e80941Smrg /* Make sure we won't exceed the 16-entry TMU fifo if each thread is 232b8e80941Smrg * storing at the same time. 233b8e80941Smrg */ 234b8e80941Smrg while (tmu_writes > 16 / c->threads) 235b8e80941Smrg c->threads /= 2; 236b8e80941Smrg 237b8e80941Smrg struct qreg offset; 238b8e80941Smrg if (instr->intrinsic == nir_intrinsic_load_uniform) { 239b8e80941Smrg const_offset += nir_intrinsic_base(instr); 240b8e80941Smrg offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 241b8e80941Smrg v3d_unit_data_create(0, const_offset)); 242b8e80941Smrg const_offset = 0; 243b8e80941Smrg } else if (instr->intrinsic == nir_intrinsic_load_ubo) { 244b8e80941Smrg uint32_t index = nir_src_as_uint(instr->src[0]) + 1; 245b8e80941Smrg /* Note that QUNIFORM_UBO_ADDR takes a UBO index shifted up by 246b8e80941Smrg * 1 (0 is gallium's constant buffer 0). 247b8e80941Smrg */ 248b8e80941Smrg offset = vir_uniform(c, QUNIFORM_UBO_ADDR, 249b8e80941Smrg v3d_unit_data_create(index, const_offset)); 250b8e80941Smrg const_offset = 0; 251b8e80941Smrg } else if (is_shared_or_scratch) { 252b8e80941Smrg /* Shared and scratch variables have no buffer index, and all 253b8e80941Smrg * start from a common base that we set up at the start of 254b8e80941Smrg * dispatch. 255b8e80941Smrg */ 256b8e80941Smrg if (instr->intrinsic == nir_intrinsic_load_scratch || 257b8e80941Smrg instr->intrinsic == nir_intrinsic_store_scratch) { 258b8e80941Smrg offset = c->spill_base; 259b8e80941Smrg } else { 260b8e80941Smrg offset = c->cs_shared_offset; 261b8e80941Smrg const_offset += nir_intrinsic_base(instr); 262b8e80941Smrg } 263b8e80941Smrg } else { 264b8e80941Smrg offset = vir_uniform(c, QUNIFORM_SSBO_OFFSET, 265b8e80941Smrg nir_src_as_uint(instr->src[is_store ? 266b8e80941Smrg 1 : 0])); 267b8e80941Smrg } 268b8e80941Smrg 269b8e80941Smrg /* The spec says that for atomics, the TYPE field is ignored, but that 270b8e80941Smrg * doesn't seem to be the case for CMPXCHG. Just use the number of 271b8e80941Smrg * tmud writes we did to decide the type (or choose "32bit" for atomic 272b8e80941Smrg * reads, which has been fine). 273b8e80941Smrg */ 274b8e80941Smrg int num_components; 275b8e80941Smrg if (tmu_op == GENERAL_TMU_WRITE_OP_ATOMIC_CMPXCHG) 276b8e80941Smrg num_components = 2; 277b8e80941Smrg else 278b8e80941Smrg num_components = instr->num_components; 279b8e80941Smrg 280b8e80941Smrg uint32_t config = (0xffffff00 | 281b8e80941Smrg tmu_op | 282b8e80941Smrg GENERAL_TMU_LOOKUP_PER_PIXEL); 283b8e80941Smrg if (num_components == 1) { 284b8e80941Smrg config |= GENERAL_TMU_LOOKUP_TYPE_32BIT_UI; 285b8e80941Smrg } else { 286b8e80941Smrg config |= GENERAL_TMU_LOOKUP_TYPE_VEC2 + num_components - 2; 287b8e80941Smrg } 288b8e80941Smrg 289b8e80941Smrg if (vir_in_nonuniform_control_flow(c)) { 290b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 291b8e80941Smrg V3D_QPU_PF_PUSHZ); 292b8e80941Smrg } 293b8e80941Smrg 294b8e80941Smrg struct qreg tmua; 295b8e80941Smrg if (config == ~0) 296b8e80941Smrg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUA); 297b8e80941Smrg else 298b8e80941Smrg tmua = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_TMUAU); 299b8e80941Smrg 300b8e80941Smrg struct qinst *tmu; 301b8e80941Smrg if (dynamic_src) { 302b8e80941Smrg if (const_offset != 0) { 303b8e80941Smrg offset = vir_ADD(c, offset, 304b8e80941Smrg vir_uniform_ui(c, const_offset)); 305b8e80941Smrg } 306b8e80941Smrg tmu = vir_ADD_dest(c, tmua, offset, 307b8e80941Smrg ntq_get_src(c, instr->src[offset_src], 0)); 308b8e80941Smrg } else { 309b8e80941Smrg if (const_offset != 0) { 310b8e80941Smrg tmu = vir_ADD_dest(c, tmua, offset, 311b8e80941Smrg vir_uniform_ui(c, const_offset)); 312b8e80941Smrg } else { 313b8e80941Smrg tmu = vir_MOV_dest(c, tmua, offset); 314b8e80941Smrg } 315b8e80941Smrg } 316b8e80941Smrg 317b8e80941Smrg if (config != ~0) { 318b8e80941Smrg tmu->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 319b8e80941Smrg config); 320b8e80941Smrg } 321b8e80941Smrg 322b8e80941Smrg if (vir_in_nonuniform_control_flow(c)) 323b8e80941Smrg vir_set_cond(tmu, V3D_QPU_COND_IFA); 324b8e80941Smrg 325b8e80941Smrg vir_emit_thrsw(c); 326b8e80941Smrg 327b8e80941Smrg /* Read the result, or wait for the TMU op to complete. */ 328b8e80941Smrg for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) 329b8e80941Smrg ntq_store_dest(c, &instr->dest, i, vir_MOV(c, vir_LDTMU(c))); 330b8e80941Smrg 331b8e80941Smrg if (nir_intrinsic_dest_components(instr) == 0) 332b8e80941Smrg vir_TMUWT(c); 333b8e80941Smrg} 334b8e80941Smrg 335b8e80941Smrgstatic struct qreg * 336b8e80941Smrgntq_init_ssa_def(struct v3d_compile *c, nir_ssa_def *def) 337b8e80941Smrg{ 338b8e80941Smrg struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 339b8e80941Smrg def->num_components); 340b8e80941Smrg _mesa_hash_table_insert(c->def_ht, def, qregs); 341b8e80941Smrg return qregs; 342b8e80941Smrg} 343b8e80941Smrg 344b8e80941Smrg/** 345b8e80941Smrg * This function is responsible for getting VIR results into the associated 346b8e80941Smrg * storage for a NIR instruction. 347b8e80941Smrg * 348b8e80941Smrg * If it's a NIR SSA def, then we just set the associated hash table entry to 349b8e80941Smrg * the new result. 350b8e80941Smrg * 351b8e80941Smrg * If it's a NIR reg, then we need to update the existing qreg assigned to the 352b8e80941Smrg * NIR destination with the incoming value. To do that without introducing 353b8e80941Smrg * new MOVs, we require that the incoming qreg either be a uniform, or be 354b8e80941Smrg * SSA-defined by the previous VIR instruction in the block and rewritable by 355b8e80941Smrg * this function. That lets us sneak ahead and insert the SF flag beforehand 356b8e80941Smrg * (knowing that the previous instruction doesn't depend on flags) and rewrite 357b8e80941Smrg * its destination to be the NIR reg's destination 358b8e80941Smrg */ 359b8e80941Smrgvoid 360b8e80941Smrgntq_store_dest(struct v3d_compile *c, nir_dest *dest, int chan, 361b8e80941Smrg struct qreg result) 362b8e80941Smrg{ 363b8e80941Smrg struct qinst *last_inst = NULL; 364b8e80941Smrg if (!list_empty(&c->cur_block->instructions)) 365b8e80941Smrg last_inst = (struct qinst *)c->cur_block->instructions.prev; 366b8e80941Smrg 367b8e80941Smrg assert((result.file == QFILE_TEMP && 368b8e80941Smrg last_inst && last_inst == c->defs[result.index])); 369b8e80941Smrg 370b8e80941Smrg if (dest->is_ssa) { 371b8e80941Smrg assert(chan < dest->ssa.num_components); 372b8e80941Smrg 373b8e80941Smrg struct qreg *qregs; 374b8e80941Smrg struct hash_entry *entry = 375b8e80941Smrg _mesa_hash_table_search(c->def_ht, &dest->ssa); 376b8e80941Smrg 377b8e80941Smrg if (entry) 378b8e80941Smrg qregs = entry->data; 379b8e80941Smrg else 380b8e80941Smrg qregs = ntq_init_ssa_def(c, &dest->ssa); 381b8e80941Smrg 382b8e80941Smrg qregs[chan] = result; 383b8e80941Smrg } else { 384b8e80941Smrg nir_register *reg = dest->reg.reg; 385b8e80941Smrg assert(dest->reg.base_offset == 0); 386b8e80941Smrg assert(reg->num_array_elems == 0); 387b8e80941Smrg struct hash_entry *entry = 388b8e80941Smrg _mesa_hash_table_search(c->def_ht, reg); 389b8e80941Smrg struct qreg *qregs = entry->data; 390b8e80941Smrg 391b8e80941Smrg /* Insert a MOV if the source wasn't an SSA def in the 392b8e80941Smrg * previous instruction. 393b8e80941Smrg */ 394b8e80941Smrg if ((vir_in_nonuniform_control_flow(c) && 395b8e80941Smrg c->defs[last_inst->dst.index]->qpu.sig.ldunif)) { 396b8e80941Smrg result = vir_MOV(c, result); 397b8e80941Smrg last_inst = c->defs[result.index]; 398b8e80941Smrg } 399b8e80941Smrg 400b8e80941Smrg /* We know they're both temps, so just rewrite index. */ 401b8e80941Smrg c->defs[last_inst->dst.index] = NULL; 402b8e80941Smrg last_inst->dst.index = qregs[chan].index; 403b8e80941Smrg 404b8e80941Smrg /* If we're in control flow, then make this update of the reg 405b8e80941Smrg * conditional on the execution mask. 406b8e80941Smrg */ 407b8e80941Smrg if (vir_in_nonuniform_control_flow(c)) { 408b8e80941Smrg last_inst->dst.index = qregs[chan].index; 409b8e80941Smrg 410b8e80941Smrg /* Set the flags to the current exec mask. 411b8e80941Smrg */ 412b8e80941Smrg c->cursor = vir_before_inst(last_inst); 413b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 414b8e80941Smrg V3D_QPU_PF_PUSHZ); 415b8e80941Smrg c->cursor = vir_after_inst(last_inst); 416b8e80941Smrg 417b8e80941Smrg vir_set_cond(last_inst, V3D_QPU_COND_IFA); 418b8e80941Smrg } 419b8e80941Smrg } 420b8e80941Smrg} 421b8e80941Smrg 422b8e80941Smrgstruct qreg 423b8e80941Smrgntq_get_src(struct v3d_compile *c, nir_src src, int i) 424b8e80941Smrg{ 425b8e80941Smrg struct hash_entry *entry; 426b8e80941Smrg if (src.is_ssa) { 427b8e80941Smrg entry = _mesa_hash_table_search(c->def_ht, src.ssa); 428b8e80941Smrg assert(i < src.ssa->num_components); 429b8e80941Smrg } else { 430b8e80941Smrg nir_register *reg = src.reg.reg; 431b8e80941Smrg entry = _mesa_hash_table_search(c->def_ht, reg); 432b8e80941Smrg assert(reg->num_array_elems == 0); 433b8e80941Smrg assert(src.reg.base_offset == 0); 434b8e80941Smrg assert(i < reg->num_components); 435b8e80941Smrg } 436b8e80941Smrg 437b8e80941Smrg struct qreg *qregs = entry->data; 438b8e80941Smrg return qregs[i]; 439b8e80941Smrg} 440b8e80941Smrg 441b8e80941Smrgstatic struct qreg 442b8e80941Smrgntq_get_alu_src(struct v3d_compile *c, nir_alu_instr *instr, 443b8e80941Smrg unsigned src) 444b8e80941Smrg{ 445b8e80941Smrg assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 446b8e80941Smrg unsigned chan = ffs(instr->dest.write_mask) - 1; 447b8e80941Smrg struct qreg r = ntq_get_src(c, instr->src[src].src, 448b8e80941Smrg instr->src[src].swizzle[chan]); 449b8e80941Smrg 450b8e80941Smrg assert(!instr->src[src].abs); 451b8e80941Smrg assert(!instr->src[src].negate); 452b8e80941Smrg 453b8e80941Smrg return r; 454b8e80941Smrg}; 455b8e80941Smrg 456b8e80941Smrgstatic struct qreg 457b8e80941Smrgntq_minify(struct v3d_compile *c, struct qreg size, struct qreg level) 458b8e80941Smrg{ 459b8e80941Smrg return vir_MAX(c, vir_SHR(c, size, level), vir_uniform_ui(c, 1)); 460b8e80941Smrg} 461b8e80941Smrg 462b8e80941Smrgstatic void 463b8e80941Smrgntq_emit_txs(struct v3d_compile *c, nir_tex_instr *instr) 464b8e80941Smrg{ 465b8e80941Smrg unsigned unit = instr->texture_index; 466b8e80941Smrg int lod_index = nir_tex_instr_src_index(instr, nir_tex_src_lod); 467b8e80941Smrg int dest_size = nir_tex_instr_dest_size(instr); 468b8e80941Smrg 469b8e80941Smrg struct qreg lod = c->undef; 470b8e80941Smrg if (lod_index != -1) 471b8e80941Smrg lod = ntq_get_src(c, instr->src[lod_index].src, 0); 472b8e80941Smrg 473b8e80941Smrg for (int i = 0; i < dest_size; i++) { 474b8e80941Smrg assert(i < 3); 475b8e80941Smrg enum quniform_contents contents; 476b8e80941Smrg 477b8e80941Smrg if (instr->is_array && i == dest_size - 1) 478b8e80941Smrg contents = QUNIFORM_TEXTURE_ARRAY_SIZE; 479b8e80941Smrg else 480b8e80941Smrg contents = QUNIFORM_TEXTURE_WIDTH + i; 481b8e80941Smrg 482b8e80941Smrg struct qreg size = vir_uniform(c, contents, unit); 483b8e80941Smrg 484b8e80941Smrg switch (instr->sampler_dim) { 485b8e80941Smrg case GLSL_SAMPLER_DIM_1D: 486b8e80941Smrg case GLSL_SAMPLER_DIM_2D: 487b8e80941Smrg case GLSL_SAMPLER_DIM_MS: 488b8e80941Smrg case GLSL_SAMPLER_DIM_3D: 489b8e80941Smrg case GLSL_SAMPLER_DIM_CUBE: 490b8e80941Smrg /* Don't minify the array size. */ 491b8e80941Smrg if (!(instr->is_array && i == dest_size - 1)) { 492b8e80941Smrg size = ntq_minify(c, size, lod); 493b8e80941Smrg } 494b8e80941Smrg break; 495b8e80941Smrg 496b8e80941Smrg case GLSL_SAMPLER_DIM_RECT: 497b8e80941Smrg /* There's no LOD field for rects */ 498b8e80941Smrg break; 499b8e80941Smrg 500b8e80941Smrg default: 501b8e80941Smrg unreachable("Bad sampler type"); 502b8e80941Smrg } 503b8e80941Smrg 504b8e80941Smrg ntq_store_dest(c, &instr->dest, i, size); 505b8e80941Smrg } 506b8e80941Smrg} 507b8e80941Smrg 508b8e80941Smrgstatic void 509b8e80941Smrgntq_emit_tex(struct v3d_compile *c, nir_tex_instr *instr) 510b8e80941Smrg{ 511b8e80941Smrg unsigned unit = instr->texture_index; 512b8e80941Smrg 513b8e80941Smrg /* Since each texture sampling op requires uploading uniforms to 514b8e80941Smrg * reference the texture, there's no HW support for texture size and 515b8e80941Smrg * you just upload uniforms containing the size. 516b8e80941Smrg */ 517b8e80941Smrg switch (instr->op) { 518b8e80941Smrg case nir_texop_query_levels: 519b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 520b8e80941Smrg vir_uniform(c, QUNIFORM_TEXTURE_LEVELS, unit)); 521b8e80941Smrg return; 522b8e80941Smrg case nir_texop_txs: 523b8e80941Smrg ntq_emit_txs(c, instr); 524b8e80941Smrg return; 525b8e80941Smrg default: 526b8e80941Smrg break; 527b8e80941Smrg } 528b8e80941Smrg 529b8e80941Smrg if (c->devinfo->ver >= 40) 530b8e80941Smrg v3d40_vir_emit_tex(c, instr); 531b8e80941Smrg else 532b8e80941Smrg v3d33_vir_emit_tex(c, instr); 533b8e80941Smrg} 534b8e80941Smrg 535b8e80941Smrgstatic struct qreg 536b8e80941Smrgntq_fsincos(struct v3d_compile *c, struct qreg src, bool is_cos) 537b8e80941Smrg{ 538b8e80941Smrg struct qreg input = vir_FMUL(c, src, vir_uniform_f(c, 1.0f / M_PI)); 539b8e80941Smrg if (is_cos) 540b8e80941Smrg input = vir_FADD(c, input, vir_uniform_f(c, 0.5)); 541b8e80941Smrg 542b8e80941Smrg struct qreg periods = vir_FROUND(c, input); 543b8e80941Smrg struct qreg sin_output = vir_SIN(c, vir_FSUB(c, input, periods)); 544b8e80941Smrg return vir_XOR(c, sin_output, vir_SHL(c, 545b8e80941Smrg vir_FTOIN(c, periods), 546b8e80941Smrg vir_uniform_ui(c, -1))); 547b8e80941Smrg} 548b8e80941Smrg 549b8e80941Smrgstatic struct qreg 550b8e80941Smrgntq_fsign(struct v3d_compile *c, struct qreg src) 551b8e80941Smrg{ 552b8e80941Smrg struct qreg t = vir_get_temp(c); 553b8e80941Smrg 554b8e80941Smrg vir_MOV_dest(c, t, vir_uniform_f(c, 0.0)); 555b8e80941Smrg vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHZ); 556b8e80941Smrg vir_MOV_cond(c, V3D_QPU_COND_IFNA, t, vir_uniform_f(c, 1.0)); 557b8e80941Smrg vir_set_pf(vir_FMOV_dest(c, vir_nop_reg(), src), V3D_QPU_PF_PUSHN); 558b8e80941Smrg vir_MOV_cond(c, V3D_QPU_COND_IFA, t, vir_uniform_f(c, -1.0)); 559b8e80941Smrg return vir_MOV(c, t); 560b8e80941Smrg} 561b8e80941Smrg 562b8e80941Smrgstatic void 563b8e80941Smrgemit_fragcoord_input(struct v3d_compile *c, int attr) 564b8e80941Smrg{ 565b8e80941Smrg c->inputs[attr * 4 + 0] = vir_FXCD(c); 566b8e80941Smrg c->inputs[attr * 4 + 1] = vir_FYCD(c); 567b8e80941Smrg c->inputs[attr * 4 + 2] = c->payload_z; 568b8e80941Smrg c->inputs[attr * 4 + 3] = vir_RECIP(c, c->payload_w); 569b8e80941Smrg} 570b8e80941Smrg 571b8e80941Smrgstatic struct qreg 572b8e80941Smrgemit_fragment_varying(struct v3d_compile *c, nir_variable *var, 573b8e80941Smrg uint8_t swizzle, int array_index) 574b8e80941Smrg{ 575b8e80941Smrg struct qreg r3 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R3); 576b8e80941Smrg struct qreg r5 = vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_R5); 577b8e80941Smrg 578b8e80941Smrg struct qreg vary; 579b8e80941Smrg if (c->devinfo->ver >= 41) { 580b8e80941Smrg struct qinst *ldvary = vir_add_inst(V3D_QPU_A_NOP, c->undef, 581b8e80941Smrg c->undef, c->undef); 582b8e80941Smrg ldvary->qpu.sig.ldvary = true; 583b8e80941Smrg vary = vir_emit_def(c, ldvary); 584b8e80941Smrg } else { 585b8e80941Smrg vir_NOP(c)->qpu.sig.ldvary = true; 586b8e80941Smrg vary = r3; 587b8e80941Smrg } 588b8e80941Smrg 589b8e80941Smrg /* For gl_PointCoord input or distance along a line, we'll be called 590b8e80941Smrg * with no nir_variable, and we don't count toward VPM size so we 591b8e80941Smrg * don't track an input slot. 592b8e80941Smrg */ 593b8e80941Smrg if (!var) { 594b8e80941Smrg return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); 595b8e80941Smrg } 596b8e80941Smrg 597b8e80941Smrg int i = c->num_inputs++; 598b8e80941Smrg c->input_slots[i] = 599b8e80941Smrg v3d_slot_from_slot_and_component(var->data.location + 600b8e80941Smrg array_index, swizzle); 601b8e80941Smrg 602b8e80941Smrg switch (var->data.interpolation) { 603b8e80941Smrg case INTERP_MODE_NONE: 604b8e80941Smrg /* If a gl_FrontColor or gl_BackColor input has no interp 605b8e80941Smrg * qualifier, then if we're using glShadeModel(GL_FLAT) it 606b8e80941Smrg * needs to be flat shaded. 607b8e80941Smrg */ 608b8e80941Smrg switch (var->data.location + array_index) { 609b8e80941Smrg case VARYING_SLOT_COL0: 610b8e80941Smrg case VARYING_SLOT_COL1: 611b8e80941Smrg case VARYING_SLOT_BFC0: 612b8e80941Smrg case VARYING_SLOT_BFC1: 613b8e80941Smrg if (c->fs_key->shade_model_flat) { 614b8e80941Smrg BITSET_SET(c->flat_shade_flags, i); 615b8e80941Smrg vir_MOV_dest(c, c->undef, vary); 616b8e80941Smrg return vir_MOV(c, r5); 617b8e80941Smrg } else { 618b8e80941Smrg return vir_FADD(c, vir_FMUL(c, vary, 619b8e80941Smrg c->payload_w), r5); 620b8e80941Smrg } 621b8e80941Smrg default: 622b8e80941Smrg break; 623b8e80941Smrg } 624b8e80941Smrg /* FALLTHROUGH */ 625b8e80941Smrg case INTERP_MODE_SMOOTH: 626b8e80941Smrg if (var->data.centroid) { 627b8e80941Smrg BITSET_SET(c->centroid_flags, i); 628b8e80941Smrg return vir_FADD(c, vir_FMUL(c, vary, 629b8e80941Smrg c->payload_w_centroid), r5); 630b8e80941Smrg } else { 631b8e80941Smrg return vir_FADD(c, vir_FMUL(c, vary, c->payload_w), r5); 632b8e80941Smrg } 633b8e80941Smrg case INTERP_MODE_NOPERSPECTIVE: 634b8e80941Smrg BITSET_SET(c->noperspective_flags, i); 635b8e80941Smrg return vir_FADD(c, vir_MOV(c, vary), r5); 636b8e80941Smrg case INTERP_MODE_FLAT: 637b8e80941Smrg BITSET_SET(c->flat_shade_flags, i); 638b8e80941Smrg vir_MOV_dest(c, c->undef, vary); 639b8e80941Smrg return vir_MOV(c, r5); 640b8e80941Smrg default: 641b8e80941Smrg unreachable("Bad interp mode"); 642b8e80941Smrg } 643b8e80941Smrg} 644b8e80941Smrg 645b8e80941Smrgstatic void 646b8e80941Smrgemit_fragment_input(struct v3d_compile *c, int attr, nir_variable *var, 647b8e80941Smrg int array_index) 648b8e80941Smrg{ 649b8e80941Smrg for (int i = 0; i < glsl_get_vector_elements(var->type); i++) { 650b8e80941Smrg int chan = var->data.location_frac + i; 651b8e80941Smrg c->inputs[attr * 4 + chan] = 652b8e80941Smrg emit_fragment_varying(c, var, chan, array_index); 653b8e80941Smrg } 654b8e80941Smrg} 655b8e80941Smrg 656b8e80941Smrgstatic void 657b8e80941Smrgadd_output(struct v3d_compile *c, 658b8e80941Smrg uint32_t decl_offset, 659b8e80941Smrg uint8_t slot, 660b8e80941Smrg uint8_t swizzle) 661b8e80941Smrg{ 662b8e80941Smrg uint32_t old_array_size = c->outputs_array_size; 663b8e80941Smrg resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 664b8e80941Smrg decl_offset + 1); 665b8e80941Smrg 666b8e80941Smrg if (old_array_size != c->outputs_array_size) { 667b8e80941Smrg c->output_slots = reralloc(c, 668b8e80941Smrg c->output_slots, 669b8e80941Smrg struct v3d_varying_slot, 670b8e80941Smrg c->outputs_array_size); 671b8e80941Smrg } 672b8e80941Smrg 673b8e80941Smrg c->output_slots[decl_offset] = 674b8e80941Smrg v3d_slot_from_slot_and_component(slot, swizzle); 675b8e80941Smrg} 676b8e80941Smrg 677b8e80941Smrg/** 678b8e80941Smrg * If compare_instr is a valid comparison instruction, emits the 679b8e80941Smrg * compare_instr's comparison and returns the sel_instr's return value based 680b8e80941Smrg * on the compare_instr's result. 681b8e80941Smrg */ 682b8e80941Smrgstatic bool 683b8e80941Smrgntq_emit_comparison(struct v3d_compile *c, 684b8e80941Smrg nir_alu_instr *compare_instr, 685b8e80941Smrg enum v3d_qpu_cond *out_cond) 686b8e80941Smrg{ 687b8e80941Smrg struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 688b8e80941Smrg struct qreg src1; 689b8e80941Smrg if (nir_op_infos[compare_instr->op].num_inputs > 1) 690b8e80941Smrg src1 = ntq_get_alu_src(c, compare_instr, 1); 691b8e80941Smrg bool cond_invert = false; 692b8e80941Smrg struct qreg nop = vir_nop_reg(); 693b8e80941Smrg 694b8e80941Smrg switch (compare_instr->op) { 695b8e80941Smrg case nir_op_feq32: 696b8e80941Smrg case nir_op_seq: 697b8e80941Smrg vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 698b8e80941Smrg break; 699b8e80941Smrg case nir_op_ieq32: 700b8e80941Smrg vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 701b8e80941Smrg break; 702b8e80941Smrg 703b8e80941Smrg case nir_op_fne32: 704b8e80941Smrg case nir_op_sne: 705b8e80941Smrg vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 706b8e80941Smrg cond_invert = true; 707b8e80941Smrg break; 708b8e80941Smrg case nir_op_ine32: 709b8e80941Smrg vir_set_pf(vir_XOR_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHZ); 710b8e80941Smrg cond_invert = true; 711b8e80941Smrg break; 712b8e80941Smrg 713b8e80941Smrg case nir_op_fge32: 714b8e80941Smrg case nir_op_sge: 715b8e80941Smrg vir_set_pf(vir_FCMP_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 716b8e80941Smrg break; 717b8e80941Smrg case nir_op_ige32: 718b8e80941Smrg vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 719b8e80941Smrg cond_invert = true; 720b8e80941Smrg break; 721b8e80941Smrg case nir_op_uge32: 722b8e80941Smrg vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 723b8e80941Smrg cond_invert = true; 724b8e80941Smrg break; 725b8e80941Smrg 726b8e80941Smrg case nir_op_slt: 727b8e80941Smrg case nir_op_flt32: 728b8e80941Smrg vir_set_pf(vir_FCMP_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHN); 729b8e80941Smrg break; 730b8e80941Smrg case nir_op_ilt32: 731b8e80941Smrg vir_set_pf(vir_MIN_dest(c, nop, src1, src0), V3D_QPU_PF_PUSHC); 732b8e80941Smrg break; 733b8e80941Smrg case nir_op_ult32: 734b8e80941Smrg vir_set_pf(vir_SUB_dest(c, nop, src0, src1), V3D_QPU_PF_PUSHC); 735b8e80941Smrg break; 736b8e80941Smrg 737b8e80941Smrg case nir_op_i2b32: 738b8e80941Smrg vir_set_pf(vir_MOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 739b8e80941Smrg cond_invert = true; 740b8e80941Smrg break; 741b8e80941Smrg 742b8e80941Smrg case nir_op_f2b32: 743b8e80941Smrg vir_set_pf(vir_FMOV_dest(c, nop, src0), V3D_QPU_PF_PUSHZ); 744b8e80941Smrg cond_invert = true; 745b8e80941Smrg break; 746b8e80941Smrg 747b8e80941Smrg default: 748b8e80941Smrg return false; 749b8e80941Smrg } 750b8e80941Smrg 751b8e80941Smrg *out_cond = cond_invert ? V3D_QPU_COND_IFNA : V3D_QPU_COND_IFA; 752b8e80941Smrg 753b8e80941Smrg return true; 754b8e80941Smrg} 755b8e80941Smrg 756b8e80941Smrg/* Finds an ALU instruction that generates our src value that could 757b8e80941Smrg * (potentially) be greedily emitted in the consuming instruction. 758b8e80941Smrg */ 759b8e80941Smrgstatic struct nir_alu_instr * 760b8e80941Smrgntq_get_alu_parent(nir_src src) 761b8e80941Smrg{ 762b8e80941Smrg if (!src.is_ssa || src.ssa->parent_instr->type != nir_instr_type_alu) 763b8e80941Smrg return NULL; 764b8e80941Smrg nir_alu_instr *instr = nir_instr_as_alu(src.ssa->parent_instr); 765b8e80941Smrg if (!instr) 766b8e80941Smrg return NULL; 767b8e80941Smrg 768b8e80941Smrg /* If the ALU instr's srcs are non-SSA, then we would have to avoid 769b8e80941Smrg * moving emission of the ALU instr down past another write of the 770b8e80941Smrg * src. 771b8e80941Smrg */ 772b8e80941Smrg for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 773b8e80941Smrg if (!instr->src[i].src.is_ssa) 774b8e80941Smrg return NULL; 775b8e80941Smrg } 776b8e80941Smrg 777b8e80941Smrg return instr; 778b8e80941Smrg} 779b8e80941Smrg 780b8e80941Smrg/* Turns a NIR bool into a condition code to predicate on. */ 781b8e80941Smrgstatic enum v3d_qpu_cond 782b8e80941Smrgntq_emit_bool_to_cond(struct v3d_compile *c, nir_src src) 783b8e80941Smrg{ 784b8e80941Smrg nir_alu_instr *compare = ntq_get_alu_parent(src); 785b8e80941Smrg if (!compare) 786b8e80941Smrg goto out; 787b8e80941Smrg 788b8e80941Smrg enum v3d_qpu_cond cond; 789b8e80941Smrg if (ntq_emit_comparison(c, compare, &cond)) 790b8e80941Smrg return cond; 791b8e80941Smrg 792b8e80941Smrgout: 793b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), ntq_get_src(c, src, 0)), 794b8e80941Smrg V3D_QPU_PF_PUSHZ); 795b8e80941Smrg return V3D_QPU_COND_IFNA; 796b8e80941Smrg} 797b8e80941Smrg 798b8e80941Smrgstatic void 799b8e80941Smrgntq_emit_alu(struct v3d_compile *c, nir_alu_instr *instr) 800b8e80941Smrg{ 801b8e80941Smrg /* This should always be lowered to ALU operations for V3D. */ 802b8e80941Smrg assert(!instr->dest.saturate); 803b8e80941Smrg 804b8e80941Smrg /* Vectors are special in that they have non-scalarized writemasks, 805b8e80941Smrg * and just take the first swizzle channel for each argument in order 806b8e80941Smrg * into each writemask channel. 807b8e80941Smrg */ 808b8e80941Smrg if (instr->op == nir_op_vec2 || 809b8e80941Smrg instr->op == nir_op_vec3 || 810b8e80941Smrg instr->op == nir_op_vec4) { 811b8e80941Smrg struct qreg srcs[4]; 812b8e80941Smrg for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 813b8e80941Smrg srcs[i] = ntq_get_src(c, instr->src[i].src, 814b8e80941Smrg instr->src[i].swizzle[0]); 815b8e80941Smrg for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 816b8e80941Smrg ntq_store_dest(c, &instr->dest.dest, i, 817b8e80941Smrg vir_MOV(c, srcs[i])); 818b8e80941Smrg return; 819b8e80941Smrg } 820b8e80941Smrg 821b8e80941Smrg /* General case: We can just grab the one used channel per src. */ 822b8e80941Smrg struct qreg src[nir_op_infos[instr->op].num_inputs]; 823b8e80941Smrg for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 824b8e80941Smrg src[i] = ntq_get_alu_src(c, instr, i); 825b8e80941Smrg } 826b8e80941Smrg 827b8e80941Smrg struct qreg result; 828b8e80941Smrg 829b8e80941Smrg switch (instr->op) { 830b8e80941Smrg case nir_op_fmov: 831b8e80941Smrg case nir_op_imov: 832b8e80941Smrg result = vir_MOV(c, src[0]); 833b8e80941Smrg break; 834b8e80941Smrg 835b8e80941Smrg case nir_op_fneg: 836b8e80941Smrg result = vir_XOR(c, src[0], vir_uniform_ui(c, 1 << 31)); 837b8e80941Smrg break; 838b8e80941Smrg case nir_op_ineg: 839b8e80941Smrg result = vir_NEG(c, src[0]); 840b8e80941Smrg break; 841b8e80941Smrg 842b8e80941Smrg case nir_op_fmul: 843b8e80941Smrg result = vir_FMUL(c, src[0], src[1]); 844b8e80941Smrg break; 845b8e80941Smrg case nir_op_fadd: 846b8e80941Smrg result = vir_FADD(c, src[0], src[1]); 847b8e80941Smrg break; 848b8e80941Smrg case nir_op_fsub: 849b8e80941Smrg result = vir_FSUB(c, src[0], src[1]); 850b8e80941Smrg break; 851b8e80941Smrg case nir_op_fmin: 852b8e80941Smrg result = vir_FMIN(c, src[0], src[1]); 853b8e80941Smrg break; 854b8e80941Smrg case nir_op_fmax: 855b8e80941Smrg result = vir_FMAX(c, src[0], src[1]); 856b8e80941Smrg break; 857b8e80941Smrg 858b8e80941Smrg case nir_op_f2i32: { 859b8e80941Smrg nir_alu_instr *src0_alu = ntq_get_alu_parent(instr->src[0].src); 860b8e80941Smrg if (src0_alu && src0_alu->op == nir_op_fround_even) { 861b8e80941Smrg result = vir_FTOIN(c, ntq_get_alu_src(c, src0_alu, 0)); 862b8e80941Smrg } else { 863b8e80941Smrg result = vir_FTOIZ(c, src[0]); 864b8e80941Smrg } 865b8e80941Smrg break; 866b8e80941Smrg } 867b8e80941Smrg 868b8e80941Smrg case nir_op_f2u32: 869b8e80941Smrg result = vir_FTOUZ(c, src[0]); 870b8e80941Smrg break; 871b8e80941Smrg case nir_op_i2f32: 872b8e80941Smrg result = vir_ITOF(c, src[0]); 873b8e80941Smrg break; 874b8e80941Smrg case nir_op_u2f32: 875b8e80941Smrg result = vir_UTOF(c, src[0]); 876b8e80941Smrg break; 877b8e80941Smrg case nir_op_b2f32: 878b8e80941Smrg result = vir_AND(c, src[0], vir_uniform_f(c, 1.0)); 879b8e80941Smrg break; 880b8e80941Smrg case nir_op_b2i32: 881b8e80941Smrg result = vir_AND(c, src[0], vir_uniform_ui(c, 1)); 882b8e80941Smrg break; 883b8e80941Smrg 884b8e80941Smrg case nir_op_iadd: 885b8e80941Smrg result = vir_ADD(c, src[0], src[1]); 886b8e80941Smrg break; 887b8e80941Smrg case nir_op_ushr: 888b8e80941Smrg result = vir_SHR(c, src[0], src[1]); 889b8e80941Smrg break; 890b8e80941Smrg case nir_op_isub: 891b8e80941Smrg result = vir_SUB(c, src[0], src[1]); 892b8e80941Smrg break; 893b8e80941Smrg case nir_op_ishr: 894b8e80941Smrg result = vir_ASR(c, src[0], src[1]); 895b8e80941Smrg break; 896b8e80941Smrg case nir_op_ishl: 897b8e80941Smrg result = vir_SHL(c, src[0], src[1]); 898b8e80941Smrg break; 899b8e80941Smrg case nir_op_imin: 900b8e80941Smrg result = vir_MIN(c, src[0], src[1]); 901b8e80941Smrg break; 902b8e80941Smrg case nir_op_umin: 903b8e80941Smrg result = vir_UMIN(c, src[0], src[1]); 904b8e80941Smrg break; 905b8e80941Smrg case nir_op_imax: 906b8e80941Smrg result = vir_MAX(c, src[0], src[1]); 907b8e80941Smrg break; 908b8e80941Smrg case nir_op_umax: 909b8e80941Smrg result = vir_UMAX(c, src[0], src[1]); 910b8e80941Smrg break; 911b8e80941Smrg case nir_op_iand: 912b8e80941Smrg result = vir_AND(c, src[0], src[1]); 913b8e80941Smrg break; 914b8e80941Smrg case nir_op_ior: 915b8e80941Smrg result = vir_OR(c, src[0], src[1]); 916b8e80941Smrg break; 917b8e80941Smrg case nir_op_ixor: 918b8e80941Smrg result = vir_XOR(c, src[0], src[1]); 919b8e80941Smrg break; 920b8e80941Smrg case nir_op_inot: 921b8e80941Smrg result = vir_NOT(c, src[0]); 922b8e80941Smrg break; 923b8e80941Smrg 924b8e80941Smrg case nir_op_ufind_msb: 925b8e80941Smrg result = vir_SUB(c, vir_uniform_ui(c, 31), vir_CLZ(c, src[0])); 926b8e80941Smrg break; 927b8e80941Smrg 928b8e80941Smrg case nir_op_imul: 929b8e80941Smrg result = vir_UMUL(c, src[0], src[1]); 930b8e80941Smrg break; 931b8e80941Smrg 932b8e80941Smrg case nir_op_seq: 933b8e80941Smrg case nir_op_sne: 934b8e80941Smrg case nir_op_sge: 935b8e80941Smrg case nir_op_slt: { 936b8e80941Smrg enum v3d_qpu_cond cond; 937b8e80941Smrg MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond); 938b8e80941Smrg assert(ok); 939b8e80941Smrg result = vir_MOV(c, vir_SEL(c, cond, 940b8e80941Smrg vir_uniform_f(c, 1.0), 941b8e80941Smrg vir_uniform_f(c, 0.0))); 942b8e80941Smrg break; 943b8e80941Smrg } 944b8e80941Smrg 945b8e80941Smrg case nir_op_i2b32: 946b8e80941Smrg case nir_op_f2b32: 947b8e80941Smrg case nir_op_feq32: 948b8e80941Smrg case nir_op_fne32: 949b8e80941Smrg case nir_op_fge32: 950b8e80941Smrg case nir_op_flt32: 951b8e80941Smrg case nir_op_ieq32: 952b8e80941Smrg case nir_op_ine32: 953b8e80941Smrg case nir_op_ige32: 954b8e80941Smrg case nir_op_uge32: 955b8e80941Smrg case nir_op_ilt32: 956b8e80941Smrg case nir_op_ult32: { 957b8e80941Smrg enum v3d_qpu_cond cond; 958b8e80941Smrg MAYBE_UNUSED bool ok = ntq_emit_comparison(c, instr, &cond); 959b8e80941Smrg assert(ok); 960b8e80941Smrg result = vir_MOV(c, vir_SEL(c, cond, 961b8e80941Smrg vir_uniform_ui(c, ~0), 962b8e80941Smrg vir_uniform_ui(c, 0))); 963b8e80941Smrg break; 964b8e80941Smrg } 965b8e80941Smrg 966b8e80941Smrg case nir_op_b32csel: 967b8e80941Smrg result = vir_MOV(c, 968b8e80941Smrg vir_SEL(c, 969b8e80941Smrg ntq_emit_bool_to_cond(c, instr->src[0].src), 970b8e80941Smrg src[1], src[2])); 971b8e80941Smrg break; 972b8e80941Smrg 973b8e80941Smrg case nir_op_fcsel: 974b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), src[0]), 975b8e80941Smrg V3D_QPU_PF_PUSHZ); 976b8e80941Smrg result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFNA, 977b8e80941Smrg src[1], src[2])); 978b8e80941Smrg break; 979b8e80941Smrg 980b8e80941Smrg case nir_op_frcp: 981b8e80941Smrg result = vir_RECIP(c, src[0]); 982b8e80941Smrg break; 983b8e80941Smrg case nir_op_frsq: 984b8e80941Smrg result = vir_RSQRT(c, src[0]); 985b8e80941Smrg break; 986b8e80941Smrg case nir_op_fexp2: 987b8e80941Smrg result = vir_EXP(c, src[0]); 988b8e80941Smrg break; 989b8e80941Smrg case nir_op_flog2: 990b8e80941Smrg result = vir_LOG(c, src[0]); 991b8e80941Smrg break; 992b8e80941Smrg 993b8e80941Smrg case nir_op_fceil: 994b8e80941Smrg result = vir_FCEIL(c, src[0]); 995b8e80941Smrg break; 996b8e80941Smrg case nir_op_ffloor: 997b8e80941Smrg result = vir_FFLOOR(c, src[0]); 998b8e80941Smrg break; 999b8e80941Smrg case nir_op_fround_even: 1000b8e80941Smrg result = vir_FROUND(c, src[0]); 1001b8e80941Smrg break; 1002b8e80941Smrg case nir_op_ftrunc: 1003b8e80941Smrg result = vir_FTRUNC(c, src[0]); 1004b8e80941Smrg break; 1005b8e80941Smrg 1006b8e80941Smrg case nir_op_fsin: 1007b8e80941Smrg result = ntq_fsincos(c, src[0], false); 1008b8e80941Smrg break; 1009b8e80941Smrg case nir_op_fcos: 1010b8e80941Smrg result = ntq_fsincos(c, src[0], true); 1011b8e80941Smrg break; 1012b8e80941Smrg 1013b8e80941Smrg case nir_op_fsign: 1014b8e80941Smrg result = ntq_fsign(c, src[0]); 1015b8e80941Smrg break; 1016b8e80941Smrg 1017b8e80941Smrg case nir_op_fabs: { 1018b8e80941Smrg result = vir_FMOV(c, src[0]); 1019b8e80941Smrg vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_ABS); 1020b8e80941Smrg break; 1021b8e80941Smrg } 1022b8e80941Smrg 1023b8e80941Smrg case nir_op_iabs: 1024b8e80941Smrg result = vir_MAX(c, src[0], vir_NEG(c, src[0])); 1025b8e80941Smrg break; 1026b8e80941Smrg 1027b8e80941Smrg case nir_op_fddx: 1028b8e80941Smrg case nir_op_fddx_coarse: 1029b8e80941Smrg case nir_op_fddx_fine: 1030b8e80941Smrg result = vir_FDX(c, src[0]); 1031b8e80941Smrg break; 1032b8e80941Smrg 1033b8e80941Smrg case nir_op_fddy: 1034b8e80941Smrg case nir_op_fddy_coarse: 1035b8e80941Smrg case nir_op_fddy_fine: 1036b8e80941Smrg result = vir_FDY(c, src[0]); 1037b8e80941Smrg break; 1038b8e80941Smrg 1039b8e80941Smrg case nir_op_uadd_carry: 1040b8e80941Smrg vir_set_pf(vir_ADD_dest(c, vir_nop_reg(), src[0], src[1]), 1041b8e80941Smrg V3D_QPU_PF_PUSHC); 1042b8e80941Smrg result = vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, 1043b8e80941Smrg vir_uniform_ui(c, ~0), 1044b8e80941Smrg vir_uniform_ui(c, 0))); 1045b8e80941Smrg break; 1046b8e80941Smrg 1047b8e80941Smrg case nir_op_pack_half_2x16_split: 1048b8e80941Smrg result = vir_VFPACK(c, src[0], src[1]); 1049b8e80941Smrg break; 1050b8e80941Smrg 1051b8e80941Smrg case nir_op_unpack_half_2x16_split_x: 1052b8e80941Smrg result = vir_FMOV(c, src[0]); 1053b8e80941Smrg vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_L); 1054b8e80941Smrg break; 1055b8e80941Smrg 1056b8e80941Smrg case nir_op_unpack_half_2x16_split_y: 1057b8e80941Smrg result = vir_FMOV(c, src[0]); 1058b8e80941Smrg vir_set_unpack(c->defs[result.index], 0, V3D_QPU_UNPACK_H); 1059b8e80941Smrg break; 1060b8e80941Smrg 1061b8e80941Smrg default: 1062b8e80941Smrg fprintf(stderr, "unknown NIR ALU inst: "); 1063b8e80941Smrg nir_print_instr(&instr->instr, stderr); 1064b8e80941Smrg fprintf(stderr, "\n"); 1065b8e80941Smrg abort(); 1066b8e80941Smrg } 1067b8e80941Smrg 1068b8e80941Smrg /* We have a scalar result, so the instruction should only have a 1069b8e80941Smrg * single channel written to. 1070b8e80941Smrg */ 1071b8e80941Smrg assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 1072b8e80941Smrg ntq_store_dest(c, &instr->dest.dest, 1073b8e80941Smrg ffs(instr->dest.write_mask) - 1, result); 1074b8e80941Smrg} 1075b8e80941Smrg 1076b8e80941Smrg/* Each TLB read/write setup (a render target or depth buffer) takes an 8-bit 1077b8e80941Smrg * specifier. They come from a register that's preloaded with 0xffffffff 1078b8e80941Smrg * (0xff gets you normal vec4 f16 RT0 writes), and when one is neaded the low 1079b8e80941Smrg * 8 bits are shifted off the bottom and 0xff shifted in from the top. 1080b8e80941Smrg */ 1081b8e80941Smrg#define TLB_TYPE_F16_COLOR (3 << 6) 1082b8e80941Smrg#define TLB_TYPE_I32_COLOR (1 << 6) 1083b8e80941Smrg#define TLB_TYPE_F32_COLOR (0 << 6) 1084b8e80941Smrg#define TLB_RENDER_TARGET_SHIFT 3 /* Reversed! 7 = RT 0, 0 = RT 7. */ 1085b8e80941Smrg#define TLB_SAMPLE_MODE_PER_SAMPLE (0 << 2) 1086b8e80941Smrg#define TLB_SAMPLE_MODE_PER_PIXEL (1 << 2) 1087b8e80941Smrg#define TLB_F16_SWAP_HI_LO (1 << 1) 1088b8e80941Smrg#define TLB_VEC_SIZE_4_F16 (1 << 0) 1089b8e80941Smrg#define TLB_VEC_SIZE_2_F16 (0 << 0) 1090b8e80941Smrg#define TLB_VEC_SIZE_MINUS_1_SHIFT 0 1091b8e80941Smrg 1092b8e80941Smrg/* Triggers Z/Stencil testing, used when the shader state's "FS modifies Z" 1093b8e80941Smrg * flag is set. 1094b8e80941Smrg */ 1095b8e80941Smrg#define TLB_TYPE_DEPTH ((2 << 6) | (0 << 4)) 1096b8e80941Smrg#define TLB_DEPTH_TYPE_INVARIANT (0 << 2) /* Unmodified sideband input used */ 1097b8e80941Smrg#define TLB_DEPTH_TYPE_PER_PIXEL (1 << 2) /* QPU result used */ 1098b8e80941Smrg#define TLB_V42_DEPTH_TYPE_INVARIANT (0 << 3) /* Unmodified sideband input used */ 1099b8e80941Smrg#define TLB_V42_DEPTH_TYPE_PER_PIXEL (1 << 3) /* QPU result used */ 1100b8e80941Smrg 1101b8e80941Smrg/* Stencil is a single 32-bit write. */ 1102b8e80941Smrg#define TLB_TYPE_STENCIL_ALPHA ((2 << 6) | (1 << 4)) 1103b8e80941Smrg 1104b8e80941Smrgstatic void 1105b8e80941Smrgemit_frag_end(struct v3d_compile *c) 1106b8e80941Smrg{ 1107b8e80941Smrg /* XXX 1108b8e80941Smrg if (c->output_sample_mask_index != -1) { 1109b8e80941Smrg vir_MS_MASK(c, c->outputs[c->output_sample_mask_index]); 1110b8e80941Smrg } 1111b8e80941Smrg */ 1112b8e80941Smrg 1113b8e80941Smrg bool has_any_tlb_color_write = false; 1114b8e80941Smrg for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) { 1115b8e80941Smrg if (c->fs_key->cbufs & (1 << rt) && c->output_color_var[rt]) 1116b8e80941Smrg has_any_tlb_color_write = true; 1117b8e80941Smrg } 1118b8e80941Smrg 1119b8e80941Smrg if (c->fs_key->sample_alpha_to_coverage && c->output_color_var[0]) { 1120b8e80941Smrg struct nir_variable *var = c->output_color_var[0]; 1121b8e80941Smrg struct qreg *color = &c->outputs[var->data.driver_location * 4]; 1122b8e80941Smrg 1123b8e80941Smrg vir_SETMSF_dest(c, vir_nop_reg(), 1124b8e80941Smrg vir_AND(c, 1125b8e80941Smrg vir_MSF(c), 1126b8e80941Smrg vir_FTOC(c, color[3]))); 1127b8e80941Smrg } 1128b8e80941Smrg 1129b8e80941Smrg struct qreg tlb_reg = vir_magic_reg(V3D_QPU_WADDR_TLB); 1130b8e80941Smrg struct qreg tlbu_reg = vir_magic_reg(V3D_QPU_WADDR_TLBU); 1131b8e80941Smrg if (c->output_position_index != -1) { 1132b8e80941Smrg struct qinst *inst = vir_MOV_dest(c, tlbu_reg, 1133b8e80941Smrg c->outputs[c->output_position_index]); 1134b8e80941Smrg uint8_t tlb_specifier = TLB_TYPE_DEPTH; 1135b8e80941Smrg 1136b8e80941Smrg if (c->devinfo->ver >= 42) { 1137b8e80941Smrg tlb_specifier |= (TLB_V42_DEPTH_TYPE_PER_PIXEL | 1138b8e80941Smrg TLB_SAMPLE_MODE_PER_PIXEL); 1139b8e80941Smrg } else 1140b8e80941Smrg tlb_specifier |= TLB_DEPTH_TYPE_PER_PIXEL; 1141b8e80941Smrg 1142b8e80941Smrg inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 1143b8e80941Smrg tlb_specifier | 1144b8e80941Smrg 0xffffff00); 1145b8e80941Smrg c->writes_z = true; 1146b8e80941Smrg } else if (c->s->info.fs.uses_discard || 1147b8e80941Smrg !c->s->info.fs.early_fragment_tests || 1148b8e80941Smrg c->fs_key->sample_alpha_to_coverage || 1149b8e80941Smrg !has_any_tlb_color_write) { 1150b8e80941Smrg /* Emit passthrough Z if it needed to be delayed until shader 1151b8e80941Smrg * end due to potential discards. 1152b8e80941Smrg * 1153b8e80941Smrg * Since (single-threaded) fragment shaders always need a TLB 1154b8e80941Smrg * write, emit passthrouh Z if we didn't have any color 1155b8e80941Smrg * buffers and flag us as potentially discarding, so that we 1156b8e80941Smrg * can use Z as the TLB write. 1157b8e80941Smrg */ 1158b8e80941Smrg c->s->info.fs.uses_discard = true; 1159b8e80941Smrg 1160b8e80941Smrg struct qinst *inst = vir_MOV_dest(c, tlbu_reg, 1161b8e80941Smrg vir_nop_reg()); 1162b8e80941Smrg uint8_t tlb_specifier = TLB_TYPE_DEPTH; 1163b8e80941Smrg 1164b8e80941Smrg if (c->devinfo->ver >= 42) { 1165b8e80941Smrg /* The spec says the PER_PIXEL flag is ignored for 1166b8e80941Smrg * invariant writes, but the simulator demands it. 1167b8e80941Smrg */ 1168b8e80941Smrg tlb_specifier |= (TLB_V42_DEPTH_TYPE_INVARIANT | 1169b8e80941Smrg TLB_SAMPLE_MODE_PER_PIXEL); 1170b8e80941Smrg } else { 1171b8e80941Smrg tlb_specifier |= TLB_DEPTH_TYPE_INVARIANT; 1172b8e80941Smrg } 1173b8e80941Smrg 1174b8e80941Smrg inst->uniform = vir_get_uniform_index(c, 1175b8e80941Smrg QUNIFORM_CONSTANT, 1176b8e80941Smrg tlb_specifier | 1177b8e80941Smrg 0xffffff00); 1178b8e80941Smrg c->writes_z = true; 1179b8e80941Smrg } 1180b8e80941Smrg 1181b8e80941Smrg /* XXX: Performance improvement: Merge Z write and color writes TLB 1182b8e80941Smrg * uniform setup 1183b8e80941Smrg */ 1184b8e80941Smrg 1185b8e80941Smrg for (int rt = 0; rt < V3D_MAX_DRAW_BUFFERS; rt++) { 1186b8e80941Smrg if (!(c->fs_key->cbufs & (1 << rt)) || !c->output_color_var[rt]) 1187b8e80941Smrg continue; 1188b8e80941Smrg 1189b8e80941Smrg nir_variable *var = c->output_color_var[rt]; 1190b8e80941Smrg struct qreg *color = &c->outputs[var->data.driver_location * 4]; 1191b8e80941Smrg int num_components = glsl_get_vector_elements(var->type); 1192b8e80941Smrg uint32_t conf = 0xffffff00; 1193b8e80941Smrg struct qinst *inst; 1194b8e80941Smrg 1195b8e80941Smrg conf |= TLB_SAMPLE_MODE_PER_PIXEL; 1196b8e80941Smrg conf |= (7 - rt) << TLB_RENDER_TARGET_SHIFT; 1197b8e80941Smrg 1198b8e80941Smrg if (c->fs_key->swap_color_rb & (1 << rt)) 1199b8e80941Smrg num_components = MAX2(num_components, 3); 1200b8e80941Smrg 1201b8e80941Smrg assert(num_components != 0); 1202b8e80941Smrg switch (glsl_get_base_type(var->type)) { 1203b8e80941Smrg case GLSL_TYPE_UINT: 1204b8e80941Smrg case GLSL_TYPE_INT: 1205b8e80941Smrg /* The F32 vs I32 distinction was dropped in 4.2. */ 1206b8e80941Smrg if (c->devinfo->ver < 42) 1207b8e80941Smrg conf |= TLB_TYPE_I32_COLOR; 1208b8e80941Smrg else 1209b8e80941Smrg conf |= TLB_TYPE_F32_COLOR; 1210b8e80941Smrg conf |= ((num_components - 1) << 1211b8e80941Smrg TLB_VEC_SIZE_MINUS_1_SHIFT); 1212b8e80941Smrg 1213b8e80941Smrg inst = vir_MOV_dest(c, tlbu_reg, color[0]); 1214b8e80941Smrg inst->uniform = vir_get_uniform_index(c, 1215b8e80941Smrg QUNIFORM_CONSTANT, 1216b8e80941Smrg conf); 1217b8e80941Smrg 1218b8e80941Smrg for (int i = 1; i < num_components; i++) { 1219b8e80941Smrg inst = vir_MOV_dest(c, tlb_reg, color[i]); 1220b8e80941Smrg } 1221b8e80941Smrg break; 1222b8e80941Smrg 1223b8e80941Smrg default: { 1224b8e80941Smrg struct qreg r = color[0]; 1225b8e80941Smrg struct qreg g = color[1]; 1226b8e80941Smrg struct qreg b = color[2]; 1227b8e80941Smrg struct qreg a = color[3]; 1228b8e80941Smrg 1229b8e80941Smrg if (c->fs_key->f32_color_rb & (1 << rt)) { 1230b8e80941Smrg conf |= TLB_TYPE_F32_COLOR; 1231b8e80941Smrg conf |= ((num_components - 1) << 1232b8e80941Smrg TLB_VEC_SIZE_MINUS_1_SHIFT); 1233b8e80941Smrg } else { 1234b8e80941Smrg conf |= TLB_TYPE_F16_COLOR; 1235b8e80941Smrg conf |= TLB_F16_SWAP_HI_LO; 1236b8e80941Smrg if (num_components >= 3) 1237b8e80941Smrg conf |= TLB_VEC_SIZE_4_F16; 1238b8e80941Smrg else 1239b8e80941Smrg conf |= TLB_VEC_SIZE_2_F16; 1240b8e80941Smrg } 1241b8e80941Smrg 1242b8e80941Smrg if (c->fs_key->swap_color_rb & (1 << rt)) { 1243b8e80941Smrg r = color[2]; 1244b8e80941Smrg b = color[0]; 1245b8e80941Smrg } 1246b8e80941Smrg 1247b8e80941Smrg if (c->fs_key->sample_alpha_to_one) 1248b8e80941Smrg a = vir_uniform_f(c, 1.0); 1249b8e80941Smrg 1250b8e80941Smrg if (c->fs_key->f32_color_rb & (1 << rt)) { 1251b8e80941Smrg inst = vir_MOV_dest(c, tlbu_reg, r); 1252b8e80941Smrg inst->uniform = vir_get_uniform_index(c, 1253b8e80941Smrg QUNIFORM_CONSTANT, 1254b8e80941Smrg conf); 1255b8e80941Smrg 1256b8e80941Smrg if (num_components >= 2) 1257b8e80941Smrg vir_MOV_dest(c, tlb_reg, g); 1258b8e80941Smrg if (num_components >= 3) 1259b8e80941Smrg vir_MOV_dest(c, tlb_reg, b); 1260b8e80941Smrg if (num_components >= 4) 1261b8e80941Smrg vir_MOV_dest(c, tlb_reg, a); 1262b8e80941Smrg } else { 1263b8e80941Smrg inst = vir_VFPACK_dest(c, tlb_reg, r, g); 1264b8e80941Smrg if (conf != ~0) { 1265b8e80941Smrg inst->dst = tlbu_reg; 1266b8e80941Smrg inst->uniform = vir_get_uniform_index(c, 1267b8e80941Smrg QUNIFORM_CONSTANT, 1268b8e80941Smrg conf); 1269b8e80941Smrg } 1270b8e80941Smrg 1271b8e80941Smrg if (num_components >= 3) 1272b8e80941Smrg inst = vir_VFPACK_dest(c, tlb_reg, b, a); 1273b8e80941Smrg } 1274b8e80941Smrg break; 1275b8e80941Smrg } 1276b8e80941Smrg } 1277b8e80941Smrg } 1278b8e80941Smrg} 1279b8e80941Smrg 1280b8e80941Smrgstatic void 1281b8e80941Smrgvir_VPM_WRITE(struct v3d_compile *c, struct qreg val, uint32_t vpm_index) 1282b8e80941Smrg{ 1283b8e80941Smrg if (c->devinfo->ver >= 40) { 1284b8e80941Smrg vir_STVPMV(c, vir_uniform_ui(c, vpm_index), val); 1285b8e80941Smrg } else { 1286b8e80941Smrg /* XXX: v3d33_vir_vpm_write_setup(c); */ 1287b8e80941Smrg vir_MOV_dest(c, vir_reg(QFILE_MAGIC, V3D_QPU_WADDR_VPM), val); 1288b8e80941Smrg } 1289b8e80941Smrg} 1290b8e80941Smrg 1291b8e80941Smrgstatic void 1292b8e80941Smrgemit_vert_end(struct v3d_compile *c) 1293b8e80941Smrg{ 1294b8e80941Smrg /* GFXH-1684: VPM writes need to be complete by the end of the shader. 1295b8e80941Smrg */ 1296b8e80941Smrg if (c->devinfo->ver >= 40 && c->devinfo->ver <= 42) 1297b8e80941Smrg vir_VPMWT(c); 1298b8e80941Smrg} 1299b8e80941Smrg 1300b8e80941Smrgvoid 1301b8e80941Smrgv3d_optimize_nir(struct nir_shader *s) 1302b8e80941Smrg{ 1303b8e80941Smrg bool progress; 1304b8e80941Smrg 1305b8e80941Smrg do { 1306b8e80941Smrg progress = false; 1307b8e80941Smrg 1308b8e80941Smrg NIR_PASS_V(s, nir_lower_vars_to_ssa); 1309b8e80941Smrg NIR_PASS(progress, s, nir_lower_alu_to_scalar); 1310b8e80941Smrg NIR_PASS(progress, s, nir_lower_phis_to_scalar); 1311b8e80941Smrg NIR_PASS(progress, s, nir_copy_prop); 1312b8e80941Smrg NIR_PASS(progress, s, nir_opt_remove_phis); 1313b8e80941Smrg NIR_PASS(progress, s, nir_opt_dce); 1314b8e80941Smrg NIR_PASS(progress, s, nir_opt_dead_cf); 1315b8e80941Smrg NIR_PASS(progress, s, nir_opt_cse); 1316b8e80941Smrg NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); 1317b8e80941Smrg NIR_PASS(progress, s, nir_opt_algebraic); 1318b8e80941Smrg NIR_PASS(progress, s, nir_opt_constant_folding); 1319b8e80941Smrg NIR_PASS(progress, s, nir_opt_undef); 1320b8e80941Smrg } while (progress); 1321b8e80941Smrg 1322b8e80941Smrg NIR_PASS(progress, s, nir_opt_move_load_ubo); 1323b8e80941Smrg} 1324b8e80941Smrg 1325b8e80941Smrgstatic int 1326b8e80941Smrgdriver_location_compare(const void *in_a, const void *in_b) 1327b8e80941Smrg{ 1328b8e80941Smrg const nir_variable *const *a = in_a; 1329b8e80941Smrg const nir_variable *const *b = in_b; 1330b8e80941Smrg 1331b8e80941Smrg return (*a)->data.driver_location - (*b)->data.driver_location; 1332b8e80941Smrg} 1333b8e80941Smrg 1334b8e80941Smrgstatic struct qreg 1335b8e80941Smrgntq_emit_vpm_read(struct v3d_compile *c, 1336b8e80941Smrg uint32_t *num_components_queued, 1337b8e80941Smrg uint32_t *remaining, 1338b8e80941Smrg uint32_t vpm_index) 1339b8e80941Smrg{ 1340b8e80941Smrg struct qreg vpm = vir_reg(QFILE_VPM, vpm_index); 1341b8e80941Smrg 1342b8e80941Smrg if (c->devinfo->ver >= 40 ) { 1343b8e80941Smrg return vir_LDVPMV_IN(c, 1344b8e80941Smrg vir_uniform_ui(c, 1345b8e80941Smrg (*num_components_queued)++)); 1346b8e80941Smrg } 1347b8e80941Smrg 1348b8e80941Smrg if (*num_components_queued != 0) { 1349b8e80941Smrg (*num_components_queued)--; 1350b8e80941Smrg return vir_MOV(c, vpm); 1351b8e80941Smrg } 1352b8e80941Smrg 1353b8e80941Smrg uint32_t num_components = MIN2(*remaining, 32); 1354b8e80941Smrg 1355b8e80941Smrg v3d33_vir_vpm_read_setup(c, num_components); 1356b8e80941Smrg 1357b8e80941Smrg *num_components_queued = num_components - 1; 1358b8e80941Smrg *remaining -= num_components; 1359b8e80941Smrg 1360b8e80941Smrg return vir_MOV(c, vpm); 1361b8e80941Smrg} 1362b8e80941Smrg 1363b8e80941Smrgstatic void 1364b8e80941Smrgntq_setup_vpm_inputs(struct v3d_compile *c) 1365b8e80941Smrg{ 1366b8e80941Smrg /* Figure out how many components of each vertex attribute the shader 1367b8e80941Smrg * uses. Each variable should have been split to individual 1368b8e80941Smrg * components and unused ones DCEed. The vertex fetcher will load 1369b8e80941Smrg * from the start of the attribute to the number of components we 1370b8e80941Smrg * declare we need in c->vattr_sizes[]. 1371b8e80941Smrg */ 1372b8e80941Smrg nir_foreach_variable(var, &c->s->inputs) { 1373b8e80941Smrg /* No VS attribute array support. */ 1374b8e80941Smrg assert(MAX2(glsl_get_length(var->type), 1) == 1); 1375b8e80941Smrg 1376b8e80941Smrg unsigned loc = var->data.driver_location; 1377b8e80941Smrg int start_component = var->data.location_frac; 1378b8e80941Smrg int num_components = glsl_get_components(var->type); 1379b8e80941Smrg 1380b8e80941Smrg c->vattr_sizes[loc] = MAX2(c->vattr_sizes[loc], 1381b8e80941Smrg start_component + num_components); 1382b8e80941Smrg } 1383b8e80941Smrg 1384b8e80941Smrg unsigned num_components = 0; 1385b8e80941Smrg uint32_t vpm_components_queued = 0; 1386b8e80941Smrg bool uses_iid = c->s->info.system_values_read & 1387b8e80941Smrg (1ull << SYSTEM_VALUE_INSTANCE_ID); 1388b8e80941Smrg bool uses_vid = c->s->info.system_values_read & 1389b8e80941Smrg (1ull << SYSTEM_VALUE_VERTEX_ID); 1390b8e80941Smrg num_components += uses_iid; 1391b8e80941Smrg num_components += uses_vid; 1392b8e80941Smrg 1393b8e80941Smrg for (int i = 0; i < ARRAY_SIZE(c->vattr_sizes); i++) 1394b8e80941Smrg num_components += c->vattr_sizes[i]; 1395b8e80941Smrg 1396b8e80941Smrg if (uses_iid) { 1397b8e80941Smrg c->iid = ntq_emit_vpm_read(c, &vpm_components_queued, 1398b8e80941Smrg &num_components, ~0); 1399b8e80941Smrg } 1400b8e80941Smrg 1401b8e80941Smrg if (uses_vid) { 1402b8e80941Smrg c->vid = ntq_emit_vpm_read(c, &vpm_components_queued, 1403b8e80941Smrg &num_components, ~0); 1404b8e80941Smrg } 1405b8e80941Smrg 1406b8e80941Smrg /* The actual loads will happen directly in nir_intrinsic_load_input 1407b8e80941Smrg * on newer versions. 1408b8e80941Smrg */ 1409b8e80941Smrg if (c->devinfo->ver >= 40) 1410b8e80941Smrg return; 1411b8e80941Smrg 1412b8e80941Smrg for (int loc = 0; loc < ARRAY_SIZE(c->vattr_sizes); loc++) { 1413b8e80941Smrg resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 1414b8e80941Smrg (loc + 1) * 4); 1415b8e80941Smrg 1416b8e80941Smrg for (int i = 0; i < c->vattr_sizes[loc]; i++) { 1417b8e80941Smrg c->inputs[loc * 4 + i] = 1418b8e80941Smrg ntq_emit_vpm_read(c, 1419b8e80941Smrg &vpm_components_queued, 1420b8e80941Smrg &num_components, 1421b8e80941Smrg loc * 4 + i); 1422b8e80941Smrg 1423b8e80941Smrg } 1424b8e80941Smrg } 1425b8e80941Smrg 1426b8e80941Smrg if (c->devinfo->ver >= 40) { 1427b8e80941Smrg assert(vpm_components_queued == num_components); 1428b8e80941Smrg } else { 1429b8e80941Smrg assert(vpm_components_queued == 0); 1430b8e80941Smrg assert(num_components == 0); 1431b8e80941Smrg } 1432b8e80941Smrg} 1433b8e80941Smrg 1434b8e80941Smrgstatic void 1435b8e80941Smrgntq_setup_fs_inputs(struct v3d_compile *c) 1436b8e80941Smrg{ 1437b8e80941Smrg unsigned num_entries = 0; 1438b8e80941Smrg unsigned num_components = 0; 1439b8e80941Smrg nir_foreach_variable(var, &c->s->inputs) { 1440b8e80941Smrg num_entries++; 1441b8e80941Smrg num_components += glsl_get_components(var->type); 1442b8e80941Smrg } 1443b8e80941Smrg 1444b8e80941Smrg nir_variable *vars[num_entries]; 1445b8e80941Smrg 1446b8e80941Smrg unsigned i = 0; 1447b8e80941Smrg nir_foreach_variable(var, &c->s->inputs) 1448b8e80941Smrg vars[i++] = var; 1449b8e80941Smrg 1450b8e80941Smrg /* Sort the variables so that we emit the input setup in 1451b8e80941Smrg * driver_location order. This is required for VPM reads, whose data 1452b8e80941Smrg * is fetched into the VPM in driver_location (TGSI register index) 1453b8e80941Smrg * order. 1454b8e80941Smrg */ 1455b8e80941Smrg qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); 1456b8e80941Smrg 1457b8e80941Smrg for (unsigned i = 0; i < num_entries; i++) { 1458b8e80941Smrg nir_variable *var = vars[i]; 1459b8e80941Smrg unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1460b8e80941Smrg unsigned loc = var->data.driver_location; 1461b8e80941Smrg 1462b8e80941Smrg resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 1463b8e80941Smrg (loc + array_len) * 4); 1464b8e80941Smrg 1465b8e80941Smrg if (var->data.location == VARYING_SLOT_POS) { 1466b8e80941Smrg emit_fragcoord_input(c, loc); 1467b8e80941Smrg } else if (var->data.location == VARYING_SLOT_PNTC || 1468b8e80941Smrg (var->data.location >= VARYING_SLOT_VAR0 && 1469b8e80941Smrg (c->fs_key->point_sprite_mask & 1470b8e80941Smrg (1 << (var->data.location - 1471b8e80941Smrg VARYING_SLOT_VAR0))))) { 1472b8e80941Smrg c->inputs[loc * 4 + 0] = c->point_x; 1473b8e80941Smrg c->inputs[loc * 4 + 1] = c->point_y; 1474b8e80941Smrg } else { 1475b8e80941Smrg for (int j = 0; j < array_len; j++) 1476b8e80941Smrg emit_fragment_input(c, loc + j, var, j); 1477b8e80941Smrg } 1478b8e80941Smrg } 1479b8e80941Smrg} 1480b8e80941Smrg 1481b8e80941Smrgstatic void 1482b8e80941Smrgntq_setup_outputs(struct v3d_compile *c) 1483b8e80941Smrg{ 1484b8e80941Smrg if (c->s->info.stage != MESA_SHADER_FRAGMENT) 1485b8e80941Smrg return; 1486b8e80941Smrg 1487b8e80941Smrg nir_foreach_variable(var, &c->s->outputs) { 1488b8e80941Smrg unsigned array_len = MAX2(glsl_get_length(var->type), 1); 1489b8e80941Smrg unsigned loc = var->data.driver_location * 4; 1490b8e80941Smrg 1491b8e80941Smrg assert(array_len == 1); 1492b8e80941Smrg (void)array_len; 1493b8e80941Smrg 1494b8e80941Smrg for (int i = 0; i < 4 - var->data.location_frac; i++) { 1495b8e80941Smrg add_output(c, loc + var->data.location_frac + i, 1496b8e80941Smrg var->data.location, 1497b8e80941Smrg var->data.location_frac + i); 1498b8e80941Smrg } 1499b8e80941Smrg 1500b8e80941Smrg switch (var->data.location) { 1501b8e80941Smrg case FRAG_RESULT_COLOR: 1502b8e80941Smrg c->output_color_var[0] = var; 1503b8e80941Smrg c->output_color_var[1] = var; 1504b8e80941Smrg c->output_color_var[2] = var; 1505b8e80941Smrg c->output_color_var[3] = var; 1506b8e80941Smrg break; 1507b8e80941Smrg case FRAG_RESULT_DATA0: 1508b8e80941Smrg case FRAG_RESULT_DATA1: 1509b8e80941Smrg case FRAG_RESULT_DATA2: 1510b8e80941Smrg case FRAG_RESULT_DATA3: 1511b8e80941Smrg c->output_color_var[var->data.location - 1512b8e80941Smrg FRAG_RESULT_DATA0] = var; 1513b8e80941Smrg break; 1514b8e80941Smrg case FRAG_RESULT_DEPTH: 1515b8e80941Smrg c->output_position_index = loc; 1516b8e80941Smrg break; 1517b8e80941Smrg case FRAG_RESULT_SAMPLE_MASK: 1518b8e80941Smrg c->output_sample_mask_index = loc; 1519b8e80941Smrg break; 1520b8e80941Smrg } 1521b8e80941Smrg } 1522b8e80941Smrg} 1523b8e80941Smrg 1524b8e80941Smrg/** 1525b8e80941Smrg * Sets up the mapping from nir_register to struct qreg *. 1526b8e80941Smrg * 1527b8e80941Smrg * Each nir_register gets a struct qreg per 32-bit component being stored. 1528b8e80941Smrg */ 1529b8e80941Smrgstatic void 1530b8e80941Smrgntq_setup_registers(struct v3d_compile *c, struct exec_list *list) 1531b8e80941Smrg{ 1532b8e80941Smrg foreach_list_typed(nir_register, nir_reg, node, list) { 1533b8e80941Smrg unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 1534b8e80941Smrg struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 1535b8e80941Smrg array_len * 1536b8e80941Smrg nir_reg->num_components); 1537b8e80941Smrg 1538b8e80941Smrg _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 1539b8e80941Smrg 1540b8e80941Smrg for (int i = 0; i < array_len * nir_reg->num_components; i++) 1541b8e80941Smrg qregs[i] = vir_get_temp(c); 1542b8e80941Smrg } 1543b8e80941Smrg} 1544b8e80941Smrg 1545b8e80941Smrgstatic void 1546b8e80941Smrgntq_emit_load_const(struct v3d_compile *c, nir_load_const_instr *instr) 1547b8e80941Smrg{ 1548b8e80941Smrg /* XXX perf: Experiment with using immediate loads to avoid having 1549b8e80941Smrg * these end up in the uniform stream. Watch out for breaking the 1550b8e80941Smrg * small immediates optimization in the process! 1551b8e80941Smrg */ 1552b8e80941Smrg struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1553b8e80941Smrg for (int i = 0; i < instr->def.num_components; i++) 1554b8e80941Smrg qregs[i] = vir_uniform_ui(c, instr->value[i].u32); 1555b8e80941Smrg 1556b8e80941Smrg _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 1557b8e80941Smrg} 1558b8e80941Smrg 1559b8e80941Smrgstatic void 1560b8e80941Smrgntq_emit_ssa_undef(struct v3d_compile *c, nir_ssa_undef_instr *instr) 1561b8e80941Smrg{ 1562b8e80941Smrg struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 1563b8e80941Smrg 1564b8e80941Smrg /* VIR needs there to be *some* value, so pick 0 (same as for 1565b8e80941Smrg * ntq_setup_registers(). 1566b8e80941Smrg */ 1567b8e80941Smrg for (int i = 0; i < instr->def.num_components; i++) 1568b8e80941Smrg qregs[i] = vir_uniform_ui(c, 0); 1569b8e80941Smrg} 1570b8e80941Smrg 1571b8e80941Smrgstatic void 1572b8e80941Smrgntq_emit_image_size(struct v3d_compile *c, nir_intrinsic_instr *instr) 1573b8e80941Smrg{ 1574b8e80941Smrg assert(instr->intrinsic == nir_intrinsic_image_deref_size); 1575b8e80941Smrg nir_variable *var = nir_intrinsic_get_var(instr, 0); 1576b8e80941Smrg unsigned image_index = var->data.driver_location; 1577b8e80941Smrg const struct glsl_type *sampler_type = glsl_without_array(var->type); 1578b8e80941Smrg bool is_array = glsl_sampler_type_is_array(sampler_type); 1579b8e80941Smrg 1580b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1581b8e80941Smrg vir_uniform(c, QUNIFORM_IMAGE_WIDTH, image_index)); 1582b8e80941Smrg if (instr->num_components > 1) { 1583b8e80941Smrg ntq_store_dest(c, &instr->dest, 1, 1584b8e80941Smrg vir_uniform(c, QUNIFORM_IMAGE_HEIGHT, 1585b8e80941Smrg image_index)); 1586b8e80941Smrg } 1587b8e80941Smrg if (instr->num_components > 2) { 1588b8e80941Smrg ntq_store_dest(c, &instr->dest, 2, 1589b8e80941Smrg vir_uniform(c, 1590b8e80941Smrg is_array ? 1591b8e80941Smrg QUNIFORM_IMAGE_ARRAY_SIZE : 1592b8e80941Smrg QUNIFORM_IMAGE_DEPTH, 1593b8e80941Smrg image_index)); 1594b8e80941Smrg } 1595b8e80941Smrg} 1596b8e80941Smrg 1597b8e80941Smrgstatic void 1598b8e80941Smrgntq_emit_intrinsic(struct v3d_compile *c, nir_intrinsic_instr *instr) 1599b8e80941Smrg{ 1600b8e80941Smrg unsigned offset; 1601b8e80941Smrg 1602b8e80941Smrg switch (instr->intrinsic) { 1603b8e80941Smrg case nir_intrinsic_load_uniform: 1604b8e80941Smrg if (nir_src_is_const(instr->src[0])) { 1605b8e80941Smrg int offset = (nir_intrinsic_base(instr) + 1606b8e80941Smrg nir_src_as_uint(instr->src[0])); 1607b8e80941Smrg assert(offset % 4 == 0); 1608b8e80941Smrg /* We need dwords */ 1609b8e80941Smrg offset = offset / 4; 1610b8e80941Smrg for (int i = 0; i < instr->num_components; i++) { 1611b8e80941Smrg ntq_store_dest(c, &instr->dest, i, 1612b8e80941Smrg vir_uniform(c, QUNIFORM_UNIFORM, 1613b8e80941Smrg offset + i)); 1614b8e80941Smrg } 1615b8e80941Smrg } else { 1616b8e80941Smrg ntq_emit_tmu_general(c, instr, false); 1617b8e80941Smrg } 1618b8e80941Smrg break; 1619b8e80941Smrg 1620b8e80941Smrg case nir_intrinsic_load_ubo: 1621b8e80941Smrg ntq_emit_tmu_general(c, instr, false); 1622b8e80941Smrg break; 1623b8e80941Smrg 1624b8e80941Smrg case nir_intrinsic_ssbo_atomic_add: 1625b8e80941Smrg case nir_intrinsic_ssbo_atomic_imin: 1626b8e80941Smrg case nir_intrinsic_ssbo_atomic_umin: 1627b8e80941Smrg case nir_intrinsic_ssbo_atomic_imax: 1628b8e80941Smrg case nir_intrinsic_ssbo_atomic_umax: 1629b8e80941Smrg case nir_intrinsic_ssbo_atomic_and: 1630b8e80941Smrg case nir_intrinsic_ssbo_atomic_or: 1631b8e80941Smrg case nir_intrinsic_ssbo_atomic_xor: 1632b8e80941Smrg case nir_intrinsic_ssbo_atomic_exchange: 1633b8e80941Smrg case nir_intrinsic_ssbo_atomic_comp_swap: 1634b8e80941Smrg case nir_intrinsic_load_ssbo: 1635b8e80941Smrg case nir_intrinsic_store_ssbo: 1636b8e80941Smrg ntq_emit_tmu_general(c, instr, false); 1637b8e80941Smrg break; 1638b8e80941Smrg 1639b8e80941Smrg case nir_intrinsic_shared_atomic_add: 1640b8e80941Smrg case nir_intrinsic_shared_atomic_imin: 1641b8e80941Smrg case nir_intrinsic_shared_atomic_umin: 1642b8e80941Smrg case nir_intrinsic_shared_atomic_imax: 1643b8e80941Smrg case nir_intrinsic_shared_atomic_umax: 1644b8e80941Smrg case nir_intrinsic_shared_atomic_and: 1645b8e80941Smrg case nir_intrinsic_shared_atomic_or: 1646b8e80941Smrg case nir_intrinsic_shared_atomic_xor: 1647b8e80941Smrg case nir_intrinsic_shared_atomic_exchange: 1648b8e80941Smrg case nir_intrinsic_shared_atomic_comp_swap: 1649b8e80941Smrg case nir_intrinsic_load_shared: 1650b8e80941Smrg case nir_intrinsic_store_shared: 1651b8e80941Smrg case nir_intrinsic_load_scratch: 1652b8e80941Smrg case nir_intrinsic_store_scratch: 1653b8e80941Smrg ntq_emit_tmu_general(c, instr, true); 1654b8e80941Smrg break; 1655b8e80941Smrg 1656b8e80941Smrg case nir_intrinsic_image_deref_load: 1657b8e80941Smrg case nir_intrinsic_image_deref_store: 1658b8e80941Smrg case nir_intrinsic_image_deref_atomic_add: 1659b8e80941Smrg case nir_intrinsic_image_deref_atomic_min: 1660b8e80941Smrg case nir_intrinsic_image_deref_atomic_max: 1661b8e80941Smrg case nir_intrinsic_image_deref_atomic_and: 1662b8e80941Smrg case nir_intrinsic_image_deref_atomic_or: 1663b8e80941Smrg case nir_intrinsic_image_deref_atomic_xor: 1664b8e80941Smrg case nir_intrinsic_image_deref_atomic_exchange: 1665b8e80941Smrg case nir_intrinsic_image_deref_atomic_comp_swap: 1666b8e80941Smrg v3d40_vir_emit_image_load_store(c, instr); 1667b8e80941Smrg break; 1668b8e80941Smrg 1669b8e80941Smrg case nir_intrinsic_get_buffer_size: 1670b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1671b8e80941Smrg vir_uniform(c, QUNIFORM_GET_BUFFER_SIZE, 1672b8e80941Smrg nir_src_as_uint(instr->src[0]))); 1673b8e80941Smrg break; 1674b8e80941Smrg 1675b8e80941Smrg case nir_intrinsic_load_user_clip_plane: 1676b8e80941Smrg for (int i = 0; i < instr->num_components; i++) { 1677b8e80941Smrg ntq_store_dest(c, &instr->dest, i, 1678b8e80941Smrg vir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 1679b8e80941Smrg nir_intrinsic_ucp_id(instr) * 1680b8e80941Smrg 4 + i)); 1681b8e80941Smrg } 1682b8e80941Smrg break; 1683b8e80941Smrg 1684b8e80941Smrg case nir_intrinsic_load_viewport_x_scale: 1685b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1686b8e80941Smrg vir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE, 0)); 1687b8e80941Smrg break; 1688b8e80941Smrg 1689b8e80941Smrg case nir_intrinsic_load_viewport_y_scale: 1690b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1691b8e80941Smrg vir_uniform(c, QUNIFORM_VIEWPORT_Y_SCALE, 0)); 1692b8e80941Smrg break; 1693b8e80941Smrg 1694b8e80941Smrg case nir_intrinsic_load_viewport_z_scale: 1695b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1696b8e80941Smrg vir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0)); 1697b8e80941Smrg break; 1698b8e80941Smrg 1699b8e80941Smrg case nir_intrinsic_load_viewport_z_offset: 1700b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1701b8e80941Smrg vir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0)); 1702b8e80941Smrg break; 1703b8e80941Smrg 1704b8e80941Smrg case nir_intrinsic_load_alpha_ref_float: 1705b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1706b8e80941Smrg vir_uniform(c, QUNIFORM_ALPHA_REF, 0)); 1707b8e80941Smrg break; 1708b8e80941Smrg 1709b8e80941Smrg case nir_intrinsic_load_sample_mask_in: 1710b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, vir_MSF(c)); 1711b8e80941Smrg break; 1712b8e80941Smrg 1713b8e80941Smrg case nir_intrinsic_load_helper_invocation: 1714b8e80941Smrg vir_set_pf(vir_MSF_dest(c, vir_nop_reg()), V3D_QPU_PF_PUSHZ); 1715b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1716b8e80941Smrg vir_MOV(c, vir_SEL(c, V3D_QPU_COND_IFA, 1717b8e80941Smrg vir_uniform_ui(c, ~0), 1718b8e80941Smrg vir_uniform_ui(c, 0)))); 1719b8e80941Smrg break; 1720b8e80941Smrg 1721b8e80941Smrg case nir_intrinsic_load_front_face: 1722b8e80941Smrg /* The register contains 0 (front) or 1 (back), and we need to 1723b8e80941Smrg * turn it into a NIR bool where true means front. 1724b8e80941Smrg */ 1725b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1726b8e80941Smrg vir_ADD(c, 1727b8e80941Smrg vir_uniform_ui(c, -1), 1728b8e80941Smrg vir_REVF(c))); 1729b8e80941Smrg break; 1730b8e80941Smrg 1731b8e80941Smrg case nir_intrinsic_load_instance_id: 1732b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->iid)); 1733b8e80941Smrg break; 1734b8e80941Smrg 1735b8e80941Smrg case nir_intrinsic_load_vertex_id: 1736b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, vir_MOV(c, c->vid)); 1737b8e80941Smrg break; 1738b8e80941Smrg 1739b8e80941Smrg case nir_intrinsic_load_input: 1740b8e80941Smrg /* Use ldvpmv (uniform offset) or ldvpmd (non-uniform offset) 1741b8e80941Smrg * and enable PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR. 1742b8e80941Smrg */ 1743b8e80941Smrg offset = (nir_intrinsic_base(instr) + 1744b8e80941Smrg nir_src_as_uint(instr->src[0])); 1745b8e80941Smrg if (c->s->info.stage != MESA_SHADER_FRAGMENT && 1746b8e80941Smrg c->devinfo->ver >= 40) { 1747b8e80941Smrg /* Emit the LDVPM directly now, rather than at the top 1748b8e80941Smrg * of the shader like we did for V3D 3.x (which needs 1749b8e80941Smrg * vpmsetup when not just taking the next offset). 1750b8e80941Smrg * 1751b8e80941Smrg * Note that delaying like this may introduce stalls, 1752b8e80941Smrg * as LDVPMV takes a minimum of 1 instruction but may 1753b8e80941Smrg * be slower if the VPM unit is busy with another QPU. 1754b8e80941Smrg */ 1755b8e80941Smrg int index = 0; 1756b8e80941Smrg if (c->s->info.system_values_read & 1757b8e80941Smrg (1ull << SYSTEM_VALUE_INSTANCE_ID)) { 1758b8e80941Smrg index++; 1759b8e80941Smrg } 1760b8e80941Smrg if (c->s->info.system_values_read & 1761b8e80941Smrg (1ull << SYSTEM_VALUE_VERTEX_ID)) { 1762b8e80941Smrg index++; 1763b8e80941Smrg } 1764b8e80941Smrg for (int i = 0; i < offset; i++) 1765b8e80941Smrg index += c->vattr_sizes[i]; 1766b8e80941Smrg index += nir_intrinsic_component(instr); 1767b8e80941Smrg for (int i = 0; i < instr->num_components; i++) { 1768b8e80941Smrg struct qreg vpm_offset = 1769b8e80941Smrg vir_uniform_ui(c, index++); 1770b8e80941Smrg ntq_store_dest(c, &instr->dest, i, 1771b8e80941Smrg vir_LDVPMV_IN(c, vpm_offset)); 1772b8e80941Smrg } 1773b8e80941Smrg } else { 1774b8e80941Smrg for (int i = 0; i < instr->num_components; i++) { 1775b8e80941Smrg int comp = nir_intrinsic_component(instr) + i; 1776b8e80941Smrg ntq_store_dest(c, &instr->dest, i, 1777b8e80941Smrg vir_MOV(c, c->inputs[offset * 4 + 1778b8e80941Smrg comp])); 1779b8e80941Smrg } 1780b8e80941Smrg } 1781b8e80941Smrg break; 1782b8e80941Smrg 1783b8e80941Smrg case nir_intrinsic_store_output: 1784b8e80941Smrg /* XXX perf: Use stvpmv with uniform non-constant offsets and 1785b8e80941Smrg * stvpmd with non-uniform offsets and enable 1786b8e80941Smrg * PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR. 1787b8e80941Smrg */ 1788b8e80941Smrg if (c->s->info.stage == MESA_SHADER_FRAGMENT) { 1789b8e80941Smrg offset = ((nir_intrinsic_base(instr) + 1790b8e80941Smrg nir_src_as_uint(instr->src[1])) * 4 + 1791b8e80941Smrg nir_intrinsic_component(instr)); 1792b8e80941Smrg for (int i = 0; i < instr->num_components; i++) { 1793b8e80941Smrg c->outputs[offset + i] = 1794b8e80941Smrg vir_MOV(c, 1795b8e80941Smrg ntq_get_src(c, 1796b8e80941Smrg instr->src[0], i)); 1797b8e80941Smrg } 1798b8e80941Smrg } else { 1799b8e80941Smrg assert(instr->num_components == 1); 1800b8e80941Smrg 1801b8e80941Smrg vir_VPM_WRITE(c, 1802b8e80941Smrg ntq_get_src(c, instr->src[0], 0), 1803b8e80941Smrg nir_intrinsic_base(instr)); 1804b8e80941Smrg } 1805b8e80941Smrg break; 1806b8e80941Smrg 1807b8e80941Smrg case nir_intrinsic_image_deref_size: 1808b8e80941Smrg ntq_emit_image_size(c, instr); 1809b8e80941Smrg break; 1810b8e80941Smrg 1811b8e80941Smrg case nir_intrinsic_discard: 1812b8e80941Smrg if (vir_in_nonuniform_control_flow(c)) { 1813b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 1814b8e80941Smrg V3D_QPU_PF_PUSHZ); 1815b8e80941Smrg vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 1816b8e80941Smrg vir_uniform_ui(c, 0)), 1817b8e80941Smrg V3D_QPU_COND_IFA); 1818b8e80941Smrg } else { 1819b8e80941Smrg vir_SETMSF_dest(c, vir_nop_reg(), 1820b8e80941Smrg vir_uniform_ui(c, 0)); 1821b8e80941Smrg } 1822b8e80941Smrg break; 1823b8e80941Smrg 1824b8e80941Smrg case nir_intrinsic_discard_if: { 1825b8e80941Smrg enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, instr->src[0]); 1826b8e80941Smrg 1827b8e80941Smrg if (vir_in_nonuniform_control_flow(c)) { 1828b8e80941Smrg struct qinst *exec_flag = vir_MOV_dest(c, vir_nop_reg(), 1829b8e80941Smrg c->execute); 1830b8e80941Smrg if (cond == V3D_QPU_COND_IFA) { 1831b8e80941Smrg vir_set_uf(exec_flag, V3D_QPU_UF_ANDZ); 1832b8e80941Smrg } else { 1833b8e80941Smrg vir_set_uf(exec_flag, V3D_QPU_UF_NORNZ); 1834b8e80941Smrg cond = V3D_QPU_COND_IFA; 1835b8e80941Smrg } 1836b8e80941Smrg } 1837b8e80941Smrg 1838b8e80941Smrg vir_set_cond(vir_SETMSF_dest(c, vir_nop_reg(), 1839b8e80941Smrg vir_uniform_ui(c, 0)), cond); 1840b8e80941Smrg 1841b8e80941Smrg break; 1842b8e80941Smrg } 1843b8e80941Smrg 1844b8e80941Smrg case nir_intrinsic_memory_barrier: 1845b8e80941Smrg case nir_intrinsic_memory_barrier_atomic_counter: 1846b8e80941Smrg case nir_intrinsic_memory_barrier_buffer: 1847b8e80941Smrg case nir_intrinsic_memory_barrier_image: 1848b8e80941Smrg case nir_intrinsic_memory_barrier_shared: 1849b8e80941Smrg case nir_intrinsic_group_memory_barrier: 1850b8e80941Smrg /* We don't do any instruction scheduling of these NIR 1851b8e80941Smrg * instructions between each other, so we just need to make 1852b8e80941Smrg * sure that the TMU operations before the barrier are flushed 1853b8e80941Smrg * before the ones after the barrier. That is currently 1854b8e80941Smrg * handled by having a THRSW in each of them and a LDTMU 1855b8e80941Smrg * series or a TMUWT after. 1856b8e80941Smrg */ 1857b8e80941Smrg break; 1858b8e80941Smrg 1859b8e80941Smrg case nir_intrinsic_barrier: 1860b8e80941Smrg /* Emit a TSY op to get all invocations in the workgroup 1861b8e80941Smrg * (actually supergroup) to block until the last invocation 1862b8e80941Smrg * reaches the TSY op. 1863b8e80941Smrg */ 1864b8e80941Smrg if (c->devinfo->ver >= 42) { 1865b8e80941Smrg vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 1866b8e80941Smrg V3D_QPU_WADDR_SYNCB)); 1867b8e80941Smrg } else { 1868b8e80941Smrg struct qinst *sync = 1869b8e80941Smrg vir_BARRIERID_dest(c, 1870b8e80941Smrg vir_reg(QFILE_MAGIC, 1871b8e80941Smrg V3D_QPU_WADDR_SYNCU)); 1872b8e80941Smrg sync->uniform = 1873b8e80941Smrg vir_get_uniform_index(c, QUNIFORM_CONSTANT, 1874b8e80941Smrg 0xffffff00 | 1875b8e80941Smrg V3D_TSY_WAIT_INC_CHECK); 1876b8e80941Smrg 1877b8e80941Smrg } 1878b8e80941Smrg 1879b8e80941Smrg /* The blocking of a TSY op only happens at the next thread 1880b8e80941Smrg * switch. No texturing may be outstanding at the time of a 1881b8e80941Smrg * TSY blocking operation. 1882b8e80941Smrg */ 1883b8e80941Smrg vir_emit_thrsw(c); 1884b8e80941Smrg break; 1885b8e80941Smrg 1886b8e80941Smrg case nir_intrinsic_load_num_work_groups: 1887b8e80941Smrg for (int i = 0; i < 3; i++) { 1888b8e80941Smrg ntq_store_dest(c, &instr->dest, i, 1889b8e80941Smrg vir_uniform(c, QUNIFORM_NUM_WORK_GROUPS, 1890b8e80941Smrg i)); 1891b8e80941Smrg } 1892b8e80941Smrg break; 1893b8e80941Smrg 1894b8e80941Smrg case nir_intrinsic_load_local_invocation_index: 1895b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1896b8e80941Smrg vir_SHR(c, c->cs_payload[1], 1897b8e80941Smrg vir_uniform_ui(c, 32 - c->local_invocation_index_bits))); 1898b8e80941Smrg break; 1899b8e80941Smrg 1900b8e80941Smrg case nir_intrinsic_load_work_group_id: 1901b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, 1902b8e80941Smrg vir_AND(c, c->cs_payload[0], 1903b8e80941Smrg vir_uniform_ui(c, 0xffff))); 1904b8e80941Smrg ntq_store_dest(c, &instr->dest, 1, 1905b8e80941Smrg vir_SHR(c, c->cs_payload[0], 1906b8e80941Smrg vir_uniform_ui(c, 16))); 1907b8e80941Smrg ntq_store_dest(c, &instr->dest, 2, 1908b8e80941Smrg vir_AND(c, c->cs_payload[1], 1909b8e80941Smrg vir_uniform_ui(c, 0xffff))); 1910b8e80941Smrg break; 1911b8e80941Smrg 1912b8e80941Smrg case nir_intrinsic_load_subgroup_id: 1913b8e80941Smrg ntq_store_dest(c, &instr->dest, 0, vir_EIDX(c)); 1914b8e80941Smrg break; 1915b8e80941Smrg 1916b8e80941Smrg default: 1917b8e80941Smrg fprintf(stderr, "Unknown intrinsic: "); 1918b8e80941Smrg nir_print_instr(&instr->instr, stderr); 1919b8e80941Smrg fprintf(stderr, "\n"); 1920b8e80941Smrg break; 1921b8e80941Smrg } 1922b8e80941Smrg} 1923b8e80941Smrg 1924b8e80941Smrg/* Clears (activates) the execute flags for any channels whose jump target 1925b8e80941Smrg * matches this block. 1926b8e80941Smrg * 1927b8e80941Smrg * XXX perf: Could we be using flpush/flpop somehow for our execution channel 1928b8e80941Smrg * enabling? 1929b8e80941Smrg * 1930b8e80941Smrg * XXX perf: For uniform control flow, we should be able to skip c->execute 1931b8e80941Smrg * handling entirely. 1932b8e80941Smrg */ 1933b8e80941Smrgstatic void 1934b8e80941Smrgntq_activate_execute_for_block(struct v3d_compile *c) 1935b8e80941Smrg{ 1936b8e80941Smrg vir_set_pf(vir_XOR_dest(c, vir_nop_reg(), 1937b8e80941Smrg c->execute, vir_uniform_ui(c, c->cur_block->index)), 1938b8e80941Smrg V3D_QPU_PF_PUSHZ); 1939b8e80941Smrg 1940b8e80941Smrg vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 1941b8e80941Smrg} 1942b8e80941Smrg 1943b8e80941Smrgstatic void 1944b8e80941Smrgntq_emit_uniform_if(struct v3d_compile *c, nir_if *if_stmt) 1945b8e80941Smrg{ 1946b8e80941Smrg nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 1947b8e80941Smrg bool empty_else_block = 1948b8e80941Smrg (nir_else_block == nir_if_last_else_block(if_stmt) && 1949b8e80941Smrg exec_list_is_empty(&nir_else_block->instr_list)); 1950b8e80941Smrg 1951b8e80941Smrg struct qblock *then_block = vir_new_block(c); 1952b8e80941Smrg struct qblock *after_block = vir_new_block(c); 1953b8e80941Smrg struct qblock *else_block; 1954b8e80941Smrg if (empty_else_block) 1955b8e80941Smrg else_block = after_block; 1956b8e80941Smrg else 1957b8e80941Smrg else_block = vir_new_block(c); 1958b8e80941Smrg 1959b8e80941Smrg /* Set up the flags for the IF condition (taking the THEN branch). */ 1960b8e80941Smrg enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 1961b8e80941Smrg 1962b8e80941Smrg /* Jump to ELSE. */ 1963b8e80941Smrg vir_BRANCH(c, cond == V3D_QPU_COND_IFA ? 1964b8e80941Smrg V3D_QPU_BRANCH_COND_ALLNA : 1965b8e80941Smrg V3D_QPU_BRANCH_COND_ALLA); 1966b8e80941Smrg vir_link_blocks(c->cur_block, else_block); 1967b8e80941Smrg vir_link_blocks(c->cur_block, then_block); 1968b8e80941Smrg 1969b8e80941Smrg /* Process the THEN block. */ 1970b8e80941Smrg vir_set_emit_block(c, then_block); 1971b8e80941Smrg ntq_emit_cf_list(c, &if_stmt->then_list); 1972b8e80941Smrg 1973b8e80941Smrg if (!empty_else_block) { 1974b8e80941Smrg /* At the end of the THEN block, jump to ENDIF */ 1975b8e80941Smrg vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALWAYS); 1976b8e80941Smrg vir_link_blocks(c->cur_block, after_block); 1977b8e80941Smrg 1978b8e80941Smrg /* Emit the else block. */ 1979b8e80941Smrg vir_set_emit_block(c, else_block); 1980b8e80941Smrg ntq_activate_execute_for_block(c); 1981b8e80941Smrg ntq_emit_cf_list(c, &if_stmt->else_list); 1982b8e80941Smrg } 1983b8e80941Smrg 1984b8e80941Smrg vir_link_blocks(c->cur_block, after_block); 1985b8e80941Smrg 1986b8e80941Smrg vir_set_emit_block(c, after_block); 1987b8e80941Smrg} 1988b8e80941Smrg 1989b8e80941Smrgstatic void 1990b8e80941Smrgntq_emit_nonuniform_if(struct v3d_compile *c, nir_if *if_stmt) 1991b8e80941Smrg{ 1992b8e80941Smrg nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 1993b8e80941Smrg bool empty_else_block = 1994b8e80941Smrg (nir_else_block == nir_if_last_else_block(if_stmt) && 1995b8e80941Smrg exec_list_is_empty(&nir_else_block->instr_list)); 1996b8e80941Smrg 1997b8e80941Smrg struct qblock *then_block = vir_new_block(c); 1998b8e80941Smrg struct qblock *after_block = vir_new_block(c); 1999b8e80941Smrg struct qblock *else_block; 2000b8e80941Smrg if (empty_else_block) 2001b8e80941Smrg else_block = after_block; 2002b8e80941Smrg else 2003b8e80941Smrg else_block = vir_new_block(c); 2004b8e80941Smrg 2005b8e80941Smrg bool was_uniform_control_flow = false; 2006b8e80941Smrg if (!vir_in_nonuniform_control_flow(c)) { 2007b8e80941Smrg c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 2008b8e80941Smrg was_uniform_control_flow = true; 2009b8e80941Smrg } 2010b8e80941Smrg 2011b8e80941Smrg /* Set up the flags for the IF condition (taking the THEN branch). */ 2012b8e80941Smrg enum v3d_qpu_cond cond = ntq_emit_bool_to_cond(c, if_stmt->condition); 2013b8e80941Smrg 2014b8e80941Smrg /* Update the flags+cond to mean "Taking the ELSE branch (!cond) and 2015b8e80941Smrg * was previously active (execute Z) for updating the exec flags. 2016b8e80941Smrg */ 2017b8e80941Smrg if (was_uniform_control_flow) { 2018b8e80941Smrg cond = v3d_qpu_cond_invert(cond); 2019b8e80941Smrg } else { 2020b8e80941Smrg struct qinst *inst = vir_MOV_dest(c, vir_nop_reg(), c->execute); 2021b8e80941Smrg if (cond == V3D_QPU_COND_IFA) { 2022b8e80941Smrg vir_set_uf(inst, V3D_QPU_UF_NORNZ); 2023b8e80941Smrg } else { 2024b8e80941Smrg vir_set_uf(inst, V3D_QPU_UF_ANDZ); 2025b8e80941Smrg cond = V3D_QPU_COND_IFA; 2026b8e80941Smrg } 2027b8e80941Smrg } 2028b8e80941Smrg 2029b8e80941Smrg vir_MOV_cond(c, cond, 2030b8e80941Smrg c->execute, 2031b8e80941Smrg vir_uniform_ui(c, else_block->index)); 2032b8e80941Smrg 2033b8e80941Smrg /* Jump to ELSE if nothing is active for THEN, otherwise fall 2034b8e80941Smrg * through. 2035b8e80941Smrg */ 2036b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 2037b8e80941Smrg vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLNA); 2038b8e80941Smrg vir_link_blocks(c->cur_block, else_block); 2039b8e80941Smrg vir_link_blocks(c->cur_block, then_block); 2040b8e80941Smrg 2041b8e80941Smrg /* Process the THEN block. */ 2042b8e80941Smrg vir_set_emit_block(c, then_block); 2043b8e80941Smrg ntq_emit_cf_list(c, &if_stmt->then_list); 2044b8e80941Smrg 2045b8e80941Smrg if (!empty_else_block) { 2046b8e80941Smrg /* Handle the end of the THEN block. First, all currently 2047b8e80941Smrg * active channels update their execute flags to point to 2048b8e80941Smrg * ENDIF 2049b8e80941Smrg */ 2050b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 2051b8e80941Smrg V3D_QPU_PF_PUSHZ); 2052b8e80941Smrg vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 2053b8e80941Smrg vir_uniform_ui(c, after_block->index)); 2054b8e80941Smrg 2055b8e80941Smrg /* If everything points at ENDIF, then jump there immediately. */ 2056b8e80941Smrg vir_set_pf(vir_XOR_dest(c, vir_nop_reg(), 2057b8e80941Smrg c->execute, 2058b8e80941Smrg vir_uniform_ui(c, after_block->index)), 2059b8e80941Smrg V3D_QPU_PF_PUSHZ); 2060b8e80941Smrg vir_BRANCH(c, V3D_QPU_BRANCH_COND_ALLA); 2061b8e80941Smrg vir_link_blocks(c->cur_block, after_block); 2062b8e80941Smrg vir_link_blocks(c->cur_block, else_block); 2063b8e80941Smrg 2064b8e80941Smrg vir_set_emit_block(c, else_block); 2065b8e80941Smrg ntq_activate_execute_for_block(c); 2066b8e80941Smrg ntq_emit_cf_list(c, &if_stmt->else_list); 2067b8e80941Smrg } 2068b8e80941Smrg 2069b8e80941Smrg vir_link_blocks(c->cur_block, after_block); 2070b8e80941Smrg 2071b8e80941Smrg vir_set_emit_block(c, after_block); 2072b8e80941Smrg if (was_uniform_control_flow) 2073b8e80941Smrg c->execute = c->undef; 2074b8e80941Smrg else 2075b8e80941Smrg ntq_activate_execute_for_block(c); 2076b8e80941Smrg} 2077b8e80941Smrg 2078b8e80941Smrgstatic void 2079b8e80941Smrgntq_emit_if(struct v3d_compile *c, nir_if *nif) 2080b8e80941Smrg{ 2081b8e80941Smrg bool was_in_control_flow = c->in_control_flow; 2082b8e80941Smrg c->in_control_flow = true; 2083b8e80941Smrg if (!vir_in_nonuniform_control_flow(c) && 2084b8e80941Smrg nir_src_is_dynamically_uniform(nif->condition)) { 2085b8e80941Smrg ntq_emit_uniform_if(c, nif); 2086b8e80941Smrg } else { 2087b8e80941Smrg ntq_emit_nonuniform_if(c, nif); 2088b8e80941Smrg } 2089b8e80941Smrg c->in_control_flow = was_in_control_flow; 2090b8e80941Smrg} 2091b8e80941Smrg 2092b8e80941Smrgstatic void 2093b8e80941Smrgntq_emit_jump(struct v3d_compile *c, nir_jump_instr *jump) 2094b8e80941Smrg{ 2095b8e80941Smrg switch (jump->type) { 2096b8e80941Smrg case nir_jump_break: 2097b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 2098b8e80941Smrg V3D_QPU_PF_PUSHZ); 2099b8e80941Smrg vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 2100b8e80941Smrg vir_uniform_ui(c, c->loop_break_block->index)); 2101b8e80941Smrg break; 2102b8e80941Smrg 2103b8e80941Smrg case nir_jump_continue: 2104b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), 2105b8e80941Smrg V3D_QPU_PF_PUSHZ); 2106b8e80941Smrg vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, 2107b8e80941Smrg vir_uniform_ui(c, c->loop_cont_block->index)); 2108b8e80941Smrg break; 2109b8e80941Smrg 2110b8e80941Smrg case nir_jump_return: 2111b8e80941Smrg unreachable("All returns shouold be lowered\n"); 2112b8e80941Smrg } 2113b8e80941Smrg} 2114b8e80941Smrg 2115b8e80941Smrgstatic void 2116b8e80941Smrgntq_emit_instr(struct v3d_compile *c, nir_instr *instr) 2117b8e80941Smrg{ 2118b8e80941Smrg switch (instr->type) { 2119b8e80941Smrg case nir_instr_type_deref: 2120b8e80941Smrg /* ignored, will be walked by the intrinsic using it. */ 2121b8e80941Smrg break; 2122b8e80941Smrg 2123b8e80941Smrg case nir_instr_type_alu: 2124b8e80941Smrg ntq_emit_alu(c, nir_instr_as_alu(instr)); 2125b8e80941Smrg break; 2126b8e80941Smrg 2127b8e80941Smrg case nir_instr_type_intrinsic: 2128b8e80941Smrg ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 2129b8e80941Smrg break; 2130b8e80941Smrg 2131b8e80941Smrg case nir_instr_type_load_const: 2132b8e80941Smrg ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 2133b8e80941Smrg break; 2134b8e80941Smrg 2135b8e80941Smrg case nir_instr_type_ssa_undef: 2136b8e80941Smrg ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); 2137b8e80941Smrg break; 2138b8e80941Smrg 2139b8e80941Smrg case nir_instr_type_tex: 2140b8e80941Smrg ntq_emit_tex(c, nir_instr_as_tex(instr)); 2141b8e80941Smrg break; 2142b8e80941Smrg 2143b8e80941Smrg case nir_instr_type_jump: 2144b8e80941Smrg ntq_emit_jump(c, nir_instr_as_jump(instr)); 2145b8e80941Smrg break; 2146b8e80941Smrg 2147b8e80941Smrg default: 2148b8e80941Smrg fprintf(stderr, "Unknown NIR instr type: "); 2149b8e80941Smrg nir_print_instr(instr, stderr); 2150b8e80941Smrg fprintf(stderr, "\n"); 2151b8e80941Smrg abort(); 2152b8e80941Smrg } 2153b8e80941Smrg} 2154b8e80941Smrg 2155b8e80941Smrgstatic void 2156b8e80941Smrgntq_emit_block(struct v3d_compile *c, nir_block *block) 2157b8e80941Smrg{ 2158b8e80941Smrg nir_foreach_instr(instr, block) { 2159b8e80941Smrg ntq_emit_instr(c, instr); 2160b8e80941Smrg } 2161b8e80941Smrg} 2162b8e80941Smrg 2163b8e80941Smrgstatic void ntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list); 2164b8e80941Smrg 2165b8e80941Smrgstatic void 2166b8e80941Smrgntq_emit_loop(struct v3d_compile *c, nir_loop *loop) 2167b8e80941Smrg{ 2168b8e80941Smrg bool was_in_control_flow = c->in_control_flow; 2169b8e80941Smrg c->in_control_flow = true; 2170b8e80941Smrg 2171b8e80941Smrg bool was_uniform_control_flow = false; 2172b8e80941Smrg if (!vir_in_nonuniform_control_flow(c)) { 2173b8e80941Smrg c->execute = vir_MOV(c, vir_uniform_ui(c, 0)); 2174b8e80941Smrg was_uniform_control_flow = true; 2175b8e80941Smrg } 2176b8e80941Smrg 2177b8e80941Smrg struct qblock *save_loop_cont_block = c->loop_cont_block; 2178b8e80941Smrg struct qblock *save_loop_break_block = c->loop_break_block; 2179b8e80941Smrg 2180b8e80941Smrg c->loop_cont_block = vir_new_block(c); 2181b8e80941Smrg c->loop_break_block = vir_new_block(c); 2182b8e80941Smrg 2183b8e80941Smrg vir_link_blocks(c->cur_block, c->loop_cont_block); 2184b8e80941Smrg vir_set_emit_block(c, c->loop_cont_block); 2185b8e80941Smrg ntq_activate_execute_for_block(c); 2186b8e80941Smrg 2187b8e80941Smrg ntq_emit_cf_list(c, &loop->body); 2188b8e80941Smrg 2189b8e80941Smrg /* Re-enable any previous continues now, so our ANYA check below 2190b8e80941Smrg * works. 2191b8e80941Smrg * 2192b8e80941Smrg * XXX: Use the .ORZ flags update, instead. 2193b8e80941Smrg */ 2194b8e80941Smrg vir_set_pf(vir_XOR_dest(c, 2195b8e80941Smrg vir_nop_reg(), 2196b8e80941Smrg c->execute, 2197b8e80941Smrg vir_uniform_ui(c, c->loop_cont_block->index)), 2198b8e80941Smrg V3D_QPU_PF_PUSHZ); 2199b8e80941Smrg vir_MOV_cond(c, V3D_QPU_COND_IFA, c->execute, vir_uniform_ui(c, 0)); 2200b8e80941Smrg 2201b8e80941Smrg vir_set_pf(vir_MOV_dest(c, vir_nop_reg(), c->execute), V3D_QPU_PF_PUSHZ); 2202b8e80941Smrg 2203b8e80941Smrg struct qinst *branch = vir_BRANCH(c, V3D_QPU_BRANCH_COND_ANYA); 2204b8e80941Smrg /* Pixels that were not dispatched or have been discarded should not 2205b8e80941Smrg * contribute to looping again. 2206b8e80941Smrg */ 2207b8e80941Smrg branch->qpu.branch.msfign = V3D_QPU_MSFIGN_P; 2208b8e80941Smrg vir_link_blocks(c->cur_block, c->loop_cont_block); 2209b8e80941Smrg vir_link_blocks(c->cur_block, c->loop_break_block); 2210b8e80941Smrg 2211b8e80941Smrg vir_set_emit_block(c, c->loop_break_block); 2212b8e80941Smrg if (was_uniform_control_flow) 2213b8e80941Smrg c->execute = c->undef; 2214b8e80941Smrg else 2215b8e80941Smrg ntq_activate_execute_for_block(c); 2216b8e80941Smrg 2217b8e80941Smrg c->loop_break_block = save_loop_break_block; 2218b8e80941Smrg c->loop_cont_block = save_loop_cont_block; 2219b8e80941Smrg 2220b8e80941Smrg c->loops++; 2221b8e80941Smrg 2222b8e80941Smrg c->in_control_flow = was_in_control_flow; 2223b8e80941Smrg} 2224b8e80941Smrg 2225b8e80941Smrgstatic void 2226b8e80941Smrgntq_emit_function(struct v3d_compile *c, nir_function_impl *func) 2227b8e80941Smrg{ 2228b8e80941Smrg fprintf(stderr, "FUNCTIONS not handled.\n"); 2229b8e80941Smrg abort(); 2230b8e80941Smrg} 2231b8e80941Smrg 2232b8e80941Smrgstatic void 2233b8e80941Smrgntq_emit_cf_list(struct v3d_compile *c, struct exec_list *list) 2234b8e80941Smrg{ 2235b8e80941Smrg foreach_list_typed(nir_cf_node, node, node, list) { 2236b8e80941Smrg switch (node->type) { 2237b8e80941Smrg case nir_cf_node_block: 2238b8e80941Smrg ntq_emit_block(c, nir_cf_node_as_block(node)); 2239b8e80941Smrg break; 2240b8e80941Smrg 2241b8e80941Smrg case nir_cf_node_if: 2242b8e80941Smrg ntq_emit_if(c, nir_cf_node_as_if(node)); 2243b8e80941Smrg break; 2244b8e80941Smrg 2245b8e80941Smrg case nir_cf_node_loop: 2246b8e80941Smrg ntq_emit_loop(c, nir_cf_node_as_loop(node)); 2247b8e80941Smrg break; 2248b8e80941Smrg 2249b8e80941Smrg case nir_cf_node_function: 2250b8e80941Smrg ntq_emit_function(c, nir_cf_node_as_function(node)); 2251b8e80941Smrg break; 2252b8e80941Smrg 2253b8e80941Smrg default: 2254b8e80941Smrg fprintf(stderr, "Unknown NIR node type\n"); 2255b8e80941Smrg abort(); 2256b8e80941Smrg } 2257b8e80941Smrg } 2258b8e80941Smrg} 2259b8e80941Smrg 2260b8e80941Smrgstatic void 2261b8e80941Smrgntq_emit_impl(struct v3d_compile *c, nir_function_impl *impl) 2262b8e80941Smrg{ 2263b8e80941Smrg ntq_setup_registers(c, &impl->registers); 2264b8e80941Smrg ntq_emit_cf_list(c, &impl->body); 2265b8e80941Smrg} 2266b8e80941Smrg 2267b8e80941Smrgstatic void 2268b8e80941Smrgnir_to_vir(struct v3d_compile *c) 2269b8e80941Smrg{ 2270b8e80941Smrg switch (c->s->info.stage) { 2271b8e80941Smrg case MESA_SHADER_FRAGMENT: 2272b8e80941Smrg c->payload_w = vir_MOV(c, vir_reg(QFILE_REG, 0)); 2273b8e80941Smrg c->payload_w_centroid = vir_MOV(c, vir_reg(QFILE_REG, 1)); 2274b8e80941Smrg c->payload_z = vir_MOV(c, vir_reg(QFILE_REG, 2)); 2275b8e80941Smrg 2276b8e80941Smrg /* XXX perf: We could set the "disable implicit point/line 2277b8e80941Smrg * varyings" field in the shader record and not emit these, if 2278b8e80941Smrg * they're not going to be used. 2279b8e80941Smrg */ 2280b8e80941Smrg if (c->fs_key->is_points) { 2281b8e80941Smrg c->point_x = emit_fragment_varying(c, NULL, 0, 0); 2282b8e80941Smrg c->point_y = emit_fragment_varying(c, NULL, 0, 0); 2283b8e80941Smrg } else if (c->fs_key->is_lines) { 2284b8e80941Smrg c->line_x = emit_fragment_varying(c, NULL, 0, 0); 2285b8e80941Smrg } 2286b8e80941Smrg break; 2287b8e80941Smrg case MESA_SHADER_COMPUTE: 2288b8e80941Smrg /* Set up the TSO for barriers, assuming we do some. */ 2289b8e80941Smrg if (c->devinfo->ver < 42) { 2290b8e80941Smrg vir_BARRIERID_dest(c, vir_reg(QFILE_MAGIC, 2291b8e80941Smrg V3D_QPU_WADDR_SYNC)); 2292b8e80941Smrg } 2293b8e80941Smrg 2294b8e80941Smrg c->cs_payload[0] = vir_MOV(c, vir_reg(QFILE_REG, 0)); 2295b8e80941Smrg c->cs_payload[1] = vir_MOV(c, vir_reg(QFILE_REG, 2)); 2296b8e80941Smrg 2297b8e80941Smrg /* Set up the division between gl_LocalInvocationIndex and 2298b8e80941Smrg * wg_in_mem in the payload reg. 2299b8e80941Smrg */ 2300b8e80941Smrg int wg_size = (c->s->info.cs.local_size[0] * 2301b8e80941Smrg c->s->info.cs.local_size[1] * 2302b8e80941Smrg c->s->info.cs.local_size[2]); 2303b8e80941Smrg c->local_invocation_index_bits = 2304b8e80941Smrg ffs(util_next_power_of_two(MAX2(wg_size, 64))) - 1; 2305b8e80941Smrg assert(c->local_invocation_index_bits <= 8); 2306b8e80941Smrg 2307b8e80941Smrg if (c->s->info.cs.shared_size) { 2308b8e80941Smrg struct qreg wg_in_mem = vir_SHR(c, c->cs_payload[1], 2309b8e80941Smrg vir_uniform_ui(c, 16)); 2310b8e80941Smrg if (c->s->info.cs.local_size[0] != 1 || 2311b8e80941Smrg c->s->info.cs.local_size[1] != 1 || 2312b8e80941Smrg c->s->info.cs.local_size[2] != 1) { 2313b8e80941Smrg int wg_bits = (16 - 2314b8e80941Smrg c->local_invocation_index_bits); 2315b8e80941Smrg int wg_mask = (1 << wg_bits) - 1; 2316b8e80941Smrg wg_in_mem = vir_AND(c, wg_in_mem, 2317b8e80941Smrg vir_uniform_ui(c, wg_mask)); 2318b8e80941Smrg } 2319b8e80941Smrg struct qreg shared_per_wg = 2320b8e80941Smrg vir_uniform_ui(c, c->s->info.cs.shared_size); 2321b8e80941Smrg 2322b8e80941Smrg c->cs_shared_offset = 2323b8e80941Smrg vir_ADD(c, 2324b8e80941Smrg vir_uniform(c, QUNIFORM_SHARED_OFFSET,0), 2325b8e80941Smrg vir_UMUL(c, wg_in_mem, shared_per_wg)); 2326b8e80941Smrg } 2327b8e80941Smrg break; 2328b8e80941Smrg default: 2329b8e80941Smrg break; 2330b8e80941Smrg } 2331b8e80941Smrg 2332b8e80941Smrg if (c->s->scratch_size) { 2333b8e80941Smrg v3d_setup_spill_base(c); 2334b8e80941Smrg c->spill_size += V3D_CHANNELS * c->s->scratch_size; 2335b8e80941Smrg } 2336b8e80941Smrg 2337b8e80941Smrg if (c->s->info.stage == MESA_SHADER_FRAGMENT) 2338b8e80941Smrg ntq_setup_fs_inputs(c); 2339b8e80941Smrg else 2340b8e80941Smrg ntq_setup_vpm_inputs(c); 2341b8e80941Smrg 2342b8e80941Smrg ntq_setup_outputs(c); 2343b8e80941Smrg 2344b8e80941Smrg /* Find the main function and emit the body. */ 2345b8e80941Smrg nir_foreach_function(function, c->s) { 2346b8e80941Smrg assert(strcmp(function->name, "main") == 0); 2347b8e80941Smrg assert(function->impl); 2348b8e80941Smrg ntq_emit_impl(c, function->impl); 2349b8e80941Smrg } 2350b8e80941Smrg} 2351b8e80941Smrg 2352b8e80941Smrgconst nir_shader_compiler_options v3d_nir_options = { 2353b8e80941Smrg .lower_all_io_to_temps = true, 2354b8e80941Smrg .lower_extract_byte = true, 2355b8e80941Smrg .lower_extract_word = true, 2356b8e80941Smrg .lower_bfm = true, 2357b8e80941Smrg .lower_bitfield_insert_to_shifts = true, 2358b8e80941Smrg .lower_bitfield_extract_to_shifts = true, 2359b8e80941Smrg .lower_bitfield_reverse = true, 2360b8e80941Smrg .lower_bit_count = true, 2361b8e80941Smrg .lower_cs_local_id_from_index = true, 2362b8e80941Smrg .lower_ffract = true, 2363b8e80941Smrg .lower_pack_unorm_2x16 = true, 2364b8e80941Smrg .lower_pack_snorm_2x16 = true, 2365b8e80941Smrg .lower_pack_unorm_4x8 = true, 2366b8e80941Smrg .lower_pack_snorm_4x8 = true, 2367b8e80941Smrg .lower_unpack_unorm_4x8 = true, 2368b8e80941Smrg .lower_unpack_snorm_4x8 = true, 2369b8e80941Smrg .lower_pack_half_2x16 = true, 2370b8e80941Smrg .lower_unpack_half_2x16 = true, 2371b8e80941Smrg .lower_fdiv = true, 2372b8e80941Smrg .lower_find_lsb = true, 2373b8e80941Smrg .lower_ffma = true, 2374b8e80941Smrg .lower_flrp32 = true, 2375b8e80941Smrg .lower_fpow = true, 2376b8e80941Smrg .lower_fsat = true, 2377b8e80941Smrg .lower_fsqrt = true, 2378b8e80941Smrg .lower_ifind_msb = true, 2379b8e80941Smrg .lower_isign = true, 2380b8e80941Smrg .lower_ldexp = true, 2381b8e80941Smrg .lower_mul_high = true, 2382b8e80941Smrg .lower_wpos_pntc = true, 2383b8e80941Smrg .native_integers = true, 2384b8e80941Smrg}; 2385b8e80941Smrg 2386b8e80941Smrg/** 2387b8e80941Smrg * When demoting a shader down to single-threaded, removes the THRSW 2388b8e80941Smrg * instructions (one will still be inserted at v3d_vir_to_qpu() for the 2389b8e80941Smrg * program end). 2390b8e80941Smrg */ 2391b8e80941Smrgstatic void 2392b8e80941Smrgvir_remove_thrsw(struct v3d_compile *c) 2393b8e80941Smrg{ 2394b8e80941Smrg vir_for_each_block(block, c) { 2395b8e80941Smrg vir_for_each_inst_safe(inst, block) { 2396b8e80941Smrg if (inst->qpu.sig.thrsw) 2397b8e80941Smrg vir_remove_instruction(c, inst); 2398b8e80941Smrg } 2399b8e80941Smrg } 2400b8e80941Smrg 2401b8e80941Smrg c->last_thrsw = NULL; 2402b8e80941Smrg} 2403b8e80941Smrg 2404b8e80941Smrgvoid 2405b8e80941Smrgvir_emit_last_thrsw(struct v3d_compile *c) 2406b8e80941Smrg{ 2407b8e80941Smrg /* On V3D before 4.1, we need a TMU op to be outstanding when thread 2408b8e80941Smrg * switching, so disable threads if we didn't do any TMU ops (each of 2409b8e80941Smrg * which would have emitted a THRSW). 2410b8e80941Smrg */ 2411b8e80941Smrg if (!c->last_thrsw_at_top_level && c->devinfo->ver < 41) { 2412b8e80941Smrg c->threads = 1; 2413b8e80941Smrg if (c->last_thrsw) 2414b8e80941Smrg vir_remove_thrsw(c); 2415b8e80941Smrg return; 2416b8e80941Smrg } 2417b8e80941Smrg 2418b8e80941Smrg /* If we're threaded and the last THRSW was in conditional code, then 2419b8e80941Smrg * we need to emit another one so that we can flag it as the last 2420b8e80941Smrg * thrsw. 2421b8e80941Smrg */ 2422b8e80941Smrg if (c->last_thrsw && !c->last_thrsw_at_top_level) { 2423b8e80941Smrg assert(c->devinfo->ver >= 41); 2424b8e80941Smrg vir_emit_thrsw(c); 2425b8e80941Smrg } 2426b8e80941Smrg 2427b8e80941Smrg /* If we're threaded, then we need to mark the last THRSW instruction 2428b8e80941Smrg * so we can emit a pair of them at QPU emit time. 2429b8e80941Smrg * 2430b8e80941Smrg * For V3D 4.x, we can spawn the non-fragment shaders already in the 2431b8e80941Smrg * post-last-THRSW state, so we can skip this. 2432b8e80941Smrg */ 2433b8e80941Smrg if (!c->last_thrsw && c->s->info.stage == MESA_SHADER_FRAGMENT) { 2434b8e80941Smrg assert(c->devinfo->ver >= 41); 2435b8e80941Smrg vir_emit_thrsw(c); 2436b8e80941Smrg } 2437b8e80941Smrg 2438b8e80941Smrg if (c->last_thrsw) 2439b8e80941Smrg c->last_thrsw->is_last_thrsw = true; 2440b8e80941Smrg} 2441b8e80941Smrg 2442b8e80941Smrg/* There's a flag in the shader for "center W is needed for reasons other than 2443b8e80941Smrg * non-centroid varyings", so we just walk the program after VIR optimization 2444b8e80941Smrg * to see if it's used. It should be harmless to set even if we only use 2445b8e80941Smrg * center W for varyings. 2446b8e80941Smrg */ 2447b8e80941Smrgstatic void 2448b8e80941Smrgvir_check_payload_w(struct v3d_compile *c) 2449b8e80941Smrg{ 2450b8e80941Smrg if (c->s->info.stage != MESA_SHADER_FRAGMENT) 2451b8e80941Smrg return; 2452b8e80941Smrg 2453b8e80941Smrg vir_for_each_inst_inorder(inst, c) { 2454b8e80941Smrg for (int i = 0; i < vir_get_nsrc(inst); i++) { 2455b8e80941Smrg if (inst->src[i].file == QFILE_REG && 2456b8e80941Smrg inst->src[i].index == 0) { 2457b8e80941Smrg c->uses_center_w = true; 2458b8e80941Smrg return; 2459b8e80941Smrg } 2460b8e80941Smrg } 2461b8e80941Smrg } 2462b8e80941Smrg 2463b8e80941Smrg} 2464b8e80941Smrg 2465b8e80941Smrgvoid 2466b8e80941Smrgv3d_nir_to_vir(struct v3d_compile *c) 2467b8e80941Smrg{ 2468b8e80941Smrg if (V3D_DEBUG & (V3D_DEBUG_NIR | 2469b8e80941Smrg v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2470b8e80941Smrg fprintf(stderr, "%s prog %d/%d NIR:\n", 2471b8e80941Smrg vir_get_stage_name(c), 2472b8e80941Smrg c->program_id, c->variant_id); 2473b8e80941Smrg nir_print_shader(c->s, stderr); 2474b8e80941Smrg } 2475b8e80941Smrg 2476b8e80941Smrg nir_to_vir(c); 2477b8e80941Smrg 2478b8e80941Smrg /* Emit the last THRSW before STVPM and TLB writes. */ 2479b8e80941Smrg vir_emit_last_thrsw(c); 2480b8e80941Smrg 2481b8e80941Smrg switch (c->s->info.stage) { 2482b8e80941Smrg case MESA_SHADER_FRAGMENT: 2483b8e80941Smrg emit_frag_end(c); 2484b8e80941Smrg break; 2485b8e80941Smrg case MESA_SHADER_VERTEX: 2486b8e80941Smrg emit_vert_end(c); 2487b8e80941Smrg break; 2488b8e80941Smrg case MESA_SHADER_COMPUTE: 2489b8e80941Smrg break; 2490b8e80941Smrg default: 2491b8e80941Smrg unreachable("bad stage"); 2492b8e80941Smrg } 2493b8e80941Smrg 2494b8e80941Smrg if (V3D_DEBUG & (V3D_DEBUG_VIR | 2495b8e80941Smrg v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2496b8e80941Smrg fprintf(stderr, "%s prog %d/%d pre-opt VIR:\n", 2497b8e80941Smrg vir_get_stage_name(c), 2498b8e80941Smrg c->program_id, c->variant_id); 2499b8e80941Smrg vir_dump(c); 2500b8e80941Smrg fprintf(stderr, "\n"); 2501b8e80941Smrg } 2502b8e80941Smrg 2503b8e80941Smrg vir_optimize(c); 2504b8e80941Smrg 2505b8e80941Smrg vir_check_payload_w(c); 2506b8e80941Smrg 2507b8e80941Smrg /* XXX perf: On VC4, we do a VIR-level instruction scheduling here. 2508b8e80941Smrg * We used that on that platform to pipeline TMU writes and reduce the 2509b8e80941Smrg * number of thread switches, as well as try (mostly successfully) to 2510b8e80941Smrg * reduce maximum register pressure to allow more threads. We should 2511b8e80941Smrg * do something of that sort for V3D -- either instruction scheduling 2512b8e80941Smrg * here, or delay the the THRSW and LDTMUs from our texture 2513b8e80941Smrg * instructions until the results are needed. 2514b8e80941Smrg */ 2515b8e80941Smrg 2516b8e80941Smrg if (V3D_DEBUG & (V3D_DEBUG_VIR | 2517b8e80941Smrg v3d_debug_flag_for_shader_stage(c->s->info.stage))) { 2518b8e80941Smrg fprintf(stderr, "%s prog %d/%d VIR:\n", 2519b8e80941Smrg vir_get_stage_name(c), 2520b8e80941Smrg c->program_id, c->variant_id); 2521b8e80941Smrg vir_dump(c); 2522b8e80941Smrg fprintf(stderr, "\n"); 2523b8e80941Smrg } 2524b8e80941Smrg 2525b8e80941Smrg /* Attempt to allocate registers for the temporaries. If we fail, 2526b8e80941Smrg * reduce thread count and try again. 2527b8e80941Smrg */ 2528b8e80941Smrg int min_threads = (c->devinfo->ver >= 41) ? 2 : 1; 2529b8e80941Smrg struct qpu_reg *temp_registers; 2530b8e80941Smrg while (true) { 2531b8e80941Smrg bool spilled; 2532b8e80941Smrg temp_registers = v3d_register_allocate(c, &spilled); 2533b8e80941Smrg if (spilled) 2534b8e80941Smrg continue; 2535b8e80941Smrg 2536b8e80941Smrg if (temp_registers) 2537b8e80941Smrg break; 2538b8e80941Smrg 2539b8e80941Smrg if (c->threads == min_threads) { 2540b8e80941Smrg fprintf(stderr, "Failed to register allocate at %d threads:\n", 2541b8e80941Smrg c->threads); 2542b8e80941Smrg vir_dump(c); 2543b8e80941Smrg c->failed = true; 2544b8e80941Smrg return; 2545b8e80941Smrg } 2546b8e80941Smrg 2547b8e80941Smrg c->threads /= 2; 2548b8e80941Smrg 2549b8e80941Smrg if (c->threads == 1) 2550b8e80941Smrg vir_remove_thrsw(c); 2551b8e80941Smrg } 2552b8e80941Smrg 2553b8e80941Smrg if (c->spills && 2554b8e80941Smrg (V3D_DEBUG & (V3D_DEBUG_VIR | 2555b8e80941Smrg v3d_debug_flag_for_shader_stage(c->s->info.stage)))) { 2556b8e80941Smrg fprintf(stderr, "%s prog %d/%d spilled VIR:\n", 2557b8e80941Smrg vir_get_stage_name(c), 2558b8e80941Smrg c->program_id, c->variant_id); 2559b8e80941Smrg vir_dump(c); 2560b8e80941Smrg fprintf(stderr, "\n"); 2561b8e80941Smrg } 2562b8e80941Smrg 2563b8e80941Smrg v3d_vir_to_qpu(c, temp_registers); 2564b8e80941Smrg} 2565