vir.c revision 7ec681f3
101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2016-2017 Broadcom 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg#include "broadcom/common/v3d_device_info.h" 2501e04c3fSmrg#include "v3d_compiler.h" 267ec681f3Smrg#include "util/u_prim.h" 277ec681f3Smrg#include "compiler/nir/nir_schedule.h" 287ec681f3Smrg#include "compiler/nir/nir_builder.h" 2901e04c3fSmrg 3001e04c3fSmrgint 31ed98bd31Smayavir_get_nsrc(struct qinst *inst) 3201e04c3fSmrg{ 3301e04c3fSmrg switch (inst->qpu.type) { 3401e04c3fSmrg case V3D_QPU_INSTR_TYPE_BRANCH: 3501e04c3fSmrg return 0; 3601e04c3fSmrg case V3D_QPU_INSTR_TYPE_ALU: 3701e04c3fSmrg if (inst->qpu.alu.add.op != V3D_QPU_A_NOP) 3801e04c3fSmrg return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op); 3901e04c3fSmrg else 4001e04c3fSmrg return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op); 4101e04c3fSmrg } 4201e04c3fSmrg 4301e04c3fSmrg return 0; 4401e04c3fSmrg} 4501e04c3fSmrg 4601e04c3fSmrg/** 4701e04c3fSmrg * Returns whether the instruction has any side effects that must be 4801e04c3fSmrg * preserved. 4901e04c3fSmrg */ 5001e04c3fSmrgbool 5101e04c3fSmrgvir_has_side_effects(struct v3d_compile *c, struct qinst *inst) 5201e04c3fSmrg{ 5301e04c3fSmrg switch (inst->qpu.type) { 5401e04c3fSmrg case V3D_QPU_INSTR_TYPE_BRANCH: 5501e04c3fSmrg return true; 5601e04c3fSmrg case V3D_QPU_INSTR_TYPE_ALU: 5701e04c3fSmrg switch (inst->qpu.alu.add.op) { 5801e04c3fSmrg case V3D_QPU_A_SETREVF: 5901e04c3fSmrg case V3D_QPU_A_SETMSF: 6001e04c3fSmrg case V3D_QPU_A_VPMSETUP: 6101e04c3fSmrg case V3D_QPU_A_STVPMV: 6201e04c3fSmrg case V3D_QPU_A_STVPMD: 6301e04c3fSmrg case V3D_QPU_A_STVPMP: 6401e04c3fSmrg case V3D_QPU_A_VPMWT: 6501e04c3fSmrg case V3D_QPU_A_TMUWT: 6601e04c3fSmrg return true; 6701e04c3fSmrg default: 6801e04c3fSmrg break; 6901e04c3fSmrg } 7001e04c3fSmrg 7101e04c3fSmrg switch (inst->qpu.alu.mul.op) { 7201e04c3fSmrg case V3D_QPU_M_MULTOP: 7301e04c3fSmrg return true; 7401e04c3fSmrg default: 7501e04c3fSmrg break; 7601e04c3fSmrg } 7701e04c3fSmrg } 7801e04c3fSmrg 7901e04c3fSmrg if (inst->qpu.sig.ldtmu || 8001e04c3fSmrg inst->qpu.sig.ldvary || 817ec681f3Smrg inst->qpu.sig.ldtlbu || 827ec681f3Smrg inst->qpu.sig.ldtlb || 8301e04c3fSmrg inst->qpu.sig.wrtmuc || 8401e04c3fSmrg inst->qpu.sig.thrsw) { 8501e04c3fSmrg return true; 8601e04c3fSmrg } 8701e04c3fSmrg 887ec681f3Smrg /* ldunifa works like ldunif: it reads an element and advances the 897ec681f3Smrg * pointer, so each read has a side effect (we don't care for ldunif 907ec681f3Smrg * because we reconstruct the uniform stream buffer after compiling 917ec681f3Smrg * with the surviving uniforms), so allowing DCE to remove 927ec681f3Smrg * one would break follow-up loads. We could fix this by emiting a 937ec681f3Smrg * unifa for each ldunifa, but each unifa requires 3 delay slots 947ec681f3Smrg * before a ldunifa, so that would be quite expensive. 957ec681f3Smrg */ 967ec681f3Smrg if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf) 977ec681f3Smrg return true; 987ec681f3Smrg 9901e04c3fSmrg return false; 10001e04c3fSmrg} 10101e04c3fSmrg 10201e04c3fSmrgbool 10301e04c3fSmrgvir_is_raw_mov(struct qinst *inst) 10401e04c3fSmrg{ 10501e04c3fSmrg if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU || 10601e04c3fSmrg (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV && 10701e04c3fSmrg inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) { 10801e04c3fSmrg return false; 10901e04c3fSmrg } 11001e04c3fSmrg 11101e04c3fSmrg if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE || 11201e04c3fSmrg inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) { 11301e04c3fSmrg return false; 11401e04c3fSmrg } 11501e04c3fSmrg 116ed98bd31Smaya if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE || 117ed98bd31Smaya inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE || 118ed98bd31Smaya inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE || 119ed98bd31Smaya inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) { 120ed98bd31Smaya return false; 121ed98bd31Smaya } 122ed98bd31Smaya 12301e04c3fSmrg if (inst->qpu.flags.ac != V3D_QPU_COND_NONE || 12401e04c3fSmrg inst->qpu.flags.mc != V3D_QPU_COND_NONE) 12501e04c3fSmrg return false; 12601e04c3fSmrg 12701e04c3fSmrg return true; 12801e04c3fSmrg} 12901e04c3fSmrg 13001e04c3fSmrgbool 13101e04c3fSmrgvir_is_add(struct qinst *inst) 13201e04c3fSmrg{ 13301e04c3fSmrg return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 13401e04c3fSmrg inst->qpu.alu.add.op != V3D_QPU_A_NOP); 13501e04c3fSmrg} 13601e04c3fSmrg 13701e04c3fSmrgbool 13801e04c3fSmrgvir_is_mul(struct qinst *inst) 13901e04c3fSmrg{ 14001e04c3fSmrg return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 14101e04c3fSmrg inst->qpu.alu.mul.op != V3D_QPU_M_NOP); 14201e04c3fSmrg} 14301e04c3fSmrg 14401e04c3fSmrgbool 1457ec681f3Smrgvir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst) 14601e04c3fSmrg{ 14701e04c3fSmrg if (inst->dst.file == QFILE_MAGIC) 1487ec681f3Smrg return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index); 14901e04c3fSmrg 15001e04c3fSmrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU && 15101e04c3fSmrg inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) { 15201e04c3fSmrg return true; 15301e04c3fSmrg } 15401e04c3fSmrg 15501e04c3fSmrg return false; 15601e04c3fSmrg} 15701e04c3fSmrg 15801e04c3fSmrgbool 15901e04c3fSmrgvir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst) 16001e04c3fSmrg{ 16101e04c3fSmrg for (int i = 0; i < vir_get_nsrc(inst); i++) { 16201e04c3fSmrg switch (inst->src[i].file) { 16301e04c3fSmrg case QFILE_VPM: 16401e04c3fSmrg return true; 16501e04c3fSmrg default: 16601e04c3fSmrg break; 16701e04c3fSmrg } 16801e04c3fSmrg } 16901e04c3fSmrg 17001e04c3fSmrg if (devinfo->ver < 41 && (inst->qpu.sig.ldvary || 17101e04c3fSmrg inst->qpu.sig.ldtlb || 17201e04c3fSmrg inst->qpu.sig.ldtlbu || 17301e04c3fSmrg inst->qpu.sig.ldvpm)) { 17401e04c3fSmrg return true; 17501e04c3fSmrg } 17601e04c3fSmrg 17701e04c3fSmrg return false; 17801e04c3fSmrg} 17901e04c3fSmrg 18001e04c3fSmrgbool 18101e04c3fSmrgvir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst) 18201e04c3fSmrg{ 18301e04c3fSmrg switch (inst->dst.file) { 18401e04c3fSmrg case QFILE_MAGIC: 18501e04c3fSmrg switch (inst->dst.index) { 18601e04c3fSmrg case V3D_QPU_WADDR_RECIP: 18701e04c3fSmrg case V3D_QPU_WADDR_RSQRT: 18801e04c3fSmrg case V3D_QPU_WADDR_EXP: 18901e04c3fSmrg case V3D_QPU_WADDR_LOG: 19001e04c3fSmrg case V3D_QPU_WADDR_SIN: 19101e04c3fSmrg return true; 19201e04c3fSmrg } 19301e04c3fSmrg break; 19401e04c3fSmrg default: 19501e04c3fSmrg break; 19601e04c3fSmrg } 19701e04c3fSmrg 19801e04c3fSmrg if (devinfo->ver < 41 && inst->qpu.sig.ldtmu) 19901e04c3fSmrg return true; 20001e04c3fSmrg 20101e04c3fSmrg return false; 20201e04c3fSmrg} 20301e04c3fSmrg 20401e04c3fSmrgvoid 20501e04c3fSmrgvir_set_unpack(struct qinst *inst, int src, 20601e04c3fSmrg enum v3d_qpu_input_unpack unpack) 20701e04c3fSmrg{ 20801e04c3fSmrg assert(src == 0 || src == 1); 20901e04c3fSmrg 21001e04c3fSmrg if (vir_is_add(inst)) { 21101e04c3fSmrg if (src == 0) 21201e04c3fSmrg inst->qpu.alu.add.a_unpack = unpack; 21301e04c3fSmrg else 21401e04c3fSmrg inst->qpu.alu.add.b_unpack = unpack; 21501e04c3fSmrg } else { 21601e04c3fSmrg assert(vir_is_mul(inst)); 21701e04c3fSmrg if (src == 0) 21801e04c3fSmrg inst->qpu.alu.mul.a_unpack = unpack; 21901e04c3fSmrg else 22001e04c3fSmrg inst->qpu.alu.mul.b_unpack = unpack; 22101e04c3fSmrg } 22201e04c3fSmrg} 22301e04c3fSmrg 2247ec681f3Smrgvoid 2257ec681f3Smrgvir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack) 2267ec681f3Smrg{ 2277ec681f3Smrg if (vir_is_add(inst)) { 2287ec681f3Smrg inst->qpu.alu.add.output_pack = pack; 2297ec681f3Smrg } else { 2307ec681f3Smrg assert(vir_is_mul(inst)); 2317ec681f3Smrg inst->qpu.alu.mul.output_pack = pack; 2327ec681f3Smrg } 2337ec681f3Smrg} 2347ec681f3Smrg 23501e04c3fSmrgvoid 23601e04c3fSmrgvir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond) 23701e04c3fSmrg{ 23801e04c3fSmrg if (vir_is_add(inst)) { 23901e04c3fSmrg inst->qpu.flags.ac = cond; 24001e04c3fSmrg } else { 24101e04c3fSmrg assert(vir_is_mul(inst)); 24201e04c3fSmrg inst->qpu.flags.mc = cond; 24301e04c3fSmrg } 24401e04c3fSmrg} 24501e04c3fSmrg 2467ec681f3Smrgenum v3d_qpu_cond 2477ec681f3Smrgvir_get_cond(struct qinst *inst) 2487ec681f3Smrg{ 2497ec681f3Smrg assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU); 2507ec681f3Smrg 2517ec681f3Smrg if (vir_is_add(inst)) 2527ec681f3Smrg return inst->qpu.flags.ac; 2537ec681f3Smrg else if (vir_is_mul(inst)) 2547ec681f3Smrg return inst->qpu.flags.mc; 2557ec681f3Smrg else /* NOP */ 2567ec681f3Smrg return V3D_QPU_COND_NONE; 2577ec681f3Smrg} 2587ec681f3Smrg 25901e04c3fSmrgvoid 2607ec681f3Smrgvir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf) 26101e04c3fSmrg{ 2627ec681f3Smrg c->flags_temp = -1; 26301e04c3fSmrg if (vir_is_add(inst)) { 26401e04c3fSmrg inst->qpu.flags.apf = pf; 26501e04c3fSmrg } else { 26601e04c3fSmrg assert(vir_is_mul(inst)); 26701e04c3fSmrg inst->qpu.flags.mpf = pf; 26801e04c3fSmrg } 26901e04c3fSmrg} 27001e04c3fSmrg 271ed98bd31Smayavoid 2727ec681f3Smrgvir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf) 273ed98bd31Smaya{ 2747ec681f3Smrg c->flags_temp = -1; 275ed98bd31Smaya if (vir_is_add(inst)) { 276ed98bd31Smaya inst->qpu.flags.auf = uf; 277ed98bd31Smaya } else { 278ed98bd31Smaya assert(vir_is_mul(inst)); 279ed98bd31Smaya inst->qpu.flags.muf = uf; 280ed98bd31Smaya } 281ed98bd31Smaya} 282ed98bd31Smaya 28301e04c3fSmrg#if 0 28401e04c3fSmrguint8_t 28501e04c3fSmrgvir_channels_written(struct qinst *inst) 28601e04c3fSmrg{ 28701e04c3fSmrg if (vir_is_mul(inst)) { 28801e04c3fSmrg switch (inst->dst.pack) { 28901e04c3fSmrg case QPU_PACK_MUL_NOP: 29001e04c3fSmrg case QPU_PACK_MUL_8888: 29101e04c3fSmrg return 0xf; 29201e04c3fSmrg case QPU_PACK_MUL_8A: 29301e04c3fSmrg return 0x1; 29401e04c3fSmrg case QPU_PACK_MUL_8B: 29501e04c3fSmrg return 0x2; 29601e04c3fSmrg case QPU_PACK_MUL_8C: 29701e04c3fSmrg return 0x4; 29801e04c3fSmrg case QPU_PACK_MUL_8D: 29901e04c3fSmrg return 0x8; 30001e04c3fSmrg } 30101e04c3fSmrg } else { 30201e04c3fSmrg switch (inst->dst.pack) { 30301e04c3fSmrg case QPU_PACK_A_NOP: 30401e04c3fSmrg case QPU_PACK_A_8888: 30501e04c3fSmrg case QPU_PACK_A_8888_SAT: 30601e04c3fSmrg case QPU_PACK_A_32_SAT: 30701e04c3fSmrg return 0xf; 30801e04c3fSmrg case QPU_PACK_A_8A: 30901e04c3fSmrg case QPU_PACK_A_8A_SAT: 31001e04c3fSmrg return 0x1; 31101e04c3fSmrg case QPU_PACK_A_8B: 31201e04c3fSmrg case QPU_PACK_A_8B_SAT: 31301e04c3fSmrg return 0x2; 31401e04c3fSmrg case QPU_PACK_A_8C: 31501e04c3fSmrg case QPU_PACK_A_8C_SAT: 31601e04c3fSmrg return 0x4; 31701e04c3fSmrg case QPU_PACK_A_8D: 31801e04c3fSmrg case QPU_PACK_A_8D_SAT: 31901e04c3fSmrg return 0x8; 32001e04c3fSmrg case QPU_PACK_A_16A: 32101e04c3fSmrg case QPU_PACK_A_16A_SAT: 32201e04c3fSmrg return 0x3; 32301e04c3fSmrg case QPU_PACK_A_16B: 32401e04c3fSmrg case QPU_PACK_A_16B_SAT: 32501e04c3fSmrg return 0xc; 32601e04c3fSmrg } 32701e04c3fSmrg } 32801e04c3fSmrg unreachable("Bad pack field"); 32901e04c3fSmrg} 33001e04c3fSmrg#endif 33101e04c3fSmrg 33201e04c3fSmrgstruct qreg 33301e04c3fSmrgvir_get_temp(struct v3d_compile *c) 33401e04c3fSmrg{ 33501e04c3fSmrg struct qreg reg; 33601e04c3fSmrg 33701e04c3fSmrg reg.file = QFILE_TEMP; 33801e04c3fSmrg reg.index = c->num_temps++; 33901e04c3fSmrg 34001e04c3fSmrg if (c->num_temps > c->defs_array_size) { 34101e04c3fSmrg uint32_t old_size = c->defs_array_size; 34201e04c3fSmrg c->defs_array_size = MAX2(old_size * 2, 16); 34301e04c3fSmrg 34401e04c3fSmrg c->defs = reralloc(c, c->defs, struct qinst *, 34501e04c3fSmrg c->defs_array_size); 34601e04c3fSmrg memset(&c->defs[old_size], 0, 34701e04c3fSmrg sizeof(c->defs[0]) * (c->defs_array_size - old_size)); 34801e04c3fSmrg 34901e04c3fSmrg c->spillable = reralloc(c, c->spillable, 35001e04c3fSmrg BITSET_WORD, 35101e04c3fSmrg BITSET_WORDS(c->defs_array_size)); 35201e04c3fSmrg for (int i = old_size; i < c->defs_array_size; i++) 35301e04c3fSmrg BITSET_SET(c->spillable, i); 35401e04c3fSmrg } 35501e04c3fSmrg 35601e04c3fSmrg return reg; 35701e04c3fSmrg} 35801e04c3fSmrg 35901e04c3fSmrgstruct qinst * 36001e04c3fSmrgvir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1) 36101e04c3fSmrg{ 36201e04c3fSmrg struct qinst *inst = calloc(1, sizeof(*inst)); 36301e04c3fSmrg 36401e04c3fSmrg inst->qpu = v3d_qpu_nop(); 36501e04c3fSmrg inst->qpu.alu.add.op = op; 36601e04c3fSmrg 36701e04c3fSmrg inst->dst = dst; 36801e04c3fSmrg inst->src[0] = src0; 36901e04c3fSmrg inst->src[1] = src1; 37001e04c3fSmrg inst->uniform = ~0; 37101e04c3fSmrg 37201e04c3fSmrg return inst; 37301e04c3fSmrg} 37401e04c3fSmrg 37501e04c3fSmrgstruct qinst * 37601e04c3fSmrgvir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1) 37701e04c3fSmrg{ 37801e04c3fSmrg struct qinst *inst = calloc(1, sizeof(*inst)); 37901e04c3fSmrg 38001e04c3fSmrg inst->qpu = v3d_qpu_nop(); 38101e04c3fSmrg inst->qpu.alu.mul.op = op; 38201e04c3fSmrg 38301e04c3fSmrg inst->dst = dst; 38401e04c3fSmrg inst->src[0] = src0; 38501e04c3fSmrg inst->src[1] = src1; 38601e04c3fSmrg inst->uniform = ~0; 38701e04c3fSmrg 38801e04c3fSmrg return inst; 38901e04c3fSmrg} 39001e04c3fSmrg 39101e04c3fSmrgstruct qinst * 392ed98bd31Smayavir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond) 39301e04c3fSmrg{ 39401e04c3fSmrg struct qinst *inst = calloc(1, sizeof(*inst)); 39501e04c3fSmrg 39601e04c3fSmrg inst->qpu = v3d_qpu_nop(); 39701e04c3fSmrg inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH; 39801e04c3fSmrg inst->qpu.branch.cond = cond; 39901e04c3fSmrg inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE; 40001e04c3fSmrg inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL; 40101e04c3fSmrg inst->qpu.branch.ub = true; 40201e04c3fSmrg inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL; 40301e04c3fSmrg 404ed98bd31Smaya inst->dst = vir_nop_reg(); 405ed98bd31Smaya inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0); 40601e04c3fSmrg 40701e04c3fSmrg return inst; 40801e04c3fSmrg} 40901e04c3fSmrg 41001e04c3fSmrgstatic void 41101e04c3fSmrgvir_emit(struct v3d_compile *c, struct qinst *inst) 41201e04c3fSmrg{ 41301e04c3fSmrg switch (c->cursor.mode) { 41401e04c3fSmrg case vir_cursor_add: 41501e04c3fSmrg list_add(&inst->link, c->cursor.link); 41601e04c3fSmrg break; 41701e04c3fSmrg case vir_cursor_addtail: 41801e04c3fSmrg list_addtail(&inst->link, c->cursor.link); 41901e04c3fSmrg break; 42001e04c3fSmrg } 42101e04c3fSmrg 42201e04c3fSmrg c->cursor = vir_after_inst(inst); 42301e04c3fSmrg c->live_intervals_valid = false; 42401e04c3fSmrg} 42501e04c3fSmrg 42601e04c3fSmrg/* Updates inst to write to a new temporary, emits it, and notes the def. */ 42701e04c3fSmrgstruct qreg 42801e04c3fSmrgvir_emit_def(struct v3d_compile *c, struct qinst *inst) 42901e04c3fSmrg{ 43001e04c3fSmrg assert(inst->dst.file == QFILE_NULL); 43101e04c3fSmrg 43201e04c3fSmrg /* If we're emitting an instruction that's a def, it had better be 43301e04c3fSmrg * writing a register. 43401e04c3fSmrg */ 43501e04c3fSmrg if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) { 43601e04c3fSmrg assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP || 43701e04c3fSmrg v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op)); 43801e04c3fSmrg assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP || 43901e04c3fSmrg v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op)); 44001e04c3fSmrg } 44101e04c3fSmrg 44201e04c3fSmrg inst->dst = vir_get_temp(c); 44301e04c3fSmrg 44401e04c3fSmrg if (inst->dst.file == QFILE_TEMP) 44501e04c3fSmrg c->defs[inst->dst.index] = inst; 44601e04c3fSmrg 44701e04c3fSmrg vir_emit(c, inst); 44801e04c3fSmrg 44901e04c3fSmrg return inst->dst; 45001e04c3fSmrg} 45101e04c3fSmrg 45201e04c3fSmrgstruct qinst * 45301e04c3fSmrgvir_emit_nondef(struct v3d_compile *c, struct qinst *inst) 45401e04c3fSmrg{ 45501e04c3fSmrg if (inst->dst.file == QFILE_TEMP) 45601e04c3fSmrg c->defs[inst->dst.index] = NULL; 45701e04c3fSmrg 45801e04c3fSmrg vir_emit(c, inst); 45901e04c3fSmrg 46001e04c3fSmrg return inst; 46101e04c3fSmrg} 46201e04c3fSmrg 46301e04c3fSmrgstruct qblock * 46401e04c3fSmrgvir_new_block(struct v3d_compile *c) 46501e04c3fSmrg{ 46601e04c3fSmrg struct qblock *block = rzalloc(c, struct qblock); 46701e04c3fSmrg 46801e04c3fSmrg list_inithead(&block->instructions); 46901e04c3fSmrg 47001e04c3fSmrg block->predecessors = _mesa_set_create(block, 47101e04c3fSmrg _mesa_hash_pointer, 47201e04c3fSmrg _mesa_key_pointer_equal); 47301e04c3fSmrg 47401e04c3fSmrg block->index = c->next_block_index++; 47501e04c3fSmrg 47601e04c3fSmrg return block; 47701e04c3fSmrg} 47801e04c3fSmrg 47901e04c3fSmrgvoid 48001e04c3fSmrgvir_set_emit_block(struct v3d_compile *c, struct qblock *block) 48101e04c3fSmrg{ 48201e04c3fSmrg c->cur_block = block; 48301e04c3fSmrg c->cursor = vir_after_block(block); 48401e04c3fSmrg list_addtail(&block->link, &c->blocks); 48501e04c3fSmrg} 48601e04c3fSmrg 48701e04c3fSmrgstruct qblock * 48801e04c3fSmrgvir_entry_block(struct v3d_compile *c) 48901e04c3fSmrg{ 49001e04c3fSmrg return list_first_entry(&c->blocks, struct qblock, link); 49101e04c3fSmrg} 49201e04c3fSmrg 49301e04c3fSmrgstruct qblock * 49401e04c3fSmrgvir_exit_block(struct v3d_compile *c) 49501e04c3fSmrg{ 49601e04c3fSmrg return list_last_entry(&c->blocks, struct qblock, link); 49701e04c3fSmrg} 49801e04c3fSmrg 49901e04c3fSmrgvoid 50001e04c3fSmrgvir_link_blocks(struct qblock *predecessor, struct qblock *successor) 50101e04c3fSmrg{ 50201e04c3fSmrg _mesa_set_add(successor->predecessors, predecessor); 50301e04c3fSmrg if (predecessor->successors[0]) { 50401e04c3fSmrg assert(!predecessor->successors[1]); 50501e04c3fSmrg predecessor->successors[1] = successor; 50601e04c3fSmrg } else { 50701e04c3fSmrg predecessor->successors[0] = successor; 50801e04c3fSmrg } 50901e04c3fSmrg} 51001e04c3fSmrg 51101e04c3fSmrgconst struct v3d_compiler * 51201e04c3fSmrgv3d_compiler_init(const struct v3d_device_info *devinfo) 51301e04c3fSmrg{ 51401e04c3fSmrg struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler); 51501e04c3fSmrg if (!compiler) 51601e04c3fSmrg return NULL; 51701e04c3fSmrg 51801e04c3fSmrg compiler->devinfo = devinfo; 51901e04c3fSmrg 52001e04c3fSmrg if (!vir_init_reg_sets(compiler)) { 52101e04c3fSmrg ralloc_free(compiler); 52201e04c3fSmrg return NULL; 52301e04c3fSmrg } 52401e04c3fSmrg 52501e04c3fSmrg return compiler; 52601e04c3fSmrg} 52701e04c3fSmrg 52801e04c3fSmrgvoid 52901e04c3fSmrgv3d_compiler_free(const struct v3d_compiler *compiler) 53001e04c3fSmrg{ 53101e04c3fSmrg ralloc_free((void *)compiler); 53201e04c3fSmrg} 53301e04c3fSmrg 53401e04c3fSmrgstatic struct v3d_compile * 53501e04c3fSmrgvir_compile_init(const struct v3d_compiler *compiler, 53601e04c3fSmrg struct v3d_key *key, 53701e04c3fSmrg nir_shader *s, 538ed98bd31Smaya void (*debug_output)(const char *msg, 539ed98bd31Smaya void *debug_output_data), 540ed98bd31Smaya void *debug_output_data, 5417ec681f3Smrg int program_id, int variant_id, 5427ec681f3Smrg uint32_t max_threads, 5437ec681f3Smrg uint32_t min_threads_for_reg_alloc, 5447ec681f3Smrg bool tmu_spilling_allowed, 5457ec681f3Smrg bool disable_loop_unrolling, 5467ec681f3Smrg bool disable_constant_ubo_load_sorting, 5477ec681f3Smrg bool disable_tmu_pipelining, 5487ec681f3Smrg bool fallback_scheduler) 54901e04c3fSmrg{ 55001e04c3fSmrg struct v3d_compile *c = rzalloc(NULL, struct v3d_compile); 55101e04c3fSmrg 55201e04c3fSmrg c->compiler = compiler; 55301e04c3fSmrg c->devinfo = compiler->devinfo; 55401e04c3fSmrg c->key = key; 55501e04c3fSmrg c->program_id = program_id; 55601e04c3fSmrg c->variant_id = variant_id; 5577ec681f3Smrg c->threads = max_threads; 558ed98bd31Smaya c->debug_output = debug_output; 559ed98bd31Smaya c->debug_output_data = debug_output_data; 5607ec681f3Smrg c->compilation_result = V3D_COMPILATION_SUCCEEDED; 5617ec681f3Smrg c->min_threads_for_reg_alloc = min_threads_for_reg_alloc; 5627ec681f3Smrg c->tmu_spilling_allowed = tmu_spilling_allowed; 5637ec681f3Smrg c->fallback_scheduler = fallback_scheduler; 5647ec681f3Smrg c->disable_tmu_pipelining = disable_tmu_pipelining; 5657ec681f3Smrg c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting; 5667ec681f3Smrg c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL 5677ec681f3Smrg ? true : disable_loop_unrolling; 56801e04c3fSmrg 56901e04c3fSmrg s = nir_shader_clone(c, s); 57001e04c3fSmrg c->s = s; 57101e04c3fSmrg 57201e04c3fSmrg list_inithead(&c->blocks); 57301e04c3fSmrg vir_set_emit_block(c, vir_new_block(c)); 57401e04c3fSmrg 57501e04c3fSmrg c->output_position_index = -1; 57601e04c3fSmrg c->output_sample_mask_index = -1; 57701e04c3fSmrg 57801e04c3fSmrg c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer, 57901e04c3fSmrg _mesa_key_pointer_equal); 58001e04c3fSmrg 5817ec681f3Smrg c->tmu.outstanding_regs = _mesa_pointer_set_create(c); 5827ec681f3Smrg c->flags_temp = -1; 5837ec681f3Smrg 58401e04c3fSmrg return c; 58501e04c3fSmrg} 58601e04c3fSmrg 58701e04c3fSmrgstatic int 588ed98bd31Smayatype_size_vec4(const struct glsl_type *type, bool bindless) 58901e04c3fSmrg{ 59001e04c3fSmrg return glsl_count_attribute_slots(type, false); 59101e04c3fSmrg} 59201e04c3fSmrg 59301e04c3fSmrgstatic void 59401e04c3fSmrgv3d_lower_nir(struct v3d_compile *c) 59501e04c3fSmrg{ 59601e04c3fSmrg struct nir_lower_tex_options tex_options = { 59701e04c3fSmrg .lower_txd = true, 598ed98bd31Smaya .lower_tg4_broadcom_swizzle = true, 599ed98bd31Smaya 60001e04c3fSmrg .lower_rect = false, /* XXX: Use this on V3D 3.x */ 60101e04c3fSmrg .lower_txp = ~0, 60201e04c3fSmrg /* Apply swizzles to all samplers. */ 60301e04c3fSmrg .swizzle_result = ~0, 60401e04c3fSmrg }; 60501e04c3fSmrg 60601e04c3fSmrg /* Lower the format swizzle and (for 32-bit returns) 60701e04c3fSmrg * ARB_texture_swizzle-style swizzle. 60801e04c3fSmrg */ 6097ec681f3Smrg assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex)); 6107ec681f3Smrg for (int i = 0; i < c->key->num_tex_used; i++) { 61101e04c3fSmrg for (int j = 0; j < 4; j++) 61201e04c3fSmrg tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j]; 6137ec681f3Smrg } 61401e04c3fSmrg 6157ec681f3Smrg assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler)); 6167ec681f3Smrg for (int i = 0; i < c->key->num_samplers_used; i++) { 6177ec681f3Smrg if (c->key->sampler[i].return_size == 16) { 618ed98bd31Smaya tex_options.lower_tex_packing[i] = 619ed98bd31Smaya nir_lower_tex_packing_16; 620ed98bd31Smaya } 621ed98bd31Smaya } 622ed98bd31Smaya 623ed98bd31Smaya /* CS textures may not have return_size reflecting the shadow state. */ 6247ec681f3Smrg nir_foreach_uniform_variable(var, c->s) { 625ed98bd31Smaya const struct glsl_type *type = glsl_without_array(var->type); 626ed98bd31Smaya unsigned array_len = MAX2(glsl_get_length(var->type), 1); 627ed98bd31Smaya 628ed98bd31Smaya if (!glsl_type_is_sampler(type) || 629ed98bd31Smaya !glsl_sampler_type_is_shadow(type)) 630ed98bd31Smaya continue; 631ed98bd31Smaya 632ed98bd31Smaya for (int i = 0; i < array_len; i++) { 633ed98bd31Smaya tex_options.lower_tex_packing[var->data.binding + i] = 634ed98bd31Smaya nir_lower_tex_packing_16; 635ed98bd31Smaya } 63601e04c3fSmrg } 63701e04c3fSmrg 63801e04c3fSmrg NIR_PASS_V(c->s, nir_lower_tex, &tex_options); 639ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_system_values); 6407ec681f3Smrg NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL); 64101e04c3fSmrg 642ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_vars_to_scratch, 643ed98bd31Smaya nir_var_function_temp, 644ed98bd31Smaya 0, 645ed98bd31Smaya glsl_get_natural_size_align_bytes); 646ed98bd31Smaya NIR_PASS_V(c->s, v3d_nir_lower_scratch); 64701e04c3fSmrg} 64801e04c3fSmrg 64901e04c3fSmrgstatic void 65001e04c3fSmrgv3d_set_prog_data_uniforms(struct v3d_compile *c, 65101e04c3fSmrg struct v3d_prog_data *prog_data) 65201e04c3fSmrg{ 65301e04c3fSmrg int count = c->num_uniforms; 65401e04c3fSmrg struct v3d_uniform_list *ulist = &prog_data->uniforms; 65501e04c3fSmrg 65601e04c3fSmrg ulist->count = count; 65701e04c3fSmrg ulist->data = ralloc_array(prog_data, uint32_t, count); 65801e04c3fSmrg memcpy(ulist->data, c->uniform_data, 65901e04c3fSmrg count * sizeof(*ulist->data)); 66001e04c3fSmrg ulist->contents = ralloc_array(prog_data, enum quniform_contents, count); 66101e04c3fSmrg memcpy(ulist->contents, c->uniform_contents, 66201e04c3fSmrg count * sizeof(*ulist->contents)); 66301e04c3fSmrg} 66401e04c3fSmrg 66501e04c3fSmrgstatic void 666ed98bd31Smayav3d_vs_set_prog_data(struct v3d_compile *c, 667ed98bd31Smaya struct v3d_vs_prog_data *prog_data) 66801e04c3fSmrg{ 66901e04c3fSmrg /* The vertex data gets format converted by the VPM so that 67001e04c3fSmrg * each attribute channel takes up a VPM column. Precompute 67101e04c3fSmrg * the sizes for the shader record. 67201e04c3fSmrg */ 67301e04c3fSmrg for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) { 67401e04c3fSmrg prog_data->vattr_sizes[i] = c->vattr_sizes[i]; 67501e04c3fSmrg prog_data->vpm_input_size += c->vattr_sizes[i]; 67601e04c3fSmrg } 67701e04c3fSmrg 6787ec681f3Smrg memset(prog_data->driver_location_map, -1, 6797ec681f3Smrg sizeof(prog_data->driver_location_map)); 6807ec681f3Smrg 6817ec681f3Smrg nir_foreach_shader_in_variable(var, c->s) { 6827ec681f3Smrg prog_data->driver_location_map[var->data.location] = 6837ec681f3Smrg var->data.driver_location; 6847ec681f3Smrg } 6857ec681f3Smrg 6867ec681f3Smrg prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read, 6877ec681f3Smrg SYSTEM_VALUE_VERTEX_ID) || 6887ec681f3Smrg BITSET_TEST(c->s->info.system_values_read, 6897ec681f3Smrg SYSTEM_VALUE_VERTEX_ID_ZERO_BASE); 6907ec681f3Smrg 6917ec681f3Smrg prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read, 6927ec681f3Smrg SYSTEM_VALUE_BASE_INSTANCE); 6937ec681f3Smrg 6947ec681f3Smrg prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read, 6957ec681f3Smrg SYSTEM_VALUE_INSTANCE_ID) || 6967ec681f3Smrg BITSET_TEST(c->s->info.system_values_read, 6977ec681f3Smrg SYSTEM_VALUE_INSTANCE_INDEX); 69801e04c3fSmrg 69901e04c3fSmrg if (prog_data->uses_vid) 70001e04c3fSmrg prog_data->vpm_input_size++; 7017ec681f3Smrg if (prog_data->uses_biid) 7027ec681f3Smrg prog_data->vpm_input_size++; 70301e04c3fSmrg if (prog_data->uses_iid) 70401e04c3fSmrg prog_data->vpm_input_size++; 70501e04c3fSmrg 70601e04c3fSmrg /* Input/output segment size are in sectors (8 rows of 32 bits per 70701e04c3fSmrg * channel). 70801e04c3fSmrg */ 70901e04c3fSmrg prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8; 710ed98bd31Smaya prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; 711ed98bd31Smaya 712ed98bd31Smaya /* Set us up for shared input/output segments. This is apparently 713ed98bd31Smaya * necessary for our VCM setup to avoid varying corruption. 714ed98bd31Smaya */ 715ed98bd31Smaya prog_data->separate_segments = false; 716ed98bd31Smaya prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size, 717ed98bd31Smaya prog_data->vpm_input_size); 718ed98bd31Smaya prog_data->vpm_input_size = 0; 71901e04c3fSmrg 72001e04c3fSmrg /* Compute VCM cache size. We set up our program to take up less than 72101e04c3fSmrg * half of the VPM, so that any set of bin and render programs won't 72201e04c3fSmrg * run out of space. We need space for at least one input segment, 72301e04c3fSmrg * and then allocate the rest to output segments (one for the current 72401e04c3fSmrg * program, the rest to VCM). The valid range of the VCM cache size 72501e04c3fSmrg * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4 72601e04c3fSmrg * batches. 72701e04c3fSmrg */ 72801e04c3fSmrg assert(c->devinfo->vpm_size); 729ed98bd31Smaya int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; 73001e04c3fSmrg int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size; 73101e04c3fSmrg int half_vpm = vpm_size_in_sectors / 2; 73201e04c3fSmrg int vpm_output_sectors = half_vpm - prog_data->vpm_input_size; 73301e04c3fSmrg int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size; 73401e04c3fSmrg assert(vpm_output_batches >= 2); 73501e04c3fSmrg prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4); 73601e04c3fSmrg} 73701e04c3fSmrg 7387ec681f3Smrgstatic void 7397ec681f3Smrgv3d_gs_set_prog_data(struct v3d_compile *c, 7407ec681f3Smrg struct v3d_gs_prog_data *prog_data) 7417ec681f3Smrg{ 7427ec681f3Smrg prog_data->num_inputs = c->num_inputs; 7437ec681f3Smrg memcpy(prog_data->input_slots, c->input_slots, 7447ec681f3Smrg c->num_inputs * sizeof(*c->input_slots)); 7457ec681f3Smrg 7467ec681f3Smrg /* gl_PrimitiveIdIn is written by the GBG into the first word of the 7477ec681f3Smrg * VPM output header automatically and the shader will overwrite 7487ec681f3Smrg * it after reading it if necessary, so it doesn't add to the VPM 7497ec681f3Smrg * size requirements. 7507ec681f3Smrg */ 7517ec681f3Smrg prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read, 7527ec681f3Smrg SYSTEM_VALUE_PRIMITIVE_ID); 7537ec681f3Smrg 7547ec681f3Smrg /* Output segment size is in sectors (8 rows of 32 bits per channel) */ 7557ec681f3Smrg prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8; 7567ec681f3Smrg 7577ec681f3Smrg /* Compute SIMD dispatch width and update VPM output size accordingly 7587ec681f3Smrg * to ensure we can fit our program in memory. Available widths are 7597ec681f3Smrg * 16, 8, 4, 1. 7607ec681f3Smrg * 7617ec681f3Smrg * Notice that at draw time we will have to consider VPM memory 7627ec681f3Smrg * requirements from other stages and choose a smaller dispatch 7637ec681f3Smrg * width if needed to fit the program in VPM memory. 7647ec681f3Smrg */ 7657ec681f3Smrg prog_data->simd_width = 16; 7667ec681f3Smrg while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) || 7677ec681f3Smrg prog_data->simd_width == 2) { 7687ec681f3Smrg prog_data->simd_width >>= 1; 7697ec681f3Smrg prog_data->vpm_output_size = 7707ec681f3Smrg align(prog_data->vpm_output_size, 2) / 2; 7717ec681f3Smrg } 7727ec681f3Smrg assert(prog_data->vpm_output_size <= 16); 7737ec681f3Smrg assert(prog_data->simd_width != 2); 7747ec681f3Smrg 7757ec681f3Smrg prog_data->out_prim_type = c->s->info.gs.output_primitive; 7767ec681f3Smrg prog_data->num_invocations = c->s->info.gs.invocations; 7777ec681f3Smrg 7787ec681f3Smrg prog_data->writes_psiz = 7797ec681f3Smrg c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ); 7807ec681f3Smrg} 7817ec681f3Smrg 78201e04c3fSmrgstatic void 78301e04c3fSmrgv3d_set_fs_prog_data_inputs(struct v3d_compile *c, 78401e04c3fSmrg struct v3d_fs_prog_data *prog_data) 78501e04c3fSmrg{ 786ed98bd31Smaya prog_data->num_inputs = c->num_inputs; 78701e04c3fSmrg memcpy(prog_data->input_slots, c->input_slots, 78801e04c3fSmrg c->num_inputs * sizeof(*c->input_slots)); 78901e04c3fSmrg 79001e04c3fSmrg STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) > 79101e04c3fSmrg (V3D_MAX_FS_INPUTS - 1) / 24); 79201e04c3fSmrg for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) { 79301e04c3fSmrg if (BITSET_TEST(c->flat_shade_flags, i)) 79401e04c3fSmrg prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24); 79501e04c3fSmrg 79601e04c3fSmrg if (BITSET_TEST(c->noperspective_flags, i)) 79701e04c3fSmrg prog_data->noperspective_flags[i / 24] |= 1 << (i % 24); 79801e04c3fSmrg 79901e04c3fSmrg if (BITSET_TEST(c->centroid_flags, i)) 80001e04c3fSmrg prog_data->centroid_flags[i / 24] |= 1 << (i % 24); 80101e04c3fSmrg } 80201e04c3fSmrg} 80301e04c3fSmrg 804ed98bd31Smayastatic void 805ed98bd31Smayav3d_fs_set_prog_data(struct v3d_compile *c, 806ed98bd31Smaya struct v3d_fs_prog_data *prog_data) 807ed98bd31Smaya{ 808ed98bd31Smaya v3d_set_fs_prog_data_inputs(c, prog_data); 809ed98bd31Smaya prog_data->writes_z = c->writes_z; 810ed98bd31Smaya prog_data->disable_ez = !c->s->info.fs.early_fragment_tests; 811ed98bd31Smaya prog_data->uses_center_w = c->uses_center_w; 8127ec681f3Smrg prog_data->uses_implicit_point_line_varyings = 8137ec681f3Smrg c->uses_implicit_point_line_varyings; 8147ec681f3Smrg prog_data->lock_scoreboard_on_first_thrsw = 8157ec681f3Smrg c->lock_scoreboard_on_first_thrsw; 8167ec681f3Smrg prog_data->force_per_sample_msaa = c->force_per_sample_msaa; 8177ec681f3Smrg prog_data->uses_pid = c->fs_uses_primitive_id; 818ed98bd31Smaya} 819ed98bd31Smaya 820ed98bd31Smayastatic void 821ed98bd31Smayav3d_cs_set_prog_data(struct v3d_compile *c, 822ed98bd31Smaya struct v3d_compute_prog_data *prog_data) 823ed98bd31Smaya{ 8247ec681f3Smrg prog_data->shared_size = c->s->info.shared_size; 8257ec681f3Smrg 8267ec681f3Smrg prog_data->local_size[0] = c->s->info.workgroup_size[0]; 8277ec681f3Smrg prog_data->local_size[1] = c->s->info.workgroup_size[1]; 8287ec681f3Smrg prog_data->local_size[2] = c->s->info.workgroup_size[2]; 8297ec681f3Smrg 8307ec681f3Smrg prog_data->has_subgroups = c->has_subgroups; 831ed98bd31Smaya} 832ed98bd31Smaya 833ed98bd31Smayastatic void 834ed98bd31Smayav3d_set_prog_data(struct v3d_compile *c, 835ed98bd31Smaya struct v3d_prog_data *prog_data) 836ed98bd31Smaya{ 837ed98bd31Smaya prog_data->threads = c->threads; 838ed98bd31Smaya prog_data->single_seg = !c->last_thrsw; 839ed98bd31Smaya prog_data->spill_size = c->spill_size; 8407ec681f3Smrg prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl; 8417ec681f3Smrg prog_data->has_control_barrier = c->s->info.uses_control_barrier; 842ed98bd31Smaya 843ed98bd31Smaya v3d_set_prog_data_uniforms(c, prog_data); 844ed98bd31Smaya 8457ec681f3Smrg switch (c->s->info.stage) { 8467ec681f3Smrg case MESA_SHADER_VERTEX: 847ed98bd31Smaya v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data); 8487ec681f3Smrg break; 8497ec681f3Smrg case MESA_SHADER_GEOMETRY: 8507ec681f3Smrg v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data); 8517ec681f3Smrg break; 8527ec681f3Smrg case MESA_SHADER_FRAGMENT: 853ed98bd31Smaya v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data); 8547ec681f3Smrg break; 8557ec681f3Smrg case MESA_SHADER_COMPUTE: 8567ec681f3Smrg v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data); 8577ec681f3Smrg break; 8587ec681f3Smrg default: 8597ec681f3Smrg unreachable("unsupported shader stage"); 860ed98bd31Smaya } 861ed98bd31Smaya} 862ed98bd31Smaya 863ed98bd31Smayastatic uint64_t * 864ed98bd31Smayav3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size) 865ed98bd31Smaya{ 866ed98bd31Smaya *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t); 867ed98bd31Smaya 868ed98bd31Smaya uint64_t *qpu_insts = malloc(*final_assembly_size); 869ed98bd31Smaya if (!qpu_insts) 870ed98bd31Smaya return NULL; 871ed98bd31Smaya 872ed98bd31Smaya memcpy(qpu_insts, c->qpu_insts, *final_assembly_size); 873ed98bd31Smaya 874ed98bd31Smaya vir_compile_destroy(c); 875ed98bd31Smaya 876ed98bd31Smaya return qpu_insts; 877ed98bd31Smaya} 878ed98bd31Smaya 879ed98bd31Smayastatic void 880ed98bd31Smayav3d_nir_lower_vs_early(struct v3d_compile *c) 881ed98bd31Smaya{ 882ed98bd31Smaya /* Split our I/O vars and dead code eliminate the unused 883ed98bd31Smaya * components. 884ed98bd31Smaya */ 885ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, 886ed98bd31Smaya nir_var_shader_in | nir_var_shader_out); 887ed98bd31Smaya uint64_t used_outputs[4] = {0}; 8887ec681f3Smrg for (int i = 0; i < c->vs_key->num_used_outputs; i++) { 8897ec681f3Smrg int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]); 8907ec681f3Smrg int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]); 891ed98bd31Smaya used_outputs[comp] |= 1ull << slot; 892ed98bd31Smaya } 893ed98bd31Smaya NIR_PASS_V(c->s, nir_remove_unused_io_vars, 8947ec681f3Smrg nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ 895ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_global_vars_to_local); 8967ec681f3Smrg v3d_optimize_nir(c, c->s); 8977ec681f3Smrg NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); 8987ec681f3Smrg 8997ec681f3Smrg /* This must go before nir_lower_io */ 9007ec681f3Smrg if (c->vs_key->per_vertex_point_size) 9017ec681f3Smrg NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); 9027ec681f3Smrg 903ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 904ed98bd31Smaya type_size_vec4, 905ed98bd31Smaya (nir_lower_io_options)0); 9067ec681f3Smrg /* clean up nir_lower_io's deref_var remains and do a constant folding pass 9077ec681f3Smrg * on the code it generated. 9087ec681f3Smrg */ 9097ec681f3Smrg NIR_PASS_V(c->s, nir_opt_dce); 9107ec681f3Smrg NIR_PASS_V(c->s, nir_opt_constant_folding); 9117ec681f3Smrg} 9127ec681f3Smrg 9137ec681f3Smrgstatic void 9147ec681f3Smrgv3d_nir_lower_gs_early(struct v3d_compile *c) 9157ec681f3Smrg{ 9167ec681f3Smrg /* Split our I/O vars and dead code eliminate the unused 9177ec681f3Smrg * components. 9187ec681f3Smrg */ 9197ec681f3Smrg NIR_PASS_V(c->s, nir_lower_io_to_scalar_early, 9207ec681f3Smrg nir_var_shader_in | nir_var_shader_out); 9217ec681f3Smrg uint64_t used_outputs[4] = {0}; 9227ec681f3Smrg for (int i = 0; i < c->gs_key->num_used_outputs; i++) { 9237ec681f3Smrg int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]); 9247ec681f3Smrg int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]); 9257ec681f3Smrg used_outputs[comp] |= 1ull << slot; 9267ec681f3Smrg } 9277ec681f3Smrg NIR_PASS_V(c->s, nir_remove_unused_io_vars, 9287ec681f3Smrg nir_var_shader_out, used_outputs, NULL); /* demotes to globals */ 9297ec681f3Smrg NIR_PASS_V(c->s, nir_lower_global_vars_to_local); 9307ec681f3Smrg v3d_optimize_nir(c, c->s); 9317ec681f3Smrg NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL); 9327ec681f3Smrg 9337ec681f3Smrg /* This must go before nir_lower_io */ 9347ec681f3Smrg if (c->gs_key->per_vertex_point_size) 9357ec681f3Smrg NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f); 9367ec681f3Smrg 9377ec681f3Smrg NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 9387ec681f3Smrg type_size_vec4, 9397ec681f3Smrg (nir_lower_io_options)0); 9407ec681f3Smrg /* clean up nir_lower_io's deref_var remains */ 9417ec681f3Smrg NIR_PASS_V(c->s, nir_opt_dce); 942ed98bd31Smaya} 943ed98bd31Smaya 94401e04c3fSmrgstatic void 94501e04c3fSmrgv3d_fixup_fs_output_types(struct v3d_compile *c) 94601e04c3fSmrg{ 9477ec681f3Smrg nir_foreach_shader_out_variable(var, c->s) { 94801e04c3fSmrg uint32_t mask = 0; 94901e04c3fSmrg 95001e04c3fSmrg switch (var->data.location) { 95101e04c3fSmrg case FRAG_RESULT_COLOR: 95201e04c3fSmrg mask = ~0; 95301e04c3fSmrg break; 95401e04c3fSmrg case FRAG_RESULT_DATA0: 95501e04c3fSmrg case FRAG_RESULT_DATA1: 95601e04c3fSmrg case FRAG_RESULT_DATA2: 95701e04c3fSmrg case FRAG_RESULT_DATA3: 95801e04c3fSmrg mask = 1 << (var->data.location - FRAG_RESULT_DATA0); 95901e04c3fSmrg break; 96001e04c3fSmrg } 96101e04c3fSmrg 96201e04c3fSmrg if (c->fs_key->int_color_rb & mask) { 96301e04c3fSmrg var->type = 96401e04c3fSmrg glsl_vector_type(GLSL_TYPE_INT, 96501e04c3fSmrg glsl_get_components(var->type)); 96601e04c3fSmrg } else if (c->fs_key->uint_color_rb & mask) { 96701e04c3fSmrg var->type = 96801e04c3fSmrg glsl_vector_type(GLSL_TYPE_UINT, 96901e04c3fSmrg glsl_get_components(var->type)); 97001e04c3fSmrg } 97101e04c3fSmrg } 97201e04c3fSmrg} 97301e04c3fSmrg 974ed98bd31Smayastatic void 975ed98bd31Smayav3d_nir_lower_fs_early(struct v3d_compile *c) 97601e04c3fSmrg{ 977ed98bd31Smaya if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb) 978ed98bd31Smaya v3d_fixup_fs_output_types(c); 97901e04c3fSmrg 9807ec681f3Smrg NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c); 9817ec681f3Smrg 9827ec681f3Smrg if (c->fs_key->line_smoothing) { 9837ec681f3Smrg v3d_nir_lower_line_smooth(c->s); 9847ec681f3Smrg NIR_PASS_V(c->s, nir_lower_global_vars_to_local); 9857ec681f3Smrg /* The lowering pass can introduce new sysval reads */ 9867ec681f3Smrg nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s)); 987ed98bd31Smaya } 988ed98bd31Smaya} 98901e04c3fSmrg 990ed98bd31Smayastatic void 9917ec681f3Smrgv3d_nir_lower_gs_late(struct v3d_compile *c) 992ed98bd31Smaya{ 9937ec681f3Smrg if (c->key->ucp_enables) { 9947ec681f3Smrg NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables, 9957ec681f3Smrg false, NULL); 9967ec681f3Smrg } 9977ec681f3Smrg 9987ec681f3Smrg /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */ 9997ec681f3Smrg NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 10007ec681f3Smrg} 100101e04c3fSmrg 10027ec681f3Smrgstatic void 10037ec681f3Smrgv3d_nir_lower_vs_late(struct v3d_compile *c) 10047ec681f3Smrg{ 1005ed98bd31Smaya if (c->key->ucp_enables) { 1006ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables, 10077ec681f3Smrg false, false, NULL); 1008ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_io_to_scalar, 1009ed98bd31Smaya nir_var_shader_out); 1010ed98bd31Smaya } 1011ed98bd31Smaya 1012ed98bd31Smaya /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */ 1013ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 1014ed98bd31Smaya} 101501e04c3fSmrg 1016ed98bd31Smayastatic void 1017ed98bd31Smayav3d_nir_lower_fs_late(struct v3d_compile *c) 1018ed98bd31Smaya{ 10197ec681f3Smrg /* In OpenGL the fragment shader can't read gl_ClipDistance[], but 10207ec681f3Smrg * Vulkan allows it, in which case the SPIR-V compiler will declare 10217ec681f3Smrg * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as 10227ec681f3Smrg * the last parameter to always operate with a compact array in both 10237ec681f3Smrg * OpenGL and Vulkan so we do't have to care about the API we 10247ec681f3Smrg * are using. 10257ec681f3Smrg */ 1026ed98bd31Smaya if (c->key->ucp_enables) 10277ec681f3Smrg NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true); 102801e04c3fSmrg 102901e04c3fSmrg NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); 1030ed98bd31Smaya} 1031ed98bd31Smaya 1032ed98bd31Smayastatic uint32_t 1033ed98bd31Smayavir_get_max_temps(struct v3d_compile *c) 1034ed98bd31Smaya{ 1035ed98bd31Smaya int max_ip = 0; 1036ed98bd31Smaya vir_for_each_inst_inorder(inst, c) 1037ed98bd31Smaya max_ip++; 1038ed98bd31Smaya 1039ed98bd31Smaya uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip); 1040ed98bd31Smaya 1041ed98bd31Smaya for (int t = 0; t < c->num_temps; t++) { 1042ed98bd31Smaya for (int i = c->temp_start[t]; (i < c->temp_end[t] && 1043ed98bd31Smaya i < max_ip); i++) { 1044ed98bd31Smaya if (i > max_ip) 1045ed98bd31Smaya break; 1046ed98bd31Smaya pressure[i]++; 1047ed98bd31Smaya } 1048ed98bd31Smaya } 1049ed98bd31Smaya 1050ed98bd31Smaya uint32_t max_temps = 0; 1051ed98bd31Smaya for (int i = 0; i < max_ip; i++) 1052ed98bd31Smaya max_temps = MAX2(max_temps, pressure[i]); 1053ed98bd31Smaya 1054ed98bd31Smaya ralloc_free(pressure); 1055ed98bd31Smaya 1056ed98bd31Smaya return max_temps; 1057ed98bd31Smaya} 1058ed98bd31Smaya 10597ec681f3Smrgenum v3d_dependency_class { 10607ec681f3Smrg V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0 10617ec681f3Smrg}; 10627ec681f3Smrg 10637ec681f3Smrgstatic bool 10647ec681f3Smrgv3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr, 10657ec681f3Smrg nir_schedule_dependency *dep, 10667ec681f3Smrg void *user_data) 1067ed98bd31Smaya{ 10687ec681f3Smrg struct v3d_compile *c = user_data; 10697ec681f3Smrg 10707ec681f3Smrg switch (intr->intrinsic) { 10717ec681f3Smrg case nir_intrinsic_store_output: 10727ec681f3Smrg /* Writing to location 0 overwrites the value passed in for 10737ec681f3Smrg * gl_PrimitiveID on geometry shaders 10747ec681f3Smrg */ 10757ec681f3Smrg if (c->s->info.stage != MESA_SHADER_GEOMETRY || 10767ec681f3Smrg nir_intrinsic_base(intr) != 0) 10777ec681f3Smrg break; 10787ec681f3Smrg 10797ec681f3Smrg nir_const_value *const_value = 10807ec681f3Smrg nir_src_as_const_value(intr->src[1]); 10817ec681f3Smrg 10827ec681f3Smrg if (const_value == NULL) 10837ec681f3Smrg break; 10847ec681f3Smrg 10857ec681f3Smrg uint64_t offset = 10867ec681f3Smrg nir_const_value_as_uint(*const_value, 10877ec681f3Smrg nir_src_bit_size(intr->src[1])); 10887ec681f3Smrg if (offset != 0) 10897ec681f3Smrg break; 10907ec681f3Smrg 10917ec681f3Smrg dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0; 10927ec681f3Smrg dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY; 10937ec681f3Smrg return true; 10947ec681f3Smrg 10957ec681f3Smrg case nir_intrinsic_load_primitive_id: 10967ec681f3Smrg if (c->s->info.stage != MESA_SHADER_GEOMETRY) 10977ec681f3Smrg break; 10987ec681f3Smrg 10997ec681f3Smrg dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0; 11007ec681f3Smrg dep->type = NIR_SCHEDULE_READ_DEPENDENCY; 11017ec681f3Smrg return true; 11027ec681f3Smrg 11037ec681f3Smrg default: 11047ec681f3Smrg break; 11057ec681f3Smrg } 11067ec681f3Smrg 11077ec681f3Smrg return false; 11087ec681f3Smrg} 11097ec681f3Smrg 11107ec681f3Smrgstatic bool 11117ec681f3Smrgshould_split_wrmask(const nir_instr *instr, const void *data) 11127ec681f3Smrg{ 11137ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 11147ec681f3Smrg switch (intr->intrinsic) { 11157ec681f3Smrg case nir_intrinsic_store_ssbo: 11167ec681f3Smrg case nir_intrinsic_store_shared: 11177ec681f3Smrg case nir_intrinsic_store_global: 11187ec681f3Smrg case nir_intrinsic_store_scratch: 11197ec681f3Smrg return true; 11207ec681f3Smrg default: 11217ec681f3Smrg return false; 11227ec681f3Smrg } 11237ec681f3Smrg} 11247ec681f3Smrg 11257ec681f3Smrgstatic nir_intrinsic_instr * 11267ec681f3Smrgnir_instr_as_constant_ubo_load(nir_instr *inst) 11277ec681f3Smrg{ 11287ec681f3Smrg if (inst->type != nir_instr_type_intrinsic) 11297ec681f3Smrg return NULL; 11307ec681f3Smrg 11317ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst); 11327ec681f3Smrg if (intr->intrinsic != nir_intrinsic_load_ubo) 11337ec681f3Smrg return NULL; 11347ec681f3Smrg 11357ec681f3Smrg assert(nir_src_is_const(intr->src[0])); 11367ec681f3Smrg if (!nir_src_is_const(intr->src[1])) 11377ec681f3Smrg return NULL; 11387ec681f3Smrg 11397ec681f3Smrg return intr; 11407ec681f3Smrg} 11417ec681f3Smrg 11427ec681f3Smrgstatic bool 11437ec681f3Smrgv3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref) 11447ec681f3Smrg{ 11457ec681f3Smrg bool progress = false; 11467ec681f3Smrg 11477ec681f3Smrg nir_instr *ref_inst = &ref->instr; 11487ec681f3Smrg uint32_t ref_offset = nir_src_as_uint(ref->src[1]); 11497ec681f3Smrg uint32_t ref_index = nir_src_as_uint(ref->src[0]); 11507ec681f3Smrg 11517ec681f3Smrg /* Go through all instructions after ref searching for constant UBO 11527ec681f3Smrg * loads for the same UBO index. 11537ec681f3Smrg */ 11547ec681f3Smrg bool seq_break = false; 11557ec681f3Smrg nir_instr *inst = &ref->instr; 11567ec681f3Smrg nir_instr *next_inst = NULL; 11577ec681f3Smrg while (true) { 11587ec681f3Smrg inst = next_inst ? next_inst : nir_instr_next(inst); 11597ec681f3Smrg if (!inst) 11607ec681f3Smrg break; 11617ec681f3Smrg 11627ec681f3Smrg next_inst = NULL; 11637ec681f3Smrg 11647ec681f3Smrg if (inst->type != nir_instr_type_intrinsic) 11657ec681f3Smrg continue; 11667ec681f3Smrg 11677ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst); 11687ec681f3Smrg if (intr->intrinsic != nir_intrinsic_load_ubo) 11697ec681f3Smrg continue; 11707ec681f3Smrg 11717ec681f3Smrg /* We only produce unifa sequences for non-divergent loads */ 11727ec681f3Smrg if (nir_src_is_divergent(intr->src[1])) 11737ec681f3Smrg continue; 11747ec681f3Smrg 11757ec681f3Smrg /* If there are any UBO loads that are not constant or that 11767ec681f3Smrg * use a different UBO index in between the reference load and 11777ec681f3Smrg * any other constant load for the same index, they would break 11787ec681f3Smrg * the unifa sequence. We will flag that so we can then move 11797ec681f3Smrg * all constant UBO loads for the reference index before these 11807ec681f3Smrg * and not just the ones that are not ordered to avoid breaking 11817ec681f3Smrg * the sequence and reduce unifa writes. 11827ec681f3Smrg */ 11837ec681f3Smrg if (!nir_src_is_const(intr->src[1])) { 11847ec681f3Smrg seq_break = true; 11857ec681f3Smrg continue; 11867ec681f3Smrg } 11877ec681f3Smrg uint32_t offset = nir_src_as_uint(intr->src[1]); 11887ec681f3Smrg 11897ec681f3Smrg assert(nir_src_is_const(intr->src[0])); 11907ec681f3Smrg uint32_t index = nir_src_as_uint(intr->src[0]); 11917ec681f3Smrg if (index != ref_index) { 11927ec681f3Smrg seq_break = true; 11937ec681f3Smrg continue; 11947ec681f3Smrg } 11957ec681f3Smrg 11967ec681f3Smrg /* Only move loads with an offset that is close enough to the 11977ec681f3Smrg * reference offset, since otherwise we would not be able to 11987ec681f3Smrg * skip the unifa write for them. See ntq_emit_load_ubo_unifa. 11997ec681f3Smrg */ 12007ec681f3Smrg if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE) 12017ec681f3Smrg continue; 12027ec681f3Smrg 12037ec681f3Smrg /* We will move this load if its offset is smaller than ref's 12047ec681f3Smrg * (in which case we will move it before ref) or if the offset 12057ec681f3Smrg * is larger than ref's but there are sequence breakers in 12067ec681f3Smrg * in between (in which case we will move it after ref and 12077ec681f3Smrg * before the sequence breakers). 12087ec681f3Smrg */ 12097ec681f3Smrg if (!seq_break && offset >= ref_offset) 12107ec681f3Smrg continue; 12117ec681f3Smrg 12127ec681f3Smrg /* Find where exactly we want to move this load: 12137ec681f3Smrg * 12147ec681f3Smrg * If we are moving it before ref, we want to check any other 12157ec681f3Smrg * UBO loads we placed before ref and make sure we insert this 12167ec681f3Smrg * one properly ordered with them. Likewise, if we are moving 12177ec681f3Smrg * it after ref. 12187ec681f3Smrg */ 12197ec681f3Smrg nir_instr *pos = ref_inst; 12207ec681f3Smrg nir_instr *tmp = pos; 12217ec681f3Smrg do { 12227ec681f3Smrg if (offset < ref_offset) 12237ec681f3Smrg tmp = nir_instr_prev(tmp); 12247ec681f3Smrg else 12257ec681f3Smrg tmp = nir_instr_next(tmp); 12267ec681f3Smrg 12277ec681f3Smrg if (!tmp || tmp == inst) 12287ec681f3Smrg break; 12297ec681f3Smrg 12307ec681f3Smrg /* Ignore non-unifa UBO loads */ 12317ec681f3Smrg if (tmp->type != nir_instr_type_intrinsic) 12327ec681f3Smrg continue; 12337ec681f3Smrg 12347ec681f3Smrg nir_intrinsic_instr *tmp_intr = 12357ec681f3Smrg nir_instr_as_intrinsic(tmp); 12367ec681f3Smrg if (tmp_intr->intrinsic != nir_intrinsic_load_ubo) 12377ec681f3Smrg continue; 1238ed98bd31Smaya 12397ec681f3Smrg if (nir_src_is_divergent(tmp_intr->src[1])) 12407ec681f3Smrg continue; 12417ec681f3Smrg 12427ec681f3Smrg /* Stop if we find a unifa UBO load that breaks the 12437ec681f3Smrg * sequence. 12447ec681f3Smrg */ 12457ec681f3Smrg if (!nir_src_is_const(tmp_intr->src[1])) 12467ec681f3Smrg break; 12477ec681f3Smrg 12487ec681f3Smrg if (nir_src_as_uint(tmp_intr->src[0]) != index) 12497ec681f3Smrg break; 12507ec681f3Smrg 12517ec681f3Smrg uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]); 12527ec681f3Smrg if (offset < ref_offset) { 12537ec681f3Smrg if (tmp_offset < offset || 12547ec681f3Smrg tmp_offset >= ref_offset) { 12557ec681f3Smrg break; 12567ec681f3Smrg } else { 12577ec681f3Smrg pos = tmp; 12587ec681f3Smrg } 12597ec681f3Smrg } else { 12607ec681f3Smrg if (tmp_offset > offset || 12617ec681f3Smrg tmp_offset <= ref_offset) { 12627ec681f3Smrg break; 12637ec681f3Smrg } else { 12647ec681f3Smrg pos = tmp; 12657ec681f3Smrg } 12667ec681f3Smrg } 12677ec681f3Smrg } while (true); 12687ec681f3Smrg 12697ec681f3Smrg /* We can't move the UBO load before the instruction that 12707ec681f3Smrg * defines its constant offset. If that instruction is placed 12717ec681f3Smrg * in between the new location (pos) and the current location 12727ec681f3Smrg * of this load, we will have to move that instruction too. 12737ec681f3Smrg * 12747ec681f3Smrg * We don't care about the UBO index definition because that 12757ec681f3Smrg * is optimized to be reused by all UBO loads for the same 12767ec681f3Smrg * index and therefore is certain to be defined before the 12777ec681f3Smrg * first UBO load that uses it. 12787ec681f3Smrg */ 12797ec681f3Smrg nir_instr *offset_inst = NULL; 12807ec681f3Smrg tmp = inst; 12817ec681f3Smrg while ((tmp = nir_instr_prev(tmp)) != NULL) { 12827ec681f3Smrg if (pos == tmp) { 12837ec681f3Smrg /* We reached the target location without 12847ec681f3Smrg * finding the instruction that defines the 12857ec681f3Smrg * offset, so that instruction must be before 12867ec681f3Smrg * the new position and we don't have to fix it. 12877ec681f3Smrg */ 12887ec681f3Smrg break; 12897ec681f3Smrg } 12907ec681f3Smrg if (intr->src[1].ssa->parent_instr == tmp) { 12917ec681f3Smrg offset_inst = tmp; 12927ec681f3Smrg break; 12937ec681f3Smrg } 12947ec681f3Smrg } 12957ec681f3Smrg 12967ec681f3Smrg if (offset_inst) { 12977ec681f3Smrg exec_node_remove(&offset_inst->node); 12987ec681f3Smrg exec_node_insert_node_before(&pos->node, 12997ec681f3Smrg &offset_inst->node); 13007ec681f3Smrg } 13017ec681f3Smrg 13027ec681f3Smrg /* Since we are moving the instruction before its current 13037ec681f3Smrg * location, grab its successor before the move so that 13047ec681f3Smrg * we can continue the next iteration of the main loop from 13057ec681f3Smrg * that instruction. 13067ec681f3Smrg */ 13077ec681f3Smrg next_inst = nir_instr_next(inst); 13087ec681f3Smrg 13097ec681f3Smrg /* Move this load to the selected location */ 13107ec681f3Smrg exec_node_remove(&inst->node); 13117ec681f3Smrg if (offset < ref_offset) 13127ec681f3Smrg exec_node_insert_node_before(&pos->node, &inst->node); 13137ec681f3Smrg else 13147ec681f3Smrg exec_node_insert_after(&pos->node, &inst->node); 13157ec681f3Smrg 13167ec681f3Smrg progress = true; 13177ec681f3Smrg } 13187ec681f3Smrg 13197ec681f3Smrg return progress; 13207ec681f3Smrg} 13217ec681f3Smrg 13227ec681f3Smrgstatic bool 13237ec681f3Smrgv3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c, 13247ec681f3Smrg nir_block *block) 13257ec681f3Smrg{ 13267ec681f3Smrg bool progress = false; 13277ec681f3Smrg bool local_progress; 13287ec681f3Smrg do { 13297ec681f3Smrg local_progress = false; 13307ec681f3Smrg nir_foreach_instr_safe(inst, block) { 13317ec681f3Smrg nir_intrinsic_instr *intr = 13327ec681f3Smrg nir_instr_as_constant_ubo_load(inst); 13337ec681f3Smrg if (intr) { 13347ec681f3Smrg local_progress |= 13357ec681f3Smrg v3d_nir_sort_constant_ubo_load(block, intr); 13367ec681f3Smrg } 13377ec681f3Smrg } 13387ec681f3Smrg progress |= local_progress; 13397ec681f3Smrg } while (local_progress); 13407ec681f3Smrg 13417ec681f3Smrg return progress; 13427ec681f3Smrg} 13437ec681f3Smrg 13447ec681f3Smrg/** 13457ec681f3Smrg * Sorts constant UBO loads in each block by offset to maximize chances of 13467ec681f3Smrg * skipping unifa writes when converting to VIR. This can increase register 13477ec681f3Smrg * pressure. 13487ec681f3Smrg */ 13497ec681f3Smrgstatic bool 13507ec681f3Smrgv3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c) 13517ec681f3Smrg{ 13527ec681f3Smrg nir_foreach_function(function, s) { 13537ec681f3Smrg if (function->impl) { 13547ec681f3Smrg nir_foreach_block(block, function->impl) { 13557ec681f3Smrg c->sorted_any_ubo_loads |= 13567ec681f3Smrg v3d_nir_sort_constant_ubo_loads_block(c, block); 13577ec681f3Smrg } 13587ec681f3Smrg nir_metadata_preserve(function->impl, 13597ec681f3Smrg nir_metadata_block_index | 13607ec681f3Smrg nir_metadata_dominance); 13617ec681f3Smrg } 13627ec681f3Smrg } 13637ec681f3Smrg return c->sorted_any_ubo_loads; 13647ec681f3Smrg} 13657ec681f3Smrg 13667ec681f3Smrgstatic void 13677ec681f3Smrglower_load_num_subgroups(struct v3d_compile *c, 13687ec681f3Smrg nir_builder *b, 13697ec681f3Smrg nir_intrinsic_instr *intr) 13707ec681f3Smrg{ 13717ec681f3Smrg assert(c->s->info.stage == MESA_SHADER_COMPUTE); 13727ec681f3Smrg assert(intr->intrinsic == nir_intrinsic_load_num_subgroups); 13737ec681f3Smrg 13747ec681f3Smrg b->cursor = nir_after_instr(&intr->instr); 13757ec681f3Smrg uint32_t num_subgroups = 13767ec681f3Smrg DIV_ROUND_UP(c->s->info.workgroup_size[0] * 13777ec681f3Smrg c->s->info.workgroup_size[1] * 13787ec681f3Smrg c->s->info.workgroup_size[2], V3D_CHANNELS); 13797ec681f3Smrg nir_ssa_def *result = nir_imm_int(b, num_subgroups); 13807ec681f3Smrg nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); 13817ec681f3Smrg nir_instr_remove(&intr->instr); 13827ec681f3Smrg} 13837ec681f3Smrg 13847ec681f3Smrgstatic bool 13857ec681f3Smrglower_subgroup_intrinsics(struct v3d_compile *c, 13867ec681f3Smrg nir_block *block, nir_builder *b) 13877ec681f3Smrg{ 13887ec681f3Smrg bool progress = false; 13897ec681f3Smrg nir_foreach_instr_safe(inst, block) { 13907ec681f3Smrg if (inst->type != nir_instr_type_intrinsic) 13917ec681f3Smrg continue;; 13927ec681f3Smrg 13937ec681f3Smrg nir_intrinsic_instr *intr = 13947ec681f3Smrg nir_instr_as_intrinsic(inst); 13957ec681f3Smrg if (!intr) 13967ec681f3Smrg continue; 13977ec681f3Smrg 13987ec681f3Smrg switch (intr->intrinsic) { 13997ec681f3Smrg case nir_intrinsic_load_num_subgroups: 14007ec681f3Smrg lower_load_num_subgroups(c, b, intr); 14017ec681f3Smrg progress = true; 14027ec681f3Smrg FALLTHROUGH; 14037ec681f3Smrg case nir_intrinsic_load_subgroup_id: 14047ec681f3Smrg case nir_intrinsic_load_subgroup_size: 14057ec681f3Smrg case nir_intrinsic_load_subgroup_invocation: 14067ec681f3Smrg case nir_intrinsic_elect: 14077ec681f3Smrg c->has_subgroups = true; 14087ec681f3Smrg break; 14097ec681f3Smrg default: 14107ec681f3Smrg break; 14117ec681f3Smrg } 14127ec681f3Smrg } 14137ec681f3Smrg 14147ec681f3Smrg return progress; 14157ec681f3Smrg} 14167ec681f3Smrg 14177ec681f3Smrgstatic bool 14187ec681f3Smrgv3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c) 14197ec681f3Smrg{ 14207ec681f3Smrg bool progress = false; 14217ec681f3Smrg nir_foreach_function(function, s) { 14227ec681f3Smrg if (function->impl) { 14237ec681f3Smrg nir_builder b; 14247ec681f3Smrg nir_builder_init(&b, function->impl); 14257ec681f3Smrg 14267ec681f3Smrg nir_foreach_block(block, function->impl) 14277ec681f3Smrg progress |= lower_subgroup_intrinsics(c, block, &b); 14287ec681f3Smrg 14297ec681f3Smrg nir_metadata_preserve(function->impl, 14307ec681f3Smrg nir_metadata_block_index | 14317ec681f3Smrg nir_metadata_dominance); 14327ec681f3Smrg } 14337ec681f3Smrg } 14347ec681f3Smrg return progress; 14357ec681f3Smrg} 14367ec681f3Smrg 14377ec681f3Smrgstatic void 14387ec681f3Smrgv3d_attempt_compile(struct v3d_compile *c) 14397ec681f3Smrg{ 1440ed98bd31Smaya switch (c->s->info.stage) { 1441ed98bd31Smaya case MESA_SHADER_VERTEX: 14427ec681f3Smrg c->vs_key = (struct v3d_vs_key *) c->key; 14437ec681f3Smrg break; 14447ec681f3Smrg case MESA_SHADER_GEOMETRY: 14457ec681f3Smrg c->gs_key = (struct v3d_gs_key *) c->key; 1446ed98bd31Smaya break; 1447ed98bd31Smaya case MESA_SHADER_FRAGMENT: 14487ec681f3Smrg c->fs_key = (struct v3d_fs_key *) c->key; 1449ed98bd31Smaya break; 1450ed98bd31Smaya case MESA_SHADER_COMPUTE: 1451ed98bd31Smaya break; 1452ed98bd31Smaya default: 1453ed98bd31Smaya unreachable("unsupported shader stage"); 1454ed98bd31Smaya } 1455ed98bd31Smaya 14567ec681f3Smrg switch (c->s->info.stage) { 14577ec681f3Smrg case MESA_SHADER_VERTEX: 1458ed98bd31Smaya v3d_nir_lower_vs_early(c); 14597ec681f3Smrg break; 14607ec681f3Smrg case MESA_SHADER_GEOMETRY: 14617ec681f3Smrg v3d_nir_lower_gs_early(c); 14627ec681f3Smrg break; 14637ec681f3Smrg case MESA_SHADER_FRAGMENT: 1464ed98bd31Smaya v3d_nir_lower_fs_early(c); 14657ec681f3Smrg break; 14667ec681f3Smrg default: 14677ec681f3Smrg break; 1468ed98bd31Smaya } 1469ed98bd31Smaya 1470ed98bd31Smaya v3d_lower_nir(c); 1471ed98bd31Smaya 14727ec681f3Smrg switch (c->s->info.stage) { 14737ec681f3Smrg case MESA_SHADER_VERTEX: 1474ed98bd31Smaya v3d_nir_lower_vs_late(c); 14757ec681f3Smrg break; 14767ec681f3Smrg case MESA_SHADER_GEOMETRY: 14777ec681f3Smrg v3d_nir_lower_gs_late(c); 14787ec681f3Smrg break; 14797ec681f3Smrg case MESA_SHADER_FRAGMENT: 1480ed98bd31Smaya v3d_nir_lower_fs_late(c); 14817ec681f3Smrg break; 14827ec681f3Smrg default: 14837ec681f3Smrg break; 1484ed98bd31Smaya } 1485ed98bd31Smaya 1486ed98bd31Smaya NIR_PASS_V(c->s, v3d_nir_lower_io, c); 1487ed98bd31Smaya NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c); 1488ed98bd31Smaya NIR_PASS_V(c->s, v3d_nir_lower_image_load_store); 14897ec681f3Smrg nir_lower_idiv_options idiv_options = { 14907ec681f3Smrg .imprecise_32bit_lowering = true, 14917ec681f3Smrg .allow_fp16 = true, 14927ec681f3Smrg }; 14937ec681f3Smrg NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options); 14947ec681f3Smrg 14957ec681f3Smrg if (c->key->robust_buffer_access) { 14967ec681f3Smrg /* v3d_nir_lower_robust_buffer_access assumes constant buffer 14977ec681f3Smrg * indices on ubo/ssbo intrinsics so run copy propagation and 14987ec681f3Smrg * constant folding passes before we run the lowering to warrant 14997ec681f3Smrg * this. We also want to run the lowering before v3d_optimize to 15007ec681f3Smrg * clean-up redundant get_buffer_size calls produced in the pass. 15017ec681f3Smrg */ 15027ec681f3Smrg NIR_PASS_V(c->s, nir_copy_prop); 15037ec681f3Smrg NIR_PASS_V(c->s, nir_opt_constant_folding); 15047ec681f3Smrg NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c); 15057ec681f3Smrg } 15067ec681f3Smrg 15077ec681f3Smrg NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s); 15087ec681f3Smrg 15097ec681f3Smrg NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c); 15107ec681f3Smrg 15117ec681f3Smrg v3d_optimize_nir(c, c->s); 15127ec681f3Smrg 15137ec681f3Smrg /* Do late algebraic optimization to turn add(a, neg(b)) back into 15147ec681f3Smrg * subs, then the mandatory cleanup after algebraic. Note that it may 15157ec681f3Smrg * produce fnegs, and if so then we need to keep running to squash 15167ec681f3Smrg * fneg(fneg(a)). 15177ec681f3Smrg */ 15187ec681f3Smrg bool more_late_algebraic = true; 15197ec681f3Smrg while (more_late_algebraic) { 15207ec681f3Smrg more_late_algebraic = false; 15217ec681f3Smrg NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); 15227ec681f3Smrg NIR_PASS_V(c->s, nir_opt_constant_folding); 15237ec681f3Smrg NIR_PASS_V(c->s, nir_copy_prop); 15247ec681f3Smrg NIR_PASS_V(c->s, nir_opt_dce); 15257ec681f3Smrg NIR_PASS_V(c->s, nir_opt_cse); 15267ec681f3Smrg } 152701e04c3fSmrg 1528ed98bd31Smaya NIR_PASS_V(c->s, nir_lower_bool_to_int32); 15297ec681f3Smrg nir_convert_to_lcssa(c->s, true, true); 15307ec681f3Smrg NIR_PASS_V(c->s, nir_divergence_analysis); 153101e04c3fSmrg NIR_PASS_V(c->s, nir_convert_from_ssa, true); 153201e04c3fSmrg 15337ec681f3Smrg struct nir_schedule_options schedule_options = { 15347ec681f3Smrg /* Schedule for about half our register space, to enable more 15357ec681f3Smrg * shaders to hit 4 threads. 15367ec681f3Smrg */ 15377ec681f3Smrg .threshold = 24, 15387ec681f3Smrg 15397ec681f3Smrg /* Vertex shaders share the same memory for inputs and outputs, 15407ec681f3Smrg * fragement and geometry shaders do not. 15417ec681f3Smrg */ 15427ec681f3Smrg .stages_with_shared_io_memory = 15437ec681f3Smrg (((1 << MESA_ALL_SHADER_STAGES) - 1) & 15447ec681f3Smrg ~((1 << MESA_SHADER_FRAGMENT) | 15457ec681f3Smrg (1 << MESA_SHADER_GEOMETRY))), 15467ec681f3Smrg 15477ec681f3Smrg .fallback = c->fallback_scheduler, 15487ec681f3Smrg 15497ec681f3Smrg .intrinsic_cb = v3d_intrinsic_dependency_cb, 15507ec681f3Smrg .intrinsic_cb_data = c, 15517ec681f3Smrg }; 15527ec681f3Smrg NIR_PASS_V(c->s, nir_schedule, &schedule_options); 15537ec681f3Smrg 15547ec681f3Smrg if (!c->disable_constant_ubo_load_sorting) 15557ec681f3Smrg NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c); 15567ec681f3Smrg 155701e04c3fSmrg v3d_nir_to_vir(c); 15587ec681f3Smrg} 15597ec681f3Smrg 15607ec681f3Smrguint32_t 15617ec681f3Smrgv3d_prog_data_size(gl_shader_stage stage) 15627ec681f3Smrg{ 15637ec681f3Smrg static const int prog_data_size[] = { 15647ec681f3Smrg [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data), 15657ec681f3Smrg [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data), 15667ec681f3Smrg [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data), 15677ec681f3Smrg [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data), 15687ec681f3Smrg }; 15697ec681f3Smrg 15707ec681f3Smrg assert(stage >= 0 && 15717ec681f3Smrg stage < ARRAY_SIZE(prog_data_size) && 15727ec681f3Smrg prog_data_size[stage]); 15737ec681f3Smrg 15747ec681f3Smrg return prog_data_size[stage]; 15757ec681f3Smrg} 15767ec681f3Smrg 15777ec681f3Smrgint v3d_shaderdb_dump(struct v3d_compile *c, 15787ec681f3Smrg char **shaderdb_str) 15797ec681f3Smrg{ 15807ec681f3Smrg if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED) 15817ec681f3Smrg return -1; 15827ec681f3Smrg 15837ec681f3Smrg return asprintf(shaderdb_str, 15847ec681f3Smrg "%s shader: %d inst, %d threads, %d loops, " 15857ec681f3Smrg "%d uniforms, %d max-temps, %d:%d spills:fills, " 15867ec681f3Smrg "%d sfu-stalls, %d inst-and-stalls, %d nops", 15877ec681f3Smrg vir_get_stage_name(c), 15887ec681f3Smrg c->qpu_inst_count, 15897ec681f3Smrg c->threads, 15907ec681f3Smrg c->loops, 15917ec681f3Smrg c->num_uniforms, 15927ec681f3Smrg vir_get_max_temps(c), 15937ec681f3Smrg c->spills, 15947ec681f3Smrg c->fills, 15957ec681f3Smrg c->qpu_inst_stalled_count, 15967ec681f3Smrg c->qpu_inst_count + c->qpu_inst_stalled_count, 15977ec681f3Smrg c->nop_count); 15987ec681f3Smrg} 15997ec681f3Smrg 16007ec681f3Smrg/* This is a list of incremental changes to the compilation strategy 16017ec681f3Smrg * that will be used to try to compile the shader successfully. The 16027ec681f3Smrg * default strategy is to enable all optimizations which will have 16037ec681f3Smrg * the highest register pressure but is expected to produce most 16047ec681f3Smrg * optimal code. Following strategies incrementally disable specific 16057ec681f3Smrg * optimizations that are known to contribute to register pressure 16067ec681f3Smrg * in order to be able to compile the shader successfully while meeting 16077ec681f3Smrg * thread count requirements. 16087ec681f3Smrg * 16097ec681f3Smrg * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also 16107ec681f3Smrg * cover previous hardware as well (meaning that we are not limiting 16117ec681f3Smrg * register allocation to any particular thread count). This is fine 16127ec681f3Smrg * because v3d_nir_to_vir will cap this to the actual minimum. 16137ec681f3Smrg */ 16147ec681f3Smrgstruct v3d_compiler_strategy { 16157ec681f3Smrg const char *name; 16167ec681f3Smrg uint32_t max_threads; 16177ec681f3Smrg uint32_t min_threads; 16187ec681f3Smrg bool disable_loop_unrolling; 16197ec681f3Smrg bool disable_ubo_load_sorting; 16207ec681f3Smrg bool disable_tmu_pipelining; 16217ec681f3Smrg bool tmu_spilling_allowed; 16227ec681f3Smrg} static const strategies[] = { 16237ec681f3Smrg /*0*/ { "default", 4, 4, false, false, false, false }, 16247ec681f3Smrg /*1*/ { "disable loop unrolling", 4, 4, true, false, false, false }, 16257ec681f3Smrg /*2*/ { "disable UBO load sorting", 4, 4, true, true, false, false }, 16267ec681f3Smrg /*3*/ { "disable TMU pipelining", 4, 4, true, true, true, false }, 16277ec681f3Smrg /*4*/ { "lower thread count", 2, 1, false, false, false, false }, 16287ec681f3Smrg /*5*/ { "disable loop unrolling (ltc)", 2, 1, true, false, false, false }, 16297ec681f3Smrg /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true, true, false, false }, 16307ec681f3Smrg /*7*/ { "disable TMU pipelining (ltc)", 2, 1, true, true, true, true }, 16317ec681f3Smrg /*8*/ { "fallback scheduler", 2, 1, true, true, true, true } 16327ec681f3Smrg}; 16337ec681f3Smrg 16347ec681f3Smrg/** 16357ec681f3Smrg * If a particular optimization didn't make any progress during a compile 16367ec681f3Smrg * attempt disabling it alone won't allow us to compile the shader successfuly, 16377ec681f3Smrg * since we'll end up with the same code. Detect these scenarios so we can 16387ec681f3Smrg * avoid wasting time with useless compiles. We should also consider if the 16397ec681f3Smrg * strategy changes other aspects of the compilation process though, like 16407ec681f3Smrg * spilling, and not skip it in that case. 16417ec681f3Smrg */ 16427ec681f3Smrgstatic bool 16437ec681f3Smrgskip_compile_strategy(struct v3d_compile *c, uint32_t idx) 16447ec681f3Smrg{ 16457ec681f3Smrg /* We decide if we can skip a strategy based on the optimizations that 16467ec681f3Smrg * were active in the previous strategy, so we should only be calling this 16477ec681f3Smrg * for strategies after the first. 16487ec681f3Smrg */ 16497ec681f3Smrg assert(idx > 0); 16507ec681f3Smrg 16517ec681f3Smrg /* Don't skip a strategy that changes spilling behavior */ 16527ec681f3Smrg if (strategies[idx].tmu_spilling_allowed != 16537ec681f3Smrg strategies[idx - 1].tmu_spilling_allowed) { 16547ec681f3Smrg return false; 16557ec681f3Smrg } 16567ec681f3Smrg 16577ec681f3Smrg switch (idx) { 16587ec681f3Smrg /* Loop unrolling: skip if we didn't unroll any loops */ 16597ec681f3Smrg case 1: 16607ec681f3Smrg case 5: 16617ec681f3Smrg return !c->unrolled_any_loops; 16627ec681f3Smrg /* UBO load sorting: skip if we didn't sort any loads */ 16637ec681f3Smrg case 2: 16647ec681f3Smrg case 6: 16657ec681f3Smrg return !c->sorted_any_ubo_loads; 16667ec681f3Smrg /* TMU pipelining: skip if we didn't pipeline any TMU ops */ 16677ec681f3Smrg case 3: 16687ec681f3Smrg case 7: 16697ec681f3Smrg return !c->pipelined_any_tmu; 16707ec681f3Smrg /* Lower thread count: skip if we already tried less that 4 threads */ 16717ec681f3Smrg case 4: 16727ec681f3Smrg return c->threads < 4; 16737ec681f3Smrg default: 16747ec681f3Smrg return false; 16757ec681f3Smrg }; 16767ec681f3Smrg} 16777ec681f3Smrguint64_t *v3d_compile(const struct v3d_compiler *compiler, 16787ec681f3Smrg struct v3d_key *key, 16797ec681f3Smrg struct v3d_prog_data **out_prog_data, 16807ec681f3Smrg nir_shader *s, 16817ec681f3Smrg void (*debug_output)(const char *msg, 16827ec681f3Smrg void *debug_output_data), 16837ec681f3Smrg void *debug_output_data, 16847ec681f3Smrg int program_id, int variant_id, 16857ec681f3Smrg uint32_t *final_assembly_size) 16867ec681f3Smrg{ 16877ec681f3Smrg struct v3d_compile *c = NULL; 16887ec681f3Smrg for (int i = 0; i < ARRAY_SIZE(strategies); i++) { 16897ec681f3Smrg /* Fallback strategy */ 16907ec681f3Smrg if (i > 0) { 16917ec681f3Smrg assert(c); 16927ec681f3Smrg if (skip_compile_strategy(c, i)) 16937ec681f3Smrg continue; 16947ec681f3Smrg 16957ec681f3Smrg char *debug_msg; 16967ec681f3Smrg int ret = asprintf(&debug_msg, 16977ec681f3Smrg "Falling back to strategy '%s' for %s", 16987ec681f3Smrg strategies[i].name, 16997ec681f3Smrg vir_get_stage_name(c)); 17007ec681f3Smrg 17017ec681f3Smrg if (ret >= 0) { 17027ec681f3Smrg if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF)) 17037ec681f3Smrg fprintf(stderr, "%s\n", debug_msg); 17047ec681f3Smrg 17057ec681f3Smrg c->debug_output(debug_msg, c->debug_output_data); 17067ec681f3Smrg free(debug_msg); 17077ec681f3Smrg } 17087ec681f3Smrg 17097ec681f3Smrg vir_compile_destroy(c); 17107ec681f3Smrg } 17117ec681f3Smrg 17127ec681f3Smrg c = vir_compile_init(compiler, key, s, 17137ec681f3Smrg debug_output, debug_output_data, 17147ec681f3Smrg program_id, variant_id, 17157ec681f3Smrg strategies[i].max_threads, 17167ec681f3Smrg strategies[i].min_threads, 17177ec681f3Smrg strategies[i].tmu_spilling_allowed, 17187ec681f3Smrg strategies[i].disable_loop_unrolling, 17197ec681f3Smrg strategies[i].disable_ubo_load_sorting, 17207ec681f3Smrg strategies[i].disable_tmu_pipelining, 17217ec681f3Smrg i == ARRAY_SIZE(strategies) - 1); 17227ec681f3Smrg 17237ec681f3Smrg v3d_attempt_compile(c); 17247ec681f3Smrg 17257ec681f3Smrg if (i >= ARRAY_SIZE(strategies) - 1 || 17267ec681f3Smrg c->compilation_result != 17277ec681f3Smrg V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) { 17287ec681f3Smrg break; 17297ec681f3Smrg } 17307ec681f3Smrg } 17317ec681f3Smrg 17327ec681f3Smrg if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) && 17337ec681f3Smrg c->compilation_result != 17347ec681f3Smrg V3D_COMPILATION_FAILED_REGISTER_ALLOCATION && 17357ec681f3Smrg c->spills > 0) { 17367ec681f3Smrg char *debug_msg; 17377ec681f3Smrg int ret = asprintf(&debug_msg, 17387ec681f3Smrg "Compiled %s with %d spills and %d fills", 17397ec681f3Smrg vir_get_stage_name(c), 17407ec681f3Smrg c->spills, c->fills); 17417ec681f3Smrg fprintf(stderr, "%s\n", debug_msg); 17427ec681f3Smrg 17437ec681f3Smrg if (ret >= 0) { 17447ec681f3Smrg c->debug_output(debug_msg, c->debug_output_data); 17457ec681f3Smrg free(debug_msg); 17467ec681f3Smrg } 17477ec681f3Smrg } 17487ec681f3Smrg 17497ec681f3Smrg if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) { 17507ec681f3Smrg fprintf(stderr, "Failed to compile %s with any strategy.\n", 17517ec681f3Smrg vir_get_stage_name(c)); 17527ec681f3Smrg } 17537ec681f3Smrg 17547ec681f3Smrg struct v3d_prog_data *prog_data; 17557ec681f3Smrg 17567ec681f3Smrg prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage)); 175701e04c3fSmrg 1758ed98bd31Smaya v3d_set_prog_data(c, prog_data); 1759ed98bd31Smaya 1760ed98bd31Smaya *out_prog_data = prog_data; 1761ed98bd31Smaya 1762ed98bd31Smaya char *shaderdb; 17637ec681f3Smrg int ret = v3d_shaderdb_dump(c, &shaderdb); 1764ed98bd31Smaya if (ret >= 0) { 1765ed98bd31Smaya if (V3D_DEBUG & V3D_DEBUG_SHADERDB) 1766ed98bd31Smaya fprintf(stderr, "SHADER-DB: %s\n", shaderdb); 1767ed98bd31Smaya 1768ed98bd31Smaya c->debug_output(shaderdb, c->debug_output_data); 1769ed98bd31Smaya free(shaderdb); 1770ed98bd31Smaya } 177101e04c3fSmrg 1772ed98bd31Smaya return v3d_return_qpu_insts(c, final_assembly_size); 177301e04c3fSmrg} 177401e04c3fSmrg 177501e04c3fSmrgvoid 177601e04c3fSmrgvir_remove_instruction(struct v3d_compile *c, struct qinst *qinst) 177701e04c3fSmrg{ 177801e04c3fSmrg if (qinst->dst.file == QFILE_TEMP) 177901e04c3fSmrg c->defs[qinst->dst.index] = NULL; 178001e04c3fSmrg 178101e04c3fSmrg assert(&qinst->link != c->cursor.link); 178201e04c3fSmrg 178301e04c3fSmrg list_del(&qinst->link); 178401e04c3fSmrg free(qinst); 178501e04c3fSmrg 178601e04c3fSmrg c->live_intervals_valid = false; 178701e04c3fSmrg} 178801e04c3fSmrg 178901e04c3fSmrgstruct qreg 179001e04c3fSmrgvir_follow_movs(struct v3d_compile *c, struct qreg reg) 179101e04c3fSmrg{ 179201e04c3fSmrg /* XXX 179301e04c3fSmrg int pack = reg.pack; 179401e04c3fSmrg 179501e04c3fSmrg while (reg.file == QFILE_TEMP && 179601e04c3fSmrg c->defs[reg.index] && 179701e04c3fSmrg (c->defs[reg.index]->op == QOP_MOV || 179801e04c3fSmrg c->defs[reg.index]->op == QOP_FMOV) && 179901e04c3fSmrg !c->defs[reg.index]->dst.pack && 180001e04c3fSmrg !c->defs[reg.index]->src[0].pack) { 180101e04c3fSmrg reg = c->defs[reg.index]->src[0]; 180201e04c3fSmrg } 180301e04c3fSmrg 180401e04c3fSmrg reg.pack = pack; 180501e04c3fSmrg */ 180601e04c3fSmrg return reg; 180701e04c3fSmrg} 180801e04c3fSmrg 180901e04c3fSmrgvoid 181001e04c3fSmrgvir_compile_destroy(struct v3d_compile *c) 181101e04c3fSmrg{ 181201e04c3fSmrg /* Defuse the assert that we aren't removing the cursor's instruction. 181301e04c3fSmrg */ 181401e04c3fSmrg c->cursor.link = NULL; 181501e04c3fSmrg 181601e04c3fSmrg vir_for_each_block(block, c) { 18177ec681f3Smrg while (!list_is_empty(&block->instructions)) { 181801e04c3fSmrg struct qinst *qinst = 181901e04c3fSmrg list_first_entry(&block->instructions, 182001e04c3fSmrg struct qinst, link); 182101e04c3fSmrg vir_remove_instruction(c, qinst); 182201e04c3fSmrg } 182301e04c3fSmrg } 182401e04c3fSmrg 182501e04c3fSmrg ralloc_free(c); 182601e04c3fSmrg} 182701e04c3fSmrg 1828ed98bd31Smayauint32_t 1829ed98bd31Smayavir_get_uniform_index(struct v3d_compile *c, 1830ed98bd31Smaya enum quniform_contents contents, 1831ed98bd31Smaya uint32_t data) 183201e04c3fSmrg{ 183301e04c3fSmrg for (int i = 0; i < c->num_uniforms; i++) { 183401e04c3fSmrg if (c->uniform_contents[i] == contents && 183501e04c3fSmrg c->uniform_data[i] == data) { 1836ed98bd31Smaya return i; 183701e04c3fSmrg } 183801e04c3fSmrg } 183901e04c3fSmrg 184001e04c3fSmrg uint32_t uniform = c->num_uniforms++; 184101e04c3fSmrg 184201e04c3fSmrg if (uniform >= c->uniform_array_size) { 184301e04c3fSmrg c->uniform_array_size = MAX2(MAX2(16, uniform + 1), 184401e04c3fSmrg c->uniform_array_size * 2); 184501e04c3fSmrg 184601e04c3fSmrg c->uniform_data = reralloc(c, c->uniform_data, 184701e04c3fSmrg uint32_t, 184801e04c3fSmrg c->uniform_array_size); 184901e04c3fSmrg c->uniform_contents = reralloc(c, c->uniform_contents, 185001e04c3fSmrg enum quniform_contents, 185101e04c3fSmrg c->uniform_array_size); 185201e04c3fSmrg } 185301e04c3fSmrg 185401e04c3fSmrg c->uniform_contents[uniform] = contents; 185501e04c3fSmrg c->uniform_data[uniform] = data; 185601e04c3fSmrg 1857ed98bd31Smaya return uniform; 185801e04c3fSmrg} 185901e04c3fSmrg 18607ec681f3Smrg/* Looks back into the current block to find the ldunif that wrote the uniform 18617ec681f3Smrg * at the requested index. If it finds it, it returns true and writes the 18627ec681f3Smrg * destination register of the ldunif instruction to 'unif'. 18637ec681f3Smrg * 18647ec681f3Smrg * This can impact register pressure and end up leading to worse code, so we 18657ec681f3Smrg * limit the number of instructions we are willing to look back through to 18667ec681f3Smrg * strike a good balance. 18677ec681f3Smrg */ 18687ec681f3Smrgstatic bool 18697ec681f3Smrgtry_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif) 18707ec681f3Smrg{ 18717ec681f3Smrg uint32_t count = 20; 18727ec681f3Smrg struct qinst *prev_inst = NULL; 18737ec681f3Smrg assert(c->cur_block); 18747ec681f3Smrg 18757ec681f3Smrg#ifdef DEBUG 18767ec681f3Smrg /* We can only reuse a uniform if it was emitted in the same block, 18777ec681f3Smrg * so callers must make sure the current instruction is being emitted 18787ec681f3Smrg * in the current block. 18797ec681f3Smrg */ 18807ec681f3Smrg bool found = false; 18817ec681f3Smrg vir_for_each_inst(inst, c->cur_block) { 18827ec681f3Smrg if (&inst->link == c->cursor.link) { 18837ec681f3Smrg found = true; 18847ec681f3Smrg break; 18857ec681f3Smrg } 18867ec681f3Smrg } 18877ec681f3Smrg 18887ec681f3Smrg assert(found || &c->cur_block->instructions == c->cursor.link); 18897ec681f3Smrg#endif 18907ec681f3Smrg 18917ec681f3Smrg list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev, 18927ec681f3Smrg &c->cur_block->instructions, link) { 18937ec681f3Smrg if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) && 18947ec681f3Smrg inst->uniform == index) { 18957ec681f3Smrg prev_inst = inst; 18967ec681f3Smrg break; 18977ec681f3Smrg } 18987ec681f3Smrg 18997ec681f3Smrg if (--count == 0) 19007ec681f3Smrg break; 19017ec681f3Smrg } 19027ec681f3Smrg 19037ec681f3Smrg if (!prev_inst) 19047ec681f3Smrg return false; 19057ec681f3Smrg 19067ec681f3Smrg 19077ec681f3Smrg list_for_each_entry_from(struct qinst, inst, prev_inst->link.next, 19087ec681f3Smrg &c->cur_block->instructions, link) { 19097ec681f3Smrg if (inst->dst.file == prev_inst->dst.file && 19107ec681f3Smrg inst->dst.index == prev_inst->dst.index) { 19117ec681f3Smrg return false; 19127ec681f3Smrg } 19137ec681f3Smrg } 19147ec681f3Smrg 19157ec681f3Smrg *unif = prev_inst->dst; 19167ec681f3Smrg return true; 19177ec681f3Smrg} 19187ec681f3Smrg 1919ed98bd31Smayastruct qreg 1920ed98bd31Smayavir_uniform(struct v3d_compile *c, 1921ed98bd31Smaya enum quniform_contents contents, 1922ed98bd31Smaya uint32_t data) 192301e04c3fSmrg{ 19247ec681f3Smrg const int num_uniforms = c->num_uniforms; 19257ec681f3Smrg const int index = vir_get_uniform_index(c, contents, data); 19267ec681f3Smrg 19277ec681f3Smrg /* If this is not the first time we see this uniform try to reuse the 19287ec681f3Smrg * result of the last ldunif that loaded it. 19297ec681f3Smrg */ 19307ec681f3Smrg const bool is_new_uniform = num_uniforms != c->num_uniforms; 19317ec681f3Smrg if (!is_new_uniform && !c->disable_ldunif_opt) { 19327ec681f3Smrg struct qreg ldunif_dst; 19337ec681f3Smrg if (try_opt_ldunif(c, index, &ldunif_dst)) 19347ec681f3Smrg return ldunif_dst; 19357ec681f3Smrg } 19367ec681f3Smrg 1937ed98bd31Smaya struct qinst *inst = vir_NOP(c); 1938ed98bd31Smaya inst->qpu.sig.ldunif = true; 19397ec681f3Smrg inst->uniform = index; 1940ed98bd31Smaya inst->dst = vir_get_temp(c); 1941ed98bd31Smaya c->defs[inst->dst.index] = inst; 1942ed98bd31Smaya return inst->dst; 194301e04c3fSmrg} 194401e04c3fSmrg 194501e04c3fSmrg#define OPTPASS(func) \ 194601e04c3fSmrg do { \ 194701e04c3fSmrg bool stage_progress = func(c); \ 194801e04c3fSmrg if (stage_progress) { \ 194901e04c3fSmrg progress = true; \ 195001e04c3fSmrg if (print_opt_debug) { \ 195101e04c3fSmrg fprintf(stderr, \ 195201e04c3fSmrg "VIR opt pass %2d: %s progress\n", \ 195301e04c3fSmrg pass, #func); \ 195401e04c3fSmrg } \ 195501e04c3fSmrg /*XXX vir_validate(c);*/ \ 195601e04c3fSmrg } \ 195701e04c3fSmrg } while (0) 195801e04c3fSmrg 195901e04c3fSmrgvoid 196001e04c3fSmrgvir_optimize(struct v3d_compile *c) 196101e04c3fSmrg{ 196201e04c3fSmrg bool print_opt_debug = false; 196301e04c3fSmrg int pass = 1; 196401e04c3fSmrg 196501e04c3fSmrg while (true) { 196601e04c3fSmrg bool progress = false; 196701e04c3fSmrg 196801e04c3fSmrg OPTPASS(vir_opt_copy_propagate); 1969ed98bd31Smaya OPTPASS(vir_opt_redundant_flags); 197001e04c3fSmrg OPTPASS(vir_opt_dead_code); 197101e04c3fSmrg OPTPASS(vir_opt_small_immediates); 19727ec681f3Smrg OPTPASS(vir_opt_constant_alu); 197301e04c3fSmrg 197401e04c3fSmrg if (!progress) 197501e04c3fSmrg break; 197601e04c3fSmrg 197701e04c3fSmrg pass++; 197801e04c3fSmrg } 197901e04c3fSmrg} 198001e04c3fSmrg 198101e04c3fSmrgconst char * 198201e04c3fSmrgvir_get_stage_name(struct v3d_compile *c) 198301e04c3fSmrg{ 198401e04c3fSmrg if (c->vs_key && c->vs_key->is_coord) 19857ec681f3Smrg return "MESA_SHADER_VERTEX_BIN"; 19867ec681f3Smrg else if (c->gs_key && c->gs_key->is_coord) 19877ec681f3Smrg return "MESA_SHADER_GEOMETRY_BIN"; 198801e04c3fSmrg else 198901e04c3fSmrg return gl_shader_stage_name(c->s->info.stage); 199001e04c3fSmrg} 19917ec681f3Smrg 19927ec681f3Smrgstatic inline uint32_t 19937ec681f3Smrgcompute_vpm_size_in_sectors(const struct v3d_device_info *devinfo) 19947ec681f3Smrg{ 19957ec681f3Smrg assert(devinfo->vpm_size > 0); 19967ec681f3Smrg const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8; 19977ec681f3Smrg return devinfo->vpm_size / sector_size; 19987ec681f3Smrg} 19997ec681f3Smrg 20007ec681f3Smrg/* Computes various parameters affecting VPM memory configuration for programs 20017ec681f3Smrg * involving geometry shaders to ensure the program fits in memory and honors 20027ec681f3Smrg * requirements described in section "VPM usage" of the programming manual. 20037ec681f3Smrg */ 20047ec681f3Smrgstatic bool 20057ec681f3Smrgcompute_vpm_config_gs(struct v3d_device_info *devinfo, 20067ec681f3Smrg struct v3d_vs_prog_data *vs, 20077ec681f3Smrg struct v3d_gs_prog_data *gs, 20087ec681f3Smrg struct vpm_config *vpm_cfg_out) 20097ec681f3Smrg{ 20107ec681f3Smrg const uint32_t A = vs->separate_segments ? 1 : 0; 20117ec681f3Smrg const uint32_t Ad = vs->vpm_input_size; 20127ec681f3Smrg const uint32_t Vd = vs->vpm_output_size; 20137ec681f3Smrg 20147ec681f3Smrg const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo); 20157ec681f3Smrg 20167ec681f3Smrg /* Try to fit program into our VPM memory budget by adjusting 20177ec681f3Smrg * configurable parameters iteratively. We do this in two phases: 20187ec681f3Smrg * the first phase tries to fit the program into the total available 20197ec681f3Smrg * VPM memory. If we succeed at that, then the second phase attempts 20207ec681f3Smrg * to fit the program into half of that budget so we can run bin and 20217ec681f3Smrg * render programs in parallel. 20227ec681f3Smrg */ 20237ec681f3Smrg struct vpm_config vpm_cfg[2]; 20247ec681f3Smrg struct vpm_config *final_vpm_cfg = NULL; 20257ec681f3Smrg uint32_t phase = 0; 20267ec681f3Smrg 20277ec681f3Smrg vpm_cfg[phase].As = 1; 20287ec681f3Smrg vpm_cfg[phase].Gs = 1; 20297ec681f3Smrg vpm_cfg[phase].Gd = gs->vpm_output_size; 20307ec681f3Smrg vpm_cfg[phase].gs_width = gs->simd_width; 20317ec681f3Smrg 20327ec681f3Smrg /* While there is a requirement that Vc >= [Vn / 16], this is 20337ec681f3Smrg * always the case when tessellation is not present because in that 20347ec681f3Smrg * case Vn can only be 6 at most (when input primitive is triangles 20357ec681f3Smrg * with adjacency). 20367ec681f3Smrg * 20377ec681f3Smrg * We always choose Vc=2. We can't go lower than this due to GFXH-1744, 20387ec681f3Smrg * and Broadcom has not found it worth it to increase it beyond this 20397ec681f3Smrg * in general. Increasing Vc also increases VPM memory pressure which 20407ec681f3Smrg * can turn up being detrimental for performance in some scenarios. 20417ec681f3Smrg */ 20427ec681f3Smrg vpm_cfg[phase].Vc = 2; 20437ec681f3Smrg 20447ec681f3Smrg /* Gv is a constraint on the hardware to not exceed the 20457ec681f3Smrg * specified number of vertex segments per GS batch. If adding a 20467ec681f3Smrg * new primitive to a GS batch would result in a range of more 20477ec681f3Smrg * than Gv vertex segments being referenced by the batch, then 20487ec681f3Smrg * the hardware will flush the batch and start a new one. This 20497ec681f3Smrg * means that we can choose any value we want, we just need to 20507ec681f3Smrg * be aware that larger values improve GS batch utilization 20517ec681f3Smrg * at the expense of more VPM memory pressure (which can affect 20527ec681f3Smrg * other performance aspects, such as GS dispatch width). 20537ec681f3Smrg * We start with the largest value, and will reduce it if we 20547ec681f3Smrg * find that total memory pressure is too high. 20557ec681f3Smrg */ 20567ec681f3Smrg vpm_cfg[phase].Gv = 3; 20577ec681f3Smrg do { 20587ec681f3Smrg /* When GS is present in absence of TES, then we need to satisfy 20597ec681f3Smrg * that Ve >= Gv. We go with the smallest value of Ve to avoid 20607ec681f3Smrg * increasing memory pressure. 20617ec681f3Smrg */ 20627ec681f3Smrg vpm_cfg[phase].Ve = vpm_cfg[phase].Gv; 20637ec681f3Smrg 20647ec681f3Smrg uint32_t vpm_sectors = 20657ec681f3Smrg A * vpm_cfg[phase].As * Ad + 20667ec681f3Smrg (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd + 20677ec681f3Smrg vpm_cfg[phase].Gs * vpm_cfg[phase].Gd; 20687ec681f3Smrg 20697ec681f3Smrg /* Ideally we want to use no more than half of the available 20707ec681f3Smrg * memory so we can execute a bin and render program in parallel 20717ec681f3Smrg * without stalls. If we achieved that then we are done. 20727ec681f3Smrg */ 20737ec681f3Smrg if (vpm_sectors <= vpm_size / 2) { 20747ec681f3Smrg final_vpm_cfg = &vpm_cfg[phase]; 20757ec681f3Smrg break; 20767ec681f3Smrg } 20777ec681f3Smrg 20787ec681f3Smrg /* At the very least, we should not allocate more than the 20797ec681f3Smrg * total available VPM memory. If we have a configuration that 20807ec681f3Smrg * succeeds at this we save it and continue to see if we can 20817ec681f3Smrg * meet the half-memory-use criteria too. 20827ec681f3Smrg */ 20837ec681f3Smrg if (phase == 0 && vpm_sectors <= vpm_size) { 20847ec681f3Smrg vpm_cfg[1] = vpm_cfg[0]; 20857ec681f3Smrg phase = 1; 20867ec681f3Smrg } 20877ec681f3Smrg 20887ec681f3Smrg /* Try lowering Gv */ 20897ec681f3Smrg if (vpm_cfg[phase].Gv > 0) { 20907ec681f3Smrg vpm_cfg[phase].Gv--; 20917ec681f3Smrg continue; 20927ec681f3Smrg } 20937ec681f3Smrg 20947ec681f3Smrg /* Try lowering GS dispatch width */ 20957ec681f3Smrg if (vpm_cfg[phase].gs_width > 1) { 20967ec681f3Smrg do { 20977ec681f3Smrg vpm_cfg[phase].gs_width >>= 1; 20987ec681f3Smrg vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2; 20997ec681f3Smrg } while (vpm_cfg[phase].gs_width == 2); 21007ec681f3Smrg 21017ec681f3Smrg /* Reset Gv to max after dropping dispatch width */ 21027ec681f3Smrg vpm_cfg[phase].Gv = 3; 21037ec681f3Smrg continue; 21047ec681f3Smrg } 21057ec681f3Smrg 21067ec681f3Smrg /* We ran out of options to reduce memory pressure. If we 21077ec681f3Smrg * are at phase 1 we have at least a valid configuration, so we 21087ec681f3Smrg * we use that. 21097ec681f3Smrg */ 21107ec681f3Smrg if (phase == 1) 21117ec681f3Smrg final_vpm_cfg = &vpm_cfg[0]; 21127ec681f3Smrg break; 21137ec681f3Smrg } while (true); 21147ec681f3Smrg 21157ec681f3Smrg if (!final_vpm_cfg) 21167ec681f3Smrg return false; 21177ec681f3Smrg 21187ec681f3Smrg assert(final_vpm_cfg); 21197ec681f3Smrg assert(final_vpm_cfg->Gd <= 16); 21207ec681f3Smrg assert(final_vpm_cfg->Gv < 4); 21217ec681f3Smrg assert(final_vpm_cfg->Ve < 4); 21227ec681f3Smrg assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4); 21237ec681f3Smrg assert(final_vpm_cfg->gs_width == 1 || 21247ec681f3Smrg final_vpm_cfg->gs_width == 4 || 21257ec681f3Smrg final_vpm_cfg->gs_width == 8 || 21267ec681f3Smrg final_vpm_cfg->gs_width == 16); 21277ec681f3Smrg 21287ec681f3Smrg *vpm_cfg_out = *final_vpm_cfg; 21297ec681f3Smrg return true; 21307ec681f3Smrg} 21317ec681f3Smrg 21327ec681f3Smrgbool 21337ec681f3Smrgv3d_compute_vpm_config(struct v3d_device_info *devinfo, 21347ec681f3Smrg struct v3d_vs_prog_data *vs_bin, 21357ec681f3Smrg struct v3d_vs_prog_data *vs, 21367ec681f3Smrg struct v3d_gs_prog_data *gs_bin, 21377ec681f3Smrg struct v3d_gs_prog_data *gs, 21387ec681f3Smrg struct vpm_config *vpm_cfg_bin, 21397ec681f3Smrg struct vpm_config *vpm_cfg) 21407ec681f3Smrg{ 21417ec681f3Smrg assert(vs && vs_bin); 21427ec681f3Smrg assert((gs != NULL) == (gs_bin != NULL)); 21437ec681f3Smrg 21447ec681f3Smrg if (!gs) { 21457ec681f3Smrg vpm_cfg_bin->As = 1; 21467ec681f3Smrg vpm_cfg_bin->Ve = 0; 21477ec681f3Smrg vpm_cfg_bin->Vc = vs_bin->vcm_cache_size; 21487ec681f3Smrg 21497ec681f3Smrg vpm_cfg->As = 1; 21507ec681f3Smrg vpm_cfg->Ve = 0; 21517ec681f3Smrg vpm_cfg->Vc = vs->vcm_cache_size; 21527ec681f3Smrg } else { 21537ec681f3Smrg if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin)) 21547ec681f3Smrg return false; 21557ec681f3Smrg 21567ec681f3Smrg if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg)) 21577ec681f3Smrg return false; 21587ec681f3Smrg } 21597ec681f3Smrg 21607ec681f3Smrg return true; 21617ec681f3Smrg} 2162