1af69d88dSmrg/* 2af69d88dSmrg * Copyright (c) 2014 Scott Mansell 3af69d88dSmrg * Copyright © 2014 Broadcom 4af69d88dSmrg * 5af69d88dSmrg * Permission is hereby granted, free of charge, to any person obtaining a 6af69d88dSmrg * copy of this software and associated documentation files (the "Software"), 7af69d88dSmrg * to deal in the Software without restriction, including without limitation 8af69d88dSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9af69d88dSmrg * and/or sell copies of the Software, and to permit persons to whom the 10af69d88dSmrg * Software is furnished to do so, subject to the following conditions: 11af69d88dSmrg * 12af69d88dSmrg * The above copyright notice and this permission notice (including the next 13af69d88dSmrg * paragraph) shall be included in all copies or substantial portions of the 14af69d88dSmrg * Software. 15af69d88dSmrg * 16af69d88dSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17af69d88dSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18af69d88dSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19af69d88dSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20af69d88dSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21af69d88dSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22af69d88dSmrg * IN THE SOFTWARE. 23af69d88dSmrg */ 24af69d88dSmrg 25af69d88dSmrg#include <inttypes.h> 267ec681f3Smrg#include "util/format/u_format.h" 2701e04c3fSmrg#include "util/crc32.h" 287ec681f3Smrg#include "util/u_helpers.h" 2901e04c3fSmrg#include "util/u_math.h" 30af69d88dSmrg#include "util/u_memory.h" 3101e04c3fSmrg#include "util/ralloc.h" 3201e04c3fSmrg#include "util/hash_table.h" 33af69d88dSmrg#include "tgsi/tgsi_dump.h" 3401e04c3fSmrg#include "tgsi/tgsi_parse.h" 3501e04c3fSmrg#include "compiler/nir/nir.h" 3601e04c3fSmrg#include "compiler/nir/nir_builder.h" 3701e04c3fSmrg#include "compiler/nir_types.h" 3801e04c3fSmrg#include "nir/tgsi_to_nir.h" 39af69d88dSmrg#include "vc4_context.h" 40af69d88dSmrg#include "vc4_qpu.h" 41af69d88dSmrg#include "vc4_qir.h" 42af69d88dSmrg 4301e04c3fSmrgstatic struct qreg 4401e04c3fSmrgntq_get_src(struct vc4_compile *c, nir_src src, int i); 4501e04c3fSmrgstatic void 4601e04c3fSmrgntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list); 47af69d88dSmrg 4801e04c3fSmrgstatic int 499f464c52Smayatype_size(const struct glsl_type *type, bool bindless) 5001e04c3fSmrg{ 5101e04c3fSmrg return glsl_count_attribute_slots(type, false); 5201e04c3fSmrg} 53af69d88dSmrg 5401e04c3fSmrgstatic void 5501e04c3fSmrgresize_qreg_array(struct vc4_compile *c, 5601e04c3fSmrg struct qreg **regs, 5701e04c3fSmrg uint32_t *size, 5801e04c3fSmrg uint32_t decl_size) 5901e04c3fSmrg{ 6001e04c3fSmrg if (*size >= decl_size) 6101e04c3fSmrg return; 6201e04c3fSmrg 6301e04c3fSmrg uint32_t old_size = *size; 6401e04c3fSmrg *size = MAX2(*size * 2, decl_size); 6501e04c3fSmrg *regs = reralloc(c, *regs, struct qreg, *size); 6601e04c3fSmrg if (!*regs) { 6701e04c3fSmrg fprintf(stderr, "Malloc failure\n"); 6801e04c3fSmrg abort(); 6901e04c3fSmrg } 7001e04c3fSmrg 7101e04c3fSmrg for (uint32_t i = old_size; i < *size; i++) 7201e04c3fSmrg (*regs)[i] = c->undef; 7301e04c3fSmrg} 7401e04c3fSmrg 7501e04c3fSmrgstatic void 7601e04c3fSmrgntq_emit_thrsw(struct vc4_compile *c) 7701e04c3fSmrg{ 7801e04c3fSmrg if (!c->fs_threaded) 7901e04c3fSmrg return; 8001e04c3fSmrg 8101e04c3fSmrg /* Always thread switch after each texture operation for now. 8201e04c3fSmrg * 8301e04c3fSmrg * We could do better by batching a bunch of texture fetches up and 8401e04c3fSmrg * then doing one thread switch and collecting all their results 8501e04c3fSmrg * afterward. 8601e04c3fSmrg */ 8701e04c3fSmrg qir_emit_nondef(c, qir_inst(QOP_THRSW, c->undef, 8801e04c3fSmrg c->undef, c->undef)); 8901e04c3fSmrg c->last_thrsw_at_top_level = (c->execute.file == QFILE_NULL); 9001e04c3fSmrg} 91af69d88dSmrg 92af69d88dSmrgstatic struct qreg 9301e04c3fSmrgindirect_uniform_load(struct vc4_compile *c, nir_intrinsic_instr *intr) 94af69d88dSmrg{ 9501e04c3fSmrg struct qreg indirect_offset = ntq_get_src(c, intr->src[0], 0); 9601e04c3fSmrg 9701e04c3fSmrg /* Clamp to [0, array size). Note that MIN/MAX are signed. */ 989f464c52Smaya uint32_t range = nir_intrinsic_range(intr); 9901e04c3fSmrg indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0)); 10001e04c3fSmrg indirect_offset = qir_MIN_NOIMM(c, indirect_offset, 1019f464c52Smaya qir_uniform_ui(c, range - 4)); 10201e04c3fSmrg 10301e04c3fSmrg qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), 10401e04c3fSmrg indirect_offset, 1059f464c52Smaya qir_uniform(c, QUNIFORM_UBO0_ADDR, 1069f464c52Smaya nir_intrinsic_base(intr))); 10701e04c3fSmrg 10801e04c3fSmrg c->num_texture_samples++; 109af69d88dSmrg 11001e04c3fSmrg ntq_emit_thrsw(c); 111af69d88dSmrg 11201e04c3fSmrg return qir_TEX_RESULT(c); 113af69d88dSmrg} 114af69d88dSmrg 115af69d88dSmrgstatic struct qreg 11601e04c3fSmrgvc4_ubo_load(struct vc4_compile *c, nir_intrinsic_instr *intr) 117af69d88dSmrg{ 1187ec681f3Smrg ASSERTED int buffer_index = nir_src_as_uint(intr->src[0]); 1199f464c52Smaya assert(buffer_index == 1); 12001e04c3fSmrg assert(c->stage == QSTAGE_FRAG); 121af69d88dSmrg 12201e04c3fSmrg struct qreg offset = ntq_get_src(c, intr->src[1], 0); 12301e04c3fSmrg 12401e04c3fSmrg /* Clamp to [0, array size). Note that MIN/MAX are signed. */ 12501e04c3fSmrg offset = qir_MAX(c, offset, qir_uniform_ui(c, 0)); 12601e04c3fSmrg offset = qir_MIN_NOIMM(c, offset, 12701e04c3fSmrg qir_uniform_ui(c, c->fs_key->ubo_1_size - 4)); 12801e04c3fSmrg 12901e04c3fSmrg qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), 13001e04c3fSmrg offset, 1319f464c52Smaya qir_uniform(c, QUNIFORM_UBO1_ADDR, 0)); 13201e04c3fSmrg 13301e04c3fSmrg c->num_texture_samples++; 134af69d88dSmrg 13501e04c3fSmrg ntq_emit_thrsw(c); 136af69d88dSmrg 13701e04c3fSmrg return qir_TEX_RESULT(c); 138af69d88dSmrg} 139af69d88dSmrg 14001e04c3fSmrgnir_ssa_def * 14101e04c3fSmrgvc4_nir_get_swizzled_channel(nir_builder *b, nir_ssa_def **srcs, int swiz) 142af69d88dSmrg{ 14301e04c3fSmrg switch (swiz) { 14401e04c3fSmrg default: 14501e04c3fSmrg case PIPE_SWIZZLE_NONE: 14601e04c3fSmrg fprintf(stderr, "warning: unknown swizzle\n"); 1477ec681f3Smrg FALLTHROUGH; 14801e04c3fSmrg case PIPE_SWIZZLE_0: 14901e04c3fSmrg return nir_imm_float(b, 0.0); 15001e04c3fSmrg case PIPE_SWIZZLE_1: 15101e04c3fSmrg return nir_imm_float(b, 1.0); 15201e04c3fSmrg case PIPE_SWIZZLE_X: 15301e04c3fSmrg case PIPE_SWIZZLE_Y: 15401e04c3fSmrg case PIPE_SWIZZLE_Z: 15501e04c3fSmrg case PIPE_SWIZZLE_W: 15601e04c3fSmrg return srcs[swiz]; 15701e04c3fSmrg } 158af69d88dSmrg} 159af69d88dSmrg 16001e04c3fSmrgstatic struct qreg * 16101e04c3fSmrgntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def) 162af69d88dSmrg{ 16301e04c3fSmrg struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 16401e04c3fSmrg def->num_components); 16501e04c3fSmrg _mesa_hash_table_insert(c->def_ht, def, qregs); 16601e04c3fSmrg return qregs; 167af69d88dSmrg} 168af69d88dSmrg 16901e04c3fSmrg/** 17001e04c3fSmrg * This function is responsible for getting QIR results into the associated 17101e04c3fSmrg * storage for a NIR instruction. 17201e04c3fSmrg * 17301e04c3fSmrg * If it's a NIR SSA def, then we just set the associated hash table entry to 17401e04c3fSmrg * the new result. 17501e04c3fSmrg * 17601e04c3fSmrg * If it's a NIR reg, then we need to update the existing qreg assigned to the 17701e04c3fSmrg * NIR destination with the incoming value. To do that without introducing 17801e04c3fSmrg * new MOVs, we require that the incoming qreg either be a uniform, or be 17901e04c3fSmrg * SSA-defined by the previous QIR instruction in the block and rewritable by 18001e04c3fSmrg * this function. That lets us sneak ahead and insert the SF flag beforehand 18101e04c3fSmrg * (knowing that the previous instruction doesn't depend on flags) and rewrite 18201e04c3fSmrg * its destination to be the NIR reg's destination 18301e04c3fSmrg */ 18401e04c3fSmrgstatic void 18501e04c3fSmrgntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan, 18601e04c3fSmrg struct qreg result) 187af69d88dSmrg{ 18801e04c3fSmrg struct qinst *last_inst = NULL; 1897ec681f3Smrg if (!list_is_empty(&c->cur_block->instructions)) 19001e04c3fSmrg last_inst = (struct qinst *)c->cur_block->instructions.prev; 191af69d88dSmrg 19201e04c3fSmrg assert(result.file == QFILE_UNIF || 19301e04c3fSmrg (result.file == QFILE_TEMP && 19401e04c3fSmrg last_inst && last_inst == c->defs[result.index])); 195af69d88dSmrg 19601e04c3fSmrg if (dest->is_ssa) { 19701e04c3fSmrg assert(chan < dest->ssa.num_components); 198af69d88dSmrg 19901e04c3fSmrg struct qreg *qregs; 20001e04c3fSmrg struct hash_entry *entry = 20101e04c3fSmrg _mesa_hash_table_search(c->def_ht, &dest->ssa); 202af69d88dSmrg 20301e04c3fSmrg if (entry) 20401e04c3fSmrg qregs = entry->data; 20501e04c3fSmrg else 20601e04c3fSmrg qregs = ntq_init_ssa_def(c, &dest->ssa); 207af69d88dSmrg 20801e04c3fSmrg qregs[chan] = result; 20901e04c3fSmrg } else { 21001e04c3fSmrg nir_register *reg = dest->reg.reg; 21101e04c3fSmrg assert(dest->reg.base_offset == 0); 21201e04c3fSmrg assert(reg->num_array_elems == 0); 21301e04c3fSmrg struct hash_entry *entry = 21401e04c3fSmrg _mesa_hash_table_search(c->def_ht, reg); 21501e04c3fSmrg struct qreg *qregs = entry->data; 21601e04c3fSmrg 21701e04c3fSmrg /* Insert a MOV if the source wasn't an SSA def in the 21801e04c3fSmrg * previous instruction. 21901e04c3fSmrg */ 22001e04c3fSmrg if (result.file == QFILE_UNIF) { 22101e04c3fSmrg result = qir_MOV(c, result); 22201e04c3fSmrg last_inst = c->defs[result.index]; 22301e04c3fSmrg } 224af69d88dSmrg 22501e04c3fSmrg /* We know they're both temps, so just rewrite index. */ 22601e04c3fSmrg c->defs[last_inst->dst.index] = NULL; 22701e04c3fSmrg last_inst->dst.index = qregs[chan].index; 22801e04c3fSmrg 22901e04c3fSmrg /* If we're in control flow, then make this update of the reg 23001e04c3fSmrg * conditional on the execution mask. 23101e04c3fSmrg */ 23201e04c3fSmrg if (c->execute.file != QFILE_NULL) { 23301e04c3fSmrg last_inst->dst.index = qregs[chan].index; 23401e04c3fSmrg 23501e04c3fSmrg /* Set the flags to the current exec mask. To insert 23601e04c3fSmrg * the SF, we temporarily remove our SSA instruction. 23701e04c3fSmrg */ 23801e04c3fSmrg list_del(&last_inst->link); 23901e04c3fSmrg qir_SF(c, c->execute); 24001e04c3fSmrg list_addtail(&last_inst->link, 24101e04c3fSmrg &c->cur_block->instructions); 24201e04c3fSmrg 24301e04c3fSmrg last_inst->cond = QPU_COND_ZS; 24401e04c3fSmrg last_inst->cond_is_exec_mask = true; 24501e04c3fSmrg } 24601e04c3fSmrg } 24701e04c3fSmrg} 248af69d88dSmrg 24901e04c3fSmrgstatic struct qreg * 25001e04c3fSmrgntq_get_dest(struct vc4_compile *c, nir_dest *dest) 25101e04c3fSmrg{ 25201e04c3fSmrg if (dest->is_ssa) { 25301e04c3fSmrg struct qreg *qregs = ntq_init_ssa_def(c, &dest->ssa); 25401e04c3fSmrg for (int i = 0; i < dest->ssa.num_components; i++) 25501e04c3fSmrg qregs[i] = c->undef; 25601e04c3fSmrg return qregs; 25701e04c3fSmrg } else { 25801e04c3fSmrg nir_register *reg = dest->reg.reg; 25901e04c3fSmrg assert(dest->reg.base_offset == 0); 26001e04c3fSmrg assert(reg->num_array_elems == 0); 26101e04c3fSmrg struct hash_entry *entry = 26201e04c3fSmrg _mesa_hash_table_search(c->def_ht, reg); 26301e04c3fSmrg return entry->data; 26401e04c3fSmrg } 26501e04c3fSmrg} 266af69d88dSmrg 26701e04c3fSmrgstatic struct qreg 26801e04c3fSmrgntq_get_src(struct vc4_compile *c, nir_src src, int i) 269af69d88dSmrg{ 27001e04c3fSmrg struct hash_entry *entry; 27101e04c3fSmrg if (src.is_ssa) { 27201e04c3fSmrg entry = _mesa_hash_table_search(c->def_ht, src.ssa); 27301e04c3fSmrg assert(i < src.ssa->num_components); 27401e04c3fSmrg } else { 27501e04c3fSmrg nir_register *reg = src.reg.reg; 27601e04c3fSmrg entry = _mesa_hash_table_search(c->def_ht, reg); 27701e04c3fSmrg assert(reg->num_array_elems == 0); 27801e04c3fSmrg assert(src.reg.base_offset == 0); 27901e04c3fSmrg assert(i < reg->num_components); 28001e04c3fSmrg } 281af69d88dSmrg 28201e04c3fSmrg struct qreg *qregs = entry->data; 28301e04c3fSmrg return qregs[i]; 28401e04c3fSmrg} 285af69d88dSmrg 28601e04c3fSmrgstatic struct qreg 28701e04c3fSmrgntq_get_alu_src(struct vc4_compile *c, nir_alu_instr *instr, 28801e04c3fSmrg unsigned src) 28901e04c3fSmrg{ 29001e04c3fSmrg assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 29101e04c3fSmrg unsigned chan = ffs(instr->dest.write_mask) - 1; 29201e04c3fSmrg struct qreg r = ntq_get_src(c, instr->src[src].src, 29301e04c3fSmrg instr->src[src].swizzle[chan]); 29401e04c3fSmrg 29501e04c3fSmrg assert(!instr->src[src].abs); 29601e04c3fSmrg assert(!instr->src[src].negate); 29701e04c3fSmrg 29801e04c3fSmrg return r; 299af69d88dSmrg}; 300af69d88dSmrg 30101e04c3fSmrgstatic inline struct qreg 30201e04c3fSmrgqir_SAT(struct vc4_compile *c, struct qreg val) 303af69d88dSmrg{ 30401e04c3fSmrg return qir_FMAX(c, 30501e04c3fSmrg qir_FMIN(c, val, qir_uniform_f(c, 1.0)), 30601e04c3fSmrg qir_uniform_f(c, 0.0)); 307af69d88dSmrg} 308af69d88dSmrg 309af69d88dSmrgstatic struct qreg 31001e04c3fSmrgntq_rcp(struct vc4_compile *c, struct qreg x) 311af69d88dSmrg{ 31201e04c3fSmrg struct qreg r = qir_RCP(c, x); 31301e04c3fSmrg 31401e04c3fSmrg /* Apply a Newton-Raphson step to improve the accuracy. */ 31501e04c3fSmrg r = qir_FMUL(c, r, qir_FSUB(c, 31601e04c3fSmrg qir_uniform_f(c, 2.0), 31701e04c3fSmrg qir_FMUL(c, x, r))); 31801e04c3fSmrg 31901e04c3fSmrg return r; 320af69d88dSmrg} 321af69d88dSmrg 322af69d88dSmrgstatic struct qreg 32301e04c3fSmrgntq_rsq(struct vc4_compile *c, struct qreg x) 324af69d88dSmrg{ 32501e04c3fSmrg struct qreg r = qir_RSQ(c, x); 32601e04c3fSmrg 32701e04c3fSmrg /* Apply a Newton-Raphson step to improve the accuracy. */ 32801e04c3fSmrg r = qir_FMUL(c, r, qir_FSUB(c, 32901e04c3fSmrg qir_uniform_f(c, 1.5), 33001e04c3fSmrg qir_FMUL(c, 33101e04c3fSmrg qir_uniform_f(c, 0.5), 33201e04c3fSmrg qir_FMUL(c, x, 33301e04c3fSmrg qir_FMUL(c, r, r))))); 33401e04c3fSmrg 33501e04c3fSmrg return r; 336af69d88dSmrg} 337af69d88dSmrg 338af69d88dSmrgstatic struct qreg 33901e04c3fSmrgntq_umul(struct vc4_compile *c, struct qreg src0, struct qreg src1) 34001e04c3fSmrg{ 34101e04c3fSmrg struct qreg src0_hi = qir_SHR(c, src0, 34201e04c3fSmrg qir_uniform_ui(c, 24)); 34301e04c3fSmrg struct qreg src1_hi = qir_SHR(c, src1, 34401e04c3fSmrg qir_uniform_ui(c, 24)); 34501e04c3fSmrg 34601e04c3fSmrg struct qreg hilo = qir_MUL24(c, src0_hi, src1); 34701e04c3fSmrg struct qreg lohi = qir_MUL24(c, src0, src1_hi); 34801e04c3fSmrg struct qreg lolo = qir_MUL24(c, src0, src1); 34901e04c3fSmrg 35001e04c3fSmrg return qir_ADD(c, lolo, qir_SHL(c, 35101e04c3fSmrg qir_ADD(c, hilo, lohi), 35201e04c3fSmrg qir_uniform_ui(c, 24))); 353af69d88dSmrg} 354af69d88dSmrg 355af69d88dSmrgstatic struct qreg 35601e04c3fSmrgntq_scale_depth_texture(struct vc4_compile *c, struct qreg src) 35701e04c3fSmrg{ 35801e04c3fSmrg struct qreg depthf = qir_ITOF(c, qir_SHR(c, src, 35901e04c3fSmrg qir_uniform_ui(c, 8))); 36001e04c3fSmrg return qir_FMUL(c, depthf, qir_uniform_f(c, 1.0f/0xffffff)); 36101e04c3fSmrg} 362af69d88dSmrg 36301e04c3fSmrg/** 36401e04c3fSmrg * Emits a lowered TXF_MS from an MSAA texture. 36501e04c3fSmrg * 36601e04c3fSmrg * The addressing math has been lowered in NIR, and now we just need to read 36701e04c3fSmrg * it like a UBO. 36801e04c3fSmrg */ 36901e04c3fSmrgstatic void 37001e04c3fSmrgntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr) 37101e04c3fSmrg{ 37201e04c3fSmrg uint32_t tile_width = 32; 37301e04c3fSmrg uint32_t tile_height = 32; 37401e04c3fSmrg uint32_t tile_size = (tile_height * tile_width * 37501e04c3fSmrg VC4_MAX_SAMPLES * sizeof(uint32_t)); 37601e04c3fSmrg 37701e04c3fSmrg unsigned unit = instr->texture_index; 37801e04c3fSmrg uint32_t w = align(c->key->tex[unit].msaa_width, tile_width); 37901e04c3fSmrg uint32_t w_tiles = w / tile_width; 38001e04c3fSmrg uint32_t h = align(c->key->tex[unit].msaa_height, tile_height); 38101e04c3fSmrg uint32_t h_tiles = h / tile_height; 38201e04c3fSmrg uint32_t size = w_tiles * h_tiles * tile_size; 38301e04c3fSmrg 38401e04c3fSmrg struct qreg addr; 38501e04c3fSmrg assert(instr->num_srcs == 1); 38601e04c3fSmrg assert(instr->src[0].src_type == nir_tex_src_coord); 38701e04c3fSmrg addr = ntq_get_src(c, instr->src[0].src, 0); 38801e04c3fSmrg 38901e04c3fSmrg /* Perform the clamping required by kernel validation. */ 39001e04c3fSmrg addr = qir_MAX(c, addr, qir_uniform_ui(c, 0)); 39101e04c3fSmrg addr = qir_MIN_NOIMM(c, addr, qir_uniform_ui(c, size - 4)); 39201e04c3fSmrg 39301e04c3fSmrg qir_ADD_dest(c, qir_reg(QFILE_TEX_S_DIRECT, 0), 39401e04c3fSmrg addr, qir_uniform(c, QUNIFORM_TEXTURE_MSAA_ADDR, unit)); 39501e04c3fSmrg 39601e04c3fSmrg ntq_emit_thrsw(c); 39701e04c3fSmrg 39801e04c3fSmrg struct qreg tex = qir_TEX_RESULT(c); 39901e04c3fSmrg c->num_texture_samples++; 40001e04c3fSmrg 40101e04c3fSmrg enum pipe_format format = c->key->tex[unit].format; 40201e04c3fSmrg if (util_format_is_depth_or_stencil(format)) { 40301e04c3fSmrg struct qreg scaled = ntq_scale_depth_texture(c, tex); 40401e04c3fSmrg for (int i = 0; i < 4; i++) 40501e04c3fSmrg ntq_store_dest(c, &instr->dest, i, qir_MOV(c, scaled)); 40601e04c3fSmrg } else { 40701e04c3fSmrg for (int i = 0; i < 4; i++) 40801e04c3fSmrg ntq_store_dest(c, &instr->dest, i, 40901e04c3fSmrg qir_UNPACK_8_F(c, tex, i)); 41001e04c3fSmrg } 411af69d88dSmrg} 412af69d88dSmrg 413af69d88dSmrgstatic void 41401e04c3fSmrgntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr) 415af69d88dSmrg{ 41601e04c3fSmrg struct qreg s, t, r, lod, compare; 41701e04c3fSmrg bool is_txb = false, is_txl = false; 41801e04c3fSmrg unsigned unit = instr->texture_index; 41901e04c3fSmrg 42001e04c3fSmrg if (instr->op == nir_texop_txf) { 42101e04c3fSmrg ntq_emit_txf(c, instr); 42201e04c3fSmrg return; 42301e04c3fSmrg } 424af69d88dSmrg 42501e04c3fSmrg for (unsigned i = 0; i < instr->num_srcs; i++) { 42601e04c3fSmrg switch (instr->src[i].src_type) { 42701e04c3fSmrg case nir_tex_src_coord: 42801e04c3fSmrg s = ntq_get_src(c, instr->src[i].src, 0); 42901e04c3fSmrg if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D) 43001e04c3fSmrg t = qir_uniform_f(c, 0.5); 43101e04c3fSmrg else 43201e04c3fSmrg t = ntq_get_src(c, instr->src[i].src, 1); 43301e04c3fSmrg if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) 43401e04c3fSmrg r = ntq_get_src(c, instr->src[i].src, 2); 43501e04c3fSmrg break; 43601e04c3fSmrg case nir_tex_src_bias: 43701e04c3fSmrg lod = ntq_get_src(c, instr->src[i].src, 0); 43801e04c3fSmrg is_txb = true; 43901e04c3fSmrg break; 44001e04c3fSmrg case nir_tex_src_lod: 44101e04c3fSmrg lod = ntq_get_src(c, instr->src[i].src, 0); 44201e04c3fSmrg is_txl = true; 44301e04c3fSmrg break; 44401e04c3fSmrg case nir_tex_src_comparator: 44501e04c3fSmrg compare = ntq_get_src(c, instr->src[i].src, 0); 44601e04c3fSmrg break; 44701e04c3fSmrg default: 44801e04c3fSmrg unreachable("unknown texture source"); 44901e04c3fSmrg } 45001e04c3fSmrg } 451af69d88dSmrg 45201e04c3fSmrg if (c->stage != QSTAGE_FRAG && !is_txl) { 45301e04c3fSmrg /* From the GLSL 1.20 spec: 45401e04c3fSmrg * 45501e04c3fSmrg * "If it is mip-mapped and running on the vertex shader, 45601e04c3fSmrg * then the base texture is used." 45701e04c3fSmrg */ 45801e04c3fSmrg is_txl = true; 45901e04c3fSmrg lod = qir_uniform_ui(c, 0); 46001e04c3fSmrg } 461af69d88dSmrg 46201e04c3fSmrg if (c->key->tex[unit].force_first_level) { 46301e04c3fSmrg lod = qir_uniform(c, QUNIFORM_TEXTURE_FIRST_LEVEL, unit); 46401e04c3fSmrg is_txl = true; 46501e04c3fSmrg is_txb = false; 466af69d88dSmrg } 467af69d88dSmrg 46801e04c3fSmrg struct qreg texture_u[] = { 46901e04c3fSmrg qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P0, unit), 47001e04c3fSmrg qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P1, unit), 47101e04c3fSmrg qir_uniform(c, QUNIFORM_CONSTANT, 0), 47201e04c3fSmrg qir_uniform(c, QUNIFORM_CONSTANT, 0), 47301e04c3fSmrg }; 47401e04c3fSmrg uint32_t next_texture_u = 0; 47501e04c3fSmrg 47601e04c3fSmrg if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE || is_txl) { 47701e04c3fSmrg texture_u[2] = qir_uniform(c, QUNIFORM_TEXTURE_CONFIG_P2, 47801e04c3fSmrg unit | (is_txl << 16)); 479af69d88dSmrg } 480af69d88dSmrg 48101e04c3fSmrg struct qinst *tmu; 48201e04c3fSmrg if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 48301e04c3fSmrg tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), r); 48401e04c3fSmrg tmu->src[qir_get_tex_uniform_src(tmu)] = 48501e04c3fSmrg texture_u[next_texture_u++]; 48601e04c3fSmrg } else if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 48701e04c3fSmrg c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP || 48801e04c3fSmrg c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP_TO_BORDER || 48901e04c3fSmrg c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) { 49001e04c3fSmrg tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_R, 0), 49101e04c3fSmrg qir_uniform(c, QUNIFORM_TEXTURE_BORDER_COLOR, 49201e04c3fSmrg unit)); 49301e04c3fSmrg tmu->src[qir_get_tex_uniform_src(tmu)] = 49401e04c3fSmrg texture_u[next_texture_u++]; 49501e04c3fSmrg } 496af69d88dSmrg 49701e04c3fSmrg if (c->key->tex[unit].wrap_s == PIPE_TEX_WRAP_CLAMP) { 49801e04c3fSmrg s = qir_SAT(c, s); 49901e04c3fSmrg } 500af69d88dSmrg 50101e04c3fSmrg if (c->key->tex[unit].wrap_t == PIPE_TEX_WRAP_CLAMP) { 50201e04c3fSmrg t = qir_SAT(c, t); 50301e04c3fSmrg } 504af69d88dSmrg 50501e04c3fSmrg tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_T, 0), t); 50601e04c3fSmrg tmu->src[qir_get_tex_uniform_src(tmu)] = 50701e04c3fSmrg texture_u[next_texture_u++]; 508af69d88dSmrg 50901e04c3fSmrg if (is_txl || is_txb) { 51001e04c3fSmrg tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_B, 0), lod); 51101e04c3fSmrg tmu->src[qir_get_tex_uniform_src(tmu)] = 51201e04c3fSmrg texture_u[next_texture_u++]; 513af69d88dSmrg } 514af69d88dSmrg 51501e04c3fSmrg tmu = qir_MOV_dest(c, qir_reg(QFILE_TEX_S, 0), s); 51601e04c3fSmrg tmu->src[qir_get_tex_uniform_src(tmu)] = texture_u[next_texture_u++]; 51701e04c3fSmrg 51801e04c3fSmrg c->num_texture_samples++; 51901e04c3fSmrg 52001e04c3fSmrg ntq_emit_thrsw(c); 52101e04c3fSmrg 52201e04c3fSmrg struct qreg tex = qir_TEX_RESULT(c); 52301e04c3fSmrg 52401e04c3fSmrg enum pipe_format format = c->key->tex[unit].format; 52501e04c3fSmrg 52601e04c3fSmrg struct qreg *dest = ntq_get_dest(c, &instr->dest); 52701e04c3fSmrg if (util_format_is_depth_or_stencil(format)) { 52801e04c3fSmrg struct qreg normalized = ntq_scale_depth_texture(c, tex); 52901e04c3fSmrg struct qreg depth_output; 53001e04c3fSmrg 53101e04c3fSmrg struct qreg u0 = qir_uniform_f(c, 0.0f); 53201e04c3fSmrg struct qreg u1 = qir_uniform_f(c, 1.0f); 53301e04c3fSmrg if (c->key->tex[unit].compare_mode) { 53401e04c3fSmrg /* From the GL_ARB_shadow spec: 53501e04c3fSmrg * 53601e04c3fSmrg * "Let Dt (D subscript t) be the depth texture 53701e04c3fSmrg * value, in the range [0, 1]. Let R be the 53801e04c3fSmrg * interpolated texture coordinate clamped to the 53901e04c3fSmrg * range [0, 1]." 54001e04c3fSmrg */ 54101e04c3fSmrg compare = qir_SAT(c, compare); 54201e04c3fSmrg 54301e04c3fSmrg switch (c->key->tex[unit].compare_func) { 54401e04c3fSmrg case PIPE_FUNC_NEVER: 54501e04c3fSmrg depth_output = qir_uniform_f(c, 0.0f); 54601e04c3fSmrg break; 54701e04c3fSmrg case PIPE_FUNC_ALWAYS: 54801e04c3fSmrg depth_output = u1; 54901e04c3fSmrg break; 55001e04c3fSmrg case PIPE_FUNC_EQUAL: 55101e04c3fSmrg qir_SF(c, qir_FSUB(c, compare, normalized)); 55201e04c3fSmrg depth_output = qir_SEL(c, QPU_COND_ZS, u1, u0); 55301e04c3fSmrg break; 55401e04c3fSmrg case PIPE_FUNC_NOTEQUAL: 55501e04c3fSmrg qir_SF(c, qir_FSUB(c, compare, normalized)); 55601e04c3fSmrg depth_output = qir_SEL(c, QPU_COND_ZC, u1, u0); 55701e04c3fSmrg break; 55801e04c3fSmrg case PIPE_FUNC_GREATER: 55901e04c3fSmrg qir_SF(c, qir_FSUB(c, compare, normalized)); 56001e04c3fSmrg depth_output = qir_SEL(c, QPU_COND_NC, u1, u0); 56101e04c3fSmrg break; 56201e04c3fSmrg case PIPE_FUNC_GEQUAL: 56301e04c3fSmrg qir_SF(c, qir_FSUB(c, normalized, compare)); 56401e04c3fSmrg depth_output = qir_SEL(c, QPU_COND_NS, u1, u0); 56501e04c3fSmrg break; 56601e04c3fSmrg case PIPE_FUNC_LESS: 56701e04c3fSmrg qir_SF(c, qir_FSUB(c, compare, normalized)); 56801e04c3fSmrg depth_output = qir_SEL(c, QPU_COND_NS, u1, u0); 56901e04c3fSmrg break; 57001e04c3fSmrg case PIPE_FUNC_LEQUAL: 57101e04c3fSmrg qir_SF(c, qir_FSUB(c, normalized, compare)); 57201e04c3fSmrg depth_output = qir_SEL(c, QPU_COND_NC, u1, u0); 57301e04c3fSmrg break; 57401e04c3fSmrg } 57501e04c3fSmrg } else { 57601e04c3fSmrg depth_output = normalized; 57701e04c3fSmrg } 578af69d88dSmrg 57901e04c3fSmrg for (int i = 0; i < 4; i++) 58001e04c3fSmrg dest[i] = depth_output; 58101e04c3fSmrg } else { 58201e04c3fSmrg for (int i = 0; i < 4; i++) 58301e04c3fSmrg dest[i] = qir_UNPACK_8_F(c, tex, i); 58401e04c3fSmrg } 585af69d88dSmrg} 586af69d88dSmrg 587af69d88dSmrg/** 588af69d88dSmrg * Computes x - floor(x), which is tricky because our FTOI truncates (rounds 589af69d88dSmrg * to zero). 590af69d88dSmrg */ 591af69d88dSmrgstatic struct qreg 59201e04c3fSmrgntq_ffract(struct vc4_compile *c, struct qreg src) 593af69d88dSmrg{ 59401e04c3fSmrg struct qreg trunc = qir_ITOF(c, qir_FTOI(c, src)); 59501e04c3fSmrg struct qreg diff = qir_FSUB(c, src, trunc); 59601e04c3fSmrg qir_SF(c, diff); 59701e04c3fSmrg 59801e04c3fSmrg qir_FADD_dest(c, diff, 59901e04c3fSmrg diff, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; 60001e04c3fSmrg 60101e04c3fSmrg return qir_MOV(c, diff); 602af69d88dSmrg} 603af69d88dSmrg 604af69d88dSmrg/** 605af69d88dSmrg * Computes floor(x), which is tricky because our FTOI truncates (rounds to 606af69d88dSmrg * zero). 607af69d88dSmrg */ 608af69d88dSmrgstatic struct qreg 60901e04c3fSmrgntq_ffloor(struct vc4_compile *c, struct qreg src) 610af69d88dSmrg{ 61101e04c3fSmrg struct qreg result = qir_ITOF(c, qir_FTOI(c, src)); 612af69d88dSmrg 61301e04c3fSmrg /* This will be < 0 if we truncated and the truncation was of a value 61401e04c3fSmrg * that was < 0 in the first place. 61501e04c3fSmrg */ 61601e04c3fSmrg qir_SF(c, qir_FSUB(c, src, result)); 617af69d88dSmrg 61801e04c3fSmrg struct qinst *sub = qir_FSUB_dest(c, result, 61901e04c3fSmrg result, qir_uniform_f(c, 1.0)); 62001e04c3fSmrg sub->cond = QPU_COND_NS; 621af69d88dSmrg 62201e04c3fSmrg return qir_MOV(c, result); 623af69d88dSmrg} 624af69d88dSmrg 62501e04c3fSmrg/** 62601e04c3fSmrg * Computes ceil(x), which is tricky because our FTOI truncates (rounds to 62701e04c3fSmrg * zero). 62801e04c3fSmrg */ 629af69d88dSmrgstatic struct qreg 63001e04c3fSmrgntq_fceil(struct vc4_compile *c, struct qreg src) 631af69d88dSmrg{ 63201e04c3fSmrg struct qreg result = qir_ITOF(c, qir_FTOI(c, src)); 633af69d88dSmrg 63401e04c3fSmrg /* This will be < 0 if we truncated and the truncation was of a value 63501e04c3fSmrg * that was > 0 in the first place. 63601e04c3fSmrg */ 63701e04c3fSmrg qir_SF(c, qir_FSUB(c, result, src)); 63801e04c3fSmrg 63901e04c3fSmrg qir_FADD_dest(c, result, 64001e04c3fSmrg result, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; 64101e04c3fSmrg 64201e04c3fSmrg return qir_MOV(c, result); 643af69d88dSmrg} 644af69d88dSmrg 645af69d88dSmrgstatic struct qreg 64601e04c3fSmrgntq_shrink_sincos_input_range(struct vc4_compile *c, struct qreg x) 647af69d88dSmrg{ 64801e04c3fSmrg /* Since we're using a Taylor approximation, we want to have a small 64901e04c3fSmrg * number of coefficients and take advantage of sin/cos repeating 65001e04c3fSmrg * every 2pi. We keep our x as close to 0 as we can, since the series 65101e04c3fSmrg * will be less accurate as |x| increases. (Also, be careful of 65201e04c3fSmrg * shifting the input x value to be tricky with sin/cos relations, 65301e04c3fSmrg * because getting accurate values for x==0 is very important for SDL 65401e04c3fSmrg * rendering) 65501e04c3fSmrg */ 65601e04c3fSmrg struct qreg scaled_x = 65701e04c3fSmrg qir_FMUL(c, x, 65801e04c3fSmrg qir_uniform_f(c, 1.0f / (M_PI * 2.0f))); 65901e04c3fSmrg /* Note: FTOI truncates toward 0. */ 66001e04c3fSmrg struct qreg x_frac = qir_FSUB(c, scaled_x, 66101e04c3fSmrg qir_ITOF(c, qir_FTOI(c, scaled_x))); 66201e04c3fSmrg /* Map [0.5, 1] to [-0.5, 0] */ 66301e04c3fSmrg qir_SF(c, qir_FSUB(c, x_frac, qir_uniform_f(c, 0.5))); 66401e04c3fSmrg qir_FSUB_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NC; 66501e04c3fSmrg /* Map [-1, -0.5] to [0, 0.5] */ 66601e04c3fSmrg qir_SF(c, qir_FADD(c, x_frac, qir_uniform_f(c, 0.5))); 66701e04c3fSmrg qir_FADD_dest(c, x_frac, x_frac, qir_uniform_f(c, 1.0))->cond = QPU_COND_NS; 66801e04c3fSmrg 66901e04c3fSmrg return x_frac; 670af69d88dSmrg} 671af69d88dSmrg 672af69d88dSmrgstatic struct qreg 67301e04c3fSmrgntq_fsin(struct vc4_compile *c, struct qreg src) 674af69d88dSmrg{ 675af69d88dSmrg float coeff[] = { 676af69d88dSmrg 2.0 * M_PI, 677af69d88dSmrg -pow(2.0 * M_PI, 3) / (3 * 2 * 1), 678af69d88dSmrg pow(2.0 * M_PI, 5) / (5 * 4 * 3 * 2 * 1), 679af69d88dSmrg -pow(2.0 * M_PI, 7) / (7 * 6 * 5 * 4 * 3 * 2 * 1), 68001e04c3fSmrg pow(2.0 * M_PI, 9) / (9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 681af69d88dSmrg }; 682af69d88dSmrg 68301e04c3fSmrg struct qreg x = ntq_shrink_sincos_input_range(c, src); 684af69d88dSmrg struct qreg x2 = qir_FMUL(c, x, x); 68501e04c3fSmrg struct qreg sum = qir_FMUL(c, x, qir_uniform_f(c, coeff[0])); 686af69d88dSmrg for (int i = 1; i < ARRAY_SIZE(coeff); i++) { 687af69d88dSmrg x = qir_FMUL(c, x, x2); 688af69d88dSmrg sum = qir_FADD(c, 689af69d88dSmrg sum, 690af69d88dSmrg qir_FMUL(c, 691af69d88dSmrg x, 69201e04c3fSmrg qir_uniform_f(c, coeff[i]))); 693af69d88dSmrg } 694af69d88dSmrg return sum; 695af69d88dSmrg} 696af69d88dSmrg 697af69d88dSmrgstatic struct qreg 69801e04c3fSmrgntq_fcos(struct vc4_compile *c, struct qreg src) 699af69d88dSmrg{ 700af69d88dSmrg float coeff[] = { 701af69d88dSmrg 1.0f, 702af69d88dSmrg -pow(2.0 * M_PI, 2) / (2 * 1), 703af69d88dSmrg pow(2.0 * M_PI, 4) / (4 * 3 * 2 * 1), 704af69d88dSmrg -pow(2.0 * M_PI, 6) / (6 * 5 * 4 * 3 * 2 * 1), 70501e04c3fSmrg pow(2.0 * M_PI, 8) / (8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 70601e04c3fSmrg -pow(2.0 * M_PI, 10) / (10 * 9 * 8 * 7 * 6 * 5 * 4 * 3 * 2 * 1), 707af69d88dSmrg }; 708af69d88dSmrg 70901e04c3fSmrg struct qreg x_frac = ntq_shrink_sincos_input_range(c, src); 71001e04c3fSmrg struct qreg sum = qir_uniform_f(c, coeff[0]); 711af69d88dSmrg struct qreg x2 = qir_FMUL(c, x_frac, x_frac); 712af69d88dSmrg struct qreg x = x2; /* Current x^2, x^4, or x^6 */ 713af69d88dSmrg for (int i = 1; i < ARRAY_SIZE(coeff); i++) { 714af69d88dSmrg if (i != 1) 715af69d88dSmrg x = qir_FMUL(c, x, x2); 716af69d88dSmrg 71701e04c3fSmrg sum = qir_FADD(c, qir_FMUL(c, 718af69d88dSmrg x, 71901e04c3fSmrg qir_uniform_f(c, coeff[i])), 72001e04c3fSmrg sum); 721af69d88dSmrg } 722af69d88dSmrg return sum; 723af69d88dSmrg} 724af69d88dSmrg 72501e04c3fSmrgstatic struct qreg 72601e04c3fSmrgntq_fsign(struct vc4_compile *c, struct qreg src) 727af69d88dSmrg{ 72801e04c3fSmrg struct qreg t = qir_get_temp(c); 729af69d88dSmrg 73001e04c3fSmrg qir_SF(c, src); 73101e04c3fSmrg qir_MOV_dest(c, t, qir_uniform_f(c, 0.0)); 73201e04c3fSmrg qir_MOV_dest(c, t, qir_uniform_f(c, 1.0))->cond = QPU_COND_ZC; 73301e04c3fSmrg qir_MOV_dest(c, t, qir_uniform_f(c, -1.0))->cond = QPU_COND_NS; 73401e04c3fSmrg return qir_MOV(c, t); 73501e04c3fSmrg} 736af69d88dSmrg 73701e04c3fSmrgstatic void 73801e04c3fSmrgemit_vertex_input(struct vc4_compile *c, int attr) 73901e04c3fSmrg{ 74001e04c3fSmrg enum pipe_format format = c->vs_key->attr_formats[attr]; 74101e04c3fSmrg uint32_t attr_size = util_format_get_blocksize(format); 742af69d88dSmrg 74301e04c3fSmrg c->vattr_sizes[attr] = align(attr_size, 4); 74401e04c3fSmrg for (int i = 0; i < align(attr_size, 4) / 4; i++) { 74501e04c3fSmrg c->inputs[attr * 4 + i] = 74601e04c3fSmrg qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i)); 74701e04c3fSmrg c->num_inputs++; 748af69d88dSmrg } 749af69d88dSmrg} 750af69d88dSmrg 751af69d88dSmrgstatic void 75201e04c3fSmrgemit_fragcoord_input(struct vc4_compile *c, int attr) 753af69d88dSmrg{ 75401e04c3fSmrg c->inputs[attr * 4 + 0] = qir_ITOF(c, qir_reg(QFILE_FRAG_X, 0)); 75501e04c3fSmrg c->inputs[attr * 4 + 1] = qir_ITOF(c, qir_reg(QFILE_FRAG_Y, 0)); 75601e04c3fSmrg c->inputs[attr * 4 + 2] = 757af69d88dSmrg qir_FMUL(c, 75801e04c3fSmrg qir_ITOF(c, qir_FRAG_Z(c)), 75901e04c3fSmrg qir_uniform_f(c, 1.0 / 0xffffff)); 76001e04c3fSmrg c->inputs[attr * 4 + 3] = qir_RCP(c, qir_FRAG_W(c)); 761af69d88dSmrg} 762af69d88dSmrg 763af69d88dSmrgstatic struct qreg 76401e04c3fSmrgemit_fragment_varying(struct vc4_compile *c, gl_varying_slot slot, 76501e04c3fSmrg uint8_t swizzle) 766af69d88dSmrg{ 76701e04c3fSmrg uint32_t i = c->num_input_slots++; 768af69d88dSmrg struct qreg vary = { 769af69d88dSmrg QFILE_VARY, 77001e04c3fSmrg i 771af69d88dSmrg }; 772af69d88dSmrg 77301e04c3fSmrg if (c->num_input_slots >= c->input_slots_array_size) { 77401e04c3fSmrg c->input_slots_array_size = 77501e04c3fSmrg MAX2(4, c->input_slots_array_size * 2); 77601e04c3fSmrg 77701e04c3fSmrg c->input_slots = reralloc(c, c->input_slots, 77801e04c3fSmrg struct vc4_varying_slot, 77901e04c3fSmrg c->input_slots_array_size); 78001e04c3fSmrg } 78101e04c3fSmrg 78201e04c3fSmrg c->input_slots[i].slot = slot; 78301e04c3fSmrg c->input_slots[i].swizzle = swizzle; 78401e04c3fSmrg 78501e04c3fSmrg return qir_VARY_ADD_C(c, qir_FMUL(c, vary, qir_FRAG_W(c))); 786af69d88dSmrg} 787af69d88dSmrg 788af69d88dSmrgstatic void 78901e04c3fSmrgemit_fragment_input(struct vc4_compile *c, int attr, gl_varying_slot slot) 790af69d88dSmrg{ 791af69d88dSmrg for (int i = 0; i < 4; i++) { 79201e04c3fSmrg c->inputs[attr * 4 + i] = 79301e04c3fSmrg emit_fragment_varying(c, slot, i); 794af69d88dSmrg c->num_inputs++; 795af69d88dSmrg } 796af69d88dSmrg} 797af69d88dSmrg 798af69d88dSmrgstatic void 79901e04c3fSmrgadd_output(struct vc4_compile *c, 80001e04c3fSmrg uint32_t decl_offset, 80101e04c3fSmrg uint8_t slot, 80201e04c3fSmrg uint8_t swizzle) 80301e04c3fSmrg{ 80401e04c3fSmrg uint32_t old_array_size = c->outputs_array_size; 80501e04c3fSmrg resize_qreg_array(c, &c->outputs, &c->outputs_array_size, 80601e04c3fSmrg decl_offset + 1); 80701e04c3fSmrg 80801e04c3fSmrg if (old_array_size != c->outputs_array_size) { 80901e04c3fSmrg c->output_slots = reralloc(c, 81001e04c3fSmrg c->output_slots, 81101e04c3fSmrg struct vc4_varying_slot, 81201e04c3fSmrg c->outputs_array_size); 813af69d88dSmrg } 814af69d88dSmrg 81501e04c3fSmrg c->output_slots[decl_offset].slot = slot; 81601e04c3fSmrg c->output_slots[decl_offset].swizzle = swizzle; 817af69d88dSmrg} 818af69d88dSmrg 81901e04c3fSmrgstatic bool 82001e04c3fSmrgntq_src_is_only_ssa_def_user(nir_src *src) 82101e04c3fSmrg{ 82201e04c3fSmrg if (!src->is_ssa) 82301e04c3fSmrg return false; 824af69d88dSmrg 8257ec681f3Smrg if (!list_is_empty(&src->ssa->if_uses)) 82601e04c3fSmrg return false; 827af69d88dSmrg 82801e04c3fSmrg return (src->ssa->uses.next == &src->use_link && 82901e04c3fSmrg src->ssa->uses.next->next == &src->ssa->uses); 830af69d88dSmrg} 831af69d88dSmrg 832af69d88dSmrg/** 83301e04c3fSmrg * In general, emits a nir_pack_unorm_4x8 as a series of MOVs with the pack 83401e04c3fSmrg * bit set. 835af69d88dSmrg * 83601e04c3fSmrg * However, as an optimization, it tries to find the instructions generating 83701e04c3fSmrg * the sources to be packed and just emit the pack flag there, if possible. 838af69d88dSmrg */ 839af69d88dSmrgstatic void 84001e04c3fSmrgntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr) 841af69d88dSmrg{ 84201e04c3fSmrg struct qreg result = qir_get_temp(c); 84301e04c3fSmrg struct nir_alu_instr *vec4 = NULL; 844af69d88dSmrg 84501e04c3fSmrg /* If packing from a vec4 op (as expected), identify it so that we can 84601e04c3fSmrg * peek back at what generated its sources. 84701e04c3fSmrg */ 84801e04c3fSmrg if (instr->src[0].src.is_ssa && 84901e04c3fSmrg instr->src[0].src.ssa->parent_instr->type == nir_instr_type_alu && 85001e04c3fSmrg nir_instr_as_alu(instr->src[0].src.ssa->parent_instr)->op == 85101e04c3fSmrg nir_op_vec4) { 85201e04c3fSmrg vec4 = nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 85301e04c3fSmrg } 85401e04c3fSmrg 85501e04c3fSmrg /* If the pack is replicating the same channel 4 times, use the 8888 85601e04c3fSmrg * pack flag. This is common for blending using the alpha 85701e04c3fSmrg * channel. 85801e04c3fSmrg */ 85901e04c3fSmrg if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] && 86001e04c3fSmrg instr->src[0].swizzle[0] == instr->src[0].swizzle[2] && 86101e04c3fSmrg instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) { 86201e04c3fSmrg struct qreg rep = ntq_get_src(c, 86301e04c3fSmrg instr->src[0].src, 86401e04c3fSmrg instr->src[0].swizzle[0]); 86501e04c3fSmrg ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep)); 866af69d88dSmrg return; 867af69d88dSmrg } 868af69d88dSmrg 86901e04c3fSmrg for (int i = 0; i < 4; i++) { 87001e04c3fSmrg int swiz = instr->src[0].swizzle[i]; 87101e04c3fSmrg struct qreg src; 87201e04c3fSmrg if (vec4) { 87301e04c3fSmrg src = ntq_get_src(c, vec4->src[swiz].src, 87401e04c3fSmrg vec4->src[swiz].swizzle[0]); 87501e04c3fSmrg } else { 87601e04c3fSmrg src = ntq_get_src(c, instr->src[0].src, swiz); 87701e04c3fSmrg } 87801e04c3fSmrg 87901e04c3fSmrg if (vec4 && 88001e04c3fSmrg ntq_src_is_only_ssa_def_user(&vec4->src[swiz].src) && 88101e04c3fSmrg src.file == QFILE_TEMP && 88201e04c3fSmrg c->defs[src.index] && 88301e04c3fSmrg qir_is_mul(c->defs[src.index]) && 88401e04c3fSmrg !c->defs[src.index]->dst.pack) { 88501e04c3fSmrg struct qinst *rewrite = c->defs[src.index]; 88601e04c3fSmrg c->defs[src.index] = NULL; 88701e04c3fSmrg rewrite->dst = result; 88801e04c3fSmrg rewrite->dst.pack = QPU_PACK_MUL_8A + i; 88901e04c3fSmrg continue; 89001e04c3fSmrg } 89101e04c3fSmrg 89201e04c3fSmrg qir_PACK_8_F(c, result, src, i); 89301e04c3fSmrg } 89401e04c3fSmrg 89501e04c3fSmrg ntq_store_dest(c, &instr->dest.dest, 0, qir_MOV(c, result)); 896af69d88dSmrg} 897af69d88dSmrg 89801e04c3fSmrg/** Handles sign-extended bitfield extracts for 16 bits. */ 89901e04c3fSmrgstatic struct qreg 90001e04c3fSmrgntq_emit_ibfe(struct vc4_compile *c, struct qreg base, struct qreg offset, 90101e04c3fSmrg struct qreg bits) 902af69d88dSmrg{ 90301e04c3fSmrg assert(bits.file == QFILE_UNIF && 90401e04c3fSmrg c->uniform_contents[bits.index] == QUNIFORM_CONSTANT && 90501e04c3fSmrg c->uniform_data[bits.index] == 16); 906af69d88dSmrg 90701e04c3fSmrg assert(offset.file == QFILE_UNIF && 90801e04c3fSmrg c->uniform_contents[offset.index] == QUNIFORM_CONSTANT); 90901e04c3fSmrg int offset_bit = c->uniform_data[offset.index]; 91001e04c3fSmrg assert(offset_bit % 16 == 0); 911af69d88dSmrg 91201e04c3fSmrg return qir_UNPACK_16_I(c, base, offset_bit / 16); 91301e04c3fSmrg} 914af69d88dSmrg 91501e04c3fSmrg/** Handles unsigned bitfield extracts for 8 bits. */ 91601e04c3fSmrgstatic struct qreg 91701e04c3fSmrgntq_emit_ubfe(struct vc4_compile *c, struct qreg base, struct qreg offset, 91801e04c3fSmrg struct qreg bits) 91901e04c3fSmrg{ 92001e04c3fSmrg assert(bits.file == QFILE_UNIF && 92101e04c3fSmrg c->uniform_contents[bits.index] == QUNIFORM_CONSTANT && 92201e04c3fSmrg c->uniform_data[bits.index] == 8); 92301e04c3fSmrg 92401e04c3fSmrg assert(offset.file == QFILE_UNIF && 92501e04c3fSmrg c->uniform_contents[offset.index] == QUNIFORM_CONSTANT); 92601e04c3fSmrg int offset_bit = c->uniform_data[offset.index]; 92701e04c3fSmrg assert(offset_bit % 8 == 0); 92801e04c3fSmrg 92901e04c3fSmrg return qir_UNPACK_8_I(c, base, offset_bit / 8); 93001e04c3fSmrg} 93101e04c3fSmrg 93201e04c3fSmrg/** 93301e04c3fSmrg * If compare_instr is a valid comparison instruction, emits the 93401e04c3fSmrg * compare_instr's comparison and returns the sel_instr's return value based 93501e04c3fSmrg * on the compare_instr's result. 93601e04c3fSmrg */ 93701e04c3fSmrgstatic bool 93801e04c3fSmrgntq_emit_comparison(struct vc4_compile *c, struct qreg *dest, 93901e04c3fSmrg nir_alu_instr *compare_instr, 94001e04c3fSmrg nir_alu_instr *sel_instr) 94101e04c3fSmrg{ 94201e04c3fSmrg enum qpu_cond cond; 94301e04c3fSmrg 94401e04c3fSmrg switch (compare_instr->op) { 9459f464c52Smaya case nir_op_feq32: 9469f464c52Smaya case nir_op_ieq32: 94701e04c3fSmrg case nir_op_seq: 94801e04c3fSmrg cond = QPU_COND_ZS; 94901e04c3fSmrg break; 9507ec681f3Smrg case nir_op_fneu32: 9519f464c52Smaya case nir_op_ine32: 95201e04c3fSmrg case nir_op_sne: 95301e04c3fSmrg cond = QPU_COND_ZC; 95401e04c3fSmrg break; 9559f464c52Smaya case nir_op_fge32: 9569f464c52Smaya case nir_op_ige32: 9579f464c52Smaya case nir_op_uge32: 95801e04c3fSmrg case nir_op_sge: 95901e04c3fSmrg cond = QPU_COND_NC; 96001e04c3fSmrg break; 9619f464c52Smaya case nir_op_flt32: 9629f464c52Smaya case nir_op_ilt32: 96301e04c3fSmrg case nir_op_slt: 96401e04c3fSmrg cond = QPU_COND_NS; 96501e04c3fSmrg break; 96601e04c3fSmrg default: 96701e04c3fSmrg return false; 96801e04c3fSmrg } 96901e04c3fSmrg 97001e04c3fSmrg struct qreg src0 = ntq_get_alu_src(c, compare_instr, 0); 97101e04c3fSmrg struct qreg src1 = ntq_get_alu_src(c, compare_instr, 1); 97201e04c3fSmrg 97301e04c3fSmrg unsigned unsized_type = 97401e04c3fSmrg nir_alu_type_get_base_type(nir_op_infos[compare_instr->op].input_types[0]); 97501e04c3fSmrg if (unsized_type == nir_type_float) 97601e04c3fSmrg qir_SF(c, qir_FSUB(c, src0, src1)); 97701e04c3fSmrg else 97801e04c3fSmrg qir_SF(c, qir_SUB(c, src0, src1)); 97901e04c3fSmrg 98001e04c3fSmrg switch (sel_instr->op) { 98101e04c3fSmrg case nir_op_seq: 98201e04c3fSmrg case nir_op_sne: 98301e04c3fSmrg case nir_op_sge: 98401e04c3fSmrg case nir_op_slt: 98501e04c3fSmrg *dest = qir_SEL(c, cond, 98601e04c3fSmrg qir_uniform_f(c, 1.0), qir_uniform_f(c, 0.0)); 98701e04c3fSmrg break; 988af69d88dSmrg 9899f464c52Smaya case nir_op_b32csel: 99001e04c3fSmrg *dest = qir_SEL(c, cond, 99101e04c3fSmrg ntq_get_alu_src(c, sel_instr, 1), 99201e04c3fSmrg ntq_get_alu_src(c, sel_instr, 2)); 99301e04c3fSmrg break; 99401e04c3fSmrg 99501e04c3fSmrg default: 99601e04c3fSmrg *dest = qir_SEL(c, cond, 99701e04c3fSmrg qir_uniform_ui(c, ~0), qir_uniform_ui(c, 0)); 99801e04c3fSmrg break; 99901e04c3fSmrg } 100001e04c3fSmrg 100101e04c3fSmrg /* Make the temporary for nir_store_dest(). */ 100201e04c3fSmrg *dest = qir_MOV(c, *dest); 100301e04c3fSmrg 100401e04c3fSmrg return true; 100501e04c3fSmrg} 100601e04c3fSmrg 100701e04c3fSmrg/** 100801e04c3fSmrg * Attempts to fold a comparison generating a boolean result into the 100901e04c3fSmrg * condition code for selecting between two values, instead of comparing the 101001e04c3fSmrg * boolean result against 0 to generate the condition code. 101101e04c3fSmrg */ 101201e04c3fSmrgstatic struct qreg ntq_emit_bcsel(struct vc4_compile *c, nir_alu_instr *instr, 101301e04c3fSmrg struct qreg *src) 101401e04c3fSmrg{ 101501e04c3fSmrg if (!instr->src[0].src.is_ssa) 101601e04c3fSmrg goto out; 101701e04c3fSmrg if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) 101801e04c3fSmrg goto out; 101901e04c3fSmrg nir_alu_instr *compare = 102001e04c3fSmrg nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); 102101e04c3fSmrg if (!compare) 102201e04c3fSmrg goto out; 102301e04c3fSmrg 102401e04c3fSmrg struct qreg dest; 102501e04c3fSmrg if (ntq_emit_comparison(c, &dest, compare, instr)) 102601e04c3fSmrg return dest; 102701e04c3fSmrg 102801e04c3fSmrgout: 102901e04c3fSmrg qir_SF(c, src[0]); 103001e04c3fSmrg return qir_MOV(c, qir_SEL(c, QPU_COND_NS, src[1], src[2])); 103101e04c3fSmrg} 103201e04c3fSmrg 103301e04c3fSmrgstatic struct qreg 103401e04c3fSmrgntq_fddx(struct vc4_compile *c, struct qreg src) 103501e04c3fSmrg{ 103601e04c3fSmrg /* Make sure that we have a bare temp to use for MUL rotation, so it 103701e04c3fSmrg * can be allocated to an accumulator. 103801e04c3fSmrg */ 103901e04c3fSmrg if (src.pack || src.file != QFILE_TEMP) 104001e04c3fSmrg src = qir_MOV(c, src); 104101e04c3fSmrg 104201e04c3fSmrg struct qreg from_left = qir_ROT_MUL(c, src, 1); 104301e04c3fSmrg struct qreg from_right = qir_ROT_MUL(c, src, 15); 104401e04c3fSmrg 104501e04c3fSmrg /* Distinguish left/right pixels of the quad. */ 104601e04c3fSmrg qir_SF(c, qir_AND(c, qir_reg(QFILE_QPU_ELEMENT, 0), 104701e04c3fSmrg qir_uniform_ui(c, 1))); 104801e04c3fSmrg 104901e04c3fSmrg return qir_MOV(c, qir_SEL(c, QPU_COND_ZS, 105001e04c3fSmrg qir_FSUB(c, from_right, src), 105101e04c3fSmrg qir_FSUB(c, src, from_left))); 105201e04c3fSmrg} 105301e04c3fSmrg 105401e04c3fSmrgstatic struct qreg 105501e04c3fSmrgntq_fddy(struct vc4_compile *c, struct qreg src) 105601e04c3fSmrg{ 105701e04c3fSmrg if (src.pack || src.file != QFILE_TEMP) 105801e04c3fSmrg src = qir_MOV(c, src); 105901e04c3fSmrg 106001e04c3fSmrg struct qreg from_bottom = qir_ROT_MUL(c, src, 2); 106101e04c3fSmrg struct qreg from_top = qir_ROT_MUL(c, src, 14); 106201e04c3fSmrg 106301e04c3fSmrg /* Distinguish top/bottom pixels of the quad. */ 106401e04c3fSmrg qir_SF(c, qir_AND(c, 106501e04c3fSmrg qir_reg(QFILE_QPU_ELEMENT, 0), 106601e04c3fSmrg qir_uniform_ui(c, 2))); 106701e04c3fSmrg 106801e04c3fSmrg return qir_MOV(c, qir_SEL(c, QPU_COND_ZS, 106901e04c3fSmrg qir_FSUB(c, from_top, src), 107001e04c3fSmrg qir_FSUB(c, src, from_bottom))); 107101e04c3fSmrg} 107201e04c3fSmrg 107301e04c3fSmrgstatic void 107401e04c3fSmrgntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr) 107501e04c3fSmrg{ 107601e04c3fSmrg /* This should always be lowered to ALU operations for VC4. */ 107701e04c3fSmrg assert(!instr->dest.saturate); 107801e04c3fSmrg 107901e04c3fSmrg /* Vectors are special in that they have non-scalarized writemasks, 108001e04c3fSmrg * and just take the first swizzle channel for each argument in order 108101e04c3fSmrg * into each writemask channel. 108201e04c3fSmrg */ 108301e04c3fSmrg if (instr->op == nir_op_vec2 || 108401e04c3fSmrg instr->op == nir_op_vec3 || 108501e04c3fSmrg instr->op == nir_op_vec4) { 108601e04c3fSmrg struct qreg srcs[4]; 108701e04c3fSmrg for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 108801e04c3fSmrg srcs[i] = ntq_get_src(c, instr->src[i].src, 108901e04c3fSmrg instr->src[i].swizzle[0]); 109001e04c3fSmrg for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) 109101e04c3fSmrg ntq_store_dest(c, &instr->dest.dest, i, 109201e04c3fSmrg qir_MOV(c, srcs[i])); 109301e04c3fSmrg return; 109401e04c3fSmrg } 109501e04c3fSmrg 109601e04c3fSmrg if (instr->op == nir_op_pack_unorm_4x8) { 109701e04c3fSmrg ntq_emit_pack_unorm_4x8(c, instr); 109801e04c3fSmrg return; 109901e04c3fSmrg } 110001e04c3fSmrg 110101e04c3fSmrg if (instr->op == nir_op_unpack_unorm_4x8) { 110201e04c3fSmrg struct qreg src = ntq_get_src(c, instr->src[0].src, 110301e04c3fSmrg instr->src[0].swizzle[0]); 1104af69d88dSmrg for (int i = 0; i < 4; i++) { 110501e04c3fSmrg if (instr->dest.write_mask & (1 << i)) 110601e04c3fSmrg ntq_store_dest(c, &instr->dest.dest, i, 110701e04c3fSmrg qir_UNPACK_8_F(c, src, i)); 110801e04c3fSmrg } 110901e04c3fSmrg return; 111001e04c3fSmrg } 111101e04c3fSmrg 111201e04c3fSmrg /* General case: We can just grab the one used channel per src. */ 111301e04c3fSmrg struct qreg src[nir_op_infos[instr->op].num_inputs]; 111401e04c3fSmrg for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 111501e04c3fSmrg src[i] = ntq_get_alu_src(c, instr, i); 111601e04c3fSmrg } 111701e04c3fSmrg 111801e04c3fSmrg struct qreg result; 111901e04c3fSmrg 112001e04c3fSmrg switch (instr->op) { 11217ec681f3Smrg case nir_op_mov: 112201e04c3fSmrg result = qir_MOV(c, src[0]); 112301e04c3fSmrg break; 112401e04c3fSmrg case nir_op_fmul: 112501e04c3fSmrg result = qir_FMUL(c, src[0], src[1]); 112601e04c3fSmrg break; 112701e04c3fSmrg case nir_op_fadd: 112801e04c3fSmrg result = qir_FADD(c, src[0], src[1]); 112901e04c3fSmrg break; 113001e04c3fSmrg case nir_op_fsub: 113101e04c3fSmrg result = qir_FSUB(c, src[0], src[1]); 113201e04c3fSmrg break; 113301e04c3fSmrg case nir_op_fmin: 113401e04c3fSmrg result = qir_FMIN(c, src[0], src[1]); 113501e04c3fSmrg break; 113601e04c3fSmrg case nir_op_fmax: 113701e04c3fSmrg result = qir_FMAX(c, src[0], src[1]); 113801e04c3fSmrg break; 113901e04c3fSmrg 114001e04c3fSmrg case nir_op_f2i32: 114101e04c3fSmrg case nir_op_f2u32: 114201e04c3fSmrg result = qir_FTOI(c, src[0]); 114301e04c3fSmrg break; 114401e04c3fSmrg case nir_op_i2f32: 114501e04c3fSmrg case nir_op_u2f32: 114601e04c3fSmrg result = qir_ITOF(c, src[0]); 114701e04c3fSmrg break; 11489f464c52Smaya case nir_op_b2f32: 114901e04c3fSmrg result = qir_AND(c, src[0], qir_uniform_f(c, 1.0)); 115001e04c3fSmrg break; 11519f464c52Smaya case nir_op_b2i32: 115201e04c3fSmrg result = qir_AND(c, src[0], qir_uniform_ui(c, 1)); 115301e04c3fSmrg break; 11549f464c52Smaya case nir_op_i2b32: 11559f464c52Smaya case nir_op_f2b32: 115601e04c3fSmrg qir_SF(c, src[0]); 115701e04c3fSmrg result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, 115801e04c3fSmrg qir_uniform_ui(c, ~0), 115901e04c3fSmrg qir_uniform_ui(c, 0))); 116001e04c3fSmrg break; 116101e04c3fSmrg 116201e04c3fSmrg case nir_op_iadd: 116301e04c3fSmrg result = qir_ADD(c, src[0], src[1]); 116401e04c3fSmrg break; 116501e04c3fSmrg case nir_op_ushr: 116601e04c3fSmrg result = qir_SHR(c, src[0], src[1]); 116701e04c3fSmrg break; 116801e04c3fSmrg case nir_op_isub: 116901e04c3fSmrg result = qir_SUB(c, src[0], src[1]); 117001e04c3fSmrg break; 117101e04c3fSmrg case nir_op_ishr: 117201e04c3fSmrg result = qir_ASR(c, src[0], src[1]); 117301e04c3fSmrg break; 117401e04c3fSmrg case nir_op_ishl: 117501e04c3fSmrg result = qir_SHL(c, src[0], src[1]); 117601e04c3fSmrg break; 117701e04c3fSmrg case nir_op_imin: 117801e04c3fSmrg result = qir_MIN(c, src[0], src[1]); 117901e04c3fSmrg break; 118001e04c3fSmrg case nir_op_imax: 118101e04c3fSmrg result = qir_MAX(c, src[0], src[1]); 118201e04c3fSmrg break; 118301e04c3fSmrg case nir_op_iand: 118401e04c3fSmrg result = qir_AND(c, src[0], src[1]); 118501e04c3fSmrg break; 118601e04c3fSmrg case nir_op_ior: 118701e04c3fSmrg result = qir_OR(c, src[0], src[1]); 118801e04c3fSmrg break; 118901e04c3fSmrg case nir_op_ixor: 119001e04c3fSmrg result = qir_XOR(c, src[0], src[1]); 119101e04c3fSmrg break; 119201e04c3fSmrg case nir_op_inot: 119301e04c3fSmrg result = qir_NOT(c, src[0]); 119401e04c3fSmrg break; 119501e04c3fSmrg 119601e04c3fSmrg case nir_op_imul: 119701e04c3fSmrg result = ntq_umul(c, src[0], src[1]); 119801e04c3fSmrg break; 1199af69d88dSmrg 120001e04c3fSmrg case nir_op_seq: 120101e04c3fSmrg case nir_op_sne: 120201e04c3fSmrg case nir_op_sge: 120301e04c3fSmrg case nir_op_slt: 12049f464c52Smaya case nir_op_feq32: 12057ec681f3Smrg case nir_op_fneu32: 12069f464c52Smaya case nir_op_fge32: 12079f464c52Smaya case nir_op_flt32: 12089f464c52Smaya case nir_op_ieq32: 12099f464c52Smaya case nir_op_ine32: 12109f464c52Smaya case nir_op_ige32: 12119f464c52Smaya case nir_op_uge32: 12129f464c52Smaya case nir_op_ilt32: 121301e04c3fSmrg if (!ntq_emit_comparison(c, &result, instr, instr)) { 121401e04c3fSmrg fprintf(stderr, "Bad comparison instruction\n"); 1215af69d88dSmrg } 121601e04c3fSmrg break; 121701e04c3fSmrg 12189f464c52Smaya case nir_op_b32csel: 121901e04c3fSmrg result = ntq_emit_bcsel(c, instr, src); 122001e04c3fSmrg break; 122101e04c3fSmrg case nir_op_fcsel: 122201e04c3fSmrg qir_SF(c, src[0]); 122301e04c3fSmrg result = qir_MOV(c, qir_SEL(c, QPU_COND_ZC, src[1], src[2])); 122401e04c3fSmrg break; 122501e04c3fSmrg 122601e04c3fSmrg case nir_op_frcp: 122701e04c3fSmrg result = ntq_rcp(c, src[0]); 122801e04c3fSmrg break; 122901e04c3fSmrg case nir_op_frsq: 123001e04c3fSmrg result = ntq_rsq(c, src[0]); 123101e04c3fSmrg break; 123201e04c3fSmrg case nir_op_fexp2: 123301e04c3fSmrg result = qir_EXP2(c, src[0]); 123401e04c3fSmrg break; 123501e04c3fSmrg case nir_op_flog2: 123601e04c3fSmrg result = qir_LOG2(c, src[0]); 123701e04c3fSmrg break; 123801e04c3fSmrg 123901e04c3fSmrg case nir_op_ftrunc: 124001e04c3fSmrg result = qir_ITOF(c, qir_FTOI(c, src[0])); 124101e04c3fSmrg break; 124201e04c3fSmrg case nir_op_fceil: 124301e04c3fSmrg result = ntq_fceil(c, src[0]); 124401e04c3fSmrg break; 124501e04c3fSmrg case nir_op_ffract: 124601e04c3fSmrg result = ntq_ffract(c, src[0]); 124701e04c3fSmrg break; 124801e04c3fSmrg case nir_op_ffloor: 124901e04c3fSmrg result = ntq_ffloor(c, src[0]); 125001e04c3fSmrg break; 125101e04c3fSmrg 125201e04c3fSmrg case nir_op_fsin: 125301e04c3fSmrg result = ntq_fsin(c, src[0]); 125401e04c3fSmrg break; 125501e04c3fSmrg case nir_op_fcos: 125601e04c3fSmrg result = ntq_fcos(c, src[0]); 125701e04c3fSmrg break; 125801e04c3fSmrg 125901e04c3fSmrg case nir_op_fsign: 126001e04c3fSmrg result = ntq_fsign(c, src[0]); 126101e04c3fSmrg break; 126201e04c3fSmrg 126301e04c3fSmrg case nir_op_fabs: 126401e04c3fSmrg result = qir_FMAXABS(c, src[0], src[0]); 126501e04c3fSmrg break; 126601e04c3fSmrg case nir_op_iabs: 126701e04c3fSmrg result = qir_MAX(c, src[0], 126801e04c3fSmrg qir_SUB(c, qir_uniform_ui(c, 0), src[0])); 126901e04c3fSmrg break; 127001e04c3fSmrg 127101e04c3fSmrg case nir_op_ibitfield_extract: 127201e04c3fSmrg result = ntq_emit_ibfe(c, src[0], src[1], src[2]); 127301e04c3fSmrg break; 127401e04c3fSmrg 127501e04c3fSmrg case nir_op_ubitfield_extract: 127601e04c3fSmrg result = ntq_emit_ubfe(c, src[0], src[1], src[2]); 127701e04c3fSmrg break; 127801e04c3fSmrg 12797ec681f3Smrg case nir_op_usadd_4x8_vc4: 128001e04c3fSmrg result = qir_V8ADDS(c, src[0], src[1]); 128101e04c3fSmrg break; 128201e04c3fSmrg 12837ec681f3Smrg case nir_op_ussub_4x8_vc4: 128401e04c3fSmrg result = qir_V8SUBS(c, src[0], src[1]); 128501e04c3fSmrg break; 128601e04c3fSmrg 12877ec681f3Smrg case nir_op_umin_4x8_vc4: 128801e04c3fSmrg result = qir_V8MIN(c, src[0], src[1]); 128901e04c3fSmrg break; 129001e04c3fSmrg 12917ec681f3Smrg case nir_op_umax_4x8_vc4: 129201e04c3fSmrg result = qir_V8MAX(c, src[0], src[1]); 129301e04c3fSmrg break; 129401e04c3fSmrg 12957ec681f3Smrg case nir_op_umul_unorm_4x8_vc4: 129601e04c3fSmrg result = qir_V8MULD(c, src[0], src[1]); 129701e04c3fSmrg break; 129801e04c3fSmrg 129901e04c3fSmrg case nir_op_fddx: 130001e04c3fSmrg case nir_op_fddx_coarse: 130101e04c3fSmrg case nir_op_fddx_fine: 130201e04c3fSmrg result = ntq_fddx(c, src[0]); 130301e04c3fSmrg break; 130401e04c3fSmrg 130501e04c3fSmrg case nir_op_fddy: 130601e04c3fSmrg case nir_op_fddy_coarse: 130701e04c3fSmrg case nir_op_fddy_fine: 130801e04c3fSmrg result = ntq_fddy(c, src[0]); 130901e04c3fSmrg break; 131001e04c3fSmrg 131101e04c3fSmrg default: 131201e04c3fSmrg fprintf(stderr, "unknown NIR ALU inst: "); 131301e04c3fSmrg nir_print_instr(&instr->instr, stderr); 131401e04c3fSmrg fprintf(stderr, "\n"); 131501e04c3fSmrg abort(); 1316af69d88dSmrg } 1317af69d88dSmrg 131801e04c3fSmrg /* We have a scalar result, so the instruction should only have a 131901e04c3fSmrg * single channel written to. 132001e04c3fSmrg */ 132101e04c3fSmrg assert(util_is_power_of_two_or_zero(instr->dest.write_mask)); 132201e04c3fSmrg ntq_store_dest(c, &instr->dest.dest, 132301e04c3fSmrg ffs(instr->dest.write_mask) - 1, result); 132401e04c3fSmrg} 1325af69d88dSmrg 132601e04c3fSmrgstatic void 132701e04c3fSmrgemit_frag_end(struct vc4_compile *c) 132801e04c3fSmrg{ 132901e04c3fSmrg struct qreg color; 133001e04c3fSmrg if (c->output_color_index != -1) { 133101e04c3fSmrg color = c->outputs[c->output_color_index]; 133201e04c3fSmrg } else { 133301e04c3fSmrg color = qir_uniform_ui(c, 0); 133401e04c3fSmrg } 133501e04c3fSmrg 133601e04c3fSmrg uint32_t discard_cond = QPU_COND_ALWAYS; 133701e04c3fSmrg if (c->s->info.fs.uses_discard) { 133801e04c3fSmrg qir_SF(c, c->discard); 133901e04c3fSmrg discard_cond = QPU_COND_ZS; 134001e04c3fSmrg } 134101e04c3fSmrg 134201e04c3fSmrg if (c->fs_key->stencil_enabled) { 134301e04c3fSmrg qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 134401e04c3fSmrg qir_uniform(c, QUNIFORM_STENCIL, 0)); 134501e04c3fSmrg if (c->fs_key->stencil_twoside) { 134601e04c3fSmrg qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 134701e04c3fSmrg qir_uniform(c, QUNIFORM_STENCIL, 1)); 134801e04c3fSmrg } 134901e04c3fSmrg if (c->fs_key->stencil_full_writemasks) { 135001e04c3fSmrg qir_MOV_dest(c, qir_reg(QFILE_TLB_STENCIL_SETUP, 0), 135101e04c3fSmrg qir_uniform(c, QUNIFORM_STENCIL, 2)); 135201e04c3fSmrg } 135301e04c3fSmrg } 135401e04c3fSmrg 135501e04c3fSmrg if (c->output_sample_mask_index != -1) { 135601e04c3fSmrg qir_MS_MASK(c, c->outputs[c->output_sample_mask_index]); 135701e04c3fSmrg } 135801e04c3fSmrg 135901e04c3fSmrg if (c->fs_key->depth_enabled) { 136001e04c3fSmrg if (c->output_position_index != -1) { 136101e04c3fSmrg qir_FTOI_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0), 136201e04c3fSmrg qir_FMUL(c, 136301e04c3fSmrg c->outputs[c->output_position_index], 136401e04c3fSmrg qir_uniform_f(c, 0xffffff)))->cond = discard_cond; 136501e04c3fSmrg } else { 136601e04c3fSmrg qir_MOV_dest(c, qir_reg(QFILE_TLB_Z_WRITE, 0), 136701e04c3fSmrg qir_FRAG_Z(c))->cond = discard_cond; 136801e04c3fSmrg } 136901e04c3fSmrg } 137001e04c3fSmrg 137101e04c3fSmrg if (!c->msaa_per_sample_output) { 137201e04c3fSmrg qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE, 0), 137301e04c3fSmrg color)->cond = discard_cond; 137401e04c3fSmrg } else { 137501e04c3fSmrg for (int i = 0; i < VC4_MAX_SAMPLES; i++) { 137601e04c3fSmrg qir_MOV_dest(c, qir_reg(QFILE_TLB_COLOR_WRITE_MS, 0), 137701e04c3fSmrg c->sample_colors[i])->cond = discard_cond; 137801e04c3fSmrg } 137901e04c3fSmrg } 138001e04c3fSmrg} 138101e04c3fSmrg 138201e04c3fSmrgstatic void 138301e04c3fSmrgemit_scaled_viewport_write(struct vc4_compile *c, struct qreg rcp_w) 138401e04c3fSmrg{ 138501e04c3fSmrg struct qreg packed = qir_get_temp(c); 138601e04c3fSmrg 138701e04c3fSmrg for (int i = 0; i < 2; i++) { 138801e04c3fSmrg struct qreg scale = 138901e04c3fSmrg qir_uniform(c, QUNIFORM_VIEWPORT_X_SCALE + i, 0); 139001e04c3fSmrg 139101e04c3fSmrg struct qreg packed_chan = packed; 139201e04c3fSmrg packed_chan.pack = QPU_PACK_A_16A + i; 139301e04c3fSmrg 139401e04c3fSmrg qir_FTOI_dest(c, packed_chan, 139501e04c3fSmrg qir_FMUL(c, 139601e04c3fSmrg qir_FMUL(c, 139701e04c3fSmrg c->outputs[c->output_position_index + i], 139801e04c3fSmrg scale), 139901e04c3fSmrg rcp_w)); 140001e04c3fSmrg } 140101e04c3fSmrg 140201e04c3fSmrg qir_VPM_WRITE(c, packed); 140301e04c3fSmrg} 140401e04c3fSmrg 140501e04c3fSmrgstatic void 140601e04c3fSmrgemit_zs_write(struct vc4_compile *c, struct qreg rcp_w) 140701e04c3fSmrg{ 140801e04c3fSmrg struct qreg zscale = qir_uniform(c, QUNIFORM_VIEWPORT_Z_SCALE, 0); 140901e04c3fSmrg struct qreg zoffset = qir_uniform(c, QUNIFORM_VIEWPORT_Z_OFFSET, 0); 141001e04c3fSmrg 141101e04c3fSmrg qir_VPM_WRITE(c, qir_FADD(c, qir_FMUL(c, qir_FMUL(c, 141201e04c3fSmrg c->outputs[c->output_position_index + 2], 141301e04c3fSmrg zscale), 141401e04c3fSmrg rcp_w), 141501e04c3fSmrg zoffset)); 141601e04c3fSmrg} 141701e04c3fSmrg 141801e04c3fSmrgstatic void 141901e04c3fSmrgemit_rcp_wc_write(struct vc4_compile *c, struct qreg rcp_w) 142001e04c3fSmrg{ 142101e04c3fSmrg qir_VPM_WRITE(c, rcp_w); 142201e04c3fSmrg} 142301e04c3fSmrg 142401e04c3fSmrgstatic void 142501e04c3fSmrgemit_point_size_write(struct vc4_compile *c) 142601e04c3fSmrg{ 142701e04c3fSmrg struct qreg point_size; 142801e04c3fSmrg 142901e04c3fSmrg if (c->output_point_size_index != -1) 143001e04c3fSmrg point_size = c->outputs[c->output_point_size_index]; 143101e04c3fSmrg else 143201e04c3fSmrg point_size = qir_uniform_f(c, 1.0); 143301e04c3fSmrg 143401e04c3fSmrg qir_VPM_WRITE(c, point_size); 143501e04c3fSmrg} 143601e04c3fSmrg 143701e04c3fSmrg/** 143801e04c3fSmrg * Emits a VPM read of the stub vertex attribute set up by vc4_draw.c. 143901e04c3fSmrg * 144001e04c3fSmrg * The simulator insists that there be at least one vertex attribute, so 144101e04c3fSmrg * vc4_draw.c will emit one if it wouldn't have otherwise. The simulator also 144201e04c3fSmrg * insists that all vertex attributes loaded get read by the VS/CS, so we have 144301e04c3fSmrg * to consume it here. 144401e04c3fSmrg */ 144501e04c3fSmrgstatic void 144601e04c3fSmrgemit_stub_vpm_read(struct vc4_compile *c) 144701e04c3fSmrg{ 144801e04c3fSmrg if (c->num_inputs) 144901e04c3fSmrg return; 145001e04c3fSmrg 145101e04c3fSmrg c->vattr_sizes[0] = 4; 145201e04c3fSmrg (void)qir_MOV(c, qir_reg(QFILE_VPM, 0)); 145301e04c3fSmrg c->num_inputs++; 145401e04c3fSmrg} 145501e04c3fSmrg 145601e04c3fSmrgstatic void 145701e04c3fSmrgemit_vert_end(struct vc4_compile *c, 145801e04c3fSmrg struct vc4_varying_slot *fs_inputs, 145901e04c3fSmrg uint32_t num_fs_inputs) 146001e04c3fSmrg{ 146101e04c3fSmrg struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]); 146201e04c3fSmrg 146301e04c3fSmrg emit_stub_vpm_read(c); 146401e04c3fSmrg 146501e04c3fSmrg emit_scaled_viewport_write(c, rcp_w); 146601e04c3fSmrg emit_zs_write(c, rcp_w); 146701e04c3fSmrg emit_rcp_wc_write(c, rcp_w); 146801e04c3fSmrg if (c->vs_key->per_vertex_point_size) 146901e04c3fSmrg emit_point_size_write(c); 147001e04c3fSmrg 147101e04c3fSmrg for (int i = 0; i < num_fs_inputs; i++) { 147201e04c3fSmrg struct vc4_varying_slot *input = &fs_inputs[i]; 147301e04c3fSmrg int j; 147401e04c3fSmrg 147501e04c3fSmrg for (j = 0; j < c->num_outputs; j++) { 147601e04c3fSmrg struct vc4_varying_slot *output = 147701e04c3fSmrg &c->output_slots[j]; 147801e04c3fSmrg 147901e04c3fSmrg if (input->slot == output->slot && 148001e04c3fSmrg input->swizzle == output->swizzle) { 148101e04c3fSmrg qir_VPM_WRITE(c, c->outputs[j]); 148201e04c3fSmrg break; 148301e04c3fSmrg } 1484af69d88dSmrg } 148501e04c3fSmrg /* Emit padding if we didn't find a declared VS output for 148601e04c3fSmrg * this FS input. 148701e04c3fSmrg */ 148801e04c3fSmrg if (j == c->num_outputs) 148901e04c3fSmrg qir_VPM_WRITE(c, qir_uniform_f(c, 0.0)); 1490af69d88dSmrg } 149101e04c3fSmrg} 149201e04c3fSmrg 149301e04c3fSmrgstatic void 149401e04c3fSmrgemit_coord_end(struct vc4_compile *c) 149501e04c3fSmrg{ 149601e04c3fSmrg struct qreg rcp_w = ntq_rcp(c, c->outputs[c->output_position_index + 3]); 149701e04c3fSmrg 149801e04c3fSmrg emit_stub_vpm_read(c); 149901e04c3fSmrg 150001e04c3fSmrg for (int i = 0; i < 4; i++) 150101e04c3fSmrg qir_VPM_WRITE(c, c->outputs[c->output_position_index + i]); 150201e04c3fSmrg 150301e04c3fSmrg emit_scaled_viewport_write(c, rcp_w); 150401e04c3fSmrg emit_zs_write(c, rcp_w); 150501e04c3fSmrg emit_rcp_wc_write(c, rcp_w); 150601e04c3fSmrg if (c->vs_key->per_vertex_point_size) 150701e04c3fSmrg emit_point_size_write(c); 150801e04c3fSmrg} 150901e04c3fSmrg 151001e04c3fSmrgstatic void 151101e04c3fSmrgvc4_optimize_nir(struct nir_shader *s) 151201e04c3fSmrg{ 151301e04c3fSmrg bool progress; 15147ec681f3Smrg unsigned lower_flrp = 15157ec681f3Smrg (s->options->lower_flrp16 ? 16 : 0) | 15167ec681f3Smrg (s->options->lower_flrp32 ? 32 : 0) | 15177ec681f3Smrg (s->options->lower_flrp64 ? 64 : 0); 151801e04c3fSmrg 151901e04c3fSmrg do { 152001e04c3fSmrg progress = false; 152101e04c3fSmrg 152201e04c3fSmrg NIR_PASS_V(s, nir_lower_vars_to_ssa); 15237ec681f3Smrg NIR_PASS(progress, s, nir_lower_alu_to_scalar, NULL, NULL); 15247ec681f3Smrg NIR_PASS(progress, s, nir_lower_phis_to_scalar, false); 152501e04c3fSmrg NIR_PASS(progress, s, nir_copy_prop); 152601e04c3fSmrg NIR_PASS(progress, s, nir_opt_remove_phis); 152701e04c3fSmrg NIR_PASS(progress, s, nir_opt_dce); 152801e04c3fSmrg NIR_PASS(progress, s, nir_opt_dead_cf); 152901e04c3fSmrg NIR_PASS(progress, s, nir_opt_cse); 15309f464c52Smaya NIR_PASS(progress, s, nir_opt_peephole_select, 8, true, true); 153101e04c3fSmrg NIR_PASS(progress, s, nir_opt_algebraic); 153201e04c3fSmrg NIR_PASS(progress, s, nir_opt_constant_folding); 15337ec681f3Smrg if (lower_flrp != 0) { 15347ec681f3Smrg bool lower_flrp_progress = false; 15357ec681f3Smrg 15367ec681f3Smrg NIR_PASS(lower_flrp_progress, s, nir_lower_flrp, 15377ec681f3Smrg lower_flrp, 15387ec681f3Smrg false /* always_precise */); 15397ec681f3Smrg if (lower_flrp_progress) { 15407ec681f3Smrg NIR_PASS(progress, s, nir_opt_constant_folding); 15417ec681f3Smrg progress = true; 15427ec681f3Smrg } 15437ec681f3Smrg 15447ec681f3Smrg /* Nothing should rematerialize any flrps, so we only 15457ec681f3Smrg * need to do this lowering once. 15467ec681f3Smrg */ 15477ec681f3Smrg lower_flrp = 0; 15487ec681f3Smrg } 15497ec681f3Smrg 155001e04c3fSmrg NIR_PASS(progress, s, nir_opt_undef); 15517ec681f3Smrg NIR_PASS(progress, s, nir_opt_loop_unroll); 155201e04c3fSmrg } while (progress); 155301e04c3fSmrg} 155401e04c3fSmrg 155501e04c3fSmrgstatic int 155601e04c3fSmrgdriver_location_compare(const void *in_a, const void *in_b) 155701e04c3fSmrg{ 155801e04c3fSmrg const nir_variable *const *a = in_a; 155901e04c3fSmrg const nir_variable *const *b = in_b; 156001e04c3fSmrg 156101e04c3fSmrg return (*a)->data.driver_location - (*b)->data.driver_location; 156201e04c3fSmrg} 156301e04c3fSmrg 156401e04c3fSmrgstatic void 156501e04c3fSmrgntq_setup_inputs(struct vc4_compile *c) 156601e04c3fSmrg{ 156701e04c3fSmrg unsigned num_entries = 0; 15687ec681f3Smrg nir_foreach_shader_in_variable(var, c->s) 156901e04c3fSmrg num_entries++; 157001e04c3fSmrg 157101e04c3fSmrg nir_variable *vars[num_entries]; 157201e04c3fSmrg 157301e04c3fSmrg unsigned i = 0; 15747ec681f3Smrg nir_foreach_shader_in_variable(var, c->s) 157501e04c3fSmrg vars[i++] = var; 1576af69d88dSmrg 157701e04c3fSmrg /* Sort the variables so that we emit the input setup in 157801e04c3fSmrg * driver_location order. This is required for VPM reads, whose data 157901e04c3fSmrg * is fetched into the VPM in driver_location (TGSI register index) 158001e04c3fSmrg * order. 1581af69d88dSmrg */ 158201e04c3fSmrg qsort(&vars, num_entries, sizeof(*vars), driver_location_compare); 158301e04c3fSmrg 158401e04c3fSmrg for (unsigned i = 0; i < num_entries; i++) { 158501e04c3fSmrg nir_variable *var = vars[i]; 158601e04c3fSmrg unsigned array_len = MAX2(glsl_get_length(var->type), 1); 158701e04c3fSmrg unsigned loc = var->data.driver_location; 158801e04c3fSmrg 158901e04c3fSmrg assert(array_len == 1); 159001e04c3fSmrg (void)array_len; 159101e04c3fSmrg resize_qreg_array(c, &c->inputs, &c->inputs_array_size, 159201e04c3fSmrg (loc + 1) * 4); 159301e04c3fSmrg 159401e04c3fSmrg if (c->stage == QSTAGE_FRAG) { 159501e04c3fSmrg if (var->data.location == VARYING_SLOT_POS) { 159601e04c3fSmrg emit_fragcoord_input(c, loc); 15977ec681f3Smrg } else if (util_varying_is_point_coord(var->data.location, 15987ec681f3Smrg c->fs_key->point_sprite_mask)) { 159901e04c3fSmrg c->inputs[loc * 4 + 0] = c->point_x; 160001e04c3fSmrg c->inputs[loc * 4 + 1] = c->point_y; 160101e04c3fSmrg } else { 160201e04c3fSmrg emit_fragment_input(c, loc, var->data.location); 160301e04c3fSmrg } 160401e04c3fSmrg } else { 160501e04c3fSmrg emit_vertex_input(c, loc); 160601e04c3fSmrg } 1607af69d88dSmrg } 160801e04c3fSmrg} 160901e04c3fSmrg 161001e04c3fSmrgstatic void 161101e04c3fSmrgntq_setup_outputs(struct vc4_compile *c) 161201e04c3fSmrg{ 16137ec681f3Smrg nir_foreach_shader_out_variable(var, c->s) { 161401e04c3fSmrg unsigned array_len = MAX2(glsl_get_length(var->type), 1); 161501e04c3fSmrg unsigned loc = var->data.driver_location * 4; 161601e04c3fSmrg 161701e04c3fSmrg assert(array_len == 1); 161801e04c3fSmrg (void)array_len; 161901e04c3fSmrg 162001e04c3fSmrg for (int i = 0; i < 4; i++) 162101e04c3fSmrg add_output(c, loc + i, var->data.location, i); 162201e04c3fSmrg 162301e04c3fSmrg if (c->stage == QSTAGE_FRAG) { 162401e04c3fSmrg switch (var->data.location) { 162501e04c3fSmrg case FRAG_RESULT_COLOR: 162601e04c3fSmrg case FRAG_RESULT_DATA0: 162701e04c3fSmrg c->output_color_index = loc; 162801e04c3fSmrg break; 162901e04c3fSmrg case FRAG_RESULT_DEPTH: 163001e04c3fSmrg c->output_position_index = loc; 163101e04c3fSmrg break; 163201e04c3fSmrg case FRAG_RESULT_SAMPLE_MASK: 163301e04c3fSmrg c->output_sample_mask_index = loc; 163401e04c3fSmrg break; 163501e04c3fSmrg } 163601e04c3fSmrg } else { 163701e04c3fSmrg switch (var->data.location) { 163801e04c3fSmrg case VARYING_SLOT_POS: 163901e04c3fSmrg c->output_position_index = loc; 164001e04c3fSmrg break; 164101e04c3fSmrg case VARYING_SLOT_PSIZ: 164201e04c3fSmrg c->output_point_size_index = loc; 164301e04c3fSmrg break; 164401e04c3fSmrg } 164501e04c3fSmrg } 164601e04c3fSmrg } 164701e04c3fSmrg} 164801e04c3fSmrg 164901e04c3fSmrg/** 165001e04c3fSmrg * Sets up the mapping from nir_register to struct qreg *. 165101e04c3fSmrg * 165201e04c3fSmrg * Each nir_register gets a struct qreg per 32-bit component being stored. 165301e04c3fSmrg */ 165401e04c3fSmrgstatic void 165501e04c3fSmrgntq_setup_registers(struct vc4_compile *c, struct exec_list *list) 165601e04c3fSmrg{ 165701e04c3fSmrg foreach_list_typed(nir_register, nir_reg, node, list) { 165801e04c3fSmrg unsigned array_len = MAX2(nir_reg->num_array_elems, 1); 165901e04c3fSmrg struct qreg *qregs = ralloc_array(c->def_ht, struct qreg, 166001e04c3fSmrg array_len * 166101e04c3fSmrg nir_reg->num_components); 166201e04c3fSmrg 166301e04c3fSmrg _mesa_hash_table_insert(c->def_ht, nir_reg, qregs); 166401e04c3fSmrg 166501e04c3fSmrg for (int i = 0; i < array_len * nir_reg->num_components; i++) 166601e04c3fSmrg qregs[i] = qir_get_temp(c); 166701e04c3fSmrg } 166801e04c3fSmrg} 166901e04c3fSmrg 167001e04c3fSmrgstatic void 167101e04c3fSmrgntq_emit_load_const(struct vc4_compile *c, nir_load_const_instr *instr) 167201e04c3fSmrg{ 167301e04c3fSmrg struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 167401e04c3fSmrg for (int i = 0; i < instr->def.num_components; i++) 16759f464c52Smaya qregs[i] = qir_uniform_ui(c, instr->value[i].u32); 167601e04c3fSmrg 167701e04c3fSmrg _mesa_hash_table_insert(c->def_ht, &instr->def, qregs); 167801e04c3fSmrg} 167901e04c3fSmrg 168001e04c3fSmrgstatic void 168101e04c3fSmrgntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr) 168201e04c3fSmrg{ 168301e04c3fSmrg struct qreg *qregs = ntq_init_ssa_def(c, &instr->def); 168401e04c3fSmrg 168501e04c3fSmrg /* QIR needs there to be *some* value, so pick 0 (same as for 168601e04c3fSmrg * ntq_setup_registers(). 168701e04c3fSmrg */ 168801e04c3fSmrg for (int i = 0; i < instr->def.num_components; i++) 168901e04c3fSmrg qregs[i] = qir_uniform_ui(c, 0); 169001e04c3fSmrg} 169101e04c3fSmrg 169201e04c3fSmrgstatic void 169301e04c3fSmrgntq_emit_color_read(struct vc4_compile *c, nir_intrinsic_instr *instr) 169401e04c3fSmrg{ 16959f464c52Smaya assert(nir_src_as_uint(instr->src[0]) == 0); 169601e04c3fSmrg 169701e04c3fSmrg /* Reads of the per-sample color need to be done in 169801e04c3fSmrg * order. 169901e04c3fSmrg */ 170001e04c3fSmrg int sample_index = (nir_intrinsic_base(instr) - 170101e04c3fSmrg VC4_NIR_TLB_COLOR_READ_INPUT); 170201e04c3fSmrg for (int i = 0; i <= sample_index; i++) { 170301e04c3fSmrg if (c->color_reads[i].file == QFILE_NULL) { 170401e04c3fSmrg c->color_reads[i] = 170501e04c3fSmrg qir_TLB_COLOR_READ(c); 170601e04c3fSmrg } 170701e04c3fSmrg } 170801e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 170901e04c3fSmrg qir_MOV(c, c->color_reads[sample_index])); 171001e04c3fSmrg} 171101e04c3fSmrg 171201e04c3fSmrgstatic void 171301e04c3fSmrgntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr) 171401e04c3fSmrg{ 171501e04c3fSmrg assert(instr->num_components == 1); 17169f464c52Smaya assert(nir_src_is_const(instr->src[0]) && 17179f464c52Smaya "vc4 doesn't support indirect inputs"); 171801e04c3fSmrg 171901e04c3fSmrg if (c->stage == QSTAGE_FRAG && 172001e04c3fSmrg nir_intrinsic_base(instr) >= VC4_NIR_TLB_COLOR_READ_INPUT) { 172101e04c3fSmrg ntq_emit_color_read(c, instr); 172201e04c3fSmrg return; 172301e04c3fSmrg } 172401e04c3fSmrg 17259f464c52Smaya uint32_t offset = nir_intrinsic_base(instr) + 17269f464c52Smaya nir_src_as_uint(instr->src[0]); 172701e04c3fSmrg int comp = nir_intrinsic_component(instr); 172801e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 172901e04c3fSmrg qir_MOV(c, c->inputs[offset * 4 + comp])); 173001e04c3fSmrg} 173101e04c3fSmrg 173201e04c3fSmrgstatic void 173301e04c3fSmrgntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr) 173401e04c3fSmrg{ 173501e04c3fSmrg unsigned offset; 173601e04c3fSmrg 173701e04c3fSmrg switch (instr->intrinsic) { 173801e04c3fSmrg case nir_intrinsic_load_uniform: 173901e04c3fSmrg assert(instr->num_components == 1); 17409f464c52Smaya if (nir_src_is_const(instr->src[0])) { 17419f464c52Smaya offset = nir_intrinsic_base(instr) + 17429f464c52Smaya nir_src_as_uint(instr->src[0]); 174301e04c3fSmrg assert(offset % 4 == 0); 174401e04c3fSmrg /* We need dwords */ 174501e04c3fSmrg offset = offset / 4; 174601e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 174701e04c3fSmrg qir_uniform(c, QUNIFORM_UNIFORM, 174801e04c3fSmrg offset)); 174901e04c3fSmrg } else { 175001e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 175101e04c3fSmrg indirect_uniform_load(c, instr)); 175201e04c3fSmrg } 175301e04c3fSmrg break; 175401e04c3fSmrg 175501e04c3fSmrg case nir_intrinsic_load_ubo: 175601e04c3fSmrg assert(instr->num_components == 1); 175701e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, vc4_ubo_load(c, instr)); 175801e04c3fSmrg break; 175901e04c3fSmrg 176001e04c3fSmrg case nir_intrinsic_load_user_clip_plane: 17617ec681f3Smrg for (int i = 0; i < nir_intrinsic_dest_components(instr); i++) { 176201e04c3fSmrg ntq_store_dest(c, &instr->dest, i, 176301e04c3fSmrg qir_uniform(c, QUNIFORM_USER_CLIP_PLANE, 176401e04c3fSmrg nir_intrinsic_ucp_id(instr) * 176501e04c3fSmrg 4 + i)); 176601e04c3fSmrg } 176701e04c3fSmrg break; 176801e04c3fSmrg 176901e04c3fSmrg case nir_intrinsic_load_blend_const_color_r_float: 177001e04c3fSmrg case nir_intrinsic_load_blend_const_color_g_float: 177101e04c3fSmrg case nir_intrinsic_load_blend_const_color_b_float: 177201e04c3fSmrg case nir_intrinsic_load_blend_const_color_a_float: 177301e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 177401e04c3fSmrg qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_X + 177501e04c3fSmrg (instr->intrinsic - 177601e04c3fSmrg nir_intrinsic_load_blend_const_color_r_float), 177701e04c3fSmrg 0)); 177801e04c3fSmrg break; 177901e04c3fSmrg 178001e04c3fSmrg case nir_intrinsic_load_blend_const_color_rgba8888_unorm: 178101e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 178201e04c3fSmrg qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_RGBA, 178301e04c3fSmrg 0)); 178401e04c3fSmrg break; 178501e04c3fSmrg 178601e04c3fSmrg case nir_intrinsic_load_blend_const_color_aaaa8888_unorm: 178701e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 178801e04c3fSmrg qir_uniform(c, QUNIFORM_BLEND_CONST_COLOR_AAAA, 178901e04c3fSmrg 0)); 179001e04c3fSmrg break; 179101e04c3fSmrg 179201e04c3fSmrg case nir_intrinsic_load_sample_mask_in: 179301e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 179401e04c3fSmrg qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0)); 179501e04c3fSmrg break; 179601e04c3fSmrg 179701e04c3fSmrg case nir_intrinsic_load_front_face: 179801e04c3fSmrg /* The register contains 0 (front) or 1 (back), and we need to 179901e04c3fSmrg * turn it into a NIR bool where true means front. 180001e04c3fSmrg */ 180101e04c3fSmrg ntq_store_dest(c, &instr->dest, 0, 180201e04c3fSmrg qir_ADD(c, 180301e04c3fSmrg qir_uniform_ui(c, -1), 180401e04c3fSmrg qir_reg(QFILE_FRAG_REV_FLAG, 0))); 180501e04c3fSmrg break; 180601e04c3fSmrg 180701e04c3fSmrg case nir_intrinsic_load_input: 180801e04c3fSmrg ntq_emit_load_input(c, instr); 180901e04c3fSmrg break; 181001e04c3fSmrg 181101e04c3fSmrg case nir_intrinsic_store_output: 18129f464c52Smaya assert(nir_src_is_const(instr->src[1]) && 18139f464c52Smaya "vc4 doesn't support indirect outputs"); 18149f464c52Smaya offset = nir_intrinsic_base(instr) + 18159f464c52Smaya nir_src_as_uint(instr->src[1]); 181601e04c3fSmrg 181701e04c3fSmrg /* MSAA color outputs are the only case where we have an 181801e04c3fSmrg * output that's not lowered to being a store of a single 32 181901e04c3fSmrg * bit value. 182001e04c3fSmrg */ 182101e04c3fSmrg if (c->stage == QSTAGE_FRAG && instr->num_components == 4) { 182201e04c3fSmrg assert(offset == c->output_color_index); 182301e04c3fSmrg for (int i = 0; i < 4; i++) { 182401e04c3fSmrg c->sample_colors[i] = 182501e04c3fSmrg qir_MOV(c, ntq_get_src(c, instr->src[0], 182601e04c3fSmrg i)); 182701e04c3fSmrg } 182801e04c3fSmrg } else { 182901e04c3fSmrg offset = offset * 4 + nir_intrinsic_component(instr); 183001e04c3fSmrg assert(instr->num_components == 1); 183101e04c3fSmrg c->outputs[offset] = 183201e04c3fSmrg qir_MOV(c, ntq_get_src(c, instr->src[0], 0)); 183301e04c3fSmrg c->num_outputs = MAX2(c->num_outputs, offset + 1); 183401e04c3fSmrg } 183501e04c3fSmrg break; 183601e04c3fSmrg 183701e04c3fSmrg case nir_intrinsic_discard: 183801e04c3fSmrg if (c->execute.file != QFILE_NULL) { 183901e04c3fSmrg qir_SF(c, c->execute); 184001e04c3fSmrg qir_MOV_cond(c, QPU_COND_ZS, c->discard, 184101e04c3fSmrg qir_uniform_ui(c, ~0)); 184201e04c3fSmrg } else { 184301e04c3fSmrg qir_MOV_dest(c, c->discard, qir_uniform_ui(c, ~0)); 184401e04c3fSmrg } 184501e04c3fSmrg break; 184601e04c3fSmrg 184701e04c3fSmrg case nir_intrinsic_discard_if: { 184801e04c3fSmrg /* true (~0) if we're discarding */ 184901e04c3fSmrg struct qreg cond = ntq_get_src(c, instr->src[0], 0); 185001e04c3fSmrg 185101e04c3fSmrg if (c->execute.file != QFILE_NULL) { 185201e04c3fSmrg /* execute == 0 means the channel is active. Invert 185301e04c3fSmrg * the condition so that we can use zero as "executing 185401e04c3fSmrg * and discarding." 185501e04c3fSmrg */ 185601e04c3fSmrg qir_SF(c, qir_AND(c, c->execute, qir_NOT(c, cond))); 185701e04c3fSmrg qir_MOV_cond(c, QPU_COND_ZS, c->discard, cond); 185801e04c3fSmrg } else { 185901e04c3fSmrg qir_OR_dest(c, c->discard, c->discard, 186001e04c3fSmrg ntq_get_src(c, instr->src[0], 0)); 186101e04c3fSmrg } 186201e04c3fSmrg 186301e04c3fSmrg break; 186401e04c3fSmrg } 186501e04c3fSmrg 18667ec681f3Smrg case nir_intrinsic_load_texture_rect_scaling: { 18677ec681f3Smrg assert(nir_src_is_const(instr->src[0])); 18687ec681f3Smrg int sampler = nir_src_as_int(instr->src[0]); 18697ec681f3Smrg 18707ec681f3Smrg ntq_store_dest(c, &instr->dest, 0, 18717ec681f3Smrg qir_uniform(c, QUNIFORM_TEXRECT_SCALE_X, sampler)); 18727ec681f3Smrg ntq_store_dest(c, &instr->dest, 1, 18737ec681f3Smrg qir_uniform(c, QUNIFORM_TEXRECT_SCALE_Y, sampler)); 18747ec681f3Smrg break; 18757ec681f3Smrg } 18767ec681f3Smrg 187701e04c3fSmrg default: 187801e04c3fSmrg fprintf(stderr, "Unknown intrinsic: "); 187901e04c3fSmrg nir_print_instr(&instr->instr, stderr); 188001e04c3fSmrg fprintf(stderr, "\n"); 188101e04c3fSmrg break; 188201e04c3fSmrg } 188301e04c3fSmrg} 188401e04c3fSmrg 188501e04c3fSmrg/* Clears (activates) the execute flags for any channels whose jump target 188601e04c3fSmrg * matches this block. 188701e04c3fSmrg */ 188801e04c3fSmrgstatic void 188901e04c3fSmrgntq_activate_execute_for_block(struct vc4_compile *c) 189001e04c3fSmrg{ 189101e04c3fSmrg qir_SF(c, qir_SUB(c, 189201e04c3fSmrg c->execute, 189301e04c3fSmrg qir_uniform_ui(c, c->cur_block->index))); 189401e04c3fSmrg qir_MOV_cond(c, QPU_COND_ZS, c->execute, qir_uniform_ui(c, 0)); 189501e04c3fSmrg} 189601e04c3fSmrg 189701e04c3fSmrgstatic void 189801e04c3fSmrgntq_emit_if(struct vc4_compile *c, nir_if *if_stmt) 189901e04c3fSmrg{ 190001e04c3fSmrg if (!c->vc4->screen->has_control_flow) { 190101e04c3fSmrg fprintf(stderr, 190201e04c3fSmrg "IF statement support requires updated kernel.\n"); 190301e04c3fSmrg return; 190401e04c3fSmrg } 190501e04c3fSmrg 190601e04c3fSmrg nir_block *nir_else_block = nir_if_first_else_block(if_stmt); 190701e04c3fSmrg bool empty_else_block = 190801e04c3fSmrg (nir_else_block == nir_if_last_else_block(if_stmt) && 190901e04c3fSmrg exec_list_is_empty(&nir_else_block->instr_list)); 191001e04c3fSmrg 191101e04c3fSmrg struct qblock *then_block = qir_new_block(c); 191201e04c3fSmrg struct qblock *after_block = qir_new_block(c); 191301e04c3fSmrg struct qblock *else_block; 191401e04c3fSmrg if (empty_else_block) 191501e04c3fSmrg else_block = after_block; 191601e04c3fSmrg else 191701e04c3fSmrg else_block = qir_new_block(c); 191801e04c3fSmrg 191901e04c3fSmrg bool was_top_level = false; 192001e04c3fSmrg if (c->execute.file == QFILE_NULL) { 192101e04c3fSmrg c->execute = qir_MOV(c, qir_uniform_ui(c, 0)); 192201e04c3fSmrg was_top_level = true; 192301e04c3fSmrg } 192401e04c3fSmrg 192501e04c3fSmrg /* Set ZS for executing (execute == 0) and jumping (if->condition == 192601e04c3fSmrg * 0) channels, and then update execute flags for those to point to 192701e04c3fSmrg * the ELSE block. 192801e04c3fSmrg */ 192901e04c3fSmrg qir_SF(c, qir_OR(c, 193001e04c3fSmrg c->execute, 193101e04c3fSmrg ntq_get_src(c, if_stmt->condition, 0))); 193201e04c3fSmrg qir_MOV_cond(c, QPU_COND_ZS, c->execute, 193301e04c3fSmrg qir_uniform_ui(c, else_block->index)); 193401e04c3fSmrg 193501e04c3fSmrg /* Jump to ELSE if nothing is active for THEN, otherwise fall 193601e04c3fSmrg * through. 193701e04c3fSmrg */ 193801e04c3fSmrg qir_SF(c, c->execute); 193901e04c3fSmrg qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZC); 194001e04c3fSmrg qir_link_blocks(c->cur_block, else_block); 194101e04c3fSmrg qir_link_blocks(c->cur_block, then_block); 194201e04c3fSmrg 194301e04c3fSmrg /* Process the THEN block. */ 194401e04c3fSmrg qir_set_emit_block(c, then_block); 194501e04c3fSmrg ntq_emit_cf_list(c, &if_stmt->then_list); 194601e04c3fSmrg 194701e04c3fSmrg if (!empty_else_block) { 194801e04c3fSmrg /* Handle the end of the THEN block. First, all currently 194901e04c3fSmrg * active channels update their execute flags to point to 195001e04c3fSmrg * ENDIF 195101e04c3fSmrg */ 195201e04c3fSmrg qir_SF(c, c->execute); 195301e04c3fSmrg qir_MOV_cond(c, QPU_COND_ZS, c->execute, 195401e04c3fSmrg qir_uniform_ui(c, after_block->index)); 195501e04c3fSmrg 195601e04c3fSmrg /* If everything points at ENDIF, then jump there immediately. */ 195701e04c3fSmrg qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, after_block->index))); 195801e04c3fSmrg qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS); 195901e04c3fSmrg qir_link_blocks(c->cur_block, after_block); 196001e04c3fSmrg qir_link_blocks(c->cur_block, else_block); 196101e04c3fSmrg 196201e04c3fSmrg qir_set_emit_block(c, else_block); 196301e04c3fSmrg ntq_activate_execute_for_block(c); 196401e04c3fSmrg ntq_emit_cf_list(c, &if_stmt->else_list); 196501e04c3fSmrg } 196601e04c3fSmrg 196701e04c3fSmrg qir_link_blocks(c->cur_block, after_block); 196801e04c3fSmrg 196901e04c3fSmrg qir_set_emit_block(c, after_block); 197001e04c3fSmrg if (was_top_level) { 197101e04c3fSmrg c->execute = c->undef; 197201e04c3fSmrg c->last_top_block = c->cur_block; 197301e04c3fSmrg } else { 197401e04c3fSmrg ntq_activate_execute_for_block(c); 197501e04c3fSmrg } 197601e04c3fSmrg} 197701e04c3fSmrg 197801e04c3fSmrgstatic void 197901e04c3fSmrgntq_emit_jump(struct vc4_compile *c, nir_jump_instr *jump) 198001e04c3fSmrg{ 198101e04c3fSmrg struct qblock *jump_block; 198201e04c3fSmrg switch (jump->type) { 198301e04c3fSmrg case nir_jump_break: 198401e04c3fSmrg jump_block = c->loop_break_block; 198501e04c3fSmrg break; 198601e04c3fSmrg case nir_jump_continue: 198701e04c3fSmrg jump_block = c->loop_cont_block; 198801e04c3fSmrg break; 198901e04c3fSmrg default: 199001e04c3fSmrg unreachable("Unsupported jump type\n"); 199101e04c3fSmrg } 199201e04c3fSmrg 199301e04c3fSmrg qir_SF(c, c->execute); 199401e04c3fSmrg qir_MOV_cond(c, QPU_COND_ZS, c->execute, 199501e04c3fSmrg qir_uniform_ui(c, jump_block->index)); 199601e04c3fSmrg 199701e04c3fSmrg /* Jump to the destination block if everyone has taken the jump. */ 199801e04c3fSmrg qir_SF(c, qir_SUB(c, c->execute, qir_uniform_ui(c, jump_block->index))); 199901e04c3fSmrg qir_BRANCH(c, QPU_COND_BRANCH_ALL_ZS); 200001e04c3fSmrg struct qblock *new_block = qir_new_block(c); 200101e04c3fSmrg qir_link_blocks(c->cur_block, jump_block); 200201e04c3fSmrg qir_link_blocks(c->cur_block, new_block); 200301e04c3fSmrg qir_set_emit_block(c, new_block); 200401e04c3fSmrg} 200501e04c3fSmrg 200601e04c3fSmrgstatic void 200701e04c3fSmrgntq_emit_instr(struct vc4_compile *c, nir_instr *instr) 200801e04c3fSmrg{ 200901e04c3fSmrg switch (instr->type) { 201001e04c3fSmrg case nir_instr_type_alu: 201101e04c3fSmrg ntq_emit_alu(c, nir_instr_as_alu(instr)); 201201e04c3fSmrg break; 201301e04c3fSmrg 201401e04c3fSmrg case nir_instr_type_intrinsic: 201501e04c3fSmrg ntq_emit_intrinsic(c, nir_instr_as_intrinsic(instr)); 201601e04c3fSmrg break; 201701e04c3fSmrg 201801e04c3fSmrg case nir_instr_type_load_const: 201901e04c3fSmrg ntq_emit_load_const(c, nir_instr_as_load_const(instr)); 202001e04c3fSmrg break; 202101e04c3fSmrg 202201e04c3fSmrg case nir_instr_type_ssa_undef: 202301e04c3fSmrg ntq_emit_ssa_undef(c, nir_instr_as_ssa_undef(instr)); 202401e04c3fSmrg break; 202501e04c3fSmrg 202601e04c3fSmrg case nir_instr_type_tex: 202701e04c3fSmrg ntq_emit_tex(c, nir_instr_as_tex(instr)); 202801e04c3fSmrg break; 2029af69d88dSmrg 203001e04c3fSmrg case nir_instr_type_jump: 203101e04c3fSmrg ntq_emit_jump(c, nir_instr_as_jump(instr)); 203201e04c3fSmrg break; 2033af69d88dSmrg 203401e04c3fSmrg default: 203501e04c3fSmrg fprintf(stderr, "Unknown NIR instr type: "); 203601e04c3fSmrg nir_print_instr(instr, stderr); 203701e04c3fSmrg fprintf(stderr, "\n"); 203801e04c3fSmrg abort(); 2039af69d88dSmrg } 2040af69d88dSmrg} 2041af69d88dSmrg 2042af69d88dSmrgstatic void 204301e04c3fSmrgntq_emit_block(struct vc4_compile *c, nir_block *block) 2044af69d88dSmrg{ 204501e04c3fSmrg nir_foreach_instr(instr, block) { 204601e04c3fSmrg ntq_emit_instr(c, instr); 2047af69d88dSmrg } 2048af69d88dSmrg} 2049af69d88dSmrg 205001e04c3fSmrgstatic void ntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list); 205101e04c3fSmrg 2052af69d88dSmrgstatic void 205301e04c3fSmrgntq_emit_loop(struct vc4_compile *c, nir_loop *loop) 2054af69d88dSmrg{ 205501e04c3fSmrg if (!c->vc4->screen->has_control_flow) { 205601e04c3fSmrg fprintf(stderr, 205701e04c3fSmrg "loop support requires updated kernel.\n"); 205801e04c3fSmrg ntq_emit_cf_list(c, &loop->body); 205901e04c3fSmrg return; 206001e04c3fSmrg } 206101e04c3fSmrg 206201e04c3fSmrg bool was_top_level = false; 206301e04c3fSmrg if (c->execute.file == QFILE_NULL) { 206401e04c3fSmrg c->execute = qir_MOV(c, qir_uniform_ui(c, 0)); 206501e04c3fSmrg was_top_level = true; 206601e04c3fSmrg } 2067af69d88dSmrg 206801e04c3fSmrg struct qblock *save_loop_cont_block = c->loop_cont_block; 206901e04c3fSmrg struct qblock *save_loop_break_block = c->loop_break_block; 2070af69d88dSmrg 207101e04c3fSmrg c->loop_cont_block = qir_new_block(c); 207201e04c3fSmrg c->loop_break_block = qir_new_block(c); 207301e04c3fSmrg 207401e04c3fSmrg qir_link_blocks(c->cur_block, c->loop_cont_block); 207501e04c3fSmrg qir_set_emit_block(c, c->loop_cont_block); 207601e04c3fSmrg ntq_activate_execute_for_block(c); 207701e04c3fSmrg 207801e04c3fSmrg ntq_emit_cf_list(c, &loop->body); 207901e04c3fSmrg 208001e04c3fSmrg /* If anything had explicitly continued, or is here at the end of the 208101e04c3fSmrg * loop, then we need to loop again. SF updates are masked by the 208201e04c3fSmrg * instruction's condition, so we can do the OR of the two conditions 208301e04c3fSmrg * within SF. 208401e04c3fSmrg */ 208501e04c3fSmrg qir_SF(c, c->execute); 208601e04c3fSmrg struct qinst *cont_check = 208701e04c3fSmrg qir_SUB_dest(c, 208801e04c3fSmrg c->undef, 208901e04c3fSmrg c->execute, 209001e04c3fSmrg qir_uniform_ui(c, c->loop_cont_block->index)); 209101e04c3fSmrg cont_check->cond = QPU_COND_ZC; 209201e04c3fSmrg cont_check->sf = true; 209301e04c3fSmrg 209401e04c3fSmrg qir_BRANCH(c, QPU_COND_BRANCH_ANY_ZS); 209501e04c3fSmrg qir_link_blocks(c->cur_block, c->loop_cont_block); 209601e04c3fSmrg qir_link_blocks(c->cur_block, c->loop_break_block); 209701e04c3fSmrg 209801e04c3fSmrg qir_set_emit_block(c, c->loop_break_block); 209901e04c3fSmrg if (was_top_level) { 210001e04c3fSmrg c->execute = c->undef; 210101e04c3fSmrg c->last_top_block = c->cur_block; 210201e04c3fSmrg } else { 210301e04c3fSmrg ntq_activate_execute_for_block(c); 210401e04c3fSmrg } 210501e04c3fSmrg 210601e04c3fSmrg c->loop_break_block = save_loop_break_block; 210701e04c3fSmrg c->loop_cont_block = save_loop_cont_block; 2108af69d88dSmrg} 2109af69d88dSmrg 2110af69d88dSmrgstatic void 211101e04c3fSmrgntq_emit_function(struct vc4_compile *c, nir_function_impl *func) 2112af69d88dSmrg{ 211301e04c3fSmrg fprintf(stderr, "FUNCTIONS not handled.\n"); 211401e04c3fSmrg abort(); 2115af69d88dSmrg} 2116af69d88dSmrg 2117af69d88dSmrgstatic void 211801e04c3fSmrgntq_emit_cf_list(struct vc4_compile *c, struct exec_list *list) 2119af69d88dSmrg{ 212001e04c3fSmrg foreach_list_typed(nir_cf_node, node, node, list) { 212101e04c3fSmrg switch (node->type) { 212201e04c3fSmrg case nir_cf_node_block: 212301e04c3fSmrg ntq_emit_block(c, nir_cf_node_as_block(node)); 212401e04c3fSmrg break; 2125af69d88dSmrg 212601e04c3fSmrg case nir_cf_node_if: 212701e04c3fSmrg ntq_emit_if(c, nir_cf_node_as_if(node)); 212801e04c3fSmrg break; 212901e04c3fSmrg 213001e04c3fSmrg case nir_cf_node_loop: 213101e04c3fSmrg ntq_emit_loop(c, nir_cf_node_as_loop(node)); 213201e04c3fSmrg break; 2133af69d88dSmrg 213401e04c3fSmrg case nir_cf_node_function: 213501e04c3fSmrg ntq_emit_function(c, nir_cf_node_as_function(node)); 213601e04c3fSmrg break; 2137af69d88dSmrg 213801e04c3fSmrg default: 213901e04c3fSmrg fprintf(stderr, "Unknown NIR node type\n"); 214001e04c3fSmrg abort(); 214101e04c3fSmrg } 2142af69d88dSmrg } 2143af69d88dSmrg} 2144af69d88dSmrg 2145af69d88dSmrgstatic void 214601e04c3fSmrgntq_emit_impl(struct vc4_compile *c, nir_function_impl *impl) 2147af69d88dSmrg{ 214801e04c3fSmrg ntq_setup_registers(c, &impl->registers); 214901e04c3fSmrg ntq_emit_cf_list(c, &impl->body); 2150af69d88dSmrg} 2151af69d88dSmrg 215201e04c3fSmrgstatic void 215301e04c3fSmrgnir_to_qir(struct vc4_compile *c) 2154af69d88dSmrg{ 215501e04c3fSmrg if (c->stage == QSTAGE_FRAG && c->s->info.fs.uses_discard) 215601e04c3fSmrg c->discard = qir_MOV(c, qir_uniform_ui(c, 0)); 215701e04c3fSmrg 215801e04c3fSmrg ntq_setup_inputs(c); 215901e04c3fSmrg ntq_setup_outputs(c); 216001e04c3fSmrg 216101e04c3fSmrg /* Find the main function and emit the body. */ 216201e04c3fSmrg nir_foreach_function(function, c->s) { 216301e04c3fSmrg assert(strcmp(function->name, "main") == 0); 216401e04c3fSmrg assert(function->impl); 216501e04c3fSmrg ntq_emit_impl(c, function->impl); 216601e04c3fSmrg } 216701e04c3fSmrg} 2168af69d88dSmrg 216901e04c3fSmrgstatic const nir_shader_compiler_options nir_options = { 217001e04c3fSmrg .lower_all_io_to_temps = true, 217101e04c3fSmrg .lower_extract_byte = true, 217201e04c3fSmrg .lower_extract_word = true, 21737ec681f3Smrg .lower_insert_byte = true, 21747ec681f3Smrg .lower_insert_word = true, 217501e04c3fSmrg .lower_fdiv = true, 21767ec681f3Smrg .lower_ffma16 = true, 21777ec681f3Smrg .lower_ffma32 = true, 21787ec681f3Smrg .lower_ffma64 = true, 217901e04c3fSmrg .lower_flrp32 = true, 21807ec681f3Smrg .lower_fmod = true, 218101e04c3fSmrg .lower_fpow = true, 218201e04c3fSmrg .lower_fsat = true, 218301e04c3fSmrg .lower_fsqrt = true, 218401e04c3fSmrg .lower_ldexp = true, 21857ec681f3Smrg .lower_fneg = true, 21867ec681f3Smrg .lower_ineg = true, 21877ec681f3Smrg .lower_rotate = true, 21887ec681f3Smrg .lower_to_scalar = true, 21897ec681f3Smrg .lower_umax = true, 21907ec681f3Smrg .lower_umin = true, 21917ec681f3Smrg .lower_isign = true, 21927ec681f3Smrg .has_fsub = true, 21937ec681f3Smrg .has_isub = true, 219401e04c3fSmrg .max_unroll_iterations = 32, 21957ec681f3Smrg .force_indirect_unrolling = (nir_var_shader_in | nir_var_shader_out | nir_var_function_temp), 219601e04c3fSmrg}; 2197af69d88dSmrg 219801e04c3fSmrgconst void * 219901e04c3fSmrgvc4_screen_get_compiler_options(struct pipe_screen *pscreen, 220001e04c3fSmrg enum pipe_shader_ir ir, 220101e04c3fSmrg enum pipe_shader_type shader) 220201e04c3fSmrg{ 220301e04c3fSmrg return &nir_options; 220401e04c3fSmrg} 2205af69d88dSmrg 220601e04c3fSmrgstatic int 220701e04c3fSmrgcount_nir_instrs(nir_shader *nir) 220801e04c3fSmrg{ 220901e04c3fSmrg int count = 0; 221001e04c3fSmrg nir_foreach_function(function, nir) { 221101e04c3fSmrg if (!function->impl) 221201e04c3fSmrg continue; 221301e04c3fSmrg nir_foreach_block(block, function->impl) { 221401e04c3fSmrg nir_foreach_instr(instr, block) 221501e04c3fSmrg count++; 221601e04c3fSmrg } 221701e04c3fSmrg } 221801e04c3fSmrg return count; 221901e04c3fSmrg} 2220af69d88dSmrg 222101e04c3fSmrgstatic struct vc4_compile * 222201e04c3fSmrgvc4_shader_ntq(struct vc4_context *vc4, enum qstage stage, 222301e04c3fSmrg struct vc4_key *key, bool fs_threaded) 222401e04c3fSmrg{ 222501e04c3fSmrg struct vc4_compile *c = qir_compile_init(); 2226af69d88dSmrg 222701e04c3fSmrg c->vc4 = vc4; 222801e04c3fSmrg c->stage = stage; 222901e04c3fSmrg c->shader_state = &key->shader_state->base; 223001e04c3fSmrg c->program_id = key->shader_state->program_id; 223101e04c3fSmrg c->variant_id = 223201e04c3fSmrg p_atomic_inc_return(&key->shader_state->compiled_variant_count); 223301e04c3fSmrg c->fs_threaded = fs_threaded; 2234af69d88dSmrg 223501e04c3fSmrg c->key = key; 2236af69d88dSmrg switch (stage) { 2237af69d88dSmrg case QSTAGE_FRAG: 223801e04c3fSmrg c->fs_key = (struct vc4_fs_key *)key; 223901e04c3fSmrg if (c->fs_key->is_points) { 224001e04c3fSmrg c->point_x = emit_fragment_varying(c, ~0, 0); 224101e04c3fSmrg c->point_y = emit_fragment_varying(c, ~0, 0); 224201e04c3fSmrg } else if (c->fs_key->is_lines) { 224301e04c3fSmrg c->line_x = emit_fragment_varying(c, ~0, 0); 2244af69d88dSmrg } 2245af69d88dSmrg break; 2246af69d88dSmrg case QSTAGE_VERT: 224701e04c3fSmrg c->vs_key = (struct vc4_vs_key *)key; 2248af69d88dSmrg break; 2249af69d88dSmrg case QSTAGE_COORD: 225001e04c3fSmrg c->vs_key = (struct vc4_vs_key *)key; 2251af69d88dSmrg break; 2252af69d88dSmrg } 2253af69d88dSmrg 225401e04c3fSmrg c->s = nir_shader_clone(c, key->shader_state->base.ir.nir); 2255af69d88dSmrg 225601e04c3fSmrg if (stage == QSTAGE_FRAG) { 225701e04c3fSmrg NIR_PASS_V(c->s, vc4_nir_lower_blend, c); 225801e04c3fSmrg } 2259af69d88dSmrg 226001e04c3fSmrg struct nir_lower_tex_options tex_options = { 226101e04c3fSmrg .lower_txp = ~0, 226201e04c3fSmrg 226301e04c3fSmrg /* Apply swizzles to all samplers. */ 226401e04c3fSmrg .swizzle_result = ~0, 226501e04c3fSmrg }; 226601e04c3fSmrg 226701e04c3fSmrg /* Lower the format swizzle and ARB_texture_swizzle-style swizzle. 226801e04c3fSmrg * The format swizzling applies before sRGB decode, and 226901e04c3fSmrg * ARB_texture_swizzle is the last thing before returning the sample. 227001e04c3fSmrg */ 227101e04c3fSmrg for (int i = 0; i < ARRAY_SIZE(key->tex); i++) { 227201e04c3fSmrg enum pipe_format format = c->key->tex[i].format; 227301e04c3fSmrg 227401e04c3fSmrg if (!format) 227501e04c3fSmrg continue; 227601e04c3fSmrg 227701e04c3fSmrg const uint8_t *format_swizzle = vc4_get_format_swizzle(format); 227801e04c3fSmrg 227901e04c3fSmrg for (int j = 0; j < 4; j++) { 228001e04c3fSmrg uint8_t arb_swiz = c->key->tex[i].swizzle[j]; 228101e04c3fSmrg 228201e04c3fSmrg if (arb_swiz <= 3) { 228301e04c3fSmrg tex_options.swizzles[i][j] = 228401e04c3fSmrg format_swizzle[arb_swiz]; 228501e04c3fSmrg } else { 228601e04c3fSmrg tex_options.swizzles[i][j] = arb_swiz; 228701e04c3fSmrg } 228801e04c3fSmrg } 228901e04c3fSmrg 229001e04c3fSmrg if (util_format_is_srgb(format)) 229101e04c3fSmrg tex_options.lower_srgb |= (1 << i); 229201e04c3fSmrg } 229301e04c3fSmrg 229401e04c3fSmrg NIR_PASS_V(c->s, nir_lower_tex, &tex_options); 229501e04c3fSmrg 229601e04c3fSmrg if (c->key->ucp_enables) { 229701e04c3fSmrg if (stage == QSTAGE_FRAG) { 22987ec681f3Smrg NIR_PASS_V(c->s, nir_lower_clip_fs, 22997ec681f3Smrg c->key->ucp_enables, false); 230001e04c3fSmrg } else { 23019f464c52Smaya NIR_PASS_V(c->s, nir_lower_clip_vs, 23027ec681f3Smrg c->key->ucp_enables, false, false, NULL); 230301e04c3fSmrg NIR_PASS_V(c->s, nir_lower_io_to_scalar, 230401e04c3fSmrg nir_var_shader_out); 2305af69d88dSmrg } 2306af69d88dSmrg } 2307af69d88dSmrg 230801e04c3fSmrg /* FS input scalarizing must happen after nir_lower_two_sided_color, 230901e04c3fSmrg * which only handles a vec4 at a time. Similarly, VS output 231001e04c3fSmrg * scalarizing must happen after nir_lower_clip_vs. 231101e04c3fSmrg */ 231201e04c3fSmrg if (c->stage == QSTAGE_FRAG) 231301e04c3fSmrg NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in); 231401e04c3fSmrg else 231501e04c3fSmrg NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out); 231601e04c3fSmrg 231701e04c3fSmrg NIR_PASS_V(c->s, vc4_nir_lower_io, c); 231801e04c3fSmrg NIR_PASS_V(c->s, vc4_nir_lower_txf_ms, c); 23197ec681f3Smrg nir_lower_idiv_options idiv_options = { 23207ec681f3Smrg .imprecise_32bit_lowering = true, 23217ec681f3Smrg .allow_fp16 = true, 23227ec681f3Smrg }; 23237ec681f3Smrg NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options); 232401e04c3fSmrg 232501e04c3fSmrg vc4_optimize_nir(c->s); 232601e04c3fSmrg 23277ec681f3Smrg /* Do late algebraic optimization to turn add(a, neg(b)) back into 23287ec681f3Smrg * subs, then the mandatory cleanup after algebraic. Note that it may 23297ec681f3Smrg * produce fnegs, and if so then we need to keep running to squash 23307ec681f3Smrg * fneg(fneg(a)). 23317ec681f3Smrg */ 23327ec681f3Smrg bool more_late_algebraic = true; 23337ec681f3Smrg while (more_late_algebraic) { 23347ec681f3Smrg more_late_algebraic = false; 23357ec681f3Smrg NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late); 23367ec681f3Smrg NIR_PASS_V(c->s, nir_opt_constant_folding); 23377ec681f3Smrg NIR_PASS_V(c->s, nir_copy_prop); 23387ec681f3Smrg NIR_PASS_V(c->s, nir_opt_dce); 23397ec681f3Smrg NIR_PASS_V(c->s, nir_opt_cse); 23407ec681f3Smrg } 23417ec681f3Smrg 23429f464c52Smaya NIR_PASS_V(c->s, nir_lower_bool_to_int32); 23439f464c52Smaya 234401e04c3fSmrg NIR_PASS_V(c->s, nir_convert_from_ssa, true); 234501e04c3fSmrg 234601e04c3fSmrg if (vc4_debug & VC4_DEBUG_SHADERDB) { 234701e04c3fSmrg fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d NIR instructions\n", 234801e04c3fSmrg qir_get_stage_name(c->stage), 234901e04c3fSmrg c->program_id, c->variant_id, 235001e04c3fSmrg count_nir_instrs(c->s)); 235101e04c3fSmrg } 235201e04c3fSmrg 235301e04c3fSmrg if (vc4_debug & VC4_DEBUG_NIR) { 235401e04c3fSmrg fprintf(stderr, "%s prog %d/%d NIR:\n", 235501e04c3fSmrg qir_get_stage_name(c->stage), 235601e04c3fSmrg c->program_id, c->variant_id); 235701e04c3fSmrg nir_print_shader(c->s, stderr); 235801e04c3fSmrg } 235901e04c3fSmrg 236001e04c3fSmrg nir_to_qir(c); 236101e04c3fSmrg 2362af69d88dSmrg switch (stage) { 2363af69d88dSmrg case QSTAGE_FRAG: 236401e04c3fSmrg /* FS threading requires that the thread execute 236501e04c3fSmrg * QPU_SIG_LAST_THREAD_SWITCH exactly once before terminating 236601e04c3fSmrg * (with no other THRSW afterwards, obviously). If we didn't 236701e04c3fSmrg * fetch a texture at a top level block, this wouldn't be 236801e04c3fSmrg * true. 236901e04c3fSmrg */ 237001e04c3fSmrg if (c->fs_threaded && !c->last_thrsw_at_top_level) { 237101e04c3fSmrg c->failed = true; 237201e04c3fSmrg return c; 237301e04c3fSmrg } 237401e04c3fSmrg 237501e04c3fSmrg emit_frag_end(c); 2376af69d88dSmrg break; 2377af69d88dSmrg case QSTAGE_VERT: 237801e04c3fSmrg emit_vert_end(c, 237901e04c3fSmrg c->vs_key->fs_inputs->input_slots, 238001e04c3fSmrg c->vs_key->fs_inputs->num_inputs); 2381af69d88dSmrg break; 2382af69d88dSmrg case QSTAGE_COORD: 238301e04c3fSmrg emit_coord_end(c); 2384af69d88dSmrg break; 2385af69d88dSmrg } 2386af69d88dSmrg 238701e04c3fSmrg if (vc4_debug & VC4_DEBUG_QIR) { 238801e04c3fSmrg fprintf(stderr, "%s prog %d/%d pre-opt QIR:\n", 238901e04c3fSmrg qir_get_stage_name(c->stage), 239001e04c3fSmrg c->program_id, c->variant_id); 239101e04c3fSmrg qir_dump(c); 239201e04c3fSmrg fprintf(stderr, "\n"); 239301e04c3fSmrg } 2394af69d88dSmrg 2395af69d88dSmrg qir_optimize(c); 239601e04c3fSmrg qir_lower_uniforms(c); 239701e04c3fSmrg 239801e04c3fSmrg qir_schedule_instructions(c); 239901e04c3fSmrg qir_emit_uniform_stream_resets(c); 2400af69d88dSmrg 2401af69d88dSmrg if (vc4_debug & VC4_DEBUG_QIR) { 240201e04c3fSmrg fprintf(stderr, "%s prog %d/%d QIR:\n", 240301e04c3fSmrg qir_get_stage_name(c->stage), 240401e04c3fSmrg c->program_id, c->variant_id); 2405af69d88dSmrg qir_dump(c); 240601e04c3fSmrg fprintf(stderr, "\n"); 2407af69d88dSmrg } 240801e04c3fSmrg 240901e04c3fSmrg qir_reorder_uniforms(c); 241001e04c3fSmrg vc4_generate_code(vc4, c); 2411af69d88dSmrg 2412af69d88dSmrg if (vc4_debug & VC4_DEBUG_SHADERDB) { 241301e04c3fSmrg fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d instructions\n", 241401e04c3fSmrg qir_get_stage_name(c->stage), 241501e04c3fSmrg c->program_id, c->variant_id, 241601e04c3fSmrg c->qpu_inst_count); 241701e04c3fSmrg fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d uniforms\n", 241801e04c3fSmrg qir_get_stage_name(c->stage), 241901e04c3fSmrg c->program_id, c->variant_id, 242001e04c3fSmrg c->num_uniforms); 2421af69d88dSmrg } 2422af69d88dSmrg 242301e04c3fSmrg ralloc_free(c->s); 242401e04c3fSmrg 242501e04c3fSmrg return c; 2426af69d88dSmrg} 2427af69d88dSmrg 2428af69d88dSmrgstatic void * 2429af69d88dSmrgvc4_shader_state_create(struct pipe_context *pctx, 2430af69d88dSmrg const struct pipe_shader_state *cso) 2431af69d88dSmrg{ 243201e04c3fSmrg struct vc4_context *vc4 = vc4_context(pctx); 243301e04c3fSmrg struct vc4_uncompiled_shader *so = CALLOC_STRUCT(vc4_uncompiled_shader); 2434af69d88dSmrg if (!so) 2435af69d88dSmrg return NULL; 2436af69d88dSmrg 243701e04c3fSmrg so->program_id = vc4->next_uncompiled_program_id++; 243801e04c3fSmrg 243901e04c3fSmrg nir_shader *s; 244001e04c3fSmrg 244101e04c3fSmrg if (cso->type == PIPE_SHADER_IR_NIR) { 244201e04c3fSmrg /* The backend takes ownership of the NIR shader on state 244301e04c3fSmrg * creation. 244401e04c3fSmrg */ 244501e04c3fSmrg s = cso->ir.nir; 244601e04c3fSmrg } else { 244701e04c3fSmrg assert(cso->type == PIPE_SHADER_IR_TGSI); 244801e04c3fSmrg 244901e04c3fSmrg if (vc4_debug & VC4_DEBUG_TGSI) { 245001e04c3fSmrg fprintf(stderr, "prog %d TGSI:\n", 245101e04c3fSmrg so->program_id); 245201e04c3fSmrg tgsi_dump(cso->tokens, 0); 245301e04c3fSmrg fprintf(stderr, "\n"); 245401e04c3fSmrg } 24557ec681f3Smrg s = tgsi_to_nir(cso->tokens, pctx->screen, false); 245601e04c3fSmrg } 245701e04c3fSmrg 24587ec681f3Smrg if (s->info.stage == MESA_SHADER_VERTEX) 24597ec681f3Smrg NIR_PASS_V(s, nir_lower_point_size, 1.0f, 0.0f); 24607ec681f3Smrg 24617ec681f3Smrg NIR_PASS_V(s, nir_lower_io, 24627ec681f3Smrg nir_var_shader_in | nir_var_shader_out | nir_var_uniform, 24637ec681f3Smrg type_size, (nir_lower_io_options)0); 246401e04c3fSmrg 246501e04c3fSmrg NIR_PASS_V(s, nir_lower_regs_to_ssa); 246601e04c3fSmrg NIR_PASS_V(s, nir_normalize_cubemap_coords); 246701e04c3fSmrg 246801e04c3fSmrg NIR_PASS_V(s, nir_lower_load_const_to_scalar); 246901e04c3fSmrg 247001e04c3fSmrg vc4_optimize_nir(s); 247101e04c3fSmrg 24727ec681f3Smrg NIR_PASS_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL); 247301e04c3fSmrg 247401e04c3fSmrg /* Garbage collect dead instructions */ 247501e04c3fSmrg nir_sweep(s); 247601e04c3fSmrg 247701e04c3fSmrg so->base.type = PIPE_SHADER_IR_NIR; 247801e04c3fSmrg so->base.ir.nir = s; 247901e04c3fSmrg 248001e04c3fSmrg if (vc4_debug & VC4_DEBUG_NIR) { 248101e04c3fSmrg fprintf(stderr, "%s prog %d NIR:\n", 248201e04c3fSmrg gl_shader_stage_name(s->info.stage), 248301e04c3fSmrg so->program_id); 248401e04c3fSmrg nir_print_shader(s, stderr); 248501e04c3fSmrg fprintf(stderr, "\n"); 248601e04c3fSmrg } 2487af69d88dSmrg 2488af69d88dSmrg return so; 2489af69d88dSmrg} 2490af69d88dSmrg 2491af69d88dSmrgstatic void 2492af69d88dSmrgcopy_uniform_state_to_shader(struct vc4_compiled_shader *shader, 249301e04c3fSmrg struct vc4_compile *c) 2494af69d88dSmrg{ 249501e04c3fSmrg int count = c->num_uniforms; 249601e04c3fSmrg struct vc4_shader_uniform_info *uinfo = &shader->uniforms; 2497af69d88dSmrg 2498af69d88dSmrg uinfo->count = count; 249901e04c3fSmrg uinfo->data = ralloc_array(shader, uint32_t, count); 250001e04c3fSmrg memcpy(uinfo->data, c->uniform_data, 2501af69d88dSmrg count * sizeof(*uinfo->data)); 250201e04c3fSmrg uinfo->contents = ralloc_array(shader, enum quniform_contents, count); 250301e04c3fSmrg memcpy(uinfo->contents, c->uniform_contents, 2504af69d88dSmrg count * sizeof(*uinfo->contents)); 250501e04c3fSmrg uinfo->num_texture_samples = c->num_texture_samples; 250601e04c3fSmrg 250701e04c3fSmrg vc4_set_shader_uniform_dirty_flags(shader); 2508af69d88dSmrg} 2509af69d88dSmrg 2510af69d88dSmrgstatic void 251101e04c3fSmrgvc4_setup_compiled_fs_inputs(struct vc4_context *vc4, struct vc4_compile *c, 251201e04c3fSmrg struct vc4_compiled_shader *shader) 2513af69d88dSmrg{ 251401e04c3fSmrg struct vc4_fs_inputs inputs; 251501e04c3fSmrg 251601e04c3fSmrg memset(&inputs, 0, sizeof(inputs)); 251701e04c3fSmrg inputs.input_slots = ralloc_array(shader, 251801e04c3fSmrg struct vc4_varying_slot, 251901e04c3fSmrg c->num_input_slots); 252001e04c3fSmrg 252101e04c3fSmrg bool input_live[c->num_input_slots]; 252201e04c3fSmrg 252301e04c3fSmrg memset(input_live, 0, sizeof(input_live)); 252401e04c3fSmrg qir_for_each_inst_inorder(inst, c) { 252501e04c3fSmrg for (int i = 0; i < qir_get_nsrc(inst); i++) { 252601e04c3fSmrg if (inst->src[i].file == QFILE_VARY) 252701e04c3fSmrg input_live[inst->src[i].index] = true; 252801e04c3fSmrg } 252901e04c3fSmrg } 253001e04c3fSmrg 253101e04c3fSmrg for (int i = 0; i < c->num_input_slots; i++) { 253201e04c3fSmrg struct vc4_varying_slot *slot = &c->input_slots[i]; 253301e04c3fSmrg 253401e04c3fSmrg if (!input_live[i]) 253501e04c3fSmrg continue; 253601e04c3fSmrg 253701e04c3fSmrg /* Skip non-VS-output inputs. */ 253801e04c3fSmrg if (slot->slot == (uint8_t)~0) 253901e04c3fSmrg continue; 254001e04c3fSmrg 254101e04c3fSmrg if (slot->slot == VARYING_SLOT_COL0 || 254201e04c3fSmrg slot->slot == VARYING_SLOT_COL1 || 254301e04c3fSmrg slot->slot == VARYING_SLOT_BFC0 || 254401e04c3fSmrg slot->slot == VARYING_SLOT_BFC1) { 254501e04c3fSmrg shader->color_inputs |= (1 << inputs.num_inputs); 254601e04c3fSmrg } 2547af69d88dSmrg 254801e04c3fSmrg inputs.input_slots[inputs.num_inputs] = *slot; 254901e04c3fSmrg inputs.num_inputs++; 255001e04c3fSmrg } 255101e04c3fSmrg shader->num_inputs = inputs.num_inputs; 255201e04c3fSmrg 255301e04c3fSmrg /* Add our set of inputs to the set of all inputs seen. This way, we 255401e04c3fSmrg * can have a single pointer that identifies an FS inputs set, 255501e04c3fSmrg * allowing VS to avoid recompiling when the FS is recompiled (or a 255601e04c3fSmrg * new one is bound using separate shader objects) but the inputs 255701e04c3fSmrg * don't change. 255801e04c3fSmrg */ 255901e04c3fSmrg struct set_entry *entry = _mesa_set_search(vc4->fs_inputs_set, &inputs); 256001e04c3fSmrg if (entry) { 256101e04c3fSmrg shader->fs_inputs = entry->key; 256201e04c3fSmrg ralloc_free(inputs.input_slots); 256301e04c3fSmrg } else { 256401e04c3fSmrg struct vc4_fs_inputs *alloc_inputs; 256501e04c3fSmrg 256601e04c3fSmrg alloc_inputs = rzalloc(vc4->fs_inputs_set, struct vc4_fs_inputs); 256701e04c3fSmrg memcpy(alloc_inputs, &inputs, sizeof(inputs)); 256801e04c3fSmrg ralloc_steal(alloc_inputs, inputs.input_slots); 256901e04c3fSmrg _mesa_set_add(vc4->fs_inputs_set, alloc_inputs); 257001e04c3fSmrg 257101e04c3fSmrg shader->fs_inputs = alloc_inputs; 257201e04c3fSmrg } 2573af69d88dSmrg} 2574af69d88dSmrg 257501e04c3fSmrgstatic struct vc4_compiled_shader * 257601e04c3fSmrgvc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage, 257701e04c3fSmrg struct vc4_key *key) 2578af69d88dSmrg{ 257901e04c3fSmrg struct hash_table *ht; 258001e04c3fSmrg uint32_t key_size; 258101e04c3fSmrg bool try_threading; 258201e04c3fSmrg 258301e04c3fSmrg if (stage == QSTAGE_FRAG) { 258401e04c3fSmrg ht = vc4->fs_cache; 258501e04c3fSmrg key_size = sizeof(struct vc4_fs_key); 258601e04c3fSmrg try_threading = vc4->screen->has_threaded_fs; 258701e04c3fSmrg } else { 258801e04c3fSmrg ht = vc4->vs_cache; 258901e04c3fSmrg key_size = sizeof(struct vc4_vs_key); 259001e04c3fSmrg try_threading = false; 259101e04c3fSmrg } 259201e04c3fSmrg 259301e04c3fSmrg struct vc4_compiled_shader *shader; 259401e04c3fSmrg struct hash_entry *entry = _mesa_hash_table_search(ht, key); 259501e04c3fSmrg if (entry) 259601e04c3fSmrg return entry->data; 259701e04c3fSmrg 259801e04c3fSmrg struct vc4_compile *c = vc4_shader_ntq(vc4, stage, key, try_threading); 259901e04c3fSmrg /* If the FS failed to compile threaded, fall back to single threaded. */ 260001e04c3fSmrg if (try_threading && c->failed) { 260101e04c3fSmrg qir_compile_destroy(c); 260201e04c3fSmrg c = vc4_shader_ntq(vc4, stage, key, false); 260301e04c3fSmrg } 260401e04c3fSmrg 260501e04c3fSmrg shader = rzalloc(NULL, struct vc4_compiled_shader); 260601e04c3fSmrg 260701e04c3fSmrg shader->program_id = vc4->next_compiled_program_id++; 260801e04c3fSmrg if (stage == QSTAGE_FRAG) { 260901e04c3fSmrg vc4_setup_compiled_fs_inputs(vc4, c, shader); 261001e04c3fSmrg 261101e04c3fSmrg /* Note: the temporary clone in c->s has been freed. */ 261201e04c3fSmrg nir_shader *orig_shader = key->shader_state->base.ir.nir; 261301e04c3fSmrg if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH)) 261401e04c3fSmrg shader->disable_early_z = true; 261501e04c3fSmrg } else { 261601e04c3fSmrg shader->num_inputs = c->num_inputs; 261701e04c3fSmrg 261801e04c3fSmrg shader->vattr_offsets[0] = 0; 261901e04c3fSmrg for (int i = 0; i < 8; i++) { 262001e04c3fSmrg shader->vattr_offsets[i + 1] = 262101e04c3fSmrg shader->vattr_offsets[i] + c->vattr_sizes[i]; 262201e04c3fSmrg 262301e04c3fSmrg if (c->vattr_sizes[i]) 262401e04c3fSmrg shader->vattrs_live |= (1 << i); 262501e04c3fSmrg } 262601e04c3fSmrg } 262701e04c3fSmrg 262801e04c3fSmrg shader->failed = c->failed; 262901e04c3fSmrg if (c->failed) { 263001e04c3fSmrg shader->failed = true; 263101e04c3fSmrg } else { 263201e04c3fSmrg copy_uniform_state_to_shader(shader, c); 263301e04c3fSmrg shader->bo = vc4_bo_alloc_shader(vc4->screen, c->qpu_insts, 263401e04c3fSmrg c->qpu_inst_count * 263501e04c3fSmrg sizeof(uint64_t)); 263601e04c3fSmrg } 2637af69d88dSmrg 263801e04c3fSmrg shader->fs_threaded = c->fs_threaded; 263901e04c3fSmrg 264001e04c3fSmrg if ((vc4_debug & VC4_DEBUG_SHADERDB) && stage == QSTAGE_FRAG) { 264101e04c3fSmrg fprintf(stderr, "SHADER-DB: %s prog %d/%d: %d FS threads\n", 264201e04c3fSmrg qir_get_stage_name(c->stage), 264301e04c3fSmrg c->program_id, c->variant_id, 264401e04c3fSmrg 1 + shader->fs_threaded); 264501e04c3fSmrg } 2646af69d88dSmrg 264701e04c3fSmrg qir_compile_destroy(c); 2648af69d88dSmrg 264901e04c3fSmrg struct vc4_key *dup_key; 265001e04c3fSmrg dup_key = rzalloc_size(shader, key_size); /* TODO: don't use rzalloc */ 265101e04c3fSmrg memcpy(dup_key, key, key_size); 265201e04c3fSmrg _mesa_hash_table_insert(ht, dup_key, shader); 2653af69d88dSmrg 265401e04c3fSmrg return shader; 2655af69d88dSmrg} 2656af69d88dSmrg 2657af69d88dSmrgstatic void 265801e04c3fSmrgvc4_setup_shared_key(struct vc4_context *vc4, struct vc4_key *key, 265901e04c3fSmrg struct vc4_texture_stateobj *texstate) 2660af69d88dSmrg{ 2661af69d88dSmrg for (int i = 0; i < texstate->num_textures; i++) { 2662af69d88dSmrg struct pipe_sampler_view *sampler = texstate->textures[i]; 266301e04c3fSmrg struct vc4_sampler_view *vc4_sampler = vc4_sampler_view(sampler); 266401e04c3fSmrg struct pipe_sampler_state *sampler_state = 266501e04c3fSmrg texstate->samplers[i]; 266601e04c3fSmrg 266701e04c3fSmrg if (!sampler) 266801e04c3fSmrg continue; 266901e04c3fSmrg 267001e04c3fSmrg key->tex[i].format = sampler->format; 267101e04c3fSmrg key->tex[i].swizzle[0] = sampler->swizzle_r; 267201e04c3fSmrg key->tex[i].swizzle[1] = sampler->swizzle_g; 267301e04c3fSmrg key->tex[i].swizzle[2] = sampler->swizzle_b; 267401e04c3fSmrg key->tex[i].swizzle[3] = sampler->swizzle_a; 267501e04c3fSmrg 267601e04c3fSmrg if (sampler->texture->nr_samples > 1) { 267701e04c3fSmrg key->tex[i].msaa_width = sampler->texture->width0; 267801e04c3fSmrg key->tex[i].msaa_height = sampler->texture->height0; 267901e04c3fSmrg } else if (sampler){ 268001e04c3fSmrg key->tex[i].compare_mode = sampler_state->compare_mode; 268101e04c3fSmrg key->tex[i].compare_func = sampler_state->compare_func; 268201e04c3fSmrg key->tex[i].wrap_s = sampler_state->wrap_s; 268301e04c3fSmrg key->tex[i].wrap_t = sampler_state->wrap_t; 268401e04c3fSmrg key->tex[i].force_first_level = 268501e04c3fSmrg vc4_sampler->force_first_level; 2686af69d88dSmrg } 2687af69d88dSmrg } 268801e04c3fSmrg 268901e04c3fSmrg key->ucp_enables = vc4->rasterizer->base.clip_plane_enable; 2690af69d88dSmrg} 2691af69d88dSmrg 2692af69d88dSmrgstatic void 2693af69d88dSmrgvc4_update_compiled_fs(struct vc4_context *vc4, uint8_t prim_mode) 2694af69d88dSmrg{ 269501e04c3fSmrg struct vc4_job *job = vc4->job; 2696af69d88dSmrg struct vc4_fs_key local_key; 2697af69d88dSmrg struct vc4_fs_key *key = &local_key; 2698af69d88dSmrg 269901e04c3fSmrg if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE | 270001e04c3fSmrg VC4_DIRTY_BLEND | 270101e04c3fSmrg VC4_DIRTY_FRAMEBUFFER | 270201e04c3fSmrg VC4_DIRTY_ZSA | 270301e04c3fSmrg VC4_DIRTY_RASTERIZER | 270401e04c3fSmrg VC4_DIRTY_SAMPLE_MASK | 270501e04c3fSmrg VC4_DIRTY_FRAGTEX | 270601e04c3fSmrg VC4_DIRTY_UNCOMPILED_FS | 270701e04c3fSmrg VC4_DIRTY_UBO_1_SIZE))) { 270801e04c3fSmrg return; 270901e04c3fSmrg } 271001e04c3fSmrg 2711af69d88dSmrg memset(key, 0, sizeof(*key)); 271201e04c3fSmrg vc4_setup_shared_key(vc4, &key->base, &vc4->fragtex); 2713af69d88dSmrg key->base.shader_state = vc4->prog.bind_fs; 2714af69d88dSmrg key->is_points = (prim_mode == PIPE_PRIM_POINTS); 2715af69d88dSmrg key->is_lines = (prim_mode >= PIPE_PRIM_LINES && 2716af69d88dSmrg prim_mode <= PIPE_PRIM_LINE_STRIP); 2717af69d88dSmrg key->blend = vc4->blend->rt[0]; 271801e04c3fSmrg if (vc4->blend->logicop_enable) { 271901e04c3fSmrg key->logicop_func = vc4->blend->logicop_func; 272001e04c3fSmrg } else { 272101e04c3fSmrg key->logicop_func = PIPE_LOGICOP_COPY; 272201e04c3fSmrg } 272301e04c3fSmrg if (job->msaa) { 272401e04c3fSmrg key->msaa = vc4->rasterizer->base.multisample; 272501e04c3fSmrg key->sample_coverage = (vc4->sample_mask != (1 << VC4_MAX_SAMPLES) - 1); 272601e04c3fSmrg key->sample_alpha_to_coverage = vc4->blend->alpha_to_coverage; 272701e04c3fSmrg key->sample_alpha_to_one = vc4->blend->alpha_to_one; 272801e04c3fSmrg } 2729af69d88dSmrg 2730af69d88dSmrg if (vc4->framebuffer.cbufs[0]) 2731af69d88dSmrg key->color_format = vc4->framebuffer.cbufs[0]->format; 2732af69d88dSmrg 273301e04c3fSmrg key->stencil_enabled = vc4->zsa->stencil_uniforms[0] != 0; 273401e04c3fSmrg key->stencil_twoside = vc4->zsa->stencil_uniforms[1] != 0; 273501e04c3fSmrg key->stencil_full_writemasks = vc4->zsa->stencil_uniforms[2] != 0; 27367ec681f3Smrg key->depth_enabled = (vc4->zsa->base.depth_enabled || 273701e04c3fSmrg key->stencil_enabled); 273801e04c3fSmrg 273901e04c3fSmrg if (key->is_points) { 274001e04c3fSmrg key->point_sprite_mask = 274101e04c3fSmrg vc4->rasterizer->base.sprite_coord_enable; 274201e04c3fSmrg key->point_coord_upper_left = 274301e04c3fSmrg (vc4->rasterizer->base.sprite_coord_mode == 274401e04c3fSmrg PIPE_SPRITE_COORD_UPPER_LEFT); 274501e04c3fSmrg } 274601e04c3fSmrg 274701e04c3fSmrg key->ubo_1_size = vc4->constbuf[PIPE_SHADER_FRAGMENT].cb[1].buffer_size; 2748af69d88dSmrg 274901e04c3fSmrg struct vc4_compiled_shader *old_fs = vc4->prog.fs; 275001e04c3fSmrg vc4->prog.fs = vc4_get_compiled_shader(vc4, QSTAGE_FRAG, &key->base); 275101e04c3fSmrg if (vc4->prog.fs == old_fs) 2752af69d88dSmrg return; 2753af69d88dSmrg 275401e04c3fSmrg vc4->dirty |= VC4_DIRTY_COMPILED_FS; 2755af69d88dSmrg 275601e04c3fSmrg if (vc4->rasterizer->base.flatshade && 275701e04c3fSmrg (!old_fs || vc4->prog.fs->color_inputs != old_fs->color_inputs)) { 275801e04c3fSmrg vc4->dirty |= VC4_DIRTY_FLAT_SHADE_FLAGS; 275901e04c3fSmrg } 2760af69d88dSmrg 276101e04c3fSmrg if (!old_fs || vc4->prog.fs->fs_inputs != old_fs->fs_inputs) 276201e04c3fSmrg vc4->dirty |= VC4_DIRTY_FS_INPUTS; 2763af69d88dSmrg} 2764af69d88dSmrg 2765af69d88dSmrgstatic void 276601e04c3fSmrgvc4_update_compiled_vs(struct vc4_context *vc4, uint8_t prim_mode) 2767af69d88dSmrg{ 2768af69d88dSmrg struct vc4_vs_key local_key; 2769af69d88dSmrg struct vc4_vs_key *key = &local_key; 2770af69d88dSmrg 277101e04c3fSmrg if (!(vc4->dirty & (VC4_DIRTY_PRIM_MODE | 277201e04c3fSmrg VC4_DIRTY_RASTERIZER | 277301e04c3fSmrg VC4_DIRTY_VERTTEX | 277401e04c3fSmrg VC4_DIRTY_VTXSTATE | 277501e04c3fSmrg VC4_DIRTY_UNCOMPILED_VS | 277601e04c3fSmrg VC4_DIRTY_FS_INPUTS))) { 277701e04c3fSmrg return; 277801e04c3fSmrg } 277901e04c3fSmrg 2780af69d88dSmrg memset(key, 0, sizeof(*key)); 278101e04c3fSmrg vc4_setup_shared_key(vc4, &key->base, &vc4->verttex); 2782af69d88dSmrg key->base.shader_state = vc4->prog.bind_vs; 278301e04c3fSmrg key->fs_inputs = vc4->prog.fs->fs_inputs; 2784af69d88dSmrg 2785af69d88dSmrg for (int i = 0; i < ARRAY_SIZE(key->attr_formats); i++) 2786af69d88dSmrg key->attr_formats[i] = vc4->vtx->pipe[i].src_format; 2787af69d88dSmrg 278801e04c3fSmrg key->per_vertex_point_size = 278901e04c3fSmrg (prim_mode == PIPE_PRIM_POINTS && 279001e04c3fSmrg vc4->rasterizer->base.point_size_per_vertex); 2791af69d88dSmrg 279201e04c3fSmrg struct vc4_compiled_shader *vs = 279301e04c3fSmrg vc4_get_compiled_shader(vc4, QSTAGE_VERT, &key->base); 279401e04c3fSmrg if (vs != vc4->prog.vs) { 279501e04c3fSmrg vc4->prog.vs = vs; 279601e04c3fSmrg vc4->dirty |= VC4_DIRTY_COMPILED_VS; 279701e04c3fSmrg } 2798af69d88dSmrg 279901e04c3fSmrg key->is_coord = true; 280001e04c3fSmrg /* Coord shaders don't care what the FS inputs are. */ 280101e04c3fSmrg key->fs_inputs = NULL; 280201e04c3fSmrg struct vc4_compiled_shader *cs = 280301e04c3fSmrg vc4_get_compiled_shader(vc4, QSTAGE_COORD, &key->base); 280401e04c3fSmrg if (cs != vc4->prog.cs) { 280501e04c3fSmrg vc4->prog.cs = cs; 280601e04c3fSmrg vc4->dirty |= VC4_DIRTY_COMPILED_CS; 280701e04c3fSmrg } 2808af69d88dSmrg} 2809af69d88dSmrg 281001e04c3fSmrgbool 2811af69d88dSmrgvc4_update_compiled_shaders(struct vc4_context *vc4, uint8_t prim_mode) 2812af69d88dSmrg{ 2813af69d88dSmrg vc4_update_compiled_fs(vc4, prim_mode); 281401e04c3fSmrg vc4_update_compiled_vs(vc4, prim_mode); 281501e04c3fSmrg 281601e04c3fSmrg return !(vc4->prog.cs->failed || 281701e04c3fSmrg vc4->prog.vs->failed || 281801e04c3fSmrg vc4->prog.fs->failed); 2819af69d88dSmrg} 2820af69d88dSmrg 282101e04c3fSmrgstatic uint32_t 282201e04c3fSmrgfs_cache_hash(const void *key) 2823af69d88dSmrg{ 282401e04c3fSmrg return _mesa_hash_data(key, sizeof(struct vc4_fs_key)); 2825af69d88dSmrg} 2826af69d88dSmrg 282701e04c3fSmrgstatic uint32_t 282801e04c3fSmrgvs_cache_hash(const void *key) 2829af69d88dSmrg{ 283001e04c3fSmrg return _mesa_hash_data(key, sizeof(struct vc4_vs_key)); 2831af69d88dSmrg} 2832af69d88dSmrg 283301e04c3fSmrgstatic bool 283401e04c3fSmrgfs_cache_compare(const void *key1, const void *key2) 2835af69d88dSmrg{ 283601e04c3fSmrg return memcmp(key1, key2, sizeof(struct vc4_fs_key)) == 0; 2837af69d88dSmrg} 2838af69d88dSmrg 283901e04c3fSmrgstatic bool 284001e04c3fSmrgvs_cache_compare(const void *key1, const void *key2) 2841af69d88dSmrg{ 284201e04c3fSmrg return memcmp(key1, key2, sizeof(struct vc4_vs_key)) == 0; 2843af69d88dSmrg} 2844af69d88dSmrg 284501e04c3fSmrgstatic uint32_t 284601e04c3fSmrgfs_inputs_hash(const void *key) 2847af69d88dSmrg{ 284801e04c3fSmrg const struct vc4_fs_inputs *inputs = key; 2849af69d88dSmrg 285001e04c3fSmrg return _mesa_hash_data(inputs->input_slots, 285101e04c3fSmrg sizeof(*inputs->input_slots) * 285201e04c3fSmrg inputs->num_inputs); 2853af69d88dSmrg} 2854af69d88dSmrg 285501e04c3fSmrgstatic bool 285601e04c3fSmrgfs_inputs_compare(const void *key1, const void *key2) 2857af69d88dSmrg{ 285801e04c3fSmrg const struct vc4_fs_inputs *inputs1 = key1; 285901e04c3fSmrg const struct vc4_fs_inputs *inputs2 = key2; 286001e04c3fSmrg 286101e04c3fSmrg return (inputs1->num_inputs == inputs2->num_inputs && 286201e04c3fSmrg memcmp(inputs1->input_slots, 286301e04c3fSmrg inputs2->input_slots, 286401e04c3fSmrg sizeof(*inputs1->input_slots) * 286501e04c3fSmrg inputs1->num_inputs) == 0); 2866af69d88dSmrg} 2867af69d88dSmrg 2868af69d88dSmrgstatic void 286901e04c3fSmrgdelete_from_cache_if_matches(struct hash_table *ht, 287001e04c3fSmrg struct vc4_compiled_shader **last_compile, 287101e04c3fSmrg struct hash_entry *entry, 287201e04c3fSmrg struct vc4_uncompiled_shader *so) 2873af69d88dSmrg{ 287401e04c3fSmrg const struct vc4_key *key = entry->key; 2875af69d88dSmrg 287601e04c3fSmrg if (key->shader_state == so) { 287701e04c3fSmrg struct vc4_compiled_shader *shader = entry->data; 287801e04c3fSmrg _mesa_hash_table_remove(ht, entry); 287901e04c3fSmrg vc4_bo_unreference(&shader->bo); 2880af69d88dSmrg 288101e04c3fSmrg if (shader == *last_compile) 288201e04c3fSmrg *last_compile = NULL; 2883af69d88dSmrg 288401e04c3fSmrg ralloc_free(shader); 2885af69d88dSmrg } 2886af69d88dSmrg} 2887af69d88dSmrg 2888af69d88dSmrgstatic void 288901e04c3fSmrgvc4_shader_state_delete(struct pipe_context *pctx, void *hwcso) 2890af69d88dSmrg{ 289101e04c3fSmrg struct vc4_context *vc4 = vc4_context(pctx); 289201e04c3fSmrg struct vc4_uncompiled_shader *so = hwcso; 2893af69d88dSmrg 289401e04c3fSmrg hash_table_foreach(vc4->fs_cache, entry) { 289501e04c3fSmrg delete_from_cache_if_matches(vc4->fs_cache, &vc4->prog.fs, 289601e04c3fSmrg entry, so); 2897af69d88dSmrg } 289801e04c3fSmrg hash_table_foreach(vc4->vs_cache, entry) { 289901e04c3fSmrg delete_from_cache_if_matches(vc4->vs_cache, &vc4->prog.vs, 290001e04c3fSmrg entry, so); 290101e04c3fSmrg } 290201e04c3fSmrg 290301e04c3fSmrg ralloc_free(so->base.ir.nir); 290401e04c3fSmrg free(so); 2905af69d88dSmrg} 2906af69d88dSmrg 2907af69d88dSmrgstatic void 2908af69d88dSmrgvc4_fp_state_bind(struct pipe_context *pctx, void *hwcso) 2909af69d88dSmrg{ 2910af69d88dSmrg struct vc4_context *vc4 = vc4_context(pctx); 2911af69d88dSmrg vc4->prog.bind_fs = hwcso; 291201e04c3fSmrg vc4->dirty |= VC4_DIRTY_UNCOMPILED_FS; 2913af69d88dSmrg} 2914af69d88dSmrg 2915af69d88dSmrgstatic void 2916af69d88dSmrgvc4_vp_state_bind(struct pipe_context *pctx, void *hwcso) 2917af69d88dSmrg{ 2918af69d88dSmrg struct vc4_context *vc4 = vc4_context(pctx); 2919af69d88dSmrg vc4->prog.bind_vs = hwcso; 292001e04c3fSmrg vc4->dirty |= VC4_DIRTY_UNCOMPILED_VS; 2921af69d88dSmrg} 2922af69d88dSmrg 2923af69d88dSmrgvoid 2924af69d88dSmrgvc4_program_init(struct pipe_context *pctx) 2925af69d88dSmrg{ 2926af69d88dSmrg struct vc4_context *vc4 = vc4_context(pctx); 2927af69d88dSmrg 2928af69d88dSmrg pctx->create_vs_state = vc4_shader_state_create; 2929af69d88dSmrg pctx->delete_vs_state = vc4_shader_state_delete; 2930af69d88dSmrg 2931af69d88dSmrg pctx->create_fs_state = vc4_shader_state_create; 2932af69d88dSmrg pctx->delete_fs_state = vc4_shader_state_delete; 2933af69d88dSmrg 2934af69d88dSmrg pctx->bind_fs_state = vc4_fp_state_bind; 2935af69d88dSmrg pctx->bind_vs_state = vc4_vp_state_bind; 2936af69d88dSmrg 293701e04c3fSmrg vc4->fs_cache = _mesa_hash_table_create(pctx, fs_cache_hash, 293801e04c3fSmrg fs_cache_compare); 293901e04c3fSmrg vc4->vs_cache = _mesa_hash_table_create(pctx, vs_cache_hash, 294001e04c3fSmrg vs_cache_compare); 294101e04c3fSmrg vc4->fs_inputs_set = _mesa_set_create(pctx, fs_inputs_hash, 294201e04c3fSmrg fs_inputs_compare); 294301e04c3fSmrg} 294401e04c3fSmrg 294501e04c3fSmrgvoid 294601e04c3fSmrgvc4_program_fini(struct pipe_context *pctx) 294701e04c3fSmrg{ 294801e04c3fSmrg struct vc4_context *vc4 = vc4_context(pctx); 294901e04c3fSmrg 295001e04c3fSmrg hash_table_foreach(vc4->fs_cache, entry) { 295101e04c3fSmrg struct vc4_compiled_shader *shader = entry->data; 295201e04c3fSmrg vc4_bo_unreference(&shader->bo); 295301e04c3fSmrg ralloc_free(shader); 295401e04c3fSmrg _mesa_hash_table_remove(vc4->fs_cache, entry); 295501e04c3fSmrg } 295601e04c3fSmrg 295701e04c3fSmrg hash_table_foreach(vc4->vs_cache, entry) { 295801e04c3fSmrg struct vc4_compiled_shader *shader = entry->data; 295901e04c3fSmrg vc4_bo_unreference(&shader->bo); 296001e04c3fSmrg ralloc_free(shader); 296101e04c3fSmrg _mesa_hash_table_remove(vc4->vs_cache, entry); 296201e04c3fSmrg } 2963af69d88dSmrg} 2964