1b8e80941Smrg/* 2b8e80941Smrg * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org> 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20b8e80941Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21b8e80941Smrg * SOFTWARE. 22b8e80941Smrg * 23b8e80941Smrg * Authors: 24b8e80941Smrg * Rob Clark <robclark@freedesktop.org> 25b8e80941Smrg */ 26b8e80941Smrg 27b8e80941Smrg#include <stdarg.h> 28b8e80941Smrg 29b8e80941Smrg#include "util/u_string.h" 30b8e80941Smrg#include "util/u_memory.h" 31b8e80941Smrg#include "util/u_math.h" 32b8e80941Smrg 33b8e80941Smrg#include "ir3_compiler.h" 34b8e80941Smrg#include "ir3_image.h" 35b8e80941Smrg#include "ir3_shader.h" 36b8e80941Smrg#include "ir3_nir.h" 37b8e80941Smrg 38b8e80941Smrg#include "instr-a3xx.h" 39b8e80941Smrg#include "ir3.h" 40b8e80941Smrg#include "ir3_context.h" 41b8e80941Smrg 42b8e80941Smrg 43b8e80941Smrgstatic struct ir3_instruction * 44b8e80941Smrgcreate_indirect_load(struct ir3_context *ctx, unsigned arrsz, int n, 45b8e80941Smrg struct ir3_instruction *address, struct ir3_instruction *collect) 46b8e80941Smrg{ 47b8e80941Smrg struct ir3_block *block = ctx->block; 48b8e80941Smrg struct ir3_instruction *mov; 49b8e80941Smrg struct ir3_register *src; 50b8e80941Smrg 51b8e80941Smrg mov = ir3_instr_create(block, OPC_MOV); 52b8e80941Smrg mov->cat1.src_type = TYPE_U32; 53b8e80941Smrg mov->cat1.dst_type = TYPE_U32; 54b8e80941Smrg ir3_reg_create(mov, 0, 0); 55b8e80941Smrg src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); 56b8e80941Smrg src->instr = collect; 57b8e80941Smrg src->size = arrsz; 58b8e80941Smrg src->array.offset = n; 59b8e80941Smrg 60b8e80941Smrg ir3_instr_set_address(mov, address); 61b8e80941Smrg 62b8e80941Smrg return mov; 63b8e80941Smrg} 64b8e80941Smrg 65b8e80941Smrgstatic struct ir3_instruction * 66b8e80941Smrgcreate_input_compmask(struct ir3_context *ctx, unsigned n, unsigned compmask) 67b8e80941Smrg{ 68b8e80941Smrg struct ir3_instruction *in; 69b8e80941Smrg 70b8e80941Smrg in = ir3_instr_create(ctx->in_block, OPC_META_INPUT); 71b8e80941Smrg in->inout.block = ctx->in_block; 72b8e80941Smrg ir3_reg_create(in, n, 0); 73b8e80941Smrg 74b8e80941Smrg in->regs[0]->wrmask = compmask; 75b8e80941Smrg 76b8e80941Smrg return in; 77b8e80941Smrg} 78b8e80941Smrg 79b8e80941Smrgstatic struct ir3_instruction * 80b8e80941Smrgcreate_input(struct ir3_context *ctx, unsigned n) 81b8e80941Smrg{ 82b8e80941Smrg return create_input_compmask(ctx, n, 0x1); 83b8e80941Smrg} 84b8e80941Smrg 85b8e80941Smrgstatic struct ir3_instruction * 86b8e80941Smrgcreate_frag_input(struct ir3_context *ctx, bool use_ldlv, unsigned n) 87b8e80941Smrg{ 88b8e80941Smrg struct ir3_block *block = ctx->block; 89b8e80941Smrg struct ir3_instruction *instr; 90b8e80941Smrg /* packed inloc is fixed up later: */ 91b8e80941Smrg struct ir3_instruction *inloc = create_immed(block, n); 92b8e80941Smrg 93b8e80941Smrg if (use_ldlv) { 94b8e80941Smrg instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); 95b8e80941Smrg instr->cat6.type = TYPE_U32; 96b8e80941Smrg instr->cat6.iim_val = 1; 97b8e80941Smrg } else { 98b8e80941Smrg instr = ir3_BARY_F(block, inloc, 0, ctx->ij_pixel, 0); 99b8e80941Smrg instr->regs[2]->wrmask = 0x3; 100b8e80941Smrg } 101b8e80941Smrg 102b8e80941Smrg return instr; 103b8e80941Smrg} 104b8e80941Smrg 105b8e80941Smrgstatic struct ir3_instruction * 106b8e80941Smrgcreate_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp) 107b8e80941Smrg{ 108b8e80941Smrg /* first four vec4 sysval's reserved for UBOs: */ 109b8e80941Smrg /* NOTE: dp is in scalar, but there can be >4 dp components: */ 110b8e80941Smrg unsigned n = ctx->so->constbase.driver_param; 111b8e80941Smrg unsigned r = regid(n + dp / 4, dp % 4); 112b8e80941Smrg return create_uniform(ctx->block, r); 113b8e80941Smrg} 114b8e80941Smrg 115b8e80941Smrg/* 116b8e80941Smrg * Adreno uses uint rather than having dedicated bool type, 117b8e80941Smrg * which (potentially) requires some conversion, in particular 118b8e80941Smrg * when using output of an bool instr to int input, or visa 119b8e80941Smrg * versa. 120b8e80941Smrg * 121b8e80941Smrg * | Adreno | NIR | 122b8e80941Smrg * -------+---------+-------+- 123b8e80941Smrg * true | 1 | ~0 | 124b8e80941Smrg * false | 0 | 0 | 125b8e80941Smrg * 126b8e80941Smrg * To convert from an adreno bool (uint) to nir, use: 127b8e80941Smrg * 128b8e80941Smrg * absneg.s dst, (neg)src 129b8e80941Smrg * 130b8e80941Smrg * To convert back in the other direction: 131b8e80941Smrg * 132b8e80941Smrg * absneg.s dst, (abs)arc 133b8e80941Smrg * 134b8e80941Smrg * The CP step can clean up the absneg.s that cancel each other 135b8e80941Smrg * out, and with a slight bit of extra cleverness (to recognize 136b8e80941Smrg * the instructions which produce either a 0 or 1) can eliminate 137b8e80941Smrg * the absneg.s's completely when an instruction that wants 138b8e80941Smrg * 0/1 consumes the result. For example, when a nir 'bcsel' 139b8e80941Smrg * consumes the result of 'feq'. So we should be able to get by 140b8e80941Smrg * without a boolean resolve step, and without incuring any 141b8e80941Smrg * extra penalty in instruction count. 142b8e80941Smrg */ 143b8e80941Smrg 144b8e80941Smrg/* NIR bool -> native (adreno): */ 145b8e80941Smrgstatic struct ir3_instruction * 146b8e80941Smrgir3_b2n(struct ir3_block *block, struct ir3_instruction *instr) 147b8e80941Smrg{ 148b8e80941Smrg return ir3_ABSNEG_S(block, instr, IR3_REG_SABS); 149b8e80941Smrg} 150b8e80941Smrg 151b8e80941Smrg/* native (adreno) -> NIR bool: */ 152b8e80941Smrgstatic struct ir3_instruction * 153b8e80941Smrgir3_n2b(struct ir3_block *block, struct ir3_instruction *instr) 154b8e80941Smrg{ 155b8e80941Smrg return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG); 156b8e80941Smrg} 157b8e80941Smrg 158b8e80941Smrg/* 159b8e80941Smrg * alu/sfu instructions: 160b8e80941Smrg */ 161b8e80941Smrg 162b8e80941Smrgstatic struct ir3_instruction * 163b8e80941Smrgcreate_cov(struct ir3_context *ctx, struct ir3_instruction *src, 164b8e80941Smrg unsigned src_bitsize, nir_op op) 165b8e80941Smrg{ 166b8e80941Smrg type_t src_type, dst_type; 167b8e80941Smrg 168b8e80941Smrg switch (op) { 169b8e80941Smrg case nir_op_f2f32: 170b8e80941Smrg case nir_op_f2f16_rtne: 171b8e80941Smrg case nir_op_f2f16_rtz: 172b8e80941Smrg case nir_op_f2f16: 173b8e80941Smrg case nir_op_f2i32: 174b8e80941Smrg case nir_op_f2i16: 175b8e80941Smrg case nir_op_f2i8: 176b8e80941Smrg case nir_op_f2u32: 177b8e80941Smrg case nir_op_f2u16: 178b8e80941Smrg case nir_op_f2u8: 179b8e80941Smrg switch (src_bitsize) { 180b8e80941Smrg case 32: 181b8e80941Smrg src_type = TYPE_F32; 182b8e80941Smrg break; 183b8e80941Smrg case 16: 184b8e80941Smrg src_type = TYPE_F16; 185b8e80941Smrg break; 186b8e80941Smrg default: 187b8e80941Smrg ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize); 188b8e80941Smrg } 189b8e80941Smrg break; 190b8e80941Smrg 191b8e80941Smrg case nir_op_i2f32: 192b8e80941Smrg case nir_op_i2f16: 193b8e80941Smrg case nir_op_i2i32: 194b8e80941Smrg case nir_op_i2i16: 195b8e80941Smrg case nir_op_i2i8: 196b8e80941Smrg switch (src_bitsize) { 197b8e80941Smrg case 32: 198b8e80941Smrg src_type = TYPE_S32; 199b8e80941Smrg break; 200b8e80941Smrg case 16: 201b8e80941Smrg src_type = TYPE_S16; 202b8e80941Smrg break; 203b8e80941Smrg case 8: 204b8e80941Smrg src_type = TYPE_S8; 205b8e80941Smrg break; 206b8e80941Smrg default: 207b8e80941Smrg ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize); 208b8e80941Smrg } 209b8e80941Smrg break; 210b8e80941Smrg 211b8e80941Smrg case nir_op_u2f32: 212b8e80941Smrg case nir_op_u2f16: 213b8e80941Smrg case nir_op_u2u32: 214b8e80941Smrg case nir_op_u2u16: 215b8e80941Smrg case nir_op_u2u8: 216b8e80941Smrg switch (src_bitsize) { 217b8e80941Smrg case 32: 218b8e80941Smrg src_type = TYPE_U32; 219b8e80941Smrg break; 220b8e80941Smrg case 16: 221b8e80941Smrg src_type = TYPE_U16; 222b8e80941Smrg break; 223b8e80941Smrg case 8: 224b8e80941Smrg src_type = TYPE_U8; 225b8e80941Smrg break; 226b8e80941Smrg default: 227b8e80941Smrg ir3_context_error(ctx, "invalid src bit size: %u", src_bitsize); 228b8e80941Smrg } 229b8e80941Smrg break; 230b8e80941Smrg 231b8e80941Smrg default: 232b8e80941Smrg ir3_context_error(ctx, "invalid conversion op: %u", op); 233b8e80941Smrg } 234b8e80941Smrg 235b8e80941Smrg switch (op) { 236b8e80941Smrg case nir_op_f2f32: 237b8e80941Smrg case nir_op_i2f32: 238b8e80941Smrg case nir_op_u2f32: 239b8e80941Smrg dst_type = TYPE_F32; 240b8e80941Smrg break; 241b8e80941Smrg 242b8e80941Smrg case nir_op_f2f16_rtne: 243b8e80941Smrg case nir_op_f2f16_rtz: 244b8e80941Smrg case nir_op_f2f16: 245b8e80941Smrg /* TODO how to handle rounding mode? */ 246b8e80941Smrg case nir_op_i2f16: 247b8e80941Smrg case nir_op_u2f16: 248b8e80941Smrg dst_type = TYPE_F16; 249b8e80941Smrg break; 250b8e80941Smrg 251b8e80941Smrg case nir_op_f2i32: 252b8e80941Smrg case nir_op_i2i32: 253b8e80941Smrg dst_type = TYPE_S32; 254b8e80941Smrg break; 255b8e80941Smrg 256b8e80941Smrg case nir_op_f2i16: 257b8e80941Smrg case nir_op_i2i16: 258b8e80941Smrg dst_type = TYPE_S16; 259b8e80941Smrg break; 260b8e80941Smrg 261b8e80941Smrg case nir_op_f2i8: 262b8e80941Smrg case nir_op_i2i8: 263b8e80941Smrg dst_type = TYPE_S8; 264b8e80941Smrg break; 265b8e80941Smrg 266b8e80941Smrg case nir_op_f2u32: 267b8e80941Smrg case nir_op_u2u32: 268b8e80941Smrg dst_type = TYPE_U32; 269b8e80941Smrg break; 270b8e80941Smrg 271b8e80941Smrg case nir_op_f2u16: 272b8e80941Smrg case nir_op_u2u16: 273b8e80941Smrg dst_type = TYPE_U16; 274b8e80941Smrg break; 275b8e80941Smrg 276b8e80941Smrg case nir_op_f2u8: 277b8e80941Smrg case nir_op_u2u8: 278b8e80941Smrg dst_type = TYPE_U8; 279b8e80941Smrg break; 280b8e80941Smrg 281b8e80941Smrg default: 282b8e80941Smrg ir3_context_error(ctx, "invalid conversion op: %u", op); 283b8e80941Smrg } 284b8e80941Smrg 285b8e80941Smrg return ir3_COV(ctx->block, src, src_type, dst_type); 286b8e80941Smrg} 287b8e80941Smrg 288b8e80941Smrgstatic void 289b8e80941Smrgemit_alu(struct ir3_context *ctx, nir_alu_instr *alu) 290b8e80941Smrg{ 291b8e80941Smrg const nir_op_info *info = &nir_op_infos[alu->op]; 292b8e80941Smrg struct ir3_instruction **dst, *src[info->num_inputs]; 293b8e80941Smrg unsigned bs[info->num_inputs]; /* bit size */ 294b8e80941Smrg struct ir3_block *b = ctx->block; 295b8e80941Smrg unsigned dst_sz, wrmask; 296b8e80941Smrg 297b8e80941Smrg if (alu->dest.dest.is_ssa) { 298b8e80941Smrg dst_sz = alu->dest.dest.ssa.num_components; 299b8e80941Smrg wrmask = (1 << dst_sz) - 1; 300b8e80941Smrg } else { 301b8e80941Smrg dst_sz = alu->dest.dest.reg.reg->num_components; 302b8e80941Smrg wrmask = alu->dest.write_mask; 303b8e80941Smrg } 304b8e80941Smrg 305b8e80941Smrg dst = ir3_get_dst(ctx, &alu->dest.dest, dst_sz); 306b8e80941Smrg 307b8e80941Smrg /* Vectors are special in that they have non-scalarized writemasks, 308b8e80941Smrg * and just take the first swizzle channel for each argument in 309b8e80941Smrg * order into each writemask channel. 310b8e80941Smrg */ 311b8e80941Smrg if ((alu->op == nir_op_vec2) || 312b8e80941Smrg (alu->op == nir_op_vec3) || 313b8e80941Smrg (alu->op == nir_op_vec4)) { 314b8e80941Smrg 315b8e80941Smrg for (int i = 0; i < info->num_inputs; i++) { 316b8e80941Smrg nir_alu_src *asrc = &alu->src[i]; 317b8e80941Smrg 318b8e80941Smrg compile_assert(ctx, !asrc->abs); 319b8e80941Smrg compile_assert(ctx, !asrc->negate); 320b8e80941Smrg 321b8e80941Smrg src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[0]]; 322b8e80941Smrg if (!src[i]) 323b8e80941Smrg src[i] = create_immed(ctx->block, 0); 324b8e80941Smrg dst[i] = ir3_MOV(b, src[i], TYPE_U32); 325b8e80941Smrg } 326b8e80941Smrg 327b8e80941Smrg ir3_put_dst(ctx, &alu->dest.dest); 328b8e80941Smrg return; 329b8e80941Smrg } 330b8e80941Smrg 331b8e80941Smrg /* We also get mov's with more than one component for mov's so 332b8e80941Smrg * handle those specially: 333b8e80941Smrg */ 334b8e80941Smrg if ((alu->op == nir_op_imov) || (alu->op == nir_op_fmov)) { 335b8e80941Smrg type_t type = (alu->op == nir_op_imov) ? TYPE_U32 : TYPE_F32; 336b8e80941Smrg nir_alu_src *asrc = &alu->src[0]; 337b8e80941Smrg struct ir3_instruction *const *src0 = ir3_get_src(ctx, &asrc->src); 338b8e80941Smrg 339b8e80941Smrg for (unsigned i = 0; i < dst_sz; i++) { 340b8e80941Smrg if (wrmask & (1 << i)) { 341b8e80941Smrg dst[i] = ir3_MOV(b, src0[asrc->swizzle[i]], type); 342b8e80941Smrg } else { 343b8e80941Smrg dst[i] = NULL; 344b8e80941Smrg } 345b8e80941Smrg } 346b8e80941Smrg 347b8e80941Smrg ir3_put_dst(ctx, &alu->dest.dest); 348b8e80941Smrg return; 349b8e80941Smrg } 350b8e80941Smrg 351b8e80941Smrg /* General case: We can just grab the one used channel per src. */ 352b8e80941Smrg for (int i = 0; i < info->num_inputs; i++) { 353b8e80941Smrg unsigned chan = ffs(alu->dest.write_mask) - 1; 354b8e80941Smrg nir_alu_src *asrc = &alu->src[i]; 355b8e80941Smrg 356b8e80941Smrg compile_assert(ctx, !asrc->abs); 357b8e80941Smrg compile_assert(ctx, !asrc->negate); 358b8e80941Smrg 359b8e80941Smrg src[i] = ir3_get_src(ctx, &asrc->src)[asrc->swizzle[chan]]; 360b8e80941Smrg bs[i] = nir_src_bit_size(asrc->src); 361b8e80941Smrg 362b8e80941Smrg compile_assert(ctx, src[i]); 363b8e80941Smrg } 364b8e80941Smrg 365b8e80941Smrg switch (alu->op) { 366b8e80941Smrg case nir_op_f2f32: 367b8e80941Smrg case nir_op_f2f16_rtne: 368b8e80941Smrg case nir_op_f2f16_rtz: 369b8e80941Smrg case nir_op_f2f16: 370b8e80941Smrg case nir_op_f2i32: 371b8e80941Smrg case nir_op_f2i16: 372b8e80941Smrg case nir_op_f2i8: 373b8e80941Smrg case nir_op_f2u32: 374b8e80941Smrg case nir_op_f2u16: 375b8e80941Smrg case nir_op_f2u8: 376b8e80941Smrg case nir_op_i2f32: 377b8e80941Smrg case nir_op_i2f16: 378b8e80941Smrg case nir_op_i2i32: 379b8e80941Smrg case nir_op_i2i16: 380b8e80941Smrg case nir_op_i2i8: 381b8e80941Smrg case nir_op_u2f32: 382b8e80941Smrg case nir_op_u2f16: 383b8e80941Smrg case nir_op_u2u32: 384b8e80941Smrg case nir_op_u2u16: 385b8e80941Smrg case nir_op_u2u8: 386b8e80941Smrg dst[0] = create_cov(ctx, src[0], bs[0], alu->op); 387b8e80941Smrg break; 388b8e80941Smrg case nir_op_f2b32: 389b8e80941Smrg dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0); 390b8e80941Smrg dst[0]->cat2.condition = IR3_COND_NE; 391b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 392b8e80941Smrg break; 393b8e80941Smrg case nir_op_b2f16: 394b8e80941Smrg case nir_op_b2f32: 395b8e80941Smrg dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32); 396b8e80941Smrg break; 397b8e80941Smrg case nir_op_b2i8: 398b8e80941Smrg case nir_op_b2i16: 399b8e80941Smrg case nir_op_b2i32: 400b8e80941Smrg dst[0] = ir3_b2n(b, src[0]); 401b8e80941Smrg break; 402b8e80941Smrg case nir_op_i2b32: 403b8e80941Smrg dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); 404b8e80941Smrg dst[0]->cat2.condition = IR3_COND_NE; 405b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 406b8e80941Smrg break; 407b8e80941Smrg 408b8e80941Smrg case nir_op_fneg: 409b8e80941Smrg dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG); 410b8e80941Smrg break; 411b8e80941Smrg case nir_op_fabs: 412b8e80941Smrg dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS); 413b8e80941Smrg break; 414b8e80941Smrg case nir_op_fmax: 415b8e80941Smrg dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0); 416b8e80941Smrg break; 417b8e80941Smrg case nir_op_fmin: 418b8e80941Smrg dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0); 419b8e80941Smrg break; 420b8e80941Smrg case nir_op_fsat: 421b8e80941Smrg /* if there is just a single use of the src, and it supports 422b8e80941Smrg * (sat) bit, we can just fold the (sat) flag back to the 423b8e80941Smrg * src instruction and create a mov. This is easier for cp 424b8e80941Smrg * to eliminate. 425b8e80941Smrg * 426b8e80941Smrg * TODO probably opc_cat==4 is ok too 427b8e80941Smrg */ 428b8e80941Smrg if (alu->src[0].src.is_ssa && 429b8e80941Smrg (list_length(&alu->src[0].src.ssa->uses) == 1) && 430b8e80941Smrg ((opc_cat(src[0]->opc) == 2) || (opc_cat(src[0]->opc) == 3))) { 431b8e80941Smrg src[0]->flags |= IR3_INSTR_SAT; 432b8e80941Smrg dst[0] = ir3_MOV(b, src[0], TYPE_U32); 433b8e80941Smrg } else { 434b8e80941Smrg /* otherwise generate a max.f that saturates.. blob does 435b8e80941Smrg * similar (generating a cat2 mov using max.f) 436b8e80941Smrg */ 437b8e80941Smrg dst[0] = ir3_MAX_F(b, src[0], 0, src[0], 0); 438b8e80941Smrg dst[0]->flags |= IR3_INSTR_SAT; 439b8e80941Smrg } 440b8e80941Smrg break; 441b8e80941Smrg case nir_op_fmul: 442b8e80941Smrg dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0); 443b8e80941Smrg break; 444b8e80941Smrg case nir_op_fadd: 445b8e80941Smrg dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0); 446b8e80941Smrg break; 447b8e80941Smrg case nir_op_fsub: 448b8e80941Smrg dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG); 449b8e80941Smrg break; 450b8e80941Smrg case nir_op_ffma: 451b8e80941Smrg dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0); 452b8e80941Smrg break; 453b8e80941Smrg case nir_op_fddx: 454b8e80941Smrg dst[0] = ir3_DSX(b, src[0], 0); 455b8e80941Smrg dst[0]->cat5.type = TYPE_F32; 456b8e80941Smrg break; 457b8e80941Smrg case nir_op_fddy: 458b8e80941Smrg dst[0] = ir3_DSY(b, src[0], 0); 459b8e80941Smrg dst[0]->cat5.type = TYPE_F32; 460b8e80941Smrg break; 461b8e80941Smrg break; 462b8e80941Smrg case nir_op_flt32: 463b8e80941Smrg dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); 464b8e80941Smrg dst[0]->cat2.condition = IR3_COND_LT; 465b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 466b8e80941Smrg break; 467b8e80941Smrg case nir_op_fge32: 468b8e80941Smrg dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); 469b8e80941Smrg dst[0]->cat2.condition = IR3_COND_GE; 470b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 471b8e80941Smrg break; 472b8e80941Smrg case nir_op_feq32: 473b8e80941Smrg dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); 474b8e80941Smrg dst[0]->cat2.condition = IR3_COND_EQ; 475b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 476b8e80941Smrg break; 477b8e80941Smrg case nir_op_fne32: 478b8e80941Smrg dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); 479b8e80941Smrg dst[0]->cat2.condition = IR3_COND_NE; 480b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 481b8e80941Smrg break; 482b8e80941Smrg case nir_op_fceil: 483b8e80941Smrg dst[0] = ir3_CEIL_F(b, src[0], 0); 484b8e80941Smrg break; 485b8e80941Smrg case nir_op_ffloor: 486b8e80941Smrg dst[0] = ir3_FLOOR_F(b, src[0], 0); 487b8e80941Smrg break; 488b8e80941Smrg case nir_op_ftrunc: 489b8e80941Smrg dst[0] = ir3_TRUNC_F(b, src[0], 0); 490b8e80941Smrg break; 491b8e80941Smrg case nir_op_fround_even: 492b8e80941Smrg dst[0] = ir3_RNDNE_F(b, src[0], 0); 493b8e80941Smrg break; 494b8e80941Smrg case nir_op_fsign: 495b8e80941Smrg dst[0] = ir3_SIGN_F(b, src[0], 0); 496b8e80941Smrg break; 497b8e80941Smrg 498b8e80941Smrg case nir_op_fsin: 499b8e80941Smrg dst[0] = ir3_SIN(b, src[0], 0); 500b8e80941Smrg break; 501b8e80941Smrg case nir_op_fcos: 502b8e80941Smrg dst[0] = ir3_COS(b, src[0], 0); 503b8e80941Smrg break; 504b8e80941Smrg case nir_op_frsq: 505b8e80941Smrg dst[0] = ir3_RSQ(b, src[0], 0); 506b8e80941Smrg break; 507b8e80941Smrg case nir_op_frcp: 508b8e80941Smrg dst[0] = ir3_RCP(b, src[0], 0); 509b8e80941Smrg break; 510b8e80941Smrg case nir_op_flog2: 511b8e80941Smrg dst[0] = ir3_LOG2(b, src[0], 0); 512b8e80941Smrg break; 513b8e80941Smrg case nir_op_fexp2: 514b8e80941Smrg dst[0] = ir3_EXP2(b, src[0], 0); 515b8e80941Smrg break; 516b8e80941Smrg case nir_op_fsqrt: 517b8e80941Smrg dst[0] = ir3_SQRT(b, src[0], 0); 518b8e80941Smrg break; 519b8e80941Smrg 520b8e80941Smrg case nir_op_iabs: 521b8e80941Smrg dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS); 522b8e80941Smrg break; 523b8e80941Smrg case nir_op_iadd: 524b8e80941Smrg dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); 525b8e80941Smrg break; 526b8e80941Smrg case nir_op_iand: 527b8e80941Smrg dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0); 528b8e80941Smrg break; 529b8e80941Smrg case nir_op_imax: 530b8e80941Smrg dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0); 531b8e80941Smrg break; 532b8e80941Smrg case nir_op_umax: 533b8e80941Smrg dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0); 534b8e80941Smrg break; 535b8e80941Smrg case nir_op_imin: 536b8e80941Smrg dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0); 537b8e80941Smrg break; 538b8e80941Smrg case nir_op_umin: 539b8e80941Smrg dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0); 540b8e80941Smrg break; 541b8e80941Smrg case nir_op_imul: 542b8e80941Smrg /* 543b8e80941Smrg * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16) 544b8e80941Smrg * mull.u tmp0, a, b ; mul low, i.e. al * bl 545b8e80941Smrg * madsh.m16 tmp1, a, b, tmp0 ; mul-add shift high mix, i.e. ah * bl << 16 546b8e80941Smrg * madsh.m16 dst, b, a, tmp1 ; i.e. al * bh << 16 547b8e80941Smrg */ 548b8e80941Smrg dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0, 549b8e80941Smrg ir3_MADSH_M16(b, src[0], 0, src[1], 0, 550b8e80941Smrg ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0); 551b8e80941Smrg break; 552b8e80941Smrg case nir_op_ineg: 553b8e80941Smrg dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); 554b8e80941Smrg break; 555b8e80941Smrg case nir_op_inot: 556b8e80941Smrg dst[0] = ir3_NOT_B(b, src[0], 0); 557b8e80941Smrg break; 558b8e80941Smrg case nir_op_ior: 559b8e80941Smrg dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0); 560b8e80941Smrg break; 561b8e80941Smrg case nir_op_ishl: 562b8e80941Smrg dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0); 563b8e80941Smrg break; 564b8e80941Smrg case nir_op_ishr: 565b8e80941Smrg dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0); 566b8e80941Smrg break; 567b8e80941Smrg case nir_op_isub: 568b8e80941Smrg dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0); 569b8e80941Smrg break; 570b8e80941Smrg case nir_op_ixor: 571b8e80941Smrg dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0); 572b8e80941Smrg break; 573b8e80941Smrg case nir_op_ushr: 574b8e80941Smrg dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0); 575b8e80941Smrg break; 576b8e80941Smrg case nir_op_ilt32: 577b8e80941Smrg dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); 578b8e80941Smrg dst[0]->cat2.condition = IR3_COND_LT; 579b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 580b8e80941Smrg break; 581b8e80941Smrg case nir_op_ige32: 582b8e80941Smrg dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); 583b8e80941Smrg dst[0]->cat2.condition = IR3_COND_GE; 584b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 585b8e80941Smrg break; 586b8e80941Smrg case nir_op_ieq32: 587b8e80941Smrg dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); 588b8e80941Smrg dst[0]->cat2.condition = IR3_COND_EQ; 589b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 590b8e80941Smrg break; 591b8e80941Smrg case nir_op_ine32: 592b8e80941Smrg dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); 593b8e80941Smrg dst[0]->cat2.condition = IR3_COND_NE; 594b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 595b8e80941Smrg break; 596b8e80941Smrg case nir_op_ult32: 597b8e80941Smrg dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); 598b8e80941Smrg dst[0]->cat2.condition = IR3_COND_LT; 599b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 600b8e80941Smrg break; 601b8e80941Smrg case nir_op_uge32: 602b8e80941Smrg dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); 603b8e80941Smrg dst[0]->cat2.condition = IR3_COND_GE; 604b8e80941Smrg dst[0] = ir3_n2b(b, dst[0]); 605b8e80941Smrg break; 606b8e80941Smrg 607b8e80941Smrg case nir_op_b32csel: { 608b8e80941Smrg struct ir3_instruction *cond = ir3_b2n(b, src[0]); 609b8e80941Smrg compile_assert(ctx, bs[1] == bs[2]); 610b8e80941Smrg /* the boolean condition is 32b even if src[1] and src[2] are 611b8e80941Smrg * half-precision, but sel.b16 wants all three src's to be the 612b8e80941Smrg * same type. 613b8e80941Smrg */ 614b8e80941Smrg if (bs[1] < 32) 615b8e80941Smrg cond = ir3_COV(b, cond, TYPE_U32, TYPE_U16); 616b8e80941Smrg dst[0] = ir3_SEL_B32(b, src[1], 0, cond, 0, src[2], 0); 617b8e80941Smrg break; 618b8e80941Smrg } 619b8e80941Smrg case nir_op_bit_count: { 620b8e80941Smrg // TODO, we need to do this 16b at a time on a5xx+a6xx.. need to 621b8e80941Smrg // double check on earlier gen's. Once half-precision support is 622b8e80941Smrg // in place, this should probably move to a NIR lowering pass: 623b8e80941Smrg struct ir3_instruction *hi, *lo; 624b8e80941Smrg 625b8e80941Smrg hi = ir3_COV(b, ir3_SHR_B(b, src[0], 0, create_immed(b, 16), 0), 626b8e80941Smrg TYPE_U32, TYPE_U16); 627b8e80941Smrg lo = ir3_COV(b, src[0], TYPE_U32, TYPE_U16); 628b8e80941Smrg 629b8e80941Smrg hi = ir3_CBITS_B(b, hi, 0); 630b8e80941Smrg lo = ir3_CBITS_B(b, lo, 0); 631b8e80941Smrg 632b8e80941Smrg // TODO maybe the builders should default to making dst half-precision 633b8e80941Smrg // if the src's were half precision, to make this less awkward.. otoh 634b8e80941Smrg // we should probably just do this lowering in NIR. 635b8e80941Smrg hi->regs[0]->flags |= IR3_REG_HALF; 636b8e80941Smrg lo->regs[0]->flags |= IR3_REG_HALF; 637b8e80941Smrg 638b8e80941Smrg dst[0] = ir3_ADD_S(b, hi, 0, lo, 0); 639b8e80941Smrg dst[0]->regs[0]->flags |= IR3_REG_HALF; 640b8e80941Smrg dst[0] = ir3_COV(b, dst[0], TYPE_U16, TYPE_U32); 641b8e80941Smrg break; 642b8e80941Smrg } 643b8e80941Smrg case nir_op_ifind_msb: { 644b8e80941Smrg struct ir3_instruction *cmp; 645b8e80941Smrg dst[0] = ir3_CLZ_S(b, src[0], 0); 646b8e80941Smrg cmp = ir3_CMPS_S(b, dst[0], 0, create_immed(b, 0), 0); 647b8e80941Smrg cmp->cat2.condition = IR3_COND_GE; 648b8e80941Smrg dst[0] = ir3_SEL_B32(b, 649b8e80941Smrg ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, 650b8e80941Smrg cmp, 0, dst[0], 0); 651b8e80941Smrg break; 652b8e80941Smrg } 653b8e80941Smrg case nir_op_ufind_msb: 654b8e80941Smrg dst[0] = ir3_CLZ_B(b, src[0], 0); 655b8e80941Smrg dst[0] = ir3_SEL_B32(b, 656b8e80941Smrg ir3_SUB_U(b, create_immed(b, 31), 0, dst[0], 0), 0, 657b8e80941Smrg src[0], 0, dst[0], 0); 658b8e80941Smrg break; 659b8e80941Smrg case nir_op_find_lsb: 660b8e80941Smrg dst[0] = ir3_BFREV_B(b, src[0], 0); 661b8e80941Smrg dst[0] = ir3_CLZ_B(b, dst[0], 0); 662b8e80941Smrg break; 663b8e80941Smrg case nir_op_bitfield_reverse: 664b8e80941Smrg dst[0] = ir3_BFREV_B(b, src[0], 0); 665b8e80941Smrg break; 666b8e80941Smrg 667b8e80941Smrg default: 668b8e80941Smrg ir3_context_error(ctx, "Unhandled ALU op: %s\n", 669b8e80941Smrg nir_op_infos[alu->op].name); 670b8e80941Smrg break; 671b8e80941Smrg } 672b8e80941Smrg 673b8e80941Smrg ir3_put_dst(ctx, &alu->dest.dest); 674b8e80941Smrg} 675b8e80941Smrg 676b8e80941Smrg/* handles direct/indirect UBO reads: */ 677b8e80941Smrgstatic void 678b8e80941Smrgemit_intrinsic_load_ubo(struct ir3_context *ctx, nir_intrinsic_instr *intr, 679b8e80941Smrg struct ir3_instruction **dst) 680b8e80941Smrg{ 681b8e80941Smrg struct ir3_block *b = ctx->block; 682b8e80941Smrg struct ir3_instruction *base_lo, *base_hi, *addr, *src0, *src1; 683b8e80941Smrg /* UBO addresses are the first driver params, but subtract 2 here to 684b8e80941Smrg * account for nir_lower_uniforms_to_ubo rebasing the UBOs such that UBO 0 685b8e80941Smrg * is the uniforms: */ 686b8e80941Smrg unsigned ubo = regid(ctx->so->constbase.ubo, 0) - 2; 687b8e80941Smrg const unsigned ptrsz = ir3_pointer_size(ctx); 688b8e80941Smrg 689b8e80941Smrg int off = 0; 690b8e80941Smrg 691b8e80941Smrg /* First src is ubo index, which could either be an immed or not: */ 692b8e80941Smrg src0 = ir3_get_src(ctx, &intr->src[0])[0]; 693b8e80941Smrg if (is_same_type_mov(src0) && 694b8e80941Smrg (src0->regs[1]->flags & IR3_REG_IMMED)) { 695b8e80941Smrg base_lo = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz)); 696b8e80941Smrg base_hi = create_uniform(b, ubo + (src0->regs[1]->iim_val * ptrsz) + 1); 697b8e80941Smrg } else { 698b8e80941Smrg base_lo = create_uniform_indirect(b, ubo, ir3_get_addr(ctx, src0, ptrsz)); 699b8e80941Smrg base_hi = create_uniform_indirect(b, ubo + 1, ir3_get_addr(ctx, src0, ptrsz)); 700b8e80941Smrg } 701b8e80941Smrg 702b8e80941Smrg /* note: on 32bit gpu's base_hi is ignored and DCE'd */ 703b8e80941Smrg addr = base_lo; 704b8e80941Smrg 705b8e80941Smrg if (nir_src_is_const(intr->src[1])) { 706b8e80941Smrg off += nir_src_as_uint(intr->src[1]); 707b8e80941Smrg } else { 708b8e80941Smrg /* For load_ubo_indirect, second src is indirect offset: */ 709b8e80941Smrg src1 = ir3_get_src(ctx, &intr->src[1])[0]; 710b8e80941Smrg 711b8e80941Smrg /* and add offset to addr: */ 712b8e80941Smrg addr = ir3_ADD_S(b, addr, 0, src1, 0); 713b8e80941Smrg } 714b8e80941Smrg 715b8e80941Smrg /* if offset is to large to encode in the ldg, split it out: */ 716b8e80941Smrg if ((off + (intr->num_components * 4)) > 1024) { 717b8e80941Smrg /* split out the minimal amount to improve the odds that 718b8e80941Smrg * cp can fit the immediate in the add.s instruction: 719b8e80941Smrg */ 720b8e80941Smrg unsigned off2 = off + (intr->num_components * 4) - 1024; 721b8e80941Smrg addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0); 722b8e80941Smrg off -= off2; 723b8e80941Smrg } 724b8e80941Smrg 725b8e80941Smrg if (ptrsz == 2) { 726b8e80941Smrg struct ir3_instruction *carry; 727b8e80941Smrg 728b8e80941Smrg /* handle 32b rollover, ie: 729b8e80941Smrg * if (addr < base_lo) 730b8e80941Smrg * base_hi++ 731b8e80941Smrg */ 732b8e80941Smrg carry = ir3_CMPS_U(b, addr, 0, base_lo, 0); 733b8e80941Smrg carry->cat2.condition = IR3_COND_LT; 734b8e80941Smrg base_hi = ir3_ADD_S(b, base_hi, 0, carry, 0); 735b8e80941Smrg 736b8e80941Smrg addr = ir3_create_collect(ctx, (struct ir3_instruction*[]){ addr, base_hi }, 2); 737b8e80941Smrg } 738b8e80941Smrg 739b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 740b8e80941Smrg struct ir3_instruction *load = 741b8e80941Smrg ir3_LDG(b, addr, 0, create_immed(b, 1), 0); 742b8e80941Smrg load->cat6.type = TYPE_U32; 743b8e80941Smrg load->cat6.src_offset = off + i * 4; /* byte offset */ 744b8e80941Smrg dst[i] = load; 745b8e80941Smrg } 746b8e80941Smrg} 747b8e80941Smrg 748b8e80941Smrg/* src[] = { block_index } */ 749b8e80941Smrgstatic void 750b8e80941Smrgemit_intrinsic_ssbo_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, 751b8e80941Smrg struct ir3_instruction **dst) 752b8e80941Smrg{ 753b8e80941Smrg /* SSBO size stored as a const starting at ssbo_sizes: */ 754b8e80941Smrg unsigned blk_idx = nir_src_as_uint(intr->src[0]); 755b8e80941Smrg unsigned idx = regid(ctx->so->constbase.ssbo_sizes, 0) + 756b8e80941Smrg ctx->so->const_layout.ssbo_size.off[blk_idx]; 757b8e80941Smrg 758b8e80941Smrg debug_assert(ctx->so->const_layout.ssbo_size.mask & (1 << blk_idx)); 759b8e80941Smrg 760b8e80941Smrg dst[0] = create_uniform(ctx->block, idx); 761b8e80941Smrg} 762b8e80941Smrg 763b8e80941Smrg/* src[] = { offset }. const_index[] = { base } */ 764b8e80941Smrgstatic void 765b8e80941Smrgemit_intrinsic_load_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr, 766b8e80941Smrg struct ir3_instruction **dst) 767b8e80941Smrg{ 768b8e80941Smrg struct ir3_block *b = ctx->block; 769b8e80941Smrg struct ir3_instruction *ldl, *offset; 770b8e80941Smrg unsigned base; 771b8e80941Smrg 772b8e80941Smrg offset = ir3_get_src(ctx, &intr->src[0])[0]; 773b8e80941Smrg base = nir_intrinsic_base(intr); 774b8e80941Smrg 775b8e80941Smrg ldl = ir3_LDL(b, offset, 0, create_immed(b, intr->num_components), 0); 776b8e80941Smrg ldl->cat6.src_offset = base; 777b8e80941Smrg ldl->cat6.type = utype_dst(intr->dest); 778b8e80941Smrg ldl->regs[0]->wrmask = MASK(intr->num_components); 779b8e80941Smrg 780b8e80941Smrg ldl->barrier_class = IR3_BARRIER_SHARED_R; 781b8e80941Smrg ldl->barrier_conflict = IR3_BARRIER_SHARED_W; 782b8e80941Smrg 783b8e80941Smrg ir3_split_dest(b, dst, ldl, 0, intr->num_components); 784b8e80941Smrg} 785b8e80941Smrg 786b8e80941Smrg/* src[] = { value, offset }. const_index[] = { base, write_mask } */ 787b8e80941Smrgstatic void 788b8e80941Smrgemit_intrinsic_store_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) 789b8e80941Smrg{ 790b8e80941Smrg struct ir3_block *b = ctx->block; 791b8e80941Smrg struct ir3_instruction *stl, *offset; 792b8e80941Smrg struct ir3_instruction * const *value; 793b8e80941Smrg unsigned base, wrmask; 794b8e80941Smrg 795b8e80941Smrg value = ir3_get_src(ctx, &intr->src[0]); 796b8e80941Smrg offset = ir3_get_src(ctx, &intr->src[1])[0]; 797b8e80941Smrg 798b8e80941Smrg base = nir_intrinsic_base(intr); 799b8e80941Smrg wrmask = nir_intrinsic_write_mask(intr); 800b8e80941Smrg 801b8e80941Smrg /* Combine groups of consecutive enabled channels in one write 802b8e80941Smrg * message. We use ffs to find the first enabled channel and then ffs on 803b8e80941Smrg * the bit-inverse, down-shifted writemask to determine the length of 804b8e80941Smrg * the block of enabled bits. 805b8e80941Smrg * 806b8e80941Smrg * (trick stolen from i965's fs_visitor::nir_emit_cs_intrinsic()) 807b8e80941Smrg */ 808b8e80941Smrg while (wrmask) { 809b8e80941Smrg unsigned first_component = ffs(wrmask) - 1; 810b8e80941Smrg unsigned length = ffs(~(wrmask >> first_component)) - 1; 811b8e80941Smrg 812b8e80941Smrg stl = ir3_STL(b, offset, 0, 813b8e80941Smrg ir3_create_collect(ctx, &value[first_component], length), 0, 814b8e80941Smrg create_immed(b, length), 0); 815b8e80941Smrg stl->cat6.dst_offset = first_component + base; 816b8e80941Smrg stl->cat6.type = utype_src(intr->src[0]); 817b8e80941Smrg stl->barrier_class = IR3_BARRIER_SHARED_W; 818b8e80941Smrg stl->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; 819b8e80941Smrg 820b8e80941Smrg array_insert(b, b->keeps, stl); 821b8e80941Smrg 822b8e80941Smrg /* Clear the bits in the writemask that we just wrote, then try 823b8e80941Smrg * again to see if more channels are left. 824b8e80941Smrg */ 825b8e80941Smrg wrmask &= (15 << (first_component + length)); 826b8e80941Smrg } 827b8e80941Smrg} 828b8e80941Smrg 829b8e80941Smrg/* 830b8e80941Smrg * CS shared variable atomic intrinsics 831b8e80941Smrg * 832b8e80941Smrg * All of the shared variable atomic memory operations read a value from 833b8e80941Smrg * memory, compute a new value using one of the operations below, write the 834b8e80941Smrg * new value to memory, and return the original value read. 835b8e80941Smrg * 836b8e80941Smrg * All operations take 2 sources except CompSwap that takes 3. These 837b8e80941Smrg * sources represent: 838b8e80941Smrg * 839b8e80941Smrg * 0: The offset into the shared variable storage region that the atomic 840b8e80941Smrg * operation will operate on. 841b8e80941Smrg * 1: The data parameter to the atomic function (i.e. the value to add 842b8e80941Smrg * in shared_atomic_add, etc). 843b8e80941Smrg * 2: For CompSwap only: the second data parameter. 844b8e80941Smrg */ 845b8e80941Smrgstatic struct ir3_instruction * 846b8e80941Smrgemit_intrinsic_atomic_shared(struct ir3_context *ctx, nir_intrinsic_instr *intr) 847b8e80941Smrg{ 848b8e80941Smrg struct ir3_block *b = ctx->block; 849b8e80941Smrg struct ir3_instruction *atomic, *src0, *src1; 850b8e80941Smrg type_t type = TYPE_U32; 851b8e80941Smrg 852b8e80941Smrg src0 = ir3_get_src(ctx, &intr->src[0])[0]; /* offset */ 853b8e80941Smrg src1 = ir3_get_src(ctx, &intr->src[1])[0]; /* value */ 854b8e80941Smrg 855b8e80941Smrg switch (intr->intrinsic) { 856b8e80941Smrg case nir_intrinsic_shared_atomic_add: 857b8e80941Smrg atomic = ir3_ATOMIC_ADD(b, src0, 0, src1, 0); 858b8e80941Smrg break; 859b8e80941Smrg case nir_intrinsic_shared_atomic_imin: 860b8e80941Smrg atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); 861b8e80941Smrg type = TYPE_S32; 862b8e80941Smrg break; 863b8e80941Smrg case nir_intrinsic_shared_atomic_umin: 864b8e80941Smrg atomic = ir3_ATOMIC_MIN(b, src0, 0, src1, 0); 865b8e80941Smrg break; 866b8e80941Smrg case nir_intrinsic_shared_atomic_imax: 867b8e80941Smrg atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); 868b8e80941Smrg type = TYPE_S32; 869b8e80941Smrg break; 870b8e80941Smrg case nir_intrinsic_shared_atomic_umax: 871b8e80941Smrg atomic = ir3_ATOMIC_MAX(b, src0, 0, src1, 0); 872b8e80941Smrg break; 873b8e80941Smrg case nir_intrinsic_shared_atomic_and: 874b8e80941Smrg atomic = ir3_ATOMIC_AND(b, src0, 0, src1, 0); 875b8e80941Smrg break; 876b8e80941Smrg case nir_intrinsic_shared_atomic_or: 877b8e80941Smrg atomic = ir3_ATOMIC_OR(b, src0, 0, src1, 0); 878b8e80941Smrg break; 879b8e80941Smrg case nir_intrinsic_shared_atomic_xor: 880b8e80941Smrg atomic = ir3_ATOMIC_XOR(b, src0, 0, src1, 0); 881b8e80941Smrg break; 882b8e80941Smrg case nir_intrinsic_shared_atomic_exchange: 883b8e80941Smrg atomic = ir3_ATOMIC_XCHG(b, src0, 0, src1, 0); 884b8e80941Smrg break; 885b8e80941Smrg case nir_intrinsic_shared_atomic_comp_swap: 886b8e80941Smrg /* for cmpxchg, src1 is [ui]vec2(data, compare): */ 887b8e80941Smrg src1 = ir3_create_collect(ctx, (struct ir3_instruction*[]){ 888b8e80941Smrg ir3_get_src(ctx, &intr->src[2])[0], 889b8e80941Smrg src1, 890b8e80941Smrg }, 2); 891b8e80941Smrg atomic = ir3_ATOMIC_CMPXCHG(b, src0, 0, src1, 0); 892b8e80941Smrg break; 893b8e80941Smrg default: 894b8e80941Smrg unreachable("boo"); 895b8e80941Smrg } 896b8e80941Smrg 897b8e80941Smrg atomic->cat6.iim_val = 1; 898b8e80941Smrg atomic->cat6.d = 1; 899b8e80941Smrg atomic->cat6.type = type; 900b8e80941Smrg atomic->barrier_class = IR3_BARRIER_SHARED_W; 901b8e80941Smrg atomic->barrier_conflict = IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W; 902b8e80941Smrg 903b8e80941Smrg /* even if nothing consume the result, we can't DCE the instruction: */ 904b8e80941Smrg array_insert(b, b->keeps, atomic); 905b8e80941Smrg 906b8e80941Smrg return atomic; 907b8e80941Smrg} 908b8e80941Smrg 909b8e80941Smrg/* TODO handle actual indirect/dynamic case.. which is going to be weird 910b8e80941Smrg * to handle with the image_mapping table.. 911b8e80941Smrg */ 912b8e80941Smrgstatic struct ir3_instruction * 913b8e80941Smrgget_image_samp_tex_src(struct ir3_context *ctx, nir_intrinsic_instr *intr) 914b8e80941Smrg{ 915b8e80941Smrg unsigned slot = ir3_get_image_slot(nir_src_as_deref(intr->src[0])); 916b8e80941Smrg unsigned tex_idx = ir3_image_to_tex(&ctx->so->image_mapping, slot); 917b8e80941Smrg struct ir3_instruction *texture, *sampler; 918b8e80941Smrg 919b8e80941Smrg texture = create_immed_typed(ctx->block, tex_idx, TYPE_U16); 920b8e80941Smrg sampler = create_immed_typed(ctx->block, tex_idx, TYPE_U16); 921b8e80941Smrg 922b8e80941Smrg return ir3_create_collect(ctx, (struct ir3_instruction*[]){ 923b8e80941Smrg sampler, 924b8e80941Smrg texture, 925b8e80941Smrg }, 2); 926b8e80941Smrg} 927b8e80941Smrg 928b8e80941Smrg/* src[] = { deref, coord, sample_index }. const_index[] = {} */ 929b8e80941Smrgstatic void 930b8e80941Smrgemit_intrinsic_load_image(struct ir3_context *ctx, nir_intrinsic_instr *intr, 931b8e80941Smrg struct ir3_instruction **dst) 932b8e80941Smrg{ 933b8e80941Smrg struct ir3_block *b = ctx->block; 934b8e80941Smrg const nir_variable *var = nir_intrinsic_get_var(intr, 0); 935b8e80941Smrg struct ir3_instruction *samp_tex = get_image_samp_tex_src(ctx, intr); 936b8e80941Smrg struct ir3_instruction *sam; 937b8e80941Smrg struct ir3_instruction * const *src0 = ir3_get_src(ctx, &intr->src[1]); 938b8e80941Smrg struct ir3_instruction *coords[4]; 939b8e80941Smrg unsigned flags, ncoords = ir3_get_image_coords(var, &flags); 940b8e80941Smrg type_t type = ir3_get_image_type(var); 941b8e80941Smrg 942b8e80941Smrg /* hmm, this seems a bit odd, but it is what blob does and (at least 943b8e80941Smrg * a5xx) just faults on bogus addresses otherwise: 944b8e80941Smrg */ 945b8e80941Smrg if (flags & IR3_INSTR_3D) { 946b8e80941Smrg flags &= ~IR3_INSTR_3D; 947b8e80941Smrg flags |= IR3_INSTR_A; 948b8e80941Smrg } 949b8e80941Smrg 950b8e80941Smrg for (unsigned i = 0; i < ncoords; i++) 951b8e80941Smrg coords[i] = src0[i]; 952b8e80941Smrg 953b8e80941Smrg if (ncoords == 1) 954b8e80941Smrg coords[ncoords++] = create_immed(b, 0); 955b8e80941Smrg 956b8e80941Smrg sam = ir3_SAM(b, OPC_ISAM, type, 0b1111, flags, 957b8e80941Smrg samp_tex, ir3_create_collect(ctx, coords, ncoords), NULL); 958b8e80941Smrg 959b8e80941Smrg sam->barrier_class = IR3_BARRIER_IMAGE_R; 960b8e80941Smrg sam->barrier_conflict = IR3_BARRIER_IMAGE_W; 961b8e80941Smrg 962b8e80941Smrg ir3_split_dest(b, dst, sam, 0, 4); 963b8e80941Smrg} 964b8e80941Smrg 965b8e80941Smrgstatic void 966b8e80941Smrgemit_intrinsic_image_size(struct ir3_context *ctx, nir_intrinsic_instr *intr, 967b8e80941Smrg struct ir3_instruction **dst) 968b8e80941Smrg{ 969b8e80941Smrg struct ir3_block *b = ctx->block; 970b8e80941Smrg const nir_variable *var = nir_intrinsic_get_var(intr, 0); 971b8e80941Smrg struct ir3_instruction *samp_tex = get_image_samp_tex_src(ctx, intr); 972b8e80941Smrg struct ir3_instruction *sam, *lod; 973b8e80941Smrg unsigned flags, ncoords = ir3_get_image_coords(var, &flags); 974b8e80941Smrg 975b8e80941Smrg lod = create_immed(b, 0); 976b8e80941Smrg sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags, 977b8e80941Smrg samp_tex, lod, NULL); 978b8e80941Smrg 979b8e80941Smrg /* Array size actually ends up in .w rather than .z. This doesn't 980b8e80941Smrg * matter for miplevel 0, but for higher mips the value in z is 981b8e80941Smrg * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is 982b8e80941Smrg * returned, which means that we have to add 1 to it for arrays for 983b8e80941Smrg * a3xx. 984b8e80941Smrg * 985b8e80941Smrg * Note use a temporary dst and then copy, since the size of the dst 986b8e80941Smrg * array that is passed in is based on nir's understanding of the 987b8e80941Smrg * result size, not the hardware's 988b8e80941Smrg */ 989b8e80941Smrg struct ir3_instruction *tmp[4]; 990b8e80941Smrg 991b8e80941Smrg ir3_split_dest(b, tmp, sam, 0, 4); 992b8e80941Smrg 993b8e80941Smrg /* get_size instruction returns size in bytes instead of texels 994b8e80941Smrg * for imageBuffer, so we need to divide it by the pixel size 995b8e80941Smrg * of the image format. 996b8e80941Smrg * 997b8e80941Smrg * TODO: This is at least true on a5xx. Check other gens. 998b8e80941Smrg */ 999b8e80941Smrg enum glsl_sampler_dim dim = 1000b8e80941Smrg glsl_get_sampler_dim(glsl_without_array(var->type)); 1001b8e80941Smrg if (dim == GLSL_SAMPLER_DIM_BUF) { 1002b8e80941Smrg /* Since all the possible values the divisor can take are 1003b8e80941Smrg * power-of-two (4, 8, or 16), the division is implemented 1004b8e80941Smrg * as a shift-right. 1005b8e80941Smrg * During shader setup, the log2 of the image format's 1006b8e80941Smrg * bytes-per-pixel should have been emitted in 2nd slot of 1007b8e80941Smrg * image_dims. See ir3_shader::emit_image_dims(). 1008b8e80941Smrg */ 1009b8e80941Smrg unsigned cb = regid(ctx->so->constbase.image_dims, 0) + 1010b8e80941Smrg ctx->so->const_layout.image_dims.off[var->data.driver_location]; 1011b8e80941Smrg struct ir3_instruction *aux = create_uniform(b, cb + 1); 1012b8e80941Smrg 1013b8e80941Smrg tmp[0] = ir3_SHR_B(b, tmp[0], 0, aux, 0); 1014b8e80941Smrg } 1015b8e80941Smrg 1016b8e80941Smrg for (unsigned i = 0; i < ncoords; i++) 1017b8e80941Smrg dst[i] = tmp[i]; 1018b8e80941Smrg 1019b8e80941Smrg if (flags & IR3_INSTR_A) { 1020b8e80941Smrg if (ctx->compiler->levels_add_one) { 1021b8e80941Smrg dst[ncoords-1] = ir3_ADD_U(b, tmp[3], 0, create_immed(b, 1), 0); 1022b8e80941Smrg } else { 1023b8e80941Smrg dst[ncoords-1] = ir3_MOV(b, tmp[3], TYPE_U32); 1024b8e80941Smrg } 1025b8e80941Smrg } 1026b8e80941Smrg} 1027b8e80941Smrg 1028b8e80941Smrgstatic void 1029b8e80941Smrgemit_intrinsic_barrier(struct ir3_context *ctx, nir_intrinsic_instr *intr) 1030b8e80941Smrg{ 1031b8e80941Smrg struct ir3_block *b = ctx->block; 1032b8e80941Smrg struct ir3_instruction *barrier; 1033b8e80941Smrg 1034b8e80941Smrg switch (intr->intrinsic) { 1035b8e80941Smrg case nir_intrinsic_barrier: 1036b8e80941Smrg barrier = ir3_BAR(b); 1037b8e80941Smrg barrier->cat7.g = true; 1038b8e80941Smrg barrier->cat7.l = true; 1039b8e80941Smrg barrier->flags = IR3_INSTR_SS | IR3_INSTR_SY; 1040b8e80941Smrg barrier->barrier_class = IR3_BARRIER_EVERYTHING; 1041b8e80941Smrg break; 1042b8e80941Smrg case nir_intrinsic_memory_barrier: 1043b8e80941Smrg barrier = ir3_FENCE(b); 1044b8e80941Smrg barrier->cat7.g = true; 1045b8e80941Smrg barrier->cat7.r = true; 1046b8e80941Smrg barrier->cat7.w = true; 1047b8e80941Smrg barrier->cat7.l = true; 1048b8e80941Smrg barrier->barrier_class = IR3_BARRIER_IMAGE_W | 1049b8e80941Smrg IR3_BARRIER_BUFFER_W; 1050b8e80941Smrg barrier->barrier_conflict = 1051b8e80941Smrg IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | 1052b8e80941Smrg IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 1053b8e80941Smrg break; 1054b8e80941Smrg case nir_intrinsic_memory_barrier_atomic_counter: 1055b8e80941Smrg case nir_intrinsic_memory_barrier_buffer: 1056b8e80941Smrg barrier = ir3_FENCE(b); 1057b8e80941Smrg barrier->cat7.g = true; 1058b8e80941Smrg barrier->cat7.r = true; 1059b8e80941Smrg barrier->cat7.w = true; 1060b8e80941Smrg barrier->barrier_class = IR3_BARRIER_BUFFER_W; 1061b8e80941Smrg barrier->barrier_conflict = IR3_BARRIER_BUFFER_R | 1062b8e80941Smrg IR3_BARRIER_BUFFER_W; 1063b8e80941Smrg break; 1064b8e80941Smrg case nir_intrinsic_memory_barrier_image: 1065b8e80941Smrg // TODO double check if this should have .g set 1066b8e80941Smrg barrier = ir3_FENCE(b); 1067b8e80941Smrg barrier->cat7.g = true; 1068b8e80941Smrg barrier->cat7.r = true; 1069b8e80941Smrg barrier->cat7.w = true; 1070b8e80941Smrg barrier->barrier_class = IR3_BARRIER_IMAGE_W; 1071b8e80941Smrg barrier->barrier_conflict = IR3_BARRIER_IMAGE_R | 1072b8e80941Smrg IR3_BARRIER_IMAGE_W; 1073b8e80941Smrg break; 1074b8e80941Smrg case nir_intrinsic_memory_barrier_shared: 1075b8e80941Smrg barrier = ir3_FENCE(b); 1076b8e80941Smrg barrier->cat7.g = true; 1077b8e80941Smrg barrier->cat7.l = true; 1078b8e80941Smrg barrier->cat7.r = true; 1079b8e80941Smrg barrier->cat7.w = true; 1080b8e80941Smrg barrier->barrier_class = IR3_BARRIER_SHARED_W; 1081b8e80941Smrg barrier->barrier_conflict = IR3_BARRIER_SHARED_R | 1082b8e80941Smrg IR3_BARRIER_SHARED_W; 1083b8e80941Smrg break; 1084b8e80941Smrg case nir_intrinsic_group_memory_barrier: 1085b8e80941Smrg barrier = ir3_FENCE(b); 1086b8e80941Smrg barrier->cat7.g = true; 1087b8e80941Smrg barrier->cat7.l = true; 1088b8e80941Smrg barrier->cat7.r = true; 1089b8e80941Smrg barrier->cat7.w = true; 1090b8e80941Smrg barrier->barrier_class = IR3_BARRIER_SHARED_W | 1091b8e80941Smrg IR3_BARRIER_IMAGE_W | 1092b8e80941Smrg IR3_BARRIER_BUFFER_W; 1093b8e80941Smrg barrier->barrier_conflict = 1094b8e80941Smrg IR3_BARRIER_SHARED_R | IR3_BARRIER_SHARED_W | 1095b8e80941Smrg IR3_BARRIER_IMAGE_R | IR3_BARRIER_IMAGE_W | 1096b8e80941Smrg IR3_BARRIER_BUFFER_R | IR3_BARRIER_BUFFER_W; 1097b8e80941Smrg break; 1098b8e80941Smrg default: 1099b8e80941Smrg unreachable("boo"); 1100b8e80941Smrg } 1101b8e80941Smrg 1102b8e80941Smrg /* make sure barrier doesn't get DCE'd */ 1103b8e80941Smrg array_insert(b, b->keeps, barrier); 1104b8e80941Smrg} 1105b8e80941Smrg 1106b8e80941Smrgstatic void add_sysval_input_compmask(struct ir3_context *ctx, 1107b8e80941Smrg gl_system_value slot, unsigned compmask, 1108b8e80941Smrg struct ir3_instruction *instr) 1109b8e80941Smrg{ 1110b8e80941Smrg struct ir3_shader_variant *so = ctx->so; 1111b8e80941Smrg unsigned r = regid(so->inputs_count, 0); 1112b8e80941Smrg unsigned n = so->inputs_count++; 1113b8e80941Smrg 1114b8e80941Smrg so->inputs[n].sysval = true; 1115b8e80941Smrg so->inputs[n].slot = slot; 1116b8e80941Smrg so->inputs[n].compmask = compmask; 1117b8e80941Smrg so->inputs[n].regid = r; 1118b8e80941Smrg so->inputs[n].interpolate = INTERP_MODE_FLAT; 1119b8e80941Smrg so->total_in++; 1120b8e80941Smrg 1121b8e80941Smrg ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1); 1122b8e80941Smrg ctx->ir->inputs[r] = instr; 1123b8e80941Smrg} 1124b8e80941Smrg 1125b8e80941Smrgstatic void add_sysval_input(struct ir3_context *ctx, gl_system_value slot, 1126b8e80941Smrg struct ir3_instruction *instr) 1127b8e80941Smrg{ 1128b8e80941Smrg add_sysval_input_compmask(ctx, slot, 0x1, instr); 1129b8e80941Smrg} 1130b8e80941Smrg 1131b8e80941Smrgstatic struct ir3_instruction * 1132b8e80941Smrgget_barycentric_centroid(struct ir3_context *ctx) 1133b8e80941Smrg{ 1134b8e80941Smrg if (!ctx->ij_centroid) { 1135b8e80941Smrg struct ir3_instruction *xy[2]; 1136b8e80941Smrg struct ir3_instruction *ij; 1137b8e80941Smrg 1138b8e80941Smrg ij = create_input_compmask(ctx, 0, 0x3); 1139b8e80941Smrg ir3_split_dest(ctx->block, xy, ij, 0, 2); 1140b8e80941Smrg 1141b8e80941Smrg ctx->ij_centroid = ir3_create_collect(ctx, xy, 2); 1142b8e80941Smrg 1143b8e80941Smrg add_sysval_input_compmask(ctx, 1144b8e80941Smrg SYSTEM_VALUE_BARYCENTRIC_CENTROID, 1145b8e80941Smrg 0x3, ij); 1146b8e80941Smrg } 1147b8e80941Smrg 1148b8e80941Smrg return ctx->ij_centroid; 1149b8e80941Smrg} 1150b8e80941Smrg 1151b8e80941Smrgstatic struct ir3_instruction * 1152b8e80941Smrgget_barycentric_sample(struct ir3_context *ctx) 1153b8e80941Smrg{ 1154b8e80941Smrg if (!ctx->ij_sample) { 1155b8e80941Smrg struct ir3_instruction *xy[2]; 1156b8e80941Smrg struct ir3_instruction *ij; 1157b8e80941Smrg 1158b8e80941Smrg ij = create_input_compmask(ctx, 0, 0x3); 1159b8e80941Smrg ir3_split_dest(ctx->block, xy, ij, 0, 2); 1160b8e80941Smrg 1161b8e80941Smrg ctx->ij_sample = ir3_create_collect(ctx, xy, 2); 1162b8e80941Smrg 1163b8e80941Smrg add_sysval_input_compmask(ctx, 1164b8e80941Smrg SYSTEM_VALUE_BARYCENTRIC_SAMPLE, 1165b8e80941Smrg 0x3, ij); 1166b8e80941Smrg } 1167b8e80941Smrg 1168b8e80941Smrg return ctx->ij_sample; 1169b8e80941Smrg} 1170b8e80941Smrg 1171b8e80941Smrgstatic struct ir3_instruction * 1172b8e80941Smrgget_barycentric_pixel(struct ir3_context *ctx) 1173b8e80941Smrg{ 1174b8e80941Smrg /* TODO when tgsi_to_nir supports "new-style" FS inputs switch 1175b8e80941Smrg * this to create ij_pixel only on demand: 1176b8e80941Smrg */ 1177b8e80941Smrg return ctx->ij_pixel; 1178b8e80941Smrg} 1179b8e80941Smrg 1180b8e80941Smrgstatic struct ir3_instruction * 1181b8e80941Smrgget_frag_coord(struct ir3_context *ctx) 1182b8e80941Smrg{ 1183b8e80941Smrg if (!ctx->frag_coord) { 1184b8e80941Smrg struct ir3_block *b = ctx->block; 1185b8e80941Smrg struct ir3_instruction *xyzw[4]; 1186b8e80941Smrg struct ir3_instruction *hw_frag_coord; 1187b8e80941Smrg 1188b8e80941Smrg hw_frag_coord = create_input_compmask(ctx, 0, 0xf); 1189b8e80941Smrg ir3_split_dest(ctx->block, xyzw, hw_frag_coord, 0, 4); 1190b8e80941Smrg 1191b8e80941Smrg /* for frag_coord.xy, we get unsigned values.. we need 1192b8e80941Smrg * to subtract (integer) 8 and divide by 16 (right- 1193b8e80941Smrg * shift by 4) then convert to float: 1194b8e80941Smrg * 1195b8e80941Smrg * sub.s tmp, src, 8 1196b8e80941Smrg * shr.b tmp, tmp, 4 1197b8e80941Smrg * mov.u32f32 dst, tmp 1198b8e80941Smrg * 1199b8e80941Smrg */ 1200b8e80941Smrg for (int i = 0; i < 2; i++) { 1201b8e80941Smrg xyzw[i] = ir3_SUB_S(b, xyzw[i], 0, 1202b8e80941Smrg create_immed(b, 8), 0); 1203b8e80941Smrg xyzw[i] = ir3_SHR_B(b, xyzw[i], 0, 1204b8e80941Smrg create_immed(b, 4), 0); 1205b8e80941Smrg xyzw[i] = ir3_COV(b, xyzw[i], TYPE_U32, TYPE_F32); 1206b8e80941Smrg } 1207b8e80941Smrg 1208b8e80941Smrg ctx->frag_coord = ir3_create_collect(ctx, xyzw, 4); 1209b8e80941Smrg 1210b8e80941Smrg add_sysval_input_compmask(ctx, 1211b8e80941Smrg SYSTEM_VALUE_FRAG_COORD, 1212b8e80941Smrg 0xf, hw_frag_coord); 1213b8e80941Smrg 1214b8e80941Smrg ctx->so->frag_coord = true; 1215b8e80941Smrg } 1216b8e80941Smrg 1217b8e80941Smrg return ctx->frag_coord; 1218b8e80941Smrg} 1219b8e80941Smrg 1220b8e80941Smrgstatic void 1221b8e80941Smrgemit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) 1222b8e80941Smrg{ 1223b8e80941Smrg const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; 1224b8e80941Smrg struct ir3_instruction **dst; 1225b8e80941Smrg struct ir3_instruction * const *src; 1226b8e80941Smrg struct ir3_block *b = ctx->block; 1227b8e80941Smrg int idx, comp; 1228b8e80941Smrg 1229b8e80941Smrg if (info->has_dest) { 1230b8e80941Smrg unsigned n = nir_intrinsic_dest_components(intr); 1231b8e80941Smrg dst = ir3_get_dst(ctx, &intr->dest, n); 1232b8e80941Smrg } else { 1233b8e80941Smrg dst = NULL; 1234b8e80941Smrg } 1235b8e80941Smrg 1236b8e80941Smrg switch (intr->intrinsic) { 1237b8e80941Smrg case nir_intrinsic_load_uniform: 1238b8e80941Smrg idx = nir_intrinsic_base(intr); 1239b8e80941Smrg if (nir_src_is_const(intr->src[0])) { 1240b8e80941Smrg idx += nir_src_as_uint(intr->src[0]); 1241b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1242b8e80941Smrg dst[i] = create_uniform(b, idx + i); 1243b8e80941Smrg } 1244b8e80941Smrg } else { 1245b8e80941Smrg src = ir3_get_src(ctx, &intr->src[0]); 1246b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1247b8e80941Smrg dst[i] = create_uniform_indirect(b, idx + i, 1248b8e80941Smrg ir3_get_addr(ctx, src[0], 1)); 1249b8e80941Smrg } 1250b8e80941Smrg /* NOTE: if relative addressing is used, we set 1251b8e80941Smrg * constlen in the compiler (to worst-case value) 1252b8e80941Smrg * since we don't know in the assembler what the max 1253b8e80941Smrg * addr reg value can be: 1254b8e80941Smrg */ 1255b8e80941Smrg ctx->so->constlen = ctx->s->num_uniforms; 1256b8e80941Smrg } 1257b8e80941Smrg break; 1258b8e80941Smrg case nir_intrinsic_load_ubo: 1259b8e80941Smrg emit_intrinsic_load_ubo(ctx, intr, dst); 1260b8e80941Smrg break; 1261b8e80941Smrg case nir_intrinsic_load_frag_coord: 1262b8e80941Smrg ir3_split_dest(b, dst, get_frag_coord(ctx), 0, 4); 1263b8e80941Smrg break; 1264b8e80941Smrg case nir_intrinsic_load_sample_pos_from_id: { 1265b8e80941Smrg /* NOTE: blob seems to always use TYPE_F16 and then cov.f16f32, 1266b8e80941Smrg * but that doesn't seem necessary. 1267b8e80941Smrg */ 1268b8e80941Smrg struct ir3_instruction *offset = 1269b8e80941Smrg ir3_RGETPOS(b, ir3_get_src(ctx, &intr->src[0])[0], 0); 1270b8e80941Smrg offset->regs[0]->wrmask = 0x3; 1271b8e80941Smrg offset->cat5.type = TYPE_F32; 1272b8e80941Smrg 1273b8e80941Smrg ir3_split_dest(b, dst, offset, 0, 2); 1274b8e80941Smrg 1275b8e80941Smrg break; 1276b8e80941Smrg } 1277b8e80941Smrg case nir_intrinsic_load_size_ir3: 1278b8e80941Smrg if (!ctx->ij_size) { 1279b8e80941Smrg ctx->ij_size = create_input(ctx, 0); 1280b8e80941Smrg 1281b8e80941Smrg add_sysval_input(ctx, SYSTEM_VALUE_BARYCENTRIC_SIZE, 1282b8e80941Smrg ctx->ij_size); 1283b8e80941Smrg } 1284b8e80941Smrg dst[0] = ctx->ij_size; 1285b8e80941Smrg break; 1286b8e80941Smrg case nir_intrinsic_load_barycentric_centroid: 1287b8e80941Smrg ir3_split_dest(b, dst, get_barycentric_centroid(ctx), 0, 2); 1288b8e80941Smrg break; 1289b8e80941Smrg case nir_intrinsic_load_barycentric_sample: 1290b8e80941Smrg if (ctx->so->key.msaa) { 1291b8e80941Smrg ir3_split_dest(b, dst, get_barycentric_sample(ctx), 0, 2); 1292b8e80941Smrg } else { 1293b8e80941Smrg ir3_split_dest(b, dst, get_barycentric_pixel(ctx), 0, 2); 1294b8e80941Smrg } 1295b8e80941Smrg break; 1296b8e80941Smrg case nir_intrinsic_load_barycentric_pixel: 1297b8e80941Smrg ir3_split_dest(b, dst, get_barycentric_pixel(ctx), 0, 2); 1298b8e80941Smrg break; 1299b8e80941Smrg case nir_intrinsic_load_interpolated_input: 1300b8e80941Smrg idx = nir_intrinsic_base(intr); 1301b8e80941Smrg comp = nir_intrinsic_component(intr); 1302b8e80941Smrg src = ir3_get_src(ctx, &intr->src[0]); 1303b8e80941Smrg if (nir_src_is_const(intr->src[1])) { 1304b8e80941Smrg struct ir3_instruction *coord = ir3_create_collect(ctx, src, 2); 1305b8e80941Smrg idx += nir_src_as_uint(intr->src[1]); 1306b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1307b8e80941Smrg unsigned inloc = idx * 4 + i + comp; 1308b8e80941Smrg if (ctx->so->inputs[idx].bary && 1309b8e80941Smrg !ctx->so->inputs[idx].use_ldlv) { 1310b8e80941Smrg dst[i] = ir3_BARY_F(b, create_immed(b, inloc), 0, coord, 0); 1311b8e80941Smrg } else { 1312b8e80941Smrg /* for non-varyings use the pre-setup input, since 1313b8e80941Smrg * that is easier than mapping things back to a 1314b8e80941Smrg * nir_variable to figure out what it is. 1315b8e80941Smrg */ 1316b8e80941Smrg dst[i] = ctx->ir->inputs[inloc]; 1317b8e80941Smrg } 1318b8e80941Smrg } 1319b8e80941Smrg } else { 1320b8e80941Smrg ir3_context_error(ctx, "unhandled"); 1321b8e80941Smrg } 1322b8e80941Smrg break; 1323b8e80941Smrg case nir_intrinsic_load_input: 1324b8e80941Smrg idx = nir_intrinsic_base(intr); 1325b8e80941Smrg comp = nir_intrinsic_component(intr); 1326b8e80941Smrg if (nir_src_is_const(intr->src[0])) { 1327b8e80941Smrg idx += nir_src_as_uint(intr->src[0]); 1328b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1329b8e80941Smrg unsigned n = idx * 4 + i + comp; 1330b8e80941Smrg dst[i] = ctx->ir->inputs[n]; 1331b8e80941Smrg compile_assert(ctx, ctx->ir->inputs[n]); 1332b8e80941Smrg } 1333b8e80941Smrg } else { 1334b8e80941Smrg src = ir3_get_src(ctx, &intr->src[0]); 1335b8e80941Smrg struct ir3_instruction *collect = 1336b8e80941Smrg ir3_create_collect(ctx, ctx->ir->inputs, ctx->ir->ninputs); 1337b8e80941Smrg struct ir3_instruction *addr = ir3_get_addr(ctx, src[0], 4); 1338b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1339b8e80941Smrg unsigned n = idx * 4 + i + comp; 1340b8e80941Smrg dst[i] = create_indirect_load(ctx, ctx->ir->ninputs, 1341b8e80941Smrg n, addr, collect); 1342b8e80941Smrg } 1343b8e80941Smrg } 1344b8e80941Smrg break; 1345b8e80941Smrg /* All SSBO intrinsics should have been lowered by 'lower_io_offsets' 1346b8e80941Smrg * pass and replaced by an ir3-specifc version that adds the 1347b8e80941Smrg * dword-offset in the last source. 1348b8e80941Smrg */ 1349b8e80941Smrg case nir_intrinsic_load_ssbo_ir3: 1350b8e80941Smrg ctx->funcs->emit_intrinsic_load_ssbo(ctx, intr, dst); 1351b8e80941Smrg break; 1352b8e80941Smrg case nir_intrinsic_store_ssbo_ir3: 1353b8e80941Smrg if ((ctx->so->type == MESA_SHADER_FRAGMENT) && 1354b8e80941Smrg !ctx->s->info.fs.early_fragment_tests) 1355b8e80941Smrg ctx->so->no_earlyz = true; 1356b8e80941Smrg ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr); 1357b8e80941Smrg break; 1358b8e80941Smrg case nir_intrinsic_get_buffer_size: 1359b8e80941Smrg emit_intrinsic_ssbo_size(ctx, intr, dst); 1360b8e80941Smrg break; 1361b8e80941Smrg case nir_intrinsic_ssbo_atomic_add_ir3: 1362b8e80941Smrg case nir_intrinsic_ssbo_atomic_imin_ir3: 1363b8e80941Smrg case nir_intrinsic_ssbo_atomic_umin_ir3: 1364b8e80941Smrg case nir_intrinsic_ssbo_atomic_imax_ir3: 1365b8e80941Smrg case nir_intrinsic_ssbo_atomic_umax_ir3: 1366b8e80941Smrg case nir_intrinsic_ssbo_atomic_and_ir3: 1367b8e80941Smrg case nir_intrinsic_ssbo_atomic_or_ir3: 1368b8e80941Smrg case nir_intrinsic_ssbo_atomic_xor_ir3: 1369b8e80941Smrg case nir_intrinsic_ssbo_atomic_exchange_ir3: 1370b8e80941Smrg case nir_intrinsic_ssbo_atomic_comp_swap_ir3: 1371b8e80941Smrg if ((ctx->so->type == MESA_SHADER_FRAGMENT) && 1372b8e80941Smrg !ctx->s->info.fs.early_fragment_tests) 1373b8e80941Smrg ctx->so->no_earlyz = true; 1374b8e80941Smrg dst[0] = ctx->funcs->emit_intrinsic_atomic_ssbo(ctx, intr); 1375b8e80941Smrg break; 1376b8e80941Smrg case nir_intrinsic_load_shared: 1377b8e80941Smrg emit_intrinsic_load_shared(ctx, intr, dst); 1378b8e80941Smrg break; 1379b8e80941Smrg case nir_intrinsic_store_shared: 1380b8e80941Smrg emit_intrinsic_store_shared(ctx, intr); 1381b8e80941Smrg break; 1382b8e80941Smrg case nir_intrinsic_shared_atomic_add: 1383b8e80941Smrg case nir_intrinsic_shared_atomic_imin: 1384b8e80941Smrg case nir_intrinsic_shared_atomic_umin: 1385b8e80941Smrg case nir_intrinsic_shared_atomic_imax: 1386b8e80941Smrg case nir_intrinsic_shared_atomic_umax: 1387b8e80941Smrg case nir_intrinsic_shared_atomic_and: 1388b8e80941Smrg case nir_intrinsic_shared_atomic_or: 1389b8e80941Smrg case nir_intrinsic_shared_atomic_xor: 1390b8e80941Smrg case nir_intrinsic_shared_atomic_exchange: 1391b8e80941Smrg case nir_intrinsic_shared_atomic_comp_swap: 1392b8e80941Smrg dst[0] = emit_intrinsic_atomic_shared(ctx, intr); 1393b8e80941Smrg break; 1394b8e80941Smrg case nir_intrinsic_image_deref_load: 1395b8e80941Smrg emit_intrinsic_load_image(ctx, intr, dst); 1396b8e80941Smrg break; 1397b8e80941Smrg case nir_intrinsic_image_deref_store: 1398b8e80941Smrg if ((ctx->so->type == MESA_SHADER_FRAGMENT) && 1399b8e80941Smrg !ctx->s->info.fs.early_fragment_tests) 1400b8e80941Smrg ctx->so->no_earlyz = true; 1401b8e80941Smrg ctx->funcs->emit_intrinsic_store_image(ctx, intr); 1402b8e80941Smrg break; 1403b8e80941Smrg case nir_intrinsic_image_deref_size: 1404b8e80941Smrg emit_intrinsic_image_size(ctx, intr, dst); 1405b8e80941Smrg break; 1406b8e80941Smrg case nir_intrinsic_image_deref_atomic_add: 1407b8e80941Smrg case nir_intrinsic_image_deref_atomic_min: 1408b8e80941Smrg case nir_intrinsic_image_deref_atomic_max: 1409b8e80941Smrg case nir_intrinsic_image_deref_atomic_and: 1410b8e80941Smrg case nir_intrinsic_image_deref_atomic_or: 1411b8e80941Smrg case nir_intrinsic_image_deref_atomic_xor: 1412b8e80941Smrg case nir_intrinsic_image_deref_atomic_exchange: 1413b8e80941Smrg case nir_intrinsic_image_deref_atomic_comp_swap: 1414b8e80941Smrg if ((ctx->so->type == MESA_SHADER_FRAGMENT) && 1415b8e80941Smrg !ctx->s->info.fs.early_fragment_tests) 1416b8e80941Smrg ctx->so->no_earlyz = true; 1417b8e80941Smrg dst[0] = ctx->funcs->emit_intrinsic_atomic_image(ctx, intr); 1418b8e80941Smrg break; 1419b8e80941Smrg case nir_intrinsic_barrier: 1420b8e80941Smrg case nir_intrinsic_memory_barrier: 1421b8e80941Smrg case nir_intrinsic_group_memory_barrier: 1422b8e80941Smrg case nir_intrinsic_memory_barrier_atomic_counter: 1423b8e80941Smrg case nir_intrinsic_memory_barrier_buffer: 1424b8e80941Smrg case nir_intrinsic_memory_barrier_image: 1425b8e80941Smrg case nir_intrinsic_memory_barrier_shared: 1426b8e80941Smrg emit_intrinsic_barrier(ctx, intr); 1427b8e80941Smrg /* note that blk ptr no longer valid, make that obvious: */ 1428b8e80941Smrg b = NULL; 1429b8e80941Smrg break; 1430b8e80941Smrg case nir_intrinsic_store_output: 1431b8e80941Smrg idx = nir_intrinsic_base(intr); 1432b8e80941Smrg comp = nir_intrinsic_component(intr); 1433b8e80941Smrg compile_assert(ctx, nir_src_is_const(intr->src[1])); 1434b8e80941Smrg idx += nir_src_as_uint(intr->src[1]); 1435b8e80941Smrg 1436b8e80941Smrg src = ir3_get_src(ctx, &intr->src[0]); 1437b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1438b8e80941Smrg unsigned n = idx * 4 + i + comp; 1439b8e80941Smrg ctx->ir->outputs[n] = src[i]; 1440b8e80941Smrg } 1441b8e80941Smrg break; 1442b8e80941Smrg case nir_intrinsic_load_base_vertex: 1443b8e80941Smrg case nir_intrinsic_load_first_vertex: 1444b8e80941Smrg if (!ctx->basevertex) { 1445b8e80941Smrg ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE); 1446b8e80941Smrg add_sysval_input(ctx, SYSTEM_VALUE_FIRST_VERTEX, ctx->basevertex); 1447b8e80941Smrg } 1448b8e80941Smrg dst[0] = ctx->basevertex; 1449b8e80941Smrg break; 1450b8e80941Smrg case nir_intrinsic_load_vertex_id_zero_base: 1451b8e80941Smrg case nir_intrinsic_load_vertex_id: 1452b8e80941Smrg if (!ctx->vertex_id) { 1453b8e80941Smrg gl_system_value sv = (intr->intrinsic == nir_intrinsic_load_vertex_id) ? 1454b8e80941Smrg SYSTEM_VALUE_VERTEX_ID : SYSTEM_VALUE_VERTEX_ID_ZERO_BASE; 1455b8e80941Smrg ctx->vertex_id = create_input(ctx, 0); 1456b8e80941Smrg add_sysval_input(ctx, sv, ctx->vertex_id); 1457b8e80941Smrg } 1458b8e80941Smrg dst[0] = ctx->vertex_id; 1459b8e80941Smrg break; 1460b8e80941Smrg case nir_intrinsic_load_instance_id: 1461b8e80941Smrg if (!ctx->instance_id) { 1462b8e80941Smrg ctx->instance_id = create_input(ctx, 0); 1463b8e80941Smrg add_sysval_input(ctx, SYSTEM_VALUE_INSTANCE_ID, 1464b8e80941Smrg ctx->instance_id); 1465b8e80941Smrg } 1466b8e80941Smrg dst[0] = ctx->instance_id; 1467b8e80941Smrg break; 1468b8e80941Smrg case nir_intrinsic_load_sample_id: 1469b8e80941Smrg ctx->so->per_samp = true; 1470b8e80941Smrg /* fall-thru */ 1471b8e80941Smrg case nir_intrinsic_load_sample_id_no_per_sample: 1472b8e80941Smrg if (!ctx->samp_id) { 1473b8e80941Smrg ctx->samp_id = create_input(ctx, 0); 1474b8e80941Smrg ctx->samp_id->regs[0]->flags |= IR3_REG_HALF; 1475b8e80941Smrg add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_ID, 1476b8e80941Smrg ctx->samp_id); 1477b8e80941Smrg } 1478b8e80941Smrg dst[0] = ir3_COV(b, ctx->samp_id, TYPE_U16, TYPE_U32); 1479b8e80941Smrg break; 1480b8e80941Smrg case nir_intrinsic_load_sample_mask_in: 1481b8e80941Smrg if (!ctx->samp_mask_in) { 1482b8e80941Smrg ctx->samp_mask_in = create_input(ctx, 0); 1483b8e80941Smrg add_sysval_input(ctx, SYSTEM_VALUE_SAMPLE_MASK_IN, 1484b8e80941Smrg ctx->samp_mask_in); 1485b8e80941Smrg } 1486b8e80941Smrg dst[0] = ctx->samp_mask_in; 1487b8e80941Smrg break; 1488b8e80941Smrg case nir_intrinsic_load_user_clip_plane: 1489b8e80941Smrg idx = nir_intrinsic_ucp_id(intr); 1490b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1491b8e80941Smrg unsigned n = idx * 4 + i; 1492b8e80941Smrg dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n); 1493b8e80941Smrg } 1494b8e80941Smrg break; 1495b8e80941Smrg case nir_intrinsic_load_front_face: 1496b8e80941Smrg if (!ctx->frag_face) { 1497b8e80941Smrg ctx->so->frag_face = true; 1498b8e80941Smrg ctx->frag_face = create_input(ctx, 0); 1499b8e80941Smrg add_sysval_input(ctx, SYSTEM_VALUE_FRONT_FACE, ctx->frag_face); 1500b8e80941Smrg ctx->frag_face->regs[0]->flags |= IR3_REG_HALF; 1501b8e80941Smrg } 1502b8e80941Smrg /* for fragface, we get -1 for back and 0 for front. However this is 1503b8e80941Smrg * the inverse of what nir expects (where ~0 is true). 1504b8e80941Smrg */ 1505b8e80941Smrg dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32); 1506b8e80941Smrg dst[0] = ir3_NOT_B(b, dst[0], 0); 1507b8e80941Smrg break; 1508b8e80941Smrg case nir_intrinsic_load_local_invocation_id: 1509b8e80941Smrg if (!ctx->local_invocation_id) { 1510b8e80941Smrg ctx->local_invocation_id = create_input_compmask(ctx, 0, 0x7); 1511b8e80941Smrg add_sysval_input_compmask(ctx, SYSTEM_VALUE_LOCAL_INVOCATION_ID, 1512b8e80941Smrg 0x7, ctx->local_invocation_id); 1513b8e80941Smrg } 1514b8e80941Smrg ir3_split_dest(b, dst, ctx->local_invocation_id, 0, 3); 1515b8e80941Smrg break; 1516b8e80941Smrg case nir_intrinsic_load_work_group_id: 1517b8e80941Smrg if (!ctx->work_group_id) { 1518b8e80941Smrg ctx->work_group_id = create_input_compmask(ctx, 0, 0x7); 1519b8e80941Smrg add_sysval_input_compmask(ctx, SYSTEM_VALUE_WORK_GROUP_ID, 1520b8e80941Smrg 0x7, ctx->work_group_id); 1521b8e80941Smrg ctx->work_group_id->regs[0]->flags |= IR3_REG_HIGH; 1522b8e80941Smrg } 1523b8e80941Smrg ir3_split_dest(b, dst, ctx->work_group_id, 0, 3); 1524b8e80941Smrg break; 1525b8e80941Smrg case nir_intrinsic_load_num_work_groups: 1526b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1527b8e80941Smrg dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i); 1528b8e80941Smrg } 1529b8e80941Smrg break; 1530b8e80941Smrg case nir_intrinsic_load_local_group_size: 1531b8e80941Smrg for (int i = 0; i < intr->num_components; i++) { 1532b8e80941Smrg dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i); 1533b8e80941Smrg } 1534b8e80941Smrg break; 1535b8e80941Smrg case nir_intrinsic_discard_if: 1536b8e80941Smrg case nir_intrinsic_discard: { 1537b8e80941Smrg struct ir3_instruction *cond, *kill; 1538b8e80941Smrg 1539b8e80941Smrg if (intr->intrinsic == nir_intrinsic_discard_if) { 1540b8e80941Smrg /* conditional discard: */ 1541b8e80941Smrg src = ir3_get_src(ctx, &intr->src[0]); 1542b8e80941Smrg cond = ir3_b2n(b, src[0]); 1543b8e80941Smrg } else { 1544b8e80941Smrg /* unconditional discard: */ 1545b8e80941Smrg cond = create_immed(b, 1); 1546b8e80941Smrg } 1547b8e80941Smrg 1548b8e80941Smrg /* NOTE: only cmps.*.* can write p0.x: */ 1549b8e80941Smrg cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0); 1550b8e80941Smrg cond->cat2.condition = IR3_COND_NE; 1551b8e80941Smrg 1552b8e80941Smrg /* condition always goes in predicate register: */ 1553b8e80941Smrg cond->regs[0]->num = regid(REG_P0, 0); 1554b8e80941Smrg 1555b8e80941Smrg kill = ir3_KILL(b, cond, 0); 1556b8e80941Smrg array_insert(ctx->ir, ctx->ir->predicates, kill); 1557b8e80941Smrg 1558b8e80941Smrg array_insert(b, b->keeps, kill); 1559b8e80941Smrg ctx->so->no_earlyz = true; 1560b8e80941Smrg 1561b8e80941Smrg break; 1562b8e80941Smrg } 1563b8e80941Smrg default: 1564b8e80941Smrg ir3_context_error(ctx, "Unhandled intrinsic type: %s\n", 1565b8e80941Smrg nir_intrinsic_infos[intr->intrinsic].name); 1566b8e80941Smrg break; 1567b8e80941Smrg } 1568b8e80941Smrg 1569b8e80941Smrg if (info->has_dest) 1570b8e80941Smrg ir3_put_dst(ctx, &intr->dest); 1571b8e80941Smrg} 1572b8e80941Smrg 1573b8e80941Smrgstatic void 1574b8e80941Smrgemit_load_const(struct ir3_context *ctx, nir_load_const_instr *instr) 1575b8e80941Smrg{ 1576b8e80941Smrg struct ir3_instruction **dst = ir3_get_dst_ssa(ctx, &instr->def, 1577b8e80941Smrg instr->def.num_components); 1578b8e80941Smrg type_t type = (instr->def.bit_size < 32) ? TYPE_U16 : TYPE_U32; 1579b8e80941Smrg 1580b8e80941Smrg for (int i = 0; i < instr->def.num_components; i++) 1581b8e80941Smrg dst[i] = create_immed_typed(ctx->block, instr->value[i].u32, type); 1582b8e80941Smrg} 1583b8e80941Smrg 1584b8e80941Smrgstatic void 1585b8e80941Smrgemit_undef(struct ir3_context *ctx, nir_ssa_undef_instr *undef) 1586b8e80941Smrg{ 1587b8e80941Smrg struct ir3_instruction **dst = ir3_get_dst_ssa(ctx, &undef->def, 1588b8e80941Smrg undef->def.num_components); 1589b8e80941Smrg type_t type = (undef->def.bit_size < 32) ? TYPE_U16 : TYPE_U32; 1590b8e80941Smrg 1591b8e80941Smrg /* backend doesn't want undefined instructions, so just plug 1592b8e80941Smrg * in 0.0.. 1593b8e80941Smrg */ 1594b8e80941Smrg for (int i = 0; i < undef->def.num_components; i++) 1595b8e80941Smrg dst[i] = create_immed_typed(ctx->block, fui(0.0), type); 1596b8e80941Smrg} 1597b8e80941Smrg 1598b8e80941Smrg/* 1599b8e80941Smrg * texture fetch/sample instructions: 1600b8e80941Smrg */ 1601b8e80941Smrg 1602b8e80941Smrgstatic void 1603b8e80941Smrgtex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp) 1604b8e80941Smrg{ 1605b8e80941Smrg unsigned coords, flags = 0; 1606b8e80941Smrg 1607b8e80941Smrg /* note: would use tex->coord_components.. except txs.. also, 1608b8e80941Smrg * since array index goes after shadow ref, we don't want to 1609b8e80941Smrg * count it: 1610b8e80941Smrg */ 1611b8e80941Smrg switch (tex->sampler_dim) { 1612b8e80941Smrg case GLSL_SAMPLER_DIM_1D: 1613b8e80941Smrg case GLSL_SAMPLER_DIM_BUF: 1614b8e80941Smrg coords = 1; 1615b8e80941Smrg break; 1616b8e80941Smrg case GLSL_SAMPLER_DIM_2D: 1617b8e80941Smrg case GLSL_SAMPLER_DIM_RECT: 1618b8e80941Smrg case GLSL_SAMPLER_DIM_EXTERNAL: 1619b8e80941Smrg case GLSL_SAMPLER_DIM_MS: 1620b8e80941Smrg coords = 2; 1621b8e80941Smrg break; 1622b8e80941Smrg case GLSL_SAMPLER_DIM_3D: 1623b8e80941Smrg case GLSL_SAMPLER_DIM_CUBE: 1624b8e80941Smrg coords = 3; 1625b8e80941Smrg flags |= IR3_INSTR_3D; 1626b8e80941Smrg break; 1627b8e80941Smrg default: 1628b8e80941Smrg unreachable("bad sampler_dim"); 1629b8e80941Smrg } 1630b8e80941Smrg 1631b8e80941Smrg if (tex->is_shadow && tex->op != nir_texop_lod) 1632b8e80941Smrg flags |= IR3_INSTR_S; 1633b8e80941Smrg 1634b8e80941Smrg if (tex->is_array && tex->op != nir_texop_lod) 1635b8e80941Smrg flags |= IR3_INSTR_A; 1636b8e80941Smrg 1637b8e80941Smrg *flagsp = flags; 1638b8e80941Smrg *coordsp = coords; 1639b8e80941Smrg} 1640b8e80941Smrg 1641b8e80941Smrg/* Gets the sampler/texture idx as a hvec2. Which could either be dynamic 1642b8e80941Smrg * or immediate (in which case it will get lowered later to a non .s2en 1643b8e80941Smrg * version of the tex instruction which encode tex/samp as immediates: 1644b8e80941Smrg */ 1645b8e80941Smrgstatic struct ir3_instruction * 1646b8e80941Smrgget_tex_samp_tex_src(struct ir3_context *ctx, nir_tex_instr *tex) 1647b8e80941Smrg{ 1648b8e80941Smrg int texture_idx = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset); 1649b8e80941Smrg int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset); 1650b8e80941Smrg struct ir3_instruction *texture, *sampler; 1651b8e80941Smrg 1652b8e80941Smrg if (texture_idx >= 0) { 1653b8e80941Smrg texture = ir3_get_src(ctx, &tex->src[texture_idx].src)[0]; 1654b8e80941Smrg texture = ir3_COV(ctx->block, texture, TYPE_U32, TYPE_U16); 1655b8e80941Smrg } else { 1656b8e80941Smrg /* TODO what to do for dynamic case? I guess we only need the 1657b8e80941Smrg * max index for astc srgb workaround so maybe not a problem 1658b8e80941Smrg * to worry about if we don't enable indirect samplers for 1659b8e80941Smrg * a4xx? 1660b8e80941Smrg */ 1661b8e80941Smrg ctx->max_texture_index = MAX2(ctx->max_texture_index, tex->texture_index); 1662b8e80941Smrg texture = create_immed_typed(ctx->block, tex->texture_index, TYPE_U16); 1663b8e80941Smrg } 1664b8e80941Smrg 1665b8e80941Smrg if (sampler_idx >= 0) { 1666b8e80941Smrg sampler = ir3_get_src(ctx, &tex->src[sampler_idx].src)[0]; 1667b8e80941Smrg sampler = ir3_COV(ctx->block, sampler, TYPE_U32, TYPE_U16); 1668b8e80941Smrg } else { 1669b8e80941Smrg sampler = create_immed_typed(ctx->block, tex->sampler_index, TYPE_U16); 1670b8e80941Smrg } 1671b8e80941Smrg 1672b8e80941Smrg return ir3_create_collect(ctx, (struct ir3_instruction*[]){ 1673b8e80941Smrg sampler, 1674b8e80941Smrg texture, 1675b8e80941Smrg }, 2); 1676b8e80941Smrg} 1677b8e80941Smrg 1678b8e80941Smrgstatic void 1679b8e80941Smrgemit_tex(struct ir3_context *ctx, nir_tex_instr *tex) 1680b8e80941Smrg{ 1681b8e80941Smrg struct ir3_block *b = ctx->block; 1682b8e80941Smrg struct ir3_instruction **dst, *sam, *src0[12], *src1[4]; 1683b8e80941Smrg struct ir3_instruction * const *coord, * const *off, * const *ddx, * const *ddy; 1684b8e80941Smrg struct ir3_instruction *lod, *compare, *proj, *sample_index; 1685b8e80941Smrg bool has_bias = false, has_lod = false, has_proj = false, has_off = false; 1686b8e80941Smrg unsigned i, coords, flags, ncomp; 1687b8e80941Smrg unsigned nsrc0 = 0, nsrc1 = 0; 1688b8e80941Smrg type_t type; 1689b8e80941Smrg opc_t opc = 0; 1690b8e80941Smrg 1691b8e80941Smrg ncomp = nir_dest_num_components(tex->dest); 1692b8e80941Smrg 1693b8e80941Smrg coord = off = ddx = ddy = NULL; 1694b8e80941Smrg lod = proj = compare = sample_index = NULL; 1695b8e80941Smrg 1696b8e80941Smrg dst = ir3_get_dst(ctx, &tex->dest, ncomp); 1697b8e80941Smrg 1698b8e80941Smrg for (unsigned i = 0; i < tex->num_srcs; i++) { 1699b8e80941Smrg switch (tex->src[i].src_type) { 1700b8e80941Smrg case nir_tex_src_coord: 1701b8e80941Smrg coord = ir3_get_src(ctx, &tex->src[i].src); 1702b8e80941Smrg break; 1703b8e80941Smrg case nir_tex_src_bias: 1704b8e80941Smrg lod = ir3_get_src(ctx, &tex->src[i].src)[0]; 1705b8e80941Smrg has_bias = true; 1706b8e80941Smrg break; 1707b8e80941Smrg case nir_tex_src_lod: 1708b8e80941Smrg lod = ir3_get_src(ctx, &tex->src[i].src)[0]; 1709b8e80941Smrg has_lod = true; 1710b8e80941Smrg break; 1711b8e80941Smrg case nir_tex_src_comparator: /* shadow comparator */ 1712b8e80941Smrg compare = ir3_get_src(ctx, &tex->src[i].src)[0]; 1713b8e80941Smrg break; 1714b8e80941Smrg case nir_tex_src_projector: 1715b8e80941Smrg proj = ir3_get_src(ctx, &tex->src[i].src)[0]; 1716b8e80941Smrg has_proj = true; 1717b8e80941Smrg break; 1718b8e80941Smrg case nir_tex_src_offset: 1719b8e80941Smrg off = ir3_get_src(ctx, &tex->src[i].src); 1720b8e80941Smrg has_off = true; 1721b8e80941Smrg break; 1722b8e80941Smrg case nir_tex_src_ddx: 1723b8e80941Smrg ddx = ir3_get_src(ctx, &tex->src[i].src); 1724b8e80941Smrg break; 1725b8e80941Smrg case nir_tex_src_ddy: 1726b8e80941Smrg ddy = ir3_get_src(ctx, &tex->src[i].src); 1727b8e80941Smrg break; 1728b8e80941Smrg case nir_tex_src_ms_index: 1729b8e80941Smrg sample_index = ir3_get_src(ctx, &tex->src[i].src)[0]; 1730b8e80941Smrg break; 1731b8e80941Smrg case nir_tex_src_texture_offset: 1732b8e80941Smrg case nir_tex_src_sampler_offset: 1733b8e80941Smrg /* handled in get_tex_samp_src() */ 1734b8e80941Smrg break; 1735b8e80941Smrg default: 1736b8e80941Smrg ir3_context_error(ctx, "Unhandled NIR tex src type: %d\n", 1737b8e80941Smrg tex->src[i].src_type); 1738b8e80941Smrg return; 1739b8e80941Smrg } 1740b8e80941Smrg } 1741b8e80941Smrg 1742b8e80941Smrg switch (tex->op) { 1743b8e80941Smrg case nir_texop_tex: opc = has_lod ? OPC_SAML : OPC_SAM; break; 1744b8e80941Smrg case nir_texop_txb: opc = OPC_SAMB; break; 1745b8e80941Smrg case nir_texop_txl: opc = OPC_SAML; break; 1746b8e80941Smrg case nir_texop_txd: opc = OPC_SAMGQ; break; 1747b8e80941Smrg case nir_texop_txf: opc = OPC_ISAML; break; 1748b8e80941Smrg case nir_texop_lod: opc = OPC_GETLOD; break; 1749b8e80941Smrg case nir_texop_tg4: 1750b8e80941Smrg /* NOTE: a4xx might need to emulate gather w/ txf (this is 1751b8e80941Smrg * what blob does, seems gather is broken?), and a3xx did 1752b8e80941Smrg * not support it (but probably could also emulate). 1753b8e80941Smrg */ 1754b8e80941Smrg switch (tex->component) { 1755b8e80941Smrg case 0: opc = OPC_GATHER4R; break; 1756b8e80941Smrg case 1: opc = OPC_GATHER4G; break; 1757b8e80941Smrg case 2: opc = OPC_GATHER4B; break; 1758b8e80941Smrg case 3: opc = OPC_GATHER4A; break; 1759b8e80941Smrg } 1760b8e80941Smrg break; 1761b8e80941Smrg case nir_texop_txf_ms_fb: 1762b8e80941Smrg case nir_texop_txf_ms: opc = OPC_ISAMM; break; 1763b8e80941Smrg default: 1764b8e80941Smrg ir3_context_error(ctx, "Unhandled NIR tex type: %d\n", tex->op); 1765b8e80941Smrg return; 1766b8e80941Smrg } 1767b8e80941Smrg 1768b8e80941Smrg tex_info(tex, &flags, &coords); 1769b8e80941Smrg 1770b8e80941Smrg /* 1771b8e80941Smrg * lay out the first argument in the proper order: 1772b8e80941Smrg * - actual coordinates first 1773b8e80941Smrg * - shadow reference 1774b8e80941Smrg * - array index 1775b8e80941Smrg * - projection w 1776b8e80941Smrg * - starting at offset 4, dpdx.xy, dpdy.xy 1777b8e80941Smrg * 1778b8e80941Smrg * bias/lod go into the second arg 1779b8e80941Smrg */ 1780b8e80941Smrg 1781b8e80941Smrg /* insert tex coords: */ 1782b8e80941Smrg for (i = 0; i < coords; i++) 1783b8e80941Smrg src0[i] = coord[i]; 1784b8e80941Smrg 1785b8e80941Smrg nsrc0 = i; 1786b8e80941Smrg 1787b8e80941Smrg /* scale up integer coords for TXF based on the LOD */ 1788b8e80941Smrg if (ctx->compiler->unminify_coords && (opc == OPC_ISAML)) { 1789b8e80941Smrg assert(has_lod); 1790b8e80941Smrg for (i = 0; i < coords; i++) 1791b8e80941Smrg src0[i] = ir3_SHL_B(b, src0[i], 0, lod, 0); 1792b8e80941Smrg } 1793b8e80941Smrg 1794b8e80941Smrg if (coords == 1) { 1795b8e80941Smrg /* hw doesn't do 1d, so we treat it as 2d with 1796b8e80941Smrg * height of 1, and patch up the y coord. 1797b8e80941Smrg */ 1798b8e80941Smrg if (is_isam(opc)) { 1799b8e80941Smrg src0[nsrc0++] = create_immed(b, 0); 1800b8e80941Smrg } else { 1801b8e80941Smrg src0[nsrc0++] = create_immed(b, fui(0.5)); 1802b8e80941Smrg } 1803b8e80941Smrg } 1804b8e80941Smrg 1805b8e80941Smrg if (tex->is_shadow && tex->op != nir_texop_lod) 1806b8e80941Smrg src0[nsrc0++] = compare; 1807b8e80941Smrg 1808b8e80941Smrg if (tex->is_array && tex->op != nir_texop_lod) { 1809b8e80941Smrg struct ir3_instruction *idx = coord[coords]; 1810b8e80941Smrg 1811b8e80941Smrg /* the array coord for cube arrays needs 0.5 added to it */ 1812b8e80941Smrg if (ctx->compiler->array_index_add_half && !is_isam(opc)) 1813b8e80941Smrg idx = ir3_ADD_F(b, idx, 0, create_immed(b, fui(0.5)), 0); 1814b8e80941Smrg 1815b8e80941Smrg src0[nsrc0++] = idx; 1816b8e80941Smrg } 1817b8e80941Smrg 1818b8e80941Smrg if (has_proj) { 1819b8e80941Smrg src0[nsrc0++] = proj; 1820b8e80941Smrg flags |= IR3_INSTR_P; 1821b8e80941Smrg } 1822b8e80941Smrg 1823b8e80941Smrg /* pad to 4, then ddx/ddy: */ 1824b8e80941Smrg if (tex->op == nir_texop_txd) { 1825b8e80941Smrg while (nsrc0 < 4) 1826b8e80941Smrg src0[nsrc0++] = create_immed(b, fui(0.0)); 1827b8e80941Smrg for (i = 0; i < coords; i++) 1828b8e80941Smrg src0[nsrc0++] = ddx[i]; 1829b8e80941Smrg if (coords < 2) 1830b8e80941Smrg src0[nsrc0++] = create_immed(b, fui(0.0)); 1831b8e80941Smrg for (i = 0; i < coords; i++) 1832b8e80941Smrg src0[nsrc0++] = ddy[i]; 1833b8e80941Smrg if (coords < 2) 1834b8e80941Smrg src0[nsrc0++] = create_immed(b, fui(0.0)); 1835b8e80941Smrg } 1836b8e80941Smrg 1837b8e80941Smrg /* NOTE a3xx (and possibly a4xx?) might be different, using isaml 1838b8e80941Smrg * with scaled x coord according to requested sample: 1839b8e80941Smrg */ 1840b8e80941Smrg if (opc == OPC_ISAMM) { 1841b8e80941Smrg if (ctx->compiler->txf_ms_with_isaml) { 1842b8e80941Smrg /* the samples are laid out in x dimension as 1843b8e80941Smrg * 0 1 2 3 1844b8e80941Smrg * x_ms = (x << ms) + sample_index; 1845b8e80941Smrg */ 1846b8e80941Smrg struct ir3_instruction *ms; 1847b8e80941Smrg ms = create_immed(b, (ctx->samples >> (2 * tex->texture_index)) & 3); 1848b8e80941Smrg 1849b8e80941Smrg src0[0] = ir3_SHL_B(b, src0[0], 0, ms, 0); 1850b8e80941Smrg src0[0] = ir3_ADD_U(b, src0[0], 0, sample_index, 0); 1851b8e80941Smrg 1852b8e80941Smrg opc = OPC_ISAML; 1853b8e80941Smrg } else { 1854b8e80941Smrg src0[nsrc0++] = sample_index; 1855b8e80941Smrg } 1856b8e80941Smrg } 1857b8e80941Smrg 1858b8e80941Smrg /* 1859b8e80941Smrg * second argument (if applicable): 1860b8e80941Smrg * - offsets 1861b8e80941Smrg * - lod 1862b8e80941Smrg * - bias 1863b8e80941Smrg */ 1864b8e80941Smrg if (has_off | has_lod | has_bias) { 1865b8e80941Smrg if (has_off) { 1866b8e80941Smrg unsigned off_coords = coords; 1867b8e80941Smrg if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) 1868b8e80941Smrg off_coords--; 1869b8e80941Smrg for (i = 0; i < off_coords; i++) 1870b8e80941Smrg src1[nsrc1++] = off[i]; 1871b8e80941Smrg if (off_coords < 2) 1872b8e80941Smrg src1[nsrc1++] = create_immed(b, fui(0.0)); 1873b8e80941Smrg flags |= IR3_INSTR_O; 1874b8e80941Smrg } 1875b8e80941Smrg 1876b8e80941Smrg if (has_lod | has_bias) 1877b8e80941Smrg src1[nsrc1++] = lod; 1878b8e80941Smrg } 1879b8e80941Smrg 1880b8e80941Smrg switch (tex->dest_type) { 1881b8e80941Smrg case nir_type_invalid: 1882b8e80941Smrg case nir_type_float: 1883b8e80941Smrg type = TYPE_F32; 1884b8e80941Smrg break; 1885b8e80941Smrg case nir_type_int: 1886b8e80941Smrg type = TYPE_S32; 1887b8e80941Smrg break; 1888b8e80941Smrg case nir_type_uint: 1889b8e80941Smrg case nir_type_bool: 1890b8e80941Smrg type = TYPE_U32; 1891b8e80941Smrg break; 1892b8e80941Smrg default: 1893b8e80941Smrg unreachable("bad dest_type"); 1894b8e80941Smrg } 1895b8e80941Smrg 1896b8e80941Smrg if (opc == OPC_GETLOD) 1897b8e80941Smrg type = TYPE_U32; 1898b8e80941Smrg 1899b8e80941Smrg struct ir3_instruction *samp_tex; 1900b8e80941Smrg 1901b8e80941Smrg if (tex->op == nir_texop_txf_ms_fb) { 1902b8e80941Smrg /* only expect a single txf_ms_fb per shader: */ 1903b8e80941Smrg compile_assert(ctx, !ctx->so->fb_read); 1904b8e80941Smrg compile_assert(ctx, ctx->so->type == MESA_SHADER_FRAGMENT); 1905b8e80941Smrg 1906b8e80941Smrg ctx->so->fb_read = true; 1907b8e80941Smrg samp_tex = ir3_create_collect(ctx, (struct ir3_instruction*[]){ 1908b8e80941Smrg create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16), 1909b8e80941Smrg create_immed_typed(ctx->block, ctx->so->num_samp, TYPE_U16), 1910b8e80941Smrg }, 2); 1911b8e80941Smrg 1912b8e80941Smrg ctx->so->num_samp++; 1913b8e80941Smrg } else { 1914b8e80941Smrg samp_tex = get_tex_samp_tex_src(ctx, tex); 1915b8e80941Smrg } 1916b8e80941Smrg 1917b8e80941Smrg struct ir3_instruction *col0 = ir3_create_collect(ctx, src0, nsrc0); 1918b8e80941Smrg struct ir3_instruction *col1 = ir3_create_collect(ctx, src1, nsrc1); 1919b8e80941Smrg 1920b8e80941Smrg sam = ir3_SAM(b, opc, type, MASK(ncomp), flags, 1921b8e80941Smrg samp_tex, col0, col1); 1922b8e80941Smrg 1923b8e80941Smrg if ((ctx->astc_srgb & (1 << tex->texture_index)) && !nir_tex_instr_is_query(tex)) { 1924b8e80941Smrg /* only need first 3 components: */ 1925b8e80941Smrg sam->regs[0]->wrmask = 0x7; 1926b8e80941Smrg ir3_split_dest(b, dst, sam, 0, 3); 1927b8e80941Smrg 1928b8e80941Smrg /* we need to sample the alpha separately with a non-ASTC 1929b8e80941Smrg * texture state: 1930b8e80941Smrg */ 1931b8e80941Smrg sam = ir3_SAM(b, opc, type, 0b1000, flags, 1932b8e80941Smrg samp_tex, col0, col1); 1933b8e80941Smrg 1934b8e80941Smrg array_insert(ctx->ir, ctx->ir->astc_srgb, sam); 1935b8e80941Smrg 1936b8e80941Smrg /* fixup .w component: */ 1937b8e80941Smrg ir3_split_dest(b, &dst[3], sam, 3, 1); 1938b8e80941Smrg } else { 1939b8e80941Smrg /* normal (non-workaround) case: */ 1940b8e80941Smrg ir3_split_dest(b, dst, sam, 0, ncomp); 1941b8e80941Smrg } 1942b8e80941Smrg 1943b8e80941Smrg /* GETLOD returns results in 4.8 fixed point */ 1944b8e80941Smrg if (opc == OPC_GETLOD) { 1945b8e80941Smrg struct ir3_instruction *factor = create_immed(b, fui(1.0 / 256)); 1946b8e80941Smrg 1947b8e80941Smrg compile_assert(ctx, tex->dest_type == nir_type_float); 1948b8e80941Smrg for (i = 0; i < 2; i++) { 1949b8e80941Smrg dst[i] = ir3_MUL_F(b, ir3_COV(b, dst[i], TYPE_U32, TYPE_F32), 0, 1950b8e80941Smrg factor, 0); 1951b8e80941Smrg } 1952b8e80941Smrg } 1953b8e80941Smrg 1954b8e80941Smrg ir3_put_dst(ctx, &tex->dest); 1955b8e80941Smrg} 1956b8e80941Smrg 1957b8e80941Smrgstatic void 1958b8e80941Smrgemit_tex_query_levels(struct ir3_context *ctx, nir_tex_instr *tex) 1959b8e80941Smrg{ 1960b8e80941Smrg struct ir3_block *b = ctx->block; 1961b8e80941Smrg struct ir3_instruction **dst, *sam; 1962b8e80941Smrg 1963b8e80941Smrg dst = ir3_get_dst(ctx, &tex->dest, 1); 1964b8e80941Smrg 1965b8e80941Smrg sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, 0b0100, 0, 1966b8e80941Smrg get_tex_samp_tex_src(ctx, tex), NULL, NULL); 1967b8e80941Smrg 1968b8e80941Smrg /* even though there is only one component, since it ends 1969b8e80941Smrg * up in .z rather than .x, we need a split_dest() 1970b8e80941Smrg */ 1971b8e80941Smrg ir3_split_dest(b, dst, sam, 0, 3); 1972b8e80941Smrg 1973b8e80941Smrg /* The # of levels comes from getinfo.z. We need to add 1 to it, since 1974b8e80941Smrg * the value in TEX_CONST_0 is zero-based. 1975b8e80941Smrg */ 1976b8e80941Smrg if (ctx->compiler->levels_add_one) 1977b8e80941Smrg dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0); 1978b8e80941Smrg 1979b8e80941Smrg ir3_put_dst(ctx, &tex->dest); 1980b8e80941Smrg} 1981b8e80941Smrg 1982b8e80941Smrgstatic void 1983b8e80941Smrgemit_tex_txs(struct ir3_context *ctx, nir_tex_instr *tex) 1984b8e80941Smrg{ 1985b8e80941Smrg struct ir3_block *b = ctx->block; 1986b8e80941Smrg struct ir3_instruction **dst, *sam; 1987b8e80941Smrg struct ir3_instruction *lod; 1988b8e80941Smrg unsigned flags, coords; 1989b8e80941Smrg 1990b8e80941Smrg tex_info(tex, &flags, &coords); 1991b8e80941Smrg 1992b8e80941Smrg /* Actually we want the number of dimensions, not coordinates. This 1993b8e80941Smrg * distinction only matters for cubes. 1994b8e80941Smrg */ 1995b8e80941Smrg if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE) 1996b8e80941Smrg coords = 2; 1997b8e80941Smrg 1998b8e80941Smrg dst = ir3_get_dst(ctx, &tex->dest, 4); 1999b8e80941Smrg 2000b8e80941Smrg compile_assert(ctx, tex->num_srcs == 1); 2001b8e80941Smrg compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod); 2002b8e80941Smrg 2003b8e80941Smrg lod = ir3_get_src(ctx, &tex->src[0].src)[0]; 2004b8e80941Smrg 2005b8e80941Smrg sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, 0b1111, flags, 2006b8e80941Smrg get_tex_samp_tex_src(ctx, tex), lod, NULL); 2007b8e80941Smrg 2008b8e80941Smrg ir3_split_dest(b, dst, sam, 0, 4); 2009b8e80941Smrg 2010b8e80941Smrg /* Array size actually ends up in .w rather than .z. This doesn't 2011b8e80941Smrg * matter for miplevel 0, but for higher mips the value in z is 2012b8e80941Smrg * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is 2013b8e80941Smrg * returned, which means that we have to add 1 to it for arrays. 2014b8e80941Smrg */ 2015b8e80941Smrg if (tex->is_array) { 2016b8e80941Smrg if (ctx->compiler->levels_add_one) { 2017b8e80941Smrg dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0); 2018b8e80941Smrg } else { 2019b8e80941Smrg dst[coords] = ir3_MOV(b, dst[3], TYPE_U32); 2020b8e80941Smrg } 2021b8e80941Smrg } 2022b8e80941Smrg 2023b8e80941Smrg ir3_put_dst(ctx, &tex->dest); 2024b8e80941Smrg} 2025b8e80941Smrg 2026b8e80941Smrgstatic void 2027b8e80941Smrgemit_jump(struct ir3_context *ctx, nir_jump_instr *jump) 2028b8e80941Smrg{ 2029b8e80941Smrg switch (jump->type) { 2030b8e80941Smrg case nir_jump_break: 2031b8e80941Smrg case nir_jump_continue: 2032b8e80941Smrg case nir_jump_return: 2033b8e80941Smrg /* I *think* we can simply just ignore this, and use the 2034b8e80941Smrg * successor block link to figure out where we need to 2035b8e80941Smrg * jump to for break/continue 2036b8e80941Smrg */ 2037b8e80941Smrg break; 2038b8e80941Smrg default: 2039b8e80941Smrg ir3_context_error(ctx, "Unhandled NIR jump type: %d\n", jump->type); 2040b8e80941Smrg break; 2041b8e80941Smrg } 2042b8e80941Smrg} 2043b8e80941Smrg 2044b8e80941Smrgstatic void 2045b8e80941Smrgemit_instr(struct ir3_context *ctx, nir_instr *instr) 2046b8e80941Smrg{ 2047b8e80941Smrg switch (instr->type) { 2048b8e80941Smrg case nir_instr_type_alu: 2049b8e80941Smrg emit_alu(ctx, nir_instr_as_alu(instr)); 2050b8e80941Smrg break; 2051b8e80941Smrg case nir_instr_type_deref: 2052b8e80941Smrg /* ignored, handled as part of the intrinsic they are src to */ 2053b8e80941Smrg break; 2054b8e80941Smrg case nir_instr_type_intrinsic: 2055b8e80941Smrg emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); 2056b8e80941Smrg break; 2057b8e80941Smrg case nir_instr_type_load_const: 2058b8e80941Smrg emit_load_const(ctx, nir_instr_as_load_const(instr)); 2059b8e80941Smrg break; 2060b8e80941Smrg case nir_instr_type_ssa_undef: 2061b8e80941Smrg emit_undef(ctx, nir_instr_as_ssa_undef(instr)); 2062b8e80941Smrg break; 2063b8e80941Smrg case nir_instr_type_tex: { 2064b8e80941Smrg nir_tex_instr *tex = nir_instr_as_tex(instr); 2065b8e80941Smrg /* couple tex instructions get special-cased: 2066b8e80941Smrg */ 2067b8e80941Smrg switch (tex->op) { 2068b8e80941Smrg case nir_texop_txs: 2069b8e80941Smrg emit_tex_txs(ctx, tex); 2070b8e80941Smrg break; 2071b8e80941Smrg case nir_texop_query_levels: 2072b8e80941Smrg emit_tex_query_levels(ctx, tex); 2073b8e80941Smrg break; 2074b8e80941Smrg default: 2075b8e80941Smrg emit_tex(ctx, tex); 2076b8e80941Smrg break; 2077b8e80941Smrg } 2078b8e80941Smrg break; 2079b8e80941Smrg } 2080b8e80941Smrg case nir_instr_type_jump: 2081b8e80941Smrg emit_jump(ctx, nir_instr_as_jump(instr)); 2082b8e80941Smrg break; 2083b8e80941Smrg case nir_instr_type_phi: 2084b8e80941Smrg /* we have converted phi webs to regs in NIR by now */ 2085b8e80941Smrg ir3_context_error(ctx, "Unexpected NIR instruction type: %d\n", instr->type); 2086b8e80941Smrg break; 2087b8e80941Smrg case nir_instr_type_call: 2088b8e80941Smrg case nir_instr_type_parallel_copy: 2089b8e80941Smrg ir3_context_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type); 2090b8e80941Smrg break; 2091b8e80941Smrg } 2092b8e80941Smrg} 2093b8e80941Smrg 2094b8e80941Smrgstatic struct ir3_block * 2095b8e80941Smrgget_block(struct ir3_context *ctx, const nir_block *nblock) 2096b8e80941Smrg{ 2097b8e80941Smrg struct ir3_block *block; 2098b8e80941Smrg struct hash_entry *hentry; 2099b8e80941Smrg unsigned i; 2100b8e80941Smrg 2101b8e80941Smrg hentry = _mesa_hash_table_search(ctx->block_ht, nblock); 2102b8e80941Smrg if (hentry) 2103b8e80941Smrg return hentry->data; 2104b8e80941Smrg 2105b8e80941Smrg block = ir3_block_create(ctx->ir); 2106b8e80941Smrg block->nblock = nblock; 2107b8e80941Smrg _mesa_hash_table_insert(ctx->block_ht, nblock, block); 2108b8e80941Smrg 2109b8e80941Smrg block->predecessors_count = nblock->predecessors->entries; 2110b8e80941Smrg block->predecessors = ralloc_array_size(block, 2111b8e80941Smrg sizeof(block->predecessors[0]), block->predecessors_count); 2112b8e80941Smrg i = 0; 2113b8e80941Smrg set_foreach(nblock->predecessors, sentry) { 2114b8e80941Smrg block->predecessors[i++] = get_block(ctx, sentry->key); 2115b8e80941Smrg } 2116b8e80941Smrg 2117b8e80941Smrg return block; 2118b8e80941Smrg} 2119b8e80941Smrg 2120b8e80941Smrgstatic void 2121b8e80941Smrgemit_block(struct ir3_context *ctx, nir_block *nblock) 2122b8e80941Smrg{ 2123b8e80941Smrg struct ir3_block *block = get_block(ctx, nblock); 2124b8e80941Smrg 2125b8e80941Smrg for (int i = 0; i < ARRAY_SIZE(block->successors); i++) { 2126b8e80941Smrg if (nblock->successors[i]) { 2127b8e80941Smrg block->successors[i] = 2128b8e80941Smrg get_block(ctx, nblock->successors[i]); 2129b8e80941Smrg } 2130b8e80941Smrg } 2131b8e80941Smrg 2132b8e80941Smrg ctx->block = block; 2133b8e80941Smrg list_addtail(&block->node, &ctx->ir->block_list); 2134b8e80941Smrg 2135b8e80941Smrg /* re-emit addr register in each block if needed: */ 2136b8e80941Smrg for (int i = 0; i < ARRAY_SIZE(ctx->addr_ht); i++) { 2137b8e80941Smrg _mesa_hash_table_destroy(ctx->addr_ht[i], NULL); 2138b8e80941Smrg ctx->addr_ht[i] = NULL; 2139b8e80941Smrg } 2140b8e80941Smrg 2141b8e80941Smrg nir_foreach_instr(instr, nblock) { 2142b8e80941Smrg ctx->cur_instr = instr; 2143b8e80941Smrg emit_instr(ctx, instr); 2144b8e80941Smrg ctx->cur_instr = NULL; 2145b8e80941Smrg if (ctx->error) 2146b8e80941Smrg return; 2147b8e80941Smrg } 2148b8e80941Smrg} 2149b8e80941Smrg 2150b8e80941Smrgstatic void emit_cf_list(struct ir3_context *ctx, struct exec_list *list); 2151b8e80941Smrg 2152b8e80941Smrgstatic void 2153b8e80941Smrgemit_if(struct ir3_context *ctx, nir_if *nif) 2154b8e80941Smrg{ 2155b8e80941Smrg struct ir3_instruction *condition = ir3_get_src(ctx, &nif->condition)[0]; 2156b8e80941Smrg 2157b8e80941Smrg ctx->block->condition = 2158b8e80941Smrg ir3_get_predicate(ctx, ir3_b2n(condition->block, condition)); 2159b8e80941Smrg 2160b8e80941Smrg emit_cf_list(ctx, &nif->then_list); 2161b8e80941Smrg emit_cf_list(ctx, &nif->else_list); 2162b8e80941Smrg} 2163b8e80941Smrg 2164b8e80941Smrgstatic void 2165b8e80941Smrgemit_loop(struct ir3_context *ctx, nir_loop *nloop) 2166b8e80941Smrg{ 2167b8e80941Smrg emit_cf_list(ctx, &nloop->body); 2168b8e80941Smrg} 2169b8e80941Smrg 2170b8e80941Smrgstatic void 2171b8e80941Smrgstack_push(struct ir3_context *ctx) 2172b8e80941Smrg{ 2173b8e80941Smrg ctx->stack++; 2174b8e80941Smrg ctx->max_stack = MAX2(ctx->max_stack, ctx->stack); 2175b8e80941Smrg} 2176b8e80941Smrg 2177b8e80941Smrgstatic void 2178b8e80941Smrgstack_pop(struct ir3_context *ctx) 2179b8e80941Smrg{ 2180b8e80941Smrg compile_assert(ctx, ctx->stack > 0); 2181b8e80941Smrg ctx->stack--; 2182b8e80941Smrg} 2183b8e80941Smrg 2184b8e80941Smrgstatic void 2185b8e80941Smrgemit_cf_list(struct ir3_context *ctx, struct exec_list *list) 2186b8e80941Smrg{ 2187b8e80941Smrg foreach_list_typed(nir_cf_node, node, node, list) { 2188b8e80941Smrg switch (node->type) { 2189b8e80941Smrg case nir_cf_node_block: 2190b8e80941Smrg emit_block(ctx, nir_cf_node_as_block(node)); 2191b8e80941Smrg break; 2192b8e80941Smrg case nir_cf_node_if: 2193b8e80941Smrg stack_push(ctx); 2194b8e80941Smrg emit_if(ctx, nir_cf_node_as_if(node)); 2195b8e80941Smrg stack_pop(ctx); 2196b8e80941Smrg break; 2197b8e80941Smrg case nir_cf_node_loop: 2198b8e80941Smrg stack_push(ctx); 2199b8e80941Smrg emit_loop(ctx, nir_cf_node_as_loop(node)); 2200b8e80941Smrg stack_pop(ctx); 2201b8e80941Smrg break; 2202b8e80941Smrg case nir_cf_node_function: 2203b8e80941Smrg ir3_context_error(ctx, "TODO\n"); 2204b8e80941Smrg break; 2205b8e80941Smrg } 2206b8e80941Smrg } 2207b8e80941Smrg} 2208b8e80941Smrg 2209b8e80941Smrg/* emit stream-out code. At this point, the current block is the original 2210b8e80941Smrg * (nir) end block, and nir ensures that all flow control paths terminate 2211b8e80941Smrg * into the end block. We re-purpose the original end block to generate 2212b8e80941Smrg * the 'if (vtxcnt < maxvtxcnt)' condition, then append the conditional 2213b8e80941Smrg * block holding stream-out write instructions, followed by the new end 2214b8e80941Smrg * block: 2215b8e80941Smrg * 2216b8e80941Smrg * blockOrigEnd { 2217b8e80941Smrg * p0.x = (vtxcnt < maxvtxcnt) 2218b8e80941Smrg * // succs: blockStreamOut, blockNewEnd 2219b8e80941Smrg * } 2220b8e80941Smrg * blockStreamOut { 2221b8e80941Smrg * ... stream-out instructions ... 2222b8e80941Smrg * // succs: blockNewEnd 2223b8e80941Smrg * } 2224b8e80941Smrg * blockNewEnd { 2225b8e80941Smrg * } 2226b8e80941Smrg */ 2227b8e80941Smrgstatic void 2228b8e80941Smrgemit_stream_out(struct ir3_context *ctx) 2229b8e80941Smrg{ 2230b8e80941Smrg struct ir3_shader_variant *v = ctx->so; 2231b8e80941Smrg struct ir3 *ir = ctx->ir; 2232b8e80941Smrg struct ir3_stream_output_info *strmout = 2233b8e80941Smrg &ctx->so->shader->stream_output; 2234b8e80941Smrg struct ir3_block *orig_end_block, *stream_out_block, *new_end_block; 2235b8e80941Smrg struct ir3_instruction *vtxcnt, *maxvtxcnt, *cond; 2236b8e80941Smrg struct ir3_instruction *bases[IR3_MAX_SO_BUFFERS]; 2237b8e80941Smrg 2238b8e80941Smrg /* create vtxcnt input in input block at top of shader, 2239b8e80941Smrg * so that it is seen as live over the entire duration 2240b8e80941Smrg * of the shader: 2241b8e80941Smrg */ 2242b8e80941Smrg vtxcnt = create_input(ctx, 0); 2243b8e80941Smrg add_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, vtxcnt); 2244b8e80941Smrg 2245b8e80941Smrg maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX); 2246b8e80941Smrg 2247b8e80941Smrg /* at this point, we are at the original 'end' block, 2248b8e80941Smrg * re-purpose this block to stream-out condition, then 2249b8e80941Smrg * append stream-out block and new-end block 2250b8e80941Smrg */ 2251b8e80941Smrg orig_end_block = ctx->block; 2252b8e80941Smrg 2253b8e80941Smrg// TODO these blocks need to update predecessors.. 2254b8e80941Smrg// maybe w/ store_global intrinsic, we could do this 2255b8e80941Smrg// stuff in nir->nir pass 2256b8e80941Smrg 2257b8e80941Smrg stream_out_block = ir3_block_create(ir); 2258b8e80941Smrg list_addtail(&stream_out_block->node, &ir->block_list); 2259b8e80941Smrg 2260b8e80941Smrg new_end_block = ir3_block_create(ir); 2261b8e80941Smrg list_addtail(&new_end_block->node, &ir->block_list); 2262b8e80941Smrg 2263b8e80941Smrg orig_end_block->successors[0] = stream_out_block; 2264b8e80941Smrg orig_end_block->successors[1] = new_end_block; 2265b8e80941Smrg stream_out_block->successors[0] = new_end_block; 2266b8e80941Smrg 2267b8e80941Smrg /* setup 'if (vtxcnt < maxvtxcnt)' condition: */ 2268b8e80941Smrg cond = ir3_CMPS_S(ctx->block, vtxcnt, 0, maxvtxcnt, 0); 2269b8e80941Smrg cond->regs[0]->num = regid(REG_P0, 0); 2270b8e80941Smrg cond->cat2.condition = IR3_COND_LT; 2271b8e80941Smrg 2272b8e80941Smrg /* condition goes on previous block to the conditional, 2273b8e80941Smrg * since it is used to pick which of the two successor 2274b8e80941Smrg * paths to take: 2275b8e80941Smrg */ 2276b8e80941Smrg orig_end_block->condition = cond; 2277b8e80941Smrg 2278b8e80941Smrg /* switch to stream_out_block to generate the stream-out 2279b8e80941Smrg * instructions: 2280b8e80941Smrg */ 2281b8e80941Smrg ctx->block = stream_out_block; 2282b8e80941Smrg 2283b8e80941Smrg /* Calculate base addresses based on vtxcnt. Instructions 2284b8e80941Smrg * generated for bases not used in following loop will be 2285b8e80941Smrg * stripped out in the backend. 2286b8e80941Smrg */ 2287b8e80941Smrg for (unsigned i = 0; i < IR3_MAX_SO_BUFFERS; i++) { 2288b8e80941Smrg unsigned stride = strmout->stride[i]; 2289b8e80941Smrg struct ir3_instruction *base, *off; 2290b8e80941Smrg 2291b8e80941Smrg base = create_uniform(ctx->block, regid(v->constbase.tfbo, i)); 2292b8e80941Smrg 2293b8e80941Smrg /* 24-bit should be enough: */ 2294b8e80941Smrg off = ir3_MUL_U(ctx->block, vtxcnt, 0, 2295b8e80941Smrg create_immed(ctx->block, stride * 4), 0); 2296b8e80941Smrg 2297b8e80941Smrg bases[i] = ir3_ADD_S(ctx->block, off, 0, base, 0); 2298b8e80941Smrg } 2299b8e80941Smrg 2300b8e80941Smrg /* Generate the per-output store instructions: */ 2301b8e80941Smrg for (unsigned i = 0; i < strmout->num_outputs; i++) { 2302b8e80941Smrg for (unsigned j = 0; j < strmout->output[i].num_components; j++) { 2303b8e80941Smrg unsigned c = j + strmout->output[i].start_component; 2304b8e80941Smrg struct ir3_instruction *base, *out, *stg; 2305b8e80941Smrg 2306b8e80941Smrg base = bases[strmout->output[i].output_buffer]; 2307b8e80941Smrg out = ctx->ir->outputs[regid(strmout->output[i].register_index, c)]; 2308b8e80941Smrg 2309b8e80941Smrg stg = ir3_STG(ctx->block, base, 0, out, 0, 2310b8e80941Smrg create_immed(ctx->block, 1), 0); 2311b8e80941Smrg stg->cat6.type = TYPE_U32; 2312b8e80941Smrg stg->cat6.dst_offset = (strmout->output[i].dst_offset + j) * 4; 2313b8e80941Smrg 2314b8e80941Smrg array_insert(ctx->block, ctx->block->keeps, stg); 2315b8e80941Smrg } 2316b8e80941Smrg } 2317b8e80941Smrg 2318b8e80941Smrg /* and finally switch to the new_end_block: */ 2319b8e80941Smrg ctx->block = new_end_block; 2320b8e80941Smrg} 2321b8e80941Smrg 2322b8e80941Smrgstatic void 2323b8e80941Smrgemit_function(struct ir3_context *ctx, nir_function_impl *impl) 2324b8e80941Smrg{ 2325b8e80941Smrg nir_metadata_require(impl, nir_metadata_block_index); 2326b8e80941Smrg 2327b8e80941Smrg compile_assert(ctx, ctx->stack == 0); 2328b8e80941Smrg 2329b8e80941Smrg emit_cf_list(ctx, &impl->body); 2330b8e80941Smrg emit_block(ctx, impl->end_block); 2331b8e80941Smrg 2332b8e80941Smrg compile_assert(ctx, ctx->stack == 0); 2333b8e80941Smrg 2334b8e80941Smrg /* at this point, we should have a single empty block, 2335b8e80941Smrg * into which we emit the 'end' instruction. 2336b8e80941Smrg */ 2337b8e80941Smrg compile_assert(ctx, list_empty(&ctx->block->instr_list)); 2338b8e80941Smrg 2339b8e80941Smrg /* If stream-out (aka transform-feedback) enabled, emit the 2340b8e80941Smrg * stream-out instructions, followed by a new empty block (into 2341b8e80941Smrg * which the 'end' instruction lands). 2342b8e80941Smrg * 2343b8e80941Smrg * NOTE: it is done in this order, rather than inserting before 2344b8e80941Smrg * we emit end_block, because NIR guarantees that all blocks 2345b8e80941Smrg * flow into end_block, and that end_block has no successors. 2346b8e80941Smrg * So by re-purposing end_block as the first block of stream- 2347b8e80941Smrg * out, we guarantee that all exit paths flow into the stream- 2348b8e80941Smrg * out instructions. 2349b8e80941Smrg */ 2350b8e80941Smrg if ((ctx->compiler->gpu_id < 500) && 2351b8e80941Smrg (ctx->so->shader->stream_output.num_outputs > 0) && 2352b8e80941Smrg !ctx->so->binning_pass) { 2353b8e80941Smrg debug_assert(ctx->so->type == MESA_SHADER_VERTEX); 2354b8e80941Smrg emit_stream_out(ctx); 2355b8e80941Smrg } 2356b8e80941Smrg 2357b8e80941Smrg ir3_END(ctx->block); 2358b8e80941Smrg} 2359b8e80941Smrg 2360b8e80941Smrgstatic void 2361b8e80941Smrgsetup_input(struct ir3_context *ctx, nir_variable *in) 2362b8e80941Smrg{ 2363b8e80941Smrg struct ir3_shader_variant *so = ctx->so; 2364b8e80941Smrg unsigned ncomp = glsl_get_components(in->type); 2365b8e80941Smrg unsigned n = in->data.driver_location; 2366b8e80941Smrg unsigned frac = in->data.location_frac; 2367b8e80941Smrg unsigned slot = in->data.location; 2368b8e80941Smrg 2369b8e80941Smrg /* skip unread inputs, we could end up with (for example), unsplit 2370b8e80941Smrg * matrix/etc inputs in the case they are not read, so just silently 2371b8e80941Smrg * skip these. 2372b8e80941Smrg */ 2373b8e80941Smrg if (ncomp > 4) 2374b8e80941Smrg return; 2375b8e80941Smrg 2376b8e80941Smrg so->inputs[n].slot = slot; 2377b8e80941Smrg so->inputs[n].compmask = (1 << (ncomp + frac)) - 1; 2378b8e80941Smrg so->inputs_count = MAX2(so->inputs_count, n + 1); 2379b8e80941Smrg so->inputs[n].interpolate = in->data.interpolation; 2380b8e80941Smrg so->inputs[n].ncomp = ncomp; 2381b8e80941Smrg 2382b8e80941Smrg if (ctx->so->type == MESA_SHADER_FRAGMENT) { 2383b8e80941Smrg 2384b8e80941Smrg /* if any varyings have 'sample' qualifer, that triggers us 2385b8e80941Smrg * to run in per-sample mode: 2386b8e80941Smrg */ 2387b8e80941Smrg so->per_samp |= in->data.sample; 2388b8e80941Smrg 2389b8e80941Smrg for (int i = 0; i < ncomp; i++) { 2390b8e80941Smrg struct ir3_instruction *instr = NULL; 2391b8e80941Smrg unsigned idx = (n * 4) + i + frac; 2392b8e80941Smrg 2393b8e80941Smrg if (slot == VARYING_SLOT_POS) { 2394b8e80941Smrg ir3_context_error(ctx, "fragcoord should be a sysval!\n"); 2395b8e80941Smrg } else if (slot == VARYING_SLOT_PNTC) { 2396b8e80941Smrg /* see for example st_nir_fixup_varying_slots().. this is 2397b8e80941Smrg * maybe a bit mesa/st specific. But we need things to line 2398b8e80941Smrg * up for this in fdN_program: 2399b8e80941Smrg * unsigned texmask = 1 << (slot - VARYING_SLOT_VAR0); 2400b8e80941Smrg * if (emit->sprite_coord_enable & texmask) { 2401b8e80941Smrg * ... 2402b8e80941Smrg * } 2403b8e80941Smrg */ 2404b8e80941Smrg so->inputs[n].slot = VARYING_SLOT_VAR8; 2405b8e80941Smrg so->inputs[n].bary = true; 2406b8e80941Smrg instr = create_frag_input(ctx, false, idx); 2407b8e80941Smrg } else { 2408b8e80941Smrg /* detect the special case for front/back colors where 2409b8e80941Smrg * we need to do flat vs smooth shading depending on 2410b8e80941Smrg * rast state: 2411b8e80941Smrg */ 2412b8e80941Smrg if (in->data.interpolation == INTERP_MODE_NONE) { 2413b8e80941Smrg switch (slot) { 2414b8e80941Smrg case VARYING_SLOT_COL0: 2415b8e80941Smrg case VARYING_SLOT_COL1: 2416b8e80941Smrg case VARYING_SLOT_BFC0: 2417b8e80941Smrg case VARYING_SLOT_BFC1: 2418b8e80941Smrg so->inputs[n].rasterflat = true; 2419b8e80941Smrg break; 2420b8e80941Smrg default: 2421b8e80941Smrg break; 2422b8e80941Smrg } 2423b8e80941Smrg } 2424b8e80941Smrg 2425b8e80941Smrg if (ctx->compiler->flat_bypass) { 2426b8e80941Smrg if ((so->inputs[n].interpolate == INTERP_MODE_FLAT) || 2427b8e80941Smrg (so->inputs[n].rasterflat && ctx->so->key.rasterflat)) 2428b8e80941Smrg so->inputs[n].use_ldlv = true; 2429b8e80941Smrg } 2430b8e80941Smrg 2431b8e80941Smrg so->inputs[n].bary = true; 2432b8e80941Smrg 2433b8e80941Smrg instr = create_frag_input(ctx, so->inputs[n].use_ldlv, idx); 2434b8e80941Smrg } 2435b8e80941Smrg 2436b8e80941Smrg compile_assert(ctx, idx < ctx->ir->ninputs); 2437b8e80941Smrg 2438b8e80941Smrg ctx->ir->inputs[idx] = instr; 2439b8e80941Smrg } 2440b8e80941Smrg } else if (ctx->so->type == MESA_SHADER_VERTEX) { 2441b8e80941Smrg for (int i = 0; i < ncomp; i++) { 2442b8e80941Smrg unsigned idx = (n * 4) + i + frac; 2443b8e80941Smrg compile_assert(ctx, idx < ctx->ir->ninputs); 2444b8e80941Smrg ctx->ir->inputs[idx] = create_input(ctx, idx); 2445b8e80941Smrg } 2446b8e80941Smrg } else { 2447b8e80941Smrg ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type); 2448b8e80941Smrg } 2449b8e80941Smrg 2450b8e80941Smrg if (so->inputs[n].bary || (ctx->so->type == MESA_SHADER_VERTEX)) { 2451b8e80941Smrg so->total_in += ncomp; 2452b8e80941Smrg } 2453b8e80941Smrg} 2454b8e80941Smrg 2455b8e80941Smrg/* Initially we assign non-packed inloc's for varyings, as we don't really 2456b8e80941Smrg * know up-front which components will be unused. After all the compilation 2457b8e80941Smrg * stages we scan the shader to see which components are actually used, and 2458b8e80941Smrg * re-pack the inlocs to eliminate unneeded varyings. 2459b8e80941Smrg */ 2460b8e80941Smrgstatic void 2461b8e80941Smrgpack_inlocs(struct ir3_context *ctx) 2462b8e80941Smrg{ 2463b8e80941Smrg struct ir3_shader_variant *so = ctx->so; 2464b8e80941Smrg uint8_t used_components[so->inputs_count]; 2465b8e80941Smrg 2466b8e80941Smrg memset(used_components, 0, sizeof(used_components)); 2467b8e80941Smrg 2468b8e80941Smrg /* 2469b8e80941Smrg * First Step: scan shader to find which bary.f/ldlv remain: 2470b8e80941Smrg */ 2471b8e80941Smrg 2472b8e80941Smrg list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { 2473b8e80941Smrg list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 2474b8e80941Smrg if (is_input(instr)) { 2475b8e80941Smrg unsigned inloc = instr->regs[1]->iim_val; 2476b8e80941Smrg unsigned i = inloc / 4; 2477b8e80941Smrg unsigned j = inloc % 4; 2478b8e80941Smrg 2479b8e80941Smrg compile_assert(ctx, instr->regs[1]->flags & IR3_REG_IMMED); 2480b8e80941Smrg compile_assert(ctx, i < so->inputs_count); 2481b8e80941Smrg 2482b8e80941Smrg used_components[i] |= 1 << j; 2483b8e80941Smrg } 2484b8e80941Smrg } 2485b8e80941Smrg } 2486b8e80941Smrg 2487b8e80941Smrg /* 2488b8e80941Smrg * Second Step: reassign varying inloc/slots: 2489b8e80941Smrg */ 2490b8e80941Smrg 2491b8e80941Smrg unsigned actual_in = 0; 2492b8e80941Smrg unsigned inloc = 0; 2493b8e80941Smrg 2494b8e80941Smrg for (unsigned i = 0; i < so->inputs_count; i++) { 2495b8e80941Smrg unsigned compmask = 0, maxcomp = 0; 2496b8e80941Smrg 2497b8e80941Smrg so->inputs[i].ncomp = 0; 2498b8e80941Smrg so->inputs[i].inloc = inloc; 2499b8e80941Smrg so->inputs[i].bary = false; 2500b8e80941Smrg 2501b8e80941Smrg for (unsigned j = 0; j < 4; j++) { 2502b8e80941Smrg if (!(used_components[i] & (1 << j))) 2503b8e80941Smrg continue; 2504b8e80941Smrg 2505b8e80941Smrg compmask |= (1 << j); 2506b8e80941Smrg actual_in++; 2507b8e80941Smrg so->inputs[i].ncomp++; 2508b8e80941Smrg maxcomp = j + 1; 2509b8e80941Smrg 2510b8e80941Smrg /* at this point, since used_components[i] mask is only 2511b8e80941Smrg * considering varyings (ie. not sysvals) we know this 2512b8e80941Smrg * is a varying: 2513b8e80941Smrg */ 2514b8e80941Smrg so->inputs[i].bary = true; 2515b8e80941Smrg } 2516b8e80941Smrg 2517b8e80941Smrg if (so->inputs[i].bary) { 2518b8e80941Smrg so->varying_in++; 2519b8e80941Smrg so->inputs[i].compmask = (1 << maxcomp) - 1; 2520b8e80941Smrg inloc += maxcomp; 2521b8e80941Smrg } 2522b8e80941Smrg } 2523b8e80941Smrg 2524b8e80941Smrg /* 2525b8e80941Smrg * Third Step: reassign packed inloc's: 2526b8e80941Smrg */ 2527b8e80941Smrg 2528b8e80941Smrg list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) { 2529b8e80941Smrg list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) { 2530b8e80941Smrg if (is_input(instr)) { 2531b8e80941Smrg unsigned inloc = instr->regs[1]->iim_val; 2532b8e80941Smrg unsigned i = inloc / 4; 2533b8e80941Smrg unsigned j = inloc % 4; 2534b8e80941Smrg 2535b8e80941Smrg instr->regs[1]->iim_val = so->inputs[i].inloc + j; 2536b8e80941Smrg } 2537b8e80941Smrg } 2538b8e80941Smrg } 2539b8e80941Smrg} 2540b8e80941Smrg 2541b8e80941Smrgstatic void 2542b8e80941Smrgsetup_output(struct ir3_context *ctx, nir_variable *out) 2543b8e80941Smrg{ 2544b8e80941Smrg struct ir3_shader_variant *so = ctx->so; 2545b8e80941Smrg unsigned ncomp = glsl_get_components(out->type); 2546b8e80941Smrg unsigned n = out->data.driver_location; 2547b8e80941Smrg unsigned frac = out->data.location_frac; 2548b8e80941Smrg unsigned slot = out->data.location; 2549b8e80941Smrg unsigned comp = 0; 2550b8e80941Smrg 2551b8e80941Smrg if (ctx->so->type == MESA_SHADER_FRAGMENT) { 2552b8e80941Smrg switch (slot) { 2553b8e80941Smrg case FRAG_RESULT_DEPTH: 2554b8e80941Smrg comp = 2; /* tgsi will write to .z component */ 2555b8e80941Smrg so->writes_pos = true; 2556b8e80941Smrg break; 2557b8e80941Smrg case FRAG_RESULT_COLOR: 2558b8e80941Smrg so->color0_mrt = 1; 2559b8e80941Smrg break; 2560b8e80941Smrg case FRAG_RESULT_SAMPLE_MASK: 2561b8e80941Smrg so->writes_smask = true; 2562b8e80941Smrg break; 2563b8e80941Smrg default: 2564b8e80941Smrg if (slot >= FRAG_RESULT_DATA0) 2565b8e80941Smrg break; 2566b8e80941Smrg ir3_context_error(ctx, "unknown FS output name: %s\n", 2567b8e80941Smrg gl_frag_result_name(slot)); 2568b8e80941Smrg } 2569b8e80941Smrg } else if (ctx->so->type == MESA_SHADER_VERTEX) { 2570b8e80941Smrg switch (slot) { 2571b8e80941Smrg case VARYING_SLOT_POS: 2572b8e80941Smrg so->writes_pos = true; 2573b8e80941Smrg break; 2574b8e80941Smrg case VARYING_SLOT_PSIZ: 2575b8e80941Smrg so->writes_psize = true; 2576b8e80941Smrg break; 2577b8e80941Smrg case VARYING_SLOT_COL0: 2578b8e80941Smrg case VARYING_SLOT_COL1: 2579b8e80941Smrg case VARYING_SLOT_BFC0: 2580b8e80941Smrg case VARYING_SLOT_BFC1: 2581b8e80941Smrg case VARYING_SLOT_FOGC: 2582b8e80941Smrg case VARYING_SLOT_CLIP_DIST0: 2583b8e80941Smrg case VARYING_SLOT_CLIP_DIST1: 2584b8e80941Smrg case VARYING_SLOT_CLIP_VERTEX: 2585b8e80941Smrg break; 2586b8e80941Smrg default: 2587b8e80941Smrg if (slot >= VARYING_SLOT_VAR0) 2588b8e80941Smrg break; 2589b8e80941Smrg if ((VARYING_SLOT_TEX0 <= slot) && (slot <= VARYING_SLOT_TEX7)) 2590b8e80941Smrg break; 2591b8e80941Smrg ir3_context_error(ctx, "unknown VS output name: %s\n", 2592b8e80941Smrg gl_varying_slot_name(slot)); 2593b8e80941Smrg } 2594b8e80941Smrg } else { 2595b8e80941Smrg ir3_context_error(ctx, "unknown shader type: %d\n", ctx->so->type); 2596b8e80941Smrg } 2597b8e80941Smrg 2598b8e80941Smrg compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); 2599b8e80941Smrg 2600b8e80941Smrg so->outputs[n].slot = slot; 2601b8e80941Smrg so->outputs[n].regid = regid(n, comp); 2602b8e80941Smrg so->outputs_count = MAX2(so->outputs_count, n + 1); 2603b8e80941Smrg 2604b8e80941Smrg for (int i = 0; i < ncomp; i++) { 2605b8e80941Smrg unsigned idx = (n * 4) + i + frac; 2606b8e80941Smrg compile_assert(ctx, idx < ctx->ir->noutputs); 2607b8e80941Smrg ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0)); 2608b8e80941Smrg } 2609b8e80941Smrg 2610b8e80941Smrg /* if varying packing doesn't happen, we could end up in a situation 2611b8e80941Smrg * with "holes" in the output, and since the per-generation code that 2612b8e80941Smrg * sets up varying linkage registers doesn't expect to have more than 2613b8e80941Smrg * one varying per vec4 slot, pad the holes. 2614b8e80941Smrg * 2615b8e80941Smrg * Note that this should probably generate a performance warning of 2616b8e80941Smrg * some sort. 2617b8e80941Smrg */ 2618b8e80941Smrg for (int i = 0; i < frac; i++) { 2619b8e80941Smrg unsigned idx = (n * 4) + i; 2620b8e80941Smrg if (!ctx->ir->outputs[idx]) { 2621b8e80941Smrg ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0)); 2622b8e80941Smrg } 2623b8e80941Smrg } 2624b8e80941Smrg} 2625b8e80941Smrg 2626b8e80941Smrgstatic int 2627b8e80941Smrgmax_drvloc(struct exec_list *vars) 2628b8e80941Smrg{ 2629b8e80941Smrg int drvloc = -1; 2630b8e80941Smrg nir_foreach_variable(var, vars) { 2631b8e80941Smrg drvloc = MAX2(drvloc, (int)var->data.driver_location); 2632b8e80941Smrg } 2633b8e80941Smrg return drvloc; 2634b8e80941Smrg} 2635b8e80941Smrg 2636b8e80941Smrgstatic const unsigned max_sysvals[] = { 2637b8e80941Smrg [MESA_SHADER_FRAGMENT] = 24, // TODO 2638b8e80941Smrg [MESA_SHADER_VERTEX] = 16, 2639b8e80941Smrg [MESA_SHADER_COMPUTE] = 16, // TODO how many do we actually need? 2640b8e80941Smrg [MESA_SHADER_KERNEL] = 16, // TODO how many do we actually need? 2641b8e80941Smrg}; 2642b8e80941Smrg 2643b8e80941Smrgstatic void 2644b8e80941Smrgemit_instructions(struct ir3_context *ctx) 2645b8e80941Smrg{ 2646b8e80941Smrg unsigned ninputs, noutputs; 2647b8e80941Smrg nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->s); 2648b8e80941Smrg 2649b8e80941Smrg ninputs = (max_drvloc(&ctx->s->inputs) + 1) * 4; 2650b8e80941Smrg noutputs = (max_drvloc(&ctx->s->outputs) + 1) * 4; 2651b8e80941Smrg 2652b8e80941Smrg /* we need to leave room for sysvals: 2653b8e80941Smrg */ 2654b8e80941Smrg ninputs += max_sysvals[ctx->so->type]; 2655b8e80941Smrg 2656b8e80941Smrg ctx->ir = ir3_create(ctx->compiler, ctx->so->type, ninputs, noutputs); 2657b8e80941Smrg 2658b8e80941Smrg /* Create inputs in first block: */ 2659b8e80941Smrg ctx->block = get_block(ctx, nir_start_block(fxn)); 2660b8e80941Smrg ctx->in_block = ctx->block; 2661b8e80941Smrg list_addtail(&ctx->block->node, &ctx->ir->block_list); 2662b8e80941Smrg 2663b8e80941Smrg ninputs -= max_sysvals[ctx->so->type]; 2664b8e80941Smrg 2665b8e80941Smrg /* for fragment shader, the vcoord input register is used as the 2666b8e80941Smrg * base for bary.f varying fetch instrs: 2667b8e80941Smrg * 2668b8e80941Smrg * TODO defer creating ctx->ij_pixel and corresponding sysvals 2669b8e80941Smrg * until emit_intrinsic when we know they are actually needed. 2670b8e80941Smrg * For now, we defer creating ctx->ij_centroid, etc, since we 2671b8e80941Smrg * only need ij_pixel for "old style" varying inputs (ie. 2672b8e80941Smrg * tgsi_to_nir) 2673b8e80941Smrg */ 2674b8e80941Smrg struct ir3_instruction *vcoord = NULL; 2675b8e80941Smrg if (ctx->so->type == MESA_SHADER_FRAGMENT) { 2676b8e80941Smrg struct ir3_instruction *xy[2]; 2677b8e80941Smrg 2678b8e80941Smrg vcoord = create_input_compmask(ctx, 0, 0x3); 2679b8e80941Smrg ir3_split_dest(ctx->block, xy, vcoord, 0, 2); 2680b8e80941Smrg 2681b8e80941Smrg ctx->ij_pixel = ir3_create_collect(ctx, xy, 2); 2682b8e80941Smrg } 2683b8e80941Smrg 2684b8e80941Smrg /* Setup inputs: */ 2685b8e80941Smrg nir_foreach_variable(var, &ctx->s->inputs) { 2686b8e80941Smrg setup_input(ctx, var); 2687b8e80941Smrg } 2688b8e80941Smrg 2689b8e80941Smrg /* Defer add_sysval_input() stuff until after setup_inputs(), 2690b8e80941Smrg * because sysvals need to be appended after varyings: 2691b8e80941Smrg */ 2692b8e80941Smrg if (vcoord) { 2693b8e80941Smrg add_sysval_input_compmask(ctx, SYSTEM_VALUE_BARYCENTRIC_PIXEL, 2694b8e80941Smrg 0x3, vcoord); 2695b8e80941Smrg } 2696b8e80941Smrg 2697b8e80941Smrg /* Setup outputs: */ 2698b8e80941Smrg nir_foreach_variable(var, &ctx->s->outputs) { 2699b8e80941Smrg setup_output(ctx, var); 2700b8e80941Smrg } 2701b8e80941Smrg 2702b8e80941Smrg /* Find # of samplers: */ 2703b8e80941Smrg nir_foreach_variable(var, &ctx->s->uniforms) { 2704b8e80941Smrg ctx->so->num_samp += glsl_type_get_sampler_count(var->type); 2705b8e80941Smrg /* just assume that we'll be reading from images.. if it 2706b8e80941Smrg * is write-only we don't have to count it, but not sure 2707b8e80941Smrg * if there is a good way to know? 2708b8e80941Smrg */ 2709b8e80941Smrg ctx->so->num_samp += glsl_type_get_image_count(var->type); 2710b8e80941Smrg } 2711b8e80941Smrg 2712b8e80941Smrg /* NOTE: need to do something more clever when we support >1 fxn */ 2713b8e80941Smrg nir_foreach_register(reg, &fxn->registers) { 2714b8e80941Smrg ir3_declare_array(ctx, reg); 2715b8e80941Smrg } 2716b8e80941Smrg /* And emit the body: */ 2717b8e80941Smrg ctx->impl = fxn; 2718b8e80941Smrg emit_function(ctx, fxn); 2719b8e80941Smrg} 2720b8e80941Smrg 2721b8e80941Smrg/* from NIR perspective, we actually have varying inputs. But the varying 2722b8e80941Smrg * inputs, from an IR standpoint, are just bary.f/ldlv instructions. The 2723b8e80941Smrg * only actual inputs are the sysvals. 2724b8e80941Smrg */ 2725b8e80941Smrgstatic void 2726b8e80941Smrgfixup_frag_inputs(struct ir3_context *ctx) 2727b8e80941Smrg{ 2728b8e80941Smrg struct ir3_shader_variant *so = ctx->so; 2729b8e80941Smrg struct ir3 *ir = ctx->ir; 2730b8e80941Smrg unsigned i = 0; 2731b8e80941Smrg 2732b8e80941Smrg /* sysvals should appear at the end of the inputs, drop everything else: */ 2733b8e80941Smrg while ((i < so->inputs_count) && !so->inputs[i].sysval) 2734b8e80941Smrg i++; 2735b8e80941Smrg 2736b8e80941Smrg /* at IR level, inputs are always blocks of 4 scalars: */ 2737b8e80941Smrg i *= 4; 2738b8e80941Smrg 2739b8e80941Smrg ir->inputs = &ir->inputs[i]; 2740b8e80941Smrg ir->ninputs -= i; 2741b8e80941Smrg} 2742b8e80941Smrg 2743b8e80941Smrg/* Fixup tex sampler state for astc/srgb workaround instructions. We 2744b8e80941Smrg * need to assign the tex state indexes for these after we know the 2745b8e80941Smrg * max tex index. 2746b8e80941Smrg */ 2747b8e80941Smrgstatic void 2748b8e80941Smrgfixup_astc_srgb(struct ir3_context *ctx) 2749b8e80941Smrg{ 2750b8e80941Smrg struct ir3_shader_variant *so = ctx->so; 2751b8e80941Smrg /* indexed by original tex idx, value is newly assigned alpha sampler 2752b8e80941Smrg * state tex idx. Zero is invalid since there is at least one sampler 2753b8e80941Smrg * if we get here. 2754b8e80941Smrg */ 2755b8e80941Smrg unsigned alt_tex_state[16] = {0}; 2756b8e80941Smrg unsigned tex_idx = ctx->max_texture_index + 1; 2757b8e80941Smrg unsigned idx = 0; 2758b8e80941Smrg 2759b8e80941Smrg so->astc_srgb.base = tex_idx; 2760b8e80941Smrg 2761b8e80941Smrg for (unsigned i = 0; i < ctx->ir->astc_srgb_count; i++) { 2762b8e80941Smrg struct ir3_instruction *sam = ctx->ir->astc_srgb[i]; 2763b8e80941Smrg 2764b8e80941Smrg compile_assert(ctx, sam->cat5.tex < ARRAY_SIZE(alt_tex_state)); 2765b8e80941Smrg 2766b8e80941Smrg if (alt_tex_state[sam->cat5.tex] == 0) { 2767b8e80941Smrg /* assign new alternate/alpha tex state slot: */ 2768b8e80941Smrg alt_tex_state[sam->cat5.tex] = tex_idx++; 2769b8e80941Smrg so->astc_srgb.orig_idx[idx++] = sam->cat5.tex; 2770b8e80941Smrg so->astc_srgb.count++; 2771b8e80941Smrg } 2772b8e80941Smrg 2773b8e80941Smrg sam->cat5.tex = alt_tex_state[sam->cat5.tex]; 2774b8e80941Smrg } 2775b8e80941Smrg} 2776b8e80941Smrg 2777b8e80941Smrgstatic void 2778b8e80941Smrgfixup_binning_pass(struct ir3_context *ctx) 2779b8e80941Smrg{ 2780b8e80941Smrg struct ir3_shader_variant *so = ctx->so; 2781b8e80941Smrg struct ir3 *ir = ctx->ir; 2782b8e80941Smrg unsigned i, j; 2783b8e80941Smrg 2784b8e80941Smrg for (i = 0, j = 0; i < so->outputs_count; i++) { 2785b8e80941Smrg unsigned slot = so->outputs[i].slot; 2786b8e80941Smrg 2787b8e80941Smrg /* throw away everything but first position/psize */ 2788b8e80941Smrg if ((slot == VARYING_SLOT_POS) || (slot == VARYING_SLOT_PSIZ)) { 2789b8e80941Smrg if (i != j) { 2790b8e80941Smrg so->outputs[j] = so->outputs[i]; 2791b8e80941Smrg ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0]; 2792b8e80941Smrg ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1]; 2793b8e80941Smrg ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2]; 2794b8e80941Smrg ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3]; 2795b8e80941Smrg } 2796b8e80941Smrg j++; 2797b8e80941Smrg } 2798b8e80941Smrg } 2799b8e80941Smrg so->outputs_count = j; 2800b8e80941Smrg ir->noutputs = j * 4; 2801b8e80941Smrg} 2802b8e80941Smrg 2803b8e80941Smrgint 2804b8e80941Smrgir3_compile_shader_nir(struct ir3_compiler *compiler, 2805b8e80941Smrg struct ir3_shader_variant *so) 2806b8e80941Smrg{ 2807b8e80941Smrg struct ir3_context *ctx; 2808b8e80941Smrg struct ir3 *ir; 2809b8e80941Smrg struct ir3_instruction **inputs; 2810b8e80941Smrg unsigned i; 2811b8e80941Smrg int ret = 0, max_bary; 2812b8e80941Smrg 2813b8e80941Smrg assert(!so->ir); 2814b8e80941Smrg 2815b8e80941Smrg ctx = ir3_context_init(compiler, so); 2816b8e80941Smrg if (!ctx) { 2817b8e80941Smrg DBG("INIT failed!"); 2818b8e80941Smrg ret = -1; 2819b8e80941Smrg goto out; 2820b8e80941Smrg } 2821b8e80941Smrg 2822b8e80941Smrg emit_instructions(ctx); 2823b8e80941Smrg 2824b8e80941Smrg if (ctx->error) { 2825b8e80941Smrg DBG("EMIT failed!"); 2826b8e80941Smrg ret = -1; 2827b8e80941Smrg goto out; 2828b8e80941Smrg } 2829b8e80941Smrg 2830b8e80941Smrg ir = so->ir = ctx->ir; 2831b8e80941Smrg 2832b8e80941Smrg /* keep track of the inputs from TGSI perspective.. */ 2833b8e80941Smrg inputs = ir->inputs; 2834b8e80941Smrg 2835b8e80941Smrg /* but fixup actual inputs for frag shader: */ 2836b8e80941Smrg if (so->type == MESA_SHADER_FRAGMENT) 2837b8e80941Smrg fixup_frag_inputs(ctx); 2838b8e80941Smrg 2839b8e80941Smrg /* at this point, for binning pass, throw away unneeded outputs: */ 2840b8e80941Smrg if (so->binning_pass && (ctx->compiler->gpu_id < 600)) 2841b8e80941Smrg fixup_binning_pass(ctx); 2842b8e80941Smrg 2843b8e80941Smrg /* if we want half-precision outputs, mark the output registers 2844b8e80941Smrg * as half: 2845b8e80941Smrg */ 2846b8e80941Smrg if (so->key.half_precision) { 2847b8e80941Smrg for (i = 0; i < ir->noutputs; i++) { 2848b8e80941Smrg struct ir3_instruction *out = ir->outputs[i]; 2849b8e80941Smrg 2850b8e80941Smrg if (!out) 2851b8e80941Smrg continue; 2852b8e80941Smrg 2853b8e80941Smrg /* if frag shader writes z, that needs to be full precision: */ 2854b8e80941Smrg if (so->outputs[i/4].slot == FRAG_RESULT_DEPTH) 2855b8e80941Smrg continue; 2856b8e80941Smrg 2857b8e80941Smrg out->regs[0]->flags |= IR3_REG_HALF; 2858b8e80941Smrg /* output could be a fanout (ie. texture fetch output) 2859b8e80941Smrg * in which case we need to propagate the half-reg flag 2860b8e80941Smrg * up to the definer so that RA sees it: 2861b8e80941Smrg */ 2862b8e80941Smrg if (out->opc == OPC_META_FO) { 2863b8e80941Smrg out = out->regs[1]->instr; 2864b8e80941Smrg out->regs[0]->flags |= IR3_REG_HALF; 2865b8e80941Smrg } 2866b8e80941Smrg 2867b8e80941Smrg if (out->opc == OPC_MOV) { 2868b8e80941Smrg out->cat1.dst_type = half_type(out->cat1.dst_type); 2869b8e80941Smrg } 2870b8e80941Smrg } 2871b8e80941Smrg } 2872b8e80941Smrg 2873b8e80941Smrg if (ir3_shader_debug & IR3_DBG_OPTMSGS) { 2874b8e80941Smrg printf("BEFORE CP:\n"); 2875b8e80941Smrg ir3_print(ir); 2876b8e80941Smrg } 2877b8e80941Smrg 2878b8e80941Smrg ir3_cp(ir, so); 2879b8e80941Smrg 2880b8e80941Smrg /* at this point, for binning pass, throw away unneeded outputs: 2881b8e80941Smrg * Note that for a6xx and later, we do this after ir3_cp to ensure 2882b8e80941Smrg * that the uniform/constant layout for BS and VS matches, so that 2883b8e80941Smrg * we can re-use same VS_CONST state group. 2884b8e80941Smrg */ 2885b8e80941Smrg if (so->binning_pass && (ctx->compiler->gpu_id >= 600)) 2886b8e80941Smrg fixup_binning_pass(ctx); 2887b8e80941Smrg 2888b8e80941Smrg /* Insert mov if there's same instruction for each output. 2889b8e80941Smrg * eg. dEQP-GLES31.functional.shaders.opaque_type_indexing.sampler.const_expression.vertex.sampler2dshadow 2890b8e80941Smrg */ 2891b8e80941Smrg for (int i = ir->noutputs - 1; i >= 0; i--) { 2892b8e80941Smrg if (!ir->outputs[i]) 2893b8e80941Smrg continue; 2894b8e80941Smrg for (unsigned j = 0; j < i; j++) { 2895b8e80941Smrg if (ir->outputs[i] == ir->outputs[j]) { 2896b8e80941Smrg ir->outputs[i] = 2897b8e80941Smrg ir3_MOV(ir->outputs[i]->block, ir->outputs[i], TYPE_F32); 2898b8e80941Smrg } 2899b8e80941Smrg } 2900b8e80941Smrg } 2901b8e80941Smrg 2902b8e80941Smrg if (ir3_shader_debug & IR3_DBG_OPTMSGS) { 2903b8e80941Smrg printf("BEFORE GROUPING:\n"); 2904b8e80941Smrg ir3_print(ir); 2905b8e80941Smrg } 2906b8e80941Smrg 2907b8e80941Smrg ir3_sched_add_deps(ir); 2908b8e80941Smrg 2909b8e80941Smrg /* Group left/right neighbors, inserting mov's where needed to 2910b8e80941Smrg * solve conflicts: 2911b8e80941Smrg */ 2912b8e80941Smrg ir3_group(ir); 2913b8e80941Smrg 2914b8e80941Smrg if (ir3_shader_debug & IR3_DBG_OPTMSGS) { 2915b8e80941Smrg printf("AFTER GROUPING:\n"); 2916b8e80941Smrg ir3_print(ir); 2917b8e80941Smrg } 2918b8e80941Smrg 2919b8e80941Smrg ir3_depth(ir); 2920b8e80941Smrg 2921b8e80941Smrg if (ir3_shader_debug & IR3_DBG_OPTMSGS) { 2922b8e80941Smrg printf("AFTER DEPTH:\n"); 2923b8e80941Smrg ir3_print(ir); 2924b8e80941Smrg } 2925b8e80941Smrg 2926b8e80941Smrg /* do Sethi–Ullman numbering before scheduling: */ 2927b8e80941Smrg ir3_sun(ir); 2928b8e80941Smrg 2929b8e80941Smrg ret = ir3_sched(ir); 2930b8e80941Smrg if (ret) { 2931b8e80941Smrg DBG("SCHED failed!"); 2932b8e80941Smrg goto out; 2933b8e80941Smrg } 2934b8e80941Smrg 2935b8e80941Smrg if (compiler->gpu_id >= 600) { 2936b8e80941Smrg ir3_a6xx_fixup_atomic_dests(ir, so); 2937b8e80941Smrg } 2938b8e80941Smrg 2939b8e80941Smrg if (ir3_shader_debug & IR3_DBG_OPTMSGS) { 2940b8e80941Smrg printf("AFTER SCHED:\n"); 2941b8e80941Smrg ir3_print(ir); 2942b8e80941Smrg } 2943b8e80941Smrg 2944b8e80941Smrg ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face); 2945b8e80941Smrg if (ret) { 2946b8e80941Smrg DBG("RA failed!"); 2947b8e80941Smrg goto out; 2948b8e80941Smrg } 2949b8e80941Smrg 2950b8e80941Smrg if (ir3_shader_debug & IR3_DBG_OPTMSGS) { 2951b8e80941Smrg printf("AFTER RA:\n"); 2952b8e80941Smrg ir3_print(ir); 2953b8e80941Smrg } 2954b8e80941Smrg 2955b8e80941Smrg if (so->type == MESA_SHADER_FRAGMENT) 2956b8e80941Smrg pack_inlocs(ctx); 2957b8e80941Smrg 2958b8e80941Smrg /* fixup input/outputs: */ 2959b8e80941Smrg for (i = 0; i < so->outputs_count; i++) { 2960b8e80941Smrg /* sometimes we get outputs that don't write the .x coord, like: 2961b8e80941Smrg * 2962b8e80941Smrg * decl_var shader_out INTERP_MODE_NONE float Color (VARYING_SLOT_VAR9.z, 1, 0) 2963b8e80941Smrg * 2964b8e80941Smrg * Presumably the result of varying packing and then eliminating 2965b8e80941Smrg * some unneeded varyings? Just skip head to the first valid 2966b8e80941Smrg * component of the output. 2967b8e80941Smrg */ 2968b8e80941Smrg for (unsigned j = 0; j < 4; j++) { 2969b8e80941Smrg struct ir3_instruction *instr = ir->outputs[(i*4) + j]; 2970b8e80941Smrg if (instr) { 2971b8e80941Smrg so->outputs[i].regid = instr->regs[0]->num; 2972b8e80941Smrg so->outputs[i].half = !!(instr->regs[0]->flags & IR3_REG_HALF); 2973b8e80941Smrg break; 2974b8e80941Smrg } 2975b8e80941Smrg } 2976b8e80941Smrg } 2977b8e80941Smrg 2978b8e80941Smrg /* Note that some or all channels of an input may be unused: */ 2979b8e80941Smrg for (i = 0; i < so->inputs_count; i++) { 2980b8e80941Smrg unsigned j, reg = regid(63,0); 2981b8e80941Smrg bool half = false; 2982b8e80941Smrg for (j = 0; j < 4; j++) { 2983b8e80941Smrg struct ir3_instruction *in = inputs[(i*4) + j]; 2984b8e80941Smrg 2985b8e80941Smrg if (in && !(in->flags & IR3_INSTR_UNUSED)) { 2986b8e80941Smrg reg = in->regs[0]->num - j; 2987b8e80941Smrg if (half) { 2988b8e80941Smrg compile_assert(ctx, in->regs[0]->flags & IR3_REG_HALF); 2989b8e80941Smrg } else { 2990b8e80941Smrg half = !!(in->regs[0]->flags & IR3_REG_HALF); 2991b8e80941Smrg } 2992b8e80941Smrg } 2993b8e80941Smrg } 2994b8e80941Smrg so->inputs[i].regid = reg; 2995b8e80941Smrg so->inputs[i].half = half; 2996b8e80941Smrg } 2997b8e80941Smrg 2998b8e80941Smrg if (ctx->astc_srgb) 2999b8e80941Smrg fixup_astc_srgb(ctx); 3000b8e80941Smrg 3001b8e80941Smrg /* We need to do legalize after (for frag shader's) the "bary.f" 3002b8e80941Smrg * offsets (inloc) have been assigned. 3003b8e80941Smrg */ 3004b8e80941Smrg ir3_legalize(ir, &so->has_ssbo, &so->need_pixlod, &max_bary); 3005b8e80941Smrg 3006b8e80941Smrg if (ir3_shader_debug & IR3_DBG_OPTMSGS) { 3007b8e80941Smrg printf("AFTER LEGALIZE:\n"); 3008b8e80941Smrg ir3_print(ir); 3009b8e80941Smrg } 3010b8e80941Smrg 3011b8e80941Smrg so->branchstack = ctx->max_stack; 3012b8e80941Smrg 3013b8e80941Smrg /* Note that actual_in counts inputs that are not bary.f'd for FS: */ 3014b8e80941Smrg if (so->type == MESA_SHADER_FRAGMENT) 3015b8e80941Smrg so->total_in = max_bary + 1; 3016b8e80941Smrg 3017b8e80941Smrg so->max_sun = ir->max_sun; 3018b8e80941Smrg 3019b8e80941Smrgout: 3020b8e80941Smrg if (ret) { 3021b8e80941Smrg if (so->ir) 3022b8e80941Smrg ir3_destroy(so->ir); 3023b8e80941Smrg so->ir = NULL; 3024b8e80941Smrg } 3025b8e80941Smrg ir3_context_free(ctx); 3026b8e80941Smrg 3027b8e80941Smrg return ret; 3028b8e80941Smrg} 3029