1/* 2 * Copyright (C) 2018 Jonathan Marek <jonathan@marek.ca> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Jonathan Marek <jonathan@marek.ca> 25 */ 26 27#include "ir2_private.h" 28 29#include "fd2_program.h" 30#include "freedreno_util.h" 31 32static const nir_shader_compiler_options options = { 33 .lower_fpow = true, 34 .lower_flrp32 = true, 35 .lower_fmod = true, 36 .lower_fdiv = true, 37 .lower_fceil = true, 38 .fuse_ffma16 = true, 39 .fuse_ffma32 = true, 40 .fuse_ffma64 = true, 41 /* .fdot_replicates = true, it is replicated, but it makes things worse */ 42 .lower_all_io_to_temps = true, 43 .vertex_id_zero_based = true, /* its not implemented anyway */ 44 .lower_bitops = true, 45 .lower_rotate = true, 46 .lower_vector_cmp = true, 47 .lower_fdph = true, 48 .has_fsub = true, 49 .has_isub = true, 50 .lower_insert_byte = true, 51 .lower_insert_word = true, 52 .force_indirect_unrolling = nir_var_all, 53}; 54 55const nir_shader_compiler_options * 56ir2_get_compiler_options(void) 57{ 58 return &options; 59} 60 61#define OPT(nir, pass, ...) \ 62 ({ \ 63 bool this_progress = false; \ 64 NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__); \ 65 this_progress; \ 66 }) 67#define OPT_V(nir, pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__) 68 69static void 70ir2_optimize_loop(nir_shader *s) 71{ 72 bool progress; 73 do { 74 progress = false; 75 76 OPT_V(s, nir_lower_vars_to_ssa); 77 progress |= OPT(s, nir_opt_copy_prop_vars); 78 progress |= OPT(s, nir_copy_prop); 79 progress |= OPT(s, nir_opt_dce); 80 progress |= OPT(s, nir_opt_cse); 81 /* progress |= OPT(s, nir_opt_gcm, true); */ 82 progress |= OPT(s, nir_opt_peephole_select, UINT_MAX, true, true); 83 progress |= OPT(s, nir_opt_intrinsics); 84 progress |= OPT(s, nir_opt_algebraic); 85 progress |= OPT(s, nir_opt_constant_folding); 86 progress |= OPT(s, nir_opt_dead_cf); 87 if (OPT(s, nir_opt_trivial_continues)) { 88 progress |= true; 89 /* If nir_opt_trivial_continues makes progress, then we need to clean 90 * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll 91 * to make progress. 92 */ 93 OPT(s, nir_copy_prop); 94 OPT(s, nir_opt_dce); 95 } 96 progress |= OPT(s, nir_opt_loop_unroll); 97 progress |= OPT(s, nir_opt_if, false); 98 progress |= OPT(s, nir_opt_remove_phis); 99 progress |= OPT(s, nir_opt_undef); 100 101 } while (progress); 102} 103 104/* trig workarounds is the same as ir3.. but we don't want to include ir3 */ 105bool ir3_nir_apply_trig_workarounds(nir_shader *shader); 106 107int 108ir2_optimize_nir(nir_shader *s, bool lower) 109{ 110 struct nir_lower_tex_options tex_options = { 111 .lower_txp = ~0u, 112 .lower_rect = 0, 113 }; 114 115 if (FD_DBG(DISASM)) { 116 debug_printf("----------------------\n"); 117 nir_print_shader(s, stdout); 118 debug_printf("----------------------\n"); 119 } 120 121 OPT_V(s, nir_lower_regs_to_ssa); 122 OPT_V(s, nir_lower_vars_to_ssa); 123 OPT_V(s, nir_lower_indirect_derefs, nir_var_shader_in | nir_var_shader_out, 124 UINT32_MAX); 125 126 if (lower) { 127 OPT_V(s, ir3_nir_apply_trig_workarounds); 128 OPT_V(s, nir_lower_tex, &tex_options); 129 } 130 131 ir2_optimize_loop(s); 132 133 OPT_V(s, nir_remove_dead_variables, nir_var_function_temp, NULL); 134 OPT_V(s, nir_opt_sink, nir_move_const_undef); 135 136 /* TODO we dont want to get shaders writing to depth for depth textures */ 137 if (s->info.stage == MESA_SHADER_FRAGMENT) { 138 nir_foreach_shader_out_variable (var, s) { 139 if (var->data.location == FRAG_RESULT_DEPTH) 140 return -1; 141 } 142 } 143 144 return 0; 145} 146 147static struct ir2_src 148load_const(struct ir2_context *ctx, float *value_f, unsigned ncomp) 149{ 150 struct fd2_shader_stateobj *so = ctx->so; 151 unsigned imm_ncomp, swiz, idx, i, j; 152 uint32_t *value = (uint32_t *)value_f; 153 154 /* try to merge with existing immediate (TODO: try with neg) */ 155 for (idx = 0; idx < so->num_immediates; idx++) { 156 swiz = 0; 157 imm_ncomp = so->immediates[idx].ncomp; 158 for (i = 0; i < ncomp; i++) { 159 for (j = 0; j < imm_ncomp; j++) { 160 if (value[i] == so->immediates[idx].val[j]) 161 break; 162 } 163 if (j == imm_ncomp) { 164 if (j == 4) 165 break; 166 so->immediates[idx].val[imm_ncomp++] = value[i]; 167 } 168 swiz |= swiz_set(j, i); 169 } 170 /* matched all components */ 171 if (i == ncomp) 172 break; 173 } 174 175 /* need to allocate new immediate */ 176 if (idx == so->num_immediates) { 177 swiz = 0; 178 imm_ncomp = 0; 179 for (i = 0; i < ncomp; i++) { 180 for (j = 0; j < imm_ncomp; j++) { 181 if (value[i] == ctx->so->immediates[idx].val[j]) 182 break; 183 } 184 if (j == imm_ncomp) { 185 so->immediates[idx].val[imm_ncomp++] = value[i]; 186 } 187 swiz |= swiz_set(j, i); 188 } 189 so->num_immediates++; 190 } 191 so->immediates[idx].ncomp = imm_ncomp; 192 193 if (ncomp == 1) 194 swiz = swiz_merge(swiz, IR2_SWIZZLE_XXXX); 195 196 return ir2_src(so->first_immediate + idx, swiz, IR2_SRC_CONST); 197} 198 199struct ir2_src 200ir2_zero(struct ir2_context *ctx) 201{ 202 return load_const(ctx, (float[]){0.0f}, 1); 203} 204 205static void 206update_range(struct ir2_context *ctx, struct ir2_reg *reg) 207{ 208 if (!reg->initialized) { 209 reg->initialized = true; 210 reg->loop_depth = ctx->loop_depth; 211 } 212 213 if (ctx->loop_depth > reg->loop_depth) { 214 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth + 1]; 215 } else { 216 reg->loop_depth = ctx->loop_depth; 217 reg->block_idx_free = -1; 218 } 219 220 /* for regs we want to free at the end of the loop in any case 221 * XXX dont do this for ssa 222 */ 223 if (reg->loop_depth) 224 reg->block_idx_free = ctx->loop_last_block[reg->loop_depth]; 225} 226 227static struct ir2_src 228make_src(struct ir2_context *ctx, nir_src src) 229{ 230 struct ir2_src res = {}; 231 struct ir2_reg *reg; 232 233 nir_const_value *const_value = nir_src_as_const_value(src); 234 235 if (const_value) { 236 assert(src.is_ssa); 237 float c[src.ssa->num_components]; 238 nir_const_value_to_array(c, const_value, src.ssa->num_components, f32); 239 return load_const(ctx, c, src.ssa->num_components); 240 } 241 242 if (!src.is_ssa) { 243 res.num = src.reg.reg->index; 244 res.type = IR2_SRC_REG; 245 reg = &ctx->reg[res.num]; 246 } else { 247 assert(ctx->ssa_map[src.ssa->index] >= 0); 248 res.num = ctx->ssa_map[src.ssa->index]; 249 res.type = IR2_SRC_SSA; 250 reg = &ctx->instr[res.num].ssa; 251 } 252 253 update_range(ctx, reg); 254 return res; 255} 256 257static void 258set_index(struct ir2_context *ctx, nir_dest *dst, struct ir2_instr *instr) 259{ 260 struct ir2_reg *reg = &instr->ssa; 261 262 if (dst->is_ssa) { 263 ctx->ssa_map[dst->ssa.index] = instr->idx; 264 } else { 265 assert(instr->is_ssa); 266 reg = &ctx->reg[dst->reg.reg->index]; 267 268 instr->is_ssa = false; 269 instr->reg = reg; 270 } 271 update_range(ctx, reg); 272} 273 274static struct ir2_instr * 275ir2_instr_create(struct ir2_context *ctx, int type) 276{ 277 struct ir2_instr *instr; 278 279 instr = &ctx->instr[ctx->instr_count++]; 280 instr->idx = ctx->instr_count - 1; 281 instr->type = type; 282 instr->block_idx = ctx->block_idx; 283 instr->pred = ctx->pred; 284 instr->is_ssa = true; 285 return instr; 286} 287 288static struct ir2_instr * 289instr_create_alu(struct ir2_context *ctx, nir_op opcode, unsigned ncomp) 290{ 291 /* emit_alu will fixup instrs that don't map directly */ 292 static const struct ir2_opc { 293 int8_t scalar, vector; 294 } nir_ir2_opc[nir_num_opcodes + 1] = { 295 [0 ... nir_num_opcodes - 1] = {-1, -1}, 296 297 [nir_op_mov] = {MAXs, MAXv}, 298 [nir_op_fneg] = {MAXs, MAXv}, 299 [nir_op_fabs] = {MAXs, MAXv}, 300 [nir_op_fsat] = {MAXs, MAXv}, 301 [nir_op_fsign] = {-1, CNDGTEv}, 302 [nir_op_fadd] = {ADDs, ADDv}, 303 [nir_op_fsub] = {ADDs, ADDv}, 304 [nir_op_fmul] = {MULs, MULv}, 305 [nir_op_ffma] = {-1, MULADDv}, 306 [nir_op_fmax] = {MAXs, MAXv}, 307 [nir_op_fmin] = {MINs, MINv}, 308 [nir_op_ffloor] = {FLOORs, FLOORv}, 309 [nir_op_ffract] = {FRACs, FRACv}, 310 [nir_op_ftrunc] = {TRUNCs, TRUNCv}, 311 [nir_op_fdot2] = {-1, DOT2ADDv}, 312 [nir_op_fdot3] = {-1, DOT3v}, 313 [nir_op_fdot4] = {-1, DOT4v}, 314 [nir_op_sge] = {-1, SETGTEv}, 315 [nir_op_slt] = {-1, SETGTv}, 316 [nir_op_sne] = {-1, SETNEv}, 317 [nir_op_seq] = {-1, SETEv}, 318 [nir_op_fcsel] = {-1, CNDEv}, 319 [nir_op_frsq] = {RECIPSQ_IEEE, -1}, 320 [nir_op_frcp] = {RECIP_IEEE, -1}, 321 [nir_op_flog2] = {LOG_IEEE, -1}, 322 [nir_op_fexp2] = {EXP_IEEE, -1}, 323 [nir_op_fsqrt] = {SQRT_IEEE, -1}, 324 [nir_op_fcos] = {COS, -1}, 325 [nir_op_fsin] = {SIN, -1}, 326 /* no fsat, fneg, fabs since source mods deal with those */ 327 328 /* so we can use this function with non-nir op */ 329#define ir2_op_cube nir_num_opcodes 330 [ir2_op_cube] = {-1, CUBEv}, 331 }; 332 333 struct ir2_opc op = nir_ir2_opc[opcode]; 334 assert(op.vector >= 0 || op.scalar >= 0); 335 336 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_ALU); 337 instr->alu.vector_opc = op.vector; 338 instr->alu.scalar_opc = op.scalar; 339 instr->alu.export = -1; 340 instr->alu.write_mask = (1 << ncomp) - 1; 341 instr->src_count = 342 opcode == ir2_op_cube ? 2 : nir_op_infos[opcode].num_inputs; 343 instr->ssa.ncomp = ncomp; 344 return instr; 345} 346 347static struct ir2_instr * 348instr_create_alu_reg(struct ir2_context *ctx, nir_op opcode, uint8_t write_mask, 349 struct ir2_instr *share_reg) 350{ 351 struct ir2_instr *instr; 352 struct ir2_reg *reg; 353 354 reg = share_reg ? share_reg->reg : &ctx->reg[ctx->reg_count++]; 355 reg->ncomp = MAX2(reg->ncomp, util_logbase2(write_mask) + 1); 356 357 instr = instr_create_alu(ctx, opcode, util_bitcount(write_mask)); 358 instr->alu.write_mask = write_mask; 359 instr->reg = reg; 360 instr->is_ssa = false; 361 return instr; 362} 363 364static struct ir2_instr * 365instr_create_alu_dest(struct ir2_context *ctx, nir_op opcode, nir_dest *dst) 366{ 367 struct ir2_instr *instr; 368 instr = instr_create_alu(ctx, opcode, nir_dest_num_components(*dst)); 369 set_index(ctx, dst, instr); 370 return instr; 371} 372 373static struct ir2_instr * 374ir2_instr_create_fetch(struct ir2_context *ctx, nir_dest *dst, 375 instr_fetch_opc_t opc) 376{ 377 struct ir2_instr *instr = ir2_instr_create(ctx, IR2_FETCH); 378 instr->fetch.opc = opc; 379 instr->src_count = 1; 380 instr->ssa.ncomp = nir_dest_num_components(*dst); 381 set_index(ctx, dst, instr); 382 return instr; 383} 384 385static struct ir2_src 386make_src_noconst(struct ir2_context *ctx, nir_src src) 387{ 388 struct ir2_instr *instr; 389 390 if (nir_src_as_const_value(src)) { 391 assert(src.is_ssa); 392 instr = instr_create_alu(ctx, nir_op_mov, src.ssa->num_components); 393 instr->src[0] = make_src(ctx, src); 394 return ir2_src(instr->idx, 0, IR2_SRC_SSA); 395 } 396 397 return make_src(ctx, src); 398} 399 400static void 401emit_alu(struct ir2_context *ctx, nir_alu_instr *alu) 402{ 403 const nir_op_info *info = &nir_op_infos[alu->op]; 404 nir_dest *dst = &alu->dest.dest; 405 struct ir2_instr *instr; 406 struct ir2_src tmp; 407 unsigned ncomp; 408 409 /* get the number of dst components */ 410 if (dst->is_ssa) { 411 ncomp = dst->ssa.num_components; 412 } else { 413 ncomp = 0; 414 for (int i = 0; i < 4; i++) 415 ncomp += !!(alu->dest.write_mask & 1 << i); 416 } 417 418 instr = instr_create_alu(ctx, alu->op, ncomp); 419 set_index(ctx, dst, instr); 420 instr->alu.saturate = alu->dest.saturate; 421 instr->alu.write_mask = alu->dest.write_mask; 422 423 for (int i = 0; i < info->num_inputs; i++) { 424 nir_alu_src *src = &alu->src[i]; 425 426 /* compress swizzle with writemask when applicable */ 427 unsigned swiz = 0, j = 0; 428 for (int i = 0; i < 4; i++) { 429 if (!(alu->dest.write_mask & 1 << i) && !info->output_size) 430 continue; 431 swiz |= swiz_set(src->swizzle[i], j++); 432 } 433 434 instr->src[i] = make_src(ctx, src->src); 435 instr->src[i].swizzle = swiz_merge(instr->src[i].swizzle, swiz); 436 instr->src[i].negate = src->negate; 437 instr->src[i].abs = src->abs; 438 } 439 440 /* workarounds for NIR ops that don't map directly to a2xx ops */ 441 switch (alu->op) { 442 case nir_op_fneg: 443 instr->src[0].negate = 1; 444 break; 445 case nir_op_fabs: 446 instr->src[0].abs = 1; 447 break; 448 case nir_op_fsat: 449 instr->alu.saturate = 1; 450 break; 451 case nir_op_slt: 452 tmp = instr->src[0]; 453 instr->src[0] = instr->src[1]; 454 instr->src[1] = tmp; 455 break; 456 case nir_op_fcsel: 457 tmp = instr->src[1]; 458 instr->src[1] = instr->src[2]; 459 instr->src[2] = tmp; 460 break; 461 case nir_op_fsub: 462 instr->src[1].negate = !instr->src[1].negate; 463 break; 464 case nir_op_fdot2: 465 instr->src_count = 3; 466 instr->src[2] = ir2_zero(ctx); 467 break; 468 case nir_op_fsign: { 469 /* we need an extra instruction to deal with the zero case */ 470 struct ir2_instr *tmp; 471 472 /* tmp = x == 0 ? 0 : 1 */ 473 tmp = instr_create_alu(ctx, nir_op_fcsel, ncomp); 474 tmp->src[0] = instr->src[0]; 475 tmp->src[1] = ir2_zero(ctx); 476 tmp->src[2] = load_const(ctx, (float[]){1.0f}, 1); 477 478 /* result = x >= 0 ? tmp : -tmp */ 479 instr->src[1] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); 480 instr->src[2] = instr->src[1]; 481 instr->src[2].negate = true; 482 instr->src_count = 3; 483 } break; 484 default: 485 break; 486 } 487} 488 489static void 490load_input(struct ir2_context *ctx, nir_dest *dst, unsigned idx) 491{ 492 struct ir2_instr *instr; 493 int slot = -1; 494 495 if (ctx->so->type == MESA_SHADER_VERTEX) { 496 instr = ir2_instr_create_fetch(ctx, dst, 0); 497 instr->src[0] = ir2_src(0, 0, IR2_SRC_INPUT); 498 instr->fetch.vtx.const_idx = 20 + (idx / 3); 499 instr->fetch.vtx.const_idx_sel = idx % 3; 500 return; 501 } 502 503 /* get slot from idx */ 504 nir_foreach_shader_in_variable (var, ctx->nir) { 505 if (var->data.driver_location == idx) { 506 slot = var->data.location; 507 break; 508 } 509 } 510 assert(slot >= 0); 511 512 switch (slot) { 513 case VARYING_SLOT_POS: 514 /* need to extract xy with abs and add tile offset on a20x 515 * zw from fragcoord input (w inverted in fragment shader) 516 * TODO: only components that are required by fragment shader 517 */ 518 instr = instr_create_alu_reg( 519 ctx, ctx->so->is_a20x ? nir_op_fadd : nir_op_mov, 3, NULL); 520 instr->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); 521 instr->src[0].abs = true; 522 /* on a20x, C64 contains the tile offset */ 523 instr->src[1] = ir2_src(64, 0, IR2_SRC_CONST); 524 525 instr = instr_create_alu_reg(ctx, nir_op_mov, 4, instr); 526 instr->src[0] = ir2_src(ctx->f->fragcoord, 0, IR2_SRC_INPUT); 527 528 instr = instr_create_alu_reg(ctx, nir_op_frcp, 8, instr); 529 instr->src[0] = ir2_src(ctx->f->fragcoord, IR2_SWIZZLE_Y, IR2_SRC_INPUT); 530 531 unsigned reg_idx = instr->reg - ctx->reg; /* XXX */ 532 instr = instr_create_alu_dest(ctx, nir_op_mov, dst); 533 instr->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); 534 break; 535 default: 536 instr = instr_create_alu_dest(ctx, nir_op_mov, dst); 537 instr->src[0] = ir2_src(idx, 0, IR2_SRC_INPUT); 538 break; 539 } 540} 541 542static unsigned 543output_slot(struct ir2_context *ctx, nir_intrinsic_instr *intr) 544{ 545 int slot = -1; 546 unsigned idx = nir_intrinsic_base(intr); 547 nir_foreach_shader_out_variable (var, ctx->nir) { 548 if (var->data.driver_location == idx) { 549 slot = var->data.location; 550 break; 551 } 552 } 553 assert(slot != -1); 554 return slot; 555} 556 557static void 558store_output(struct ir2_context *ctx, nir_src src, unsigned slot, 559 unsigned ncomp) 560{ 561 struct ir2_instr *instr; 562 unsigned idx = 0; 563 564 if (ctx->so->type == MESA_SHADER_VERTEX) { 565 switch (slot) { 566 case VARYING_SLOT_POS: 567 ctx->position = make_src(ctx, src); 568 idx = 62; 569 break; 570 case VARYING_SLOT_PSIZ: 571 ctx->so->writes_psize = true; 572 idx = 63; 573 break; 574 default: 575 /* find matching slot from fragment shader input */ 576 for (idx = 0; idx < ctx->f->inputs_count; idx++) 577 if (ctx->f->inputs[idx].slot == slot) 578 break; 579 if (idx == ctx->f->inputs_count) 580 return; 581 } 582 } else if (slot != FRAG_RESULT_COLOR && slot != FRAG_RESULT_DATA0) { 583 /* only color output is implemented */ 584 return; 585 } 586 587 instr = instr_create_alu(ctx, nir_op_mov, ncomp); 588 instr->src[0] = make_src(ctx, src); 589 instr->alu.export = idx; 590} 591 592static void 593emit_intrinsic(struct ir2_context *ctx, nir_intrinsic_instr *intr) 594{ 595 struct ir2_instr *instr; 596 ASSERTED nir_const_value *const_offset; 597 unsigned idx; 598 599 switch (intr->intrinsic) { 600 case nir_intrinsic_load_input: 601 load_input(ctx, &intr->dest, nir_intrinsic_base(intr)); 602 break; 603 case nir_intrinsic_store_output: 604 store_output(ctx, intr->src[0], output_slot(ctx, intr), 605 intr->num_components); 606 break; 607 case nir_intrinsic_load_uniform: 608 const_offset = nir_src_as_const_value(intr->src[0]); 609 assert(const_offset); /* TODO can be false in ES2? */ 610 idx = nir_intrinsic_base(intr); 611 idx += (uint32_t)const_offset[0].f32; 612 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest); 613 instr->src[0] = ir2_src(idx, 0, IR2_SRC_CONST); 614 break; 615 case nir_intrinsic_discard: 616 case nir_intrinsic_discard_if: 617 instr = ir2_instr_create(ctx, IR2_ALU); 618 instr->alu.vector_opc = VECTOR_NONE; 619 if (intr->intrinsic == nir_intrinsic_discard_if) { 620 instr->alu.scalar_opc = KILLNEs; 621 instr->src[0] = make_src(ctx, intr->src[0]); 622 } else { 623 instr->alu.scalar_opc = KILLEs; 624 instr->src[0] = ir2_zero(ctx); 625 } 626 instr->alu.export = -1; 627 instr->src_count = 1; 628 ctx->so->has_kill = true; 629 break; 630 case nir_intrinsic_load_front_face: 631 /* gl_FrontFacing is in the sign of param.x 632 * rcp required because otherwise we can't differentiate -0.0 and +0.0 633 */ 634 ctx->so->need_param = true; 635 636 struct ir2_instr *tmp = instr_create_alu(ctx, nir_op_frcp, 1); 637 tmp->src[0] = ir2_src(ctx->f->inputs_count, 0, IR2_SRC_INPUT); 638 639 instr = instr_create_alu_dest(ctx, nir_op_sge, &intr->dest); 640 instr->src[0] = ir2_src(tmp->idx, 0, IR2_SRC_SSA); 641 instr->src[1] = ir2_zero(ctx); 642 break; 643 case nir_intrinsic_load_point_coord: 644 /* param.zw (note: abs might be needed like fragcoord in param.xy?) */ 645 ctx->so->need_param = true; 646 647 instr = instr_create_alu_dest(ctx, nir_op_mov, &intr->dest); 648 instr->src[0] = 649 ir2_src(ctx->f->inputs_count, IR2_SWIZZLE_ZW, IR2_SRC_INPUT); 650 break; 651 default: 652 compile_error(ctx, "unimplemented intr %d\n", intr->intrinsic); 653 break; 654 } 655} 656 657static void 658emit_tex(struct ir2_context *ctx, nir_tex_instr *tex) 659{ 660 bool is_rect = false, is_cube = false; 661 struct ir2_instr *instr; 662 nir_src *coord, *lod_bias; 663 664 coord = lod_bias = NULL; 665 666 for (unsigned i = 0; i < tex->num_srcs; i++) { 667 switch (tex->src[i].src_type) { 668 case nir_tex_src_coord: 669 coord = &tex->src[i].src; 670 break; 671 case nir_tex_src_bias: 672 case nir_tex_src_lod: 673 assert(!lod_bias); 674 lod_bias = &tex->src[i].src; 675 break; 676 default: 677 compile_error(ctx, "Unhandled NIR tex src type: %d\n", 678 tex->src[i].src_type); 679 return; 680 } 681 } 682 683 switch (tex->op) { 684 case nir_texop_tex: 685 case nir_texop_txb: 686 case nir_texop_txl: 687 break; 688 default: 689 compile_error(ctx, "unimplemented texop %d\n", tex->op); 690 return; 691 } 692 693 switch (tex->sampler_dim) { 694 case GLSL_SAMPLER_DIM_2D: 695 case GLSL_SAMPLER_DIM_EXTERNAL: 696 break; 697 case GLSL_SAMPLER_DIM_RECT: 698 is_rect = true; 699 break; 700 case GLSL_SAMPLER_DIM_CUBE: 701 is_cube = true; 702 break; 703 default: 704 compile_error(ctx, "unimplemented sampler %d\n", tex->sampler_dim); 705 return; 706 } 707 708 struct ir2_src src_coord = make_src_noconst(ctx, *coord); 709 710 /* for cube maps 711 * tmp = cube(coord) 712 * tmp.xy = tmp.xy / |tmp.z| + 1.5 713 * coord = tmp.xyw 714 */ 715 if (is_cube) { 716 struct ir2_instr *rcp, *coord_xy; 717 unsigned reg_idx; 718 719 instr = instr_create_alu_reg(ctx, ir2_op_cube, 15, NULL); 720 instr->src[0] = src_coord; 721 instr->src[0].swizzle = IR2_SWIZZLE_ZZXY; 722 instr->src[1] = src_coord; 723 instr->src[1].swizzle = IR2_SWIZZLE_YXZZ; 724 725 reg_idx = instr->reg - ctx->reg; /* hacky */ 726 727 rcp = instr_create_alu(ctx, nir_op_frcp, 1); 728 rcp->src[0] = ir2_src(reg_idx, IR2_SWIZZLE_Z, IR2_SRC_REG); 729 rcp->src[0].abs = true; 730 731 coord_xy = instr_create_alu_reg(ctx, nir_op_ffma, 3, instr); 732 coord_xy->src[0] = ir2_src(reg_idx, 0, IR2_SRC_REG); 733 coord_xy->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); 734 coord_xy->src[2] = load_const(ctx, (float[]){1.5f}, 1); 735 736 src_coord = ir2_src(reg_idx, 0, IR2_SRC_REG); 737 /* TODO: lod/bias transformed by src_coord.z ? */ 738 } 739 740 instr = ir2_instr_create_fetch(ctx, &tex->dest, TEX_FETCH); 741 instr->src[0] = src_coord; 742 instr->src[0].swizzle = is_cube ? IR2_SWIZZLE_YXW : 0; 743 instr->fetch.tex.is_cube = is_cube; 744 instr->fetch.tex.is_rect = is_rect; 745 instr->fetch.tex.samp_id = tex->sampler_index; 746 747 /* for lod/bias, we insert an extra src for the backend to deal with */ 748 if (lod_bias) { 749 instr->src[1] = make_src_noconst(ctx, *lod_bias); 750 /* backend will use 2-3 components so apply swizzle */ 751 swiz_merge_p(&instr->src[1].swizzle, IR2_SWIZZLE_XXXX); 752 instr->src_count = 2; 753 } 754} 755 756static void 757setup_input(struct ir2_context *ctx, nir_variable *in) 758{ 759 struct fd2_shader_stateobj *so = ctx->so; 760 ASSERTED unsigned array_len = MAX2(glsl_get_length(in->type), 1); 761 unsigned n = in->data.driver_location; 762 unsigned slot = in->data.location; 763 764 assert(array_len == 1); 765 766 /* handle later */ 767 if (ctx->so->type == MESA_SHADER_VERTEX) 768 return; 769 770 if (ctx->so->type != MESA_SHADER_FRAGMENT) 771 compile_error(ctx, "unknown shader type: %d\n", ctx->so->type); 772 773 n = ctx->f->inputs_count++; 774 775 /* half of fragcoord from param reg, half from a varying */ 776 if (slot == VARYING_SLOT_POS) { 777 ctx->f->fragcoord = n; 778 so->need_param = true; 779 } 780 781 ctx->f->inputs[n].slot = slot; 782 ctx->f->inputs[n].ncomp = glsl_get_components(in->type); 783 784 /* in->data.interpolation? 785 * opengl ES 2.0 can't do flat mode, but we still get it from GALLIUM_HUD 786 */ 787} 788 789static void 790emit_undef(struct ir2_context *ctx, nir_ssa_undef_instr *undef) 791{ 792 /* TODO we don't want to emit anything for undefs */ 793 794 struct ir2_instr *instr; 795 796 instr = instr_create_alu_dest( 797 ctx, nir_op_mov, &(nir_dest){.ssa = undef->def, .is_ssa = true}); 798 instr->src[0] = ir2_src(0, 0, IR2_SRC_CONST); 799} 800 801static void 802emit_instr(struct ir2_context *ctx, nir_instr *instr) 803{ 804 switch (instr->type) { 805 case nir_instr_type_alu: 806 emit_alu(ctx, nir_instr_as_alu(instr)); 807 break; 808 case nir_instr_type_deref: 809 /* ignored, handled as part of the intrinsic they are src to */ 810 break; 811 case nir_instr_type_intrinsic: 812 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); 813 break; 814 case nir_instr_type_load_const: 815 /* dealt with when using nir_src */ 816 break; 817 case nir_instr_type_tex: 818 emit_tex(ctx, nir_instr_as_tex(instr)); 819 break; 820 case nir_instr_type_jump: 821 ctx->block_has_jump[ctx->block_idx] = true; 822 break; 823 case nir_instr_type_ssa_undef: 824 emit_undef(ctx, nir_instr_as_ssa_undef(instr)); 825 break; 826 default: 827 break; 828 } 829} 830 831/* fragcoord.zw and a20x hw binning outputs */ 832static void 833extra_position_exports(struct ir2_context *ctx, bool binning) 834{ 835 struct ir2_instr *instr, *rcp, *sc, *wincoord, *off; 836 837 if (ctx->f->fragcoord < 0 && !binning) 838 return; 839 840 instr = instr_create_alu(ctx, nir_op_fmax, 1); 841 instr->src[0] = ctx->position; 842 instr->src[0].swizzle = IR2_SWIZZLE_W; 843 instr->src[1] = ir2_zero(ctx); 844 845 rcp = instr_create_alu(ctx, nir_op_frcp, 1); 846 rcp->src[0] = ir2_src(instr->idx, 0, IR2_SRC_SSA); 847 848 sc = instr_create_alu(ctx, nir_op_fmul, 4); 849 sc->src[0] = ctx->position; 850 sc->src[1] = ir2_src(rcp->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); 851 852 wincoord = instr_create_alu(ctx, nir_op_ffma, 4); 853 wincoord->src[0] = ir2_src(66, 0, IR2_SRC_CONST); 854 wincoord->src[1] = ir2_src(sc->idx, 0, IR2_SRC_SSA); 855 wincoord->src[2] = ir2_src(65, 0, IR2_SRC_CONST); 856 857 /* fragcoord z/w */ 858 if (ctx->f->fragcoord >= 0 && !binning) { 859 instr = instr_create_alu(ctx, nir_op_mov, 1); 860 instr->src[0] = ir2_src(wincoord->idx, IR2_SWIZZLE_Z, IR2_SRC_SSA); 861 instr->alu.export = ctx->f->fragcoord; 862 863 instr = instr_create_alu(ctx, nir_op_mov, 1); 864 instr->src[0] = ctx->position; 865 instr->src[0].swizzle = IR2_SWIZZLE_W; 866 instr->alu.export = ctx->f->fragcoord; 867 instr->alu.write_mask = 2; 868 } 869 870 if (!binning) 871 return; 872 873 off = instr_create_alu(ctx, nir_op_fadd, 1); 874 off->src[0] = ir2_src(64, 0, IR2_SRC_CONST); 875 off->src[1] = ir2_src(2, 0, IR2_SRC_INPUT); 876 877 /* 8 max set in freedreno_screen.. unneeded instrs patched out */ 878 for (int i = 0; i < 8; i++) { 879 instr = instr_create_alu(ctx, nir_op_ffma, 4); 880 instr->src[0] = ir2_src(1, IR2_SWIZZLE_WYWW, IR2_SRC_CONST); 881 instr->src[1] = ir2_src(off->idx, IR2_SWIZZLE_XXXX, IR2_SRC_SSA); 882 instr->src[2] = ir2_src(3 + i, 0, IR2_SRC_CONST); 883 instr->alu.export = 32; 884 885 instr = instr_create_alu(ctx, nir_op_ffma, 4); 886 instr->src[0] = ir2_src(68 + i * 2, 0, IR2_SRC_CONST); 887 instr->src[1] = ir2_src(wincoord->idx, 0, IR2_SRC_SSA); 888 instr->src[2] = ir2_src(67 + i * 2, 0, IR2_SRC_CONST); 889 instr->alu.export = 33; 890 } 891} 892 893static bool emit_cf_list(struct ir2_context *ctx, struct exec_list *list); 894 895static bool 896emit_block(struct ir2_context *ctx, nir_block *block) 897{ 898 struct ir2_instr *instr; 899 nir_block *succs = block->successors[0]; 900 901 ctx->block_idx = block->index; 902 903 nir_foreach_instr (instr, block) 904 emit_instr(ctx, instr); 905 906 if (!succs || !succs->index) 907 return false; 908 909 /* we want to be smart and always jump and have the backend cleanup 910 * but we are not, so there are two cases where jump is needed: 911 * loops (succs index lower) 912 * jumps (jump instruction seen in block) 913 */ 914 if (succs->index > block->index && !ctx->block_has_jump[block->index]) 915 return false; 916 917 assert(block->successors[1] == NULL); 918 919 instr = ir2_instr_create(ctx, IR2_CF); 920 instr->cf.block_idx = succs->index; 921 /* XXX can't jump to a block with different predicate */ 922 return true; 923} 924 925static void 926emit_if(struct ir2_context *ctx, nir_if *nif) 927{ 928 unsigned pred = ctx->pred, pred_idx = ctx->pred_idx; 929 struct ir2_instr *instr; 930 931 /* XXX: blob seems to always use same register for condition */ 932 933 instr = ir2_instr_create(ctx, IR2_ALU); 934 instr->src[0] = make_src(ctx, nif->condition); 935 instr->src_count = 1; 936 instr->ssa.ncomp = 1; 937 instr->alu.vector_opc = VECTOR_NONE; 938 instr->alu.scalar_opc = SCALAR_NONE; 939 instr->alu.export = -1; 940 instr->alu.write_mask = 1; 941 instr->pred = 0; 942 943 /* if nested, use PRED_SETNE_PUSHv */ 944 if (pred) { 945 instr->alu.vector_opc = PRED_SETNE_PUSHv; 946 instr->src[1] = instr->src[0]; 947 instr->src[0] = ir2_src(pred_idx, 0, IR2_SRC_SSA); 948 instr->src[0].swizzle = IR2_SWIZZLE_XXXX; 949 instr->src[1].swizzle = IR2_SWIZZLE_XXXX; 950 instr->src_count = 2; 951 } else { 952 instr->alu.scalar_opc = PRED_SETNEs; 953 } 954 955 ctx->pred_idx = instr->idx; 956 ctx->pred = 3; 957 958 emit_cf_list(ctx, &nif->then_list); 959 960 /* TODO: if these is no else branch we don't need this 961 * and if the else branch is simple, can just flip ctx->pred instead 962 */ 963 instr = ir2_instr_create(ctx, IR2_ALU); 964 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); 965 instr->src_count = 1; 966 instr->ssa.ncomp = 1; 967 instr->alu.vector_opc = VECTOR_NONE; 968 instr->alu.scalar_opc = PRED_SET_INVs; 969 instr->alu.export = -1; 970 instr->alu.write_mask = 1; 971 instr->pred = 0; 972 ctx->pred_idx = instr->idx; 973 974 emit_cf_list(ctx, &nif->else_list); 975 976 /* restore predicate for nested predicates */ 977 if (pred) { 978 instr = ir2_instr_create(ctx, IR2_ALU); 979 instr->src[0] = ir2_src(ctx->pred_idx, 0, IR2_SRC_SSA); 980 instr->src_count = 1; 981 instr->ssa.ncomp = 1; 982 instr->alu.vector_opc = VECTOR_NONE; 983 instr->alu.scalar_opc = PRED_SET_POPs; 984 instr->alu.export = -1; 985 instr->alu.write_mask = 1; 986 instr->pred = 0; 987 ctx->pred_idx = instr->idx; 988 } 989 990 /* restore ctx->pred */ 991 ctx->pred = pred; 992} 993 994/* get the highest block idx in the loop, so we know when 995 * we can free registers that are allocated outside the loop 996 */ 997static unsigned 998loop_last_block(struct exec_list *list) 999{ 1000 nir_cf_node *node = 1001 exec_node_data(nir_cf_node, exec_list_get_tail(list), node); 1002 switch (node->type) { 1003 case nir_cf_node_block: 1004 return nir_cf_node_as_block(node)->index; 1005 case nir_cf_node_if: 1006 assert(0); /* XXX could this ever happen? */ 1007 return 0; 1008 case nir_cf_node_loop: 1009 return loop_last_block(&nir_cf_node_as_loop(node)->body); 1010 default: 1011 compile_error(ctx, "Not supported\n"); 1012 return 0; 1013 } 1014} 1015 1016static void 1017emit_loop(struct ir2_context *ctx, nir_loop *nloop) 1018{ 1019 ctx->loop_last_block[++ctx->loop_depth] = loop_last_block(&nloop->body); 1020 emit_cf_list(ctx, &nloop->body); 1021 ctx->loop_depth--; 1022} 1023 1024static bool 1025emit_cf_list(struct ir2_context *ctx, struct exec_list *list) 1026{ 1027 bool ret = false; 1028 foreach_list_typed (nir_cf_node, node, node, list) { 1029 ret = false; 1030 switch (node->type) { 1031 case nir_cf_node_block: 1032 ret = emit_block(ctx, nir_cf_node_as_block(node)); 1033 break; 1034 case nir_cf_node_if: 1035 emit_if(ctx, nir_cf_node_as_if(node)); 1036 break; 1037 case nir_cf_node_loop: 1038 emit_loop(ctx, nir_cf_node_as_loop(node)); 1039 break; 1040 case nir_cf_node_function: 1041 compile_error(ctx, "Not supported\n"); 1042 break; 1043 } 1044 } 1045 return ret; 1046} 1047 1048static void 1049cleanup_binning(struct ir2_context *ctx) 1050{ 1051 assert(ctx->so->type == MESA_SHADER_VERTEX); 1052 1053 /* kill non-position outputs for binning variant */ 1054 nir_foreach_block (block, nir_shader_get_entrypoint(ctx->nir)) { 1055 nir_foreach_instr_safe (instr, block) { 1056 if (instr->type != nir_instr_type_intrinsic) 1057 continue; 1058 1059 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1060 if (intr->intrinsic != nir_intrinsic_store_output) 1061 continue; 1062 1063 if (output_slot(ctx, intr) != VARYING_SLOT_POS) 1064 nir_instr_remove(instr); 1065 } 1066 } 1067 1068 ir2_optimize_nir(ctx->nir, false); 1069} 1070 1071static bool 1072ir2_alu_to_scalar_filter_cb(const nir_instr *instr, const void *data) 1073{ 1074 if (instr->type != nir_instr_type_alu) 1075 return false; 1076 1077 nir_alu_instr *alu = nir_instr_as_alu(instr); 1078 switch (alu->op) { 1079 case nir_op_frsq: 1080 case nir_op_frcp: 1081 case nir_op_flog2: 1082 case nir_op_fexp2: 1083 case nir_op_fsqrt: 1084 case nir_op_fcos: 1085 case nir_op_fsin: 1086 return true; 1087 default: 1088 break; 1089 } 1090 1091 return false; 1092} 1093 1094void 1095ir2_nir_compile(struct ir2_context *ctx, bool binning) 1096{ 1097 struct fd2_shader_stateobj *so = ctx->so; 1098 1099 memset(ctx->ssa_map, 0xff, sizeof(ctx->ssa_map)); 1100 1101 ctx->nir = nir_shader_clone(NULL, so->nir); 1102 1103 if (binning) 1104 cleanup_binning(ctx); 1105 1106 OPT_V(ctx->nir, nir_copy_prop); 1107 OPT_V(ctx->nir, nir_opt_dce); 1108 OPT_V(ctx->nir, nir_opt_move, nir_move_comparisons); 1109 1110 OPT_V(ctx->nir, nir_lower_int_to_float); 1111 OPT_V(ctx->nir, nir_lower_bool_to_float); 1112 while (OPT(ctx->nir, nir_opt_algebraic)) 1113 ; 1114 OPT_V(ctx->nir, nir_opt_algebraic_late); 1115 OPT_V(ctx->nir, nir_lower_to_source_mods, nir_lower_all_source_mods); 1116 1117 OPT_V(ctx->nir, nir_lower_alu_to_scalar, ir2_alu_to_scalar_filter_cb, NULL); 1118 1119 OPT_V(ctx->nir, nir_lower_locals_to_regs); 1120 1121 OPT_V(ctx->nir, nir_convert_from_ssa, true); 1122 1123 OPT_V(ctx->nir, nir_move_vec_src_uses_to_dest); 1124 OPT_V(ctx->nir, nir_lower_vec_to_movs, NULL, NULL); 1125 1126 OPT_V(ctx->nir, nir_opt_dce); 1127 1128 nir_sweep(ctx->nir); 1129 1130 if (FD_DBG(DISASM)) { 1131 debug_printf("----------------------\n"); 1132 nir_print_shader(ctx->nir, stdout); 1133 debug_printf("----------------------\n"); 1134 } 1135 1136 /* fd2_shader_stateobj init */ 1137 if (so->type == MESA_SHADER_FRAGMENT) { 1138 ctx->f->fragcoord = -1; 1139 ctx->f->inputs_count = 0; 1140 memset(ctx->f->inputs, 0, sizeof(ctx->f->inputs)); 1141 } 1142 1143 /* Setup inputs: */ 1144 nir_foreach_shader_in_variable (in, ctx->nir) 1145 setup_input(ctx, in); 1146 1147 if (so->type == MESA_SHADER_FRAGMENT) { 1148 unsigned idx; 1149 for (idx = 0; idx < ctx->f->inputs_count; idx++) { 1150 ctx->input[idx].ncomp = ctx->f->inputs[idx].ncomp; 1151 update_range(ctx, &ctx->input[idx]); 1152 } 1153 /* assume we have param input and kill it later if not */ 1154 ctx->input[idx].ncomp = 4; 1155 update_range(ctx, &ctx->input[idx]); 1156 } else { 1157 ctx->input[0].ncomp = 1; 1158 ctx->input[2].ncomp = 1; 1159 update_range(ctx, &ctx->input[0]); 1160 update_range(ctx, &ctx->input[2]); 1161 } 1162 1163 /* And emit the body: */ 1164 nir_function_impl *fxn = nir_shader_get_entrypoint(ctx->nir); 1165 1166 nir_foreach_register (reg, &fxn->registers) { 1167 ctx->reg[reg->index].ncomp = reg->num_components; 1168 ctx->reg_count = MAX2(ctx->reg_count, reg->index + 1); 1169 } 1170 1171 nir_metadata_require(fxn, nir_metadata_block_index); 1172 emit_cf_list(ctx, &fxn->body); 1173 /* TODO emit_block(ctx, fxn->end_block); */ 1174 1175 if (so->type == MESA_SHADER_VERTEX) 1176 extra_position_exports(ctx, binning); 1177 1178 ralloc_free(ctx->nir); 1179 1180 /* kill unused param input */ 1181 if (so->type == MESA_SHADER_FRAGMENT && !so->need_param) 1182 ctx->input[ctx->f->inputs_count].initialized = false; 1183} 1184