midgard_compile.c revision 7ec681f3
1/* 2 * Copyright (C) 2018-2019 Alyssa Rosenzweig <alyssa@rosenzweig.io> 3 * Copyright (C) 2019-2020 Collabora, Ltd. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include <sys/types.h> 26#include <sys/stat.h> 27#include <sys/mman.h> 28#include <fcntl.h> 29#include <stdint.h> 30#include <stdlib.h> 31#include <stdio.h> 32#include <err.h> 33 34#include "main/mtypes.h" 35#include "compiler/glsl/glsl_to_nir.h" 36#include "compiler/nir_types.h" 37#include "compiler/nir/nir_builder.h" 38#include "util/half_float.h" 39#include "util/u_math.h" 40#include "util/u_debug.h" 41#include "util/u_dynarray.h" 42#include "util/list.h" 43#include "main/mtypes.h" 44 45#include "midgard.h" 46#include "midgard_nir.h" 47#include "midgard_compile.h" 48#include "midgard_ops.h" 49#include "helpers.h" 50#include "compiler.h" 51#include "midgard_quirks.h" 52#include "panfrost-quirks.h" 53#include "panfrost/util/pan_lower_framebuffer.h" 54 55#include "disassemble.h" 56 57static const struct debug_named_value midgard_debug_options[] = { 58 {"msgs", MIDGARD_DBG_MSGS, "Print debug messages"}, 59 {"shaders", MIDGARD_DBG_SHADERS, "Dump shaders in NIR and MIR"}, 60 {"shaderdb", MIDGARD_DBG_SHADERDB, "Prints shader-db statistics"}, 61 {"inorder", MIDGARD_DBG_INORDER, "Disables out-of-order scheduling"}, 62 {"verbose", MIDGARD_DBG_VERBOSE, "Dump shaders verbosely"}, 63 {"internal", MIDGARD_DBG_INTERNAL, "Dump internal shaders"}, 64 DEBUG_NAMED_VALUE_END 65}; 66 67DEBUG_GET_ONCE_FLAGS_OPTION(midgard_debug, "MIDGARD_MESA_DEBUG", midgard_debug_options, 0) 68 69int midgard_debug = 0; 70 71#define DBG(fmt, ...) \ 72 do { if (midgard_debug & MIDGARD_DBG_MSGS) \ 73 fprintf(stderr, "%s:%d: "fmt, \ 74 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) 75static midgard_block * 76create_empty_block(compiler_context *ctx) 77{ 78 midgard_block *blk = rzalloc(ctx, midgard_block); 79 80 blk->base.predecessors = _mesa_set_create(blk, 81 _mesa_hash_pointer, 82 _mesa_key_pointer_equal); 83 84 blk->base.name = ctx->block_source_count++; 85 86 return blk; 87} 88 89static void 90schedule_barrier(compiler_context *ctx) 91{ 92 midgard_block *temp = ctx->after_block; 93 ctx->after_block = create_empty_block(ctx); 94 ctx->block_count++; 95 list_addtail(&ctx->after_block->base.link, &ctx->blocks); 96 list_inithead(&ctx->after_block->base.instructions); 97 pan_block_add_successor(&ctx->current_block->base, &ctx->after_block->base); 98 ctx->current_block = ctx->after_block; 99 ctx->after_block = temp; 100} 101 102/* Helpers to generate midgard_instruction's using macro magic, since every 103 * driver seems to do it that way */ 104 105#define EMIT(op, ...) emit_mir_instruction(ctx, v_##op(__VA_ARGS__)); 106 107#define M_LOAD_STORE(name, store, T) \ 108 static midgard_instruction m_##name(unsigned ssa, unsigned address) { \ 109 midgard_instruction i = { \ 110 .type = TAG_LOAD_STORE_4, \ 111 .mask = 0xF, \ 112 .dest = ~0, \ 113 .src = { ~0, ~0, ~0, ~0 }, \ 114 .swizzle = SWIZZLE_IDENTITY_4, \ 115 .op = midgard_op_##name, \ 116 .load_store = { \ 117 .signed_offset = address \ 118 } \ 119 }; \ 120 \ 121 if (store) { \ 122 i.src[0] = ssa; \ 123 i.src_types[0] = T; \ 124 i.dest_type = T; \ 125 } else { \ 126 i.dest = ssa; \ 127 i.dest_type = T; \ 128 } \ 129 return i; \ 130 } 131 132#define M_LOAD(name, T) M_LOAD_STORE(name, false, T) 133#define M_STORE(name, T) M_LOAD_STORE(name, true, T) 134 135M_LOAD(ld_attr_32, nir_type_uint32); 136M_LOAD(ld_vary_32, nir_type_uint32); 137M_LOAD(ld_ubo_32, nir_type_uint32); 138M_LOAD(ld_ubo_64, nir_type_uint32); 139M_LOAD(ld_ubo_128, nir_type_uint32); 140M_LOAD(ld_32, nir_type_uint32); 141M_LOAD(ld_64, nir_type_uint32); 142M_LOAD(ld_128, nir_type_uint32); 143M_STORE(st_32, nir_type_uint32); 144M_STORE(st_64, nir_type_uint32); 145M_STORE(st_128, nir_type_uint32); 146M_LOAD(ld_tilebuffer_raw, nir_type_uint32); 147M_LOAD(ld_tilebuffer_16f, nir_type_float16); 148M_LOAD(ld_tilebuffer_32f, nir_type_float32); 149M_STORE(st_vary_32, nir_type_uint32); 150M_LOAD(ld_cubemap_coords, nir_type_uint32); 151M_LOAD(ldst_mov, nir_type_uint32); 152M_LOAD(ld_image_32f, nir_type_float32); 153M_LOAD(ld_image_16f, nir_type_float16); 154M_LOAD(ld_image_32u, nir_type_uint32); 155M_LOAD(ld_image_32i, nir_type_int32); 156M_STORE(st_image_32f, nir_type_float32); 157M_STORE(st_image_16f, nir_type_float16); 158M_STORE(st_image_32u, nir_type_uint32); 159M_STORE(st_image_32i, nir_type_int32); 160M_LOAD(lea_image, nir_type_uint64); 161 162#define M_IMAGE(op) \ 163static midgard_instruction \ 164op ## _image(nir_alu_type type, unsigned val, unsigned address) \ 165{ \ 166 switch (type) { \ 167 case nir_type_float32: \ 168 return m_ ## op ## _image_32f(val, address); \ 169 case nir_type_float16: \ 170 return m_ ## op ## _image_16f(val, address); \ 171 case nir_type_uint32: \ 172 return m_ ## op ## _image_32u(val, address); \ 173 case nir_type_int32: \ 174 return m_ ## op ## _image_32i(val, address); \ 175 default: \ 176 unreachable("Invalid image type"); \ 177 } \ 178} 179 180M_IMAGE(ld); 181M_IMAGE(st); 182 183static midgard_instruction 184v_branch(bool conditional, bool invert) 185{ 186 midgard_instruction ins = { 187 .type = TAG_ALU_4, 188 .unit = ALU_ENAB_BRANCH, 189 .compact_branch = true, 190 .branch = { 191 .conditional = conditional, 192 .invert_conditional = invert 193 }, 194 .dest = ~0, 195 .src = { ~0, ~0, ~0, ~0 }, 196 }; 197 198 return ins; 199} 200 201static void 202attach_constants(compiler_context *ctx, midgard_instruction *ins, void *constants, int name) 203{ 204 ins->has_constants = true; 205 memcpy(&ins->constants, constants, 16); 206} 207 208static int 209glsl_type_size(const struct glsl_type *type, bool bindless) 210{ 211 return glsl_count_attribute_slots(type, false); 212} 213 214/* Lower fdot2 to a vector multiplication followed by channel addition */ 215static bool 216midgard_nir_lower_fdot2_instr(nir_builder *b, nir_instr *instr, void *data) 217{ 218 if (instr->type != nir_instr_type_alu) 219 return false; 220 221 nir_alu_instr *alu = nir_instr_as_alu(instr); 222 if (alu->op != nir_op_fdot2) 223 return false; 224 225 b->cursor = nir_before_instr(&alu->instr); 226 227 nir_ssa_def *src0 = nir_ssa_for_alu_src(b, alu, 0); 228 nir_ssa_def *src1 = nir_ssa_for_alu_src(b, alu, 1); 229 230 nir_ssa_def *product = nir_fmul(b, src0, src1); 231 232 nir_ssa_def *sum = nir_fadd(b, 233 nir_channel(b, product, 0), 234 nir_channel(b, product, 1)); 235 236 /* Replace the fdot2 with this sum */ 237 nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, sum); 238 239 return true; 240} 241 242static bool 243midgard_nir_lower_fdot2(nir_shader *shader) 244{ 245 return nir_shader_instructions_pass(shader, 246 midgard_nir_lower_fdot2_instr, 247 nir_metadata_block_index | nir_metadata_dominance, 248 NULL); 249} 250 251static bool 252mdg_is_64(const nir_instr *instr, const void *_unused) 253{ 254 const nir_alu_instr *alu = nir_instr_as_alu(instr); 255 256 if (nir_dest_bit_size(alu->dest.dest) == 64) 257 return true; 258 259 switch (alu->op) { 260 case nir_op_umul_high: 261 case nir_op_imul_high: 262 return true; 263 default: 264 return false; 265 } 266} 267 268/* Only vectorize int64 up to vec2 */ 269static bool 270midgard_vectorize_filter(const nir_instr *instr, void *data) 271{ 272 if (instr->type != nir_instr_type_alu) 273 return true; 274 275 const nir_alu_instr *alu = nir_instr_as_alu(instr); 276 277 unsigned num_components = alu->dest.dest.ssa.num_components; 278 279 int src_bit_size = nir_src_bit_size(alu->src[0].src); 280 int dst_bit_size = nir_dest_bit_size(alu->dest.dest); 281 282 if (src_bit_size == 64 || dst_bit_size == 64) { 283 if (num_components > 1) 284 return false; 285 } 286 287 return true; 288} 289 290 291/* Flushes undefined values to zero */ 292 293static void 294optimise_nir(nir_shader *nir, unsigned quirks, bool is_blend) 295{ 296 bool progress; 297 unsigned lower_flrp = 298 (nir->options->lower_flrp16 ? 16 : 0) | 299 (nir->options->lower_flrp32 ? 32 : 0) | 300 (nir->options->lower_flrp64 ? 64 : 0); 301 302 NIR_PASS(progress, nir, nir_lower_regs_to_ssa); 303 nir_lower_idiv_options idiv_options = { 304 .imprecise_32bit_lowering = true, 305 .allow_fp16 = true, 306 }; 307 NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); 308 309 nir_lower_tex_options lower_tex_options = { 310 .lower_txs_lod = true, 311 .lower_txp = ~0, 312 .lower_tg4_broadcom_swizzle = true, 313 /* TODO: we have native gradient.. */ 314 .lower_txd = true, 315 }; 316 317 NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); 318 319 /* Must lower fdot2 after tex is lowered */ 320 NIR_PASS(progress, nir, midgard_nir_lower_fdot2); 321 322 /* T720 is broken. */ 323 324 if (quirks & MIDGARD_BROKEN_LOD) 325 NIR_PASS_V(nir, midgard_nir_lod_errata); 326 327 /* Midgard image ops coordinates are 16-bit instead of 32-bit */ 328 NIR_PASS(progress, nir, midgard_nir_lower_image_bitsize); 329 NIR_PASS(progress, nir, midgard_nir_lower_helper_writes); 330 NIR_PASS(progress, nir, pan_lower_helper_invocation); 331 NIR_PASS(progress, nir, pan_lower_sample_pos); 332 333 NIR_PASS(progress, nir, midgard_nir_lower_algebraic_early); 334 335 do { 336 progress = false; 337 338 NIR_PASS(progress, nir, nir_lower_var_copies); 339 NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 340 341 NIR_PASS(progress, nir, nir_copy_prop); 342 NIR_PASS(progress, nir, nir_opt_remove_phis); 343 NIR_PASS(progress, nir, nir_opt_dce); 344 NIR_PASS(progress, nir, nir_opt_dead_cf); 345 NIR_PASS(progress, nir, nir_opt_cse); 346 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); 347 NIR_PASS(progress, nir, nir_opt_algebraic); 348 NIR_PASS(progress, nir, nir_opt_constant_folding); 349 350 if (lower_flrp != 0) { 351 bool lower_flrp_progress = false; 352 NIR_PASS(lower_flrp_progress, 353 nir, 354 nir_lower_flrp, 355 lower_flrp, 356 false /* always_precise */); 357 if (lower_flrp_progress) { 358 NIR_PASS(progress, nir, 359 nir_opt_constant_folding); 360 progress = true; 361 } 362 363 /* Nothing should rematerialize any flrps, so we only 364 * need to do this lowering once. 365 */ 366 lower_flrp = 0; 367 } 368 369 NIR_PASS(progress, nir, nir_opt_undef); 370 NIR_PASS(progress, nir, nir_lower_undef_to_zero); 371 372 NIR_PASS(progress, nir, nir_opt_loop_unroll); 373 374 NIR_PASS(progress, nir, nir_opt_vectorize, 375 midgard_vectorize_filter, NULL); 376 } while (progress); 377 378 NIR_PASS_V(nir, nir_lower_alu_to_scalar, mdg_is_64, NULL); 379 380 /* Run after opts so it can hit more */ 381 if (!is_blend) 382 NIR_PASS(progress, nir, nir_fuse_io_16); 383 384 /* Must be run at the end to prevent creation of fsin/fcos ops */ 385 NIR_PASS(progress, nir, midgard_nir_scale_trig); 386 387 do { 388 progress = false; 389 390 NIR_PASS(progress, nir, nir_opt_dce); 391 NIR_PASS(progress, nir, nir_opt_algebraic); 392 NIR_PASS(progress, nir, nir_opt_constant_folding); 393 NIR_PASS(progress, nir, nir_copy_prop); 394 } while (progress); 395 396 NIR_PASS(progress, nir, nir_opt_algebraic_late); 397 NIR_PASS(progress, nir, nir_opt_algebraic_distribute_src_mods); 398 399 /* We implement booleans as 32-bit 0/~0 */ 400 NIR_PASS(progress, nir, nir_lower_bool_to_int32); 401 402 /* Now that booleans are lowered, we can run out late opts */ 403 NIR_PASS(progress, nir, midgard_nir_lower_algebraic_late); 404 NIR_PASS(progress, nir, midgard_nir_cancel_inot); 405 406 NIR_PASS(progress, nir, nir_copy_prop); 407 NIR_PASS(progress, nir, nir_opt_dce); 408 409 /* Backend scheduler is purely local, so do some global optimizations 410 * to reduce register pressure. */ 411 nir_move_options move_all = 412 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | 413 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; 414 415 NIR_PASS_V(nir, nir_opt_sink, move_all); 416 NIR_PASS_V(nir, nir_opt_move, move_all); 417 418 /* Take us out of SSA */ 419 NIR_PASS(progress, nir, nir_lower_locals_to_regs); 420 NIR_PASS(progress, nir, nir_convert_from_ssa, true); 421 422 /* We are a vector architecture; write combine where possible */ 423 NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); 424 NIR_PASS(progress, nir, nir_lower_vec_to_movs, NULL, NULL); 425 426 NIR_PASS(progress, nir, nir_opt_dce); 427} 428 429/* Do not actually emit a load; instead, cache the constant for inlining */ 430 431static void 432emit_load_const(compiler_context *ctx, nir_load_const_instr *instr) 433{ 434 nir_ssa_def def = instr->def; 435 436 midgard_constants *consts = rzalloc(ctx, midgard_constants); 437 438 assert(instr->def.num_components * instr->def.bit_size <= sizeof(*consts) * 8); 439 440#define RAW_CONST_COPY(bits) \ 441 nir_const_value_to_array(consts->u##bits, instr->value, \ 442 instr->def.num_components, u##bits) 443 444 switch (instr->def.bit_size) { 445 case 64: 446 RAW_CONST_COPY(64); 447 break; 448 case 32: 449 RAW_CONST_COPY(32); 450 break; 451 case 16: 452 RAW_CONST_COPY(16); 453 break; 454 case 8: 455 RAW_CONST_COPY(8); 456 break; 457 default: 458 unreachable("Invalid bit_size for load_const instruction\n"); 459 } 460 461 /* Shifted for SSA, +1 for off-by-one */ 462 _mesa_hash_table_u64_insert(ctx->ssa_constants, (def.index << 1) + 1, consts); 463} 464 465/* Normally constants are embedded implicitly, but for I/O and such we have to 466 * explicitly emit a move with the constant source */ 467 468static void 469emit_explicit_constant(compiler_context *ctx, unsigned node, unsigned to) 470{ 471 void *constant_value = _mesa_hash_table_u64_search(ctx->ssa_constants, node + 1); 472 473 if (constant_value) { 474 midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), to); 475 attach_constants(ctx, &ins, constant_value, node + 1); 476 emit_mir_instruction(ctx, ins); 477 } 478} 479 480static bool 481nir_is_non_scalar_swizzle(nir_alu_src *src, unsigned nr_components) 482{ 483 unsigned comp = src->swizzle[0]; 484 485 for (unsigned c = 1; c < nr_components; ++c) { 486 if (src->swizzle[c] != comp) 487 return true; 488 } 489 490 return false; 491} 492 493#define ATOMIC_CASE_IMPL(ctx, instr, nir, op, is_shared) \ 494 case nir_intrinsic_##nir: \ 495 emit_atomic(ctx, instr, is_shared, midgard_op_##op, ~0); \ 496 break; 497 498#define ATOMIC_CASE(ctx, instr, nir, op) \ 499 ATOMIC_CASE_IMPL(ctx, instr, shared_atomic_##nir, atomic_##op, true); \ 500 ATOMIC_CASE_IMPL(ctx, instr, global_atomic_##nir, atomic_##op, false); 501 502#define IMAGE_ATOMIC_CASE(ctx, instr, nir, op) \ 503 case nir_intrinsic_image_atomic_##nir: { \ 504 midgard_instruction ins = emit_image_op(ctx, instr, true); \ 505 emit_atomic(ctx, instr, false, midgard_op_atomic_##op, ins.dest); \ 506 break; \ 507 } 508 509#define ALU_CASE(nir, _op) \ 510 case nir_op_##nir: \ 511 op = midgard_alu_op_##_op; \ 512 assert(src_bitsize == dst_bitsize); \ 513 break; 514 515#define ALU_CASE_RTZ(nir, _op) \ 516 case nir_op_##nir: \ 517 op = midgard_alu_op_##_op; \ 518 roundmode = MIDGARD_RTZ; \ 519 break; 520 521#define ALU_CHECK_CMP() \ 522 assert(src_bitsize == 16 || src_bitsize == 32 || src_bitsize == 64); \ 523 assert(dst_bitsize == 16 || dst_bitsize == 32); \ 524 525#define ALU_CASE_BCAST(nir, _op, count) \ 526 case nir_op_##nir: \ 527 op = midgard_alu_op_##_op; \ 528 broadcast_swizzle = count; \ 529 ALU_CHECK_CMP(); \ 530 break; 531 532#define ALU_CASE_CMP(nir, _op) \ 533 case nir_op_##nir: \ 534 op = midgard_alu_op_##_op; \ 535 ALU_CHECK_CMP(); \ 536 break; 537 538/* Compare mir_lower_invert */ 539static bool 540nir_accepts_inot(nir_op op, unsigned src) 541{ 542 switch (op) { 543 case nir_op_ior: 544 case nir_op_iand: /* TODO: b2f16 */ 545 case nir_op_ixor: 546 return true; 547 case nir_op_b32csel: 548 /* Only the condition */ 549 return (src == 0); 550 default: 551 return false; 552 } 553} 554 555static bool 556mir_accept_dest_mod(compiler_context *ctx, nir_dest **dest, nir_op op) 557{ 558 if (pan_has_dest_mod(dest, op)) { 559 assert((*dest)->is_ssa); 560 BITSET_SET(ctx->already_emitted, (*dest)->ssa.index); 561 return true; 562 } 563 564 return false; 565} 566 567/* Look for floating point mods. We have the mods clamp_m1_1, clamp_0_1, 568 * and clamp_0_inf. We also have the relations (note 3 * 2 = 6 cases): 569 * 570 * clamp_0_1(clamp_0_inf(x)) = clamp_m1_1(x) 571 * clamp_0_1(clamp_m1_1(x)) = clamp_m1_1(x) 572 * clamp_0_inf(clamp_0_1(x)) = clamp_m1_1(x) 573 * clamp_0_inf(clamp_m1_1(x)) = clamp_m1_1(x) 574 * clamp_m1_1(clamp_0_1(x)) = clamp_m1_1(x) 575 * clamp_m1_1(clamp_0_inf(x)) = clamp_m1_1(x) 576 * 577 * So by cases any composition of output modifiers is equivalent to 578 * clamp_m1_1 alone. 579 */ 580static unsigned 581mir_determine_float_outmod(compiler_context *ctx, nir_dest **dest, unsigned prior_outmod) 582{ 583 bool clamp_0_inf = mir_accept_dest_mod(ctx, dest, nir_op_fclamp_pos_mali); 584 bool clamp_0_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat); 585 bool clamp_m1_1 = mir_accept_dest_mod(ctx, dest, nir_op_fsat_signed_mali); 586 bool prior = (prior_outmod != midgard_outmod_none); 587 int count = (int) prior + (int) clamp_0_inf + (int) clamp_0_1 + (int) clamp_m1_1; 588 589 return ((count > 1) || clamp_0_1) ? midgard_outmod_clamp_0_1 : 590 clamp_0_inf ? midgard_outmod_clamp_0_inf : 591 clamp_m1_1 ? midgard_outmod_clamp_m1_1 : 592 prior_outmod; 593} 594 595static void 596mir_copy_src(midgard_instruction *ins, nir_alu_instr *instr, unsigned i, unsigned to, bool *abs, bool *neg, bool *not, enum midgard_roundmode *roundmode, bool is_int, unsigned bcast_count) 597{ 598 nir_alu_src src = instr->src[i]; 599 600 if (!is_int) { 601 if (pan_has_source_mod(&src, nir_op_fneg)) 602 *neg = !(*neg); 603 604 if (pan_has_source_mod(&src, nir_op_fabs)) 605 *abs = true; 606 } 607 608 if (nir_accepts_inot(instr->op, i) && pan_has_source_mod(&src, nir_op_inot)) 609 *not = true; 610 611 if (roundmode) { 612 if (pan_has_source_mod(&src, nir_op_fround_even)) 613 *roundmode = MIDGARD_RTE; 614 615 if (pan_has_source_mod(&src, nir_op_ftrunc)) 616 *roundmode = MIDGARD_RTZ; 617 618 if (pan_has_source_mod(&src, nir_op_ffloor)) 619 *roundmode = MIDGARD_RTN; 620 621 if (pan_has_source_mod(&src, nir_op_fceil)) 622 *roundmode = MIDGARD_RTP; 623 } 624 625 unsigned bits = nir_src_bit_size(src.src); 626 627 ins->src[to] = nir_src_index(NULL, &src.src); 628 ins->src_types[to] = nir_op_infos[instr->op].input_types[i] | bits; 629 630 for (unsigned c = 0; c < NIR_MAX_VEC_COMPONENTS; ++c) { 631 ins->swizzle[to][c] = src.swizzle[ 632 (!bcast_count || c < bcast_count) ? c : 633 (bcast_count - 1)]; 634 } 635} 636 637/* Midgard features both fcsel and icsel, depending on whether you want int or 638 * float modifiers. NIR's csel is typeless, so we want a heuristic to guess if 639 * we should emit an int or float csel depending on what modifiers could be 640 * placed. In the absense of modifiers, this is probably arbitrary. */ 641 642static bool 643mir_is_bcsel_float(nir_alu_instr *instr) 644{ 645 nir_op intmods[] = { 646 nir_op_i2i8, nir_op_i2i16, 647 nir_op_i2i32, nir_op_i2i64 648 }; 649 650 nir_op floatmods[] = { 651 nir_op_fabs, nir_op_fneg, 652 nir_op_f2f16, nir_op_f2f32, 653 nir_op_f2f64 654 }; 655 656 nir_op floatdestmods[] = { 657 nir_op_fsat, nir_op_fsat_signed_mali, nir_op_fclamp_pos_mali, 658 nir_op_f2f16, nir_op_f2f32 659 }; 660 661 signed score = 0; 662 663 for (unsigned i = 1; i < 3; ++i) { 664 nir_alu_src s = instr->src[i]; 665 for (unsigned q = 0; q < ARRAY_SIZE(intmods); ++q) { 666 if (pan_has_source_mod(&s, intmods[q])) 667 score--; 668 } 669 } 670 671 for (unsigned i = 1; i < 3; ++i) { 672 nir_alu_src s = instr->src[i]; 673 for (unsigned q = 0; q < ARRAY_SIZE(floatmods); ++q) { 674 if (pan_has_source_mod(&s, floatmods[q])) 675 score++; 676 } 677 } 678 679 for (unsigned q = 0; q < ARRAY_SIZE(floatdestmods); ++q) { 680 nir_dest *dest = &instr->dest.dest; 681 if (pan_has_dest_mod(&dest, floatdestmods[q])) 682 score++; 683 } 684 685 return (score > 0); 686} 687 688static void 689emit_alu(compiler_context *ctx, nir_alu_instr *instr) 690{ 691 nir_dest *dest = &instr->dest.dest; 692 693 if (dest->is_ssa && BITSET_TEST(ctx->already_emitted, dest->ssa.index)) 694 return; 695 696 /* Derivatives end up emitted on the texture pipe, not the ALUs. This 697 * is handled elsewhere */ 698 699 if (instr->op == nir_op_fddx || instr->op == nir_op_fddy) { 700 midgard_emit_derivatives(ctx, instr); 701 return; 702 } 703 704 bool is_ssa = dest->is_ssa; 705 706 unsigned nr_components = nir_dest_num_components(*dest); 707 unsigned nr_inputs = nir_op_infos[instr->op].num_inputs; 708 unsigned op = 0; 709 710 /* Number of components valid to check for the instruction (the rest 711 * will be forced to the last), or 0 to use as-is. Relevant as 712 * ball-type instructions have a channel count in NIR but are all vec4 713 * in Midgard */ 714 715 unsigned broadcast_swizzle = 0; 716 717 /* Should we swap arguments? */ 718 bool flip_src12 = false; 719 720 ASSERTED unsigned src_bitsize = nir_src_bit_size(instr->src[0].src); 721 ASSERTED unsigned dst_bitsize = nir_dest_bit_size(*dest); 722 723 enum midgard_roundmode roundmode = MIDGARD_RTE; 724 725 switch (instr->op) { 726 ALU_CASE(fadd, fadd); 727 ALU_CASE(fmul, fmul); 728 ALU_CASE(fmin, fmin); 729 ALU_CASE(fmax, fmax); 730 ALU_CASE(imin, imin); 731 ALU_CASE(imax, imax); 732 ALU_CASE(umin, umin); 733 ALU_CASE(umax, umax); 734 ALU_CASE(ffloor, ffloor); 735 ALU_CASE(fround_even, froundeven); 736 ALU_CASE(ftrunc, ftrunc); 737 ALU_CASE(fceil, fceil); 738 ALU_CASE(fdot3, fdot3); 739 ALU_CASE(fdot4, fdot4); 740 ALU_CASE(iadd, iadd); 741 ALU_CASE(isub, isub); 742 ALU_CASE(iadd_sat, iaddsat); 743 ALU_CASE(isub_sat, isubsat); 744 ALU_CASE(uadd_sat, uaddsat); 745 ALU_CASE(usub_sat, usubsat); 746 ALU_CASE(imul, imul); 747 ALU_CASE(imul_high, imul); 748 ALU_CASE(umul_high, imul); 749 ALU_CASE(uclz, iclz); 750 751 /* Zero shoved as second-arg */ 752 ALU_CASE(iabs, iabsdiff); 753 754 ALU_CASE(uabs_isub, iabsdiff); 755 ALU_CASE(uabs_usub, uabsdiff); 756 757 ALU_CASE(mov, imov); 758 759 ALU_CASE_CMP(feq32, feq); 760 ALU_CASE_CMP(fneu32, fne); 761 ALU_CASE_CMP(flt32, flt); 762 ALU_CASE_CMP(ieq32, ieq); 763 ALU_CASE_CMP(ine32, ine); 764 ALU_CASE_CMP(ilt32, ilt); 765 ALU_CASE_CMP(ult32, ult); 766 767 /* We don't have a native b2f32 instruction. Instead, like many 768 * GPUs, we exploit booleans as 0/~0 for false/true, and 769 * correspondingly AND 770 * by 1.0 to do the type conversion. For the moment, prime us 771 * to emit: 772 * 773 * iand [whatever], #0 774 * 775 * At the end of emit_alu (as MIR), we'll fix-up the constant 776 */ 777 778 ALU_CASE_CMP(b2f32, iand); 779 ALU_CASE_CMP(b2f16, iand); 780 ALU_CASE_CMP(b2i32, iand); 781 782 /* Likewise, we don't have a dedicated f2b32 instruction, but 783 * we can do a "not equal to 0.0" test. */ 784 785 ALU_CASE_CMP(f2b32, fne); 786 ALU_CASE_CMP(i2b32, ine); 787 788 ALU_CASE(frcp, frcp); 789 ALU_CASE(frsq, frsqrt); 790 ALU_CASE(fsqrt, fsqrt); 791 ALU_CASE(fexp2, fexp2); 792 ALU_CASE(flog2, flog2); 793 794 ALU_CASE_RTZ(f2i64, f2i_rte); 795 ALU_CASE_RTZ(f2u64, f2u_rte); 796 ALU_CASE_RTZ(i2f64, i2f_rte); 797 ALU_CASE_RTZ(u2f64, u2f_rte); 798 799 ALU_CASE_RTZ(f2i32, f2i_rte); 800 ALU_CASE_RTZ(f2u32, f2u_rte); 801 ALU_CASE_RTZ(i2f32, i2f_rte); 802 ALU_CASE_RTZ(u2f32, u2f_rte); 803 804 ALU_CASE_RTZ(f2i8, f2i_rte); 805 ALU_CASE_RTZ(f2u8, f2u_rte); 806 807 ALU_CASE_RTZ(f2i16, f2i_rte); 808 ALU_CASE_RTZ(f2u16, f2u_rte); 809 ALU_CASE_RTZ(i2f16, i2f_rte); 810 ALU_CASE_RTZ(u2f16, u2f_rte); 811 812 ALU_CASE(fsin, fsinpi); 813 ALU_CASE(fcos, fcospi); 814 815 /* We'll get 0 in the second arg, so: 816 * ~a = ~(a | 0) = nor(a, 0) */ 817 ALU_CASE(inot, inor); 818 ALU_CASE(iand, iand); 819 ALU_CASE(ior, ior); 820 ALU_CASE(ixor, ixor); 821 ALU_CASE(ishl, ishl); 822 ALU_CASE(ishr, iasr); 823 ALU_CASE(ushr, ilsr); 824 825 ALU_CASE_BCAST(b32all_fequal2, fball_eq, 2); 826 ALU_CASE_BCAST(b32all_fequal3, fball_eq, 3); 827 ALU_CASE_CMP(b32all_fequal4, fball_eq); 828 829 ALU_CASE_BCAST(b32any_fnequal2, fbany_neq, 2); 830 ALU_CASE_BCAST(b32any_fnequal3, fbany_neq, 3); 831 ALU_CASE_CMP(b32any_fnequal4, fbany_neq); 832 833 ALU_CASE_BCAST(b32all_iequal2, iball_eq, 2); 834 ALU_CASE_BCAST(b32all_iequal3, iball_eq, 3); 835 ALU_CASE_CMP(b32all_iequal4, iball_eq); 836 837 ALU_CASE_BCAST(b32any_inequal2, ibany_neq, 2); 838 ALU_CASE_BCAST(b32any_inequal3, ibany_neq, 3); 839 ALU_CASE_CMP(b32any_inequal4, ibany_neq); 840 841 /* Source mods will be shoved in later */ 842 ALU_CASE(fabs, fmov); 843 ALU_CASE(fneg, fmov); 844 ALU_CASE(fsat, fmov); 845 ALU_CASE(fsat_signed_mali, fmov); 846 ALU_CASE(fclamp_pos_mali, fmov); 847 848 /* For size conversion, we use a move. Ideally though we would squash 849 * these ops together; maybe that has to happen after in NIR as part of 850 * propagation...? An earlier algebraic pass ensured we step down by 851 * only / exactly one size. If stepping down, we use a dest override to 852 * reduce the size; if stepping up, we use a larger-sized move with a 853 * half source and a sign/zero-extension modifier */ 854 855 case nir_op_i2i8: 856 case nir_op_i2i16: 857 case nir_op_i2i32: 858 case nir_op_i2i64: 859 case nir_op_u2u8: 860 case nir_op_u2u16: 861 case nir_op_u2u32: 862 case nir_op_u2u64: 863 case nir_op_f2f16: 864 case nir_op_f2f32: 865 case nir_op_f2f64: { 866 if (instr->op == nir_op_f2f16 || instr->op == nir_op_f2f32 || 867 instr->op == nir_op_f2f64) 868 op = midgard_alu_op_fmov; 869 else 870 op = midgard_alu_op_imov; 871 872 break; 873 } 874 875 /* For greater-or-equal, we lower to less-or-equal and flip the 876 * arguments */ 877 878 case nir_op_fge: 879 case nir_op_fge32: 880 case nir_op_ige32: 881 case nir_op_uge32: { 882 op = 883 instr->op == nir_op_fge ? midgard_alu_op_fle : 884 instr->op == nir_op_fge32 ? midgard_alu_op_fle : 885 instr->op == nir_op_ige32 ? midgard_alu_op_ile : 886 instr->op == nir_op_uge32 ? midgard_alu_op_ule : 887 0; 888 889 flip_src12 = true; 890 ALU_CHECK_CMP(); 891 break; 892 } 893 894 case nir_op_b32csel: { 895 bool mixed = nir_is_non_scalar_swizzle(&instr->src[0], nr_components); 896 bool is_float = mir_is_bcsel_float(instr); 897 op = is_float ? 898 (mixed ? midgard_alu_op_fcsel_v : midgard_alu_op_fcsel) : 899 (mixed ? midgard_alu_op_icsel_v : midgard_alu_op_icsel); 900 901 break; 902 } 903 904 case nir_op_unpack_32_2x16: 905 case nir_op_unpack_32_4x8: 906 case nir_op_pack_32_2x16: 907 case nir_op_pack_32_4x8: { 908 op = midgard_alu_op_imov; 909 break; 910 } 911 912 default: 913 DBG("Unhandled ALU op %s\n", nir_op_infos[instr->op].name); 914 assert(0); 915 return; 916 } 917 918 /* Promote imov to fmov if it might help inline a constant */ 919 if (op == midgard_alu_op_imov && nir_src_is_const(instr->src[0].src) 920 && nir_src_bit_size(instr->src[0].src) == 32 921 && nir_is_same_comp_swizzle(instr->src[0].swizzle, 922 nir_src_num_components(instr->src[0].src))) { 923 op = midgard_alu_op_fmov; 924 } 925 926 /* Midgard can perform certain modifiers on output of an ALU op */ 927 928 unsigned outmod = 0; 929 bool is_int = midgard_is_integer_op(op); 930 931 if (instr->op == nir_op_umul_high || instr->op == nir_op_imul_high) { 932 outmod = midgard_outmod_keephi; 933 } else if (midgard_is_integer_out_op(op)) { 934 outmod = midgard_outmod_keeplo; 935 } else if (instr->op == nir_op_fsat) { 936 outmod = midgard_outmod_clamp_0_1; 937 } else if (instr->op == nir_op_fsat_signed_mali) { 938 outmod = midgard_outmod_clamp_m1_1; 939 } else if (instr->op == nir_op_fclamp_pos_mali) { 940 outmod = midgard_outmod_clamp_0_inf; 941 } 942 943 /* Fetch unit, quirks, etc information */ 944 unsigned opcode_props = alu_opcode_props[op].props; 945 bool quirk_flipped_r24 = opcode_props & QUIRK_FLIPPED_R24; 946 947 if (!midgard_is_integer_out_op(op)) { 948 outmod = mir_determine_float_outmod(ctx, &dest, outmod); 949 } 950 951 midgard_instruction ins = { 952 .type = TAG_ALU_4, 953 .dest = nir_dest_index(dest), 954 .dest_type = nir_op_infos[instr->op].output_type 955 | nir_dest_bit_size(*dest), 956 .roundmode = roundmode, 957 }; 958 959 enum midgard_roundmode *roundptr = (opcode_props & MIDGARD_ROUNDS) ? 960 &ins.roundmode : NULL; 961 962 for (unsigned i = nr_inputs; i < ARRAY_SIZE(ins.src); ++i) 963 ins.src[i] = ~0; 964 965 if (quirk_flipped_r24) { 966 ins.src[0] = ~0; 967 mir_copy_src(&ins, instr, 0, 1, &ins.src_abs[1], &ins.src_neg[1], &ins.src_invert[1], roundptr, is_int, broadcast_swizzle); 968 } else { 969 for (unsigned i = 0; i < nr_inputs; ++i) { 970 unsigned to = i; 971 972 if (instr->op == nir_op_b32csel) { 973 /* The condition is the first argument; move 974 * the other arguments up one to be a binary 975 * instruction for Midgard with the condition 976 * last */ 977 978 if (i == 0) 979 to = 2; 980 else if (flip_src12) 981 to = 2 - i; 982 else 983 to = i - 1; 984 } else if (flip_src12) { 985 to = 1 - to; 986 } 987 988 mir_copy_src(&ins, instr, i, to, &ins.src_abs[to], &ins.src_neg[to], &ins.src_invert[to], roundptr, is_int, broadcast_swizzle); 989 990 /* (!c) ? a : b = c ? b : a */ 991 if (instr->op == nir_op_b32csel && ins.src_invert[2]) { 992 ins.src_invert[2] = false; 993 flip_src12 ^= true; 994 } 995 } 996 } 997 998 if (instr->op == nir_op_fneg || instr->op == nir_op_fabs) { 999 /* Lowered to move */ 1000 if (instr->op == nir_op_fneg) 1001 ins.src_neg[1] ^= true; 1002 1003 if (instr->op == nir_op_fabs) 1004 ins.src_abs[1] = true; 1005 } 1006 1007 ins.mask = mask_of(nr_components); 1008 1009 /* Apply writemask if non-SSA, keeping in mind that we can't write to 1010 * components that don't exist. Note modifier => SSA => !reg => no 1011 * writemask, so we don't have to worry about writemasks here.*/ 1012 1013 if (!is_ssa) 1014 ins.mask &= instr->dest.write_mask; 1015 1016 ins.op = op; 1017 ins.outmod = outmod; 1018 1019 /* Late fixup for emulated instructions */ 1020 1021 if (instr->op == nir_op_b2f32 || instr->op == nir_op_b2i32) { 1022 /* Presently, our second argument is an inline #0 constant. 1023 * Switch over to an embedded 1.0 constant (that can't fit 1024 * inline, since we're 32-bit, not 16-bit like the inline 1025 * constants) */ 1026 1027 ins.has_inline_constant = false; 1028 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 1029 ins.src_types[1] = nir_type_float32; 1030 ins.has_constants = true; 1031 1032 if (instr->op == nir_op_b2f32) 1033 ins.constants.f32[0] = 1.0f; 1034 else 1035 ins.constants.i32[0] = 1; 1036 1037 for (unsigned c = 0; c < 16; ++c) 1038 ins.swizzle[1][c] = 0; 1039 } else if (instr->op == nir_op_b2f16) { 1040 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 1041 ins.src_types[1] = nir_type_float16; 1042 ins.has_constants = true; 1043 ins.constants.i16[0] = _mesa_float_to_half(1.0); 1044 1045 for (unsigned c = 0; c < 16; ++c) 1046 ins.swizzle[1][c] = 0; 1047 } else if (nr_inputs == 1 && !quirk_flipped_r24) { 1048 /* Lots of instructions need a 0 plonked in */ 1049 ins.has_inline_constant = false; 1050 ins.src[1] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); 1051 ins.src_types[1] = ins.src_types[0]; 1052 ins.has_constants = true; 1053 ins.constants.u32[0] = 0; 1054 1055 for (unsigned c = 0; c < 16; ++c) 1056 ins.swizzle[1][c] = 0; 1057 } else if (instr->op == nir_op_pack_32_2x16) { 1058 ins.dest_type = nir_type_uint16; 1059 ins.mask = mask_of(nr_components * 2); 1060 ins.is_pack = true; 1061 } else if (instr->op == nir_op_pack_32_4x8) { 1062 ins.dest_type = nir_type_uint8; 1063 ins.mask = mask_of(nr_components * 4); 1064 ins.is_pack = true; 1065 } else if (instr->op == nir_op_unpack_32_2x16) { 1066 ins.dest_type = nir_type_uint32; 1067 ins.mask = mask_of(nr_components >> 1); 1068 ins.is_pack = true; 1069 } else if (instr->op == nir_op_unpack_32_4x8) { 1070 ins.dest_type = nir_type_uint32; 1071 ins.mask = mask_of(nr_components >> 2); 1072 ins.is_pack = true; 1073 } 1074 1075 if ((opcode_props & UNITS_ALL) == UNIT_VLUT) { 1076 /* To avoid duplicating the lookup tables (probably), true LUT 1077 * instructions can only operate as if they were scalars. Lower 1078 * them here by changing the component. */ 1079 1080 unsigned orig_mask = ins.mask; 1081 1082 unsigned swizzle_back[MIR_VEC_COMPONENTS]; 1083 memcpy(&swizzle_back, ins.swizzle[0], sizeof(swizzle_back)); 1084 1085 midgard_instruction ins_split[MIR_VEC_COMPONENTS]; 1086 unsigned ins_count = 0; 1087 1088 for (int i = 0; i < nr_components; ++i) { 1089 /* Mask the associated component, dropping the 1090 * instruction if needed */ 1091 1092 ins.mask = 1 << i; 1093 ins.mask &= orig_mask; 1094 1095 for (unsigned j = 0; j < ins_count; ++j) { 1096 if (swizzle_back[i] == ins_split[j].swizzle[0][0]) { 1097 ins_split[j].mask |= ins.mask; 1098 ins.mask = 0; 1099 break; 1100 } 1101 } 1102 1103 if (!ins.mask) 1104 continue; 1105 1106 for (unsigned j = 0; j < MIR_VEC_COMPONENTS; ++j) 1107 ins.swizzle[0][j] = swizzle_back[i]; /* Pull from the correct component */ 1108 1109 ins_split[ins_count] = ins; 1110 1111 ++ins_count; 1112 } 1113 1114 for (unsigned i = 0; i < ins_count; ++i) { 1115 emit_mir_instruction(ctx, ins_split[i]); 1116 } 1117 } else { 1118 emit_mir_instruction(ctx, ins); 1119 } 1120} 1121 1122#undef ALU_CASE 1123 1124static void 1125mir_set_intr_mask(nir_instr *instr, midgard_instruction *ins, bool is_read) 1126{ 1127 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1128 unsigned nir_mask = 0; 1129 unsigned dsize = 0; 1130 1131 if (is_read) { 1132 nir_mask = mask_of(nir_intrinsic_dest_components(intr)); 1133 dsize = nir_dest_bit_size(intr->dest); 1134 } else { 1135 nir_mask = nir_intrinsic_write_mask(intr); 1136 dsize = 32; 1137 } 1138 1139 /* Once we have the NIR mask, we need to normalize to work in 32-bit space */ 1140 unsigned bytemask = pan_to_bytemask(dsize, nir_mask); 1141 ins->dest_type = nir_type_uint | dsize; 1142 mir_set_bytemask(ins, bytemask); 1143} 1144 1145/* Uniforms and UBOs use a shared code path, as uniforms are just (slightly 1146 * optimized) versions of UBO #0 */ 1147 1148static midgard_instruction * 1149emit_ubo_read( 1150 compiler_context *ctx, 1151 nir_instr *instr, 1152 unsigned dest, 1153 unsigned offset, 1154 nir_src *indirect_offset, 1155 unsigned indirect_shift, 1156 unsigned index, 1157 unsigned nr_comps) 1158{ 1159 midgard_instruction ins; 1160 1161 unsigned dest_size = (instr->type == nir_instr_type_intrinsic) ? 1162 nir_dest_bit_size(nir_instr_as_intrinsic(instr)->dest) : 32; 1163 1164 unsigned bitsize = dest_size * nr_comps; 1165 1166 /* Pick the smallest intrinsic to avoid out-of-bounds reads */ 1167 if (bitsize <= 32) 1168 ins = m_ld_ubo_32(dest, 0); 1169 else if (bitsize <= 64) 1170 ins = m_ld_ubo_64(dest, 0); 1171 else if (bitsize <= 128) 1172 ins = m_ld_ubo_128(dest, 0); 1173 else 1174 unreachable("Invalid UBO read size"); 1175 1176 ins.constants.u32[0] = offset; 1177 1178 if (instr->type == nir_instr_type_intrinsic) 1179 mir_set_intr_mask(instr, &ins, true); 1180 1181 if (indirect_offset) { 1182 ins.src[2] = nir_src_index(ctx, indirect_offset); 1183 ins.src_types[2] = nir_type_uint32; 1184 ins.load_store.index_shift = indirect_shift; 1185 1186 /* X component for the whole swizzle to prevent register 1187 * pressure from ballooning from the extra components */ 1188 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[2]); ++i) 1189 ins.swizzle[2][i] = 0; 1190 } else { 1191 ins.load_store.index_reg = REGISTER_LDST_ZERO; 1192 } 1193 1194 if (indirect_offset && indirect_offset->is_ssa && !indirect_shift) 1195 mir_set_ubo_offset(&ins, indirect_offset, offset); 1196 1197 midgard_pack_ubo_index_imm(&ins.load_store, index); 1198 1199 return emit_mir_instruction(ctx, ins); 1200} 1201 1202/* Globals are like UBOs if you squint. And shared memory is like globals if 1203 * you squint even harder */ 1204 1205static void 1206emit_global( 1207 compiler_context *ctx, 1208 nir_instr *instr, 1209 bool is_read, 1210 unsigned srcdest, 1211 nir_src *offset, 1212 unsigned seg) 1213{ 1214 midgard_instruction ins; 1215 1216 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1217 if (is_read) { 1218 unsigned bitsize = nir_dest_bit_size(intr->dest) * 1219 nir_dest_num_components(intr->dest); 1220 1221 if (bitsize <= 32) 1222 ins = m_ld_32(srcdest, 0); 1223 else if (bitsize <= 64) 1224 ins = m_ld_64(srcdest, 0); 1225 else if (bitsize <= 128) 1226 ins = m_ld_128(srcdest, 0); 1227 else 1228 unreachable("Invalid global read size"); 1229 } else { 1230 unsigned bitsize = nir_src_bit_size(intr->src[0]) * 1231 nir_src_num_components(intr->src[0]); 1232 1233 if (bitsize <= 32) 1234 ins = m_st_32(srcdest, 0); 1235 else if (bitsize <= 64) 1236 ins = m_st_64(srcdest, 0); 1237 else if (bitsize <= 128) 1238 ins = m_st_128(srcdest, 0); 1239 else 1240 unreachable("Invalid global store size"); 1241 } 1242 1243 mir_set_offset(ctx, &ins, offset, seg); 1244 mir_set_intr_mask(instr, &ins, is_read); 1245 1246 /* Set a valid swizzle for masked out components */ 1247 assert(ins.mask); 1248 unsigned first_component = __builtin_ffs(ins.mask) - 1; 1249 1250 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) { 1251 if (!(ins.mask & (1 << i))) 1252 ins.swizzle[0][i] = first_component; 1253 } 1254 1255 emit_mir_instruction(ctx, ins); 1256} 1257 1258/* If is_shared is off, the only other possible value are globals, since 1259 * SSBO's are being lowered to globals through a NIR pass. 1260 * `image_direct_address` should be ~0 when instr is not an image_atomic 1261 * and the destination register of a lea_image op when it is an image_atomic. */ 1262static void 1263emit_atomic( 1264 compiler_context *ctx, 1265 nir_intrinsic_instr *instr, 1266 bool is_shared, 1267 midgard_load_store_op op, 1268 unsigned image_direct_address) 1269{ 1270 nir_alu_type type = 1271 (op == midgard_op_atomic_imin || op == midgard_op_atomic_imax) ? 1272 nir_type_int : nir_type_uint; 1273 1274 bool is_image = image_direct_address != ~0; 1275 1276 unsigned dest = nir_dest_index(&instr->dest); 1277 unsigned val_src = is_image ? 3 : 1; 1278 unsigned val = nir_src_index(ctx, &instr->src[val_src]); 1279 unsigned bitsize = nir_src_bit_size(instr->src[val_src]); 1280 emit_explicit_constant(ctx, val, val); 1281 1282 midgard_instruction ins = { 1283 .type = TAG_LOAD_STORE_4, 1284 .mask = 0xF, 1285 .dest = dest, 1286 .src = { ~0, ~0, ~0, val }, 1287 .src_types = { 0, 0, 0, type | bitsize }, 1288 .op = op 1289 }; 1290 1291 nir_src *src_offset = nir_get_io_offset_src(instr); 1292 1293 if (op == midgard_op_atomic_cmpxchg) { 1294 unsigned xchg_val_src = is_image ? 4 : 2; 1295 unsigned xchg_val = nir_src_index(ctx, &instr->src[xchg_val_src]); 1296 emit_explicit_constant(ctx, xchg_val, xchg_val); 1297 1298 ins.src[2] = val; 1299 ins.src_types[2] = type | bitsize; 1300 ins.src[3] = xchg_val; 1301 1302 if (is_shared) { 1303 ins.load_store.arg_reg = REGISTER_LDST_LOCAL_STORAGE_PTR; 1304 ins.load_store.arg_comp = COMPONENT_Z; 1305 ins.load_store.bitsize_toggle = true; 1306 } else { 1307 for(unsigned i = 0; i < 2; ++i) 1308 ins.swizzle[1][i] = i; 1309 1310 ins.src[1] = is_image ? image_direct_address : 1311 nir_src_index(ctx, src_offset); 1312 ins.src_types[1] = nir_type_uint64; 1313 } 1314 } else if (is_image) { 1315 for(unsigned i = 0; i < 2; ++i) 1316 ins.swizzle[2][i] = i; 1317 1318 ins.src[2] = image_direct_address; 1319 ins.src_types[2] = nir_type_uint64; 1320 1321 ins.load_store.arg_reg = REGISTER_LDST_ZERO; 1322 ins.load_store.bitsize_toggle = true; 1323 ins.load_store.index_format = midgard_index_address_u64; 1324 } else 1325 mir_set_offset(ctx, &ins, src_offset, is_shared ? LDST_SHARED : LDST_GLOBAL); 1326 1327 mir_set_intr_mask(&instr->instr, &ins, true); 1328 1329 emit_mir_instruction(ctx, ins); 1330} 1331 1332static void 1333emit_varying_read( 1334 compiler_context *ctx, 1335 unsigned dest, unsigned offset, 1336 unsigned nr_comp, unsigned component, 1337 nir_src *indirect_offset, nir_alu_type type, bool flat) 1338{ 1339 /* XXX: Half-floats? */ 1340 /* TODO: swizzle, mask */ 1341 1342 midgard_instruction ins = m_ld_vary_32(dest, PACK_LDST_ATTRIB_OFS(offset)); 1343 ins.mask = mask_of(nr_comp); 1344 ins.dest_type = type; 1345 1346 if (type == nir_type_float16) { 1347 /* Ensure we are aligned so we can pack it later */ 1348 ins.mask = mask_of(ALIGN_POT(nr_comp, 2)); 1349 } 1350 1351 for (unsigned i = 0; i < ARRAY_SIZE(ins.swizzle[0]); ++i) 1352 ins.swizzle[0][i] = MIN2(i + component, COMPONENT_W); 1353 1354 1355 midgard_varying_params p = { 1356 .flat_shading = flat, 1357 .perspective_correction = 1, 1358 .interpolate_sample = true, 1359 }; 1360 midgard_pack_varying_params(&ins.load_store, p); 1361 1362 if (indirect_offset) { 1363 ins.src[2] = nir_src_index(ctx, indirect_offset); 1364 ins.src_types[2] = nir_type_uint32; 1365 } else 1366 ins.load_store.index_reg = REGISTER_LDST_ZERO; 1367 1368 ins.load_store.arg_reg = REGISTER_LDST_ZERO; 1369 ins.load_store.index_format = midgard_index_address_u32; 1370 1371 /* Use the type appropriate load */ 1372 switch (type) { 1373 case nir_type_uint32: 1374 case nir_type_bool32: 1375 ins.op = midgard_op_ld_vary_32u; 1376 break; 1377 case nir_type_int32: 1378 ins.op = midgard_op_ld_vary_32i; 1379 break; 1380 case nir_type_float32: 1381 ins.op = midgard_op_ld_vary_32; 1382 break; 1383 case nir_type_float16: 1384 ins.op = midgard_op_ld_vary_16; 1385 break; 1386 default: 1387 unreachable("Attempted to load unknown type"); 1388 break; 1389 } 1390 1391 emit_mir_instruction(ctx, ins); 1392} 1393 1394 1395/* If `is_atomic` is true, we emit a `lea_image` since midgard doesn't not have special 1396 * image_atomic opcodes. The caller can then use that address to emit a normal atomic opcode. */ 1397static midgard_instruction 1398emit_image_op(compiler_context *ctx, nir_intrinsic_instr *instr, bool is_atomic) 1399{ 1400 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 1401 unsigned nr_attr = ctx->stage == MESA_SHADER_VERTEX ? 1402 util_bitcount64(ctx->nir->info.inputs_read) : 0; 1403 unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); 1404 bool is_array = nir_intrinsic_image_array(instr); 1405 bool is_store = instr->intrinsic == nir_intrinsic_image_store; 1406 1407 /* TODO: MSAA */ 1408 assert(dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); 1409 1410 unsigned coord_reg = nir_src_index(ctx, &instr->src[1]); 1411 emit_explicit_constant(ctx, coord_reg, coord_reg); 1412 1413 nir_src *index = &instr->src[0]; 1414 bool is_direct = nir_src_is_const(*index); 1415 1416 /* For image opcodes, address is used as an index into the attribute descriptor */ 1417 unsigned address = nr_attr; 1418 if (is_direct) 1419 address += nir_src_as_uint(*index); 1420 1421 midgard_instruction ins; 1422 if (is_store) { /* emit st_image_* */ 1423 unsigned val = nir_src_index(ctx, &instr->src[3]); 1424 emit_explicit_constant(ctx, val, val); 1425 1426 nir_alu_type type = nir_intrinsic_src_type(instr); 1427 ins = st_image(type, val, PACK_LDST_ATTRIB_OFS(address)); 1428 nir_alu_type base_type = nir_alu_type_get_base_type(type); 1429 ins.src_types[0] = base_type | nir_src_bit_size(instr->src[3]); 1430 } else if (is_atomic) { /* emit lea_image */ 1431 unsigned dest = make_compiler_temp_reg(ctx); 1432 ins = m_lea_image(dest, PACK_LDST_ATTRIB_OFS(address)); 1433 ins.mask = mask_of(2); /* 64-bit memory address */ 1434 } else { /* emit ld_image_* */ 1435 nir_alu_type type = nir_intrinsic_dest_type(instr); 1436 ins = ld_image(type, nir_dest_index(&instr->dest), PACK_LDST_ATTRIB_OFS(address)); 1437 ins.mask = mask_of(nir_intrinsic_dest_components(instr)); 1438 ins.dest_type = type; 1439 } 1440 1441 /* Coord reg */ 1442 ins.src[1] = coord_reg; 1443 ins.src_types[1] = nir_type_uint16; 1444 if (nr_dim == 3 || is_array) { 1445 ins.load_store.bitsize_toggle = true; 1446 } 1447 1448 /* Image index reg */ 1449 if (!is_direct) { 1450 ins.src[2] = nir_src_index(ctx, index); 1451 ins.src_types[2] = nir_type_uint32; 1452 } else 1453 ins.load_store.index_reg = REGISTER_LDST_ZERO; 1454 1455 emit_mir_instruction(ctx, ins); 1456 1457 return ins; 1458} 1459 1460static void 1461emit_attr_read( 1462 compiler_context *ctx, 1463 unsigned dest, unsigned offset, 1464 unsigned nr_comp, nir_alu_type t) 1465{ 1466 midgard_instruction ins = m_ld_attr_32(dest, PACK_LDST_ATTRIB_OFS(offset)); 1467 ins.load_store.arg_reg = REGISTER_LDST_ZERO; 1468 ins.load_store.index_reg = REGISTER_LDST_ZERO; 1469 ins.mask = mask_of(nr_comp); 1470 1471 /* Use the type appropriate load */ 1472 switch (t) { 1473 case nir_type_uint: 1474 case nir_type_bool: 1475 ins.op = midgard_op_ld_attr_32u; 1476 break; 1477 case nir_type_int: 1478 ins.op = midgard_op_ld_attr_32i; 1479 break; 1480 case nir_type_float: 1481 ins.op = midgard_op_ld_attr_32; 1482 break; 1483 default: 1484 unreachable("Attempted to load unknown type"); 1485 break; 1486 } 1487 1488 emit_mir_instruction(ctx, ins); 1489} 1490 1491static void 1492emit_sysval_read(compiler_context *ctx, nir_instr *instr, 1493 unsigned nr_components, unsigned offset) 1494{ 1495 nir_dest nir_dest; 1496 1497 /* Figure out which uniform this is */ 1498 unsigned sysval_ubo = 1499 MAX2(ctx->inputs->sysval_ubo, ctx->nir->info.num_ubos); 1500 int sysval = panfrost_sysval_for_instr(instr, &nir_dest); 1501 unsigned dest = nir_dest_index(&nir_dest); 1502 unsigned uniform = 1503 pan_lookup_sysval(ctx->sysval_to_id, &ctx->info->sysvals, sysval); 1504 1505 /* Emit the read itself -- this is never indirect */ 1506 midgard_instruction *ins = 1507 emit_ubo_read(ctx, instr, dest, (uniform * 16) + offset, NULL, 0, 1508 sysval_ubo, nr_components); 1509 1510 ins->mask = mask_of(nr_components); 1511} 1512 1513static unsigned 1514compute_builtin_arg(nir_intrinsic_op op) 1515{ 1516 switch (op) { 1517 case nir_intrinsic_load_workgroup_id: 1518 return REGISTER_LDST_GROUP_ID; 1519 case nir_intrinsic_load_local_invocation_id: 1520 return REGISTER_LDST_LOCAL_THREAD_ID; 1521 case nir_intrinsic_load_global_invocation_id: 1522 case nir_intrinsic_load_global_invocation_id_zero_base: 1523 return REGISTER_LDST_GLOBAL_THREAD_ID; 1524 default: 1525 unreachable("Invalid compute paramater loaded"); 1526 } 1527} 1528 1529static void 1530emit_fragment_store(compiler_context *ctx, unsigned src, unsigned src_z, unsigned src_s, 1531 enum midgard_rt_id rt, unsigned sample_iter) 1532{ 1533 assert(rt < ARRAY_SIZE(ctx->writeout_branch)); 1534 assert(sample_iter < ARRAY_SIZE(ctx->writeout_branch[0])); 1535 1536 midgard_instruction *br = ctx->writeout_branch[rt][sample_iter]; 1537 1538 assert(!br); 1539 1540 emit_explicit_constant(ctx, src, src); 1541 1542 struct midgard_instruction ins = 1543 v_branch(false, false); 1544 1545 bool depth_only = (rt == MIDGARD_ZS_RT); 1546 1547 ins.writeout = depth_only ? 0 : PAN_WRITEOUT_C; 1548 1549 /* Add dependencies */ 1550 ins.src[0] = src; 1551 ins.src_types[0] = nir_type_uint32; 1552 1553 if (depth_only) 1554 ins.constants.u32[0] = 0xFF; 1555 else 1556 ins.constants.u32[0] = ((rt - MIDGARD_COLOR_RT0) << 8) | sample_iter; 1557 1558 for (int i = 0; i < 4; ++i) 1559 ins.swizzle[0][i] = i; 1560 1561 if (~src_z) { 1562 emit_explicit_constant(ctx, src_z, src_z); 1563 ins.src[2] = src_z; 1564 ins.src_types[2] = nir_type_uint32; 1565 ins.writeout |= PAN_WRITEOUT_Z; 1566 } 1567 if (~src_s) { 1568 emit_explicit_constant(ctx, src_s, src_s); 1569 ins.src[3] = src_s; 1570 ins.src_types[3] = nir_type_uint32; 1571 ins.writeout |= PAN_WRITEOUT_S; 1572 } 1573 1574 /* Emit the branch */ 1575 br = emit_mir_instruction(ctx, ins); 1576 schedule_barrier(ctx); 1577 ctx->writeout_branch[rt][sample_iter] = br; 1578 1579 /* Push our current location = current block count - 1 = where we'll 1580 * jump to. Maybe a bit too clever for my own good */ 1581 1582 br->branch.target_block = ctx->block_count - 1; 1583} 1584 1585static void 1586emit_compute_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) 1587{ 1588 unsigned reg = nir_dest_index(&instr->dest); 1589 midgard_instruction ins = m_ldst_mov(reg, 0); 1590 ins.mask = mask_of(3); 1591 ins.swizzle[0][3] = COMPONENT_X; /* xyzx */ 1592 ins.load_store.arg_reg = compute_builtin_arg(instr->intrinsic); 1593 emit_mir_instruction(ctx, ins); 1594} 1595 1596static unsigned 1597vertex_builtin_arg(nir_intrinsic_op op) 1598{ 1599 switch (op) { 1600 case nir_intrinsic_load_vertex_id_zero_base: 1601 return PAN_VERTEX_ID; 1602 case nir_intrinsic_load_instance_id: 1603 return PAN_INSTANCE_ID; 1604 default: 1605 unreachable("Invalid vertex builtin"); 1606 } 1607} 1608 1609static void 1610emit_vertex_builtin(compiler_context *ctx, nir_intrinsic_instr *instr) 1611{ 1612 unsigned reg = nir_dest_index(&instr->dest); 1613 emit_attr_read(ctx, reg, vertex_builtin_arg(instr->intrinsic), 1, nir_type_int); 1614} 1615 1616static void 1617emit_special(compiler_context *ctx, nir_intrinsic_instr *instr, unsigned idx) 1618{ 1619 unsigned reg = nir_dest_index(&instr->dest); 1620 1621 midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0); 1622 ld.op = midgard_op_ld_special_32u; 1623 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(idx); 1624 ld.load_store.index_reg = REGISTER_LDST_ZERO; 1625 1626 for (int i = 0; i < 4; ++i) 1627 ld.swizzle[0][i] = COMPONENT_X; 1628 1629 emit_mir_instruction(ctx, ld); 1630} 1631 1632static void 1633emit_control_barrier(compiler_context *ctx) 1634{ 1635 midgard_instruction ins = { 1636 .type = TAG_TEXTURE_4, 1637 .dest = ~0, 1638 .src = { ~0, ~0, ~0, ~0 }, 1639 .op = midgard_tex_op_barrier, 1640 }; 1641 1642 emit_mir_instruction(ctx, ins); 1643} 1644 1645static unsigned 1646mir_get_branch_cond(nir_src *src, bool *invert) 1647{ 1648 /* Wrap it. No swizzle since it's a scalar */ 1649 1650 nir_alu_src alu = { 1651 .src = *src 1652 }; 1653 1654 *invert = pan_has_source_mod(&alu, nir_op_inot); 1655 return nir_src_index(NULL, &alu.src); 1656} 1657 1658static uint8_t 1659output_load_rt_addr(compiler_context *ctx, nir_intrinsic_instr *instr) 1660{ 1661 if (ctx->inputs->is_blend) 1662 return MIDGARD_COLOR_RT0 + ctx->inputs->blend.rt; 1663 1664 const nir_variable *var; 1665 var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, nir_intrinsic_base(instr)); 1666 assert(var); 1667 1668 unsigned loc = var->data.location; 1669 1670 if (loc >= FRAG_RESULT_DATA0) 1671 return loc - FRAG_RESULT_DATA0; 1672 1673 if (loc == FRAG_RESULT_DEPTH) 1674 return 0x1F; 1675 if (loc == FRAG_RESULT_STENCIL) 1676 return 0x1E; 1677 1678 unreachable("Invalid RT to load from"); 1679} 1680 1681static void 1682emit_intrinsic(compiler_context *ctx, nir_intrinsic_instr *instr) 1683{ 1684 unsigned offset = 0, reg; 1685 1686 switch (instr->intrinsic) { 1687 case nir_intrinsic_discard_if: 1688 case nir_intrinsic_discard: { 1689 bool conditional = instr->intrinsic == nir_intrinsic_discard_if; 1690 struct midgard_instruction discard = v_branch(conditional, false); 1691 discard.branch.target_type = TARGET_DISCARD; 1692 1693 if (conditional) { 1694 discard.src[0] = mir_get_branch_cond(&instr->src[0], 1695 &discard.branch.invert_conditional); 1696 discard.src_types[0] = nir_type_uint32; 1697 } 1698 1699 emit_mir_instruction(ctx, discard); 1700 schedule_barrier(ctx); 1701 1702 break; 1703 } 1704 1705 case nir_intrinsic_image_load: 1706 case nir_intrinsic_image_store: 1707 emit_image_op(ctx, instr, false); 1708 break; 1709 1710 case nir_intrinsic_image_size: { 1711 unsigned nr_comp = nir_intrinsic_dest_components(instr); 1712 emit_sysval_read(ctx, &instr->instr, nr_comp, 0); 1713 break; 1714 } 1715 1716 case nir_intrinsic_load_ubo: 1717 case nir_intrinsic_load_global: 1718 case nir_intrinsic_load_global_constant: 1719 case nir_intrinsic_load_shared: 1720 case nir_intrinsic_load_scratch: 1721 case nir_intrinsic_load_input: 1722 case nir_intrinsic_load_kernel_input: 1723 case nir_intrinsic_load_interpolated_input: { 1724 bool is_ubo = instr->intrinsic == nir_intrinsic_load_ubo; 1725 bool is_global = instr->intrinsic == nir_intrinsic_load_global || 1726 instr->intrinsic == nir_intrinsic_load_global_constant; 1727 bool is_shared = instr->intrinsic == nir_intrinsic_load_shared; 1728 bool is_scratch = instr->intrinsic == nir_intrinsic_load_scratch; 1729 bool is_flat = instr->intrinsic == nir_intrinsic_load_input; 1730 bool is_kernel = instr->intrinsic == nir_intrinsic_load_kernel_input; 1731 bool is_interp = instr->intrinsic == nir_intrinsic_load_interpolated_input; 1732 1733 /* Get the base type of the intrinsic */ 1734 /* TODO: Infer type? Does it matter? */ 1735 nir_alu_type t = 1736 (is_interp) ? nir_type_float : 1737 (is_flat) ? nir_intrinsic_dest_type(instr) : 1738 nir_type_uint; 1739 1740 t = nir_alu_type_get_base_type(t); 1741 1742 if (!(is_ubo || is_global || is_scratch)) { 1743 offset = nir_intrinsic_base(instr); 1744 } 1745 1746 unsigned nr_comp = nir_intrinsic_dest_components(instr); 1747 1748 nir_src *src_offset = nir_get_io_offset_src(instr); 1749 1750 bool direct = nir_src_is_const(*src_offset); 1751 nir_src *indirect_offset = direct ? NULL : src_offset; 1752 1753 if (direct) 1754 offset += nir_src_as_uint(*src_offset); 1755 1756 /* We may need to apply a fractional offset */ 1757 int component = (is_flat || is_interp) ? 1758 nir_intrinsic_component(instr) : 0; 1759 reg = nir_dest_index(&instr->dest); 1760 1761 if (is_kernel) { 1762 emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, 0, nr_comp); 1763 } else if (is_ubo) { 1764 nir_src index = instr->src[0]; 1765 1766 /* TODO: Is indirect block number possible? */ 1767 assert(nir_src_is_const(index)); 1768 1769 uint32_t uindex = nir_src_as_uint(index); 1770 emit_ubo_read(ctx, &instr->instr, reg, offset, indirect_offset, 0, uindex, nr_comp); 1771 } else if (is_global || is_shared || is_scratch) { 1772 unsigned seg = is_global ? LDST_GLOBAL : (is_shared ? LDST_SHARED : LDST_SCRATCH); 1773 emit_global(ctx, &instr->instr, true, reg, src_offset, seg); 1774 } else if (ctx->stage == MESA_SHADER_FRAGMENT && !ctx->inputs->is_blend) { 1775 emit_varying_read(ctx, reg, offset, nr_comp, component, indirect_offset, t | nir_dest_bit_size(instr->dest), is_flat); 1776 } else if (ctx->inputs->is_blend) { 1777 /* ctx->blend_input will be precoloured to r0/r2, where 1778 * the input is preloaded */ 1779 1780 unsigned *input = offset ? &ctx->blend_src1 : &ctx->blend_input; 1781 1782 if (*input == ~0) 1783 *input = reg; 1784 else 1785 emit_mir_instruction(ctx, v_mov(*input, reg)); 1786 } else if (ctx->stage == MESA_SHADER_VERTEX) { 1787 emit_attr_read(ctx, reg, offset, nr_comp, t); 1788 } else { 1789 DBG("Unknown load\n"); 1790 assert(0); 1791 } 1792 1793 break; 1794 } 1795 1796 /* Handled together with load_interpolated_input */ 1797 case nir_intrinsic_load_barycentric_pixel: 1798 case nir_intrinsic_load_barycentric_centroid: 1799 case nir_intrinsic_load_barycentric_sample: 1800 break; 1801 1802 /* Reads 128-bit value raw off the tilebuffer during blending, tasty */ 1803 1804 case nir_intrinsic_load_raw_output_pan: { 1805 reg = nir_dest_index(&instr->dest); 1806 1807 /* T720 and below use different blend opcodes with slightly 1808 * different semantics than T760 and up */ 1809 1810 midgard_instruction ld = m_ld_tilebuffer_raw(reg, 0); 1811 1812 unsigned target = output_load_rt_addr(ctx, instr); 1813 ld.load_store.index_comp = target & 0x3; 1814 ld.load_store.index_reg = target >> 2; 1815 1816 if (nir_src_is_const(instr->src[0])) { 1817 unsigned sample = nir_src_as_uint(instr->src[0]); 1818 ld.load_store.arg_comp = sample & 0x3; 1819 ld.load_store.arg_reg = sample >> 2; 1820 } else { 1821 /* Enable sample index via register. */ 1822 ld.load_store.signed_offset |= 1; 1823 ld.src[1] = nir_src_index(ctx, &instr->src[0]); 1824 ld.src_types[1] = nir_type_int32; 1825 } 1826 1827 if (ctx->quirks & MIDGARD_OLD_BLEND) { 1828 ld.op = midgard_op_ld_special_32u; 1829 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(16); 1830 ld.load_store.index_reg = REGISTER_LDST_ZERO; 1831 } 1832 1833 emit_mir_instruction(ctx, ld); 1834 break; 1835 } 1836 1837 case nir_intrinsic_load_output: { 1838 reg = nir_dest_index(&instr->dest); 1839 1840 unsigned bits = nir_dest_bit_size(instr->dest); 1841 1842 midgard_instruction ld; 1843 if (bits == 16) 1844 ld = m_ld_tilebuffer_16f(reg, 0); 1845 else 1846 ld = m_ld_tilebuffer_32f(reg, 0); 1847 1848 unsigned index = output_load_rt_addr(ctx, instr); 1849 ld.load_store.index_comp = index & 0x3; 1850 ld.load_store.index_reg = index >> 2; 1851 1852 for (unsigned c = 4; c < 16; ++c) 1853 ld.swizzle[0][c] = 0; 1854 1855 if (ctx->quirks & MIDGARD_OLD_BLEND) { 1856 if (bits == 16) 1857 ld.op = midgard_op_ld_special_16f; 1858 else 1859 ld.op = midgard_op_ld_special_32f; 1860 ld.load_store.signed_offset = PACK_LDST_SELECTOR_OFS(1); 1861 ld.load_store.index_reg = REGISTER_LDST_ZERO; 1862 } 1863 1864 emit_mir_instruction(ctx, ld); 1865 break; 1866 } 1867 1868 case nir_intrinsic_store_output: 1869 case nir_intrinsic_store_combined_output_pan: 1870 assert(nir_src_is_const(instr->src[1]) && "no indirect outputs"); 1871 1872 offset = nir_intrinsic_base(instr) + nir_src_as_uint(instr->src[1]); 1873 1874 reg = nir_src_index(ctx, &instr->src[0]); 1875 1876 if (ctx->stage == MESA_SHADER_FRAGMENT) { 1877 bool combined = instr->intrinsic == 1878 nir_intrinsic_store_combined_output_pan; 1879 1880 const nir_variable *var; 1881 var = nir_find_variable_with_driver_location(ctx->nir, nir_var_shader_out, 1882 nir_intrinsic_base(instr)); 1883 assert(var); 1884 1885 /* Dual-source blend writeout is done by leaving the 1886 * value in r2 for the blend shader to use. */ 1887 if (var->data.index) { 1888 if (instr->src[0].is_ssa) { 1889 emit_explicit_constant(ctx, reg, reg); 1890 1891 unsigned out = make_compiler_temp(ctx); 1892 1893 midgard_instruction ins = v_mov(reg, out); 1894 emit_mir_instruction(ctx, ins); 1895 1896 ctx->blend_src1 = out; 1897 } else { 1898 ctx->blend_src1 = reg; 1899 } 1900 1901 break; 1902 } 1903 1904 enum midgard_rt_id rt; 1905 if (var->data.location >= FRAG_RESULT_DATA0) 1906 rt = MIDGARD_COLOR_RT0 + var->data.location - 1907 FRAG_RESULT_DATA0; 1908 else if (combined) 1909 rt = MIDGARD_ZS_RT; 1910 else 1911 unreachable("bad rt"); 1912 1913 unsigned reg_z = ~0, reg_s = ~0; 1914 if (combined) { 1915 unsigned writeout = nir_intrinsic_component(instr); 1916 if (writeout & PAN_WRITEOUT_Z) 1917 reg_z = nir_src_index(ctx, &instr->src[2]); 1918 if (writeout & PAN_WRITEOUT_S) 1919 reg_s = nir_src_index(ctx, &instr->src[3]); 1920 } 1921 1922 emit_fragment_store(ctx, reg, reg_z, reg_s, rt, 0); 1923 } else if (ctx->stage == MESA_SHADER_VERTEX) { 1924 assert(instr->intrinsic == nir_intrinsic_store_output); 1925 1926 /* We should have been vectorized, though we don't 1927 * currently check that st_vary is emitted only once 1928 * per slot (this is relevant, since there's not a mask 1929 * parameter available on the store [set to 0 by the 1930 * blob]). We do respect the component by adjusting the 1931 * swizzle. If this is a constant source, we'll need to 1932 * emit that explicitly. */ 1933 1934 emit_explicit_constant(ctx, reg, reg); 1935 1936 unsigned dst_component = nir_intrinsic_component(instr); 1937 unsigned nr_comp = nir_src_num_components(instr->src[0]); 1938 1939 midgard_instruction st = m_st_vary_32(reg, PACK_LDST_ATTRIB_OFS(offset)); 1940 st.load_store.arg_reg = REGISTER_LDST_ZERO; 1941 st.load_store.index_format = midgard_index_address_u32; 1942 st.load_store.index_reg = REGISTER_LDST_ZERO; 1943 1944 switch (nir_alu_type_get_base_type(nir_intrinsic_src_type(instr))) { 1945 case nir_type_uint: 1946 case nir_type_bool: 1947 st.op = midgard_op_st_vary_32u; 1948 break; 1949 case nir_type_int: 1950 st.op = midgard_op_st_vary_32i; 1951 break; 1952 case nir_type_float: 1953 st.op = midgard_op_st_vary_32; 1954 break; 1955 default: 1956 unreachable("Attempted to store unknown type"); 1957 break; 1958 } 1959 1960 /* nir_intrinsic_component(store_intr) encodes the 1961 * destination component start. Source component offset 1962 * adjustment is taken care of in 1963 * install_registers_instr(), when offset_swizzle() is 1964 * called. 1965 */ 1966 unsigned src_component = COMPONENT_X; 1967 1968 assert(nr_comp > 0); 1969 for (unsigned i = 0; i < ARRAY_SIZE(st.swizzle); ++i) { 1970 st.swizzle[0][i] = src_component; 1971 if (i >= dst_component && i < dst_component + nr_comp - 1) 1972 src_component++; 1973 } 1974 1975 emit_mir_instruction(ctx, st); 1976 } else { 1977 DBG("Unknown store\n"); 1978 assert(0); 1979 } 1980 1981 break; 1982 1983 /* Special case of store_output for lowered blend shaders */ 1984 case nir_intrinsic_store_raw_output_pan: 1985 assert (ctx->stage == MESA_SHADER_FRAGMENT); 1986 reg = nir_src_index(ctx, &instr->src[0]); 1987 for (unsigned s = 0; s < ctx->blend_sample_iterations; s++) 1988 emit_fragment_store(ctx, reg, ~0, ~0, 1989 ctx->inputs->blend.rt + MIDGARD_COLOR_RT0, 1990 s); 1991 break; 1992 1993 case nir_intrinsic_store_global: 1994 case nir_intrinsic_store_shared: 1995 case nir_intrinsic_store_scratch: 1996 reg = nir_src_index(ctx, &instr->src[0]); 1997 emit_explicit_constant(ctx, reg, reg); 1998 1999 unsigned seg; 2000 if (instr->intrinsic == nir_intrinsic_store_global) 2001 seg = LDST_GLOBAL; 2002 else if (instr->intrinsic == nir_intrinsic_store_shared) 2003 seg = LDST_SHARED; 2004 else 2005 seg = LDST_SCRATCH; 2006 2007 emit_global(ctx, &instr->instr, false, reg, &instr->src[1], seg); 2008 break; 2009 2010 case nir_intrinsic_load_first_vertex: 2011 case nir_intrinsic_load_ssbo_address: 2012 case nir_intrinsic_load_work_dim: 2013 emit_sysval_read(ctx, &instr->instr, 1, 0); 2014 break; 2015 2016 case nir_intrinsic_load_base_vertex: 2017 emit_sysval_read(ctx, &instr->instr, 1, 4); 2018 break; 2019 2020 case nir_intrinsic_load_base_instance: 2021 emit_sysval_read(ctx, &instr->instr, 1, 8); 2022 break; 2023 2024 case nir_intrinsic_load_sample_positions_pan: 2025 emit_sysval_read(ctx, &instr->instr, 2, 0); 2026 break; 2027 2028 case nir_intrinsic_get_ssbo_size: 2029 emit_sysval_read(ctx, &instr->instr, 1, 8); 2030 break; 2031 2032 case nir_intrinsic_load_viewport_scale: 2033 case nir_intrinsic_load_viewport_offset: 2034 case nir_intrinsic_load_num_workgroups: 2035 case nir_intrinsic_load_sampler_lod_parameters_pan: 2036 case nir_intrinsic_load_workgroup_size: 2037 emit_sysval_read(ctx, &instr->instr, 3, 0); 2038 break; 2039 2040 case nir_intrinsic_load_blend_const_color_rgba: 2041 emit_sysval_read(ctx, &instr->instr, 4, 0); 2042 break; 2043 2044 case nir_intrinsic_load_workgroup_id: 2045 case nir_intrinsic_load_local_invocation_id: 2046 case nir_intrinsic_load_global_invocation_id: 2047 case nir_intrinsic_load_global_invocation_id_zero_base: 2048 emit_compute_builtin(ctx, instr); 2049 break; 2050 2051 case nir_intrinsic_load_vertex_id_zero_base: 2052 case nir_intrinsic_load_instance_id: 2053 emit_vertex_builtin(ctx, instr); 2054 break; 2055 2056 case nir_intrinsic_load_sample_mask_in: 2057 emit_special(ctx, instr, 96); 2058 break; 2059 2060 case nir_intrinsic_load_sample_id: 2061 emit_special(ctx, instr, 97); 2062 break; 2063 2064 /* Midgard doesn't seem to want special handling */ 2065 case nir_intrinsic_memory_barrier: 2066 case nir_intrinsic_memory_barrier_buffer: 2067 case nir_intrinsic_memory_barrier_image: 2068 case nir_intrinsic_memory_barrier_shared: 2069 case nir_intrinsic_group_memory_barrier: 2070 break; 2071 2072 case nir_intrinsic_control_barrier: 2073 schedule_barrier(ctx); 2074 emit_control_barrier(ctx); 2075 schedule_barrier(ctx); 2076 break; 2077 2078 ATOMIC_CASE(ctx, instr, add, add); 2079 ATOMIC_CASE(ctx, instr, and, and); 2080 ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg); 2081 ATOMIC_CASE(ctx, instr, exchange, xchg); 2082 ATOMIC_CASE(ctx, instr, imax, imax); 2083 ATOMIC_CASE(ctx, instr, imin, imin); 2084 ATOMIC_CASE(ctx, instr, or, or); 2085 ATOMIC_CASE(ctx, instr, umax, umax); 2086 ATOMIC_CASE(ctx, instr, umin, umin); 2087 ATOMIC_CASE(ctx, instr, xor, xor); 2088 2089 IMAGE_ATOMIC_CASE(ctx, instr, add, add); 2090 IMAGE_ATOMIC_CASE(ctx, instr, and, and); 2091 IMAGE_ATOMIC_CASE(ctx, instr, comp_swap, cmpxchg); 2092 IMAGE_ATOMIC_CASE(ctx, instr, exchange, xchg); 2093 IMAGE_ATOMIC_CASE(ctx, instr, imax, imax); 2094 IMAGE_ATOMIC_CASE(ctx, instr, imin, imin); 2095 IMAGE_ATOMIC_CASE(ctx, instr, or, or); 2096 IMAGE_ATOMIC_CASE(ctx, instr, umax, umax); 2097 IMAGE_ATOMIC_CASE(ctx, instr, umin, umin); 2098 IMAGE_ATOMIC_CASE(ctx, instr, xor, xor); 2099 2100 default: 2101 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); 2102 assert(0); 2103 break; 2104 } 2105} 2106 2107/* Returns dimension with 0 special casing cubemaps */ 2108static unsigned 2109midgard_tex_format(enum glsl_sampler_dim dim) 2110{ 2111 switch (dim) { 2112 case GLSL_SAMPLER_DIM_1D: 2113 case GLSL_SAMPLER_DIM_BUF: 2114 return 1; 2115 2116 case GLSL_SAMPLER_DIM_2D: 2117 case GLSL_SAMPLER_DIM_MS: 2118 case GLSL_SAMPLER_DIM_EXTERNAL: 2119 case GLSL_SAMPLER_DIM_RECT: 2120 return 2; 2121 2122 case GLSL_SAMPLER_DIM_3D: 2123 return 3; 2124 2125 case GLSL_SAMPLER_DIM_CUBE: 2126 return 0; 2127 2128 default: 2129 DBG("Unknown sampler dim type\n"); 2130 assert(0); 2131 return 0; 2132 } 2133} 2134 2135/* Tries to attach an explicit LOD or bias as a constant. Returns whether this 2136 * was successful */ 2137 2138static bool 2139pan_attach_constant_bias( 2140 compiler_context *ctx, 2141 nir_src lod, 2142 midgard_texture_word *word) 2143{ 2144 /* To attach as constant, it has to *be* constant */ 2145 2146 if (!nir_src_is_const(lod)) 2147 return false; 2148 2149 float f = nir_src_as_float(lod); 2150 2151 /* Break into fixed-point */ 2152 signed lod_int = f; 2153 float lod_frac = f - lod_int; 2154 2155 /* Carry over negative fractions */ 2156 if (lod_frac < 0.0) { 2157 lod_int--; 2158 lod_frac += 1.0; 2159 } 2160 2161 /* Encode */ 2162 word->bias = float_to_ubyte(lod_frac); 2163 word->bias_int = lod_int; 2164 2165 return true; 2166} 2167 2168static enum mali_texture_mode 2169mdg_texture_mode(nir_tex_instr *instr) 2170{ 2171 if (instr->op == nir_texop_tg4 && instr->is_shadow) 2172 return TEXTURE_GATHER_SHADOW; 2173 else if (instr->op == nir_texop_tg4) 2174 return TEXTURE_GATHER_X + instr->component; 2175 else if (instr->is_shadow) 2176 return TEXTURE_SHADOW; 2177 else 2178 return TEXTURE_NORMAL; 2179} 2180 2181static void 2182set_tex_coord(compiler_context *ctx, nir_tex_instr *instr, 2183 midgard_instruction *ins) 2184{ 2185 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); 2186 2187 assert(coord_idx >= 0); 2188 2189 int comparator_idx = nir_tex_instr_src_index(instr, nir_tex_src_comparator); 2190 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); 2191 assert(comparator_idx < 0 || ms_idx < 0); 2192 int ms_or_comparator_idx = ms_idx >= 0 ? ms_idx : comparator_idx; 2193 2194 unsigned coords = nir_src_index(ctx, &instr->src[coord_idx].src); 2195 2196 emit_explicit_constant(ctx, coords, coords); 2197 2198 ins->src_types[1] = nir_tex_instr_src_type(instr, coord_idx) | 2199 nir_src_bit_size(instr->src[coord_idx].src); 2200 2201 unsigned nr_comps = instr->coord_components; 2202 unsigned written_mask = 0, write_mask = 0; 2203 2204 /* Initialize all components to coord.x which is expected to always be 2205 * present. Swizzle is updated below based on the texture dimension 2206 * and extra attributes that are packed in the coordinate argument. 2207 */ 2208 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) 2209 ins->swizzle[1][c] = COMPONENT_X; 2210 2211 /* Shadow ref value is part of the coordinates if there's no comparator 2212 * source, in that case it's always placed in the last component. 2213 * Midgard wants the ref value in coord.z. 2214 */ 2215 if (instr->is_shadow && comparator_idx < 0) { 2216 ins->swizzle[1][COMPONENT_Z] = --nr_comps; 2217 write_mask |= 1 << COMPONENT_Z; 2218 } 2219 2220 /* The array index is the last component if there's no shadow ref value 2221 * or second last if there's one. We already decremented the number of 2222 * components to account for the shadow ref value above. 2223 * Midgard wants the array index in coord.w. 2224 */ 2225 if (instr->is_array) { 2226 ins->swizzle[1][COMPONENT_W] = --nr_comps; 2227 write_mask |= 1 << COMPONENT_W; 2228 } 2229 2230 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 2231 /* texelFetch is undefined on samplerCube */ 2232 assert(ins->op != midgard_tex_op_fetch); 2233 2234 ins->src[1] = make_compiler_temp_reg(ctx); 2235 2236 /* For cubemaps, we use a special ld/st op to select the face 2237 * and copy the xy into the texture register 2238 */ 2239 midgard_instruction ld = m_ld_cubemap_coords(ins->src[1], 0); 2240 ld.src[1] = coords; 2241 ld.src_types[1] = ins->src_types[1]; 2242 ld.mask = 0x3; /* xy */ 2243 ld.load_store.bitsize_toggle = true; 2244 ld.swizzle[1][3] = COMPONENT_X; 2245 emit_mir_instruction(ctx, ld); 2246 2247 /* We packed cube coordiates (X,Y,Z) into (X,Y), update the 2248 * written mask accordingly and decrement the number of 2249 * components 2250 */ 2251 nr_comps--; 2252 written_mask |= 3; 2253 } 2254 2255 /* Now flag tex coord components that have not been written yet */ 2256 write_mask |= mask_of(nr_comps) & ~written_mask; 2257 for (unsigned c = 0; c < nr_comps; c++) 2258 ins->swizzle[1][c] = c; 2259 2260 /* Sample index and shadow ref are expected in coord.z */ 2261 if (ms_or_comparator_idx >= 0) { 2262 assert(!((write_mask | written_mask) & (1 << COMPONENT_Z))); 2263 2264 unsigned sample_or_ref = 2265 nir_src_index(ctx, &instr->src[ms_or_comparator_idx].src); 2266 2267 emit_explicit_constant(ctx, sample_or_ref, sample_or_ref); 2268 2269 if (ins->src[1] == ~0) 2270 ins->src[1] = make_compiler_temp_reg(ctx); 2271 2272 midgard_instruction mov = v_mov(sample_or_ref, ins->src[1]); 2273 2274 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) 2275 mov.swizzle[1][c] = COMPONENT_X; 2276 2277 mov.mask = 1 << COMPONENT_Z; 2278 written_mask |= 1 << COMPONENT_Z; 2279 ins->swizzle[1][COMPONENT_Z] = COMPONENT_Z; 2280 emit_mir_instruction(ctx, mov); 2281 } 2282 2283 /* Texelfetch coordinates uses all four elements (xyz/index) regardless 2284 * of texture dimensionality, which means it's necessary to zero the 2285 * unused components to keep everything happy. 2286 */ 2287 if (ins->op == midgard_tex_op_fetch && 2288 (written_mask | write_mask) != 0xF) { 2289 if (ins->src[1] == ~0) 2290 ins->src[1] = make_compiler_temp_reg(ctx); 2291 2292 /* mov index.zw, #0, or generalized */ 2293 midgard_instruction mov = 2294 v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), ins->src[1]); 2295 mov.has_constants = true; 2296 mov.mask = (written_mask | write_mask) ^ 0xF; 2297 emit_mir_instruction(ctx, mov); 2298 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) { 2299 if (mov.mask & (1 << c)) 2300 ins->swizzle[1][c] = c; 2301 } 2302 } 2303 2304 if (ins->src[1] == ~0) { 2305 /* No temporary reg created, use the src coords directly */ 2306 ins->src[1] = coords; 2307 } else if (write_mask) { 2308 /* Move the remaining coordinates to the temporary reg */ 2309 midgard_instruction mov = v_mov(coords, ins->src[1]); 2310 2311 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; c++) { 2312 if ((1 << c) & write_mask) { 2313 mov.swizzle[1][c] = ins->swizzle[1][c]; 2314 ins->swizzle[1][c] = c; 2315 } else { 2316 mov.swizzle[1][c] = COMPONENT_X; 2317 } 2318 } 2319 2320 mov.mask = write_mask; 2321 emit_mir_instruction(ctx, mov); 2322 } 2323} 2324 2325static void 2326emit_texop_native(compiler_context *ctx, nir_tex_instr *instr, 2327 unsigned midgard_texop) 2328{ 2329 /* TODO */ 2330 //assert (!instr->sampler); 2331 2332 nir_dest *dest = &instr->dest; 2333 2334 int texture_index = instr->texture_index; 2335 int sampler_index = instr->sampler_index; 2336 2337 nir_alu_type dest_base = nir_alu_type_get_base_type(instr->dest_type); 2338 2339 /* texture instructions support float outmods */ 2340 unsigned outmod = midgard_outmod_none; 2341 if (dest_base == nir_type_float) { 2342 outmod = mir_determine_float_outmod(ctx, &dest, 0); 2343 } 2344 2345 midgard_instruction ins = { 2346 .type = TAG_TEXTURE_4, 2347 .mask = 0xF, 2348 .dest = nir_dest_index(dest), 2349 .src = { ~0, ~0, ~0, ~0 }, 2350 .dest_type = instr->dest_type, 2351 .swizzle = SWIZZLE_IDENTITY_4, 2352 .outmod = outmod, 2353 .op = midgard_texop, 2354 .texture = { 2355 .format = midgard_tex_format(instr->sampler_dim), 2356 .texture_handle = texture_index, 2357 .sampler_handle = sampler_index, 2358 .mode = mdg_texture_mode(instr) 2359 } 2360 }; 2361 2362 if (instr->is_shadow && !instr->is_new_style_shadow && instr->op != nir_texop_tg4) 2363 for (int i = 0; i < 4; ++i) 2364 ins.swizzle[0][i] = COMPONENT_X; 2365 2366 for (unsigned i = 0; i < instr->num_srcs; ++i) { 2367 int index = nir_src_index(ctx, &instr->src[i].src); 2368 unsigned sz = nir_src_bit_size(instr->src[i].src); 2369 nir_alu_type T = nir_tex_instr_src_type(instr, i) | sz; 2370 2371 switch (instr->src[i].src_type) { 2372 case nir_tex_src_coord: 2373 set_tex_coord(ctx, instr, &ins); 2374 break; 2375 2376 case nir_tex_src_bias: 2377 case nir_tex_src_lod: { 2378 /* Try as a constant if we can */ 2379 2380 bool is_txf = midgard_texop == midgard_tex_op_fetch; 2381 if (!is_txf && pan_attach_constant_bias(ctx, instr->src[i].src, &ins.texture)) 2382 break; 2383 2384 ins.texture.lod_register = true; 2385 ins.src[2] = index; 2386 ins.src_types[2] = T; 2387 2388 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) 2389 ins.swizzle[2][c] = COMPONENT_X; 2390 2391 emit_explicit_constant(ctx, index, index); 2392 2393 break; 2394 }; 2395 2396 case nir_tex_src_offset: { 2397 ins.texture.offset_register = true; 2398 ins.src[3] = index; 2399 ins.src_types[3] = T; 2400 2401 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) 2402 ins.swizzle[3][c] = (c > COMPONENT_Z) ? 0 : c; 2403 2404 emit_explicit_constant(ctx, index, index); 2405 break; 2406 }; 2407 2408 case nir_tex_src_comparator: 2409 case nir_tex_src_ms_index: 2410 /* Nothing to do, handled in set_tex_coord() */ 2411 break; 2412 2413 default: { 2414 fprintf(stderr, "Unknown texture source type: %d\n", instr->src[i].src_type); 2415 assert(0); 2416 } 2417 } 2418 } 2419 2420 emit_mir_instruction(ctx, ins); 2421} 2422 2423static void 2424emit_tex(compiler_context *ctx, nir_tex_instr *instr) 2425{ 2426 switch (instr->op) { 2427 case nir_texop_tex: 2428 case nir_texop_txb: 2429 emit_texop_native(ctx, instr, midgard_tex_op_normal); 2430 break; 2431 case nir_texop_txl: 2432 case nir_texop_tg4: 2433 emit_texop_native(ctx, instr, midgard_tex_op_gradient); 2434 break; 2435 case nir_texop_txf: 2436 case nir_texop_txf_ms: 2437 emit_texop_native(ctx, instr, midgard_tex_op_fetch); 2438 break; 2439 case nir_texop_txs: 2440 emit_sysval_read(ctx, &instr->instr, 4, 0); 2441 break; 2442 default: { 2443 fprintf(stderr, "Unhandled texture op: %d\n", instr->op); 2444 assert(0); 2445 } 2446 } 2447} 2448 2449static void 2450emit_jump(compiler_context *ctx, nir_jump_instr *instr) 2451{ 2452 switch (instr->type) { 2453 case nir_jump_break: { 2454 /* Emit a branch out of the loop */ 2455 struct midgard_instruction br = v_branch(false, false); 2456 br.branch.target_type = TARGET_BREAK; 2457 br.branch.target_break = ctx->current_loop_depth; 2458 emit_mir_instruction(ctx, br); 2459 break; 2460 } 2461 2462 default: 2463 DBG("Unknown jump type %d\n", instr->type); 2464 break; 2465 } 2466} 2467 2468static void 2469emit_instr(compiler_context *ctx, struct nir_instr *instr) 2470{ 2471 switch (instr->type) { 2472 case nir_instr_type_load_const: 2473 emit_load_const(ctx, nir_instr_as_load_const(instr)); 2474 break; 2475 2476 case nir_instr_type_intrinsic: 2477 emit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); 2478 break; 2479 2480 case nir_instr_type_alu: 2481 emit_alu(ctx, nir_instr_as_alu(instr)); 2482 break; 2483 2484 case nir_instr_type_tex: 2485 emit_tex(ctx, nir_instr_as_tex(instr)); 2486 break; 2487 2488 case nir_instr_type_jump: 2489 emit_jump(ctx, nir_instr_as_jump(instr)); 2490 break; 2491 2492 case nir_instr_type_ssa_undef: 2493 /* Spurious */ 2494 break; 2495 2496 default: 2497 DBG("Unhandled instruction type\n"); 2498 break; 2499 } 2500} 2501 2502 2503/* ALU instructions can inline or embed constants, which decreases register 2504 * pressure and saves space. */ 2505 2506#define CONDITIONAL_ATTACH(idx) { \ 2507 void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[idx] + 1); \ 2508\ 2509 if (entry) { \ 2510 attach_constants(ctx, alu, entry, alu->src[idx] + 1); \ 2511 alu->src[idx] = SSA_FIXED_REGISTER(REGISTER_CONSTANT); \ 2512 } \ 2513} 2514 2515static void 2516inline_alu_constants(compiler_context *ctx, midgard_block *block) 2517{ 2518 mir_foreach_instr_in_block(block, alu) { 2519 /* Other instructions cannot inline constants */ 2520 if (alu->type != TAG_ALU_4) continue; 2521 if (alu->compact_branch) continue; 2522 2523 /* If there is already a constant here, we can do nothing */ 2524 if (alu->has_constants) continue; 2525 2526 CONDITIONAL_ATTACH(0); 2527 2528 if (!alu->has_constants) { 2529 CONDITIONAL_ATTACH(1) 2530 } else if (!alu->inline_constant) { 2531 /* Corner case: _two_ vec4 constants, for instance with a 2532 * csel. For this case, we can only use a constant 2533 * register for one, we'll have to emit a move for the 2534 * other. */ 2535 2536 void *entry = _mesa_hash_table_u64_search(ctx->ssa_constants, alu->src[1] + 1); 2537 unsigned scratch = make_compiler_temp(ctx); 2538 2539 if (entry) { 2540 midgard_instruction ins = v_mov(SSA_FIXED_REGISTER(REGISTER_CONSTANT), scratch); 2541 attach_constants(ctx, &ins, entry, alu->src[1] + 1); 2542 2543 /* Set the source */ 2544 alu->src[1] = scratch; 2545 2546 /* Inject us -before- the last instruction which set r31 */ 2547 mir_insert_instruction_before(ctx, mir_prev_op(alu), ins); 2548 } 2549 } 2550 } 2551} 2552 2553unsigned 2554max_bitsize_for_alu(midgard_instruction *ins) 2555{ 2556 unsigned max_bitsize = 0; 2557 for (int i = 0; i < MIR_SRC_COUNT; i++) { 2558 if (ins->src[i] == ~0) continue; 2559 unsigned src_bitsize = nir_alu_type_get_type_size(ins->src_types[i]); 2560 max_bitsize = MAX2(src_bitsize, max_bitsize); 2561 } 2562 unsigned dst_bitsize = nir_alu_type_get_type_size(ins->dest_type); 2563 max_bitsize = MAX2(dst_bitsize, max_bitsize); 2564 2565 /* We don't have fp16 LUTs, so we'll want to emit code like: 2566 * 2567 * vlut.fsinr hr0, hr0 2568 * 2569 * where both input and output are 16-bit but the operation is carried 2570 * out in 32-bit 2571 */ 2572 2573 switch (ins->op) { 2574 case midgard_alu_op_fsqrt: 2575 case midgard_alu_op_frcp: 2576 case midgard_alu_op_frsqrt: 2577 case midgard_alu_op_fsinpi: 2578 case midgard_alu_op_fcospi: 2579 case midgard_alu_op_fexp2: 2580 case midgard_alu_op_flog2: 2581 max_bitsize = MAX2(max_bitsize, 32); 2582 break; 2583 2584 default: 2585 break; 2586 } 2587 2588 /* High implies computing at a higher bitsize, e.g umul_high of 32-bit 2589 * requires computing at 64-bit */ 2590 if (midgard_is_integer_out_op(ins->op) && ins->outmod == midgard_outmod_keephi) { 2591 max_bitsize *= 2; 2592 assert(max_bitsize <= 64); 2593 } 2594 2595 return max_bitsize; 2596} 2597 2598midgard_reg_mode 2599reg_mode_for_bitsize(unsigned bitsize) 2600{ 2601 switch (bitsize) { 2602 /* use 16 pipe for 8 since we don't support vec16 yet */ 2603 case 8: 2604 case 16: 2605 return midgard_reg_mode_16; 2606 case 32: 2607 return midgard_reg_mode_32; 2608 case 64: 2609 return midgard_reg_mode_64; 2610 default: 2611 unreachable("invalid bit size"); 2612 } 2613} 2614 2615/* Midgard supports two types of constants, embedded constants (128-bit) and 2616 * inline constants (16-bit). Sometimes, especially with scalar ops, embedded 2617 * constants can be demoted to inline constants, for space savings and 2618 * sometimes a performance boost */ 2619 2620static void 2621embedded_to_inline_constant(compiler_context *ctx, midgard_block *block) 2622{ 2623 mir_foreach_instr_in_block(block, ins) { 2624 if (!ins->has_constants) continue; 2625 if (ins->has_inline_constant) continue; 2626 2627 unsigned max_bitsize = max_bitsize_for_alu(ins); 2628 2629 /* We can inline 32-bit (sometimes) or 16-bit (usually) */ 2630 bool is_16 = max_bitsize == 16; 2631 bool is_32 = max_bitsize == 32; 2632 2633 if (!(is_16 || is_32)) 2634 continue; 2635 2636 /* src1 cannot be an inline constant due to encoding 2637 * restrictions. So, if possible we try to flip the arguments 2638 * in that case */ 2639 2640 int op = ins->op; 2641 2642 if (ins->src[0] == SSA_FIXED_REGISTER(REGISTER_CONSTANT) && 2643 alu_opcode_props[op].props & OP_COMMUTES) { 2644 mir_flip(ins); 2645 } 2646 2647 if (ins->src[1] == SSA_FIXED_REGISTER(REGISTER_CONSTANT)) { 2648 /* Component is from the swizzle. Take a nonzero component */ 2649 assert(ins->mask); 2650 unsigned first_comp = ffs(ins->mask) - 1; 2651 unsigned component = ins->swizzle[1][first_comp]; 2652 2653 /* Scale constant appropriately, if we can legally */ 2654 int16_t scaled_constant = 0; 2655 2656 if (is_16) { 2657 scaled_constant = ins->constants.u16[component]; 2658 } else if (midgard_is_integer_op(op)) { 2659 scaled_constant = ins->constants.u32[component]; 2660 2661 /* Constant overflow after resize */ 2662 if (scaled_constant != ins->constants.u32[component]) 2663 continue; 2664 } else { 2665 float original = ins->constants.f32[component]; 2666 scaled_constant = _mesa_float_to_half(original); 2667 2668 /* Check for loss of precision. If this is 2669 * mediump, we don't care, but for a highp 2670 * shader, we need to pay attention. NIR 2671 * doesn't yet tell us which mode we're in! 2672 * Practically this prevents most constants 2673 * from being inlined, sadly. */ 2674 2675 float fp32 = _mesa_half_to_float(scaled_constant); 2676 2677 if (fp32 != original) 2678 continue; 2679 } 2680 2681 /* Should've been const folded */ 2682 if (ins->src_abs[1] || ins->src_neg[1]) 2683 continue; 2684 2685 /* Make sure that the constant is not itself a vector 2686 * by checking if all accessed values are the same. */ 2687 2688 const midgard_constants *cons = &ins->constants; 2689 uint32_t value = is_16 ? cons->u16[component] : cons->u32[component]; 2690 2691 bool is_vector = false; 2692 unsigned mask = effective_writemask(ins->op, ins->mask); 2693 2694 for (unsigned c = 0; c < MIR_VEC_COMPONENTS; ++c) { 2695 /* We only care if this component is actually used */ 2696 if (!(mask & (1 << c))) 2697 continue; 2698 2699 uint32_t test = is_16 ? 2700 cons->u16[ins->swizzle[1][c]] : 2701 cons->u32[ins->swizzle[1][c]]; 2702 2703 if (test != value) { 2704 is_vector = true; 2705 break; 2706 } 2707 } 2708 2709 if (is_vector) 2710 continue; 2711 2712 /* Get rid of the embedded constant */ 2713 ins->has_constants = false; 2714 ins->src[1] = ~0; 2715 ins->has_inline_constant = true; 2716 ins->inline_constant = scaled_constant; 2717 } 2718 } 2719} 2720 2721/* Dead code elimination for branches at the end of a block - only one branch 2722 * per block is legal semantically */ 2723 2724static void 2725midgard_cull_dead_branch(compiler_context *ctx, midgard_block *block) 2726{ 2727 bool branched = false; 2728 2729 mir_foreach_instr_in_block_safe(block, ins) { 2730 if (!midgard_is_branch_unit(ins->unit)) continue; 2731 2732 if (branched) 2733 mir_remove_instruction(ins); 2734 2735 branched = true; 2736 } 2737} 2738 2739/* We want to force the invert on AND/OR to the second slot to legalize into 2740 * iandnot/iornot. The relevant patterns are for AND (and OR respectively) 2741 * 2742 * ~a & #b = ~a & ~(#~b) 2743 * ~a & b = b & ~a 2744 */ 2745 2746static void 2747midgard_legalize_invert(compiler_context *ctx, midgard_block *block) 2748{ 2749 mir_foreach_instr_in_block(block, ins) { 2750 if (ins->type != TAG_ALU_4) continue; 2751 2752 if (ins->op != midgard_alu_op_iand && 2753 ins->op != midgard_alu_op_ior) continue; 2754 2755 if (ins->src_invert[1] || !ins->src_invert[0]) continue; 2756 2757 if (ins->has_inline_constant) { 2758 /* ~(#~a) = ~(~#a) = a, so valid, and forces both 2759 * inverts on */ 2760 ins->inline_constant = ~ins->inline_constant; 2761 ins->src_invert[1] = true; 2762 } else { 2763 /* Flip to the right invert order. Note 2764 * has_inline_constant false by assumption on the 2765 * branch, so flipping makes sense. */ 2766 mir_flip(ins); 2767 } 2768 } 2769} 2770 2771static unsigned 2772emit_fragment_epilogue(compiler_context *ctx, unsigned rt, unsigned sample_iter) 2773{ 2774 /* Loop to ourselves */ 2775 midgard_instruction *br = ctx->writeout_branch[rt][sample_iter]; 2776 struct midgard_instruction ins = v_branch(false, false); 2777 ins.writeout = br->writeout; 2778 ins.branch.target_block = ctx->block_count - 1; 2779 ins.constants.u32[0] = br->constants.u32[0]; 2780 memcpy(&ins.src_types, &br->src_types, sizeof(ins.src_types)); 2781 emit_mir_instruction(ctx, ins); 2782 2783 ctx->current_block->epilogue = true; 2784 schedule_barrier(ctx); 2785 return ins.branch.target_block; 2786} 2787 2788static midgard_block * 2789emit_block_init(compiler_context *ctx) 2790{ 2791 midgard_block *this_block = ctx->after_block; 2792 ctx->after_block = NULL; 2793 2794 if (!this_block) 2795 this_block = create_empty_block(ctx); 2796 2797 list_addtail(&this_block->base.link, &ctx->blocks); 2798 2799 this_block->scheduled = false; 2800 ++ctx->block_count; 2801 2802 /* Set up current block */ 2803 list_inithead(&this_block->base.instructions); 2804 ctx->current_block = this_block; 2805 2806 return this_block; 2807} 2808 2809static midgard_block * 2810emit_block(compiler_context *ctx, nir_block *block) 2811{ 2812 midgard_block *this_block = emit_block_init(ctx); 2813 2814 nir_foreach_instr(instr, block) { 2815 emit_instr(ctx, instr); 2816 ++ctx->instruction_count; 2817 } 2818 2819 return this_block; 2820} 2821 2822static midgard_block *emit_cf_list(struct compiler_context *ctx, struct exec_list *list); 2823 2824static void 2825emit_if(struct compiler_context *ctx, nir_if *nif) 2826{ 2827 midgard_block *before_block = ctx->current_block; 2828 2829 /* Speculatively emit the branch, but we can't fill it in until later */ 2830 bool inv = false; 2831 EMIT(branch, true, true); 2832 midgard_instruction *then_branch = mir_last_in_block(ctx->current_block); 2833 then_branch->src[0] = mir_get_branch_cond(&nif->condition, &inv); 2834 then_branch->src_types[0] = nir_type_uint32; 2835 then_branch->branch.invert_conditional = !inv; 2836 2837 /* Emit the two subblocks. */ 2838 midgard_block *then_block = emit_cf_list(ctx, &nif->then_list); 2839 midgard_block *end_then_block = ctx->current_block; 2840 2841 /* Emit a jump from the end of the then block to the end of the else */ 2842 EMIT(branch, false, false); 2843 midgard_instruction *then_exit = mir_last_in_block(ctx->current_block); 2844 2845 /* Emit second block, and check if it's empty */ 2846 2847 int else_idx = ctx->block_count; 2848 int count_in = ctx->instruction_count; 2849 midgard_block *else_block = emit_cf_list(ctx, &nif->else_list); 2850 midgard_block *end_else_block = ctx->current_block; 2851 int after_else_idx = ctx->block_count; 2852 2853 /* Now that we have the subblocks emitted, fix up the branches */ 2854 2855 assert(then_block); 2856 assert(else_block); 2857 2858 if (ctx->instruction_count == count_in) { 2859 /* The else block is empty, so don't emit an exit jump */ 2860 mir_remove_instruction(then_exit); 2861 then_branch->branch.target_block = after_else_idx; 2862 } else { 2863 then_branch->branch.target_block = else_idx; 2864 then_exit->branch.target_block = after_else_idx; 2865 } 2866 2867 /* Wire up the successors */ 2868 2869 ctx->after_block = create_empty_block(ctx); 2870 2871 pan_block_add_successor(&before_block->base, &then_block->base); 2872 pan_block_add_successor(&before_block->base, &else_block->base); 2873 2874 pan_block_add_successor(&end_then_block->base, &ctx->after_block->base); 2875 pan_block_add_successor(&end_else_block->base, &ctx->after_block->base); 2876} 2877 2878static void 2879emit_loop(struct compiler_context *ctx, nir_loop *nloop) 2880{ 2881 /* Remember where we are */ 2882 midgard_block *start_block = ctx->current_block; 2883 2884 /* Allocate a loop number, growing the current inner loop depth */ 2885 int loop_idx = ++ctx->current_loop_depth; 2886 2887 /* Get index from before the body so we can loop back later */ 2888 int start_idx = ctx->block_count; 2889 2890 /* Emit the body itself */ 2891 midgard_block *loop_block = emit_cf_list(ctx, &nloop->body); 2892 2893 /* Branch back to loop back */ 2894 struct midgard_instruction br_back = v_branch(false, false); 2895 br_back.branch.target_block = start_idx; 2896 emit_mir_instruction(ctx, br_back); 2897 2898 /* Mark down that branch in the graph. */ 2899 pan_block_add_successor(&start_block->base, &loop_block->base); 2900 pan_block_add_successor(&ctx->current_block->base, &loop_block->base); 2901 2902 /* Find the index of the block about to follow us (note: we don't add 2903 * one; blocks are 0-indexed so we get a fencepost problem) */ 2904 int break_block_idx = ctx->block_count; 2905 2906 /* Fix up the break statements we emitted to point to the right place, 2907 * now that we can allocate a block number for them */ 2908 ctx->after_block = create_empty_block(ctx); 2909 2910 mir_foreach_block_from(ctx, start_block, _block) { 2911 mir_foreach_instr_in_block(((midgard_block *) _block), ins) { 2912 if (ins->type != TAG_ALU_4) continue; 2913 if (!ins->compact_branch) continue; 2914 2915 /* We found a branch -- check the type to see if we need to do anything */ 2916 if (ins->branch.target_type != TARGET_BREAK) continue; 2917 2918 /* It's a break! Check if it's our break */ 2919 if (ins->branch.target_break != loop_idx) continue; 2920 2921 /* Okay, cool, we're breaking out of this loop. 2922 * Rewrite from a break to a goto */ 2923 2924 ins->branch.target_type = TARGET_GOTO; 2925 ins->branch.target_block = break_block_idx; 2926 2927 pan_block_add_successor(_block, &ctx->after_block->base); 2928 } 2929 } 2930 2931 /* Now that we've finished emitting the loop, free up the depth again 2932 * so we play nice with recursion amid nested loops */ 2933 --ctx->current_loop_depth; 2934 2935 /* Dump loop stats */ 2936 ++ctx->loop_count; 2937} 2938 2939static midgard_block * 2940emit_cf_list(struct compiler_context *ctx, struct exec_list *list) 2941{ 2942 midgard_block *start_block = NULL; 2943 2944 foreach_list_typed(nir_cf_node, node, node, list) { 2945 switch (node->type) { 2946 case nir_cf_node_block: { 2947 midgard_block *block = emit_block(ctx, nir_cf_node_as_block(node)); 2948 2949 if (!start_block) 2950 start_block = block; 2951 2952 break; 2953 } 2954 2955 case nir_cf_node_if: 2956 emit_if(ctx, nir_cf_node_as_if(node)); 2957 break; 2958 2959 case nir_cf_node_loop: 2960 emit_loop(ctx, nir_cf_node_as_loop(node)); 2961 break; 2962 2963 case nir_cf_node_function: 2964 assert(0); 2965 break; 2966 } 2967 } 2968 2969 return start_block; 2970} 2971 2972/* Due to lookahead, we need to report the first tag executed in the command 2973 * stream and in branch targets. An initial block might be empty, so iterate 2974 * until we find one that 'works' */ 2975 2976unsigned 2977midgard_get_first_tag_from_block(compiler_context *ctx, unsigned block_idx) 2978{ 2979 midgard_block *initial_block = mir_get_block(ctx, block_idx); 2980 2981 mir_foreach_block_from(ctx, initial_block, _v) { 2982 midgard_block *v = (midgard_block *) _v; 2983 if (v->quadword_count) { 2984 midgard_bundle *initial_bundle = 2985 util_dynarray_element(&v->bundles, midgard_bundle, 0); 2986 2987 return initial_bundle->tag; 2988 } 2989 } 2990 2991 /* Default to a tag 1 which will break from the shader, in case we jump 2992 * to the exit block (i.e. `return` in a compute shader) */ 2993 2994 return 1; 2995} 2996 2997/* For each fragment writeout instruction, generate a writeout loop to 2998 * associate with it */ 2999 3000static void 3001mir_add_writeout_loops(compiler_context *ctx) 3002{ 3003 for (unsigned rt = 0; rt < ARRAY_SIZE(ctx->writeout_branch); ++rt) { 3004 for (unsigned s = 0; s < MIDGARD_MAX_SAMPLE_ITER; ++s) { 3005 midgard_instruction *br = ctx->writeout_branch[rt][s]; 3006 if (!br) continue; 3007 3008 unsigned popped = br->branch.target_block; 3009 pan_block_add_successor(&(mir_get_block(ctx, popped - 1)->base), 3010 &ctx->current_block->base); 3011 br->branch.target_block = emit_fragment_epilogue(ctx, rt, s); 3012 br->branch.target_type = TARGET_GOTO; 3013 3014 /* If we have more RTs, we'll need to restore back after our 3015 * loop terminates */ 3016 midgard_instruction *next_br = NULL; 3017 3018 if ((s + 1) < MIDGARD_MAX_SAMPLE_ITER) 3019 next_br = ctx->writeout_branch[rt][s + 1]; 3020 3021 if (!next_br && (rt + 1) < ARRAY_SIZE(ctx->writeout_branch)) 3022 next_br = ctx->writeout_branch[rt + 1][0]; 3023 3024 if (next_br) { 3025 midgard_instruction uncond = v_branch(false, false); 3026 uncond.branch.target_block = popped; 3027 uncond.branch.target_type = TARGET_GOTO; 3028 emit_mir_instruction(ctx, uncond); 3029 pan_block_add_successor(&ctx->current_block->base, 3030 &(mir_get_block(ctx, popped)->base)); 3031 schedule_barrier(ctx); 3032 } else { 3033 /* We're last, so we can terminate here */ 3034 br->last_writeout = true; 3035 } 3036 } 3037 } 3038} 3039 3040void 3041midgard_compile_shader_nir(nir_shader *nir, 3042 const struct panfrost_compile_inputs *inputs, 3043 struct util_dynarray *binary, 3044 struct pan_shader_info *info) 3045{ 3046 midgard_debug = debug_get_option_midgard_debug(); 3047 3048 /* TODO: Bound against what? */ 3049 compiler_context *ctx = rzalloc(NULL, compiler_context); 3050 ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx); 3051 3052 ctx->inputs = inputs; 3053 ctx->nir = nir; 3054 ctx->info = info; 3055 ctx->stage = nir->info.stage; 3056 3057 if (inputs->is_blend) { 3058 unsigned nr_samples = MAX2(inputs->blend.nr_samples, 1); 3059 const struct util_format_description *desc = 3060 util_format_description(inputs->rt_formats[inputs->blend.rt]); 3061 3062 /* We have to split writeout in 128 bit chunks */ 3063 ctx->blend_sample_iterations = 3064 DIV_ROUND_UP(desc->block.bits * nr_samples, 128); 3065 } 3066 ctx->blend_input = ~0; 3067 ctx->blend_src1 = ~0; 3068 ctx->quirks = midgard_get_quirks(inputs->gpu_id); 3069 3070 /* Initialize at a global (not block) level hash tables */ 3071 3072 ctx->ssa_constants = _mesa_hash_table_u64_create(ctx); 3073 3074 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa 3075 * (so we don't accidentally duplicate the epilogue since mesa/st has 3076 * messed with our I/O quite a bit already) */ 3077 3078 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3079 3080 if (ctx->stage == MESA_SHADER_VERTEX) { 3081 NIR_PASS_V(nir, nir_lower_viewport_transform); 3082 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0); 3083 } 3084 3085 NIR_PASS_V(nir, nir_lower_var_copies); 3086 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3087 NIR_PASS_V(nir, nir_split_var_copies); 3088 NIR_PASS_V(nir, nir_lower_var_copies); 3089 NIR_PASS_V(nir, nir_lower_global_vars_to_local); 3090 NIR_PASS_V(nir, nir_lower_var_copies); 3091 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3092 3093 unsigned pan_quirks = panfrost_get_quirks(inputs->gpu_id, 0); 3094 NIR_PASS_V(nir, pan_lower_framebuffer, 3095 inputs->rt_formats, inputs->raw_fmt_mask, 3096 inputs->is_blend, pan_quirks); 3097 3098 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 3099 glsl_type_size, 0); 3100 NIR_PASS_V(nir, nir_lower_ssbo); 3101 NIR_PASS_V(nir, pan_nir_lower_zs_store); 3102 3103 NIR_PASS_V(nir, pan_nir_lower_64bit_intrin); 3104 3105 /* Optimisation passes */ 3106 3107 optimise_nir(nir, ctx->quirks, inputs->is_blend); 3108 3109 NIR_PASS_V(nir, pan_nir_reorder_writeout); 3110 3111 if ((midgard_debug & MIDGARD_DBG_SHADERS) && 3112 ((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) { 3113 nir_print_shader(nir, stdout); 3114 } 3115 3116 info->tls_size = nir->scratch_size; 3117 3118 nir_foreach_function(func, nir) { 3119 if (!func->impl) 3120 continue; 3121 3122 list_inithead(&ctx->blocks); 3123 ctx->block_count = 0; 3124 ctx->func = func; 3125 ctx->already_emitted = calloc(BITSET_WORDS(func->impl->ssa_alloc), sizeof(BITSET_WORD)); 3126 3127 if (nir->info.outputs_read && !inputs->is_blend) { 3128 emit_block_init(ctx); 3129 3130 struct midgard_instruction wait = v_branch(false, false); 3131 wait.branch.target_type = TARGET_TILEBUF_WAIT; 3132 3133 emit_mir_instruction(ctx, wait); 3134 3135 ++ctx->instruction_count; 3136 } 3137 3138 emit_cf_list(ctx, &func->impl->body); 3139 free(ctx->already_emitted); 3140 break; /* TODO: Multi-function shaders */ 3141 } 3142 3143 /* Per-block lowering before opts */ 3144 3145 mir_foreach_block(ctx, _block) { 3146 midgard_block *block = (midgard_block *) _block; 3147 inline_alu_constants(ctx, block); 3148 embedded_to_inline_constant(ctx, block); 3149 } 3150 /* MIR-level optimizations */ 3151 3152 bool progress = false; 3153 3154 do { 3155 progress = false; 3156 progress |= midgard_opt_dead_code_eliminate(ctx); 3157 3158 mir_foreach_block(ctx, _block) { 3159 midgard_block *block = (midgard_block *) _block; 3160 progress |= midgard_opt_copy_prop(ctx, block); 3161 progress |= midgard_opt_combine_projection(ctx, block); 3162 progress |= midgard_opt_varying_projection(ctx, block); 3163 } 3164 } while (progress); 3165 3166 mir_foreach_block(ctx, _block) { 3167 midgard_block *block = (midgard_block *) _block; 3168 midgard_lower_derivatives(ctx, block); 3169 midgard_legalize_invert(ctx, block); 3170 midgard_cull_dead_branch(ctx, block); 3171 } 3172 3173 if (ctx->stage == MESA_SHADER_FRAGMENT) 3174 mir_add_writeout_loops(ctx); 3175 3176 /* Analyze now that the code is known but before scheduling creates 3177 * pipeline registers which are harder to track */ 3178 mir_analyze_helper_requirements(ctx); 3179 3180 /* Schedule! */ 3181 midgard_schedule_program(ctx); 3182 mir_ra(ctx); 3183 3184 /* Analyze after scheduling since this is order-dependent */ 3185 mir_analyze_helper_terminate(ctx); 3186 3187 /* Emit flat binary from the instruction arrays. Iterate each block in 3188 * sequence. Save instruction boundaries such that lookahead tags can 3189 * be assigned easily */ 3190 3191 /* Cache _all_ bundles in source order for lookahead across failed branches */ 3192 3193 int bundle_count = 0; 3194 mir_foreach_block(ctx, _block) { 3195 midgard_block *block = (midgard_block *) _block; 3196 bundle_count += block->bundles.size / sizeof(midgard_bundle); 3197 } 3198 midgard_bundle **source_order_bundles = malloc(sizeof(midgard_bundle *) * bundle_count); 3199 int bundle_idx = 0; 3200 mir_foreach_block(ctx, _block) { 3201 midgard_block *block = (midgard_block *) _block; 3202 util_dynarray_foreach(&block->bundles, midgard_bundle, bundle) { 3203 source_order_bundles[bundle_idx++] = bundle; 3204 } 3205 } 3206 3207 int current_bundle = 0; 3208 3209 /* Midgard prefetches instruction types, so during emission we 3210 * need to lookahead. Unless this is the last instruction, in 3211 * which we return 1. */ 3212 3213 mir_foreach_block(ctx, _block) { 3214 midgard_block *block = (midgard_block *) _block; 3215 mir_foreach_bundle_in_block(block, bundle) { 3216 int lookahead = 1; 3217 3218 if (!bundle->last_writeout && (current_bundle + 1 < bundle_count)) 3219 lookahead = source_order_bundles[current_bundle + 1]->tag; 3220 3221 emit_binary_bundle(ctx, block, bundle, binary, lookahead); 3222 ++current_bundle; 3223 } 3224 3225 /* TODO: Free deeper */ 3226 //util_dynarray_fini(&block->instructions); 3227 } 3228 3229 free(source_order_bundles); 3230 3231 /* Report the very first tag executed */ 3232 info->midgard.first_tag = midgard_get_first_tag_from_block(ctx, 0); 3233 3234 info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos); 3235 3236 if ((midgard_debug & MIDGARD_DBG_SHADERS) && 3237 ((midgard_debug & MIDGARD_DBG_INTERNAL) || !nir->info.internal)) { 3238 disassemble_midgard(stdout, binary->data, 3239 binary->size, inputs->gpu_id, 3240 midgard_debug & MIDGARD_DBG_VERBOSE); 3241 fflush(stdout); 3242 } 3243 3244 /* A shader ending on a 16MB boundary causes INSTR_INVALID_PC faults, 3245 * workaround by adding some padding to the end of the shader. (The 3246 * kernel makes sure shader BOs can't cross 16MB boundaries.) */ 3247 if (binary->size) 3248 memset(util_dynarray_grow(binary, uint8_t, 16), 0, 16); 3249 3250 if ((midgard_debug & MIDGARD_DBG_SHADERDB || inputs->shaderdb) && 3251 !nir->info.internal) { 3252 unsigned nr_bundles = 0, nr_ins = 0; 3253 3254 /* Count instructions and bundles */ 3255 3256 mir_foreach_block(ctx, _block) { 3257 midgard_block *block = (midgard_block *) _block; 3258 nr_bundles += util_dynarray_num_elements( 3259 &block->bundles, midgard_bundle); 3260 3261 mir_foreach_bundle_in_block(block, bun) 3262 nr_ins += bun->instruction_count; 3263 } 3264 3265 /* Calculate thread count. There are certain cutoffs by 3266 * register count for thread count */ 3267 3268 unsigned nr_registers = info->work_reg_count; 3269 3270 unsigned nr_threads = 3271 (nr_registers <= 4) ? 4 : 3272 (nr_registers <= 8) ? 2 : 3273 1; 3274 3275 /* Dump stats */ 3276 3277 fprintf(stderr, "%s - %s shader: " 3278 "%u inst, %u bundles, %u quadwords, " 3279 "%u registers, %u threads, %u loops, " 3280 "%u:%u spills:fills\n", 3281 ctx->nir->info.label ?: "", 3282 ctx->inputs->is_blend ? "PAN_SHADER_BLEND" : 3283 gl_shader_stage_name(ctx->stage), 3284 nr_ins, nr_bundles, ctx->quadword_count, 3285 nr_registers, nr_threads, 3286 ctx->loop_count, 3287 ctx->spills, ctx->fills); 3288 } 3289 3290 _mesa_hash_table_u64_destroy(ctx->ssa_constants); 3291 _mesa_hash_table_u64_destroy(ctx->sysval_to_id); 3292 3293 ralloc_free(ctx); 3294} 3295