1/* 2 * Copyright (C) 2020 Collabora Ltd. 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors (Collabora): 24 * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 25 */ 26 27#include "main/mtypes.h" 28#include "compiler/glsl/glsl_to_nir.h" 29#include "compiler/nir_types.h" 30#include "compiler/nir/nir_builder.h" 31#include "util/u_debug.h" 32 33#include "disassemble.h" 34#include "bifrost_compile.h" 35#include "compiler.h" 36#include "bi_quirks.h" 37#include "bi_builder.h" 38#include "bifrost_nir.h" 39 40static const struct debug_named_value bifrost_debug_options[] = { 41 {"msgs", BIFROST_DBG_MSGS, "Print debug messages"}, 42 {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"}, 43 {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"}, 44 {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"}, 45 {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"}, 46 {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"}, 47 {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"}, 48 {"novalidate",BIFROST_DBG_NOVALIDATE, "Skip IR validation"}, 49 {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"}, 50 DEBUG_NAMED_VALUE_END 51}; 52 53DEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0) 54 55/* How many bytes are prefetched by the Bifrost shader core. From the final 56 * clause of the shader, this range must be valid instructions or zero. */ 57#define BIFROST_SHADER_PREFETCH 128 58 59int bifrost_debug = 0; 60 61#define DBG(fmt, ...) \ 62 do { if (bifrost_debug & BIFROST_DBG_MSGS) \ 63 fprintf(stderr, "%s:%d: "fmt, \ 64 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) 65 66static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list); 67 68static void 69bi_block_add_successor(bi_block *block, bi_block *successor) 70{ 71 assert(block != NULL && successor != NULL); 72 73 /* Cull impossible edges */ 74 if (block->unconditional_jumps) 75 return; 76 77 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { 78 if (block->successors[i]) { 79 if (block->successors[i] == successor) 80 return; 81 else 82 continue; 83 } 84 85 block->successors[i] = successor; 86 _mesa_set_add(successor->predecessors, block); 87 return; 88 } 89 90 unreachable("Too many successors"); 91} 92 93static void 94bi_emit_jump(bi_builder *b, nir_jump_instr *instr) 95{ 96 bi_instr *branch = bi_jump(b, bi_zero()); 97 98 switch (instr->type) { 99 case nir_jump_break: 100 branch->branch_target = b->shader->break_block; 101 break; 102 case nir_jump_continue: 103 branch->branch_target = b->shader->continue_block; 104 break; 105 default: 106 unreachable("Unhandled jump type"); 107 } 108 109 bi_block_add_successor(b->shader->current_block, branch->branch_target); 110 b->shader->current_block->unconditional_jumps = true; 111} 112 113static bi_index 114bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr) 115{ 116 switch (intr->intrinsic) { 117 case nir_intrinsic_load_barycentric_centroid: 118 case nir_intrinsic_load_barycentric_sample: 119 return bi_register(61); 120 121 /* Need to put the sample ID in the top 16-bits */ 122 case nir_intrinsic_load_barycentric_at_sample: 123 return bi_mkvec_v2i16(b, bi_half(bi_dontcare(), false), 124 bi_half(bi_src_index(&intr->src[0]), false)); 125 126 /* Interpret as 8:8 signed fixed point positions in pixels along X and 127 * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0) 128 * is the center of the pixel so we first fixup and then convert. For 129 * fp16 input: 130 * 131 * f2i16(((x, y) + (0.5, 0.5)) * 2**8) = 132 * f2i16((256 * (x, y)) + (128, 128)) = 133 * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128)) 134 * 135 * For fp32 input, that lacks enough precision for MSAA 16x, but the 136 * idea is the same. FIXME: still doesn't pass 137 */ 138 case nir_intrinsic_load_barycentric_at_offset: { 139 bi_index offset = bi_src_index(&intr->src[0]); 140 bi_index f16 = bi_null(); 141 unsigned sz = nir_src_bit_size(intr->src[0]); 142 143 if (sz == 16) { 144 f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), 145 bi_imm_f16(128.0), BI_ROUND_NONE); 146 } else { 147 assert(sz == 32); 148 bi_index f[2]; 149 for (unsigned i = 0; i < 2; ++i) { 150 f[i] = bi_fadd_rscale_f32(b, 151 bi_word(offset, i), 152 bi_imm_f32(0.5), bi_imm_u32(8), 153 BI_ROUND_NONE, BI_SPECIAL_NONE); 154 } 155 156 f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE); 157 } 158 159 return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ); 160 } 161 162 case nir_intrinsic_load_barycentric_pixel: 163 default: 164 return bi_dontcare(); 165 } 166} 167 168static enum bi_sample 169bi_interp_for_intrinsic(nir_intrinsic_op op) 170{ 171 switch (op) { 172 case nir_intrinsic_load_barycentric_centroid: 173 return BI_SAMPLE_CENTROID; 174 case nir_intrinsic_load_barycentric_sample: 175 case nir_intrinsic_load_barycentric_at_sample: 176 return BI_SAMPLE_SAMPLE; 177 case nir_intrinsic_load_barycentric_at_offset: 178 return BI_SAMPLE_EXPLICIT; 179 case nir_intrinsic_load_barycentric_pixel: 180 default: 181 return BI_SAMPLE_CENTER; 182 } 183} 184 185/* auto, 64-bit omitted */ 186static enum bi_register_format 187bi_reg_fmt_for_nir(nir_alu_type T) 188{ 189 switch (T) { 190 case nir_type_float16: return BI_REGISTER_FORMAT_F16; 191 case nir_type_float32: return BI_REGISTER_FORMAT_F32; 192 case nir_type_int16: return BI_REGISTER_FORMAT_S16; 193 case nir_type_uint16: return BI_REGISTER_FORMAT_U16; 194 case nir_type_int32: return BI_REGISTER_FORMAT_S32; 195 case nir_type_uint32: return BI_REGISTER_FORMAT_U32; 196 default: unreachable("Invalid type for register format"); 197 } 198} 199 200/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the 201 * immediate to be used (which applies even if _IMM can't be used) */ 202 203static bool 204bi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max) 205{ 206 nir_src *offset = nir_get_io_offset_src(instr); 207 208 if (!nir_src_is_const(*offset)) 209 return false; 210 211 *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); 212 return (*immediate) < max; 213} 214 215static void 216bi_make_vec_to(bi_builder *b, bi_index final_dst, 217 bi_index *src, 218 unsigned *channel, 219 unsigned count, 220 unsigned bitsize); 221 222/* Bifrost's load instructions lack a component offset despite operating in 223 * terms of vec4 slots. Usually I/O vectorization avoids nonzero components, 224 * but they may be unavoidable with separate shaders in use. To solve this, we 225 * lower to a larger load and an explicit copy of the desired components. */ 226 227static void 228bi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp) 229{ 230 unsigned component = nir_intrinsic_component(instr); 231 232 if (component == 0) 233 return; 234 235 bi_index srcs[] = { tmp, tmp, tmp, tmp }; 236 unsigned channels[] = { component, component + 1, component + 2 }; 237 238 bi_make_vec_to(b, 239 bi_dest_index(&instr->dest), 240 srcs, channels, instr->num_components, 241 nir_dest_bit_size(instr->dest)); 242} 243 244static void 245bi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) 246{ 247 nir_alu_type T = nir_intrinsic_dest_type(instr); 248 enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); 249 nir_src *offset = nir_get_io_offset_src(instr); 250 unsigned component = nir_intrinsic_component(instr); 251 enum bi_vecsize vecsize = (instr->num_components + component - 1); 252 unsigned imm_index = 0; 253 unsigned base = nir_intrinsic_base(instr); 254 bool constant = nir_src_is_const(*offset); 255 bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); 256 bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); 257 258 if (immediate) { 259 bi_ld_attr_imm_to(b, dest, bi_register(61), bi_register(62), 260 regfmt, vecsize, imm_index); 261 } else { 262 bi_index idx = bi_src_index(&instr->src[0]); 263 264 if (constant) 265 idx = bi_imm_u32(imm_index); 266 else if (base != 0) 267 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); 268 269 bi_ld_attr_to(b, dest, bi_register(61), bi_register(62), 270 idx, regfmt, vecsize); 271 } 272 273 bi_copy_component(b, instr, dest); 274} 275 276static void 277bi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) 278{ 279 enum bi_sample sample = BI_SAMPLE_CENTER; 280 enum bi_update update = BI_UPDATE_STORE; 281 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; 282 bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input; 283 bi_index src0 = bi_null(); 284 285 unsigned component = nir_intrinsic_component(instr); 286 enum bi_vecsize vecsize = (instr->num_components + component - 1); 287 bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); 288 289 unsigned sz = nir_dest_bit_size(instr->dest); 290 291 if (smooth) { 292 nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); 293 assert(parent); 294 295 sample = bi_interp_for_intrinsic(parent->intrinsic); 296 src0 = bi_varying_src0_for_barycentric(b, parent); 297 298 assert(sz == 16 || sz == 32); 299 regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 300 : BI_REGISTER_FORMAT_F32; 301 } else { 302 assert(sz == 32); 303 regfmt = BI_REGISTER_FORMAT_U32; 304 } 305 306 nir_src *offset = nir_get_io_offset_src(instr); 307 unsigned imm_index = 0; 308 bool immediate = bi_is_intr_immediate(instr, &imm_index, 20); 309 310 if (immediate && smooth) { 311 bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, 312 vecsize, imm_index); 313 } else if (immediate && !smooth) { 314 bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, 315 vecsize, imm_index); 316 } else { 317 bi_index idx = bi_src_index(offset); 318 unsigned base = nir_intrinsic_base(instr); 319 320 if (base != 0) 321 idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); 322 323 if (smooth) { 324 bi_ld_var_to(b, dest, src0, idx, regfmt, sample, 325 update, vecsize); 326 } else { 327 bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, 328 regfmt, vecsize); 329 } 330 } 331 332 bi_copy_component(b, instr, dest); 333} 334 335static void 336bi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src, 337 unsigned *channel, unsigned count) 338{ 339 for (unsigned i = 0; i < count; i += 2) { 340 bool next = (i + 1) < count; 341 342 unsigned chan = channel ? channel[i] : 0; 343 unsigned nextc = next && channel ? channel[i + 1] : 0; 344 345 bi_index w0 = bi_word(src[i], chan >> 1); 346 bi_index w1 = next ? bi_word(src[i + 1], nextc >> 1) : bi_zero(); 347 348 bi_index h0 = bi_half(w0, chan & 1); 349 bi_index h1 = bi_half(w1, nextc & 1); 350 351 bi_index to = bi_word(dst, i >> 1); 352 353 if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1)) 354 bi_mov_i32_to(b, to, w0); 355 else if (bi_is_word_equiv(w0, w1)) 356 bi_swz_v2i16_to(b, to, bi_swz_16(w0, chan & 1, nextc & 1)); 357 else 358 bi_mkvec_v2i16_to(b, to, h0, h1); 359 } 360} 361 362static void 363bi_make_vec_to(bi_builder *b, bi_index final_dst, 364 bi_index *src, 365 unsigned *channel, 366 unsigned count, 367 unsigned bitsize) 368{ 369 /* If we reads our own output, we need a temporary move to allow for 370 * swapping. TODO: Could do a bit better for pairwise swaps of 16-bit 371 * vectors */ 372 bool reads_self = false; 373 374 for (unsigned i = 0; i < count; ++i) 375 reads_self |= bi_is_equiv(final_dst, src[i]); 376 377 /* SSA can't read itself */ 378 assert(!reads_self || final_dst.reg); 379 380 bi_index dst = reads_self ? bi_temp(b->shader) : final_dst; 381 382 if (bitsize == 32) { 383 for (unsigned i = 0; i < count; ++i) { 384 bi_mov_i32_to(b, bi_word(dst, i), 385 bi_word(src[i], channel ? channel[i] : 0)); 386 } 387 } else if (bitsize == 16) { 388 bi_make_vec16_to(b, dst, src, channel, count); 389 } else if (bitsize == 8 && count == 1) { 390 bi_swz_v4i8_to(b, dst, bi_byte( 391 bi_word(src[0], channel[0] >> 2), 392 channel[0] & 3)); 393 } else { 394 unreachable("8-bit mkvec not yet supported"); 395 } 396 397 /* Emit an explicit copy if needed */ 398 if (!bi_is_equiv(dst, final_dst)) { 399 unsigned shift = (bitsize == 8) ? 2 : (bitsize == 16) ? 1 : 0; 400 unsigned vec = (1 << shift); 401 402 for (unsigned i = 0; i < count; i += vec) { 403 bi_mov_i32_to(b, bi_word(final_dst, i >> shift), 404 bi_word(dst, i >> shift)); 405 } 406 } 407} 408 409static bi_instr * 410bi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, 411 unsigned nr_components, unsigned offset) 412{ 413 unsigned sysval_ubo = 414 MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos); 415 unsigned uniform = 416 pan_lookup_sysval(b->shader->sysval_to_id, 417 &b->shader->info->sysvals, 418 sysval); 419 unsigned idx = (uniform * 16) + offset; 420 421 return bi_load_to(b, nr_components * 32, dest, 422 bi_imm_u32(idx), 423 bi_imm_u32(sysval_ubo), BI_SEG_UBO); 424} 425 426static void 427bi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr, 428 unsigned nr_components, unsigned offset) 429{ 430 bi_load_sysval_to(b, bi_dest_index(&intr->dest), 431 panfrost_sysval_for_instr(&intr->instr, NULL), 432 nr_components, offset); 433} 434 435static bi_index 436bi_load_sysval(bi_builder *b, int sysval, 437 unsigned nr_components, unsigned offset) 438{ 439 bi_index tmp = bi_temp(b->shader); 440 bi_load_sysval_to(b, tmp, sysval, nr_components, offset); 441 return tmp; 442} 443 444static void 445bi_load_sample_id_to(bi_builder *b, bi_index dst) 446{ 447 /* r61[16:23] contains the sampleID, mask it out. Upper bits 448 * seem to read garbage (despite being architecturally defined 449 * as zero), so use a 5-bit mask instead of 8-bits */ 450 451 bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f), 452 bi_imm_u8(16)); 453} 454 455static bi_index 456bi_load_sample_id(bi_builder *b) 457{ 458 bi_index sample_id = bi_temp(b->shader); 459 bi_load_sample_id_to(b, sample_id); 460 return sample_id; 461} 462 463static bi_index 464bi_pixel_indices(bi_builder *b, unsigned rt) 465{ 466 /* We want to load the current pixel. */ 467 struct bifrost_pixel_indices pix = { 468 .y = BIFROST_CURRENT_PIXEL, 469 .rt = rt 470 }; 471 472 uint32_t indices_u32 = 0; 473 memcpy(&indices_u32, &pix, sizeof(indices_u32)); 474 bi_index indices = bi_imm_u32(indices_u32); 475 476 /* Sample index above is left as zero. For multisampling, we need to 477 * fill in the actual sample ID in the lower byte */ 478 479 if (b->shader->inputs->blend.nr_samples > 1) 480 indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false); 481 482 return indices; 483} 484 485static void 486bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr) 487{ 488 ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr); 489 490 /* Source color is passed through r0-r3, or r4-r7 for the second 491 * source when dual-source blending. TODO: Precolour instead */ 492 bi_index srcs[] = { 493 bi_register(0), bi_register(1), bi_register(2), bi_register(3) 494 }; 495 bi_index srcs2[] = { 496 bi_register(4), bi_register(5), bi_register(6), bi_register(7) 497 }; 498 499 bool second_source = (sem.location == VARYING_SLOT_VAR0); 500 501 bi_make_vec_to(b, bi_dest_index(&instr->dest), 502 second_source ? srcs2 : srcs, 503 NULL, 4, 32); 504} 505 506static void 507bi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt) 508{ 509 /* Reads 2 or 4 staging registers to cover the input */ 510 unsigned size = nir_alu_type_get_type_size(T); 511 unsigned sr_count = (size <= 16) ? 2 : 4; 512 const struct panfrost_compile_inputs *inputs = b->shader->inputs; 513 uint64_t blend_desc = inputs->blend.bifrost_blend_desc; 514 515 if (inputs->is_blend && inputs->blend.nr_samples > 1) { 516 /* Conversion descriptor comes from the compile inputs, pixel 517 * indices derived at run time based on sample ID */ 518 bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_register(60), 519 bi_imm_u32(blend_desc >> 32), BI_VECSIZE_V4); 520 } else if (b->shader->inputs->is_blend) { 521 /* Blend descriptor comes from the compile inputs */ 522 /* Put the result in r0 */ 523 bi_blend_to(b, bi_register(0), rgba, 524 bi_register(60), 525 bi_imm_u32(blend_desc & 0xffffffff), 526 bi_imm_u32(blend_desc >> 32), sr_count); 527 } else { 528 /* Blend descriptor comes from the FAU RAM. By convention, the 529 * return address is stored in r48 and will be used by the 530 * blend shader to jump back to the fragment shader after */ 531 bi_blend_to(b, bi_register(48), rgba, 532 bi_register(60), 533 bi_fau(BIR_FAU_BLEND_0 + rt, false), 534 bi_fau(BIR_FAU_BLEND_0 + rt, true), sr_count); 535 } 536 537 assert(rt < 8); 538 b->shader->info->bifrost.blend[rt].type = T; 539} 540 541/* Blend shaders do not need to run ATEST since they are dependent on a 542 * fragment shader that runs it. Blit shaders may not need to run ATEST, since 543 * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and 544 * there are no writes to the coverage mask. The latter two are satisfied for 545 * all blit shaders, so we just care about early-z, which blit shaders force 546 * iff they do not write depth or stencil */ 547 548static bool 549bi_skip_atest(bi_context *ctx, bool emit_zs) 550{ 551 return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend; 552} 553 554static void 555bi_emit_atest(bi_builder *b, bi_index alpha) 556{ 557 bi_index coverage = bi_register(60); 558 bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha); 559 b->shader->emitted_atest = true; 560 561 /* Pseudo-source to encode in the tuple */ 562 atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false); 563} 564 565static void 566bi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr) 567{ 568 bool combined = instr->intrinsic == 569 nir_intrinsic_store_combined_output_pan; 570 571 unsigned writeout = combined ? nir_intrinsic_component(instr) : 572 PAN_WRITEOUT_C; 573 574 bool emit_blend = writeout & (PAN_WRITEOUT_C); 575 bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S); 576 577 const nir_variable *var = 578 nir_find_variable_with_driver_location(b->shader->nir, 579 nir_var_shader_out, nir_intrinsic_base(instr)); 580 assert(var); 581 582 unsigned loc = var->data.location; 583 bi_index src0 = bi_src_index(&instr->src[0]); 584 585 /* By ISA convention, the coverage mask is stored in R60. The store 586 * itself will be handled by a subsequent ATEST instruction */ 587 if (loc == FRAG_RESULT_SAMPLE_MASK) { 588 bi_index orig = bi_register(60); 589 bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0); 590 bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0)); 591 bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO); 592 return; 593 } 594 595 596 /* Dual-source blending is implemented by putting the color in 597 * registers r4-r7. */ 598 if (var->data.index) { 599 unsigned count = nir_src_num_components(instr->src[0]); 600 601 for (unsigned i = 0; i < count; ++i) 602 bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i)); 603 604 b->shader->info->bifrost.blend_src1_type = 605 nir_intrinsic_src_type(instr); 606 607 return; 608 } 609 610 /* Emit ATEST if we have to, note ATEST requires a floating-point alpha 611 * value, but render target #0 might not be floating point. However the 612 * alpha value is only used for alpha-to-coverage, a stage which is 613 * skipped for pure integer framebuffers, so the issue is moot. */ 614 615 if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) { 616 nir_alu_type T = nir_intrinsic_src_type(instr); 617 618 bi_index rgba = bi_src_index(&instr->src[0]); 619 bi_index alpha = 620 (T == nir_type_float16) ? bi_half(bi_word(rgba, 1), true) : 621 (T == nir_type_float32) ? bi_word(rgba, 3) : 622 bi_dontcare(); 623 624 /* Don't read out-of-bounds */ 625 if (nir_src_num_components(instr->src[0]) < 4) 626 alpha = bi_imm_f32(1.0); 627 628 bi_emit_atest(b, alpha); 629 } 630 631 if (emit_zs) { 632 bi_index z = { 0 }, s = { 0 }; 633 634 if (writeout & PAN_WRITEOUT_Z) 635 z = bi_src_index(&instr->src[2]); 636 637 if (writeout & PAN_WRITEOUT_S) 638 s = bi_src_index(&instr->src[3]); 639 640 bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60), 641 writeout & PAN_WRITEOUT_S, 642 writeout & PAN_WRITEOUT_Z); 643 } 644 645 if (emit_blend) { 646 assert(loc >= FRAG_RESULT_DATA0); 647 648 unsigned rt = (loc - FRAG_RESULT_DATA0); 649 bi_index color = bi_src_index(&instr->src[0]); 650 651 /* Explicit copy since BLEND inputs are precoloured to R0-R3, 652 * TODO: maybe schedule around this or implement in RA as a 653 * spill */ 654 bool has_mrt = false; 655 656 nir_foreach_shader_out_variable(var, b->shader->nir) 657 has_mrt |= (var->data.location > FRAG_RESULT_DATA0); 658 659 if (has_mrt) { 660 bi_index srcs[4] = { color, color, color, color }; 661 unsigned channels[4] = { 0, 1, 2, 3 }; 662 color = bi_temp(b->shader); 663 bi_make_vec_to(b, color, srcs, channels, 664 nir_src_num_components(instr->src[0]), 665 nir_alu_type_get_type_size(nir_intrinsic_src_type(instr))); 666 } 667 668 bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), rt); 669 } 670 671 if (b->shader->inputs->is_blend) { 672 /* Jump back to the fragment shader, return address is stored 673 * in r48 (see above). 674 */ 675 bi_jump(b, bi_register(48)); 676 } 677} 678 679static void 680bi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) 681{ 682 /* In principle we can do better for 16-bit. At the moment we require 683 * 32-bit to permit the use of .auto, in order to force .u32 for flat 684 * varyings, to handle internal TGSI shaders that set flat in the VS 685 * but smooth in the FS */ 686 687 ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr); 688 assert(nir_alu_type_get_type_size(T) == 32); 689 enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; 690 691 unsigned imm_index = 0; 692 bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); 693 694 bi_index address; 695 if (immediate) { 696 address = bi_lea_attr_imm(b, 697 bi_register(61), bi_register(62), 698 regfmt, imm_index); 699 } else { 700 bi_index idx = 701 bi_iadd_u32(b, 702 bi_src_index(nir_get_io_offset_src(instr)), 703 bi_imm_u32(nir_intrinsic_base(instr)), 704 false); 705 address = bi_lea_attr(b, 706 bi_register(61), bi_register(62), 707 idx, regfmt); 708 } 709 710 /* Only look at the total components needed. In effect, we fill in all 711 * the intermediate "holes" in the write mask, since we can't mask off 712 * stores. Since nir_lower_io_to_temporaries ensures each varying is 713 * written at most once, anything that's masked out is undefined, so it 714 * doesn't matter what we write there. So we may as well do the 715 * simplest thing possible. */ 716 unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr)); 717 assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0)); 718 719 bi_st_cvt(b, bi_src_index(&instr->src[0]), address, 720 bi_word(address, 1), bi_word(address, 2), 721 regfmt, nr - 1); 722} 723 724static void 725bi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr) 726{ 727 nir_src *offset = nir_get_io_offset_src(instr); 728 729 bool offset_is_const = nir_src_is_const(*offset); 730 bi_index dyn_offset = bi_src_index(offset); 731 uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0; 732 bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input); 733 734 bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest), 735 bi_dest_index(&instr->dest), offset_is_const ? 736 bi_imm_u32(const_offset) : dyn_offset, 737 kernel_input ? bi_zero() : bi_src_index(&instr->src[0]), 738 BI_SEG_UBO); 739} 740 741static bi_index 742bi_addr_high(nir_src *src) 743{ 744 return (nir_src_bit_size(*src) == 64) ? 745 bi_word(bi_src_index(src), 1) : bi_zero(); 746} 747 748static void 749bi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) 750{ 751 bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest), 752 bi_dest_index(&instr->dest), 753 bi_src_index(&instr->src[0]), bi_addr_high(&instr->src[0]), 754 seg); 755} 756 757static void 758bi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) 759{ 760 /* Require contiguous masks, gauranteed by nir_lower_wrmasks */ 761 assert(nir_intrinsic_write_mask(instr) == 762 BITFIELD_MASK(instr->num_components)); 763 764 bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]), 765 bi_src_index(&instr->src[0]), 766 bi_src_index(&instr->src[1]), bi_addr_high(&instr->src[1]), 767 seg); 768} 769 770/* Exchanges the staging register with memory */ 771 772static void 773bi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg) 774{ 775 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); 776 777 unsigned sz = nir_src_bit_size(*arg); 778 assert(sz == 32 || sz == 64); 779 780 bi_index data = bi_src_index(arg); 781 782 bi_index data_words[] = { 783 bi_word(data, 0), 784 bi_word(data, 1), 785 }; 786 787 bi_index inout = bi_temp_reg(b->shader); 788 bi_make_vec_to(b, inout, data_words, NULL, sz / 32, 32); 789 790 bi_axchg_to(b, sz, inout, inout, 791 bi_word(addr, 0), 792 (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(), 793 seg); 794 795 bi_index inout_words[] = { 796 bi_word(inout, 0), 797 bi_word(inout, 1), 798 }; 799 800 bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); 801} 802 803/* Exchanges the second staging register with memory if comparison with first 804 * staging register passes */ 805 806static void 807bi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg) 808{ 809 assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); 810 811 /* hardware is swapped from NIR */ 812 bi_index src0 = bi_src_index(arg_2); 813 bi_index src1 = bi_src_index(arg_1); 814 815 unsigned sz = nir_src_bit_size(*arg_1); 816 assert(sz == 32 || sz == 64); 817 818 bi_index data_words[] = { 819 bi_word(src0, 0), 820 sz == 32 ? bi_word(src1, 0) : bi_word(src0, 1), 821 822 /* 64-bit */ 823 bi_word(src1, 0), 824 bi_word(src1, 1), 825 }; 826 827 bi_index inout = bi_temp_reg(b->shader); 828 bi_make_vec_to(b, inout, data_words, NULL, 2 * (sz / 32), 32); 829 830 bi_acmpxchg_to(b, sz, inout, inout, 831 bi_word(addr, 0), 832 (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(), 833 seg); 834 835 bi_index inout_words[] = { 836 bi_word(inout, 0), 837 bi_word(inout, 1), 838 }; 839 840 bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); 841} 842 843/* Extracts an atomic opcode */ 844 845static enum bi_atom_opc 846bi_atom_opc_for_nir(nir_intrinsic_op op) 847{ 848 switch (op) { 849 case nir_intrinsic_global_atomic_add: 850 case nir_intrinsic_shared_atomic_add: 851 case nir_intrinsic_image_atomic_add: 852 return BI_ATOM_OPC_AADD; 853 854 case nir_intrinsic_global_atomic_imin: 855 case nir_intrinsic_shared_atomic_imin: 856 case nir_intrinsic_image_atomic_imin: 857 return BI_ATOM_OPC_ASMIN; 858 859 case nir_intrinsic_global_atomic_umin: 860 case nir_intrinsic_shared_atomic_umin: 861 case nir_intrinsic_image_atomic_umin: 862 return BI_ATOM_OPC_AUMIN; 863 864 case nir_intrinsic_global_atomic_imax: 865 case nir_intrinsic_shared_atomic_imax: 866 case nir_intrinsic_image_atomic_imax: 867 return BI_ATOM_OPC_ASMAX; 868 869 case nir_intrinsic_global_atomic_umax: 870 case nir_intrinsic_shared_atomic_umax: 871 case nir_intrinsic_image_atomic_umax: 872 return BI_ATOM_OPC_AUMAX; 873 874 case nir_intrinsic_global_atomic_and: 875 case nir_intrinsic_shared_atomic_and: 876 case nir_intrinsic_image_atomic_and: 877 return BI_ATOM_OPC_AAND; 878 879 case nir_intrinsic_global_atomic_or: 880 case nir_intrinsic_shared_atomic_or: 881 case nir_intrinsic_image_atomic_or: 882 return BI_ATOM_OPC_AOR; 883 884 case nir_intrinsic_global_atomic_xor: 885 case nir_intrinsic_shared_atomic_xor: 886 case nir_intrinsic_image_atomic_xor: 887 return BI_ATOM_OPC_AXOR; 888 889 default: 890 unreachable("Unexpected computational atomic"); 891 } 892} 893 894/* Optimized unary atomics are available with an implied #1 argument */ 895 896static bool 897bi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out) 898{ 899 /* Check we have a compatible constant */ 900 if (arg.type != BI_INDEX_CONSTANT) 901 return false; 902 903 if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD))) 904 return false; 905 906 /* Check for a compatible operation */ 907 switch (op) { 908 case BI_ATOM_OPC_AADD: 909 *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC; 910 return true; 911 case BI_ATOM_OPC_ASMAX: 912 *out = BI_ATOM_OPC_ASMAX1; 913 return true; 914 case BI_ATOM_OPC_AUMAX: 915 *out = BI_ATOM_OPC_AUMAX1; 916 return true; 917 case BI_ATOM_OPC_AOR: 918 *out = BI_ATOM_OPC_AOR1; 919 return true; 920 default: 921 return false; 922 } 923} 924 925/* Coordinates are 16-bit integers in Bifrost but 32-bit in NIR */ 926 927static bi_index 928bi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx, 929 unsigned coord_comps, bool is_array) 930{ 931 assert(coord_comps > 0 && coord_comps <= 3); 932 933 if (src_idx == 0) { 934 if (coord_comps == 1 || (coord_comps == 2 && is_array)) 935 return bi_word(coord, 0); 936 else 937 return bi_mkvec_v2i16(b, 938 bi_half(bi_word(coord, 0), false), 939 bi_half(bi_word(coord, 1), false)); 940 } else { 941 if (coord_comps == 3) 942 return bi_word(coord, 2); 943 else if (coord_comps == 2 && is_array) 944 return bi_word(coord, 1); 945 else 946 return bi_zero(); 947 } 948} 949 950static bi_index 951bi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr) 952{ 953 nir_src src = instr->src[0]; 954 bi_index index = bi_src_index(&src); 955 bi_context *ctx = b->shader; 956 957 /* Images come after vertex attributes, so handle an explicit offset */ 958 unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ? 959 util_bitcount64(ctx->nir->info.inputs_read) : 0; 960 961 if (offset == 0) 962 return index; 963 else if (nir_src_is_const(src)) 964 return bi_imm_u32(nir_src_as_uint(src) + offset); 965 else 966 return bi_iadd_u32(b, index, bi_imm_u32(offset), false); 967} 968 969static void 970bi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr) 971{ 972 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 973 unsigned coord_comps = nir_image_intrinsic_coord_components(instr); 974 bool array = nir_intrinsic_image_array(instr); 975 ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); 976 977 bi_index coords = bi_src_index(&instr->src[1]); 978 /* TODO: MSAA */ 979 assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); 980 981 bi_ld_attr_tex_to(b, bi_dest_index(&instr->dest), 982 bi_emit_image_coord(b, coords, 0, coord_comps, array), 983 bi_emit_image_coord(b, coords, 1, coord_comps, array), 984 bi_emit_image_index(b, instr), 985 bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)), 986 instr->num_components - 1); 987} 988 989static bi_index 990bi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr) 991{ 992 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 993 bool array = nir_intrinsic_image_array(instr); 994 ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); 995 unsigned coord_comps = nir_image_intrinsic_coord_components(instr); 996 997 /* TODO: MSAA */ 998 assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); 999 1000 enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ? 1001 bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) : 1002 BI_REGISTER_FORMAT_AUTO; 1003 1004 bi_index coords = bi_src_index(&instr->src[1]); 1005 bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); 1006 bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); 1007 1008 bi_instr *I = bi_lea_attr_tex_to(b, bi_temp(b->shader), xy, zw, 1009 bi_emit_image_index(b, instr), type); 1010 1011 /* LEA_ATTR_TEX defaults to the secondary attribute table, but our ABI 1012 * has all images in the primary attribute table */ 1013 I->table = BI_TABLE_ATTRIBUTE_1; 1014 1015 return I->dest[0]; 1016} 1017 1018static void 1019bi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr) 1020{ 1021 bi_index addr = bi_emit_lea_image(b, instr); 1022 1023 bi_st_cvt(b, bi_src_index(&instr->src[3]), 1024 addr, bi_word(addr, 1), bi_word(addr, 2), 1025 bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)), 1026 instr->num_components - 1); 1027} 1028 1029static void 1030bi_emit_atomic_i32_to(bi_builder *b, bi_index dst, 1031 bi_index addr, bi_index arg, nir_intrinsic_op intrinsic) 1032{ 1033 /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't 1034 * take any vector but can still output in RETURN mode */ 1035 bi_index sr = bi_temp_reg(b->shader); 1036 1037 enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic); 1038 enum bi_atom_opc post_opc = opc; 1039 1040 /* Generate either ATOM_C or ATOM_C1 as required */ 1041 if (bi_promote_atom_c1(opc, arg, &opc)) { 1042 bi_patom_c1_i32_to(b, sr, bi_word(addr, 0), 1043 bi_word(addr, 1), opc, 2); 1044 } else { 1045 bi_mov_i32_to(b, sr, arg); 1046 bi_patom_c_i32_to(b, sr, sr, bi_word(addr, 0), 1047 bi_word(addr, 1), opc, 2); 1048 } 1049 1050 /* Post-process it */ 1051 bi_atom_post_i32_to(b, dst, bi_word(sr, 0), bi_word(sr, 1), post_opc); 1052} 1053 1054/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5 1055 * gl_FragCoord.z = ld_vary(fragz) 1056 * gl_FragCoord.w = ld_vary(fragw) 1057 */ 1058 1059static void 1060bi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr) 1061{ 1062 bi_index src[4] = {}; 1063 1064 for (unsigned i = 0; i < 2; ++i) { 1065 src[i] = bi_fadd_f32(b, 1066 bi_u16_to_f32(b, bi_half(bi_register(59), i)), 1067 bi_imm_f32(0.5f), BI_ROUND_NONE); 1068 } 1069 1070 for (unsigned i = 0; i < 2; ++i) { 1071 src[2 + i] = bi_ld_var_special(b, bi_zero(), 1072 BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, 1073 BI_UPDATE_CLOBBER, 1074 (i == 0) ? BI_VARYING_NAME_FRAG_Z : 1075 BI_VARYING_NAME_FRAG_W, 1076 BI_VECSIZE_NONE); 1077 } 1078 1079 bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32); 1080} 1081 1082static void 1083bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr) 1084{ 1085 unsigned rt = b->shader->inputs->blend.rt; 1086 unsigned size = nir_dest_bit_size(instr->dest); 1087 1088 /* Get the render target */ 1089 if (!b->shader->inputs->is_blend) { 1090 const nir_variable *var = 1091 nir_find_variable_with_driver_location(b->shader->nir, 1092 nir_var_shader_out, nir_intrinsic_base(instr)); 1093 unsigned loc = var->data.location; 1094 assert(loc >= FRAG_RESULT_DATA0); 1095 rt = (loc - FRAG_RESULT_DATA0); 1096 } 1097 1098 bi_index desc = b->shader->inputs->is_blend ? 1099 bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) : 1100 b->shader->inputs->bifrost.static_rt_conv ? 1101 bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) : 1102 bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0); 1103 1104 bi_ld_tile_to(b, bi_dest_index(&instr->dest), bi_pixel_indices(b, rt), 1105 bi_register(60), desc, (instr->num_components - 1)); 1106} 1107 1108static void 1109bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) 1110{ 1111 bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ? 1112 bi_dest_index(&instr->dest) : bi_null(); 1113 gl_shader_stage stage = b->shader->stage; 1114 1115 switch (instr->intrinsic) { 1116 case nir_intrinsic_load_barycentric_pixel: 1117 case nir_intrinsic_load_barycentric_centroid: 1118 case nir_intrinsic_load_barycentric_sample: 1119 case nir_intrinsic_load_barycentric_at_sample: 1120 case nir_intrinsic_load_barycentric_at_offset: 1121 /* handled later via load_vary */ 1122 break; 1123 case nir_intrinsic_load_interpolated_input: 1124 case nir_intrinsic_load_input: 1125 if (b->shader->inputs->is_blend) 1126 bi_emit_load_blend_input(b, instr); 1127 else if (stage == MESA_SHADER_FRAGMENT) 1128 bi_emit_load_vary(b, instr); 1129 else if (stage == MESA_SHADER_VERTEX) 1130 bi_emit_load_attr(b, instr); 1131 else 1132 unreachable("Unsupported shader stage"); 1133 break; 1134 1135 case nir_intrinsic_store_output: 1136 if (stage == MESA_SHADER_FRAGMENT) 1137 bi_emit_fragment_out(b, instr); 1138 else if (stage == MESA_SHADER_VERTEX) 1139 bi_emit_store_vary(b, instr); 1140 else 1141 unreachable("Unsupported shader stage"); 1142 break; 1143 1144 case nir_intrinsic_store_combined_output_pan: 1145 assert(stage == MESA_SHADER_FRAGMENT); 1146 bi_emit_fragment_out(b, instr); 1147 break; 1148 1149 case nir_intrinsic_load_ubo: 1150 case nir_intrinsic_load_kernel_input: 1151 bi_emit_load_ubo(b, instr); 1152 break; 1153 1154 case nir_intrinsic_load_global: 1155 case nir_intrinsic_load_global_constant: 1156 bi_emit_load(b, instr, BI_SEG_NONE); 1157 break; 1158 1159 case nir_intrinsic_store_global: 1160 bi_emit_store(b, instr, BI_SEG_NONE); 1161 break; 1162 1163 case nir_intrinsic_load_scratch: 1164 bi_emit_load(b, instr, BI_SEG_TL); 1165 break; 1166 1167 case nir_intrinsic_store_scratch: 1168 bi_emit_store(b, instr, BI_SEG_TL); 1169 break; 1170 1171 case nir_intrinsic_load_shared: 1172 bi_emit_load(b, instr, BI_SEG_WLS); 1173 break; 1174 1175 case nir_intrinsic_store_shared: 1176 bi_emit_store(b, instr, BI_SEG_WLS); 1177 break; 1178 1179 /* Blob doesn't seem to do anything for memory barriers, note +BARRIER 1180 * is illegal in fragment shaders */ 1181 case nir_intrinsic_memory_barrier: 1182 case nir_intrinsic_memory_barrier_buffer: 1183 case nir_intrinsic_memory_barrier_image: 1184 case nir_intrinsic_memory_barrier_shared: 1185 case nir_intrinsic_group_memory_barrier: 1186 break; 1187 1188 case nir_intrinsic_control_barrier: 1189 assert(b->shader->stage != MESA_SHADER_FRAGMENT); 1190 bi_barrier(b); 1191 break; 1192 1193 case nir_intrinsic_shared_atomic_add: 1194 case nir_intrinsic_shared_atomic_imin: 1195 case nir_intrinsic_shared_atomic_umin: 1196 case nir_intrinsic_shared_atomic_imax: 1197 case nir_intrinsic_shared_atomic_umax: 1198 case nir_intrinsic_shared_atomic_and: 1199 case nir_intrinsic_shared_atomic_or: 1200 case nir_intrinsic_shared_atomic_xor: { 1201 assert(nir_src_bit_size(instr->src[1]) == 32); 1202 1203 bi_index addr = bi_seg_add_i64(b, bi_src_index(&instr->src[0]), 1204 bi_zero(), false, BI_SEG_WLS); 1205 1206 bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), 1207 instr->intrinsic); 1208 break; 1209 } 1210 1211 case nir_intrinsic_image_atomic_add: 1212 case nir_intrinsic_image_atomic_imin: 1213 case nir_intrinsic_image_atomic_umin: 1214 case nir_intrinsic_image_atomic_imax: 1215 case nir_intrinsic_image_atomic_umax: 1216 case nir_intrinsic_image_atomic_and: 1217 case nir_intrinsic_image_atomic_or: 1218 case nir_intrinsic_image_atomic_xor: 1219 assert(nir_src_bit_size(instr->src[3]) == 32); 1220 1221 bi_emit_atomic_i32_to(b, dst, 1222 bi_emit_lea_image(b, instr), 1223 bi_src_index(&instr->src[3]), 1224 instr->intrinsic); 1225 break; 1226 1227 case nir_intrinsic_global_atomic_add: 1228 case nir_intrinsic_global_atomic_imin: 1229 case nir_intrinsic_global_atomic_umin: 1230 case nir_intrinsic_global_atomic_imax: 1231 case nir_intrinsic_global_atomic_umax: 1232 case nir_intrinsic_global_atomic_and: 1233 case nir_intrinsic_global_atomic_or: 1234 case nir_intrinsic_global_atomic_xor: 1235 assert(nir_src_bit_size(instr->src[1]) == 32); 1236 1237 bi_emit_atomic_i32_to(b, dst, 1238 bi_src_index(&instr->src[0]), 1239 bi_src_index(&instr->src[1]), 1240 instr->intrinsic); 1241 break; 1242 1243 case nir_intrinsic_image_load: 1244 bi_emit_image_load(b, instr); 1245 break; 1246 1247 case nir_intrinsic_image_store: 1248 bi_emit_image_store(b, instr); 1249 break; 1250 1251 case nir_intrinsic_global_atomic_exchange: 1252 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), 1253 &instr->src[1], BI_SEG_NONE); 1254 break; 1255 1256 case nir_intrinsic_image_atomic_exchange: 1257 bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr), 1258 &instr->src[3], BI_SEG_NONE); 1259 break; 1260 1261 case nir_intrinsic_shared_atomic_exchange: 1262 bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), 1263 &instr->src[1], BI_SEG_WLS); 1264 break; 1265 1266 case nir_intrinsic_global_atomic_comp_swap: 1267 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), 1268 &instr->src[1], &instr->src[2], BI_SEG_NONE); 1269 break; 1270 1271 case nir_intrinsic_image_atomic_comp_swap: 1272 bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr), 1273 &instr->src[3], &instr->src[4], BI_SEG_NONE); 1274 break; 1275 1276 case nir_intrinsic_shared_atomic_comp_swap: 1277 bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), 1278 &instr->src[1], &instr->src[2], BI_SEG_WLS); 1279 break; 1280 1281 case nir_intrinsic_load_frag_coord: 1282 bi_emit_load_frag_coord(b, instr); 1283 break; 1284 1285 case nir_intrinsic_load_output: 1286 bi_emit_ld_tile(b, instr); 1287 break; 1288 1289 case nir_intrinsic_discard_if: { 1290 bi_index src = bi_src_index(&instr->src[0]); 1291 assert(nir_src_bit_size(instr->src[0]) == 1); 1292 bi_discard_b32(b, bi_half(src, false)); 1293 break; 1294 } 1295 1296 case nir_intrinsic_discard: 1297 bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ); 1298 break; 1299 1300 case nir_intrinsic_load_ssbo_address: 1301 bi_load_sysval_nir(b, instr, 2, 0); 1302 break; 1303 1304 case nir_intrinsic_load_work_dim: 1305 bi_load_sysval_nir(b, instr, 1, 0); 1306 break; 1307 1308 case nir_intrinsic_load_first_vertex: 1309 bi_load_sysval_nir(b, instr, 1, 0); 1310 break; 1311 1312 case nir_intrinsic_load_base_vertex: 1313 bi_load_sysval_nir(b, instr, 1, 4); 1314 break; 1315 1316 case nir_intrinsic_load_base_instance: 1317 bi_load_sysval_nir(b, instr, 1, 8); 1318 break; 1319 1320 case nir_intrinsic_load_draw_id: 1321 bi_load_sysval_nir(b, instr, 1, 0); 1322 break; 1323 1324 case nir_intrinsic_get_ssbo_size: 1325 bi_load_sysval_nir(b, instr, 1, 8); 1326 break; 1327 1328 case nir_intrinsic_load_viewport_scale: 1329 case nir_intrinsic_load_viewport_offset: 1330 case nir_intrinsic_load_num_workgroups: 1331 case nir_intrinsic_load_workgroup_size: 1332 bi_load_sysval_nir(b, instr, 3, 0); 1333 break; 1334 1335 case nir_intrinsic_image_size: 1336 bi_load_sysval_nir(b, instr, 1337 nir_dest_num_components(instr->dest), 0); 1338 break; 1339 1340 case nir_intrinsic_load_blend_const_color_rgba: 1341 bi_load_sysval_nir(b, instr, 1342 nir_dest_num_components(instr->dest), 0); 1343 break; 1344 1345 case nir_intrinsic_load_sample_positions_pan: 1346 bi_mov_i32_to(b, bi_word(dst, 0), 1347 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false)); 1348 bi_mov_i32_to(b, bi_word(dst, 1), 1349 bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); 1350 break; 1351 1352 case nir_intrinsic_load_sample_mask_in: 1353 /* r61[0:15] contains the coverage bitmap */ 1354 bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false)); 1355 break; 1356 1357 case nir_intrinsic_load_sample_id: 1358 bi_load_sample_id_to(b, dst); 1359 break; 1360 1361 case nir_intrinsic_load_front_face: 1362 /* r58 == 0 means primitive is front facing */ 1363 bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ, 1364 BI_RESULT_TYPE_M1); 1365 break; 1366 1367 case nir_intrinsic_load_point_coord: 1368 bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, 1369 BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER, 1370 BI_VARYING_NAME_POINT, BI_VECSIZE_V2); 1371 break; 1372 1373 case nir_intrinsic_load_vertex_id_zero_base: 1374 bi_mov_i32_to(b, dst, bi_register(61)); 1375 break; 1376 1377 case nir_intrinsic_load_instance_id: 1378 bi_mov_i32_to(b, dst, bi_register(62)); 1379 break; 1380 1381 case nir_intrinsic_load_subgroup_invocation: 1382 bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false)); 1383 break; 1384 1385 case nir_intrinsic_load_local_invocation_id: 1386 for (unsigned i = 0; i < 3; ++i) 1387 bi_u16_to_u32_to(b, bi_word(dst, i), 1388 bi_half(bi_register(55 + i / 2), i % 2)); 1389 break; 1390 1391 case nir_intrinsic_load_workgroup_id: 1392 for (unsigned i = 0; i < 3; ++i) 1393 bi_mov_i32_to(b, bi_word(dst, i), bi_register(57 + i)); 1394 break; 1395 1396 case nir_intrinsic_load_global_invocation_id: 1397 case nir_intrinsic_load_global_invocation_id_zero_base: 1398 for (unsigned i = 0; i < 3; ++i) 1399 bi_mov_i32_to(b, bi_word(dst, i), bi_register(60 + i)); 1400 break; 1401 1402 case nir_intrinsic_shader_clock: 1403 bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER); 1404 break; 1405 1406 default: 1407 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); 1408 assert(0); 1409 } 1410} 1411 1412static void 1413bi_emit_load_const(bi_builder *b, nir_load_const_instr *instr) 1414{ 1415 /* Make sure we've been lowered */ 1416 assert(instr->def.num_components <= (32 / instr->def.bit_size)); 1417 1418 /* Accumulate all the channels of the constant, as if we did an 1419 * implicit SEL over them */ 1420 uint32_t acc = 0; 1421 1422 for (unsigned i = 0; i < instr->def.num_components; ++i) { 1423 uint32_t v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size); 1424 1425 v = bi_extend_constant(v, instr->def.bit_size); 1426 acc |= (v << (i * instr->def.bit_size)); 1427 } 1428 1429 bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc)); 1430} 1431 1432static bi_index 1433bi_alu_src_index(nir_alu_src src, unsigned comps) 1434{ 1435 /* we don't lower modifiers until the backend */ 1436 assert(!(src.negate || src.abs)); 1437 1438 unsigned bitsize = nir_src_bit_size(src.src); 1439 1440 /* TODO: Do we need to do something more clever with 1-bit bools? */ 1441 if (bitsize == 1) 1442 bitsize = 16; 1443 1444 /* the bi_index carries the 32-bit (word) offset separate from the 1445 * subword swizzle, first handle the offset */ 1446 1447 unsigned offset = 0; 1448 1449 assert(bitsize == 8 || bitsize == 16 || bitsize == 32); 1450 unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; 1451 1452 for (unsigned i = 0; i < comps; ++i) { 1453 unsigned new_offset = (src.swizzle[i] >> subword_shift); 1454 1455 if (i > 0) 1456 assert(offset == new_offset && "wrong vectorization"); 1457 1458 offset = new_offset; 1459 } 1460 1461 bi_index idx = bi_word(bi_src_index(&src.src), offset); 1462 1463 /* Compose the subword swizzle with existing (identity) swizzle */ 1464 assert(idx.swizzle == BI_SWIZZLE_H01); 1465 1466 /* Bigger vectors should have been lowered */ 1467 assert(comps <= (1 << subword_shift)); 1468 1469 if (bitsize == 16) { 1470 unsigned c0 = src.swizzle[0] & 1; 1471 unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0; 1472 idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1); 1473 } else if (bitsize == 8) { 1474 /* 8-bit vectors not yet supported */ 1475 assert(comps == 1 && "8-bit vectors not supported"); 1476 assert(src.swizzle[0] < 4 && "8-bit vectors not supported"); 1477 idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0]; 1478 } 1479 1480 return idx; 1481} 1482 1483static enum bi_round 1484bi_nir_round(nir_op op) 1485{ 1486 switch (op) { 1487 case nir_op_fround_even: return BI_ROUND_NONE; 1488 case nir_op_ftrunc: return BI_ROUND_RTZ; 1489 case nir_op_fceil: return BI_ROUND_RTP; 1490 case nir_op_ffloor: return BI_ROUND_RTN; 1491 default: unreachable("invalid nir round op"); 1492 } 1493} 1494 1495/* Convenience for lowered transcendentals */ 1496 1497static bi_index 1498bi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1) 1499{ 1500 return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE); 1501} 1502 1503/* Approximate with FRCP_APPROX.f32 and apply a single iteration of 1504 * Newton-Raphson to improve precision */ 1505 1506static void 1507bi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0) 1508{ 1509 bi_index x1 = bi_frcp_approx_f32(b, s0); 1510 bi_index m = bi_frexpm_f32(b, s0, false, false); 1511 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false); 1512 bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), 1513 bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N); 1514 bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, 1515 BI_ROUND_NONE, BI_SPECIAL_NONE); 1516} 1517 1518static void 1519bi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0) 1520{ 1521 bi_index x1 = bi_frsq_approx_f32(b, s0); 1522 bi_index m = bi_frexpm_f32(b, s0, false, true); 1523 bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true); 1524 bi_index t1 = bi_fmul_f32(b, x1, x1); 1525 bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0), 1526 bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N); 1527 bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, 1528 BI_ROUND_NONE, BI_SPECIAL_N); 1529} 1530 1531/* More complex transcendentals, see 1532 * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc 1533 * for documentation */ 1534 1535static void 1536bi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0) 1537{ 1538 bi_index t1 = bi_temp(b->shader); 1539 bi_instr *t1_instr = bi_fadd_f32_to(b, t1, 1540 s0, bi_imm_u32(0x49400000), BI_ROUND_NONE); 1541 t1_instr->clamp = BI_CLAMP_CLAMP_0_INF; 1542 1543 bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE); 1544 1545 bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), 1546 s0, bi_neg(t2), BI_ROUND_NONE); 1547 a2->clamp = BI_CLAMP_CLAMP_M1_1; 1548 1549 bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE); 1550 bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false); 1551 bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4)); 1552 bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635), 1553 bi_imm_u32(0x3e75fffa), BI_ROUND_NONE); 1554 bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], 1555 bi_imm_u32(0x3f317218), BI_ROUND_NONE); 1556 bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2); 1557 bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), 1558 p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE); 1559 x->clamp = BI_CLAMP_CLAMP_0_INF; 1560 1561 bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0); 1562 max->sem = BI_SEM_NAN_PROPAGATE; 1563} 1564 1565static void 1566bi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base) 1567{ 1568 /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24 1569 * fixed-point input */ 1570 bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(), 1571 bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE); 1572 bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE); 1573 1574 /* Compute the result for the fixed-point input, but pass along 1575 * the floating-point scale for correct NaN propagation */ 1576 bi_fexp_f32_to(b, dst, fixed_pt, scale); 1577} 1578 1579static void 1580bi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0) 1581{ 1582 /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */ 1583 bi_index a1 = bi_frexpm_f32(b, s0, true, false); 1584 bi_index ei = bi_frexpe_f32(b, s0, true, false); 1585 bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ); 1586 1587 /* xt estimates -log(r1), a coarse approximation of log(a1) */ 1588 bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE); 1589 bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE); 1590 1591 /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) - 1592 * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1), 1593 * and then log(s0) = x1 + x2 */ 1594 bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE); 1595 1596 /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by 1597 * polynomial approximation around 1. The series is expressed around 1598 * 1, so set y = (a1 * r1) - 1.0 */ 1599 bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE); 1600 1601 /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate 1602 * log_e(1 + y) by the Taylor series (lower precision than the blob): 1603 * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */ 1604 bi_index loge = bi_fmul_f32(b, y, 1605 bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE)); 1606 1607 bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0))); 1608 1609 /* log(s0) = x1 + x2 */ 1610 bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE); 1611} 1612 1613static void 1614bi_flog2_32(bi_builder *b, bi_index dst, bi_index s0) 1615{ 1616 bi_index frexp = bi_frexpe_f32(b, s0, true, false); 1617 bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ); 1618 bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0); 1619 bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi, 1620 BI_ROUND_NONE); 1621} 1622 1623static void 1624bi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) 1625{ 1626 bi_index log2_base = bi_null(); 1627 1628 if (base.type == BI_INDEX_CONSTANT) { 1629 log2_base = bi_imm_f32(log2f(uif(base.value))); 1630 } else { 1631 log2_base = bi_temp(b->shader); 1632 bi_lower_flog2_32(b, log2_base, base); 1633 } 1634 1635 return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base)); 1636} 1637 1638static void 1639bi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) 1640{ 1641 bi_index log2_base = bi_null(); 1642 1643 if (base.type == BI_INDEX_CONSTANT) { 1644 log2_base = bi_imm_f32(log2f(uif(base.value))); 1645 } else { 1646 log2_base = bi_temp(b->shader); 1647 bi_flog2_32(b, log2_base, base); 1648 } 1649 1650 return bi_fexp_32(b, dst, exp, log2_base); 1651} 1652 1653/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as 1654 * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and 1655 * calculates the results. We use them to calculate sin/cos via a Taylor 1656 * approximation: 1657 * 1658 * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x) 1659 * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x) 1660 * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x) 1661 */ 1662 1663#define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f) 1664#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0) 1665#define SINCOS_BIAS bi_imm_u32(0x49400000) 1666 1667static void 1668bi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos) 1669{ 1670 /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */ 1671 bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE); 1672 1673 /* Approximate domain error (small) */ 1674 bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS), 1675 BI_ROUND_NONE), 1676 MPI_OVER_TWO, s0, BI_ROUND_NONE); 1677 1678 /* Lookup sin(x), cos(x) */ 1679 bi_index sinx = bi_fsin_table_u6(b, x_u6, false); 1680 bi_index cosx = bi_fcos_table_u6(b, x_u6, false); 1681 1682 /* e^2 / 2 */ 1683 bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(), 1684 bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE); 1685 1686 /* (-e^2)/2 f''(x) */ 1687 bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2), 1688 cos ? cosx : sinx, 1689 bi_negzero(), BI_ROUND_NONE); 1690 1691 /* e f'(x) - (e^2/2) f''(x) */ 1692 bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e, 1693 cos ? bi_neg(sinx) : cosx, 1694 quadratic, BI_ROUND_NONE); 1695 I->clamp = BI_CLAMP_CLAMP_M1_1; 1696 1697 /* f(x) + e f'(x) - (e^2/2) f''(x) */ 1698 bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE); 1699} 1700 1701/* The XOR lane op is useful for derivative calculation, but was added in v7. 1702 * Add a safe helper that will do the appropriate lowering on v6 */ 1703 1704static bi_index 1705bi_clper_xor(bi_builder *b, bi_index s0, bi_index s1) 1706{ 1707 if (b->shader->arch >= 7) { 1708 return bi_clper_i32(b, s0, s1, 1709 BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR, 1710 BI_SUBGROUP_SUBGROUP4); 1711 } 1712 1713 bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false); 1714 bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0)); 1715 return bi_clper_v6_i32(b, s0, lane); 1716} 1717 1718static bi_instr * 1719bi_emit_alu_bool(bi_builder *b, unsigned sz, nir_op op, 1720 bi_index dst, bi_index s0, bi_index s1, bi_index s2) 1721{ 1722 /* Handle 1-bit bools as 0/~0 by default and let the optimizer deal 1723 * with the bit patterns later. 0/~0 has the nice property of being 1724 * independent of replicated vectorization. */ 1725 if (sz == 1) sz = 16; 1726 bi_index f = bi_zero(); 1727 bi_index t = bi_imm_u16(0xFFFF); 1728 1729 switch (op) { 1730 case nir_op_feq: 1731 return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1); 1732 case nir_op_flt: 1733 return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1); 1734 case nir_op_fge: 1735 return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1); 1736 case nir_op_fneu: 1737 return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1); 1738 1739 case nir_op_ieq: 1740 return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1); 1741 case nir_op_ine: 1742 return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1); 1743 case nir_op_ilt: 1744 return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1); 1745 case nir_op_ige: 1746 return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1); 1747 case nir_op_ult: 1748 return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1); 1749 case nir_op_uge: 1750 return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1); 1751 1752 case nir_op_iand: 1753 return bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 1754 case nir_op_ior: 1755 return bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 1756 case nir_op_ixor: 1757 return bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 1758 case nir_op_inot: 1759 return bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); 1760 1761 case nir_op_f2b1: 1762 return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ); 1763 case nir_op_i2b1: 1764 return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ); 1765 case nir_op_b2b1: 1766 return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ); 1767 1768 case nir_op_bcsel: 1769 return bi_csel_to(b, nir_type_int, sz, dst, s0, f, s1, s2, BI_CMPF_NE); 1770 1771 default: 1772 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name); 1773 unreachable("Unhandled boolean ALU instruction"); 1774 } 1775} 1776 1777static void 1778bi_emit_alu(bi_builder *b, nir_alu_instr *instr) 1779{ 1780 bi_index dst = bi_dest_index(&instr->dest.dest); 1781 unsigned srcs = nir_op_infos[instr->op].num_inputs; 1782 unsigned sz = nir_dest_bit_size(instr->dest.dest); 1783 unsigned comps = nir_dest_num_components(instr->dest.dest); 1784 unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0; 1785 unsigned src1_sz = srcs > 1 ? nir_src_bit_size(instr->src[1].src) : 0; 1786 bool is_bool = (sz == 1); 1787 1788 /* TODO: Anything else? */ 1789 if (sz == 1) 1790 sz = 16; 1791 1792 /* Indicate scalarness */ 1793 if (sz == 16 && comps == 1) 1794 dst.swizzle = BI_SWIZZLE_H00; 1795 1796 if (!instr->dest.dest.is_ssa) { 1797 for (unsigned i = 0; i < comps; ++i) 1798 assert(instr->dest.write_mask); 1799 } 1800 1801 /* First, match against the various moves in NIR. These are 1802 * special-cased because they can operate on vectors even after 1803 * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the 1804 * instruction is no "bigger" than SIMD-within-a-register. These moves 1805 * are the exceptions that need to handle swizzles specially. */ 1806 1807 switch (instr->op) { 1808 case nir_op_pack_32_2x16: 1809 case nir_op_vec2: 1810 case nir_op_vec3: 1811 case nir_op_vec4: { 1812 bi_index unoffset_srcs[4] = { 1813 srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(), 1814 srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(), 1815 srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(), 1816 srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(), 1817 }; 1818 1819 unsigned channels[4] = { 1820 instr->src[0].swizzle[0], 1821 instr->src[1].swizzle[0], 1822 srcs > 2 ? instr->src[2].swizzle[0] : 0, 1823 srcs > 3 ? instr->src[3].swizzle[0] : 0, 1824 }; 1825 1826 bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz); 1827 return; 1828 } 1829 1830 case nir_op_vec8: 1831 case nir_op_vec16: 1832 unreachable("should've been lowered"); 1833 1834 case nir_op_unpack_32_2x16: 1835 case nir_op_unpack_64_2x32_split_x: 1836 bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0].src)); 1837 return; 1838 1839 case nir_op_unpack_64_2x32_split_y: 1840 bi_mov_i32_to(b, dst, bi_word(bi_src_index(&instr->src[0].src), 1)); 1841 return; 1842 1843 case nir_op_pack_64_2x32_split: 1844 bi_mov_i32_to(b, bi_word(dst, 0), bi_src_index(&instr->src[0].src)); 1845 bi_mov_i32_to(b, bi_word(dst, 1), bi_src_index(&instr->src[1].src)); 1846 return; 1847 1848 case nir_op_pack_64_2x32: 1849 bi_mov_i32_to(b, bi_word(dst, 0), bi_word(bi_src_index(&instr->src[0].src), 0)); 1850 bi_mov_i32_to(b, bi_word(dst, 1), bi_word(bi_src_index(&instr->src[0].src), 1)); 1851 return; 1852 1853 case nir_op_mov: { 1854 bi_index idx = bi_src_index(&instr->src[0].src); 1855 bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; 1856 1857 unsigned channels[4] = { 1858 comps > 0 ? instr->src[0].swizzle[0] : 0, 1859 comps > 1 ? instr->src[0].swizzle[1] : 0, 1860 comps > 2 ? instr->src[0].swizzle[2] : 0, 1861 comps > 3 ? instr->src[0].swizzle[3] : 0, 1862 }; 1863 1864 if (sz == 1) sz = 16; 1865 bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, sz); 1866 return; 1867 } 1868 1869 case nir_op_f2f16: 1870 assert(src_sz == 32); 1871 bi_index idx = bi_src_index(&instr->src[0].src); 1872 bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]); 1873 bi_index s1 = comps > 1 ? 1874 bi_word(idx, instr->src[0].swizzle[1]) : s0; 1875 1876 bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE); 1877 return; 1878 1879 /* Vectorized downcasts */ 1880 case nir_op_u2u16: 1881 case nir_op_i2i16: { 1882 if (!(src_sz == 32 && comps == 2)) 1883 break; 1884 1885 bi_index idx = bi_src_index(&instr->src[0].src); 1886 bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]); 1887 bi_index s1 = bi_word(idx, instr->src[0].swizzle[1]); 1888 1889 bi_mkvec_v2i16_to(b, dst, 1890 bi_half(s0, false), bi_half(s1, false)); 1891 return; 1892 } 1893 1894 case nir_op_i2i8: 1895 case nir_op_u2u8: 1896 { 1897 /* Acts like an 8-bit swizzle */ 1898 bi_index idx = bi_src_index(&instr->src[0].src); 1899 unsigned factor = src_sz / 8; 1900 unsigned chan[4] = { 0 }; 1901 1902 for (unsigned i = 0; i < comps; ++i) 1903 chan[i] = instr->src[0].swizzle[i] * factor; 1904 1905 bi_make_vec_to(b, dst, &idx, chan, comps, 8); 1906 return; 1907 } 1908 1909 default: 1910 break; 1911 } 1912 1913 bi_index s0 = srcs > 0 ? bi_alu_src_index(instr->src[0], comps) : bi_null(); 1914 bi_index s1 = srcs > 1 ? bi_alu_src_index(instr->src[1], comps) : bi_null(); 1915 bi_index s2 = srcs > 2 ? bi_alu_src_index(instr->src[2], comps) : bi_null(); 1916 1917 if (is_bool) { 1918 bi_emit_alu_bool(b, src_sz, instr->op, dst, s0, s1, s2); 1919 return; 1920 } 1921 1922 switch (instr->op) { 1923 case nir_op_ffma: 1924 bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE); 1925 break; 1926 1927 case nir_op_fmul: 1928 bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE); 1929 break; 1930 1931 case nir_op_fsub: 1932 s1 = bi_neg(s1); 1933 FALLTHROUGH; 1934 case nir_op_fadd: 1935 bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE); 1936 break; 1937 1938 case nir_op_fsat: { 1939 bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 1940 I->clamp = BI_CLAMP_CLAMP_0_1; 1941 break; 1942 } 1943 1944 case nir_op_fsat_signed_mali: { 1945 bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 1946 I->clamp = BI_CLAMP_CLAMP_M1_1; 1947 break; 1948 } 1949 1950 case nir_op_fclamp_pos_mali: { 1951 bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 1952 I->clamp = BI_CLAMP_CLAMP_0_INF; 1953 break; 1954 } 1955 1956 case nir_op_fneg: 1957 bi_fabsneg_to(b, sz, dst, bi_neg(s0)); 1958 break; 1959 1960 case nir_op_fabs: 1961 bi_fabsneg_to(b, sz, dst, bi_abs(s0)); 1962 break; 1963 1964 case nir_op_fsin: 1965 bi_lower_fsincos_32(b, dst, s0, false); 1966 break; 1967 1968 case nir_op_fcos: 1969 bi_lower_fsincos_32(b, dst, s0, true); 1970 break; 1971 1972 case nir_op_fexp2: 1973 assert(sz == 32); /* should've been lowered */ 1974 1975 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 1976 bi_lower_fexp2_32(b, dst, s0); 1977 else 1978 bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f)); 1979 1980 break; 1981 1982 case nir_op_flog2: 1983 assert(sz == 32); /* should've been lowered */ 1984 1985 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 1986 bi_lower_flog2_32(b, dst, s0); 1987 else 1988 bi_flog2_32(b, dst, s0); 1989 1990 break; 1991 1992 case nir_op_fpow: 1993 assert(sz == 32); /* should've been lowered */ 1994 1995 if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 1996 bi_lower_fpow_32(b, dst, s0, s1); 1997 else 1998 bi_fpow_32(b, dst, s0, s1); 1999 2000 break; 2001 2002 case nir_op_bcsel: 2003 if (src1_sz == 8) 2004 bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); 2005 else 2006 bi_csel_to(b, nir_type_int, src1_sz, 2007 dst, s0, bi_zero(), s1, s2, BI_CMPF_NE); 2008 break; 2009 2010 case nir_op_ishl: 2011 bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); 2012 break; 2013 case nir_op_ushr: 2014 bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); 2015 break; 2016 2017 case nir_op_ishr: 2018 bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0)); 2019 break; 2020 2021 case nir_op_imin: 2022 case nir_op_umin: 2023 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, 2024 s0, s1, s0, s1, BI_CMPF_LT); 2025 break; 2026 2027 case nir_op_imax: 2028 case nir_op_umax: 2029 bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, 2030 s0, s1, s0, s1, BI_CMPF_GT); 2031 break; 2032 2033 case nir_op_fddx_must_abs_mali: 2034 case nir_op_fddy_must_abs_mali: { 2035 bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2); 2036 bi_index adjacent = bi_clper_xor(b, s0, bit); 2037 bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0), BI_ROUND_NONE); 2038 break; 2039 } 2040 2041 case nir_op_fddx: 2042 case nir_op_fddy: { 2043 bi_index lane1 = bi_lshift_and_i32(b, 2044 bi_fau(BIR_FAU_LANE_ID, false), 2045 bi_imm_u32(instr->op == nir_op_fddx ? 2 : 1), 2046 bi_imm_u8(0)); 2047 2048 bi_index lane2 = bi_iadd_u32(b, lane1, 2049 bi_imm_u32(instr->op == nir_op_fddx ? 1 : 2), 2050 false); 2051 2052 bi_index left, right; 2053 2054 if (b->shader->quirks & BIFROST_LIMITED_CLPER) { 2055 left = bi_clper_v6_i32(b, s0, lane1); 2056 right = bi_clper_v6_i32(b, s0, lane2); 2057 } else { 2058 left = bi_clper_i32(b, s0, lane1, 2059 BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, 2060 BI_SUBGROUP_SUBGROUP4); 2061 2062 right = bi_clper_i32(b, s0, lane2, 2063 BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, 2064 BI_SUBGROUP_SUBGROUP4); 2065 } 2066 2067 bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE); 2068 break; 2069 } 2070 2071 case nir_op_f2f32: 2072 bi_f16_to_f32_to(b, dst, s0); 2073 break; 2074 2075 case nir_op_f2i32: 2076 if (src_sz == 32) 2077 bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ); 2078 else 2079 bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ); 2080 break; 2081 2082 /* Note 32-bit sources => no vectorization, so 32-bit works */ 2083 case nir_op_f2u16: 2084 if (src_sz == 32) 2085 bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ); 2086 else 2087 bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ); 2088 break; 2089 2090 case nir_op_f2i16: 2091 if (src_sz == 32) 2092 bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ); 2093 else 2094 bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ); 2095 break; 2096 2097 case nir_op_f2u32: 2098 if (src_sz == 32) 2099 bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ); 2100 else 2101 bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ); 2102 break; 2103 2104 case nir_op_u2f16: 2105 if (src_sz == 32) 2106 bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ); 2107 else if (src_sz == 16) 2108 bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ); 2109 else if (src_sz == 8) 2110 bi_v2u8_to_v2f16_to(b, dst, s0); 2111 break; 2112 2113 case nir_op_u2f32: 2114 if (src_sz == 32) 2115 bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ); 2116 else if (src_sz == 16) 2117 bi_u16_to_f32_to(b, dst, s0); 2118 else 2119 bi_u8_to_f32_to(b, dst, s0); 2120 break; 2121 2122 case nir_op_i2f16: 2123 if (src_sz == 32) 2124 bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ); 2125 else if (src_sz == 16) 2126 bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ); 2127 else if (src_sz == 8) 2128 bi_v2s8_to_v2f16_to(b, dst, s0); 2129 break; 2130 2131 case nir_op_i2f32: 2132 if (src_sz == 32) 2133 bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ); 2134 else if (src_sz == 16) 2135 bi_s16_to_f32_to(b, dst, s0); 2136 else if (src_sz == 8) 2137 bi_s8_to_f32_to(b, dst, s0); 2138 break; 2139 2140 case nir_op_i2i32: 2141 if (src_sz == 16) 2142 bi_s16_to_s32_to(b, dst, s0); 2143 else 2144 bi_s8_to_s32_to(b, dst, s0); 2145 break; 2146 2147 case nir_op_u2u32: 2148 if (src_sz == 16) 2149 bi_u16_to_u32_to(b, dst, s0); 2150 else 2151 bi_u8_to_u32_to(b, dst, s0); 2152 break; 2153 2154 case nir_op_i2i16: 2155 assert(src_sz == 8 || src_sz == 32); 2156 2157 if (src_sz == 8) 2158 bi_v2s8_to_v2s16_to(b, dst, s0); 2159 else 2160 bi_mov_i32_to(b, dst, s0); 2161 break; 2162 2163 case nir_op_u2u16: 2164 assert(src_sz == 8 || src_sz == 32); 2165 2166 if (src_sz == 8) 2167 bi_v2u8_to_v2u16_to(b, dst, s0); 2168 else 2169 bi_mov_i32_to(b, dst, s0); 2170 break; 2171 2172 case nir_op_b2f16: 2173 case nir_op_b2f32: 2174 bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(), 2175 (sz == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0), 2176 (sz == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0), 2177 BI_CMPF_NE); 2178 break; 2179 2180 case nir_op_b2b32: 2181 bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(), 2182 bi_imm_u32(~0), bi_zero(), BI_CMPF_NE); 2183 break; 2184 2185 case nir_op_b2i8: 2186 case nir_op_b2i16: 2187 case nir_op_b2i32: 2188 bi_lshift_and_to(b, sz, dst, s0, bi_imm_uintN(1, sz), bi_imm_u8(0)); 2189 break; 2190 2191 case nir_op_fround_even: 2192 case nir_op_fceil: 2193 case nir_op_ffloor: 2194 case nir_op_ftrunc: 2195 bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op)); 2196 break; 2197 2198 case nir_op_fmin: 2199 bi_fmin_to(b, sz, dst, s0, s1); 2200 break; 2201 2202 case nir_op_fmax: 2203 bi_fmax_to(b, sz, dst, s0, s1); 2204 break; 2205 2206 case nir_op_iadd: 2207 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); 2208 break; 2209 2210 case nir_op_iadd_sat: 2211 bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true); 2212 break; 2213 2214 case nir_op_uadd_sat: 2215 bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true); 2216 break; 2217 2218 case nir_op_ihadd: 2219 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN); 2220 break; 2221 2222 case nir_op_irhadd: 2223 bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP); 2224 break; 2225 2226 case nir_op_ineg: 2227 bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false); 2228 break; 2229 2230 case nir_op_isub: 2231 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false); 2232 break; 2233 2234 case nir_op_isub_sat: 2235 bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true); 2236 break; 2237 2238 case nir_op_usub_sat: 2239 bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true); 2240 break; 2241 2242 case nir_op_imul: 2243 bi_imul_to(b, sz, dst, s0, s1); 2244 break; 2245 2246 case nir_op_iabs: 2247 bi_iabs_to(b, sz, dst, s0); 2248 break; 2249 2250 case nir_op_iand: 2251 bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 2252 break; 2253 2254 case nir_op_ior: 2255 bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 2256 break; 2257 2258 case nir_op_ixor: 2259 bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 2260 break; 2261 2262 case nir_op_inot: 2263 bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); 2264 break; 2265 2266 case nir_op_frsq: 2267 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 2268 bi_lower_frsq_32(b, dst, s0); 2269 else 2270 bi_frsq_to(b, sz, dst, s0); 2271 break; 2272 2273 case nir_op_frcp: 2274 if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 2275 bi_lower_frcp_32(b, dst, s0); 2276 else 2277 bi_frcp_to(b, sz, dst, s0); 2278 break; 2279 2280 case nir_op_uclz: 2281 bi_clz_to(b, sz, dst, s0, false); 2282 break; 2283 2284 case nir_op_bit_count: 2285 bi_popcount_i32_to(b, dst, s0); 2286 break; 2287 2288 case nir_op_bitfield_reverse: 2289 bi_bitrev_i32_to(b, dst, s0); 2290 break; 2291 2292 case nir_op_ufind_msb: { 2293 bi_index clz = bi_clz(b, src_sz, s0, false); 2294 2295 if (sz == 8) 2296 clz = bi_byte(clz, 0); 2297 else if (sz == 16) 2298 clz = bi_half(clz, false); 2299 2300 bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false); 2301 break; 2302 } 2303 2304 default: 2305 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); 2306 unreachable("Unknown ALU op"); 2307 } 2308} 2309 2310/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */ 2311static unsigned 2312bifrost_tex_format(enum glsl_sampler_dim dim) 2313{ 2314 switch (dim) { 2315 case GLSL_SAMPLER_DIM_1D: 2316 case GLSL_SAMPLER_DIM_BUF: 2317 return 1; 2318 2319 case GLSL_SAMPLER_DIM_2D: 2320 case GLSL_SAMPLER_DIM_MS: 2321 case GLSL_SAMPLER_DIM_EXTERNAL: 2322 case GLSL_SAMPLER_DIM_RECT: 2323 return 2; 2324 2325 case GLSL_SAMPLER_DIM_3D: 2326 return 3; 2327 2328 case GLSL_SAMPLER_DIM_CUBE: 2329 return 0; 2330 2331 default: 2332 DBG("Unknown sampler dim type\n"); 2333 assert(0); 2334 return 0; 2335 } 2336} 2337 2338static enum bifrost_texture_format_full 2339bi_texture_format(nir_alu_type T, enum bi_clamp clamp) 2340{ 2341 switch (T) { 2342 case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp; 2343 case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp; 2344 case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16; 2345 case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16; 2346 case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32; 2347 case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32; 2348 default: unreachable("Invalid type for texturing"); 2349 } 2350} 2351 2352/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */ 2353static bi_index 2354bi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T) 2355{ 2356 /* For (u)int we can just passthrough */ 2357 nir_alu_type base = nir_alu_type_get_base_type(T); 2358 if (base == nir_type_int || base == nir_type_uint) 2359 return idx; 2360 2361 /* Otherwise we convert */ 2362 assert(T == nir_type_float32); 2363 2364 /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and 2365 * Texel Selection") defines the layer to be taken from clamp(RNE(r), 2366 * 0, dt - 1). So we use round RTE, clamping is handled at the data 2367 * structure level */ 2368 2369 return bi_f32_to_u32(b, idx, BI_ROUND_NONE); 2370} 2371 2372/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a 2373 * 16-bit 8:8 fixed-point format. We lower as: 2374 * 2375 * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF = 2376 * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0) 2377 */ 2378 2379static bi_index 2380bi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16) 2381{ 2382 /* Precompute for constant LODs to avoid general constant folding */ 2383 if (lod.type == BI_INDEX_CONSTANT) { 2384 uint32_t raw = lod.value; 2385 float x = fp16 ? _mesa_half_to_float(raw) : uif(raw); 2386 int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f; 2387 return bi_imm_u32(s32 & 0xFFFF); 2388 } 2389 2390 /* Sort of arbitrary. Must be less than 128.0, greater than or equal to 2391 * the max LOD (16 since we cap at 2^16 texture dimensions), and 2392 * preferably small to minimize precision loss */ 2393 const float max_lod = 16.0; 2394 2395 bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader), 2396 fp16 ? bi_half(lod, false) : lod, 2397 bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE); 2398 2399 fsat->clamp = BI_CLAMP_CLAMP_M1_1; 2400 2401 bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), 2402 bi_negzero(), BI_ROUND_NONE); 2403 2404 return bi_mkvec_v2i16(b, 2405 bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false), 2406 bi_imm_u16(0)); 2407} 2408 2409/* FETCH takes a 32-bit staging register containing the LOD as an integer in 2410 * the bottom 16-bits and (if present) the cube face index in the top 16-bits. 2411 * TODO: Cube face. 2412 */ 2413 2414static bi_index 2415bi_emit_texc_lod_cube(bi_builder *b, bi_index lod) 2416{ 2417 return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8)); 2418} 2419 2420/* The hardware specifies texel offsets and multisample indices together as a 2421 * u8vec4 <offset, ms index>. By default all are zero, so if have either a 2422 * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with 2423 * the bits we need and return that to be passed as a staging register. Else we 2424 * return 0 to avoid allocating a data register when everything is zero. */ 2425 2426static bi_index 2427bi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr) 2428{ 2429 bi_index dest = bi_zero(); 2430 2431 int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); 2432 if (offs_idx >= 0 && 2433 (!nir_src_is_const(instr->src[offs_idx].src) || 2434 nir_src_as_uint(instr->src[offs_idx].src) != 0)) { 2435 unsigned nr = nir_src_num_components(instr->src[offs_idx].src); 2436 bi_index idx = bi_src_index(&instr->src[offs_idx].src); 2437 dest = bi_mkvec_v4i8(b, 2438 (nr > 0) ? bi_byte(bi_word(idx, 0), 0) : bi_imm_u8(0), 2439 (nr > 1) ? bi_byte(bi_word(idx, 1), 0) : bi_imm_u8(0), 2440 (nr > 2) ? bi_byte(bi_word(idx, 2), 0) : bi_imm_u8(0), 2441 bi_imm_u8(0)); 2442 } 2443 2444 int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); 2445 if (ms_idx >= 0 && 2446 (!nir_src_is_const(instr->src[ms_idx].src) || 2447 nir_src_as_uint(instr->src[ms_idx].src) != 0)) { 2448 dest = bi_lshift_or_i32(b, 2449 bi_src_index(&instr->src[ms_idx].src), dest, 2450 bi_imm_u8(24)); 2451 } 2452 2453 return dest; 2454} 2455 2456static void 2457bi_emit_cube_coord(bi_builder *b, bi_index coord, 2458 bi_index *face, bi_index *s, bi_index *t) 2459{ 2460 /* Compute max { |x|, |y|, |z| } */ 2461 bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader), 2462 bi_temp(b->shader), coord, 2463 bi_word(coord, 1), bi_word(coord, 2)); 2464 2465 /* Select coordinates */ 2466 2467 bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord, 2468 cubeface->dest[1]); 2469 2470 bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2), 2471 cubeface->dest[1]); 2472 2473 /* The OpenGL ES specification requires us to transform an input vector 2474 * (x, y, z) to the coordinate, given the selected S/T: 2475 * 2476 * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1)) 2477 * 2478 * We implement (s shown, t similar) in a form friendlier to FMA 2479 * instructions, and clamp coordinates at the end for correct 2480 * NaN/infinity handling: 2481 * 2482 * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5) 2483 * 2484 * Take the reciprocal of max{x, y, z} 2485 */ 2486 2487 bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]); 2488 2489 /* Calculate 0.5 * (1.0 / max{x, y, z}) */ 2490 bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(), 2491 BI_ROUND_NONE); 2492 2493 /* Transform the coordinates */ 2494 *s = bi_temp(b->shader); 2495 *t = bi_temp(b->shader); 2496 2497 bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f), 2498 BI_ROUND_NONE); 2499 bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f), 2500 BI_ROUND_NONE); 2501 2502 S->clamp = BI_CLAMP_CLAMP_0_1; 2503 T->clamp = BI_CLAMP_CLAMP_0_1; 2504 2505 /* Face index at bit[29:31], matching the cube map descriptor */ 2506 *face = cubeface->dest[1]; 2507} 2508 2509/* Emits a cube map descriptor, returning lower 32-bits and putting upper 2510 * 32-bits in passed pointer t. The packing of the face with the S coordinate 2511 * exploits the redundancy of floating points with the range restriction of 2512 * CUBEFACE output. 2513 * 2514 * struct cube_map_descriptor { 2515 * float s : 29; 2516 * unsigned face : 3; 2517 * float t : 32; 2518 * } 2519 * 2520 * Since the cube face index is preshifted, this is easy to pack with a bitwise 2521 * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3 2522 * bits from face. 2523 */ 2524 2525static bi_index 2526bi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t) 2527{ 2528 bi_index face, s; 2529 bi_emit_cube_coord(b, coord, &face, &s, t); 2530 bi_index mask = bi_imm_u32(BITFIELD_MASK(29)); 2531 return bi_mux_i32(b, s, face, mask, BI_MUX_BIT); 2532} 2533 2534/* Map to the main texture op used. Some of these (txd in particular) will 2535 * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in 2536 * sequence). We assume that lowering is handled elsewhere. 2537 */ 2538 2539static enum bifrost_tex_op 2540bi_tex_op(nir_texop op) 2541{ 2542 switch (op) { 2543 case nir_texop_tex: 2544 case nir_texop_txb: 2545 case nir_texop_txl: 2546 case nir_texop_txd: 2547 case nir_texop_tex_prefetch: 2548 return BIFROST_TEX_OP_TEX; 2549 case nir_texop_txf: 2550 case nir_texop_txf_ms: 2551 case nir_texop_txf_ms_fb: 2552 case nir_texop_tg4: 2553 return BIFROST_TEX_OP_FETCH; 2554 case nir_texop_txs: 2555 case nir_texop_lod: 2556 case nir_texop_query_levels: 2557 case nir_texop_texture_samples: 2558 case nir_texop_samples_identical: 2559 unreachable("should've been lowered"); 2560 default: 2561 unreachable("unsupported tex op"); 2562 } 2563} 2564 2565/* Data registers required by texturing in the order they appear. All are 2566 * optional, the texture operation descriptor determines which are present. 2567 * Note since 3D arrays are not permitted at an API level, Z_COORD and 2568 * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */ 2569 2570enum bifrost_tex_dreg { 2571 BIFROST_TEX_DREG_Z_COORD = 0, 2572 BIFROST_TEX_DREG_Y_DELTAS = 1, 2573 BIFROST_TEX_DREG_LOD = 2, 2574 BIFROST_TEX_DREG_GRDESC_HI = 3, 2575 BIFROST_TEX_DREG_SHADOW = 4, 2576 BIFROST_TEX_DREG_ARRAY = 5, 2577 BIFROST_TEX_DREG_OFFSETMS = 6, 2578 BIFROST_TEX_DREG_SAMPLER = 7, 2579 BIFROST_TEX_DREG_TEXTURE = 8, 2580 BIFROST_TEX_DREG_COUNT, 2581}; 2582 2583static void 2584bi_emit_texc(bi_builder *b, nir_tex_instr *instr) 2585{ 2586 bool computed_lod = false; 2587 2588 struct bifrost_texture_operation desc = { 2589 .op = bi_tex_op(instr->op), 2590 .offset_or_bias_disable = false, /* TODO */ 2591 .shadow_or_clamp_disable = instr->is_shadow, 2592 .array = instr->is_array, 2593 .dimension = bifrost_tex_format(instr->sampler_dim), 2594 .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */ 2595 .mask = 0xF, 2596 }; 2597 2598 switch (desc.op) { 2599 case BIFROST_TEX_OP_TEX: 2600 desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE; 2601 computed_lod = true; 2602 break; 2603 case BIFROST_TEX_OP_FETCH: 2604 desc.lod_or_fetch = (enum bifrost_lod_mode) 2605 (instr->op == nir_texop_tg4 ? 2606 BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component : 2607 BIFROST_TEXTURE_FETCH_TEXEL); 2608 break; 2609 default: 2610 unreachable("texture op unsupported"); 2611 } 2612 2613 /* 32-bit indices to be allocated as consecutive staging registers */ 2614 bi_index dregs[BIFROST_TEX_DREG_COUNT] = { }; 2615 bi_index cx = bi_null(), cy = bi_null(); 2616 2617 for (unsigned i = 0; i < instr->num_srcs; ++i) { 2618 bi_index index = bi_src_index(&instr->src[i].src); 2619 unsigned sz = nir_src_bit_size(instr->src[i].src); 2620 ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i); 2621 nir_alu_type T = base | sz; 2622 2623 switch (instr->src[i].src_type) { 2624 case nir_tex_src_coord: 2625 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 2626 cx = bi_emit_texc_cube_coord(b, index, &cy); 2627 } else { 2628 unsigned components = nir_src_num_components(instr->src[i].src); 2629 2630 /* Copy XY (for 2D+) or XX (for 1D) */ 2631 cx = index; 2632 cy = bi_word(index, MIN2(1, components - 1)); 2633 2634 assert(components >= 1 && components <= 3); 2635 2636 if (components < 3) { 2637 /* nothing to do */ 2638 } else if (desc.array) { 2639 /* 2D array */ 2640 dregs[BIFROST_TEX_DREG_ARRAY] = 2641 bi_emit_texc_array_index(b, 2642 bi_word(index, 2), T); 2643 } else { 2644 /* 3D */ 2645 dregs[BIFROST_TEX_DREG_Z_COORD] = 2646 bi_word(index, 2); 2647 } 2648 } 2649 break; 2650 2651 case nir_tex_src_lod: 2652 if (desc.op == BIFROST_TEX_OP_TEX && 2653 nir_src_is_const(instr->src[i].src) && 2654 nir_src_as_uint(instr->src[i].src) == 0) { 2655 desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO; 2656 } else if (desc.op == BIFROST_TEX_OP_TEX) { 2657 assert(base == nir_type_float); 2658 2659 assert(sz == 16 || sz == 32); 2660 dregs[BIFROST_TEX_DREG_LOD] = 2661 bi_emit_texc_lod_88(b, index, sz == 16); 2662 desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT; 2663 } else { 2664 assert(desc.op == BIFROST_TEX_OP_FETCH); 2665 assert(base == nir_type_uint || base == nir_type_int); 2666 assert(sz == 16 || sz == 32); 2667 2668 dregs[BIFROST_TEX_DREG_LOD] = 2669 bi_emit_texc_lod_cube(b, index); 2670 } 2671 2672 break; 2673 2674 case nir_tex_src_bias: 2675 /* Upper 16-bits interpreted as a clamp, leave zero */ 2676 assert(desc.op == BIFROST_TEX_OP_TEX); 2677 assert(base == nir_type_float); 2678 assert(sz == 16 || sz == 32); 2679 dregs[BIFROST_TEX_DREG_LOD] = 2680 bi_emit_texc_lod_88(b, index, sz == 16); 2681 desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS; 2682 computed_lod = true; 2683 break; 2684 2685 case nir_tex_src_ms_index: 2686 case nir_tex_src_offset: 2687 if (desc.offset_or_bias_disable) 2688 break; 2689 2690 dregs[BIFROST_TEX_DREG_OFFSETMS] = 2691 bi_emit_texc_offset_ms_index(b, instr); 2692 if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero())) 2693 desc.offset_or_bias_disable = true; 2694 break; 2695 2696 case nir_tex_src_comparator: 2697 dregs[BIFROST_TEX_DREG_SHADOW] = index; 2698 break; 2699 2700 case nir_tex_src_texture_offset: 2701 assert(instr->texture_index == 0); 2702 dregs[BIFROST_TEX_DREG_TEXTURE] = index; 2703 break; 2704 2705 case nir_tex_src_sampler_offset: 2706 assert(instr->sampler_index == 0); 2707 dregs[BIFROST_TEX_DREG_SAMPLER] = index; 2708 break; 2709 2710 default: 2711 unreachable("Unhandled src type in texc emit"); 2712 } 2713 } 2714 2715 if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) { 2716 dregs[BIFROST_TEX_DREG_LOD] = 2717 bi_emit_texc_lod_cube(b, bi_zero()); 2718 } 2719 2720 /* Choose an index mode */ 2721 2722 bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]); 2723 bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]); 2724 bool direct = direct_tex && direct_samp; 2725 2726 desc.immediate_indices = direct && (instr->sampler_index < 16); 2727 2728 if (desc.immediate_indices) { 2729 desc.sampler_index_or_mode = instr->sampler_index; 2730 desc.index = instr->texture_index; 2731 } else { 2732 enum bifrost_index mode = 0; 2733 2734 if (direct && instr->sampler_index == instr->texture_index) { 2735 mode = BIFROST_INDEX_IMMEDIATE_SHARED; 2736 desc.index = instr->texture_index; 2737 } else if (direct) { 2738 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; 2739 desc.index = instr->sampler_index; 2740 dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b, 2741 bi_imm_u32(instr->texture_index)); 2742 } else if (direct_tex) { 2743 assert(!direct_samp); 2744 mode = BIFROST_INDEX_IMMEDIATE_TEXTURE; 2745 desc.index = instr->texture_index; 2746 } else if (direct_samp) { 2747 assert(!direct_tex); 2748 mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; 2749 desc.index = instr->sampler_index; 2750 } else { 2751 mode = BIFROST_INDEX_REGISTER; 2752 } 2753 2754 desc.sampler_index_or_mode = mode | (0x3 << 2); 2755 } 2756 2757 /* Allocate staging registers contiguously by compacting the array. 2758 * Index is not SSA (tied operands) */ 2759 2760 unsigned sr_count = 0; 2761 2762 for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) { 2763 if (!bi_is_null(dregs[i])) 2764 dregs[sr_count++] = dregs[i]; 2765 } 2766 2767 bi_index idx = sr_count ? bi_temp_reg(b->shader) : bi_null(); 2768 2769 if (sr_count) 2770 bi_make_vec_to(b, idx, dregs, NULL, sr_count, 32); 2771 2772 uint32_t desc_u = 0; 2773 memcpy(&desc_u, &desc, sizeof(desc_u)); 2774 bi_texc_to(b, sr_count ? idx : bi_dest_index(&instr->dest), 2775 idx, cx, cy, bi_imm_u32(desc_u), !computed_lod, 2776 sr_count); 2777 2778 /* Explicit copy to facilitate tied operands */ 2779 if (sr_count) { 2780 bi_index srcs[4] = { idx, idx, idx, idx }; 2781 unsigned channels[4] = { 0, 1, 2, 3 }; 2782 bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, 4, 32); 2783 } 2784} 2785 2786/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube 2787 * textures with sufficiently small immediate indices. Anything else 2788 * needs a complete texture op. */ 2789 2790static void 2791bi_emit_texs(bi_builder *b, nir_tex_instr *instr) 2792{ 2793 int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); 2794 assert(coord_idx >= 0); 2795 bi_index coords = bi_src_index(&instr->src[coord_idx].src); 2796 2797 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 2798 bi_index face, s, t; 2799 bi_emit_cube_coord(b, coords, &face, &s, &t); 2800 2801 bi_texs_cube_to(b, nir_dest_bit_size(instr->dest), 2802 bi_dest_index(&instr->dest), 2803 s, t, face, 2804 instr->sampler_index, instr->texture_index); 2805 } else { 2806 bi_texs_2d_to(b, nir_dest_bit_size(instr->dest), 2807 bi_dest_index(&instr->dest), 2808 coords, bi_word(coords, 1), 2809 instr->op != nir_texop_tex, /* zero LOD */ 2810 instr->sampler_index, instr->texture_index); 2811 } 2812} 2813 2814static bool 2815bi_is_simple_tex(nir_tex_instr *instr) 2816{ 2817 if (instr->op != nir_texop_tex && instr->op != nir_texop_txl) 2818 return false; 2819 2820 if (instr->dest_type != nir_type_float32 && 2821 instr->dest_type != nir_type_float16) 2822 return false; 2823 2824 if (instr->is_shadow || instr->is_array) 2825 return false; 2826 2827 switch (instr->sampler_dim) { 2828 case GLSL_SAMPLER_DIM_2D: 2829 case GLSL_SAMPLER_DIM_EXTERNAL: 2830 case GLSL_SAMPLER_DIM_RECT: 2831 break; 2832 2833 case GLSL_SAMPLER_DIM_CUBE: 2834 /* LOD can't be specified with TEXS_CUBE */ 2835 if (instr->op == nir_texop_txl) 2836 return false; 2837 break; 2838 2839 default: 2840 return false; 2841 } 2842 2843 for (unsigned i = 0; i < instr->num_srcs; ++i) { 2844 if (instr->src[i].src_type != nir_tex_src_lod && 2845 instr->src[i].src_type != nir_tex_src_coord) 2846 return false; 2847 } 2848 2849 /* Indices need to fit in provided bits */ 2850 unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3; 2851 if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits)) 2852 return false; 2853 2854 int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); 2855 if (lod_idx < 0) 2856 return true; 2857 2858 nir_src lod = instr->src[lod_idx].src; 2859 return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0; 2860} 2861 2862static void 2863bi_emit_tex(bi_builder *b, nir_tex_instr *instr) 2864{ 2865 switch (instr->op) { 2866 case nir_texop_txs: 2867 bi_load_sysval_to(b, bi_dest_index(&instr->dest), 2868 panfrost_sysval_for_instr(&instr->instr, NULL), 2869 4, 0); 2870 return; 2871 case nir_texop_tex: 2872 case nir_texop_txl: 2873 case nir_texop_txb: 2874 case nir_texop_txf: 2875 case nir_texop_txf_ms: 2876 case nir_texop_tg4: 2877 break; 2878 default: 2879 unreachable("Invalid texture operation"); 2880 } 2881 2882 if (bi_is_simple_tex(instr)) 2883 bi_emit_texs(b, instr); 2884 else 2885 bi_emit_texc(b, instr); 2886} 2887 2888static void 2889bi_emit_instr(bi_builder *b, struct nir_instr *instr) 2890{ 2891 switch (instr->type) { 2892 case nir_instr_type_load_const: 2893 bi_emit_load_const(b, nir_instr_as_load_const(instr)); 2894 break; 2895 2896 case nir_instr_type_intrinsic: 2897 bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); 2898 break; 2899 2900 case nir_instr_type_alu: 2901 bi_emit_alu(b, nir_instr_as_alu(instr)); 2902 break; 2903 2904 case nir_instr_type_tex: 2905 bi_emit_tex(b, nir_instr_as_tex(instr)); 2906 break; 2907 2908 case nir_instr_type_jump: 2909 bi_emit_jump(b, nir_instr_as_jump(instr)); 2910 break; 2911 2912 default: 2913 unreachable("should've been lowered"); 2914 } 2915} 2916 2917static bi_block * 2918create_empty_block(bi_context *ctx) 2919{ 2920 bi_block *blk = rzalloc(ctx, bi_block); 2921 2922 blk->predecessors = _mesa_set_create(blk, 2923 _mesa_hash_pointer, 2924 _mesa_key_pointer_equal); 2925 2926 return blk; 2927} 2928 2929static bi_block * 2930emit_block(bi_context *ctx, nir_block *block) 2931{ 2932 if (ctx->after_block) { 2933 ctx->current_block = ctx->after_block; 2934 ctx->after_block = NULL; 2935 } else { 2936 ctx->current_block = create_empty_block(ctx); 2937 } 2938 2939 list_addtail(&ctx->current_block->link, &ctx->blocks); 2940 list_inithead(&ctx->current_block->instructions); 2941 2942 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 2943 2944 nir_foreach_instr(instr, block) { 2945 bi_emit_instr(&_b, instr); 2946 ++ctx->instruction_count; 2947 } 2948 2949 return ctx->current_block; 2950} 2951 2952static void 2953emit_if(bi_context *ctx, nir_if *nif) 2954{ 2955 bi_block *before_block = ctx->current_block; 2956 2957 /* Speculatively emit the branch, but we can't fill it in until later */ 2958 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 2959 bi_instr *then_branch = bi_branchz_i16(&_b, 2960 bi_half(bi_src_index(&nif->condition), false), 2961 bi_zero(), BI_CMPF_EQ); 2962 2963 /* Emit the two subblocks. */ 2964 bi_block *then_block = emit_cf_list(ctx, &nif->then_list); 2965 bi_block *end_then_block = ctx->current_block; 2966 2967 /* Emit second block, and check if it's empty */ 2968 2969 int count_in = ctx->instruction_count; 2970 bi_block *else_block = emit_cf_list(ctx, &nif->else_list); 2971 bi_block *end_else_block = ctx->current_block; 2972 ctx->after_block = create_empty_block(ctx); 2973 2974 /* Now that we have the subblocks emitted, fix up the branches */ 2975 2976 assert(then_block); 2977 assert(else_block); 2978 2979 if (ctx->instruction_count == count_in) { 2980 then_branch->branch_target = ctx->after_block; 2981 bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */ 2982 } else { 2983 then_branch->branch_target = else_block; 2984 2985 /* Emit a jump from the end of the then block to the end of the else */ 2986 _b.cursor = bi_after_block(end_then_block); 2987 bi_instr *then_exit = bi_jump(&_b, bi_zero()); 2988 then_exit->branch_target = ctx->after_block; 2989 2990 bi_block_add_successor(end_then_block, then_exit->branch_target); 2991 bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */ 2992 } 2993 2994 bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */ 2995 bi_block_add_successor(before_block, then_block); /* fallthrough */ 2996} 2997 2998static void 2999emit_loop(bi_context *ctx, nir_loop *nloop) 3000{ 3001 /* Remember where we are */ 3002 bi_block *start_block = ctx->current_block; 3003 3004 bi_block *saved_break = ctx->break_block; 3005 bi_block *saved_continue = ctx->continue_block; 3006 3007 ctx->continue_block = create_empty_block(ctx); 3008 ctx->break_block = create_empty_block(ctx); 3009 ctx->after_block = ctx->continue_block; 3010 3011 /* Emit the body itself */ 3012 emit_cf_list(ctx, &nloop->body); 3013 3014 /* Branch back to loop back */ 3015 bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 3016 bi_instr *I = bi_jump(&_b, bi_zero()); 3017 I->branch_target = ctx->continue_block; 3018 bi_block_add_successor(start_block, ctx->continue_block); 3019 bi_block_add_successor(ctx->current_block, ctx->continue_block); 3020 3021 ctx->after_block = ctx->break_block; 3022 3023 /* Pop off */ 3024 ctx->break_block = saved_break; 3025 ctx->continue_block = saved_continue; 3026 ++ctx->loop_count; 3027} 3028 3029static bi_block * 3030emit_cf_list(bi_context *ctx, struct exec_list *list) 3031{ 3032 bi_block *start_block = NULL; 3033 3034 foreach_list_typed(nir_cf_node, node, node, list) { 3035 switch (node->type) { 3036 case nir_cf_node_block: { 3037 bi_block *block = emit_block(ctx, nir_cf_node_as_block(node)); 3038 3039 if (!start_block) 3040 start_block = block; 3041 3042 break; 3043 } 3044 3045 case nir_cf_node_if: 3046 emit_if(ctx, nir_cf_node_as_if(node)); 3047 break; 3048 3049 case nir_cf_node_loop: 3050 emit_loop(ctx, nir_cf_node_as_loop(node)); 3051 break; 3052 3053 default: 3054 unreachable("Unknown control flow"); 3055 } 3056 } 3057 3058 return start_block; 3059} 3060 3061/* shader-db stuff */ 3062 3063struct bi_stats { 3064 unsigned nr_clauses, nr_tuples, nr_ins; 3065 unsigned nr_arith, nr_texture, nr_varying, nr_ldst; 3066}; 3067 3068static void 3069bi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats) 3070{ 3071 /* Count instructions */ 3072 stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0); 3073 3074 /* Non-message passing tuples are always arithmetic */ 3075 if (tuple->add != clause->message) { 3076 stats->nr_arith++; 3077 return; 3078 } 3079 3080 /* Message + FMA we'll count as arithmetic _and_ message */ 3081 if (tuple->fma) 3082 stats->nr_arith++; 3083 3084 switch (clause->message_type) { 3085 case BIFROST_MESSAGE_VARYING: 3086 /* Check components interpolated */ 3087 stats->nr_varying += (clause->message->vecsize + 1) * 3088 (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2); 3089 break; 3090 3091 case BIFROST_MESSAGE_VARTEX: 3092 /* 2 coordinates, fp32 each */ 3093 stats->nr_varying += (2 * 2); 3094 FALLTHROUGH; 3095 case BIFROST_MESSAGE_TEX: 3096 stats->nr_texture++; 3097 break; 3098 3099 case BIFROST_MESSAGE_ATTRIBUTE: 3100 case BIFROST_MESSAGE_LOAD: 3101 case BIFROST_MESSAGE_STORE: 3102 case BIFROST_MESSAGE_ATOMIC: 3103 stats->nr_ldst++; 3104 break; 3105 3106 case BIFROST_MESSAGE_NONE: 3107 case BIFROST_MESSAGE_BARRIER: 3108 case BIFROST_MESSAGE_BLEND: 3109 case BIFROST_MESSAGE_TILE: 3110 case BIFROST_MESSAGE_Z_STENCIL: 3111 case BIFROST_MESSAGE_ATEST: 3112 case BIFROST_MESSAGE_JOB: 3113 case BIFROST_MESSAGE_64BIT: 3114 /* Nothing to do */ 3115 break; 3116 }; 3117 3118} 3119 3120static void 3121bi_print_stats(bi_context *ctx, unsigned size, FILE *fp) 3122{ 3123 struct bi_stats stats = { 0 }; 3124 3125 /* Count instructions, clauses, and tuples. Also attempt to construct 3126 * normalized execution engine cycle counts, using the following ratio: 3127 * 3128 * 24 arith tuples/cycle 3129 * 2 texture messages/cycle 3130 * 16 x 16-bit varying channels interpolated/cycle 3131 * 1 load store message/cycle 3132 * 3133 * These numbers seem to match Arm Mobile Studio's heuristic. The real 3134 * cycle counts are surely more complicated. 3135 */ 3136 3137 bi_foreach_block(ctx, block) { 3138 bi_foreach_clause_in_block(block, clause) { 3139 stats.nr_clauses++; 3140 stats.nr_tuples += clause->tuple_count; 3141 3142 for (unsigned i = 0; i < clause->tuple_count; ++i) 3143 bi_count_tuple_stats(clause, &clause->tuples[i], &stats); 3144 } 3145 } 3146 3147 float cycles_arith = ((float) stats.nr_arith) / 24.0; 3148 float cycles_texture = ((float) stats.nr_texture) / 2.0; 3149 float cycles_varying = ((float) stats.nr_varying) / 16.0; 3150 float cycles_ldst = ((float) stats.nr_ldst) / 1.0; 3151 3152 float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst); 3153 float cycles_bound = MAX2(cycles_arith, cycles_message); 3154 3155 /* Thread count and register pressure are traded off only on v7 */ 3156 bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32); 3157 unsigned nr_threads = full_threads ? 2 : 1; 3158 3159 /* Dump stats */ 3160 3161 fprintf(stderr, "%s - %s shader: " 3162 "%u inst, %u tuples, %u clauses, " 3163 "%f cycles, %f arith, %f texture, %f vary, %f ldst, " 3164 "%u quadwords, %u threads, %u loops, " 3165 "%u:%u spills:fills\n", 3166 ctx->nir->info.label ?: "", 3167 ctx->inputs->is_blend ? "PAN_SHADER_BLEND" : 3168 gl_shader_stage_name(ctx->stage), 3169 stats.nr_ins, stats.nr_tuples, stats.nr_clauses, 3170 cycles_bound, cycles_arith, cycles_texture, 3171 cycles_varying, cycles_ldst, 3172 size / 16, nr_threads, 3173 ctx->loop_count, 3174 ctx->spills, ctx->fills); 3175} 3176 3177static int 3178glsl_type_size(const struct glsl_type *type, bool bindless) 3179{ 3180 return glsl_count_attribute_slots(type, false); 3181} 3182 3183/* Split stores to memory. We don't split stores to vertex outputs, since 3184 * nir_lower_io_to_temporaries will ensure there's only a single write. 3185 */ 3186 3187static bool 3188should_split_wrmask(const nir_instr *instr, UNUSED const void *data) 3189{ 3190 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 3191 3192 switch (intr->intrinsic) { 3193 case nir_intrinsic_store_ssbo: 3194 case nir_intrinsic_store_shared: 3195 case nir_intrinsic_store_global: 3196 case nir_intrinsic_store_scratch: 3197 return true; 3198 default: 3199 return false; 3200 } 3201} 3202 3203/* Bifrost wants transcendentals as FP32 */ 3204 3205static unsigned 3206bi_lower_bit_size(const nir_instr *instr, UNUSED void *data) 3207{ 3208 if (instr->type != nir_instr_type_alu) 3209 return 0; 3210 3211 nir_alu_instr *alu = nir_instr_as_alu(instr); 3212 3213 switch (alu->op) { 3214 case nir_op_fexp2: 3215 case nir_op_flog2: 3216 case nir_op_fpow: 3217 case nir_op_fsin: 3218 case nir_op_fcos: 3219 return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32; 3220 default: 3221 return 0; 3222 } 3223} 3224 3225/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4, 3226 * transcendentals are an exception. Also shifts because of lane size mismatch 3227 * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need 3228 * to be scalarized due to type size. */ 3229 3230static bool 3231bi_vectorize_filter(const nir_instr *instr, void *data) 3232{ 3233 /* Defaults work for everything else */ 3234 if (instr->type != nir_instr_type_alu) 3235 return true; 3236 3237 const nir_alu_instr *alu = nir_instr_as_alu(instr); 3238 3239 switch (alu->op) { 3240 case nir_op_frcp: 3241 case nir_op_frsq: 3242 case nir_op_ishl: 3243 case nir_op_ishr: 3244 case nir_op_ushr: 3245 case nir_op_f2i16: 3246 case nir_op_f2u16: 3247 case nir_op_i2f16: 3248 case nir_op_u2f16: 3249 return false; 3250 default: 3251 return true; 3252 } 3253} 3254 3255/* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we 3256 * keep divergence info around after we consume it for indirect lowering, 3257 * nir_convert_from_ssa will regress code quality since it will avoid 3258 * coalescing divergent with non-divergent nodes. */ 3259 3260static bool 3261nir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data) 3262{ 3263 ssa->divergent = false; 3264 return true; 3265} 3266 3267static bool 3268nir_invalidate_divergence(struct nir_builder *b, nir_instr *instr, 3269 UNUSED void *data) 3270{ 3271 return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL); 3272} 3273 3274/* Ensure we write exactly 4 components */ 3275static nir_ssa_def * 3276bifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in, 3277 unsigned channel, unsigned first, unsigned mask) 3278{ 3279 if (!(mask & BITFIELD_BIT(channel))) 3280 channel = first; 3281 3282 return nir_channel(b, in, channel); 3283} 3284 3285/* Lower fragment store_output instructions to always write 4 components, 3286 * matching the hardware semantic. This may require additional moves. Skipping 3287 * these moves is possible in theory, but invokes undefined behaviour in the 3288 * compiler. The DDK inserts these moves, so we will as well. */ 3289 3290static bool 3291bifrost_nir_lower_blend_components(struct nir_builder *b, 3292 nir_instr *instr, void *data) 3293{ 3294 if (instr->type != nir_instr_type_intrinsic) 3295 return false; 3296 3297 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 3298 3299 if (intr->intrinsic != nir_intrinsic_store_output) 3300 return false; 3301 3302 nir_ssa_def *in = intr->src[0].ssa; 3303 unsigned first = nir_intrinsic_component(intr); 3304 unsigned mask = nir_intrinsic_write_mask(intr); 3305 3306 assert(first == 0 && "shouldn't get nonzero components"); 3307 3308 /* Nothing to do */ 3309 if (mask == BITFIELD_MASK(4)) 3310 return false; 3311 3312 b->cursor = nir_before_instr(&intr->instr); 3313 3314 /* Replicate the first valid component instead */ 3315 nir_ssa_def *replicated = 3316 nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask), 3317 bifrost_nir_valid_channel(b, in, 1, first, mask), 3318 bifrost_nir_valid_channel(b, in, 2, first, mask), 3319 bifrost_nir_valid_channel(b, in, 3, first, mask)); 3320 3321 /* Rewrite to use our replicated version */ 3322 nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated); 3323 nir_intrinsic_set_component(intr, 0); 3324 nir_intrinsic_set_write_mask(intr, 0xF); 3325 intr->num_components = 4; 3326 3327 return true; 3328} 3329 3330static void 3331bi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) 3332{ 3333 bool progress; 3334 unsigned lower_flrp = 16 | 32 | 64; 3335 3336 NIR_PASS(progress, nir, nir_lower_regs_to_ssa); 3337 3338 nir_lower_tex_options lower_tex_options = { 3339 .lower_txs_lod = true, 3340 .lower_txp = ~0, 3341 .lower_tg4_broadcom_swizzle = true, 3342 .lower_txd = true, 3343 }; 3344 3345 NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin); 3346 NIR_PASS(progress, nir, pan_lower_helper_invocation); 3347 3348 NIR_PASS(progress, nir, nir_lower_int64); 3349 3350 nir_lower_idiv_options idiv_options = { 3351 .imprecise_32bit_lowering = true, 3352 .allow_fp16 = true, 3353 }; 3354 NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); 3355 3356 NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); 3357 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL); 3358 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); 3359 3360 do { 3361 progress = false; 3362 3363 NIR_PASS(progress, nir, nir_lower_var_copies); 3364 NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 3365 NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL); 3366 3367 NIR_PASS(progress, nir, nir_copy_prop); 3368 NIR_PASS(progress, nir, nir_opt_remove_phis); 3369 NIR_PASS(progress, nir, nir_opt_dce); 3370 NIR_PASS(progress, nir, nir_opt_dead_cf); 3371 NIR_PASS(progress, nir, nir_opt_cse); 3372 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); 3373 NIR_PASS(progress, nir, nir_opt_algebraic); 3374 NIR_PASS(progress, nir, nir_opt_constant_folding); 3375 3376 NIR_PASS(progress, nir, nir_lower_alu); 3377 3378 if (lower_flrp != 0) { 3379 bool lower_flrp_progress = false; 3380 NIR_PASS(lower_flrp_progress, 3381 nir, 3382 nir_lower_flrp, 3383 lower_flrp, 3384 false /* always_precise */); 3385 if (lower_flrp_progress) { 3386 NIR_PASS(progress, nir, 3387 nir_opt_constant_folding); 3388 progress = true; 3389 } 3390 3391 /* Nothing should rematerialize any flrps, so we only 3392 * need to do this lowering once. 3393 */ 3394 lower_flrp = 0; 3395 } 3396 3397 NIR_PASS(progress, nir, nir_opt_undef); 3398 NIR_PASS(progress, nir, nir_lower_undef_to_zero); 3399 3400 NIR_PASS(progress, nir, nir_opt_loop_unroll); 3401 } while (progress); 3402 3403 /* TODO: Why is 64-bit getting rematerialized? 3404 * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ 3405 NIR_PASS(progress, nir, nir_lower_int64); 3406 3407 /* We need to cleanup after each iteration of late algebraic 3408 * optimizations, since otherwise NIR can produce weird edge cases 3409 * (like fneg of a constant) which we don't handle */ 3410 bool late_algebraic = true; 3411 while (late_algebraic) { 3412 late_algebraic = false; 3413 NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); 3414 NIR_PASS(progress, nir, nir_opt_constant_folding); 3415 NIR_PASS(progress, nir, nir_copy_prop); 3416 NIR_PASS(progress, nir, nir_opt_dce); 3417 NIR_PASS(progress, nir, nir_opt_cse); 3418 } 3419 3420 NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL); 3421 NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL); 3422 NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); 3423 NIR_PASS(progress, nir, nir_opt_dce); 3424 3425 /* Prepass to simplify instruction selection */ 3426 NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late); 3427 NIR_PASS(progress, nir, nir_opt_dce); 3428 3429 if (nir->info.stage == MESA_SHADER_FRAGMENT) { 3430 NIR_PASS_V(nir, nir_shader_instructions_pass, 3431 bifrost_nir_lower_blend_components, 3432 nir_metadata_block_index | nir_metadata_dominance, 3433 NULL); 3434 } 3435 3436 /* Backend scheduler is purely local, so do some global optimizations 3437 * to reduce register pressure. */ 3438 nir_move_options move_all = 3439 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | 3440 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; 3441 3442 NIR_PASS_V(nir, nir_opt_sink, move_all); 3443 NIR_PASS_V(nir, nir_opt_move, move_all); 3444 3445 /* We might lower attribute, varying, and image indirects. Use the 3446 * gathered info to skip the extra analysis in the happy path. */ 3447 bool any_indirects = 3448 nir->info.inputs_read_indirectly || 3449 nir->info.outputs_accessed_indirectly || 3450 nir->info.patch_inputs_read_indirectly || 3451 nir->info.patch_outputs_accessed_indirectly || 3452 nir->info.images_used; 3453 3454 if (any_indirects) { 3455 nir_convert_to_lcssa(nir, true, true); 3456 NIR_PASS_V(nir, nir_divergence_analysis); 3457 NIR_PASS_V(nir, bi_lower_divergent_indirects, 3458 bifrost_lanes_per_warp(gpu_id)); 3459 NIR_PASS_V(nir, nir_shader_instructions_pass, 3460 nir_invalidate_divergence, nir_metadata_all, NULL); 3461 } 3462 3463 /* Take us out of SSA */ 3464 NIR_PASS(progress, nir, nir_lower_locals_to_regs); 3465 NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); 3466 NIR_PASS(progress, nir, nir_convert_from_ssa, true); 3467} 3468 3469/* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the 3470 * same lowering here to zero-extend correctly */ 3471 3472static bool 3473bifrost_nir_lower_i8_fragout_impl(struct nir_builder *b, 3474 nir_intrinsic_instr *intr, UNUSED void *data) 3475{ 3476 if (nir_src_bit_size(intr->src[0]) != 8) 3477 return false; 3478 3479 nir_alu_type type = 3480 nir_alu_type_get_base_type(nir_intrinsic_src_type(intr)); 3481 3482 assert(type == nir_type_int || type == nir_type_uint); 3483 3484 b->cursor = nir_before_instr(&intr->instr); 3485 nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16); 3486 3487 nir_intrinsic_set_src_type(intr, type | 16); 3488 nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast); 3489 return true; 3490} 3491 3492static bool 3493bifrost_nir_lower_i8_fragin_impl(struct nir_builder *b, 3494 nir_intrinsic_instr *intr, UNUSED void *data) 3495{ 3496 if (nir_dest_bit_size(intr->dest) != 8) 3497 return false; 3498 3499 nir_alu_type type = 3500 nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr)); 3501 3502 assert(type == nir_type_int || type == nir_type_uint); 3503 3504 b->cursor = nir_before_instr(&intr->instr); 3505 nir_ssa_def *out = 3506 nir_load_output(b, intr->num_components, 16, intr->src[0].ssa, 3507 .base = nir_intrinsic_base(intr), 3508 .component = nir_intrinsic_component(intr), 3509 .dest_type = type | 16, 3510 .io_semantics = nir_intrinsic_io_semantics(intr)); 3511 3512 nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8); 3513 nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast); 3514 return true; 3515} 3516 3517static bool 3518bifrost_nir_lower_i8_frag(struct nir_builder *b, 3519 nir_instr *instr, UNUSED void *data) 3520{ 3521 if (instr->type != nir_instr_type_intrinsic) 3522 return false; 3523 3524 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 3525 if (intr->intrinsic == nir_intrinsic_load_output) 3526 return bifrost_nir_lower_i8_fragin_impl(b, intr, data); 3527 else if (intr->intrinsic == nir_intrinsic_store_output) 3528 return bifrost_nir_lower_i8_fragout_impl(b, intr, data); 3529 else 3530 return false; 3531} 3532 3533static void 3534bi_opt_post_ra(bi_context *ctx) 3535{ 3536 bi_foreach_instr_global_safe(ctx, ins) { 3537 if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0])) 3538 bi_remove_instruction(ins); 3539 } 3540} 3541 3542static bool 3543bifrost_nir_lower_store_component(struct nir_builder *b, 3544 nir_instr *instr, void *data) 3545{ 3546 if (instr->type != nir_instr_type_intrinsic) 3547 return false; 3548 3549 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 3550 3551 if (intr->intrinsic != nir_intrinsic_store_output) 3552 return false; 3553 3554 struct hash_table_u64 *slots = data; 3555 unsigned component = nir_intrinsic_component(intr); 3556 nir_src *slot_src = nir_get_io_offset_src(intr); 3557 uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr); 3558 3559 nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot); 3560 unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0); 3561 3562 nir_ssa_def *value = intr->src[0].ssa; 3563 b->cursor = nir_before_instr(&intr->instr); 3564 3565 nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size); 3566 nir_ssa_def *channels[4] = { undef, undef, undef, undef }; 3567 3568 /* Copy old */ 3569 u_foreach_bit(i, mask) { 3570 assert(prev != NULL); 3571 nir_ssa_def *prev_ssa = prev->src[0].ssa; 3572 channels[i] = nir_channel(b, prev_ssa, i); 3573 } 3574 3575 /* Copy new */ 3576 unsigned new_mask = nir_intrinsic_write_mask(intr); 3577 mask |= (new_mask << component); 3578 3579 u_foreach_bit(i, new_mask) { 3580 assert(component + i < 4); 3581 channels[component + i] = nir_channel(b, value, i); 3582 } 3583 3584 intr->num_components = util_last_bit(mask); 3585 nir_instr_rewrite_src_ssa(instr, &intr->src[0], 3586 nir_vec(b, channels, intr->num_components)); 3587 3588 nir_intrinsic_set_component(intr, 0); 3589 nir_intrinsic_set_write_mask(intr, mask); 3590 3591 if (prev) { 3592 _mesa_hash_table_u64_remove(slots, slot); 3593 nir_instr_remove(&prev->instr); 3594 } 3595 3596 _mesa_hash_table_u64_insert(slots, slot, intr); 3597 return false; 3598} 3599 3600/* Dead code elimination for branches at the end of a block - only one branch 3601 * per block is legal semantically, but unreachable jumps can be generated. 3602 * Likewise we can generate jumps to the terminal block which need to be 3603 * lowered away to a jump to #0x0, which induces successful termination. */ 3604 3605static void 3606bi_lower_branch(bi_block *block) 3607{ 3608 bool branched = false; 3609 ASSERTED bool was_jump = false; 3610 3611 bi_foreach_instr_in_block_safe(block, ins) { 3612 if (!ins->branch_target) continue; 3613 3614 if (branched) { 3615 assert(was_jump && (ins->op == BI_OPCODE_JUMP)); 3616 bi_remove_instruction(ins); 3617 continue; 3618 } 3619 3620 branched = true; 3621 was_jump = ins->op == BI_OPCODE_JUMP; 3622 3623 if (bi_is_terminal_block(ins->branch_target)) 3624 ins->branch_target = NULL; 3625 } 3626} 3627 3628static void 3629bi_pack_clauses(bi_context *ctx, struct util_dynarray *binary) 3630{ 3631 unsigned final_clause = bi_pack(ctx, binary); 3632 3633 /* If we need to wait for ATEST or BLEND in the first clause, pass the 3634 * corresponding bits through to the renderer state descriptor */ 3635 bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); 3636 bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); 3637 3638 unsigned first_deps = first_clause ? first_clause->dependencies : 0; 3639 ctx->info->bifrost.wait_6 = (first_deps & (1 << 6)); 3640 ctx->info->bifrost.wait_7 = (first_deps & (1 << 7)); 3641 3642 /* Pad the shader with enough zero bytes to trick the prefetcher, 3643 * unless we're compiling an empty shader (in which case we don't pad 3644 * so the size remains 0) */ 3645 unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; 3646 3647 if (binary->size) { 3648 memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 3649 0, prefetch_size); 3650 } 3651} 3652 3653void 3654bifrost_compile_shader_nir(nir_shader *nir, 3655 const struct panfrost_compile_inputs *inputs, 3656 struct util_dynarray *binary, 3657 struct pan_shader_info *info) 3658{ 3659 bifrost_debug = debug_get_option_bifrost_debug(); 3660 3661 bi_context *ctx = rzalloc(NULL, bi_context); 3662 ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx); 3663 3664 ctx->inputs = inputs; 3665 ctx->nir = nir; 3666 ctx->info = info; 3667 ctx->stage = nir->info.stage; 3668 ctx->quirks = bifrost_get_quirks(inputs->gpu_id); 3669 ctx->arch = inputs->gpu_id >> 12; 3670 3671 /* If nothing is pushed, all UBOs need to be uploaded */ 3672 ctx->ubo_mask = ~0; 3673 3674 list_inithead(&ctx->blocks); 3675 3676 /* Lower gl_Position pre-optimisation, but after lowering vars to ssa 3677 * (so we don't accidentally duplicate the epilogue since mesa/st has 3678 * messed with our I/O quite a bit already) */ 3679 3680 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3681 3682 if (ctx->stage == MESA_SHADER_VERTEX) { 3683 NIR_PASS_V(nir, nir_lower_viewport_transform); 3684 NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0); 3685 } 3686 3687 /* Lower large arrays to scratch and small arrays to bcsel (TODO: tune 3688 * threshold, but not until addresses / csel is optimized better) */ 3689 NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16, 3690 glsl_get_natural_size_align_bytes); 3691 NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); 3692 3693 NIR_PASS_V(nir, nir_split_var_copies); 3694 NIR_PASS_V(nir, nir_lower_global_vars_to_local); 3695 NIR_PASS_V(nir, nir_lower_var_copies); 3696 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 3697 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 3698 glsl_type_size, 0); 3699 3700 if (ctx->stage == MESA_SHADER_FRAGMENT) { 3701 NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, 3702 ~0, false); 3703 } else { 3704 struct hash_table_u64 *stores = _mesa_hash_table_u64_create(ctx); 3705 NIR_PASS_V(nir, nir_shader_instructions_pass, 3706 bifrost_nir_lower_store_component, 3707 nir_metadata_block_index | 3708 nir_metadata_dominance, stores); 3709 _mesa_hash_table_u64_destroy(stores); 3710 } 3711 3712 NIR_PASS_V(nir, nir_lower_ssbo); 3713 NIR_PASS_V(nir, pan_nir_lower_zs_store); 3714 NIR_PASS_V(nir, pan_lower_sample_pos); 3715 NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL); 3716 3717 if (nir->info.stage == MESA_SHADER_FRAGMENT) { 3718 NIR_PASS_V(nir, nir_shader_instructions_pass, 3719 bifrost_nir_lower_i8_frag, 3720 nir_metadata_block_index | nir_metadata_dominance, 3721 NULL); 3722 } 3723 3724 bi_optimize_nir(nir, ctx->inputs->gpu_id, ctx->inputs->is_blend); 3725 3726 NIR_PASS_V(nir, pan_nir_reorder_writeout); 3727 3728 bool skip_internal = nir->info.internal; 3729 skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL); 3730 3731 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { 3732 nir_print_shader(nir, stdout); 3733 } 3734 3735 info->tls_size = nir->scratch_size; 3736 3737 nir_foreach_function(func, nir) { 3738 if (!func->impl) 3739 continue; 3740 3741 ctx->ssa_alloc += func->impl->ssa_alloc; 3742 ctx->reg_alloc += func->impl->reg_alloc; 3743 3744 emit_cf_list(ctx, &func->impl->body); 3745 break; /* TODO: Multi-function shaders */ 3746 } 3747 3748 unsigned block_source_count = 0; 3749 3750 bi_foreach_block(ctx, block) { 3751 /* Name blocks now that we're done emitting so the order is 3752 * consistent */ 3753 block->name = block_source_count++; 3754 } 3755 3756 bi_validate(ctx, "NIR -> BIR"); 3757 3758 /* If the shader doesn't write any colour or depth outputs, it may 3759 * still need an ATEST at the very end! */ 3760 bool need_dummy_atest = 3761 (ctx->stage == MESA_SHADER_FRAGMENT) && 3762 !ctx->emitted_atest && 3763 !bi_skip_atest(ctx, false); 3764 3765 if (need_dummy_atest) { 3766 bi_block *end = list_last_entry(&ctx->blocks, bi_block, link); 3767 bi_builder b = bi_init_builder(ctx, bi_after_block(end)); 3768 bi_emit_atest(&b, bi_zero()); 3769 } 3770 3771 bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT); 3772 3773 /* Runs before constant folding */ 3774 bi_lower_swizzle(ctx); 3775 bi_validate(ctx, "Early lowering"); 3776 3777 /* Runs before copy prop */ 3778 if (optimize && !ctx->inputs->no_ubo_to_push) { 3779 bi_opt_push_ubo(ctx); 3780 } 3781 3782 if (likely(optimize)) { 3783 bi_opt_copy_prop(ctx); 3784 bi_opt_constant_fold(ctx); 3785 bi_opt_copy_prop(ctx); 3786 bi_opt_mod_prop_forward(ctx); 3787 bi_opt_mod_prop_backward(ctx); 3788 bi_opt_dead_code_eliminate(ctx); 3789 bi_opt_cse(ctx); 3790 bi_opt_dead_code_eliminate(ctx); 3791 bi_validate(ctx, "Optimization passes"); 3792 } 3793 3794 bi_foreach_instr_global(ctx, I) { 3795 bi_lower_opt_instruction(I); 3796 } 3797 3798 bi_foreach_block(ctx, block) { 3799 bi_lower_branch(block); 3800 } 3801 3802 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 3803 bi_print_shader(ctx, stdout); 3804 bi_lower_fau(ctx); 3805 3806 /* Analyze before register allocation to avoid false dependencies. The 3807 * skip bit is a function of only the data flow graph and is invariant 3808 * under valid scheduling. */ 3809 bi_analyze_helper_requirements(ctx); 3810 bi_validate(ctx, "Late lowering"); 3811 3812 bi_register_allocate(ctx); 3813 3814 if (likely(optimize)) 3815 bi_opt_post_ra(ctx); 3816 3817 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 3818 bi_print_shader(ctx, stdout); 3819 3820 if (ctx->arch <= 8) { 3821 bi_schedule(ctx); 3822 bi_assign_scoreboard(ctx); 3823 } 3824 3825 /* Analyze after scheduling since we depend on instruction order. */ 3826 bi_analyze_helper_terminate(ctx); 3827 3828 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 3829 bi_print_shader(ctx, stdout); 3830 3831 if (ctx->arch <= 8) { 3832 bi_pack_clauses(ctx, binary); 3833 } else { 3834 /* TODO: pack flat */ 3835 } 3836 3837 info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos); 3838 3839 if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { 3840 disassemble_bifrost(stdout, binary->data, binary->size, 3841 bifrost_debug & BIFROST_DBG_VERBOSE); 3842 fflush(stdout); 3843 } 3844 3845 if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) && 3846 !skip_internal) { 3847 bi_print_stats(ctx, binary->size, stderr); 3848 } 3849 3850 _mesa_hash_table_u64_destroy(ctx->sysval_to_id); 3851 ralloc_free(ctx); 3852} 3853