1/* 2 * Copyright (C) 2021 Alyssa Rosenzweig <alyssa@rosenzweig.io> 3 * Copyright (C) 2020 Collabora Ltd. 4 * Copyright © 2016 Broadcom 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 10 * and/or sell copies of the Software, and to permit persons to whom the 11 * Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 23 * SOFTWARE. 24 */ 25 26#include "main/mtypes.h" 27#include "compiler/nir_types.h" 28#include "compiler/nir/nir_builder.h" 29#include "util/u_debug.h" 30#include "util/fast_idiv_by_const.h" 31#include "agx_compile.h" 32#include "agx_compiler.h" 33#include "agx_builder.h" 34 35static const struct debug_named_value agx_debug_options[] = { 36 {"msgs", AGX_DBG_MSGS, "Print debug messages"}, 37 {"shaders", AGX_DBG_SHADERS, "Dump shaders in NIR and AIR"}, 38 {"shaderdb", AGX_DBG_SHADERDB, "Print statistics"}, 39 {"verbose", AGX_DBG_VERBOSE, "Disassemble verbosely"}, 40 {"internal", AGX_DBG_INTERNAL, "Dump even internal shaders"}, 41 DEBUG_NAMED_VALUE_END 42}; 43 44DEBUG_GET_ONCE_FLAGS_OPTION(agx_debug, "AGX_MESA_DEBUG", agx_debug_options, 0) 45 46int agx_debug = 0; 47 48#define DBG(fmt, ...) \ 49 do { if (agx_debug & AGX_DBG_MSGS) \ 50 fprintf(stderr, "%s:%d: "fmt, \ 51 __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) 52 53static void 54agx_block_add_successor(agx_block *block, agx_block *successor) 55{ 56 assert(block != NULL && successor != NULL); 57 58 /* Cull impossible edges */ 59 if (block->unconditional_jumps) 60 return; 61 62 for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { 63 if (block->successors[i]) { 64 if (block->successors[i] == successor) 65 return; 66 else 67 continue; 68 } 69 70 block->successors[i] = successor; 71 _mesa_set_add(successor->predecessors, block); 72 return; 73 } 74 75 unreachable("Too many successors"); 76} 77 78static void 79agx_emit_load_const(agx_builder *b, nir_load_const_instr *instr) 80{ 81 /* Ensure we've been scalarized and bit size lowered */ 82 unsigned bit_size = instr->def.bit_size; 83 assert(instr->def.num_components == 1); 84 assert(bit_size == 1 || bit_size == 16 || bit_size == 32); 85 86 /* Emit move, later passes can inline/push if useful */ 87 agx_mov_imm_to(b, 88 agx_get_index(instr->def.index, agx_size_for_bits(bit_size)), 89 nir_const_value_as_uint(instr->value[0], bit_size)); 90} 91 92/* Emit code dividing P by Q */ 93static agx_index 94agx_udiv_const(agx_builder *b, agx_index P, uint32_t Q) 95{ 96 /* P / 1 = P */ 97 if (Q == 1) { 98 return P; 99 } 100 101 /* P / UINT32_MAX = 0, unless P = UINT32_MAX when it's one */ 102 if (Q == UINT32_MAX) { 103 agx_index max = agx_mov_imm(b, 32, UINT32_MAX); 104 agx_index one = agx_mov_imm(b, 32, 1); 105 return agx_icmpsel(b, P, max, one, agx_zero(), AGX_ICOND_UEQ); 106 } 107 108 /* P / 2^N = P >> N */ 109 if (util_is_power_of_two_or_zero(Q)) { 110 return agx_ushr(b, P, agx_mov_imm(b, 32, util_logbase2(Q))); 111 } 112 113 /* Fall back on multiplication by a magic number */ 114 struct util_fast_udiv_info info = util_compute_fast_udiv_info(Q, 32, 32); 115 agx_index preshift = agx_mov_imm(b, 32, info.pre_shift); 116 agx_index increment = agx_mov_imm(b, 32, info.increment); 117 agx_index postshift = agx_mov_imm(b, 32, info.post_shift); 118 agx_index multiplier = agx_mov_imm(b, 32, info.multiplier); 119 agx_index multiplied = agx_temp(b->shader, AGX_SIZE_64); 120 agx_index n = P; 121 122 if (info.pre_shift != 0) n = agx_ushr(b, n, preshift); 123 if (info.increment != 0) n = agx_iadd(b, n, increment, 0); 124 125 /* 64-bit multiplication, zero extending 32-bit x 32-bit, get the top word */ 126 agx_imad_to(b, multiplied, agx_abs(n), agx_abs(multiplier), agx_zero(), 0); 127 n = agx_temp(b->shader, AGX_SIZE_32); 128 agx_p_extract_to(b, n, multiplied, 1); 129 130 if (info.post_shift != 0) n = agx_ushr(b, n, postshift); 131 132 return n; 133} 134 135/* AGX appears to lack support for vertex attributes. Lower to global loads. */ 136static agx_instr * 137agx_emit_load_attr(agx_builder *b, nir_intrinsic_instr *instr) 138{ 139 nir_src *offset_src = nir_get_io_offset_src(instr); 140 assert(nir_src_is_const(*offset_src) && "no attribute indirects"); 141 unsigned index = nir_intrinsic_base(instr) + 142 nir_src_as_uint(*offset_src); 143 144 struct agx_shader_key *key = b->shader->key; 145 struct agx_attribute attrib = key->vs.attributes[index]; 146 147 /* address = base + (stride * vertex_id) + src_offset */ 148 unsigned buf = attrib.buf; 149 unsigned stride = key->vs.vbuf_strides[buf]; 150 unsigned shift = agx_format_shift(attrib.format); 151 152 agx_index shifted_stride = agx_mov_imm(b, 32, stride >> shift); 153 agx_index src_offset = agx_mov_imm(b, 32, attrib.src_offset); 154 155 agx_index vertex_id = agx_register(10, AGX_SIZE_32); 156 agx_index instance_id = agx_register(12, AGX_SIZE_32); 157 158 /* A nonzero divisor requires dividing the instance ID. A zero divisor 159 * specifies per-instance data. */ 160 agx_index element_id = (attrib.divisor == 0) ? vertex_id : 161 agx_udiv_const(b, instance_id, attrib.divisor); 162 163 agx_index offset = agx_imad(b, element_id, shifted_stride, src_offset, 0); 164 165 /* Each VBO has a 64-bit = 4 x 16-bit address, lookup the base address as a sysval */ 166 unsigned num_vbos = key->vs.num_vbufs; 167 unsigned base_length = (num_vbos * 4); 168 agx_index base = agx_indexed_sysval(b->shader, 169 AGX_PUSH_VBO_BASES, AGX_SIZE_64, buf * 4, base_length); 170 171 /* Load the data */ 172 assert(instr->num_components <= 4); 173 174 bool pad = ((attrib.nr_comps_minus_1 + 1) < instr->num_components); 175 agx_index real_dest = agx_dest_index(&instr->dest); 176 agx_index dest = pad ? agx_temp(b->shader, AGX_SIZE_32) : real_dest; 177 178 agx_device_load_to(b, dest, base, offset, attrib.format, 179 BITFIELD_MASK(attrib.nr_comps_minus_1 + 1), 0); 180 181 agx_wait(b, 0); 182 183 if (pad) { 184 agx_index one = agx_mov_imm(b, 32, fui(1.0)); 185 agx_index zero = agx_mov_imm(b, 32, 0); 186 agx_index channels[4] = { zero, zero, zero, one }; 187 for (unsigned i = 0; i < (attrib.nr_comps_minus_1 + 1); ++i) 188 channels[i] = agx_p_extract(b, dest, i); 189 for (unsigned i = instr->num_components; i < 4; ++i) 190 channels[i] = agx_null(); 191 agx_p_combine_to(b, real_dest, channels[0], channels[1], channels[2], channels[3]); 192 } 193 194 return NULL; 195} 196 197static agx_instr * 198agx_emit_load_vary_flat(agx_builder *b, nir_intrinsic_instr *instr) 199{ 200 unsigned components = instr->num_components; 201 assert(components >= 1 && components <= 4); 202 203 nir_src *offset = nir_get_io_offset_src(instr); 204 assert(nir_src_is_const(*offset) && "no indirects"); 205 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)]; 206 imm_index += nir_src_as_uint(*offset); 207 208 agx_index chan[4] = { agx_null() }; 209 210 for (unsigned i = 0; i < components; ++i) { 211 /* vec3 for each vertex, unknown what first 2 channels are for */ 212 agx_index values = agx_ld_vary_flat(b, agx_immediate(imm_index + i), 1); 213 chan[i] = agx_p_extract(b, values, 2); 214 } 215 216 return agx_p_combine_to(b, agx_dest_index(&instr->dest), 217 chan[0], chan[1], chan[2], chan[3]); 218} 219 220static agx_instr * 221agx_emit_load_vary(agx_builder *b, nir_intrinsic_instr *instr) 222{ 223 ASSERTED unsigned components = instr->num_components; 224 ASSERTED nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); 225 226 assert(components >= 1 && components <= 4); 227 assert(parent); 228 229 /* TODO: Interpolation modes */ 230 assert(parent->intrinsic == nir_intrinsic_load_barycentric_pixel); 231 232 nir_src *offset = nir_get_io_offset_src(instr); 233 assert(nir_src_is_const(*offset) && "no indirects"); 234 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)]; 235 imm_index += nir_src_as_uint(*offset) * 4; 236 237 return agx_ld_vary_to(b, agx_dest_index(&instr->dest), 238 agx_immediate(imm_index), components, true); 239} 240 241static agx_instr * 242agx_emit_store_vary(agx_builder *b, nir_intrinsic_instr *instr) 243{ 244 nir_src *offset = nir_get_io_offset_src(instr); 245 assert(nir_src_is_const(*offset) && "todo: indirects"); 246 unsigned imm_index = b->shader->varyings[nir_intrinsic_base(instr)]; 247 imm_index += nir_intrinsic_component(instr); 248 imm_index += nir_src_as_uint(*offset); 249 250 /* nir_lower_io_to_scalar */ 251 assert(nir_intrinsic_write_mask(instr) == 0x1); 252 253 return agx_st_vary(b, 254 agx_immediate(imm_index), 255 agx_src_index(&instr->src[0])); 256} 257 258static agx_instr * 259agx_emit_fragment_out(agx_builder *b, nir_intrinsic_instr *instr) 260{ 261 const nir_variable *var = 262 nir_find_variable_with_driver_location(b->shader->nir, 263 nir_var_shader_out, nir_intrinsic_base(instr)); 264 assert(var); 265 266 unsigned loc = var->data.location; 267 assert(var->data.index == 0 && "todo: dual-source blending"); 268 assert(loc == FRAG_RESULT_DATA0 && "todo: MRT"); 269 unsigned rt = (loc - FRAG_RESULT_DATA0); 270 271 /* TODO: Reverse-engineer interactions with MRT */ 272 if (b->shader->nir->info.internal) { 273 /* clear */ 274 } else if (b->shader->did_writeout) { 275 agx_writeout(b, 0x0004); 276 } else { 277 agx_writeout(b, 0xC200); 278 agx_writeout(b, 0x000C); 279 } 280 281 b->shader->did_writeout = true; 282 return agx_st_tile(b, agx_src_index(&instr->src[0]), 283 b->shader->key->fs.tib_formats[rt]); 284} 285 286static agx_instr * 287agx_emit_load_tile(agx_builder *b, nir_intrinsic_instr *instr) 288{ 289 const nir_variable *var = 290 nir_find_variable_with_driver_location(b->shader->nir, 291 nir_var_shader_out, nir_intrinsic_base(instr)); 292 assert(var); 293 294 unsigned loc = var->data.location; 295 assert(var->data.index == 0 && "todo: dual-source blending"); 296 assert(loc == FRAG_RESULT_DATA0 && "todo: MRT"); 297 unsigned rt = (loc - FRAG_RESULT_DATA0); 298 299 /* TODO: Reverse-engineer interactions with MRT */ 300 agx_writeout(b, 0xC200); 301 agx_writeout(b, 0x0008); 302 b->shader->did_writeout = true; 303 b->shader->out->reads_tib = true; 304 305 return agx_ld_tile_to(b, agx_dest_index(&instr->dest), 306 b->shader->key->fs.tib_formats[rt]); 307} 308 309static enum agx_format 310agx_format_for_bits(unsigned bits) 311{ 312 switch (bits) { 313 case 8: return AGX_FORMAT_I8; 314 case 16: return AGX_FORMAT_I16; 315 case 32: return AGX_FORMAT_I32; 316 default: unreachable("Invalid bit size for load/store"); 317 } 318} 319 320static agx_instr * 321agx_emit_load_ubo(agx_builder *b, nir_intrinsic_instr *instr) 322{ 323 bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input); 324 nir_src *offset = nir_get_io_offset_src(instr); 325 326 if (!kernel_input && !nir_src_is_const(instr->src[0])) 327 unreachable("todo: indirect UBO access"); 328 329 /* Constant offsets for device_load are 16-bit */ 330 bool offset_is_const = nir_src_is_const(*offset); 331 assert(offset_is_const && "todo: indirect UBO access"); 332 int32_t const_offset = offset_is_const ? nir_src_as_int(*offset) : 0; 333 334 /* Offsets are shifted by the type size, so divide that out */ 335 unsigned bytes = nir_dest_bit_size(instr->dest) / 8; 336 assert((const_offset & (bytes - 1)) == 0); 337 const_offset = const_offset / bytes; 338 int16_t const_as_16 = const_offset; 339 340 /* UBO blocks are specified (kernel inputs are always 0) */ 341 uint32_t block = kernel_input ? 0 : nir_src_as_uint(instr->src[0]); 342 343 /* Each UBO has a 64-bit = 4 x 16-bit address */ 344 unsigned num_ubos = b->shader->nir->info.num_ubos; 345 unsigned base_length = (num_ubos * 4); 346 unsigned index = block * 4; /* 16 bit units */ 347 348 /* Lookup the base address (TODO: indirection) */ 349 agx_index base = agx_indexed_sysval(b->shader, 350 AGX_PUSH_UBO_BASES, AGX_SIZE_64, 351 index, base_length); 352 353 /* Load the data */ 354 assert(instr->num_components <= 4); 355 356 agx_device_load_to(b, agx_dest_index(&instr->dest), 357 base, 358 (offset_is_const && (const_offset == const_as_16)) ? 359 agx_immediate(const_as_16) : agx_mov_imm(b, 32, const_offset), 360 agx_format_for_bits(nir_dest_bit_size(instr->dest)), 361 BITFIELD_MASK(instr->num_components), 0); 362 363 return agx_wait(b, 0); 364} 365 366static agx_instr * 367agx_emit_load_frag_coord(agx_builder *b, nir_intrinsic_instr *instr) 368{ 369 agx_index xy[2]; 370 371 for (unsigned i = 0; i < 2; ++i) { 372 xy[i] = agx_fadd(b, agx_convert(b, agx_immediate(AGX_CONVERT_U32_TO_F), 373 agx_get_sr(b, 32, AGX_SR_THREAD_POSITION_IN_GRID_X + i), 374 AGX_ROUND_RTE), agx_immediate_f(0.5f)); 375 } 376 377 /* Ordering by the ABI */ 378 agx_index z = agx_ld_vary(b, agx_immediate(1), 1, false); 379 agx_index w = agx_ld_vary(b, agx_immediate(0), 1, false); 380 381 return agx_p_combine_to(b, agx_dest_index(&instr->dest), 382 xy[0], xy[1], z, w); 383} 384 385static agx_instr * 386agx_blend_const(agx_builder *b, agx_index dst, unsigned comp) 387{ 388 agx_index val = agx_indexed_sysval(b->shader, 389 AGX_PUSH_BLEND_CONST, AGX_SIZE_32, comp * 2, 4 * 2); 390 391 return agx_mov_to(b, dst, val); 392} 393 394static agx_instr * 395agx_emit_intrinsic(agx_builder *b, nir_intrinsic_instr *instr) 396{ 397 agx_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ? 398 agx_dest_index(&instr->dest) : agx_null(); 399 gl_shader_stage stage = b->shader->stage; 400 401 switch (instr->intrinsic) { 402 case nir_intrinsic_load_barycentric_pixel: 403 case nir_intrinsic_load_barycentric_centroid: 404 case nir_intrinsic_load_barycentric_sample: 405 case nir_intrinsic_load_barycentric_at_sample: 406 case nir_intrinsic_load_barycentric_at_offset: 407 /* handled later via load_vary */ 408 return NULL; 409 case nir_intrinsic_load_interpolated_input: 410 assert(stage == MESA_SHADER_FRAGMENT); 411 return agx_emit_load_vary(b, instr); 412 413 case nir_intrinsic_load_input: 414 if (stage == MESA_SHADER_FRAGMENT) 415 return agx_emit_load_vary_flat(b, instr); 416 else if (stage == MESA_SHADER_VERTEX) 417 return agx_emit_load_attr(b, instr); 418 else 419 unreachable("Unsupported shader stage"); 420 421 case nir_intrinsic_store_output: 422 if (stage == MESA_SHADER_FRAGMENT) 423 return agx_emit_fragment_out(b, instr); 424 else if (stage == MESA_SHADER_VERTEX) 425 return agx_emit_store_vary(b, instr); 426 else 427 unreachable("Unsupported shader stage"); 428 429 case nir_intrinsic_load_output: 430 assert(stage == MESA_SHADER_FRAGMENT); 431 return agx_emit_load_tile(b, instr); 432 433 case nir_intrinsic_load_ubo: 434 case nir_intrinsic_load_kernel_input: 435 return agx_emit_load_ubo(b, instr); 436 437 case nir_intrinsic_load_frag_coord: 438 return agx_emit_load_frag_coord(b, instr); 439 440 case nir_intrinsic_load_back_face_agx: 441 return agx_get_sr_to(b, dst, AGX_SR_BACKFACING); 442 443 case nir_intrinsic_load_vertex_id: 444 return agx_mov_to(b, dst, agx_abs(agx_register(10, AGX_SIZE_32))); 445 446 case nir_intrinsic_load_instance_id: 447 return agx_mov_to(b, dst, agx_abs(agx_register(12, AGX_SIZE_32))); 448 449 case nir_intrinsic_load_blend_const_color_r_float: return agx_blend_const(b, dst, 0); 450 case nir_intrinsic_load_blend_const_color_g_float: return agx_blend_const(b, dst, 1); 451 case nir_intrinsic_load_blend_const_color_b_float: return agx_blend_const(b, dst, 2); 452 case nir_intrinsic_load_blend_const_color_a_float: return agx_blend_const(b, dst, 3); 453 454 default: 455 fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); 456 unreachable("Unhandled intrinsic"); 457 } 458} 459 460static agx_index 461agx_alu_src_index(agx_builder *b, nir_alu_src src) 462{ 463 /* Check well-formedness of the input NIR */ 464 ASSERTED unsigned bitsize = nir_src_bit_size(src.src); 465 unsigned comps = nir_src_num_components(src.src); 466 unsigned channel = src.swizzle[0]; 467 468 assert(bitsize == 1 || bitsize == 16 || bitsize == 32 || bitsize == 64); 469 assert(!(src.negate || src.abs)); 470 assert(channel < comps); 471 472 agx_index idx = agx_src_index(&src.src); 473 474 /* We only deal with scalars, emit p_extract if needed */ 475 if (comps > 1) 476 return agx_p_extract(b, idx, channel); 477 else 478 return idx; 479} 480 481static agx_instr * 482agx_emit_alu_bool(agx_builder *b, nir_op op, 483 agx_index dst, agx_index s0, agx_index s1, agx_index s2) 484{ 485 /* Handle 1-bit bools as zero/nonzero rather than specifically 0/1 or 0/~0. 486 * This will give the optimizer flexibility. */ 487 agx_index f = agx_immediate(0); 488 agx_index t = agx_immediate(0x1); 489 490 switch (op) { 491 case nir_op_feq: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_EQ); 492 case nir_op_flt: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_LT); 493 case nir_op_fge: return agx_fcmpsel_to(b, dst, s0, s1, t, f, AGX_FCOND_GE); 494 case nir_op_fneu: return agx_fcmpsel_to(b, dst, s0, s1, f, t, AGX_FCOND_EQ); 495 496 case nir_op_ieq: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_UEQ); 497 case nir_op_ine: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_UEQ); 498 case nir_op_ilt: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_SLT); 499 case nir_op_ige: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_SLT); 500 case nir_op_ult: return agx_icmpsel_to(b, dst, s0, s1, t, f, AGX_ICOND_ULT); 501 case nir_op_uge: return agx_icmpsel_to(b, dst, s0, s1, f, t, AGX_ICOND_ULT); 502 503 case nir_op_mov: return agx_mov_to(b, dst, s0); 504 case nir_op_iand: return agx_and_to(b, dst, s0, s1); 505 case nir_op_ior: return agx_or_to(b, dst, s0, s1); 506 case nir_op_ixor: return agx_xor_to(b, dst, s0, s1); 507 case nir_op_inot: return agx_xor_to(b, dst, s0, t); 508 509 case nir_op_f2b1: return agx_fcmpsel_to(b, dst, s0, f, f, t, AGX_FCOND_EQ); 510 case nir_op_i2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ); 511 case nir_op_b2b1: return agx_icmpsel_to(b, dst, s0, f, f, t, AGX_ICOND_UEQ); 512 513 case nir_op_bcsel: 514 return agx_icmpsel_to(b, dst, s0, f, s2, s1, AGX_ICOND_UEQ); 515 516 default: 517 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name); 518 unreachable("Unhandled boolean ALU instruction"); 519 } 520} 521 522static agx_instr * 523agx_emit_alu(agx_builder *b, nir_alu_instr *instr) 524{ 525 unsigned srcs = nir_op_infos[instr->op].num_inputs; 526 unsigned sz = nir_dest_bit_size(instr->dest.dest); 527 unsigned src_sz = srcs ? nir_src_bit_size(instr->src[0].src) : 0; 528 ASSERTED unsigned comps = nir_dest_num_components(instr->dest.dest); 529 530 assert(comps == 1 || nir_op_is_vec(instr->op)); 531 assert(sz == 1 || sz == 16 || sz == 32 || sz == 64); 532 533 agx_index dst = agx_dest_index(&instr->dest.dest); 534 agx_index s0 = srcs > 0 ? agx_alu_src_index(b, instr->src[0]) : agx_null(); 535 agx_index s1 = srcs > 1 ? agx_alu_src_index(b, instr->src[1]) : agx_null(); 536 agx_index s2 = srcs > 2 ? agx_alu_src_index(b, instr->src[2]) : agx_null(); 537 agx_index s3 = srcs > 3 ? agx_alu_src_index(b, instr->src[3]) : agx_null(); 538 539 /* 1-bit bools are a bit special, only handle with select ops */ 540 if (sz == 1) 541 return agx_emit_alu_bool(b, instr->op, dst, s0, s1, s2); 542 543#define UNOP(nop, aop) \ 544 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0); 545#define BINOP(nop, aop) \ 546 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1); 547#define TRIOP(nop, aop) \ 548 case nir_op_ ## nop: return agx_ ## aop ## _to(b, dst, s0, s1, s2); 549 550 switch (instr->op) { 551 BINOP(fadd, fadd); 552 BINOP(fmul, fmul); 553 TRIOP(ffma, fma); 554 555 UNOP(f2f16, fmov); 556 UNOP(f2f32, fmov); 557 UNOP(fround_even, roundeven); 558 UNOP(ftrunc, trunc); 559 UNOP(ffloor, floor); 560 UNOP(fceil, ceil); 561 UNOP(frcp, rcp); 562 UNOP(frsq, rsqrt); 563 UNOP(flog2, log2); 564 UNOP(fexp2, exp2); 565 566 UNOP(fddx, dfdx); 567 UNOP(fddx_coarse, dfdx); 568 UNOP(fddx_fine, dfdx); 569 570 UNOP(fddy, dfdy); 571 UNOP(fddy_coarse, dfdy); 572 UNOP(fddy_fine, dfdy); 573 574 UNOP(mov, mov); 575 UNOP(u2u16, mov); 576 UNOP(u2u32, mov); 577 UNOP(inot, not); 578 BINOP(iand, and); 579 BINOP(ior, or); 580 BINOP(ixor, xor); 581 582 case nir_op_fsqrt: return agx_fmul_to(b, dst, s0, agx_srsqrt(b, s0)); 583 case nir_op_fsub: return agx_fadd_to(b, dst, s0, agx_neg(s1)); 584 case nir_op_fabs: return agx_fmov_to(b, dst, agx_abs(s0)); 585 case nir_op_fneg: return agx_fmov_to(b, dst, agx_neg(s0)); 586 587 case nir_op_fmin: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_LTN); 588 case nir_op_fmax: return agx_fcmpsel_to(b, dst, s0, s1, s0, s1, AGX_FCOND_GTN); 589 case nir_op_imin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SLT); 590 case nir_op_imax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_SGT); 591 case nir_op_umin: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_ULT); 592 case nir_op_umax: return agx_icmpsel_to(b, dst, s0, s1, s0, s1, AGX_ICOND_UGT); 593 594 case nir_op_iadd: return agx_iadd_to(b, dst, s0, s1, 0); 595 case nir_op_isub: return agx_iadd_to(b, dst, s0, agx_neg(s1), 0); 596 case nir_op_ineg: return agx_iadd_to(b, dst, agx_zero(), agx_neg(s0), 0); 597 case nir_op_imul: return agx_imad_to(b, dst, s0, s1, agx_zero(), 0); 598 599 case nir_op_ishl: return agx_bfi_to(b, dst, agx_zero(), s0, s1, 0); 600 case nir_op_ushr: return agx_ushr_to(b, dst, s0, s1); 601 case nir_op_ishr: return agx_asr_to(b, dst, s0, s1); 602 603 case nir_op_bcsel: 604 return agx_icmpsel_to(b, dst, s0, agx_zero(), s2, s1, AGX_ICOND_UEQ); 605 606 case nir_op_b2i32: 607 case nir_op_b2i16: 608 return agx_icmpsel_to(b, dst, s0, agx_zero(), agx_zero(), agx_immediate(1), AGX_ICOND_UEQ); 609 610 case nir_op_b2f16: 611 case nir_op_b2f32: 612 { 613 /* At this point, boolean is just zero/nonzero, so compare with zero */ 614 agx_index one = (sz == 16) ? 615 agx_mov_imm(b, 16, _mesa_float_to_half(1.0)) : 616 agx_mov_imm(b, 32, fui(1.0)); 617 618 agx_index zero = agx_zero(); 619 620 return agx_fcmpsel_to(b, dst, s0, zero, zero, one, AGX_FCOND_EQ); 621 } 622 623 case nir_op_i2i32: 624 { 625 if (s0.size != AGX_SIZE_16) 626 unreachable("todo: more conversions"); 627 628 return agx_iadd_to(b, dst, s0, agx_zero(), 0); 629 } 630 631 case nir_op_i2i16: 632 { 633 if (s0.size != AGX_SIZE_32) 634 unreachable("todo: more conversions"); 635 636 return agx_iadd_to(b, dst, s0, agx_zero(), 0); 637 } 638 639 case nir_op_iadd_sat: 640 { 641 agx_instr *I = agx_iadd_to(b, dst, s0, s1, 0); 642 I->saturate = true; 643 return I; 644 } 645 646 case nir_op_isub_sat: 647 { 648 agx_instr *I = agx_iadd_to(b, dst, s0, agx_neg(s1), 0); 649 I->saturate = true; 650 return I; 651 } 652 653 case nir_op_uadd_sat: 654 { 655 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_abs(s1), 0); 656 I->saturate = true; 657 return I; 658 } 659 660 case nir_op_usub_sat: 661 { 662 agx_instr *I = agx_iadd_to(b, dst, agx_abs(s0), agx_neg(agx_abs(s1)), 0); 663 I->saturate = true; 664 return I; 665 } 666 667 case nir_op_fsat: 668 { 669 agx_instr *I = agx_fadd_to(b, dst, s0, agx_negzero()); 670 I->saturate = true; 671 return I; 672 } 673 674 case nir_op_fsin_agx: 675 { 676 agx_index fixup = agx_sin_pt_1(b, s0); 677 agx_index sinc = agx_sin_pt_2(b, fixup); 678 return agx_fmul_to(b, dst, sinc, fixup); 679 } 680 681 case nir_op_f2i16: 682 return agx_convert_to(b, dst, 683 agx_immediate(AGX_CONVERT_F_TO_S16), s0, AGX_ROUND_RTZ); 684 685 case nir_op_f2i32: 686 return agx_convert_to(b, dst, 687 agx_immediate(AGX_CONVERT_F_TO_S32), s0, AGX_ROUND_RTZ); 688 689 case nir_op_f2u16: 690 return agx_convert_to(b, dst, 691 agx_immediate(AGX_CONVERT_F_TO_U16), s0, AGX_ROUND_RTZ); 692 693 case nir_op_f2u32: 694 return agx_convert_to(b, dst, 695 agx_immediate(AGX_CONVERT_F_TO_U32), s0, AGX_ROUND_RTZ); 696 697 case nir_op_u2f16: 698 case nir_op_u2f32: 699 { 700 if (src_sz == 64) 701 unreachable("64-bit conversions unimplemented"); 702 703 enum agx_convert mode = 704 (src_sz == 32) ? AGX_CONVERT_U32_TO_F : 705 (src_sz == 16) ? AGX_CONVERT_U16_TO_F : 706 AGX_CONVERT_U8_TO_F; 707 708 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE); 709 } 710 711 case nir_op_i2f16: 712 case nir_op_i2f32: 713 { 714 if (src_sz == 64) 715 unreachable("64-bit conversions unimplemented"); 716 717 enum agx_convert mode = 718 (src_sz == 32) ? AGX_CONVERT_S32_TO_F : 719 (src_sz == 16) ? AGX_CONVERT_S16_TO_F : 720 AGX_CONVERT_S8_TO_F; 721 722 return agx_convert_to(b, dst, agx_immediate(mode), s0, AGX_ROUND_RTE); 723 } 724 725 case nir_op_vec2: 726 case nir_op_vec3: 727 case nir_op_vec4: 728 return agx_p_combine_to(b, dst, s0, s1, s2, s3); 729 730 case nir_op_vec8: 731 case nir_op_vec16: 732 unreachable("should've been lowered"); 733 734 default: 735 fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); 736 unreachable("Unhandled ALU instruction"); 737 } 738} 739 740static enum agx_dim 741agx_tex_dim(enum glsl_sampler_dim dim, bool array) 742{ 743 switch (dim) { 744 case GLSL_SAMPLER_DIM_1D: 745 case GLSL_SAMPLER_DIM_BUF: 746 return array ? AGX_DIM_TEX_1D_ARRAY : AGX_DIM_TEX_1D; 747 748 case GLSL_SAMPLER_DIM_2D: 749 case GLSL_SAMPLER_DIM_RECT: 750 case GLSL_SAMPLER_DIM_EXTERNAL: 751 return array ? AGX_DIM_TEX_2D_ARRAY : AGX_DIM_TEX_2D; 752 753 case GLSL_SAMPLER_DIM_MS: 754 assert(!array && "multisampled arrays unsupported"); 755 return AGX_DIM_TEX_2D_MS; 756 757 case GLSL_SAMPLER_DIM_3D: 758 assert(!array && "3D arrays unsupported"); 759 return AGX_DIM_TEX_3D; 760 761 case GLSL_SAMPLER_DIM_CUBE: 762 return array ? AGX_DIM_TEX_CUBE_ARRAY : AGX_DIM_TEX_CUBE; 763 764 default: 765 unreachable("Invalid sampler dim\n"); 766 } 767} 768 769static void 770agx_emit_tex(agx_builder *b, nir_tex_instr *instr) 771{ 772 switch (instr->op) { 773 case nir_texop_tex: 774 case nir_texop_txl: 775 break; 776 default: 777 unreachable("Unhandled texture op"); 778 } 779 780 enum agx_lod_mode lod_mode = (instr->op == nir_texop_tex) ? 781 AGX_LOD_MODE_AUTO_LOD : AGX_LOD_MODE_LOD_MIN; 782 783 agx_index coords = agx_null(), 784 texture = agx_immediate(instr->texture_index), 785 sampler = agx_immediate(instr->sampler_index), 786 lod = agx_immediate(0), 787 offset = agx_null(); 788 789 for (unsigned i = 0; i < instr->num_srcs; ++i) { 790 agx_index index = agx_src_index(&instr->src[i].src); 791 792 switch (instr->src[i].src_type) { 793 case nir_tex_src_coord: 794 coords = index; 795 break; 796 797 case nir_tex_src_lod: 798 lod = index; 799 break; 800 801 case nir_tex_src_bias: 802 case nir_tex_src_ms_index: 803 case nir_tex_src_offset: 804 case nir_tex_src_comparator: 805 case nir_tex_src_texture_offset: 806 case nir_tex_src_sampler_offset: 807 default: 808 unreachable("todo"); 809 } 810 } 811 812 agx_texture_sample_to(b, agx_dest_index(&instr->dest), 813 coords, lod, texture, sampler, offset, 814 agx_tex_dim(instr->sampler_dim, instr->is_array), 815 lod_mode, 816 0xF, /* TODO: wrmask */ 817 0); 818 819 agx_wait(b, 0); 820} 821 822/* NIR loops are treated as a pair of AGX loops: 823 * 824 * do { 825 * do { 826 * ... 827 * } while (0); 828 * } while (cond); 829 * 830 * By manipulating the nesting counter (r0l), we may break out of nested loops, 831 * so under the model, both break and continue may be implemented as breaks, 832 * where break breaks out of the outer loop (2 layers) and continue breaks out 833 * of the inner loop (1 layer). 834 * 835 * After manipulating the nesting counter directly, pop_exec #0 must be used to 836 * flush the update to the execution mask. 837 */ 838 839static void 840agx_emit_jump(agx_builder *b, nir_jump_instr *instr) 841{ 842 agx_context *ctx = b->shader; 843 assert (instr->type == nir_jump_break || instr->type == nir_jump_continue); 844 845 /* Break out of either one or two loops */ 846 unsigned nestings = b->shader->loop_nesting; 847 848 if (instr->type == nir_jump_continue) { 849 nestings += 1; 850 agx_block_add_successor(ctx->current_block, ctx->continue_block); 851 } else if (instr->type == nir_jump_break) { 852 nestings += 2; 853 agx_block_add_successor(ctx->current_block, ctx->break_block); 854 } 855 856 /* Update the counter and flush */ 857 agx_index r0l = agx_register(0, false); 858 agx_mov_to(b, r0l, agx_immediate(nestings)); 859 agx_pop_exec(b, 0); 860 861 ctx->current_block->unconditional_jumps = true; 862} 863 864static void 865agx_emit_instr(agx_builder *b, struct nir_instr *instr) 866{ 867 switch (instr->type) { 868 case nir_instr_type_load_const: 869 agx_emit_load_const(b, nir_instr_as_load_const(instr)); 870 break; 871 872 case nir_instr_type_intrinsic: 873 agx_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); 874 break; 875 876 case nir_instr_type_alu: 877 agx_emit_alu(b, nir_instr_as_alu(instr)); 878 break; 879 880 case nir_instr_type_tex: 881 agx_emit_tex(b, nir_instr_as_tex(instr)); 882 break; 883 884 case nir_instr_type_jump: 885 agx_emit_jump(b, nir_instr_as_jump(instr)); 886 break; 887 888 default: 889 unreachable("should've been lowered"); 890 } 891} 892 893static agx_block * 894agx_create_block(agx_context *ctx) 895{ 896 agx_block *blk = rzalloc(ctx, agx_block); 897 898 blk->predecessors = _mesa_set_create(blk, 899 _mesa_hash_pointer, _mesa_key_pointer_equal); 900 901 return blk; 902} 903 904static agx_block * 905emit_block(agx_context *ctx, nir_block *block) 906{ 907 if (ctx->after_block) { 908 ctx->current_block = ctx->after_block; 909 ctx->after_block = NULL; 910 } else { 911 ctx->current_block = agx_create_block(ctx); 912 } 913 914 agx_block *blk = ctx->current_block; 915 list_addtail(&blk->link, &ctx->blocks); 916 list_inithead(&blk->instructions); 917 918 agx_builder _b = agx_init_builder(ctx, agx_after_block(blk)); 919 920 nir_foreach_instr(instr, block) { 921 agx_emit_instr(&_b, instr); 922 } 923 924 return blk; 925} 926 927static agx_block * 928emit_cf_list(agx_context *ctx, struct exec_list *list); 929 930/* Emit if-else as 931 * 932 * if_icmp cond != 0 933 * ... 934 * else_icmp cond == 0 935 * ... 936 * pop_exec 937 * 938 * If the else is empty, we can omit the else_icmp. This is not usually 939 * optimal, but it's a start. 940 */ 941 942static void 943emit_if(agx_context *ctx, nir_if *nif) 944{ 945 nir_block *nir_else_block = nir_if_first_else_block(nif); 946 bool empty_else_block = 947 (nir_else_block == nir_if_last_else_block(nif) && 948 exec_list_is_empty(&nir_else_block->instr_list)); 949 950 agx_block *first_block = ctx->current_block; 951 agx_builder _b = agx_init_builder(ctx, agx_after_block(first_block)); 952 agx_index cond = agx_src_index(&nif->condition); 953 954 agx_if_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, true); 955 ctx->loop_nesting++; 956 957 /* Emit the two subblocks. */ 958 agx_block *if_block = emit_cf_list(ctx, &nif->then_list); 959 agx_block *end_then = ctx->current_block; 960 961 if (!empty_else_block) { 962 _b.cursor = agx_after_block(ctx->current_block); 963 agx_else_icmp(&_b, cond, agx_zero(), 1, AGX_ICOND_UEQ, false); 964 } 965 966 agx_block *else_block = emit_cf_list(ctx, &nif->else_list); 967 agx_block *end_else = ctx->current_block; 968 969 ctx->after_block = agx_create_block(ctx); 970 971 agx_block_add_successor(first_block, if_block); 972 agx_block_add_successor(first_block, else_block); 973 agx_block_add_successor(end_then, ctx->after_block); 974 agx_block_add_successor(end_else, ctx->after_block); 975 976 _b.cursor = agx_after_block(ctx->current_block); 977 agx_pop_exec(&_b, 1); 978 ctx->loop_nesting--; 979} 980 981static void 982emit_loop(agx_context *ctx, nir_loop *nloop) 983{ 984 /* We only track nesting within the innermost loop, so reset */ 985 ctx->loop_nesting = 0; 986 987 agx_block *popped_break = ctx->break_block; 988 agx_block *popped_continue = ctx->continue_block; 989 990 ctx->break_block = agx_create_block(ctx); 991 ctx->continue_block = agx_create_block(ctx); 992 993 /* Make room for break/continue nesting (TODO: skip if no divergent CF) */ 994 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block)); 995 agx_push_exec(&_b, 2); 996 997 /* Fallthrough to body */ 998 agx_block_add_successor(ctx->current_block, ctx->continue_block); 999 1000 /* Emit the body */ 1001 ctx->after_block = ctx->continue_block; 1002 agx_block *start_block = emit_cf_list(ctx, &nloop->body); 1003 1004 /* Fix up the nesting counter via an always true while_icmp, and branch back 1005 * to start of loop if any lanes are active */ 1006 _b.cursor = agx_after_block(ctx->current_block); 1007 agx_while_icmp(&_b, agx_zero(), agx_zero(), 2, AGX_ICOND_UEQ, false); 1008 agx_jmp_exec_any(&_b, start_block); 1009 agx_pop_exec(&_b, 2); 1010 agx_block_add_successor(ctx->current_block, ctx->continue_block); 1011 1012 /* Pop off */ 1013 ctx->after_block = ctx->break_block; 1014 ctx->break_block = popped_break; 1015 ctx->continue_block = popped_continue; 1016 1017 /* Update shader-db stats */ 1018 ++ctx->loop_count; 1019 1020 /* All nested control flow must have finished */ 1021 assert(ctx->loop_nesting == 0); 1022} 1023 1024/* Before the first control flow structure, the nesting counter (r0l) needs to 1025 * be zeroed for correct operation. This only happens at most once, since by 1026 * definition this occurs at the end of the first block, which dominates the 1027 * rest of the program. */ 1028 1029static void 1030emit_first_cf(agx_context *ctx) 1031{ 1032 if (ctx->any_cf) 1033 return; 1034 1035 agx_builder _b = agx_init_builder(ctx, agx_after_block(ctx->current_block)); 1036 agx_index r0l = agx_register(0, false); 1037 1038 agx_mov_to(&_b, r0l, agx_immediate(0)); 1039 ctx->any_cf = true; 1040} 1041 1042static agx_block * 1043emit_cf_list(agx_context *ctx, struct exec_list *list) 1044{ 1045 agx_block *start_block = NULL; 1046 1047 foreach_list_typed(nir_cf_node, node, node, list) { 1048 switch (node->type) { 1049 case nir_cf_node_block: { 1050 agx_block *block = emit_block(ctx, nir_cf_node_as_block(node)); 1051 1052 if (!start_block) 1053 start_block = block; 1054 1055 break; 1056 } 1057 1058 case nir_cf_node_if: 1059 emit_first_cf(ctx); 1060 emit_if(ctx, nir_cf_node_as_if(node)); 1061 break; 1062 1063 case nir_cf_node_loop: 1064 emit_first_cf(ctx); 1065 emit_loop(ctx, nir_cf_node_as_loop(node)); 1066 break; 1067 1068 default: 1069 unreachable("Unknown control flow"); 1070 } 1071 } 1072 1073 return start_block; 1074} 1075 1076static void 1077agx_set_st_vary_final(agx_context *ctx) 1078{ 1079 agx_foreach_instr_global_rev(ctx, I) { 1080 if (I->op == AGX_OPCODE_ST_VARY) { 1081 I->last = true; 1082 return; 1083 } 1084 } 1085} 1086 1087static void 1088agx_print_stats(agx_context *ctx, unsigned size, FILE *fp) 1089{ 1090 unsigned nr_ins = 0, nr_bytes = 0, nr_threads = 1; 1091 1092 /* TODO */ 1093 fprintf(stderr, "%s shader: %u inst, %u bytes, %u threads, %u loops," 1094 "%u:%u spills:fills\n", 1095 ctx->nir->info.label ?: "", 1096 nr_ins, nr_bytes, nr_threads, ctx->loop_count, 1097 ctx->spills, ctx->fills); 1098} 1099 1100static int 1101glsl_type_size(const struct glsl_type *type, bool bindless) 1102{ 1103 return glsl_count_attribute_slots(type, false); 1104} 1105 1106static bool 1107agx_lower_sincos_filter(const nir_instr *instr, UNUSED const void *_) 1108{ 1109 if (instr->type != nir_instr_type_alu) 1110 return false; 1111 1112 nir_alu_instr *alu = nir_instr_as_alu(instr); 1113 return alu->op == nir_op_fsin || alu->op == nir_op_fcos; 1114} 1115 1116/* Sine and cosine are implemented via the sin_pt_1 and sin_pt_2 opcodes for 1117 * heavy lifting. sin_pt_2 implements sinc in the first quadrant, expressed in 1118 * turns (sin (tau x) / x), while sin_pt_1 implements a piecewise sign/offset 1119 * fixup to transform a quadrant angle [0, 4] to [-1, 1]. The NIR opcode 1120 * fsin_agx models the fixup, sinc, and multiply to obtain sine, so we just 1121 * need to change units from radians to quadrants modulo turns. Cosine is 1122 * implemented by shifting by one quadrant: cos(x) = sin(x + tau/4). 1123 */ 1124 1125static nir_ssa_def * 1126agx_lower_sincos_impl(struct nir_builder *b, nir_instr *instr, UNUSED void *_) 1127{ 1128 nir_alu_instr *alu = nir_instr_as_alu(instr); 1129 nir_ssa_def *x = nir_mov_alu(b, alu->src[0], 1); 1130 nir_ssa_def *turns = nir_fmul_imm(b, x, M_1_PI * 0.5f); 1131 1132 if (alu->op == nir_op_fcos) 1133 turns = nir_fadd_imm(b, turns, 0.25f); 1134 1135 nir_ssa_def *quadrants = nir_fmul_imm(b, nir_ffract(b, turns), 4.0); 1136 return nir_fsin_agx(b, quadrants); 1137} 1138 1139static bool 1140agx_lower_sincos(nir_shader *shader) 1141{ 1142 return nir_shader_lower_instructions(shader, 1143 agx_lower_sincos_filter, agx_lower_sincos_impl, NULL); 1144} 1145 1146static bool 1147agx_lower_front_face(struct nir_builder *b, 1148 nir_instr *instr, UNUSED void *data) 1149{ 1150 if (instr->type != nir_instr_type_intrinsic) 1151 return false; 1152 1153 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1154 if (intr->intrinsic != nir_intrinsic_load_front_face) 1155 return false; 1156 1157 assert(intr->dest.is_ssa); 1158 nir_ssa_def *def = &intr->dest.ssa; 1159 assert(def->bit_size == 1); 1160 1161 b->cursor = nir_before_instr(&intr->instr); 1162 nir_ssa_def_rewrite_uses(def, nir_inot(b, nir_load_back_face_agx(b, 1))); 1163 return true; 1164} 1165 1166static bool 1167agx_lower_point_coord(struct nir_builder *b, 1168 nir_instr *instr, UNUSED void *data) 1169{ 1170 if (instr->type != nir_instr_type_intrinsic) 1171 return false; 1172 1173 nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1174 1175 if (intr->intrinsic != nir_intrinsic_load_deref) 1176 return false; 1177 1178 nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); 1179 nir_variable *var = nir_deref_instr_get_variable(deref); 1180 1181 if (var->data.mode != nir_var_shader_in) 1182 return false; 1183 1184 if (var->data.location != VARYING_SLOT_PNTC) 1185 return false; 1186 1187 assert(intr->dest.is_ssa); 1188 assert(intr->dest.ssa.num_components == 2); 1189 1190 b->cursor = nir_after_instr(&intr->instr); 1191 nir_ssa_def *def = nir_load_deref(b, deref); 1192 nir_ssa_def *y = nir_channel(b, def, 1); 1193 nir_ssa_def *flipped_y = nir_fadd_imm(b, nir_fneg(b, y), 1.0); 1194 nir_ssa_def *flipped = nir_vec2(b, nir_channel(b, def, 0), flipped_y); 1195 nir_ssa_def_rewrite_uses(&intr->dest.ssa, flipped); 1196 return true; 1197} 1198 1199static void 1200agx_optimize_nir(nir_shader *nir) 1201{ 1202 bool progress; 1203 1204 nir_lower_idiv_options idiv_options = { 1205 .imprecise_32bit_lowering = true, 1206 .allow_fp16 = true, 1207 }; 1208 1209 NIR_PASS_V(nir, nir_lower_regs_to_ssa); 1210 NIR_PASS_V(nir, nir_lower_int64); 1211 NIR_PASS_V(nir, nir_lower_idiv, &idiv_options); 1212 NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); 1213 NIR_PASS_V(nir, nir_lower_load_const_to_scalar); 1214 NIR_PASS_V(nir, nir_lower_flrp, 16 | 32 | 64, false); 1215 NIR_PASS_V(nir, agx_lower_sincos); 1216 NIR_PASS_V(nir, nir_shader_instructions_pass, 1217 agx_lower_front_face, 1218 nir_metadata_block_index | nir_metadata_dominance, NULL); 1219 1220 do { 1221 progress = false; 1222 1223 NIR_PASS(progress, nir, nir_lower_var_copies); 1224 NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 1225 1226 NIR_PASS(progress, nir, nir_copy_prop); 1227 NIR_PASS(progress, nir, nir_opt_remove_phis); 1228 NIR_PASS(progress, nir, nir_opt_dce); 1229 NIR_PASS(progress, nir, nir_opt_dead_cf); 1230 NIR_PASS(progress, nir, nir_opt_cse); 1231 NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); 1232 NIR_PASS(progress, nir, nir_opt_algebraic); 1233 NIR_PASS(progress, nir, nir_opt_constant_folding); 1234 1235 NIR_PASS(progress, nir, nir_opt_undef); 1236 NIR_PASS(progress, nir, nir_lower_undef_to_zero); 1237 1238 NIR_PASS(progress, nir, nir_opt_loop_unroll); 1239 } while (progress); 1240 1241 NIR_PASS_V(nir, nir_opt_algebraic_late); 1242 NIR_PASS_V(nir, nir_opt_constant_folding); 1243 NIR_PASS_V(nir, nir_copy_prop); 1244 NIR_PASS_V(nir, nir_opt_dce); 1245 NIR_PASS_V(nir, nir_opt_cse); 1246 NIR_PASS_V(nir, nir_lower_alu_to_scalar, NULL, NULL); 1247 NIR_PASS_V(nir, nir_lower_load_const_to_scalar); 1248 1249 /* Cleanup optimizations */ 1250 nir_move_options move_all = 1251 nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | 1252 nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; 1253 1254 NIR_PASS_V(nir, nir_opt_sink, move_all); 1255 NIR_PASS_V(nir, nir_opt_move, move_all); 1256 NIR_PASS_V(nir, nir_convert_from_ssa, true); 1257} 1258 1259/* ABI: position first, then user, then psiz */ 1260static void 1261agx_remap_varyings_vs(nir_shader *nir, struct agx_varyings *varyings, 1262 unsigned *remap) 1263{ 1264 unsigned base = 0; 1265 1266 nir_variable *pos = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_POS); 1267 if (pos) { 1268 assert(pos->data.driver_location < AGX_MAX_VARYINGS); 1269 remap[pos->data.driver_location] = base; 1270 base += 4; 1271 } 1272 1273 nir_foreach_shader_out_variable(var, nir) { 1274 unsigned loc = var->data.location; 1275 1276 if(loc == VARYING_SLOT_POS || loc == VARYING_SLOT_PSIZ) { 1277 continue; 1278 } 1279 1280 assert(var->data.driver_location < AGX_MAX_VARYINGS); 1281 remap[var->data.driver_location] = base; 1282 base += 4; 1283 } 1284 1285 nir_variable *psiz = nir_find_variable_with_location(nir, nir_var_shader_out, VARYING_SLOT_PSIZ); 1286 if (psiz) { 1287 assert(psiz->data.driver_location < AGX_MAX_VARYINGS); 1288 remap[psiz->data.driver_location] = base; 1289 base += 1; 1290 } 1291 1292 varyings->nr_slots = base; 1293} 1294 1295static void 1296agx_remap_varyings_fs(nir_shader *nir, struct agx_varyings *varyings, 1297 unsigned *remap) 1298{ 1299 struct agx_varying_packed *packed = varyings->packed; 1300 unsigned base = 0; 1301 1302 agx_pack(packed, VARYING, cfg) { 1303 cfg.type = AGX_VARYING_TYPE_FRAGCOORD_W; 1304 cfg.components = 1; 1305 cfg.triangle_slot = cfg.point_slot = base; 1306 } 1307 1308 base++; 1309 packed++; 1310 1311 agx_pack(packed, VARYING, cfg) { 1312 cfg.type = AGX_VARYING_TYPE_FRAGCOORD_Z; 1313 cfg.components = 1; 1314 cfg.triangle_slot = cfg.point_slot = base; 1315 } 1316 1317 base++; 1318 packed++; 1319 1320 unsigned comps[MAX_VARYING] = { 0 }; 1321 1322 nir_foreach_shader_in_variable(var, nir) { 1323 unsigned loc = var->data.driver_location; 1324 const struct glsl_type *column = 1325 glsl_without_array_or_matrix(var->type); 1326 unsigned chan = glsl_get_components(column); 1327 1328 /* If we have a fractional location added, we need to increase the size 1329 * so it will fit, i.e. a vec3 in YZW requires us to allocate a vec4. 1330 * We could do better but this is an edge case as it is, normally 1331 * packed varyings will be aligned. 1332 */ 1333 chan += var->data.location_frac; 1334 comps[loc] = MAX2(comps[loc], chan); 1335 } 1336 1337 nir_foreach_shader_in_variable(var, nir) { 1338 unsigned loc = var->data.driver_location; 1339 unsigned sz = glsl_count_attribute_slots(var->type, FALSE); 1340 unsigned channels = comps[loc]; 1341 1342 assert(var->data.driver_location <= AGX_MAX_VARYINGS); 1343 remap[var->data.driver_location] = base; 1344 1345 for (int c = 0; c < sz; ++c) { 1346 agx_pack(packed, VARYING, cfg) { 1347 cfg.type = (var->data.location == VARYING_SLOT_PNTC) ? 1348 AGX_VARYING_TYPE_POINT_COORDINATES : 1349 (var->data.interpolation == INTERP_MODE_FLAT) ? 1350 AGX_VARYING_TYPE_FLAT_LAST : 1351 AGX_VARYING_TYPE_SMOOTH; 1352 1353 cfg.components = channels; 1354 cfg.triangle_slot = cfg.point_slot = base; 1355 } 1356 1357 base += channels; 1358 packed++; 1359 } 1360 } 1361 1362 varyings->nr_descs = (packed - varyings->packed); 1363 varyings->nr_slots = base; 1364} 1365 1366void 1367agx_compile_shader_nir(nir_shader *nir, 1368 struct agx_shader_key *key, 1369 struct util_dynarray *binary, 1370 struct agx_shader_info *out) 1371{ 1372 agx_debug = debug_get_option_agx_debug(); 1373 1374 agx_context *ctx = rzalloc(NULL, agx_context); 1375 ctx->nir = nir; 1376 ctx->out = out; 1377 ctx->key = key; 1378 ctx->stage = nir->info.stage; 1379 list_inithead(&ctx->blocks); 1380 1381 if (ctx->stage == MESA_SHADER_VERTEX) { 1382 out->writes_psiz = nir->info.outputs_written & 1383 BITFIELD_BIT(VARYING_SLOT_PSIZ); 1384 } 1385 1386 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 1387 1388 /* Lower large arrays to scratch and small arrays to csel */ 1389 NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16, 1390 glsl_get_natural_size_align_bytes); 1391 NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); 1392 1393 if (ctx->stage == MESA_SHADER_VERTEX) { 1394 /* Lower from OpenGL [-1, 1] to [0, 1] if half-z is not set */ 1395 if (!key->vs.clip_halfz) 1396 NIR_PASS_V(nir, nir_lower_clip_halfz); 1397 } else if (ctx->stage == MESA_SHADER_FRAGMENT) { 1398 /* Flip point coordinate since OpenGL and Metal disagree */ 1399 NIR_PASS_V(nir, nir_shader_instructions_pass, 1400 agx_lower_point_coord, 1401 nir_metadata_block_index | nir_metadata_dominance, NULL); 1402 } 1403 1404 NIR_PASS_V(nir, nir_split_var_copies); 1405 NIR_PASS_V(nir, nir_lower_global_vars_to_local); 1406 NIR_PASS_V(nir, nir_lower_var_copies); 1407 NIR_PASS_V(nir, nir_lower_vars_to_ssa); 1408 NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 1409 glsl_type_size, 0); 1410 if (ctx->stage == MESA_SHADER_FRAGMENT) { 1411 NIR_PASS_V(nir, nir_lower_mediump_io, 1412 nir_var_shader_in | nir_var_shader_out, ~0, false); 1413 } 1414 NIR_PASS_V(nir, nir_lower_ssbo); 1415 1416 /* Varying output is scalar, other I/O is vector */ 1417 if (ctx->stage == MESA_SHADER_VERTEX) { 1418 NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out); 1419 } 1420 1421 nir_lower_tex_options lower_tex_options = { 1422 .lower_txs_lod = true, 1423 .lower_txp = ~0, 1424 }; 1425 1426 nir_tex_src_type_constraints tex_constraints = { 1427 [nir_tex_src_lod] = { true, 16 } 1428 }; 1429 1430 NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); 1431 NIR_PASS_V(nir, nir_legalize_16bit_sampler_srcs, tex_constraints); 1432 1433 agx_optimize_nir(nir); 1434 1435 /* Must be last since NIR passes can remap driver_location freely */ 1436 if (ctx->stage == MESA_SHADER_VERTEX) { 1437 agx_remap_varyings_vs(nir, &out->varyings, ctx->varyings); 1438 } else if (ctx->stage == MESA_SHADER_FRAGMENT) { 1439 agx_remap_varyings_fs(nir, &out->varyings, ctx->varyings); 1440 } 1441 1442 bool skip_internal = nir->info.internal; 1443 skip_internal &= !(agx_debug & AGX_DBG_INTERNAL); 1444 1445 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) { 1446 nir_print_shader(nir, stdout); 1447 } 1448 1449 nir_foreach_function(func, nir) { 1450 if (!func->impl) 1451 continue; 1452 1453 /* TODO: Handle phi nodes instead of just convert_from_ssa and yolo'ing 1454 * the mapping of nir_register to hardware registers and guaranteeing bad 1455 * performance and breaking spilling... */ 1456 ctx->nir_regalloc = rzalloc_array(ctx, unsigned, func->impl->reg_alloc); 1457 1458 /* Leave the last 4 registers for hacky p-copy lowering */ 1459 unsigned nir_regalloc = AGX_NUM_REGS - (4 * 2); 1460 1461 /* Assign backwards so we don't need to guess a size */ 1462 nir_foreach_register(reg, &func->impl->registers) { 1463 /* Ensure alignment */ 1464 if (reg->bit_size >= 32 && (nir_regalloc & 1)) 1465 nir_regalloc--; 1466 1467 unsigned size = DIV_ROUND_UP(reg->bit_size * reg->num_components, 16); 1468 nir_regalloc -= size; 1469 ctx->nir_regalloc[reg->index] = nir_regalloc; 1470 } 1471 1472 ctx->max_register = nir_regalloc; 1473 ctx->alloc += func->impl->ssa_alloc; 1474 emit_cf_list(ctx, &func->impl->body); 1475 break; /* TODO: Multi-function shaders */ 1476 } 1477 1478 /* TODO: Actual RA... this way passes don't need to deal nir_register */ 1479 agx_foreach_instr_global(ctx, I) { 1480 agx_foreach_dest(I, d) { 1481 if (I->dest[d].type == AGX_INDEX_NIR_REGISTER) { 1482 I->dest[d].type = AGX_INDEX_REGISTER; 1483 I->dest[d].value = ctx->nir_regalloc[I->dest[d].value]; 1484 } 1485 } 1486 1487 agx_foreach_src(I, s) { 1488 if (I->src[s].type == AGX_INDEX_NIR_REGISTER) { 1489 I->src[s].type = AGX_INDEX_REGISTER; 1490 I->src[s].value = ctx->nir_regalloc[I->src[s].value]; 1491 } 1492 } 1493 } 1494 1495 /* Terminate the shader after the exit block */ 1496 agx_block *last_block = list_last_entry(&ctx->blocks, agx_block, link); 1497 agx_builder _b = agx_init_builder(ctx, agx_after_block(last_block)); 1498 agx_stop(&_b); 1499 1500 /* Also add traps to match the blob, unsure what the function is */ 1501 for (unsigned i = 0; i < 8; ++i) 1502 agx_trap(&_b); 1503 1504 unsigned block_source_count = 0; 1505 1506 /* Name blocks now that we're done emitting so the order is consistent */ 1507 agx_foreach_block(ctx, block) 1508 block->name = block_source_count++; 1509 1510 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) 1511 agx_print_shader(ctx, stdout); 1512 1513 agx_optimizer(ctx); 1514 agx_dce(ctx); 1515 1516 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) 1517 agx_print_shader(ctx, stdout); 1518 1519 agx_ra(ctx); 1520 1521 if (ctx->stage == MESA_SHADER_VERTEX) 1522 agx_set_st_vary_final(ctx); 1523 1524 if (agx_debug & AGX_DBG_SHADERS && !skip_internal) 1525 agx_print_shader(ctx, stdout); 1526 1527 agx_pack_binary(ctx, binary); 1528 1529 if ((agx_debug & AGX_DBG_SHADERDB) && !skip_internal) 1530 agx_print_stats(ctx, binary->size, stderr); 1531 1532 ralloc_free(ctx); 1533} 1534