1/* 2 * Copyright © 2018 Valve Corporation 3 * Copyright © 2018 Google 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 * 24 */ 25 26#include "aco_instruction_selection.h" 27 28#include "aco_builder.h" 29#include "aco_ir.h" 30 31#include "common/ac_exp_param.h" 32#include "common/sid.h" 33#include "vulkan/radv_descriptor_set.h" 34 35#include "util/fast_idiv_by_const.h" 36#include "util/memstream.h" 37 38#include <array> 39#include <functional> 40#include <map> 41#include <numeric> 42#include <stack> 43#include <utility> 44#include <vector> 45 46namespace aco { 47namespace { 48 49#define isel_err(...) _isel_err(ctx, __FILE__, __LINE__, __VA_ARGS__) 50 51static void 52_isel_err(isel_context* ctx, const char* file, unsigned line, const nir_instr* instr, 53 const char* msg) 54{ 55 char* out; 56 size_t outsize; 57 struct u_memstream mem; 58 u_memstream_open(&mem, &out, &outsize); 59 FILE* const memf = u_memstream_get(&mem); 60 61 fprintf(memf, "%s: ", msg); 62 nir_print_instr(instr, memf); 63 u_memstream_close(&mem); 64 65 _aco_err(ctx->program, file, line, out); 66 free(out); 67} 68 69struct if_context { 70 Temp cond; 71 72 bool divergent_old; 73 bool exec_potentially_empty_discard_old; 74 bool exec_potentially_empty_break_old; 75 uint16_t exec_potentially_empty_break_depth_old; 76 77 unsigned BB_if_idx; 78 unsigned invert_idx; 79 bool uniform_has_then_branch; 80 bool then_branch_divergent; 81 Block BB_invert; 82 Block BB_endif; 83}; 84 85struct loop_context { 86 Block loop_exit; 87 88 unsigned header_idx_old; 89 Block* exit_old; 90 bool divergent_cont_old; 91 bool divergent_branch_old; 92 bool divergent_if_old; 93}; 94 95static bool visit_cf_list(struct isel_context* ctx, struct exec_list* list); 96 97static void 98add_logical_edge(unsigned pred_idx, Block* succ) 99{ 100 succ->logical_preds.emplace_back(pred_idx); 101} 102 103static void 104add_linear_edge(unsigned pred_idx, Block* succ) 105{ 106 succ->linear_preds.emplace_back(pred_idx); 107} 108 109static void 110add_edge(unsigned pred_idx, Block* succ) 111{ 112 add_logical_edge(pred_idx, succ); 113 add_linear_edge(pred_idx, succ); 114} 115 116static void 117append_logical_start(Block* b) 118{ 119 Builder(NULL, b).pseudo(aco_opcode::p_logical_start); 120} 121 122static void 123append_logical_end(Block* b) 124{ 125 Builder(NULL, b).pseudo(aco_opcode::p_logical_end); 126} 127 128Temp 129get_ssa_temp(struct isel_context* ctx, nir_ssa_def* def) 130{ 131 uint32_t id = ctx->first_temp_id + def->index; 132 return Temp(id, ctx->program->temp_rc[id]); 133} 134 135Temp 136emit_mbcnt(isel_context* ctx, Temp dst, Operand mask = Operand(), Operand base = Operand::zero()) 137{ 138 Builder bld(ctx->program, ctx->block); 139 assert(mask.isUndefined() || mask.isTemp() || (mask.isFixed() && mask.physReg() == exec)); 140 assert(mask.isUndefined() || mask.bytes() == bld.lm.bytes()); 141 142 if (ctx->program->wave_size == 32) { 143 Operand mask_lo = mask.isUndefined() ? Operand::c32(-1u) : mask; 144 return bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, Definition(dst), mask_lo, base); 145 } 146 147 Operand mask_lo = Operand::c32(-1u); 148 Operand mask_hi = Operand::c32(-1u); 149 150 if (mask.isTemp()) { 151 RegClass rc = RegClass(mask.regClass().type(), 1); 152 Builder::Result mask_split = 153 bld.pseudo(aco_opcode::p_split_vector, bld.def(rc), bld.def(rc), mask); 154 mask_lo = Operand(mask_split.def(0).getTemp()); 155 mask_hi = Operand(mask_split.def(1).getTemp()); 156 } else if (mask.physReg() == exec) { 157 mask_lo = Operand(exec_lo, s1); 158 mask_hi = Operand(exec_hi, s1); 159 } 160 161 Temp mbcnt_lo = bld.vop3(aco_opcode::v_mbcnt_lo_u32_b32, bld.def(v1), mask_lo, base); 162 163 if (ctx->program->chip_class <= GFX7) 164 return bld.vop2(aco_opcode::v_mbcnt_hi_u32_b32, Definition(dst), mask_hi, mbcnt_lo); 165 else 166 return bld.vop3(aco_opcode::v_mbcnt_hi_u32_b32_e64, Definition(dst), mask_hi, mbcnt_lo); 167} 168 169Temp 170emit_wqm(Builder& bld, Temp src, Temp dst = Temp(0, s1), bool program_needs_wqm = false) 171{ 172 if (!dst.id()) 173 dst = bld.tmp(src.regClass()); 174 175 assert(src.size() == dst.size()); 176 177 if (bld.program->stage != fragment_fs) { 178 if (!dst.id()) 179 return src; 180 181 bld.copy(Definition(dst), src); 182 return dst; 183 } 184 185 bld.pseudo(aco_opcode::p_wqm, Definition(dst), src); 186 bld.program->needs_wqm |= program_needs_wqm; 187 return dst; 188} 189 190static Temp 191emit_bpermute(isel_context* ctx, Builder& bld, Temp index, Temp data) 192{ 193 if (index.regClass() == s1) 194 return bld.readlane(bld.def(s1), data, index); 195 196 if (ctx->options->chip_class <= GFX7) { 197 /* GFX6-7: there is no bpermute instruction */ 198 Operand index_op(index); 199 Operand input_data(data); 200 index_op.setLateKill(true); 201 input_data.setLateKill(true); 202 203 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(bld.lm), bld.def(bld.lm, vcc), 204 index_op, input_data); 205 } else if (ctx->options->chip_class >= GFX10 && ctx->program->wave_size == 64) { 206 207 /* GFX10 wave64 mode: emulate full-wave bpermute */ 208 Temp index_is_lo = 209 bld.vopc(aco_opcode::v_cmp_ge_u32, bld.def(bld.lm), Operand::c32(31u), index); 210 Builder::Result index_is_lo_split = 211 bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), index_is_lo); 212 Temp index_is_lo_n1 = bld.sop1(aco_opcode::s_not_b32, bld.def(s1), bld.def(s1, scc), 213 index_is_lo_split.def(1).getTemp()); 214 Operand same_half = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), 215 index_is_lo_split.def(0).getTemp(), index_is_lo_n1); 216 Operand index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index); 217 Operand input_data(data); 218 219 index_x4.setLateKill(true); 220 input_data.setLateKill(true); 221 same_half.setLateKill(true); 222 223 /* We need one pair of shared VGPRs: 224 * Note, that these have twice the allocation granularity of normal VGPRs */ 225 ctx->program->config->num_shared_vgprs = 2 * ctx->program->dev.vgpr_alloc_granule; 226 227 return bld.pseudo(aco_opcode::p_bpermute, bld.def(v1), bld.def(s2), bld.def(s1, scc), 228 index_x4, input_data, same_half); 229 } else { 230 /* GFX8-9 or GFX10 wave32: bpermute works normally */ 231 Temp index_x4 = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), index); 232 return bld.ds(aco_opcode::ds_bpermute_b32, bld.def(v1), index_x4, data); 233 } 234} 235 236static Temp 237emit_masked_swizzle(isel_context* ctx, Builder& bld, Temp src, unsigned mask) 238{ 239 if (ctx->options->chip_class >= GFX8) { 240 unsigned and_mask = mask & 0x1f; 241 unsigned or_mask = (mask >> 5) & 0x1f; 242 unsigned xor_mask = (mask >> 10) & 0x1f; 243 244 uint16_t dpp_ctrl = 0xffff; 245 246 // TODO: we could use DPP8 for some swizzles 247 if (and_mask == 0x1f && or_mask < 4 && xor_mask < 4) { 248 unsigned res[4] = {0, 1, 2, 3}; 249 for (unsigned i = 0; i < 4; i++) 250 res[i] = ((res[i] | or_mask) ^ xor_mask) & 0x3; 251 dpp_ctrl = dpp_quad_perm(res[0], res[1], res[2], res[3]); 252 } else if (and_mask == 0x1f && !or_mask && xor_mask == 8) { 253 dpp_ctrl = dpp_row_rr(8); 254 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0xf) { 255 dpp_ctrl = dpp_row_mirror; 256 } else if (and_mask == 0x1f && !or_mask && xor_mask == 0x7) { 257 dpp_ctrl = dpp_row_half_mirror; 258 } 259 260 if (dpp_ctrl != 0xffff) 261 return bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl); 262 } 263 264 return bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, mask, 0, false); 265} 266 267Temp 268as_vgpr(isel_context* ctx, Temp val) 269{ 270 if (val.type() == RegType::sgpr) { 271 Builder bld(ctx->program, ctx->block); 272 return bld.copy(bld.def(RegType::vgpr, val.size()), val); 273 } 274 assert(val.type() == RegType::vgpr); 275 return val; 276} 277 278// assumes a != 0xffffffff 279void 280emit_v_div_u32(isel_context* ctx, Temp dst, Temp a, uint32_t b) 281{ 282 assert(b != 0); 283 Builder bld(ctx->program, ctx->block); 284 285 if (util_is_power_of_two_or_zero(b)) { 286 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(util_logbase2(b)), a); 287 return; 288 } 289 290 util_fast_udiv_info info = util_compute_fast_udiv_info(b, 32, 32); 291 292 assert(info.multiplier <= 0xffffffff); 293 294 bool pre_shift = info.pre_shift != 0; 295 bool increment = info.increment != 0; 296 bool multiply = true; 297 bool post_shift = info.post_shift != 0; 298 299 if (!pre_shift && !increment && !multiply && !post_shift) { 300 bld.copy(Definition(dst), a); 301 return; 302 } 303 304 Temp pre_shift_dst = a; 305 if (pre_shift) { 306 pre_shift_dst = (increment || multiply || post_shift) ? bld.tmp(v1) : dst; 307 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(pre_shift_dst), Operand::c32(info.pre_shift), 308 a); 309 } 310 311 Temp increment_dst = pre_shift_dst; 312 if (increment) { 313 increment_dst = (post_shift || multiply) ? bld.tmp(v1) : dst; 314 bld.vadd32(Definition(increment_dst), Operand::c32(info.increment), pre_shift_dst); 315 } 316 317 Temp multiply_dst = increment_dst; 318 if (multiply) { 319 multiply_dst = post_shift ? bld.tmp(v1) : dst; 320 bld.vop3(aco_opcode::v_mul_hi_u32, Definition(multiply_dst), increment_dst, 321 bld.copy(bld.def(v1), Operand::c32(info.multiplier))); 322 } 323 324 if (post_shift) { 325 bld.vop2(aco_opcode::v_lshrrev_b32, Definition(dst), Operand::c32(info.post_shift), 326 multiply_dst); 327 } 328} 329 330void 331emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, Temp dst) 332{ 333 Builder bld(ctx->program, ctx->block); 334 bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::c32(idx)); 335} 336 337Temp 338emit_extract_vector(isel_context* ctx, Temp src, uint32_t idx, RegClass dst_rc) 339{ 340 /* no need to extract the whole vector */ 341 if (src.regClass() == dst_rc) { 342 assert(idx == 0); 343 return src; 344 } 345 346 assert(src.bytes() > (idx * dst_rc.bytes())); 347 Builder bld(ctx->program, ctx->block); 348 auto it = ctx->allocated_vec.find(src.id()); 349 if (it != ctx->allocated_vec.end() && dst_rc.bytes() == it->second[idx].regClass().bytes()) { 350 if (it->second[idx].regClass() == dst_rc) { 351 return it->second[idx]; 352 } else { 353 assert(!dst_rc.is_subdword()); 354 assert(dst_rc.type() == RegType::vgpr && it->second[idx].type() == RegType::sgpr); 355 return bld.copy(bld.def(dst_rc), it->second[idx]); 356 } 357 } 358 359 if (dst_rc.is_subdword()) 360 src = as_vgpr(ctx, src); 361 362 if (src.bytes() == dst_rc.bytes()) { 363 assert(idx == 0); 364 return bld.copy(bld.def(dst_rc), src); 365 } else { 366 Temp dst = bld.tmp(dst_rc); 367 emit_extract_vector(ctx, src, idx, dst); 368 return dst; 369 } 370} 371 372void 373emit_split_vector(isel_context* ctx, Temp vec_src, unsigned num_components) 374{ 375 if (num_components == 1) 376 return; 377 if (ctx->allocated_vec.find(vec_src.id()) != ctx->allocated_vec.end()) 378 return; 379 RegClass rc; 380 if (num_components > vec_src.size()) { 381 if (vec_src.type() == RegType::sgpr) { 382 /* should still help get_alu_src() */ 383 emit_split_vector(ctx, vec_src, vec_src.size()); 384 return; 385 } 386 /* sub-dword split */ 387 rc = RegClass(RegType::vgpr, vec_src.bytes() / num_components).as_subdword(); 388 } else { 389 rc = RegClass(vec_src.type(), vec_src.size() / num_components); 390 } 391 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>( 392 aco_opcode::p_split_vector, Format::PSEUDO, 1, num_components)}; 393 split->operands[0] = Operand(vec_src); 394 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 395 for (unsigned i = 0; i < num_components; i++) { 396 elems[i] = ctx->program->allocateTmp(rc); 397 split->definitions[i] = Definition(elems[i]); 398 } 399 ctx->block->instructions.emplace_back(std::move(split)); 400 ctx->allocated_vec.emplace(vec_src.id(), elems); 401} 402 403/* This vector expansion uses a mask to determine which elements in the new vector 404 * come from the original vector. The other elements are undefined. */ 405void 406expand_vector(isel_context* ctx, Temp vec_src, Temp dst, unsigned num_components, unsigned mask) 407{ 408 emit_split_vector(ctx, vec_src, util_bitcount(mask)); 409 410 if (vec_src == dst) 411 return; 412 413 Builder bld(ctx->program, ctx->block); 414 if (num_components == 1) { 415 if (dst.type() == RegType::sgpr) 416 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec_src); 417 else 418 bld.copy(Definition(dst), vec_src); 419 return; 420 } 421 422 unsigned component_size = dst.size() / num_components; 423 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 424 425 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 426 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 427 vec->definitions[0] = Definition(dst); 428 unsigned k = 0; 429 for (unsigned i = 0; i < num_components; i++) { 430 if (mask & (1 << i)) { 431 Temp src = 432 emit_extract_vector(ctx, vec_src, k++, RegClass(vec_src.type(), component_size)); 433 if (dst.type() == RegType::sgpr) 434 src = bld.as_uniform(src); 435 vec->operands[i] = Operand(src); 436 } else { 437 vec->operands[i] = Operand::zero(component_size == 2 ? 8 : 4); 438 } 439 elems[i] = vec->operands[i].getTemp(); 440 } 441 ctx->block->instructions.emplace_back(std::move(vec)); 442 ctx->allocated_vec.emplace(dst.id(), elems); 443} 444 445/* adjust misaligned small bit size loads */ 446void 447byte_align_scalar(isel_context* ctx, Temp vec, Operand offset, Temp dst) 448{ 449 Builder bld(ctx->program, ctx->block); 450 Operand shift; 451 Temp select = Temp(); 452 if (offset.isConstant()) { 453 assert(offset.constantValue() && offset.constantValue() < 4); 454 shift = Operand::c32(offset.constantValue() * 8); 455 } else { 456 /* bit_offset = 8 * (offset & 0x3) */ 457 Temp tmp = 458 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), offset, Operand::c32(3u)); 459 select = bld.tmp(s1); 460 shift = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.scc(Definition(select)), tmp, 461 Operand::c32(3u)); 462 } 463 464 if (vec.size() == 1) { 465 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), vec, shift); 466 } else if (vec.size() == 2) { 467 Temp tmp = dst.size() == 2 ? dst : bld.tmp(s2); 468 bld.sop2(aco_opcode::s_lshr_b64, Definition(tmp), bld.def(s1, scc), vec, shift); 469 if (tmp == dst) 470 emit_split_vector(ctx, dst, 2); 471 else 472 emit_extract_vector(ctx, tmp, 0, dst); 473 } else if (vec.size() == 3 || vec.size() == 4) { 474 Temp lo = bld.tmp(s2), hi; 475 if (vec.size() == 3) { 476 /* this can happen if we use VMEM for a uniform load */ 477 hi = bld.tmp(s1); 478 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec); 479 } else { 480 hi = bld.tmp(s2); 481 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), vec); 482 hi = bld.pseudo(aco_opcode::p_extract_vector, bld.def(s1), hi, Operand::zero()); 483 } 484 if (select != Temp()) 485 hi = 486 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), hi, Operand::zero(), bld.scc(select)); 487 lo = bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), lo, shift); 488 Temp mid = bld.tmp(s1); 489 lo = bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), Definition(mid), lo); 490 hi = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), hi, shift); 491 mid = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), hi, mid); 492 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, mid); 493 emit_split_vector(ctx, dst, 2); 494 } 495} 496 497void 498byte_align_vector(isel_context* ctx, Temp vec, Operand offset, Temp dst, unsigned component_size) 499{ 500 Builder bld(ctx->program, ctx->block); 501 if (offset.isTemp()) { 502 Temp tmp[4] = {vec, vec, vec, vec}; 503 504 if (vec.size() == 4) { 505 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1), tmp[3] = bld.tmp(v1); 506 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), 507 Definition(tmp[2]), Definition(tmp[3]), vec); 508 } else if (vec.size() == 3) { 509 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = bld.tmp(v1); 510 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), 511 Definition(tmp[2]), vec); 512 } else if (vec.size() == 2) { 513 tmp[0] = bld.tmp(v1), tmp[1] = bld.tmp(v1), tmp[2] = tmp[1]; 514 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp[0]), Definition(tmp[1]), vec); 515 } 516 for (unsigned i = 0; i < dst.size(); i++) 517 tmp[i] = bld.vop3(aco_opcode::v_alignbyte_b32, bld.def(v1), tmp[i + 1], tmp[i], offset); 518 519 vec = tmp[0]; 520 if (dst.size() == 2) 521 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), tmp[0], tmp[1]); 522 523 offset = Operand::zero(); 524 } 525 526 unsigned num_components = vec.bytes() / component_size; 527 if (vec.regClass() == dst.regClass()) { 528 assert(offset.constantValue() == 0); 529 bld.copy(Definition(dst), vec); 530 emit_split_vector(ctx, dst, num_components); 531 return; 532 } 533 534 emit_split_vector(ctx, vec, num_components); 535 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 536 RegClass rc = RegClass(RegType::vgpr, component_size).as_subdword(); 537 538 assert(offset.constantValue() % component_size == 0); 539 unsigned skip = offset.constantValue() / component_size; 540 for (unsigned i = skip; i < num_components; i++) 541 elems[i - skip] = emit_extract_vector(ctx, vec, i, rc); 542 543 if (dst.type() == RegType::vgpr) { 544 /* if dst is vgpr - split the src and create a shrunk version according to the mask. */ 545 num_components = dst.bytes() / component_size; 546 aco_ptr<Pseudo_instruction> create_vec{create_instruction<Pseudo_instruction>( 547 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 548 for (unsigned i = 0; i < num_components; i++) 549 create_vec->operands[i] = Operand(elems[i]); 550 create_vec->definitions[0] = Definition(dst); 551 bld.insert(std::move(create_vec)); 552 553 } else if (skip) { 554 /* if dst is sgpr - split the src, but move the original to sgpr. */ 555 vec = bld.pseudo(aco_opcode::p_as_uniform, bld.def(RegClass(RegType::sgpr, vec.size())), vec); 556 byte_align_scalar(ctx, vec, offset, dst); 557 } else { 558 assert(dst.size() == vec.size()); 559 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), vec); 560 } 561 562 ctx->allocated_vec.emplace(dst.id(), elems); 563} 564 565Temp 566bool_to_vector_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s2)) 567{ 568 Builder bld(ctx->program, ctx->block); 569 if (!dst.id()) 570 dst = bld.tmp(bld.lm); 571 572 assert(val.regClass() == s1); 573 assert(dst.regClass() == bld.lm); 574 575 return bld.sop2(Builder::s_cselect, Definition(dst), Operand::c32(-1), Operand::zero(), 576 bld.scc(val)); 577} 578 579Temp 580bool_to_scalar_condition(isel_context* ctx, Temp val, Temp dst = Temp(0, s1)) 581{ 582 Builder bld(ctx->program, ctx->block); 583 if (!dst.id()) 584 dst = bld.tmp(s1); 585 586 assert(val.regClass() == bld.lm); 587 assert(dst.regClass() == s1); 588 589 /* if we're currently in WQM mode, ensure that the source is also computed in WQM */ 590 Temp tmp = bld.tmp(s1); 591 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.scc(Definition(tmp)), val, Operand(exec, bld.lm)); 592 return emit_wqm(bld, tmp, dst); 593} 594 595/** 596 * Copies the first src_bits of the input to the output Temp. Input bits at positions larger than 597 * src_bits and dst_bits are truncated. 598 * 599 * Sign extension may be applied using the sign_extend parameter. The position of the input sign 600 * bit is indicated by src_bits in this case. 601 * 602 * If dst.bytes() is larger than dst_bits/8, the value of the upper bits is undefined. 603 */ 604Temp 605convert_int(isel_context* ctx, Builder& bld, Temp src, unsigned src_bits, unsigned dst_bits, 606 bool sign_extend, Temp dst = Temp()) 607{ 608 assert(!(sign_extend && dst_bits < src_bits) && 609 "Shrinking integers is not supported for signed inputs"); 610 611 if (!dst.id()) { 612 if (dst_bits % 32 == 0 || src.type() == RegType::sgpr) 613 dst = bld.tmp(src.type(), DIV_ROUND_UP(dst_bits, 32u)); 614 else 615 dst = bld.tmp(RegClass(RegType::vgpr, dst_bits / 8u).as_subdword()); 616 } 617 618 assert(src.type() == RegType::sgpr || src_bits == src.bytes() * 8); 619 assert(dst.type() == RegType::sgpr || dst_bits == dst.bytes() * 8); 620 621 if (dst.bytes() == src.bytes() && dst_bits < src_bits) { 622 /* Copy the raw value, leaving an undefined value in the upper bits for 623 * the caller to handle appropriately */ 624 return bld.copy(Definition(dst), src); 625 } else if (dst.bytes() < src.bytes()) { 626 return bld.pseudo(aco_opcode::p_extract_vector, Definition(dst), src, Operand::zero()); 627 } 628 629 Temp tmp = dst; 630 if (dst_bits == 64) 631 tmp = src_bits == 32 ? src : bld.tmp(src.type(), 1); 632 633 if (tmp == src) { 634 } else if (src.regClass() == s1) { 635 assert(src_bits < 32); 636 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), src, Operand::zero(), 637 Operand::c32(src_bits), Operand::c32((unsigned)sign_extend)); 638 } else { 639 assert(src_bits < 32); 640 bld.pseudo(aco_opcode::p_extract, Definition(tmp), src, Operand::zero(), Operand::c32(src_bits), 641 Operand::c32((unsigned)sign_extend)); 642 } 643 644 if (dst_bits == 64) { 645 if (sign_extend && dst.regClass() == s2) { 646 Temp high = 647 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(31u)); 648 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); 649 } else if (sign_extend && dst.regClass() == v2) { 650 Temp high = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), tmp); 651 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, high); 652 } else { 653 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero()); 654 } 655 } 656 657 return dst; 658} 659 660enum sgpr_extract_mode { 661 sgpr_extract_sext, 662 sgpr_extract_zext, 663 sgpr_extract_undef, 664}; 665 666Temp 667extract_8_16_bit_sgpr_element(isel_context* ctx, Temp dst, nir_alu_src* src, sgpr_extract_mode mode) 668{ 669 Temp vec = get_ssa_temp(ctx, src->src.ssa); 670 unsigned src_size = src->src.ssa->bit_size; 671 unsigned swizzle = src->swizzle[0]; 672 673 if (vec.size() > 1) { 674 assert(src_size == 16); 675 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1); 676 swizzle = swizzle & 1; 677 } 678 679 Builder bld(ctx->program, ctx->block); 680 Temp tmp = dst.regClass() == s2 ? bld.tmp(s1) : dst; 681 682 if (mode == sgpr_extract_undef && swizzle == 0) 683 bld.copy(Definition(tmp), vec); 684 else 685 bld.pseudo(aco_opcode::p_extract, Definition(tmp), bld.def(s1, scc), Operand(vec), 686 Operand::c32(swizzle), Operand::c32(src_size), 687 Operand::c32((mode == sgpr_extract_sext))); 688 689 if (dst.regClass() == s2) 690 convert_int(ctx, bld, tmp, 32, 64, mode == sgpr_extract_sext, dst); 691 692 return dst; 693} 694 695Temp 696get_alu_src(struct isel_context* ctx, nir_alu_src src, unsigned size = 1) 697{ 698 if (src.src.ssa->num_components == 1 && size == 1) 699 return get_ssa_temp(ctx, src.src.ssa); 700 701 Temp vec = get_ssa_temp(ctx, src.src.ssa); 702 unsigned elem_size = src.src.ssa->bit_size / 8u; 703 bool identity_swizzle = true; 704 705 for (unsigned i = 0; identity_swizzle && i < size; i++) { 706 if (src.swizzle[i] != i) 707 identity_swizzle = false; 708 } 709 if (identity_swizzle) 710 return emit_extract_vector(ctx, vec, 0, RegClass::get(vec.type(), elem_size * size)); 711 712 assert(elem_size > 0); 713 assert(vec.bytes() % elem_size == 0); 714 715 if (elem_size < 4 && vec.type() == RegType::sgpr && size == 1) { 716 assert(src.src.ssa->bit_size == 8 || src.src.ssa->bit_size == 16); 717 return extract_8_16_bit_sgpr_element(ctx, ctx->program->allocateTmp(s1), &src, 718 sgpr_extract_undef); 719 } 720 721 bool as_uniform = elem_size < 4 && vec.type() == RegType::sgpr; 722 if (as_uniform) 723 vec = as_vgpr(ctx, vec); 724 725 RegClass elem_rc = elem_size < 4 ? RegClass(vec.type(), elem_size).as_subdword() 726 : RegClass(vec.type(), elem_size / 4); 727 if (size == 1) { 728 return emit_extract_vector(ctx, vec, src.swizzle[0], elem_rc); 729 } else { 730 assert(size <= 4); 731 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 732 aco_ptr<Pseudo_instruction> vec_instr{create_instruction<Pseudo_instruction>( 733 aco_opcode::p_create_vector, Format::PSEUDO, size, 1)}; 734 for (unsigned i = 0; i < size; ++i) { 735 elems[i] = emit_extract_vector(ctx, vec, src.swizzle[i], elem_rc); 736 vec_instr->operands[i] = Operand{elems[i]}; 737 } 738 Temp dst = ctx->program->allocateTmp(RegClass(vec.type(), elem_size * size / 4)); 739 vec_instr->definitions[0] = Definition(dst); 740 ctx->block->instructions.emplace_back(std::move(vec_instr)); 741 ctx->allocated_vec.emplace(dst.id(), elems); 742 return vec.type() == RegType::sgpr ? Builder(ctx->program, ctx->block).as_uniform(dst) : dst; 743 } 744} 745 746Temp 747get_alu_src_vop3p(struct isel_context* ctx, nir_alu_src src) 748{ 749 /* returns v2b or v1 for vop3p usage. 750 * The source expects exactly 2 16bit components 751 * which are within the same dword 752 */ 753 assert(src.src.ssa->bit_size == 16); 754 assert(src.swizzle[0] >> 1 == src.swizzle[1] >> 1); 755 756 Temp tmp = get_ssa_temp(ctx, src.src.ssa); 757 if (tmp.size() == 1) 758 return tmp; 759 760 /* the size is larger than 1 dword: check the swizzle */ 761 unsigned dword = src.swizzle[0] >> 1; 762 763 /* extract a full dword if possible */ 764 if (tmp.bytes() >= (dword + 1) * 4) { 765 return emit_extract_vector(ctx, tmp, dword, RegClass(tmp.type(), 1)); 766 } else { 767 /* This must be a swizzled access to %a.zz where %a is v6b */ 768 assert(((src.swizzle[0] | src.swizzle[1]) & 1) == 0); 769 assert(tmp.regClass() == v6b && dword == 1); 770 return emit_extract_vector(ctx, tmp, dword * 2, v2b); 771 } 772} 773 774uint32_t 775get_alu_src_ub(isel_context* ctx, nir_alu_instr* instr, int src_idx) 776{ 777 nir_ssa_scalar scalar = 778 nir_ssa_scalar{instr->src[src_idx].src.ssa, instr->src[src_idx].swizzle[0]}; 779 return nir_unsigned_upper_bound(ctx->shader, ctx->range_ht, scalar, &ctx->ub_config); 780} 781 782Temp 783convert_pointer_to_64_bit(isel_context* ctx, Temp ptr, bool non_uniform = false) 784{ 785 if (ptr.size() == 2) 786 return ptr; 787 Builder bld(ctx->program, ctx->block); 788 if (ptr.type() == RegType::vgpr && !non_uniform) 789 ptr = bld.as_uniform(ptr); 790 return bld.pseudo(aco_opcode::p_create_vector, bld.def(RegClass(ptr.type(), 2)), ptr, 791 Operand::c32((unsigned)ctx->options->address32_hi)); 792} 793 794void 795emit_sop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, 796 bool writes_scc, uint8_t uses_ub = 0) 797{ 798 aco_ptr<SOP2_instruction> sop2{ 799 create_instruction<SOP2_instruction>(op, Format::SOP2, 2, writes_scc ? 2 : 1)}; 800 sop2->operands[0] = Operand(get_alu_src(ctx, instr->src[0])); 801 sop2->operands[1] = Operand(get_alu_src(ctx, instr->src[1])); 802 sop2->definitions[0] = Definition(dst); 803 if (instr->no_unsigned_wrap) 804 sop2->definitions[0].setNUW(true); 805 if (writes_scc) 806 sop2->definitions[1] = Definition(ctx->program->allocateId(s1), scc, s1); 807 808 for (int i = 0; i < 2; i++) { 809 if (uses_ub & (1 << i)) { 810 uint32_t src_ub = get_alu_src_ub(ctx, instr, i); 811 if (src_ub <= 0xffff) 812 sop2->operands[i].set16bit(true); 813 else if (src_ub <= 0xffffff) 814 sop2->operands[i].set24bit(true); 815 } 816 } 817 818 ctx->block->instructions.emplace_back(std::move(sop2)); 819} 820 821void 822emit_vop2_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode opc, Temp dst, 823 bool commutative, bool swap_srcs = false, bool flush_denorms = false, 824 bool nuw = false, uint8_t uses_ub = 0) 825{ 826 Builder bld(ctx->program, ctx->block); 827 bld.is_precise = instr->exact; 828 829 Temp src0 = get_alu_src(ctx, instr->src[swap_srcs ? 1 : 0]); 830 Temp src1 = get_alu_src(ctx, instr->src[swap_srcs ? 0 : 1]); 831 if (src1.type() == RegType::sgpr) { 832 if (commutative && src0.type() == RegType::vgpr) { 833 Temp t = src0; 834 src0 = src1; 835 src1 = t; 836 } else { 837 src1 = as_vgpr(ctx, src1); 838 } 839 } 840 841 Operand op[2] = {Operand(src0), Operand(src1)}; 842 843 for (int i = 0; i < 2; i++) { 844 if (uses_ub & (1 << i)) { 845 uint32_t src_ub = get_alu_src_ub(ctx, instr, swap_srcs ? !i : i); 846 if (src_ub <= 0xffff) 847 op[i].set16bit(true); 848 else if (src_ub <= 0xffffff) 849 op[i].set24bit(true); 850 } 851 } 852 853 if (flush_denorms && ctx->program->chip_class < GFX9) { 854 assert(dst.size() == 1); 855 Temp tmp = bld.vop2(opc, bld.def(v1), op[0], op[1]); 856 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp); 857 } else { 858 if (nuw) { 859 bld.nuw().vop2(opc, Definition(dst), op[0], op[1]); 860 } else { 861 bld.vop2(opc, Definition(dst), op[0], op[1]); 862 } 863 } 864} 865 866void 867emit_vop2_instruction_logic64(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) 868{ 869 Builder bld(ctx->program, ctx->block); 870 bld.is_precise = instr->exact; 871 872 Temp src0 = get_alu_src(ctx, instr->src[0]); 873 Temp src1 = get_alu_src(ctx, instr->src[1]); 874 875 if (src1.type() == RegType::sgpr) { 876 assert(src0.type() == RegType::vgpr); 877 std::swap(src0, src1); 878 } 879 880 Temp src00 = bld.tmp(src0.type(), 1); 881 Temp src01 = bld.tmp(src0.type(), 1); 882 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 883 Temp src10 = bld.tmp(v1); 884 Temp src11 = bld.tmp(v1); 885 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 886 Temp lo = bld.vop2(op, bld.def(v1), src00, src10); 887 Temp hi = bld.vop2(op, bld.def(v1), src01, src11); 888 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 889} 890 891void 892emit_vop3a_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, 893 bool flush_denorms = false, unsigned num_sources = 2, bool swap_srcs = false) 894{ 895 assert(num_sources == 2 || num_sources == 3); 896 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)}; 897 bool has_sgpr = false; 898 for (unsigned i = 0; i < num_sources; i++) { 899 src[i] = get_alu_src(ctx, instr->src[swap_srcs ? 1 - i : i]); 900 if (has_sgpr) 901 src[i] = as_vgpr(ctx, src[i]); 902 else 903 has_sgpr = src[i].type() == RegType::sgpr; 904 } 905 906 Builder bld(ctx->program, ctx->block); 907 bld.is_precise = instr->exact; 908 if (flush_denorms && ctx->program->chip_class < GFX9) { 909 Temp tmp; 910 if (num_sources == 3) 911 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1], src[2]); 912 else 913 tmp = bld.vop3(op, bld.def(dst.regClass()), src[0], src[1]); 914 if (dst.size() == 1) 915 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0x3f800000u), tmp); 916 else 917 bld.vop3(aco_opcode::v_mul_f64, Definition(dst), Operand::c64(0x3FF0000000000000), tmp); 918 } else if (num_sources == 3) { 919 bld.vop3(op, Definition(dst), src[0], src[1], src[2]); 920 } else { 921 bld.vop3(op, Definition(dst), src[0], src[1]); 922 } 923} 924 925Builder::Result 926emit_vop3p_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, 927 bool swap_srcs = false) 928{ 929 Temp src0 = get_alu_src_vop3p(ctx, instr->src[swap_srcs]); 930 Temp src1 = get_alu_src_vop3p(ctx, instr->src[!swap_srcs]); 931 if (src0.type() == RegType::sgpr && src1.type() == RegType::sgpr) 932 src1 = as_vgpr(ctx, src1); 933 assert(instr->dest.dest.ssa.num_components == 2); 934 935 /* swizzle to opsel: all swizzles are either 0 (x) or 1 (y) */ 936 unsigned opsel_lo = 937 (instr->src[!swap_srcs].swizzle[0] & 1) << 1 | (instr->src[swap_srcs].swizzle[0] & 1); 938 unsigned opsel_hi = 939 (instr->src[!swap_srcs].swizzle[1] & 1) << 1 | (instr->src[swap_srcs].swizzle[1] & 1); 940 941 Builder bld(ctx->program, ctx->block); 942 bld.is_precise = instr->exact; 943 Builder::Result res = bld.vop3p(op, Definition(dst), src0, src1, opsel_lo, opsel_hi); 944 emit_split_vector(ctx, dst, 2); 945 return res; 946} 947 948void 949emit_idot_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst, bool clamp) 950{ 951 Temp src[3] = {Temp(0, v1), Temp(0, v1), Temp(0, v1)}; 952 bool has_sgpr = false; 953 for (unsigned i = 0; i < 3; i++) { 954 src[i] = get_alu_src(ctx, instr->src[i]); 955 if (has_sgpr) 956 src[i] = as_vgpr(ctx, src[i]); 957 else 958 has_sgpr = src[i].type() == RegType::sgpr; 959 } 960 961 Builder bld(ctx->program, ctx->block); 962 bld.is_precise = instr->exact; 963 bld.vop3p(op, Definition(dst), src[0], src[1], src[2], 0x0, 0x7).instr->vop3p().clamp = clamp; 964} 965 966void 967emit_vop1_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) 968{ 969 Builder bld(ctx->program, ctx->block); 970 bld.is_precise = instr->exact; 971 if (dst.type() == RegType::sgpr) 972 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), 973 bld.vop1(op, bld.def(RegType::vgpr, dst.size()), get_alu_src(ctx, instr->src[0]))); 974 else 975 bld.vop1(op, Definition(dst), get_alu_src(ctx, instr->src[0])); 976} 977 978void 979emit_vopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) 980{ 981 Temp src0 = get_alu_src(ctx, instr->src[0]); 982 Temp src1 = get_alu_src(ctx, instr->src[1]); 983 assert(src0.size() == src1.size()); 984 985 aco_ptr<Instruction> vopc; 986 if (src1.type() == RegType::sgpr) { 987 if (src0.type() == RegType::vgpr) { 988 /* to swap the operands, we might also have to change the opcode */ 989 switch (op) { 990 case aco_opcode::v_cmp_lt_f16: op = aco_opcode::v_cmp_gt_f16; break; 991 case aco_opcode::v_cmp_ge_f16: op = aco_opcode::v_cmp_le_f16; break; 992 case aco_opcode::v_cmp_lt_i16: op = aco_opcode::v_cmp_gt_i16; break; 993 case aco_opcode::v_cmp_ge_i16: op = aco_opcode::v_cmp_le_i16; break; 994 case aco_opcode::v_cmp_lt_u16: op = aco_opcode::v_cmp_gt_u16; break; 995 case aco_opcode::v_cmp_ge_u16: op = aco_opcode::v_cmp_le_u16; break; 996 case aco_opcode::v_cmp_lt_f32: op = aco_opcode::v_cmp_gt_f32; break; 997 case aco_opcode::v_cmp_ge_f32: op = aco_opcode::v_cmp_le_f32; break; 998 case aco_opcode::v_cmp_lt_i32: op = aco_opcode::v_cmp_gt_i32; break; 999 case aco_opcode::v_cmp_ge_i32: op = aco_opcode::v_cmp_le_i32; break; 1000 case aco_opcode::v_cmp_lt_u32: op = aco_opcode::v_cmp_gt_u32; break; 1001 case aco_opcode::v_cmp_ge_u32: op = aco_opcode::v_cmp_le_u32; break; 1002 case aco_opcode::v_cmp_lt_f64: op = aco_opcode::v_cmp_gt_f64; break; 1003 case aco_opcode::v_cmp_ge_f64: op = aco_opcode::v_cmp_le_f64; break; 1004 case aco_opcode::v_cmp_lt_i64: op = aco_opcode::v_cmp_gt_i64; break; 1005 case aco_opcode::v_cmp_ge_i64: op = aco_opcode::v_cmp_le_i64; break; 1006 case aco_opcode::v_cmp_lt_u64: op = aco_opcode::v_cmp_gt_u64; break; 1007 case aco_opcode::v_cmp_ge_u64: op = aco_opcode::v_cmp_le_u64; break; 1008 default: /* eq and ne are commutative */ break; 1009 } 1010 Temp t = src0; 1011 src0 = src1; 1012 src1 = t; 1013 } else { 1014 src1 = as_vgpr(ctx, src1); 1015 } 1016 } 1017 1018 Builder bld(ctx->program, ctx->block); 1019 bld.vopc(op, bld.hint_vcc(Definition(dst)), src0, src1); 1020} 1021 1022void 1023emit_sopc_instruction(isel_context* ctx, nir_alu_instr* instr, aco_opcode op, Temp dst) 1024{ 1025 Temp src0 = get_alu_src(ctx, instr->src[0]); 1026 Temp src1 = get_alu_src(ctx, instr->src[1]); 1027 Builder bld(ctx->program, ctx->block); 1028 1029 assert(dst.regClass() == bld.lm); 1030 assert(src0.type() == RegType::sgpr); 1031 assert(src1.type() == RegType::sgpr); 1032 assert(src0.regClass() == src1.regClass()); 1033 1034 /* Emit the SALU comparison instruction */ 1035 Temp cmp = bld.sopc(op, bld.scc(bld.def(s1)), src0, src1); 1036 /* Turn the result into a per-lane bool */ 1037 bool_to_vector_condition(ctx, cmp, dst); 1038} 1039 1040void 1041emit_comparison(isel_context* ctx, nir_alu_instr* instr, Temp dst, aco_opcode v16_op, 1042 aco_opcode v32_op, aco_opcode v64_op, aco_opcode s32_op = aco_opcode::num_opcodes, 1043 aco_opcode s64_op = aco_opcode::num_opcodes) 1044{ 1045 aco_opcode s_op = instr->src[0].src.ssa->bit_size == 64 ? s64_op 1046 : instr->src[0].src.ssa->bit_size == 32 ? s32_op 1047 : aco_opcode::num_opcodes; 1048 aco_opcode v_op = instr->src[0].src.ssa->bit_size == 64 ? v64_op 1049 : instr->src[0].src.ssa->bit_size == 32 ? v32_op 1050 : v16_op; 1051 bool use_valu = s_op == aco_opcode::num_opcodes || nir_dest_is_divergent(instr->dest.dest) || 1052 get_ssa_temp(ctx, instr->src[0].src.ssa).type() == RegType::vgpr || 1053 get_ssa_temp(ctx, instr->src[1].src.ssa).type() == RegType::vgpr; 1054 aco_opcode op = use_valu ? v_op : s_op; 1055 assert(op != aco_opcode::num_opcodes); 1056 assert(dst.regClass() == ctx->program->lane_mask); 1057 1058 if (use_valu) 1059 emit_vopc_instruction(ctx, instr, op, dst); 1060 else 1061 emit_sopc_instruction(ctx, instr, op, dst); 1062} 1063 1064void 1065emit_boolean_logic(isel_context* ctx, nir_alu_instr* instr, Builder::WaveSpecificOpcode op, 1066 Temp dst) 1067{ 1068 Builder bld(ctx->program, ctx->block); 1069 Temp src0 = get_alu_src(ctx, instr->src[0]); 1070 Temp src1 = get_alu_src(ctx, instr->src[1]); 1071 1072 assert(dst.regClass() == bld.lm); 1073 assert(src0.regClass() == bld.lm); 1074 assert(src1.regClass() == bld.lm); 1075 1076 bld.sop2(op, Definition(dst), bld.def(s1, scc), src0, src1); 1077} 1078 1079void 1080emit_bcsel(isel_context* ctx, nir_alu_instr* instr, Temp dst) 1081{ 1082 Builder bld(ctx->program, ctx->block); 1083 Temp cond = get_alu_src(ctx, instr->src[0]); 1084 Temp then = get_alu_src(ctx, instr->src[1]); 1085 Temp els = get_alu_src(ctx, instr->src[2]); 1086 1087 assert(cond.regClass() == bld.lm); 1088 1089 if (dst.type() == RegType::vgpr) { 1090 aco_ptr<Instruction> bcsel; 1091 if (dst.size() == 1) { 1092 then = as_vgpr(ctx, then); 1093 els = as_vgpr(ctx, els); 1094 1095 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), els, then, cond); 1096 } else if (dst.size() == 2) { 1097 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); 1098 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), then); 1099 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); 1100 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), els); 1101 1102 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, cond); 1103 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, cond); 1104 1105 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 1106 } else { 1107 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1108 } 1109 return; 1110 } 1111 1112 if (instr->dest.dest.ssa.bit_size == 1) { 1113 assert(dst.regClass() == bld.lm); 1114 assert(then.regClass() == bld.lm); 1115 assert(els.regClass() == bld.lm); 1116 } 1117 1118 if (!nir_src_is_divergent(instr->src[0].src)) { /* uniform condition and values in sgpr */ 1119 if (dst.regClass() == s1 || dst.regClass() == s2) { 1120 assert((then.regClass() == s1 || then.regClass() == s2) && 1121 els.regClass() == then.regClass()); 1122 assert(dst.size() == then.size()); 1123 aco_opcode op = 1124 dst.regClass() == s1 ? aco_opcode::s_cselect_b32 : aco_opcode::s_cselect_b64; 1125 bld.sop2(op, Definition(dst), then, els, bld.scc(bool_to_scalar_condition(ctx, cond))); 1126 } else { 1127 isel_err(&instr->instr, "Unimplemented uniform bcsel bit size"); 1128 } 1129 return; 1130 } 1131 1132 /* divergent boolean bcsel 1133 * this implements bcsel on bools: dst = s0 ? s1 : s2 1134 * are going to be: dst = (s0 & s1) | (~s0 & s2) */ 1135 assert(instr->dest.dest.ssa.bit_size == 1); 1136 1137 if (cond.id() != then.id()) 1138 then = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), cond, then); 1139 1140 if (cond.id() == els.id()) 1141 bld.copy(Definition(dst), then); 1142 else 1143 bld.sop2(Builder::s_or, Definition(dst), bld.def(s1, scc), then, 1144 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), els, cond)); 1145} 1146 1147void 1148emit_scaled_op(isel_context* ctx, Builder& bld, Definition dst, Temp val, aco_opcode op, 1149 uint32_t undo) 1150{ 1151 /* multiply by 16777216 to handle denormals */ 1152 Temp is_denormal = 1153 bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), as_vgpr(ctx, val), 1154 bld.copy(bld.def(v1), Operand::c32((1u << 7) | (1u << 4)))); 1155 Temp scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(0x4b800000u), val); 1156 scaled = bld.vop1(op, bld.def(v1), scaled); 1157 scaled = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::c32(undo), scaled); 1158 1159 Temp not_scaled = bld.vop1(op, bld.def(v1), val); 1160 1161 bld.vop2(aco_opcode::v_cndmask_b32, dst, not_scaled, scaled, is_denormal); 1162} 1163 1164void 1165emit_rcp(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1166{ 1167 if (ctx->block->fp_mode.denorm32 == 0) { 1168 bld.vop1(aco_opcode::v_rcp_f32, dst, val); 1169 return; 1170 } 1171 1172 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rcp_f32, 0x4b800000u); 1173} 1174 1175void 1176emit_rsq(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1177{ 1178 if (ctx->block->fp_mode.denorm32 == 0) { 1179 bld.vop1(aco_opcode::v_rsq_f32, dst, val); 1180 return; 1181 } 1182 1183 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_rsq_f32, 0x45800000u); 1184} 1185 1186void 1187emit_sqrt(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1188{ 1189 if (ctx->block->fp_mode.denorm32 == 0) { 1190 bld.vop1(aco_opcode::v_sqrt_f32, dst, val); 1191 return; 1192 } 1193 1194 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_sqrt_f32, 0x39800000u); 1195} 1196 1197void 1198emit_log2(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1199{ 1200 if (ctx->block->fp_mode.denorm32 == 0) { 1201 bld.vop1(aco_opcode::v_log_f32, dst, val); 1202 return; 1203 } 1204 1205 emit_scaled_op(ctx, bld, dst, val, aco_opcode::v_log_f32, 0xc1c00000u); 1206} 1207 1208Temp 1209emit_trunc_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1210{ 1211 if (ctx->options->chip_class >= GFX7) 1212 return bld.vop1(aco_opcode::v_trunc_f64, Definition(dst), val); 1213 1214 /* GFX6 doesn't support V_TRUNC_F64, lower it. */ 1215 /* TODO: create more efficient code! */ 1216 if (val.type() == RegType::sgpr) 1217 val = as_vgpr(ctx, val); 1218 1219 /* Split the input value. */ 1220 Temp val_lo = bld.tmp(v1), val_hi = bld.tmp(v1); 1221 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); 1222 1223 /* Extract the exponent and compute the unbiased value. */ 1224 Temp exponent = 1225 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), val_hi, Operand::c32(20u), Operand::c32(11u)); 1226 exponent = bld.vsub32(bld.def(v1), exponent, Operand::c32(1023u)); 1227 1228 /* Extract the fractional part. */ 1229 Temp fract_mask = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u), 1230 Operand::c32(0x000fffffu)); 1231 fract_mask = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), fract_mask, exponent); 1232 1233 Temp fract_mask_lo = bld.tmp(v1), fract_mask_hi = bld.tmp(v1); 1234 bld.pseudo(aco_opcode::p_split_vector, Definition(fract_mask_lo), Definition(fract_mask_hi), 1235 fract_mask); 1236 1237 Temp fract_lo = bld.tmp(v1), fract_hi = bld.tmp(v1); 1238 Temp tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_lo); 1239 fract_lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_lo, tmp); 1240 tmp = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), fract_mask_hi); 1241 fract_hi = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), val_hi, tmp); 1242 1243 /* Get the sign bit. */ 1244 Temp sign = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x80000000u), val_hi); 1245 1246 /* Decide the operation to apply depending on the unbiased exponent. */ 1247 Temp exp_lt0 = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), exponent, 1248 Operand::zero()); 1249 Temp dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_lo, 1250 bld.copy(bld.def(v1), Operand::zero()), exp_lt0); 1251 Temp dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), fract_hi, sign, exp_lt0); 1252 Temp exp_gt51 = bld.vopc_e64(aco_opcode::v_cmp_gt_i32, bld.def(s2), exponent, Operand::c32(51u)); 1253 dst_lo = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_lo, val_lo, exp_gt51); 1254 dst_hi = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), dst_hi, val_hi, exp_gt51); 1255 1256 return bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst_lo, dst_hi); 1257} 1258 1259Temp 1260emit_floor_f64(isel_context* ctx, Builder& bld, Definition dst, Temp val) 1261{ 1262 if (ctx->options->chip_class >= GFX7) 1263 return bld.vop1(aco_opcode::v_floor_f64, Definition(dst), val); 1264 1265 /* GFX6 doesn't support V_FLOOR_F64, lower it (note that it's actually 1266 * lowered at NIR level for precision reasons). */ 1267 Temp src0 = as_vgpr(ctx, val); 1268 1269 Temp mask = bld.copy(bld.def(s1), Operand::c32(3u)); /* isnan */ 1270 Temp min_val = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::c32(-1u), 1271 Operand::c32(0x3fefffffu)); 1272 1273 Temp isnan = 1274 bld.vopc_e64(aco_opcode::v_cmp_class_f64, bld.hint_vcc(bld.def(bld.lm)), src0, mask); 1275 Temp fract = bld.vop1(aco_opcode::v_fract_f64, bld.def(v2), src0); 1276 Temp min = bld.vop3(aco_opcode::v_min_f64, bld.def(v2), fract, min_val); 1277 1278 Temp then_lo = bld.tmp(v1), then_hi = bld.tmp(v1); 1279 bld.pseudo(aco_opcode::p_split_vector, Definition(then_lo), Definition(then_hi), src0); 1280 Temp else_lo = bld.tmp(v1), else_hi = bld.tmp(v1); 1281 bld.pseudo(aco_opcode::p_split_vector, Definition(else_lo), Definition(else_hi), min); 1282 1283 Temp dst0 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_lo, then_lo, isnan); 1284 Temp dst1 = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), else_hi, then_hi, isnan); 1285 1286 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), dst0, dst1); 1287 1288 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src0, v); 1289 add->vop3().neg[1] = true; 1290 1291 return add->definitions[0].getTemp(); 1292} 1293 1294Temp 1295uadd32_sat(Builder& bld, Definition dst, Temp src0, Temp src1) 1296{ 1297 if (bld.program->chip_class < GFX8) { 1298 Builder::Result add = bld.vadd32(bld.def(v1), src0, src1, true); 1299 return bld.vop2_e64(aco_opcode::v_cndmask_b32, dst, add.def(0).getTemp(), Operand::c32(-1), 1300 add.def(1).getTemp()); 1301 } 1302 1303 Builder::Result add(NULL); 1304 if (bld.program->chip_class >= GFX9) { 1305 add = bld.vop2_e64(aco_opcode::v_add_u32, dst, src0, src1); 1306 } else { 1307 add = bld.vop2_e64(aco_opcode::v_add_co_u32, dst, bld.hint_vcc(bld.def(bld.lm)), src0, src1); 1308 } 1309 add.instr->vop3().clamp = 1; 1310 return dst.getTemp(); 1311} 1312 1313void 1314visit_alu_instr(isel_context* ctx, nir_alu_instr* instr) 1315{ 1316 if (!instr->dest.dest.is_ssa) { 1317 isel_err(&instr->instr, "nir alu dst not in ssa"); 1318 abort(); 1319 } 1320 Builder bld(ctx->program, ctx->block); 1321 bld.is_precise = instr->exact; 1322 Temp dst = get_ssa_temp(ctx, &instr->dest.dest.ssa); 1323 switch (instr->op) { 1324 case nir_op_vec2: 1325 case nir_op_vec3: 1326 case nir_op_vec4: 1327 case nir_op_vec5: { 1328 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 1329 unsigned num = instr->dest.dest.ssa.num_components; 1330 for (unsigned i = 0; i < num; ++i) 1331 elems[i] = get_alu_src(ctx, instr->src[i]); 1332 1333 if (instr->dest.dest.ssa.bit_size >= 32 || dst.type() == RegType::vgpr) { 1334 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 1335 aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.dest.ssa.num_components, 1)}; 1336 RegClass elem_rc = RegClass::get(RegType::vgpr, instr->dest.dest.ssa.bit_size / 8u); 1337 for (unsigned i = 0; i < num; ++i) { 1338 if (elems[i].type() == RegType::sgpr && elem_rc.is_subdword()) 1339 elems[i] = emit_extract_vector(ctx, elems[i], 0, elem_rc); 1340 vec->operands[i] = Operand{elems[i]}; 1341 } 1342 vec->definitions[0] = Definition(dst); 1343 ctx->block->instructions.emplace_back(std::move(vec)); 1344 ctx->allocated_vec.emplace(dst.id(), elems); 1345 } else { 1346 bool use_s_pack = ctx->program->chip_class >= GFX9; 1347 Temp mask = bld.copy(bld.def(s1), Operand::c32((1u << instr->dest.dest.ssa.bit_size) - 1)); 1348 1349 std::array<Temp, NIR_MAX_VEC_COMPONENTS> packed; 1350 uint32_t const_vals[NIR_MAX_VEC_COMPONENTS] = {}; 1351 for (unsigned i = 0; i < num; i++) { 1352 unsigned packed_size = use_s_pack ? 16 : 32; 1353 unsigned idx = i * instr->dest.dest.ssa.bit_size / packed_size; 1354 unsigned offset = i * instr->dest.dest.ssa.bit_size % packed_size; 1355 if (nir_src_is_const(instr->src[i].src)) { 1356 const_vals[idx] |= nir_src_as_uint(instr->src[i].src) << offset; 1357 continue; 1358 } 1359 1360 if (offset != packed_size - instr->dest.dest.ssa.bit_size) 1361 elems[i] = 1362 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), elems[i], mask); 1363 1364 if (offset) 1365 elems[i] = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), elems[i], 1366 Operand::c32(offset)); 1367 1368 if (packed[idx].id()) 1369 packed[idx] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), elems[i], 1370 packed[idx]); 1371 else 1372 packed[idx] = elems[i]; 1373 } 1374 1375 if (use_s_pack) { 1376 for (unsigned i = 0; i < dst.size(); i++) { 1377 bool same = !!packed[i * 2].id() == !!packed[i * 2 + 1].id(); 1378 1379 if (packed[i * 2].id() && packed[i * 2 + 1].id()) 1380 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2], 1381 packed[i * 2 + 1]); 1382 else if (packed[i * 2 + 1].id()) 1383 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), 1384 Operand::c32(const_vals[i * 2]), packed[i * 2 + 1]); 1385 else if (packed[i * 2].id()) 1386 packed[i] = bld.sop2(aco_opcode::s_pack_ll_b32_b16, bld.def(s1), packed[i * 2], 1387 Operand::c32(const_vals[i * 2 + 1])); 1388 1389 if (same) 1390 const_vals[i] = const_vals[i * 2] | (const_vals[i * 2 + 1] << 16); 1391 else 1392 const_vals[i] = 0; 1393 } 1394 } 1395 1396 for (unsigned i = 0; i < dst.size(); i++) { 1397 if (const_vals[i] && packed[i].id()) 1398 packed[i] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), 1399 Operand::c32(const_vals[i]), packed[i]); 1400 else if (!packed[i].id()) 1401 packed[i] = bld.copy(bld.def(s1), Operand::c32(const_vals[i])); 1402 } 1403 1404 if (dst.size() == 1) 1405 bld.copy(Definition(dst), packed[0]); 1406 else if (dst.size() == 2) 1407 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1]); 1408 else 1409 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), packed[0], packed[1], 1410 packed[2]); 1411 } 1412 break; 1413 } 1414 case nir_op_mov: { 1415 Temp src = get_alu_src(ctx, instr->src[0]); 1416 if (src.type() == RegType::vgpr && dst.type() == RegType::sgpr) { 1417 /* use size() instead of bytes() for 8/16-bit */ 1418 assert(src.size() == dst.size() && "wrong src or dst register class for nir_op_mov"); 1419 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), src); 1420 } else { 1421 assert(src.bytes() == dst.bytes() && "wrong src or dst register class for nir_op_mov"); 1422 bld.copy(Definition(dst), src); 1423 } 1424 break; 1425 } 1426 case nir_op_inot: { 1427 Temp src = get_alu_src(ctx, instr->src[0]); 1428 if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { 1429 emit_vop1_instruction(ctx, instr, aco_opcode::v_not_b32, dst); 1430 } else if (dst.regClass() == v2) { 1431 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 1432 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 1433 lo = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), lo); 1434 hi = bld.vop1(aco_opcode::v_not_b32, bld.def(v1), hi); 1435 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 1436 } else if (dst.type() == RegType::sgpr) { 1437 aco_opcode opcode = dst.size() == 1 ? aco_opcode::s_not_b32 : aco_opcode::s_not_b64; 1438 bld.sop1(opcode, Definition(dst), bld.def(s1, scc), src); 1439 } else { 1440 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1441 } 1442 break; 1443 } 1444 case nir_op_iabs: { 1445 Temp src = get_alu_src(ctx, instr->src[0]); 1446 if (dst.regClass() == s1) { 1447 bld.sop1(aco_opcode::s_abs_i32, Definition(dst), bld.def(s1, scc), src); 1448 } else if (dst.regClass() == v1) { 1449 bld.vop2(aco_opcode::v_max_i32, Definition(dst), src, 1450 bld.vsub32(bld.def(v1), Operand::zero(), src)); 1451 } else { 1452 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1453 } 1454 break; 1455 } 1456 case nir_op_isign: { 1457 Temp src = get_alu_src(ctx, instr->src[0]); 1458 if (dst.regClass() == s1) { 1459 Temp tmp = 1460 bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(-1)); 1461 bld.sop2(aco_opcode::s_min_i32, Definition(dst), bld.def(s1, scc), tmp, Operand::c32(1u)); 1462 } else if (dst.regClass() == s2) { 1463 Temp neg = 1464 bld.sop2(aco_opcode::s_ashr_i64, bld.def(s2), bld.def(s1, scc), src, Operand::c32(63u)); 1465 Temp neqz; 1466 if (ctx->program->chip_class >= GFX8) 1467 neqz = bld.sopc(aco_opcode::s_cmp_lg_u64, bld.def(s1, scc), src, Operand::zero()); 1468 else 1469 neqz = 1470 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), src, Operand::zero()) 1471 .def(1) 1472 .getTemp(); 1473 /* SCC gets zero-extended to 64 bit */ 1474 bld.sop2(aco_opcode::s_or_b64, Definition(dst), bld.def(s1, scc), neg, bld.scc(neqz)); 1475 } else if (dst.regClass() == v1) { 1476 bld.vop3(aco_opcode::v_med3_i32, Definition(dst), Operand::c32(-1), src, Operand::c32(1u)); 1477 } else if (dst.regClass() == v2) { 1478 Temp upper = emit_extract_vector(ctx, src, 1, v1); 1479 Temp neg = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), upper); 1480 Temp gtz = 1481 bld.vopc(aco_opcode::v_cmp_ge_i64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src); 1482 Temp lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(1u), neg, gtz); 1483 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), neg, gtz); 1484 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 1485 } else { 1486 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1487 } 1488 break; 1489 } 1490 case nir_op_imax: { 1491 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) { 1492 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_i16_e64, dst); 1493 } else if (dst.regClass() == v2b) { 1494 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i16, dst, true); 1495 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1496 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_i16, dst); 1497 } else if (dst.regClass() == v1) { 1498 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_i32, dst, true); 1499 } else if (dst.regClass() == s1) { 1500 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_i32, dst, true); 1501 } else { 1502 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1503 } 1504 break; 1505 } 1506 case nir_op_umax: { 1507 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) { 1508 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_u16_e64, dst); 1509 } else if (dst.regClass() == v2b) { 1510 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u16, dst, true); 1511 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1512 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_u16, dst); 1513 } else if (dst.regClass() == v1) { 1514 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_u32, dst, true); 1515 } else if (dst.regClass() == s1) { 1516 emit_sop2_instruction(ctx, instr, aco_opcode::s_max_u32, dst, true); 1517 } else { 1518 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1519 } 1520 break; 1521 } 1522 case nir_op_imin: { 1523 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) { 1524 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_i16_e64, dst); 1525 } else if (dst.regClass() == v2b) { 1526 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i16, dst, true); 1527 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1528 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_i16, dst); 1529 } else if (dst.regClass() == v1) { 1530 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_i32, dst, true); 1531 } else if (dst.regClass() == s1) { 1532 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_i32, dst, true); 1533 } else { 1534 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1535 } 1536 break; 1537 } 1538 case nir_op_umin: { 1539 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) { 1540 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_u16_e64, dst); 1541 } else if (dst.regClass() == v2b) { 1542 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u16, dst, true); 1543 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1544 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_u16, dst); 1545 } else if (dst.regClass() == v1) { 1546 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_u32, dst, true); 1547 } else if (dst.regClass() == s1) { 1548 emit_sop2_instruction(ctx, instr, aco_opcode::s_min_u32, dst, true); 1549 } else { 1550 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1551 } 1552 break; 1553 } 1554 case nir_op_ior: { 1555 if (instr->dest.dest.ssa.bit_size == 1) { 1556 emit_boolean_logic(ctx, instr, Builder::s_or, dst); 1557 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { 1558 emit_vop2_instruction(ctx, instr, aco_opcode::v_or_b32, dst, true); 1559 } else if (dst.regClass() == v2) { 1560 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_or_b32, dst); 1561 } else if (dst.regClass() == s1) { 1562 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b32, dst, true); 1563 } else if (dst.regClass() == s2) { 1564 emit_sop2_instruction(ctx, instr, aco_opcode::s_or_b64, dst, true); 1565 } else { 1566 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1567 } 1568 break; 1569 } 1570 case nir_op_iand: { 1571 if (instr->dest.dest.ssa.bit_size == 1) { 1572 emit_boolean_logic(ctx, instr, Builder::s_and, dst); 1573 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { 1574 emit_vop2_instruction(ctx, instr, aco_opcode::v_and_b32, dst, true); 1575 } else if (dst.regClass() == v2) { 1576 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_and_b32, dst); 1577 } else if (dst.regClass() == s1) { 1578 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b32, dst, true); 1579 } else if (dst.regClass() == s2) { 1580 emit_sop2_instruction(ctx, instr, aco_opcode::s_and_b64, dst, true); 1581 } else { 1582 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1583 } 1584 break; 1585 } 1586 case nir_op_ixor: { 1587 if (instr->dest.dest.ssa.bit_size == 1) { 1588 emit_boolean_logic(ctx, instr, Builder::s_xor, dst); 1589 } else if (dst.regClass() == v1 || dst.regClass() == v2b || dst.regClass() == v1b) { 1590 emit_vop2_instruction(ctx, instr, aco_opcode::v_xor_b32, dst, true); 1591 } else if (dst.regClass() == v2) { 1592 emit_vop2_instruction_logic64(ctx, instr, aco_opcode::v_xor_b32, dst); 1593 } else if (dst.regClass() == s1) { 1594 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b32, dst, true); 1595 } else if (dst.regClass() == s2) { 1596 emit_sop2_instruction(ctx, instr, aco_opcode::s_xor_b64, dst, true); 1597 } else { 1598 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1599 } 1600 break; 1601 } 1602 case nir_op_ushr: { 1603 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) { 1604 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshrrev_b16_e64, dst, false, 2, true); 1605 } else if (dst.regClass() == v2b) { 1606 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b16, dst, false, true); 1607 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1608 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshrrev_b16, dst, true); 1609 } else if (dst.regClass() == v1) { 1610 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshrrev_b32, dst, false, true); 1611 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { 1612 bld.vop3(aco_opcode::v_lshrrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]), 1613 get_alu_src(ctx, instr->src[0])); 1614 } else if (dst.regClass() == v2) { 1615 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshr_b64, dst); 1616 } else if (dst.regClass() == s2) { 1617 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b64, dst, true); 1618 } else if (dst.regClass() == s1) { 1619 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshr_b32, dst, true); 1620 } else { 1621 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1622 } 1623 break; 1624 } 1625 case nir_op_ishl: { 1626 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) { 1627 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshlrev_b16_e64, dst, false, 2, true); 1628 } else if (dst.regClass() == v2b) { 1629 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b16, dst, false, true); 1630 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1631 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_lshlrev_b16, dst, true); 1632 } else if (dst.regClass() == v1) { 1633 emit_vop2_instruction(ctx, instr, aco_opcode::v_lshlrev_b32, dst, false, true, false, 1634 false, 2); 1635 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { 1636 bld.vop3(aco_opcode::v_lshlrev_b64, Definition(dst), get_alu_src(ctx, instr->src[1]), 1637 get_alu_src(ctx, instr->src[0])); 1638 } else if (dst.regClass() == v2) { 1639 emit_vop3a_instruction(ctx, instr, aco_opcode::v_lshl_b64, dst); 1640 } else if (dst.regClass() == s1) { 1641 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b32, dst, true, 1); 1642 } else if (dst.regClass() == s2) { 1643 emit_sop2_instruction(ctx, instr, aco_opcode::s_lshl_b64, dst, true); 1644 } else { 1645 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1646 } 1647 break; 1648 } 1649 case nir_op_ishr: { 1650 if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10) { 1651 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashrrev_i16_e64, dst, false, 2, true); 1652 } else if (dst.regClass() == v2b) { 1653 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i16, dst, false, true); 1654 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1655 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_ashrrev_i16, dst, true); 1656 } else if (dst.regClass() == v1) { 1657 emit_vop2_instruction(ctx, instr, aco_opcode::v_ashrrev_i32, dst, false, true); 1658 } else if (dst.regClass() == v2 && ctx->program->chip_class >= GFX8) { 1659 bld.vop3(aco_opcode::v_ashrrev_i64, Definition(dst), get_alu_src(ctx, instr->src[1]), 1660 get_alu_src(ctx, instr->src[0])); 1661 } else if (dst.regClass() == v2) { 1662 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ashr_i64, dst); 1663 } else if (dst.regClass() == s1) { 1664 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i32, dst, true); 1665 } else if (dst.regClass() == s2) { 1666 emit_sop2_instruction(ctx, instr, aco_opcode::s_ashr_i64, dst, true); 1667 } else { 1668 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1669 } 1670 break; 1671 } 1672 case nir_op_find_lsb: { 1673 Temp src = get_alu_src(ctx, instr->src[0]); 1674 if (src.regClass() == s1) { 1675 bld.sop1(aco_opcode::s_ff1_i32_b32, Definition(dst), src); 1676 } else if (src.regClass() == v1) { 1677 emit_vop1_instruction(ctx, instr, aco_opcode::v_ffbl_b32, dst); 1678 } else if (src.regClass() == s2) { 1679 bld.sop1(aco_opcode::s_ff1_i32_b64, Definition(dst), src); 1680 } else { 1681 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1682 } 1683 break; 1684 } 1685 case nir_op_ufind_msb: 1686 case nir_op_ifind_msb: { 1687 Temp src = get_alu_src(ctx, instr->src[0]); 1688 if (src.regClass() == s1 || src.regClass() == s2) { 1689 aco_opcode op = src.regClass() == s2 1690 ? (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b64 1691 : aco_opcode::s_flbit_i32_i64) 1692 : (instr->op == nir_op_ufind_msb ? aco_opcode::s_flbit_i32_b32 1693 : aco_opcode::s_flbit_i32); 1694 Temp msb_rev = bld.sop1(op, bld.def(s1), src); 1695 1696 Builder::Result sub = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), 1697 Operand::c32(src.size() * 32u - 1u), msb_rev); 1698 Temp msb = sub.def(0).getTemp(); 1699 Temp carry = sub.def(1).getTemp(); 1700 1701 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), msb, 1702 bld.scc(carry)); 1703 } else if (src.regClass() == v1) { 1704 aco_opcode op = 1705 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; 1706 Temp msb_rev = bld.tmp(v1); 1707 emit_vop1_instruction(ctx, instr, op, msb_rev); 1708 Temp msb = bld.tmp(v1); 1709 Temp carry = 1710 bld.vsub32(Definition(msb), Operand::c32(31u), Operand(msb_rev), true).def(1).getTemp(); 1711 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry); 1712 } else if (src.regClass() == v2) { 1713 aco_opcode op = 1714 instr->op == nir_op_ufind_msb ? aco_opcode::v_ffbh_u32 : aco_opcode::v_ffbh_i32; 1715 1716 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 1717 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 1718 1719 lo = uadd32_sat(bld, bld.def(v1), bld.copy(bld.def(s1), Operand::c32(32u)), 1720 bld.vop1(op, bld.def(v1), lo)); 1721 hi = bld.vop1(op, bld.def(v1), hi); 1722 Temp found_hi = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::c32(-1), hi); 1723 1724 Temp msb_rev = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lo, hi, found_hi); 1725 1726 Temp msb = bld.tmp(v1); 1727 Temp carry = 1728 bld.vsub32(Definition(msb), Operand::c32(63u), Operand(msb_rev), true).def(1).getTemp(); 1729 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), msb, Operand::c32(-1), carry); 1730 } else { 1731 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1732 } 1733 break; 1734 } 1735 case nir_op_bitfield_reverse: { 1736 if (dst.regClass() == s1) { 1737 bld.sop1(aco_opcode::s_brev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); 1738 } else if (dst.regClass() == v1) { 1739 bld.vop1(aco_opcode::v_bfrev_b32, Definition(dst), get_alu_src(ctx, instr->src[0])); 1740 } else { 1741 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1742 } 1743 break; 1744 } 1745 case nir_op_iadd: { 1746 if (dst.regClass() == s1) { 1747 emit_sop2_instruction(ctx, instr, aco_opcode::s_add_u32, dst, true); 1748 break; 1749 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) { 1750 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_u16_e64, dst); 1751 break; 1752 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) { 1753 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_u16, dst, true); 1754 break; 1755 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1756 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_u16, dst); 1757 break; 1758 } 1759 1760 Temp src0 = get_alu_src(ctx, instr->src[0]); 1761 Temp src1 = get_alu_src(ctx, instr->src[1]); 1762 if (dst.type() == RegType::vgpr && dst.bytes() <= 4) { 1763 bld.vadd32(Definition(dst), Operand(src0), Operand(src1)); 1764 break; 1765 } 1766 1767 assert(src0.size() == 2 && src1.size() == 2); 1768 Temp src00 = bld.tmp(src0.type(), 1); 1769 Temp src01 = bld.tmp(dst.type(), 1); 1770 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 1771 Temp src10 = bld.tmp(src1.type(), 1); 1772 Temp src11 = bld.tmp(dst.type(), 1); 1773 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 1774 1775 if (dst.regClass() == s2) { 1776 Temp carry = bld.tmp(s1); 1777 Temp dst0 = 1778 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); 1779 Temp dst1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), src01, src11, 1780 bld.scc(carry)); 1781 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 1782 } else if (dst.regClass() == v2) { 1783 Temp dst0 = bld.tmp(v1); 1784 Temp carry = bld.vadd32(Definition(dst0), src00, src10, true).def(1).getTemp(); 1785 Temp dst1 = bld.vadd32(bld.def(v1), src01, src11, false, carry); 1786 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 1787 } else { 1788 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1789 } 1790 break; 1791 } 1792 case nir_op_uadd_sat: { 1793 Temp src0 = get_alu_src(ctx, instr->src[0]); 1794 Temp src1 = get_alu_src(ctx, instr->src[1]); 1795 if (dst.regClass() == s1) { 1796 Temp tmp = bld.tmp(s1), carry = bld.tmp(s1); 1797 bld.sop2(aco_opcode::s_add_u32, Definition(tmp), bld.scc(Definition(carry)), src0, src1); 1798 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), Operand::c32(-1), tmp, 1799 bld.scc(carry)); 1800 } else if (dst.regClass() == v2b) { 1801 Instruction* add_instr; 1802 if (ctx->program->chip_class >= GFX10) { 1803 add_instr = bld.vop3(aco_opcode::v_add_u16_e64, Definition(dst), src0, src1).instr; 1804 } else { 1805 if (src1.type() == RegType::sgpr) 1806 std::swap(src0, src1); 1807 add_instr = 1808 bld.vop2_e64(aco_opcode::v_add_u16, Definition(dst), src0, as_vgpr(ctx, src1)).instr; 1809 } 1810 add_instr->vop3().clamp = 1; 1811 } else if (dst.regClass() == v1) { 1812 uadd32_sat(bld, Definition(dst), src0, src1); 1813 } else { 1814 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1815 } 1816 break; 1817 } 1818 case nir_op_iadd_sat: { 1819 Temp src0 = get_alu_src(ctx, instr->src[0]); 1820 Temp src1 = as_vgpr(ctx, get_alu_src(ctx, instr->src[1])); 1821 if (dst.regClass() == v2b) { 1822 Instruction* add_instr = 1823 bld.vop3(aco_opcode::v_add_i16, Definition(dst), src0, src1).instr; 1824 add_instr->vop3().clamp = 1; 1825 } else if (dst.regClass() == v1) { 1826 Instruction* add_instr = 1827 bld.vop3(aco_opcode::v_add_i32, Definition(dst), src0, src1).instr; 1828 add_instr->vop3().clamp = 1; 1829 } else { 1830 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1831 } 1832 break; 1833 } 1834 case nir_op_uadd_carry: { 1835 Temp src0 = get_alu_src(ctx, instr->src[0]); 1836 Temp src1 = get_alu_src(ctx, instr->src[1]); 1837 if (dst.regClass() == s1) { 1838 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1); 1839 break; 1840 } 1841 if (dst.regClass() == v1) { 1842 Temp carry = bld.vadd32(bld.def(v1), src0, src1, true).def(1).getTemp(); 1843 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u), 1844 carry); 1845 break; 1846 } 1847 1848 Temp src00 = bld.tmp(src0.type(), 1); 1849 Temp src01 = bld.tmp(dst.type(), 1); 1850 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 1851 Temp src10 = bld.tmp(src1.type(), 1); 1852 Temp src11 = bld.tmp(dst.type(), 1); 1853 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 1854 if (dst.regClass() == s2) { 1855 Temp carry = bld.tmp(s1); 1856 bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), src00, src10); 1857 carry = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, 1858 bld.scc(carry)) 1859 .def(1) 1860 .getTemp(); 1861 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero()); 1862 } else if (dst.regClass() == v2) { 1863 Temp carry = bld.vadd32(bld.def(v1), src00, src10, true).def(1).getTemp(); 1864 carry = bld.vadd32(bld.def(v1), src01, src11, true, carry).def(1).getTemp(); 1865 carry = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 1866 Operand::c32(1u), carry); 1867 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), carry, Operand::zero()); 1868 } else { 1869 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1870 } 1871 break; 1872 } 1873 case nir_op_isub: { 1874 if (dst.regClass() == s1) { 1875 emit_sop2_instruction(ctx, instr, aco_opcode::s_sub_i32, dst, true); 1876 break; 1877 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1878 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_sub_u16, dst); 1879 break; 1880 } 1881 1882 Temp src0 = get_alu_src(ctx, instr->src[0]); 1883 Temp src1 = get_alu_src(ctx, instr->src[1]); 1884 if (dst.regClass() == v1) { 1885 bld.vsub32(Definition(dst), src0, src1); 1886 break; 1887 } else if (dst.bytes() <= 2) { 1888 if (ctx->program->chip_class >= GFX10) 1889 bld.vop3(aco_opcode::v_sub_u16_e64, Definition(dst), src0, src1); 1890 else if (src1.type() == RegType::sgpr) 1891 bld.vop2(aco_opcode::v_subrev_u16, Definition(dst), src1, as_vgpr(ctx, src0)); 1892 else if (ctx->program->chip_class >= GFX8) 1893 bld.vop2(aco_opcode::v_sub_u16, Definition(dst), src0, as_vgpr(ctx, src1)); 1894 else 1895 bld.vsub32(Definition(dst), src0, src1); 1896 break; 1897 } 1898 1899 Temp src00 = bld.tmp(src0.type(), 1); 1900 Temp src01 = bld.tmp(dst.type(), 1); 1901 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 1902 Temp src10 = bld.tmp(src1.type(), 1); 1903 Temp src11 = bld.tmp(dst.type(), 1); 1904 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 1905 if (dst.regClass() == s2) { 1906 Temp borrow = bld.tmp(s1); 1907 Temp dst0 = 1908 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); 1909 Temp dst1 = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), src01, src11, 1910 bld.scc(borrow)); 1911 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 1912 } else if (dst.regClass() == v2) { 1913 Temp lower = bld.tmp(v1); 1914 Temp borrow = bld.vsub32(Definition(lower), src00, src10, true).def(1).getTemp(); 1915 Temp upper = bld.vsub32(bld.def(v1), src01, src11, false, borrow); 1916 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 1917 } else { 1918 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1919 } 1920 break; 1921 } 1922 case nir_op_usub_borrow: { 1923 Temp src0 = get_alu_src(ctx, instr->src[0]); 1924 Temp src1 = get_alu_src(ctx, instr->src[1]); 1925 if (dst.regClass() == s1) { 1926 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(dst)), src0, src1); 1927 break; 1928 } else if (dst.regClass() == v1) { 1929 Temp borrow = bld.vsub32(bld.def(v1), src0, src1, true).def(1).getTemp(); 1930 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), Operand::c32(1u), 1931 borrow); 1932 break; 1933 } 1934 1935 Temp src00 = bld.tmp(src0.type(), 1); 1936 Temp src01 = bld.tmp(dst.type(), 1); 1937 bld.pseudo(aco_opcode::p_split_vector, Definition(src00), Definition(src01), src0); 1938 Temp src10 = bld.tmp(src1.type(), 1); 1939 Temp src11 = bld.tmp(dst.type(), 1); 1940 bld.pseudo(aco_opcode::p_split_vector, Definition(src10), Definition(src11), src1); 1941 if (dst.regClass() == s2) { 1942 Temp borrow = bld.tmp(s1); 1943 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), src00, src10); 1944 borrow = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.scc(bld.def(s1)), src01, src11, 1945 bld.scc(borrow)) 1946 .def(1) 1947 .getTemp(); 1948 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero()); 1949 } else if (dst.regClass() == v2) { 1950 Temp borrow = bld.vsub32(bld.def(v1), src00, src10, true).def(1).getTemp(); 1951 borrow = bld.vsub32(bld.def(v1), src01, src11, true, Operand(borrow)).def(1).getTemp(); 1952 borrow = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 1953 Operand::c32(1u), borrow); 1954 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), borrow, Operand::zero()); 1955 } else { 1956 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1957 } 1958 break; 1959 } 1960 case nir_op_imul: { 1961 if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) { 1962 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u16_e64, dst); 1963 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) { 1964 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_lo_u16, dst, true); 1965 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 1966 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_lo_u16, dst); 1967 } else if (dst.type() == RegType::vgpr) { 1968 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0); 1969 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1); 1970 1971 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) { 1972 bool nuw_16bit = src0_ub <= 0xffff && src1_ub <= 0xffff && src0_ub * src1_ub <= 0xffff; 1973 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_u32_u24, dst, 1974 true /* commutative */, false, false, nuw_16bit); 1975 } else if (nir_src_is_const(instr->src[0].src)) { 1976 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[1]), 1977 nir_src_as_uint(instr->src[0].src), false); 1978 } else if (nir_src_is_const(instr->src[1].src)) { 1979 bld.v_mul_imm(Definition(dst), get_alu_src(ctx, instr->src[0]), 1980 nir_src_as_uint(instr->src[1].src), false); 1981 } else { 1982 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_lo_u32, dst); 1983 } 1984 } else if (dst.regClass() == s1) { 1985 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_i32, dst, false); 1986 } else { 1987 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 1988 } 1989 break; 1990 } 1991 case nir_op_umul_high: { 1992 if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) { 1993 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_u32, dst, false); 1994 } else if (dst.bytes() == 4) { 1995 uint32_t src0_ub = get_alu_src_ub(ctx, instr, 0); 1996 uint32_t src1_ub = get_alu_src_ub(ctx, instr, 1); 1997 1998 Temp tmp = dst.regClass() == s1 ? bld.tmp(v1) : dst; 1999 if (src0_ub <= 0xffffff && src1_ub <= 0xffffff) { 2000 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_hi_u32_u24, tmp, true); 2001 } else { 2002 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_u32, tmp); 2003 } 2004 2005 if (dst.regClass() == s1) 2006 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 2007 } else { 2008 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2009 } 2010 break; 2011 } 2012 case nir_op_imul_high: { 2013 if (dst.regClass() == v1) { 2014 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_hi_i32, dst); 2015 } else if (dst.regClass() == s1 && ctx->options->chip_class >= GFX9) { 2016 emit_sop2_instruction(ctx, instr, aco_opcode::s_mul_hi_i32, dst, false); 2017 } else if (dst.regClass() == s1) { 2018 Temp tmp = bld.vop3(aco_opcode::v_mul_hi_i32, bld.def(v1), get_alu_src(ctx, instr->src[0]), 2019 as_vgpr(ctx, get_alu_src(ctx, instr->src[1]))); 2020 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 2021 } else { 2022 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2023 } 2024 break; 2025 } 2026 case nir_op_fmul: { 2027 if (dst.regClass() == v2b) { 2028 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f16, dst, true); 2029 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2030 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_mul_f16, dst); 2031 } else if (dst.regClass() == v1) { 2032 emit_vop2_instruction(ctx, instr, aco_opcode::v_mul_f32, dst, true); 2033 } else if (dst.regClass() == v2) { 2034 emit_vop3a_instruction(ctx, instr, aco_opcode::v_mul_f64, dst); 2035 } else { 2036 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2037 } 2038 break; 2039 } 2040 case nir_op_fadd: { 2041 if (dst.regClass() == v2b) { 2042 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f16, dst, true); 2043 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2044 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst); 2045 } else if (dst.regClass() == v1) { 2046 emit_vop2_instruction(ctx, instr, aco_opcode::v_add_f32, dst, true); 2047 } else if (dst.regClass() == v2) { 2048 emit_vop3a_instruction(ctx, instr, aco_opcode::v_add_f64, dst); 2049 } else { 2050 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2051 } 2052 break; 2053 } 2054 case nir_op_fsub: { 2055 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2056 Instruction* add = emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_add_f16, dst); 2057 VOP3P_instruction& sub = add->vop3p(); 2058 sub.neg_lo[1] = true; 2059 sub.neg_hi[1] = true; 2060 break; 2061 } 2062 2063 Temp src0 = get_alu_src(ctx, instr->src[0]); 2064 Temp src1 = get_alu_src(ctx, instr->src[1]); 2065 if (dst.regClass() == v2b) { 2066 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) 2067 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f16, dst, false); 2068 else 2069 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f16, dst, true); 2070 } else if (dst.regClass() == v1) { 2071 if (src1.type() == RegType::vgpr || src0.type() != RegType::vgpr) 2072 emit_vop2_instruction(ctx, instr, aco_opcode::v_sub_f32, dst, false); 2073 else 2074 emit_vop2_instruction(ctx, instr, aco_opcode::v_subrev_f32, dst, true); 2075 } else if (dst.regClass() == v2) { 2076 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), as_vgpr(ctx, src0), 2077 as_vgpr(ctx, src1)); 2078 add->vop3().neg[1] = true; 2079 } else { 2080 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2081 } 2082 break; 2083 } 2084 case nir_op_fmax: { 2085 if (dst.regClass() == v2b) { 2086 // TODO: check fp_mode.must_flush_denorms16_64 2087 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f16, dst, true); 2088 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2089 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_max_f16, dst); 2090 } else if (dst.regClass() == v1) { 2091 emit_vop2_instruction(ctx, instr, aco_opcode::v_max_f32, dst, true, false, 2092 ctx->block->fp_mode.must_flush_denorms32); 2093 } else if (dst.regClass() == v2) { 2094 emit_vop3a_instruction(ctx, instr, aco_opcode::v_max_f64, dst, 2095 ctx->block->fp_mode.must_flush_denorms16_64); 2096 } else { 2097 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2098 } 2099 break; 2100 } 2101 case nir_op_fmin: { 2102 if (dst.regClass() == v2b) { 2103 // TODO: check fp_mode.must_flush_denorms16_64 2104 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f16, dst, true); 2105 } else if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2106 emit_vop3p_instruction(ctx, instr, aco_opcode::v_pk_min_f16, dst, true); 2107 } else if (dst.regClass() == v1) { 2108 emit_vop2_instruction(ctx, instr, aco_opcode::v_min_f32, dst, true, false, 2109 ctx->block->fp_mode.must_flush_denorms32); 2110 } else if (dst.regClass() == v2) { 2111 emit_vop3a_instruction(ctx, instr, aco_opcode::v_min_f64, dst, 2112 ctx->block->fp_mode.must_flush_denorms16_64); 2113 } else { 2114 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2115 } 2116 break; 2117 } 2118 case nir_op_sdot_4x8_iadd: { 2119 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, false); 2120 break; 2121 } 2122 case nir_op_sdot_4x8_iadd_sat: { 2123 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_i32_i8, dst, true); 2124 break; 2125 } 2126 case nir_op_udot_4x8_uadd: { 2127 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, false); 2128 break; 2129 } 2130 case nir_op_udot_4x8_uadd_sat: { 2131 emit_idot_instruction(ctx, instr, aco_opcode::v_dot4_u32_u8, dst, true); 2132 break; 2133 } 2134 case nir_op_sdot_2x16_iadd: { 2135 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, false); 2136 break; 2137 } 2138 case nir_op_sdot_2x16_iadd_sat: { 2139 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_i32_i16, dst, true); 2140 break; 2141 } 2142 case nir_op_udot_2x16_uadd: { 2143 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, false); 2144 break; 2145 } 2146 case nir_op_udot_2x16_uadd_sat: { 2147 emit_idot_instruction(ctx, instr, aco_opcode::v_dot2_u32_u16, dst, true); 2148 break; 2149 } 2150 case nir_op_cube_face_coord_amd: { 2151 Temp in = get_alu_src(ctx, instr->src[0], 3); 2152 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1), 2153 emit_extract_vector(ctx, in, 2, v1)}; 2154 Temp ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), src[0], src[1], src[2]); 2155 ma = bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), ma); 2156 Temp sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), src[0], src[1], src[2]); 2157 Temp tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), src[0], src[1], src[2]); 2158 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/), 2159 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, ma)); 2160 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3f000000u /*0.5*/), 2161 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, ma)); 2162 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), sc, tc); 2163 break; 2164 } 2165 case nir_op_cube_face_index_amd: { 2166 Temp in = get_alu_src(ctx, instr->src[0], 3); 2167 Temp src[3] = {emit_extract_vector(ctx, in, 0, v1), emit_extract_vector(ctx, in, 1, v1), 2168 emit_extract_vector(ctx, in, 2, v1)}; 2169 bld.vop3(aco_opcode::v_cubeid_f32, Definition(dst), src[0], src[1], src[2]); 2170 break; 2171 } 2172 case nir_op_bcsel: { 2173 emit_bcsel(ctx, instr, dst); 2174 break; 2175 } 2176 case nir_op_frsq: { 2177 if (dst.regClass() == v2b) { 2178 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f16, dst); 2179 } else if (dst.regClass() == v1) { 2180 Temp src = get_alu_src(ctx, instr->src[0]); 2181 emit_rsq(ctx, bld, Definition(dst), src); 2182 } else if (dst.regClass() == v2) { 2183 /* Lowered at NIR level for precision reasons. */ 2184 emit_vop1_instruction(ctx, instr, aco_opcode::v_rsq_f64, dst); 2185 } else { 2186 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2187 } 2188 break; 2189 } 2190 case nir_op_fneg: { 2191 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2192 Temp src = get_alu_src_vop3p(ctx, instr->src[0]); 2193 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0xBC00), 2194 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1); 2195 emit_split_vector(ctx, dst, 2); 2196 break; 2197 } 2198 Temp src = get_alu_src(ctx, instr->src[0]); 2199 if (dst.regClass() == v2b) { 2200 bld.vop2(aco_opcode::v_mul_f16, Definition(dst), Operand::c16(0xbc00u), as_vgpr(ctx, src)); 2201 } else if (dst.regClass() == v1) { 2202 bld.vop2(aco_opcode::v_mul_f32, Definition(dst), Operand::c32(0xbf800000u), 2203 as_vgpr(ctx, src)); 2204 } else if (dst.regClass() == v2) { 2205 if (ctx->block->fp_mode.must_flush_denorms16_64) 2206 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000), 2207 as_vgpr(ctx, src)); 2208 Temp upper = bld.tmp(v1), lower = bld.tmp(v1); 2209 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2210 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), Operand::c32(0x80000000u), upper); 2211 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2212 } else { 2213 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2214 } 2215 break; 2216 } 2217 case nir_op_fabs: { 2218 Temp src = get_alu_src(ctx, instr->src[0]); 2219 if (dst.regClass() == v2b) { 2220 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f16, Definition(dst), 2221 Operand::c16(0x3c00), as_vgpr(ctx, src)) 2222 .instr; 2223 mul->vop3().abs[1] = true; 2224 } else if (dst.regClass() == v1) { 2225 Instruction* mul = bld.vop2_e64(aco_opcode::v_mul_f32, Definition(dst), 2226 Operand::c32(0x3f800000u), as_vgpr(ctx, src)) 2227 .instr; 2228 mul->vop3().abs[1] = true; 2229 } else if (dst.regClass() == v2) { 2230 if (ctx->block->fp_mode.must_flush_denorms16_64) 2231 src = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), Operand::c64(0x3FF0000000000000), 2232 as_vgpr(ctx, src)); 2233 Temp upper = bld.tmp(v1), lower = bld.tmp(v1); 2234 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2235 upper = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7FFFFFFFu), upper); 2236 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2237 } else { 2238 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2239 } 2240 break; 2241 } 2242 case nir_op_fsat: { 2243 if (dst.regClass() == v1 && instr->dest.dest.ssa.bit_size == 16) { 2244 Temp src = get_alu_src_vop3p(ctx, instr->src[0]); 2245 Instruction* vop3p = 2246 bld.vop3p(aco_opcode::v_pk_mul_f16, Definition(dst), src, Operand::c16(0x3C00), 2247 instr->src[0].swizzle[0] & 1, instr->src[0].swizzle[1] & 1); 2248 vop3p->vop3p().clamp = true; 2249 emit_split_vector(ctx, dst, 2); 2250 break; 2251 } 2252 Temp src = get_alu_src(ctx, instr->src[0]); 2253 if (dst.regClass() == v2b) { 2254 bld.vop3(aco_opcode::v_med3_f16, Definition(dst), Operand::c16(0u), Operand::c16(0x3c00), 2255 src); 2256 } else if (dst.regClass() == v1) { 2257 bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::zero(), 2258 Operand::c32(0x3f800000u), src); 2259 /* apparently, it is not necessary to flush denorms if this instruction is used with these 2260 * operands */ 2261 // TODO: confirm that this holds under any circumstances 2262 } else if (dst.regClass() == v2) { 2263 Instruction* add = bld.vop3(aco_opcode::v_add_f64, Definition(dst), src, Operand::zero()); 2264 add->vop3().clamp = true; 2265 } else { 2266 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2267 } 2268 break; 2269 } 2270 case nir_op_flog2: { 2271 if (dst.regClass() == v2b) { 2272 emit_vop1_instruction(ctx, instr, aco_opcode::v_log_f16, dst); 2273 } else if (dst.regClass() == v1) { 2274 Temp src = get_alu_src(ctx, instr->src[0]); 2275 emit_log2(ctx, bld, Definition(dst), src); 2276 } else { 2277 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2278 } 2279 break; 2280 } 2281 case nir_op_frcp: { 2282 if (dst.regClass() == v2b) { 2283 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f16, dst); 2284 } else if (dst.regClass() == v1) { 2285 Temp src = get_alu_src(ctx, instr->src[0]); 2286 emit_rcp(ctx, bld, Definition(dst), src); 2287 } else if (dst.regClass() == v2) { 2288 /* Lowered at NIR level for precision reasons. */ 2289 emit_vop1_instruction(ctx, instr, aco_opcode::v_rcp_f64, dst); 2290 } else { 2291 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2292 } 2293 break; 2294 } 2295 case nir_op_fexp2: { 2296 if (dst.regClass() == v2b) { 2297 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f16, dst); 2298 } else if (dst.regClass() == v1) { 2299 emit_vop1_instruction(ctx, instr, aco_opcode::v_exp_f32, dst); 2300 } else { 2301 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2302 } 2303 break; 2304 } 2305 case nir_op_fsqrt: { 2306 if (dst.regClass() == v2b) { 2307 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f16, dst); 2308 } else if (dst.regClass() == v1) { 2309 Temp src = get_alu_src(ctx, instr->src[0]); 2310 emit_sqrt(ctx, bld, Definition(dst), src); 2311 } else if (dst.regClass() == v2) { 2312 /* Lowered at NIR level for precision reasons. */ 2313 emit_vop1_instruction(ctx, instr, aco_opcode::v_sqrt_f64, dst); 2314 } else { 2315 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2316 } 2317 break; 2318 } 2319 case nir_op_ffract: { 2320 if (dst.regClass() == v2b) { 2321 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f16, dst); 2322 } else if (dst.regClass() == v1) { 2323 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f32, dst); 2324 } else if (dst.regClass() == v2) { 2325 emit_vop1_instruction(ctx, instr, aco_opcode::v_fract_f64, dst); 2326 } else { 2327 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2328 } 2329 break; 2330 } 2331 case nir_op_ffloor: { 2332 if (dst.regClass() == v2b) { 2333 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f16, dst); 2334 } else if (dst.regClass() == v1) { 2335 emit_vop1_instruction(ctx, instr, aco_opcode::v_floor_f32, dst); 2336 } else if (dst.regClass() == v2) { 2337 Temp src = get_alu_src(ctx, instr->src[0]); 2338 emit_floor_f64(ctx, bld, Definition(dst), src); 2339 } else { 2340 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2341 } 2342 break; 2343 } 2344 case nir_op_fceil: { 2345 if (dst.regClass() == v2b) { 2346 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f16, dst); 2347 } else if (dst.regClass() == v1) { 2348 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f32, dst); 2349 } else if (dst.regClass() == v2) { 2350 if (ctx->options->chip_class >= GFX7) { 2351 emit_vop1_instruction(ctx, instr, aco_opcode::v_ceil_f64, dst); 2352 } else { 2353 /* GFX6 doesn't support V_CEIL_F64, lower it. */ 2354 /* trunc = trunc(src0) 2355 * if (src0 > 0.0 && src0 != trunc) 2356 * trunc += 1.0 2357 */ 2358 Temp src0 = get_alu_src(ctx, instr->src[0]); 2359 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src0); 2360 Temp tmp0 = 2361 bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.def(bld.lm), src0, Operand::zero()); 2362 Temp tmp1 = 2363 bld.vopc(aco_opcode::v_cmp_lg_f64, bld.hint_vcc(bld.def(bld.lm)), src0, trunc); 2364 Temp cond = bld.sop2(aco_opcode::s_and_b64, bld.hint_vcc(bld.def(s2)), bld.def(s1, scc), 2365 tmp0, tmp1); 2366 Temp add = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 2367 bld.copy(bld.def(v1), Operand::zero()), 2368 bld.copy(bld.def(v1), Operand::c32(0x3ff00000u)), cond); 2369 add = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), 2370 bld.copy(bld.def(v1), Operand::zero()), add); 2371 bld.vop3(aco_opcode::v_add_f64, Definition(dst), trunc, add); 2372 } 2373 } else { 2374 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2375 } 2376 break; 2377 } 2378 case nir_op_ftrunc: { 2379 if (dst.regClass() == v2b) { 2380 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f16, dst); 2381 } else if (dst.regClass() == v1) { 2382 emit_vop1_instruction(ctx, instr, aco_opcode::v_trunc_f32, dst); 2383 } else if (dst.regClass() == v2) { 2384 Temp src = get_alu_src(ctx, instr->src[0]); 2385 emit_trunc_f64(ctx, bld, Definition(dst), src); 2386 } else { 2387 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2388 } 2389 break; 2390 } 2391 case nir_op_fround_even: { 2392 if (dst.regClass() == v2b) { 2393 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f16, dst); 2394 } else if (dst.regClass() == v1) { 2395 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f32, dst); 2396 } else if (dst.regClass() == v2) { 2397 if (ctx->options->chip_class >= GFX7) { 2398 emit_vop1_instruction(ctx, instr, aco_opcode::v_rndne_f64, dst); 2399 } else { 2400 /* GFX6 doesn't support V_RNDNE_F64, lower it. */ 2401 Temp src0_lo = bld.tmp(v1), src0_hi = bld.tmp(v1); 2402 Temp src0 = get_alu_src(ctx, instr->src[0]); 2403 bld.pseudo(aco_opcode::p_split_vector, Definition(src0_lo), Definition(src0_hi), src0); 2404 2405 Temp bitmask = bld.sop1(aco_opcode::s_brev_b32, bld.def(s1), 2406 bld.copy(bld.def(s1), Operand::c32(-2u))); 2407 Temp bfi = 2408 bld.vop3(aco_opcode::v_bfi_b32, bld.def(v1), bitmask, 2409 bld.copy(bld.def(v1), Operand::c32(0x43300000u)), as_vgpr(ctx, src0_hi)); 2410 Temp tmp = 2411 bld.vop3(aco_opcode::v_add_f64, bld.def(v2), src0, 2412 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi)); 2413 Instruction* sub = 2414 bld.vop3(aco_opcode::v_add_f64, bld.def(v2), tmp, 2415 bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), bfi)); 2416 sub->vop3().neg[1] = true; 2417 tmp = sub->definitions[0].getTemp(); 2418 2419 Temp v = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::c32(-1u), 2420 Operand::c32(0x432fffffu)); 2421 Instruction* vop3 = 2422 bld.vopc_e64(aco_opcode::v_cmp_gt_f64, bld.hint_vcc(bld.def(bld.lm)), src0, v); 2423 vop3->vop3().abs[0] = true; 2424 Temp cond = vop3->definitions[0].getTemp(); 2425 2426 Temp tmp_lo = bld.tmp(v1), tmp_hi = bld.tmp(v1); 2427 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp_lo), Definition(tmp_hi), tmp); 2428 Temp dst0 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_lo, 2429 as_vgpr(ctx, src0_lo), cond); 2430 Temp dst1 = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp_hi, 2431 as_vgpr(ctx, src0_hi), cond); 2432 2433 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), dst0, dst1); 2434 } 2435 } else { 2436 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2437 } 2438 break; 2439 } 2440 case nir_op_fsin: 2441 case nir_op_fcos: { 2442 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); 2443 aco_ptr<Instruction> norm; 2444 if (dst.regClass() == v2b) { 2445 Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3118u)); 2446 Temp tmp = bld.vop2(aco_opcode::v_mul_f16, bld.def(v1), half_pi, src); 2447 aco_opcode opcode = 2448 instr->op == nir_op_fsin ? aco_opcode::v_sin_f16 : aco_opcode::v_cos_f16; 2449 bld.vop1(opcode, Definition(dst), tmp); 2450 } else if (dst.regClass() == v1) { 2451 Temp half_pi = bld.copy(bld.def(s1), Operand::c32(0x3e22f983u)); 2452 Temp tmp = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), half_pi, src); 2453 2454 /* before GFX9, v_sin_f32 and v_cos_f32 had a valid input domain of [-256, +256] */ 2455 if (ctx->options->chip_class < GFX9) 2456 tmp = bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), tmp); 2457 2458 aco_opcode opcode = 2459 instr->op == nir_op_fsin ? aco_opcode::v_sin_f32 : aco_opcode::v_cos_f32; 2460 bld.vop1(opcode, Definition(dst), tmp); 2461 } else { 2462 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2463 } 2464 break; 2465 } 2466 case nir_op_ldexp: { 2467 if (dst.regClass() == v2b) { 2468 emit_vop2_instruction(ctx, instr, aco_opcode::v_ldexp_f16, dst, false); 2469 } else if (dst.regClass() == v1) { 2470 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f32, dst); 2471 } else if (dst.regClass() == v2) { 2472 emit_vop3a_instruction(ctx, instr, aco_opcode::v_ldexp_f64, dst); 2473 } else { 2474 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2475 } 2476 break; 2477 } 2478 case nir_op_frexp_sig: { 2479 if (dst.regClass() == v2b) { 2480 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f16, dst); 2481 } else if (dst.regClass() == v1) { 2482 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f32, dst); 2483 } else if (dst.regClass() == v2) { 2484 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_mant_f64, dst); 2485 } else { 2486 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2487 } 2488 break; 2489 } 2490 case nir_op_frexp_exp: { 2491 if (instr->src[0].src.ssa->bit_size == 16) { 2492 Temp src = get_alu_src(ctx, instr->src[0]); 2493 Temp tmp = bld.vop1(aco_opcode::v_frexp_exp_i16_f16, bld.def(v1), src); 2494 tmp = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v1b), tmp, Operand::zero()); 2495 convert_int(ctx, bld, tmp, 8, 32, true, dst); 2496 } else if (instr->src[0].src.ssa->bit_size == 32) { 2497 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f32, dst); 2498 } else if (instr->src[0].src.ssa->bit_size == 64) { 2499 emit_vop1_instruction(ctx, instr, aco_opcode::v_frexp_exp_i32_f64, dst); 2500 } else { 2501 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2502 } 2503 break; 2504 } 2505 case nir_op_fsign: { 2506 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); 2507 if (dst.regClass() == v2b) { 2508 assert(ctx->program->chip_class >= GFX9); 2509 /* replace negative zero with positive zero */ 2510 src = bld.vop2(aco_opcode::v_add_f16, bld.def(v2b), Operand::zero(), src); 2511 src = 2512 bld.vop3(aco_opcode::v_med3_i16, bld.def(v2b), Operand::c16(-1), src, Operand::c16(1u)); 2513 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); 2514 } else if (dst.regClass() == v1) { 2515 src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src); 2516 src = 2517 bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u)); 2518 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); 2519 } else if (dst.regClass() == v2) { 2520 Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.hint_vcc(bld.def(bld.lm)), 2521 Operand::zero(), src); 2522 Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u)); 2523 Temp upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, 2524 emit_extract_vector(ctx, src, 1, v1), cond); 2525 2526 cond = 2527 bld.vopc(aco_opcode::v_cmp_le_f64, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), src); 2528 tmp = bld.copy(bld.def(v1), Operand::c32(0xBFF00000u)); 2529 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), tmp, upper, cond); 2530 2531 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper); 2532 } else { 2533 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2534 } 2535 break; 2536 } 2537 case nir_op_f2f16: 2538 case nir_op_f2f16_rtne: { 2539 Temp src = get_alu_src(ctx, instr->src[0]); 2540 if (instr->src[0].src.ssa->bit_size == 64) 2541 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); 2542 if (instr->op == nir_op_f2f16_rtne && ctx->block->fp_mode.round16_64 != fp_round_ne) 2543 /* We emit s_round_mode/s_setreg_imm32 in lower_to_hw_instr to 2544 * keep value numbering and the scheduler simpler. 2545 */ 2546 bld.vop1(aco_opcode::p_cvt_f16_f32_rtne, Definition(dst), src); 2547 else 2548 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); 2549 break; 2550 } 2551 case nir_op_f2f16_rtz: { 2552 Temp src = get_alu_src(ctx, instr->src[0]); 2553 if (instr->src[0].src.ssa->bit_size == 64) 2554 src = bld.vop1(aco_opcode::v_cvt_f32_f64, bld.def(v1), src); 2555 if (ctx->block->fp_mode.round16_64 == fp_round_tz) 2556 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); 2557 else if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9) 2558 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, Definition(dst), src, Operand::zero()); 2559 else 2560 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, Definition(dst), src, as_vgpr(ctx, src)); 2561 break; 2562 } 2563 case nir_op_f2f32: { 2564 if (instr->src[0].src.ssa->bit_size == 16) { 2565 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, dst); 2566 } else if (instr->src[0].src.ssa->bit_size == 64) { 2567 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f64, dst); 2568 } else { 2569 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2570 } 2571 break; 2572 } 2573 case nir_op_f2f64: { 2574 Temp src = get_alu_src(ctx, instr->src[0]); 2575 if (instr->src[0].src.ssa->bit_size == 16) 2576 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 2577 bld.vop1(aco_opcode::v_cvt_f64_f32, Definition(dst), src); 2578 break; 2579 } 2580 case nir_op_i2f16: { 2581 assert(dst.regClass() == v2b); 2582 Temp src = get_alu_src(ctx, instr->src[0]); 2583 const unsigned input_size = instr->src[0].src.ssa->bit_size; 2584 if (input_size <= 16) { 2585 /* Expand integer to the size expected by the uint→float converter used below */ 2586 unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32); 2587 if (input_size != target_size) { 2588 src = convert_int(ctx, bld, src, input_size, target_size, true); 2589 } 2590 } else if (input_size == 64) { 2591 /* Truncate down to 32 bits; if any of the upper bits are relevant, 2592 * the value does not fall into the single-precision float range 2593 * anyway. SPIR-V does not mandate any specific behavior for such 2594 * large inputs. 2595 */ 2596 src = convert_int(ctx, bld, src, 64, 32, false); 2597 } 2598 2599 if (ctx->program->chip_class >= GFX8 && input_size <= 16) { 2600 bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src); 2601 } else { 2602 /* Convert to f32 and then down to f16. This is needed to handle 2603 * inputs slightly outside the range [INT16_MIN, INT16_MAX], 2604 * which are representable via f16 but wouldn't be converted 2605 * correctly by v_cvt_f16_i16. 2606 * 2607 * This is also the fallback-path taken on GFX7 and earlier, which 2608 * do not support direct f16⟷i16 conversions. 2609 */ 2610 src = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), src); 2611 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); 2612 } 2613 break; 2614 } 2615 case nir_op_i2f32: { 2616 assert(dst.size() == 1); 2617 Temp src = get_alu_src(ctx, instr->src[0]); 2618 const unsigned input_size = instr->src[0].src.ssa->bit_size; 2619 if (input_size <= 32) { 2620 if (input_size <= 16) { 2621 /* Sign-extend to 32-bits */ 2622 src = convert_int(ctx, bld, src, input_size, 32, true); 2623 } 2624 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src); 2625 } else { 2626 assert(input_size == 64); 2627 RegClass rc = RegClass(src.type(), 1); 2628 Temp lower = bld.tmp(rc), upper = bld.tmp(rc); 2629 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2630 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); 2631 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper); 2632 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u)); 2633 upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper); 2634 bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper); 2635 } 2636 2637 break; 2638 } 2639 case nir_op_i2f64: { 2640 if (instr->src[0].src.ssa->bit_size <= 32) { 2641 Temp src = get_alu_src(ctx, instr->src[0]); 2642 if (instr->src[0].src.ssa->bit_size <= 16) 2643 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, true); 2644 bld.vop1(aco_opcode::v_cvt_f64_i32, Definition(dst), src); 2645 } else if (instr->src[0].src.ssa->bit_size == 64) { 2646 Temp src = get_alu_src(ctx, instr->src[0]); 2647 RegClass rc = RegClass(src.type(), 1); 2648 Temp lower = bld.tmp(rc), upper = bld.tmp(rc); 2649 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2650 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); 2651 upper = bld.vop1(aco_opcode::v_cvt_f64_i32, bld.def(v2), upper); 2652 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u)); 2653 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); 2654 2655 } else { 2656 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2657 } 2658 break; 2659 } 2660 case nir_op_u2f16: { 2661 assert(dst.regClass() == v2b); 2662 Temp src = get_alu_src(ctx, instr->src[0]); 2663 const unsigned input_size = instr->src[0].src.ssa->bit_size; 2664 if (input_size <= 16) { 2665 /* Expand integer to the size expected by the uint→float converter used below */ 2666 unsigned target_size = (ctx->program->chip_class >= GFX8 ? 16 : 32); 2667 if (input_size != target_size) { 2668 src = convert_int(ctx, bld, src, input_size, target_size, false); 2669 } 2670 } else if (input_size == 64) { 2671 /* Truncate down to 32 bits; if any of the upper bits are non-zero, 2672 * the value does not fall into the single-precision float range 2673 * anyway. SPIR-V does not mandate any specific behavior for such 2674 * large inputs. 2675 */ 2676 src = convert_int(ctx, bld, src, 64, 32, false); 2677 } 2678 2679 if (ctx->program->chip_class >= GFX8) { 2680 /* float16 has a range of [0, 65519]. Converting from larger 2681 * inputs is UB, so we just need to consider the lower 16 bits */ 2682 bld.vop1(aco_opcode::v_cvt_f16_u16, Definition(dst), src); 2683 } else { 2684 /* GFX7 and earlier do not support direct f16⟷u16 conversions */ 2685 src = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), src); 2686 bld.vop1(aco_opcode::v_cvt_f16_f32, Definition(dst), src); 2687 } 2688 break; 2689 } 2690 case nir_op_u2f32: { 2691 assert(dst.size() == 1); 2692 Temp src = get_alu_src(ctx, instr->src[0]); 2693 const unsigned input_size = instr->src[0].src.ssa->bit_size; 2694 if (input_size == 8) { 2695 bld.vop1(aco_opcode::v_cvt_f32_ubyte0, Definition(dst), src); 2696 } else if (input_size <= 32) { 2697 if (input_size == 16) 2698 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false); 2699 bld.vop1(aco_opcode::v_cvt_f32_u32, Definition(dst), src); 2700 } else { 2701 assert(input_size == 64); 2702 RegClass rc = RegClass(src.type(), 1); 2703 Temp lower = bld.tmp(rc), upper = bld.tmp(rc); 2704 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2705 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); 2706 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper); 2707 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u)); 2708 upper = bld.vop3(aco_opcode::v_add_f64, bld.def(v2), lower, upper); 2709 bld.vop1(aco_opcode::v_cvt_f32_f64, Definition(dst), upper); 2710 } 2711 break; 2712 } 2713 case nir_op_u2f64: { 2714 if (instr->src[0].src.ssa->bit_size <= 32) { 2715 Temp src = get_alu_src(ctx, instr->src[0]); 2716 if (instr->src[0].src.ssa->bit_size <= 16) 2717 src = convert_int(ctx, bld, src, instr->src[0].src.ssa->bit_size, 32, false); 2718 bld.vop1(aco_opcode::v_cvt_f64_u32, Definition(dst), src); 2719 } else if (instr->src[0].src.ssa->bit_size == 64) { 2720 Temp src = get_alu_src(ctx, instr->src[0]); 2721 RegClass rc = RegClass(src.type(), 1); 2722 Temp lower = bld.tmp(rc), upper = bld.tmp(rc); 2723 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), src); 2724 lower = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), lower); 2725 upper = bld.vop1(aco_opcode::v_cvt_f64_u32, bld.def(v2), upper); 2726 upper = bld.vop3(aco_opcode::v_ldexp_f64, bld.def(v2), upper, Operand::c32(32u)); 2727 bld.vop3(aco_opcode::v_add_f64, Definition(dst), lower, upper); 2728 } else { 2729 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2730 } 2731 break; 2732 } 2733 case nir_op_f2i8: 2734 case nir_op_f2i16: { 2735 if (instr->src[0].src.ssa->bit_size == 16) { 2736 if (ctx->program->chip_class >= GFX8) { 2737 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i16_f16, dst); 2738 } else { 2739 /* GFX7 and earlier do not support direct f16⟷i16 conversions */ 2740 Temp tmp = bld.tmp(v1); 2741 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp); 2742 tmp = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp); 2743 tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false, 2744 (dst.type() == RegType::sgpr) ? Temp() : dst); 2745 if (dst.type() == RegType::sgpr) { 2746 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 2747 } 2748 } 2749 } else if (instr->src[0].src.ssa->bit_size == 32) { 2750 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst); 2751 } else { 2752 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); 2753 } 2754 break; 2755 } 2756 case nir_op_f2u8: 2757 case nir_op_f2u16: { 2758 if (instr->src[0].src.ssa->bit_size == 16) { 2759 if (ctx->program->chip_class >= GFX8) { 2760 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u16_f16, dst); 2761 } else { 2762 /* GFX7 and earlier do not support direct f16⟷u16 conversions */ 2763 Temp tmp = bld.tmp(v1); 2764 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_f32_f16, tmp); 2765 tmp = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp); 2766 tmp = convert_int(ctx, bld, tmp, 32, instr->dest.dest.ssa.bit_size, false, 2767 (dst.type() == RegType::sgpr) ? Temp() : dst); 2768 if (dst.type() == RegType::sgpr) { 2769 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 2770 } 2771 } 2772 } else if (instr->src[0].src.ssa->bit_size == 32) { 2773 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); 2774 } else { 2775 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); 2776 } 2777 break; 2778 } 2779 case nir_op_f2i32: { 2780 Temp src = get_alu_src(ctx, instr->src[0]); 2781 if (instr->src[0].src.ssa->bit_size == 16) { 2782 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 2783 if (dst.type() == RegType::vgpr) { 2784 bld.vop1(aco_opcode::v_cvt_i32_f32, Definition(dst), tmp); 2785 } else { 2786 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), 2787 bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), tmp)); 2788 } 2789 } else if (instr->src[0].src.ssa->bit_size == 32) { 2790 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f32, dst); 2791 } else if (instr->src[0].src.ssa->bit_size == 64) { 2792 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_i32_f64, dst); 2793 } else { 2794 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2795 } 2796 break; 2797 } 2798 case nir_op_f2u32: { 2799 Temp src = get_alu_src(ctx, instr->src[0]); 2800 if (instr->src[0].src.ssa->bit_size == 16) { 2801 Temp tmp = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 2802 if (dst.type() == RegType::vgpr) { 2803 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(dst), tmp); 2804 } else { 2805 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), 2806 bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), tmp)); 2807 } 2808 } else if (instr->src[0].src.ssa->bit_size == 32) { 2809 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f32, dst); 2810 } else if (instr->src[0].src.ssa->bit_size == 64) { 2811 emit_vop1_instruction(ctx, instr, aco_opcode::v_cvt_u32_f64, dst); 2812 } else { 2813 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2814 } 2815 break; 2816 } 2817 case nir_op_f2i64: { 2818 Temp src = get_alu_src(ctx, instr->src[0]); 2819 if (instr->src[0].src.ssa->bit_size == 16) 2820 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 2821 2822 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) { 2823 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); 2824 exponent = bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::zero(), exponent, 2825 Operand::c32(64u)); 2826 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src); 2827 Temp sign = bld.vop2(aco_opcode::v_ashrrev_i32, bld.def(v1), Operand::c32(31u), src); 2828 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa); 2829 mantissa = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(7u), mantissa); 2830 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa); 2831 Temp new_exponent = bld.tmp(v1); 2832 Temp borrow = 2833 bld.vsub32(Definition(new_exponent), Operand::c32(63u), exponent, true).def(1).getTemp(); 2834 if (ctx->program->chip_class >= GFX8) 2835 mantissa = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), new_exponent, mantissa); 2836 else 2837 mantissa = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), mantissa, new_exponent); 2838 Temp saturate = bld.vop1(aco_opcode::v_bfrev_b32, bld.def(v1), Operand::c32(0xfffffffeu)); 2839 Temp lower = bld.tmp(v1), upper = bld.tmp(v1); 2840 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); 2841 lower = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), lower, 2842 Operand::c32(0xffffffffu), borrow); 2843 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), upper, saturate, borrow); 2844 lower = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, lower); 2845 upper = bld.vop2(aco_opcode::v_xor_b32, bld.def(v1), sign, upper); 2846 Temp new_lower = bld.tmp(v1); 2847 borrow = bld.vsub32(Definition(new_lower), lower, sign, true).def(1).getTemp(); 2848 Temp new_upper = bld.vsub32(bld.def(v1), upper, sign, false, borrow); 2849 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), new_lower, new_upper); 2850 2851 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) { 2852 if (src.type() == RegType::vgpr) 2853 src = bld.as_uniform(src); 2854 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, 2855 Operand::c32(0x80017u)); 2856 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, 2857 Operand::c32(126u)); 2858 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(), 2859 exponent); 2860 exponent = bld.sop2(aco_opcode::s_min_i32, bld.def(s1), bld.def(s1, scc), 2861 Operand::c32(64u), exponent); 2862 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 2863 Operand::c32(0x7fffffu), src); 2864 Temp sign = 2865 bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(31u)); 2866 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), 2867 Operand::c32(0x800000u), mantissa); 2868 mantissa = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), mantissa, 2869 Operand::c32(7u)); 2870 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa); 2871 exponent = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), 2872 Operand::c32(63u), exponent); 2873 mantissa = 2874 bld.sop2(aco_opcode::s_lshr_b64, bld.def(s2), bld.def(s1, scc), mantissa, exponent); 2875 Temp cond = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), exponent, 2876 Operand::c32(0xffffffffu)); // exp >= 64 2877 Temp saturate = bld.sop1(aco_opcode::s_brev_b64, bld.def(s2), Operand::c32(0xfffffffeu)); 2878 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), saturate, mantissa, cond); 2879 Temp lower = bld.tmp(s1), upper = bld.tmp(s1); 2880 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); 2881 lower = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, lower); 2882 upper = bld.sop2(aco_opcode::s_xor_b32, bld.def(s1), bld.def(s1, scc), sign, upper); 2883 Temp borrow = bld.tmp(s1); 2884 lower = 2885 bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.scc(Definition(borrow)), lower, sign); 2886 upper = bld.sop2(aco_opcode::s_subb_u32, bld.def(s1), bld.def(s1, scc), upper, sign, 2887 bld.scc(borrow)); 2888 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2889 2890 } else if (instr->src[0].src.ssa->bit_size == 64) { 2891 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), 2892 Operand::c32(0x3df00000u)); 2893 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); 2894 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); 2895 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), 2896 Operand::c32(0xc1f00000u)); 2897 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); 2898 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); 2899 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); 2900 Temp upper = bld.vop1(aco_opcode::v_cvt_i32_f64, bld.def(v1), floor); 2901 if (dst.type() == RegType::sgpr) { 2902 lower = bld.as_uniform(lower); 2903 upper = bld.as_uniform(upper); 2904 } 2905 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2906 2907 } else { 2908 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2909 } 2910 break; 2911 } 2912 case nir_op_f2u64: { 2913 Temp src = get_alu_src(ctx, instr->src[0]); 2914 if (instr->src[0].src.ssa->bit_size == 16) 2915 src = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), src); 2916 2917 if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::vgpr) { 2918 Temp exponent = bld.vop1(aco_opcode::v_frexp_exp_i32_f32, bld.def(v1), src); 2919 Temp exponent_in_range = bld.vopc(aco_opcode::v_cmp_ge_i32, bld.hint_vcc(bld.def(bld.lm)), 2920 Operand::c32(64u), exponent); 2921 exponent = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), Operand::zero(), exponent); 2922 Temp mantissa = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffu), src); 2923 mantissa = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(0x800000u), mantissa); 2924 Temp exponent_small = bld.vsub32(bld.def(v1), Operand::c32(24u), exponent); 2925 Temp small = bld.vop2(aco_opcode::v_lshrrev_b32, bld.def(v1), exponent_small, mantissa); 2926 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), Operand::zero(), mantissa); 2927 Temp new_exponent = bld.tmp(v1); 2928 Temp cond_small = 2929 bld.vsub32(Definition(new_exponent), exponent, Operand::c32(24u), true).def(1).getTemp(); 2930 if (ctx->program->chip_class >= GFX8) 2931 mantissa = bld.vop3(aco_opcode::v_lshlrev_b64, bld.def(v2), new_exponent, mantissa); 2932 else 2933 mantissa = bld.vop3(aco_opcode::v_lshl_b64, bld.def(v2), mantissa, new_exponent); 2934 Temp lower = bld.tmp(v1), upper = bld.tmp(v1); 2935 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); 2936 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), lower, small, cond_small); 2937 upper = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), upper, Operand::zero(), 2938 cond_small); 2939 lower = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), lower, 2940 exponent_in_range); 2941 upper = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::c32(0xffffffffu), upper, 2942 exponent_in_range); 2943 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2944 2945 } else if (instr->src[0].src.ssa->bit_size <= 32 && dst.type() == RegType::sgpr) { 2946 if (src.type() == RegType::vgpr) 2947 src = bld.as_uniform(src); 2948 Temp exponent = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), src, 2949 Operand::c32(0x80017u)); 2950 exponent = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), exponent, 2951 Operand::c32(126u)); 2952 exponent = bld.sop2(aco_opcode::s_max_i32, bld.def(s1), bld.def(s1, scc), Operand::zero(), 2953 exponent); 2954 Temp mantissa = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 2955 Operand::c32(0x7fffffu), src); 2956 mantissa = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), 2957 Operand::c32(0x800000u), mantissa); 2958 Temp exponent_small = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), 2959 Operand::c32(24u), exponent); 2960 Temp small = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), mantissa, 2961 exponent_small); 2962 mantissa = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), mantissa); 2963 Temp exponent_large = bld.sop2(aco_opcode::s_sub_u32, bld.def(s1), bld.def(s1, scc), 2964 exponent, Operand::c32(24u)); 2965 mantissa = bld.sop2(aco_opcode::s_lshl_b64, bld.def(s2), bld.def(s1, scc), mantissa, 2966 exponent_large); 2967 Temp cond = 2968 bld.sopc(aco_opcode::s_cmp_ge_i32, bld.def(s1, scc), Operand::c32(64u), exponent); 2969 mantissa = bld.sop2(aco_opcode::s_cselect_b64, bld.def(s2), mantissa, 2970 Operand::c32(0xffffffffu), cond); 2971 Temp lower = bld.tmp(s1), upper = bld.tmp(s1); 2972 bld.pseudo(aco_opcode::p_split_vector, Definition(lower), Definition(upper), mantissa); 2973 Temp cond_small = 2974 bld.sopc(aco_opcode::s_cmp_le_i32, bld.def(s1, scc), exponent, Operand::c32(24u)); 2975 lower = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), small, lower, cond_small); 2976 upper = 2977 bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::zero(), upper, cond_small); 2978 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2979 2980 } else if (instr->src[0].src.ssa->bit_size == 64) { 2981 Temp vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), 2982 Operand::c32(0x3df00000u)); 2983 Temp trunc = emit_trunc_f64(ctx, bld, bld.def(v2), src); 2984 Temp mul = bld.vop3(aco_opcode::v_mul_f64, bld.def(v2), trunc, vec); 2985 vec = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), Operand::zero(), 2986 Operand::c32(0xc1f00000u)); 2987 Temp floor = emit_floor_f64(ctx, bld, bld.def(v2), mul); 2988 Temp fma = bld.vop3(aco_opcode::v_fma_f64, bld.def(v2), floor, vec, trunc); 2989 Temp lower = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), fma); 2990 Temp upper = bld.vop1(aco_opcode::v_cvt_u32_f64, bld.def(v1), floor); 2991 if (dst.type() == RegType::sgpr) { 2992 lower = bld.as_uniform(lower); 2993 upper = bld.as_uniform(upper); 2994 } 2995 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lower, upper); 2996 2997 } else { 2998 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 2999 } 3000 break; 3001 } 3002 case nir_op_b2f16: { 3003 Temp src = get_alu_src(ctx, instr->src[0]); 3004 assert(src.regClass() == bld.lm); 3005 3006 if (dst.regClass() == s1) { 3007 src = bool_to_scalar_condition(ctx, src); 3008 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3c00u), src); 3009 } else if (dst.regClass() == v2b) { 3010 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3c00u)); 3011 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), one, src); 3012 } else { 3013 unreachable("Wrong destination register class for nir_op_b2f16."); 3014 } 3015 break; 3016 } 3017 case nir_op_b2f32: { 3018 Temp src = get_alu_src(ctx, instr->src[0]); 3019 assert(src.regClass() == bld.lm); 3020 3021 if (dst.regClass() == s1) { 3022 src = bool_to_scalar_condition(ctx, src); 3023 bld.sop2(aco_opcode::s_mul_i32, Definition(dst), Operand::c32(0x3f800000u), src); 3024 } else if (dst.regClass() == v1) { 3025 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), 3026 Operand::c32(0x3f800000u), src); 3027 } else { 3028 unreachable("Wrong destination register class for nir_op_b2f32."); 3029 } 3030 break; 3031 } 3032 case nir_op_b2f64: { 3033 Temp src = get_alu_src(ctx, instr->src[0]); 3034 assert(src.regClass() == bld.lm); 3035 3036 if (dst.regClass() == s2) { 3037 src = bool_to_scalar_condition(ctx, src); 3038 bld.sop2(aco_opcode::s_cselect_b64, Definition(dst), Operand::c32(0x3f800000u), 3039 Operand::zero(), bld.scc(src)); 3040 } else if (dst.regClass() == v2) { 3041 Temp one = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u)); 3042 Temp upper = 3043 bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), one, src); 3044 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), upper); 3045 } else { 3046 unreachable("Wrong destination register class for nir_op_b2f64."); 3047 } 3048 break; 3049 } 3050 case nir_op_i2i8: 3051 case nir_op_i2i16: 3052 case nir_op_i2i32: 3053 case nir_op_i2i64: { 3054 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) { 3055 /* no need to do the extract in get_alu_src() */ 3056 sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size 3057 ? sgpr_extract_sext 3058 : sgpr_extract_undef; 3059 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode); 3060 } else { 3061 const unsigned input_bitsize = instr->src[0].src.ssa->bit_size; 3062 const unsigned output_bitsize = instr->dest.dest.ssa.bit_size; 3063 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), input_bitsize, output_bitsize, 3064 output_bitsize > input_bitsize, dst); 3065 } 3066 break; 3067 } 3068 case nir_op_u2u8: 3069 case nir_op_u2u16: 3070 case nir_op_u2u32: 3071 case nir_op_u2u64: { 3072 if (dst.type() == RegType::sgpr && instr->src[0].src.ssa->bit_size < 32) { 3073 /* no need to do the extract in get_alu_src() */ 3074 sgpr_extract_mode mode = instr->dest.dest.ssa.bit_size > instr->src[0].src.ssa->bit_size 3075 ? sgpr_extract_zext 3076 : sgpr_extract_undef; 3077 extract_8_16_bit_sgpr_element(ctx, dst, &instr->src[0], mode); 3078 } else { 3079 convert_int(ctx, bld, get_alu_src(ctx, instr->src[0]), instr->src[0].src.ssa->bit_size, 3080 instr->dest.dest.ssa.bit_size, false, dst); 3081 } 3082 break; 3083 } 3084 case nir_op_b2b32: 3085 case nir_op_b2i8: 3086 case nir_op_b2i16: 3087 case nir_op_b2i32: 3088 case nir_op_b2i64: { 3089 Temp src = get_alu_src(ctx, instr->src[0]); 3090 assert(src.regClass() == bld.lm); 3091 3092 Temp tmp = dst.bytes() == 8 ? bld.tmp(RegClass::get(dst.type(), 4)) : dst; 3093 if (tmp.regClass() == s1) { 3094 bool_to_scalar_condition(ctx, src, tmp); 3095 } else if (tmp.type() == RegType::vgpr) { 3096 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(tmp), Operand::zero(), Operand::c32(1u), 3097 src); 3098 } else { 3099 unreachable("Invalid register class for b2i32"); 3100 } 3101 3102 if (tmp != dst) 3103 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tmp, Operand::zero()); 3104 break; 3105 } 3106 case nir_op_b2b1: 3107 case nir_op_i2b1: { 3108 Temp src = get_alu_src(ctx, instr->src[0]); 3109 assert(dst.regClass() == bld.lm); 3110 3111 if (src.type() == RegType::vgpr) { 3112 assert(src.regClass() == v1 || src.regClass() == v2); 3113 assert(dst.regClass() == bld.lm); 3114 bld.vopc(src.size() == 2 ? aco_opcode::v_cmp_lg_u64 : aco_opcode::v_cmp_lg_u32, 3115 Definition(dst), Operand::zero(), src) 3116 .def(0) 3117 .setHint(vcc); 3118 } else { 3119 assert(src.regClass() == s1 || src.regClass() == s2); 3120 Temp tmp; 3121 if (src.regClass() == s2 && ctx->program->chip_class <= GFX7) { 3122 tmp = 3123 bld.sop2(aco_opcode::s_or_b64, bld.def(s2), bld.def(s1, scc), Operand::zero(), src) 3124 .def(1) 3125 .getTemp(); 3126 } else { 3127 tmp = bld.sopc(src.size() == 2 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::s_cmp_lg_u32, 3128 bld.scc(bld.def(s1)), Operand::zero(), src); 3129 } 3130 bool_to_vector_condition(ctx, tmp, dst); 3131 } 3132 break; 3133 } 3134 case nir_op_unpack_64_2x32: 3135 case nir_op_unpack_32_2x16: 3136 case nir_op_unpack_64_4x16: 3137 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); 3138 emit_split_vector(ctx, dst, instr->op == nir_op_unpack_64_4x16 ? 4 : 2); 3139 break; 3140 case nir_op_pack_64_2x32_split: { 3141 Temp src0 = get_alu_src(ctx, instr->src[0]); 3142 Temp src1 = get_alu_src(ctx, instr->src[1]); 3143 3144 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); 3145 break; 3146 } 3147 case nir_op_unpack_64_2x32_split_x: 3148 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), 3149 get_alu_src(ctx, instr->src[0])); 3150 break; 3151 case nir_op_unpack_64_2x32_split_y: 3152 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), 3153 get_alu_src(ctx, instr->src[0])); 3154 break; 3155 case nir_op_unpack_32_2x16_split_x: 3156 if (dst.type() == RegType::vgpr) { 3157 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), bld.def(dst.regClass()), 3158 get_alu_src(ctx, instr->src[0])); 3159 } else { 3160 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); 3161 } 3162 break; 3163 case nir_op_unpack_32_2x16_split_y: 3164 if (dst.type() == RegType::vgpr) { 3165 bld.pseudo(aco_opcode::p_split_vector, bld.def(dst.regClass()), Definition(dst), 3166 get_alu_src(ctx, instr->src[0])); 3167 } else { 3168 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), 3169 get_alu_src(ctx, instr->src[0]), Operand::c32(1u), Operand::c32(16u), 3170 Operand::zero()); 3171 } 3172 break; 3173 case nir_op_pack_32_2x16_split: { 3174 Temp src0 = get_alu_src(ctx, instr->src[0]); 3175 Temp src1 = get_alu_src(ctx, instr->src[1]); 3176 if (dst.regClass() == v1) { 3177 src0 = emit_extract_vector(ctx, src0, 0, v2b); 3178 src1 = emit_extract_vector(ctx, src1, 0, v2b); 3179 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); 3180 } else { 3181 src0 = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), src0, 3182 Operand::c32(0xFFFFu)); 3183 src1 = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), src1, 3184 Operand::c32(16u)); 3185 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), src0, src1); 3186 } 3187 break; 3188 } 3189 case nir_op_pack_32_4x8: bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0], 4)); break; 3190 case nir_op_pack_half_2x16_split: { 3191 if (dst.regClass() == v1) { 3192 if (!ctx->block->fp_mode.care_about_round16_64 || 3193 ctx->block->fp_mode.round16_64 == fp_round_tz) { 3194 if (ctx->program->chip_class == GFX8 || ctx->program->chip_class == GFX9) 3195 emit_vop3a_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32_e64, dst); 3196 else 3197 emit_vop2_instruction(ctx, instr, aco_opcode::v_cvt_pkrtz_f16_f32, dst, false); 3198 } else { 3199 Temp src0 = 3200 bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[0])); 3201 Temp src1 = 3202 bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), get_alu_src(ctx, instr->src[1])); 3203 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), src0, src1); 3204 } 3205 } else { 3206 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3207 } 3208 break; 3209 } 3210 case nir_op_unpack_half_2x16_split_x_flush_to_zero: 3211 case nir_op_unpack_half_2x16_split_x: { 3212 Temp src = get_alu_src(ctx, instr->src[0]); 3213 if (src.regClass() == v1) 3214 src = bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src); 3215 if (dst.regClass() == v1) { 3216 assert(ctx->block->fp_mode.must_flush_denorms16_64 == 3217 (instr->op == nir_op_unpack_half_2x16_split_x_flush_to_zero)); 3218 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src); 3219 } else { 3220 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3221 } 3222 break; 3223 } 3224 case nir_op_unpack_half_2x16_split_y_flush_to_zero: 3225 case nir_op_unpack_half_2x16_split_y: { 3226 Temp src = get_alu_src(ctx, instr->src[0]); 3227 if (src.regClass() == s1) 3228 src = 3229 bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), src, Operand::c32(16u)); 3230 else 3231 src = 3232 bld.pseudo(aco_opcode::p_split_vector, bld.def(v2b), bld.def(v2b), src).def(1).getTemp(); 3233 if (dst.regClass() == v1) { 3234 assert(ctx->block->fp_mode.must_flush_denorms16_64 == 3235 (instr->op == nir_op_unpack_half_2x16_split_y_flush_to_zero)); 3236 bld.vop1(aco_opcode::v_cvt_f32_f16, Definition(dst), src); 3237 } else { 3238 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3239 } 3240 break; 3241 } 3242 case nir_op_sad_u8x4: { 3243 assert(dst.regClass() == v1); 3244 emit_vop3a_instruction(ctx, instr, aco_opcode::v_sad_u8, dst, false, 3u, false); 3245 break; 3246 } 3247 case nir_op_fquantize2f16: { 3248 Temp src = get_alu_src(ctx, instr->src[0]); 3249 Temp f16 = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v1), src); 3250 Temp f32, cmp_res; 3251 3252 if (ctx->program->chip_class >= GFX8) { 3253 Temp mask = bld.copy( 3254 bld.def(s1), Operand::c32(0x36Fu)); /* value is NOT negative/positive denormal value */ 3255 cmp_res = 3256 bld.vopc_e64(aco_opcode::v_cmp_class_f16, bld.hint_vcc(bld.def(bld.lm)), f16, mask); 3257 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); 3258 } else { 3259 /* 0x38800000 is smallest half float value (2^-14) in 32-bit float, 3260 * so compare the result and flush to 0 if it's smaller. 3261 */ 3262 f32 = bld.vop1(aco_opcode::v_cvt_f32_f16, bld.def(v1), f16); 3263 Temp smallest = bld.copy(bld.def(s1), Operand::c32(0x38800000u)); 3264 Instruction* tmp0 = bld.vopc_e64(aco_opcode::v_cmp_lt_f32, bld.def(bld.lm), f32, smallest); 3265 tmp0->vop3().abs[0] = true; 3266 Temp tmp1 = 3267 bld.vopc(aco_opcode::v_cmp_lg_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), f32); 3268 cmp_res = bld.sop2(aco_opcode::s_nand_b64, bld.def(s2), bld.def(s1, scc), 3269 tmp0->definitions[0].getTemp(), tmp1); 3270 } 3271 3272 if (ctx->block->fp_mode.preserve_signed_zero_inf_nan32) { 3273 Temp copysign_0 = 3274 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), Operand::zero(), as_vgpr(ctx, src)); 3275 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), copysign_0, f32, cmp_res); 3276 } else { 3277 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), Operand::zero(), f32, cmp_res); 3278 } 3279 break; 3280 } 3281 case nir_op_bfm: { 3282 Temp bits = get_alu_src(ctx, instr->src[0]); 3283 Temp offset = get_alu_src(ctx, instr->src[1]); 3284 3285 if (dst.regClass() == s1) { 3286 bld.sop2(aco_opcode::s_bfm_b32, Definition(dst), bits, offset); 3287 } else if (dst.regClass() == v1) { 3288 bld.vop3(aco_opcode::v_bfm_b32, Definition(dst), bits, offset); 3289 } else { 3290 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3291 } 3292 break; 3293 } 3294 case nir_op_bitfield_select: { 3295 3296 /* dst = (insert & bitmask) | (base & ~bitmask) */ 3297 if (dst.regClass() == s1) { 3298 Temp bitmask = get_alu_src(ctx, instr->src[0]); 3299 Temp insert = get_alu_src(ctx, instr->src[1]); 3300 Temp base = get_alu_src(ctx, instr->src[2]); 3301 aco_ptr<Instruction> sop2; 3302 nir_const_value* const_bitmask = nir_src_as_const_value(instr->src[0].src); 3303 nir_const_value* const_insert = nir_src_as_const_value(instr->src[1].src); 3304 Operand lhs; 3305 if (const_insert && const_bitmask) { 3306 lhs = Operand::c32(const_insert->u32 & const_bitmask->u32); 3307 } else { 3308 insert = 3309 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), insert, bitmask); 3310 lhs = Operand(insert); 3311 } 3312 3313 Operand rhs; 3314 nir_const_value* const_base = nir_src_as_const_value(instr->src[2].src); 3315 if (const_base && const_bitmask) { 3316 rhs = Operand::c32(const_base->u32 & ~const_bitmask->u32); 3317 } else { 3318 base = bld.sop2(aco_opcode::s_andn2_b32, bld.def(s1), bld.def(s1, scc), base, bitmask); 3319 rhs = Operand(base); 3320 } 3321 3322 bld.sop2(aco_opcode::s_or_b32, Definition(dst), bld.def(s1, scc), rhs, lhs); 3323 3324 } else if (dst.regClass() == v1) { 3325 emit_vop3a_instruction(ctx, instr, aco_opcode::v_bfi_b32, dst, false, 3); 3326 } else { 3327 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3328 } 3329 break; 3330 } 3331 case nir_op_ubfe: 3332 case nir_op_ibfe: { 3333 if (dst.bytes() != 4) 3334 unreachable("Unsupported BFE bit size"); 3335 3336 if (dst.type() == RegType::sgpr) { 3337 Temp base = get_alu_src(ctx, instr->src[0]); 3338 3339 nir_const_value* const_offset = nir_src_as_const_value(instr->src[1].src); 3340 nir_const_value* const_bits = nir_src_as_const_value(instr->src[2].src); 3341 if (const_offset && const_bits) { 3342 uint32_t extract = (const_bits->u32 << 16) | (const_offset->u32 & 0x1f); 3343 aco_opcode opcode = 3344 instr->op == nir_op_ubfe ? aco_opcode::s_bfe_u32 : aco_opcode::s_bfe_i32; 3345 bld.sop2(opcode, Definition(dst), bld.def(s1, scc), base, Operand::c32(extract)); 3346 break; 3347 } 3348 3349 Temp offset = get_alu_src(ctx, instr->src[1]); 3350 Temp bits = get_alu_src(ctx, instr->src[2]); 3351 if (instr->op == nir_op_ubfe) { 3352 Temp mask = bld.sop2(aco_opcode::s_bfm_b32, bld.def(s1), bits, offset); 3353 Temp masked = 3354 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), base, mask); 3355 bld.sop2(aco_opcode::s_lshr_b32, Definition(dst), bld.def(s1, scc), masked, offset); 3356 } else { 3357 Operand bits_op = const_bits ? Operand::c32(const_bits->u32 << 16) 3358 : bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), 3359 bld.def(s1, scc), bits, Operand::c32(16u)); 3360 Operand offset_op = const_offset 3361 ? Operand::c32(const_offset->u32 & 0x1fu) 3362 : bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 3363 offset, Operand::c32(0x1fu)); 3364 3365 Temp extract = 3366 bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), bits_op, offset_op); 3367 bld.sop2(aco_opcode::s_bfe_i32, Definition(dst), bld.def(s1, scc), base, extract); 3368 } 3369 3370 } else { 3371 aco_opcode opcode = 3372 instr->op == nir_op_ubfe ? aco_opcode::v_bfe_u32 : aco_opcode::v_bfe_i32; 3373 emit_vop3a_instruction(ctx, instr, opcode, dst, false, 3); 3374 } 3375 break; 3376 } 3377 case nir_op_extract_u8: 3378 case nir_op_extract_i8: 3379 case nir_op_extract_u16: 3380 case nir_op_extract_i16: { 3381 bool is_signed = instr->op == nir_op_extract_i16 || instr->op == nir_op_extract_i8; 3382 unsigned comp = instr->op == nir_op_extract_u8 || instr->op == nir_op_extract_i8 ? 4 : 2; 3383 uint32_t bits = comp == 4 ? 8 : 16; 3384 unsigned index = nir_src_as_uint(instr->src[1].src); 3385 if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) { 3386 assert(index == 0); 3387 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); 3388 } else if (dst.regClass() == s1 && instr->dest.dest.ssa.bit_size == 16) { 3389 Temp vec = get_ssa_temp(ctx, instr->src[0].src.ssa); 3390 unsigned swizzle = instr->src[0].swizzle[0]; 3391 if (vec.size() > 1) { 3392 vec = emit_extract_vector(ctx, vec, swizzle / 2, s1); 3393 swizzle = swizzle & 1; 3394 } 3395 index += swizzle * instr->dest.dest.ssa.bit_size / bits; 3396 bld.pseudo(aco_opcode::p_extract, Definition(dst), bld.def(s1, scc), Operand(vec), 3397 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed)); 3398 } else { 3399 Temp src = get_alu_src(ctx, instr->src[0]); 3400 Definition def(dst); 3401 if (dst.bytes() == 8) { 3402 src = emit_extract_vector(ctx, src, index / comp, RegClass(src.type(), 1)); 3403 index %= comp; 3404 def = bld.def(src.type(), 1); 3405 } 3406 assert(def.bytes() <= 4); 3407 if (def.regClass() == s1) { 3408 bld.pseudo(aco_opcode::p_extract, def, bld.def(s1, scc), Operand(src), 3409 Operand::c32(index), Operand::c32(bits), Operand::c32(is_signed)); 3410 } else { 3411 src = emit_extract_vector(ctx, src, 0, def.regClass()); 3412 bld.pseudo(aco_opcode::p_extract, def, Operand(src), Operand::c32(index), 3413 Operand::c32(bits), Operand::c32(is_signed)); 3414 } 3415 if (dst.size() == 2) 3416 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(), 3417 Operand::zero()); 3418 } 3419 break; 3420 } 3421 case nir_op_insert_u8: 3422 case nir_op_insert_u16: { 3423 unsigned comp = instr->op == nir_op_insert_u8 ? 4 : 2; 3424 uint32_t bits = comp == 4 ? 8 : 16; 3425 unsigned index = nir_src_as_uint(instr->src[1].src); 3426 if (bits >= instr->dest.dest.ssa.bit_size || index * bits >= instr->dest.dest.ssa.bit_size) { 3427 assert(index == 0); 3428 bld.copy(Definition(dst), get_alu_src(ctx, instr->src[0])); 3429 } else { 3430 Temp src = get_alu_src(ctx, instr->src[0]); 3431 Definition def(dst); 3432 bool swap = false; 3433 if (dst.bytes() == 8) { 3434 src = emit_extract_vector(ctx, src, 0u, RegClass(src.type(), 1)); 3435 swap = index >= comp; 3436 index %= comp; 3437 def = bld.def(src.type(), 1); 3438 } 3439 if (def.regClass() == s1) { 3440 bld.pseudo(aco_opcode::p_insert, def, bld.def(s1, scc), Operand(src), 3441 Operand::c32(index), Operand::c32(bits)); 3442 } else { 3443 src = emit_extract_vector(ctx, src, 0, def.regClass()); 3444 bld.pseudo(aco_opcode::p_insert, def, Operand(src), Operand::c32(index), 3445 Operand::c32(bits)); 3446 } 3447 if (dst.size() == 2 && swap) 3448 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand::zero(), 3449 def.getTemp()); 3450 else if (dst.size() == 2) 3451 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), def.getTemp(), 3452 Operand::zero()); 3453 } 3454 break; 3455 } 3456 case nir_op_bit_count: { 3457 Temp src = get_alu_src(ctx, instr->src[0]); 3458 if (src.regClass() == s1) { 3459 bld.sop1(aco_opcode::s_bcnt1_i32_b32, Definition(dst), bld.def(s1, scc), src); 3460 } else if (src.regClass() == v1) { 3461 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), src, Operand::zero()); 3462 } else if (src.regClass() == v2) { 3463 bld.vop3(aco_opcode::v_bcnt_u32_b32, Definition(dst), emit_extract_vector(ctx, src, 1, v1), 3464 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), 3465 emit_extract_vector(ctx, src, 0, v1), Operand::zero())); 3466 } else if (src.regClass() == s2) { 3467 bld.sop1(aco_opcode::s_bcnt1_i32_b64, Definition(dst), bld.def(s1, scc), src); 3468 } else { 3469 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 3470 } 3471 break; 3472 } 3473 case nir_op_flt: { 3474 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_f16, aco_opcode::v_cmp_lt_f32, 3475 aco_opcode::v_cmp_lt_f64); 3476 break; 3477 } 3478 case nir_op_fge: { 3479 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_f16, aco_opcode::v_cmp_ge_f32, 3480 aco_opcode::v_cmp_ge_f64); 3481 break; 3482 } 3483 case nir_op_feq: { 3484 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_eq_f16, aco_opcode::v_cmp_eq_f32, 3485 aco_opcode::v_cmp_eq_f64); 3486 break; 3487 } 3488 case nir_op_fneu: { 3489 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_neq_f16, aco_opcode::v_cmp_neq_f32, 3490 aco_opcode::v_cmp_neq_f64); 3491 break; 3492 } 3493 case nir_op_ilt: { 3494 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_i16, aco_opcode::v_cmp_lt_i32, 3495 aco_opcode::v_cmp_lt_i64, aco_opcode::s_cmp_lt_i32); 3496 break; 3497 } 3498 case nir_op_ige: { 3499 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_i16, aco_opcode::v_cmp_ge_i32, 3500 aco_opcode::v_cmp_ge_i64, aco_opcode::s_cmp_ge_i32); 3501 break; 3502 } 3503 case nir_op_ieq: { 3504 if (instr->src[0].src.ssa->bit_size == 1) 3505 emit_boolean_logic(ctx, instr, Builder::s_xnor, dst); 3506 else 3507 emit_comparison( 3508 ctx, instr, dst, aco_opcode::v_cmp_eq_i16, aco_opcode::v_cmp_eq_i32, 3509 aco_opcode::v_cmp_eq_i64, aco_opcode::s_cmp_eq_i32, 3510 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_eq_u64 : aco_opcode::num_opcodes); 3511 break; 3512 } 3513 case nir_op_ine: { 3514 if (instr->src[0].src.ssa->bit_size == 1) 3515 emit_boolean_logic(ctx, instr, Builder::s_xor, dst); 3516 else 3517 emit_comparison( 3518 ctx, instr, dst, aco_opcode::v_cmp_lg_i16, aco_opcode::v_cmp_lg_i32, 3519 aco_opcode::v_cmp_lg_i64, aco_opcode::s_cmp_lg_i32, 3520 ctx->program->chip_class >= GFX8 ? aco_opcode::s_cmp_lg_u64 : aco_opcode::num_opcodes); 3521 break; 3522 } 3523 case nir_op_ult: { 3524 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_lt_u16, aco_opcode::v_cmp_lt_u32, 3525 aco_opcode::v_cmp_lt_u64, aco_opcode::s_cmp_lt_u32); 3526 break; 3527 } 3528 case nir_op_uge: { 3529 emit_comparison(ctx, instr, dst, aco_opcode::v_cmp_ge_u16, aco_opcode::v_cmp_ge_u32, 3530 aco_opcode::v_cmp_ge_u64, aco_opcode::s_cmp_ge_u32); 3531 break; 3532 } 3533 case nir_op_fddx: 3534 case nir_op_fddy: 3535 case nir_op_fddx_fine: 3536 case nir_op_fddy_fine: 3537 case nir_op_fddx_coarse: 3538 case nir_op_fddy_coarse: { 3539 if (!nir_src_is_divergent(instr->src[0].src)) { 3540 /* Source is the same in all lanes, so the derivative is zero. 3541 * This also avoids emitting invalid IR. 3542 */ 3543 bld.copy(Definition(dst), Operand::zero()); 3544 break; 3545 } 3546 3547 Temp src = as_vgpr(ctx, get_alu_src(ctx, instr->src[0])); 3548 uint16_t dpp_ctrl1, dpp_ctrl2; 3549 if (instr->op == nir_op_fddx_fine) { 3550 dpp_ctrl1 = dpp_quad_perm(0, 0, 2, 2); 3551 dpp_ctrl2 = dpp_quad_perm(1, 1, 3, 3); 3552 } else if (instr->op == nir_op_fddy_fine) { 3553 dpp_ctrl1 = dpp_quad_perm(0, 1, 0, 1); 3554 dpp_ctrl2 = dpp_quad_perm(2, 3, 2, 3); 3555 } else { 3556 dpp_ctrl1 = dpp_quad_perm(0, 0, 0, 0); 3557 if (instr->op == nir_op_fddx || instr->op == nir_op_fddx_coarse) 3558 dpp_ctrl2 = dpp_quad_perm(1, 1, 1, 1); 3559 else 3560 dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2); 3561 } 3562 3563 Temp tmp; 3564 if (ctx->program->chip_class >= GFX8) { 3565 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), src, dpp_ctrl1); 3566 tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), src, tl, dpp_ctrl2); 3567 } else { 3568 Temp tl = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl1); 3569 Temp tr = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), src, (1 << 15) | dpp_ctrl2); 3570 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), tr, tl); 3571 } 3572 emit_wqm(bld, tmp, dst, true); 3573 break; 3574 } 3575 default: isel_err(&instr->instr, "Unknown NIR ALU instr"); 3576 } 3577} 3578 3579void 3580visit_load_const(isel_context* ctx, nir_load_const_instr* instr) 3581{ 3582 Temp dst = get_ssa_temp(ctx, &instr->def); 3583 3584 // TODO: we really want to have the resulting type as this would allow for 64bit literals 3585 // which get truncated the lsb if double and msb if int 3586 // for now, we only use s_mov_b64 with 64bit inline constants 3587 assert(instr->def.num_components == 1 && "Vector load_const should be lowered to scalar."); 3588 assert(dst.type() == RegType::sgpr); 3589 3590 Builder bld(ctx->program, ctx->block); 3591 3592 if (instr->def.bit_size == 1) { 3593 assert(dst.regClass() == bld.lm); 3594 int val = instr->value[0].b ? -1 : 0; 3595 Operand op = bld.lm.size() == 1 ? Operand::c32(val) : Operand::c64(val); 3596 bld.copy(Definition(dst), op); 3597 } else if (instr->def.bit_size == 8) { 3598 bld.copy(Definition(dst), Operand::c32(instr->value[0].u8)); 3599 } else if (instr->def.bit_size == 16) { 3600 /* sign-extend to use s_movk_i32 instead of a literal */ 3601 bld.copy(Definition(dst), Operand::c32(instr->value[0].i16)); 3602 } else if (dst.size() == 1) { 3603 bld.copy(Definition(dst), Operand::c32(instr->value[0].u32)); 3604 } else { 3605 assert(dst.size() != 1); 3606 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 3607 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; 3608 if (instr->def.bit_size == 64) 3609 for (unsigned i = 0; i < dst.size(); i++) 3610 vec->operands[i] = Operand::c32(instr->value[0].u64 >> i * 32); 3611 else { 3612 for (unsigned i = 0; i < dst.size(); i++) 3613 vec->operands[i] = Operand::c32(instr->value[i].u32); 3614 } 3615 vec->definitions[0] = Definition(dst); 3616 ctx->block->instructions.emplace_back(std::move(vec)); 3617 } 3618} 3619 3620uint32_t 3621widen_mask(uint32_t mask, unsigned multiplier) 3622{ 3623 uint32_t new_mask = 0; 3624 for (unsigned i = 0; i < 32 && (1u << i) <= mask; ++i) 3625 if (mask & (1u << i)) 3626 new_mask |= ((1u << multiplier) - 1u) << (i * multiplier); 3627 return new_mask; 3628} 3629 3630struct LoadEmitInfo { 3631 Operand offset; 3632 Temp dst; 3633 unsigned num_components; 3634 unsigned component_size; 3635 Temp resource = Temp(0, s1); 3636 unsigned component_stride = 0; 3637 unsigned const_offset = 0; 3638 unsigned align_mul = 0; 3639 unsigned align_offset = 0; 3640 3641 bool glc = false; 3642 bool slc = false; 3643 unsigned swizzle_component_size = 0; 3644 memory_sync_info sync; 3645 Temp soffset = Temp(0, s1); 3646}; 3647 3648struct EmitLoadParameters { 3649 using Callback = Temp (*)(Builder& bld, const LoadEmitInfo& info, Temp offset, 3650 unsigned bytes_needed, unsigned align, unsigned const_offset, 3651 Temp dst_hint); 3652 3653 Callback callback; 3654 bool byte_align_loads; 3655 bool supports_8bit_16bit_loads; 3656 unsigned max_const_offset_plus_one; 3657}; 3658 3659void 3660emit_load(isel_context* ctx, Builder& bld, const LoadEmitInfo& info, 3661 const EmitLoadParameters& params) 3662{ 3663 unsigned load_size = info.num_components * info.component_size; 3664 unsigned component_size = info.component_size; 3665 3666 unsigned num_vals = 0; 3667 Temp* const vals = (Temp*)alloca(info.dst.bytes() * sizeof(Temp)); 3668 3669 unsigned const_offset = info.const_offset; 3670 3671 const unsigned align_mul = info.align_mul ? info.align_mul : component_size; 3672 unsigned align_offset = (info.align_offset + const_offset) % align_mul; 3673 3674 unsigned bytes_read = 0; 3675 while (bytes_read < load_size) { 3676 unsigned bytes_needed = load_size - bytes_read; 3677 3678 /* add buffer for unaligned loads */ 3679 int byte_align = 0; 3680 if (params.byte_align_loads) { 3681 byte_align = align_mul % 4 == 0 ? align_offset % 4 : -1; 3682 } 3683 3684 if (byte_align) { 3685 if (bytes_needed > 2 || (bytes_needed == 2 && (align_mul % 2 || align_offset % 2)) || 3686 !params.supports_8bit_16bit_loads) { 3687 if (info.component_stride) { 3688 assert(params.supports_8bit_16bit_loads && "unimplemented"); 3689 bytes_needed = 2; 3690 byte_align = 0; 3691 } else { 3692 bytes_needed += byte_align == -1 ? 4 - info.align_mul : byte_align; 3693 bytes_needed = align(bytes_needed, 4); 3694 } 3695 } else { 3696 byte_align = 0; 3697 } 3698 } 3699 3700 if (info.swizzle_component_size) 3701 bytes_needed = MIN2(bytes_needed, info.swizzle_component_size); 3702 if (info.component_stride) 3703 bytes_needed = MIN2(bytes_needed, info.component_size); 3704 3705 bool need_to_align_offset = byte_align && (align_mul % 4 || align_offset % 4); 3706 3707 /* reduce constant offset */ 3708 Operand offset = info.offset; 3709 unsigned reduced_const_offset = const_offset; 3710 bool remove_const_offset_completely = need_to_align_offset; 3711 if (const_offset && 3712 (remove_const_offset_completely || const_offset >= params.max_const_offset_plus_one)) { 3713 unsigned to_add = const_offset; 3714 if (remove_const_offset_completely) { 3715 reduced_const_offset = 0; 3716 } else { 3717 to_add = 3718 const_offset / params.max_const_offset_plus_one * params.max_const_offset_plus_one; 3719 reduced_const_offset %= params.max_const_offset_plus_one; 3720 } 3721 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); 3722 if (offset.isConstant()) { 3723 offset = Operand::c32(offset.constantValue() + to_add); 3724 } else if (offset_tmp.regClass() == s1) { 3725 offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), offset_tmp, 3726 Operand::c32(to_add)); 3727 } else if (offset_tmp.regClass() == v1) { 3728 offset = bld.vadd32(bld.def(v1), offset_tmp, Operand::c32(to_add)); 3729 } else { 3730 Temp lo = bld.tmp(offset_tmp.type(), 1); 3731 Temp hi = bld.tmp(offset_tmp.type(), 1); 3732 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp); 3733 3734 if (offset_tmp.regClass() == s2) { 3735 Temp carry = bld.tmp(s1); 3736 lo = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), lo, 3737 Operand::c32(to_add)); 3738 hi = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), hi, carry); 3739 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(s2), lo, hi); 3740 } else { 3741 Temp new_lo = bld.tmp(v1); 3742 Temp carry = 3743 bld.vadd32(Definition(new_lo), lo, Operand::c32(to_add), true).def(1).getTemp(); 3744 hi = bld.vadd32(bld.def(v1), hi, Operand::zero(), false, carry); 3745 offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_lo, hi); 3746 } 3747 } 3748 } 3749 3750 /* align offset down if needed */ 3751 Operand aligned_offset = offset; 3752 unsigned align = align_offset ? 1 << (ffs(align_offset) - 1) : align_mul; 3753 if (need_to_align_offset) { 3754 align = 4; 3755 Temp offset_tmp = offset.isTemp() ? offset.getTemp() : Temp(); 3756 if (offset.isConstant()) { 3757 aligned_offset = Operand::c32(offset.constantValue() & 0xfffffffcu); 3758 } else if (offset_tmp.regClass() == s1) { 3759 aligned_offset = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 3760 Operand::c32(0xfffffffcu), offset_tmp); 3761 } else if (offset_tmp.regClass() == s2) { 3762 aligned_offset = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), 3763 Operand::c64(0xfffffffffffffffcllu), offset_tmp); 3764 } else if (offset_tmp.regClass() == v1) { 3765 aligned_offset = 3766 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), offset_tmp); 3767 } else if (offset_tmp.regClass() == v2) { 3768 Temp hi = bld.tmp(v1), lo = bld.tmp(v1); 3769 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), offset_tmp); 3770 lo = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0xfffffffcu), lo); 3771 aligned_offset = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), lo, hi); 3772 } 3773 } 3774 Temp aligned_offset_tmp = 3775 aligned_offset.isTemp() ? aligned_offset.getTemp() : bld.copy(bld.def(s1), aligned_offset); 3776 3777 Temp val = params.callback(bld, info, aligned_offset_tmp, bytes_needed, align, 3778 reduced_const_offset, byte_align ? Temp() : info.dst); 3779 3780 /* the callback wrote directly to dst */ 3781 if (val == info.dst) { 3782 assert(num_vals == 0); 3783 emit_split_vector(ctx, info.dst, info.num_components); 3784 return; 3785 } 3786 3787 /* shift result right if needed */ 3788 if (params.byte_align_loads && info.component_size < 4) { 3789 Operand byte_align_off = Operand::c32(byte_align); 3790 if (byte_align == -1) { 3791 if (offset.isConstant()) 3792 byte_align_off = Operand::c32(offset.constantValue() % 4u); 3793 else if (offset.size() == 2) 3794 byte_align_off = Operand(emit_extract_vector(ctx, offset.getTemp(), 0, 3795 RegClass(offset.getTemp().type(), 1))); 3796 else 3797 byte_align_off = offset; 3798 } 3799 3800 assert(val.bytes() >= load_size && "unimplemented"); 3801 if (val.type() == RegType::sgpr) 3802 byte_align_scalar(ctx, val, byte_align_off, info.dst); 3803 else 3804 byte_align_vector(ctx, val, byte_align_off, info.dst, component_size); 3805 return; 3806 } 3807 3808 /* add result to list and advance */ 3809 if (info.component_stride) { 3810 assert(val.bytes() == info.component_size && "unimplemented"); 3811 const_offset += info.component_stride; 3812 align_offset = (align_offset + info.component_stride) % align_mul; 3813 } else { 3814 const_offset += val.bytes(); 3815 align_offset = (align_offset + val.bytes()) % align_mul; 3816 } 3817 bytes_read += val.bytes(); 3818 vals[num_vals++] = val; 3819 } 3820 3821 /* create array of components */ 3822 unsigned components_split = 0; 3823 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec; 3824 bool has_vgprs = false; 3825 for (unsigned i = 0; i < num_vals;) { 3826 Temp* const tmp = (Temp*)alloca(num_vals * sizeof(Temp)); 3827 unsigned num_tmps = 0; 3828 unsigned tmp_size = 0; 3829 RegType reg_type = RegType::sgpr; 3830 while ((!tmp_size || (tmp_size % component_size)) && i < num_vals) { 3831 if (vals[i].type() == RegType::vgpr) 3832 reg_type = RegType::vgpr; 3833 tmp_size += vals[i].bytes(); 3834 tmp[num_tmps++] = vals[i++]; 3835 } 3836 if (num_tmps > 1) { 3837 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 3838 aco_opcode::p_create_vector, Format::PSEUDO, num_tmps, 1)}; 3839 for (unsigned j = 0; j < num_tmps; j++) 3840 vec->operands[j] = Operand(tmp[j]); 3841 tmp[0] = bld.tmp(RegClass::get(reg_type, tmp_size)); 3842 vec->definitions[0] = Definition(tmp[0]); 3843 bld.insert(std::move(vec)); 3844 } 3845 3846 if (tmp[0].bytes() % component_size) { 3847 /* trim tmp[0] */ 3848 assert(i == num_vals); 3849 RegClass new_rc = 3850 RegClass::get(reg_type, tmp[0].bytes() / component_size * component_size); 3851 tmp[0] = 3852 bld.pseudo(aco_opcode::p_extract_vector, bld.def(new_rc), tmp[0], Operand::zero()); 3853 } 3854 3855 RegClass elem_rc = RegClass::get(reg_type, component_size); 3856 3857 unsigned start = components_split; 3858 3859 if (tmp_size == elem_rc.bytes()) { 3860 allocated_vec[components_split++] = tmp[0]; 3861 } else { 3862 assert(tmp_size % elem_rc.bytes() == 0); 3863 aco_ptr<Pseudo_instruction> split{create_instruction<Pseudo_instruction>( 3864 aco_opcode::p_split_vector, Format::PSEUDO, 1, tmp_size / elem_rc.bytes())}; 3865 for (auto& def : split->definitions) { 3866 Temp component = bld.tmp(elem_rc); 3867 allocated_vec[components_split++] = component; 3868 def = Definition(component); 3869 } 3870 split->operands[0] = Operand(tmp[0]); 3871 bld.insert(std::move(split)); 3872 } 3873 3874 /* try to p_as_uniform early so we can create more optimizable code and 3875 * also update allocated_vec */ 3876 for (unsigned j = start; j < components_split; j++) { 3877 if (allocated_vec[j].bytes() % 4 == 0 && info.dst.type() == RegType::sgpr) 3878 allocated_vec[j] = bld.as_uniform(allocated_vec[j]); 3879 has_vgprs |= allocated_vec[j].type() == RegType::vgpr; 3880 } 3881 } 3882 3883 /* concatenate components and p_as_uniform() result if needed */ 3884 if (info.dst.type() == RegType::vgpr || !has_vgprs) 3885 ctx->allocated_vec.emplace(info.dst.id(), allocated_vec); 3886 3887 int padding_bytes = 3888 MAX2((int)info.dst.bytes() - int(allocated_vec[0].bytes() * info.num_components), 0); 3889 3890 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 3891 aco_opcode::p_create_vector, Format::PSEUDO, info.num_components + !!padding_bytes, 1)}; 3892 for (unsigned i = 0; i < info.num_components; i++) 3893 vec->operands[i] = Operand(allocated_vec[i]); 3894 if (padding_bytes) 3895 vec->operands[info.num_components] = Operand(RegClass::get(RegType::vgpr, padding_bytes)); 3896 if (info.dst.type() == RegType::sgpr && has_vgprs) { 3897 Temp tmp = bld.tmp(RegType::vgpr, info.dst.size()); 3898 vec->definitions[0] = Definition(tmp); 3899 bld.insert(std::move(vec)); 3900 bld.pseudo(aco_opcode::p_as_uniform, Definition(info.dst), tmp); 3901 } else { 3902 vec->definitions[0] = Definition(info.dst); 3903 bld.insert(std::move(vec)); 3904 } 3905} 3906 3907Operand 3908load_lds_size_m0(Builder& bld) 3909{ 3910 /* m0 does not need to be initialized on GFX9+ */ 3911 if (bld.program->chip_class >= GFX9) 3912 return Operand(s1); 3913 3914 return bld.m0((Temp)bld.copy(bld.def(s1, m0), Operand::c32(0xffffffffu))); 3915} 3916 3917Temp 3918lds_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 3919 unsigned align, unsigned const_offset, Temp dst_hint) 3920{ 3921 offset = offset.regClass() == s1 ? bld.copy(bld.def(v1), offset) : offset; 3922 3923 Operand m = load_lds_size_m0(bld); 3924 3925 bool large_ds_read = bld.program->chip_class >= GFX7; 3926 bool usable_read2 = bld.program->chip_class >= GFX7; 3927 3928 bool read2 = false; 3929 unsigned size = 0; 3930 aco_opcode op; 3931 if (bytes_needed >= 16 && align % 16 == 0 && large_ds_read) { 3932 size = 16; 3933 op = aco_opcode::ds_read_b128; 3934 } else if (bytes_needed >= 16 && align % 8 == 0 && const_offset % 8 == 0 && usable_read2) { 3935 size = 16; 3936 read2 = true; 3937 op = aco_opcode::ds_read2_b64; 3938 } else if (bytes_needed >= 12 && align % 16 == 0 && large_ds_read) { 3939 size = 12; 3940 op = aco_opcode::ds_read_b96; 3941 } else if (bytes_needed >= 8 && align % 8 == 0) { 3942 size = 8; 3943 op = aco_opcode::ds_read_b64; 3944 } else if (bytes_needed >= 8 && align % 4 == 0 && const_offset % 4 == 0 && usable_read2) { 3945 size = 8; 3946 read2 = true; 3947 op = aco_opcode::ds_read2_b32; 3948 } else if (bytes_needed >= 4 && align % 4 == 0) { 3949 size = 4; 3950 op = aco_opcode::ds_read_b32; 3951 } else if (bytes_needed >= 2 && align % 2 == 0) { 3952 size = 2; 3953 op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u16_d16 : aco_opcode::ds_read_u16; 3954 } else { 3955 size = 1; 3956 op = bld.program->chip_class >= GFX9 ? aco_opcode::ds_read_u8_d16 : aco_opcode::ds_read_u8; 3957 } 3958 3959 unsigned const_offset_unit = read2 ? size / 2u : 1u; 3960 unsigned const_offset_range = read2 ? 255 * const_offset_unit : 65536; 3961 3962 if (const_offset > (const_offset_range - const_offset_unit)) { 3963 unsigned excess = const_offset - (const_offset % const_offset_range); 3964 offset = bld.vadd32(bld.def(v1), offset, Operand::c32(excess)); 3965 const_offset -= excess; 3966 } 3967 3968 const_offset /= const_offset_unit; 3969 3970 RegClass rc = RegClass::get(RegType::vgpr, size); 3971 Temp val = rc == info.dst.regClass() && dst_hint.id() ? dst_hint : bld.tmp(rc); 3972 Instruction* instr; 3973 if (read2) 3974 instr = bld.ds(op, Definition(val), offset, m, const_offset, const_offset + 1); 3975 else 3976 instr = bld.ds(op, Definition(val), offset, m, const_offset); 3977 instr->ds().sync = info.sync; 3978 3979 if (m.isUndefined()) 3980 instr->operands.pop_back(); 3981 3982 return val; 3983} 3984 3985const EmitLoadParameters lds_load_params{lds_load_callback, false, true, UINT32_MAX}; 3986 3987Temp 3988smem_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 3989 unsigned align, unsigned const_offset, Temp dst_hint) 3990{ 3991 unsigned size = 0; 3992 aco_opcode op; 3993 if (bytes_needed <= 4) { 3994 size = 1; 3995 op = info.resource.id() ? aco_opcode::s_buffer_load_dword : aco_opcode::s_load_dword; 3996 } else if (bytes_needed <= 8) { 3997 size = 2; 3998 op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx2 : aco_opcode::s_load_dwordx2; 3999 } else if (bytes_needed <= 16) { 4000 size = 4; 4001 op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx4 : aco_opcode::s_load_dwordx4; 4002 } else if (bytes_needed <= 32) { 4003 size = 8; 4004 op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx8 : aco_opcode::s_load_dwordx8; 4005 } else { 4006 size = 16; 4007 op = info.resource.id() ? aco_opcode::s_buffer_load_dwordx16 : aco_opcode::s_load_dwordx16; 4008 } 4009 aco_ptr<SMEM_instruction> load{create_instruction<SMEM_instruction>(op, Format::SMEM, 2, 1)}; 4010 if (info.resource.id()) { 4011 load->operands[0] = Operand(info.resource); 4012 load->operands[1] = Operand(offset); 4013 } else { 4014 load->operands[0] = Operand(offset); 4015 load->operands[1] = Operand::zero(); 4016 } 4017 RegClass rc(RegType::sgpr, size); 4018 Temp val = dst_hint.id() && dst_hint.regClass() == rc ? dst_hint : bld.tmp(rc); 4019 load->definitions[0] = Definition(val); 4020 load->glc = info.glc; 4021 load->dlc = info.glc && bld.program->chip_class >= GFX10; 4022 load->sync = info.sync; 4023 bld.insert(std::move(load)); 4024 return val; 4025} 4026 4027const EmitLoadParameters smem_load_params{smem_load_callback, true, false, 1024}; 4028 4029Temp 4030mubuf_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 4031 unsigned align_, unsigned const_offset, Temp dst_hint) 4032{ 4033 Operand vaddr = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); 4034 Operand soffset = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0); 4035 4036 if (info.soffset.id()) { 4037 if (soffset.isTemp()) 4038 vaddr = bld.copy(bld.def(v1), soffset); 4039 soffset = Operand(info.soffset); 4040 } 4041 4042 unsigned bytes_size = 0; 4043 aco_opcode op; 4044 if (bytes_needed == 1 || align_ % 2) { 4045 bytes_size = 1; 4046 op = aco_opcode::buffer_load_ubyte; 4047 } else if (bytes_needed == 2 || align_ % 4) { 4048 bytes_size = 2; 4049 op = aco_opcode::buffer_load_ushort; 4050 } else if (bytes_needed <= 4) { 4051 bytes_size = 4; 4052 op = aco_opcode::buffer_load_dword; 4053 } else if (bytes_needed <= 8) { 4054 bytes_size = 8; 4055 op = aco_opcode::buffer_load_dwordx2; 4056 } else if (bytes_needed <= 12 && bld.program->chip_class > GFX6) { 4057 bytes_size = 12; 4058 op = aco_opcode::buffer_load_dwordx3; 4059 } else { 4060 bytes_size = 16; 4061 op = aco_opcode::buffer_load_dwordx4; 4062 } 4063 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)}; 4064 mubuf->operands[0] = Operand(info.resource); 4065 mubuf->operands[1] = vaddr; 4066 mubuf->operands[2] = soffset; 4067 mubuf->offen = (offset.type() == RegType::vgpr); 4068 mubuf->glc = info.glc; 4069 mubuf->dlc = info.glc && bld.program->chip_class >= GFX10; 4070 mubuf->slc = info.slc; 4071 mubuf->sync = info.sync; 4072 mubuf->offset = const_offset; 4073 mubuf->swizzled = info.swizzle_component_size != 0; 4074 RegClass rc = RegClass::get(RegType::vgpr, bytes_size); 4075 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); 4076 mubuf->definitions[0] = Definition(val); 4077 bld.insert(std::move(mubuf)); 4078 4079 return val; 4080} 4081 4082const EmitLoadParameters mubuf_load_params{mubuf_load_callback, true, true, 4096}; 4083const EmitLoadParameters scratch_load_params{mubuf_load_callback, false, true, 4096}; 4084 4085Temp 4086get_gfx6_global_rsrc(Builder& bld, Temp addr) 4087{ 4088 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 4089 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 4090 4091 if (addr.type() == RegType::vgpr) 4092 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), Operand::zero(), Operand::zero(), 4093 Operand::c32(-1u), Operand::c32(rsrc_conf)); 4094 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), addr, Operand::c32(-1u), 4095 Operand::c32(rsrc_conf)); 4096} 4097 4098Temp 4099global_load_callback(Builder& bld, const LoadEmitInfo& info, Temp offset, unsigned bytes_needed, 4100 unsigned align_, unsigned const_offset, Temp dst_hint) 4101{ 4102 unsigned bytes_size = 0; 4103 bool use_mubuf = bld.program->chip_class == GFX6; 4104 bool global = bld.program->chip_class >= GFX9; 4105 aco_opcode op; 4106 if (bytes_needed == 1) { 4107 bytes_size = 1; 4108 op = use_mubuf ? aco_opcode::buffer_load_ubyte 4109 : global ? aco_opcode::global_load_ubyte 4110 : aco_opcode::flat_load_ubyte; 4111 } else if (bytes_needed == 2) { 4112 bytes_size = 2; 4113 op = use_mubuf ? aco_opcode::buffer_load_ushort 4114 : global ? aco_opcode::global_load_ushort 4115 : aco_opcode::flat_load_ushort; 4116 } else if (bytes_needed <= 4) { 4117 bytes_size = 4; 4118 op = use_mubuf ? aco_opcode::buffer_load_dword 4119 : global ? aco_opcode::global_load_dword 4120 : aco_opcode::flat_load_dword; 4121 } else if (bytes_needed <= 8) { 4122 bytes_size = 8; 4123 op = use_mubuf ? aco_opcode::buffer_load_dwordx2 4124 : global ? aco_opcode::global_load_dwordx2 4125 : aco_opcode::flat_load_dwordx2; 4126 } else if (bytes_needed <= 12 && !use_mubuf) { 4127 bytes_size = 12; 4128 op = global ? aco_opcode::global_load_dwordx3 : aco_opcode::flat_load_dwordx3; 4129 } else { 4130 bytes_size = 16; 4131 op = use_mubuf ? aco_opcode::buffer_load_dwordx4 4132 : global ? aco_opcode::global_load_dwordx4 4133 : aco_opcode::flat_load_dwordx4; 4134 } 4135 RegClass rc = RegClass::get(RegType::vgpr, align(bytes_size, 4)); 4136 Temp val = dst_hint.id() && rc == dst_hint.regClass() ? dst_hint : bld.tmp(rc); 4137 if (use_mubuf) { 4138 aco_ptr<MUBUF_instruction> mubuf{ 4139 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3, 1)}; 4140 mubuf->operands[0] = Operand(get_gfx6_global_rsrc(bld, offset)); 4141 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); 4142 mubuf->operands[2] = Operand::zero(); 4143 mubuf->glc = info.glc; 4144 mubuf->dlc = false; 4145 mubuf->offset = 0; 4146 mubuf->addr64 = offset.type() == RegType::vgpr; 4147 mubuf->disable_wqm = false; 4148 mubuf->sync = info.sync; 4149 mubuf->definitions[0] = Definition(val); 4150 bld.insert(std::move(mubuf)); 4151 } else { 4152 offset = offset.regClass() == s2 ? bld.copy(bld.def(v2), offset) : offset; 4153 4154 aco_ptr<FLAT_instruction> flat{ 4155 create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 2, 1)}; 4156 flat->operands[0] = Operand(offset); 4157 flat->operands[1] = Operand(s1); 4158 flat->glc = info.glc; 4159 flat->dlc = info.glc && bld.program->chip_class >= GFX10; 4160 flat->sync = info.sync; 4161 flat->offset = 0u; 4162 flat->definitions[0] = Definition(val); 4163 bld.insert(std::move(flat)); 4164 } 4165 4166 return val; 4167} 4168 4169const EmitLoadParameters global_load_params{global_load_callback, true, true, 1}; 4170 4171Temp 4172load_lds(isel_context* ctx, unsigned elem_size_bytes, unsigned num_components, Temp dst, 4173 Temp address, unsigned base_offset, unsigned align) 4174{ 4175 assert(util_is_power_of_two_nonzero(align)); 4176 4177 Builder bld(ctx->program, ctx->block); 4178 4179 LoadEmitInfo info = {Operand(as_vgpr(ctx, address)), dst, num_components, elem_size_bytes}; 4180 info.align_mul = align; 4181 info.align_offset = 0; 4182 info.sync = memory_sync_info(storage_shared); 4183 info.const_offset = base_offset; 4184 emit_load(ctx, bld, info, lds_load_params); 4185 4186 return dst; 4187} 4188 4189void 4190split_store_data(isel_context* ctx, RegType dst_type, unsigned count, Temp* dst, unsigned* bytes, 4191 Temp src) 4192{ 4193 if (!count) 4194 return; 4195 4196 Builder bld(ctx->program, ctx->block); 4197 4198 /* count == 1 fast path */ 4199 if (count == 1) { 4200 if (dst_type == RegType::sgpr) 4201 dst[0] = bld.as_uniform(src); 4202 else 4203 dst[0] = as_vgpr(ctx, src); 4204 return; 4205 } 4206 4207 /* elem_size_bytes is the greatest common divisor which is a power of 2 */ 4208 unsigned elem_size_bytes = 4209 1u << (ffs(std::accumulate(bytes, bytes + count, 8, std::bit_or<>{})) - 1); 4210 4211 ASSERTED bool is_subdword = elem_size_bytes < 4; 4212 assert(!is_subdword || dst_type == RegType::vgpr); 4213 4214 for (unsigned i = 0; i < count; i++) 4215 dst[i] = bld.tmp(RegClass::get(dst_type, bytes[i])); 4216 4217 std::vector<Temp> temps; 4218 /* use allocated_vec if possible */ 4219 auto it = ctx->allocated_vec.find(src.id()); 4220 if (it != ctx->allocated_vec.end()) { 4221 if (!it->second[0].id()) 4222 goto split; 4223 unsigned elem_size = it->second[0].bytes(); 4224 assert(src.bytes() % elem_size == 0); 4225 4226 for (unsigned i = 0; i < src.bytes() / elem_size; i++) { 4227 if (!it->second[i].id()) 4228 goto split; 4229 } 4230 if (elem_size_bytes % elem_size) 4231 goto split; 4232 4233 temps.insert(temps.end(), it->second.begin(), it->second.begin() + src.bytes() / elem_size); 4234 elem_size_bytes = elem_size; 4235 } 4236 4237split: 4238 /* split src if necessary */ 4239 if (temps.empty()) { 4240 if (is_subdword && src.type() == RegType::sgpr) 4241 src = as_vgpr(ctx, src); 4242 if (dst_type == RegType::sgpr) 4243 src = bld.as_uniform(src); 4244 4245 unsigned num_elems = src.bytes() / elem_size_bytes; 4246 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>( 4247 aco_opcode::p_split_vector, Format::PSEUDO, 1, num_elems)}; 4248 split->operands[0] = Operand(src); 4249 for (unsigned i = 0; i < num_elems; i++) { 4250 temps.emplace_back(bld.tmp(RegClass::get(dst_type, elem_size_bytes))); 4251 split->definitions[i] = Definition(temps.back()); 4252 } 4253 bld.insert(std::move(split)); 4254 } 4255 4256 unsigned idx = 0; 4257 for (unsigned i = 0; i < count; i++) { 4258 unsigned op_count = dst[i].bytes() / elem_size_bytes; 4259 if (op_count == 1) { 4260 if (dst_type == RegType::sgpr) 4261 dst[i] = bld.as_uniform(temps[idx++]); 4262 else 4263 dst[i] = as_vgpr(ctx, temps[idx++]); 4264 continue; 4265 } 4266 4267 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, 4268 Format::PSEUDO, op_count, 1)}; 4269 for (unsigned j = 0; j < op_count; j++) { 4270 Temp tmp = temps[idx++]; 4271 if (dst_type == RegType::sgpr) 4272 tmp = bld.as_uniform(tmp); 4273 vec->operands[j] = Operand(tmp); 4274 } 4275 vec->definitions[0] = Definition(dst[i]); 4276 bld.insert(std::move(vec)); 4277 } 4278 return; 4279} 4280 4281bool 4282scan_write_mask(uint32_t mask, uint32_t todo_mask, int* start, int* count) 4283{ 4284 unsigned start_elem = ffs(todo_mask) - 1; 4285 bool skip = !(mask & (1 << start_elem)); 4286 if (skip) 4287 mask = ~mask & todo_mask; 4288 4289 mask &= todo_mask; 4290 4291 u_bit_scan_consecutive_range(&mask, start, count); 4292 4293 return !skip; 4294} 4295 4296void 4297advance_write_mask(uint32_t* todo_mask, int start, int count) 4298{ 4299 *todo_mask &= ~u_bit_consecutive(0, count) << start; 4300} 4301 4302void 4303store_lds(isel_context* ctx, unsigned elem_size_bytes, Temp data, uint32_t wrmask, Temp address, 4304 unsigned base_offset, unsigned align) 4305{ 4306 assert(util_is_power_of_two_nonzero(align)); 4307 assert(util_is_power_of_two_nonzero(elem_size_bytes) && elem_size_bytes <= 8); 4308 4309 Builder bld(ctx->program, ctx->block); 4310 bool large_ds_write = ctx->options->chip_class >= GFX7; 4311 bool usable_write2 = ctx->options->chip_class >= GFX7; 4312 4313 unsigned write_count = 0; 4314 Temp write_datas[32]; 4315 unsigned offsets[32]; 4316 unsigned bytes[32]; 4317 aco_opcode opcodes[32]; 4318 4319 wrmask = widen_mask(wrmask, elem_size_bytes); 4320 4321 uint32_t todo = u_bit_consecutive(0, data.bytes()); 4322 while (todo) { 4323 int offset, byte; 4324 if (!scan_write_mask(wrmask, todo, &offset, &byte)) { 4325 offsets[write_count] = offset; 4326 bytes[write_count] = byte; 4327 opcodes[write_count] = aco_opcode::num_opcodes; 4328 write_count++; 4329 advance_write_mask(&todo, offset, byte); 4330 continue; 4331 } 4332 4333 bool aligned2 = offset % 2 == 0 && align % 2 == 0; 4334 bool aligned4 = offset % 4 == 0 && align % 4 == 0; 4335 bool aligned8 = offset % 8 == 0 && align % 8 == 0; 4336 bool aligned16 = offset % 16 == 0 && align % 16 == 0; 4337 4338 // TODO: use ds_write_b8_d16_hi/ds_write_b16_d16_hi if beneficial 4339 aco_opcode op = aco_opcode::num_opcodes; 4340 if (byte >= 16 && aligned16 && large_ds_write) { 4341 op = aco_opcode::ds_write_b128; 4342 byte = 16; 4343 } else if (byte >= 12 && aligned16 && large_ds_write) { 4344 op = aco_opcode::ds_write_b96; 4345 byte = 12; 4346 } else if (byte >= 8 && aligned8) { 4347 op = aco_opcode::ds_write_b64; 4348 byte = 8; 4349 } else if (byte >= 4 && aligned4) { 4350 op = aco_opcode::ds_write_b32; 4351 byte = 4; 4352 } else if (byte >= 2 && aligned2) { 4353 op = aco_opcode::ds_write_b16; 4354 byte = 2; 4355 } else if (byte >= 1) { 4356 op = aco_opcode::ds_write_b8; 4357 byte = 1; 4358 } else { 4359 assert(false); 4360 } 4361 4362 offsets[write_count] = offset; 4363 bytes[write_count] = byte; 4364 opcodes[write_count] = op; 4365 write_count++; 4366 advance_write_mask(&todo, offset, byte); 4367 } 4368 4369 Operand m = load_lds_size_m0(bld); 4370 4371 split_store_data(ctx, RegType::vgpr, write_count, write_datas, bytes, data); 4372 4373 for (unsigned i = 0; i < write_count; i++) { 4374 aco_opcode op = opcodes[i]; 4375 if (op == aco_opcode::num_opcodes) 4376 continue; 4377 4378 Temp split_data = write_datas[i]; 4379 4380 unsigned second = write_count; 4381 if (usable_write2 && (op == aco_opcode::ds_write_b32 || op == aco_opcode::ds_write_b64)) { 4382 for (second = i + 1; second < write_count; second++) { 4383 if (opcodes[second] == op && (offsets[second] - offsets[i]) % split_data.bytes() == 0) { 4384 op = split_data.bytes() == 4 ? aco_opcode::ds_write2_b32 : aco_opcode::ds_write2_b64; 4385 opcodes[second] = aco_opcode::num_opcodes; 4386 break; 4387 } 4388 } 4389 } 4390 4391 bool write2 = op == aco_opcode::ds_write2_b32 || op == aco_opcode::ds_write2_b64; 4392 unsigned write2_off = (offsets[second] - offsets[i]) / split_data.bytes(); 4393 4394 unsigned inline_offset = base_offset + offsets[i]; 4395 unsigned max_offset = write2 ? (255 - write2_off) * split_data.bytes() : 65535; 4396 Temp address_offset = address; 4397 if (inline_offset > max_offset) { 4398 address_offset = bld.vadd32(bld.def(v1), Operand::c32(base_offset), address_offset); 4399 inline_offset = offsets[i]; 4400 } 4401 4402 /* offsets[i] shouldn't be large enough for this to happen */ 4403 assert(inline_offset <= max_offset); 4404 4405 Instruction* instr; 4406 if (write2) { 4407 Temp second_data = write_datas[second]; 4408 inline_offset /= split_data.bytes(); 4409 instr = bld.ds(op, address_offset, split_data, second_data, m, inline_offset, 4410 inline_offset + write2_off); 4411 } else { 4412 instr = bld.ds(op, address_offset, split_data, m, inline_offset); 4413 } 4414 instr->ds().sync = memory_sync_info(storage_shared); 4415 4416 if (m.isUndefined()) 4417 instr->operands.pop_back(); 4418 } 4419} 4420 4421aco_opcode 4422get_buffer_store_op(unsigned bytes) 4423{ 4424 switch (bytes) { 4425 case 1: return aco_opcode::buffer_store_byte; 4426 case 2: return aco_opcode::buffer_store_short; 4427 case 4: return aco_opcode::buffer_store_dword; 4428 case 8: return aco_opcode::buffer_store_dwordx2; 4429 case 12: return aco_opcode::buffer_store_dwordx3; 4430 case 16: return aco_opcode::buffer_store_dwordx4; 4431 } 4432 unreachable("Unexpected store size"); 4433 return aco_opcode::num_opcodes; 4434} 4435 4436void 4437split_buffer_store(isel_context* ctx, nir_intrinsic_instr* instr, bool smem, RegType dst_type, 4438 Temp data, unsigned writemask, int swizzle_element_size, unsigned* write_count, 4439 Temp* write_datas, unsigned* offsets) 4440{ 4441 unsigned write_count_with_skips = 0; 4442 bool skips[16]; 4443 unsigned bytes[16]; 4444 4445 /* determine how to split the data */ 4446 unsigned todo = u_bit_consecutive(0, data.bytes()); 4447 while (todo) { 4448 int offset, byte; 4449 skips[write_count_with_skips] = !scan_write_mask(writemask, todo, &offset, &byte); 4450 offsets[write_count_with_skips] = offset; 4451 if (skips[write_count_with_skips]) { 4452 bytes[write_count_with_skips] = byte; 4453 advance_write_mask(&todo, offset, byte); 4454 write_count_with_skips++; 4455 continue; 4456 } 4457 4458 /* only supported sizes are 1, 2, 4, 8, 12 and 16 bytes and can't be 4459 * larger than swizzle_element_size */ 4460 byte = MIN2(byte, swizzle_element_size); 4461 if (byte % 4) 4462 byte = byte > 4 ? byte & ~0x3 : MIN2(byte, 2); 4463 4464 /* SMEM and GFX6 VMEM can't emit 12-byte stores */ 4465 if ((ctx->program->chip_class == GFX6 || smem) && byte == 12) 4466 byte = 8; 4467 4468 /* dword or larger stores have to be dword-aligned */ 4469 unsigned align_mul = instr ? nir_intrinsic_align_mul(instr) : 4; 4470 unsigned align_offset = (instr ? nir_intrinsic_align_offset(instr) : 0) + offset; 4471 bool dword_aligned = align_offset % 4 == 0 && align_mul % 4 == 0; 4472 if (!dword_aligned) 4473 byte = MIN2(byte, (align_offset % 2 == 0 && align_mul % 2 == 0) ? 2 : 1); 4474 4475 bytes[write_count_with_skips] = byte; 4476 advance_write_mask(&todo, offset, byte); 4477 write_count_with_skips++; 4478 } 4479 4480 /* actually split data */ 4481 split_store_data(ctx, dst_type, write_count_with_skips, write_datas, bytes, data); 4482 4483 /* remove skips */ 4484 for (unsigned i = 0; i < write_count_with_skips; i++) { 4485 if (skips[i]) 4486 continue; 4487 write_datas[*write_count] = write_datas[i]; 4488 offsets[*write_count] = offsets[i]; 4489 (*write_count)++; 4490 } 4491} 4492 4493Temp 4494create_vec_from_array(isel_context* ctx, Temp arr[], unsigned cnt, RegType reg_type, 4495 unsigned elem_size_bytes, unsigned split_cnt = 0u, Temp dst = Temp()) 4496{ 4497 Builder bld(ctx->program, ctx->block); 4498 unsigned dword_size = elem_size_bytes / 4; 4499 4500 if (!dst.id()) 4501 dst = bld.tmp(RegClass(reg_type, cnt * dword_size)); 4502 4503 std::array<Temp, NIR_MAX_VEC_COMPONENTS> allocated_vec; 4504 aco_ptr<Pseudo_instruction> instr{ 4505 create_instruction<Pseudo_instruction>(aco_opcode::p_create_vector, Format::PSEUDO, cnt, 1)}; 4506 instr->definitions[0] = Definition(dst); 4507 4508 for (unsigned i = 0; i < cnt; ++i) { 4509 if (arr[i].id()) { 4510 assert(arr[i].size() == dword_size); 4511 allocated_vec[i] = arr[i]; 4512 instr->operands[i] = Operand(arr[i]); 4513 } else { 4514 Temp zero = bld.copy(bld.def(RegClass(reg_type, dword_size)), 4515 Operand::zero(dword_size == 2 ? 8 : 4)); 4516 allocated_vec[i] = zero; 4517 instr->operands[i] = Operand(zero); 4518 } 4519 } 4520 4521 bld.insert(std::move(instr)); 4522 4523 if (split_cnt) 4524 emit_split_vector(ctx, dst, split_cnt); 4525 else 4526 ctx->allocated_vec.emplace(dst.id(), allocated_vec); /* emit_split_vector already does this */ 4527 4528 return dst; 4529} 4530 4531inline unsigned 4532resolve_excess_vmem_const_offset(Builder& bld, Temp& voffset, unsigned const_offset) 4533{ 4534 if (const_offset >= 4096) { 4535 unsigned excess_const_offset = const_offset / 4096u * 4096u; 4536 const_offset %= 4096u; 4537 4538 if (!voffset.id()) 4539 voffset = bld.copy(bld.def(v1), Operand::c32(excess_const_offset)); 4540 else if (unlikely(voffset.regClass() == s1)) 4541 voffset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), 4542 Operand::c32(excess_const_offset), Operand(voffset)); 4543 else if (likely(voffset.regClass() == v1)) 4544 voffset = bld.vadd32(bld.def(v1), Operand(voffset), Operand::c32(excess_const_offset)); 4545 else 4546 unreachable("Unsupported register class of voffset"); 4547 } 4548 4549 return const_offset; 4550} 4551 4552void 4553emit_single_mubuf_store(isel_context* ctx, Temp descriptor, Temp voffset, Temp soffset, Temp vdata, 4554 unsigned const_offset = 0u, memory_sync_info sync = memory_sync_info(), 4555 bool slc = false, bool swizzled = false) 4556{ 4557 assert(vdata.id()); 4558 assert(vdata.size() != 3 || ctx->program->chip_class != GFX6); 4559 assert(vdata.size() >= 1 && vdata.size() <= 4); 4560 4561 Builder bld(ctx->program, ctx->block); 4562 aco_opcode op = get_buffer_store_op(vdata.bytes()); 4563 const_offset = resolve_excess_vmem_const_offset(bld, voffset, const_offset); 4564 4565 Operand voffset_op = voffset.id() ? Operand(as_vgpr(ctx, voffset)) : Operand(v1); 4566 Operand soffset_op = soffset.id() ? Operand(soffset) : Operand::zero(); 4567 Builder::Result r = 4568 bld.mubuf(op, Operand(descriptor), voffset_op, soffset_op, Operand(vdata), const_offset, 4569 /* offen */ !voffset_op.isUndefined(), /* swizzled */ swizzled, 4570 /* idxen*/ false, /* addr64 */ false, /* disable_wqm */ false, /* glc */ true, 4571 /* dlc*/ false, /* slc */ slc); 4572 4573 r.instr->mubuf().sync = sync; 4574} 4575 4576void 4577store_vmem_mubuf(isel_context* ctx, Temp src, Temp descriptor, Temp voffset, Temp soffset, 4578 unsigned base_const_offset, unsigned elem_size_bytes, unsigned write_mask, 4579 bool allow_combining = true, memory_sync_info sync = memory_sync_info(), 4580 bool slc = false) 4581{ 4582 Builder bld(ctx->program, ctx->block); 4583 assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); 4584 assert(write_mask); 4585 write_mask = widen_mask(write_mask, elem_size_bytes); 4586 4587 unsigned write_count = 0; 4588 Temp write_datas[32]; 4589 unsigned offsets[32]; 4590 split_buffer_store(ctx, NULL, false, RegType::vgpr, src, write_mask, allow_combining ? 16 : 4, 4591 &write_count, write_datas, offsets); 4592 4593 for (unsigned i = 0; i < write_count; i++) { 4594 unsigned const_offset = offsets[i] + base_const_offset; 4595 emit_single_mubuf_store(ctx, descriptor, voffset, soffset, write_datas[i], const_offset, sync, 4596 slc, !allow_combining); 4597 } 4598} 4599 4600void 4601load_vmem_mubuf(isel_context* ctx, Temp dst, Temp descriptor, Temp voffset, Temp soffset, 4602 unsigned base_const_offset, unsigned elem_size_bytes, unsigned num_components, 4603 unsigned stride = 0u, bool allow_combining = true, bool allow_reorder = true, 4604 bool slc = false) 4605{ 4606 assert(elem_size_bytes == 2 || elem_size_bytes == 4 || elem_size_bytes == 8); 4607 assert((num_components * elem_size_bytes) == dst.bytes()); 4608 assert(!!stride != allow_combining); 4609 4610 Builder bld(ctx->program, ctx->block); 4611 4612 LoadEmitInfo info = {Operand(voffset), dst, num_components, elem_size_bytes, descriptor}; 4613 info.component_stride = allow_combining ? 0 : stride; 4614 info.glc = true; 4615 info.slc = slc; 4616 info.swizzle_component_size = allow_combining ? 0 : 4; 4617 info.align_mul = MIN2(elem_size_bytes, 4); 4618 info.align_offset = 0; 4619 info.soffset = soffset; 4620 info.const_offset = base_const_offset; 4621 emit_load(ctx, bld, info, mubuf_load_params); 4622} 4623 4624Temp 4625wave_id_in_threadgroup(isel_context* ctx) 4626{ 4627 Builder bld(ctx->program, ctx->block); 4628 return bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 4629 get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(24u | (4u << 16))); 4630} 4631 4632Temp 4633thread_id_in_threadgroup(isel_context* ctx) 4634{ 4635 /* tid_in_tg = wave_id * wave_size + tid_in_wave */ 4636 4637 Builder bld(ctx->program, ctx->block); 4638 Temp tid_in_wave = emit_mbcnt(ctx, bld.tmp(v1)); 4639 4640 if (ctx->program->workgroup_size <= ctx->program->wave_size) 4641 return tid_in_wave; 4642 4643 Temp wave_id_in_tg = wave_id_in_threadgroup(ctx); 4644 Temp num_pre_threads = 4645 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), wave_id_in_tg, 4646 Operand::c32(ctx->program->wave_size == 64 ? 6u : 5u)); 4647 return bld.vadd32(bld.def(v1), Operand(num_pre_threads), Operand(tid_in_wave)); 4648} 4649 4650Temp 4651get_tess_rel_patch_id(isel_context* ctx) 4652{ 4653 Builder bld(ctx->program, ctx->block); 4654 4655 switch (ctx->shader->info.stage) { 4656 case MESA_SHADER_TESS_CTRL: 4657 return bld.pseudo(aco_opcode::p_extract, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids), 4658 Operand::zero(), Operand::c32(8u), Operand::zero()); 4659 case MESA_SHADER_TESS_EVAL: return get_arg(ctx, ctx->args->ac.tes_rel_patch_id); 4660 default: unreachable("Unsupported stage in get_tess_rel_patch_id"); 4661 } 4662} 4663 4664bool 4665store_output_to_temps(isel_context* ctx, nir_intrinsic_instr* instr) 4666{ 4667 unsigned write_mask = nir_intrinsic_write_mask(instr); 4668 unsigned component = nir_intrinsic_component(instr); 4669 unsigned idx = nir_intrinsic_base(instr) * 4u + component; 4670 nir_src offset = *nir_get_io_offset_src(instr); 4671 4672 if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) 4673 return false; 4674 4675 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 4676 4677 if (instr->src[0].ssa->bit_size == 64) 4678 write_mask = widen_mask(write_mask, 2); 4679 4680 RegClass rc = instr->src[0].ssa->bit_size == 16 ? v2b : v1; 4681 4682 for (unsigned i = 0; i < 8; ++i) { 4683 if (write_mask & (1 << i)) { 4684 ctx->outputs.mask[idx / 4u] |= 1 << (idx % 4u); 4685 ctx->outputs.temps[idx] = emit_extract_vector(ctx, src, i, rc); 4686 } 4687 idx++; 4688 } 4689 4690 return true; 4691} 4692 4693bool 4694load_input_from_temps(isel_context* ctx, nir_intrinsic_instr* instr, Temp dst) 4695{ 4696 /* Only TCS per-vertex inputs are supported by this function. 4697 * Per-vertex inputs only match between the VS/TCS invocation id when the number of invocations 4698 * is the same. 4699 */ 4700 if (ctx->shader->info.stage != MESA_SHADER_TESS_CTRL || !ctx->tcs_in_out_eq) 4701 return false; 4702 4703 nir_src* off_src = nir_get_io_offset_src(instr); 4704 nir_src* vertex_index_src = nir_get_io_vertex_index_src(instr); 4705 nir_instr* vertex_index_instr = vertex_index_src->ssa->parent_instr; 4706 bool can_use_temps = 4707 nir_src_is_const(*off_src) && vertex_index_instr->type == nir_instr_type_intrinsic && 4708 nir_instr_as_intrinsic(vertex_index_instr)->intrinsic == nir_intrinsic_load_invocation_id; 4709 4710 if (!can_use_temps) 4711 return false; 4712 4713 unsigned idx = nir_intrinsic_base(instr) * 4u + nir_intrinsic_component(instr) + 4714 4 * nir_src_as_uint(*off_src); 4715 Temp* src = &ctx->inputs.temps[idx]; 4716 create_vec_from_array(ctx, src, dst.size(), dst.regClass().type(), 4u, 0, dst); 4717 4718 return true; 4719} 4720 4721static void export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos); 4722 4723void 4724visit_store_output(isel_context* ctx, nir_intrinsic_instr* instr) 4725{ 4726 if (ctx->stage == vertex_vs || ctx->stage == tess_eval_vs || ctx->stage == fragment_fs || 4727 ctx->stage == vertex_ngg || ctx->stage == tess_eval_ngg || 4728 (ctx->stage == vertex_tess_control_hs && ctx->shader->info.stage == MESA_SHADER_VERTEX) || 4729 ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { 4730 bool stored_to_temps = store_output_to_temps(ctx, instr); 4731 if (!stored_to_temps) { 4732 isel_err(instr->src[1].ssa->parent_instr, "Unimplemented output offset instruction"); 4733 abort(); 4734 } 4735 } else { 4736 unreachable("Shader stage not implemented"); 4737 } 4738 4739 /* For NGG VS and TES shaders the primitive ID is exported manually after the other exports so we 4740 * have to emit an exp here manually */ 4741 if (ctx->stage.hw == HWStage::NGG && 4742 (ctx->stage.has(SWStage::VS) || ctx->stage.has(SWStage::TES)) && 4743 nir_intrinsic_io_semantics(instr).location == VARYING_SLOT_PRIMITIVE_ID) 4744 export_vs_varying(ctx, VARYING_SLOT_PRIMITIVE_ID, false, NULL); 4745} 4746 4747void 4748emit_interp_instr(isel_context* ctx, unsigned idx, unsigned component, Temp src, Temp dst, 4749 Temp prim_mask) 4750{ 4751 Temp coord1 = emit_extract_vector(ctx, src, 0, v1); 4752 Temp coord2 = emit_extract_vector(ctx, src, 1, v1); 4753 4754 Builder bld(ctx->program, ctx->block); 4755 4756 if (dst.regClass() == v2b) { 4757 if (ctx->program->dev.has_16bank_lds) { 4758 assert(ctx->options->chip_class <= GFX8); 4759 Builder::Result interp_p1 = 4760 bld.vintrp(aco_opcode::v_interp_mov_f32, bld.def(v1), Operand::c32(2u) /* P0 */, 4761 bld.m0(prim_mask), idx, component); 4762 interp_p1 = bld.vintrp(aco_opcode::v_interp_p1lv_f16, bld.def(v2b), coord1, 4763 bld.m0(prim_mask), interp_p1, idx, component); 4764 bld.vintrp(aco_opcode::v_interp_p2_legacy_f16, Definition(dst), coord2, bld.m0(prim_mask), 4765 interp_p1, idx, component); 4766 } else { 4767 aco_opcode interp_p2_op = aco_opcode::v_interp_p2_f16; 4768 4769 if (ctx->options->chip_class == GFX8) 4770 interp_p2_op = aco_opcode::v_interp_p2_legacy_f16; 4771 4772 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1ll_f16, bld.def(v1), coord1, 4773 bld.m0(prim_mask), idx, component); 4774 bld.vintrp(interp_p2_op, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, idx, 4775 component); 4776 } 4777 } else { 4778 Builder::Result interp_p1 = bld.vintrp(aco_opcode::v_interp_p1_f32, bld.def(v1), coord1, 4779 bld.m0(prim_mask), idx, component); 4780 4781 if (ctx->program->dev.has_16bank_lds) 4782 interp_p1.instr->operands[0].setLateKill(true); 4783 4784 bld.vintrp(aco_opcode::v_interp_p2_f32, Definition(dst), coord2, bld.m0(prim_mask), interp_p1, 4785 idx, component); 4786 } 4787} 4788 4789void 4790emit_load_frag_coord(isel_context* ctx, Temp dst, unsigned num_components) 4791{ 4792 Builder bld(ctx->program, ctx->block); 4793 4794 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>( 4795 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)); 4796 for (unsigned i = 0; i < num_components; i++) { 4797 if (ctx->args->ac.frag_pos[i].used) 4798 vec->operands[i] = Operand(get_arg(ctx, ctx->args->ac.frag_pos[i])); 4799 else 4800 vec->operands[i] = Operand(v1); 4801 } 4802 if (G_0286CC_POS_W_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) { 4803 assert(num_components == 4); 4804 vec->operands[3] = 4805 bld.vop1(aco_opcode::v_rcp_f32, bld.def(v1), get_arg(ctx, ctx->args->ac.frag_pos[3])); 4806 } 4807 4808 if (ctx->options->adjust_frag_coord_z && 4809 G_0286CC_POS_Z_FLOAT_ENA(ctx->program->config->spi_ps_input_ena)) { 4810 /* Adjust gl_FragCoord.z for VRS due to a hw bug on some GFX10.3 chips. */ 4811 Operand frag_z = vec->operands[2]; 4812 Temp adjusted_frag_z = bld.tmp(v1); 4813 Temp tmp; 4814 4815 /* dFdx fine */ 4816 Temp tl = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), frag_z, dpp_quad_perm(0, 0, 2, 2)); 4817 tmp = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), frag_z, tl, dpp_quad_perm(1, 1, 3, 3)); 4818 emit_wqm(bld, tmp, adjusted_frag_z, true); 4819 4820 /* adjusted_frag_z * 0.0625 + frag_z */ 4821 adjusted_frag_z = bld.vop3(aco_opcode::v_fma_f32, bld.def(v1), adjusted_frag_z, 4822 Operand::c32(0x3d800000u /* 0.0625 */), frag_z); 4823 4824 /* VRS Rate X = Ancillary[2:3] */ 4825 Temp x_rate = 4826 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), 4827 Operand::c32(2u), Operand::c32(2u)); 4828 4829 /* xRate = xRate == 0x1 ? adjusted_frag_z : frag_z. */ 4830 Temp cond = 4831 bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate)); 4832 vec->operands[2] = 4833 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), frag_z, adjusted_frag_z, cond); 4834 } 4835 4836 for (Operand& op : vec->operands) 4837 op = op.isUndefined() ? Operand::zero() : op; 4838 4839 vec->definitions[0] = Definition(dst); 4840 ctx->block->instructions.emplace_back(std::move(vec)); 4841 emit_split_vector(ctx, dst, num_components); 4842 return; 4843} 4844 4845void 4846emit_load_frag_shading_rate(isel_context* ctx, Temp dst) 4847{ 4848 Builder bld(ctx->program, ctx->block); 4849 Temp cond; 4850 4851 /* VRS Rate X = Ancillary[2:3] 4852 * VRS Rate Y = Ancillary[4:5] 4853 */ 4854 Temp x_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), 4855 Operand::c32(2u), Operand::c32(2u)); 4856 Temp y_rate = bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), 4857 Operand::c32(4u), Operand::c32(2u)); 4858 4859 /* xRate = xRate == 0x1 ? Horizontal2Pixels : None. */ 4860 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(x_rate)); 4861 x_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()), 4862 bld.copy(bld.def(v1), Operand::c32(4u)), cond); 4863 4864 /* yRate = yRate == 0x1 ? Vertical2Pixels : None. */ 4865 cond = bld.vopc(aco_opcode::v_cmp_eq_i32, bld.def(bld.lm), Operand::c32(1u), Operand(y_rate)); 4866 y_rate = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), bld.copy(bld.def(v1), Operand::zero()), 4867 bld.copy(bld.def(v1), Operand::c32(1u)), cond); 4868 4869 bld.vop2(aco_opcode::v_or_b32, Definition(dst), Operand(x_rate), Operand(y_rate)); 4870} 4871 4872void 4873visit_load_interpolated_input(isel_context* ctx, nir_intrinsic_instr* instr) 4874{ 4875 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 4876 Temp coords = get_ssa_temp(ctx, instr->src[0].ssa); 4877 unsigned idx = nir_intrinsic_base(instr); 4878 unsigned component = nir_intrinsic_component(instr); 4879 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); 4880 4881 assert(nir_src_is_const(instr->src[1]) && !nir_src_as_uint(instr->src[1])); 4882 4883 if (instr->dest.ssa.num_components == 1) { 4884 emit_interp_instr(ctx, idx, component, coords, dst, prim_mask); 4885 } else { 4886 aco_ptr<Pseudo_instruction> vec(create_instruction<Pseudo_instruction>( 4887 aco_opcode::p_create_vector, Format::PSEUDO, instr->dest.ssa.num_components, 1)); 4888 for (unsigned i = 0; i < instr->dest.ssa.num_components; i++) { 4889 Temp tmp = ctx->program->allocateTmp(instr->dest.ssa.bit_size == 16 ? v2b : v1); 4890 emit_interp_instr(ctx, idx, component + i, coords, tmp, prim_mask); 4891 vec->operands[i] = Operand(tmp); 4892 } 4893 vec->definitions[0] = Definition(dst); 4894 ctx->block->instructions.emplace_back(std::move(vec)); 4895 } 4896} 4897 4898bool 4899check_vertex_fetch_size(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset, 4900 unsigned binding_align, unsigned channels) 4901{ 4902 unsigned vertex_byte_size = vtx_info->chan_byte_size * channels; 4903 if (vtx_info->chan_byte_size != 4 && channels == 3) 4904 return false; 4905 4906 /* Split typed vertex buffer loads on GFX6 and GFX10+ to avoid any 4907 * alignment issues that triggers memory violations and eventually a GPU 4908 * hang. This can happen if the stride (static or dynamic) is unaligned and 4909 * also if the VBO offset is aligned to a scalar (eg. stride is 8 and VBO 4910 * offset is 2 for R16G16B16A16_SNORM). 4911 */ 4912 return (ctx->options->chip_class >= GFX7 && ctx->options->chip_class <= GFX9) || 4913 (offset % vertex_byte_size == 0 && MAX2(binding_align, 1) % vertex_byte_size == 0); 4914} 4915 4916uint8_t 4917get_fetch_data_format(isel_context* ctx, const ac_data_format_info* vtx_info, unsigned offset, 4918 unsigned* channels, unsigned max_channels, unsigned binding_align) 4919{ 4920 if (!vtx_info->chan_byte_size) { 4921 *channels = vtx_info->num_channels; 4922 return vtx_info->chan_format; 4923 } 4924 4925 unsigned num_channels = *channels; 4926 if (!check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, *channels)) { 4927 unsigned new_channels = num_channels + 1; 4928 /* first, assume more loads is worse and try using a larger data format */ 4929 while (new_channels <= max_channels && 4930 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) { 4931 new_channels++; 4932 } 4933 4934 if (new_channels > max_channels) { 4935 /* then try decreasing load size (at the cost of more loads) */ 4936 new_channels = *channels; 4937 while (new_channels > 1 && 4938 !check_vertex_fetch_size(ctx, vtx_info, offset, binding_align, new_channels)) 4939 new_channels--; 4940 } 4941 4942 if (new_channels < *channels) 4943 *channels = new_channels; 4944 num_channels = new_channels; 4945 } 4946 4947 switch (vtx_info->chan_format) { 4948 case V_008F0C_BUF_DATA_FORMAT_8: 4949 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_8, V_008F0C_BUF_DATA_FORMAT_8_8, 4950 V_008F0C_BUF_DATA_FORMAT_INVALID, 4951 V_008F0C_BUF_DATA_FORMAT_8_8_8_8}[num_channels - 1]; 4952 case V_008F0C_BUF_DATA_FORMAT_16: 4953 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_16, V_008F0C_BUF_DATA_FORMAT_16_16, 4954 V_008F0C_BUF_DATA_FORMAT_INVALID, 4955 V_008F0C_BUF_DATA_FORMAT_16_16_16_16}[num_channels - 1]; 4956 case V_008F0C_BUF_DATA_FORMAT_32: 4957 return std::array<uint8_t, 4>{V_008F0C_BUF_DATA_FORMAT_32, V_008F0C_BUF_DATA_FORMAT_32_32, 4958 V_008F0C_BUF_DATA_FORMAT_32_32_32, 4959 V_008F0C_BUF_DATA_FORMAT_32_32_32_32}[num_channels - 1]; 4960 } 4961 unreachable("shouldn't reach here"); 4962 return V_008F0C_BUF_DATA_FORMAT_INVALID; 4963} 4964 4965/* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. 4966 * so we may need to fix it up. */ 4967Temp 4968adjust_vertex_fetch_alpha(isel_context* ctx, enum radv_vs_input_alpha_adjust adjustment, Temp alpha) 4969{ 4970 Builder bld(ctx->program, ctx->block); 4971 4972 if (adjustment == ALPHA_ADJUST_SSCALED) 4973 alpha = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), alpha); 4974 4975 /* For the integer-like cases, do a natural sign extension. 4976 * 4977 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 4978 * and happen to contain 0, 1, 2, 3 as the two LSBs of the 4979 * exponent. 4980 */ 4981 unsigned offset = adjustment == ALPHA_ADJUST_SNORM ? 23u : 0u; 4982 alpha = 4983 bld.vop3(aco_opcode::v_bfe_i32, bld.def(v1), alpha, Operand::c32(offset), Operand::c32(2u)); 4984 4985 /* Convert back to the right type. */ 4986 if (adjustment == ALPHA_ADJUST_SNORM) { 4987 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); 4988 alpha = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::c32(0xbf800000u), alpha); 4989 } else if (adjustment == ALPHA_ADJUST_SSCALED) { 4990 alpha = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), alpha); 4991 } 4992 4993 return alpha; 4994} 4995 4996void 4997visit_load_input(isel_context* ctx, nir_intrinsic_instr* instr) 4998{ 4999 Builder bld(ctx->program, ctx->block); 5000 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5001 nir_src offset = *nir_get_io_offset_src(instr); 5002 5003 if (ctx->shader->info.stage == MESA_SHADER_VERTEX && ctx->args->shader_info->vs.dynamic_inputs) { 5004 if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) 5005 isel_err(offset.ssa->parent_instr, 5006 "Unimplemented non-zero nir_intrinsic_load_input offset"); 5007 5008 unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; 5009 unsigned component = nir_intrinsic_component(instr); 5010 unsigned bitsize = instr->dest.ssa.bit_size; 5011 unsigned num_components = instr->dest.ssa.num_components; 5012 5013 Temp input = get_arg(ctx, ctx->args->vs_inputs[location]); 5014 5015 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>( 5016 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 5017 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 5018 for (unsigned i = 0; i < num_components; i++) { 5019 elems[i] = emit_extract_vector(ctx, input, component + i, bitsize == 64 ? v2 : v1); 5020 if (bitsize == 16) { 5021 if (nir_alu_type_get_base_type(nir_intrinsic_dest_type(instr)) == nir_type_float) 5022 elems[i] = bld.vop1(aco_opcode::v_cvt_f16_f32, bld.def(v2b), elems[i]); 5023 else 5024 elems[i] = bld.pseudo(aco_opcode::p_extract_vector, bld.def(v2b), elems[i], 5025 Operand::c32(0u)); 5026 } 5027 vec->operands[i] = Operand(elems[i]); 5028 } 5029 vec->definitions[0] = Definition(dst); 5030 ctx->block->instructions.emplace_back(std::move(vec)); 5031 ctx->allocated_vec.emplace(dst.id(), elems); 5032 } else if (ctx->shader->info.stage == MESA_SHADER_VERTEX) { 5033 5034 if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) 5035 isel_err(offset.ssa->parent_instr, 5036 "Unimplemented non-zero nir_intrinsic_load_input offset"); 5037 5038 Temp vertex_buffers = 5039 convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.vertex_buffers)); 5040 5041 unsigned location = nir_intrinsic_base(instr) - VERT_ATTRIB_GENERIC0; 5042 unsigned component = nir_intrinsic_component(instr); 5043 unsigned bitsize = instr->dest.ssa.bit_size; 5044 unsigned attrib_binding = ctx->options->key.vs.vertex_attribute_bindings[location]; 5045 uint32_t attrib_offset = ctx->options->key.vs.vertex_attribute_offsets[location]; 5046 uint32_t attrib_stride = ctx->options->key.vs.vertex_attribute_strides[location]; 5047 unsigned attrib_format = ctx->options->key.vs.vertex_attribute_formats[location]; 5048 unsigned binding_align = ctx->options->key.vs.vertex_binding_align[attrib_binding]; 5049 enum radv_vs_input_alpha_adjust alpha_adjust = 5050 ctx->options->key.vs.vertex_alpha_adjust[location]; 5051 5052 unsigned dfmt = attrib_format & 0xf; 5053 unsigned nfmt = (attrib_format >> 4) & 0x7; 5054 const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt); 5055 5056 unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa) << component; 5057 unsigned num_channels = MIN2(util_last_bit(mask), vtx_info->num_channels); 5058 bool post_shuffle = ctx->options->key.vs.vertex_post_shuffle & (1 << location); 5059 if (post_shuffle) 5060 num_channels = MAX2(num_channels, 3); 5061 5062 unsigned desc_index = 5063 ctx->program->info->vs.use_per_attribute_vb_descs ? location : attrib_binding; 5064 desc_index = util_bitcount(ctx->program->info->vs.vb_desc_usage_mask & 5065 u_bit_consecutive(0, desc_index)); 5066 Operand off = bld.copy(bld.def(s1), Operand::c32(desc_index * 16u)); 5067 Temp list = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), vertex_buffers, off); 5068 5069 Temp index; 5070 if (ctx->options->key.vs.instance_rate_inputs & (1u << location)) { 5071 uint32_t divisor = ctx->options->key.vs.instance_rate_divisors[location]; 5072 Temp start_instance = get_arg(ctx, ctx->args->ac.start_instance); 5073 if (divisor) { 5074 Temp instance_id = get_arg(ctx, ctx->args->ac.instance_id); 5075 if (divisor != 1) { 5076 Temp divided = bld.tmp(v1); 5077 emit_v_div_u32(ctx, divided, as_vgpr(ctx, instance_id), divisor); 5078 index = bld.vadd32(bld.def(v1), start_instance, divided); 5079 } else { 5080 index = bld.vadd32(bld.def(v1), start_instance, instance_id); 5081 } 5082 } else { 5083 index = bld.copy(bld.def(v1), start_instance); 5084 } 5085 } else { 5086 index = bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.base_vertex), 5087 get_arg(ctx, ctx->args->ac.vertex_id)); 5088 } 5089 5090 Temp* const channels = (Temp*)alloca(num_channels * sizeof(Temp)); 5091 unsigned channel_start = 0; 5092 bool direct_fetch = false; 5093 5094 /* skip unused channels at the start */ 5095 if (vtx_info->chan_byte_size && !post_shuffle) { 5096 channel_start = ffs(mask) - 1; 5097 for (unsigned i = 0; i < MIN2(channel_start, num_channels); i++) 5098 channels[i] = Temp(0, s1); 5099 } else if (vtx_info->chan_byte_size && post_shuffle && !(mask & 0x8)) { 5100 num_channels = 3 - (ffs(mask) - 1); 5101 } 5102 5103 /* load channels */ 5104 while (channel_start < num_channels) { 5105 unsigned fetch_component = num_channels - channel_start; 5106 unsigned fetch_offset = attrib_offset + channel_start * vtx_info->chan_byte_size; 5107 bool expanded = false; 5108 5109 /* use MUBUF when possible to avoid possible alignment issues */ 5110 /* TODO: we could use SDWA to unpack 8/16-bit attributes without extra instructions */ 5111 bool use_mubuf = 5112 (nfmt == V_008F0C_BUF_NUM_FORMAT_FLOAT || nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || 5113 nfmt == V_008F0C_BUF_NUM_FORMAT_SINT) && 5114 vtx_info->chan_byte_size == 4; 5115 unsigned fetch_dfmt = V_008F0C_BUF_DATA_FORMAT_INVALID; 5116 if (!use_mubuf) { 5117 fetch_dfmt = 5118 get_fetch_data_format(ctx, vtx_info, fetch_offset, &fetch_component, 5119 vtx_info->num_channels - channel_start, binding_align); 5120 } else { 5121 if (fetch_component == 3 && ctx->options->chip_class == GFX6) { 5122 /* GFX6 only supports loading vec3 with MTBUF, expand to vec4. */ 5123 fetch_component = 4; 5124 expanded = true; 5125 } 5126 } 5127 5128 unsigned fetch_bytes = fetch_component * bitsize / 8; 5129 5130 Temp fetch_index = index; 5131 if (attrib_stride != 0 && fetch_offset > attrib_stride) { 5132 fetch_index = 5133 bld.vadd32(bld.def(v1), Operand::c32(fetch_offset / attrib_stride), fetch_index); 5134 fetch_offset = fetch_offset % attrib_stride; 5135 } 5136 5137 Operand soffset = Operand::zero(); 5138 if (fetch_offset >= 4096) { 5139 soffset = bld.copy(bld.def(s1), Operand::c32(fetch_offset / 4096 * 4096)); 5140 fetch_offset %= 4096; 5141 } 5142 5143 aco_opcode opcode; 5144 switch (fetch_bytes) { 5145 case 2: 5146 assert(!use_mubuf && bitsize == 16); 5147 opcode = aco_opcode::tbuffer_load_format_d16_x; 5148 break; 5149 case 4: 5150 if (bitsize == 16) { 5151 assert(!use_mubuf); 5152 opcode = aco_opcode::tbuffer_load_format_d16_xy; 5153 } else { 5154 opcode = 5155 use_mubuf ? aco_opcode::buffer_load_dword : aco_opcode::tbuffer_load_format_x; 5156 } 5157 break; 5158 case 6: 5159 assert(!use_mubuf && bitsize == 16); 5160 opcode = aco_opcode::tbuffer_load_format_d16_xyz; 5161 break; 5162 case 8: 5163 if (bitsize == 16) { 5164 assert(!use_mubuf); 5165 opcode = aco_opcode::tbuffer_load_format_d16_xyzw; 5166 } else { 5167 opcode = 5168 use_mubuf ? aco_opcode::buffer_load_dwordx2 : aco_opcode::tbuffer_load_format_xy; 5169 } 5170 break; 5171 case 12: 5172 assert(ctx->options->chip_class >= GFX7 || 5173 (!use_mubuf && ctx->options->chip_class == GFX6)); 5174 opcode = 5175 use_mubuf ? aco_opcode::buffer_load_dwordx3 : aco_opcode::tbuffer_load_format_xyz; 5176 break; 5177 case 16: 5178 opcode = 5179 use_mubuf ? aco_opcode::buffer_load_dwordx4 : aco_opcode::tbuffer_load_format_xyzw; 5180 break; 5181 default: unreachable("Unimplemented load_input vector size"); 5182 } 5183 5184 Temp fetch_dst; 5185 if (channel_start == 0 && fetch_bytes == dst.bytes() && !post_shuffle && !expanded && 5186 (alpha_adjust == ALPHA_ADJUST_NONE || num_channels <= 3)) { 5187 direct_fetch = true; 5188 fetch_dst = dst; 5189 } else { 5190 fetch_dst = bld.tmp(RegClass::get(RegType::vgpr, fetch_bytes)); 5191 } 5192 5193 if (use_mubuf) { 5194 Instruction* mubuf = bld.mubuf(opcode, Definition(fetch_dst), list, fetch_index, 5195 soffset, fetch_offset, false, false, true) 5196 .instr; 5197 mubuf->mubuf().vtx_binding = attrib_binding + 1; 5198 } else { 5199 Instruction* mtbuf = bld.mtbuf(opcode, Definition(fetch_dst), list, fetch_index, 5200 soffset, fetch_dfmt, nfmt, fetch_offset, false, true) 5201 .instr; 5202 mtbuf->mtbuf().vtx_binding = attrib_binding + 1; 5203 } 5204 5205 emit_split_vector(ctx, fetch_dst, fetch_dst.size()); 5206 5207 if (fetch_component == 1) { 5208 channels[channel_start] = fetch_dst; 5209 } else { 5210 for (unsigned i = 0; i < MIN2(fetch_component, num_channels - channel_start); i++) 5211 channels[channel_start + i] = 5212 emit_extract_vector(ctx, fetch_dst, i, bitsize == 16 ? v2b : v1); 5213 } 5214 5215 channel_start += fetch_component; 5216 } 5217 5218 if (!direct_fetch) { 5219 bool is_float = 5220 nfmt != V_008F0C_BUF_NUM_FORMAT_UINT && nfmt != V_008F0C_BUF_NUM_FORMAT_SINT; 5221 5222 static const unsigned swizzle_normal[4] = {0, 1, 2, 3}; 5223 static const unsigned swizzle_post_shuffle[4] = {2, 1, 0, 3}; 5224 const unsigned* swizzle = post_shuffle ? swizzle_post_shuffle : swizzle_normal; 5225 unsigned num_components = instr->dest.ssa.num_components; 5226 5227 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>( 5228 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 5229 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 5230 unsigned num_temp = 0; 5231 for (unsigned i = 0; i < num_components; i++) { 5232 unsigned idx = i + component; 5233 if (swizzle[idx] < num_channels && channels[swizzle[idx]].id()) { 5234 Temp channel = channels[swizzle[idx]]; 5235 if (idx == 3 && alpha_adjust != ALPHA_ADJUST_NONE) 5236 channel = adjust_vertex_fetch_alpha(ctx, alpha_adjust, channel); 5237 vec->operands[i] = Operand(channel); 5238 5239 num_temp++; 5240 elems[i] = channel; 5241 } else if (is_float && idx == 3) { 5242 vec->operands[i] = Operand::c32(0x3f800000u); 5243 } else if (!is_float && idx == 3) { 5244 vec->operands[i] = Operand::c32(1u); 5245 } else { 5246 vec->operands[i] = Operand::zero(); 5247 } 5248 } 5249 vec->definitions[0] = Definition(dst); 5250 ctx->block->instructions.emplace_back(std::move(vec)); 5251 emit_split_vector(ctx, dst, num_components); 5252 5253 if (num_temp == num_components) 5254 ctx->allocated_vec.emplace(dst.id(), elems); 5255 } 5256 } else if (ctx->shader->info.stage == MESA_SHADER_FRAGMENT) { 5257 if (!nir_src_is_const(offset) || nir_src_as_uint(offset)) 5258 isel_err(offset.ssa->parent_instr, 5259 "Unimplemented non-zero nir_intrinsic_load_input offset"); 5260 5261 Temp prim_mask = get_arg(ctx, ctx->args->ac.prim_mask); 5262 5263 unsigned idx = nir_intrinsic_base(instr); 5264 unsigned component = nir_intrinsic_component(instr); 5265 unsigned vertex_id = 2; /* P0 */ 5266 5267 if (instr->intrinsic == nir_intrinsic_load_input_vertex) { 5268 nir_const_value* src0 = nir_src_as_const_value(instr->src[0]); 5269 switch (src0->u32) { 5270 case 0: 5271 vertex_id = 2; /* P0 */ 5272 break; 5273 case 1: 5274 vertex_id = 0; /* P10 */ 5275 break; 5276 case 2: 5277 vertex_id = 1; /* P20 */ 5278 break; 5279 default: unreachable("invalid vertex index"); 5280 } 5281 } 5282 5283 if (instr->dest.ssa.num_components == 1 && 5284 instr->dest.ssa.bit_size != 64) { 5285 bld.vintrp(aco_opcode::v_interp_mov_f32, Definition(dst), Operand::c32(vertex_id), 5286 bld.m0(prim_mask), idx, component); 5287 } else { 5288 unsigned num_components = instr->dest.ssa.num_components; 5289 if (instr->dest.ssa.bit_size == 64) 5290 num_components *= 2; 5291 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 5292 aco_opcode::p_create_vector, Format::PSEUDO, num_components, 1)}; 5293 for (unsigned i = 0; i < num_components; i++) { 5294 unsigned chan_component = (component + i) % 4; 5295 unsigned chan_idx = idx + (component + i) / 4; 5296 vec->operands[i] = bld.vintrp( 5297 aco_opcode::v_interp_mov_f32, bld.def(instr->dest.ssa.bit_size == 16 ? v2b : v1), 5298 Operand::c32(vertex_id), bld.m0(prim_mask), chan_idx, chan_component); 5299 } 5300 vec->definitions[0] = Definition(dst); 5301 bld.insert(std::move(vec)); 5302 } 5303 } else { 5304 unreachable("Shader stage not implemented"); 5305 } 5306} 5307 5308void 5309visit_load_tcs_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr) 5310{ 5311 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL); 5312 5313 Builder bld(ctx->program, ctx->block); 5314 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5315 5316 if (load_input_from_temps(ctx, instr, dst)) 5317 return; 5318 5319 unreachable("LDS-based TCS input should have been lowered in NIR."); 5320} 5321 5322void 5323visit_load_per_vertex_input(isel_context* ctx, nir_intrinsic_instr* instr) 5324{ 5325 switch (ctx->shader->info.stage) { 5326 case MESA_SHADER_TESS_CTRL: visit_load_tcs_per_vertex_input(ctx, instr); break; 5327 default: unreachable("Unimplemented shader stage"); 5328 } 5329} 5330 5331void 5332visit_load_tess_coord(isel_context* ctx, nir_intrinsic_instr* instr) 5333{ 5334 assert(ctx->shader->info.stage == MESA_SHADER_TESS_EVAL); 5335 5336 Builder bld(ctx->program, ctx->block); 5337 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5338 5339 Operand tes_u(get_arg(ctx, ctx->args->ac.tes_u)); 5340 Operand tes_v(get_arg(ctx, ctx->args->ac.tes_v)); 5341 Operand tes_w = Operand::zero(); 5342 5343 if (ctx->shader->info.tess.primitive_mode == GL_TRIANGLES) { 5344 Temp tmp = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), tes_u, tes_v); 5345 tmp = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::c32(0x3f800000u /* 1.0f */), tmp); 5346 tes_w = Operand(tmp); 5347 } 5348 5349 Temp tess_coord = bld.pseudo(aco_opcode::p_create_vector, Definition(dst), tes_u, tes_v, tes_w); 5350 emit_split_vector(ctx, tess_coord, 3); 5351} 5352 5353Temp 5354load_desc_ptr(isel_context* ctx, unsigned desc_set) 5355{ 5356 const struct radv_userdata_locations *user_sgprs_locs = &ctx->program->info->user_sgprs_locs; 5357 5358 if (user_sgprs_locs->shader_data[AC_UD_INDIRECT_DESCRIPTOR_SETS].sgpr_idx != -1) { 5359 Builder bld(ctx->program, ctx->block); 5360 Temp ptr64 = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->descriptor_sets[0])); 5361 Operand off = bld.copy(bld.def(s1), Operand::c32(desc_set << 2)); 5362 return bld.smem(aco_opcode::s_load_dword, bld.def(s1), ptr64, off); //, false, false, false); 5363 } 5364 5365 return get_arg(ctx, ctx->args->descriptor_sets[desc_set]); 5366} 5367 5368void 5369visit_load_resource(isel_context* ctx, nir_intrinsic_instr* instr) 5370{ 5371 Builder bld(ctx->program, ctx->block); 5372 Temp index = get_ssa_temp(ctx, instr->src[0].ssa); 5373 if (!nir_dest_is_divergent(instr->dest)) 5374 index = bld.as_uniform(index); 5375 unsigned desc_set = nir_intrinsic_desc_set(instr); 5376 unsigned binding = nir_intrinsic_binding(instr); 5377 5378 Temp desc_ptr; 5379 radv_pipeline_layout* pipeline_layout = ctx->options->layout; 5380 radv_descriptor_set_layout* layout = pipeline_layout->set[desc_set].layout; 5381 unsigned offset = layout->binding[binding].offset; 5382 unsigned stride; 5383 if (layout->binding[binding].type == VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER_DYNAMIC || 5384 layout->binding[binding].type == VK_DESCRIPTOR_TYPE_STORAGE_BUFFER_DYNAMIC) { 5385 unsigned idx = pipeline_layout->set[desc_set].dynamic_offset_start + 5386 layout->binding[binding].dynamic_offset_offset; 5387 desc_ptr = get_arg(ctx, ctx->args->ac.push_constants); 5388 offset = pipeline_layout->push_constant_size + 16 * idx; 5389 stride = 16; 5390 } else { 5391 desc_ptr = load_desc_ptr(ctx, desc_set); 5392 stride = layout->binding[binding].size; 5393 } 5394 5395 if (nir_src_is_const(instr->src[0])) { 5396 index = 5397 bld.copy(bld.def(s1), Operand::c32((offset + nir_src_as_uint(instr->src[0]) * stride))); 5398 } else if (index.type() == RegType::vgpr) { 5399 if (stride != 1) { 5400 bool index24bit = layout->binding[binding].array_size <= 0x1000000; 5401 index = bld.v_mul_imm(bld.def(v1), index, stride, index24bit); 5402 } 5403 if (offset) 5404 index = bld.vadd32(bld.def(v1), Operand::c32(offset), index); 5405 } else { 5406 if (stride != 1) 5407 index = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index); 5408 if (offset) 5409 index = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), 5410 Operand::c32(offset), index); 5411 } 5412 5413 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5414 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 5415 elems[0] = desc_ptr; 5416 elems[1] = index; 5417 ctx->allocated_vec.emplace(dst.id(), elems); 5418 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), desc_ptr, index, Operand::zero()); 5419} 5420 5421void 5422load_buffer(isel_context* ctx, unsigned num_components, unsigned component_size, Temp dst, 5423 Temp rsrc, Temp offset, unsigned align_mul, unsigned align_offset, bool glc = false, 5424 bool allow_smem = true, memory_sync_info sync = memory_sync_info()) 5425{ 5426 Builder bld(ctx->program, ctx->block); 5427 5428 bool use_smem = 5429 dst.type() != RegType::vgpr && (!glc || ctx->options->chip_class >= GFX8) && allow_smem; 5430 if (use_smem) 5431 offset = bld.as_uniform(offset); 5432 else { 5433 /* GFX6-7 are affected by a hw bug that prevents address clamping to 5434 * work correctly when the SGPR offset is used. 5435 */ 5436 if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8) 5437 offset = as_vgpr(ctx, offset); 5438 } 5439 5440 LoadEmitInfo info = {Operand(offset), dst, num_components, component_size, rsrc}; 5441 info.glc = glc; 5442 info.sync = sync; 5443 info.align_mul = align_mul; 5444 info.align_offset = align_offset; 5445 if (use_smem) 5446 emit_load(ctx, bld, info, smem_load_params); 5447 else 5448 emit_load(ctx, bld, info, mubuf_load_params); 5449} 5450 5451Temp 5452load_buffer_rsrc(isel_context* ctx, Temp rsrc) 5453{ 5454 Builder bld(ctx->program, ctx->block); 5455 Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)); 5456 Temp binding = bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1))); 5457 set_ptr = convert_pointer_to_64_bit(ctx, set_ptr); 5458 return bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), set_ptr, binding); 5459} 5460 5461bool 5462is_inline_ubo(isel_context* ctx, nir_src rsrc) 5463{ 5464 nir_binding binding = nir_chase_binding(rsrc); 5465 if (!binding.success) 5466 return false; 5467 5468 radv_descriptor_set_layout* layout = ctx->options->layout->set[binding.desc_set].layout; 5469 return layout->binding[binding.binding].type == VK_DESCRIPTOR_TYPE_INLINE_UNIFORM_BLOCK_EXT; 5470} 5471 5472void 5473visit_load_ubo(isel_context* ctx, nir_intrinsic_instr* instr) 5474{ 5475 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5476 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa); 5477 5478 Builder bld(ctx->program, ctx->block); 5479 5480 if (is_inline_ubo(ctx, instr->src[0])) { 5481 Temp set_ptr = bld.as_uniform(emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1))); 5482 Temp binding_off = 5483 bld.as_uniform(emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1))); 5484 rsrc = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), set_ptr, binding_off); 5485 5486 uint32_t desc_type = 5487 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 5488 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 5489 if (ctx->options->chip_class >= GFX10) { 5490 desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 5491 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 5492 } else { 5493 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 5494 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 5495 } 5496 rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), rsrc, 5497 Operand::c32(S_008F04_BASE_ADDRESS_HI(ctx->options->address32_hi)), 5498 Operand::c32(0xFFFFFFFFu), Operand::c32(desc_type)); 5499 } else { 5500 rsrc = load_buffer_rsrc(ctx, rsrc); 5501 } 5502 unsigned size = instr->dest.ssa.bit_size / 8; 5503 load_buffer(ctx, instr->num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), 5504 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr)); 5505} 5506 5507void 5508visit_load_sbt_amd(isel_context* ctx, nir_intrinsic_instr* instr) 5509{ 5510 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5511 unsigned binding = nir_intrinsic_binding(instr); 5512 5513 Builder bld(ctx->program, ctx->block); 5514 Temp desc_base = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.sbt_descriptors)); 5515 Operand desc_off = bld.copy(bld.def(s1), Operand::c32(binding * 16u)); 5516 bld.smem(aco_opcode::s_load_dwordx4, Definition(dst), desc_base, desc_off); 5517} 5518 5519void 5520visit_load_push_constant(isel_context* ctx, nir_intrinsic_instr* instr) 5521{ 5522 Builder bld(ctx->program, ctx->block); 5523 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5524 unsigned offset = nir_intrinsic_base(instr); 5525 unsigned count = instr->dest.ssa.num_components; 5526 nir_const_value* index_cv = nir_src_as_const_value(instr->src[0]); 5527 5528 if (index_cv && instr->dest.ssa.bit_size == 32) { 5529 struct radv_userdata_info *loc = 5530 &ctx->args->shader_info->user_sgprs_locs.shader_data[AC_UD_INLINE_PUSH_CONSTANTS]; 5531 unsigned start = (offset + index_cv->u32) / 4u; 5532 unsigned num_inline_push_consts = loc->sgpr_idx != -1 ? loc->num_sgprs : 0; 5533 5534 start -= ctx->args->shader_info->min_push_constant_used / 4; 5535 if (start + count <= num_inline_push_consts) { 5536 std::array<Temp, NIR_MAX_VEC_COMPONENTS> elems; 5537 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 5538 aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; 5539 for (unsigned i = 0; i < count; ++i) { 5540 elems[i] = get_arg(ctx, ctx->args->ac.inline_push_consts[start + i]); 5541 vec->operands[i] = Operand{elems[i]}; 5542 } 5543 vec->definitions[0] = Definition(dst); 5544 ctx->block->instructions.emplace_back(std::move(vec)); 5545 ctx->allocated_vec.emplace(dst.id(), elems); 5546 return; 5547 } 5548 } 5549 5550 Temp index = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); 5551 if (offset != 0) // TODO check if index != 0 as well 5552 index = bld.nuw().sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), 5553 Operand::c32(offset), index); 5554 Temp ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->ac.push_constants)); 5555 Temp vec = dst; 5556 bool trim = false; 5557 bool aligned = true; 5558 5559 if (instr->dest.ssa.bit_size == 8) { 5560 aligned = index_cv && (offset + index_cv->u32) % 4 == 0; 5561 bool fits_in_dword = count == 1 || (index_cv && ((offset + index_cv->u32) % 4 + count) <= 4); 5562 if (!aligned) 5563 vec = fits_in_dword ? bld.tmp(s1) : bld.tmp(s2); 5564 } else if (instr->dest.ssa.bit_size == 16) { 5565 aligned = index_cv && (offset + index_cv->u32) % 4 == 0; 5566 if (!aligned) 5567 vec = count == 4 ? bld.tmp(s4) : count > 1 ? bld.tmp(s2) : bld.tmp(s1); 5568 } 5569 5570 aco_opcode op; 5571 5572 switch (vec.size()) { 5573 case 1: op = aco_opcode::s_load_dword; break; 5574 case 2: op = aco_opcode::s_load_dwordx2; break; 5575 case 3: 5576 vec = bld.tmp(s4); 5577 trim = true; 5578 FALLTHROUGH; 5579 case 4: op = aco_opcode::s_load_dwordx4; break; 5580 case 6: 5581 vec = bld.tmp(s8); 5582 trim = true; 5583 FALLTHROUGH; 5584 case 8: op = aco_opcode::s_load_dwordx8; break; 5585 default: unreachable("unimplemented or forbidden load_push_constant."); 5586 } 5587 5588 bld.smem(op, Definition(vec), ptr, index).instr->smem().prevent_overflow = true; 5589 5590 if (!aligned) { 5591 Operand byte_offset = index_cv ? Operand::c32((offset + index_cv->u32) % 4) : Operand(index); 5592 byte_align_scalar(ctx, vec, byte_offset, dst); 5593 return; 5594 } 5595 5596 if (trim) { 5597 emit_split_vector(ctx, vec, 4); 5598 RegClass rc = dst.size() == 3 ? s1 : s2; 5599 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), emit_extract_vector(ctx, vec, 0, rc), 5600 emit_extract_vector(ctx, vec, 1, rc), emit_extract_vector(ctx, vec, 2, rc)); 5601 } 5602 emit_split_vector(ctx, dst, instr->dest.ssa.num_components); 5603} 5604 5605void 5606visit_load_constant(isel_context* ctx, nir_intrinsic_instr* instr) 5607{ 5608 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 5609 5610 Builder bld(ctx->program, ctx->block); 5611 5612 uint32_t desc_type = 5613 S_008F0C_DST_SEL_X(V_008F0C_SQ_SEL_X) | S_008F0C_DST_SEL_Y(V_008F0C_SQ_SEL_Y) | 5614 S_008F0C_DST_SEL_Z(V_008F0C_SQ_SEL_Z) | S_008F0C_DST_SEL_W(V_008F0C_SQ_SEL_W); 5615 if (ctx->options->chip_class >= GFX10) { 5616 desc_type |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 5617 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 5618 } else { 5619 desc_type |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 5620 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 5621 } 5622 5623 unsigned base = nir_intrinsic_base(instr); 5624 unsigned range = nir_intrinsic_range(instr); 5625 5626 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); 5627 if (base && offset.type() == RegType::sgpr) 5628 offset = bld.nuw().sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, 5629 Operand::c32(base)); 5630 else if (base && offset.type() == RegType::vgpr) 5631 offset = bld.vadd32(bld.def(v1), Operand::c32(base), offset); 5632 5633 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), 5634 bld.pseudo(aco_opcode::p_constaddr, bld.def(s2), bld.def(s1, scc), 5635 Operand::c32(ctx->constant_data_offset)), 5636 Operand::c32(MIN2(base + range, ctx->shader->constant_data_size)), 5637 Operand::c32(desc_type)); 5638 unsigned size = instr->dest.ssa.bit_size / 8; 5639 // TODO: get alignment information for subdword constants 5640 load_buffer(ctx, instr->num_components, size, dst, rsrc, offset, size, 0); 5641} 5642 5643void 5644visit_discard_if(isel_context* ctx, nir_intrinsic_instr* instr) 5645{ 5646 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) 5647 ctx->cf_info.exec_potentially_empty_discard = true; 5648 5649 ctx->program->needs_exact = true; 5650 5651 // TODO: optimize uniform conditions 5652 Builder bld(ctx->program, ctx->block); 5653 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 5654 assert(src.regClass() == bld.lm); 5655 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 5656 bld.pseudo(aco_opcode::p_discard_if, src); 5657 ctx->block->kind |= block_kind_uses_discard_if; 5658 return; 5659} 5660 5661void 5662visit_discard(isel_context* ctx, nir_intrinsic_instr* instr) 5663{ 5664 Builder bld(ctx->program, ctx->block); 5665 5666 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) 5667 ctx->cf_info.exec_potentially_empty_discard = true; 5668 5669 bool divergent = 5670 ctx->cf_info.parent_if.is_divergent || ctx->cf_info.parent_loop.has_divergent_continue; 5671 5672 if (ctx->block->loop_nest_depth && (nir_instr_is_last(&instr->instr) && !divergent)) { 5673 /* we handle discards the same way as jump instructions */ 5674 append_logical_end(ctx->block); 5675 5676 /* in loops, discard behaves like break */ 5677 Block* linear_target = ctx->cf_info.parent_loop.exit; 5678 ctx->block->kind |= block_kind_discard; 5679 5680 /* uniform discard - loop ends here */ 5681 assert(nir_instr_is_last(&instr->instr)); 5682 ctx->block->kind |= block_kind_uniform; 5683 ctx->cf_info.has_branch = true; 5684 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 5685 add_linear_edge(ctx->block->index, linear_target); 5686 return; 5687 } 5688 5689 /* it can currently happen that NIR doesn't remove the unreachable code */ 5690 if (!nir_instr_is_last(&instr->instr)) { 5691 ctx->program->needs_exact = true; 5692 /* save exec somewhere temporarily so that it doesn't get 5693 * overwritten before the discard from outer exec masks */ 5694 Temp cond = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), 5695 Operand::c32(0xFFFFFFFF), Operand(exec, bld.lm)); 5696 bld.pseudo(aco_opcode::p_discard_if, cond); 5697 ctx->block->kind |= block_kind_uses_discard_if; 5698 return; 5699 } 5700 5701 /* This condition is incorrect for uniformly branched discards in a loop 5702 * predicated by a divergent condition, but the above code catches that case 5703 * and the discard would end up turning into a discard_if. 5704 * For example: 5705 * if (divergent) { 5706 * while (...) { 5707 * if (uniform) { 5708 * discard; 5709 * } 5710 * } 5711 * } 5712 */ 5713 if (!ctx->cf_info.parent_if.is_divergent) { 5714 /* program just ends here */ 5715 ctx->block->kind |= block_kind_uses_discard_if; 5716 bld.pseudo(aco_opcode::p_discard_if, Operand::c32(0xFFFFFFFFu)); 5717 // TODO: it will potentially be followed by a branch which is dead code to sanitize NIR phis 5718 } else { 5719 ctx->block->kind |= block_kind_discard; 5720 /* branch and linear edge is added by visit_if() */ 5721 } 5722} 5723 5724enum aco_descriptor_type { 5725 ACO_DESC_IMAGE, 5726 ACO_DESC_FMASK, 5727 ACO_DESC_SAMPLER, 5728 ACO_DESC_BUFFER, 5729 ACO_DESC_PLANE_0, 5730 ACO_DESC_PLANE_1, 5731 ACO_DESC_PLANE_2, 5732}; 5733 5734static bool 5735should_declare_array(isel_context* ctx, enum glsl_sampler_dim sampler_dim, bool is_array) 5736{ 5737 if (sampler_dim == GLSL_SAMPLER_DIM_BUF) 5738 return false; 5739 ac_image_dim dim = ac_get_sampler_dim(ctx->options->chip_class, sampler_dim, is_array); 5740 return dim == ac_image_cube || dim == ac_image_1darray || dim == ac_image_2darray || 5741 dim == ac_image_2darraymsaa; 5742} 5743 5744Temp 5745get_sampler_desc(isel_context* ctx, nir_deref_instr* deref_instr, 5746 enum aco_descriptor_type desc_type, const nir_tex_instr* tex_instr, bool write) 5747{ 5748 /* FIXME: we should lower the deref with some new nir_intrinsic_load_desc 5749 std::unordered_map<uint64_t, Temp>::iterator it = ctx->tex_desc.find((uint64_t) desc_type << 5750 32 | deref_instr->dest.ssa.index); if (it != ctx->tex_desc.end()) return it->second; 5751 */ 5752 Temp index = Temp(); 5753 bool index_set = false; 5754 unsigned constant_index = 0; 5755 unsigned descriptor_set; 5756 unsigned base_index; 5757 Builder bld(ctx->program, ctx->block); 5758 5759 if (!deref_instr) { 5760 assert(tex_instr); 5761 descriptor_set = 0; 5762 base_index = tex_instr->sampler_index; 5763 } else { 5764 while (deref_instr->deref_type != nir_deref_type_var) { 5765 unsigned array_size = glsl_get_aoa_size(deref_instr->type); 5766 if (!array_size) 5767 array_size = 1; 5768 5769 assert(deref_instr->deref_type == nir_deref_type_array); 5770 nir_const_value* const_value = nir_src_as_const_value(deref_instr->arr.index); 5771 if (const_value) { 5772 constant_index += array_size * const_value->u32; 5773 } else { 5774 Temp indirect = get_ssa_temp(ctx, deref_instr->arr.index.ssa); 5775 if (indirect.type() == RegType::vgpr) 5776 indirect = bld.as_uniform(indirect); 5777 5778 if (array_size != 1) 5779 indirect = 5780 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(array_size), indirect); 5781 5782 if (!index_set) { 5783 index = indirect; 5784 index_set = true; 5785 } else { 5786 index = 5787 bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), index, indirect); 5788 } 5789 } 5790 5791 deref_instr = nir_src_as_deref(deref_instr->parent); 5792 } 5793 descriptor_set = deref_instr->var->data.descriptor_set; 5794 base_index = deref_instr->var->data.binding; 5795 } 5796 5797 Temp list = load_desc_ptr(ctx, descriptor_set); 5798 list = convert_pointer_to_64_bit(ctx, list); 5799 5800 struct radv_descriptor_set_layout* layout = ctx->options->layout->set[descriptor_set].layout; 5801 struct radv_descriptor_set_binding_layout* binding = layout->binding + base_index; 5802 unsigned offset = binding->offset; 5803 unsigned stride = binding->size; 5804 aco_opcode opcode; 5805 RegClass type; 5806 5807 assert(base_index < layout->binding_count); 5808 5809 switch (desc_type) { 5810 case ACO_DESC_IMAGE: 5811 type = s8; 5812 opcode = aco_opcode::s_load_dwordx8; 5813 break; 5814 case ACO_DESC_FMASK: 5815 type = s8; 5816 opcode = aco_opcode::s_load_dwordx8; 5817 offset += 32; 5818 break; 5819 case ACO_DESC_SAMPLER: 5820 type = s4; 5821 opcode = aco_opcode::s_load_dwordx4; 5822 if (binding->type == VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER) 5823 offset += radv_combined_image_descriptor_sampler_offset(binding); 5824 break; 5825 case ACO_DESC_BUFFER: 5826 type = s4; 5827 opcode = aco_opcode::s_load_dwordx4; 5828 break; 5829 case ACO_DESC_PLANE_0: 5830 case ACO_DESC_PLANE_1: 5831 type = s8; 5832 opcode = aco_opcode::s_load_dwordx8; 5833 offset += 32 * (desc_type - ACO_DESC_PLANE_0); 5834 break; 5835 case ACO_DESC_PLANE_2: 5836 type = s4; 5837 opcode = aco_opcode::s_load_dwordx4; 5838 offset += 64; 5839 break; 5840 default: unreachable("invalid desc_type\n"); 5841 } 5842 5843 offset += constant_index * stride; 5844 5845 if (desc_type == ACO_DESC_SAMPLER && binding->immutable_samplers_offset && 5846 (!index_set || binding->immutable_samplers_equal)) { 5847 if (binding->immutable_samplers_equal) 5848 constant_index = 0; 5849 5850 const uint32_t* samplers = radv_immutable_samplers(layout, binding); 5851 uint32_t dword0_mask = tex_instr->op == nir_texop_tg4 ? C_008F30_TRUNC_COORD : 0xffffffffu; 5852 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), 5853 Operand::c32(samplers[constant_index * 4 + 0] & dword0_mask), 5854 Operand::c32(samplers[constant_index * 4 + 1]), 5855 Operand::c32(samplers[constant_index * 4 + 2]), 5856 Operand::c32(samplers[constant_index * 4 + 3])); 5857 } 5858 5859 Operand off; 5860 if (!index_set) { 5861 off = bld.copy(bld.def(s1), Operand::c32(offset)); 5862 } else { 5863 off = Operand( 5864 (Temp)bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), Operand::c32(offset), 5865 bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(stride), index))); 5866 } 5867 5868 Temp res = bld.smem(opcode, bld.def(type), list, off); 5869 5870 if (desc_type == ACO_DESC_PLANE_2) { 5871 Temp components[8]; 5872 for (unsigned i = 0; i < 8; i++) 5873 components[i] = bld.tmp(s1); 5874 bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]), 5875 Definition(components[2]), Definition(components[3]), res); 5876 5877 Temp desc2 = get_sampler_desc(ctx, deref_instr, ACO_DESC_PLANE_1, tex_instr, write); 5878 bld.pseudo(aco_opcode::p_split_vector, bld.def(s1), bld.def(s1), bld.def(s1), bld.def(s1), 5879 Definition(components[4]), Definition(components[5]), Definition(components[6]), 5880 Definition(components[7]), desc2); 5881 5882 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1], 5883 components[2], components[3], components[4], components[5], components[6], 5884 components[7]); 5885 } else if (desc_type == ACO_DESC_IMAGE && ctx->options->has_image_load_dcc_bug && !tex_instr && 5886 !write) { 5887 Temp components[8]; 5888 for (unsigned i = 0; i < 8; i++) 5889 components[i] = bld.tmp(s1); 5890 5891 bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]), 5892 Definition(components[2]), Definition(components[3]), Definition(components[4]), 5893 Definition(components[5]), Definition(components[6]), Definition(components[7]), 5894 res); 5895 5896 /* WRITE_COMPRESS_ENABLE must be 0 for all image loads to workaround a 5897 * hardware bug. 5898 */ 5899 components[6] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[6], 5900 bld.copy(bld.def(s1), Operand::c32(C_00A018_WRITE_COMPRESS_ENABLE))); 5901 5902 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), components[0], components[1], 5903 components[2], components[3], components[4], components[5], components[6], 5904 components[7]); 5905 } else if (desc_type == ACO_DESC_SAMPLER && tex_instr->op == nir_texop_tg4) { 5906 Temp components[4]; 5907 for (unsigned i = 0; i < 4; i++) 5908 components[i] = bld.tmp(s1); 5909 5910 bld.pseudo(aco_opcode::p_split_vector, Definition(components[0]), Definition(components[1]), 5911 Definition(components[2]), Definition(components[3]), res); 5912 5913 /* We want to always use the linear filtering truncation behaviour for 5914 * nir_texop_tg4, even if the sampler uses nearest/point filtering. 5915 */ 5916 components[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), components[0], 5917 Operand::c32(C_008F30_TRUNC_COORD)); 5918 5919 res = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), components[0], components[1], 5920 components[2], components[3]); 5921 } 5922 5923 return res; 5924} 5925 5926static int 5927image_type_to_components_count(enum glsl_sampler_dim dim, bool array) 5928{ 5929 switch (dim) { 5930 case GLSL_SAMPLER_DIM_BUF: return 1; 5931 case GLSL_SAMPLER_DIM_1D: return array ? 2 : 1; 5932 case GLSL_SAMPLER_DIM_2D: return array ? 3 : 2; 5933 case GLSL_SAMPLER_DIM_MS: return array ? 4 : 3; 5934 case GLSL_SAMPLER_DIM_3D: 5935 case GLSL_SAMPLER_DIM_CUBE: return 3; 5936 case GLSL_SAMPLER_DIM_RECT: 5937 case GLSL_SAMPLER_DIM_SUBPASS: return 2; 5938 case GLSL_SAMPLER_DIM_SUBPASS_MS: return 3; 5939 default: break; 5940 } 5941 return 0; 5942} 5943 5944static MIMG_instruction* 5945emit_mimg(Builder& bld, aco_opcode op, Definition dst, Temp rsrc, Operand samp, 5946 std::vector<Temp> coords, unsigned wqm_mask = 0, Operand vdata = Operand(v1)) 5947{ 5948 /* Limit NSA instructions to 3 dwords on GFX10 to avoid stability issues. */ 5949 unsigned max_nsa_size = bld.program->chip_class >= GFX10_3 ? 13 : 5; 5950 bool use_nsa = bld.program->chip_class >= GFX10 && coords.size() <= max_nsa_size; 5951 5952 if (!use_nsa) { 5953 Temp coord = coords[0]; 5954 if (coords.size() > 1) { 5955 coord = bld.tmp(RegType::vgpr, coords.size()); 5956 5957 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 5958 aco_opcode::p_create_vector, Format::PSEUDO, coords.size(), 1)}; 5959 for (unsigned i = 0; i < coords.size(); i++) 5960 vec->operands[i] = Operand(coords[i]); 5961 vec->definitions[0] = Definition(coord); 5962 bld.insert(std::move(vec)); 5963 } else if (coord.type() == RegType::sgpr) { 5964 coord = bld.copy(bld.def(v1), coord); 5965 } 5966 5967 if (wqm_mask) { 5968 /* We don't need the bias, sample index, compare value or offset to be 5969 * computed in WQM but if the p_create_vector copies the coordinates, then it 5970 * needs to be in WQM. */ 5971 coord = emit_wqm(bld, coord, bld.tmp(coord.regClass()), true); 5972 } 5973 5974 coords[0] = coord; 5975 coords.resize(1); 5976 } else { 5977 for (unsigned i = 0; i < coords.size(); i++) { 5978 if (wqm_mask & (1u << i)) 5979 coords[i] = emit_wqm(bld, coords[i], bld.tmp(coords[i].regClass()), true); 5980 } 5981 5982 for (Temp& coord : coords) { 5983 if (coord.type() == RegType::sgpr) 5984 coord = bld.copy(bld.def(v1), coord); 5985 } 5986 } 5987 5988 aco_ptr<MIMG_instruction> mimg{ 5989 create_instruction<MIMG_instruction>(op, Format::MIMG, 3 + coords.size(), dst.isTemp())}; 5990 if (dst.isTemp()) 5991 mimg->definitions[0] = dst; 5992 mimg->operands[0] = Operand(rsrc); 5993 mimg->operands[1] = samp; 5994 mimg->operands[2] = vdata; 5995 for (unsigned i = 0; i < coords.size(); i++) 5996 mimg->operands[3 + i] = Operand(coords[i]); 5997 5998 MIMG_instruction* res = mimg.get(); 5999 bld.insert(std::move(mimg)); 6000 return res; 6001} 6002 6003void 6004visit_bvh64_intersect_ray_amd(isel_context* ctx, nir_intrinsic_instr* instr) 6005{ 6006 Builder bld(ctx->program, ctx->block); 6007 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6008 Temp resource = get_ssa_temp(ctx, instr->src[0].ssa); 6009 Temp node = get_ssa_temp(ctx, instr->src[1].ssa); 6010 Temp tmax = get_ssa_temp(ctx, instr->src[2].ssa); 6011 Temp origin = get_ssa_temp(ctx, instr->src[3].ssa); 6012 Temp dir = get_ssa_temp(ctx, instr->src[4].ssa); 6013 Temp inv_dir = get_ssa_temp(ctx, instr->src[5].ssa); 6014 6015 std::vector<Temp> args; 6016 args.push_back(emit_extract_vector(ctx, node, 0, v1)); 6017 args.push_back(emit_extract_vector(ctx, node, 1, v1)); 6018 args.push_back(as_vgpr(ctx, tmax)); 6019 args.push_back(emit_extract_vector(ctx, origin, 0, v1)); 6020 args.push_back(emit_extract_vector(ctx, origin, 1, v1)); 6021 args.push_back(emit_extract_vector(ctx, origin, 2, v1)); 6022 args.push_back(emit_extract_vector(ctx, dir, 0, v1)); 6023 args.push_back(emit_extract_vector(ctx, dir, 1, v1)); 6024 args.push_back(emit_extract_vector(ctx, dir, 2, v1)); 6025 args.push_back(emit_extract_vector(ctx, inv_dir, 0, v1)); 6026 args.push_back(emit_extract_vector(ctx, inv_dir, 1, v1)); 6027 args.push_back(emit_extract_vector(ctx, inv_dir, 2, v1)); 6028 6029 MIMG_instruction* mimg = emit_mimg(bld, aco_opcode::image_bvh64_intersect_ray, Definition(dst), 6030 resource, Operand(s4), args); 6031 mimg->dim = ac_image_1d; 6032 mimg->dmask = 0xf; 6033 mimg->unrm = true; 6034 mimg->r128 = true; 6035} 6036 6037static std::vector<Temp> 6038get_image_coords(isel_context* ctx, const nir_intrinsic_instr* instr) 6039{ 6040 6041 Temp src0 = get_ssa_temp(ctx, instr->src[1].ssa); 6042 enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6043 bool is_array = nir_intrinsic_image_array(instr); 6044 ASSERTED bool add_frag_pos = 6045 (dim == GLSL_SAMPLER_DIM_SUBPASS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); 6046 assert(!add_frag_pos && "Input attachments should be lowered."); 6047 bool is_ms = (dim == GLSL_SAMPLER_DIM_MS || dim == GLSL_SAMPLER_DIM_SUBPASS_MS); 6048 bool gfx9_1d = ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D; 6049 int count = image_type_to_components_count(dim, is_array); 6050 std::vector<Temp> coords(count); 6051 Builder bld(ctx->program, ctx->block); 6052 6053 if (is_ms) 6054 coords[--count] = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[2].ssa), 0, v1); 6055 6056 if (gfx9_1d) { 6057 coords[0] = emit_extract_vector(ctx, src0, 0, v1); 6058 coords.resize(coords.size() + 1); 6059 coords[1] = bld.copy(bld.def(v1), Operand::zero()); 6060 if (is_array) 6061 coords[2] = emit_extract_vector(ctx, src0, 1, v1); 6062 } else { 6063 for (int i = 0; i < count; i++) 6064 coords[i] = emit_extract_vector(ctx, src0, i, v1); 6065 } 6066 6067 if (instr->intrinsic == nir_intrinsic_image_deref_load || 6068 instr->intrinsic == nir_intrinsic_image_deref_sparse_load || 6069 instr->intrinsic == nir_intrinsic_image_deref_store) { 6070 int lod_index = instr->intrinsic == nir_intrinsic_image_deref_store ? 4 : 3; 6071 bool level_zero = 6072 nir_src_is_const(instr->src[lod_index]) && nir_src_as_uint(instr->src[lod_index]) == 0; 6073 6074 if (!level_zero) 6075 coords.emplace_back(get_ssa_temp(ctx, instr->src[lod_index].ssa)); 6076 } 6077 6078 return coords; 6079} 6080 6081memory_sync_info 6082get_memory_sync_info(nir_intrinsic_instr* instr, storage_class storage, unsigned semantics) 6083{ 6084 /* atomicrmw might not have NIR_INTRINSIC_ACCESS and there's nothing interesting there anyway */ 6085 if (semantics & semantic_atomicrmw) 6086 return memory_sync_info(storage, semantics); 6087 6088 unsigned access = nir_intrinsic_access(instr); 6089 6090 if (access & ACCESS_VOLATILE) 6091 semantics |= semantic_volatile; 6092 if (access & ACCESS_CAN_REORDER) 6093 semantics |= semantic_can_reorder | semantic_private; 6094 6095 return memory_sync_info(storage, semantics); 6096} 6097 6098Operand 6099emit_tfe_init(Builder& bld, Temp dst) 6100{ 6101 Temp tmp = bld.tmp(dst.regClass()); 6102 6103 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 6104 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; 6105 for (unsigned i = 0; i < dst.size(); i++) 6106 vec->operands[i] = Operand::zero(); 6107 vec->definitions[0] = Definition(tmp); 6108 /* Since this is fixed to an instruction's definition register, any CSE will 6109 * just create copies. Copying costs about the same as zero-initialization, 6110 * but these copies can break up clauses. 6111 */ 6112 vec->definitions[0].setNoCSE(true); 6113 bld.insert(std::move(vec)); 6114 6115 return Operand(tmp); 6116} 6117 6118void 6119visit_image_load(isel_context* ctx, nir_intrinsic_instr* instr) 6120{ 6121 Builder bld(ctx->program, ctx->block); 6122 const nir_variable* var = 6123 nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); 6124 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6125 bool is_array = nir_intrinsic_image_array(instr); 6126 bool is_sparse = instr->intrinsic == nir_intrinsic_image_deref_sparse_load; 6127 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6128 6129 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); 6130 unsigned access = var->data.access | nir_intrinsic_access(instr); 6131 6132 unsigned result_size = instr->dest.ssa.num_components - is_sparse; 6133 unsigned expand_mask = 6134 nir_ssa_def_components_read(&instr->dest.ssa) & u_bit_consecutive(0, result_size); 6135 expand_mask = MAX2(expand_mask, 1); /* this can be zero in the case of sparse image loads */ 6136 if (dim == GLSL_SAMPLER_DIM_BUF) 6137 expand_mask = (1u << util_last_bit(expand_mask)) - 1u; 6138 unsigned dmask = expand_mask; 6139 if (instr->dest.ssa.bit_size == 64) { 6140 expand_mask &= 0x9; 6141 /* only R64_UINT and R64_SINT supported. x is in xy of the result, w in zw */ 6142 dmask = ((expand_mask & 0x1) ? 0x3 : 0) | ((expand_mask & 0x8) ? 0xc : 0); 6143 } 6144 if (is_sparse) 6145 expand_mask |= 1 << result_size; 6146 unsigned num_components = util_bitcount(dmask) + is_sparse; 6147 6148 Temp tmp; 6149 if (num_components == dst.size() && dst.type() == RegType::vgpr) 6150 tmp = dst; 6151 else 6152 tmp = ctx->program->allocateTmp(RegClass(RegType::vgpr, num_components)); 6153 6154 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), 6155 dim == GLSL_SAMPLER_DIM_BUF ? ACO_DESC_BUFFER : ACO_DESC_IMAGE, 6156 nullptr, false); 6157 6158 if (dim == GLSL_SAMPLER_DIM_BUF) { 6159 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); 6160 6161 aco_opcode opcode; 6162 switch (util_bitcount(dmask)) { 6163 case 1: opcode = aco_opcode::buffer_load_format_x; break; 6164 case 2: opcode = aco_opcode::buffer_load_format_xy; break; 6165 case 3: opcode = aco_opcode::buffer_load_format_xyz; break; 6166 case 4: opcode = aco_opcode::buffer_load_format_xyzw; break; 6167 default: unreachable(">4 channel buffer image load"); 6168 } 6169 aco_ptr<MUBUF_instruction> load{ 6170 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 3 + is_sparse, 1)}; 6171 load->operands[0] = Operand(resource); 6172 load->operands[1] = Operand(vindex); 6173 load->operands[2] = Operand::c32(0); 6174 load->definitions[0] = Definition(tmp); 6175 load->idxen = true; 6176 load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); 6177 load->dlc = load->glc && ctx->options->chip_class >= GFX10; 6178 load->sync = sync; 6179 load->tfe = is_sparse; 6180 if (load->tfe) 6181 load->operands[3] = emit_tfe_init(bld, tmp); 6182 ctx->block->instructions.emplace_back(std::move(load)); 6183 } else { 6184 std::vector<Temp> coords = get_image_coords(ctx, instr); 6185 6186 bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; 6187 aco_opcode opcode = level_zero ? aco_opcode::image_load : aco_opcode::image_load_mip; 6188 6189 Operand vdata = is_sparse ? emit_tfe_init(bld, tmp) : Operand(v1); 6190 MIMG_instruction* load = 6191 emit_mimg(bld, opcode, Definition(tmp), resource, Operand(s4), coords, 0, vdata); 6192 load->glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT) ? 1 : 0; 6193 load->dlc = load->glc && ctx->options->chip_class >= GFX10; 6194 load->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); 6195 load->dmask = dmask; 6196 load->unrm = true; 6197 load->da = should_declare_array(ctx, dim, is_array); 6198 load->sync = sync; 6199 load->tfe = is_sparse; 6200 } 6201 6202 if (is_sparse && instr->dest.ssa.bit_size == 64) { 6203 /* The result components are 64-bit but the sparse residency code is 6204 * 32-bit. So add a zero to the end so expand_vector() works correctly. 6205 */ 6206 tmp = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, tmp.size() + 1), tmp, 6207 Operand::zero()); 6208 } 6209 6210 expand_vector(ctx, tmp, dst, instr->dest.ssa.num_components, expand_mask); 6211} 6212 6213void 6214visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) 6215{ 6216 const nir_variable* var = 6217 nir_deref_instr_get_variable(nir_instr_as_deref(instr->src[0].ssa->parent_instr)); 6218 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6219 bool is_array = nir_intrinsic_image_array(instr); 6220 Temp data = get_ssa_temp(ctx, instr->src[3].ssa); 6221 6222 /* only R64_UINT and R64_SINT supported */ 6223 if (instr->src[3].ssa->bit_size == 64 && data.bytes() > 8) 6224 data = emit_extract_vector(ctx, data, 0, RegClass(data.type(), 2)); 6225 data = as_vgpr(ctx, data); 6226 6227 memory_sync_info sync = get_memory_sync_info(instr, storage_image, 0); 6228 unsigned access = var->data.access | nir_intrinsic_access(instr); 6229 bool glc = ctx->options->chip_class == GFX6 || 6230 access & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE) 6231 ? 1 6232 : 0; 6233 6234 if (dim == GLSL_SAMPLER_DIM_BUF) { 6235 Temp rsrc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), 6236 ACO_DESC_BUFFER, nullptr, true); 6237 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); 6238 aco_opcode opcode; 6239 switch (data.size()) { 6240 case 1: opcode = aco_opcode::buffer_store_format_x; break; 6241 case 2: opcode = aco_opcode::buffer_store_format_xy; break; 6242 case 3: opcode = aco_opcode::buffer_store_format_xyz; break; 6243 case 4: opcode = aco_opcode::buffer_store_format_xyzw; break; 6244 default: unreachable(">4 channel buffer image store"); 6245 } 6246 aco_ptr<MUBUF_instruction> store{ 6247 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)}; 6248 store->operands[0] = Operand(rsrc); 6249 store->operands[1] = Operand(vindex); 6250 store->operands[2] = Operand::c32(0); 6251 store->operands[3] = Operand(data); 6252 store->idxen = true; 6253 store->glc = glc; 6254 store->dlc = false; 6255 store->disable_wqm = true; 6256 store->sync = sync; 6257 ctx->program->needs_exact = true; 6258 ctx->block->instructions.emplace_back(std::move(store)); 6259 return; 6260 } 6261 6262 assert(data.type() == RegType::vgpr); 6263 std::vector<Temp> coords = get_image_coords(ctx, instr); 6264 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), 6265 ACO_DESC_IMAGE, nullptr, true); 6266 6267 bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; 6268 aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip; 6269 6270 Builder bld(ctx->program, ctx->block); 6271 MIMG_instruction* store = 6272 emit_mimg(bld, opcode, Definition(), resource, Operand(s4), coords, 0, Operand(data)); 6273 store->glc = glc; 6274 store->dlc = false; 6275 store->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); 6276 store->dmask = (1 << data.size()) - 1; 6277 store->unrm = true; 6278 store->da = should_declare_array(ctx, dim, is_array); 6279 store->disable_wqm = true; 6280 store->sync = sync; 6281 ctx->program->needs_exact = true; 6282 return; 6283} 6284 6285void 6286visit_image_atomic(isel_context* ctx, nir_intrinsic_instr* instr) 6287{ 6288 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); 6289 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6290 bool is_array = nir_intrinsic_image_array(instr); 6291 Builder bld(ctx->program, ctx->block); 6292 6293 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[3].ssa)); 6294 bool is_64bit = data.bytes() == 8; 6295 assert((data.bytes() == 4 || data.bytes() == 8) && "only 32/64-bit image atomics implemented."); 6296 6297 if (instr->intrinsic == nir_intrinsic_image_deref_atomic_comp_swap) 6298 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(is_64bit ? v4 : v2), 6299 get_ssa_temp(ctx, instr->src[4].ssa), data); 6300 6301 aco_opcode buf_op, buf_op64, image_op; 6302 switch (instr->intrinsic) { 6303 case nir_intrinsic_image_deref_atomic_add: 6304 buf_op = aco_opcode::buffer_atomic_add; 6305 buf_op64 = aco_opcode::buffer_atomic_add_x2; 6306 image_op = aco_opcode::image_atomic_add; 6307 break; 6308 case nir_intrinsic_image_deref_atomic_umin: 6309 buf_op = aco_opcode::buffer_atomic_umin; 6310 buf_op64 = aco_opcode::buffer_atomic_umin_x2; 6311 image_op = aco_opcode::image_atomic_umin; 6312 break; 6313 case nir_intrinsic_image_deref_atomic_imin: 6314 buf_op = aco_opcode::buffer_atomic_smin; 6315 buf_op64 = aco_opcode::buffer_atomic_smin_x2; 6316 image_op = aco_opcode::image_atomic_smin; 6317 break; 6318 case nir_intrinsic_image_deref_atomic_umax: 6319 buf_op = aco_opcode::buffer_atomic_umax; 6320 buf_op64 = aco_opcode::buffer_atomic_umax_x2; 6321 image_op = aco_opcode::image_atomic_umax; 6322 break; 6323 case nir_intrinsic_image_deref_atomic_imax: 6324 buf_op = aco_opcode::buffer_atomic_smax; 6325 buf_op64 = aco_opcode::buffer_atomic_smax_x2; 6326 image_op = aco_opcode::image_atomic_smax; 6327 break; 6328 case nir_intrinsic_image_deref_atomic_and: 6329 buf_op = aco_opcode::buffer_atomic_and; 6330 buf_op64 = aco_opcode::buffer_atomic_and_x2; 6331 image_op = aco_opcode::image_atomic_and; 6332 break; 6333 case nir_intrinsic_image_deref_atomic_or: 6334 buf_op = aco_opcode::buffer_atomic_or; 6335 buf_op64 = aco_opcode::buffer_atomic_or_x2; 6336 image_op = aco_opcode::image_atomic_or; 6337 break; 6338 case nir_intrinsic_image_deref_atomic_xor: 6339 buf_op = aco_opcode::buffer_atomic_xor; 6340 buf_op64 = aco_opcode::buffer_atomic_xor_x2; 6341 image_op = aco_opcode::image_atomic_xor; 6342 break; 6343 case nir_intrinsic_image_deref_atomic_exchange: 6344 buf_op = aco_opcode::buffer_atomic_swap; 6345 buf_op64 = aco_opcode::buffer_atomic_swap_x2; 6346 image_op = aco_opcode::image_atomic_swap; 6347 break; 6348 case nir_intrinsic_image_deref_atomic_comp_swap: 6349 buf_op = aco_opcode::buffer_atomic_cmpswap; 6350 buf_op64 = aco_opcode::buffer_atomic_cmpswap_x2; 6351 image_op = aco_opcode::image_atomic_cmpswap; 6352 break; 6353 case nir_intrinsic_image_deref_atomic_fmin: 6354 buf_op = aco_opcode::buffer_atomic_fmin; 6355 buf_op64 = aco_opcode::buffer_atomic_fmin_x2; 6356 image_op = aco_opcode::image_atomic_fmin; 6357 break; 6358 case nir_intrinsic_image_deref_atomic_fmax: 6359 buf_op = aco_opcode::buffer_atomic_fmax; 6360 buf_op64 = aco_opcode::buffer_atomic_fmax_x2; 6361 image_op = aco_opcode::image_atomic_fmax; 6362 break; 6363 default: 6364 unreachable("visit_image_atomic should only be called with " 6365 "nir_intrinsic_image_deref_atomic_* instructions."); 6366 } 6367 6368 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6369 memory_sync_info sync = get_memory_sync_info(instr, storage_image, semantic_atomicrmw); 6370 6371 if (dim == GLSL_SAMPLER_DIM_BUF) { 6372 Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); 6373 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), 6374 ACO_DESC_BUFFER, nullptr, true); 6375 // assert(ctx->options->chip_class < GFX9 && "GFX9 stride size workaround not yet 6376 // implemented."); 6377 aco_ptr<MUBUF_instruction> mubuf{create_instruction<MUBUF_instruction>( 6378 is_64bit ? buf_op64 : buf_op, Format::MUBUF, 4, return_previous ? 1 : 0)}; 6379 mubuf->operands[0] = Operand(resource); 6380 mubuf->operands[1] = Operand(vindex); 6381 mubuf->operands[2] = Operand::c32(0); 6382 mubuf->operands[3] = Operand(data); 6383 if (return_previous) 6384 mubuf->definitions[0] = Definition(dst); 6385 mubuf->offset = 0; 6386 mubuf->idxen = true; 6387 mubuf->glc = return_previous; 6388 mubuf->dlc = false; /* Not needed for atomics */ 6389 mubuf->disable_wqm = true; 6390 mubuf->sync = sync; 6391 ctx->program->needs_exact = true; 6392 ctx->block->instructions.emplace_back(std::move(mubuf)); 6393 return; 6394 } 6395 6396 std::vector<Temp> coords = get_image_coords(ctx, instr); 6397 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), 6398 ACO_DESC_IMAGE, nullptr, true); 6399 Definition def = return_previous ? Definition(dst) : Definition(); 6400 MIMG_instruction* mimg = 6401 emit_mimg(bld, image_op, def, resource, Operand(s4), coords, 0, Operand(data)); 6402 mimg->glc = return_previous; 6403 mimg->dlc = false; /* Not needed for atomics */ 6404 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); 6405 mimg->dmask = (1 << data.size()) - 1; 6406 mimg->unrm = true; 6407 mimg->da = should_declare_array(ctx, dim, is_array); 6408 mimg->disable_wqm = true; 6409 mimg->sync = sync; 6410 ctx->program->needs_exact = true; 6411 return; 6412} 6413 6414void 6415get_buffer_size(isel_context* ctx, Temp desc, Temp dst) 6416{ 6417 if (ctx->options->chip_class == GFX8) { 6418 /* we only have to divide by 1, 2, 4, 8, 12 or 16 */ 6419 Builder bld(ctx->program, ctx->block); 6420 6421 Temp size = emit_extract_vector(ctx, desc, 2, s1); 6422 6423 Temp size_div3 = bld.vop3(aco_opcode::v_mul_hi_u32, bld.def(v1), 6424 bld.copy(bld.def(v1), Operand::c32(0xaaaaaaabu)), size); 6425 size_div3 = bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), 6426 bld.as_uniform(size_div3), Operand::c32(1u)); 6427 6428 Temp stride = emit_extract_vector(ctx, desc, 1, s1); 6429 stride = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), stride, 6430 Operand::c32((5u << 16) | 16u)); 6431 6432 Temp is12 = bld.sopc(aco_opcode::s_cmp_eq_i32, bld.def(s1, scc), stride, Operand::c32(12u)); 6433 size = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), size_div3, size, bld.scc(is12)); 6434 6435 Temp shr_dst = dst.type() == RegType::vgpr ? bld.tmp(s1) : dst; 6436 bld.sop2(aco_opcode::s_lshr_b32, Definition(shr_dst), bld.def(s1, scc), size, 6437 bld.sop1(aco_opcode::s_ff1_i32_b32, bld.def(s1), stride)); 6438 if (dst.type() == RegType::vgpr) 6439 bld.copy(Definition(dst), shr_dst); 6440 6441 /* TODO: we can probably calculate this faster with v_skip when stride != 12 */ 6442 } else { 6443 emit_extract_vector(ctx, desc, 2, dst); 6444 } 6445} 6446 6447void 6448visit_image_size(isel_context* ctx, nir_intrinsic_instr* instr) 6449{ 6450 const enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 6451 bool is_array = nir_intrinsic_image_array(instr); 6452 Builder bld(ctx->program, ctx->block); 6453 6454 if (dim == GLSL_SAMPLER_DIM_BUF) { 6455 Temp desc = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), 6456 ACO_DESC_BUFFER, NULL, false); 6457 return get_buffer_size(ctx, desc, get_ssa_temp(ctx, &instr->dest.ssa)); 6458 } 6459 6460 /* LOD */ 6461 assert(nir_src_as_uint(instr->src[1]) == 0); 6462 std::vector<Temp> lod{bld.copy(bld.def(v1), Operand::zero())}; 6463 6464 /* Resource */ 6465 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), 6466 ACO_DESC_IMAGE, NULL, false); 6467 6468 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6469 6470 MIMG_instruction* mimg = 6471 emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(dst), resource, Operand(s4), lod); 6472 uint8_t& dmask = mimg->dmask; 6473 mimg->dim = ac_get_image_dim(ctx->options->chip_class, dim, is_array); 6474 mimg->dmask = (1 << instr->dest.ssa.num_components) - 1; 6475 mimg->da = is_array; 6476 6477 if (ctx->options->chip_class == GFX9 && dim == GLSL_SAMPLER_DIM_1D && is_array) { 6478 assert(instr->dest.ssa.num_components == 2); 6479 dmask = 0x5; 6480 } 6481 6482 emit_split_vector(ctx, dst, instr->dest.ssa.num_components); 6483} 6484 6485void 6486get_image_samples(isel_context* ctx, Definition dst, Temp resource) 6487{ 6488 Builder bld(ctx->program, ctx->block); 6489 6490 Temp dword3 = emit_extract_vector(ctx, resource, 3, s1); 6491 Temp samples_log2 = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, 6492 Operand::c32(16u | 4u << 16)); 6493 Temp samples = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), Operand::c32(1u), 6494 samples_log2); 6495 Temp type = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), dword3, 6496 Operand::c32(28u | 4u << 16 /* offset=28, width=4 */)); 6497 6498 Operand default_sample = Operand::c32(1u); 6499 if (ctx->options->robust_buffer_access) { 6500 /* Extract the second dword of the descriptor, if it's 6501 * all zero, then it's a null descriptor. 6502 */ 6503 Temp dword1 = emit_extract_vector(ctx, resource, 1, s1); 6504 Temp is_non_null_descriptor = 6505 bld.sopc(aco_opcode::s_cmp_gt_u32, bld.def(s1, scc), dword1, Operand::zero()); 6506 default_sample = Operand(is_non_null_descriptor); 6507 } 6508 6509 Temp is_msaa = bld.sopc(aco_opcode::s_cmp_ge_u32, bld.def(s1, scc), type, Operand::c32(14u)); 6510 bld.sop2(aco_opcode::s_cselect_b32, dst, samples, default_sample, bld.scc(is_msaa)); 6511} 6512 6513void 6514visit_image_samples(isel_context* ctx, nir_intrinsic_instr* instr) 6515{ 6516 Builder bld(ctx->program, ctx->block); 6517 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6518 Temp resource = get_sampler_desc(ctx, nir_instr_as_deref(instr->src[0].ssa->parent_instr), 6519 ACO_DESC_IMAGE, NULL, false); 6520 get_image_samples(ctx, Definition(dst), resource); 6521} 6522 6523void 6524visit_load_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) 6525{ 6526 Builder bld(ctx->program, ctx->block); 6527 unsigned num_components = instr->num_components; 6528 6529 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6530 Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 6531 6532 unsigned access = nir_intrinsic_access(instr); 6533 bool glc = access & (ACCESS_VOLATILE | ACCESS_COHERENT); 6534 unsigned size = instr->dest.ssa.bit_size / 8; 6535 6536 bool allow_smem = access & ACCESS_CAN_REORDER; 6537 6538 load_buffer(ctx, num_components, size, dst, rsrc, get_ssa_temp(ctx, instr->src[1].ssa), 6539 nir_intrinsic_align_mul(instr), nir_intrinsic_align_offset(instr), glc, allow_smem, 6540 get_memory_sync_info(instr, storage_buffer, 0)); 6541} 6542 6543void 6544visit_store_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) 6545{ 6546 Builder bld(ctx->program, ctx->block); 6547 Temp data = get_ssa_temp(ctx, instr->src[0].ssa); 6548 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; 6549 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); 6550 Temp offset = get_ssa_temp(ctx, instr->src[2].ssa); 6551 6552 Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 6553 6554 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); 6555 bool glc = 6556 nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); 6557 6558 unsigned write_count = 0; 6559 Temp write_datas[32]; 6560 unsigned offsets[32]; 6561 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count, 6562 write_datas, offsets); 6563 6564 /* GFX6-7 are affected by a hw bug that prevents address clamping to work 6565 * correctly when the SGPR offset is used. 6566 */ 6567 if (offset.type() == RegType::sgpr && ctx->options->chip_class < GFX8) 6568 offset = as_vgpr(ctx, offset); 6569 6570 for (unsigned i = 0; i < write_count; i++) { 6571 aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); 6572 6573 aco_ptr<MUBUF_instruction> store{ 6574 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)}; 6575 store->operands[0] = Operand(rsrc); 6576 store->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); 6577 store->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0); 6578 store->operands[3] = Operand(write_datas[i]); 6579 store->offset = offsets[i]; 6580 store->offen = (offset.type() == RegType::vgpr); 6581 store->glc = glc; 6582 store->dlc = false; 6583 store->disable_wqm = true; 6584 store->sync = sync; 6585 ctx->program->needs_exact = true; 6586 ctx->block->instructions.emplace_back(std::move(store)); 6587 } 6588} 6589 6590void 6591visit_atomic_ssbo(isel_context* ctx, nir_intrinsic_instr* instr) 6592{ 6593 Builder bld(ctx->program, ctx->block); 6594 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); 6595 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)); 6596 6597 if (instr->intrinsic == nir_intrinsic_ssbo_atomic_comp_swap) 6598 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), 6599 get_ssa_temp(ctx, instr->src[3].ssa), data); 6600 6601 Temp offset = get_ssa_temp(ctx, instr->src[1].ssa); 6602 Temp rsrc = load_buffer_rsrc(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 6603 6604 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6605 6606 aco_opcode op32, op64; 6607 switch (instr->intrinsic) { 6608 case nir_intrinsic_ssbo_atomic_add: 6609 op32 = aco_opcode::buffer_atomic_add; 6610 op64 = aco_opcode::buffer_atomic_add_x2; 6611 break; 6612 case nir_intrinsic_ssbo_atomic_imin: 6613 op32 = aco_opcode::buffer_atomic_smin; 6614 op64 = aco_opcode::buffer_atomic_smin_x2; 6615 break; 6616 case nir_intrinsic_ssbo_atomic_umin: 6617 op32 = aco_opcode::buffer_atomic_umin; 6618 op64 = aco_opcode::buffer_atomic_umin_x2; 6619 break; 6620 case nir_intrinsic_ssbo_atomic_imax: 6621 op32 = aco_opcode::buffer_atomic_smax; 6622 op64 = aco_opcode::buffer_atomic_smax_x2; 6623 break; 6624 case nir_intrinsic_ssbo_atomic_umax: 6625 op32 = aco_opcode::buffer_atomic_umax; 6626 op64 = aco_opcode::buffer_atomic_umax_x2; 6627 break; 6628 case nir_intrinsic_ssbo_atomic_and: 6629 op32 = aco_opcode::buffer_atomic_and; 6630 op64 = aco_opcode::buffer_atomic_and_x2; 6631 break; 6632 case nir_intrinsic_ssbo_atomic_or: 6633 op32 = aco_opcode::buffer_atomic_or; 6634 op64 = aco_opcode::buffer_atomic_or_x2; 6635 break; 6636 case nir_intrinsic_ssbo_atomic_xor: 6637 op32 = aco_opcode::buffer_atomic_xor; 6638 op64 = aco_opcode::buffer_atomic_xor_x2; 6639 break; 6640 case nir_intrinsic_ssbo_atomic_exchange: 6641 op32 = aco_opcode::buffer_atomic_swap; 6642 op64 = aco_opcode::buffer_atomic_swap_x2; 6643 break; 6644 case nir_intrinsic_ssbo_atomic_comp_swap: 6645 op32 = aco_opcode::buffer_atomic_cmpswap; 6646 op64 = aco_opcode::buffer_atomic_cmpswap_x2; 6647 break; 6648 case nir_intrinsic_ssbo_atomic_fmin: 6649 op32 = aco_opcode::buffer_atomic_fmin; 6650 op64 = aco_opcode::buffer_atomic_fmin_x2; 6651 break; 6652 case nir_intrinsic_ssbo_atomic_fmax: 6653 op32 = aco_opcode::buffer_atomic_fmax; 6654 op64 = aco_opcode::buffer_atomic_fmax_x2; 6655 break; 6656 default: 6657 unreachable( 6658 "visit_atomic_ssbo should only be called with nir_intrinsic_ssbo_atomic_* instructions."); 6659 } 6660 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; 6661 aco_ptr<MUBUF_instruction> mubuf{ 6662 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; 6663 mubuf->operands[0] = Operand(rsrc); 6664 mubuf->operands[1] = offset.type() == RegType::vgpr ? Operand(offset) : Operand(v1); 6665 mubuf->operands[2] = offset.type() == RegType::sgpr ? Operand(offset) : Operand::c32(0); 6666 mubuf->operands[3] = Operand(data); 6667 if (return_previous) 6668 mubuf->definitions[0] = Definition(dst); 6669 mubuf->offset = 0; 6670 mubuf->offen = (offset.type() == RegType::vgpr); 6671 mubuf->glc = return_previous; 6672 mubuf->dlc = false; /* Not needed for atomics */ 6673 mubuf->disable_wqm = true; 6674 mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); 6675 ctx->program->needs_exact = true; 6676 ctx->block->instructions.emplace_back(std::move(mubuf)); 6677} 6678 6679void 6680visit_get_ssbo_size(isel_context* ctx, nir_intrinsic_instr* instr) 6681{ 6682 6683 Temp rsrc = get_ssa_temp(ctx, instr->src[0].ssa); 6684 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6685 bool non_uniform = dst.type() == RegType::vgpr; 6686 6687 Builder bld(ctx->program, ctx->block); 6688 if (non_uniform) { 6689 Temp set_ptr = emit_extract_vector(ctx, rsrc, 0, RegClass(rsrc.type(), 1)); 6690 Temp binding = emit_extract_vector(ctx, rsrc, 1, RegClass(rsrc.type(), 1)); 6691 Temp index = bld.vadd32(bld.def(v1), set_ptr, binding); 6692 index = convert_pointer_to_64_bit(ctx, index, non_uniform); 6693 6694 LoadEmitInfo info = {Operand(index), dst, 1, 4}; 6695 info.align_mul = 4; 6696 info.const_offset = 8; 6697 emit_load(ctx, bld, info, global_load_params); 6698 } else { 6699 emit_extract_vector(ctx, load_buffer_rsrc(ctx, rsrc), 2, dst); 6700 } 6701} 6702 6703void 6704visit_load_global(isel_context* ctx, nir_intrinsic_instr* instr) 6705{ 6706 Builder bld(ctx->program, ctx->block); 6707 unsigned num_components = instr->num_components; 6708 unsigned component_size = instr->dest.ssa.bit_size / 8; 6709 6710 LoadEmitInfo info = {Operand(get_ssa_temp(ctx, instr->src[0].ssa)), 6711 get_ssa_temp(ctx, &instr->dest.ssa), num_components, component_size}; 6712 info.glc = nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT); 6713 info.align_mul = nir_intrinsic_align_mul(instr); 6714 info.align_offset = nir_intrinsic_align_offset(instr); 6715 info.sync = get_memory_sync_info(instr, storage_buffer, 0); 6716 /* VMEM stores don't update the SMEM cache and it's difficult to prove that 6717 * it's safe to use SMEM */ 6718 bool can_use_smem = nir_intrinsic_access(instr) & ACCESS_NON_WRITEABLE; 6719 if (info.dst.type() == RegType::vgpr || (info.glc && ctx->options->chip_class < GFX8) || 6720 !can_use_smem) { 6721 emit_load(ctx, bld, info, global_load_params); 6722 } else { 6723 info.offset = Operand(bld.as_uniform(info.offset)); 6724 emit_load(ctx, bld, info, smem_load_params); 6725 } 6726} 6727 6728void 6729visit_store_global(isel_context* ctx, nir_intrinsic_instr* instr) 6730{ 6731 Builder bld(ctx->program, ctx->block); 6732 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; 6733 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); 6734 6735 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 6736 Temp addr = get_ssa_temp(ctx, instr->src[1].ssa); 6737 memory_sync_info sync = get_memory_sync_info(instr, storage_buffer, 0); 6738 bool glc = 6739 nir_intrinsic_access(instr) & (ACCESS_VOLATILE | ACCESS_COHERENT | ACCESS_NON_READABLE); 6740 6741 if (ctx->options->chip_class >= GFX7) 6742 addr = as_vgpr(ctx, addr); 6743 6744 unsigned write_count = 0; 6745 Temp write_datas[32]; 6746 unsigned offsets[32]; 6747 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, 16, &write_count, 6748 write_datas, offsets); 6749 6750 for (unsigned i = 0; i < write_count; i++) { 6751 if (ctx->options->chip_class >= GFX7) { 6752 unsigned offset = offsets[i]; 6753 Temp store_addr = addr; 6754 if (offset > 0 && ctx->options->chip_class < GFX9) { 6755 Temp addr0 = bld.tmp(v1), addr1 = bld.tmp(v1); 6756 Temp new_addr0 = bld.tmp(v1), new_addr1 = bld.tmp(v1); 6757 Temp carry = bld.tmp(bld.lm); 6758 bld.pseudo(aco_opcode::p_split_vector, Definition(addr0), Definition(addr1), addr); 6759 6760 bld.vop2(aco_opcode::v_add_co_u32, Definition(new_addr0), 6761 bld.hint_vcc(Definition(carry)), Operand::c32(offset), addr0); 6762 bld.vop2(aco_opcode::v_addc_co_u32, Definition(new_addr1), bld.def(bld.lm), 6763 Operand::zero(), addr1, carry) 6764 .def(1) 6765 .setHint(vcc); 6766 6767 store_addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), new_addr0, new_addr1); 6768 6769 offset = 0; 6770 } 6771 6772 bool global = ctx->options->chip_class >= GFX9; 6773 aco_opcode op; 6774 switch (write_datas[i].bytes()) { 6775 case 1: op = global ? aco_opcode::global_store_byte : aco_opcode::flat_store_byte; break; 6776 case 2: op = global ? aco_opcode::global_store_short : aco_opcode::flat_store_short; break; 6777 case 4: op = global ? aco_opcode::global_store_dword : aco_opcode::flat_store_dword; break; 6778 case 8: 6779 op = global ? aco_opcode::global_store_dwordx2 : aco_opcode::flat_store_dwordx2; 6780 break; 6781 case 12: 6782 op = global ? aco_opcode::global_store_dwordx3 : aco_opcode::flat_store_dwordx3; 6783 break; 6784 case 16: 6785 op = global ? aco_opcode::global_store_dwordx4 : aco_opcode::flat_store_dwordx4; 6786 break; 6787 default: unreachable("store_global not implemented for this size."); 6788 } 6789 6790 aco_ptr<FLAT_instruction> flat{ 6791 create_instruction<FLAT_instruction>(op, global ? Format::GLOBAL : Format::FLAT, 3, 0)}; 6792 flat->operands[0] = Operand(store_addr); 6793 flat->operands[1] = Operand(s1); 6794 flat->operands[2] = Operand(write_datas[i]); 6795 flat->glc = glc; 6796 flat->dlc = false; 6797 flat->offset = offset; 6798 flat->disable_wqm = true; 6799 flat->sync = sync; 6800 ctx->program->needs_exact = true; 6801 ctx->block->instructions.emplace_back(std::move(flat)); 6802 } else { 6803 assert(ctx->options->chip_class == GFX6); 6804 6805 aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); 6806 6807 Temp rsrc = get_gfx6_global_rsrc(bld, addr); 6808 6809 aco_ptr<MUBUF_instruction> mubuf{ 6810 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, 0)}; 6811 mubuf->operands[0] = Operand(rsrc); 6812 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); 6813 mubuf->operands[2] = Operand::zero(); 6814 mubuf->operands[3] = Operand(write_datas[i]); 6815 mubuf->glc = glc; 6816 mubuf->dlc = false; 6817 mubuf->offset = offsets[i]; 6818 mubuf->addr64 = addr.type() == RegType::vgpr; 6819 mubuf->disable_wqm = true; 6820 mubuf->sync = sync; 6821 ctx->program->needs_exact = true; 6822 ctx->block->instructions.emplace_back(std::move(mubuf)); 6823 } 6824 } 6825} 6826 6827void 6828visit_global_atomic(isel_context* ctx, nir_intrinsic_instr* instr) 6829{ 6830 Builder bld(ctx->program, ctx->block); 6831 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); 6832 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); 6833 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 6834 6835 if (ctx->options->chip_class >= GFX7) 6836 addr = as_vgpr(ctx, addr); 6837 6838 if (instr->intrinsic == nir_intrinsic_global_atomic_comp_swap) 6839 data = bld.pseudo(aco_opcode::p_create_vector, bld.def(RegType::vgpr, data.size() * 2), 6840 get_ssa_temp(ctx, instr->src[2].ssa), data); 6841 6842 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 6843 6844 aco_opcode op32, op64; 6845 6846 if (ctx->options->chip_class >= GFX7) { 6847 bool global = ctx->options->chip_class >= GFX9; 6848 switch (instr->intrinsic) { 6849 case nir_intrinsic_global_atomic_add: 6850 op32 = global ? aco_opcode::global_atomic_add : aco_opcode::flat_atomic_add; 6851 op64 = global ? aco_opcode::global_atomic_add_x2 : aco_opcode::flat_atomic_add_x2; 6852 break; 6853 case nir_intrinsic_global_atomic_imin: 6854 op32 = global ? aco_opcode::global_atomic_smin : aco_opcode::flat_atomic_smin; 6855 op64 = global ? aco_opcode::global_atomic_smin_x2 : aco_opcode::flat_atomic_smin_x2; 6856 break; 6857 case nir_intrinsic_global_atomic_umin: 6858 op32 = global ? aco_opcode::global_atomic_umin : aco_opcode::flat_atomic_umin; 6859 op64 = global ? aco_opcode::global_atomic_umin_x2 : aco_opcode::flat_atomic_umin_x2; 6860 break; 6861 case nir_intrinsic_global_atomic_imax: 6862 op32 = global ? aco_opcode::global_atomic_smax : aco_opcode::flat_atomic_smax; 6863 op64 = global ? aco_opcode::global_atomic_smax_x2 : aco_opcode::flat_atomic_smax_x2; 6864 break; 6865 case nir_intrinsic_global_atomic_umax: 6866 op32 = global ? aco_opcode::global_atomic_umax : aco_opcode::flat_atomic_umax; 6867 op64 = global ? aco_opcode::global_atomic_umax_x2 : aco_opcode::flat_atomic_umax_x2; 6868 break; 6869 case nir_intrinsic_global_atomic_and: 6870 op32 = global ? aco_opcode::global_atomic_and : aco_opcode::flat_atomic_and; 6871 op64 = global ? aco_opcode::global_atomic_and_x2 : aco_opcode::flat_atomic_and_x2; 6872 break; 6873 case nir_intrinsic_global_atomic_or: 6874 op32 = global ? aco_opcode::global_atomic_or : aco_opcode::flat_atomic_or; 6875 op64 = global ? aco_opcode::global_atomic_or_x2 : aco_opcode::flat_atomic_or_x2; 6876 break; 6877 case nir_intrinsic_global_atomic_xor: 6878 op32 = global ? aco_opcode::global_atomic_xor : aco_opcode::flat_atomic_xor; 6879 op64 = global ? aco_opcode::global_atomic_xor_x2 : aco_opcode::flat_atomic_xor_x2; 6880 break; 6881 case nir_intrinsic_global_atomic_exchange: 6882 op32 = global ? aco_opcode::global_atomic_swap : aco_opcode::flat_atomic_swap; 6883 op64 = global ? aco_opcode::global_atomic_swap_x2 : aco_opcode::flat_atomic_swap_x2; 6884 break; 6885 case nir_intrinsic_global_atomic_comp_swap: 6886 op32 = global ? aco_opcode::global_atomic_cmpswap : aco_opcode::flat_atomic_cmpswap; 6887 op64 = global ? aco_opcode::global_atomic_cmpswap_x2 : aco_opcode::flat_atomic_cmpswap_x2; 6888 break; 6889 case nir_intrinsic_global_atomic_fmin: 6890 op32 = global ? aco_opcode::global_atomic_fmin : aco_opcode::flat_atomic_fmin; 6891 op64 = global ? aco_opcode::global_atomic_fmin_x2 : aco_opcode::flat_atomic_fmin_x2; 6892 break; 6893 case nir_intrinsic_global_atomic_fmax: 6894 op32 = global ? aco_opcode::global_atomic_fmax : aco_opcode::flat_atomic_fmax; 6895 op64 = global ? aco_opcode::global_atomic_fmax_x2 : aco_opcode::flat_atomic_fmax_x2; 6896 break; 6897 default: 6898 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* " 6899 "instructions."); 6900 } 6901 6902 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; 6903 aco_ptr<FLAT_instruction> flat{create_instruction<FLAT_instruction>( 6904 op, global ? Format::GLOBAL : Format::FLAT, 3, return_previous ? 1 : 0)}; 6905 flat->operands[0] = Operand(addr); 6906 flat->operands[1] = Operand(s1); 6907 flat->operands[2] = Operand(data); 6908 if (return_previous) 6909 flat->definitions[0] = Definition(dst); 6910 flat->glc = return_previous; 6911 flat->dlc = false; /* Not needed for atomics */ 6912 flat->offset = 0; 6913 flat->disable_wqm = true; 6914 flat->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); 6915 ctx->program->needs_exact = true; 6916 ctx->block->instructions.emplace_back(std::move(flat)); 6917 } else { 6918 assert(ctx->options->chip_class == GFX6); 6919 6920 switch (instr->intrinsic) { 6921 case nir_intrinsic_global_atomic_add: 6922 op32 = aco_opcode::buffer_atomic_add; 6923 op64 = aco_opcode::buffer_atomic_add_x2; 6924 break; 6925 case nir_intrinsic_global_atomic_imin: 6926 op32 = aco_opcode::buffer_atomic_smin; 6927 op64 = aco_opcode::buffer_atomic_smin_x2; 6928 break; 6929 case nir_intrinsic_global_atomic_umin: 6930 op32 = aco_opcode::buffer_atomic_umin; 6931 op64 = aco_opcode::buffer_atomic_umin_x2; 6932 break; 6933 case nir_intrinsic_global_atomic_imax: 6934 op32 = aco_opcode::buffer_atomic_smax; 6935 op64 = aco_opcode::buffer_atomic_smax_x2; 6936 break; 6937 case nir_intrinsic_global_atomic_umax: 6938 op32 = aco_opcode::buffer_atomic_umax; 6939 op64 = aco_opcode::buffer_atomic_umax_x2; 6940 break; 6941 case nir_intrinsic_global_atomic_and: 6942 op32 = aco_opcode::buffer_atomic_and; 6943 op64 = aco_opcode::buffer_atomic_and_x2; 6944 break; 6945 case nir_intrinsic_global_atomic_or: 6946 op32 = aco_opcode::buffer_atomic_or; 6947 op64 = aco_opcode::buffer_atomic_or_x2; 6948 break; 6949 case nir_intrinsic_global_atomic_xor: 6950 op32 = aco_opcode::buffer_atomic_xor; 6951 op64 = aco_opcode::buffer_atomic_xor_x2; 6952 break; 6953 case nir_intrinsic_global_atomic_exchange: 6954 op32 = aco_opcode::buffer_atomic_swap; 6955 op64 = aco_opcode::buffer_atomic_swap_x2; 6956 break; 6957 case nir_intrinsic_global_atomic_comp_swap: 6958 op32 = aco_opcode::buffer_atomic_cmpswap; 6959 op64 = aco_opcode::buffer_atomic_cmpswap_x2; 6960 break; 6961 case nir_intrinsic_global_atomic_fmin: 6962 op32 = aco_opcode::buffer_atomic_fmin; 6963 op64 = aco_opcode::buffer_atomic_fmin_x2; 6964 break; 6965 case nir_intrinsic_global_atomic_fmax: 6966 op32 = aco_opcode::buffer_atomic_fmax; 6967 op64 = aco_opcode::buffer_atomic_fmax_x2; 6968 break; 6969 default: 6970 unreachable("visit_atomic_global should only be called with nir_intrinsic_global_atomic_* " 6971 "instructions."); 6972 } 6973 6974 Temp rsrc = get_gfx6_global_rsrc(bld, addr); 6975 6976 aco_opcode op = instr->dest.ssa.bit_size == 32 ? op32 : op64; 6977 6978 aco_ptr<MUBUF_instruction> mubuf{ 6979 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 4, return_previous ? 1 : 0)}; 6980 mubuf->operands[0] = Operand(rsrc); 6981 mubuf->operands[1] = addr.type() == RegType::vgpr ? Operand(addr) : Operand(v1); 6982 mubuf->operands[2] = Operand::zero(); 6983 mubuf->operands[3] = Operand(data); 6984 if (return_previous) 6985 mubuf->definitions[0] = Definition(dst); 6986 mubuf->glc = return_previous; 6987 mubuf->dlc = false; 6988 mubuf->offset = 0; 6989 mubuf->addr64 = addr.type() == RegType::vgpr; 6990 mubuf->disable_wqm = true; 6991 mubuf->sync = get_memory_sync_info(instr, storage_buffer, semantic_atomicrmw); 6992 ctx->program->needs_exact = true; 6993 ctx->block->instructions.emplace_back(std::move(mubuf)); 6994 } 6995} 6996 6997void 6998visit_load_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) 6999{ 7000 Builder bld(ctx->program, ctx->block); 7001 7002 Temp dst = get_ssa_temp(ctx, &intrin->dest.ssa); 7003 Temp descriptor = bld.as_uniform(get_ssa_temp(ctx, intrin->src[0].ssa)); 7004 Temp v_offset = as_vgpr(ctx, get_ssa_temp(ctx, intrin->src[1].ssa)); 7005 Temp s_offset = bld.as_uniform(get_ssa_temp(ctx, intrin->src[2].ssa)); 7006 7007 bool swizzled = nir_intrinsic_is_swizzled(intrin); 7008 bool reorder = nir_intrinsic_can_reorder(intrin); 7009 bool slc = nir_intrinsic_slc_amd(intrin); 7010 7011 unsigned const_offset = nir_intrinsic_base(intrin); 7012 unsigned elem_size_bytes = intrin->dest.ssa.bit_size / 8u; 7013 unsigned num_components = intrin->dest.ssa.num_components; 7014 unsigned swizzle_element_size = swizzled ? (ctx->program->chip_class <= GFX8 ? 4 : 16) : 0; 7015 7016 load_vmem_mubuf(ctx, dst, descriptor, v_offset, s_offset, const_offset, elem_size_bytes, 7017 num_components, swizzle_element_size, !swizzled, reorder, slc); 7018} 7019 7020void 7021visit_store_buffer(isel_context* ctx, nir_intrinsic_instr* intrin) 7022{ 7023 Temp store_src = get_ssa_temp(ctx, intrin->src[0].ssa); 7024 Temp descriptor = get_ssa_temp(ctx, intrin->src[1].ssa); 7025 Temp v_offset = get_ssa_temp(ctx, intrin->src[2].ssa); 7026 Temp s_offset = get_ssa_temp(ctx, intrin->src[3].ssa); 7027 7028 bool swizzled = nir_intrinsic_is_swizzled(intrin); 7029 bool slc = nir_intrinsic_slc_amd(intrin); 7030 7031 unsigned const_offset = nir_intrinsic_base(intrin); 7032 unsigned write_mask = nir_intrinsic_write_mask(intrin); 7033 unsigned elem_size_bytes = intrin->src[0].ssa->bit_size / 8u; 7034 7035 nir_variable_mode mem_mode = nir_intrinsic_memory_modes(intrin); 7036 memory_sync_info sync(mem_mode == nir_var_shader_out ? storage_vmem_output : storage_none); 7037 7038 store_vmem_mubuf(ctx, store_src, descriptor, v_offset, s_offset, const_offset, elem_size_bytes, 7039 write_mask, !swizzled, sync, slc); 7040} 7041 7042sync_scope 7043translate_nir_scope(nir_scope scope) 7044{ 7045 switch (scope) { 7046 case NIR_SCOPE_NONE: 7047 case NIR_SCOPE_INVOCATION: return scope_invocation; 7048 case NIR_SCOPE_SUBGROUP: return scope_subgroup; 7049 case NIR_SCOPE_WORKGROUP: return scope_workgroup; 7050 case NIR_SCOPE_QUEUE_FAMILY: return scope_queuefamily; 7051 case NIR_SCOPE_DEVICE: return scope_device; 7052 case NIR_SCOPE_SHADER_CALL: return scope_invocation; 7053 } 7054 unreachable("invalid scope"); 7055} 7056 7057void 7058emit_scoped_barrier(isel_context* ctx, nir_intrinsic_instr* instr) 7059{ 7060 Builder bld(ctx->program, ctx->block); 7061 7062 unsigned semantics = 0; 7063 unsigned storage = 0; 7064 sync_scope mem_scope = translate_nir_scope(nir_intrinsic_memory_scope(instr)); 7065 sync_scope exec_scope = translate_nir_scope(nir_intrinsic_execution_scope(instr)); 7066 7067 /* We use shared storage for the following: 7068 * - compute shaders expose it in their API 7069 * - when tessellation is used, TCS and VS I/O is lowered to shared memory 7070 * - when GS is used on GFX9+, VS->GS and TES->GS I/O is lowered to shared memory 7071 * - additionally, when NGG is used on GFX10+, shared memory is used for certain features 7072 */ 7073 bool shared_storage_used = ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::LS || 7074 ctx->stage.hw == HWStage::HS || 7075 (ctx->stage.hw == HWStage::GS && ctx->program->chip_class >= GFX9) || 7076 ctx->stage.hw == HWStage::NGG; 7077 7078 /* Workgroup barriers can hang merged shaders that can potentially have 0 threads in either half. 7079 * They are allowed in CS, TCS, and in any NGG shader. 7080 */ 7081 ASSERTED bool workgroup_scope_allowed = 7082 ctx->stage.hw == HWStage::CS || ctx->stage.hw == HWStage::HS || ctx->stage.hw == HWStage::NGG; 7083 7084 unsigned nir_storage = nir_intrinsic_memory_modes(instr); 7085 if (nir_storage & (nir_var_mem_ssbo | nir_var_mem_global)) 7086 storage |= storage_buffer | storage_image; // TODO: split this when NIR gets nir_var_mem_image 7087 if (shared_storage_used && (nir_storage & nir_var_mem_shared)) 7088 storage |= storage_shared; 7089 7090 unsigned nir_semantics = nir_intrinsic_memory_semantics(instr); 7091 if (nir_semantics & NIR_MEMORY_ACQUIRE) 7092 semantics |= semantic_acquire | semantic_release; 7093 if (nir_semantics & NIR_MEMORY_RELEASE) 7094 semantics |= semantic_acquire | semantic_release; 7095 7096 assert(!(nir_semantics & (NIR_MEMORY_MAKE_AVAILABLE | NIR_MEMORY_MAKE_VISIBLE))); 7097 assert(exec_scope != scope_workgroup || workgroup_scope_allowed); 7098 7099 bld.barrier(aco_opcode::p_barrier, 7100 memory_sync_info((storage_class)storage, (memory_semantics)semantics, mem_scope), 7101 exec_scope); 7102} 7103 7104void 7105visit_load_shared(isel_context* ctx, nir_intrinsic_instr* instr) 7106{ 7107 // TODO: implement sparse reads using ds_read2_b32 and nir_ssa_def_components_read() 7108 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7109 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7110 Builder bld(ctx->program, ctx->block); 7111 7112 unsigned elem_size_bytes = instr->dest.ssa.bit_size / 8; 7113 unsigned num_components = instr->dest.ssa.num_components; 7114 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes; 7115 load_lds(ctx, elem_size_bytes, num_components, dst, address, nir_intrinsic_base(instr), align); 7116} 7117 7118void 7119visit_store_shared(isel_context* ctx, nir_intrinsic_instr* instr) 7120{ 7121 unsigned writemask = nir_intrinsic_write_mask(instr); 7122 Temp data = get_ssa_temp(ctx, instr->src[0].ssa); 7123 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 7124 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; 7125 7126 unsigned align = nir_intrinsic_align_mul(instr) ? nir_intrinsic_align(instr) : elem_size_bytes; 7127 store_lds(ctx, elem_size_bytes, data, writemask, address, nir_intrinsic_base(instr), align); 7128} 7129 7130void 7131visit_shared_atomic(isel_context* ctx, nir_intrinsic_instr* instr) 7132{ 7133 unsigned offset = nir_intrinsic_base(instr); 7134 Builder bld(ctx->program, ctx->block); 7135 Operand m = load_lds_size_m0(bld); 7136 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 7137 Temp address = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7138 7139 unsigned num_operands = 3; 7140 aco_opcode op32, op64, op32_rtn, op64_rtn; 7141 switch (instr->intrinsic) { 7142 case nir_intrinsic_shared_atomic_add: 7143 op32 = aco_opcode::ds_add_u32; 7144 op64 = aco_opcode::ds_add_u64; 7145 op32_rtn = aco_opcode::ds_add_rtn_u32; 7146 op64_rtn = aco_opcode::ds_add_rtn_u64; 7147 break; 7148 case nir_intrinsic_shared_atomic_imin: 7149 op32 = aco_opcode::ds_min_i32; 7150 op64 = aco_opcode::ds_min_i64; 7151 op32_rtn = aco_opcode::ds_min_rtn_i32; 7152 op64_rtn = aco_opcode::ds_min_rtn_i64; 7153 break; 7154 case nir_intrinsic_shared_atomic_umin: 7155 op32 = aco_opcode::ds_min_u32; 7156 op64 = aco_opcode::ds_min_u64; 7157 op32_rtn = aco_opcode::ds_min_rtn_u32; 7158 op64_rtn = aco_opcode::ds_min_rtn_u64; 7159 break; 7160 case nir_intrinsic_shared_atomic_imax: 7161 op32 = aco_opcode::ds_max_i32; 7162 op64 = aco_opcode::ds_max_i64; 7163 op32_rtn = aco_opcode::ds_max_rtn_i32; 7164 op64_rtn = aco_opcode::ds_max_rtn_i64; 7165 break; 7166 case nir_intrinsic_shared_atomic_umax: 7167 op32 = aco_opcode::ds_max_u32; 7168 op64 = aco_opcode::ds_max_u64; 7169 op32_rtn = aco_opcode::ds_max_rtn_u32; 7170 op64_rtn = aco_opcode::ds_max_rtn_u64; 7171 break; 7172 case nir_intrinsic_shared_atomic_and: 7173 op32 = aco_opcode::ds_and_b32; 7174 op64 = aco_opcode::ds_and_b64; 7175 op32_rtn = aco_opcode::ds_and_rtn_b32; 7176 op64_rtn = aco_opcode::ds_and_rtn_b64; 7177 break; 7178 case nir_intrinsic_shared_atomic_or: 7179 op32 = aco_opcode::ds_or_b32; 7180 op64 = aco_opcode::ds_or_b64; 7181 op32_rtn = aco_opcode::ds_or_rtn_b32; 7182 op64_rtn = aco_opcode::ds_or_rtn_b64; 7183 break; 7184 case nir_intrinsic_shared_atomic_xor: 7185 op32 = aco_opcode::ds_xor_b32; 7186 op64 = aco_opcode::ds_xor_b64; 7187 op32_rtn = aco_opcode::ds_xor_rtn_b32; 7188 op64_rtn = aco_opcode::ds_xor_rtn_b64; 7189 break; 7190 case nir_intrinsic_shared_atomic_exchange: 7191 op32 = aco_opcode::ds_write_b32; 7192 op64 = aco_opcode::ds_write_b64; 7193 op32_rtn = aco_opcode::ds_wrxchg_rtn_b32; 7194 op64_rtn = aco_opcode::ds_wrxchg_rtn_b64; 7195 break; 7196 case nir_intrinsic_shared_atomic_comp_swap: 7197 op32 = aco_opcode::ds_cmpst_b32; 7198 op64 = aco_opcode::ds_cmpst_b64; 7199 op32_rtn = aco_opcode::ds_cmpst_rtn_b32; 7200 op64_rtn = aco_opcode::ds_cmpst_rtn_b64; 7201 num_operands = 4; 7202 break; 7203 case nir_intrinsic_shared_atomic_fadd: 7204 op32 = aco_opcode::ds_add_f32; 7205 op32_rtn = aco_opcode::ds_add_rtn_f32; 7206 op64 = aco_opcode::num_opcodes; 7207 op64_rtn = aco_opcode::num_opcodes; 7208 break; 7209 case nir_intrinsic_shared_atomic_fmin: 7210 op32 = aco_opcode::ds_min_f32; 7211 op32_rtn = aco_opcode::ds_min_rtn_f32; 7212 op64 = aco_opcode::ds_min_f64; 7213 op64_rtn = aco_opcode::ds_min_rtn_f64; 7214 break; 7215 case nir_intrinsic_shared_atomic_fmax: 7216 op32 = aco_opcode::ds_max_f32; 7217 op32_rtn = aco_opcode::ds_max_rtn_f32; 7218 op64 = aco_opcode::ds_max_f64; 7219 op64_rtn = aco_opcode::ds_max_rtn_f64; 7220 break; 7221 default: unreachable("Unhandled shared atomic intrinsic"); 7222 } 7223 7224 bool return_previous = !nir_ssa_def_is_unused(&instr->dest.ssa); 7225 7226 aco_opcode op; 7227 if (data.size() == 1) { 7228 assert(instr->dest.ssa.bit_size == 32); 7229 op = return_previous ? op32_rtn : op32; 7230 } else { 7231 assert(instr->dest.ssa.bit_size == 64); 7232 op = return_previous ? op64_rtn : op64; 7233 } 7234 7235 if (offset > 65535) { 7236 address = bld.vadd32(bld.def(v1), Operand::c32(offset), address); 7237 offset = 0; 7238 } 7239 7240 aco_ptr<DS_instruction> ds; 7241 ds.reset( 7242 create_instruction<DS_instruction>(op, Format::DS, num_operands, return_previous ? 1 : 0)); 7243 ds->operands[0] = Operand(address); 7244 ds->operands[1] = Operand(data); 7245 if (num_operands == 4) { 7246 Temp data2 = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa)); 7247 ds->operands[2] = Operand(data2); 7248 } 7249 ds->operands[num_operands - 1] = m; 7250 ds->offset0 = offset; 7251 if (return_previous) 7252 ds->definitions[0] = Definition(get_ssa_temp(ctx, &instr->dest.ssa)); 7253 ds->sync = memory_sync_info(storage_shared, semantic_atomicrmw); 7254 7255 if (m.isUndefined()) 7256 ds->operands.pop_back(); 7257 7258 ctx->block->instructions.emplace_back(std::move(ds)); 7259} 7260 7261Temp 7262get_scratch_resource(isel_context* ctx) 7263{ 7264 Builder bld(ctx->program, ctx->block); 7265 Temp scratch_addr = ctx->program->private_segment_buffer; 7266 if (ctx->stage != compute_cs) 7267 scratch_addr = 7268 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), scratch_addr, Operand::zero()); 7269 7270 uint32_t rsrc_conf = 7271 S_008F0C_ADD_TID_ENABLE(1) | S_008F0C_INDEX_STRIDE(ctx->program->wave_size == 64 ? 3 : 2); 7272 7273 if (ctx->program->chip_class >= GFX10) { 7274 rsrc_conf |= S_008F0C_FORMAT(V_008F0C_GFX10_FORMAT_32_FLOAT) | 7275 S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); 7276 } else if (ctx->program->chip_class <= 7277 GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ 7278 rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 7279 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 7280 } 7281 7282 /* older generations need element size = 4 bytes. element size removed in GFX9 */ 7283 if (ctx->program->chip_class <= GFX8) 7284 rsrc_conf |= S_008F0C_ELEMENT_SIZE(1); 7285 7286 return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), scratch_addr, Operand::c32(-1u), 7287 Operand::c32(rsrc_conf)); 7288} 7289 7290void 7291visit_load_scratch(isel_context* ctx, nir_intrinsic_instr* instr) 7292{ 7293 Builder bld(ctx->program, ctx->block); 7294 Temp rsrc = get_scratch_resource(ctx); 7295 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7296 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7297 7298 LoadEmitInfo info = {Operand(offset), dst, instr->dest.ssa.num_components, 7299 instr->dest.ssa.bit_size / 8u, rsrc}; 7300 info.align_mul = nir_intrinsic_align_mul(instr); 7301 info.align_offset = nir_intrinsic_align_offset(instr); 7302 info.swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 0; 7303 info.sync = memory_sync_info(storage_scratch, semantic_private); 7304 info.soffset = ctx->program->scratch_offset; 7305 emit_load(ctx, bld, info, scratch_load_params); 7306} 7307 7308void 7309visit_store_scratch(isel_context* ctx, nir_intrinsic_instr* instr) 7310{ 7311 Builder bld(ctx->program, ctx->block); 7312 Temp rsrc = get_scratch_resource(ctx); 7313 Temp data = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7314 Temp offset = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 7315 7316 unsigned elem_size_bytes = instr->src[0].ssa->bit_size / 8; 7317 unsigned writemask = widen_mask(nir_intrinsic_write_mask(instr), elem_size_bytes); 7318 7319 unsigned write_count = 0; 7320 Temp write_datas[32]; 7321 unsigned offsets[32]; 7322 unsigned swizzle_component_size = ctx->program->chip_class <= GFX8 ? 4 : 16; 7323 split_buffer_store(ctx, instr, false, RegType::vgpr, data, writemask, swizzle_component_size, 7324 &write_count, write_datas, offsets); 7325 7326 for (unsigned i = 0; i < write_count; i++) { 7327 aco_opcode op = get_buffer_store_op(write_datas[i].bytes()); 7328 Instruction* mubuf = bld.mubuf(op, rsrc, offset, ctx->program->scratch_offset, write_datas[i], 7329 offsets[i], true, true); 7330 mubuf->mubuf().sync = memory_sync_info(storage_scratch, semantic_private); 7331 } 7332} 7333 7334void 7335visit_load_sample_mask_in(isel_context* ctx, nir_intrinsic_instr* instr) 7336{ 7337 uint8_t log2_ps_iter_samples; 7338 if (ctx->program->info->ps.uses_sample_shading) { 7339 log2_ps_iter_samples = util_logbase2(ctx->options->key.ps.num_samples); 7340 } else { 7341 log2_ps_iter_samples = ctx->options->key.ps.log2_ps_iter_samples; 7342 } 7343 7344 Builder bld(ctx->program, ctx->block); 7345 7346 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7347 7348 if (log2_ps_iter_samples) { 7349 /* gl_SampleMaskIn[0] = (SampleCoverage & (1 << gl_SampleID)). */ 7350 Temp sample_id = 7351 bld.vop3(aco_opcode::v_bfe_u32, bld.def(v1), get_arg(ctx, ctx->args->ac.ancillary), 7352 Operand::c32(8u), Operand::c32(4u)); 7353 Temp mask = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), sample_id, 7354 bld.copy(bld.def(v1), Operand::c32(1u))); 7355 bld.vop2(aco_opcode::v_and_b32, Definition(dst), mask, 7356 get_arg(ctx, ctx->args->ac.sample_coverage)); 7357 } else { 7358 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.sample_coverage)); 7359 } 7360} 7361 7362void 7363visit_emit_vertex_with_counter(isel_context* ctx, nir_intrinsic_instr* instr) 7364{ 7365 Builder bld(ctx->program, ctx->block); 7366 7367 unsigned stream = nir_intrinsic_stream_id(instr); 7368 Temp next_vertex = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 7369 next_vertex = bld.v_mul_imm(bld.def(v1), next_vertex, 4u); 7370 nir_const_value* next_vertex_cv = nir_src_as_const_value(instr->src[0]); 7371 7372 /* get GSVS ring */ 7373 Temp gsvs_ring = 7374 bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), ctx->program->private_segment_buffer, 7375 Operand::c32(RING_GSVS_GS * 16u)); 7376 7377 unsigned num_components = ctx->program->info->gs.num_stream_output_components[stream]; 7378 7379 unsigned stride = 4u * num_components * ctx->shader->info.gs.vertices_out; 7380 unsigned stream_offset = 0; 7381 for (unsigned i = 0; i < stream; i++) { 7382 unsigned prev_stride = 4u * ctx->program->info->gs.num_stream_output_components[i] * 7383 ctx->shader->info.gs.vertices_out; 7384 stream_offset += prev_stride * ctx->program->wave_size; 7385 } 7386 7387 /* Limit on the stride field for <= GFX7. */ 7388 assert(stride < (1 << 14)); 7389 7390 Temp gsvs_dwords[4]; 7391 for (unsigned i = 0; i < 4; i++) 7392 gsvs_dwords[i] = bld.tmp(s1); 7393 bld.pseudo(aco_opcode::p_split_vector, Definition(gsvs_dwords[0]), Definition(gsvs_dwords[1]), 7394 Definition(gsvs_dwords[2]), Definition(gsvs_dwords[3]), gsvs_ring); 7395 7396 if (stream_offset) { 7397 Temp stream_offset_tmp = bld.copy(bld.def(s1), Operand::c32(stream_offset)); 7398 7399 Temp carry = bld.tmp(s1); 7400 gsvs_dwords[0] = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.scc(Definition(carry)), 7401 gsvs_dwords[0], stream_offset_tmp); 7402 gsvs_dwords[1] = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), 7403 gsvs_dwords[1], Operand::zero(), bld.scc(carry)); 7404 } 7405 7406 gsvs_dwords[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), gsvs_dwords[1], 7407 Operand::c32(S_008F04_STRIDE(stride))); 7408 gsvs_dwords[2] = bld.copy(bld.def(s1), Operand::c32(ctx->program->wave_size)); 7409 7410 gsvs_ring = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), gsvs_dwords[0], gsvs_dwords[1], 7411 gsvs_dwords[2], gsvs_dwords[3]); 7412 7413 unsigned offset = 0; 7414 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; i++) { 7415 if (ctx->program->info->gs.output_streams[i] != stream) 7416 continue; 7417 7418 for (unsigned j = 0; j < 4; j++) { 7419 if (!(ctx->program->info->gs.output_usage_mask[i] & (1 << j))) 7420 continue; 7421 7422 if (ctx->outputs.mask[i] & (1 << j)) { 7423 Operand vaddr_offset = next_vertex_cv ? Operand(v1) : Operand(next_vertex); 7424 unsigned const_offset = (offset + (next_vertex_cv ? next_vertex_cv->u32 : 0u)) * 4u; 7425 if (const_offset >= 4096u) { 7426 if (vaddr_offset.isUndefined()) 7427 vaddr_offset = bld.copy(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u)); 7428 else 7429 vaddr_offset = bld.vadd32(bld.def(v1), Operand::c32(const_offset / 4096u * 4096u), 7430 vaddr_offset); 7431 const_offset %= 4096u; 7432 } 7433 7434 aco_ptr<MTBUF_instruction> mtbuf{create_instruction<MTBUF_instruction>( 7435 aco_opcode::tbuffer_store_format_x, Format::MTBUF, 4, 0)}; 7436 mtbuf->operands[0] = Operand(gsvs_ring); 7437 mtbuf->operands[1] = vaddr_offset; 7438 mtbuf->operands[2] = Operand(get_arg(ctx, ctx->args->ac.gs2vs_offset)); 7439 mtbuf->operands[3] = Operand(ctx->outputs.temps[i * 4u + j]); 7440 mtbuf->offen = !vaddr_offset.isUndefined(); 7441 mtbuf->dfmt = V_008F0C_BUF_DATA_FORMAT_32; 7442 mtbuf->nfmt = V_008F0C_BUF_NUM_FORMAT_UINT; 7443 mtbuf->offset = const_offset; 7444 mtbuf->glc = true; 7445 mtbuf->slc = true; 7446 mtbuf->sync = memory_sync_info(storage_vmem_output, semantic_can_reorder); 7447 bld.insert(std::move(mtbuf)); 7448 } 7449 7450 offset += ctx->shader->info.gs.vertices_out; 7451 } 7452 7453 /* outputs for the next vertex are undefined and keeping them around can 7454 * create invalid IR with control flow */ 7455 ctx->outputs.mask[i] = 0; 7456 } 7457 7458 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, sendmsg_gs(false, true, stream)); 7459} 7460 7461Temp 7462emit_boolean_reduce(isel_context* ctx, nir_op op, unsigned cluster_size, Temp src) 7463{ 7464 Builder bld(ctx->program, ctx->block); 7465 7466 if (cluster_size == 1) { 7467 return src; 7468 } 7469 if (op == nir_op_iand && cluster_size == 4) { 7470 /* subgroupClusteredAnd(val, 4) -> ~wqm(exec & ~val) */ 7471 Temp tmp = 7472 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); 7473 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), 7474 bld.sop1(Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), tmp)); 7475 } else if (op == nir_op_ior && cluster_size == 4) { 7476 /* subgroupClusteredOr(val, 4) -> wqm(val & exec) */ 7477 return bld.sop1( 7478 Builder::s_wqm, bld.def(bld.lm), bld.def(s1, scc), 7479 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm))); 7480 } else if (op == nir_op_iand && cluster_size == ctx->program->wave_size) { 7481 /* subgroupAnd(val) -> (exec & ~val) == 0 */ 7482 Temp tmp = 7483 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src) 7484 .def(1) 7485 .getTemp(); 7486 Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp)); 7487 return bld.sop1(Builder::s_not, bld.def(bld.lm), bld.def(s1, scc), cond); 7488 } else if (op == nir_op_ior && cluster_size == ctx->program->wave_size) { 7489 /* subgroupOr(val) -> (val & exec) != 0 */ 7490 Temp tmp = 7491 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)) 7492 .def(1) 7493 .getTemp(); 7494 return bool_to_vector_condition(ctx, tmp); 7495 } else if (op == nir_op_ixor && cluster_size == ctx->program->wave_size) { 7496 /* subgroupXor(val) -> s_bcnt1_i32_b64(val & exec) & 1 */ 7497 Temp tmp = 7498 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 7499 tmp = bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), tmp); 7500 tmp = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), tmp, Operand::c32(1u)) 7501 .def(1) 7502 .getTemp(); 7503 return bool_to_vector_condition(ctx, tmp); 7504 } else { 7505 /* subgroupClustered{And,Or,Xor}(val, n): 7506 * lane_id = v_mbcnt_hi_u32_b32(-1, v_mbcnt_lo_u32_b32(-1, 0)) (just v_mbcnt_lo on wave32) 7507 * cluster_offset = ~(n - 1) & lane_id cluster_mask = ((1 << n) - 1) 7508 * subgroupClusteredAnd(): 7509 * return ((val | ~exec) >> cluster_offset) & cluster_mask == cluster_mask 7510 * subgroupClusteredOr(): 7511 * return ((val & exec) >> cluster_offset) & cluster_mask != 0 7512 * subgroupClusteredXor(): 7513 * return v_bnt_u32_b32(((val & exec) >> cluster_offset) & cluster_mask, 0) & 1 != 0 7514 */ 7515 Temp lane_id = emit_mbcnt(ctx, bld.tmp(v1)); 7516 Temp cluster_offset = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), 7517 Operand::c32(~uint32_t(cluster_size - 1)), lane_id); 7518 7519 Temp tmp; 7520 if (op == nir_op_iand) 7521 tmp = bld.sop2(Builder::s_orn2, bld.def(bld.lm), bld.def(s1, scc), src, 7522 Operand(exec, bld.lm)); 7523 else 7524 tmp = 7525 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 7526 7527 uint32_t cluster_mask = cluster_size == 32 ? -1 : (1u << cluster_size) - 1u; 7528 7529 if (ctx->program->chip_class <= GFX7) 7530 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), tmp, cluster_offset); 7531 else if (ctx->program->wave_size == 64) 7532 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), cluster_offset, tmp); 7533 else 7534 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), cluster_offset, tmp); 7535 tmp = emit_extract_vector(ctx, tmp, 0, v1); 7536 if (cluster_mask != 0xffffffff) 7537 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(cluster_mask), tmp); 7538 7539 if (op == nir_op_iand) { 7540 return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::c32(cluster_mask), 7541 tmp); 7542 } else if (op == nir_op_ior) { 7543 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp); 7544 } else if (op == nir_op_ixor) { 7545 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), 7546 bld.vop3(aco_opcode::v_bcnt_u32_b32, bld.def(v1), tmp, Operand::zero())); 7547 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), tmp); 7548 } 7549 assert(false); 7550 return Temp(); 7551 } 7552} 7553 7554Temp 7555emit_boolean_exclusive_scan(isel_context* ctx, nir_op op, Temp src) 7556{ 7557 Builder bld(ctx->program, ctx->block); 7558 assert(src.regClass() == bld.lm); 7559 7560 /* subgroupExclusiveAnd(val) -> mbcnt(exec & ~val) == 0 7561 * subgroupExclusiveOr(val) -> mbcnt(val & exec) != 0 7562 * subgroupExclusiveXor(val) -> mbcnt(val & exec) & 1 != 0 7563 */ 7564 Temp tmp; 7565 if (op == nir_op_iand) 7566 tmp = 7567 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src); 7568 else 7569 tmp = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 7570 7571 Temp mbcnt = emit_mbcnt(ctx, bld.tmp(v1), Operand(tmp)); 7572 7573 if (op == nir_op_iand) 7574 return bld.vopc(aco_opcode::v_cmp_eq_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt); 7575 else if (op == nir_op_ior) 7576 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), mbcnt); 7577 else if (op == nir_op_ixor) 7578 return bld.vopc(aco_opcode::v_cmp_lg_u32, bld.hint_vcc(bld.lm), Operand::zero(), 7579 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), mbcnt)); 7580 7581 assert(false); 7582 return Temp(); 7583} 7584 7585Temp 7586emit_boolean_inclusive_scan(isel_context* ctx, nir_op op, Temp src) 7587{ 7588 Builder bld(ctx->program, ctx->block); 7589 7590 /* subgroupInclusiveAnd(val) -> subgroupExclusiveAnd(val) && val 7591 * subgroupInclusiveOr(val) -> subgroupExclusiveOr(val) || val 7592 * subgroupInclusiveXor(val) -> subgroupExclusiveXor(val) ^^ val 7593 */ 7594 Temp tmp = emit_boolean_exclusive_scan(ctx, op, src); 7595 if (op == nir_op_iand) 7596 return bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), tmp, src); 7597 else if (op == nir_op_ior) 7598 return bld.sop2(Builder::s_or, bld.def(bld.lm), bld.def(s1, scc), tmp, src); 7599 else if (op == nir_op_ixor) 7600 return bld.sop2(Builder::s_xor, bld.def(bld.lm), bld.def(s1, scc), tmp, src); 7601 7602 assert(false); 7603 return Temp(); 7604} 7605 7606ReduceOp 7607get_reduce_op(nir_op op, unsigned bit_size) 7608{ 7609 switch (op) { 7610#define CASEI(name) \ 7611 case nir_op_##name: \ 7612 return (bit_size == 32) ? name##32 \ 7613 : (bit_size == 16) ? name##16 \ 7614 : (bit_size == 8) ? name##8 \ 7615 : name##64; 7616#define CASEF(name) \ 7617 case nir_op_##name: return (bit_size == 32) ? name##32 : (bit_size == 16) ? name##16 : name##64; 7618 CASEI(iadd) 7619 CASEI(imul) 7620 CASEI(imin) 7621 CASEI(umin) 7622 CASEI(imax) 7623 CASEI(umax) 7624 CASEI(iand) 7625 CASEI(ior) 7626 CASEI(ixor) 7627 CASEF(fadd) 7628 CASEF(fmul) 7629 CASEF(fmin) 7630 CASEF(fmax) 7631 default: unreachable("unknown reduction op"); 7632#undef CASEI 7633#undef CASEF 7634 } 7635} 7636 7637void 7638emit_uniform_subgroup(isel_context* ctx, nir_intrinsic_instr* instr, Temp src) 7639{ 7640 Builder bld(ctx->program, ctx->block); 7641 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); 7642 assert(dst.regClass().type() != RegType::vgpr); 7643 if (src.regClass().type() == RegType::vgpr) 7644 bld.pseudo(aco_opcode::p_as_uniform, dst, src); 7645 else 7646 bld.copy(dst, src); 7647} 7648 7649void 7650emit_addition_uniform_reduce(isel_context* ctx, nir_op op, Definition dst, nir_src src, Temp count) 7651{ 7652 Builder bld(ctx->program, ctx->block); 7653 Temp src_tmp = get_ssa_temp(ctx, src.ssa); 7654 7655 if (op == nir_op_fadd) { 7656 src_tmp = as_vgpr(ctx, src_tmp); 7657 Temp tmp = dst.regClass() == s1 ? bld.tmp(src_tmp.regClass()) : dst.getTemp(); 7658 7659 if (src.ssa->bit_size == 16) { 7660 count = bld.vop1(aco_opcode::v_cvt_f16_u16, bld.def(v2b), count); 7661 bld.vop2(aco_opcode::v_mul_f16, Definition(tmp), count, src_tmp); 7662 } else { 7663 assert(src.ssa->bit_size == 32); 7664 count = bld.vop1(aco_opcode::v_cvt_f32_u32, bld.def(v1), count); 7665 bld.vop2(aco_opcode::v_mul_f32, Definition(tmp), count, src_tmp); 7666 } 7667 7668 if (tmp != dst.getTemp()) 7669 bld.pseudo(aco_opcode::p_as_uniform, dst, tmp); 7670 7671 return; 7672 } 7673 7674 if (dst.regClass() == s1) 7675 src_tmp = bld.as_uniform(src_tmp); 7676 7677 if (op == nir_op_ixor && count.type() == RegType::sgpr) 7678 count = 7679 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), count, Operand::c32(1u)); 7680 else if (op == nir_op_ixor) 7681 count = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), count); 7682 7683 assert(dst.getTemp().type() == count.type()); 7684 7685 if (nir_src_is_const(src)) { 7686 if (nir_src_as_uint(src) == 1 && dst.bytes() <= 2) 7687 bld.pseudo(aco_opcode::p_extract_vector, dst, count, Operand::zero()); 7688 else if (nir_src_as_uint(src) == 1) 7689 bld.copy(dst, count); 7690 else if (nir_src_as_uint(src) == 0 && dst.bytes() <= 2) 7691 bld.vop1(aco_opcode::v_mov_b32, dst, Operand::zero()); /* RA will use SDWA if possible */ 7692 else if (nir_src_as_uint(src) == 0) 7693 bld.copy(dst, Operand::zero()); 7694 else if (count.type() == RegType::vgpr) 7695 bld.v_mul_imm(dst, count, nir_src_as_uint(src)); 7696 else 7697 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count); 7698 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX10) { 7699 bld.vop3(aco_opcode::v_mul_lo_u16_e64, dst, src_tmp, count); 7700 } else if (dst.bytes() <= 2 && ctx->program->chip_class >= GFX8) { 7701 bld.vop2(aco_opcode::v_mul_lo_u16, dst, src_tmp, count); 7702 } else if (dst.getTemp().type() == RegType::vgpr) { 7703 bld.vop3(aco_opcode::v_mul_lo_u32, dst, src_tmp, count); 7704 } else { 7705 bld.sop2(aco_opcode::s_mul_i32, dst, src_tmp, count); 7706 } 7707} 7708 7709bool 7710emit_uniform_reduce(isel_context* ctx, nir_intrinsic_instr* instr) 7711{ 7712 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); 7713 if (op == nir_op_imul || op == nir_op_fmul) 7714 return false; 7715 7716 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) { 7717 Builder bld(ctx->program, ctx->block); 7718 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); 7719 unsigned bit_size = instr->src[0].ssa->bit_size; 7720 if (bit_size > 32) 7721 return false; 7722 7723 Temp thread_count = 7724 bld.sop1(Builder::s_bcnt1_i32, bld.def(s1), bld.def(s1, scc), Operand(exec, bld.lm)); 7725 7726 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], thread_count); 7727 } else { 7728 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa)); 7729 } 7730 7731 return true; 7732} 7733 7734bool 7735emit_uniform_scan(isel_context* ctx, nir_intrinsic_instr* instr) 7736{ 7737 Builder bld(ctx->program, ctx->block); 7738 Definition dst(get_ssa_temp(ctx, &instr->dest.ssa)); 7739 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); 7740 bool inc = instr->intrinsic == nir_intrinsic_inclusive_scan; 7741 7742 if (op == nir_op_imul || op == nir_op_fmul) 7743 return false; 7744 7745 if (op == nir_op_iadd || op == nir_op_ixor || op == nir_op_fadd) { 7746 if (instr->src[0].ssa->bit_size > 32) 7747 return false; 7748 7749 Temp packed_tid; 7750 if (inc) 7751 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm), Operand::c32(1u)); 7752 else 7753 packed_tid = emit_mbcnt(ctx, bld.tmp(v1), Operand(exec, bld.lm)); 7754 7755 emit_addition_uniform_reduce(ctx, op, dst, instr->src[0], packed_tid); 7756 return true; 7757 } 7758 7759 assert(op == nir_op_imin || op == nir_op_umin || op == nir_op_imax || op == nir_op_umax || 7760 op == nir_op_iand || op == nir_op_ior || op == nir_op_fmin || op == nir_op_fmax); 7761 7762 if (inc) { 7763 emit_uniform_subgroup(ctx, instr, get_ssa_temp(ctx, instr->src[0].ssa)); 7764 return true; 7765 } 7766 7767 /* Copy the source and write the reduction operation identity to the first lane. */ 7768 Temp lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); 7769 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 7770 ReduceOp reduce_op = get_reduce_op(op, instr->src[0].ssa->bit_size); 7771 if (dst.bytes() == 8) { 7772 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 7773 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 7774 uint32_t identity_lo = get_reduction_identity(reduce_op, 0); 7775 uint32_t identity_hi = get_reduction_identity(reduce_op, 1); 7776 7777 lo = 7778 bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_lo)), lane, lo); 7779 hi = 7780 bld.writelane(bld.def(v1), bld.copy(bld.hint_m0(s1), Operand::c32(identity_hi)), lane, hi); 7781 bld.pseudo(aco_opcode::p_create_vector, dst, lo, hi); 7782 } else { 7783 uint32_t identity = get_reduction_identity(reduce_op, 0); 7784 bld.writelane(dst, bld.copy(bld.hint_m0(s1), Operand::c32(identity)), lane, 7785 as_vgpr(ctx, src)); 7786 } 7787 7788 return true; 7789} 7790 7791Temp 7792emit_reduction_instr(isel_context* ctx, aco_opcode aco_op, ReduceOp op, unsigned cluster_size, 7793 Definition dst, Temp src) 7794{ 7795 assert(src.bytes() <= 8); 7796 assert(src.type() == RegType::vgpr); 7797 7798 Builder bld(ctx->program, ctx->block); 7799 7800 unsigned num_defs = 0; 7801 Definition defs[5]; 7802 defs[num_defs++] = dst; 7803 defs[num_defs++] = bld.def(bld.lm); /* used internally to save/restore exec */ 7804 7805 /* scalar identity temporary */ 7806 bool need_sitmp = (ctx->program->chip_class <= GFX7 || ctx->program->chip_class >= GFX10) && 7807 aco_op != aco_opcode::p_reduce; 7808 if (aco_op == aco_opcode::p_exclusive_scan) { 7809 need_sitmp |= (op == imin8 || op == imin16 || op == imin32 || op == imin64 || op == imax8 || 7810 op == imax16 || op == imax32 || op == imax64 || op == fmin16 || op == fmin32 || 7811 op == fmin64 || op == fmax16 || op == fmax32 || op == fmax64 || op == fmul16 || 7812 op == fmul64); 7813 } 7814 if (need_sitmp) 7815 defs[num_defs++] = bld.def(RegType::sgpr, dst.size()); 7816 7817 /* scc clobber */ 7818 defs[num_defs++] = bld.def(s1, scc); 7819 7820 /* vcc clobber */ 7821 bool clobber_vcc = false; 7822 if ((op == iadd32 || op == imul64) && ctx->program->chip_class < GFX9) 7823 clobber_vcc = true; 7824 if ((op == iadd8 || op == iadd16) && ctx->program->chip_class < GFX8) 7825 clobber_vcc = true; 7826 if (op == iadd64 || op == umin64 || op == umax64 || op == imin64 || op == imax64) 7827 clobber_vcc = true; 7828 7829 if (clobber_vcc) 7830 defs[num_defs++] = bld.def(bld.lm, vcc); 7831 7832 Pseudo_reduction_instruction* reduce = create_instruction<Pseudo_reduction_instruction>( 7833 aco_op, Format::PSEUDO_REDUCTION, 3, num_defs); 7834 reduce->operands[0] = Operand(src); 7835 /* setup_reduce_temp will update these undef operands if needed */ 7836 reduce->operands[1] = Operand(RegClass(RegType::vgpr, dst.size()).as_linear()); 7837 reduce->operands[2] = Operand(v1.as_linear()); 7838 std::copy(defs, defs + num_defs, reduce->definitions.begin()); 7839 7840 reduce->reduce_op = op; 7841 reduce->cluster_size = cluster_size; 7842 bld.insert(std::move(reduce)); 7843 7844 return dst.getTemp(); 7845} 7846 7847void 7848emit_interp_center(isel_context* ctx, Temp dst, Temp bary, Temp pos1, Temp pos2) 7849{ 7850 Builder bld(ctx->program, ctx->block); 7851 Temp p1 = emit_extract_vector(ctx, bary, 0, v1); 7852 Temp p2 = emit_extract_vector(ctx, bary, 1, v1); 7853 7854 Temp ddx_1, ddx_2, ddy_1, ddy_2; 7855 uint32_t dpp_ctrl0 = dpp_quad_perm(0, 0, 0, 0); 7856 uint32_t dpp_ctrl1 = dpp_quad_perm(1, 1, 1, 1); 7857 uint32_t dpp_ctrl2 = dpp_quad_perm(2, 2, 2, 2); 7858 7859 /* Build DD X/Y */ 7860 if (ctx->program->chip_class >= GFX8) { 7861 Temp tl_1 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p1, dpp_ctrl0); 7862 ddx_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl1); 7863 ddy_1 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p1, tl_1, dpp_ctrl2); 7864 Temp tl_2 = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), p2, dpp_ctrl0); 7865 ddx_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl1); 7866 ddy_2 = bld.vop2_dpp(aco_opcode::v_sub_f32, bld.def(v1), p2, tl_2, dpp_ctrl2); 7867 } else { 7868 Temp tl_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl0); 7869 ddx_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl1); 7870 ddx_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_1, tl_1); 7871 ddx_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p1, (1 << 15) | dpp_ctrl2); 7872 ddx_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddx_2, tl_1); 7873 Temp tl_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl0); 7874 ddy_1 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl1); 7875 ddy_1 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_1, tl_2); 7876 ddy_2 = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), p2, (1 << 15) | dpp_ctrl2); 7877 ddy_2 = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), ddy_2, tl_2); 7878 } 7879 7880 /* res_k = p_k + ddx_k * pos1 + ddy_k * pos2 */ 7881 aco_opcode mad = 7882 ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fma_f32 : aco_opcode::v_mad_f32; 7883 Temp tmp1 = bld.vop3(mad, bld.def(v1), ddx_1, pos1, p1); 7884 Temp tmp2 = bld.vop3(mad, bld.def(v1), ddx_2, pos1, p2); 7885 tmp1 = bld.vop3(mad, bld.def(v1), ddy_1, pos2, tmp1); 7886 tmp2 = bld.vop3(mad, bld.def(v1), ddy_2, pos2, tmp2); 7887 Temp wqm1 = bld.tmp(v1); 7888 emit_wqm(bld, tmp1, wqm1, true); 7889 Temp wqm2 = bld.tmp(v1); 7890 emit_wqm(bld, tmp2, wqm2, true); 7891 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), wqm1, wqm2); 7892 return; 7893} 7894 7895Temp merged_wave_info_to_mask(isel_context* ctx, unsigned i); 7896void ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt); 7897static void create_vs_exports(isel_context* ctx); 7898 7899Temp 7900get_interp_param(isel_context* ctx, nir_intrinsic_op intrin, 7901 enum glsl_interp_mode interp) 7902{ 7903 bool linear = interp == INTERP_MODE_NOPERSPECTIVE; 7904 if (intrin == nir_intrinsic_load_barycentric_pixel || 7905 intrin == nir_intrinsic_load_barycentric_at_sample || 7906 intrin == nir_intrinsic_load_barycentric_at_offset) { 7907 return get_arg(ctx, linear ? ctx->args->ac.linear_center : ctx->args->ac.persp_center); 7908 } else if (intrin == nir_intrinsic_load_barycentric_centroid) { 7909 return linear ? ctx->linear_centroid : ctx->persp_centroid; 7910 } else { 7911 assert(intrin == nir_intrinsic_load_barycentric_sample); 7912 return get_arg(ctx, linear ? ctx->args->ac.linear_sample : ctx->args->ac.persp_sample); 7913 } 7914} 7915 7916void 7917visit_intrinsic(isel_context* ctx, nir_intrinsic_instr* instr) 7918{ 7919 Builder bld(ctx->program, ctx->block); 7920 switch (instr->intrinsic) { 7921 case nir_intrinsic_load_barycentric_sample: 7922 case nir_intrinsic_load_barycentric_pixel: 7923 case nir_intrinsic_load_barycentric_centroid: { 7924 glsl_interp_mode mode = (glsl_interp_mode)nir_intrinsic_interp_mode(instr); 7925 Temp bary = get_interp_param(ctx, instr->intrinsic, mode); 7926 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7927 Temp p1 = emit_extract_vector(ctx, bary, 0, v1); 7928 Temp p2 = emit_extract_vector(ctx, bary, 1, v1); 7929 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2)); 7930 emit_split_vector(ctx, dst, 2); 7931 break; 7932 } 7933 case nir_intrinsic_load_barycentric_model: { 7934 Temp model = get_arg(ctx, ctx->args->ac.pull_model); 7935 7936 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 7937 Temp p1 = emit_extract_vector(ctx, model, 0, v1); 7938 Temp p2 = emit_extract_vector(ctx, model, 1, v1); 7939 Temp p3 = emit_extract_vector(ctx, model, 2, v1); 7940 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), Operand(p1), Operand(p2), 7941 Operand(p3)); 7942 emit_split_vector(ctx, dst, 3); 7943 break; 7944 } 7945 case nir_intrinsic_load_barycentric_at_sample: { 7946 uint32_t sample_pos_offset = RING_PS_SAMPLE_POSITIONS * 16; 7947 switch (ctx->options->key.ps.num_samples) { 7948 case 2: sample_pos_offset += 1 << 3; break; 7949 case 4: sample_pos_offset += 3 << 3; break; 7950 case 8: sample_pos_offset += 7 << 3; break; 7951 default: break; 7952 } 7953 Temp sample_pos; 7954 Temp addr = get_ssa_temp(ctx, instr->src[0].ssa); 7955 nir_const_value* const_addr = nir_src_as_const_value(instr->src[0]); 7956 Temp private_segment_buffer = ctx->program->private_segment_buffer; 7957 // TODO: bounds checking? 7958 if (addr.type() == RegType::sgpr) { 7959 Operand offset; 7960 if (const_addr) { 7961 sample_pos_offset += const_addr->u32 << 3; 7962 offset = Operand::c32(sample_pos_offset); 7963 } else if (ctx->options->chip_class >= GFX9) { 7964 offset = bld.sop2(aco_opcode::s_lshl3_add_u32, bld.def(s1), bld.def(s1, scc), addr, 7965 Operand::c32(sample_pos_offset)); 7966 } else { 7967 offset = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), addr, 7968 Operand::c32(3u)); 7969 offset = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), bld.def(s1, scc), offset, 7970 Operand::c32(sample_pos_offset)); 7971 } 7972 7973 Operand off = bld.copy(bld.def(s1), Operand(offset)); 7974 sample_pos = 7975 bld.smem(aco_opcode::s_load_dwordx2, bld.def(s2), private_segment_buffer, off); 7976 7977 } else if (ctx->options->chip_class >= GFX9) { 7978 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr); 7979 sample_pos = bld.global(aco_opcode::global_load_dwordx2, bld.def(v2), addr, 7980 private_segment_buffer, sample_pos_offset); 7981 } else if (ctx->options->chip_class >= GFX7) { 7982 /* addr += private_segment_buffer + sample_pos_offset */ 7983 Temp tmp0 = bld.tmp(s1); 7984 Temp tmp1 = bld.tmp(s1); 7985 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp0), Definition(tmp1), 7986 private_segment_buffer); 7987 Definition scc_tmp = bld.def(s1, scc); 7988 tmp0 = bld.sop2(aco_opcode::s_add_u32, bld.def(s1), scc_tmp, tmp0, 7989 Operand::c32(sample_pos_offset)); 7990 tmp1 = bld.sop2(aco_opcode::s_addc_u32, bld.def(s1), bld.def(s1, scc), tmp1, 7991 Operand::zero(), bld.scc(scc_tmp.getTemp())); 7992 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr); 7993 Temp pck0 = bld.tmp(v1); 7994 Temp carry = bld.vadd32(Definition(pck0), tmp0, addr, true).def(1).getTemp(); 7995 tmp1 = as_vgpr(ctx, tmp1); 7996 Temp pck1 = bld.vop2_e64(aco_opcode::v_addc_co_u32, bld.def(v1), 7997 bld.hint_vcc(bld.def(bld.lm)), tmp1, Operand::zero(), carry); 7998 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), pck0, pck1); 7999 8000 /* sample_pos = flat_load_dwordx2 addr */ 8001 sample_pos = bld.flat(aco_opcode::flat_load_dwordx2, bld.def(v2), addr, Operand(s1)); 8002 } else { 8003 assert(ctx->options->chip_class == GFX6); 8004 8005 uint32_t rsrc_conf = S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | 8006 S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); 8007 Temp rsrc = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, 8008 Operand::zero(), Operand::c32(rsrc_conf)); 8009 8010 addr = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(3u), addr); 8011 addr = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), addr, Operand::zero()); 8012 8013 sample_pos = bld.tmp(v2); 8014 8015 aco_ptr<MUBUF_instruction> load{create_instruction<MUBUF_instruction>( 8016 aco_opcode::buffer_load_dwordx2, Format::MUBUF, 3, 1)}; 8017 load->definitions[0] = Definition(sample_pos); 8018 load->operands[0] = Operand(rsrc); 8019 load->operands[1] = Operand(addr); 8020 load->operands[2] = Operand::zero(); 8021 load->offset = sample_pos_offset; 8022 load->offen = 0; 8023 load->addr64 = true; 8024 load->glc = false; 8025 load->dlc = false; 8026 load->disable_wqm = false; 8027 ctx->block->instructions.emplace_back(std::move(load)); 8028 } 8029 8030 /* sample_pos -= 0.5 */ 8031 Temp pos1 = bld.tmp(RegClass(sample_pos.type(), 1)); 8032 Temp pos2 = bld.tmp(RegClass(sample_pos.type(), 1)); 8033 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), sample_pos); 8034 pos1 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos1, Operand::c32(0x3f000000u)); 8035 pos2 = bld.vop2_e64(aco_opcode::v_sub_f32, bld.def(v1), pos2, Operand::c32(0x3f000000u)); 8036 8037 Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr)); 8038 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2); 8039 break; 8040 } 8041 case nir_intrinsic_load_barycentric_at_offset: { 8042 Temp offset = get_ssa_temp(ctx, instr->src[0].ssa); 8043 RegClass rc = RegClass(offset.type(), 1); 8044 Temp pos1 = bld.tmp(rc), pos2 = bld.tmp(rc); 8045 bld.pseudo(aco_opcode::p_split_vector, Definition(pos1), Definition(pos2), offset); 8046 Temp bary = get_interp_param(ctx, instr->intrinsic, (glsl_interp_mode)nir_intrinsic_interp_mode(instr)); 8047 emit_interp_center(ctx, get_ssa_temp(ctx, &instr->dest.ssa), bary, pos1, pos2); 8048 break; 8049 } 8050 case nir_intrinsic_load_front_face: { 8051 bld.vopc(aco_opcode::v_cmp_lg_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8052 Operand::zero(), get_arg(ctx, ctx->args->ac.front_face)) 8053 .def(0) 8054 .setHint(vcc); 8055 break; 8056 } 8057 case nir_intrinsic_load_view_index: { 8058 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8059 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.view_index))); 8060 break; 8061 } 8062 case nir_intrinsic_load_frag_coord: { 8063 emit_load_frag_coord(ctx, get_ssa_temp(ctx, &instr->dest.ssa), 4); 8064 break; 8065 } 8066 case nir_intrinsic_load_frag_shading_rate: 8067 emit_load_frag_shading_rate(ctx, get_ssa_temp(ctx, &instr->dest.ssa)); 8068 break; 8069 case nir_intrinsic_load_sample_pos: { 8070 Temp posx = get_arg(ctx, ctx->args->ac.frag_pos[0]); 8071 Temp posy = get_arg(ctx, ctx->args->ac.frag_pos[1]); 8072 bld.pseudo( 8073 aco_opcode::p_create_vector, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8074 posx.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posx) : Operand::zero(), 8075 posy.id() ? bld.vop1(aco_opcode::v_fract_f32, bld.def(v1), posy) : Operand::zero()); 8076 break; 8077 } 8078 case nir_intrinsic_load_tess_coord: visit_load_tess_coord(ctx, instr); break; 8079 case nir_intrinsic_load_interpolated_input: visit_load_interpolated_input(ctx, instr); break; 8080 case nir_intrinsic_store_output: visit_store_output(ctx, instr); break; 8081 case nir_intrinsic_load_input: 8082 case nir_intrinsic_load_input_vertex: visit_load_input(ctx, instr); break; 8083 case nir_intrinsic_load_per_vertex_input: visit_load_per_vertex_input(ctx, instr); break; 8084 case nir_intrinsic_load_ubo: visit_load_ubo(ctx, instr); break; 8085 case nir_intrinsic_load_push_constant: visit_load_push_constant(ctx, instr); break; 8086 case nir_intrinsic_load_constant: visit_load_constant(ctx, instr); break; 8087 case nir_intrinsic_vulkan_resource_index: visit_load_resource(ctx, instr); break; 8088 case nir_intrinsic_terminate: 8089 case nir_intrinsic_discard: visit_discard(ctx, instr); break; 8090 case nir_intrinsic_terminate_if: 8091 case nir_intrinsic_discard_if: visit_discard_if(ctx, instr); break; 8092 case nir_intrinsic_load_shared: visit_load_shared(ctx, instr); break; 8093 case nir_intrinsic_store_shared: visit_store_shared(ctx, instr); break; 8094 case nir_intrinsic_shared_atomic_add: 8095 case nir_intrinsic_shared_atomic_imin: 8096 case nir_intrinsic_shared_atomic_umin: 8097 case nir_intrinsic_shared_atomic_imax: 8098 case nir_intrinsic_shared_atomic_umax: 8099 case nir_intrinsic_shared_atomic_and: 8100 case nir_intrinsic_shared_atomic_or: 8101 case nir_intrinsic_shared_atomic_xor: 8102 case nir_intrinsic_shared_atomic_exchange: 8103 case nir_intrinsic_shared_atomic_comp_swap: 8104 case nir_intrinsic_shared_atomic_fadd: 8105 case nir_intrinsic_shared_atomic_fmin: 8106 case nir_intrinsic_shared_atomic_fmax: visit_shared_atomic(ctx, instr); break; 8107 case nir_intrinsic_image_deref_load: 8108 case nir_intrinsic_image_deref_sparse_load: visit_image_load(ctx, instr); break; 8109 case nir_intrinsic_image_deref_store: visit_image_store(ctx, instr); break; 8110 case nir_intrinsic_image_deref_atomic_add: 8111 case nir_intrinsic_image_deref_atomic_umin: 8112 case nir_intrinsic_image_deref_atomic_imin: 8113 case nir_intrinsic_image_deref_atomic_umax: 8114 case nir_intrinsic_image_deref_atomic_imax: 8115 case nir_intrinsic_image_deref_atomic_and: 8116 case nir_intrinsic_image_deref_atomic_or: 8117 case nir_intrinsic_image_deref_atomic_xor: 8118 case nir_intrinsic_image_deref_atomic_exchange: 8119 case nir_intrinsic_image_deref_atomic_comp_swap: 8120 case nir_intrinsic_image_deref_atomic_fmin: 8121 case nir_intrinsic_image_deref_atomic_fmax: visit_image_atomic(ctx, instr); break; 8122 case nir_intrinsic_image_deref_size: visit_image_size(ctx, instr); break; 8123 case nir_intrinsic_image_deref_samples: visit_image_samples(ctx, instr); break; 8124 case nir_intrinsic_load_ssbo: visit_load_ssbo(ctx, instr); break; 8125 case nir_intrinsic_store_ssbo: visit_store_ssbo(ctx, instr); break; 8126 case nir_intrinsic_load_global_constant: 8127 case nir_intrinsic_load_global: visit_load_global(ctx, instr); break; 8128 case nir_intrinsic_load_buffer_amd: visit_load_buffer(ctx, instr); break; 8129 case nir_intrinsic_store_buffer_amd: visit_store_buffer(ctx, instr); break; 8130 case nir_intrinsic_store_global: visit_store_global(ctx, instr); break; 8131 case nir_intrinsic_global_atomic_add: 8132 case nir_intrinsic_global_atomic_imin: 8133 case nir_intrinsic_global_atomic_umin: 8134 case nir_intrinsic_global_atomic_imax: 8135 case nir_intrinsic_global_atomic_umax: 8136 case nir_intrinsic_global_atomic_and: 8137 case nir_intrinsic_global_atomic_or: 8138 case nir_intrinsic_global_atomic_xor: 8139 case nir_intrinsic_global_atomic_exchange: 8140 case nir_intrinsic_global_atomic_comp_swap: 8141 case nir_intrinsic_global_atomic_fmin: 8142 case nir_intrinsic_global_atomic_fmax: visit_global_atomic(ctx, instr); break; 8143 case nir_intrinsic_ssbo_atomic_add: 8144 case nir_intrinsic_ssbo_atomic_imin: 8145 case nir_intrinsic_ssbo_atomic_umin: 8146 case nir_intrinsic_ssbo_atomic_imax: 8147 case nir_intrinsic_ssbo_atomic_umax: 8148 case nir_intrinsic_ssbo_atomic_and: 8149 case nir_intrinsic_ssbo_atomic_or: 8150 case nir_intrinsic_ssbo_atomic_xor: 8151 case nir_intrinsic_ssbo_atomic_exchange: 8152 case nir_intrinsic_ssbo_atomic_comp_swap: 8153 case nir_intrinsic_ssbo_atomic_fmin: 8154 case nir_intrinsic_ssbo_atomic_fmax: visit_atomic_ssbo(ctx, instr); break; 8155 case nir_intrinsic_load_scratch: visit_load_scratch(ctx, instr); break; 8156 case nir_intrinsic_store_scratch: visit_store_scratch(ctx, instr); break; 8157 case nir_intrinsic_get_ssbo_size: visit_get_ssbo_size(ctx, instr); break; 8158 case nir_intrinsic_scoped_barrier: emit_scoped_barrier(ctx, instr); break; 8159 case nir_intrinsic_load_num_workgroups: { 8160 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8161 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.num_work_groups))); 8162 emit_split_vector(ctx, dst, 3); 8163 break; 8164 } 8165 case nir_intrinsic_load_ray_launch_size: { 8166 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8167 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.ray_launch_size))); 8168 emit_split_vector(ctx, dst, 3); 8169 break; 8170 } 8171 case nir_intrinsic_load_local_invocation_id: { 8172 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8173 bld.copy(Definition(dst), Operand(get_arg(ctx, ctx->args->ac.local_invocation_ids))); 8174 emit_split_vector(ctx, dst, 3); 8175 break; 8176 } 8177 case nir_intrinsic_load_workgroup_id: { 8178 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8179 const struct ac_arg* args = ctx->args->ac.workgroup_ids; 8180 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), 8181 args[0].used ? Operand(get_arg(ctx, args[0])) : Operand::zero(), 8182 args[1].used ? Operand(get_arg(ctx, args[1])) : Operand::zero(), 8183 args[2].used ? Operand(get_arg(ctx, args[2])) : Operand::zero()); 8184 emit_split_vector(ctx, dst, 3); 8185 break; 8186 } 8187 case nir_intrinsic_load_local_invocation_index: { 8188 if (ctx->stage.hw == HWStage::LS || ctx->stage.hw == HWStage::HS) { 8189 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8190 get_arg(ctx, ctx->args->ac.vs_rel_patch_id)); 8191 break; 8192 } else if (ctx->stage.hw == HWStage::GS || ctx->stage.hw == HWStage::NGG) { 8193 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), thread_id_in_threadgroup(ctx)); 8194 break; 8195 } 8196 8197 Temp id = emit_mbcnt(ctx, bld.tmp(v1)); 8198 8199 /* The tg_size bits [6:11] contain the subgroup id, 8200 * we need this multiplied by the wave size, and then OR the thread id to it. 8201 */ 8202 if (ctx->program->wave_size == 64) { 8203 /* After the s_and the bits are already multiplied by 64 (left shifted by 6) so we can just 8204 * feed that to v_or */ 8205 Temp tg_num = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 8206 Operand::c32(0xfc0u), get_arg(ctx, ctx->args->ac.tg_size)); 8207 bld.vop2(aco_opcode::v_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), tg_num, 8208 id); 8209 } else { 8210 /* Extract the bit field and multiply the result by 32 (left shift by 5), then do the OR */ 8211 Temp tg_num = 8212 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 8213 get_arg(ctx, ctx->args->ac.tg_size), Operand::c32(0x6u | (0x6u << 16))); 8214 bld.vop3(aco_opcode::v_lshl_or_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8215 tg_num, Operand::c32(0x5u), id); 8216 } 8217 break; 8218 } 8219 case nir_intrinsic_load_subgroup_id: { 8220 if (ctx->stage == compute_cs) { 8221 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8222 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.tg_size), 8223 Operand::c32(0x6u | (0x6u << 16))); 8224 } else if (ctx->stage.hw == HWStage::NGG) { 8225 /* Get the id of the current wave within the threadgroup (workgroup) */ 8226 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8227 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info), 8228 Operand::c32(24u | (4u << 16))); 8229 } else { 8230 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::zero()); 8231 } 8232 break; 8233 } 8234 case nir_intrinsic_load_subgroup_invocation: { 8235 emit_mbcnt(ctx, get_ssa_temp(ctx, &instr->dest.ssa)); 8236 break; 8237 } 8238 case nir_intrinsic_load_num_subgroups: { 8239 if (ctx->stage == compute_cs) 8240 bld.sop2(aco_opcode::s_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8241 bld.def(s1, scc), Operand::c32(0x3fu), get_arg(ctx, ctx->args->ac.tg_size)); 8242 else if (ctx->stage.hw == HWStage::NGG) 8243 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8244 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.merged_wave_info), 8245 Operand::c32(28u | (4u << 16))); 8246 else 8247 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), Operand::c32(0x1u)); 8248 break; 8249 } 8250 case nir_intrinsic_ballot: { 8251 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8252 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8253 8254 if (instr->src[0].ssa->bit_size == 1) { 8255 assert(src.regClass() == bld.lm); 8256 } else if (instr->src[0].ssa->bit_size == 32 && src.regClass() == v1) { 8257 src = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src); 8258 } else if (instr->src[0].ssa->bit_size == 64 && src.regClass() == v2) { 8259 src = bld.vopc(aco_opcode::v_cmp_lg_u64, bld.def(bld.lm), Operand::zero(), src); 8260 } else { 8261 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 8262 } 8263 8264 /* Make sure that all inactive lanes return zero. 8265 * Value-numbering might remove the comparison above */ 8266 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 8267 if (dst.size() != bld.lm.size()) { 8268 /* Wave32 with ballot size set to 64 */ 8269 src = 8270 bld.pseudo(aco_opcode::p_create_vector, bld.def(dst.regClass()), src, Operand::zero()); 8271 } 8272 8273 emit_wqm(bld, src, dst); 8274 break; 8275 } 8276 case nir_intrinsic_shuffle: 8277 case nir_intrinsic_read_invocation: { 8278 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8279 if (!nir_src_is_divergent(instr->src[0])) { 8280 emit_uniform_subgroup(ctx, instr, src); 8281 } else { 8282 Temp tid = get_ssa_temp(ctx, instr->src[1].ssa); 8283 if (instr->intrinsic == nir_intrinsic_read_invocation || 8284 !nir_src_is_divergent(instr->src[1])) 8285 tid = bld.as_uniform(tid); 8286 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8287 8288 if (instr->dest.ssa.bit_size != 1) 8289 src = as_vgpr(ctx, src); 8290 8291 if (src.regClass() == v1b || src.regClass() == v2b) { 8292 Temp tmp = bld.tmp(v1); 8293 tmp = emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), tmp); 8294 if (dst.type() == RegType::vgpr) 8295 bld.pseudo(aco_opcode::p_split_vector, Definition(dst), 8296 bld.def(src.regClass() == v1b ? v3b : v2b), tmp); 8297 else 8298 bld.pseudo(aco_opcode::p_as_uniform, Definition(dst), tmp); 8299 } else if (src.regClass() == v1) { 8300 emit_wqm(bld, emit_bpermute(ctx, bld, tid, src), dst); 8301 } else if (src.regClass() == v2) { 8302 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8303 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8304 lo = emit_wqm(bld, emit_bpermute(ctx, bld, tid, lo)); 8305 hi = emit_wqm(bld, emit_bpermute(ctx, bld, tid, hi)); 8306 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 8307 emit_split_vector(ctx, dst, 2); 8308 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == s1) { 8309 assert(src.regClass() == bld.lm); 8310 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, tid); 8311 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst); 8312 } else if (instr->dest.ssa.bit_size == 1 && tid.regClass() == v1) { 8313 assert(src.regClass() == bld.lm); 8314 Temp tmp; 8315 if (ctx->program->chip_class <= GFX7) 8316 tmp = bld.vop3(aco_opcode::v_lshr_b64, bld.def(v2), src, tid); 8317 else if (ctx->program->wave_size == 64) 8318 tmp = bld.vop3(aco_opcode::v_lshrrev_b64, bld.def(v2), tid, src); 8319 else 8320 tmp = bld.vop2_e64(aco_opcode::v_lshrrev_b32, bld.def(v1), tid, src); 8321 tmp = emit_extract_vector(ctx, tmp, 0, v1); 8322 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(1u), tmp); 8323 emit_wqm(bld, bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp), 8324 dst); 8325 } else { 8326 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 8327 } 8328 } 8329 break; 8330 } 8331 case nir_intrinsic_load_sample_id: { 8332 bld.vop3(aco_opcode::v_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8333 get_arg(ctx, ctx->args->ac.ancillary), Operand::c32(8u), Operand::c32(4u)); 8334 break; 8335 } 8336 case nir_intrinsic_load_sample_mask_in: { 8337 visit_load_sample_mask_in(ctx, instr); 8338 break; 8339 } 8340 case nir_intrinsic_read_first_invocation: { 8341 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8342 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8343 if (src.regClass() == v1b || src.regClass() == v2b || src.regClass() == v1) { 8344 emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), src), dst); 8345 } else if (src.regClass() == v2) { 8346 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8347 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8348 lo = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), lo)); 8349 hi = emit_wqm(bld, bld.vop1(aco_opcode::v_readfirstlane_b32, bld.def(s1), hi)); 8350 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 8351 emit_split_vector(ctx, dst, 2); 8352 } else if (instr->dest.ssa.bit_size == 1) { 8353 assert(src.regClass() == bld.lm); 8354 Temp tmp = bld.sopc(Builder::s_bitcmp1, bld.def(s1, scc), src, 8355 bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm))); 8356 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst); 8357 } else { 8358 bld.copy(Definition(dst), src); 8359 } 8360 break; 8361 } 8362 case nir_intrinsic_vote_all: { 8363 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8364 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8365 assert(src.regClass() == bld.lm); 8366 assert(dst.regClass() == bld.lm); 8367 8368 Temp tmp = 8369 bld.sop2(Builder::s_andn2, bld.def(bld.lm), bld.def(s1, scc), Operand(exec, bld.lm), src) 8370 .def(1) 8371 .getTemp(); 8372 Temp cond = bool_to_vector_condition(ctx, emit_wqm(bld, tmp)); 8373 bld.sop1(Builder::s_not, Definition(dst), bld.def(s1, scc), cond); 8374 break; 8375 } 8376 case nir_intrinsic_vote_any: { 8377 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8378 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8379 assert(src.regClass() == bld.lm); 8380 assert(dst.regClass() == bld.lm); 8381 8382 Temp tmp = bool_to_scalar_condition(ctx, src); 8383 bool_to_vector_condition(ctx, emit_wqm(bld, tmp), dst); 8384 break; 8385 } 8386 case nir_intrinsic_reduce: 8387 case nir_intrinsic_inclusive_scan: 8388 case nir_intrinsic_exclusive_scan: { 8389 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8390 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8391 nir_op op = (nir_op)nir_intrinsic_reduction_op(instr); 8392 unsigned cluster_size = 8393 instr->intrinsic == nir_intrinsic_reduce ? nir_intrinsic_cluster_size(instr) : 0; 8394 cluster_size = util_next_power_of_two( 8395 MIN2(cluster_size ? cluster_size : ctx->program->wave_size, ctx->program->wave_size)); 8396 8397 if (!nir_src_is_divergent(instr->src[0]) && cluster_size == ctx->program->wave_size && 8398 instr->dest.ssa.bit_size != 1) { 8399 /* We use divergence analysis to assign the regclass, so check if it's 8400 * working as expected */ 8401 ASSERTED bool expected_divergent = instr->intrinsic == nir_intrinsic_exclusive_scan; 8402 if (instr->intrinsic == nir_intrinsic_inclusive_scan) 8403 expected_divergent = op == nir_op_iadd || op == nir_op_fadd || op == nir_op_ixor; 8404 assert(nir_dest_is_divergent(instr->dest) == expected_divergent); 8405 8406 if (instr->intrinsic == nir_intrinsic_reduce) { 8407 if (emit_uniform_reduce(ctx, instr)) 8408 break; 8409 } else if (emit_uniform_scan(ctx, instr)) { 8410 break; 8411 } 8412 } 8413 8414 if (instr->dest.ssa.bit_size == 1) { 8415 if (op == nir_op_imul || op == nir_op_umin || op == nir_op_imin) 8416 op = nir_op_iand; 8417 else if (op == nir_op_iadd) 8418 op = nir_op_ixor; 8419 else if (op == nir_op_umax || op == nir_op_imax) 8420 op = nir_op_ior; 8421 assert(op == nir_op_iand || op == nir_op_ior || op == nir_op_ixor); 8422 8423 switch (instr->intrinsic) { 8424 case nir_intrinsic_reduce: 8425 emit_wqm(bld, emit_boolean_reduce(ctx, op, cluster_size, src), dst); 8426 break; 8427 case nir_intrinsic_exclusive_scan: 8428 emit_wqm(bld, emit_boolean_exclusive_scan(ctx, op, src), dst); 8429 break; 8430 case nir_intrinsic_inclusive_scan: 8431 emit_wqm(bld, emit_boolean_inclusive_scan(ctx, op, src), dst); 8432 break; 8433 default: assert(false); 8434 } 8435 } else if (cluster_size == 1) { 8436 bld.copy(Definition(dst), src); 8437 } else { 8438 unsigned bit_size = instr->src[0].ssa->bit_size; 8439 8440 src = emit_extract_vector(ctx, src, 0, RegClass::get(RegType::vgpr, bit_size / 8)); 8441 8442 ReduceOp reduce_op = get_reduce_op(op, bit_size); 8443 8444 aco_opcode aco_op; 8445 switch (instr->intrinsic) { 8446 case nir_intrinsic_reduce: aco_op = aco_opcode::p_reduce; break; 8447 case nir_intrinsic_inclusive_scan: aco_op = aco_opcode::p_inclusive_scan; break; 8448 case nir_intrinsic_exclusive_scan: aco_op = aco_opcode::p_exclusive_scan; break; 8449 default: unreachable("unknown reduce intrinsic"); 8450 } 8451 8452 Temp tmp_dst = emit_reduction_instr(ctx, aco_op, reduce_op, cluster_size, 8453 bld.def(dst.regClass()), src); 8454 emit_wqm(bld, tmp_dst, dst); 8455 } 8456 break; 8457 } 8458 case nir_intrinsic_quad_broadcast: 8459 case nir_intrinsic_quad_swap_horizontal: 8460 case nir_intrinsic_quad_swap_vertical: 8461 case nir_intrinsic_quad_swap_diagonal: 8462 case nir_intrinsic_quad_swizzle_amd: { 8463 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8464 8465 if (!nir_dest_is_divergent(instr->dest)) { 8466 emit_uniform_subgroup(ctx, instr, src); 8467 break; 8468 } 8469 8470 /* Quad broadcast lane. */ 8471 unsigned lane = 0; 8472 /* Use VALU for the bool instructions that don't have a SALU-only special case. */ 8473 bool bool_use_valu = instr->dest.ssa.bit_size == 1; 8474 8475 uint16_t dpp_ctrl = 0; 8476 8477 switch (instr->intrinsic) { 8478 case nir_intrinsic_quad_swap_horizontal: dpp_ctrl = dpp_quad_perm(1, 0, 3, 2); break; 8479 case nir_intrinsic_quad_swap_vertical: dpp_ctrl = dpp_quad_perm(2, 3, 0, 1); break; 8480 case nir_intrinsic_quad_swap_diagonal: dpp_ctrl = dpp_quad_perm(3, 2, 1, 0); break; 8481 case nir_intrinsic_quad_swizzle_amd: dpp_ctrl = nir_intrinsic_swizzle_mask(instr); break; 8482 case nir_intrinsic_quad_broadcast: 8483 lane = nir_src_as_const_value(instr->src[1])->u32; 8484 dpp_ctrl = dpp_quad_perm(lane, lane, lane, lane); 8485 bool_use_valu = false; 8486 break; 8487 default: break; 8488 } 8489 8490 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8491 Temp tmp(dst); 8492 8493 /* Setup source. */ 8494 if (bool_use_valu) 8495 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 8496 Operand::c32(-1), src); 8497 else if (instr->dest.ssa.bit_size != 1) 8498 src = as_vgpr(ctx, src); 8499 8500 /* Setup temporary destination. */ 8501 if (bool_use_valu) 8502 tmp = bld.tmp(v1); 8503 else if (ctx->program->stage == fragment_fs) 8504 tmp = bld.tmp(dst.regClass()); 8505 8506 if (instr->dest.ssa.bit_size == 1 && instr->intrinsic == nir_intrinsic_quad_broadcast) { 8507 /* Special case for quad broadcast using SALU only. */ 8508 assert(src.regClass() == bld.lm && tmp.regClass() == bld.lm); 8509 8510 uint32_t half_mask = 0x11111111u << lane; 8511 Operand mask_tmp = bld.lm.bytes() == 4 8512 ? Operand::c32(half_mask) 8513 : bld.pseudo(aco_opcode::p_create_vector, bld.def(bld.lm), 8514 Operand::c32(half_mask), Operand::c32(half_mask)); 8515 8516 src = 8517 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 8518 src = bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), mask_tmp, src); 8519 bld.sop1(Builder::s_wqm, Definition(tmp), src); 8520 } else if (instr->dest.ssa.bit_size <= 32 || bool_use_valu) { 8521 unsigned excess_bytes = bool_use_valu ? 0 : 4 - instr->dest.ssa.bit_size / 8; 8522 Definition def = excess_bytes ? bld.def(v1) : Definition(tmp); 8523 8524 if (ctx->program->chip_class >= GFX8) 8525 bld.vop1_dpp(aco_opcode::v_mov_b32, def, src, dpp_ctrl); 8526 else 8527 bld.ds(aco_opcode::ds_swizzle_b32, def, src, (1 << 15) | dpp_ctrl); 8528 8529 if (excess_bytes) 8530 bld.pseudo(aco_opcode::p_split_vector, Definition(tmp), 8531 bld.def(RegClass::get(tmp.type(), excess_bytes)), def.getTemp()); 8532 } else if (instr->dest.ssa.bit_size == 64) { 8533 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8534 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8535 8536 if (ctx->program->chip_class >= GFX8) { 8537 lo = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), lo, dpp_ctrl); 8538 hi = bld.vop1_dpp(aco_opcode::v_mov_b32, bld.def(v1), hi, dpp_ctrl); 8539 } else { 8540 lo = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), lo, (1 << 15) | dpp_ctrl); 8541 hi = bld.ds(aco_opcode::ds_swizzle_b32, bld.def(v1), hi, (1 << 15) | dpp_ctrl); 8542 } 8543 8544 bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), lo, hi); 8545 emit_split_vector(ctx, tmp, 2); 8546 } else { 8547 isel_err(&instr->instr, "Unimplemented NIR quad group instruction bit size."); 8548 } 8549 8550 if (tmp.id() != dst.id()) { 8551 if (bool_use_valu) 8552 tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), tmp); 8553 8554 /* Vulkan spec 9.25: Helper invocations must be active for quad group instructions. */ 8555 emit_wqm(bld, tmp, dst, true); 8556 } 8557 8558 break; 8559 } 8560 case nir_intrinsic_masked_swizzle_amd: { 8561 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8562 if (!nir_dest_is_divergent(instr->dest)) { 8563 emit_uniform_subgroup(ctx, instr, src); 8564 break; 8565 } 8566 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8567 uint32_t mask = nir_intrinsic_swizzle_mask(instr); 8568 8569 if (instr->dest.ssa.bit_size != 1) 8570 src = as_vgpr(ctx, src); 8571 8572 if (instr->dest.ssa.bit_size == 1) { 8573 assert(src.regClass() == bld.lm); 8574 src = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), Operand::zero(), 8575 Operand::c32(-1), src); 8576 src = emit_masked_swizzle(ctx, bld, src, mask); 8577 Temp tmp = bld.vopc(aco_opcode::v_cmp_lg_u32, bld.def(bld.lm), Operand::zero(), src); 8578 emit_wqm(bld, tmp, dst); 8579 } else if (dst.regClass() == v1b) { 8580 Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask)); 8581 emit_extract_vector(ctx, tmp, 0, dst); 8582 } else if (dst.regClass() == v2b) { 8583 Temp tmp = emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask)); 8584 emit_extract_vector(ctx, tmp, 0, dst); 8585 } else if (dst.regClass() == v1) { 8586 emit_wqm(bld, emit_masked_swizzle(ctx, bld, src, mask), dst); 8587 } else if (dst.regClass() == v2) { 8588 Temp lo = bld.tmp(v1), hi = bld.tmp(v1); 8589 bld.pseudo(aco_opcode::p_split_vector, Definition(lo), Definition(hi), src); 8590 lo = emit_wqm(bld, emit_masked_swizzle(ctx, bld, lo, mask)); 8591 hi = emit_wqm(bld, emit_masked_swizzle(ctx, bld, hi, mask)); 8592 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 8593 emit_split_vector(ctx, dst, 2); 8594 } else { 8595 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 8596 } 8597 break; 8598 } 8599 case nir_intrinsic_write_invocation_amd: { 8600 Temp src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[0].ssa)); 8601 Temp val = bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)); 8602 Temp lane = bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa)); 8603 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8604 if (dst.regClass() == v1) { 8605 /* src2 is ignored for writelane. RA assigns the same reg for dst */ 8606 emit_wqm(bld, bld.writelane(bld.def(v1), val, lane, src), dst); 8607 } else if (dst.regClass() == v2) { 8608 Temp src_lo = bld.tmp(v1), src_hi = bld.tmp(v1); 8609 Temp val_lo = bld.tmp(s1), val_hi = bld.tmp(s1); 8610 bld.pseudo(aco_opcode::p_split_vector, Definition(src_lo), Definition(src_hi), src); 8611 bld.pseudo(aco_opcode::p_split_vector, Definition(val_lo), Definition(val_hi), val); 8612 Temp lo = emit_wqm(bld, bld.writelane(bld.def(v1), val_lo, lane, src_hi)); 8613 Temp hi = emit_wqm(bld, bld.writelane(bld.def(v1), val_hi, lane, src_hi)); 8614 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), lo, hi); 8615 emit_split_vector(ctx, dst, 2); 8616 } else { 8617 isel_err(&instr->instr, "Unimplemented NIR instr bit size"); 8618 } 8619 break; 8620 } 8621 case nir_intrinsic_mbcnt_amd: { 8622 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8623 Temp add_src = as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)); 8624 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8625 /* Fit 64-bit mask for wave32 */ 8626 src = emit_extract_vector(ctx, src, 0, RegClass(src.type(), bld.lm.size())); 8627 Temp wqm_tmp = emit_mbcnt(ctx, bld.tmp(v1), Operand(src), Operand(add_src)); 8628 emit_wqm(bld, wqm_tmp, dst); 8629 break; 8630 } 8631 case nir_intrinsic_byte_permute_amd: { 8632 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8633 assert(dst.regClass() == v1); 8634 assert(ctx->program->chip_class >= GFX8); 8635 bld.vop3(aco_opcode::v_perm_b32, Definition(dst), get_ssa_temp(ctx, instr->src[0].ssa), 8636 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[1].ssa)), 8637 as_vgpr(ctx, get_ssa_temp(ctx, instr->src[2].ssa))); 8638 break; 8639 } 8640 case nir_intrinsic_lane_permute_16_amd: { 8641 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8642 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8643 assert(ctx->program->chip_class >= GFX10); 8644 8645 if (src.regClass() == s1) { 8646 bld.copy(Definition(dst), src); 8647 } else if (dst.regClass() == v1 && src.regClass() == v1) { 8648 bld.vop3(aco_opcode::v_permlane16_b32, Definition(dst), src, 8649 bld.as_uniform(get_ssa_temp(ctx, instr->src[1].ssa)), 8650 bld.as_uniform(get_ssa_temp(ctx, instr->src[2].ssa))); 8651 } else { 8652 isel_err(&instr->instr, "Unimplemented lane_permute_16_amd"); 8653 } 8654 break; 8655 } 8656 case nir_intrinsic_load_helper_invocation: 8657 case nir_intrinsic_is_helper_invocation: { 8658 /* load_helper() after demote() get lowered to is_helper(). 8659 * Otherwise, these two behave the same. */ 8660 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8661 bld.pseudo(aco_opcode::p_is_helper, Definition(dst), Operand(exec, bld.lm)); 8662 ctx->block->kind |= block_kind_needs_lowering; 8663 ctx->program->needs_exact = true; 8664 break; 8665 } 8666 case nir_intrinsic_demote: 8667 bld.pseudo(aco_opcode::p_demote_to_helper, Operand::c32(-1u)); 8668 8669 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) 8670 ctx->cf_info.exec_potentially_empty_discard = true; 8671 ctx->block->kind |= block_kind_uses_demote; 8672 ctx->program->needs_exact = true; 8673 break; 8674 case nir_intrinsic_demote_if: { 8675 Temp src = get_ssa_temp(ctx, instr->src[0].ssa); 8676 assert(src.regClass() == bld.lm); 8677 Temp cond = 8678 bld.sop2(Builder::s_and, bld.def(bld.lm), bld.def(s1, scc), src, Operand(exec, bld.lm)); 8679 bld.pseudo(aco_opcode::p_demote_to_helper, cond); 8680 8681 if (ctx->block->loop_nest_depth || ctx->cf_info.parent_if.is_divergent) 8682 ctx->cf_info.exec_potentially_empty_discard = true; 8683 ctx->block->kind |= block_kind_uses_demote; 8684 ctx->program->needs_exact = true; 8685 break; 8686 } 8687 case nir_intrinsic_first_invocation: { 8688 emit_wqm(bld, bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)), 8689 get_ssa_temp(ctx, &instr->dest.ssa)); 8690 break; 8691 } 8692 case nir_intrinsic_last_invocation: { 8693 Temp flbit = bld.sop1(Builder::s_flbit_i32, bld.def(s1), Operand(exec, bld.lm)); 8694 Temp last = bld.sop2(aco_opcode::s_sub_i32, bld.def(s1), bld.def(s1, scc), 8695 Operand::c32(ctx->program->wave_size - 1u), flbit); 8696 emit_wqm(bld, last, get_ssa_temp(ctx, &instr->dest.ssa)); 8697 break; 8698 } 8699 case nir_intrinsic_elect: { 8700 /* p_elect is lowered in aco_insert_exec_mask. 8701 * Use exec as an operand so value numbering and the pre-RA optimizer won't recognize 8702 * two p_elect with different exec masks as the same. 8703 */ 8704 Temp elected = bld.pseudo(aco_opcode::p_elect, bld.def(bld.lm), Operand(exec, bld.lm)); 8705 emit_wqm(bld, elected, get_ssa_temp(ctx, &instr->dest.ssa)); 8706 ctx->block->kind |= block_kind_needs_lowering; 8707 break; 8708 } 8709 case nir_intrinsic_shader_clock: { 8710 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8711 if (nir_intrinsic_memory_scope(instr) == NIR_SCOPE_SUBGROUP && 8712 ctx->options->chip_class >= GFX10_3) { 8713 /* "((size - 1) << 11) | register" (SHADER_CYCLES is encoded as register 29) */ 8714 Temp clock = bld.sopk(aco_opcode::s_getreg_b32, bld.def(s1), ((20 - 1) << 11) | 29); 8715 bld.pseudo(aco_opcode::p_create_vector, Definition(dst), clock, Operand::zero()); 8716 } else { 8717 aco_opcode opcode = nir_intrinsic_memory_scope(instr) == NIR_SCOPE_DEVICE 8718 ? aco_opcode::s_memrealtime 8719 : aco_opcode::s_memtime; 8720 bld.smem(opcode, Definition(dst), memory_sync_info(0, semantic_volatile)); 8721 } 8722 emit_split_vector(ctx, dst, 2); 8723 break; 8724 } 8725 case nir_intrinsic_load_vertex_id_zero_base: { 8726 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8727 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.vertex_id)); 8728 break; 8729 } 8730 case nir_intrinsic_load_first_vertex: { 8731 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8732 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.base_vertex)); 8733 break; 8734 } 8735 case nir_intrinsic_load_base_instance: { 8736 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8737 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.start_instance)); 8738 break; 8739 } 8740 case nir_intrinsic_load_instance_id: { 8741 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8742 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.instance_id)); 8743 break; 8744 } 8745 case nir_intrinsic_load_draw_id: { 8746 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8747 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.draw_id)); 8748 break; 8749 } 8750 case nir_intrinsic_load_invocation_id: { 8751 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8752 8753 if (ctx->shader->info.stage == MESA_SHADER_GEOMETRY) { 8754 if (ctx->options->chip_class >= GFX10) 8755 bld.vop2_e64(aco_opcode::v_and_b32, Definition(dst), Operand::c32(127u), 8756 get_arg(ctx, ctx->args->ac.gs_invocation_id)); 8757 else 8758 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_invocation_id)); 8759 } else if (ctx->shader->info.stage == MESA_SHADER_TESS_CTRL) { 8760 bld.vop3(aco_opcode::v_bfe_u32, Definition(dst), get_arg(ctx, ctx->args->ac.tcs_rel_ids), 8761 Operand::c32(8u), Operand::c32(5u)); 8762 } else { 8763 unreachable("Unsupported stage for load_invocation_id"); 8764 } 8765 8766 break; 8767 } 8768 case nir_intrinsic_load_primitive_id: { 8769 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8770 8771 switch (ctx->shader->info.stage) { 8772 case MESA_SHADER_GEOMETRY: 8773 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id)); 8774 break; 8775 case MESA_SHADER_TESS_CTRL: 8776 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tcs_patch_id)); 8777 break; 8778 case MESA_SHADER_TESS_EVAL: 8779 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.tes_patch_id)); 8780 break; 8781 default: 8782 if (ctx->stage.hw == HWStage::NGG && !ctx->stage.has(SWStage::GS)) { 8783 /* In case of NGG, the GS threads always have the primitive ID 8784 * even if there is no SW GS. */ 8785 bld.copy(Definition(dst), get_arg(ctx, ctx->args->ac.gs_prim_id)); 8786 break; 8787 } 8788 unreachable("Unimplemented shader stage for nir_intrinsic_load_primitive_id"); 8789 } 8790 8791 break; 8792 } 8793 case nir_intrinsic_load_patch_vertices_in: { 8794 assert(ctx->shader->info.stage == MESA_SHADER_TESS_CTRL || 8795 ctx->shader->info.stage == MESA_SHADER_TESS_EVAL); 8796 8797 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 8798 bld.copy(Definition(dst), Operand::c32(ctx->args->options->key.tcs.tess_input_vertices)); 8799 break; 8800 } 8801 case nir_intrinsic_emit_vertex_with_counter: { 8802 assert(ctx->stage.hw == HWStage::GS); 8803 visit_emit_vertex_with_counter(ctx, instr); 8804 break; 8805 } 8806 case nir_intrinsic_end_primitive_with_counter: { 8807 if (ctx->stage.hw != HWStage::NGG) { 8808 unsigned stream = nir_intrinsic_stream_id(instr); 8809 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx->gs_wave_id), -1, 8810 sendmsg_gs(true, false, stream)); 8811 } 8812 break; 8813 } 8814 case nir_intrinsic_set_vertex_and_primitive_count: { 8815 assert(ctx->stage.hw == HWStage::GS); 8816 /* unused in the legacy pipeline, the HW keeps track of this for us */ 8817 break; 8818 } 8819 case nir_intrinsic_load_tess_rel_patch_id_amd: { 8820 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), get_tess_rel_patch_id(ctx)); 8821 break; 8822 } 8823 case nir_intrinsic_load_ring_tess_factors_amd: { 8824 bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8825 ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_FACTOR * 16u)); 8826 break; 8827 } 8828 case nir_intrinsic_load_ring_tess_factors_offset_amd: { 8829 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8830 get_arg(ctx, ctx->args->ac.tcs_factor_offset)); 8831 break; 8832 } 8833 case nir_intrinsic_load_ring_tess_offchip_amd: { 8834 bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8835 ctx->program->private_segment_buffer, Operand::c32(RING_HS_TESS_OFFCHIP * 16u)); 8836 break; 8837 } 8838 case nir_intrinsic_load_ring_tess_offchip_offset_amd: { 8839 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8840 get_arg(ctx, ctx->args->ac.tess_offchip_offset)); 8841 break; 8842 } 8843 case nir_intrinsic_load_ring_esgs_amd: { 8844 unsigned ring = ctx->stage.hw == HWStage::ES ? RING_ESGS_VS : RING_ESGS_GS; 8845 bld.smem(aco_opcode::s_load_dwordx4, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8846 ctx->program->private_segment_buffer, Operand::c32(ring * 16u)); 8847 break; 8848 } 8849 case nir_intrinsic_load_ring_es2gs_offset_amd: { 8850 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8851 get_arg(ctx, ctx->args->ac.es2gs_offset)); 8852 break; 8853 } 8854 case nir_intrinsic_load_gs_vertex_offset_amd: { 8855 /* GFX6-8 uses 6 separate args, while GFX9+ packs these into only 3 args. */ 8856 unsigned b = nir_intrinsic_base(instr); 8857 assert(b <= (ctx->program->chip_class >= GFX9 ? 2 : 5)); 8858 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8859 get_arg(ctx, ctx->args->ac.gs_vtx_offset[b])); 8860 break; 8861 } 8862 case nir_intrinsic_has_input_vertex_amd: 8863 case nir_intrinsic_has_input_primitive_amd: { 8864 assert(ctx->stage.hw == HWStage::NGG); 8865 unsigned i = instr->intrinsic == nir_intrinsic_has_input_vertex_amd ? 0 : 1; 8866 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), merged_wave_info_to_mask(ctx, i)); 8867 break; 8868 } 8869 case nir_intrinsic_load_workgroup_num_input_vertices_amd: 8870 case nir_intrinsic_load_workgroup_num_input_primitives_amd: { 8871 assert(ctx->stage.hw == HWStage::NGG); 8872 unsigned pos = 8873 instr->intrinsic == nir_intrinsic_load_workgroup_num_input_vertices_amd ? 12 : 22; 8874 bld.sop2(aco_opcode::s_bfe_u32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8875 bld.def(s1, scc), get_arg(ctx, ctx->args->ac.gs_tg_info), 8876 Operand::c32(pos | (9u << 16u))); 8877 break; 8878 } 8879 case nir_intrinsic_load_initial_edgeflags_amd: { 8880 assert(ctx->stage.hw == HWStage::NGG); 8881 8882 Temp gs_invocation_id = get_arg(ctx, ctx->args->ac.gs_invocation_id); 8883 /* Get initial edgeflags for each vertex at bits 8, 9, 10 of gs_invocation_id. */ 8884 Temp flags = 8885 bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x700u), gs_invocation_id); 8886 /* Move the bits to their desired position: 8->9, 9->19, 10->29. */ 8887 flags = bld.vop2(aco_opcode::v_mul_u32_u24, bld.def(v1), Operand::c32(0x80402u), flags); 8888 /* Remove garbage bits that are a byproduct of the multiplication. */ 8889 bld.vop2(aco_opcode::v_and_b32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8890 Operand::c32(0x20080200), flags); 8891 break; 8892 } 8893 case nir_intrinsic_load_packed_passthrough_primitive_amd: { 8894 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8895 get_arg(ctx, ctx->args->ac.gs_vtx_offset[0])); 8896 break; 8897 } 8898 case nir_intrinsic_export_vertex_amd: { 8899 ctx->block->kind |= block_kind_export_end; 8900 create_vs_exports(ctx); 8901 break; 8902 } 8903 case nir_intrinsic_export_primitive_amd: { 8904 assert(ctx->stage.hw == HWStage::NGG); 8905 Temp prim_exp_arg = get_ssa_temp(ctx, instr->src[0].ssa); 8906 bld.exp(aco_opcode::exp, prim_exp_arg, Operand(v1), Operand(v1), Operand(v1), 8907 1 /* enabled mask */, V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, 8908 true /* done */, false /* valid mask */); 8909 break; 8910 } 8911 case nir_intrinsic_alloc_vertices_and_primitives_amd: { 8912 assert(ctx->stage.hw == HWStage::NGG); 8913 Temp num_vertices = get_ssa_temp(ctx, instr->src[0].ssa); 8914 Temp num_primitives = get_ssa_temp(ctx, instr->src[1].ssa); 8915 ngg_emit_sendmsg_gs_alloc_req(ctx, num_vertices, num_primitives); 8916 break; 8917 } 8918 case nir_intrinsic_gds_atomic_add_amd: { 8919 Temp store_val = get_ssa_temp(ctx, instr->src[0].ssa); 8920 Temp gds_addr = get_ssa_temp(ctx, instr->src[1].ssa); 8921 Temp m0_val = get_ssa_temp(ctx, instr->src[2].ssa); 8922 Operand m = bld.m0((Temp)bld.copy(bld.def(s1, m0), bld.as_uniform(m0_val))); 8923 bld.ds(aco_opcode::ds_add_u32, as_vgpr(ctx, gds_addr), as_vgpr(ctx, store_val), m, 0u, 0u, 8924 true); 8925 break; 8926 } 8927 case nir_intrinsic_load_shader_query_enabled_amd: { 8928 unsigned cmp_bit = 0; 8929 Temp shader_query_enabled = 8930 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), 8931 get_arg(ctx, ctx->args->ngg_gs_state), Operand::c32(cmp_bit)); 8932 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8933 bool_to_vector_condition(ctx, shader_query_enabled)); 8934 break; 8935 } 8936 case nir_intrinsic_load_cull_front_face_enabled_amd: 8937 case nir_intrinsic_load_cull_back_face_enabled_amd: 8938 case nir_intrinsic_load_cull_ccw_amd: 8939 case nir_intrinsic_load_cull_small_primitives_enabled_amd: { 8940 unsigned cmp_bit; 8941 if (instr->intrinsic == nir_intrinsic_load_cull_front_face_enabled_amd) 8942 cmp_bit = 0; 8943 else if (instr->intrinsic == nir_intrinsic_load_cull_back_face_enabled_amd) 8944 cmp_bit = 1; 8945 else if (instr->intrinsic == nir_intrinsic_load_cull_ccw_amd) 8946 cmp_bit = 2; 8947 else if (instr->intrinsic == nir_intrinsic_load_cull_small_primitives_enabled_amd) 8948 cmp_bit = 3; 8949 else 8950 unreachable("unimplemented culling intrinsic"); 8951 8952 Builder::Result enabled = 8953 bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), 8954 get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(cmp_bit)); 8955 enabled.instr->definitions[0].setNoCSE(true); 8956 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8957 bool_to_vector_condition(ctx, enabled)); 8958 break; 8959 } 8960 case nir_intrinsic_load_sbt_amd: visit_load_sbt_amd(ctx, instr); break; 8961 case nir_intrinsic_bvh64_intersect_ray_amd: visit_bvh64_intersect_ray_amd(ctx, instr); break; 8962 case nir_intrinsic_load_cull_any_enabled_amd: { 8963 Builder::Result cull_any_enabled = 8964 bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), 8965 get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(0xbu)); 8966 cull_any_enabled.instr->definitions[1].setNoCSE(true); 8967 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8968 bool_to_vector_condition(ctx, cull_any_enabled.def(1).getTemp())); 8969 break; 8970 } 8971 case nir_intrinsic_load_cull_small_prim_precision_amd: { 8972 /* Exponent is 8-bit signed int, move that into a signed 32-bit int. */ 8973 Temp exponent = bld.sop2(aco_opcode::s_ashr_i32, bld.def(s1), bld.def(s1, scc), 8974 get_arg(ctx, ctx->args->ngg_culling_settings), Operand::c32(24u)); 8975 /* small_prim_precision = 1.0 * 2^X */ 8976 bld.vop3(aco_opcode::v_ldexp_f32, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8977 Operand::c32(0x3f800000u), Operand(exponent)); 8978 break; 8979 } 8980 case nir_intrinsic_load_viewport_x_scale: { 8981 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8982 get_arg(ctx, ctx->args->ngg_viewport_scale[0])); 8983 break; 8984 } 8985 case nir_intrinsic_load_viewport_y_scale: { 8986 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8987 get_arg(ctx, ctx->args->ngg_viewport_scale[1])); 8988 break; 8989 } 8990 case nir_intrinsic_load_viewport_x_offset: { 8991 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8992 get_arg(ctx, ctx->args->ngg_viewport_translate[0])); 8993 break; 8994 } 8995 case nir_intrinsic_load_viewport_y_offset: { 8996 bld.copy(Definition(get_ssa_temp(ctx, &instr->dest.ssa)), 8997 get_arg(ctx, ctx->args->ngg_viewport_translate[1])); 8998 break; 8999 } 9000 case nir_intrinsic_overwrite_vs_arguments_amd: { 9001 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa); 9002 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa); 9003 break; 9004 } 9005 case nir_intrinsic_overwrite_tes_arguments_amd: { 9006 ctx->arg_temps[ctx->args->ac.tes_u.arg_index] = get_ssa_temp(ctx, instr->src[0].ssa); 9007 ctx->arg_temps[ctx->args->ac.tes_v.arg_index] = get_ssa_temp(ctx, instr->src[1].ssa); 9008 ctx->arg_temps[ctx->args->ac.tes_rel_patch_id.arg_index] = 9009 get_ssa_temp(ctx, instr->src[2].ssa); 9010 ctx->arg_temps[ctx->args->ac.tes_patch_id.arg_index] = get_ssa_temp(ctx, instr->src[3].ssa); 9011 break; 9012 } 9013 default: 9014 isel_err(&instr->instr, "Unimplemented intrinsic instr"); 9015 abort(); 9016 9017 break; 9018 } 9019} 9020 9021void 9022tex_fetch_ptrs(isel_context* ctx, nir_tex_instr* instr, Temp* res_ptr, Temp* samp_ptr, 9023 enum glsl_base_type* stype) 9024{ 9025 nir_deref_instr* texture_deref_instr = NULL; 9026 nir_deref_instr* sampler_deref_instr = NULL; 9027 int plane = -1; 9028 9029 for (unsigned i = 0; i < instr->num_srcs; i++) { 9030 switch (instr->src[i].src_type) { 9031 case nir_tex_src_texture_deref: 9032 texture_deref_instr = nir_src_as_deref(instr->src[i].src); 9033 break; 9034 case nir_tex_src_sampler_deref: 9035 sampler_deref_instr = nir_src_as_deref(instr->src[i].src); 9036 break; 9037 case nir_tex_src_plane: plane = nir_src_as_int(instr->src[i].src); break; 9038 default: break; 9039 } 9040 } 9041 9042 *stype = glsl_get_sampler_result_type(texture_deref_instr->type); 9043 9044 if (!sampler_deref_instr) 9045 sampler_deref_instr = texture_deref_instr; 9046 9047 if (plane >= 0) { 9048 assert(instr->sampler_dim != GLSL_SAMPLER_DIM_BUF); 9049 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, 9050 (aco_descriptor_type)(ACO_DESC_PLANE_0 + plane), instr, false); 9051 } else if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { 9052 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_BUFFER, instr, false); 9053 } else if (instr->op == nir_texop_fragment_mask_fetch_amd) { 9054 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_FMASK, instr, false); 9055 } else { 9056 *res_ptr = get_sampler_desc(ctx, texture_deref_instr, ACO_DESC_IMAGE, instr, false); 9057 } 9058 if (samp_ptr) { 9059 *samp_ptr = get_sampler_desc(ctx, sampler_deref_instr, ACO_DESC_SAMPLER, instr, false); 9060 9061 if (instr->sampler_dim < GLSL_SAMPLER_DIM_RECT && ctx->options->chip_class < GFX8) { 9062 /* fix sampler aniso on SI/CI: samp[0] = samp[0] & img[7] */ 9063 Builder bld(ctx->program, ctx->block); 9064 9065 /* to avoid unnecessary moves, we split and recombine sampler and image */ 9066 Temp img[8] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), 9067 bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)}; 9068 Temp samp[4] = {bld.tmp(s1), bld.tmp(s1), bld.tmp(s1), bld.tmp(s1)}; 9069 bld.pseudo(aco_opcode::p_split_vector, Definition(img[0]), Definition(img[1]), 9070 Definition(img[2]), Definition(img[3]), Definition(img[4]), Definition(img[5]), 9071 Definition(img[6]), Definition(img[7]), *res_ptr); 9072 bld.pseudo(aco_opcode::p_split_vector, Definition(samp[0]), Definition(samp[1]), 9073 Definition(samp[2]), Definition(samp[3]), *samp_ptr); 9074 9075 samp[0] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), samp[0], img[7]); 9076 *res_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s8), img[0], img[1], img[2], 9077 img[3], img[4], img[5], img[6], img[7]); 9078 *samp_ptr = bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), samp[0], samp[1], samp[2], 9079 samp[3]); 9080 } 9081 } 9082} 9083 9084void 9085build_cube_select(isel_context* ctx, Temp ma, Temp id, Temp deriv, Temp* out_ma, Temp* out_sc, 9086 Temp* out_tc) 9087{ 9088 Builder bld(ctx->program, ctx->block); 9089 9090 Temp deriv_x = emit_extract_vector(ctx, deriv, 0, v1); 9091 Temp deriv_y = emit_extract_vector(ctx, deriv, 1, v1); 9092 Temp deriv_z = emit_extract_vector(ctx, deriv, 2, v1); 9093 9094 Operand neg_one = Operand::c32(0xbf800000u); 9095 Operand one = Operand::c32(0x3f800000u); 9096 Operand two = Operand::c32(0x40000000u); 9097 Operand four = Operand::c32(0x40800000u); 9098 9099 Temp is_ma_positive = 9100 bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), Operand::zero(), ma); 9101 Temp sgn_ma = bld.vop2_e64(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, one, is_ma_positive); 9102 Temp neg_sgn_ma = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), Operand::zero(), sgn_ma); 9103 9104 Temp is_ma_z = bld.vopc(aco_opcode::v_cmp_le_f32, bld.hint_vcc(bld.def(bld.lm)), four, id); 9105 Temp is_ma_y = bld.vopc(aco_opcode::v_cmp_le_f32, bld.def(bld.lm), two, id); 9106 is_ma_y = bld.sop2(Builder::s_andn2, bld.hint_vcc(bld.def(bld.lm)), is_ma_y, is_ma_z); 9107 Temp is_not_ma_x = bld.sop2(aco_opcode::s_or_b64, bld.hint_vcc(bld.def(bld.lm)), 9108 bld.def(s1, scc), is_ma_z, is_ma_y); 9109 9110 /* select sc */ 9111 Temp tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_z, deriv_x, is_not_ma_x); 9112 Temp sgn = bld.vop2_e64( 9113 aco_opcode::v_cndmask_b32, bld.def(v1), 9114 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_sgn_ma, sgn_ma, is_ma_z), one, is_ma_y); 9115 *out_sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); 9116 9117 /* select tc */ 9118 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_y, deriv_z, is_ma_y); 9119 sgn = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), neg_one, sgn_ma, is_ma_y); 9120 *out_tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tmp, sgn); 9121 9122 /* select ma */ 9123 tmp = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 9124 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), deriv_x, deriv_y, is_ma_y), 9125 deriv_z, is_ma_z); 9126 tmp = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x7fffffffu), tmp); 9127 *out_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), two, tmp); 9128} 9129 9130void 9131prepare_cube_coords(isel_context* ctx, std::vector<Temp>& coords, Temp* ddx, Temp* ddy, 9132 bool is_deriv, bool is_array) 9133{ 9134 Builder bld(ctx->program, ctx->block); 9135 Temp ma, tc, sc, id; 9136 aco_opcode madak = 9137 ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmaak_f32 : aco_opcode::v_madak_f32; 9138 aco_opcode madmk = 9139 ctx->program->chip_class >= GFX10_3 ? aco_opcode::v_fmamk_f32 : aco_opcode::v_madmk_f32; 9140 9141 if (is_array) { 9142 coords[3] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[3]); 9143 9144 /* see comment in ac_prepare_cube_coords() */ 9145 if (ctx->options->chip_class <= GFX8) 9146 coords[3] = bld.vop2(aco_opcode::v_max_f32, bld.def(v1), Operand::zero(), coords[3]); 9147 } 9148 9149 ma = bld.vop3(aco_opcode::v_cubema_f32, bld.def(v1), coords[0], coords[1], coords[2]); 9150 9151 aco_ptr<VOP3_instruction> vop3a{ 9152 create_instruction<VOP3_instruction>(aco_opcode::v_rcp_f32, asVOP3(Format::VOP1), 1, 1)}; 9153 vop3a->operands[0] = Operand(ma); 9154 vop3a->abs[0] = true; 9155 Temp invma = bld.tmp(v1); 9156 vop3a->definitions[0] = Definition(invma); 9157 ctx->block->instructions.emplace_back(std::move(vop3a)); 9158 9159 sc = bld.vop3(aco_opcode::v_cubesc_f32, bld.def(v1), coords[0], coords[1], coords[2]); 9160 if (!is_deriv) 9161 sc = bld.vop2(madak, bld.def(v1), sc, invma, Operand::c32(0x3fc00000u /*1.5*/)); 9162 9163 tc = bld.vop3(aco_opcode::v_cubetc_f32, bld.def(v1), coords[0], coords[1], coords[2]); 9164 if (!is_deriv) 9165 tc = bld.vop2(madak, bld.def(v1), tc, invma, Operand::c32(0x3fc00000u /*1.5*/)); 9166 9167 id = bld.vop3(aco_opcode::v_cubeid_f32, bld.def(v1), coords[0], coords[1], coords[2]); 9168 9169 if (is_deriv) { 9170 sc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), sc, invma); 9171 tc = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), tc, invma); 9172 9173 for (unsigned i = 0; i < 2; i++) { 9174 /* see comment in ac_prepare_cube_coords() */ 9175 Temp deriv_ma; 9176 Temp deriv_sc, deriv_tc; 9177 build_cube_select(ctx, ma, id, i ? *ddy : *ddx, &deriv_ma, &deriv_sc, &deriv_tc); 9178 9179 deriv_ma = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, invma); 9180 9181 Temp x = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), 9182 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_sc, invma), 9183 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, sc)); 9184 Temp y = bld.vop2(aco_opcode::v_sub_f32, bld.def(v1), 9185 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_tc, invma), 9186 bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), deriv_ma, tc)); 9187 *(i ? ddy : ddx) = bld.pseudo(aco_opcode::p_create_vector, bld.def(v2), x, y); 9188 } 9189 9190 sc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), sc); 9191 tc = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::c32(0x3fc00000u /*1.5*/), tc); 9192 } 9193 9194 if (is_array) 9195 id = bld.vop2(madmk, bld.def(v1), coords[3], id, Operand::c32(0x41000000u /*8.0*/)); 9196 coords.resize(3); 9197 coords[0] = sc; 9198 coords[1] = tc; 9199 coords[2] = id; 9200} 9201 9202void 9203get_const_vec(nir_ssa_def* vec, nir_const_value* cv[4]) 9204{ 9205 if (vec->parent_instr->type != nir_instr_type_alu) 9206 return; 9207 nir_alu_instr* vec_instr = nir_instr_as_alu(vec->parent_instr); 9208 if (vec_instr->op != nir_op_vec(vec->num_components)) 9209 return; 9210 9211 for (unsigned i = 0; i < vec->num_components; i++) { 9212 cv[i] = 9213 vec_instr->src[i].swizzle[0] == 0 ? nir_src_as_const_value(vec_instr->src[i].src) : NULL; 9214 } 9215} 9216 9217void 9218visit_tex(isel_context* ctx, nir_tex_instr* instr) 9219{ 9220 assert(instr->op != nir_texop_txf_ms && instr->op != nir_texop_samples_identical); 9221 9222 Builder bld(ctx->program, ctx->block); 9223 bool has_bias = false, has_lod = false, level_zero = false, has_compare = false, 9224 has_offset = false, has_ddx = false, has_ddy = false, has_derivs = false, 9225 has_sample_index = false, has_clamped_lod = false; 9226 Temp resource, sampler, bias = Temp(), compare = Temp(), sample_index = Temp(), lod = Temp(), 9227 offset = Temp(), ddx = Temp(), ddy = Temp(), clamped_lod = Temp(); 9228 std::vector<Temp> coords; 9229 std::vector<Temp> derivs; 9230 nir_const_value* const_offset[4] = {NULL, NULL, NULL, NULL}; 9231 enum glsl_base_type stype; 9232 tex_fetch_ptrs(ctx, instr, &resource, &sampler, &stype); 9233 9234 bool tg4_integer_workarounds = ctx->options->chip_class <= GFX8 && instr->op == nir_texop_tg4 && 9235 (stype == GLSL_TYPE_UINT || stype == GLSL_TYPE_INT); 9236 bool tg4_integer_cube_workaround = 9237 tg4_integer_workarounds && instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE; 9238 9239 for (unsigned i = 0; i < instr->num_srcs; i++) { 9240 switch (instr->src[i].src_type) { 9241 case nir_tex_src_coord: { 9242 Temp coord = get_ssa_temp(ctx, instr->src[i].src.ssa); 9243 for (unsigned j = 0; j < coord.size(); j++) 9244 coords.emplace_back(emit_extract_vector(ctx, coord, j, v1)); 9245 break; 9246 } 9247 case nir_tex_src_bias: 9248 bias = get_ssa_temp(ctx, instr->src[i].src.ssa); 9249 has_bias = true; 9250 break; 9251 case nir_tex_src_lod: { 9252 if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) { 9253 level_zero = true; 9254 } else { 9255 lod = get_ssa_temp(ctx, instr->src[i].src.ssa); 9256 has_lod = true; 9257 } 9258 break; 9259 } 9260 case nir_tex_src_min_lod: 9261 clamped_lod = get_ssa_temp(ctx, instr->src[i].src.ssa); 9262 has_clamped_lod = true; 9263 break; 9264 case nir_tex_src_comparator: 9265 if (instr->is_shadow) { 9266 compare = get_ssa_temp(ctx, instr->src[i].src.ssa); 9267 has_compare = true; 9268 } 9269 break; 9270 case nir_tex_src_offset: 9271 offset = get_ssa_temp(ctx, instr->src[i].src.ssa); 9272 get_const_vec(instr->src[i].src.ssa, const_offset); 9273 has_offset = true; 9274 break; 9275 case nir_tex_src_ddx: 9276 ddx = get_ssa_temp(ctx, instr->src[i].src.ssa); 9277 has_ddx = true; 9278 break; 9279 case nir_tex_src_ddy: 9280 ddy = get_ssa_temp(ctx, instr->src[i].src.ssa); 9281 has_ddy = true; 9282 break; 9283 case nir_tex_src_ms_index: 9284 sample_index = get_ssa_temp(ctx, instr->src[i].src.ssa); 9285 has_sample_index = true; 9286 break; 9287 case nir_tex_src_texture_offset: 9288 case nir_tex_src_sampler_offset: 9289 default: break; 9290 } 9291 } 9292 9293 if (instr->op == nir_texop_txs && instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 9294 return get_buffer_size(ctx, resource, get_ssa_temp(ctx, &instr->dest.ssa)); 9295 9296 if (instr->op == nir_texop_texture_samples) { 9297 get_image_samples(ctx, Definition(get_ssa_temp(ctx, &instr->dest.ssa)), resource); 9298 return; 9299 } 9300 9301 if (has_offset && instr->op != nir_texop_txf) { 9302 aco_ptr<Instruction> tmp_instr; 9303 Temp acc, pack = Temp(); 9304 9305 uint32_t pack_const = 0; 9306 for (unsigned i = 0; i < offset.size(); i++) { 9307 if (!const_offset[i]) 9308 continue; 9309 pack_const |= (const_offset[i]->u32 & 0x3Fu) << (8u * i); 9310 } 9311 9312 if (offset.type() == RegType::sgpr) { 9313 for (unsigned i = 0; i < offset.size(); i++) { 9314 if (const_offset[i]) 9315 continue; 9316 9317 acc = emit_extract_vector(ctx, offset, i, s1); 9318 acc = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), acc, 9319 Operand::c32(0x3Fu)); 9320 9321 if (i) { 9322 acc = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), acc, 9323 Operand::c32(8u * i)); 9324 } 9325 9326 if (pack == Temp()) { 9327 pack = acc; 9328 } else { 9329 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), pack, acc); 9330 } 9331 } 9332 9333 if (pack_const && pack != Temp()) 9334 pack = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), 9335 Operand::c32(pack_const), pack); 9336 } else { 9337 for (unsigned i = 0; i < offset.size(); i++) { 9338 if (const_offset[i]) 9339 continue; 9340 9341 acc = emit_extract_vector(ctx, offset, i, v1); 9342 acc = bld.vop2(aco_opcode::v_and_b32, bld.def(v1), Operand::c32(0x3Fu), acc); 9343 9344 if (i) { 9345 acc = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(8u * i), acc); 9346 } 9347 9348 if (pack == Temp()) { 9349 pack = acc; 9350 } else { 9351 pack = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), pack, acc); 9352 } 9353 } 9354 9355 if (pack_const && pack != Temp()) 9356 pack = bld.sop2(aco_opcode::v_or_b32, bld.def(v1), Operand::c32(pack_const), pack); 9357 } 9358 if (pack_const && pack == Temp()) 9359 offset = bld.copy(bld.def(v1), Operand::c32(pack_const)); 9360 else if (pack == Temp()) 9361 has_offset = false; 9362 else 9363 offset = pack; 9364 } 9365 9366 if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE && instr->coord_components) 9367 prepare_cube_coords(ctx, coords, &ddx, &ddy, instr->op == nir_texop_txd, 9368 instr->is_array && instr->op != nir_texop_lod); 9369 9370 /* pack derivatives */ 9371 if (has_ddx || has_ddy) { 9372 if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D && ctx->options->chip_class == GFX9) { 9373 assert(has_ddx && has_ddy && ddx.size() == 1 && ddy.size() == 1); 9374 Temp zero = bld.copy(bld.def(v1), Operand::zero()); 9375 derivs = {ddx, zero, ddy, zero}; 9376 } else { 9377 for (unsigned i = 0; has_ddx && i < ddx.size(); i++) 9378 derivs.emplace_back(emit_extract_vector(ctx, ddx, i, v1)); 9379 for (unsigned i = 0; has_ddy && i < ddy.size(); i++) 9380 derivs.emplace_back(emit_extract_vector(ctx, ddy, i, v1)); 9381 } 9382 has_derivs = true; 9383 } 9384 9385 if (instr->coord_components > 1 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && 9386 instr->is_array && instr->op != nir_texop_txf) 9387 coords[1] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[1]); 9388 9389 if (instr->coord_components > 2 && 9390 (instr->sampler_dim == GLSL_SAMPLER_DIM_2D || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || 9391 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS || 9392 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS) && 9393 instr->is_array && instr->op != nir_texop_txf && instr->op != nir_texop_fragment_fetch_amd && 9394 instr->op != nir_texop_fragment_mask_fetch_amd) 9395 coords[2] = bld.vop1(aco_opcode::v_rndne_f32, bld.def(v1), coords[2]); 9396 9397 if (ctx->options->chip_class == GFX9 && instr->sampler_dim == GLSL_SAMPLER_DIM_1D && 9398 instr->op != nir_texop_lod && instr->coord_components) { 9399 assert(coords.size() > 0 && coords.size() < 3); 9400 9401 coords.insert(std::next(coords.begin()), 9402 bld.copy(bld.def(v1), instr->op == nir_texop_txf ? Operand::c32(0) 9403 : Operand::c32(0x3f000000))); 9404 } 9405 9406 bool da = should_declare_array(ctx, instr->sampler_dim, instr->is_array); 9407 9408 if (has_offset && instr->op == nir_texop_txf) { 9409 for (unsigned i = 0; i < std::min(offset.size(), instr->coord_components); i++) { 9410 Temp off = emit_extract_vector(ctx, offset, i, v1); 9411 coords[i] = bld.vadd32(bld.def(v1), coords[i], off); 9412 } 9413 has_offset = false; 9414 } 9415 9416 /* Build tex instruction */ 9417 unsigned dmask = nir_ssa_def_components_read(&instr->dest.ssa) & 0xf; 9418 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) 9419 dmask = u_bit_consecutive(0, util_last_bit(dmask)); 9420 if (instr->is_sparse) 9421 dmask = MAX2(dmask, 1) | 0x10; 9422 unsigned dim = 9423 ctx->options->chip_class >= GFX10 && instr->sampler_dim != GLSL_SAMPLER_DIM_BUF 9424 ? ac_get_sampler_dim(ctx->options->chip_class, instr->sampler_dim, instr->is_array) 9425 : 0; 9426 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9427 Temp tmp_dst = dst; 9428 9429 /* gather4 selects the component by dmask and always returns vec4 (vec5 if sparse) */ 9430 if (instr->op == nir_texop_tg4) { 9431 assert(instr->dest.ssa.num_components == (4 + instr->is_sparse)); 9432 if (instr->is_shadow) 9433 dmask = 1; 9434 else 9435 dmask = 1 << instr->component; 9436 if (tg4_integer_cube_workaround || dst.type() == RegType::sgpr) 9437 tmp_dst = bld.tmp(instr->is_sparse ? v5 : v4); 9438 } else if (instr->op == nir_texop_fragment_mask_fetch_amd) { 9439 tmp_dst = bld.tmp(v1); 9440 } else if (util_bitcount(dmask) != instr->dest.ssa.num_components || 9441 dst.type() == RegType::sgpr) { 9442 tmp_dst = bld.tmp(RegClass(RegType::vgpr, util_bitcount(dmask))); 9443 } 9444 9445 if (instr->op == nir_texop_txs || instr->op == nir_texop_query_levels) { 9446 if (!has_lod) 9447 lod = bld.copy(bld.def(v1), Operand::zero()); 9448 9449 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(tmp_dst), 9450 resource, Operand(s4), std::vector<Temp>{lod}); 9451 if (ctx->options->chip_class == GFX9 && instr->op == nir_texop_txs && 9452 instr->sampler_dim == GLSL_SAMPLER_DIM_1D && instr->is_array) { 9453 tex->dmask = (dmask & 0x1) | ((dmask & 0x2) << 1); 9454 } else if (instr->op == nir_texop_query_levels) { 9455 tex->dmask = 1 << 3; 9456 } else { 9457 tex->dmask = dmask; 9458 } 9459 tex->da = da; 9460 tex->dim = dim; 9461 9462 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); 9463 return; 9464 } 9465 9466 Temp tg4_compare_cube_wa64 = Temp(); 9467 9468 if (tg4_integer_workarounds) { 9469 Temp tg4_lod = bld.copy(bld.def(v1), Operand::zero()); 9470 Temp size = bld.tmp(v2); 9471 MIMG_instruction* tex = emit_mimg(bld, aco_opcode::image_get_resinfo, Definition(size), 9472 resource, Operand(s4), std::vector<Temp>{tg4_lod}); 9473 tex->dim = dim; 9474 tex->dmask = 0x3; 9475 tex->da = da; 9476 emit_split_vector(ctx, size, size.size()); 9477 9478 Temp half_texel[2]; 9479 for (unsigned i = 0; i < 2; i++) { 9480 half_texel[i] = emit_extract_vector(ctx, size, i, v1); 9481 half_texel[i] = bld.vop1(aco_opcode::v_cvt_f32_i32, bld.def(v1), half_texel[i]); 9482 half_texel[i] = bld.vop1(aco_opcode::v_rcp_iflag_f32, bld.def(v1), half_texel[i]); 9483 half_texel[i] = bld.vop2(aco_opcode::v_mul_f32, bld.def(v1), 9484 Operand::c32(0xbf000000 /*-0.5*/), half_texel[i]); 9485 } 9486 9487 if (instr->sampler_dim == GLSL_SAMPLER_DIM_2D && !instr->is_array) { 9488 /* In vulkan, whether the sampler uses unnormalized 9489 * coordinates or not is a dynamic property of the 9490 * sampler. Hence, to figure out whether or not we 9491 * need to divide by the texture size, we need to test 9492 * the sampler at runtime. This tests the bit set by 9493 * radv_init_sampler(). 9494 */ 9495 unsigned bit_idx = ffs(S_008F30_FORCE_UNNORMALIZED(1)) - 1; 9496 Temp not_needed = 9497 bld.sopc(aco_opcode::s_bitcmp0_b32, bld.def(s1, scc), sampler, Operand::c32(bit_idx)); 9498 9499 not_needed = bool_to_vector_condition(ctx, not_needed); 9500 half_texel[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 9501 Operand::c32(0xbf000000 /*-0.5*/), half_texel[0], not_needed); 9502 half_texel[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 9503 Operand::c32(0xbf000000 /*-0.5*/), half_texel[1], not_needed); 9504 } 9505 9506 Temp new_coords[2] = {bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[0], half_texel[0]), 9507 bld.vop2(aco_opcode::v_add_f32, bld.def(v1), coords[1], half_texel[1])}; 9508 9509 if (tg4_integer_cube_workaround) { 9510 /* see comment in ac_nir_to_llvm.c's lower_gather4_integer() */ 9511 Temp* const desc = (Temp*)alloca(resource.size() * sizeof(Temp)); 9512 aco_ptr<Instruction> split{create_instruction<Pseudo_instruction>( 9513 aco_opcode::p_split_vector, Format::PSEUDO, 1, resource.size())}; 9514 split->operands[0] = Operand(resource); 9515 for (unsigned i = 0; i < resource.size(); i++) { 9516 desc[i] = bld.tmp(s1); 9517 split->definitions[i] = Definition(desc[i]); 9518 } 9519 ctx->block->instructions.emplace_back(std::move(split)); 9520 9521 Temp dfmt = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), desc[1], 9522 Operand::c32(20u | (6u << 16))); 9523 Temp compare_cube_wa = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), dfmt, 9524 Operand::c32(V_008F14_IMG_DATA_FORMAT_8_8_8_8)); 9525 9526 Temp nfmt; 9527 if (stype == GLSL_TYPE_UINT) { 9528 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), 9529 Operand::c32(V_008F14_IMG_NUM_FORMAT_USCALED), 9530 Operand::c32(V_008F14_IMG_NUM_FORMAT_UINT), bld.scc(compare_cube_wa)); 9531 } else { 9532 nfmt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), 9533 Operand::c32(V_008F14_IMG_NUM_FORMAT_SSCALED), 9534 Operand::c32(V_008F14_IMG_NUM_FORMAT_SINT), bld.scc(compare_cube_wa)); 9535 } 9536 tg4_compare_cube_wa64 = bld.tmp(bld.lm); 9537 bool_to_vector_condition(ctx, compare_cube_wa, tg4_compare_cube_wa64); 9538 9539 nfmt = bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), nfmt, 9540 Operand::c32(26u)); 9541 9542 desc[1] = bld.sop2(aco_opcode::s_and_b32, bld.def(s1), bld.def(s1, scc), desc[1], 9543 Operand::c32(C_008F14_NUM_FORMAT)); 9544 desc[1] = bld.sop2(aco_opcode::s_or_b32, bld.def(s1), bld.def(s1, scc), desc[1], nfmt); 9545 9546 aco_ptr<Instruction> vec{create_instruction<Pseudo_instruction>( 9547 aco_opcode::p_create_vector, Format::PSEUDO, resource.size(), 1)}; 9548 for (unsigned i = 0; i < resource.size(); i++) 9549 vec->operands[i] = Operand(desc[i]); 9550 resource = bld.tmp(resource.regClass()); 9551 vec->definitions[0] = Definition(resource); 9552 ctx->block->instructions.emplace_back(std::move(vec)); 9553 9554 new_coords[0] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[0], coords[0], 9555 tg4_compare_cube_wa64); 9556 new_coords[1] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), new_coords[1], coords[1], 9557 tg4_compare_cube_wa64); 9558 } 9559 coords[0] = new_coords[0]; 9560 coords[1] = new_coords[1]; 9561 } 9562 9563 if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) { 9564 // FIXME: if (ctx->abi->gfx9_stride_size_workaround) return 9565 // ac_build_buffer_load_format_gfx9_safe() 9566 9567 assert(coords.size() == 1); 9568 aco_opcode op; 9569 switch (util_last_bit(dmask & 0xf)) { 9570 case 1: op = aco_opcode::buffer_load_format_x; break; 9571 case 2: op = aco_opcode::buffer_load_format_xy; break; 9572 case 3: op = aco_opcode::buffer_load_format_xyz; break; 9573 case 4: op = aco_opcode::buffer_load_format_xyzw; break; 9574 default: unreachable("Tex instruction loads more than 4 components."); 9575 } 9576 9577 aco_ptr<MUBUF_instruction> mubuf{ 9578 create_instruction<MUBUF_instruction>(op, Format::MUBUF, 3 + instr->is_sparse, 1)}; 9579 mubuf->operands[0] = Operand(resource); 9580 mubuf->operands[1] = Operand(coords[0]); 9581 mubuf->operands[2] = Operand::c32(0); 9582 mubuf->definitions[0] = Definition(tmp_dst); 9583 mubuf->idxen = true; 9584 mubuf->tfe = instr->is_sparse; 9585 if (mubuf->tfe) 9586 mubuf->operands[3] = emit_tfe_init(bld, tmp_dst); 9587 ctx->block->instructions.emplace_back(std::move(mubuf)); 9588 9589 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); 9590 return; 9591 } 9592 9593 /* gather MIMG address components */ 9594 std::vector<Temp> args; 9595 unsigned wqm_mask = 0; 9596 if (has_offset) { 9597 wqm_mask |= u_bit_consecutive(args.size(), 1); 9598 args.emplace_back(offset); 9599 } 9600 if (has_bias) 9601 args.emplace_back(bias); 9602 if (has_compare) 9603 args.emplace_back(compare); 9604 if (has_derivs) 9605 args.insert(args.end(), derivs.begin(), derivs.end()); 9606 9607 wqm_mask |= u_bit_consecutive(args.size(), coords.size()); 9608 args.insert(args.end(), coords.begin(), coords.end()); 9609 9610 if (has_sample_index) 9611 args.emplace_back(sample_index); 9612 if (has_lod) 9613 args.emplace_back(lod); 9614 if (has_clamped_lod) 9615 args.emplace_back(clamped_lod); 9616 9617 if (instr->op == nir_texop_txf || instr->op == nir_texop_fragment_fetch_amd || 9618 instr->op == nir_texop_fragment_mask_fetch_amd) { 9619 aco_opcode op = level_zero || instr->sampler_dim == GLSL_SAMPLER_DIM_MS || 9620 instr->sampler_dim == GLSL_SAMPLER_DIM_SUBPASS_MS 9621 ? aco_opcode::image_load 9622 : aco_opcode::image_load_mip; 9623 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); 9624 MIMG_instruction* tex = 9625 emit_mimg(bld, op, Definition(tmp_dst), resource, Operand(s4), args, 0, vdata); 9626 if (instr->op == nir_texop_fragment_mask_fetch_amd) 9627 tex->dim = da ? ac_image_2darray : ac_image_2d; 9628 else 9629 tex->dim = dim; 9630 tex->dmask = dmask & 0xf; 9631 tex->unrm = true; 9632 tex->da = da; 9633 tex->tfe = instr->is_sparse; 9634 9635 if (instr->op == nir_texop_fragment_mask_fetch_amd) { 9636 /* Use 0x76543210 if the image doesn't have FMASK. */ 9637 assert(dmask == 1 && dst.bytes() == 4); 9638 assert(dst.id() != tmp_dst.id()); 9639 9640 if (dst.regClass() == s1) { 9641 Temp is_not_null = bld.sopc(aco_opcode::s_cmp_lg_u32, bld.def(s1, scc), Operand::zero(), 9642 emit_extract_vector(ctx, resource, 1, s1)); 9643 bld.sop2(aco_opcode::s_cselect_b32, Definition(dst), 9644 bld.as_uniform(tmp_dst), Operand::c32(0x76543210), 9645 bld.scc(is_not_null)); 9646 } else { 9647 Temp is_not_null = bld.tmp(bld.lm); 9648 bld.vopc_e64(aco_opcode::v_cmp_lg_u32, Definition(is_not_null), Operand::zero(), 9649 emit_extract_vector(ctx, resource, 1, s1)) 9650 .def(0) 9651 .setHint(vcc); 9652 bld.vop2(aco_opcode::v_cndmask_b32, Definition(dst), 9653 bld.copy(bld.def(v1), Operand::c32(0x76543210)), tmp_dst, is_not_null); 9654 } 9655 } else { 9656 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, dmask); 9657 } 9658 return; 9659 } 9660 9661 // TODO: would be better to do this by adding offsets, but needs the opcodes ordered. 9662 aco_opcode opcode = aco_opcode::image_sample; 9663 if (has_offset) { /* image_sample_*_o */ 9664 if (has_clamped_lod) { 9665 if (has_compare) { 9666 opcode = aco_opcode::image_sample_c_cl_o; 9667 if (has_derivs) 9668 opcode = aco_opcode::image_sample_c_d_cl_o; 9669 if (has_bias) 9670 opcode = aco_opcode::image_sample_c_b_cl_o; 9671 } else { 9672 opcode = aco_opcode::image_sample_cl_o; 9673 if (has_derivs) 9674 opcode = aco_opcode::image_sample_d_cl_o; 9675 if (has_bias) 9676 opcode = aco_opcode::image_sample_b_cl_o; 9677 } 9678 } else if (has_compare) { 9679 opcode = aco_opcode::image_sample_c_o; 9680 if (has_derivs) 9681 opcode = aco_opcode::image_sample_c_d_o; 9682 if (has_bias) 9683 opcode = aco_opcode::image_sample_c_b_o; 9684 if (level_zero) 9685 opcode = aco_opcode::image_sample_c_lz_o; 9686 if (has_lod) 9687 opcode = aco_opcode::image_sample_c_l_o; 9688 } else { 9689 opcode = aco_opcode::image_sample_o; 9690 if (has_derivs) 9691 opcode = aco_opcode::image_sample_d_o; 9692 if (has_bias) 9693 opcode = aco_opcode::image_sample_b_o; 9694 if (level_zero) 9695 opcode = aco_opcode::image_sample_lz_o; 9696 if (has_lod) 9697 opcode = aco_opcode::image_sample_l_o; 9698 } 9699 } else if (has_clamped_lod) { /* image_sample_*_cl */ 9700 if (has_compare) { 9701 opcode = aco_opcode::image_sample_c_cl; 9702 if (has_derivs) 9703 opcode = aco_opcode::image_sample_c_d_cl; 9704 if (has_bias) 9705 opcode = aco_opcode::image_sample_c_b_cl; 9706 } else { 9707 opcode = aco_opcode::image_sample_cl; 9708 if (has_derivs) 9709 opcode = aco_opcode::image_sample_d_cl; 9710 if (has_bias) 9711 opcode = aco_opcode::image_sample_b_cl; 9712 } 9713 } else { /* no offset */ 9714 if (has_compare) { 9715 opcode = aco_opcode::image_sample_c; 9716 if (has_derivs) 9717 opcode = aco_opcode::image_sample_c_d; 9718 if (has_bias) 9719 opcode = aco_opcode::image_sample_c_b; 9720 if (level_zero) 9721 opcode = aco_opcode::image_sample_c_lz; 9722 if (has_lod) 9723 opcode = aco_opcode::image_sample_c_l; 9724 } else { 9725 opcode = aco_opcode::image_sample; 9726 if (has_derivs) 9727 opcode = aco_opcode::image_sample_d; 9728 if (has_bias) 9729 opcode = aco_opcode::image_sample_b; 9730 if (level_zero) 9731 opcode = aco_opcode::image_sample_lz; 9732 if (has_lod) 9733 opcode = aco_opcode::image_sample_l; 9734 } 9735 } 9736 9737 if (instr->op == nir_texop_tg4) { 9738 if (has_offset) { /* image_gather4_*_o */ 9739 if (has_compare) { 9740 opcode = aco_opcode::image_gather4_c_lz_o; 9741 if (has_lod) 9742 opcode = aco_opcode::image_gather4_c_l_o; 9743 if (has_bias) 9744 opcode = aco_opcode::image_gather4_c_b_o; 9745 } else { 9746 opcode = aco_opcode::image_gather4_lz_o; 9747 if (has_lod) 9748 opcode = aco_opcode::image_gather4_l_o; 9749 if (has_bias) 9750 opcode = aco_opcode::image_gather4_b_o; 9751 } 9752 } else { 9753 if (has_compare) { 9754 opcode = aco_opcode::image_gather4_c_lz; 9755 if (has_lod) 9756 opcode = aco_opcode::image_gather4_c_l; 9757 if (has_bias) 9758 opcode = aco_opcode::image_gather4_c_b; 9759 } else { 9760 opcode = aco_opcode::image_gather4_lz; 9761 if (has_lod) 9762 opcode = aco_opcode::image_gather4_l; 9763 if (has_bias) 9764 opcode = aco_opcode::image_gather4_b; 9765 } 9766 } 9767 } else if (instr->op == nir_texop_lod) { 9768 opcode = aco_opcode::image_get_lod; 9769 } 9770 9771 bool implicit_derivs = bld.program->stage == fragment_fs && !has_derivs && !has_lod && 9772 !level_zero && instr->sampler_dim != GLSL_SAMPLER_DIM_MS && 9773 instr->sampler_dim != GLSL_SAMPLER_DIM_SUBPASS_MS; 9774 9775 Operand vdata = instr->is_sparse ? emit_tfe_init(bld, tmp_dst) : Operand(v1); 9776 MIMG_instruction* tex = emit_mimg(bld, opcode, Definition(tmp_dst), resource, Operand(sampler), 9777 args, implicit_derivs ? wqm_mask : 0, vdata); 9778 tex->dim = dim; 9779 tex->dmask = dmask & 0xf; 9780 tex->da = da; 9781 tex->tfe = instr->is_sparse; 9782 9783 if (tg4_integer_cube_workaround) { 9784 assert(tmp_dst.id() != dst.id()); 9785 assert(tmp_dst.size() == dst.size()); 9786 9787 emit_split_vector(ctx, tmp_dst, tmp_dst.size()); 9788 Temp val[4]; 9789 for (unsigned i = 0; i < 4; i++) { 9790 val[i] = emit_extract_vector(ctx, tmp_dst, i, v1); 9791 Temp cvt_val; 9792 if (stype == GLSL_TYPE_UINT) 9793 cvt_val = bld.vop1(aco_opcode::v_cvt_u32_f32, bld.def(v1), val[i]); 9794 else 9795 cvt_val = bld.vop1(aco_opcode::v_cvt_i32_f32, bld.def(v1), val[i]); 9796 val[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), val[i], cvt_val, 9797 tg4_compare_cube_wa64); 9798 } 9799 9800 Temp tmp = dst.regClass() == tmp_dst.regClass() ? dst : bld.tmp(tmp_dst.regClass()); 9801 if (instr->is_sparse) 9802 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2], 9803 val[3], emit_extract_vector(ctx, tmp_dst, 4, v1)); 9804 else 9805 tmp_dst = bld.pseudo(aco_opcode::p_create_vector, Definition(tmp), val[0], val[1], val[2], 9806 val[3]); 9807 } 9808 unsigned mask = instr->op == nir_texop_tg4 ? (instr->is_sparse ? 0x1F : 0xF) : dmask; 9809 expand_vector(ctx, tmp_dst, dst, instr->dest.ssa.num_components, mask); 9810} 9811 9812Operand 9813get_phi_operand(isel_context* ctx, nir_ssa_def* ssa, RegClass rc, bool logical) 9814{ 9815 Temp tmp = get_ssa_temp(ctx, ssa); 9816 if (ssa->parent_instr->type == nir_instr_type_ssa_undef) { 9817 return Operand(rc); 9818 } else if (logical && ssa->bit_size == 1 && 9819 ssa->parent_instr->type == nir_instr_type_load_const) { 9820 if (ctx->program->wave_size == 64) 9821 return Operand::c64(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT64_MAX 9822 : 0u); 9823 else 9824 return Operand::c32(nir_instr_as_load_const(ssa->parent_instr)->value[0].b ? UINT32_MAX 9825 : 0u); 9826 } else { 9827 return Operand(tmp); 9828 } 9829} 9830 9831void 9832visit_phi(isel_context* ctx, nir_phi_instr* instr) 9833{ 9834 aco_ptr<Pseudo_instruction> phi; 9835 Temp dst = get_ssa_temp(ctx, &instr->dest.ssa); 9836 assert(instr->dest.ssa.bit_size != 1 || dst.regClass() == ctx->program->lane_mask); 9837 9838 bool logical = !dst.is_linear() || nir_dest_is_divergent(instr->dest); 9839 logical |= (ctx->block->kind & block_kind_merge) != 0; 9840 aco_opcode opcode = logical ? aco_opcode::p_phi : aco_opcode::p_linear_phi; 9841 9842 /* we want a sorted list of sources, since the predecessor list is also sorted */ 9843 std::map<unsigned, nir_ssa_def*> phi_src; 9844 nir_foreach_phi_src (src, instr) 9845 phi_src[src->pred->index] = src->src.ssa; 9846 9847 std::vector<unsigned>& preds = logical ? ctx->block->logical_preds : ctx->block->linear_preds; 9848 unsigned num_operands = 0; 9849 Operand* const operands = (Operand*)alloca( 9850 (std::max(exec_list_length(&instr->srcs), (unsigned)preds.size()) + 1) * sizeof(Operand)); 9851 unsigned num_defined = 0; 9852 unsigned cur_pred_idx = 0; 9853 for (std::pair<unsigned, nir_ssa_def*> src : phi_src) { 9854 if (cur_pred_idx < preds.size()) { 9855 /* handle missing preds (IF merges with discard/break) and extra preds 9856 * (loop exit with discard) */ 9857 unsigned block = ctx->cf_info.nir_to_aco[src.first]; 9858 unsigned skipped = 0; 9859 while (cur_pred_idx + skipped < preds.size() && preds[cur_pred_idx + skipped] != block) 9860 skipped++; 9861 if (cur_pred_idx + skipped < preds.size()) { 9862 for (unsigned i = 0; i < skipped; i++) 9863 operands[num_operands++] = Operand(dst.regClass()); 9864 cur_pred_idx += skipped; 9865 } else { 9866 continue; 9867 } 9868 } 9869 /* Handle missing predecessors at the end. This shouldn't happen with loop 9870 * headers and we can't ignore these sources for loop header phis. */ 9871 if (!(ctx->block->kind & block_kind_loop_header) && cur_pred_idx >= preds.size()) 9872 continue; 9873 cur_pred_idx++; 9874 Operand op = get_phi_operand(ctx, src.second, dst.regClass(), logical); 9875 operands[num_operands++] = op; 9876 num_defined += !op.isUndefined(); 9877 } 9878 /* handle block_kind_continue_or_break at loop exit blocks */ 9879 while (cur_pred_idx++ < preds.size()) 9880 operands[num_operands++] = Operand(dst.regClass()); 9881 9882 /* If the loop ends with a break, still add a linear continue edge in case 9883 * that break is divergent or continue_or_break is used. We'll either remove 9884 * this operand later in visit_loop() if it's not necessary or replace the 9885 * undef with something correct. */ 9886 if (!logical && ctx->block->kind & block_kind_loop_header) { 9887 nir_loop* loop = nir_cf_node_as_loop(instr->instr.block->cf_node.parent); 9888 nir_block* last = nir_loop_last_block(loop); 9889 if (last->successors[0] != instr->instr.block) 9890 operands[num_operands++] = Operand(RegClass()); 9891 } 9892 9893 /* we can use a linear phi in some cases if one src is undef */ 9894 if (dst.is_linear() && ctx->block->kind & block_kind_merge && num_defined == 1) { 9895 phi.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, 9896 num_operands, 1)); 9897 9898 Block* linear_else = &ctx->program->blocks[ctx->block->linear_preds[1]]; 9899 Block* invert = &ctx->program->blocks[linear_else->linear_preds[0]]; 9900 assert(invert->kind & block_kind_invert); 9901 9902 unsigned then_block = invert->linear_preds[0]; 9903 9904 Block* insert_block = NULL; 9905 for (unsigned i = 0; i < num_operands; i++) { 9906 Operand op = operands[i]; 9907 if (op.isUndefined()) 9908 continue; 9909 insert_block = ctx->block->logical_preds[i] == then_block ? invert : ctx->block; 9910 phi->operands[0] = op; 9911 break; 9912 } 9913 assert(insert_block); /* should be handled by the "num_defined == 0" case above */ 9914 phi->operands[1] = Operand(dst.regClass()); 9915 phi->definitions[0] = Definition(dst); 9916 insert_block->instructions.emplace(insert_block->instructions.begin(), std::move(phi)); 9917 return; 9918 } 9919 9920 phi.reset(create_instruction<Pseudo_instruction>(opcode, Format::PSEUDO, num_operands, 1)); 9921 for (unsigned i = 0; i < num_operands; i++) 9922 phi->operands[i] = operands[i]; 9923 phi->definitions[0] = Definition(dst); 9924 ctx->block->instructions.emplace(ctx->block->instructions.begin(), std::move(phi)); 9925} 9926 9927void 9928visit_undef(isel_context* ctx, nir_ssa_undef_instr* instr) 9929{ 9930 Temp dst = get_ssa_temp(ctx, &instr->def); 9931 9932 assert(dst.type() == RegType::sgpr); 9933 9934 if (dst.size() == 1) { 9935 Builder(ctx->program, ctx->block).copy(Definition(dst), Operand::zero()); 9936 } else { 9937 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 9938 aco_opcode::p_create_vector, Format::PSEUDO, dst.size(), 1)}; 9939 for (unsigned i = 0; i < dst.size(); i++) 9940 vec->operands[i] = Operand::zero(); 9941 vec->definitions[0] = Definition(dst); 9942 ctx->block->instructions.emplace_back(std::move(vec)); 9943 } 9944} 9945 9946void 9947begin_loop(isel_context* ctx, loop_context* lc) 9948{ 9949 // TODO: we might want to wrap the loop around a branch if exec_potentially_empty=true 9950 append_logical_end(ctx->block); 9951 ctx->block->kind |= block_kind_loop_preheader | block_kind_uniform; 9952 Builder bld(ctx->program, ctx->block); 9953 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 9954 unsigned loop_preheader_idx = ctx->block->index; 9955 9956 lc->loop_exit.kind |= (block_kind_loop_exit | (ctx->block->kind & block_kind_top_level)); 9957 9958 ctx->program->next_loop_depth++; 9959 9960 Block* loop_header = ctx->program->create_and_insert_block(); 9961 loop_header->kind |= block_kind_loop_header; 9962 add_edge(loop_preheader_idx, loop_header); 9963 ctx->block = loop_header; 9964 9965 append_logical_start(ctx->block); 9966 9967 lc->header_idx_old = std::exchange(ctx->cf_info.parent_loop.header_idx, loop_header->index); 9968 lc->exit_old = std::exchange(ctx->cf_info.parent_loop.exit, &lc->loop_exit); 9969 lc->divergent_cont_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_continue, false); 9970 lc->divergent_branch_old = std::exchange(ctx->cf_info.parent_loop.has_divergent_branch, false); 9971 lc->divergent_if_old = std::exchange(ctx->cf_info.parent_if.is_divergent, false); 9972} 9973 9974void 9975end_loop(isel_context* ctx, loop_context* lc) 9976{ 9977 // TODO: what if a loop ends with a unconditional or uniformly branched continue 9978 // and this branch is never taken? 9979 if (!ctx->cf_info.has_branch) { 9980 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx; 9981 Builder bld(ctx->program, ctx->block); 9982 append_logical_end(ctx->block); 9983 9984 if (ctx->cf_info.exec_potentially_empty_discard || 9985 ctx->cf_info.exec_potentially_empty_break) { 9986 /* Discards can result in code running with an empty exec mask. 9987 * This would result in divergent breaks not ever being taken. As a 9988 * workaround, break the loop when the loop mask is empty instead of 9989 * always continuing. */ 9990 ctx->block->kind |= (block_kind_continue_or_break | block_kind_uniform); 9991 unsigned block_idx = ctx->block->index; 9992 9993 /* create helper blocks to avoid critical edges */ 9994 Block* break_block = ctx->program->create_and_insert_block(); 9995 break_block->kind = block_kind_uniform; 9996 bld.reset(break_block); 9997 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 9998 add_linear_edge(block_idx, break_block); 9999 add_linear_edge(break_block->index, &lc->loop_exit); 10000 10001 Block* continue_block = ctx->program->create_and_insert_block(); 10002 continue_block->kind = block_kind_uniform; 10003 bld.reset(continue_block); 10004 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 10005 add_linear_edge(block_idx, continue_block); 10006 add_linear_edge(continue_block->index, &ctx->program->blocks[loop_header_idx]); 10007 10008 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10009 add_logical_edge(block_idx, &ctx->program->blocks[loop_header_idx]); 10010 ctx->block = &ctx->program->blocks[block_idx]; 10011 } else { 10012 ctx->block->kind |= (block_kind_continue | block_kind_uniform); 10013 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10014 add_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]); 10015 else 10016 add_linear_edge(ctx->block->index, &ctx->program->blocks[loop_header_idx]); 10017 } 10018 10019 bld.reset(ctx->block); 10020 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 10021 } 10022 10023 ctx->cf_info.has_branch = false; 10024 ctx->program->next_loop_depth--; 10025 10026 // TODO: if the loop has not a single exit, we must add one °° 10027 /* emit loop successor block */ 10028 ctx->block = ctx->program->insert_block(std::move(lc->loop_exit)); 10029 append_logical_start(ctx->block); 10030 10031#if 0 10032 // TODO: check if it is beneficial to not branch on continues 10033 /* trim linear phis in loop header */ 10034 for (auto&& instr : loop_entry->instructions) { 10035 if (instr->opcode == aco_opcode::p_linear_phi) { 10036 aco_ptr<Pseudo_instruction> new_phi{create_instruction<Pseudo_instruction>(aco_opcode::p_linear_phi, Format::PSEUDO, loop_entry->linear_predecessors.size(), 1)}; 10037 new_phi->definitions[0] = instr->definitions[0]; 10038 for (unsigned i = 0; i < new_phi->operands.size(); i++) 10039 new_phi->operands[i] = instr->operands[i]; 10040 /* check that the remaining operands are all the same */ 10041 for (unsigned i = new_phi->operands.size(); i < instr->operands.size(); i++) 10042 assert(instr->operands[i].tempId() == instr->operands.back().tempId()); 10043 instr.swap(new_phi); 10044 } else if (instr->opcode == aco_opcode::p_phi) { 10045 continue; 10046 } else { 10047 break; 10048 } 10049 } 10050#endif 10051 10052 ctx->cf_info.parent_loop.header_idx = lc->header_idx_old; 10053 ctx->cf_info.parent_loop.exit = lc->exit_old; 10054 ctx->cf_info.parent_loop.has_divergent_continue = lc->divergent_cont_old; 10055 ctx->cf_info.parent_loop.has_divergent_branch = lc->divergent_branch_old; 10056 ctx->cf_info.parent_if.is_divergent = lc->divergent_if_old; 10057 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) 10058 ctx->cf_info.exec_potentially_empty_discard = false; 10059} 10060 10061void 10062emit_loop_jump(isel_context* ctx, bool is_break) 10063{ 10064 Builder bld(ctx->program, ctx->block); 10065 Block* logical_target; 10066 append_logical_end(ctx->block); 10067 unsigned idx = ctx->block->index; 10068 10069 if (is_break) { 10070 logical_target = ctx->cf_info.parent_loop.exit; 10071 add_logical_edge(idx, logical_target); 10072 ctx->block->kind |= block_kind_break; 10073 10074 if (!ctx->cf_info.parent_if.is_divergent && 10075 !ctx->cf_info.parent_loop.has_divergent_continue) { 10076 /* uniform break - directly jump out of the loop */ 10077 ctx->block->kind |= block_kind_uniform; 10078 ctx->cf_info.has_branch = true; 10079 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 10080 add_linear_edge(idx, logical_target); 10081 return; 10082 } 10083 ctx->cf_info.parent_loop.has_divergent_branch = true; 10084 } else { 10085 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx]; 10086 add_logical_edge(idx, logical_target); 10087 ctx->block->kind |= block_kind_continue; 10088 10089 if (!ctx->cf_info.parent_if.is_divergent) { 10090 /* uniform continue - directly jump to the loop header */ 10091 ctx->block->kind |= block_kind_uniform; 10092 ctx->cf_info.has_branch = true; 10093 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 10094 add_linear_edge(idx, logical_target); 10095 return; 10096 } 10097 10098 /* for potential uniform breaks after this continue, 10099 we must ensure that they are handled correctly */ 10100 ctx->cf_info.parent_loop.has_divergent_continue = true; 10101 ctx->cf_info.parent_loop.has_divergent_branch = true; 10102 } 10103 10104 if (ctx->cf_info.parent_if.is_divergent && !ctx->cf_info.exec_potentially_empty_break) { 10105 ctx->cf_info.exec_potentially_empty_break = true; 10106 ctx->cf_info.exec_potentially_empty_break_depth = ctx->block->loop_nest_depth; 10107 } 10108 10109 /* remove critical edges from linear CFG */ 10110 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 10111 Block* break_block = ctx->program->create_and_insert_block(); 10112 break_block->kind |= block_kind_uniform; 10113 add_linear_edge(idx, break_block); 10114 /* the loop_header pointer might be invalidated by this point */ 10115 if (!is_break) 10116 logical_target = &ctx->program->blocks[ctx->cf_info.parent_loop.header_idx]; 10117 add_linear_edge(break_block->index, logical_target); 10118 bld.reset(break_block); 10119 bld.branch(aco_opcode::p_branch, bld.hint_vcc(bld.def(s2))); 10120 10121 Block* continue_block = ctx->program->create_and_insert_block(); 10122 add_linear_edge(idx, continue_block); 10123 append_logical_start(continue_block); 10124 ctx->block = continue_block; 10125} 10126 10127void 10128emit_loop_break(isel_context* ctx) 10129{ 10130 emit_loop_jump(ctx, true); 10131} 10132 10133void 10134emit_loop_continue(isel_context* ctx) 10135{ 10136 emit_loop_jump(ctx, false); 10137} 10138 10139void 10140visit_jump(isel_context* ctx, nir_jump_instr* instr) 10141{ 10142 /* visit_block() would usually do this but divergent jumps updates ctx->block */ 10143 ctx->cf_info.nir_to_aco[instr->instr.block->index] = ctx->block->index; 10144 10145 switch (instr->type) { 10146 case nir_jump_break: emit_loop_break(ctx); break; 10147 case nir_jump_continue: emit_loop_continue(ctx); break; 10148 default: isel_err(&instr->instr, "Unknown NIR jump instr"); abort(); 10149 } 10150} 10151 10152void 10153visit_block(isel_context* ctx, nir_block* block) 10154{ 10155 nir_foreach_instr (instr, block) { 10156 switch (instr->type) { 10157 case nir_instr_type_alu: visit_alu_instr(ctx, nir_instr_as_alu(instr)); break; 10158 case nir_instr_type_load_const: visit_load_const(ctx, nir_instr_as_load_const(instr)); break; 10159 case nir_instr_type_intrinsic: visit_intrinsic(ctx, nir_instr_as_intrinsic(instr)); break; 10160 case nir_instr_type_tex: visit_tex(ctx, nir_instr_as_tex(instr)); break; 10161 case nir_instr_type_phi: visit_phi(ctx, nir_instr_as_phi(instr)); break; 10162 case nir_instr_type_ssa_undef: visit_undef(ctx, nir_instr_as_ssa_undef(instr)); break; 10163 case nir_instr_type_deref: break; 10164 case nir_instr_type_jump: visit_jump(ctx, nir_instr_as_jump(instr)); break; 10165 default: isel_err(instr, "Unknown NIR instr type"); 10166 } 10167 } 10168 10169 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10170 ctx->cf_info.nir_to_aco[block->index] = ctx->block->index; 10171} 10172 10173static Operand 10174create_continue_phis(isel_context* ctx, unsigned first, unsigned last, 10175 aco_ptr<Instruction>& header_phi, Operand* vals) 10176{ 10177 vals[0] = Operand(header_phi->definitions[0].getTemp()); 10178 RegClass rc = vals[0].regClass(); 10179 10180 unsigned loop_nest_depth = ctx->program->blocks[first].loop_nest_depth; 10181 10182 unsigned next_pred = 1; 10183 10184 for (unsigned idx = first + 1; idx <= last; idx++) { 10185 Block& block = ctx->program->blocks[idx]; 10186 if (block.loop_nest_depth != loop_nest_depth) { 10187 vals[idx - first] = vals[idx - 1 - first]; 10188 continue; 10189 } 10190 10191 if ((block.kind & block_kind_continue) && block.index != last) { 10192 vals[idx - first] = header_phi->operands[next_pred]; 10193 next_pred++; 10194 continue; 10195 } 10196 10197 bool all_same = true; 10198 for (unsigned i = 1; all_same && (i < block.linear_preds.size()); i++) 10199 all_same = vals[block.linear_preds[i] - first] == vals[block.linear_preds[0] - first]; 10200 10201 Operand val; 10202 if (all_same) { 10203 val = vals[block.linear_preds[0] - first]; 10204 } else { 10205 aco_ptr<Instruction> phi(create_instruction<Pseudo_instruction>( 10206 aco_opcode::p_linear_phi, Format::PSEUDO, block.linear_preds.size(), 1)); 10207 for (unsigned i = 0; i < block.linear_preds.size(); i++) 10208 phi->operands[i] = vals[block.linear_preds[i] - first]; 10209 val = Operand(ctx->program->allocateTmp(rc)); 10210 phi->definitions[0] = Definition(val.getTemp()); 10211 block.instructions.emplace(block.instructions.begin(), std::move(phi)); 10212 } 10213 vals[idx - first] = val; 10214 } 10215 10216 return vals[last - first]; 10217} 10218 10219static void begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond); 10220static void begin_uniform_if_else(isel_context* ctx, if_context* ic); 10221static void end_uniform_if(isel_context* ctx, if_context* ic); 10222 10223static void 10224visit_loop(isel_context* ctx, nir_loop* loop) 10225{ 10226 loop_context lc; 10227 begin_loop(ctx, &lc); 10228 10229 /* NIR seems to allow this, and even though the loop exit has no predecessors, SSA defs from the 10230 * loop header are live. Handle this without complicating the ACO IR by creating a dummy break. 10231 */ 10232 if (nir_cf_node_cf_tree_next(&loop->cf_node)->predecessors->entries == 0) { 10233 Builder bld(ctx->program, ctx->block); 10234 Temp cond = bld.copy(bld.def(s1, scc), Operand::zero()); 10235 if_context ic; 10236 begin_uniform_if_then(ctx, &ic, cond); 10237 emit_loop_break(ctx); 10238 begin_uniform_if_else(ctx, &ic); 10239 end_uniform_if(ctx, &ic); 10240 } 10241 10242 bool unreachable = visit_cf_list(ctx, &loop->body); 10243 10244 unsigned loop_header_idx = ctx->cf_info.parent_loop.header_idx; 10245 10246 /* Fixup phis in loop header from unreachable blocks. 10247 * has_branch/has_divergent_branch also indicates if the loop ends with a 10248 * break/continue instruction, but we don't emit those if unreachable=true */ 10249 if (unreachable) { 10250 assert(ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch); 10251 bool linear = ctx->cf_info.has_branch; 10252 bool logical = ctx->cf_info.has_branch || ctx->cf_info.parent_loop.has_divergent_branch; 10253 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) { 10254 if ((logical && instr->opcode == aco_opcode::p_phi) || 10255 (linear && instr->opcode == aco_opcode::p_linear_phi)) { 10256 /* the last operand should be the one that needs to be removed */ 10257 instr->operands.pop_back(); 10258 } else if (!is_phi(instr)) { 10259 break; 10260 } 10261 } 10262 } 10263 10264 /* Fixup linear phis in loop header from expecting a continue. Both this fixup 10265 * and the previous one shouldn't both happen at once because a break in the 10266 * merge block would get CSE'd */ 10267 if (nir_loop_last_block(loop)->successors[0] != nir_loop_first_block(loop)) { 10268 unsigned num_vals = ctx->cf_info.has_branch ? 1 : (ctx->block->index - loop_header_idx + 1); 10269 Operand* const vals = (Operand*)alloca(num_vals * sizeof(Operand)); 10270 for (aco_ptr<Instruction>& instr : ctx->program->blocks[loop_header_idx].instructions) { 10271 if (instr->opcode == aco_opcode::p_linear_phi) { 10272 if (ctx->cf_info.has_branch) 10273 instr->operands.pop_back(); 10274 else 10275 instr->operands.back() = 10276 create_continue_phis(ctx, loop_header_idx, ctx->block->index, instr, vals); 10277 } else if (!is_phi(instr)) { 10278 break; 10279 } 10280 } 10281 } 10282 10283 end_loop(ctx, &lc); 10284} 10285 10286static void 10287begin_divergent_if_then(isel_context* ctx, if_context* ic, Temp cond) 10288{ 10289 ic->cond = cond; 10290 10291 append_logical_end(ctx->block); 10292 ctx->block->kind |= block_kind_branch; 10293 10294 /* branch to linear then block */ 10295 assert(cond.regClass() == ctx->program->lane_mask); 10296 aco_ptr<Pseudo_branch_instruction> branch; 10297 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_cbranch_z, 10298 Format::PSEUDO_BRANCH, 1, 1)); 10299 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10300 branch->definitions[0].setHint(vcc); 10301 branch->operands[0] = Operand(cond); 10302 ctx->block->instructions.push_back(std::move(branch)); 10303 10304 ic->BB_if_idx = ctx->block->index; 10305 ic->BB_invert = Block(); 10306 /* Invert blocks are intentionally not marked as top level because they 10307 * are not part of the logical cfg. */ 10308 ic->BB_invert.kind |= block_kind_invert; 10309 ic->BB_endif = Block(); 10310 ic->BB_endif.kind |= (block_kind_merge | (ctx->block->kind & block_kind_top_level)); 10311 10312 ic->exec_potentially_empty_discard_old = ctx->cf_info.exec_potentially_empty_discard; 10313 ic->exec_potentially_empty_break_old = ctx->cf_info.exec_potentially_empty_break; 10314 ic->exec_potentially_empty_break_depth_old = ctx->cf_info.exec_potentially_empty_break_depth; 10315 ic->divergent_old = ctx->cf_info.parent_if.is_divergent; 10316 ctx->cf_info.parent_if.is_divergent = true; 10317 10318 /* divergent branches use cbranch_execz */ 10319 ctx->cf_info.exec_potentially_empty_discard = false; 10320 ctx->cf_info.exec_potentially_empty_break = false; 10321 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; 10322 10323 /** emit logical then block */ 10324 ctx->program->next_divergent_if_logical_depth++; 10325 Block* BB_then_logical = ctx->program->create_and_insert_block(); 10326 add_edge(ic->BB_if_idx, BB_then_logical); 10327 ctx->block = BB_then_logical; 10328 append_logical_start(BB_then_logical); 10329} 10330 10331static void 10332begin_divergent_if_else(isel_context* ctx, if_context* ic) 10333{ 10334 Block* BB_then_logical = ctx->block; 10335 append_logical_end(BB_then_logical); 10336 /* branch from logical then block to invert block */ 10337 aco_ptr<Pseudo_branch_instruction> branch; 10338 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10339 Format::PSEUDO_BRANCH, 0, 1)); 10340 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10341 branch->definitions[0].setHint(vcc); 10342 BB_then_logical->instructions.emplace_back(std::move(branch)); 10343 add_linear_edge(BB_then_logical->index, &ic->BB_invert); 10344 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10345 add_logical_edge(BB_then_logical->index, &ic->BB_endif); 10346 BB_then_logical->kind |= block_kind_uniform; 10347 assert(!ctx->cf_info.has_branch); 10348 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; 10349 ctx->cf_info.parent_loop.has_divergent_branch = false; 10350 ctx->program->next_divergent_if_logical_depth--; 10351 10352 /** emit linear then block */ 10353 Block* BB_then_linear = ctx->program->create_and_insert_block(); 10354 BB_then_linear->kind |= block_kind_uniform; 10355 add_linear_edge(ic->BB_if_idx, BB_then_linear); 10356 /* branch from linear then block to invert block */ 10357 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10358 Format::PSEUDO_BRANCH, 0, 1)); 10359 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10360 branch->definitions[0].setHint(vcc); 10361 BB_then_linear->instructions.emplace_back(std::move(branch)); 10362 add_linear_edge(BB_then_linear->index, &ic->BB_invert); 10363 10364 /** emit invert merge block */ 10365 ctx->block = ctx->program->insert_block(std::move(ic->BB_invert)); 10366 ic->invert_idx = ctx->block->index; 10367 10368 /* branch to linear else block (skip else) */ 10369 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10370 Format::PSEUDO_BRANCH, 0, 1)); 10371 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10372 branch->definitions[0].setHint(vcc); 10373 ctx->block->instructions.push_back(std::move(branch)); 10374 10375 ic->exec_potentially_empty_discard_old |= ctx->cf_info.exec_potentially_empty_discard; 10376 ic->exec_potentially_empty_break_old |= ctx->cf_info.exec_potentially_empty_break; 10377 ic->exec_potentially_empty_break_depth_old = std::min( 10378 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); 10379 /* divergent branches use cbranch_execz */ 10380 ctx->cf_info.exec_potentially_empty_discard = false; 10381 ctx->cf_info.exec_potentially_empty_break = false; 10382 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; 10383 10384 /** emit logical else block */ 10385 ctx->program->next_divergent_if_logical_depth++; 10386 Block* BB_else_logical = ctx->program->create_and_insert_block(); 10387 add_logical_edge(ic->BB_if_idx, BB_else_logical); 10388 add_linear_edge(ic->invert_idx, BB_else_logical); 10389 ctx->block = BB_else_logical; 10390 append_logical_start(BB_else_logical); 10391} 10392 10393static void 10394end_divergent_if(isel_context* ctx, if_context* ic) 10395{ 10396 Block* BB_else_logical = ctx->block; 10397 append_logical_end(BB_else_logical); 10398 10399 /* branch from logical else block to endif block */ 10400 aco_ptr<Pseudo_branch_instruction> branch; 10401 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10402 Format::PSEUDO_BRANCH, 0, 1)); 10403 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10404 branch->definitions[0].setHint(vcc); 10405 BB_else_logical->instructions.emplace_back(std::move(branch)); 10406 add_linear_edge(BB_else_logical->index, &ic->BB_endif); 10407 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10408 add_logical_edge(BB_else_logical->index, &ic->BB_endif); 10409 BB_else_logical->kind |= block_kind_uniform; 10410 ctx->program->next_divergent_if_logical_depth--; 10411 10412 assert(!ctx->cf_info.has_branch); 10413 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; 10414 10415 /** emit linear else block */ 10416 Block* BB_else_linear = ctx->program->create_and_insert_block(); 10417 BB_else_linear->kind |= block_kind_uniform; 10418 add_linear_edge(ic->invert_idx, BB_else_linear); 10419 10420 /* branch from linear else block to endif block */ 10421 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10422 Format::PSEUDO_BRANCH, 0, 1)); 10423 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10424 branch->definitions[0].setHint(vcc); 10425 BB_else_linear->instructions.emplace_back(std::move(branch)); 10426 add_linear_edge(BB_else_linear->index, &ic->BB_endif); 10427 10428 /** emit endif merge block */ 10429 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); 10430 append_logical_start(ctx->block); 10431 10432 ctx->cf_info.parent_if.is_divergent = ic->divergent_old; 10433 ctx->cf_info.exec_potentially_empty_discard |= ic->exec_potentially_empty_discard_old; 10434 ctx->cf_info.exec_potentially_empty_break |= ic->exec_potentially_empty_break_old; 10435 ctx->cf_info.exec_potentially_empty_break_depth = std::min( 10436 ic->exec_potentially_empty_break_depth_old, ctx->cf_info.exec_potentially_empty_break_depth); 10437 if (ctx->block->loop_nest_depth == ctx->cf_info.exec_potentially_empty_break_depth && 10438 !ctx->cf_info.parent_if.is_divergent) { 10439 ctx->cf_info.exec_potentially_empty_break = false; 10440 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; 10441 } 10442 /* uniform control flow never has an empty exec-mask */ 10443 if (!ctx->block->loop_nest_depth && !ctx->cf_info.parent_if.is_divergent) { 10444 ctx->cf_info.exec_potentially_empty_discard = false; 10445 ctx->cf_info.exec_potentially_empty_break = false; 10446 ctx->cf_info.exec_potentially_empty_break_depth = UINT16_MAX; 10447 } 10448} 10449 10450static void 10451begin_uniform_if_then(isel_context* ctx, if_context* ic, Temp cond) 10452{ 10453 assert(cond.regClass() == s1); 10454 10455 append_logical_end(ctx->block); 10456 ctx->block->kind |= block_kind_uniform; 10457 10458 aco_ptr<Pseudo_branch_instruction> branch; 10459 aco_opcode branch_opcode = aco_opcode::p_cbranch_z; 10460 branch.reset( 10461 create_instruction<Pseudo_branch_instruction>(branch_opcode, Format::PSEUDO_BRANCH, 1, 1)); 10462 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10463 branch->definitions[0].setHint(vcc); 10464 branch->operands[0] = Operand(cond); 10465 branch->operands[0].setFixed(scc); 10466 ctx->block->instructions.emplace_back(std::move(branch)); 10467 10468 ic->BB_if_idx = ctx->block->index; 10469 ic->BB_endif = Block(); 10470 ic->BB_endif.kind |= ctx->block->kind & block_kind_top_level; 10471 10472 ctx->cf_info.has_branch = false; 10473 ctx->cf_info.parent_loop.has_divergent_branch = false; 10474 10475 /** emit then block */ 10476 ctx->program->next_uniform_if_depth++; 10477 Block* BB_then = ctx->program->create_and_insert_block(); 10478 add_edge(ic->BB_if_idx, BB_then); 10479 append_logical_start(BB_then); 10480 ctx->block = BB_then; 10481} 10482 10483static void 10484begin_uniform_if_else(isel_context* ctx, if_context* ic) 10485{ 10486 Block* BB_then = ctx->block; 10487 10488 ic->uniform_has_then_branch = ctx->cf_info.has_branch; 10489 ic->then_branch_divergent = ctx->cf_info.parent_loop.has_divergent_branch; 10490 10491 if (!ic->uniform_has_then_branch) { 10492 append_logical_end(BB_then); 10493 /* branch from then block to endif block */ 10494 aco_ptr<Pseudo_branch_instruction> branch; 10495 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10496 Format::PSEUDO_BRANCH, 0, 1)); 10497 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10498 branch->definitions[0].setHint(vcc); 10499 BB_then->instructions.emplace_back(std::move(branch)); 10500 add_linear_edge(BB_then->index, &ic->BB_endif); 10501 if (!ic->then_branch_divergent) 10502 add_logical_edge(BB_then->index, &ic->BB_endif); 10503 BB_then->kind |= block_kind_uniform; 10504 } 10505 10506 ctx->cf_info.has_branch = false; 10507 ctx->cf_info.parent_loop.has_divergent_branch = false; 10508 10509 /** emit else block */ 10510 Block* BB_else = ctx->program->create_and_insert_block(); 10511 add_edge(ic->BB_if_idx, BB_else); 10512 append_logical_start(BB_else); 10513 ctx->block = BB_else; 10514} 10515 10516static void 10517end_uniform_if(isel_context* ctx, if_context* ic) 10518{ 10519 Block* BB_else = ctx->block; 10520 10521 if (!ctx->cf_info.has_branch) { 10522 append_logical_end(BB_else); 10523 /* branch from then block to endif block */ 10524 aco_ptr<Pseudo_branch_instruction> branch; 10525 branch.reset(create_instruction<Pseudo_branch_instruction>(aco_opcode::p_branch, 10526 Format::PSEUDO_BRANCH, 0, 1)); 10527 branch->definitions[0] = Definition(ctx->program->allocateTmp(s2)); 10528 branch->definitions[0].setHint(vcc); 10529 BB_else->instructions.emplace_back(std::move(branch)); 10530 add_linear_edge(BB_else->index, &ic->BB_endif); 10531 if (!ctx->cf_info.parent_loop.has_divergent_branch) 10532 add_logical_edge(BB_else->index, &ic->BB_endif); 10533 BB_else->kind |= block_kind_uniform; 10534 } 10535 10536 ctx->cf_info.has_branch &= ic->uniform_has_then_branch; 10537 ctx->cf_info.parent_loop.has_divergent_branch &= ic->then_branch_divergent; 10538 10539 /** emit endif merge block */ 10540 ctx->program->next_uniform_if_depth--; 10541 if (!ctx->cf_info.has_branch) { 10542 ctx->block = ctx->program->insert_block(std::move(ic->BB_endif)); 10543 append_logical_start(ctx->block); 10544 } 10545} 10546 10547static bool 10548visit_if(isel_context* ctx, nir_if* if_stmt) 10549{ 10550 Temp cond = get_ssa_temp(ctx, if_stmt->condition.ssa); 10551 Builder bld(ctx->program, ctx->block); 10552 aco_ptr<Pseudo_branch_instruction> branch; 10553 if_context ic; 10554 10555 if (!nir_src_is_divergent(if_stmt->condition)) { /* uniform condition */ 10556 /** 10557 * Uniform conditionals are represented in the following way*) : 10558 * 10559 * The linear and logical CFG: 10560 * BB_IF 10561 * / \ 10562 * BB_THEN (logical) BB_ELSE (logical) 10563 * \ / 10564 * BB_ENDIF 10565 * 10566 * *) Exceptions may be due to break and continue statements within loops 10567 * If a break/continue happens within uniform control flow, it branches 10568 * to the loop exit/entry block. Otherwise, it branches to the next 10569 * merge block. 10570 **/ 10571 10572 assert(cond.regClass() == ctx->program->lane_mask); 10573 cond = bool_to_scalar_condition(ctx, cond); 10574 10575 begin_uniform_if_then(ctx, &ic, cond); 10576 visit_cf_list(ctx, &if_stmt->then_list); 10577 10578 begin_uniform_if_else(ctx, &ic); 10579 visit_cf_list(ctx, &if_stmt->else_list); 10580 10581 end_uniform_if(ctx, &ic); 10582 } else { /* non-uniform condition */ 10583 /** 10584 * To maintain a logical and linear CFG without critical edges, 10585 * non-uniform conditionals are represented in the following way*) : 10586 * 10587 * The linear CFG: 10588 * BB_IF 10589 * / \ 10590 * BB_THEN (logical) BB_THEN (linear) 10591 * \ / 10592 * BB_INVERT (linear) 10593 * / \ 10594 * BB_ELSE (logical) BB_ELSE (linear) 10595 * \ / 10596 * BB_ENDIF 10597 * 10598 * The logical CFG: 10599 * BB_IF 10600 * / \ 10601 * BB_THEN (logical) BB_ELSE (logical) 10602 * \ / 10603 * BB_ENDIF 10604 * 10605 * *) Exceptions may be due to break and continue statements within loops 10606 **/ 10607 10608 begin_divergent_if_then(ctx, &ic, cond); 10609 visit_cf_list(ctx, &if_stmt->then_list); 10610 10611 begin_divergent_if_else(ctx, &ic); 10612 visit_cf_list(ctx, &if_stmt->else_list); 10613 10614 end_divergent_if(ctx, &ic); 10615 } 10616 10617 return !ctx->cf_info.has_branch && !ctx->block->logical_preds.empty(); 10618} 10619 10620static bool 10621visit_cf_list(isel_context* ctx, struct exec_list* list) 10622{ 10623 foreach_list_typed (nir_cf_node, node, node, list) { 10624 switch (node->type) { 10625 case nir_cf_node_block: visit_block(ctx, nir_cf_node_as_block(node)); break; 10626 case nir_cf_node_if: 10627 if (!visit_if(ctx, nir_cf_node_as_if(node))) 10628 return true; 10629 break; 10630 case nir_cf_node_loop: visit_loop(ctx, nir_cf_node_as_loop(node)); break; 10631 default: unreachable("unimplemented cf list type"); 10632 } 10633 } 10634 return false; 10635} 10636 10637static void 10638export_vs_varying(isel_context* ctx, int slot, bool is_pos, int* next_pos) 10639{ 10640 assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG); 10641 10642 int offset = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS)) 10643 ? ctx->program->info->tes.outinfo.vs_output_param_offset[slot] 10644 : ctx->program->info->vs.outinfo.vs_output_param_offset[slot]; 10645 unsigned mask = ctx->outputs.mask[slot]; 10646 if (!is_pos && !mask) 10647 return; 10648 if (!is_pos && offset == AC_EXP_PARAM_UNDEFINED) 10649 return; 10650 aco_ptr<Export_instruction> exp{ 10651 create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; 10652 exp->enabled_mask = mask; 10653 for (unsigned i = 0; i < 4; ++i) { 10654 if (mask & (1 << i)) 10655 exp->operands[i] = Operand(ctx->outputs.temps[slot * 4u + i]); 10656 else 10657 exp->operands[i] = Operand(v1); 10658 } 10659 /* GFX10 (Navi1x) skip POS0 exports if EXEC=0 and DONE=0, causing a hang. 10660 * Setting valid_mask=1 prevents it and has no other effect. 10661 */ 10662 exp->valid_mask = ctx->options->chip_class == GFX10 && is_pos && *next_pos == 0; 10663 exp->done = false; 10664 exp->compressed = false; 10665 if (is_pos) 10666 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; 10667 else 10668 exp->dest = V_008DFC_SQ_EXP_PARAM + offset; 10669 ctx->block->instructions.emplace_back(std::move(exp)); 10670} 10671 10672static void 10673export_vs_psiz_layer_viewport_vrs(isel_context* ctx, int* next_pos) 10674{ 10675 aco_ptr<Export_instruction> exp{ 10676 create_instruction<Export_instruction>(aco_opcode::exp, Format::EXP, 4, 0)}; 10677 exp->enabled_mask = 0; 10678 for (unsigned i = 0; i < 4; ++i) 10679 exp->operands[i] = Operand(v1); 10680 if (ctx->outputs.mask[VARYING_SLOT_PSIZ]) { 10681 exp->operands[0] = Operand(ctx->outputs.temps[VARYING_SLOT_PSIZ * 4u]); 10682 exp->enabled_mask |= 0x1; 10683 } 10684 if (ctx->outputs.mask[VARYING_SLOT_LAYER]) { 10685 exp->operands[2] = Operand(ctx->outputs.temps[VARYING_SLOT_LAYER * 4u]); 10686 exp->enabled_mask |= 0x4; 10687 } 10688 if (ctx->outputs.mask[VARYING_SLOT_VIEWPORT]) { 10689 if (ctx->options->chip_class < GFX9) { 10690 exp->operands[3] = Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u]); 10691 exp->enabled_mask |= 0x8; 10692 } else { 10693 Builder bld(ctx->program, ctx->block); 10694 10695 Temp out = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), 10696 Operand(ctx->outputs.temps[VARYING_SLOT_VIEWPORT * 4u])); 10697 if (exp->operands[2].isTemp()) 10698 out = bld.vop2(aco_opcode::v_or_b32, bld.def(v1), Operand(out), exp->operands[2]); 10699 10700 exp->operands[2] = Operand(out); 10701 exp->enabled_mask |= 0x4; 10702 } 10703 } 10704 if (ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_SHADING_RATE]) { 10705 exp->operands[1] = Operand(ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_SHADING_RATE * 4u]); 10706 exp->enabled_mask |= 0x2; 10707 } else if (ctx->options->force_vrs_rates) { 10708 /* Bits [2:3] = VRS rate X 10709 * Bits [4:5] = VRS rate Y 10710 * 10711 * The range is [-2, 1]. Values: 10712 * 1: 2x coarser shading rate in that direction. 10713 * 0: normal shading rate 10714 * -1: 2x finer shading rate (sample shading, not directional) 10715 * -2: 4x finer shading rate (sample shading, not directional) 10716 * 10717 * Sample shading can't go above 8 samples, so both numbers can't be -2 10718 * at the same time. 10719 */ 10720 Builder bld(ctx->program, ctx->block); 10721 Temp rates = bld.copy(bld.def(v1), Operand::c32((unsigned)ctx->options->force_vrs_rates)); 10722 10723 /* If Pos.W != 1 (typical for non-GUI elements), use 2x2 coarse shading. */ 10724 Temp cond = bld.vopc(aco_opcode::v_cmp_neq_f32, bld.def(bld.lm), Operand::c32(0x3f800000u), 10725 Operand(ctx->outputs.temps[VARYING_SLOT_POS + 3])); 10726 rates = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), 10727 bld.copy(bld.def(v1), Operand::zero()), rates, cond); 10728 10729 exp->operands[1] = Operand(rates); 10730 exp->enabled_mask |= 0x2; 10731 } 10732 10733 exp->valid_mask = ctx->options->chip_class == GFX10 && *next_pos == 0; 10734 exp->done = false; 10735 exp->compressed = false; 10736 exp->dest = V_008DFC_SQ_EXP_POS + (*next_pos)++; 10737 ctx->block->instructions.emplace_back(std::move(exp)); 10738} 10739 10740static void 10741create_vs_exports(isel_context* ctx) 10742{ 10743 assert(ctx->stage.hw == HWStage::VS || ctx->stage.hw == HWStage::NGG); 10744 10745 const radv_vs_output_info* outinfo = (ctx->stage.has(SWStage::TES) && !ctx->stage.has(SWStage::GS)) 10746 ? &ctx->program->info->tes.outinfo 10747 : &ctx->program->info->vs.outinfo; 10748 10749 ctx->block->kind |= block_kind_export_end; 10750 10751 if (outinfo->export_prim_id && ctx->stage.hw != HWStage::NGG) { 10752 ctx->outputs.mask[VARYING_SLOT_PRIMITIVE_ID] |= 0x1; 10753 if (ctx->stage.has(SWStage::TES)) 10754 ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = 10755 get_arg(ctx, ctx->args->ac.tes_patch_id); 10756 else 10757 ctx->outputs.temps[VARYING_SLOT_PRIMITIVE_ID * 4u] = 10758 get_arg(ctx, ctx->args->ac.vs_prim_id); 10759 } 10760 10761 if (ctx->options->key.has_multiview_view_index) { 10762 ctx->outputs.mask[VARYING_SLOT_LAYER] |= 0x1; 10763 ctx->outputs.temps[VARYING_SLOT_LAYER * 4u] = 10764 as_vgpr(ctx, get_arg(ctx, ctx->args->ac.view_index)); 10765 } 10766 10767 /* Hardware requires position data to always be exported, even if the 10768 * application did not write gl_Position. 10769 */ 10770 ctx->outputs.mask[VARYING_SLOT_POS] = 0xf; 10771 10772 /* the order these position exports are created is important */ 10773 int next_pos = 0; 10774 export_vs_varying(ctx, VARYING_SLOT_POS, true, &next_pos); 10775 10776 bool writes_primitive_shading_rate = 10777 outinfo->writes_primitive_shading_rate || ctx->options->force_vrs_rates; 10778 if (outinfo->writes_pointsize || outinfo->writes_layer || outinfo->writes_viewport_index || 10779 writes_primitive_shading_rate) { 10780 export_vs_psiz_layer_viewport_vrs(ctx, &next_pos); 10781 } 10782 if (ctx->num_clip_distances + ctx->num_cull_distances > 0) 10783 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, true, &next_pos); 10784 if (ctx->num_clip_distances + ctx->num_cull_distances > 4) 10785 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, true, &next_pos); 10786 10787 if (ctx->export_clip_dists) { 10788 if (ctx->num_clip_distances + ctx->num_cull_distances > 0) 10789 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST0, false, &next_pos); 10790 if (ctx->num_clip_distances + ctx->num_cull_distances > 4) 10791 export_vs_varying(ctx, VARYING_SLOT_CLIP_DIST1, false, &next_pos); 10792 } 10793 10794 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { 10795 if (i < VARYING_SLOT_VAR0 && i != VARYING_SLOT_LAYER && i != VARYING_SLOT_PRIMITIVE_ID && 10796 i != VARYING_SLOT_VIEWPORT) 10797 continue; 10798 10799 export_vs_varying(ctx, i, false, NULL); 10800 } 10801} 10802 10803static bool 10804export_fs_mrt_z(isel_context* ctx) 10805{ 10806 Builder bld(ctx->program, ctx->block); 10807 unsigned enabled_channels = 0; 10808 bool compr = false; 10809 Operand values[4]; 10810 10811 for (unsigned i = 0; i < 4; ++i) { 10812 values[i] = Operand(v1); 10813 } 10814 10815 /* Both stencil and sample mask only need 16-bits. */ 10816 if (!ctx->program->info->ps.writes_z && 10817 (ctx->program->info->ps.writes_stencil || ctx->program->info->ps.writes_sample_mask)) { 10818 compr = true; /* COMPR flag */ 10819 10820 if (ctx->program->info->ps.writes_stencil) { 10821 /* Stencil should be in X[23:16]. */ 10822 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]); 10823 values[0] = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(16u), values[0]); 10824 enabled_channels |= 0x3; 10825 } 10826 10827 if (ctx->program->info->ps.writes_sample_mask) { 10828 /* SampleMask should be in Y[15:0]. */ 10829 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]); 10830 enabled_channels |= 0xc; 10831 } 10832 } else { 10833 if (ctx->program->info->ps.writes_z) { 10834 values[0] = Operand(ctx->outputs.temps[FRAG_RESULT_DEPTH * 4u]); 10835 enabled_channels |= 0x1; 10836 } 10837 10838 if (ctx->program->info->ps.writes_stencil) { 10839 values[1] = Operand(ctx->outputs.temps[FRAG_RESULT_STENCIL * 4u]); 10840 enabled_channels |= 0x2; 10841 } 10842 10843 if (ctx->program->info->ps.writes_sample_mask) { 10844 values[2] = Operand(ctx->outputs.temps[FRAG_RESULT_SAMPLE_MASK * 4u]); 10845 enabled_channels |= 0x4; 10846 } 10847 } 10848 10849 /* GFX6 (except OLAND and HAINAN) has a bug that it only looks at the X 10850 * writemask component. 10851 */ 10852 if (ctx->options->chip_class == GFX6 && ctx->options->family != CHIP_OLAND && 10853 ctx->options->family != CHIP_HAINAN) { 10854 enabled_channels |= 0x1; 10855 } 10856 10857 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, 10858 V_008DFC_SQ_EXP_MRTZ, compr); 10859 10860 return true; 10861} 10862 10863static bool 10864export_fs_mrt_color(isel_context* ctx, int slot) 10865{ 10866 Builder bld(ctx->program, ctx->block); 10867 unsigned write_mask = ctx->outputs.mask[slot]; 10868 Operand values[4]; 10869 10870 for (unsigned i = 0; i < 4; ++i) { 10871 if (write_mask & (1 << i)) { 10872 values[i] = Operand(ctx->outputs.temps[slot * 4u + i]); 10873 } else { 10874 values[i] = Operand(v1); 10875 } 10876 } 10877 10878 unsigned target, col_format; 10879 unsigned enabled_channels = 0; 10880 aco_opcode compr_op = (aco_opcode)0; 10881 bool compr = false; 10882 10883 slot -= FRAG_RESULT_DATA0; 10884 target = V_008DFC_SQ_EXP_MRT + slot; 10885 col_format = (ctx->options->key.ps.col_format >> (4 * slot)) & 0xf; 10886 10887 bool is_int8 = (ctx->options->key.ps.is_int8 >> slot) & 1; 10888 bool is_int10 = (ctx->options->key.ps.is_int10 >> slot) & 1; 10889 bool is_16bit = values[0].regClass() == v2b; 10890 10891 /* Replace NaN by zero (only 32-bit) to fix game bugs if requested. */ 10892 if (ctx->options->enable_mrt_output_nan_fixup && !is_16bit && 10893 (col_format == V_028714_SPI_SHADER_32_R || col_format == V_028714_SPI_SHADER_32_GR || 10894 col_format == V_028714_SPI_SHADER_32_AR || col_format == V_028714_SPI_SHADER_32_ABGR || 10895 col_format == V_028714_SPI_SHADER_FP16_ABGR)) { 10896 for (int i = 0; i < 4; i++) { 10897 if (!(write_mask & (1 << i))) 10898 continue; 10899 10900 Temp isnan = bld.vopc(aco_opcode::v_cmp_class_f32, bld.hint_vcc(bld.def(bld.lm)), 10901 values[i], bld.copy(bld.def(v1), Operand::c32(3u))); 10902 values[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), values[i], 10903 bld.copy(bld.def(v1), Operand::zero()), isnan); 10904 } 10905 } 10906 10907 switch (col_format) { 10908 case V_028714_SPI_SHADER_32_R: enabled_channels = 1; break; 10909 10910 case V_028714_SPI_SHADER_32_GR: enabled_channels = 0x3; break; 10911 10912 case V_028714_SPI_SHADER_32_AR: 10913 if (ctx->options->chip_class >= GFX10) { 10914 /* Special case: on GFX10, the outputs are different for 32_AR */ 10915 enabled_channels = 0x3; 10916 values[1] = values[3]; 10917 values[3] = Operand(v1); 10918 } else { 10919 enabled_channels = 0x9; 10920 } 10921 break; 10922 10923 case V_028714_SPI_SHADER_FP16_ABGR: 10924 for (int i = 0; i < 2; i++) { 10925 bool enabled = (write_mask >> (i * 2)) & 0x3; 10926 if (enabled) { 10927 enabled_channels |= 0x3 << (i * 2); 10928 if (is_16bit) { 10929 values[i] = 10930 bld.pseudo(aco_opcode::p_create_vector, bld.def(v1), 10931 values[i * 2].isUndefined() ? Operand(v2b) : values[i * 2], 10932 values[i * 2 + 1].isUndefined() ? Operand(v2b) : values[i * 2 + 1]); 10933 } else if (ctx->options->chip_class == GFX8 || ctx->options->chip_class == GFX9) { 10934 values[i] = 10935 bld.vop3(aco_opcode::v_cvt_pkrtz_f16_f32_e64, bld.def(v1), 10936 values[i * 2].isUndefined() ? Operand::zero() : values[i * 2], 10937 values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]); 10938 } else { 10939 values[i] = 10940 bld.vop2(aco_opcode::v_cvt_pkrtz_f16_f32, bld.def(v1), 10941 values[i * 2].isUndefined() ? values[i * 2 + 1] : values[i * 2], 10942 values[i * 2 + 1].isUndefined() ? values[i * 2] : values[i * 2 + 1]); 10943 } 10944 } else { 10945 values[i] = Operand(v1); 10946 } 10947 } 10948 values[2] = Operand(v1); 10949 values[3] = Operand(v1); 10950 compr = true; 10951 break; 10952 10953 case V_028714_SPI_SHADER_UNORM16_ABGR: 10954 if (is_16bit && ctx->options->chip_class >= GFX9) { 10955 compr_op = aco_opcode::v_cvt_pknorm_u16_f16; 10956 } else { 10957 compr_op = aco_opcode::v_cvt_pknorm_u16_f32; 10958 } 10959 break; 10960 10961 case V_028714_SPI_SHADER_SNORM16_ABGR: 10962 if (is_16bit && ctx->options->chip_class >= GFX9) { 10963 compr_op = aco_opcode::v_cvt_pknorm_i16_f16; 10964 } else { 10965 compr_op = aco_opcode::v_cvt_pknorm_i16_f32; 10966 } 10967 break; 10968 10969 case V_028714_SPI_SHADER_UINT16_ABGR: { 10970 compr_op = aco_opcode::v_cvt_pk_u16_u32; 10971 if (is_int8 || is_int10) { 10972 /* clamp */ 10973 uint32_t max_rgb = is_int8 ? 255 : is_int10 ? 1023 : 0; 10974 Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb)); 10975 10976 for (unsigned i = 0; i < 4; i++) { 10977 if ((write_mask >> i) & 1) { 10978 values[i] = 10979 bld.vop2(aco_opcode::v_min_u32, bld.def(v1), 10980 i == 3 && is_int10 ? Operand::c32(3u) : Operand(max_rgb_val), values[i]); 10981 } 10982 } 10983 } else if (is_16bit) { 10984 for (unsigned i = 0; i < 4; i++) { 10985 if ((write_mask >> i) & 1) { 10986 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, false); 10987 values[i] = Operand(tmp); 10988 } 10989 } 10990 } 10991 break; 10992 } 10993 10994 case V_028714_SPI_SHADER_SINT16_ABGR: 10995 compr_op = aco_opcode::v_cvt_pk_i16_i32; 10996 if (is_int8 || is_int10) { 10997 /* clamp */ 10998 uint32_t max_rgb = is_int8 ? 127 : is_int10 ? 511 : 0; 10999 uint32_t min_rgb = is_int8 ? -128 : is_int10 ? -512 : 0; 11000 Temp max_rgb_val = bld.copy(bld.def(s1), Operand::c32(max_rgb)); 11001 Temp min_rgb_val = bld.copy(bld.def(s1), Operand::c32(min_rgb)); 11002 11003 for (unsigned i = 0; i < 4; i++) { 11004 if ((write_mask >> i) & 1) { 11005 values[i] = 11006 bld.vop2(aco_opcode::v_min_i32, bld.def(v1), 11007 i == 3 && is_int10 ? Operand::c32(1u) : Operand(max_rgb_val), values[i]); 11008 values[i] = bld.vop2(aco_opcode::v_max_i32, bld.def(v1), 11009 i == 3 && is_int10 ? Operand::c32(-2u) : Operand(min_rgb_val), 11010 values[i]); 11011 } 11012 } 11013 } else if (is_16bit) { 11014 for (unsigned i = 0; i < 4; i++) { 11015 if ((write_mask >> i) & 1) { 11016 Temp tmp = convert_int(ctx, bld, values[i].getTemp(), 16, 32, true); 11017 values[i] = Operand(tmp); 11018 } 11019 } 11020 } 11021 break; 11022 11023 case V_028714_SPI_SHADER_32_ABGR: enabled_channels = 0xF; break; 11024 11025 case V_028714_SPI_SHADER_ZERO: 11026 default: return false; 11027 } 11028 11029 if ((bool)compr_op) { 11030 for (int i = 0; i < 2; i++) { 11031 /* check if at least one of the values to be compressed is enabled */ 11032 bool enabled = (write_mask >> (i * 2)) & 0x3; 11033 if (enabled) { 11034 enabled_channels |= 0x3 << (i * 2); 11035 values[i] = bld.vop3( 11036 compr_op, bld.def(v1), values[i * 2].isUndefined() ? Operand::zero() : values[i * 2], 11037 values[i * 2 + 1].isUndefined() ? Operand::zero() : values[i * 2 + 1]); 11038 } else { 11039 values[i] = Operand(v1); 11040 } 11041 } 11042 values[2] = Operand(v1); 11043 values[3] = Operand(v1); 11044 compr = true; 11045 } else if (!compr) { 11046 for (int i = 0; i < 4; i++) 11047 values[i] = enabled_channels & (1 << i) ? values[i] : Operand(v1); 11048 } 11049 11050 bld.exp(aco_opcode::exp, values[0], values[1], values[2], values[3], enabled_channels, target, 11051 compr); 11052 return true; 11053} 11054 11055static void 11056create_fs_null_export(isel_context* ctx) 11057{ 11058 /* FS must always have exports. 11059 * So when there are none, we need to add a null export. 11060 */ 11061 11062 Builder bld(ctx->program, ctx->block); 11063 unsigned dest = V_008DFC_SQ_EXP_NULL; 11064 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 11065 /* enabled_mask */ 0, dest, /* compr */ false, /* done */ true, /* vm */ true); 11066} 11067 11068static void 11069create_fs_exports(isel_context* ctx) 11070{ 11071 bool exported = false; 11072 11073 /* Export depth, stencil and sample mask. */ 11074 if (ctx->outputs.mask[FRAG_RESULT_DEPTH] || ctx->outputs.mask[FRAG_RESULT_STENCIL] || 11075 ctx->outputs.mask[FRAG_RESULT_SAMPLE_MASK]) 11076 exported |= export_fs_mrt_z(ctx); 11077 11078 /* Export all color render targets. */ 11079 for (unsigned i = FRAG_RESULT_DATA0; i < FRAG_RESULT_DATA7 + 1; ++i) 11080 if (ctx->outputs.mask[i]) 11081 exported |= export_fs_mrt_color(ctx, i); 11082 11083 if (!exported) 11084 create_fs_null_export(ctx); 11085 11086 ctx->block->kind |= block_kind_export_end; 11087} 11088 11089static void 11090create_workgroup_barrier(Builder& bld) 11091{ 11092 bld.barrier(aco_opcode::p_barrier, 11093 memory_sync_info(storage_shared, semantic_acqrel, scope_workgroup), scope_workgroup); 11094} 11095 11096static void 11097emit_stream_output(isel_context* ctx, Temp const* so_buffers, Temp const* so_write_offset, 11098 const struct radv_stream_output* output) 11099{ 11100 unsigned num_comps = util_bitcount(output->component_mask); 11101 unsigned writemask = (1 << num_comps) - 1; 11102 unsigned loc = output->location; 11103 unsigned buf = output->buffer; 11104 11105 assert(num_comps && num_comps <= 4); 11106 if (!num_comps || num_comps > 4) 11107 return; 11108 11109 unsigned first_comp = ffs(output->component_mask) - 1; 11110 11111 Temp out[4]; 11112 bool all_undef = true; 11113 assert(ctx->stage.hw == HWStage::VS); 11114 for (unsigned i = 0; i < num_comps; i++) { 11115 out[i] = ctx->outputs.temps[loc * 4 + first_comp + i]; 11116 all_undef = all_undef && !out[i].id(); 11117 } 11118 if (all_undef) 11119 return; 11120 11121 while (writemask) { 11122 int start, count; 11123 u_bit_scan_consecutive_range(&writemask, &start, &count); 11124 if (count == 3 && ctx->options->chip_class == GFX6) { 11125 /* GFX6 doesn't support storing vec3, split it. */ 11126 writemask |= 1u << (start + 2); 11127 count = 2; 11128 } 11129 11130 unsigned offset = output->offset + start * 4; 11131 11132 Temp write_data = ctx->program->allocateTmp(RegClass(RegType::vgpr, count)); 11133 aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>( 11134 aco_opcode::p_create_vector, Format::PSEUDO, count, 1)}; 11135 for (int i = 0; i < count; ++i) 11136 vec->operands[i] = 11137 (ctx->outputs.mask[loc] & 1 << (start + first_comp + i)) ? Operand(out[start + i]) : Operand::zero(); 11138 vec->definitions[0] = Definition(write_data); 11139 ctx->block->instructions.emplace_back(std::move(vec)); 11140 11141 aco_opcode opcode; 11142 switch (count) { 11143 case 1: opcode = aco_opcode::buffer_store_dword; break; 11144 case 2: opcode = aco_opcode::buffer_store_dwordx2; break; 11145 case 3: opcode = aco_opcode::buffer_store_dwordx3; break; 11146 case 4: opcode = aco_opcode::buffer_store_dwordx4; break; 11147 default: unreachable("Unsupported dword count."); 11148 } 11149 11150 aco_ptr<MUBUF_instruction> store{ 11151 create_instruction<MUBUF_instruction>(opcode, Format::MUBUF, 4, 0)}; 11152 store->operands[0] = Operand(so_buffers[buf]); 11153 store->operands[1] = Operand(so_write_offset[buf]); 11154 store->operands[2] = Operand::c32(0); 11155 store->operands[3] = Operand(write_data); 11156 if (offset > 4095) { 11157 /* Don't think this can happen in RADV, but maybe GL? It's easy to do this anyway. */ 11158 Builder bld(ctx->program, ctx->block); 11159 store->operands[0] = 11160 bld.vadd32(bld.def(v1), Operand::c32(offset), Operand(so_write_offset[buf])); 11161 } else { 11162 store->offset = offset; 11163 } 11164 store->offen = true; 11165 store->glc = true; 11166 store->dlc = false; 11167 store->slc = true; 11168 ctx->block->instructions.emplace_back(std::move(store)); 11169 } 11170} 11171 11172static void 11173emit_streamout(isel_context* ctx, unsigned stream) 11174{ 11175 Builder bld(ctx->program, ctx->block); 11176 11177 Temp so_vtx_count = 11178 bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 11179 get_arg(ctx, ctx->args->ac.streamout_config), Operand::c32(0x70010u)); 11180 11181 Temp tid = emit_mbcnt(ctx, bld.tmp(v1)); 11182 11183 Temp can_emit = bld.vopc(aco_opcode::v_cmp_gt_i32, bld.def(bld.lm), so_vtx_count, tid); 11184 11185 if_context ic; 11186 begin_divergent_if_then(ctx, &ic, can_emit); 11187 11188 bld.reset(ctx->block); 11189 11190 Temp so_write_index = 11191 bld.vadd32(bld.def(v1), get_arg(ctx, ctx->args->ac.streamout_write_index), tid); 11192 11193 Temp so_buffers[4]; 11194 Temp so_write_offset[4]; 11195 Temp buf_ptr = convert_pointer_to_64_bit(ctx, get_arg(ctx, ctx->args->streamout_buffers)); 11196 11197 for (unsigned i = 0; i < 4; i++) { 11198 unsigned stride = ctx->program->info->so.strides[i]; 11199 if (!stride) 11200 continue; 11201 11202 so_buffers[i] = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), buf_ptr, 11203 bld.copy(bld.def(s1), Operand::c32(i * 16u))); 11204 11205 if (stride == 1) { 11206 Temp offset = bld.sop2(aco_opcode::s_add_i32, bld.def(s1), bld.def(s1, scc), 11207 get_arg(ctx, ctx->args->ac.streamout_write_index), 11208 get_arg(ctx, ctx->args->ac.streamout_offset[i])); 11209 Temp new_offset = bld.vadd32(bld.def(v1), offset, tid); 11210 11211 so_write_offset[i] = 11212 bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), new_offset); 11213 } else { 11214 Temp offset = bld.v_mul_imm(bld.def(v1), so_write_index, stride * 4u); 11215 Temp offset2 = bld.sop2(aco_opcode::s_mul_i32, bld.def(s1), Operand::c32(4u), 11216 get_arg(ctx, ctx->args->ac.streamout_offset[i])); 11217 so_write_offset[i] = bld.vadd32(bld.def(v1), offset, offset2); 11218 } 11219 } 11220 11221 for (unsigned i = 0; i < ctx->program->info->so.num_outputs; i++) { 11222 const struct radv_stream_output* output = &ctx->program->info->so.outputs[i]; 11223 if (stream != output->stream) 11224 continue; 11225 11226 emit_stream_output(ctx, so_buffers, so_write_offset, output); 11227 } 11228 11229 begin_divergent_if_else(ctx, &ic); 11230 end_divergent_if(ctx, &ic); 11231} 11232 11233Pseudo_instruction* 11234add_startpgm(struct isel_context* ctx) 11235{ 11236 aco_ptr<Pseudo_instruction> startpgm{ 11237 create_instruction<Pseudo_instruction>(aco_opcode::p_startpgm, Format::PSEUDO, 0, ctx->args->ac.arg_count)}; 11238 for (unsigned i = 0, arg = 0; i < ctx->args->ac.arg_count; i++) { 11239 if (ctx->args->ac.args[i].skip) 11240 continue; 11241 11242 enum ac_arg_regfile file = ctx->args->ac.args[i].file; 11243 unsigned size = ctx->args->ac.args[i].size; 11244 unsigned reg = ctx->args->ac.args[i].offset; 11245 RegClass type = RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size); 11246 Temp dst = ctx->program->allocateTmp(type); 11247 ctx->arg_temps[i] = dst; 11248 startpgm->definitions[arg] = Definition(dst); 11249 startpgm->definitions[arg].setFixed(PhysReg{file == AC_ARG_SGPR ? reg : reg + 256}); 11250 arg++; 11251 } 11252 Pseudo_instruction* instr = startpgm.get(); 11253 ctx->block->instructions.push_back(std::move(startpgm)); 11254 11255 /* Stash these in the program so that they can be accessed later when 11256 * handling spilling. 11257 */ 11258 ctx->program->private_segment_buffer = get_arg(ctx, ctx->args->ring_offsets); 11259 ctx->program->scratch_offset = get_arg(ctx, ctx->args->ac.scratch_offset); 11260 11261 if (ctx->stage.has(SWStage::VS) && ctx->program->info->vs.dynamic_inputs) { 11262 unsigned num_attributes = util_last_bit(ctx->program->info->vs.vb_desc_usage_mask); 11263 for (unsigned i = 0; i < num_attributes; i++) { 11264 Definition def(get_arg(ctx, ctx->args->vs_inputs[i])); 11265 11266 unsigned idx = ctx->args->vs_inputs[i].arg_index; 11267 def.setFixed(PhysReg(256 + ctx->args->ac.args[idx].offset)); 11268 11269 ctx->program->vs_inputs.push_back(def); 11270 } 11271 } 11272 11273 return instr; 11274} 11275 11276void 11277fix_ls_vgpr_init_bug(isel_context* ctx, Pseudo_instruction* startpgm) 11278{ 11279 assert(ctx->shader->info.stage == MESA_SHADER_VERTEX); 11280 Builder bld(ctx->program, ctx->block); 11281 constexpr unsigned hs_idx = 1u; 11282 Builder::Result hs_thread_count = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 11283 get_arg(ctx, ctx->args->ac.merged_wave_info), 11284 Operand::c32((8u << 16) | (hs_idx * 8u))); 11285 Temp ls_has_nonzero_hs_threads = bool_to_vector_condition(ctx, hs_thread_count.def(1).getTemp()); 11286 11287 /* If there are no HS threads, SPI mistakenly loads the LS VGPRs starting at VGPR 0. */ 11288 11289 Temp instance_id = 11290 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.vertex_id), 11291 get_arg(ctx, ctx->args->ac.instance_id), ls_has_nonzero_hs_threads); 11292 Temp vs_rel_patch_id = 11293 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_rel_ids), 11294 get_arg(ctx, ctx->args->ac.vs_rel_patch_id), ls_has_nonzero_hs_threads); 11295 Temp vertex_id = 11296 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), get_arg(ctx, ctx->args->ac.tcs_patch_id), 11297 get_arg(ctx, ctx->args->ac.vertex_id), ls_has_nonzero_hs_threads); 11298 11299 ctx->arg_temps[ctx->args->ac.instance_id.arg_index] = instance_id; 11300 ctx->arg_temps[ctx->args->ac.vs_rel_patch_id.arg_index] = vs_rel_patch_id; 11301 ctx->arg_temps[ctx->args->ac.vertex_id.arg_index] = vertex_id; 11302} 11303 11304void 11305split_arguments(isel_context* ctx, Pseudo_instruction* startpgm) 11306{ 11307 /* Split all arguments except for the first (ring_offsets) and the last 11308 * (exec) so that the dead channels don't stay live throughout the program. 11309 */ 11310 for (int i = 1; i < startpgm->definitions.size(); i++) { 11311 if (startpgm->definitions[i].regClass().size() > 1) { 11312 emit_split_vector(ctx, startpgm->definitions[i].getTemp(), 11313 startpgm->definitions[i].regClass().size()); 11314 } 11315 } 11316} 11317 11318void 11319handle_bc_optimize(isel_context* ctx) 11320{ 11321 /* needed when SPI_PS_IN_CONTROL.BC_OPTIMIZE_DISABLE is set to 0 */ 11322 Builder bld(ctx->program, ctx->block); 11323 uint32_t spi_ps_input_ena = ctx->program->config->spi_ps_input_ena; 11324 bool uses_center = 11325 G_0286CC_PERSP_CENTER_ENA(spi_ps_input_ena) || G_0286CC_LINEAR_CENTER_ENA(spi_ps_input_ena); 11326 bool uses_persp_centroid = G_0286CC_PERSP_CENTROID_ENA(spi_ps_input_ena); 11327 bool uses_linear_centroid = G_0286CC_LINEAR_CENTROID_ENA(spi_ps_input_ena); 11328 11329 if (uses_persp_centroid) 11330 ctx->persp_centroid = get_arg(ctx, ctx->args->ac.persp_centroid); 11331 if (uses_linear_centroid) 11332 ctx->linear_centroid = get_arg(ctx, ctx->args->ac.linear_centroid); 11333 11334 if (uses_center && (uses_persp_centroid || uses_linear_centroid)) { 11335 Temp sel = bld.vopc_e64(aco_opcode::v_cmp_lt_i32, bld.hint_vcc(bld.def(bld.lm)), 11336 get_arg(ctx, ctx->args->ac.prim_mask), Operand::zero()); 11337 11338 if (uses_persp_centroid) { 11339 Temp new_coord[2]; 11340 for (unsigned i = 0; i < 2; i++) { 11341 Temp persp_centroid = 11342 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_centroid), i, v1); 11343 Temp persp_center = 11344 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.persp_center), i, v1); 11345 new_coord[i] = 11346 bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), persp_centroid, persp_center, sel); 11347 } 11348 ctx->persp_centroid = bld.tmp(v2); 11349 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->persp_centroid), 11350 Operand(new_coord[0]), Operand(new_coord[1])); 11351 emit_split_vector(ctx, ctx->persp_centroid, 2); 11352 } 11353 11354 if (uses_linear_centroid) { 11355 Temp new_coord[2]; 11356 for (unsigned i = 0; i < 2; i++) { 11357 Temp linear_centroid = 11358 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_centroid), i, v1); 11359 Temp linear_center = 11360 emit_extract_vector(ctx, get_arg(ctx, ctx->args->ac.linear_center), i, v1); 11361 new_coord[i] = bld.vop2(aco_opcode::v_cndmask_b32, bld.def(v1), linear_centroid, 11362 linear_center, sel); 11363 } 11364 ctx->linear_centroid = bld.tmp(v2); 11365 bld.pseudo(aco_opcode::p_create_vector, Definition(ctx->linear_centroid), 11366 Operand(new_coord[0]), Operand(new_coord[1])); 11367 emit_split_vector(ctx, ctx->linear_centroid, 2); 11368 } 11369 } 11370} 11371 11372void 11373setup_fp_mode(isel_context* ctx, nir_shader* shader) 11374{ 11375 Program* program = ctx->program; 11376 11377 unsigned float_controls = shader->info.float_controls_execution_mode; 11378 11379 program->next_fp_mode.preserve_signed_zero_inf_nan32 = 11380 float_controls & FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP32; 11381 program->next_fp_mode.preserve_signed_zero_inf_nan16_64 = 11382 float_controls & (FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP16 | 11383 FLOAT_CONTROLS_SIGNED_ZERO_INF_NAN_PRESERVE_FP64); 11384 11385 program->next_fp_mode.must_flush_denorms32 = 11386 float_controls & FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP32; 11387 program->next_fp_mode.must_flush_denorms16_64 = 11388 float_controls & 11389 (FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP16 | FLOAT_CONTROLS_DENORM_FLUSH_TO_ZERO_FP64); 11390 11391 program->next_fp_mode.care_about_round32 = 11392 float_controls & 11393 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32); 11394 11395 program->next_fp_mode.care_about_round16_64 = 11396 float_controls & 11397 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64 | 11398 FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64); 11399 11400 /* default to preserving fp16 and fp64 denorms, since it's free for fp64 and 11401 * the precision seems needed for Wolfenstein: Youngblood to render correctly */ 11402 if (program->next_fp_mode.must_flush_denorms16_64) 11403 program->next_fp_mode.denorm16_64 = 0; 11404 else 11405 program->next_fp_mode.denorm16_64 = fp_denorm_keep; 11406 11407 /* preserving fp32 denorms is expensive, so only do it if asked */ 11408 if (float_controls & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) 11409 program->next_fp_mode.denorm32 = fp_denorm_keep; 11410 else 11411 program->next_fp_mode.denorm32 = 0; 11412 11413 if (float_controls & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) 11414 program->next_fp_mode.round32 = fp_round_tz; 11415 else 11416 program->next_fp_mode.round32 = fp_round_ne; 11417 11418 if (float_controls & 11419 (FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 | FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64)) 11420 program->next_fp_mode.round16_64 = fp_round_tz; 11421 else 11422 program->next_fp_mode.round16_64 = fp_round_ne; 11423 11424 ctx->block->fp_mode = program->next_fp_mode; 11425} 11426 11427void 11428cleanup_cfg(Program* program) 11429{ 11430 /* create linear_succs/logical_succs */ 11431 for (Block& BB : program->blocks) { 11432 for (unsigned idx : BB.linear_preds) 11433 program->blocks[idx].linear_succs.emplace_back(BB.index); 11434 for (unsigned idx : BB.logical_preds) 11435 program->blocks[idx].logical_succs.emplace_back(BB.index); 11436 } 11437} 11438 11439Temp 11440lanecount_to_mask(isel_context* ctx, Temp count, bool allow64 = true) 11441{ 11442 assert(count.regClass() == s1); 11443 11444 Builder bld(ctx->program, ctx->block); 11445 Temp mask = bld.sop2(aco_opcode::s_bfm_b64, bld.def(s2), count, Operand::zero()); 11446 Temp cond; 11447 11448 if (ctx->program->wave_size == 64) { 11449 /* If we know that all 64 threads can't be active at a time, we just use the mask as-is */ 11450 if (!allow64) 11451 return mask; 11452 11453 /* Special case for 64 active invocations, because 64 doesn't work with s_bfm */ 11454 Temp active_64 = bld.sopc(aco_opcode::s_bitcmp1_b32, bld.def(s1, scc), count, 11455 Operand::c32(6u /* log2(64) */)); 11456 cond = 11457 bld.sop2(Builder::s_cselect, bld.def(bld.lm), Operand::c32(-1u), mask, bld.scc(active_64)); 11458 } else { 11459 /* We use s_bfm_b64 (not _b32) which works with 32, but we need to extract the lower half of 11460 * the register */ 11461 cond = emit_extract_vector(ctx, mask, 0, bld.lm); 11462 } 11463 11464 return cond; 11465} 11466 11467Temp 11468merged_wave_info_to_mask(isel_context* ctx, unsigned i) 11469{ 11470 Builder bld(ctx->program, ctx->block); 11471 11472 /* lanecount_to_mask() only cares about s0.u[6:0] so we don't need either s_bfe nor s_and here */ 11473 Temp count = i == 0 11474 ? get_arg(ctx, ctx->args->ac.merged_wave_info) 11475 : bld.sop2(aco_opcode::s_lshr_b32, bld.def(s1), bld.def(s1, scc), 11476 get_arg(ctx, ctx->args->ac.merged_wave_info), Operand::c32(i * 8u)); 11477 11478 return lanecount_to_mask(ctx, count); 11479} 11480 11481void 11482ngg_emit_sendmsg_gs_alloc_req(isel_context* ctx, Temp vtx_cnt, Temp prm_cnt) 11483{ 11484 assert(vtx_cnt.id() && prm_cnt.id()); 11485 11486 Builder bld(ctx->program, ctx->block); 11487 Temp prm_cnt_0; 11488 11489 if (ctx->program->chip_class == GFX10 && 11490 (ctx->stage.has(SWStage::GS) || ctx->program->info->has_ngg_culling)) { 11491 /* Navi 1x workaround: check whether the workgroup has no output. 11492 * If so, change the number of exported vertices and primitives to 1. 11493 */ 11494 prm_cnt_0 = bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), prm_cnt, Operand::zero()); 11495 prm_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), prm_cnt, 11496 bld.scc(prm_cnt_0)); 11497 vtx_cnt = bld.sop2(aco_opcode::s_cselect_b32, bld.def(s1), Operand::c32(1u), vtx_cnt, 11498 bld.scc(prm_cnt_0)); 11499 } 11500 11501 /* Put the number of vertices and primitives into m0 for the GS_ALLOC_REQ */ 11502 Temp tmp = 11503 bld.sop2(aco_opcode::s_lshl_b32, bld.def(s1), bld.def(s1, scc), prm_cnt, Operand::c32(12u)); 11504 tmp = bld.sop2(aco_opcode::s_or_b32, bld.m0(bld.def(s1)), bld.def(s1, scc), tmp, vtx_cnt); 11505 11506 /* Request the SPI to allocate space for the primitives and vertices 11507 * that will be exported by the threadgroup. 11508 */ 11509 bld.sopp(aco_opcode::s_sendmsg, bld.m0(tmp), -1, sendmsg_gs_alloc_req); 11510 11511 if (prm_cnt_0.id()) { 11512 /* Navi 1x workaround: export a triangle with NaN coordinates when NGG has no output. 11513 * It can't have all-zero positions because that would render an undesired pixel with 11514 * conservative rasterization. 11515 */ 11516 Temp first_lane = bld.sop1(Builder::s_ff1_i32, bld.def(s1), Operand(exec, bld.lm)); 11517 Temp cond = bld.sop2(Builder::s_lshl, bld.def(bld.lm), bld.def(s1, scc), 11518 Operand::c32_or_c64(1u, ctx->program->wave_size == 64), first_lane); 11519 cond = bld.sop2(Builder::s_cselect, bld.def(bld.lm), cond, 11520 Operand::zero(ctx->program->wave_size == 64 ? 8 : 4), bld.scc(prm_cnt_0)); 11521 11522 if_context ic_prim_0; 11523 begin_divergent_if_then(ctx, &ic_prim_0, cond); 11524 bld.reset(ctx->block); 11525 ctx->block->kind |= block_kind_export_end; 11526 11527 /* Use zero: means that it's a triangle whose every vertex index is 0. */ 11528 Temp zero = bld.copy(bld.def(v1), Operand::zero()); 11529 /* Use NaN for the coordinates, so that the rasterizer allways culls it. */ 11530 Temp nan_coord = bld.copy(bld.def(v1), Operand::c32(-1u)); 11531 11532 bld.exp(aco_opcode::exp, zero, Operand(v1), Operand(v1), Operand(v1), 1 /* enabled mask */, 11533 V_008DFC_SQ_EXP_PRIM /* dest */, false /* compressed */, true /* done */, 11534 false /* valid mask */); 11535 bld.exp(aco_opcode::exp, nan_coord, nan_coord, nan_coord, nan_coord, 0xf /* enabled mask */, 11536 V_008DFC_SQ_EXP_POS /* dest */, false /* compressed */, true /* done */, 11537 true /* valid mask */); 11538 11539 begin_divergent_if_else(ctx, &ic_prim_0); 11540 end_divergent_if(ctx, &ic_prim_0); 11541 bld.reset(ctx->block); 11542 } 11543} 11544 11545} /* end namespace */ 11546 11547void 11548select_program(Program* program, unsigned shader_count, struct nir_shader* const* shaders, 11549 ac_shader_config* config, const struct radv_shader_args* args) 11550{ 11551 isel_context ctx = setup_isel_context(program, shader_count, shaders, config, args, false); 11552 if_context ic_merged_wave_info; 11553 bool ngg_gs = ctx.stage.hw == HWStage::NGG && ctx.stage.has(SWStage::GS); 11554 11555 for (unsigned i = 0; i < shader_count; i++) { 11556 nir_shader* nir = shaders[i]; 11557 init_context(&ctx, nir); 11558 11559 setup_fp_mode(&ctx, nir); 11560 11561 if (!i) { 11562 /* needs to be after init_context() for FS */ 11563 Pseudo_instruction* startpgm = add_startpgm(&ctx); 11564 append_logical_start(ctx.block); 11565 11566 if (unlikely(args->options->has_ls_vgpr_init_bug && ctx.stage == vertex_tess_control_hs)) 11567 fix_ls_vgpr_init_bug(&ctx, startpgm); 11568 11569 split_arguments(&ctx, startpgm); 11570 11571 if (!args->shader_info->vs.has_prolog && 11572 (program->stage.has(SWStage::VS) || program->stage.has(SWStage::TES))) { 11573 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_setprio, -1u, 0x3u); 11574 } 11575 } 11576 11577 /* In a merged VS+TCS HS, the VS implementation can be completely empty. */ 11578 nir_function_impl* func = nir_shader_get_entrypoint(nir); 11579 bool empty_shader = 11580 nir_cf_list_is_empty_block(&func->body) && 11581 ((nir->info.stage == MESA_SHADER_VERTEX && 11582 (ctx.stage == vertex_tess_control_hs || ctx.stage == vertex_geometry_gs)) || 11583 (nir->info.stage == MESA_SHADER_TESS_EVAL && ctx.stage == tess_eval_geometry_gs)); 11584 11585 bool check_merged_wave_info = 11586 ctx.tcs_in_out_eq ? i == 0 : (shader_count >= 2 && !empty_shader && !(ngg_gs && i == 1)); 11587 bool endif_merged_wave_info = 11588 ctx.tcs_in_out_eq ? i == 1 : (check_merged_wave_info && !(ngg_gs && i == 1)); 11589 11590 if (program->chip_class == GFX10 && program->stage.hw == HWStage::NGG && 11591 program->stage.num_sw_stages() == 1) { 11592 /* Workaround for Navi1x HW bug to ensure that all NGG waves launch before 11593 * s_sendmsg(GS_ALLOC_REQ). */ 11594 Builder(ctx.program, ctx.block).sopp(aco_opcode::s_barrier, -1u, 0u); 11595 } 11596 11597 if (check_merged_wave_info) { 11598 Temp cond = merged_wave_info_to_mask(&ctx, i); 11599 begin_divergent_if_then(&ctx, &ic_merged_wave_info, cond); 11600 } 11601 11602 if (i) { 11603 Builder bld(ctx.program, ctx.block); 11604 11605 /* Skip s_barrier from TCS when VS outputs are not stored in the LDS. */ 11606 bool tcs_skip_barrier = ctx.stage == vertex_tess_control_hs && 11607 ctx.tcs_temp_only_inputs == nir->info.inputs_read; 11608 11609 if (!ngg_gs && !tcs_skip_barrier) 11610 create_workgroup_barrier(bld); 11611 11612 if (ctx.stage == vertex_geometry_gs || ctx.stage == tess_eval_geometry_gs) { 11613 ctx.gs_wave_id = bld.pseudo(aco_opcode::p_extract, bld.def(s1, m0), bld.def(s1, scc), 11614 get_arg(&ctx, args->ac.merged_wave_info), Operand::c32(2u), 11615 Operand::c32(8u), Operand::zero()); 11616 } 11617 } else if (ctx.stage == geometry_gs) 11618 ctx.gs_wave_id = get_arg(&ctx, args->ac.gs_wave_id); 11619 11620 if (ctx.stage == fragment_fs) 11621 handle_bc_optimize(&ctx); 11622 11623 visit_cf_list(&ctx, &func->body); 11624 11625 if (ctx.program->info->so.num_outputs && ctx.stage.hw == HWStage::VS) 11626 emit_streamout(&ctx, 0); 11627 11628 if (ctx.stage.hw == HWStage::VS) { 11629 create_vs_exports(&ctx); 11630 } else if (nir->info.stage == MESA_SHADER_GEOMETRY && !ngg_gs) { 11631 Builder bld(ctx.program, ctx.block); 11632 bld.barrier(aco_opcode::p_barrier, 11633 memory_sync_info(storage_vmem_output, semantic_release, scope_device)); 11634 bld.sopp(aco_opcode::s_sendmsg, bld.m0(ctx.gs_wave_id), -1, 11635 sendmsg_gs_done(false, false, 0)); 11636 } 11637 11638 if (ctx.stage == fragment_fs) { 11639 create_fs_exports(&ctx); 11640 } 11641 11642 if (endif_merged_wave_info) { 11643 begin_divergent_if_else(&ctx, &ic_merged_wave_info); 11644 end_divergent_if(&ctx, &ic_merged_wave_info); 11645 } 11646 11647 if (i == 0 && ctx.stage == vertex_tess_control_hs && ctx.tcs_in_out_eq) { 11648 /* Outputs of the previous stage are inputs to the next stage */ 11649 ctx.inputs = ctx.outputs; 11650 ctx.outputs = shader_io_state(); 11651 } 11652 11653 cleanup_context(&ctx); 11654 } 11655 11656 program->config->float_mode = program->blocks[0].fp_mode.val; 11657 11658 append_logical_end(ctx.block); 11659 ctx.block->kind |= block_kind_uniform; 11660 Builder bld(ctx.program, ctx.block); 11661 bld.sopp(aco_opcode::s_endpgm); 11662 11663 cleanup_cfg(program); 11664} 11665 11666void 11667select_gs_copy_shader(Program* program, struct nir_shader* gs_shader, ac_shader_config* config, 11668 const struct radv_shader_args* args) 11669{ 11670 isel_context ctx = setup_isel_context(program, 1, &gs_shader, config, args, true); 11671 11672 ctx.block->fp_mode = program->next_fp_mode; 11673 11674 add_startpgm(&ctx); 11675 append_logical_start(ctx.block); 11676 11677 Builder bld(ctx.program, ctx.block); 11678 11679 Temp gsvs_ring = bld.smem(aco_opcode::s_load_dwordx4, bld.def(s4), 11680 program->private_segment_buffer, Operand::c32(RING_GSVS_VS * 16u)); 11681 11682 Operand stream_id = Operand::zero(); 11683 if (args->shader_info->so.num_outputs) 11684 stream_id = bld.sop2(aco_opcode::s_bfe_u32, bld.def(s1), bld.def(s1, scc), 11685 get_arg(&ctx, ctx.args->ac.streamout_config), Operand::c32(0x20018u)); 11686 11687 Temp vtx_offset = bld.vop2(aco_opcode::v_lshlrev_b32, bld.def(v1), Operand::c32(2u), 11688 get_arg(&ctx, ctx.args->ac.vertex_id)); 11689 11690 std::stack<if_context, std::vector<if_context>> if_contexts; 11691 11692 for (unsigned stream = 0; stream < 4; stream++) { 11693 if (stream_id.isConstant() && stream != stream_id.constantValue()) 11694 continue; 11695 11696 unsigned num_components = args->shader_info->gs.num_stream_output_components[stream]; 11697 if (stream > 0 && (!num_components || !args->shader_info->so.num_outputs)) 11698 continue; 11699 11700 memset(ctx.outputs.mask, 0, sizeof(ctx.outputs.mask)); 11701 11702 if (!stream_id.isConstant()) { 11703 Temp cond = 11704 bld.sopc(aco_opcode::s_cmp_eq_u32, bld.def(s1, scc), stream_id, Operand::c32(stream)); 11705 if_contexts.emplace(); 11706 begin_uniform_if_then(&ctx, &if_contexts.top(), cond); 11707 bld.reset(ctx.block); 11708 } 11709 11710 unsigned offset = 0; 11711 for (unsigned i = 0; i <= VARYING_SLOT_VAR31; ++i) { 11712 if (args->shader_info->gs.output_streams[i] != stream) 11713 continue; 11714 11715 unsigned output_usage_mask = args->shader_info->gs.output_usage_mask[i]; 11716 unsigned length = util_last_bit(output_usage_mask); 11717 for (unsigned j = 0; j < length; ++j) { 11718 if (!(output_usage_mask & (1 << j))) 11719 continue; 11720 11721 Temp val = bld.tmp(v1); 11722 unsigned const_offset = offset * args->shader_info->gs.vertices_out * 16 * 4; 11723 load_vmem_mubuf(&ctx, val, gsvs_ring, vtx_offset, Temp(), const_offset, 4, 1, 0u, true, 11724 true, true); 11725 11726 ctx.outputs.mask[i] |= 1 << j; 11727 ctx.outputs.temps[i * 4u + j] = val; 11728 11729 offset++; 11730 } 11731 } 11732 11733 if (args->shader_info->so.num_outputs) { 11734 emit_streamout(&ctx, stream); 11735 bld.reset(ctx.block); 11736 } 11737 11738 if (stream == 0) { 11739 create_vs_exports(&ctx); 11740 } 11741 11742 if (!stream_id.isConstant()) { 11743 begin_uniform_if_else(&ctx, &if_contexts.top()); 11744 bld.reset(ctx.block); 11745 } 11746 } 11747 11748 while (!if_contexts.empty()) { 11749 end_uniform_if(&ctx, &if_contexts.top()); 11750 if_contexts.pop(); 11751 } 11752 11753 program->config->float_mode = program->blocks[0].fp_mode.val; 11754 11755 append_logical_end(ctx.block); 11756 ctx.block->kind |= block_kind_uniform; 11757 bld.reset(ctx.block); 11758 bld.sopp(aco_opcode::s_endpgm); 11759 11760 cleanup_cfg(program); 11761} 11762 11763void 11764select_trap_handler_shader(Program* program, struct nir_shader* shader, ac_shader_config* config, 11765 const struct radv_shader_args* args) 11766{ 11767 assert(args->options->chip_class == GFX8); 11768 11769 init_program(program, compute_cs, args->shader_info, args->options->chip_class, 11770 args->options->family, args->options->wgp_mode, config); 11771 11772 isel_context ctx = {}; 11773 ctx.program = program; 11774 ctx.args = args; 11775 ctx.options = args->options; 11776 ctx.stage = program->stage; 11777 11778 ctx.block = ctx.program->create_and_insert_block(); 11779 ctx.block->kind = block_kind_top_level; 11780 11781 program->workgroup_size = 1; /* XXX */ 11782 11783 add_startpgm(&ctx); 11784 append_logical_start(ctx.block); 11785 11786 Builder bld(ctx.program, ctx.block); 11787 11788 /* Load the buffer descriptor from TMA. */ 11789 bld.smem(aco_opcode::s_load_dwordx4, Definition(PhysReg{ttmp4}, s4), Operand(PhysReg{tma}, s2), 11790 Operand::zero()); 11791 11792 /* Store TTMP0-TTMP1. */ 11793 bld.smem(aco_opcode::s_buffer_store_dwordx2, Operand(PhysReg{ttmp4}, s4), Operand::zero(), 11794 Operand(PhysReg{ttmp0}, s2), memory_sync_info(), true); 11795 11796 uint32_t hw_regs_idx[] = { 11797 2, /* HW_REG_STATUS */ 11798 3, /* HW_REG_TRAP_STS */ 11799 4, /* HW_REG_HW_ID */ 11800 7, /* HW_REG_IB_STS */ 11801 }; 11802 11803 /* Store some hardware registers. */ 11804 for (unsigned i = 0; i < ARRAY_SIZE(hw_regs_idx); i++) { 11805 /* "((size - 1) << 11) | register" */ 11806 bld.sopk(aco_opcode::s_getreg_b32, Definition(PhysReg{ttmp8}, s1), 11807 ((20 - 1) << 11) | hw_regs_idx[i]); 11808 11809 bld.smem(aco_opcode::s_buffer_store_dword, Operand(PhysReg{ttmp4}, s4), 11810 Operand::c32(8u + i * 4), Operand(PhysReg{ttmp8}, s1), memory_sync_info(), true); 11811 } 11812 11813 program->config->float_mode = program->blocks[0].fp_mode.val; 11814 11815 append_logical_end(ctx.block); 11816 ctx.block->kind |= block_kind_uniform; 11817 bld.sopp(aco_opcode::s_endpgm); 11818 11819 cleanup_cfg(program); 11820} 11821 11822Operand 11823get_arg_fixed(const struct radv_shader_args* args, struct ac_arg arg) 11824{ 11825 assert(arg.used); 11826 11827 enum ac_arg_regfile file = args->ac.args[arg.arg_index].file; 11828 unsigned size = args->ac.args[arg.arg_index].size; 11829 unsigned reg = args->ac.args[arg.arg_index].offset; 11830 11831 return Operand(PhysReg(file == AC_ARG_SGPR ? reg : reg + 256), 11832 RegClass(file == AC_ARG_SGPR ? RegType::sgpr : RegType::vgpr, size)); 11833} 11834 11835unsigned 11836load_vb_descs(Builder& bld, PhysReg dest, Operand base, unsigned start, unsigned max) 11837{ 11838 unsigned count = MIN2((bld.program->dev.sgpr_limit - dest.reg()) / 4u, max); 11839 11840 unsigned num_loads = (count / 4u) + util_bitcount(count & 0x3); 11841 if (bld.program->chip_class >= GFX10 && num_loads > 1) 11842 bld.sopp(aco_opcode::s_clause, -1, num_loads - 1); 11843 11844 for (unsigned i = 0; i < count;) { 11845 unsigned size = 1u << util_logbase2(MIN2(count - i, 4)); 11846 11847 if (size == 4) 11848 bld.smem(aco_opcode::s_load_dwordx16, Definition(dest, s16), base, 11849 Operand::c32((start + i) * 16u)); 11850 else if (size == 2) 11851 bld.smem(aco_opcode::s_load_dwordx8, Definition(dest, s8), base, 11852 Operand::c32((start + i) * 16u)); 11853 else 11854 bld.smem(aco_opcode::s_load_dwordx4, Definition(dest, s4), base, 11855 Operand::c32((start + i) * 16u)); 11856 11857 dest = dest.advance(size * 16u); 11858 i += size; 11859 } 11860 11861 return count; 11862} 11863 11864Operand 11865calc_nontrivial_instance_id(Builder& bld, const struct radv_shader_args* args, unsigned index, 11866 Operand instance_id, Operand start_instance, PhysReg tmp_sgpr, 11867 PhysReg tmp_vgpr0, PhysReg tmp_vgpr1) 11868{ 11869 bld.smem(aco_opcode::s_load_dwordx2, Definition(tmp_sgpr, s2), 11870 get_arg_fixed(args, args->prolog_inputs), Operand::c32(8u + index * 8u)); 11871 11872 wait_imm lgkm_imm; 11873 lgkm_imm.lgkm = 0; 11874 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(bld.program->chip_class)); 11875 11876 Definition fetch_index_def(tmp_vgpr0, v1); 11877 Operand fetch_index(tmp_vgpr0, v1); 11878 11879 Operand div_info(tmp_sgpr, s1); 11880 if (bld.program->chip_class >= GFX8) { 11881 /* use SDWA */ 11882 if (bld.program->chip_class < GFX9) { 11883 bld.vop1(aco_opcode::v_mov_b32, Definition(tmp_vgpr1, v1), div_info); 11884 div_info = Operand(tmp_vgpr1, v1); 11885 } 11886 11887 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id).instr; 11888 11889 Instruction* instr; 11890 if (bld.program->chip_class >= GFX9) 11891 instr = bld.vop2_sdwa(aco_opcode::v_add_u32, fetch_index_def, div_info, fetch_index).instr; 11892 else 11893 instr = bld.vop2_sdwa(aco_opcode::v_add_co_u32, fetch_index_def, Definition(vcc, bld.lm), 11894 div_info, fetch_index) 11895 .instr; 11896 instr->sdwa().sel[0] = SubdwordSel::ubyte1; 11897 11898 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, Operand(tmp_sgpr.advance(4), s1), 11899 fetch_index); 11900 11901 instr = 11902 bld.vop2_sdwa(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, fetch_index).instr; 11903 instr->sdwa().sel[0] = SubdwordSel::ubyte2; 11904 } else { 11905 Operand tmp_op(tmp_vgpr1, v1); 11906 Definition tmp_def(tmp_vgpr1, v1); 11907 11908 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, div_info, instance_id); 11909 11910 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(8u), Operand::c32(8u)); 11911 bld.vadd32(fetch_index_def, tmp_op, fetch_index, false, Operand(s2), true); 11912 11913 bld.vop3(aco_opcode::v_mul_hi_u32, fetch_index_def, fetch_index, 11914 Operand(tmp_sgpr.advance(4), s1)); 11915 11916 bld.vop3(aco_opcode::v_bfe_u32, tmp_def, div_info, Operand::c32(16u), Operand::c32(8u)); 11917 bld.vop2(aco_opcode::v_lshrrev_b32, fetch_index_def, tmp_op, fetch_index); 11918 } 11919 11920 bld.vadd32(fetch_index_def, start_instance, fetch_index, false, Operand(s2), true); 11921 11922 return fetch_index; 11923} 11924 11925void 11926select_vs_prolog(Program* program, const struct radv_vs_prolog_key* key, ac_shader_config* config, 11927 const struct radv_shader_args* args, unsigned* num_preserved_sgprs) 11928{ 11929 assert(key->num_attributes > 0); 11930 11931 /* This should be enough for any shader/stage. */ 11932 unsigned max_user_sgprs = args->options->chip_class >= GFX9 ? 32 : 16; 11933 *num_preserved_sgprs = max_user_sgprs + 14; 11934 11935 init_program(program, compute_cs, args->shader_info, args->options->chip_class, 11936 args->options->family, args->options->wgp_mode, config); 11937 11938 Block* block = program->create_and_insert_block(); 11939 block->kind = block_kind_top_level; 11940 11941 program->workgroup_size = 64; 11942 calc_min_waves(program); 11943 11944 Builder bld(program, block); 11945 11946 block->instructions.reserve(16 + key->num_attributes * 4); 11947 11948 bld.sopp(aco_opcode::s_setprio, -1u, 0x3u); 11949 11950 uint32_t attrib_mask = BITFIELD_MASK(key->num_attributes); 11951 bool has_nontrivial_divisors = key->state->nontrivial_divisors & attrib_mask; 11952 11953 wait_imm lgkm_imm; 11954 lgkm_imm.lgkm = 0; 11955 11956 /* choose sgprs */ 11957 PhysReg vertex_buffers(align(*num_preserved_sgprs, 2)); 11958 PhysReg prolog_input = vertex_buffers.advance(8); 11959 PhysReg desc( 11960 align((has_nontrivial_divisors ? prolog_input : vertex_buffers).advance(8).reg(), 4)); 11961 11962 Operand start_instance = get_arg_fixed(args, args->ac.start_instance); 11963 Operand instance_id = get_arg_fixed(args, args->ac.instance_id); 11964 11965 PhysReg attributes_start(256 + args->ac.num_vgprs_used); 11966 /* choose vgprs that won't be used for anything else until the last attribute load */ 11967 PhysReg vertex_index(attributes_start.reg() + key->num_attributes * 4 - 1); 11968 PhysReg instance_index(attributes_start.reg() + key->num_attributes * 4 - 2); 11969 PhysReg start_instance_vgpr(attributes_start.reg() + key->num_attributes * 4 - 3); 11970 PhysReg nontrivial_tmp_vgpr0(attributes_start.reg() + key->num_attributes * 4 - 4); 11971 PhysReg nontrivial_tmp_vgpr1(attributes_start.reg() + key->num_attributes * 4); 11972 11973 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers, s1), 11974 get_arg_fixed(args, args->ac.vertex_buffers)); 11975 bld.sop1(aco_opcode::s_mov_b32, Definition(vertex_buffers.advance(4), s1), 11976 Operand::c32((unsigned)args->options->address32_hi)); 11977 11978 /* calculate vgpr requirements */ 11979 unsigned num_vgprs = attributes_start.reg() - 256; 11980 num_vgprs += key->num_attributes * 4; 11981 if (has_nontrivial_divisors && program->chip_class <= GFX8) 11982 num_vgprs++; /* make space for nontrivial_tmp_vgpr1 */ 11983 unsigned num_sgprs = 0; 11984 11985 for (unsigned loc = 0; loc < key->num_attributes;) { 11986 unsigned num_descs = 11987 load_vb_descs(bld, desc, Operand(vertex_buffers, s2), loc, key->num_attributes - loc); 11988 num_sgprs = MAX2(num_sgprs, desc.advance(num_descs * 16u).reg()); 11989 11990 if (loc == 0) { 11991 /* perform setup while we load the descriptors */ 11992 if (key->is_ngg || key->next_stage != MESA_SHADER_VERTEX) { 11993 Operand count = get_arg_fixed(args, args->ac.merged_wave_info); 11994 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), count, Operand::c32(0u)); 11995 if (program->wave_size == 64) { 11996 bld.sopc(aco_opcode::s_bitcmp1_b32, Definition(scc, s1), count, 11997 Operand::c32(6u /* log2(64) */)); 11998 bld.sop2(aco_opcode::s_cselect_b64, Definition(exec, s2), Operand::c64(UINT64_MAX), 11999 Operand(exec, s2), Operand(scc, s1)); 12000 } 12001 } 12002 12003 bool needs_instance_index = false; 12004 bool needs_start_instance = false; 12005 u_foreach_bit(i, key->state->instance_rate_inputs & attrib_mask) 12006 { 12007 needs_instance_index |= key->state->divisors[i] == 1; 12008 needs_start_instance |= key->state->divisors[i] == 0; 12009 } 12010 bool needs_vertex_index = ~key->state->instance_rate_inputs & attrib_mask; 12011 if (needs_vertex_index) 12012 bld.vadd32(Definition(vertex_index, v1), get_arg_fixed(args, args->ac.base_vertex), 12013 get_arg_fixed(args, args->ac.vertex_id), false, Operand(s2), true); 12014 if (needs_instance_index) 12015 bld.vadd32(Definition(instance_index, v1), start_instance, instance_id, false, 12016 Operand(s2), true); 12017 if (needs_start_instance) 12018 bld.vop1(aco_opcode::v_mov_b32, Definition(start_instance_vgpr, v1), start_instance); 12019 } 12020 12021 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class)); 12022 12023 for (unsigned i = 0; i < num_descs; i++, loc++) { 12024 PhysReg dest(attributes_start.reg() + loc * 4u); 12025 12026 /* calculate index */ 12027 Operand fetch_index = Operand(vertex_index, v1); 12028 if (key->state->instance_rate_inputs & (1u << loc)) { 12029 uint32_t divisor = key->state->divisors[loc]; 12030 if (divisor) { 12031 fetch_index = instance_id; 12032 if (key->state->nontrivial_divisors & (1u << loc)) { 12033 unsigned index = 12034 util_bitcount(key->state->nontrivial_divisors & BITFIELD_MASK(loc)); 12035 fetch_index = calc_nontrivial_instance_id( 12036 bld, args, index, instance_id, start_instance, prolog_input, 12037 nontrivial_tmp_vgpr0, nontrivial_tmp_vgpr1); 12038 } else { 12039 fetch_index = Operand(instance_index, v1); 12040 } 12041 } else { 12042 fetch_index = Operand(start_instance_vgpr, v1); 12043 } 12044 } 12045 12046 /* perform load */ 12047 PhysReg cur_desc = desc.advance(i * 16); 12048 if ((key->misaligned_mask & (1u << loc))) { 12049 unsigned dfmt = key->state->formats[loc] & 0xf; 12050 unsigned nfmt = key->state->formats[loc] >> 4; 12051 const struct ac_data_format_info* vtx_info = ac_get_data_format_info(dfmt); 12052 for (unsigned j = 0; j < vtx_info->num_channels; j++) { 12053 bool post_shuffle = key->state->post_shuffle & (1u << loc); 12054 unsigned offset = vtx_info->chan_byte_size * (post_shuffle && j < 3 ? 2 - j : j); 12055 12056 /* Use MUBUF to workaround hangs for byte-aligned dword loads. The Vulkan spec 12057 * doesn't require this to work, but some GL CTS tests over Zink do this anyway. 12058 * MTBUF can hang, but MUBUF doesn't (probably gives garbage, but GL CTS doesn't 12059 * care). 12060 */ 12061 if (vtx_info->chan_format == V_008F0C_BUF_DATA_FORMAT_32) 12062 bld.mubuf(aco_opcode::buffer_load_dword, Definition(dest.advance(j * 4u), v1), 12063 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), offset, false, 12064 false, true); 12065 else 12066 bld.mtbuf(aco_opcode::tbuffer_load_format_x, Definition(dest.advance(j * 4u), v1), 12067 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 12068 vtx_info->chan_format, nfmt, offset, false, true); 12069 } 12070 uint32_t one = 12071 nfmt == V_008F0C_BUF_NUM_FORMAT_UINT || nfmt == V_008F0C_BUF_NUM_FORMAT_SINT 12072 ? 1u 12073 : 0x3f800000u; 12074 for (unsigned j = vtx_info->num_channels; j < 4; j++) { 12075 bld.vop1(aco_opcode::v_mov_b32, Definition(dest.advance(j * 4u), v1), 12076 Operand::c32(j == 3 ? one : 0u)); 12077 } 12078 } else { 12079 bld.mubuf(aco_opcode::buffer_load_format_xyzw, Definition(dest, v4), 12080 Operand(cur_desc, s4), fetch_index, Operand::c32(0u), 0u, false, false, true); 12081 } 12082 } 12083 } 12084 12085 if (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi) { 12086 wait_imm vm_imm; 12087 vm_imm.vm = 0; 12088 bld.sopp(aco_opcode::s_waitcnt, -1, vm_imm.pack(program->chip_class)); 12089 } 12090 12091 /* For 2_10_10_10 formats the alpha is handled as unsigned by pre-vega HW. 12092 * so we may need to fix it up. */ 12093 u_foreach_bit(loc, (key->state->alpha_adjust_lo | key->state->alpha_adjust_hi)) 12094 { 12095 PhysReg alpha(attributes_start.reg() + loc * 4u + 3); 12096 12097 unsigned alpha_adjust = (key->state->alpha_adjust_lo >> loc) & 0x1; 12098 alpha_adjust |= ((key->state->alpha_adjust_hi >> loc) & 0x1) << 1; 12099 12100 if (alpha_adjust == ALPHA_ADJUST_SSCALED) 12101 bld.vop1(aco_opcode::v_cvt_u32_f32, Definition(alpha, v1), Operand(alpha, v1)); 12102 12103 /* For the integer-like cases, do a natural sign extension. 12104 * 12105 * For the SNORM case, the values are 0.0, 0.333, 0.666, 1.0 12106 * and happen to contain 0, 1, 2, 3 as the two LSBs of the 12107 * exponent. 12108 */ 12109 unsigned offset = alpha_adjust == ALPHA_ADJUST_SNORM ? 23u : 0u; 12110 bld.vop3(aco_opcode::v_bfe_i32, Definition(alpha, v1), Operand(alpha, v1), 12111 Operand::c32(offset), Operand::c32(2u)); 12112 12113 /* Convert back to the right type. */ 12114 if (alpha_adjust == ALPHA_ADJUST_SNORM) { 12115 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1)); 12116 bld.vop2(aco_opcode::v_max_f32, Definition(alpha, v1), Operand::c32(0xbf800000u), 12117 Operand(alpha, v1)); 12118 } else if (alpha_adjust == ALPHA_ADJUST_SSCALED) { 12119 bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(alpha, v1), Operand(alpha, v1)); 12120 } 12121 } 12122 12123 block->kind |= block_kind_uniform; 12124 12125 /* continue on to the main shader */ 12126 Operand continue_pc = get_arg_fixed(args, args->prolog_inputs); 12127 if (has_nontrivial_divisors) { 12128 bld.smem(aco_opcode::s_load_dwordx2, Definition(prolog_input, s2), 12129 get_arg_fixed(args, args->prolog_inputs), Operand::c32(0u)); 12130 bld.sopp(aco_opcode::s_waitcnt, -1, lgkm_imm.pack(program->chip_class)); 12131 continue_pc = Operand(prolog_input, s2); 12132 } 12133 12134 bld.sop1(aco_opcode::s_setpc_b64, continue_pc); 12135 12136 program->config->float_mode = program->blocks[0].fp_mode.val; 12137 /* addition on GFX6-8 requires a carry-out (we use VCC) */ 12138 program->needs_vcc = program->chip_class <= GFX8; 12139 program->config->num_vgprs = get_vgpr_alloc(program, num_vgprs); 12140 program->config->num_sgprs = get_sgpr_alloc(program, num_sgprs); 12141} 12142} // namespace aco 12143