1/* 2 * Copyright © 2018 Valve Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "aco_builder.h" 26#include "aco_ir.h" 27 28#include "common/sid.h" 29 30#include <map> 31#include <vector> 32 33namespace aco { 34 35struct lower_context { 36 Program* program; 37 Block* block; 38 std::vector<aco_ptr<Instruction>> instructions; 39}; 40 41/* used by handle_operands() indirectly through Builder::copy */ 42uint8_t int8_mul_table[512] = { 43 0, 20, 1, 1, 1, 2, 1, 3, 1, 4, 1, 5, 1, 6, 1, 7, 1, 8, 1, 9, 44 1, 10, 1, 11, 1, 12, 1, 13, 1, 14, 1, 15, 1, 16, 1, 17, 1, 18, 1, 19, 45 1, 20, 1, 21, 1, 22, 1, 23, 1, 24, 1, 25, 1, 26, 1, 27, 1, 28, 1, 29, 46 1, 30, 1, 31, 1, 32, 1, 33, 1, 34, 1, 35, 1, 36, 1, 37, 1, 38, 1, 39, 47 1, 40, 1, 41, 1, 42, 1, 43, 1, 44, 1, 45, 1, 46, 1, 47, 1, 48, 1, 49, 48 1, 50, 1, 51, 1, 52, 1, 53, 1, 54, 1, 55, 1, 56, 1, 57, 1, 58, 1, 59, 49 1, 60, 1, 61, 1, 62, 1, 63, 1, 64, 5, 13, 2, 33, 17, 19, 2, 34, 3, 23, 50 2, 35, 11, 53, 2, 36, 7, 47, 2, 37, 3, 25, 2, 38, 7, 11, 2, 39, 53, 243, 51 2, 40, 3, 27, 2, 41, 17, 35, 2, 42, 5, 17, 2, 43, 3, 29, 2, 44, 15, 23, 52 2, 45, 7, 13, 2, 46, 3, 31, 2, 47, 5, 19, 2, 48, 19, 59, 2, 49, 3, 33, 53 2, 50, 7, 51, 2, 51, 15, 41, 2, 52, 3, 35, 2, 53, 11, 33, 2, 54, 23, 27, 54 2, 55, 3, 37, 2, 56, 9, 41, 2, 57, 5, 23, 2, 58, 3, 39, 2, 59, 7, 17, 55 2, 60, 9, 241, 2, 61, 3, 41, 2, 62, 5, 25, 2, 63, 35, 245, 2, 64, 3, 43, 56 5, 26, 9, 43, 3, 44, 7, 19, 10, 39, 3, 45, 4, 34, 11, 59, 3, 46, 9, 243, 57 4, 35, 3, 47, 22, 53, 7, 57, 3, 48, 5, 29, 10, 245, 3, 49, 4, 37, 9, 45, 58 3, 50, 7, 241, 4, 38, 3, 51, 7, 22, 5, 31, 3, 52, 7, 59, 7, 242, 3, 53, 59 4, 40, 7, 23, 3, 54, 15, 45, 4, 41, 3, 55, 6, 241, 9, 47, 3, 56, 13, 13, 60 5, 34, 3, 57, 4, 43, 11, 39, 3, 58, 5, 35, 4, 44, 3, 59, 6, 243, 7, 245, 61 3, 60, 5, 241, 7, 26, 3, 61, 4, 46, 5, 37, 3, 62, 11, 17, 4, 47, 3, 63, 62 5, 38, 5, 243, 3, 64, 7, 247, 9, 50, 5, 39, 4, 241, 33, 37, 6, 33, 13, 35, 63 4, 242, 5, 245, 6, 247, 7, 29, 4, 51, 5, 41, 5, 246, 7, 249, 3, 240, 11, 19, 64 5, 42, 3, 241, 4, 245, 25, 29, 3, 242, 5, 43, 4, 246, 3, 243, 17, 58, 17, 43, 65 3, 244, 5, 249, 6, 37, 3, 245, 2, 240, 5, 45, 2, 241, 21, 23, 2, 242, 3, 247, 66 2, 243, 5, 251, 2, 244, 29, 61, 2, 245, 3, 249, 2, 246, 17, 29, 2, 247, 9, 55, 67 1, 240, 1, 241, 1, 242, 1, 243, 1, 244, 1, 245, 1, 246, 1, 247, 1, 248, 1, 249, 68 1, 250, 1, 251, 1, 252, 1, 253, 1, 254, 1, 255}; 69 70aco_opcode 71get_reduce_opcode(chip_class chip, ReduceOp op) 72{ 73 /* Because some 16-bit instructions are already VOP3 on GFX10, we use the 74 * 32-bit opcodes (VOP2) which allows to remove the tempory VGPR and to use 75 * DPP with the arithmetic instructions. This requires to sign-extend. 76 */ 77 switch (op) { 78 case iadd8: 79 case iadd16: 80 if (chip >= GFX10) { 81 return aco_opcode::v_add_u32; 82 } else if (chip >= GFX8) { 83 return aco_opcode::v_add_u16; 84 } else { 85 return aco_opcode::v_add_co_u32; 86 } 87 break; 88 case imul8: 89 case imul16: 90 if (chip >= GFX10) { 91 return aco_opcode::v_mul_lo_u16_e64; 92 } else if (chip >= GFX8) { 93 return aco_opcode::v_mul_lo_u16; 94 } else { 95 return aco_opcode::v_mul_u32_u24; 96 } 97 break; 98 case fadd16: return aco_opcode::v_add_f16; 99 case fmul16: return aco_opcode::v_mul_f16; 100 case imax8: 101 case imax16: 102 if (chip >= GFX10) { 103 return aco_opcode::v_max_i32; 104 } else if (chip >= GFX8) { 105 return aco_opcode::v_max_i16; 106 } else { 107 return aco_opcode::v_max_i32; 108 } 109 break; 110 case imin8: 111 case imin16: 112 if (chip >= GFX10) { 113 return aco_opcode::v_min_i32; 114 } else if (chip >= GFX8) { 115 return aco_opcode::v_min_i16; 116 } else { 117 return aco_opcode::v_min_i32; 118 } 119 break; 120 case umin8: 121 case umin16: 122 if (chip >= GFX10) { 123 return aco_opcode::v_min_u32; 124 } else if (chip >= GFX8) { 125 return aco_opcode::v_min_u16; 126 } else { 127 return aco_opcode::v_min_u32; 128 } 129 break; 130 case umax8: 131 case umax16: 132 if (chip >= GFX10) { 133 return aco_opcode::v_max_u32; 134 } else if (chip >= GFX8) { 135 return aco_opcode::v_max_u16; 136 } else { 137 return aco_opcode::v_max_u32; 138 } 139 break; 140 case fmin16: return aco_opcode::v_min_f16; 141 case fmax16: return aco_opcode::v_max_f16; 142 case iadd32: return chip >= GFX9 ? aco_opcode::v_add_u32 : aco_opcode::v_add_co_u32; 143 case imul32: return aco_opcode::v_mul_lo_u32; 144 case fadd32: return aco_opcode::v_add_f32; 145 case fmul32: return aco_opcode::v_mul_f32; 146 case imax32: return aco_opcode::v_max_i32; 147 case imin32: return aco_opcode::v_min_i32; 148 case umin32: return aco_opcode::v_min_u32; 149 case umax32: return aco_opcode::v_max_u32; 150 case fmin32: return aco_opcode::v_min_f32; 151 case fmax32: return aco_opcode::v_max_f32; 152 case iand8: 153 case iand16: 154 case iand32: return aco_opcode::v_and_b32; 155 case ixor8: 156 case ixor16: 157 case ixor32: return aco_opcode::v_xor_b32; 158 case ior8: 159 case ior16: 160 case ior32: return aco_opcode::v_or_b32; 161 case iadd64: return aco_opcode::num_opcodes; 162 case imul64: return aco_opcode::num_opcodes; 163 case fadd64: return aco_opcode::v_add_f64; 164 case fmul64: return aco_opcode::v_mul_f64; 165 case imin64: return aco_opcode::num_opcodes; 166 case imax64: return aco_opcode::num_opcodes; 167 case umin64: return aco_opcode::num_opcodes; 168 case umax64: return aco_opcode::num_opcodes; 169 case fmin64: return aco_opcode::v_min_f64; 170 case fmax64: return aco_opcode::v_max_f64; 171 case iand64: return aco_opcode::num_opcodes; 172 case ior64: return aco_opcode::num_opcodes; 173 case ixor64: return aco_opcode::num_opcodes; 174 default: return aco_opcode::num_opcodes; 175 } 176} 177 178bool 179is_vop3_reduce_opcode(aco_opcode opcode) 180{ 181 /* 64-bit reductions are VOP3. */ 182 if (opcode == aco_opcode::num_opcodes) 183 return true; 184 185 return instr_info.format[(int)opcode] == Format::VOP3; 186} 187 188void 189emit_vadd32(Builder& bld, Definition def, Operand src0, Operand src1) 190{ 191 Instruction* instr = bld.vadd32(def, src0, src1, false, Operand(s2), true); 192 if (instr->definitions.size() >= 2) { 193 assert(instr->definitions[1].regClass() == bld.lm); 194 instr->definitions[1].setFixed(vcc); 195 } 196} 197 198void 199emit_int64_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, 200 PhysReg vtmp_reg, ReduceOp op, unsigned dpp_ctrl, unsigned row_mask, 201 unsigned bank_mask, bool bound_ctrl, Operand* identity = NULL) 202{ 203 Builder bld(ctx->program, &ctx->instructions); 204 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)}; 205 Definition vtmp_def[] = {Definition(vtmp_reg, v1), Definition(PhysReg{vtmp_reg + 1}, v1)}; 206 Operand src0[] = {Operand(src0_reg, v1), Operand(PhysReg{src0_reg + 1}, v1)}; 207 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)}; 208 Operand src1_64 = Operand(src1_reg, v2); 209 Operand vtmp_op[] = {Operand(vtmp_reg, v1), Operand(PhysReg{vtmp_reg + 1}, v1)}; 210 Operand vtmp_op64 = Operand(vtmp_reg, v2); 211 if (op == iadd64) { 212 if (ctx->program->chip_class >= GFX10) { 213 if (identity) 214 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); 215 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, 216 bound_ctrl); 217 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), vtmp_op[0], src1[0]); 218 } else { 219 bld.vop2_dpp(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0], 220 dpp_ctrl, row_mask, bank_mask, bound_ctrl); 221 } 222 bld.vop2_dpp(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], 223 Operand(vcc, bld.lm), dpp_ctrl, row_mask, bank_mask, bound_ctrl); 224 } else if (op == iand64) { 225 bld.vop2_dpp(aco_opcode::v_and_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, 226 bound_ctrl); 227 bld.vop2_dpp(aco_opcode::v_and_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask, 228 bound_ctrl); 229 } else if (op == ior64) { 230 bld.vop2_dpp(aco_opcode::v_or_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, 231 bound_ctrl); 232 bld.vop2_dpp(aco_opcode::v_or_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask, 233 bound_ctrl); 234 } else if (op == ixor64) { 235 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0], dpp_ctrl, row_mask, bank_mask, 236 bound_ctrl); 237 bld.vop2_dpp(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1], dpp_ctrl, row_mask, bank_mask, 238 bound_ctrl); 239 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) { 240 aco_opcode cmp = aco_opcode::num_opcodes; 241 switch (op) { 242 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break; 243 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break; 244 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break; 245 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break; 246 default: break; 247 } 248 249 if (identity) { 250 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); 251 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[1], identity[1]); 252 } 253 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, 254 bound_ctrl); 255 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[1], src0[1], dpp_ctrl, row_mask, bank_mask, 256 bound_ctrl); 257 258 bld.vopc(cmp, bld.def(bld.lm, vcc), vtmp_op64, src1_64); 259 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], vtmp_op[0], src1[0], Operand(vcc, bld.lm)); 260 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], vtmp_op[1], src1[1], Operand(vcc, bld.lm)); 261 } else if (op == imul64) { 262 /* t4 = dpp(x_hi) 263 * t1 = umul_lo(t4, y_lo) 264 * t3 = dpp(x_lo) 265 * t0 = umul_lo(t3, y_hi) 266 * t2 = iadd(t0, t1) 267 * t5 = umul_hi(t3, y_lo) 268 * res_hi = iadd(t2, t5) 269 * res_lo = umul_lo(t3, y_lo) 270 * Requires that res_hi != src0[0] and res_hi != src1[0] 271 * and that vtmp[0] != res_hi. 272 */ 273 if (identity) 274 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[1]); 275 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[1], dpp_ctrl, row_mask, bank_mask, 276 bound_ctrl); 277 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[1], vtmp_op[0], src1[0]); 278 if (identity) 279 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); 280 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, 281 bound_ctrl); 282 bld.vop3(aco_opcode::v_mul_lo_u32, vtmp_def[0], vtmp_op[0], src1[1]); 283 emit_vadd32(bld, vtmp_def[1], vtmp_op[0], vtmp_op[1]); 284 if (identity) 285 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); 286 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, 287 bound_ctrl); 288 bld.vop3(aco_opcode::v_mul_hi_u32, vtmp_def[0], vtmp_op[0], src1[0]); 289 emit_vadd32(bld, dst[1], vtmp_op[1], vtmp_op[0]); 290 if (identity) 291 bld.vop1(aco_opcode::v_mov_b32, vtmp_def[0], identity[0]); 292 bld.vop1_dpp(aco_opcode::v_mov_b32, vtmp_def[0], src0[0], dpp_ctrl, row_mask, bank_mask, 293 bound_ctrl); 294 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], vtmp_op[0], src1[0]); 295 } 296} 297 298void 299emit_int64_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, 300 ReduceOp op) 301{ 302 Builder bld(ctx->program, &ctx->instructions); 303 Definition dst[] = {Definition(dst_reg, v1), Definition(PhysReg{dst_reg + 1}, v1)}; 304 RegClass src0_rc = src0_reg.reg() >= 256 ? v1 : s1; 305 Operand src0[] = {Operand(src0_reg, src0_rc), Operand(PhysReg{src0_reg + 1}, src0_rc)}; 306 Operand src1[] = {Operand(src1_reg, v1), Operand(PhysReg{src1_reg + 1}, v1)}; 307 Operand src0_64 = Operand(src0_reg, src0_reg.reg() >= 256 ? v2 : s2); 308 Operand src1_64 = Operand(src1_reg, v2); 309 310 if (src0_rc == s1 && 311 (op == imul64 || op == umin64 || op == umax64 || op == imin64 || op == imax64)) { 312 assert(vtmp.reg() != 0); 313 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), src0[0]); 314 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]); 315 src0_reg = vtmp; 316 src0[0] = Operand(vtmp, v1); 317 src0[1] = Operand(PhysReg{vtmp + 1}, v1); 318 src0_64 = Operand(vtmp, v2); 319 } else if (src0_rc == s1 && op == iadd64) { 320 assert(vtmp.reg() != 0); 321 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), src0[1]); 322 src0[1] = Operand(PhysReg{vtmp + 1}, v1); 323 } 324 325 if (op == iadd64) { 326 if (ctx->program->chip_class >= GFX10) { 327 bld.vop3(aco_opcode::v_add_co_u32_e64, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); 328 } else { 329 bld.vop2(aco_opcode::v_add_co_u32, dst[0], bld.def(bld.lm, vcc), src0[0], src1[0]); 330 } 331 bld.vop2(aco_opcode::v_addc_co_u32, dst[1], bld.def(bld.lm, vcc), src0[1], src1[1], 332 Operand(vcc, bld.lm)); 333 } else if (op == iand64) { 334 bld.vop2(aco_opcode::v_and_b32, dst[0], src0[0], src1[0]); 335 bld.vop2(aco_opcode::v_and_b32, dst[1], src0[1], src1[1]); 336 } else if (op == ior64) { 337 bld.vop2(aco_opcode::v_or_b32, dst[0], src0[0], src1[0]); 338 bld.vop2(aco_opcode::v_or_b32, dst[1], src0[1], src1[1]); 339 } else if (op == ixor64) { 340 bld.vop2(aco_opcode::v_xor_b32, dst[0], src0[0], src1[0]); 341 bld.vop2(aco_opcode::v_xor_b32, dst[1], src0[1], src1[1]); 342 } else if (op == umin64 || op == umax64 || op == imin64 || op == imax64) { 343 aco_opcode cmp = aco_opcode::num_opcodes; 344 switch (op) { 345 case umin64: cmp = aco_opcode::v_cmp_gt_u64; break; 346 case umax64: cmp = aco_opcode::v_cmp_lt_u64; break; 347 case imin64: cmp = aco_opcode::v_cmp_gt_i64; break; 348 case imax64: cmp = aco_opcode::v_cmp_lt_i64; break; 349 default: break; 350 } 351 352 bld.vopc(cmp, bld.def(bld.lm, vcc), src0_64, src1_64); 353 bld.vop2(aco_opcode::v_cndmask_b32, dst[0], src0[0], src1[0], Operand(vcc, bld.lm)); 354 bld.vop2(aco_opcode::v_cndmask_b32, dst[1], src0[1], src1[1], Operand(vcc, bld.lm)); 355 } else if (op == imul64) { 356 if (src1_reg == dst_reg) { 357 /* it's fine if src0==dst but not if src1==dst */ 358 std::swap(src0_reg, src1_reg); 359 std::swap(src0[0], src1[0]); 360 std::swap(src0[1], src1[1]); 361 std::swap(src0_64, src1_64); 362 } 363 assert(!(src0_reg == src1_reg)); 364 /* t1 = umul_lo(x_hi, y_lo) 365 * t0 = umul_lo(x_lo, y_hi) 366 * t2 = iadd(t0, t1) 367 * t5 = umul_hi(x_lo, y_lo) 368 * res_hi = iadd(t2, t5) 369 * res_lo = umul_lo(x_lo, y_lo) 370 * assumes that it's ok to modify x_hi/y_hi, since we might not have vtmp 371 */ 372 Definition tmp0_def(PhysReg{src0_reg + 1}, v1); 373 Definition tmp1_def(PhysReg{src1_reg + 1}, v1); 374 Operand tmp0_op = src0[1]; 375 Operand tmp1_op = src1[1]; 376 bld.vop3(aco_opcode::v_mul_lo_u32, tmp0_def, src0[1], src1[0]); 377 bld.vop3(aco_opcode::v_mul_lo_u32, tmp1_def, src0[0], src1[1]); 378 emit_vadd32(bld, tmp0_def, tmp1_op, tmp0_op); 379 bld.vop3(aco_opcode::v_mul_hi_u32, tmp1_def, src0[0], src1[0]); 380 emit_vadd32(bld, dst[1], tmp0_op, tmp1_op); 381 bld.vop3(aco_opcode::v_mul_lo_u32, dst[0], src0[0], src1[0]); 382 } 383} 384 385void 386emit_dpp_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, 387 ReduceOp op, unsigned size, unsigned dpp_ctrl, unsigned row_mask, unsigned bank_mask, 388 bool bound_ctrl, Operand* identity = NULL) /* for VOP3 with sparse writes */ 389{ 390 Builder bld(ctx->program, &ctx->instructions); 391 RegClass rc = RegClass(RegType::vgpr, size); 392 Definition dst(dst_reg, rc); 393 Operand src0(src0_reg, rc); 394 Operand src1(src1_reg, rc); 395 396 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op); 397 bool vop3 = is_vop3_reduce_opcode(opcode); 398 399 if (!vop3) { 400 if (opcode == aco_opcode::v_add_co_u32) 401 bld.vop2_dpp(opcode, dst, bld.def(bld.lm, vcc), src0, src1, dpp_ctrl, row_mask, bank_mask, 402 bound_ctrl); 403 else 404 bld.vop2_dpp(opcode, dst, src0, src1, dpp_ctrl, row_mask, bank_mask, bound_ctrl); 405 return; 406 } 407 408 if (opcode == aco_opcode::num_opcodes) { 409 emit_int64_dpp_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op, dpp_ctrl, row_mask, bank_mask, 410 bound_ctrl, identity); 411 return; 412 } 413 414 if (identity) 415 bld.vop1(aco_opcode::v_mov_b32, Definition(vtmp, v1), identity[0]); 416 if (identity && size >= 2) 417 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + 1}, v1), identity[1]); 418 419 for (unsigned i = 0; i < size; i++) 420 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1), 421 Operand(PhysReg{src0_reg + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl); 422 423 bld.vop3(opcode, dst, Operand(vtmp, rc), src1); 424} 425 426void 427emit_op(lower_context* ctx, PhysReg dst_reg, PhysReg src0_reg, PhysReg src1_reg, PhysReg vtmp, 428 ReduceOp op, unsigned size) 429{ 430 Builder bld(ctx->program, &ctx->instructions); 431 RegClass rc = RegClass(RegType::vgpr, size); 432 Definition dst(dst_reg, rc); 433 Operand src0(src0_reg, RegClass(src0_reg.reg() >= 256 ? RegType::vgpr : RegType::sgpr, size)); 434 Operand src1(src1_reg, rc); 435 436 aco_opcode opcode = get_reduce_opcode(ctx->program->chip_class, op); 437 bool vop3 = is_vop3_reduce_opcode(opcode); 438 439 if (opcode == aco_opcode::num_opcodes) { 440 emit_int64_op(ctx, dst_reg, src0_reg, src1_reg, vtmp, op); 441 return; 442 } 443 444 if (vop3) { 445 bld.vop3(opcode, dst, src0, src1); 446 } else if (opcode == aco_opcode::v_add_co_u32) { 447 bld.vop2(opcode, dst, bld.def(bld.lm, vcc), src0, src1); 448 } else { 449 bld.vop2(opcode, dst, src0, src1); 450 } 451} 452 453void 454emit_dpp_mov(lower_context* ctx, PhysReg dst, PhysReg src0, unsigned size, unsigned dpp_ctrl, 455 unsigned row_mask, unsigned bank_mask, bool bound_ctrl) 456{ 457 Builder bld(ctx->program, &ctx->instructions); 458 for (unsigned i = 0; i < size; i++) { 459 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(PhysReg{dst + i}, v1), 460 Operand(PhysReg{src0 + i}, v1), dpp_ctrl, row_mask, bank_mask, bound_ctrl); 461 } 462} 463 464void 465emit_ds_swizzle(Builder bld, PhysReg dst, PhysReg src, unsigned size, unsigned ds_pattern) 466{ 467 for (unsigned i = 0; i < size; i++) { 468 bld.ds(aco_opcode::ds_swizzle_b32, Definition(PhysReg{dst + i}, v1), 469 Operand(PhysReg{src + i}, v1), ds_pattern); 470 } 471} 472 473void 474emit_reduction(lower_context* ctx, aco_opcode op, ReduceOp reduce_op, unsigned cluster_size, 475 PhysReg tmp, PhysReg stmp, PhysReg vtmp, PhysReg sitmp, Operand src, Definition dst) 476{ 477 assert(cluster_size == ctx->program->wave_size || op == aco_opcode::p_reduce); 478 assert(cluster_size <= ctx->program->wave_size); 479 480 Builder bld(ctx->program, &ctx->instructions); 481 482 Operand identity[2]; 483 identity[0] = Operand::c32(get_reduction_identity(reduce_op, 0)); 484 identity[1] = Operand::c32(get_reduction_identity(reduce_op, 1)); 485 Operand vcndmask_identity[2] = {identity[0], identity[1]}; 486 487 /* First, copy the source to tmp and set inactive lanes to the identity */ 488 bld.sop1(Builder::s_or_saveexec, Definition(stmp, bld.lm), Definition(scc, s1), 489 Definition(exec, bld.lm), Operand::c64(UINT64_MAX), Operand(exec, bld.lm)); 490 491 for (unsigned i = 0; i < src.size(); i++) { 492 /* p_exclusive_scan needs it to be a sgpr or inline constant for the v_writelane_b32 493 * except on GFX10, where v_writelane_b32 can take a literal. */ 494 if (identity[i].isLiteral() && op == aco_opcode::p_exclusive_scan && 495 ctx->program->chip_class < GFX10) { 496 bld.sop1(aco_opcode::s_mov_b32, Definition(PhysReg{sitmp + i}, s1), identity[i]); 497 identity[i] = Operand(PhysReg{sitmp + i}, s1); 498 499 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]); 500 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1); 501 } else if (identity[i].isLiteral()) { 502 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{tmp + i}, v1), identity[i]); 503 vcndmask_identity[i] = Operand(PhysReg{tmp + i}, v1); 504 } 505 } 506 507 for (unsigned i = 0; i < src.size(); i++) { 508 bld.vop2_e64(aco_opcode::v_cndmask_b32, Definition(PhysReg{tmp + i}, v1), 509 vcndmask_identity[i], Operand(PhysReg{src.physReg() + i}, v1), 510 Operand(stmp, bld.lm)); 511 } 512 513 if (src.regClass() == v1b) { 514 if (ctx->program->chip_class >= GFX8) { 515 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>( 516 aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; 517 sdwa->operands[0] = Operand(PhysReg{tmp}, v1); 518 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1); 519 bool sext = reduce_op == imin8 || reduce_op == imax8; 520 sdwa->sel[0] = SubdwordSel(1, 0, sext); 521 sdwa->dst_sel = SubdwordSel::dword; 522 bld.insert(std::move(sdwa)); 523 } else { 524 aco_opcode opcode; 525 526 if (reduce_op == imin8 || reduce_op == imax8) 527 opcode = aco_opcode::v_bfe_i32; 528 else 529 opcode = aco_opcode::v_bfe_u32; 530 531 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(), 532 Operand::c32(8u)); 533 } 534 } else if (src.regClass() == v2b) { 535 if (ctx->program->chip_class >= GFX10 && 536 (reduce_op == iadd16 || reduce_op == imax16 || reduce_op == imin16 || 537 reduce_op == umin16 || reduce_op == umax16)) { 538 aco_ptr<SDWA_instruction> sdwa{create_instruction<SDWA_instruction>( 539 aco_opcode::v_mov_b32, asSDWA(Format::VOP1), 1, 1)}; 540 sdwa->operands[0] = Operand(PhysReg{tmp}, v1); 541 sdwa->definitions[0] = Definition(PhysReg{tmp}, v1); 542 bool sext = reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16; 543 sdwa->sel[0] = SubdwordSel(2, 0, sext); 544 sdwa->dst_sel = SubdwordSel::dword; 545 bld.insert(std::move(sdwa)); 546 } else if (ctx->program->chip_class == GFX6 || ctx->program->chip_class == GFX7) { 547 aco_opcode opcode; 548 549 if (reduce_op == imin16 || reduce_op == imax16 || reduce_op == iadd16) 550 opcode = aco_opcode::v_bfe_i32; 551 else 552 opcode = aco_opcode::v_bfe_u32; 553 554 bld.vop3(opcode, Definition(PhysReg{tmp}, v1), Operand(PhysReg{tmp}, v1), Operand::zero(), 555 Operand::c32(16u)); 556 } 557 } 558 559 bool reduction_needs_last_op = false; 560 switch (op) { 561 case aco_opcode::p_reduce: 562 if (cluster_size == 1) 563 break; 564 565 if (ctx->program->chip_class <= GFX7) { 566 reduction_needs_last_op = true; 567 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(1, 0, 3, 2)); 568 if (cluster_size == 2) 569 break; 570 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); 571 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(2, 3, 0, 1)); 572 if (cluster_size == 4) 573 break; 574 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); 575 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x04)); 576 if (cluster_size == 8) 577 break; 578 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); 579 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x08)); 580 if (cluster_size == 16) 581 break; 582 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); 583 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10)); 584 if (cluster_size == 32) 585 break; 586 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); 587 for (unsigned i = 0; i < src.size(); i++) 588 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), 589 Operand::zero()); 590 // TODO: it would be more effective to do the last reduction step on SALU 591 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size()); 592 reduction_needs_last_op = false; 593 break; 594 } 595 596 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(1, 0, 3, 2), 0xf, 597 0xf, false); 598 if (cluster_size == 2) 599 break; 600 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_quad_perm(2, 3, 0, 1), 0xf, 601 0xf, false); 602 if (cluster_size == 4) 603 break; 604 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_half_mirror, 0xf, 0xf, 605 false); 606 if (cluster_size == 8) 607 break; 608 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_mirror, 0xf, 0xf, false); 609 if (cluster_size == 16) 610 break; 611 612 if (ctx->program->chip_class >= GFX10) { 613 /* GFX10+ doesn't support row_bcast15 and row_bcast31 */ 614 for (unsigned i = 0; i < src.size(); i++) 615 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1), 616 Operand(PhysReg{tmp + i}, v1), Operand::zero(), Operand::zero()); 617 618 if (cluster_size == 32) { 619 reduction_needs_last_op = true; 620 break; 621 } 622 623 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); 624 for (unsigned i = 0; i < src.size(); i++) 625 bld.readlane(Definition(PhysReg{dst.physReg() + i}, s1), Operand(PhysReg{tmp + i}, v1), 626 Operand::zero()); 627 // TODO: it would be more effective to do the last reduction step on SALU 628 emit_op(ctx, tmp, dst.physReg(), tmp, vtmp, reduce_op, src.size()); 629 break; 630 } 631 632 if (cluster_size == 32) { 633 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1f, 0, 0x10)); 634 reduction_needs_last_op = true; 635 break; 636 } 637 assert(cluster_size == 64); 638 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, 639 false); 640 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, 641 false); 642 break; 643 case aco_opcode::p_exclusive_scan: 644 if (ctx->program->chip_class >= GFX10) { /* gfx10 doesn't support wf_sr1, so emulate it */ 645 /* shift rows right */ 646 emit_dpp_mov(ctx, vtmp, tmp, src.size(), dpp_row_sr(1), 0xf, 0xf, true); 647 648 /* fill in the gaps in rows 1 and 3 */ 649 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10000u)); 650 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand::c32(0x10000u)); 651 for (unsigned i = 0; i < src.size(); i++) { 652 Instruction* perm = 653 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1), 654 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu), 655 Operand::c32(0xffffffffu)) 656 .instr; 657 perm->vop3().opsel = 1; /* FI (Fetch Inactive) */ 658 } 659 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand::c64(UINT64_MAX)); 660 661 if (ctx->program->wave_size == 64) { 662 /* fill in the gap in row 2 */ 663 for (unsigned i = 0; i < src.size(); i++) { 664 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1), 665 Operand::c32(31u)); 666 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1), 667 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1)); 668 } 669 } 670 std::swap(tmp, vtmp); 671 } else if (ctx->program->chip_class >= GFX8) { 672 emit_dpp_mov(ctx, tmp, tmp, src.size(), dpp_wf_sr1, 0xf, 0xf, true); 673 } else { 674 // TODO: use LDS on CS with a single write and shifted read 675 /* wavefront shift_right by 1 on SI/CI */ 676 emit_ds_swizzle(bld, vtmp, tmp, src.size(), (1 << 15) | dpp_quad_perm(0, 0, 1, 2)); 677 emit_ds_swizzle(bld, tmp, tmp, src.size(), 678 ds_pattern_bitmode(0x1F, 0x00, 0x07)); /* mirror(8) */ 679 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x10101010u)); 680 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); 681 for (unsigned i = 0; i < src.size(); i++) 682 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1), 683 Operand(PhysReg{tmp + i}, v1)); 684 685 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX)); 686 emit_ds_swizzle(bld, tmp, tmp, src.size(), 687 ds_pattern_bitmode(0x1F, 0x00, 0x08)); /* swap(8) */ 688 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0x01000100u)); 689 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); 690 for (unsigned i = 0; i < src.size(); i++) 691 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1), 692 Operand(PhysReg{tmp + i}, v1)); 693 694 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX)); 695 emit_ds_swizzle(bld, tmp, tmp, src.size(), 696 ds_pattern_bitmode(0x1F, 0x00, 0x10)); /* swap(16) */ 697 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(1u), 698 Operand::c32(16u)); 699 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(1u), 700 Operand::c32(16u)); 701 for (unsigned i = 0; i < src.size(); i++) 702 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{vtmp + i}, v1), 703 Operand(PhysReg{tmp + i}, v1)); 704 705 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX)); 706 for (unsigned i = 0; i < src.size(); i++) { 707 bld.writelane(Definition(PhysReg{vtmp + i}, v1), identity[i], Operand::zero(), 708 Operand(PhysReg{vtmp + i}, v1)); 709 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1), 710 Operand::zero()); 711 bld.writelane(Definition(PhysReg{vtmp + i}, v1), Operand(PhysReg{sitmp + i}, s1), 712 Operand::c32(32u), Operand(PhysReg{vtmp + i}, v1)); 713 identity[i] = Operand::zero(); /* prevent further uses of identity */ 714 } 715 std::swap(tmp, vtmp); 716 } 717 718 for (unsigned i = 0; i < src.size(); i++) { 719 if (!identity[i].isConstant() || 720 identity[i].constantValue()) { /* bound_ctrl should take care of this overwise */ 721 if (ctx->program->chip_class < GFX10) 722 assert((identity[i].isConstant() && !identity[i].isLiteral()) || 723 identity[i].physReg() == PhysReg{sitmp + i}); 724 bld.writelane(Definition(PhysReg{tmp + i}, v1), identity[i], Operand::zero(), 725 Operand(PhysReg{tmp + i}, v1)); 726 } 727 } 728 FALLTHROUGH; 729 case aco_opcode::p_inclusive_scan: 730 assert(cluster_size == ctx->program->wave_size); 731 if (ctx->program->chip_class <= GFX7) { 732 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1e, 0x00, 0x00)); 733 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xAAAAAAAAu)); 734 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); 735 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); 736 737 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX)); 738 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x1c, 0x01, 0x00)); 739 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xCCCCCCCCu)); 740 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); 741 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); 742 743 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX)); 744 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x18, 0x03, 0x00)); 745 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xF0F0F0F0u)); 746 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); 747 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); 748 749 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX)); 750 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x10, 0x07, 0x00)); 751 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_lo, s1), Operand::c32(0xFF00FF00u)); 752 bld.sop1(aco_opcode::s_mov_b32, Definition(exec_hi, s1), Operand(exec_lo, s1)); 753 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); 754 755 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand::c64(UINT64_MAX)); 756 emit_ds_swizzle(bld, vtmp, tmp, src.size(), ds_pattern_bitmode(0x00, 0x0f, 0x00)); 757 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u), 758 Operand::c32(16u)); 759 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u), 760 Operand::c32(16u)); 761 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); 762 763 for (unsigned i = 0; i < src.size(); i++) 764 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1), 765 Operand::c32(31u)); 766 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), 767 Operand::c32(32u)); 768 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); 769 break; 770 } 771 772 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(1), 0xf, 0xf, false, 773 identity); 774 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(2), 0xf, 0xf, false, 775 identity); 776 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(4), 0xf, 0xf, false, 777 identity); 778 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_sr(8), 0xf, 0xf, false, 779 identity); 780 if (ctx->program->chip_class >= GFX10) { 781 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_lo, s1), Operand::c32(16u), 782 Operand::c32(16u)); 783 bld.sop2(aco_opcode::s_bfm_b32, Definition(exec_hi, s1), Operand::c32(16u), 784 Operand::c32(16u)); 785 for (unsigned i = 0; i < src.size(); i++) { 786 Instruction* perm = 787 bld.vop3(aco_opcode::v_permlanex16_b32, Definition(PhysReg{vtmp + i}, v1), 788 Operand(PhysReg{tmp + i}, v1), Operand::c32(0xffffffffu), 789 Operand::c32(0xffffffffu)) 790 .instr; 791 perm->vop3().opsel = 1; /* FI (Fetch Inactive) */ 792 } 793 emit_op(ctx, tmp, tmp, vtmp, PhysReg{0}, reduce_op, src.size()); 794 795 if (ctx->program->wave_size == 64) { 796 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), 797 Operand::c32(32u)); 798 for (unsigned i = 0; i < src.size(); i++) 799 bld.readlane(Definition(PhysReg{sitmp + i}, s1), Operand(PhysReg{tmp + i}, v1), 800 Operand::c32(31u)); 801 emit_op(ctx, tmp, sitmp, tmp, vtmp, reduce_op, src.size()); 802 } 803 } else { 804 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast15, 0xa, 0xf, 805 false, identity); 806 emit_dpp_op(ctx, tmp, tmp, tmp, vtmp, reduce_op, src.size(), dpp_row_bcast31, 0xc, 0xf, 807 false, identity); 808 } 809 break; 810 default: unreachable("Invalid reduction mode"); 811 } 812 813 if (op == aco_opcode::p_reduce) { 814 if (reduction_needs_last_op && dst.regClass().type() == RegType::vgpr) { 815 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); 816 emit_op(ctx, dst.physReg(), tmp, vtmp, PhysReg{0}, reduce_op, src.size()); 817 return; 818 } 819 820 if (reduction_needs_last_op) 821 emit_op(ctx, tmp, vtmp, tmp, PhysReg{0}, reduce_op, src.size()); 822 } 823 824 /* restore exec */ 825 bld.sop1(Builder::s_mov, Definition(exec, bld.lm), Operand(stmp, bld.lm)); 826 827 if (dst.regClass().type() == RegType::sgpr) { 828 for (unsigned k = 0; k < src.size(); k++) { 829 bld.readlane(Definition(PhysReg{dst.physReg() + k}, s1), Operand(PhysReg{tmp + k}, v1), 830 Operand::c32(ctx->program->wave_size - 1)); 831 } 832 } else if (dst.physReg() != tmp) { 833 for (unsigned k = 0; k < src.size(); k++) { 834 bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{dst.physReg() + k}, v1), 835 Operand(PhysReg{tmp + k}, v1)); 836 } 837 } 838} 839 840void 841emit_gfx10_wave64_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld) 842{ 843 /* Emulates proper bpermute on GFX10 in wave64 mode. 844 * 845 * This is necessary because on GFX10 the bpermute instruction only works 846 * on half waves (you can think of it as having a cluster size of 32), so we 847 * manually swap the data between the two halves using two shared VGPRs. 848 */ 849 850 assert(program->chip_class >= GFX10); 851 assert(program->wave_size == 64); 852 853 unsigned shared_vgpr_reg_0 = align(program->config->num_vgprs, 4) + 256; 854 Definition dst = instr->definitions[0]; 855 Definition tmp_exec = instr->definitions[1]; 856 Definition clobber_scc = instr->definitions[2]; 857 Operand index_x4 = instr->operands[0]; 858 Operand input_data = instr->operands[1]; 859 Operand same_half = instr->operands[2]; 860 861 assert(dst.regClass() == v1); 862 assert(tmp_exec.regClass() == bld.lm); 863 assert(clobber_scc.isFixed() && clobber_scc.physReg() == scc); 864 assert(same_half.regClass() == bld.lm); 865 assert(index_x4.regClass() == v1); 866 assert(input_data.regClass().type() == RegType::vgpr); 867 assert(input_data.bytes() <= 4); 868 assert(dst.physReg() != index_x4.physReg()); 869 assert(dst.physReg() != input_data.physReg()); 870 assert(tmp_exec.physReg() != same_half.physReg()); 871 872 PhysReg shared_vgpr_lo(shared_vgpr_reg_0); 873 PhysReg shared_vgpr_hi(shared_vgpr_reg_0 + 1); 874 875 /* Permute the input within the same half-wave */ 876 bld.ds(aco_opcode::ds_bpermute_b32, dst, index_x4, input_data); 877 878 /* HI: Copy data from high lanes 32-63 to shared vgpr */ 879 bld.vop1_dpp(aco_opcode::v_mov_b32, Definition(shared_vgpr_hi, v1), input_data, 880 dpp_quad_perm(0, 1, 2, 3), 0xc, 0xf, false); 881 /* Save EXEC */ 882 bld.sop1(aco_opcode::s_mov_b64, tmp_exec, Operand(exec, s2)); 883 /* Set EXEC to enable LO lanes only */ 884 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::zero()); 885 /* LO: Copy data from low lanes 0-31 to shared vgpr */ 886 bld.vop1(aco_opcode::v_mov_b32, Definition(shared_vgpr_lo, v1), input_data); 887 /* LO: bpermute shared vgpr (high lanes' data) */ 888 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_hi, v1), index_x4, 889 Operand(shared_vgpr_hi, v1)); 890 /* Set EXEC to enable HI lanes only */ 891 bld.sop2(aco_opcode::s_bfm_b64, Definition(exec, s2), Operand::c32(32u), Operand::c32(32u)); 892 /* HI: bpermute shared vgpr (low lanes' data) */ 893 bld.ds(aco_opcode::ds_bpermute_b32, Definition(shared_vgpr_lo, v1), index_x4, 894 Operand(shared_vgpr_lo, v1)); 895 896 /* Only enable lanes which use the other half's data */ 897 bld.sop2(aco_opcode::s_andn2_b64, Definition(exec, s2), clobber_scc, 898 Operand(tmp_exec.physReg(), s2), same_half); 899 /* LO: Copy shared vgpr (high lanes' bpermuted data) to output vgpr */ 900 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_hi, v1), dpp_quad_perm(0, 1, 2, 3), 901 0x3, 0xf, false); 902 /* HI: Copy shared vgpr (low lanes' bpermuted data) to output vgpr */ 903 bld.vop1_dpp(aco_opcode::v_mov_b32, dst, Operand(shared_vgpr_lo, v1), dpp_quad_perm(0, 1, 2, 3), 904 0xc, 0xf, false); 905 906 /* Restore saved EXEC */ 907 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(tmp_exec.physReg(), s2)); 908 909 /* RA assumes that the result is always in the low part of the register, so we have to shift, if 910 * it's not there already */ 911 if (input_data.physReg().byte()) { 912 unsigned right_shift = input_data.physReg().byte() * 8; 913 bld.vop2(aco_opcode::v_lshrrev_b32, dst, Operand::c32(right_shift), 914 Operand(dst.physReg(), v1)); 915 } 916} 917 918void 919emit_gfx6_bpermute(Program* program, aco_ptr<Instruction>& instr, Builder& bld) 920{ 921 /* Emulates bpermute using readlane instructions */ 922 923 Operand index = instr->operands[0]; 924 Operand input = instr->operands[1]; 925 Definition dst = instr->definitions[0]; 926 Definition temp_exec = instr->definitions[1]; 927 Definition clobber_vcc = instr->definitions[2]; 928 929 assert(dst.regClass() == v1); 930 assert(temp_exec.regClass() == bld.lm); 931 assert(clobber_vcc.regClass() == bld.lm); 932 assert(clobber_vcc.physReg() == vcc); 933 assert(index.regClass() == v1); 934 assert(index.physReg() != dst.physReg()); 935 assert(input.regClass().type() == RegType::vgpr); 936 assert(input.bytes() <= 4); 937 assert(input.physReg() != dst.physReg()); 938 939 /* Save original EXEC */ 940 bld.sop1(aco_opcode::s_mov_b64, temp_exec, Operand(exec, s2)); 941 942 /* An "unrolled loop" that is executed per each lane. 943 * This takes only a few instructions per lane, as opposed to a "real" loop 944 * with branching, where the branch instruction alone would take 16+ cycles. 945 */ 946 for (unsigned n = 0; n < program->wave_size; ++n) { 947 /* Activate the lane which has N for its source index */ 948 bld.vopc(aco_opcode::v_cmpx_eq_u32, Definition(exec, bld.lm), clobber_vcc, Operand::c32(n), 949 index); 950 /* Read the data from lane N */ 951 bld.readlane(Definition(vcc, s1), input, Operand::c32(n)); 952 /* On the active lane, move the data we read from lane N to the destination VGPR */ 953 bld.vop1(aco_opcode::v_mov_b32, dst, Operand(vcc, s1)); 954 /* Restore original EXEC */ 955 bld.sop1(aco_opcode::s_mov_b64, Definition(exec, s2), Operand(temp_exec.physReg(), s2)); 956 } 957} 958 959struct copy_operation { 960 Operand op; 961 Definition def; 962 unsigned bytes; 963 union { 964 uint8_t uses[8]; 965 uint64_t is_used = 0; 966 }; 967}; 968 969void 970split_copy(lower_context* ctx, unsigned offset, Definition* def, Operand* op, 971 const copy_operation& src, bool ignore_uses, unsigned max_size) 972{ 973 PhysReg def_reg = src.def.physReg(); 974 PhysReg op_reg = src.op.physReg(); 975 def_reg.reg_b += offset; 976 op_reg.reg_b += offset; 977 978 /* 64-bit VGPR copies (implemented with v_lshrrev_b64) are slow before GFX10 */ 979 if (ctx->program->chip_class < GFX10 && src.def.regClass().type() == RegType::vgpr) 980 max_size = MIN2(max_size, 4); 981 unsigned max_align = src.def.regClass().type() == RegType::vgpr ? 4 : 16; 982 983 /* make sure the size is a power of two and reg % bytes == 0 */ 984 unsigned bytes = 1; 985 for (; bytes <= max_size; bytes *= 2) { 986 unsigned next = bytes * 2u; 987 bool can_increase = def_reg.reg_b % MIN2(next, max_align) == 0 && 988 offset + next <= src.bytes && next <= max_size; 989 if (!src.op.isConstant() && can_increase) 990 can_increase = op_reg.reg_b % MIN2(next, max_align) == 0; 991 for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++) 992 can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0); 993 if (!can_increase) 994 break; 995 } 996 997 *def = Definition(src.def.tempId(), def_reg, src.def.regClass().resize(bytes)); 998 if (src.op.isConstant()) { 999 assert(bytes >= 1 && bytes <= 8); 1000 uint64_t val = src.op.constantValue64() >> (offset * 8u); 1001 *op = Operand::get_const(ctx->program->chip_class, val, bytes); 1002 } else { 1003 RegClass op_cls = src.op.regClass().resize(bytes); 1004 *op = Operand(op_reg, op_cls); 1005 op->setTemp(Temp(src.op.tempId(), op_cls)); 1006 } 1007} 1008 1009uint32_t 1010get_intersection_mask(int a_start, int a_size, int b_start, int b_size) 1011{ 1012 int intersection_start = MAX2(b_start - a_start, 0); 1013 int intersection_end = MAX2(b_start + b_size - a_start, 0); 1014 if (intersection_start >= a_size || intersection_end == 0) 1015 return 0; 1016 1017 uint32_t mask = u_bit_consecutive(0, a_size); 1018 return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask; 1019} 1020 1021void 1022copy_constant(lower_context* ctx, Builder& bld, Definition dst, Operand op) 1023{ 1024 assert(op.bytes() == dst.bytes()); 1025 1026 if (dst.bytes() == 4 && op.isLiteral()) { 1027 uint32_t imm = op.constantValue(); 1028 if (dst.regClass() == s1 && (imm >= 0xffff8000 || imm <= 0x7fff)) { 1029 bld.sopk(aco_opcode::s_movk_i32, dst, imm & 0xFFFFu); 1030 return; 1031 } else if (util_bitreverse(imm) <= 64 || util_bitreverse(imm) >= 0xFFFFFFF0) { 1032 uint32_t rev = util_bitreverse(imm); 1033 if (dst.regClass() == s1) 1034 bld.sop1(aco_opcode::s_brev_b32, dst, Operand::c32(rev)); 1035 else 1036 bld.vop1(aco_opcode::v_bfrev_b32, dst, Operand::c32(rev)); 1037 return; 1038 } else if (dst.regClass() == s1 && imm != 0) { 1039 unsigned start = (ffs(imm) - 1) & 0x1f; 1040 unsigned size = util_bitcount(imm) & 0x1f; 1041 if ((((1u << size) - 1u) << start) == imm) { 1042 bld.sop2(aco_opcode::s_bfm_b32, dst, Operand::c32(size), Operand::c32(start)); 1043 return; 1044 } 1045 } 1046 } 1047 1048 if (op.bytes() == 4 && op.constantEquals(0x3e22f983) && ctx->program->chip_class >= GFX8) 1049 op.setFixed(PhysReg{248}); /* it can be an inline constant on GFX8+ */ 1050 1051 if (dst.regClass() == s1) { 1052 bld.sop1(aco_opcode::s_mov_b32, dst, op); 1053 } else if (dst.regClass() == s2) { 1054 /* s_ashr_i64 writes SCC, so we can't use it */ 1055 assert(Operand::is_constant_representable(op.constantValue64(), 8, true, false)); 1056 bld.sop1(aco_opcode::s_mov_b64, dst, op); 1057 } else if (dst.regClass() == v2) { 1058 if (Operand::is_constant_representable(op.constantValue64(), 8, true, false)) { 1059 bld.vop3(aco_opcode::v_lshrrev_b64, dst, Operand::zero(), op); 1060 } else { 1061 assert(Operand::is_constant_representable(op.constantValue64(), 8, false, true)); 1062 bld.vop3(aco_opcode::v_ashrrev_i64, dst, Operand::zero(), op); 1063 } 1064 } else if (dst.regClass() == v1) { 1065 bld.vop1(aco_opcode::v_mov_b32, dst, op); 1066 } else { 1067 assert(dst.regClass() == v1b || dst.regClass() == v2b); 1068 1069 if (dst.regClass() == v1b && ctx->program->chip_class >= GFX9) { 1070 uint8_t val = op.constantValue(); 1071 Operand op32 = Operand::c32((uint32_t)val | (val & 0x80u ? 0xffffff00u : 0u)); 1072 if (op32.isLiteral()) { 1073 uint32_t a = (uint32_t)int8_mul_table[val * 2]; 1074 uint32_t b = (uint32_t)int8_mul_table[val * 2 + 1]; 1075 bld.vop2_sdwa(aco_opcode::v_mul_u32_u24, dst, 1076 Operand::c32(a | (a & 0x80u ? 0xffffff00u : 0x0u)), 1077 Operand::c32(b | (b & 0x80u ? 0xffffff00u : 0x0u))); 1078 } else { 1079 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op32); 1080 } 1081 } else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX9 && !op.isLiteral()) { 1082 if (op.constantValue() >= 0xfff0 || op.constantValue() <= 64) { 1083 /* use v_mov_b32 to avoid possible issues with denormal flushing or 1084 * NaN. v_add_f16 is still needed for float constants. */ 1085 uint32_t val32 = (int32_t)(int16_t)op.constantValue(); 1086 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, Operand::c32(val32)); 1087 } else { 1088 bld.vop2_sdwa(aco_opcode::v_add_f16, dst, op, Operand::zero()); 1089 } 1090 } else if (dst.regClass() == v2b && ctx->program->chip_class >= GFX10 && 1091 (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in)) { 1092 if (dst.physReg().byte() == 2) { 1093 Operand def_lo(dst.physReg().advance(-2), v2b); 1094 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, def_lo, op); 1095 instr->vop3().opsel = 0; 1096 } else { 1097 assert(dst.physReg().byte() == 0); 1098 Operand def_hi(dst.physReg().advance(2), v2b); 1099 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, dst, op, def_hi); 1100 instr->vop3().opsel = 2; 1101 } 1102 } else { 1103 uint32_t offset = dst.physReg().byte() * 8u; 1104 uint32_t mask = ((1u << (dst.bytes() * 8)) - 1) << offset; 1105 uint32_t val = (op.constantValue() << offset) & mask; 1106 dst = Definition(PhysReg(dst.physReg().reg()), v1); 1107 Operand def_op(dst.physReg(), v1); 1108 if (val != mask) 1109 bld.vop2(aco_opcode::v_and_b32, dst, Operand::c32(~mask), def_op); 1110 if (val != 0) 1111 bld.vop2(aco_opcode::v_or_b32, dst, Operand::c32(val), def_op); 1112 } 1113 } 1114} 1115 1116void 1117copy_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr) 1118{ 1119 if (preserve_scc) 1120 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1)); 1121 1122 for (unsigned i = 0; i < 2; i++) { 1123 if (def.size() == 2) 1124 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op); 1125 else 1126 bld.vop1(aco_opcode::v_mov_b32, def, op); 1127 1128 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), 1129 Operand(exec, bld.lm)); 1130 } 1131 1132 if (preserve_scc) 1133 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1), 1134 Operand::zero()); 1135} 1136 1137void 1138swap_linear_vgpr(Builder& bld, Definition def, Operand op, bool preserve_scc, PhysReg scratch_sgpr) 1139{ 1140 if (preserve_scc) 1141 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand(scc, s1)); 1142 1143 Operand def_as_op = Operand(def.physReg(), def.regClass()); 1144 Definition op_as_def = Definition(op.physReg(), op.regClass()); 1145 1146 for (unsigned i = 0; i < 2; i++) { 1147 if (bld.program->chip_class >= GFX9) { 1148 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op); 1149 } else { 1150 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); 1151 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op); 1152 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); 1153 } 1154 1155 bld.sop1(Builder::s_not, Definition(exec, bld.lm), Definition(scc, s1), 1156 Operand(exec, bld.lm)); 1157 } 1158 1159 if (preserve_scc) 1160 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(scratch_sgpr, s1), 1161 Operand::zero()); 1162} 1163 1164bool 1165do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool* preserve_scc, 1166 PhysReg scratch_sgpr) 1167{ 1168 bool did_copy = false; 1169 for (unsigned offset = 0; offset < copy.bytes;) { 1170 if (copy.uses[offset]) { 1171 offset++; 1172 continue; 1173 } 1174 1175 Definition def; 1176 Operand op; 1177 split_copy(ctx, offset, &def, &op, copy, false, 8); 1178 1179 if (def.physReg() == scc) { 1180 bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand::zero()); 1181 *preserve_scc = true; 1182 } else if (op.isConstant()) { 1183 copy_constant(ctx, bld, def, op); 1184 } else if (def.regClass().is_linear_vgpr()) { 1185 copy_linear_vgpr(bld, def, op, *preserve_scc, scratch_sgpr); 1186 } else if (def.regClass() == v1) { 1187 bld.vop1(aco_opcode::v_mov_b32, def, op); 1188 } else if (def.regClass() == v2) { 1189 bld.vop3(aco_opcode::v_lshrrev_b64, def, Operand::zero(), op); 1190 } else if (def.regClass() == s1) { 1191 bld.sop1(aco_opcode::s_mov_b32, def, op); 1192 } else if (def.regClass() == s2) { 1193 bld.sop1(aco_opcode::s_mov_b64, def, op); 1194 } else if (def.regClass().is_subdword() && ctx->program->chip_class < GFX8) { 1195 if (op.physReg().byte()) { 1196 assert(def.physReg().byte() == 0); 1197 bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand::c32(op.physReg().byte() * 8), op); 1198 } else if (def.physReg().byte()) { 1199 assert(op.physReg().byte() == 0); 1200 /* preserve the target's lower half */ 1201 uint32_t bits = def.physReg().byte() * 8; 1202 PhysReg lo_reg = PhysReg(def.physReg().reg()); 1203 Definition lo_half = 1204 Definition(lo_reg, RegClass::get(RegType::vgpr, def.physReg().byte())); 1205 Definition dst = 1206 Definition(lo_reg, RegClass::get(RegType::vgpr, lo_half.bytes() + op.bytes())); 1207 1208 if (def.physReg().reg() == op.physReg().reg()) { 1209 bld.vop2(aco_opcode::v_and_b32, lo_half, Operand::c32((1 << bits) - 1u), 1210 Operand(lo_reg, lo_half.regClass())); 1211 if (def.physReg().byte() == 1) { 1212 bld.vop2(aco_opcode::v_mul_u32_u24, dst, Operand::c32((1 << bits) + 1u), op); 1213 } else if (def.physReg().byte() == 2) { 1214 bld.vop2(aco_opcode::v_cvt_pk_u16_u32, dst, Operand(lo_reg, v2b), op); 1215 } else if (def.physReg().byte() == 3) { 1216 bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), 1217 Operand::c32((1 << bits) + 1u)); 1218 bld.vop3(aco_opcode::v_mul_lo_u32, dst, Operand(scratch_sgpr, s1), op); 1219 } 1220 } else { 1221 lo_half.setFixed(lo_half.physReg().advance(4 - def.physReg().byte())); 1222 bld.vop2(aco_opcode::v_lshlrev_b32, lo_half, Operand::c32(32 - bits), 1223 Operand(lo_reg, lo_half.regClass())); 1224 bld.vop3(aco_opcode::v_alignbyte_b32, dst, op, 1225 Operand(lo_half.physReg(), lo_half.regClass()), 1226 Operand::c32(4 - def.physReg().byte())); 1227 } 1228 } else { 1229 bld.vop1(aco_opcode::v_mov_b32, def, op); 1230 } 1231 } else if (def.regClass().is_subdword()) { 1232 bld.vop1_sdwa(aco_opcode::v_mov_b32, def, op); 1233 } else { 1234 unreachable("unsupported copy"); 1235 } 1236 1237 did_copy = true; 1238 offset += def.bytes(); 1239 } 1240 return did_copy; 1241} 1242 1243void 1244do_swap(lower_context* ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, 1245 Pseudo_instruction* pi) 1246{ 1247 unsigned offset = 0; 1248 1249 if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) && 1250 (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) { 1251 /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte 1252 * swap */ 1253 PhysReg op = copy.op.physReg(); 1254 PhysReg def = copy.def.physReg(); 1255 op.reg_b &= ~0x3; 1256 def.reg_b &= ~0x3; 1257 1258 copy_operation tmp; 1259 tmp.op = Operand(op, v1); 1260 tmp.def = Definition(def, v1); 1261 tmp.bytes = 4; 1262 memset(tmp.uses, 1, 4); 1263 do_swap(ctx, bld, tmp, preserve_scc, pi); 1264 1265 op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0; 1266 def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0; 1267 tmp.op = Operand(op, v1b); 1268 tmp.def = Definition(def, v1b); 1269 tmp.bytes = 1; 1270 tmp.uses[0] = 1; 1271 do_swap(ctx, bld, tmp, preserve_scc, pi); 1272 1273 offset = copy.bytes; 1274 } 1275 1276 for (; offset < copy.bytes;) { 1277 Definition def; 1278 Operand op; 1279 unsigned max_size = copy.def.regClass().type() == RegType::vgpr ? 4 : 8; 1280 split_copy(ctx, offset, &def, &op, copy, true, max_size); 1281 1282 assert(op.regClass() == def.regClass()); 1283 Operand def_as_op = Operand(def.physReg(), def.regClass()); 1284 Definition op_as_def = Definition(op.physReg(), op.regClass()); 1285 if (def.regClass().is_linear_vgpr()) { 1286 swap_linear_vgpr(bld, def, op, preserve_scc, pi->scratch_sgpr); 1287 } else if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) { 1288 bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op); 1289 } else if (def.regClass() == v1) { 1290 assert(def.physReg().byte() == 0 && op.physReg().byte() == 0); 1291 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); 1292 bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op); 1293 bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); 1294 } else if (op.physReg() == scc || def.physReg() == scc) { 1295 /* we need to swap scc and another sgpr */ 1296 assert(!preserve_scc); 1297 1298 PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg(); 1299 1300 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1)); 1301 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), 1302 Operand::zero()); 1303 bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1)); 1304 } else if (def.regClass() == s1) { 1305 if (preserve_scc) { 1306 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op); 1307 bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op); 1308 bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1)); 1309 } else { 1310 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op); 1311 bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op); 1312 bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op); 1313 } 1314 } else if (def.regClass() == s2) { 1315 if (preserve_scc) 1316 bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1)); 1317 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op); 1318 bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op); 1319 bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op); 1320 if (preserve_scc) 1321 bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), 1322 Operand::zero()); 1323 } else if (def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) { 1324 bld.vop3(aco_opcode::v_alignbyte_b32, Definition(def.physReg(), v1), def_as_op, op, 1325 Operand::c32(2u)); 1326 } else { 1327 assert(def.regClass().is_subdword()); 1328 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); 1329 bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op); 1330 bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); 1331 } 1332 1333 offset += def.bytes(); 1334 } 1335 1336 if (ctx->program->chip_class <= GFX7) 1337 return; 1338 1339 /* fixup in case we swapped bytes we shouldn't have */ 1340 copy_operation tmp_copy = copy; 1341 tmp_copy.op.setFixed(copy.def.physReg()); 1342 tmp_copy.def.setFixed(copy.op.physReg()); 1343 do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr); 1344} 1345 1346void 1347do_pack_2x16(lower_context* ctx, Builder& bld, Definition def, Operand lo, Operand hi) 1348{ 1349 if (lo.isConstant() && hi.isConstant()) { 1350 copy_constant(ctx, bld, def, Operand::c32(lo.constantValue() | (hi.constantValue() << 16))); 1351 return; 1352 } 1353 1354 bool can_use_pack = (ctx->block->fp_mode.denorm16_64 & fp_denorm_keep_in) && 1355 (ctx->program->chip_class >= GFX10 || 1356 (ctx->program->chip_class >= GFX9 && !lo.isLiteral() && !hi.isLiteral())); 1357 1358 if (can_use_pack) { 1359 Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi); 1360 /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */ 1361 instr->vop3().opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1); 1362 return; 1363 } 1364 1365 /* a single alignbyte can be sufficient: hi can be a 32-bit integer constant */ 1366 if (lo.physReg().byte() == 2 && hi.physReg().byte() == 0 && 1367 (!hi.isConstant() || !Operand::c32(hi.constantValue()).isLiteral() || 1368 ctx->program->chip_class >= GFX10)) { 1369 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u)); 1370 return; 1371 } 1372 1373 Definition def_lo = Definition(def.physReg(), v2b); 1374 Definition def_hi = Definition(def.physReg().advance(2), v2b); 1375 1376 if (lo.isConstant()) { 1377 /* move hi and zero low bits */ 1378 if (hi.physReg().byte() == 0) 1379 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi); 1380 else 1381 bld.vop2(aco_opcode::v_and_b32, def_hi, Operand::c32(~0xFFFFu), hi); 1382 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(lo.constantValue()), 1383 Operand(def.physReg(), v1)); 1384 return; 1385 } 1386 if (hi.isConstant()) { 1387 /* move lo and zero high bits */ 1388 if (lo.physReg().byte() == 2) 1389 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo); 1390 else 1391 bld.vop2(aco_opcode::v_and_b32, def_lo, Operand::c32(0xFFFFu), lo); 1392 bld.vop2(aco_opcode::v_or_b32, def, Operand::c32(hi.constantValue() << 16u), 1393 Operand(def.physReg(), v1)); 1394 return; 1395 } 1396 1397 if (lo.physReg().reg() == def.physReg().reg()) { 1398 /* lo is in the high bits of def */ 1399 assert(lo.physReg().byte() == 2); 1400 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), lo); 1401 lo.setFixed(def.physReg()); 1402 } else if (hi.physReg() == def.physReg()) { 1403 /* hi is in the low bits of def */ 1404 assert(hi.physReg().byte() == 0); 1405 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), hi); 1406 hi.setFixed(def.physReg().advance(2)); 1407 } else if (ctx->program->chip_class >= GFX8) { 1408 /* either lo or hi can be placed with just a v_mov */ 1409 assert(lo.physReg().byte() == 0 || hi.physReg().byte() == 2); 1410 Operand& op = lo.physReg().byte() == 0 ? lo : hi; 1411 PhysReg reg = def.physReg().advance(op.physReg().byte()); 1412 bld.vop1(aco_opcode::v_mov_b32, Definition(reg, v2b), op); 1413 op.setFixed(reg); 1414 } 1415 1416 if (ctx->program->chip_class >= GFX8) { 1417 /* either hi or lo are already placed correctly */ 1418 if (lo.physReg().reg() == def.physReg().reg()) 1419 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_hi, hi); 1420 else 1421 bld.vop1_sdwa(aco_opcode::v_mov_b32, def_lo, lo); 1422 return; 1423 } 1424 1425 /* alignbyte needs the operands in the following way: 1426 * | xx hi | lo xx | >> 2 byte */ 1427 if (lo.physReg().byte() != hi.physReg().byte()) { 1428 /* | xx lo | hi xx | => | lo hi | lo hi | */ 1429 assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 2); 1430 bld.vop3(aco_opcode::v_alignbyte_b32, def, lo, hi, Operand::c32(2u)); 1431 lo = Operand(def_hi.physReg(), v2b); 1432 hi = Operand(def_lo.physReg(), v2b); 1433 } else if (lo.physReg().byte() == 0) { 1434 /* | xx hi | xx lo | => | xx hi | lo 00 | */ 1435 bld.vop2(aco_opcode::v_lshlrev_b32, def_hi, Operand::c32(16u), lo); 1436 lo = Operand(def_hi.physReg(), v2b); 1437 } else { 1438 /* | hi xx | lo xx | => | 00 hi | lo xx | */ 1439 assert(hi.physReg().byte() == 2); 1440 bld.vop2(aco_opcode::v_lshrrev_b32, def_lo, Operand::c32(16u), hi); 1441 hi = Operand(def_lo.physReg(), v2b); 1442 } 1443 /* perform the alignbyte */ 1444 bld.vop3(aco_opcode::v_alignbyte_b32, def, hi, lo, Operand::c32(2u)); 1445} 1446 1447void 1448try_coalesce_copies(lower_context* ctx, std::map<PhysReg, copy_operation>& copy_map, 1449 copy_operation& copy) 1450{ 1451 // TODO try more relaxed alignment for subdword copies 1452 unsigned next_def_align = util_next_power_of_two(copy.bytes + 1); 1453 unsigned next_op_align = next_def_align; 1454 if (copy.def.regClass().type() == RegType::vgpr) 1455 next_def_align = MIN2(next_def_align, 4); 1456 if (copy.op.regClass().type() == RegType::vgpr) 1457 next_op_align = MIN2(next_op_align, 4); 1458 1459 if (copy.bytes >= 8 || copy.def.physReg().reg_b % next_def_align || 1460 (!copy.op.isConstant() && copy.op.physReg().reg_b % next_op_align)) 1461 return; 1462 1463 auto other = copy_map.find(copy.def.physReg().advance(copy.bytes)); 1464 if (other == copy_map.end() || copy.bytes + other->second.bytes > 8 || 1465 copy.op.isConstant() != other->second.op.isConstant()) 1466 return; 1467 1468 /* don't create 64-bit copies before GFX10 */ 1469 if (copy.bytes >= 4 && copy.def.regClass().type() == RegType::vgpr && 1470 ctx->program->chip_class < GFX10) 1471 return; 1472 1473 unsigned new_size = copy.bytes + other->second.bytes; 1474 if (copy.op.isConstant()) { 1475 uint64_t val = 1476 copy.op.constantValue64() | (other->second.op.constantValue64() << (copy.bytes * 8u)); 1477 if (!util_is_power_of_two_or_zero(new_size)) 1478 return; 1479 if (!Operand::is_constant_representable(val, new_size, true, 1480 copy.def.regClass().type() == RegType::vgpr)) 1481 return; 1482 copy.op = Operand::get_const(ctx->program->chip_class, val, new_size); 1483 } else { 1484 if (other->second.op.physReg() != copy.op.physReg().advance(copy.bytes)) 1485 return; 1486 copy.op = Operand(copy.op.physReg(), copy.op.regClass().resize(new_size)); 1487 } 1488 1489 copy.bytes = new_size; 1490 copy.def = Definition(copy.def.physReg(), copy.def.regClass().resize(copy.bytes)); 1491 copy_map.erase(other); 1492} 1493 1494void 1495handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, 1496 chip_class chip_class, Pseudo_instruction* pi) 1497{ 1498 Builder bld(ctx->program, &ctx->instructions); 1499 unsigned num_instructions_before = ctx->instructions.size(); 1500 aco_ptr<Instruction> mov; 1501 bool writes_scc = false; 1502 1503 /* count the number of uses for each dst reg */ 1504 for (auto it = copy_map.begin(); it != copy_map.end();) { 1505 1506 if (it->second.def.physReg() == scc) 1507 writes_scc = true; 1508 1509 assert(!pi->tmp_in_scc || !(it->second.def.physReg() == pi->scratch_sgpr)); 1510 1511 /* if src and dst reg are the same, remove operation */ 1512 if (it->first == it->second.op.physReg()) { 1513 it = copy_map.erase(it); 1514 continue; 1515 } 1516 1517 /* split large copies */ 1518 if (it->second.bytes > 8) { 1519 assert(!it->second.op.isConstant()); 1520 assert(!it->second.def.regClass().is_subdword()); 1521 RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 2); 1522 Definition hi_def = Definition(PhysReg{it->first + 2}, rc); 1523 rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 2); 1524 Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 2}, rc); 1525 copy_operation copy = {hi_op, hi_def, it->second.bytes - 8}; 1526 copy_map[hi_def.physReg()] = copy; 1527 assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0); 1528 it->second.op = Operand(it->second.op.physReg(), 1529 it->second.op.regClass().type() == RegType::sgpr ? s2 : v2); 1530 it->second.def = Definition(it->second.def.physReg(), 1531 it->second.def.regClass().type() == RegType::sgpr ? s2 : v2); 1532 it->second.bytes = 8; 1533 } 1534 1535 try_coalesce_copies(ctx, copy_map, it->second); 1536 1537 /* check if the definition reg is used by another copy operation */ 1538 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) { 1539 if (copy.second.op.isConstant()) 1540 continue; 1541 for (uint16_t i = 0; i < it->second.bytes; i++) { 1542 /* distance might underflow */ 1543 unsigned distance = it->first.reg_b + i - copy.second.op.physReg().reg_b; 1544 if (distance < copy.second.bytes) 1545 it->second.uses[i] += 1; 1546 } 1547 } 1548 1549 ++it; 1550 } 1551 1552 /* first, handle paths in the location transfer graph */ 1553 bool preserve_scc = pi->tmp_in_scc && !writes_scc; 1554 bool skip_partial_copies = true; 1555 for (auto it = copy_map.begin();;) { 1556 if (copy_map.empty()) { 1557 ctx->program->statistics[statistic_copies] += 1558 ctx->instructions.size() - num_instructions_before; 1559 return; 1560 } 1561 if (it == copy_map.end()) { 1562 if (!skip_partial_copies) 1563 break; 1564 skip_partial_copies = false; 1565 it = copy_map.begin(); 1566 } 1567 1568 /* check if we can pack one register at once */ 1569 if (it->first.byte() == 0 && it->second.bytes == 2) { 1570 PhysReg reg_hi = it->first.advance(2); 1571 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(reg_hi); 1572 if (other != copy_map.end() && other->second.bytes == 2) { 1573 /* check if the target register is otherwise unused */ 1574 bool unused_lo = !it->second.is_used || (it->second.is_used == 0x0101 && 1575 other->second.op.physReg() == it->first); 1576 bool unused_hi = !other->second.is_used || 1577 (other->second.is_used == 0x0101 && it->second.op.physReg() == reg_hi); 1578 if (unused_lo && unused_hi) { 1579 Operand lo = it->second.op; 1580 Operand hi = other->second.op; 1581 do_pack_2x16(ctx, bld, Definition(it->first, v1), lo, hi); 1582 copy_map.erase(it); 1583 copy_map.erase(other); 1584 1585 for (std::pair<const PhysReg, copy_operation>& other2 : copy_map) { 1586 for (uint16_t i = 0; i < other2.second.bytes; i++) { 1587 /* distance might underflow */ 1588 unsigned distance_lo = other2.first.reg_b + i - lo.physReg().reg_b; 1589 unsigned distance_hi = other2.first.reg_b + i - hi.physReg().reg_b; 1590 if (distance_lo < 2 || distance_hi < 2) 1591 other2.second.uses[i] -= 1; 1592 } 1593 } 1594 it = copy_map.begin(); 1595 continue; 1596 } 1597 } 1598 } 1599 1600 /* on GFX6/7, we need some small workarounds as there is no 1601 * SDWA instruction to do partial register writes */ 1602 if (ctx->program->chip_class < GFX8 && it->second.bytes < 4) { 1603 if (it->first.byte() == 0 && it->second.op.physReg().byte() == 0 && !it->second.is_used && 1604 pi->opcode == aco_opcode::p_split_vector) { 1605 /* Other operations might overwrite the high bits, so change all users 1606 * of the high bits to the new target where they are still available. 1607 * This mechanism depends on also emitting dead definitions. */ 1608 PhysReg reg_hi = it->second.op.physReg().advance(it->second.bytes); 1609 while (reg_hi != PhysReg(it->second.op.physReg().reg() + 1)) { 1610 std::map<PhysReg, copy_operation>::iterator other = copy_map.begin(); 1611 for (other = copy_map.begin(); other != copy_map.end(); other++) { 1612 /* on GFX6/7, if the high bits are used as operand, they cannot be a target */ 1613 if (other->second.op.physReg() == reg_hi) { 1614 other->second.op.setFixed(it->first.advance(reg_hi.byte())); 1615 break; /* break because an operand can only be used once */ 1616 } 1617 } 1618 reg_hi = reg_hi.advance(it->second.bytes); 1619 } 1620 } else if (it->first.byte()) { 1621 assert(pi->opcode == aco_opcode::p_create_vector); 1622 /* on GFX6/7, if we target an upper half where the lower half hasn't yet been handled, 1623 * move to the target operand's high bits. This is save to do as it cannot be an operand 1624 */ 1625 PhysReg lo = PhysReg(it->first.reg()); 1626 std::map<PhysReg, copy_operation>::iterator other = copy_map.find(lo); 1627 if (other != copy_map.end()) { 1628 assert(other->second.bytes == it->first.byte()); 1629 PhysReg new_reg_hi = other->second.op.physReg().advance(it->first.byte()); 1630 it->second.def = Definition(new_reg_hi, it->second.def.regClass()); 1631 it->second.is_used = 0; 1632 other->second.bytes += it->second.bytes; 1633 other->second.def.setTemp(Temp(other->second.def.tempId(), 1634 RegClass::get(RegType::vgpr, other->second.bytes))); 1635 other->second.op.setTemp(Temp(other->second.op.tempId(), 1636 RegClass::get(RegType::vgpr, other->second.bytes))); 1637 /* if the new target's high bits are also a target, change uses */ 1638 std::map<PhysReg, copy_operation>::iterator target = copy_map.find(new_reg_hi); 1639 if (target != copy_map.end()) { 1640 for (unsigned i = 0; i < it->second.bytes; i++) 1641 target->second.uses[i]++; 1642 } 1643 } 1644 } 1645 } 1646 1647 /* find portions where the target reg is not used as operand for any other copy */ 1648 if (it->second.is_used) { 1649 if (it->second.op.isConstant() || skip_partial_copies) { 1650 /* we have to skip constants until is_used=0. 1651 * we also skip partial copies at the beginning to help coalescing */ 1652 ++it; 1653 continue; 1654 } 1655 1656 unsigned has_zero_use_bytes = 0; 1657 for (unsigned i = 0; i < it->second.bytes; i++) 1658 has_zero_use_bytes |= (it->second.uses[i] == 0) << i; 1659 1660 if (has_zero_use_bytes) { 1661 /* Skipping partial copying and doing a v_swap_b32 and then fixup 1662 * copies is usually beneficial for sub-dword copies, but if doing 1663 * a partial copy allows further copies, it should be done instead. */ 1664 bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0); 1665 for (std::pair<const PhysReg, copy_operation>& copy : copy_map) { 1666 /* on GFX6/7, we can only do copies with full registers */ 1667 if (partial_copy || ctx->program->chip_class <= GFX7) 1668 break; 1669 for (uint16_t i = 0; i < copy.second.bytes; i++) { 1670 /* distance might underflow */ 1671 unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b; 1672 if (distance < it->second.bytes && copy.second.uses[i] == 1 && 1673 !it->second.uses[distance]) 1674 partial_copy = true; 1675 } 1676 } 1677 1678 if (!partial_copy) { 1679 ++it; 1680 continue; 1681 } 1682 } else { 1683 /* full target reg is used: register swapping needed */ 1684 ++it; 1685 continue; 1686 } 1687 } 1688 1689 bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr); 1690 skip_partial_copies = did_copy; 1691 std::pair<PhysReg, copy_operation> copy = *it; 1692 1693 if (it->second.is_used == 0) { 1694 /* the target reg is not used as operand for any other copy, so we 1695 * copied to all of it */ 1696 copy_map.erase(it); 1697 it = copy_map.begin(); 1698 } else { 1699 /* we only performed some portions of this copy, so split it to only 1700 * leave the portions that still need to be done */ 1701 copy_operation original = it->second; /* the map insertion below can overwrite this */ 1702 copy_map.erase(it); 1703 for (unsigned offset = 0; offset < original.bytes;) { 1704 if (original.uses[offset] == 0) { 1705 offset++; 1706 continue; 1707 } 1708 Definition def; 1709 Operand op; 1710 split_copy(ctx, offset, &def, &op, original, false, 8); 1711 1712 copy_operation new_copy = {op, def, def.bytes()}; 1713 for (unsigned i = 0; i < new_copy.bytes; i++) 1714 new_copy.uses[i] = original.uses[i + offset]; 1715 copy_map[def.physReg()] = new_copy; 1716 1717 offset += def.bytes(); 1718 } 1719 1720 it = copy_map.begin(); 1721 } 1722 1723 /* Reduce the number of uses of the operand reg by one. Do this after 1724 * splitting the copy or removing it in case the copy writes to it's own 1725 * operand (for example, v[7:8] = v[8:9]) */ 1726 if (did_copy && !copy.second.op.isConstant()) { 1727 for (std::pair<const PhysReg, copy_operation>& other : copy_map) { 1728 for (uint16_t i = 0; i < other.second.bytes; i++) { 1729 /* distance might underflow */ 1730 unsigned distance = other.first.reg_b + i - copy.second.op.physReg().reg_b; 1731 if (distance < copy.second.bytes && !copy.second.uses[distance]) 1732 other.second.uses[i] -= 1; 1733 } 1734 } 1735 } 1736 } 1737 1738 /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */ 1739 unsigned largest = 0; 1740 for (const std::pair<const PhysReg, copy_operation>& op : copy_map) 1741 largest = MAX2(largest, op.second.bytes); 1742 1743 while (!copy_map.empty()) { 1744 1745 /* Perform larger swaps first, because larger swaps swaps can make other 1746 * swaps unnecessary. */ 1747 auto it = copy_map.begin(); 1748 for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) { 1749 if (it2->second.bytes > it->second.bytes) { 1750 it = it2; 1751 if (it->second.bytes == largest) 1752 break; 1753 } 1754 } 1755 1756 /* should already be done */ 1757 assert(!it->second.op.isConstant()); 1758 1759 assert(it->second.op.isFixed()); 1760 assert(it->second.def.regClass() == it->second.op.regClass()); 1761 1762 if (it->first == it->second.op.physReg()) { 1763 copy_map.erase(it); 1764 continue; 1765 } 1766 1767 if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr) 1768 assert(!(it->second.def.physReg() == pi->scratch_sgpr)); 1769 1770 /* to resolve the cycle, we have to swap the src reg with the dst reg */ 1771 copy_operation swap = it->second; 1772 1773 /* if this is self-intersecting, we have to split it because 1774 * self-intersecting swaps don't make sense */ 1775 PhysReg src = swap.op.physReg(), dst = swap.def.physReg(); 1776 if (abs((int)src.reg_b - (int)dst.reg_b) < (int)swap.bytes) { 1777 unsigned offset = abs((int)src.reg_b - (int)dst.reg_b); 1778 1779 copy_operation remaining; 1780 src.reg_b += offset; 1781 dst.reg_b += offset; 1782 remaining.bytes = swap.bytes - offset; 1783 memcpy(remaining.uses, swap.uses + offset, remaining.bytes); 1784 remaining.op = Operand(src, swap.def.regClass().resize(remaining.bytes)); 1785 remaining.def = Definition(dst, swap.def.regClass().resize(remaining.bytes)); 1786 copy_map[dst] = remaining; 1787 1788 memset(swap.uses + offset, 0, swap.bytes - offset); 1789 swap.bytes = offset; 1790 } 1791 1792 /* GFX6-7 can only swap full registers */ 1793 if (ctx->program->chip_class <= GFX7) 1794 swap.bytes = align(swap.bytes, 4); 1795 1796 do_swap(ctx, bld, swap, preserve_scc, pi); 1797 1798 /* remove from map */ 1799 copy_map.erase(it); 1800 1801 /* change the operand reg of the target's uses and split uses if needed */ 1802 uint32_t bytes_left = u_bit_consecutive(0, swap.bytes); 1803 for (auto target = copy_map.begin(); target != copy_map.end(); ++target) { 1804 if (target->second.op.physReg() == swap.def.physReg() && 1805 swap.bytes == target->second.bytes) { 1806 target->second.op.setFixed(swap.op.physReg()); 1807 break; 1808 } 1809 1810 uint32_t imask = 1811 get_intersection_mask(swap.def.physReg().reg_b, swap.bytes, 1812 target->second.op.physReg().reg_b, target->second.bytes); 1813 1814 if (!imask) 1815 continue; 1816 1817 int offset = (int)target->second.op.physReg().reg_b - (int)swap.def.physReg().reg_b; 1818 1819 /* split and update the middle (the portion that reads the swap's 1820 * definition) to read the swap's operand instead */ 1821 int target_op_end = target->second.op.physReg().reg_b + target->second.bytes; 1822 int swap_def_end = swap.def.physReg().reg_b + swap.bytes; 1823 int before_bytes = MAX2(-offset, 0); 1824 int after_bytes = MAX2(target_op_end - swap_def_end, 0); 1825 int middle_bytes = target->second.bytes - before_bytes - after_bytes; 1826 1827 if (after_bytes) { 1828 unsigned after_offset = before_bytes + middle_bytes; 1829 assert(after_offset > 0); 1830 copy_operation copy; 1831 copy.bytes = after_bytes; 1832 memcpy(copy.uses, target->second.uses + after_offset, copy.bytes); 1833 RegClass rc = target->second.op.regClass().resize(after_bytes); 1834 copy.op = Operand(target->second.op.physReg().advance(after_offset), rc); 1835 copy.def = Definition(target->second.def.physReg().advance(after_offset), rc); 1836 copy_map[copy.def.physReg()] = copy; 1837 } 1838 1839 if (middle_bytes) { 1840 copy_operation copy; 1841 copy.bytes = middle_bytes; 1842 memcpy(copy.uses, target->second.uses + before_bytes, copy.bytes); 1843 RegClass rc = target->second.op.regClass().resize(middle_bytes); 1844 copy.op = Operand(swap.op.physReg().advance(MAX2(offset, 0)), rc); 1845 copy.def = Definition(target->second.def.physReg().advance(before_bytes), rc); 1846 copy_map[copy.def.physReg()] = copy; 1847 } 1848 1849 if (before_bytes) { 1850 copy_operation copy; 1851 target->second.bytes = before_bytes; 1852 RegClass rc = target->second.op.regClass().resize(before_bytes); 1853 target->second.op = Operand(target->second.op.physReg(), rc); 1854 target->second.def = Definition(target->second.def.physReg(), rc); 1855 memset(target->second.uses + target->second.bytes, 0, 8 - target->second.bytes); 1856 } 1857 1858 /* break early since we know each byte of the swap's definition is used 1859 * at most once */ 1860 bytes_left &= ~imask; 1861 if (!bytes_left) 1862 break; 1863 } 1864 } 1865 ctx->program->statistics[statistic_copies] += ctx->instructions.size() - num_instructions_before; 1866} 1867 1868void 1869emit_set_mode(Builder& bld, float_mode new_mode, bool set_round, bool set_denorm) 1870{ 1871 if (bld.program->chip_class >= GFX10) { 1872 if (set_round) 1873 bld.sopp(aco_opcode::s_round_mode, -1, new_mode.round); 1874 if (set_denorm) 1875 bld.sopp(aco_opcode::s_denorm_mode, -1, new_mode.denorm); 1876 } else if (set_round || set_denorm) { 1877 /* "((size - 1) << 11) | register" (MODE is encoded as register 1) */ 1878 Instruction* instr = 1879 bld.sopk(aco_opcode::s_setreg_imm32_b32, Operand::c8(new_mode.val), (7 << 11) | 1).instr; 1880 /* has to be a literal */ 1881 instr->operands[0].setFixed(PhysReg{255}); 1882 } 1883} 1884 1885void 1886emit_set_mode_from_block(Builder& bld, Program& program, Block* block, bool always_set) 1887{ 1888 float_mode config_mode; 1889 config_mode.val = program.config->float_mode; 1890 1891 bool set_round = always_set && block->fp_mode.round != config_mode.round; 1892 bool set_denorm = always_set && block->fp_mode.denorm != config_mode.denorm; 1893 if (block->kind & block_kind_top_level) { 1894 for (unsigned pred : block->linear_preds) { 1895 if (program.blocks[pred].fp_mode.round != block->fp_mode.round) 1896 set_round = true; 1897 if (program.blocks[pred].fp_mode.denorm != block->fp_mode.denorm) 1898 set_denorm = true; 1899 } 1900 } 1901 /* only allow changing modes at top-level blocks so this doesn't break 1902 * the "jump over empty blocks" optimization */ 1903 assert((!set_round && !set_denorm) || (block->kind & block_kind_top_level)); 1904 emit_set_mode(bld, block->fp_mode, set_round, set_denorm); 1905} 1906 1907void 1908lower_to_hw_instr(Program* program) 1909{ 1910 Block* discard_block = NULL; 1911 1912 for (int block_idx = program->blocks.size() - 1; block_idx >= 0; block_idx--) { 1913 Block* block = &program->blocks[block_idx]; 1914 lower_context ctx; 1915 ctx.program = program; 1916 ctx.block = block; 1917 Builder bld(program, &ctx.instructions); 1918 1919 emit_set_mode_from_block(bld, *program, block, (block_idx == 0)); 1920 1921 for (size_t instr_idx = 0; instr_idx < block->instructions.size(); instr_idx++) { 1922 aco_ptr<Instruction>& instr = block->instructions[instr_idx]; 1923 aco_ptr<Instruction> mov; 1924 if (instr->isPseudo() && instr->opcode != aco_opcode::p_unit_test) { 1925 Pseudo_instruction* pi = &instr->pseudo(); 1926 1927 switch (instr->opcode) { 1928 case aco_opcode::p_extract_vector: { 1929 PhysReg reg = instr->operands[0].physReg(); 1930 Definition& def = instr->definitions[0]; 1931 reg.reg_b += instr->operands[1].constantValue() * def.bytes(); 1932 1933 if (reg == def.physReg()) 1934 break; 1935 1936 RegClass op_rc = def.regClass().is_subdword() 1937 ? def.regClass() 1938 : RegClass(instr->operands[0].getTemp().type(), def.size()); 1939 std::map<PhysReg, copy_operation> copy_operations; 1940 copy_operations[def.physReg()] = {Operand(reg, op_rc), def, def.bytes()}; 1941 handle_operands(copy_operations, &ctx, program->chip_class, pi); 1942 break; 1943 } 1944 case aco_opcode::p_create_vector: { 1945 std::map<PhysReg, copy_operation> copy_operations; 1946 PhysReg reg = instr->definitions[0].physReg(); 1947 1948 for (const Operand& op : instr->operands) { 1949 if (op.isConstant()) { 1950 const Definition def = Definition( 1951 reg, instr->definitions[0].getTemp().regClass().resize(op.bytes())); 1952 copy_operations[reg] = {op, def, op.bytes()}; 1953 reg.reg_b += op.bytes(); 1954 continue; 1955 } 1956 if (op.isUndefined()) { 1957 // TODO: coalesce subdword copies if dst byte is 0 1958 reg.reg_b += op.bytes(); 1959 continue; 1960 } 1961 1962 RegClass rc_def = 1963 op.regClass().is_subdword() 1964 ? op.regClass() 1965 : instr->definitions[0].getTemp().regClass().resize(op.bytes()); 1966 const Definition def = Definition(reg, rc_def); 1967 copy_operations[def.physReg()] = {op, def, op.bytes()}; 1968 reg.reg_b += op.bytes(); 1969 } 1970 handle_operands(copy_operations, &ctx, program->chip_class, pi); 1971 break; 1972 } 1973 case aco_opcode::p_split_vector: { 1974 std::map<PhysReg, copy_operation> copy_operations; 1975 PhysReg reg = instr->operands[0].physReg(); 1976 1977 for (const Definition& def : instr->definitions) { 1978 RegClass rc_op = def.regClass().is_subdword() 1979 ? def.regClass() 1980 : instr->operands[0].getTemp().regClass().resize(def.bytes()); 1981 const Operand op = Operand(reg, rc_op); 1982 copy_operations[def.physReg()] = {op, def, def.bytes()}; 1983 reg.reg_b += def.bytes(); 1984 } 1985 handle_operands(copy_operations, &ctx, program->chip_class, pi); 1986 break; 1987 } 1988 case aco_opcode::p_parallelcopy: 1989 case aco_opcode::p_wqm: { 1990 std::map<PhysReg, copy_operation> copy_operations; 1991 for (unsigned j = 0; j < instr->operands.size(); j++) { 1992 assert(instr->definitions[j].bytes() == instr->operands[j].bytes()); 1993 copy_operations[instr->definitions[j].physReg()] = { 1994 instr->operands[j], instr->definitions[j], instr->operands[j].bytes()}; 1995 } 1996 handle_operands(copy_operations, &ctx, program->chip_class, pi); 1997 break; 1998 } 1999 case aco_opcode::p_exit_early_if: { 2000 /* don't bother with an early exit near the end of the program */ 2001 if ((block->instructions.size() - 1 - instr_idx) <= 4 && 2002 block->instructions.back()->opcode == aco_opcode::s_endpgm) { 2003 unsigned null_exp_dest = 2004 (ctx.program->stage.hw == HWStage::FS) ? 9 /* NULL */ : V_008DFC_SQ_EXP_POS; 2005 bool ignore_early_exit = true; 2006 2007 for (unsigned k = instr_idx + 1; k < block->instructions.size(); ++k) { 2008 const aco_ptr<Instruction>& instr2 = block->instructions[k]; 2009 if (instr2->opcode == aco_opcode::s_endpgm || 2010 instr2->opcode == aco_opcode::p_logical_end) 2011 continue; 2012 else if (instr2->opcode == aco_opcode::exp && 2013 instr2->exp().dest == null_exp_dest) 2014 continue; 2015 else if (instr2->opcode == aco_opcode::p_parallelcopy && 2016 instr2->definitions[0].isFixed() && 2017 instr2->definitions[0].physReg() == exec) 2018 continue; 2019 2020 ignore_early_exit = false; 2021 } 2022 2023 if (ignore_early_exit) 2024 break; 2025 } 2026 2027 if (!discard_block) { 2028 discard_block = program->create_and_insert_block(); 2029 block = &program->blocks[block_idx]; 2030 2031 bld.reset(discard_block); 2032 bld.exp(aco_opcode::exp, Operand(v1), Operand(v1), Operand(v1), Operand(v1), 0, 2033 V_008DFC_SQ_EXP_NULL, false, true, true); 2034 bld.sopp(aco_opcode::s_endpgm); 2035 2036 bld.reset(&ctx.instructions); 2037 } 2038 2039 // TODO: exec can be zero here with block_kind_discard 2040 2041 assert(instr->operands[0].physReg() == scc); 2042 bld.sopp(aco_opcode::s_cbranch_scc0, Definition(exec, s2), instr->operands[0], 2043 discard_block->index); 2044 2045 discard_block->linear_preds.push_back(block->index); 2046 block->linear_succs.push_back(discard_block->index); 2047 break; 2048 } 2049 case aco_opcode::p_spill: { 2050 assert(instr->operands[0].regClass() == v1.as_linear()); 2051 for (unsigned i = 0; i < instr->operands[2].size(); i++) { 2052 Operand src = 2053 instr->operands[2].isConstant() 2054 ? Operand::c32(uint32_t(instr->operands[2].constantValue64() >> (32 * i))) 2055 : Operand(PhysReg{instr->operands[2].physReg() + i}, s1); 2056 bld.writelane(bld.def(v1, instr->operands[0].physReg()), src, 2057 Operand::c32(instr->operands[1].constantValue() + i), 2058 instr->operands[0]); 2059 } 2060 break; 2061 } 2062 case aco_opcode::p_reload: { 2063 assert(instr->operands[0].regClass() == v1.as_linear()); 2064 for (unsigned i = 0; i < instr->definitions[0].size(); i++) 2065 bld.readlane(bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), 2066 instr->operands[0], 2067 Operand::c32(instr->operands[1].constantValue() + i)); 2068 break; 2069 } 2070 case aco_opcode::p_as_uniform: { 2071 if (instr->operands[0].isConstant() || 2072 instr->operands[0].regClass().type() == RegType::sgpr) { 2073 std::map<PhysReg, copy_operation> copy_operations; 2074 copy_operations[instr->definitions[0].physReg()] = { 2075 instr->operands[0], instr->definitions[0], instr->definitions[0].bytes()}; 2076 handle_operands(copy_operations, &ctx, program->chip_class, pi); 2077 } else { 2078 assert(instr->operands[0].regClass().type() == RegType::vgpr); 2079 assert(instr->definitions[0].regClass().type() == RegType::sgpr); 2080 assert(instr->operands[0].size() == instr->definitions[0].size()); 2081 for (unsigned i = 0; i < instr->definitions[0].size(); i++) { 2082 bld.vop1(aco_opcode::v_readfirstlane_b32, 2083 bld.def(s1, PhysReg{instr->definitions[0].physReg() + i}), 2084 Operand(PhysReg{instr->operands[0].physReg() + i}, v1)); 2085 } 2086 } 2087 break; 2088 } 2089 case aco_opcode::p_bpermute: { 2090 if (ctx.program->chip_class <= GFX7) 2091 emit_gfx6_bpermute(program, instr, bld); 2092 else if (ctx.program->chip_class >= GFX10 && ctx.program->wave_size == 64) 2093 emit_gfx10_wave64_bpermute(program, instr, bld); 2094 else 2095 unreachable("Current hardware supports ds_bpermute, don't emit p_bpermute."); 2096 break; 2097 } 2098 case aco_opcode::p_constaddr: { 2099 unsigned id = instr->definitions[0].tempId(); 2100 PhysReg reg = instr->definitions[0].physReg(); 2101 bld.sop1(aco_opcode::p_constaddr_getpc, instr->definitions[0], Operand::c32(id)); 2102 bld.sop2(aco_opcode::p_constaddr_addlo, Definition(reg, s1), bld.def(s1, scc), 2103 Operand(reg, s1), Operand::c32(id)); 2104 bld.sop2(aco_opcode::s_addc_u32, Definition(reg.advance(4), s1), bld.def(s1, scc), 2105 Operand(reg.advance(4), s1), Operand::zero(), Operand(scc, s1)); 2106 break; 2107 } 2108 case aco_opcode::p_extract: { 2109 assert(instr->operands[1].isConstant()); 2110 assert(instr->operands[2].isConstant()); 2111 assert(instr->operands[3].isConstant()); 2112 if (instr->definitions[0].regClass() == s1) 2113 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc); 2114 Definition dst = instr->definitions[0]; 2115 Operand op = instr->operands[0]; 2116 unsigned bits = instr->operands[2].constantValue(); 2117 unsigned index = instr->operands[1].constantValue(); 2118 unsigned offset = index * bits; 2119 bool signext = !instr->operands[3].constantEquals(0); 2120 2121 if (dst.regClass() == s1) { 2122 if (offset == (32 - bits)) { 2123 bld.sop2(signext ? aco_opcode::s_ashr_i32 : aco_opcode::s_lshr_b32, dst, 2124 bld.def(s1, scc), op, Operand::c32(offset)); 2125 } else if (offset == 0 && signext && (bits == 8 || bits == 16)) { 2126 bld.sop1(bits == 8 ? aco_opcode::s_sext_i32_i8 : aco_opcode::s_sext_i32_i16, 2127 dst, op); 2128 } else { 2129 bld.sop2(signext ? aco_opcode::s_bfe_i32 : aco_opcode::s_bfe_u32, dst, 2130 bld.def(s1, scc), op, Operand::c32((bits << 16) | offset)); 2131 } 2132 } else if ((dst.regClass() == v1 && op.regClass() == v1) || 2133 ctx.program->chip_class <= GFX7) { 2134 assert(op.physReg().byte() == 0 && dst.physReg().byte() == 0); 2135 if (offset == (32 - bits) && op.regClass() != s1) { 2136 bld.vop2(signext ? aco_opcode::v_ashrrev_i32 : aco_opcode::v_lshrrev_b32, dst, 2137 Operand::c32(offset), op); 2138 } else { 2139 bld.vop3(signext ? aco_opcode::v_bfe_i32 : aco_opcode::v_bfe_u32, dst, op, 2140 Operand::c32(offset), Operand::c32(bits)); 2141 } 2142 } else { 2143 assert(dst.regClass() == v2b || dst.regClass() == v1b || op.regClass() == v2b || 2144 op.regClass() == v1b); 2145 SDWA_instruction& sdwa = 2146 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op).instr->sdwa(); 2147 sdwa.sel[0] = SubdwordSel(bits / 8, offset / 8, signext); 2148 } 2149 break; 2150 } 2151 case aco_opcode::p_insert: { 2152 assert(instr->operands[1].isConstant()); 2153 assert(instr->operands[2].isConstant()); 2154 if (instr->definitions[0].regClass() == s1) 2155 assert(instr->definitions.size() >= 2 && instr->definitions[1].physReg() == scc); 2156 Definition dst = instr->definitions[0]; 2157 Operand op = instr->operands[0]; 2158 unsigned bits = instr->operands[2].constantValue(); 2159 unsigned index = instr->operands[1].constantValue(); 2160 unsigned offset = index * bits; 2161 2162 if (dst.regClass() == s1) { 2163 if (offset == (32 - bits)) { 2164 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), op, 2165 Operand::c32(offset)); 2166 } else if (offset == 0) { 2167 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, 2168 Operand::c32(bits << 16)); 2169 } else { 2170 bld.sop2(aco_opcode::s_bfe_u32, dst, bld.def(s1, scc), op, 2171 Operand::c32(bits << 16)); 2172 bld.sop2(aco_opcode::s_lshl_b32, dst, bld.def(s1, scc), 2173 Operand(dst.physReg(), s1), Operand::c32(offset)); 2174 } 2175 } else if (dst.regClass() == v1 || ctx.program->chip_class <= GFX7) { 2176 if (offset == (dst.bytes() * 8u - bits)) { 2177 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op); 2178 } else if (offset == 0) { 2179 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits)); 2180 } else if (program->chip_class >= GFX9 || 2181 (op.regClass() != s1 && program->chip_class >= GFX8)) { 2182 bld.vop1_sdwa(aco_opcode::v_mov_b32, dst, op).instr->sdwa().dst_sel = 2183 SubdwordSel(bits / 8, offset / 8, false); 2184 } else { 2185 bld.vop3(aco_opcode::v_bfe_u32, dst, op, Operand::zero(), Operand::c32(bits)); 2186 bld.vop2(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), 2187 Operand(dst.physReg(), v1)); 2188 } 2189 } else { 2190 assert(dst.regClass() == v2b); 2191 bld.vop2_sdwa(aco_opcode::v_lshlrev_b32, dst, Operand::c32(offset), op) 2192 .instr->sdwa() 2193 .sel[1] = SubdwordSel::ubyte; 2194 } 2195 break; 2196 } 2197 default: break; 2198 } 2199 } else if (instr->isBranch()) { 2200 Pseudo_branch_instruction* branch = &instr->branch(); 2201 uint32_t target = branch->target[0]; 2202 2203 /* check if all blocks from current to target are empty */ 2204 /* In case there are <= 4 SALU or <= 2 VALU instructions, remove the branch */ 2205 bool can_remove = block->index < target; 2206 unsigned num_scalar = 0; 2207 unsigned num_vector = 0; 2208 for (unsigned i = block->index + 1; can_remove && i < branch->target[0]; i++) { 2209 /* uniform branches must not be ignored if they 2210 * are about to jump over actual instructions */ 2211 if (!program->blocks[i].instructions.empty() && 2212 (branch->opcode != aco_opcode::p_cbranch_z || 2213 branch->operands[0].physReg() != exec)) { 2214 can_remove = false; 2215 break; 2216 } 2217 2218 for (aco_ptr<Instruction>& inst : program->blocks[i].instructions) { 2219 if (inst->isSOPP()) { 2220 can_remove = false; 2221 } else if (inst->isSALU()) { 2222 num_scalar++; 2223 } else if (inst->isVALU()) { 2224 num_vector++; 2225 } else { 2226 can_remove = false; 2227 } 2228 2229 if (num_scalar + num_vector * 2 > 4) 2230 can_remove = false; 2231 2232 if (!can_remove) 2233 break; 2234 } 2235 } 2236 2237 if (can_remove) 2238 continue; 2239 2240 switch (instr->opcode) { 2241 case aco_opcode::p_branch: 2242 assert(block->linear_succs[0] == target); 2243 bld.sopp(aco_opcode::s_branch, branch->definitions[0], target); 2244 break; 2245 case aco_opcode::p_cbranch_nz: 2246 assert(block->linear_succs[1] == target); 2247 if (branch->operands[0].physReg() == exec) 2248 bld.sopp(aco_opcode::s_cbranch_execnz, branch->definitions[0], target); 2249 else if (branch->operands[0].physReg() == vcc) 2250 bld.sopp(aco_opcode::s_cbranch_vccnz, branch->definitions[0], target); 2251 else { 2252 assert(branch->operands[0].physReg() == scc); 2253 bld.sopp(aco_opcode::s_cbranch_scc1, branch->definitions[0], target); 2254 } 2255 break; 2256 case aco_opcode::p_cbranch_z: 2257 assert(block->linear_succs[1] == target); 2258 if (branch->operands[0].physReg() == exec) 2259 bld.sopp(aco_opcode::s_cbranch_execz, branch->definitions[0], target); 2260 else if (branch->operands[0].physReg() == vcc) 2261 bld.sopp(aco_opcode::s_cbranch_vccz, branch->definitions[0], target); 2262 else { 2263 assert(branch->operands[0].physReg() == scc); 2264 bld.sopp(aco_opcode::s_cbranch_scc0, branch->definitions[0], target); 2265 } 2266 break; 2267 default: unreachable("Unknown Pseudo branch instruction!"); 2268 } 2269 2270 } else if (instr->isReduction()) { 2271 Pseudo_reduction_instruction& reduce = instr->reduction(); 2272 emit_reduction(&ctx, reduce.opcode, reduce.reduce_op, reduce.cluster_size, 2273 reduce.operands[1].physReg(), // tmp 2274 reduce.definitions[1].physReg(), // stmp 2275 reduce.operands[2].physReg(), // vtmp 2276 reduce.definitions[2].physReg(), // sitmp 2277 reduce.operands[0], reduce.definitions[0]); 2278 } else if (instr->isBarrier()) { 2279 Pseudo_barrier_instruction& barrier = instr->barrier(); 2280 2281 /* Anything larger than a workgroup isn't possible. Anything 2282 * smaller requires no instructions and this pseudo instruction 2283 * would only be included to control optimizations. */ 2284 bool emit_s_barrier = barrier.exec_scope == scope_workgroup && 2285 program->workgroup_size > program->wave_size; 2286 2287 bld.insert(std::move(instr)); 2288 if (emit_s_barrier) 2289 bld.sopp(aco_opcode::s_barrier); 2290 } else if (instr->opcode == aco_opcode::p_cvt_f16_f32_rtne) { 2291 float_mode new_mode = block->fp_mode; 2292 new_mode.round16_64 = fp_round_ne; 2293 bool set_round = new_mode.round != block->fp_mode.round; 2294 2295 emit_set_mode(bld, new_mode, set_round, false); 2296 2297 instr->opcode = aco_opcode::v_cvt_f16_f32; 2298 ctx.instructions.emplace_back(std::move(instr)); 2299 2300 emit_set_mode(bld, block->fp_mode, set_round, false); 2301 } else { 2302 ctx.instructions.emplace_back(std::move(instr)); 2303 } 2304 } 2305 block->instructions.swap(ctx.instructions); 2306 } 2307} 2308 2309} // namespace aco 2310