1/* 2 * Copyright © 2018 Valve Corporation 3 * Copyright © 2018 Google 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 22 * IN THE SOFTWARE. 23 * 24 */ 25 26#include "aco_ir.h" 27 28#include "util/u_math.h" 29 30#include <set> 31#include <vector> 32 33namespace aco { 34RegisterDemand 35get_live_changes(aco_ptr<Instruction>& instr) 36{ 37 RegisterDemand changes; 38 for (const Definition& def : instr->definitions) { 39 if (!def.isTemp() || def.isKill()) 40 continue; 41 changes += def.getTemp(); 42 } 43 44 for (const Operand& op : instr->operands) { 45 if (!op.isTemp() || !op.isFirstKill()) 46 continue; 47 changes -= op.getTemp(); 48 } 49 50 return changes; 51} 52 53RegisterDemand 54get_temp_registers(aco_ptr<Instruction>& instr) 55{ 56 RegisterDemand temp_registers; 57 58 for (Definition def : instr->definitions) { 59 if (!def.isTemp()) 60 continue; 61 if (def.isKill()) 62 temp_registers += def.getTemp(); 63 } 64 65 for (Operand op : instr->operands) { 66 if (op.isTemp() && op.isLateKill() && op.isFirstKill()) 67 temp_registers += op.getTemp(); 68 } 69 70 return temp_registers; 71} 72 73RegisterDemand 74get_demand_before(RegisterDemand demand, aco_ptr<Instruction>& instr, 75 aco_ptr<Instruction>& instr_before) 76{ 77 demand -= get_live_changes(instr); 78 demand -= get_temp_registers(instr); 79 if (instr_before) 80 demand += get_temp_registers(instr_before); 81 return demand; 82} 83 84namespace { 85struct PhiInfo { 86 uint16_t logical_phi_sgpr_ops = 0; 87 uint16_t linear_phi_ops = 0; 88 uint16_t linear_phi_defs = 0; 89}; 90 91void 92process_live_temps_per_block(Program* program, live& lives, Block* block, unsigned& worklist, 93 std::vector<PhiInfo>& phi_info) 94{ 95 std::vector<RegisterDemand>& register_demand = lives.register_demand[block->index]; 96 RegisterDemand new_demand; 97 98 register_demand.resize(block->instructions.size()); 99 RegisterDemand block_register_demand; 100 IDSet live = lives.live_out[block->index]; 101 102 /* initialize register demand */ 103 for (unsigned t : live) 104 new_demand += Temp(t, program->temp_rc[t]); 105 new_demand.sgpr -= phi_info[block->index].logical_phi_sgpr_ops; 106 107 /* traverse the instructions backwards */ 108 int idx; 109 for (idx = block->instructions.size() - 1; idx >= 0; idx--) { 110 Instruction* insn = block->instructions[idx].get(); 111 if (is_phi(insn)) 112 break; 113 114 register_demand[idx] = RegisterDemand(new_demand.vgpr, new_demand.sgpr); 115 116 /* KILL */ 117 for (Definition& definition : insn->definitions) { 118 if (!definition.isTemp()) { 119 continue; 120 } 121 if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc) 122 program->needs_vcc = true; 123 124 const Temp temp = definition.getTemp(); 125 const size_t n = live.erase(temp.id()); 126 127 if (n) { 128 new_demand -= temp; 129 definition.setKill(false); 130 } else { 131 register_demand[idx] += temp; 132 definition.setKill(true); 133 } 134 } 135 136 /* GEN */ 137 if (insn->opcode == aco_opcode::p_logical_end) { 138 new_demand.sgpr += phi_info[block->index].logical_phi_sgpr_ops; 139 } else { 140 /* we need to do this in a separate loop because the next one can 141 * setKill() for several operands at once and we don't want to 142 * overwrite that in a later iteration */ 143 for (Operand& op : insn->operands) 144 op.setKill(false); 145 146 for (unsigned i = 0; i < insn->operands.size(); ++i) { 147 Operand& operand = insn->operands[i]; 148 if (!operand.isTemp()) 149 continue; 150 if (operand.isFixed() && operand.physReg() == vcc) 151 program->needs_vcc = true; 152 const Temp temp = operand.getTemp(); 153 const bool inserted = live.insert(temp.id()).second; 154 if (inserted) { 155 operand.setFirstKill(true); 156 for (unsigned j = i + 1; j < insn->operands.size(); ++j) { 157 if (insn->operands[j].isTemp() && 158 insn->operands[j].tempId() == operand.tempId()) { 159 insn->operands[j].setFirstKill(false); 160 insn->operands[j].setKill(true); 161 } 162 } 163 if (operand.isLateKill()) 164 register_demand[idx] += temp; 165 new_demand += temp; 166 } 167 } 168 } 169 170 block_register_demand.update(register_demand[idx]); 171 } 172 173 /* update block's register demand for a last time */ 174 block_register_demand.update(new_demand); 175 if (program->progress < CompilationProgress::after_ra) 176 block->register_demand = block_register_demand; 177 178 /* handle phi definitions */ 179 uint16_t linear_phi_defs = 0; 180 int phi_idx = idx; 181 while (phi_idx >= 0) { 182 register_demand[phi_idx] = new_demand; 183 Instruction* insn = block->instructions[phi_idx].get(); 184 185 assert(is_phi(insn) && insn->definitions.size() == 1); 186 if (!insn->definitions[0].isTemp()) { 187 assert(insn->definitions[0].isFixed() && insn->definitions[0].physReg() == exec); 188 phi_idx--; 189 continue; 190 } 191 Definition& definition = insn->definitions[0]; 192 if ((definition.isFixed() || definition.hasHint()) && definition.physReg() == vcc) 193 program->needs_vcc = true; 194 const Temp temp = definition.getTemp(); 195 const size_t n = live.erase(temp.id()); 196 197 if (n) 198 definition.setKill(false); 199 else 200 definition.setKill(true); 201 202 if (insn->opcode == aco_opcode::p_linear_phi) { 203 assert(definition.getTemp().type() == RegType::sgpr); 204 linear_phi_defs += definition.size(); 205 } 206 207 phi_idx--; 208 } 209 210 for (unsigned pred_idx : block->linear_preds) 211 phi_info[pred_idx].linear_phi_defs = linear_phi_defs; 212 213 /* now, we need to merge the live-ins into the live-out sets */ 214 for (unsigned t : live) { 215 RegClass rc = program->temp_rc[t]; 216 std::vector<unsigned>& preds = rc.is_linear() ? block->linear_preds : block->logical_preds; 217 218#ifndef NDEBUG 219 if (preds.empty()) 220 aco_err(program, "Temporary never defined or are defined after use: %%%d in BB%d", t, 221 block->index); 222#endif 223 224 for (unsigned pred_idx : preds) { 225 auto it = lives.live_out[pred_idx].insert(t); 226 if (it.second) 227 worklist = std::max(worklist, pred_idx + 1); 228 } 229 } 230 231 /* handle phi operands */ 232 phi_idx = idx; 233 while (phi_idx >= 0) { 234 Instruction* insn = block->instructions[phi_idx].get(); 235 assert(is_phi(insn)); 236 /* directly insert into the predecessors live-out set */ 237 std::vector<unsigned>& preds = 238 insn->opcode == aco_opcode::p_phi ? block->logical_preds : block->linear_preds; 239 for (unsigned i = 0; i < preds.size(); ++i) { 240 Operand& operand = insn->operands[i]; 241 if (!operand.isTemp()) 242 continue; 243 if (operand.isFixed() && operand.physReg() == vcc) 244 program->needs_vcc = true; 245 /* check if we changed an already processed block */ 246 const bool inserted = lives.live_out[preds[i]].insert(operand.tempId()).second; 247 if (inserted) { 248 worklist = std::max(worklist, preds[i] + 1); 249 if (insn->opcode == aco_opcode::p_phi && operand.getTemp().type() == RegType::sgpr) { 250 phi_info[preds[i]].logical_phi_sgpr_ops += operand.size(); 251 } else if (insn->opcode == aco_opcode::p_linear_phi) { 252 assert(operand.getTemp().type() == RegType::sgpr); 253 phi_info[preds[i]].linear_phi_ops += operand.size(); 254 } 255 } 256 257 /* set if the operand is killed by this (or another) phi instruction */ 258 operand.setKill(!live.count(operand.tempId())); 259 } 260 phi_idx--; 261 } 262 263 assert(block->index != 0 || (new_demand == RegisterDemand() && live.empty())); 264} 265 266unsigned 267calc_waves_per_workgroup(Program* program) 268{ 269 /* When workgroup size is not known, just go with wave_size */ 270 unsigned workgroup_size = 271 program->workgroup_size == UINT_MAX ? program->wave_size : program->workgroup_size; 272 273 return align(workgroup_size, program->wave_size) / program->wave_size; 274} 275} /* end namespace */ 276 277uint16_t 278get_extra_sgprs(Program* program) 279{ 280 if (program->chip_class >= GFX10) { 281 assert(!program->needs_flat_scr); 282 assert(!program->dev.xnack_enabled); 283 return 0; 284 } else if (program->chip_class >= GFX8) { 285 if (program->needs_flat_scr) 286 return 6; 287 else if (program->dev.xnack_enabled) 288 return 4; 289 else if (program->needs_vcc) 290 return 2; 291 else 292 return 0; 293 } else { 294 assert(!program->dev.xnack_enabled); 295 if (program->needs_flat_scr) 296 return 4; 297 else if (program->needs_vcc) 298 return 2; 299 else 300 return 0; 301 } 302} 303 304uint16_t 305get_sgpr_alloc(Program* program, uint16_t addressable_sgprs) 306{ 307 uint16_t sgprs = addressable_sgprs + get_extra_sgprs(program); 308 uint16_t granule = program->dev.sgpr_alloc_granule; 309 return ALIGN_NPOT(std::max(sgprs, granule), granule); 310} 311 312uint16_t 313get_vgpr_alloc(Program* program, uint16_t addressable_vgprs) 314{ 315 assert(addressable_vgprs <= program->dev.vgpr_limit); 316 uint16_t granule = program->dev.vgpr_alloc_granule; 317 return align(std::max(addressable_vgprs, granule), granule); 318} 319 320unsigned 321round_down(unsigned a, unsigned b) 322{ 323 return a - (a % b); 324} 325 326uint16_t 327get_addr_sgpr_from_waves(Program* program, uint16_t waves) 328{ 329 /* it's not possible to allocate more than 128 SGPRs */ 330 uint16_t sgprs = std::min(program->dev.physical_sgprs / waves, 128); 331 sgprs = round_down(sgprs, program->dev.sgpr_alloc_granule); 332 sgprs -= get_extra_sgprs(program); 333 return std::min(sgprs, program->dev.sgpr_limit); 334} 335 336uint16_t 337get_addr_vgpr_from_waves(Program* program, uint16_t waves) 338{ 339 uint16_t vgprs = program->dev.physical_vgprs / waves & ~(program->dev.vgpr_alloc_granule - 1); 340 vgprs -= program->config->num_shared_vgprs / 2; 341 return std::min(vgprs, program->dev.vgpr_limit); 342} 343 344void 345calc_min_waves(Program* program) 346{ 347 unsigned waves_per_workgroup = calc_waves_per_workgroup(program); 348 unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1); 349 program->min_waves = DIV_ROUND_UP(waves_per_workgroup, simd_per_cu_wgp); 350} 351 352void 353update_vgpr_sgpr_demand(Program* program, const RegisterDemand new_demand) 354{ 355 unsigned max_waves_per_simd = program->dev.max_wave64_per_simd * (64 / program->wave_size); 356 unsigned simd_per_cu_wgp = program->dev.simd_per_cu * (program->wgp_mode ? 2 : 1); 357 unsigned lds_limit = program->wgp_mode ? program->dev.lds_limit * 2 : program->dev.lds_limit; 358 unsigned max_workgroups_per_cu_wgp = program->wgp_mode ? 32 : 16; 359 360 assert(program->min_waves >= 1); 361 uint16_t sgpr_limit = get_addr_sgpr_from_waves(program, program->min_waves); 362 uint16_t vgpr_limit = get_addr_vgpr_from_waves(program, program->min_waves); 363 364 /* this won't compile, register pressure reduction necessary */ 365 if (new_demand.vgpr > vgpr_limit || new_demand.sgpr > sgpr_limit) { 366 program->num_waves = 0; 367 program->max_reg_demand = new_demand; 368 } else { 369 program->num_waves = program->dev.physical_sgprs / get_sgpr_alloc(program, new_demand.sgpr); 370 uint16_t vgpr_demand = 371 get_vgpr_alloc(program, new_demand.vgpr) + program->config->num_shared_vgprs / 2; 372 program->num_waves = 373 std::min<uint16_t>(program->num_waves, program->dev.physical_vgprs / vgpr_demand); 374 program->max_waves = max_waves_per_simd; 375 376 /* adjust max_waves for workgroup and LDS limits */ 377 unsigned waves_per_workgroup = calc_waves_per_workgroup(program); 378 unsigned workgroups_per_cu_wgp = max_waves_per_simd * simd_per_cu_wgp / waves_per_workgroup; 379 380 unsigned lds_per_workgroup = 381 align(program->config->lds_size * program->dev.lds_encoding_granule, 382 program->dev.lds_alloc_granule); 383 384 if (program->stage == fragment_fs) { 385 /* PS inputs are moved from PC (parameter cache) to LDS before PS waves are launched. 386 * Each PS input occupies 3x vec4 of LDS space. See Figure 10.3 in GCN3 ISA manual. 387 * These limit occupancy the same way as other stages' LDS usage does. 388 */ 389 unsigned lds_bytes_per_interp = 3 * 16; 390 unsigned lds_param_bytes = lds_bytes_per_interp * program->info->ps.num_interp; 391 lds_per_workgroup += align(lds_param_bytes, program->dev.lds_alloc_granule); 392 } 393 394 if (lds_per_workgroup) 395 workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, lds_limit / lds_per_workgroup); 396 397 if (waves_per_workgroup > 1) 398 workgroups_per_cu_wgp = std::min(workgroups_per_cu_wgp, max_workgroups_per_cu_wgp); 399 400 /* in cases like waves_per_workgroup=3 or lds=65536 and 401 * waves_per_workgroup=1, we want the maximum possible number of waves per 402 * SIMD and not the minimum. so DIV_ROUND_UP is used */ 403 program->max_waves = std::min<uint16_t>( 404 program->max_waves, 405 DIV_ROUND_UP(workgroups_per_cu_wgp * waves_per_workgroup, simd_per_cu_wgp)); 406 407 /* incorporate max_waves and calculate max_reg_demand */ 408 program->num_waves = std::min<uint16_t>(program->num_waves, program->max_waves); 409 program->max_reg_demand.vgpr = get_addr_vgpr_from_waves(program, program->num_waves); 410 program->max_reg_demand.sgpr = get_addr_sgpr_from_waves(program, program->num_waves); 411 } 412} 413 414live 415live_var_analysis(Program* program) 416{ 417 live result; 418 result.live_out.resize(program->blocks.size()); 419 result.register_demand.resize(program->blocks.size()); 420 unsigned worklist = program->blocks.size(); 421 std::vector<PhiInfo> phi_info(program->blocks.size()); 422 RegisterDemand new_demand; 423 424 program->needs_vcc = false; 425 426 /* this implementation assumes that the block idx corresponds to the block's position in 427 * program->blocks vector */ 428 while (worklist) { 429 unsigned block_idx = --worklist; 430 process_live_temps_per_block(program, result, &program->blocks[block_idx], worklist, 431 phi_info); 432 new_demand.update(program->blocks[block_idx].register_demand); 433 } 434 435 /* Handle branches: we will insert copies created for linear phis just before the branch. */ 436 for (Block& block : program->blocks) { 437 result.register_demand[block.index].back().sgpr += phi_info[block.index].linear_phi_defs; 438 result.register_demand[block.index].back().sgpr -= phi_info[block.index].linear_phi_ops; 439 } 440 441 /* calculate the program's register demand and number of waves */ 442 if (program->progress < CompilationProgress::after_ra) 443 update_vgpr_sgpr_demand(program, new_demand); 444 445 return result; 446} 447 448} // namespace aco 449