1/* 2 * Copyright © 2020 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "brw_eu.h" 25#include "brw_fs.h" 26#include "brw_vec4.h" 27#include "brw_cfg.h" 28 29using namespace brw; 30 31namespace { 32 /** 33 * Enumeration representing the various asynchronous units that can run 34 * computations in parallel on behalf of a shader thread. 35 */ 36 enum unit { 37 /** EU front-end. */ 38 unit_fe, 39 /** EU FPU0 (Note that co-issue to FPU1 is currently not modeled here). */ 40 unit_fpu, 41 /** Extended Math unit (AKA FPU1 on Gfx8-11, part of the EU on Gfx6+). */ 42 unit_em, 43 /** Sampler shared function. */ 44 unit_sampler, 45 /** Pixel Interpolator shared function. */ 46 unit_pi, 47 /** Unified Return Buffer shared function. */ 48 unit_urb, 49 /** Data Port Data Cache shared function. */ 50 unit_dp_dc, 51 /** Data Port Render Cache shared function. */ 52 unit_dp_rc, 53 /** Data Port Constant Cache shared function. */ 54 unit_dp_cc, 55 /** Message Gateway shared function. */ 56 unit_gateway, 57 /** Thread Spawner shared function. */ 58 unit_spawner, 59 /* unit_vme, */ 60 /* unit_cre, */ 61 /** Number of asynchronous units currently tracked. */ 62 num_units, 63 /** Dummy unit for instructions that don't consume runtime from the above. */ 64 unit_null = num_units 65 }; 66 67 /** 68 * Enumeration representing a computation result another computation can 69 * potentially depend on. 70 */ 71 enum dependency_id { 72 /* Register part of the GRF. */ 73 dependency_id_grf0 = 0, 74 /* Register part of the MRF. Only used on Gfx4-6. */ 75 dependency_id_mrf0 = dependency_id_grf0 + BRW_MAX_GRF, 76 /* Address register part of the ARF. */ 77 dependency_id_addr0 = dependency_id_mrf0 + 24, 78 /* Accumulator register part of the ARF. */ 79 dependency_id_accum0 = dependency_id_addr0 + 1, 80 /* Flag register part of the ARF. */ 81 dependency_id_flag0 = dependency_id_accum0 + 12, 82 /* SBID token write completion. Only used on Gfx12+. */ 83 dependency_id_sbid_wr0 = dependency_id_flag0 + 8, 84 /* SBID token read completion. Only used on Gfx12+. */ 85 dependency_id_sbid_rd0 = dependency_id_sbid_wr0 + 16, 86 /* Number of computation dependencies currently tracked. */ 87 num_dependency_ids = dependency_id_sbid_rd0 + 16 88 }; 89 90 /** 91 * State of our modeling of the program execution. 92 */ 93 struct state { 94 state() : unit_ready(), dep_ready(), unit_busy(), weight(1.0) {} 95 /** 96 * Time at which a given unit will be ready to execute the next 97 * computation, in clock units. 98 */ 99 unsigned unit_ready[num_units]; 100 /** 101 * Time at which an instruction dependent on a given dependency ID will 102 * be ready to execute, in clock units. 103 */ 104 unsigned dep_ready[num_dependency_ids]; 105 /** 106 * Aggregated utilization of a given unit excluding idle cycles, 107 * in clock units. 108 */ 109 float unit_busy[num_units]; 110 /** 111 * Factor of the overhead of a computation accounted for in the 112 * aggregated utilization calculation. 113 */ 114 float weight; 115 }; 116 117 /** 118 * Information derived from an IR instruction used to compute performance 119 * estimates. Allows the timing calculation to work on both FS and VEC4 120 * instructions. 121 */ 122 struct instruction_info { 123 instruction_info(const intel_device_info *devinfo, const fs_inst *inst) : 124 devinfo(devinfo), op(inst->opcode), 125 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), 126 tx(get_exec_type(inst)), sx(0), ss(0), 127 sc(has_bank_conflict(devinfo, inst) ? sd : 0), 128 desc(inst->desc), sfid(inst->sfid) 129 { 130 /* We typically want the maximum source size, except for split send 131 * messages which require the total size. 132 */ 133 if (inst->opcode == SHADER_OPCODE_SEND) { 134 ss = DIV_ROUND_UP(inst->size_read(2), REG_SIZE) + 135 DIV_ROUND_UP(inst->size_read(3), REG_SIZE); 136 } else { 137 for (unsigned i = 0; i < inst->sources; i++) 138 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); 139 } 140 141 /* Convert the execution size to GRF units. */ 142 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); 143 144 /* 32x32 integer multiplication has half the usual ALU throughput. 145 * Treat it as double-precision. 146 */ 147 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && 148 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && 149 type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) 150 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); 151 } 152 153 instruction_info(const intel_device_info *devinfo, 154 const vec4_instruction *inst) : 155 devinfo(devinfo), op(inst->opcode), 156 td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), 157 tx(get_exec_type(inst)), sx(0), ss(0), sc(0), 158 desc(inst->desc), sfid(inst->sfid) 159 { 160 /* Compute the maximum source size. */ 161 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) 162 ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); 163 164 /* Convert the execution size to GRF units. */ 165 sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); 166 167 /* 32x32 integer multiplication has half the usual ALU throughput. 168 * Treat it as double-precision. 169 */ 170 if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && 171 !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && 172 type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) 173 tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); 174 } 175 176 /** Device information. */ 177 const struct intel_device_info *devinfo; 178 /** Instruction opcode. */ 179 opcode op; 180 /** Destination type. */ 181 brw_reg_type td; 182 /** Destination size in GRF units. */ 183 unsigned sd; 184 /** Execution type. */ 185 brw_reg_type tx; 186 /** Execution size in GRF units. */ 187 unsigned sx; 188 /** Source size. */ 189 unsigned ss; 190 /** Bank conflict penalty size in GRF units (equal to sd if non-zero). */ 191 unsigned sc; 192 /** Send message descriptor. */ 193 uint32_t desc; 194 /** Send message shared function ID. */ 195 uint8_t sfid; 196 }; 197 198 /** 199 * Timing information of an instruction used to estimate the performance of 200 * the program. 201 */ 202 struct perf_desc { 203 perf_desc(unit u, int df, int db, int ls, int ld, int la, int lf) : 204 u(u), df(df), db(db), ls(ls), ld(ld), la(la), lf(lf) {} 205 206 /** 207 * Back-end unit its runtime shall be accounted to, in addition to the 208 * EU front-end which is always assumed to be involved. 209 */ 210 unit u; 211 /** 212 * Overhead cycles from the time that the EU front-end starts executing 213 * the instruction until it's ready to execute the next instruction. 214 */ 215 int df; 216 /** 217 * Overhead cycles from the time that the back-end starts executing the 218 * instruction until it's ready to execute the next instruction. 219 */ 220 int db; 221 /** 222 * Latency cycles from the time that the back-end starts executing the 223 * instruction until its sources have been read from the register file. 224 */ 225 int ls; 226 /** 227 * Latency cycles from the time that the back-end starts executing the 228 * instruction until its regular destination has been written to the 229 * register file. 230 */ 231 int ld; 232 /** 233 * Latency cycles from the time that the back-end starts executing the 234 * instruction until its accumulator destination has been written to the 235 * ARF file. 236 * 237 * Note that this is an approximation of the real behavior of 238 * accumulating instructions in the hardware: Instead of modeling a pair 239 * of back-to-back accumulating instructions as a first computation with 240 * latency equal to ld followed by another computation with a 241 * mid-pipeline stall (e.g. after the "M" part of a MAC instruction), we 242 * model the stall as if it occurred at the top of the pipeline, with 243 * the latency of the accumulator computation offset accordingly. 244 */ 245 int la; 246 /** 247 * Latency cycles from the time that the back-end starts executing the 248 * instruction until its flag destination has been written to the ARF 249 * file. 250 */ 251 int lf; 252 }; 253 254 /** 255 * Compute the timing information of an instruction based on any relevant 256 * information from the IR and a number of parameters specifying a linear 257 * approximation: Parameter X_Y specifies the derivative of timing X 258 * relative to info field Y, while X_1 specifies the independent term of 259 * the approximation of timing X. 260 */ 261 perf_desc 262 calculate_desc(const instruction_info &info, unit u, 263 int df_1, int df_sd, int df_sc, 264 int db_1, int db_sx, 265 int ls_1, int ld_1, int la_1, int lf_1, 266 int l_ss, int l_sd) 267 { 268 return perf_desc(u, df_1 + df_sd * int(info.sd) + df_sc * int(info.sc), 269 db_1 + db_sx * int(info.sx), 270 ls_1 + l_ss * int(info.ss), 271 ld_1 + l_ss * int(info.ss) + l_sd * int(info.sd), 272 la_1, lf_1); 273 } 274 275 /** 276 * Compute the timing information of an instruction based on any relevant 277 * information from the IR and a number of linear approximation parameters 278 * hard-coded for each IR instruction. 279 * 280 * Most timing parameters are obtained from the multivariate linear 281 * regression of a sample of empirical timings measured using the tm0 282 * register (as can be done today by using the shader_time debugging 283 * option). The Gfx4-5 math timings are obtained from BSpec Volume 5c.3 284 * "Shared Functions - Extended Math", Section 3.2 "Performance". 285 * Parameters marked XXX shall be considered low-quality, they're possibly 286 * high variance or completely guessed in cases where experimental data was 287 * unavailable. 288 */ 289 const perf_desc 290 instruction_desc(const instruction_info &info) 291 { 292 const struct intel_device_info *devinfo = info.devinfo; 293 294 switch (info.op) { 295 case BRW_OPCODE_SYNC: 296 case BRW_OPCODE_SEL: 297 case BRW_OPCODE_NOT: 298 case BRW_OPCODE_AND: 299 case BRW_OPCODE_OR: 300 case BRW_OPCODE_XOR: 301 case BRW_OPCODE_SHR: 302 case BRW_OPCODE_SHL: 303 case BRW_OPCODE_DIM: 304 case BRW_OPCODE_ASR: 305 case BRW_OPCODE_CMPN: 306 case BRW_OPCODE_F16TO32: 307 case BRW_OPCODE_BFREV: 308 case BRW_OPCODE_BFI1: 309 case BRW_OPCODE_AVG: 310 case BRW_OPCODE_FRC: 311 case BRW_OPCODE_RNDU: 312 case BRW_OPCODE_RNDD: 313 case BRW_OPCODE_RNDE: 314 case BRW_OPCODE_RNDZ: 315 case BRW_OPCODE_MAC: 316 case BRW_OPCODE_MACH: 317 case BRW_OPCODE_LZD: 318 case BRW_OPCODE_FBH: 319 case BRW_OPCODE_FBL: 320 case BRW_OPCODE_CBIT: 321 case BRW_OPCODE_ADDC: 322 case BRW_OPCODE_ROR: 323 case BRW_OPCODE_ROL: 324 case BRW_OPCODE_SUBB: 325 case BRW_OPCODE_SAD2: 326 case BRW_OPCODE_SADA2: 327 case BRW_OPCODE_LINE: 328 case BRW_OPCODE_NOP: 329 case SHADER_OPCODE_CLUSTER_BROADCAST: 330 case SHADER_OPCODE_SCRATCH_HEADER: 331 case FS_OPCODE_DDX_COARSE: 332 case FS_OPCODE_DDX_FINE: 333 case FS_OPCODE_DDY_COARSE: 334 case FS_OPCODE_PIXEL_X: 335 case FS_OPCODE_PIXEL_Y: 336 case FS_OPCODE_SET_SAMPLE_ID: 337 case VEC4_OPCODE_MOV_BYTES: 338 case VEC4_OPCODE_UNPACK_UNIFORM: 339 case VEC4_OPCODE_DOUBLE_TO_F32: 340 case VEC4_OPCODE_DOUBLE_TO_D32: 341 case VEC4_OPCODE_DOUBLE_TO_U32: 342 case VEC4_OPCODE_TO_DOUBLE: 343 case VEC4_OPCODE_PICK_LOW_32BIT: 344 case VEC4_OPCODE_PICK_HIGH_32BIT: 345 case VEC4_OPCODE_SET_LOW_32BIT: 346 case VEC4_OPCODE_SET_HIGH_32BIT: 347 case VEC4_OPCODE_ZERO_OOB_PUSH_REGS: 348 case GS_OPCODE_SET_DWORD_2: 349 case GS_OPCODE_SET_WRITE_OFFSET: 350 case GS_OPCODE_SET_VERTEX_COUNT: 351 case GS_OPCODE_PREPARE_CHANNEL_MASKS: 352 case GS_OPCODE_SET_CHANNEL_MASKS: 353 case GS_OPCODE_GET_INSTANCE_ID: 354 case GS_OPCODE_SET_PRIMITIVE_ID: 355 case GS_OPCODE_SVB_SET_DST_INDEX: 356 case TCS_OPCODE_SRC0_010_IS_ZERO: 357 case TCS_OPCODE_GET_PRIMITIVE_ID: 358 case TES_OPCODE_GET_PRIMITIVE_ID: 359 case SHADER_OPCODE_GET_DSS_ID: 360 if (devinfo->ver >= 11) { 361 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 362 0, 10, 6 /* XXX */, 14, 0, 0); 363 } else if (devinfo->ver >= 8) { 364 if (type_sz(info.tx) > 4) 365 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, 366 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 367 else 368 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 369 0, 8, 4, 12, 0, 0); 370 } else if (devinfo->is_haswell) { 371 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 372 0, 10, 6 /* XXX */, 16, 0, 0); 373 } else { 374 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 375 0, 12, 8 /* XXX */, 18, 0, 0); 376 } 377 378 case BRW_OPCODE_MOV: 379 case BRW_OPCODE_CMP: 380 case BRW_OPCODE_ADD: 381 case BRW_OPCODE_ADD3: 382 case BRW_OPCODE_MUL: 383 case SHADER_OPCODE_MOV_RELOC_IMM: 384 case VEC4_OPCODE_MOV_FOR_SCRATCH: 385 if (devinfo->ver >= 11) { 386 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 387 0, 10, 6, 14, 0, 0); 388 } else if (devinfo->ver >= 8) { 389 if (type_sz(info.tx) > 4) 390 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, 391 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 392 else 393 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 394 0, 8, 4, 12, 0, 0); 395 } else if (devinfo->is_haswell) { 396 if (info.tx == BRW_REGISTER_TYPE_F) 397 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 398 0, 12, 8 /* XXX */, 18, 0, 0); 399 else 400 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 401 0, 10, 6 /* XXX */, 16, 0, 0); 402 } else if (devinfo->ver >= 7) { 403 if (info.tx == BRW_REGISTER_TYPE_F) 404 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 405 0, 14, 10 /* XXX */, 20, 0, 0); 406 else 407 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 408 0, 12, 8 /* XXX */, 18, 0, 0); 409 } else { 410 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0, 411 0, 2 /* XXX */, 412 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 413 0, 0); 414 } 415 416 case BRW_OPCODE_BFE: 417 case BRW_OPCODE_BFI2: 418 case BRW_OPCODE_CSEL: 419 if (devinfo->ver >= 11) 420 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 421 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 422 else if (devinfo->ver >= 8) 423 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 424 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 425 else if (devinfo->is_haswell) 426 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 427 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 428 else if (devinfo->ver >= 7) 429 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 430 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 431 else 432 abort(); 433 434 case BRW_OPCODE_MAD: 435 if (devinfo->ver >= 11) { 436 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 437 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 438 } else if (devinfo->ver >= 8) { 439 if (type_sz(info.tx) > 4) 440 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4, 441 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 442 else 443 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 444 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 445 } else if (devinfo->is_haswell) { 446 if (info.tx == BRW_REGISTER_TYPE_F) 447 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 448 0, 12, 8 /* XXX */, 18, 0, 0); 449 else 450 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 451 0, 10, 6 /* XXX */, 16, 0, 0); 452 } else if (devinfo->ver >= 7) { 453 if (info.tx == BRW_REGISTER_TYPE_F) 454 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 455 0, 14, 10 /* XXX */, 20, 0, 0); 456 else 457 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 458 0, 12, 8 /* XXX */, 18, 0, 0); 459 } else if (devinfo->ver >= 6) { 460 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 1 /* XXX */, 461 0, 2 /* XXX */, 462 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 463 0, 0); 464 } else { 465 abort(); 466 } 467 468 case BRW_OPCODE_F32TO16: 469 if (devinfo->ver >= 11) 470 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, 471 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 472 else if (devinfo->ver >= 8) 473 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, 474 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 475 else if (devinfo->is_haswell) 476 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, 477 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 478 else if (devinfo->ver >= 7) 479 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, 480 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 481 else 482 abort(); 483 484 case BRW_OPCODE_DP4: 485 case BRW_OPCODE_DPH: 486 case BRW_OPCODE_DP3: 487 case BRW_OPCODE_DP2: 488 if (devinfo->ver >= 8) 489 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 490 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 491 else if (devinfo->is_haswell) 492 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 493 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 494 else 495 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 496 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 497 498 case BRW_OPCODE_DP4A: 499 if (devinfo->ver >= 12) 500 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 501 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 502 else 503 abort(); 504 505 case SHADER_OPCODE_RCP: 506 case SHADER_OPCODE_RSQ: 507 case SHADER_OPCODE_SQRT: 508 case SHADER_OPCODE_EXP2: 509 case SHADER_OPCODE_LOG2: 510 case SHADER_OPCODE_SIN: 511 case SHADER_OPCODE_COS: 512 case SHADER_OPCODE_POW: 513 case SHADER_OPCODE_INT_QUOTIENT: 514 case SHADER_OPCODE_INT_REMAINDER: 515 if (devinfo->ver >= 6) { 516 switch (info.op) { 517 case SHADER_OPCODE_RCP: 518 case SHADER_OPCODE_RSQ: 519 case SHADER_OPCODE_SQRT: 520 case SHADER_OPCODE_EXP2: 521 case SHADER_OPCODE_LOG2: 522 case SHADER_OPCODE_SIN: 523 case SHADER_OPCODE_COS: 524 if (devinfo->ver >= 8) 525 return calculate_desc(info, unit_em, -2, 4, 0, 0, 4, 526 0, 16, 0, 0, 0, 0); 527 else if (devinfo->is_haswell) 528 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2, 529 0, 12, 0, 0, 0, 0); 530 else 531 return calculate_desc(info, unit_em, 0, 2, 0, 0, 2, 532 0, 14, 0, 0, 0, 0); 533 534 case SHADER_OPCODE_POW: 535 if (devinfo->ver >= 8) 536 return calculate_desc(info, unit_em, -2, 4, 0, 0, 8, 537 0, 24, 0, 0, 0, 0); 538 else if (devinfo->is_haswell) 539 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4, 540 0, 20, 0, 0, 0, 0); 541 else 542 return calculate_desc(info, unit_em, 0, 2, 0, 0, 4, 543 0, 22, 0, 0, 0, 0); 544 545 case SHADER_OPCODE_INT_QUOTIENT: 546 case SHADER_OPCODE_INT_REMAINDER: 547 return calculate_desc(info, unit_em, 2, 0, 0, 26, 0, 548 0, 28 /* XXX */, 0, 0, 0, 0); 549 550 default: 551 abort(); 552 } 553 } else { 554 switch (info.op) { 555 case SHADER_OPCODE_RCP: 556 return calculate_desc(info, unit_em, 2, 0, 0, 0, 8, 557 0, 22, 0, 0, 0, 8); 558 559 case SHADER_OPCODE_RSQ: 560 return calculate_desc(info, unit_em, 2, 0, 0, 0, 16, 561 0, 44, 0, 0, 0, 8); 562 563 case SHADER_OPCODE_INT_QUOTIENT: 564 case SHADER_OPCODE_SQRT: 565 case SHADER_OPCODE_LOG2: 566 return calculate_desc(info, unit_em, 2, 0, 0, 0, 24, 567 0, 66, 0, 0, 0, 8); 568 569 case SHADER_OPCODE_INT_REMAINDER: 570 case SHADER_OPCODE_EXP2: 571 return calculate_desc(info, unit_em, 2, 0, 0, 0, 32, 572 0, 88, 0, 0, 0, 8); 573 574 case SHADER_OPCODE_SIN: 575 case SHADER_OPCODE_COS: 576 return calculate_desc(info, unit_em, 2, 0, 0, 0, 48, 577 0, 132, 0, 0, 0, 8); 578 579 case SHADER_OPCODE_POW: 580 return calculate_desc(info, unit_em, 2, 0, 0, 0, 64, 581 0, 176, 0, 0, 0, 8); 582 583 default: 584 abort(); 585 } 586 } 587 588 case BRW_OPCODE_DO: 589 if (devinfo->ver >= 6) 590 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0, 591 0, 0, 0, 0, 0, 0); 592 else 593 return calculate_desc(info, unit_null, 2 /* XXX */, 0, 0, 0, 0, 594 0, 0, 0, 0, 0, 0); 595 596 case BRW_OPCODE_IF: 597 case BRW_OPCODE_ELSE: 598 case BRW_OPCODE_ENDIF: 599 case BRW_OPCODE_WHILE: 600 case BRW_OPCODE_BREAK: 601 case BRW_OPCODE_CONTINUE: 602 case BRW_OPCODE_HALT: 603 if (devinfo->ver >= 8) 604 return calculate_desc(info, unit_null, 8, 0, 0, 0, 0, 605 0, 0, 0, 0, 0, 0); 606 else if (devinfo->is_haswell) 607 return calculate_desc(info, unit_null, 6, 0, 0, 0, 0, 608 0, 0, 0, 0, 0, 0); 609 else 610 return calculate_desc(info, unit_null, 2, 0, 0, 0, 0, 611 0, 0, 0, 0, 0, 0); 612 613 case FS_OPCODE_LINTERP: 614 if (devinfo->ver >= 8) 615 return calculate_desc(info, unit_fpu, 0, 4, 0, 0, 4, 616 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 617 else if (devinfo->is_haswell) 618 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 619 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 620 else 621 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 622 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 623 624 case BRW_OPCODE_LRP: 625 if (devinfo->ver >= 8) 626 return calculate_desc(info, unit_fpu, 0, 4, 1, 0, 4, 627 0, 12, 8 /* XXX */, 16 /* XXX */, 0, 0); 628 else if (devinfo->is_haswell) 629 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 630 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 631 else if (devinfo->ver >= 6) 632 return calculate_desc(info, unit_fpu, 0, 2, 1, 0, 2, 633 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 634 else 635 abort(); 636 637 case FS_OPCODE_PACK_HALF_2x16_SPLIT: 638 if (devinfo->ver >= 11) 639 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6, 640 0, 10 /* XXX */, 6 /* XXX */, 641 14 /* XXX */, 0, 0); 642 else if (devinfo->ver >= 8) 643 return calculate_desc(info, unit_fpu, 16, 6, 0, 0, 6, 644 0, 8 /* XXX */, 4 /* XXX */, 645 12 /* XXX */, 0, 0); 646 else if (devinfo->is_haswell) 647 return calculate_desc(info, unit_fpu, 20, 6, 0, 0, 6, 648 0, 10 /* XXX */, 6 /* XXX */, 649 16 /* XXX */, 0, 0); 650 else if (devinfo->ver >= 7) 651 return calculate_desc(info, unit_fpu, 24, 6, 0, 0, 6, 652 0, 12 /* XXX */, 8 /* XXX */, 653 18 /* XXX */, 0, 0); 654 else 655 abort(); 656 657 case SHADER_OPCODE_MOV_INDIRECT: 658 if (devinfo->ver >= 11) 659 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0, 660 0, 10 /* XXX */, 6 /* XXX */, 661 14 /* XXX */, 0, 0); 662 else if (devinfo->ver >= 8) 663 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0, 664 0, 8 /* XXX */, 4 /* XXX */, 665 12 /* XXX */, 0, 0); 666 else if (devinfo->is_haswell) 667 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0, 668 0, 10 /* XXX */, 6 /* XXX */, 669 16 /* XXX */, 0, 0); 670 else 671 return calculate_desc(info, unit_fpu, 34, 0, 0, 34, 0, 672 0, 12 /* XXX */, 8 /* XXX */, 673 18 /* XXX */, 0, 0); 674 675 case SHADER_OPCODE_BROADCAST: 676 if (devinfo->ver >= 11) 677 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 4, 0, 678 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 679 else if (devinfo->ver >= 8) 680 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0, 681 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 682 else if (devinfo->is_haswell) 683 return calculate_desc(info, unit_fpu, 18, 0, 0, 4, 0, 684 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 685 else if (devinfo->ver >= 7) 686 return calculate_desc(info, unit_fpu, 20, 0, 0, 4, 0, 687 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 688 else 689 abort(); 690 691 case SHADER_OPCODE_FIND_LIVE_CHANNEL: 692 if (devinfo->ver >= 11) 693 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0, 694 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 695 else if (devinfo->ver >= 8) 696 return calculate_desc(info, unit_fpu, 2, 0, 0, 2, 0, 697 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 698 else if (devinfo->is_haswell) 699 return calculate_desc(info, unit_fpu, 36, 0, 0, 6, 0, 700 0, 10, 6 /* XXX */, 16 /* XXX */, 0, 0); 701 else if (devinfo->ver >= 7) 702 return calculate_desc(info, unit_fpu, 40, 0, 0, 6, 0, 703 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 704 else 705 abort(); 706 707 case SHADER_OPCODE_RND_MODE: 708 case SHADER_OPCODE_FLOAT_CONTROL_MODE: 709 if (devinfo->ver >= 11) 710 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0, 711 4 /* XXX */, 0, 712 0, 0, 0, 0, 0, 0); 713 else if (devinfo->ver >= 8) 714 return calculate_desc(info, unit_fpu, 20 /* XXX */, 0, 0, 715 4 /* XXX */, 0, 716 0, 0, 0, 0, 0, 0); 717 else if (devinfo->is_haswell) 718 return calculate_desc(info, unit_fpu, 24 /* XXX */, 0, 0, 719 4 /* XXX */, 0, 720 0, 0, 0, 0, 0, 0); 721 else if (devinfo->ver >= 6) 722 return calculate_desc(info, unit_fpu, 28 /* XXX */, 0, 0, 723 4 /* XXX */, 0, 724 0, 0, 0, 0, 0, 0); 725 else 726 abort(); 727 728 case SHADER_OPCODE_SHUFFLE: 729 if (devinfo->ver >= 11) 730 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0, 731 44 /* XXX */, 0, 732 0, 10 /* XXX */, 6 /* XXX */, 733 14 /* XXX */, 0, 0); 734 else if (devinfo->ver >= 8) 735 return calculate_desc(info, unit_fpu, 42 /* XXX */, 0, 0, 736 42 /* XXX */, 0, 737 0, 8 /* XXX */, 4 /* XXX */, 738 12 /* XXX */, 0, 0); 739 else if (devinfo->is_haswell) 740 return calculate_desc(info, unit_fpu, 0, 44 /* XXX */, 0, 741 0, 44 /* XXX */, 742 0, 10 /* XXX */, 6 /* XXX */, 743 16 /* XXX */, 0, 0); 744 else if (devinfo->ver >= 6) 745 return calculate_desc(info, unit_fpu, 0, 46 /* XXX */, 0, 746 0, 46 /* XXX */, 747 0, 12 /* XXX */, 8 /* XXX */, 748 18 /* XXX */, 0, 0); 749 else 750 abort(); 751 752 case SHADER_OPCODE_SEL_EXEC: 753 if (devinfo->ver >= 11) 754 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0, 755 0, 4 /* XXX */, 756 0, 10 /* XXX */, 6 /* XXX */, 757 14 /* XXX */, 0, 0); 758 else if (devinfo->ver >= 8) 759 return calculate_desc(info, unit_fpu, 8 /* XXX */, 4 /* XXX */, 0, 760 0, 4 /* XXX */, 761 0, 8 /* XXX */, 4 /* XXX */, 762 12 /* XXX */, 0, 0); 763 else if (devinfo->is_haswell) 764 return calculate_desc(info, unit_fpu, 10 /* XXX */, 4 /* XXX */, 0, 765 0, 4 /* XXX */, 766 0, 10 /* XXX */, 6 /* XXX */, 767 16 /* XXX */, 0, 0); 768 else 769 return calculate_desc(info, unit_fpu, 12 /* XXX */, 4 /* XXX */, 0, 770 0, 4 /* XXX */, 771 0, 12 /* XXX */, 8 /* XXX */, 772 18 /* XXX */, 0, 0); 773 774 case SHADER_OPCODE_QUAD_SWIZZLE: 775 if (devinfo->ver >= 11) 776 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0, 777 0, 8 /* XXX */, 778 0, 10 /* XXX */, 6 /* XXX */, 779 14 /* XXX */, 0, 0); 780 else if (devinfo->ver >= 8) 781 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0, 782 0, 8 /* XXX */, 783 0, 8 /* XXX */, 4 /* XXX */, 784 12 /* XXX */, 0, 0); 785 else if (devinfo->is_haswell) 786 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0, 787 0, 8 /* XXX */, 788 0, 10 /* XXX */, 6 /* XXX */, 789 16 /* XXX */, 0, 0); 790 else 791 return calculate_desc(info, unit_fpu, 0 /* XXX */, 8 /* XXX */, 0, 792 0, 8 /* XXX */, 793 0, 12 /* XXX */, 8 /* XXX */, 794 18 /* XXX */, 0, 0); 795 796 case FS_OPCODE_DDY_FINE: 797 if (devinfo->ver >= 11) 798 return calculate_desc(info, unit_fpu, 0, 14, 0, 0, 4, 799 0, 10, 6 /* XXX */, 14 /* XXX */, 0, 0); 800 else if (devinfo->ver >= 8) 801 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 802 0, 8, 4 /* XXX */, 12 /* XXX */, 0, 0); 803 else if (devinfo->is_haswell) 804 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 805 0, 12, 8 /* XXX */, 18 /* XXX */, 0, 0); 806 else 807 return calculate_desc(info, unit_fpu, 0, 2, 0, 0, 2, 808 0, 14, 10 /* XXX */, 20 /* XXX */, 0, 0); 809 810 case FS_OPCODE_LOAD_LIVE_CHANNELS: 811 if (devinfo->ver >= 11) 812 return calculate_desc(info, unit_fpu, 2 /* XXX */, 0, 0, 813 2 /* XXX */, 0, 814 0, 0, 0, 10 /* XXX */, 0, 0); 815 else if (devinfo->ver >= 8) 816 return calculate_desc(info, unit_fpu, 0, 2 /* XXX */, 0, 817 0, 2 /* XXX */, 818 0, 0, 0, 8 /* XXX */, 0, 0); 819 else 820 abort(); 821 822 case VEC4_OPCODE_PACK_BYTES: 823 if (devinfo->ver >= 8) 824 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0, 825 4 /* XXX */, 0, 826 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, 827 0, 0); 828 else if (devinfo->is_haswell) 829 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0, 830 4 /* XXX */, 0, 831 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, 832 0, 0); 833 else 834 return calculate_desc(info, unit_fpu, 4 /* XXX */, 0, 0, 835 4 /* XXX */, 0, 836 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 837 0, 0); 838 839 case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: 840 case TCS_OPCODE_GET_INSTANCE_ID: 841 case TCS_OPCODE_SET_INPUT_URB_OFFSETS: 842 case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: 843 case TES_OPCODE_CREATE_INPUT_READ_HEADER: 844 if (devinfo->ver >= 8) 845 return calculate_desc(info, unit_fpu, 22 /* XXX */, 0, 0, 846 6 /* XXX */, 0, 847 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, 848 0, 0); 849 else if (devinfo->is_haswell) 850 return calculate_desc(info, unit_fpu, 26 /* XXX */, 0, 0, 851 6 /* XXX */, 0, 852 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, 853 0, 0); 854 else 855 return calculate_desc(info, unit_fpu, 30 /* XXX */, 0, 0, 856 6 /* XXX */, 0, 857 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 858 0, 0); 859 860 case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: 861 case TCS_OPCODE_CREATE_BARRIER_HEADER: 862 if (devinfo->ver >= 8) 863 return calculate_desc(info, unit_fpu, 32 /* XXX */, 0, 0, 864 8 /* XXX */, 0, 865 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, 866 0, 0); 867 else if (devinfo->is_haswell) 868 return calculate_desc(info, unit_fpu, 38 /* XXX */, 0, 0, 869 8 /* XXX */, 0, 870 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, 871 0, 0); 872 else if (devinfo->ver >= 6) 873 return calculate_desc(info, unit_fpu, 44 /* XXX */, 0, 0, 874 8 /* XXX */, 0, 875 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 876 0, 0); 877 else 878 abort(); 879 880 case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: 881 if (devinfo->ver >= 8) 882 return calculate_desc(info, unit_fpu, 12 /* XXX */, 0, 0, 883 4 /* XXX */, 0, 884 0, 8 /* XXX */, 4 /* XXX */, 12 /* XXX */, 885 0, 0); 886 else if (devinfo->is_haswell) 887 return calculate_desc(info, unit_fpu, 14 /* XXX */, 0, 0, 888 4 /* XXX */, 0, 889 0, 10 /* XXX */, 6 /* XXX */, 16 /* XXX */, 890 0, 0); 891 else if (devinfo->ver >= 7) 892 return calculate_desc(info, unit_fpu, 16 /* XXX */, 0, 0, 893 4 /* XXX */, 0, 894 0, 12 /* XXX */, 8 /* XXX */, 18 /* XXX */, 895 0, 0); 896 else 897 abort(); 898 899 case SHADER_OPCODE_TEX: 900 case FS_OPCODE_TXB: 901 case SHADER_OPCODE_TXD: 902 case SHADER_OPCODE_TXF: 903 case SHADER_OPCODE_TXF_LZ: 904 case SHADER_OPCODE_TXL: 905 case SHADER_OPCODE_TXL_LZ: 906 case SHADER_OPCODE_TXF_CMS: 907 case SHADER_OPCODE_TXF_CMS_W: 908 case SHADER_OPCODE_TXF_UMS: 909 case SHADER_OPCODE_TXF_MCS: 910 case SHADER_OPCODE_TXS: 911 case SHADER_OPCODE_LOD: 912 case SHADER_OPCODE_GET_BUFFER_SIZE: 913 case SHADER_OPCODE_TG4: 914 case SHADER_OPCODE_TG4_OFFSET: 915 case SHADER_OPCODE_SAMPLEINFO: 916 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4: 917 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16 /* XXX */, 918 8 /* XXX */, 750 /* XXX */, 0, 0, 919 2 /* XXX */, 0); 920 921 case SHADER_OPCODE_URB_READ_SIMD8: 922 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT: 923 case SHADER_OPCODE_URB_WRITE_SIMD8: 924 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT: 925 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED: 926 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT: 927 case VEC4_OPCODE_URB_READ: 928 case VS_OPCODE_URB_WRITE: 929 case GS_OPCODE_URB_WRITE: 930 case GS_OPCODE_URB_WRITE_ALLOCATE: 931 case GS_OPCODE_THREAD_END: 932 case GS_OPCODE_FF_SYNC: 933 case TCS_OPCODE_URB_WRITE: 934 case TCS_OPCODE_RELEASE_INPUT: 935 case TCS_OPCODE_THREAD_END: 936 return calculate_desc(info, unit_urb, 2, 0, 0, 0, 6 /* XXX */, 937 32 /* XXX */, 200 /* XXX */, 0, 0, 0, 0); 938 939 case SHADER_OPCODE_MEMORY_FENCE: 940 case SHADER_OPCODE_INTERLOCK: 941 switch (info.sfid) { 942 case GFX6_SFID_DATAPORT_RENDER_CACHE: 943 if (devinfo->ver >= 7) 944 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 30 /* XXX */, 0, 945 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); 946 else 947 abort(); 948 949 case BRW_SFID_URB: 950 case GFX7_SFID_DATAPORT_DATA_CACHE: 951 case GFX12_SFID_SLM: 952 case GFX12_SFID_TGM: 953 case GFX12_SFID_UGM: 954 case HSW_SFID_DATAPORT_DATA_CACHE_1: 955 if (devinfo->ver >= 7) 956 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 30 /* XXX */, 0, 957 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); 958 else 959 abort(); 960 961 default: 962 abort(); 963 } 964 965 case SHADER_OPCODE_GFX4_SCRATCH_READ: 966 case SHADER_OPCODE_GFX4_SCRATCH_WRITE: 967 case SHADER_OPCODE_GFX7_SCRATCH_READ: 968 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 0, 8 /* XXX */, 969 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); 970 971 case VEC4_OPCODE_UNTYPED_ATOMIC: 972 if (devinfo->ver >= 7) 973 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 974 30 /* XXX */, 400 /* XXX */, 975 10 /* XXX */, 100 /* XXX */, 0, 0, 976 0, 400 /* XXX */); 977 else 978 abort(); 979 980 case VEC4_OPCODE_UNTYPED_SURFACE_READ: 981 case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: 982 if (devinfo->ver >= 7) 983 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 984 0, 20 /* XXX */, 985 10 /* XXX */, 100 /* XXX */, 0, 0, 986 0, 0); 987 else 988 abort(); 989 990 case FS_OPCODE_FB_WRITE: 991 case FS_OPCODE_FB_READ: 992 case FS_OPCODE_REP_FB_WRITE: 993 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 0, 450 /* XXX */, 994 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); 995 996 case GS_OPCODE_SVB_WRITE: 997 if (devinfo->ver >= 6) 998 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0, 999 0, 450 /* XXX */, 1000 10 /* XXX */, 300 /* XXX */, 0, 0, 1001 0, 0); 1002 else 1003 abort(); 1004 1005 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: 1006 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GFX7: 1007 return calculate_desc(info, unit_dp_cc, 2, 0, 0, 0, 16 /* XXX */, 1008 10 /* XXX */, 100 /* XXX */, 0, 0, 0, 0); 1009 1010 case VS_OPCODE_PULL_CONSTANT_LOAD: 1011 case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7: 1012 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16, 1013 8, 750, 0, 0, 2, 0); 1014 1015 case FS_OPCODE_INTERPOLATE_AT_SAMPLE: 1016 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: 1017 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: 1018 if (devinfo->ver >= 7) 1019 return calculate_desc(info, unit_pi, 2, 0, 0, 14 /* XXX */, 0, 1020 0, 90 /* XXX */, 0, 0, 0, 0); 1021 else 1022 abort(); 1023 1024 case SHADER_OPCODE_BARRIER: 1025 if (devinfo->ver >= 7) 1026 return calculate_desc(info, unit_gateway, 90 /* XXX */, 0, 0, 1027 0 /* XXX */, 0, 1028 0, 0, 0, 0, 0, 0); 1029 else 1030 abort(); 1031 1032 case CS_OPCODE_CS_TERMINATE: 1033 if (devinfo->ver >= 7) 1034 return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0, 1035 10 /* XXX */, 0, 0, 0, 0, 0); 1036 else 1037 abort(); 1038 1039 case SHADER_OPCODE_SEND: 1040 switch (info.sfid) { 1041 case GFX6_SFID_DATAPORT_RENDER_CACHE: 1042 if (devinfo->ver >= 7) { 1043 switch (brw_dp_desc_msg_type(devinfo, info.desc)) { 1044 case GFX7_DATAPORT_RC_TYPED_ATOMIC_OP: 1045 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 1046 30 /* XXX */, 450 /* XXX */, 1047 10 /* XXX */, 100 /* XXX */, 1048 0, 0, 0, 400 /* XXX */); 1049 default: 1050 return calculate_desc(info, unit_dp_rc, 2, 0, 0, 1051 0, 450 /* XXX */, 1052 10 /* XXX */, 300 /* XXX */, 0, 0, 1053 0, 0); 1054 } 1055 } else if (devinfo->ver >= 6) { 1056 return calculate_desc(info, unit_dp_rc, 2 /* XXX */, 0, 0, 1057 0, 450 /* XXX */, 1058 10 /* XXX */, 300 /* XXX */, 0, 0, 0, 0); 1059 } else { 1060 abort(); 1061 } 1062 case BRW_SFID_SAMPLER: { 1063 if (devinfo->ver >= 6) 1064 return calculate_desc(info, unit_sampler, 2, 0, 0, 0, 16, 1065 8, 750, 0, 0, 2, 0); 1066 else 1067 abort(); 1068 } 1069 case GFX7_SFID_DATAPORT_DATA_CACHE: 1070 case HSW_SFID_DATAPORT_DATA_CACHE_1: 1071 if (devinfo->verx10 >= 75) { 1072 switch (brw_dp_desc_msg_type(devinfo, info.desc)) { 1073 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP: 1074 case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2: 1075 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2: 1076 case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP: 1077 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 1078 30 /* XXX */, 400 /* XXX */, 1079 10 /* XXX */, 100 /* XXX */, 0, 0, 1080 0, 400 /* XXX */); 1081 1082 default: 1083 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 1084 0, 20 /* XXX */, 1085 10 /* XXX */, 100 /* XXX */, 0, 0, 1086 0, 0); 1087 } 1088 } else if (devinfo->ver >= 7) { 1089 switch (brw_dp_desc_msg_type(devinfo, info.desc)) { 1090 case GFX7_DATAPORT_DC_UNTYPED_ATOMIC_OP: 1091 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 1092 30 /* XXX */, 400 /* XXX */, 1093 10 /* XXX */, 100 /* XXX */, 1094 0, 0, 0, 400 /* XXX */); 1095 default: 1096 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 1097 0, 20 /* XXX */, 1098 10 /* XXX */, 100 /* XXX */, 0, 0, 1099 0, 0); 1100 } 1101 } else { 1102 abort(); 1103 } 1104 1105 case GFX12_SFID_UGM: 1106 case GFX12_SFID_TGM: 1107 case GFX12_SFID_SLM: 1108 switch (lsc_msg_desc_opcode(devinfo, info.desc)) { 1109 case LSC_OP_LOAD: 1110 case LSC_OP_STORE: 1111 case LSC_OP_LOAD_CMASK: 1112 case LSC_OP_STORE_CMASK: 1113 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 1114 0, 20 /* XXX */, 1115 10 /* XXX */, 100 /* XXX */, 0, 0, 1116 0, 0); 1117 1118 case LSC_OP_FENCE: 1119 case LSC_OP_ATOMIC_INC: 1120 case LSC_OP_ATOMIC_DEC: 1121 case LSC_OP_ATOMIC_LOAD: 1122 case LSC_OP_ATOMIC_STORE: 1123 case LSC_OP_ATOMIC_ADD: 1124 case LSC_OP_ATOMIC_SUB: 1125 case LSC_OP_ATOMIC_MIN: 1126 case LSC_OP_ATOMIC_MAX: 1127 case LSC_OP_ATOMIC_UMIN: 1128 case LSC_OP_ATOMIC_UMAX: 1129 case LSC_OP_ATOMIC_CMPXCHG: 1130 case LSC_OP_ATOMIC_FADD: 1131 case LSC_OP_ATOMIC_FSUB: 1132 case LSC_OP_ATOMIC_FMIN: 1133 case LSC_OP_ATOMIC_FMAX: 1134 case LSC_OP_ATOMIC_FCMPXCHG: 1135 case LSC_OP_ATOMIC_AND: 1136 case LSC_OP_ATOMIC_OR: 1137 case LSC_OP_ATOMIC_XOR: 1138 return calculate_desc(info, unit_dp_dc, 2, 0, 0, 1139 30 /* XXX */, 400 /* XXX */, 1140 10 /* XXX */, 100 /* XXX */, 0, 0, 1141 0, 400 /* XXX */); 1142 default: 1143 abort(); 1144 } 1145 1146 case GEN_RT_SFID_BINDLESS_THREAD_DISPATCH: 1147 case GEN_RT_SFID_RAY_TRACE_ACCELERATOR: 1148 return calculate_desc(info, unit_spawner, 2, 0, 0, 0 /* XXX */, 0, 1149 10 /* XXX */, 0, 0, 0, 0, 0); 1150 1151 default: 1152 abort(); 1153 } 1154 1155 case SHADER_OPCODE_UNDEF: 1156 case SHADER_OPCODE_HALT_TARGET: 1157 case FS_OPCODE_SCHEDULING_FENCE: 1158 return calculate_desc(info, unit_null, 0, 0, 0, 0, 0, 1159 0, 0, 0, 0, 0, 0); 1160 1161 default: 1162 abort(); 1163 } 1164 } 1165 1166 /** 1167 * Model the performance behavior of a stall on the specified dependency 1168 * ID. 1169 */ 1170 void 1171 stall_on_dependency(state &st, dependency_id id) 1172 { 1173 if (id < ARRAY_SIZE(st.dep_ready)) 1174 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe], 1175 st.dep_ready[id]); 1176 } 1177 1178 /** 1179 * Model the performance behavior of the front-end and back-end while 1180 * executing an instruction with the specified timing information, assuming 1181 * all dependencies are already clear. 1182 */ 1183 void 1184 execute_instruction(state &st, const perf_desc &perf) 1185 { 1186 /* Compute the time at which the front-end will be ready to execute the 1187 * next instruction. 1188 */ 1189 st.unit_ready[unit_fe] += perf.df; 1190 1191 if (perf.u < num_units) { 1192 /* Wait for the back-end to be ready to execute this instruction. */ 1193 st.unit_ready[unit_fe] = MAX2(st.unit_ready[unit_fe], 1194 st.unit_ready[perf.u]); 1195 1196 /* Compute the time at which the back-end will be ready to execute 1197 * the next instruction, and update the back-end utilization. 1198 */ 1199 st.unit_ready[perf.u] = st.unit_ready[unit_fe] + perf.db; 1200 st.unit_busy[perf.u] += perf.db * st.weight; 1201 } 1202 } 1203 1204 /** 1205 * Model the performance behavior of a read dependency provided by an 1206 * instruction. 1207 */ 1208 void 1209 mark_read_dependency(state &st, const perf_desc &perf, dependency_id id) 1210 { 1211 if (id < ARRAY_SIZE(st.dep_ready)) 1212 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ls; 1213 } 1214 1215 /** 1216 * Model the performance behavior of a write dependency provided by an 1217 * instruction. 1218 */ 1219 void 1220 mark_write_dependency(state &st, const perf_desc &perf, dependency_id id) 1221 { 1222 if (id >= dependency_id_accum0 && id < dependency_id_flag0) 1223 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.la; 1224 else if (id >= dependency_id_flag0 && id < dependency_id_sbid_wr0) 1225 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.lf; 1226 else if (id < ARRAY_SIZE(st.dep_ready)) 1227 st.dep_ready[id] = st.unit_ready[unit_fe] + perf.ld; 1228 } 1229 1230 /** 1231 * Return the dependency ID of a backend_reg, offset by \p delta GRFs. 1232 */ 1233 dependency_id 1234 reg_dependency_id(const intel_device_info *devinfo, const backend_reg &r, 1235 const int delta) 1236 { 1237 if (r.file == VGRF) { 1238 const unsigned i = r.nr + r.offset / REG_SIZE + delta; 1239 assert(i < dependency_id_mrf0 - dependency_id_grf0); 1240 return dependency_id(dependency_id_grf0 + i); 1241 1242 } else if (r.file == FIXED_GRF) { 1243 const unsigned i = r.nr + delta; 1244 assert(i < dependency_id_mrf0 - dependency_id_grf0); 1245 return dependency_id(dependency_id_grf0 + i); 1246 1247 } else if (r.file == MRF && devinfo->ver >= 7) { 1248 const unsigned i = GFX7_MRF_HACK_START + 1249 r.nr + r.offset / REG_SIZE + delta; 1250 assert(i < dependency_id_mrf0 - dependency_id_grf0); 1251 return dependency_id(dependency_id_grf0 + i); 1252 1253 } else if (r.file == MRF && devinfo->ver < 7) { 1254 const unsigned i = (r.nr & ~BRW_MRF_COMPR4) + 1255 r.offset / REG_SIZE + delta; 1256 assert(i < dependency_id_addr0 - dependency_id_mrf0); 1257 return dependency_id(dependency_id_mrf0 + i); 1258 1259 } else if (r.file == ARF && r.nr >= BRW_ARF_ADDRESS && 1260 r.nr < BRW_ARF_ACCUMULATOR) { 1261 assert(delta == 0); 1262 return dependency_id_addr0; 1263 1264 } else if (r.file == ARF && r.nr >= BRW_ARF_ACCUMULATOR && 1265 r.nr < BRW_ARF_FLAG) { 1266 const unsigned i = r.nr - BRW_ARF_ACCUMULATOR + delta; 1267 assert(i < dependency_id_flag0 - dependency_id_accum0); 1268 return dependency_id(dependency_id_accum0 + i); 1269 1270 } else { 1271 return num_dependency_ids; 1272 } 1273 } 1274 1275 /** 1276 * Return the dependency ID of flag register starting at offset \p i. 1277 */ 1278 dependency_id 1279 flag_dependency_id(unsigned i) 1280 { 1281 assert(i < dependency_id_sbid_wr0 - dependency_id_flag0); 1282 return dependency_id(dependency_id_flag0 + i); 1283 } 1284 1285 /** 1286 * Return the dependency ID corresponding to the SBID read completion 1287 * condition of a Gfx12+ SWSB. 1288 */ 1289 dependency_id 1290 tgl_swsb_rd_dependency_id(tgl_swsb swsb) 1291 { 1292 if (swsb.mode) { 1293 assert(swsb.sbid < num_dependency_ids - dependency_id_sbid_rd0); 1294 return dependency_id(dependency_id_sbid_rd0 + swsb.sbid); 1295 } else { 1296 return num_dependency_ids; 1297 } 1298 } 1299 1300 /** 1301 * Return the dependency ID corresponding to the SBID write completion 1302 * condition of a Gfx12+ SWSB. 1303 */ 1304 dependency_id 1305 tgl_swsb_wr_dependency_id(tgl_swsb swsb) 1306 { 1307 if (swsb.mode) { 1308 assert(swsb.sbid < dependency_id_sbid_rd0 - dependency_id_sbid_wr0); 1309 return dependency_id(dependency_id_sbid_wr0 + swsb.sbid); 1310 } else { 1311 return num_dependency_ids; 1312 } 1313 } 1314 1315 /** 1316 * Return the implicit accumulator register accessed by channel \p i of the 1317 * instruction. 1318 */ 1319 unsigned 1320 accum_reg_of_channel(const intel_device_info *devinfo, 1321 const backend_instruction *inst, 1322 brw_reg_type tx, unsigned i) 1323 { 1324 assert(inst->reads_accumulator_implicitly() || 1325 inst->writes_accumulator_implicitly(devinfo)); 1326 const unsigned offset = (inst->group + i) * type_sz(tx) * 1327 (devinfo->ver < 7 || brw_reg_type_is_floating_point(tx) ? 1 : 2); 1328 return offset / REG_SIZE % 2; 1329 } 1330 1331 /** 1332 * Model the performance behavior of an FS back-end instruction. 1333 */ 1334 void 1335 issue_fs_inst(state &st, const intel_device_info *devinfo, 1336 const backend_instruction *be_inst) 1337 { 1338 const fs_inst *inst = static_cast<const fs_inst *>(be_inst); 1339 const instruction_info info(devinfo, inst); 1340 const perf_desc perf = instruction_desc(info); 1341 1342 /* Stall on any source dependencies. */ 1343 for (unsigned i = 0; i < inst->sources; i++) { 1344 for (unsigned j = 0; j < regs_read(inst, i); j++) 1345 stall_on_dependency( 1346 st, reg_dependency_id(devinfo, inst->src[i], j)); 1347 } 1348 1349 if (inst->reads_accumulator_implicitly()) { 1350 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1351 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1352 inst->exec_size - 1); j++) 1353 stall_on_dependency( 1354 st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1355 } 1356 1357 if (is_send(inst) && inst->base_mrf != -1) { 1358 for (unsigned j = 0; j < inst->mlen; j++) 1359 stall_on_dependency( 1360 st, reg_dependency_id( 1361 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); 1362 } 1363 1364 if (const unsigned mask = inst->flags_read(devinfo)) { 1365 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { 1366 if (mask & (1 << i)) 1367 stall_on_dependency(st, flag_dependency_id(i)); 1368 } 1369 } 1370 1371 /* Stall on any write dependencies. */ 1372 if (!inst->no_dd_check) { 1373 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { 1374 for (unsigned j = 0; j < regs_written(inst); j++) 1375 stall_on_dependency( 1376 st, reg_dependency_id(devinfo, inst->dst, j)); 1377 } 1378 1379 if (inst->writes_accumulator_implicitly(devinfo)) { 1380 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1381 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1382 inst->exec_size - 1); j++) 1383 stall_on_dependency( 1384 st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1385 } 1386 1387 if (const unsigned mask = inst->flags_written(devinfo)) { 1388 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { 1389 if (mask & (1 << i)) 1390 stall_on_dependency(st, flag_dependency_id(i)); 1391 } 1392 } 1393 } 1394 1395 /* Stall on any SBID dependencies. */ 1396 if (inst->sched.mode & (TGL_SBID_SET | TGL_SBID_DST)) 1397 stall_on_dependency(st, tgl_swsb_wr_dependency_id(inst->sched)); 1398 else if (inst->sched.mode & TGL_SBID_SRC) 1399 stall_on_dependency(st, tgl_swsb_rd_dependency_id(inst->sched)); 1400 1401 /* Execute the instruction. */ 1402 execute_instruction(st, perf); 1403 1404 /* Mark any source dependencies. */ 1405 if (inst->is_send_from_grf()) { 1406 for (unsigned i = 0; i < inst->sources; i++) { 1407 if (inst->is_payload(i)) { 1408 for (unsigned j = 0; j < regs_read(inst, i); j++) 1409 mark_read_dependency( 1410 st, perf, reg_dependency_id(devinfo, inst->src[i], j)); 1411 } 1412 } 1413 } 1414 1415 if (is_send(inst) && inst->base_mrf != -1) { 1416 for (unsigned j = 0; j < inst->mlen; j++) 1417 mark_read_dependency(st, perf, 1418 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); 1419 } 1420 1421 /* Mark any destination dependencies. */ 1422 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { 1423 for (unsigned j = 0; j < regs_written(inst); j++) { 1424 mark_write_dependency(st, perf, 1425 reg_dependency_id(devinfo, inst->dst, j)); 1426 } 1427 } 1428 1429 if (inst->writes_accumulator_implicitly(devinfo)) { 1430 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1431 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1432 inst->exec_size - 1); j++) 1433 mark_write_dependency(st, perf, 1434 reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1435 } 1436 1437 if (const unsigned mask = inst->flags_written(devinfo)) { 1438 for (unsigned i = 0; i < sizeof(mask) * CHAR_BIT; i++) { 1439 if (mask & (1 << i)) 1440 mark_write_dependency(st, perf, flag_dependency_id(i)); 1441 } 1442 } 1443 1444 /* Mark any SBID dependencies. */ 1445 if (inst->sched.mode & TGL_SBID_SET) { 1446 mark_read_dependency(st, perf, tgl_swsb_rd_dependency_id(inst->sched)); 1447 mark_write_dependency(st, perf, tgl_swsb_wr_dependency_id(inst->sched)); 1448 } 1449 } 1450 1451 /** 1452 * Model the performance behavior of a VEC4 back-end instruction. 1453 */ 1454 void 1455 issue_vec4_instruction(state &st, const intel_device_info *devinfo, 1456 const backend_instruction *be_inst) 1457 { 1458 const vec4_instruction *inst = 1459 static_cast<const vec4_instruction *>(be_inst); 1460 const instruction_info info(devinfo, inst); 1461 const perf_desc perf = instruction_desc(info); 1462 1463 /* Stall on any source dependencies. */ 1464 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { 1465 for (unsigned j = 0; j < regs_read(inst, i); j++) 1466 stall_on_dependency( 1467 st, reg_dependency_id(devinfo, inst->src[i], j)); 1468 } 1469 1470 if (inst->reads_accumulator_implicitly()) { 1471 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1472 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1473 inst->exec_size - 1); j++) 1474 stall_on_dependency( 1475 st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1476 } 1477 1478 if (inst->base_mrf != -1) { 1479 for (unsigned j = 0; j < inst->mlen; j++) 1480 stall_on_dependency( 1481 st, reg_dependency_id( 1482 devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); 1483 } 1484 1485 if (inst->reads_flag()) 1486 stall_on_dependency(st, dependency_id_flag0); 1487 1488 /* Stall on any write dependencies. */ 1489 if (!inst->no_dd_check) { 1490 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { 1491 for (unsigned j = 0; j < regs_written(inst); j++) 1492 stall_on_dependency( 1493 st, reg_dependency_id(devinfo, inst->dst, j)); 1494 } 1495 1496 if (inst->writes_accumulator_implicitly(devinfo)) { 1497 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1498 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1499 inst->exec_size - 1); j++) 1500 stall_on_dependency( 1501 st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1502 } 1503 1504 if (inst->writes_flag(devinfo)) 1505 stall_on_dependency(st, dependency_id_flag0); 1506 } 1507 1508 /* Execute the instruction. */ 1509 execute_instruction(st, perf); 1510 1511 /* Mark any source dependencies. */ 1512 if (inst->is_send_from_grf()) { 1513 for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { 1514 for (unsigned j = 0; j < regs_read(inst, i); j++) 1515 mark_read_dependency( 1516 st, perf, reg_dependency_id(devinfo, inst->src[i], j)); 1517 } 1518 } 1519 1520 if (inst->base_mrf != -1) { 1521 for (unsigned j = 0; j < inst->mlen; j++) 1522 mark_read_dependency(st, perf, 1523 reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); 1524 } 1525 1526 /* Mark any destination dependencies. */ 1527 if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { 1528 for (unsigned j = 0; j < regs_written(inst); j++) { 1529 mark_write_dependency(st, perf, 1530 reg_dependency_id(devinfo, inst->dst, j)); 1531 } 1532 } 1533 1534 if (inst->writes_accumulator_implicitly(devinfo)) { 1535 for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); 1536 j <= accum_reg_of_channel(devinfo, inst, info.tx, 1537 inst->exec_size - 1); j++) 1538 mark_write_dependency(st, perf, 1539 reg_dependency_id(devinfo, brw_acc_reg(8), j)); 1540 } 1541 1542 if (inst->writes_flag(devinfo)) 1543 mark_write_dependency(st, perf, dependency_id_flag0); 1544 } 1545 1546 /** 1547 * Calculate the maximum possible throughput of the program compatible with 1548 * the cycle-count utilization estimated for each asynchronous unit, in 1549 * threads-per-cycle units. 1550 */ 1551 float 1552 calculate_thread_throughput(const state &st, float busy) 1553 { 1554 for (unsigned i = 0; i < num_units; i++) 1555 busy = MAX2(busy, st.unit_busy[i]); 1556 1557 return 1.0 / busy; 1558 } 1559 1560 /** 1561 * Estimate the performance of the specified shader. 1562 */ 1563 void 1564 calculate_performance(performance &p, const backend_shader *s, 1565 void (*issue_instruction)( 1566 state &, const intel_device_info *, 1567 const backend_instruction *), 1568 unsigned dispatch_width) 1569 { 1570 /* XXX - Note that the previous version of this code used worst-case 1571 * scenario estimation of branching divergence for SIMD32 shaders, 1572 * but this heuristic was removed to improve performance in common 1573 * scenarios. Wider shader variants are less optimal when divergence 1574 * is high, e.g. when application renders complex scene on a small 1575 * surface. It is assumed that such renders are short, so their 1576 * time doesn't matter and when it comes to the overall performance, 1577 * they are dominated by more optimal larger renders. 1578 * 1579 * It's possible that we could do better with divergence analysis 1580 * by isolating branches which are 100% uniform. 1581 * 1582 * Plumbing the trip counts from NIR loop analysis would allow us 1583 * to do a better job regarding the loop weights. 1584 * 1585 * In the meantime use values that roughly match the control flow 1586 * weights used elsewhere in the compiler back-end. 1587 * 1588 * Note that we provide slightly more pessimistic weights on 1589 * Gfx12+ for SIMD32, since the effective warp size on that 1590 * platform is 2x the SIMD width due to EU fusion, which increases 1591 * the likelihood of divergent control flow in comparison to 1592 * previous generations, giving narrower SIMD modes a performance 1593 * advantage in several test-cases with non-uniform discard jumps. 1594 */ 1595 const float discard_weight = (dispatch_width > 16 || s->devinfo->ver < 12 ? 1596 1.0 : 0.5); 1597 const float loop_weight = 10; 1598 unsigned halt_count = 0; 1599 unsigned elapsed = 0; 1600 state st; 1601 1602 foreach_block(block, s->cfg) { 1603 const unsigned elapsed0 = elapsed; 1604 1605 foreach_inst_in_block(backend_instruction, inst, block) { 1606 const unsigned clock0 = st.unit_ready[unit_fe]; 1607 1608 issue_instruction(st, s->devinfo, inst); 1609 1610 if (inst->opcode == SHADER_OPCODE_HALT_TARGET && halt_count) 1611 st.weight /= discard_weight; 1612 1613 elapsed += (st.unit_ready[unit_fe] - clock0) * st.weight; 1614 1615 if (inst->opcode == BRW_OPCODE_DO) 1616 st.weight *= loop_weight; 1617 else if (inst->opcode == BRW_OPCODE_WHILE) 1618 st.weight /= loop_weight; 1619 else if (inst->opcode == BRW_OPCODE_HALT && !halt_count++) 1620 st.weight *= discard_weight; 1621 } 1622 1623 p.block_latency[block->num] = elapsed - elapsed0; 1624 } 1625 1626 p.latency = elapsed; 1627 p.throughput = dispatch_width * calculate_thread_throughput(st, elapsed); 1628 } 1629} 1630 1631brw::performance::performance(const fs_visitor *v) : 1632 block_latency(new unsigned[v->cfg->num_blocks]) 1633{ 1634 calculate_performance(*this, v, issue_fs_inst, v->dispatch_width); 1635} 1636 1637brw::performance::performance(const vec4_visitor *v) : 1638 block_latency(new unsigned[v->cfg->num_blocks]) 1639{ 1640 calculate_performance(*this, v, issue_vec4_instruction, 8); 1641} 1642 1643brw::performance::~performance() 1644{ 1645 delete[] block_latency; 1646} 1647