si_perfcounter.c revision 01e04c3f
1/* 2 * Copyright 2015 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "si_build_pm4.h" 26#include "si_query.h" 27#include "util/u_memory.h" 28 29 30enum si_pc_reg_layout { 31 /* All secondary selector dwords follow as one block after the primary 32 * selector dwords for the counters that have secondary selectors. 33 */ 34 SI_PC_MULTI_BLOCK = 0, 35 36 /* Each secondary selector dword follows immediately afters the 37 * corresponding primary. 38 */ 39 SI_PC_MULTI_ALTERNATE = 1, 40 41 /* All secondary selector dwords follow as one block after all primary 42 * selector dwords. 43 */ 44 SI_PC_MULTI_TAIL = 2, 45 46 /* Free-form arrangement of selector registers. */ 47 SI_PC_MULTI_CUSTOM = 3, 48 49 SI_PC_MULTI_MASK = 3, 50 51 /* Registers are laid out in decreasing rather than increasing order. */ 52 SI_PC_REG_REVERSE = 4, 53 54 SI_PC_FAKE = 8, 55}; 56 57struct si_pc_block_base { 58 const char *name; 59 unsigned num_counters; 60 unsigned flags; 61 62 unsigned select_or; 63 unsigned select0; 64 unsigned counter0_lo; 65 unsigned *select; 66 unsigned *counters; 67 unsigned num_multi; 68 unsigned num_prelude; 69 unsigned layout; 70}; 71 72struct si_pc_block { 73 struct si_pc_block_base *b; 74 unsigned selectors; 75 unsigned instances; 76}; 77 78/* The order is chosen to be compatible with GPUPerfStudio's hardcoding of 79 * performance counter group IDs. 80 */ 81static const char * const si_pc_shader_type_suffixes[] = { 82 "", "_ES", "_GS", "_VS", "_PS", "_LS", "_HS", "_CS" 83}; 84 85static const unsigned si_pc_shader_type_bits[] = { 86 0x7f, 87 S_036780_ES_EN(1), 88 S_036780_GS_EN(1), 89 S_036780_VS_EN(1), 90 S_036780_PS_EN(1), 91 S_036780_LS_EN(1), 92 S_036780_HS_EN(1), 93 S_036780_CS_EN(1), 94}; 95 96static struct si_pc_block_base cik_CB = { 97 .name = "CB", 98 .num_counters = 4, 99 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, 100 101 .select0 = R_037000_CB_PERFCOUNTER_FILTER, 102 .counter0_lo = R_035018_CB_PERFCOUNTER0_LO, 103 .num_multi = 1, 104 .num_prelude = 1, 105 .layout = SI_PC_MULTI_ALTERNATE, 106}; 107 108static unsigned cik_CPC_select[] = { 109 R_036024_CPC_PERFCOUNTER0_SELECT, 110 R_036010_CPC_PERFCOUNTER0_SELECT1, 111 R_03600C_CPC_PERFCOUNTER1_SELECT, 112}; 113static struct si_pc_block_base cik_CPC = { 114 .name = "CPC", 115 .num_counters = 2, 116 117 .select = cik_CPC_select, 118 .counter0_lo = R_034018_CPC_PERFCOUNTER0_LO, 119 .num_multi = 1, 120 .layout = SI_PC_MULTI_CUSTOM | SI_PC_REG_REVERSE, 121}; 122 123static struct si_pc_block_base cik_CPF = { 124 .name = "CPF", 125 .num_counters = 2, 126 127 .select0 = R_03601C_CPF_PERFCOUNTER0_SELECT, 128 .counter0_lo = R_034028_CPF_PERFCOUNTER0_LO, 129 .num_multi = 1, 130 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE, 131}; 132 133static struct si_pc_block_base cik_CPG = { 134 .name = "CPG", 135 .num_counters = 2, 136 137 .select0 = R_036008_CPG_PERFCOUNTER0_SELECT, 138 .counter0_lo = R_034008_CPG_PERFCOUNTER0_LO, 139 .num_multi = 1, 140 .layout = SI_PC_MULTI_ALTERNATE | SI_PC_REG_REVERSE, 141}; 142 143static struct si_pc_block_base cik_DB = { 144 .name = "DB", 145 .num_counters = 4, 146 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS, 147 148 .select0 = R_037100_DB_PERFCOUNTER0_SELECT, 149 .counter0_lo = R_035100_DB_PERFCOUNTER0_LO, 150 .num_multi = 3, // really only 2, but there's a gap between registers 151 .layout = SI_PC_MULTI_ALTERNATE, 152}; 153 154static struct si_pc_block_base cik_GDS = { 155 .name = "GDS", 156 .num_counters = 4, 157 158 .select0 = R_036A00_GDS_PERFCOUNTER0_SELECT, 159 .counter0_lo = R_034A00_GDS_PERFCOUNTER0_LO, 160 .num_multi = 1, 161 .layout = SI_PC_MULTI_TAIL, 162}; 163 164static unsigned cik_GRBM_counters[] = { 165 R_034100_GRBM_PERFCOUNTER0_LO, 166 R_03410C_GRBM_PERFCOUNTER1_LO, 167}; 168static struct si_pc_block_base cik_GRBM = { 169 .name = "GRBM", 170 .num_counters = 2, 171 172 .select0 = R_036100_GRBM_PERFCOUNTER0_SELECT, 173 .counters = cik_GRBM_counters, 174}; 175 176static struct si_pc_block_base cik_GRBMSE = { 177 .name = "GRBMSE", 178 .num_counters = 4, 179 180 .select0 = R_036108_GRBM_SE0_PERFCOUNTER_SELECT, 181 .counter0_lo = R_034114_GRBM_SE0_PERFCOUNTER_LO, 182}; 183 184static struct si_pc_block_base cik_IA = { 185 .name = "IA", 186 .num_counters = 4, 187 188 .select0 = R_036210_IA_PERFCOUNTER0_SELECT, 189 .counter0_lo = R_034220_IA_PERFCOUNTER0_LO, 190 .num_multi = 1, 191 .layout = SI_PC_MULTI_TAIL, 192}; 193 194static struct si_pc_block_base cik_PA_SC = { 195 .name = "PA_SC", 196 .num_counters = 8, 197 .flags = SI_PC_BLOCK_SE, 198 199 .select0 = R_036500_PA_SC_PERFCOUNTER0_SELECT, 200 .counter0_lo = R_034500_PA_SC_PERFCOUNTER0_LO, 201 .num_multi = 1, 202 .layout = SI_PC_MULTI_ALTERNATE, 203}; 204 205/* According to docs, PA_SU counters are only 48 bits wide. */ 206static struct si_pc_block_base cik_PA_SU = { 207 .name = "PA_SU", 208 .num_counters = 4, 209 .flags = SI_PC_BLOCK_SE, 210 211 .select0 = R_036400_PA_SU_PERFCOUNTER0_SELECT, 212 .counter0_lo = R_034400_PA_SU_PERFCOUNTER0_LO, 213 .num_multi = 2, 214 .layout = SI_PC_MULTI_ALTERNATE, 215}; 216 217static struct si_pc_block_base cik_SPI = { 218 .name = "SPI", 219 .num_counters = 6, 220 .flags = SI_PC_BLOCK_SE, 221 222 .select0 = R_036600_SPI_PERFCOUNTER0_SELECT, 223 .counter0_lo = R_034604_SPI_PERFCOUNTER0_LO, 224 .num_multi = 4, 225 .layout = SI_PC_MULTI_BLOCK, 226}; 227 228static struct si_pc_block_base cik_SQ = { 229 .name = "SQ", 230 .num_counters = 16, 231 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_SHADER, 232 233 .select0 = R_036700_SQ_PERFCOUNTER0_SELECT, 234 .select_or = S_036700_SQC_BANK_MASK(15) | 235 S_036700_SQC_CLIENT_MASK(15) | 236 S_036700_SIMD_MASK(15), 237 .counter0_lo = R_034700_SQ_PERFCOUNTER0_LO, 238}; 239 240static struct si_pc_block_base cik_SX = { 241 .name = "SX", 242 .num_counters = 4, 243 .flags = SI_PC_BLOCK_SE, 244 245 .select0 = R_036900_SX_PERFCOUNTER0_SELECT, 246 .counter0_lo = R_034900_SX_PERFCOUNTER0_LO, 247 .num_multi = 2, 248 .layout = SI_PC_MULTI_TAIL, 249}; 250 251static struct si_pc_block_base cik_TA = { 252 .name = "TA", 253 .num_counters = 2, 254 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, 255 256 .select0 = R_036B00_TA_PERFCOUNTER0_SELECT, 257 .counter0_lo = R_034B00_TA_PERFCOUNTER0_LO, 258 .num_multi = 1, 259 .layout = SI_PC_MULTI_ALTERNATE, 260}; 261 262static struct si_pc_block_base cik_TD = { 263 .name = "TD", 264 .num_counters = 2, 265 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, 266 267 .select0 = R_036C00_TD_PERFCOUNTER0_SELECT, 268 .counter0_lo = R_034C00_TD_PERFCOUNTER0_LO, 269 .num_multi = 1, 270 .layout = SI_PC_MULTI_ALTERNATE, 271}; 272 273static struct si_pc_block_base cik_TCA = { 274 .name = "TCA", 275 .num_counters = 4, 276 .flags = SI_PC_BLOCK_INSTANCE_GROUPS, 277 278 .select0 = R_036E40_TCA_PERFCOUNTER0_SELECT, 279 .counter0_lo = R_034E40_TCA_PERFCOUNTER0_LO, 280 .num_multi = 2, 281 .layout = SI_PC_MULTI_ALTERNATE, 282}; 283 284static struct si_pc_block_base cik_TCC = { 285 .name = "TCC", 286 .num_counters = 4, 287 .flags = SI_PC_BLOCK_INSTANCE_GROUPS, 288 289 .select0 = R_036E00_TCC_PERFCOUNTER0_SELECT, 290 .counter0_lo = R_034E00_TCC_PERFCOUNTER0_LO, 291 .num_multi = 2, 292 .layout = SI_PC_MULTI_ALTERNATE, 293}; 294 295static struct si_pc_block_base cik_TCP = { 296 .name = "TCP", 297 .num_counters = 4, 298 .flags = SI_PC_BLOCK_SE | SI_PC_BLOCK_INSTANCE_GROUPS | SI_PC_BLOCK_SHADER_WINDOWED, 299 300 .select0 = R_036D00_TCP_PERFCOUNTER0_SELECT, 301 .counter0_lo = R_034D00_TCP_PERFCOUNTER0_LO, 302 .num_multi = 2, 303 .layout = SI_PC_MULTI_ALTERNATE, 304}; 305 306static struct si_pc_block_base cik_VGT = { 307 .name = "VGT", 308 .num_counters = 4, 309 .flags = SI_PC_BLOCK_SE, 310 311 .select0 = R_036230_VGT_PERFCOUNTER0_SELECT, 312 .counter0_lo = R_034240_VGT_PERFCOUNTER0_LO, 313 .num_multi = 1, 314 .layout = SI_PC_MULTI_TAIL, 315}; 316 317static struct si_pc_block_base cik_WD = { 318 .name = "WD", 319 .num_counters = 4, 320 321 .select0 = R_036200_WD_PERFCOUNTER0_SELECT, 322 .counter0_lo = R_034200_WD_PERFCOUNTER0_LO, 323}; 324 325static struct si_pc_block_base cik_MC = { 326 .name = "MC", 327 .num_counters = 4, 328 329 .layout = SI_PC_FAKE, 330}; 331 332static struct si_pc_block_base cik_SRBM = { 333 .name = "SRBM", 334 .num_counters = 2, 335 336 .layout = SI_PC_FAKE, 337}; 338 339/* Both the number of instances and selectors varies between chips of the same 340 * class. We only differentiate by class here and simply expose the maximum 341 * number over all chips in a class. 342 * 343 * Unfortunately, GPUPerfStudio uses the order of performance counter groups 344 * blindly once it believes it has identified the hardware, so the order of 345 * blocks here matters. 346 */ 347static struct si_pc_block groups_CIK[] = { 348 { &cik_CB, 226}, 349 { &cik_CPF, 17 }, 350 { &cik_DB, 257}, 351 { &cik_GRBM, 34 }, 352 { &cik_GRBMSE, 15 }, 353 { &cik_PA_SU, 153 }, 354 { &cik_PA_SC, 395 }, 355 { &cik_SPI, 186 }, 356 { &cik_SQ, 252 }, 357 { &cik_SX, 32 }, 358 { &cik_TA, 111, 11 }, 359 { &cik_TCA, 39, 2 }, 360 { &cik_TCC, 160}, 361 { &cik_TD, 55, 11 }, 362 { &cik_TCP, 154, 11 }, 363 { &cik_GDS, 121 }, 364 { &cik_VGT, 140 }, 365 { &cik_IA, 22 }, 366 { &cik_MC, 22 }, 367 { &cik_SRBM, 19 }, 368 { &cik_WD, 22 }, 369 { &cik_CPG, 46 }, 370 { &cik_CPC, 22 }, 371 372}; 373 374static struct si_pc_block groups_VI[] = { 375 { &cik_CB, 405}, 376 { &cik_CPF, 19 }, 377 { &cik_DB, 257}, 378 { &cik_GRBM, 34 }, 379 { &cik_GRBMSE, 15 }, 380 { &cik_PA_SU, 154 }, 381 { &cik_PA_SC, 397 }, 382 { &cik_SPI, 197 }, 383 { &cik_SQ, 273 }, 384 { &cik_SX, 34 }, 385 { &cik_TA, 119, 16 }, 386 { &cik_TCA, 35, 2 }, 387 { &cik_TCC, 192}, 388 { &cik_TD, 55, 16 }, 389 { &cik_TCP, 180, 16 }, 390 { &cik_GDS, 121 }, 391 { &cik_VGT, 147 }, 392 { &cik_IA, 24 }, 393 { &cik_MC, 22 }, 394 { &cik_SRBM, 27 }, 395 { &cik_WD, 37 }, 396 { &cik_CPG, 48 }, 397 { &cik_CPC, 24 }, 398 399}; 400 401static struct si_pc_block groups_gfx9[] = { 402 { &cik_CB, 438}, 403 { &cik_CPF, 32 }, 404 { &cik_DB, 328}, 405 { &cik_GRBM, 38 }, 406 { &cik_GRBMSE, 16 }, 407 { &cik_PA_SU, 292 }, 408 { &cik_PA_SC, 491 }, 409 { &cik_SPI, 196 }, 410 { &cik_SQ, 374 }, 411 { &cik_SX, 208 }, 412 { &cik_TA, 119, 16 }, 413 { &cik_TCA, 35, 2 }, 414 { &cik_TCC, 256}, 415 { &cik_TD, 57, 16 }, 416 { &cik_TCP, 85, 16 }, 417 { &cik_GDS, 121 }, 418 { &cik_VGT, 148 }, 419 { &cik_IA, 32 }, 420 { &cik_WD, 58 }, 421 { &cik_CPG, 59 }, 422 { &cik_CPC, 35 }, 423}; 424 425static void si_pc_emit_instance(struct si_context *sctx, 426 int se, int instance) 427{ 428 struct radeon_cmdbuf *cs = sctx->gfx_cs; 429 unsigned value = S_030800_SH_BROADCAST_WRITES(1); 430 431 if (se >= 0) { 432 value |= S_030800_SE_INDEX(se); 433 } else { 434 value |= S_030800_SE_BROADCAST_WRITES(1); 435 } 436 437 if (instance >= 0) { 438 value |= S_030800_INSTANCE_INDEX(instance); 439 } else { 440 value |= S_030800_INSTANCE_BROADCAST_WRITES(1); 441 } 442 443 radeon_set_uconfig_reg(cs, R_030800_GRBM_GFX_INDEX, value); 444} 445 446static void si_pc_emit_shaders(struct si_context *sctx, 447 unsigned shaders) 448{ 449 struct radeon_cmdbuf *cs = sctx->gfx_cs; 450 451 radeon_set_uconfig_reg_seq(cs, R_036780_SQ_PERFCOUNTER_CTRL, 2); 452 radeon_emit(cs, shaders & 0x7f); 453 radeon_emit(cs, 0xffffffff); 454} 455 456static void si_pc_emit_select(struct si_context *sctx, 457 struct si_perfcounter_block *group, 458 unsigned count, unsigned *selectors) 459{ 460 struct si_pc_block *sigroup = (struct si_pc_block *)group->data; 461 struct si_pc_block_base *regs = sigroup->b; 462 struct radeon_cmdbuf *cs = sctx->gfx_cs; 463 unsigned idx; 464 unsigned layout_multi = regs->layout & SI_PC_MULTI_MASK; 465 unsigned dw; 466 467 assert(count <= regs->num_counters); 468 469 if (regs->layout & SI_PC_FAKE) 470 return; 471 472 if (layout_multi == SI_PC_MULTI_BLOCK) { 473 assert(!(regs->layout & SI_PC_REG_REVERSE)); 474 475 dw = count + regs->num_prelude; 476 if (count >= regs->num_multi) 477 dw += regs->num_multi; 478 radeon_set_uconfig_reg_seq(cs, regs->select0, dw); 479 for (idx = 0; idx < regs->num_prelude; ++idx) 480 radeon_emit(cs, 0); 481 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx) 482 radeon_emit(cs, selectors[idx] | regs->select_or); 483 484 if (count < regs->num_multi) { 485 unsigned select1 = 486 regs->select0 + 4 * regs->num_multi; 487 radeon_set_uconfig_reg_seq(cs, select1, count); 488 } 489 490 for (idx = 0; idx < MIN2(count, regs->num_multi); ++idx) 491 radeon_emit(cs, 0); 492 493 if (count > regs->num_multi) { 494 for (idx = regs->num_multi; idx < count; ++idx) 495 radeon_emit(cs, selectors[idx] | regs->select_or); 496 } 497 } else if (layout_multi == SI_PC_MULTI_TAIL) { 498 unsigned select1, select1_count; 499 500 assert(!(regs->layout & SI_PC_REG_REVERSE)); 501 502 radeon_set_uconfig_reg_seq(cs, regs->select0, count + regs->num_prelude); 503 for (idx = 0; idx < regs->num_prelude; ++idx) 504 radeon_emit(cs, 0); 505 for (idx = 0; idx < count; ++idx) 506 radeon_emit(cs, selectors[idx] | regs->select_or); 507 508 select1 = regs->select0 + 4 * regs->num_counters; 509 select1_count = MIN2(count, regs->num_multi); 510 radeon_set_uconfig_reg_seq(cs, select1, select1_count); 511 for (idx = 0; idx < select1_count; ++idx) 512 radeon_emit(cs, 0); 513 } else if (layout_multi == SI_PC_MULTI_CUSTOM) { 514 unsigned *reg = regs->select; 515 for (idx = 0; idx < count; ++idx) { 516 radeon_set_uconfig_reg(cs, *reg++, selectors[idx] | regs->select_or); 517 if (idx < regs->num_multi) 518 radeon_set_uconfig_reg(cs, *reg++, 0); 519 } 520 } else { 521 assert(layout_multi == SI_PC_MULTI_ALTERNATE); 522 523 unsigned reg_base = regs->select0; 524 unsigned reg_count = count + MIN2(count, regs->num_multi); 525 reg_count += regs->num_prelude; 526 527 if (!(regs->layout & SI_PC_REG_REVERSE)) { 528 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count); 529 530 for (idx = 0; idx < regs->num_prelude; ++idx) 531 radeon_emit(cs, 0); 532 for (idx = 0; idx < count; ++idx) { 533 radeon_emit(cs, selectors[idx] | regs->select_or); 534 if (idx < regs->num_multi) 535 radeon_emit(cs, 0); 536 } 537 } else { 538 reg_base -= (reg_count - 1) * 4; 539 radeon_set_uconfig_reg_seq(cs, reg_base, reg_count); 540 541 for (idx = count; idx > 0; --idx) { 542 if (idx <= regs->num_multi) 543 radeon_emit(cs, 0); 544 radeon_emit(cs, selectors[idx - 1] | regs->select_or); 545 } 546 for (idx = 0; idx < regs->num_prelude; ++idx) 547 radeon_emit(cs, 0); 548 } 549 } 550} 551 552static void si_pc_emit_start(struct si_context *sctx, 553 struct r600_resource *buffer, uint64_t va) 554{ 555 struct radeon_cmdbuf *cs = sctx->gfx_cs; 556 557 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, buffer, 558 RADEON_USAGE_WRITE, RADEON_PRIO_QUERY); 559 560 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 561 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | 562 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM)); 563 radeon_emit(cs, 1); /* immediate */ 564 radeon_emit(cs, 0); /* unused */ 565 radeon_emit(cs, va); 566 radeon_emit(cs, va >> 32); 567 568 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 569 S_036020_PERFMON_STATE(V_036020_DISABLE_AND_RESET)); 570 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 571 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_START) | EVENT_INDEX(0)); 572 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 573 S_036020_PERFMON_STATE(V_036020_START_COUNTING)); 574} 575 576/* Note: The buffer was already added in si_pc_emit_start, so we don't have to 577 * do it again in here. */ 578static void si_pc_emit_stop(struct si_context *sctx, 579 struct r600_resource *buffer, uint64_t va) 580{ 581 struct radeon_cmdbuf *cs = sctx->gfx_cs; 582 583 si_cp_release_mem(sctx, V_028A90_BOTTOM_OF_PIPE_TS, 0, 584 EOP_DST_SEL_MEM, 585 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, 586 EOP_DATA_SEL_VALUE_32BIT, 587 buffer, va, 0, SI_NOT_QUERY); 588 si_cp_wait_mem(sctx, va, 0, 0xffffffff, 0); 589 590 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 591 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_SAMPLE) | EVENT_INDEX(0)); 592 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 593 radeon_emit(cs, EVENT_TYPE(V_028A90_PERFCOUNTER_STOP) | EVENT_INDEX(0)); 594 radeon_set_uconfig_reg(cs, R_036020_CP_PERFMON_CNTL, 595 S_036020_PERFMON_STATE(V_036020_STOP_COUNTING) | 596 S_036020_PERFMON_SAMPLE_ENABLE(1)); 597} 598 599static void si_pc_emit_read(struct si_context *sctx, 600 struct si_perfcounter_block *group, 601 unsigned count, unsigned *selectors, 602 struct r600_resource *buffer, uint64_t va) 603{ 604 struct si_pc_block *sigroup = (struct si_pc_block *)group->data; 605 struct si_pc_block_base *regs = sigroup->b; 606 struct radeon_cmdbuf *cs = sctx->gfx_cs; 607 unsigned idx; 608 unsigned reg = regs->counter0_lo; 609 unsigned reg_delta = 8; 610 611 if (!(regs->layout & SI_PC_FAKE)) { 612 if (regs->layout & SI_PC_REG_REVERSE) 613 reg_delta = -reg_delta; 614 615 for (idx = 0; idx < count; ++idx) { 616 if (regs->counters) 617 reg = regs->counters[idx]; 618 619 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 620 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_PERF) | 621 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) | 622 COPY_DATA_COUNT_SEL); /* 64 bits */ 623 radeon_emit(cs, reg >> 2); 624 radeon_emit(cs, 0); /* unused */ 625 radeon_emit(cs, va); 626 radeon_emit(cs, va >> 32); 627 va += sizeof(uint64_t); 628 reg += reg_delta; 629 } 630 } else { 631 for (idx = 0; idx < count; ++idx) { 632 radeon_emit(cs, PKT3(PKT3_COPY_DATA, 4, 0)); 633 radeon_emit(cs, COPY_DATA_SRC_SEL(COPY_DATA_IMM) | 634 COPY_DATA_DST_SEL(COPY_DATA_DST_MEM_GRBM) | 635 COPY_DATA_COUNT_SEL); 636 radeon_emit(cs, 0); /* immediate */ 637 radeon_emit(cs, 0); 638 radeon_emit(cs, va); 639 radeon_emit(cs, va >> 32); 640 va += sizeof(uint64_t); 641 } 642 } 643} 644 645static void si_pc_cleanup(struct si_screen *sscreen) 646{ 647 si_perfcounters_do_destroy(sscreen->perfcounters); 648 sscreen->perfcounters = NULL; 649} 650 651void si_init_perfcounters(struct si_screen *screen) 652{ 653 struct si_perfcounters *pc; 654 struct si_pc_block *blocks; 655 unsigned num_blocks; 656 unsigned i; 657 658 switch (screen->info.chip_class) { 659 case CIK: 660 blocks = groups_CIK; 661 num_blocks = ARRAY_SIZE(groups_CIK); 662 break; 663 case VI: 664 blocks = groups_VI; 665 num_blocks = ARRAY_SIZE(groups_VI); 666 break; 667 case GFX9: 668 blocks = groups_gfx9; 669 num_blocks = ARRAY_SIZE(groups_gfx9); 670 break; 671 case SI: 672 default: 673 return; /* not implemented */ 674 } 675 676 if (screen->info.max_sh_per_se != 1) { 677 /* This should not happen on non-SI chips. */ 678 fprintf(stderr, "si_init_perfcounters: max_sh_per_se = %d not " 679 "supported (inaccurate performance counters)\n", 680 screen->info.max_sh_per_se); 681 } 682 683 pc = CALLOC_STRUCT(si_perfcounters); 684 if (!pc) 685 return; 686 687 pc->num_stop_cs_dwords = 14 + si_cp_write_fence_dwords(screen); 688 pc->num_instance_cs_dwords = 3; 689 690 pc->num_shader_types = ARRAY_SIZE(si_pc_shader_type_bits); 691 pc->shader_type_suffixes = si_pc_shader_type_suffixes; 692 pc->shader_type_bits = si_pc_shader_type_bits; 693 694 pc->emit_instance = si_pc_emit_instance; 695 pc->emit_shaders = si_pc_emit_shaders; 696 pc->emit_select = si_pc_emit_select; 697 pc->emit_start = si_pc_emit_start; 698 pc->emit_stop = si_pc_emit_stop; 699 pc->emit_read = si_pc_emit_read; 700 pc->cleanup = si_pc_cleanup; 701 702 if (!si_perfcounters_init(pc, num_blocks)) 703 goto error; 704 705 for (i = 0; i < num_blocks; ++i) { 706 struct si_pc_block *block = &blocks[i]; 707 unsigned instances = block->instances; 708 709 if (!strcmp(block->b->name, "CB") || 710 !strcmp(block->b->name, "DB")) 711 instances = screen->info.max_se; 712 else if (!strcmp(block->b->name, "TCC")) 713 instances = screen->info.num_tcc_blocks; 714 else if (!strcmp(block->b->name, "IA")) 715 instances = MAX2(1, screen->info.max_se / 2); 716 717 si_perfcounters_add_block(screen, pc, 718 block->b->name, 719 block->b->flags, 720 block->b->num_counters, 721 block->selectors, 722 instances, 723 block); 724 } 725 726 screen->perfcounters = pc; 727 return; 728 729error: 730 si_perfcounters_do_destroy(pc); 731} 732