1/* 2 * Copyright 2015 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 */ 24 25#include "ac_debug.h" 26#include "ac_rtld.h" 27#include "driver_ddebug/dd_util.h" 28#include "si_compute.h" 29#include "si_pipe.h" 30#include "sid.h" 31#include "sid_tables.h" 32#include "tgsi/tgsi_from_mesa.h" 33#include "util/u_dump.h" 34#include "util/u_log.h" 35#include "util/u_memory.h" 36#include "util/u_string.h" 37 38static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f); 39 40DEBUG_GET_ONCE_OPTION(replace_shaders, "RADEON_REPLACE_SHADERS", NULL) 41 42/** 43 * Store a linearized copy of all chunks of \p cs together with the buffer 44 * list in \p saved. 45 */ 46void si_save_cs(struct radeon_winsys *ws, struct radeon_cmdbuf *cs, struct radeon_saved_cs *saved, 47 bool get_buffer_list) 48{ 49 uint32_t *buf; 50 unsigned i; 51 52 /* Save the IB chunks. */ 53 saved->num_dw = cs->prev_dw + cs->current.cdw; 54 saved->ib = MALLOC(4 * saved->num_dw); 55 if (!saved->ib) 56 goto oom; 57 58 buf = saved->ib; 59 for (i = 0; i < cs->num_prev; ++i) { 60 memcpy(buf, cs->prev[i].buf, cs->prev[i].cdw * 4); 61 buf += cs->prev[i].cdw; 62 } 63 memcpy(buf, cs->current.buf, cs->current.cdw * 4); 64 65 if (!get_buffer_list) 66 return; 67 68 /* Save the buffer list. */ 69 saved->bo_count = ws->cs_get_buffer_list(cs, NULL); 70 saved->bo_list = CALLOC(saved->bo_count, sizeof(saved->bo_list[0])); 71 if (!saved->bo_list) { 72 FREE(saved->ib); 73 goto oom; 74 } 75 ws->cs_get_buffer_list(cs, saved->bo_list); 76 77 return; 78 79oom: 80 fprintf(stderr, "%s: out of memory\n", __func__); 81 memset(saved, 0, sizeof(*saved)); 82} 83 84void si_clear_saved_cs(struct radeon_saved_cs *saved) 85{ 86 FREE(saved->ib); 87 FREE(saved->bo_list); 88 89 memset(saved, 0, sizeof(*saved)); 90} 91 92void si_destroy_saved_cs(struct si_saved_cs *scs) 93{ 94 si_clear_saved_cs(&scs->gfx); 95 si_resource_reference(&scs->trace_buf, NULL); 96 free(scs); 97} 98 99static void si_dump_shader(struct si_screen *sscreen, struct si_shader *shader, FILE *f) 100{ 101 if (shader->shader_log) 102 fwrite(shader->shader_log, shader->shader_log_size, 1, f); 103 else 104 si_shader_dump(sscreen, shader, NULL, f, false); 105 106 if (shader->bo && sscreen->options.dump_shader_binary) { 107 unsigned size = shader->bo->b.b.width0; 108 fprintf(f, "BO: VA=%" PRIx64 " Size=%u\n", shader->bo->gpu_address, size); 109 110 const char *mapped = sscreen->ws->buffer_map(sscreen->ws, 111 shader->bo->buf, NULL, 112 PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ | RADEON_MAP_TEMPORARY); 113 114 for (unsigned i = 0; i < size; i += 4) { 115 fprintf(f, " %4x: %08x\n", i, *(uint32_t *)(mapped + i)); 116 } 117 118 sscreen->ws->buffer_unmap(sscreen->ws, shader->bo->buf); 119 120 fprintf(f, "\n"); 121 } 122} 123 124struct si_log_chunk_shader { 125 /* The shader destroy code assumes a current context for unlinking of 126 * PM4 packets etc. 127 * 128 * While we should be able to destroy shaders without a context, doing 129 * so would happen only very rarely and be therefore likely to fail 130 * just when you're trying to debug something. Let's just remember the 131 * current context in the chunk. 132 */ 133 struct si_context *ctx; 134 struct si_shader *shader; 135 136 /* For keep-alive reference counts */ 137 struct si_shader_selector *sel; 138 struct si_compute *program; 139}; 140 141static void si_log_chunk_shader_destroy(void *data) 142{ 143 struct si_log_chunk_shader *chunk = data; 144 si_shader_selector_reference(chunk->ctx, &chunk->sel, NULL); 145 si_compute_reference(&chunk->program, NULL); 146 FREE(chunk); 147} 148 149static void si_log_chunk_shader_print(void *data, FILE *f) 150{ 151 struct si_log_chunk_shader *chunk = data; 152 struct si_screen *sscreen = chunk->ctx->screen; 153 si_dump_shader(sscreen, chunk->shader, f); 154} 155 156static struct u_log_chunk_type si_log_chunk_type_shader = { 157 .destroy = si_log_chunk_shader_destroy, 158 .print = si_log_chunk_shader_print, 159}; 160 161static void si_dump_gfx_shader(struct si_context *ctx, const struct si_shader_ctx_state *state, 162 struct u_log_context *log) 163{ 164 struct si_shader *current = state->current; 165 166 if (!state->cso || !current) 167 return; 168 169 struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader); 170 chunk->ctx = ctx; 171 chunk->shader = current; 172 si_shader_selector_reference(ctx, &chunk->sel, current->selector); 173 u_log_chunk(log, &si_log_chunk_type_shader, chunk); 174} 175 176static void si_dump_compute_shader(struct si_context *ctx, struct u_log_context *log) 177{ 178 const struct si_cs_shader_state *state = &ctx->cs_shader_state; 179 180 if (!state->program) 181 return; 182 183 struct si_log_chunk_shader *chunk = CALLOC_STRUCT(si_log_chunk_shader); 184 chunk->ctx = ctx; 185 chunk->shader = &state->program->shader; 186 si_compute_reference(&chunk->program, state->program); 187 u_log_chunk(log, &si_log_chunk_type_shader, chunk); 188} 189 190/** 191 * Shader compiles can be overridden with arbitrary ELF objects by setting 192 * the environment variable RADEON_REPLACE_SHADERS=num1:filename1[;num2:filename2] 193 * 194 * TODO: key this off some hash 195 */ 196bool si_replace_shader(unsigned num, struct si_shader_binary *binary) 197{ 198 const char *p = debug_get_option_replace_shaders(); 199 const char *semicolon; 200 char *copy = NULL; 201 FILE *f; 202 long filesize, nread; 203 bool replaced = false; 204 205 if (!p) 206 return false; 207 208 while (*p) { 209 unsigned long i; 210 char *endp; 211 i = strtoul(p, &endp, 0); 212 213 p = endp; 214 if (*p != ':') { 215 fprintf(stderr, "RADEON_REPLACE_SHADERS formatted badly.\n"); 216 exit(1); 217 } 218 ++p; 219 220 if (i == num) 221 break; 222 223 p = strchr(p, ';'); 224 if (!p) 225 return false; 226 ++p; 227 } 228 if (!*p) 229 return false; 230 231 semicolon = strchr(p, ';'); 232 if (semicolon) { 233 p = copy = strndup(p, semicolon - p); 234 if (!copy) { 235 fprintf(stderr, "out of memory\n"); 236 return false; 237 } 238 } 239 240 fprintf(stderr, "radeonsi: replace shader %u by %s\n", num, p); 241 242 f = fopen(p, "r"); 243 if (!f) { 244 perror("radeonsi: failed to open file"); 245 goto out_free; 246 } 247 248 if (fseek(f, 0, SEEK_END) != 0) 249 goto file_error; 250 251 filesize = ftell(f); 252 if (filesize < 0) 253 goto file_error; 254 255 if (fseek(f, 0, SEEK_SET) != 0) 256 goto file_error; 257 258 binary->elf_buffer = MALLOC(filesize); 259 if (!binary->elf_buffer) { 260 fprintf(stderr, "out of memory\n"); 261 goto out_close; 262 } 263 264 nread = fread((void *)binary->elf_buffer, 1, filesize, f); 265 if (nread != filesize) { 266 FREE((void *)binary->elf_buffer); 267 binary->elf_buffer = NULL; 268 goto file_error; 269 } 270 271 binary->elf_size = nread; 272 replaced = true; 273 274out_close: 275 fclose(f); 276out_free: 277 free(copy); 278 return replaced; 279 280file_error: 281 perror("radeonsi: reading shader"); 282 goto out_close; 283} 284 285/* Parsed IBs are difficult to read without colors. Use "less -R file" to 286 * read them, or use "aha -b -f file" to convert them to html. 287 */ 288#define COLOR_RESET "\033[0m" 289#define COLOR_RED "\033[31m" 290#define COLOR_GREEN "\033[1;32m" 291#define COLOR_YELLOW "\033[1;33m" 292#define COLOR_CYAN "\033[1;36m" 293 294static void si_dump_mmapped_reg(struct si_context *sctx, FILE *f, unsigned offset) 295{ 296 struct radeon_winsys *ws = sctx->ws; 297 uint32_t value; 298 299 if (ws->read_registers(ws, offset, 1, &value)) 300 ac_dump_reg(f, sctx->chip_class, offset, value, ~0); 301} 302 303static void si_dump_debug_registers(struct si_context *sctx, FILE *f) 304{ 305 if (!sctx->screen->info.has_read_registers_query) 306 return; 307 308 fprintf(f, "Memory-mapped registers:\n"); 309 si_dump_mmapped_reg(sctx, f, R_008010_GRBM_STATUS); 310 311 /* No other registers can be read on DRM < 3.1.0. */ 312 if (!sctx->screen->info.is_amdgpu || sctx->screen->info.drm_minor < 1) { 313 fprintf(f, "\n"); 314 return; 315 } 316 317 si_dump_mmapped_reg(sctx, f, R_008008_GRBM_STATUS2); 318 si_dump_mmapped_reg(sctx, f, R_008014_GRBM_STATUS_SE0); 319 si_dump_mmapped_reg(sctx, f, R_008018_GRBM_STATUS_SE1); 320 si_dump_mmapped_reg(sctx, f, R_008038_GRBM_STATUS_SE2); 321 si_dump_mmapped_reg(sctx, f, R_00803C_GRBM_STATUS_SE3); 322 si_dump_mmapped_reg(sctx, f, R_00D034_SDMA0_STATUS_REG); 323 si_dump_mmapped_reg(sctx, f, R_00D834_SDMA1_STATUS_REG); 324 if (sctx->chip_class <= GFX8) { 325 si_dump_mmapped_reg(sctx, f, R_000E50_SRBM_STATUS); 326 si_dump_mmapped_reg(sctx, f, R_000E4C_SRBM_STATUS2); 327 si_dump_mmapped_reg(sctx, f, R_000E54_SRBM_STATUS3); 328 } 329 si_dump_mmapped_reg(sctx, f, R_008680_CP_STAT); 330 si_dump_mmapped_reg(sctx, f, R_008674_CP_STALLED_STAT1); 331 si_dump_mmapped_reg(sctx, f, R_008678_CP_STALLED_STAT2); 332 si_dump_mmapped_reg(sctx, f, R_008670_CP_STALLED_STAT3); 333 si_dump_mmapped_reg(sctx, f, R_008210_CP_CPC_STATUS); 334 si_dump_mmapped_reg(sctx, f, R_008214_CP_CPC_BUSY_STAT); 335 si_dump_mmapped_reg(sctx, f, R_008218_CP_CPC_STALLED_STAT1); 336 si_dump_mmapped_reg(sctx, f, R_00821C_CP_CPF_STATUS); 337 si_dump_mmapped_reg(sctx, f, R_008220_CP_CPF_BUSY_STAT); 338 si_dump_mmapped_reg(sctx, f, R_008224_CP_CPF_STALLED_STAT1); 339 fprintf(f, "\n"); 340} 341 342struct si_log_chunk_cs { 343 struct si_context *ctx; 344 struct si_saved_cs *cs; 345 bool dump_bo_list; 346 unsigned gfx_begin, gfx_end; 347}; 348 349static void si_log_chunk_type_cs_destroy(void *data) 350{ 351 struct si_log_chunk_cs *chunk = data; 352 si_saved_cs_reference(&chunk->cs, NULL); 353 free(chunk); 354} 355 356static void si_parse_current_ib(FILE *f, struct radeon_cmdbuf *cs, unsigned begin, unsigned end, 357 int *last_trace_id, unsigned trace_id_count, const char *name, 358 enum chip_class chip_class) 359{ 360 unsigned orig_end = end; 361 362 assert(begin <= end); 363 364 fprintf(f, "------------------ %s begin (dw = %u) ------------------\n", name, begin); 365 366 for (unsigned prev_idx = 0; prev_idx < cs->num_prev; ++prev_idx) { 367 struct radeon_cmdbuf_chunk *chunk = &cs->prev[prev_idx]; 368 369 if (begin < chunk->cdw) { 370 ac_parse_ib_chunk(f, chunk->buf + begin, MIN2(end, chunk->cdw) - begin, last_trace_id, 371 trace_id_count, chip_class, NULL, NULL); 372 } 373 374 if (end <= chunk->cdw) 375 return; 376 377 if (begin < chunk->cdw) 378 fprintf(f, "\n---------- Next %s Chunk ----------\n\n", name); 379 380 begin -= MIN2(begin, chunk->cdw); 381 end -= chunk->cdw; 382 } 383 384 assert(end <= cs->current.cdw); 385 386 ac_parse_ib_chunk(f, cs->current.buf + begin, end - begin, last_trace_id, trace_id_count, 387 chip_class, NULL, NULL); 388 389 fprintf(f, "------------------- %s end (dw = %u) -------------------\n\n", name, orig_end); 390} 391 392void si_print_current_ib(struct si_context *sctx, FILE *f) 393{ 394 si_parse_current_ib(f, &sctx->gfx_cs, 0, sctx->gfx_cs.prev_dw + sctx->gfx_cs.current.cdw, 395 NULL, 0, "GFX", sctx->chip_class); 396} 397 398static void si_log_chunk_type_cs_print(void *data, FILE *f) 399{ 400 struct si_log_chunk_cs *chunk = data; 401 struct si_context *ctx = chunk->ctx; 402 struct si_saved_cs *scs = chunk->cs; 403 int last_trace_id = -1; 404 405 /* We are expecting that the ddebug pipe has already 406 * waited for the context, so this buffer should be idle. 407 * If the GPU is hung, there is no point in waiting for it. 408 */ 409 uint32_t *map = ctx->ws->buffer_map(ctx->ws, scs->trace_buf->buf, NULL, 410 PIPE_MAP_UNSYNCHRONIZED | PIPE_MAP_READ); 411 if (map) 412 last_trace_id = map[0]; 413 414 if (chunk->gfx_end != chunk->gfx_begin) { 415 if (chunk->gfx_begin == 0) { 416 if (ctx->cs_preamble_state) 417 ac_parse_ib(f, ctx->cs_preamble_state->pm4, ctx->cs_preamble_state->ndw, NULL, 0, 418 "IB2: Init config", ctx->chip_class, NULL, NULL); 419 420 if (ctx->cs_preamble_gs_rings) 421 ac_parse_ib(f, ctx->cs_preamble_gs_rings->pm4, ctx->cs_preamble_gs_rings->ndw, NULL, 0, 422 "IB2: Init GS rings", ctx->chip_class, NULL, NULL); 423 } 424 425 if (scs->flushed) { 426 ac_parse_ib(f, scs->gfx.ib + chunk->gfx_begin, chunk->gfx_end - chunk->gfx_begin, 427 &last_trace_id, map ? 1 : 0, "IB", ctx->chip_class, NULL, NULL); 428 } else { 429 si_parse_current_ib(f, &ctx->gfx_cs, chunk->gfx_begin, chunk->gfx_end, &last_trace_id, 430 map ? 1 : 0, "IB", ctx->chip_class); 431 } 432 } 433 434 if (chunk->dump_bo_list) { 435 fprintf(f, "Flushing. Time: "); 436 util_dump_ns(f, scs->time_flush); 437 fprintf(f, "\n\n"); 438 si_dump_bo_list(ctx, &scs->gfx, f); 439 } 440} 441 442static const struct u_log_chunk_type si_log_chunk_type_cs = { 443 .destroy = si_log_chunk_type_cs_destroy, 444 .print = si_log_chunk_type_cs_print, 445}; 446 447static void si_log_cs(struct si_context *ctx, struct u_log_context *log, bool dump_bo_list) 448{ 449 assert(ctx->current_saved_cs); 450 451 struct si_saved_cs *scs = ctx->current_saved_cs; 452 unsigned gfx_cur = ctx->gfx_cs.prev_dw + ctx->gfx_cs.current.cdw; 453 454 if (!dump_bo_list && gfx_cur == scs->gfx_last_dw) 455 return; 456 457 struct si_log_chunk_cs *chunk = calloc(1, sizeof(*chunk)); 458 459 chunk->ctx = ctx; 460 si_saved_cs_reference(&chunk->cs, scs); 461 chunk->dump_bo_list = dump_bo_list; 462 463 chunk->gfx_begin = scs->gfx_last_dw; 464 chunk->gfx_end = gfx_cur; 465 scs->gfx_last_dw = gfx_cur; 466 467 u_log_chunk(log, &si_log_chunk_type_cs, chunk); 468} 469 470void si_auto_log_cs(void *data, struct u_log_context *log) 471{ 472 struct si_context *ctx = (struct si_context *)data; 473 si_log_cs(ctx, log, false); 474} 475 476void si_log_hw_flush(struct si_context *sctx) 477{ 478 if (!sctx->log) 479 return; 480 481 si_log_cs(sctx, sctx->log, true); 482 483 if (&sctx->b == sctx->screen->aux_context) { 484 /* The aux context isn't captured by the ddebug wrapper, 485 * so we dump it on a flush-by-flush basis here. 486 */ 487 FILE *f = dd_get_debug_file(false); 488 if (!f) { 489 fprintf(stderr, "radeonsi: error opening aux context dump file.\n"); 490 } else { 491 dd_write_header(f, &sctx->screen->b, 0); 492 493 fprintf(f, "Aux context dump:\n\n"); 494 u_log_new_page_print(sctx->log, f); 495 496 fclose(f); 497 } 498 } 499} 500 501static const char *priority_to_string(enum radeon_bo_priority priority) 502{ 503#define ITEM(x) [RADEON_PRIO_##x] = #x 504 static const char *table[64] = { 505 ITEM(FENCE), 506 ITEM(TRACE), 507 ITEM(SO_FILLED_SIZE), 508 ITEM(QUERY), 509 ITEM(IB1), 510 ITEM(IB2), 511 ITEM(DRAW_INDIRECT), 512 ITEM(INDEX_BUFFER), 513 ITEM(CP_DMA), 514 ITEM(CONST_BUFFER), 515 ITEM(DESCRIPTORS), 516 ITEM(BORDER_COLORS), 517 ITEM(SAMPLER_BUFFER), 518 ITEM(VERTEX_BUFFER), 519 ITEM(SHADER_RW_BUFFER), 520 ITEM(COMPUTE_GLOBAL), 521 ITEM(SAMPLER_TEXTURE), 522 ITEM(SHADER_RW_IMAGE), 523 ITEM(SAMPLER_TEXTURE_MSAA), 524 ITEM(COLOR_BUFFER), 525 ITEM(DEPTH_BUFFER), 526 ITEM(COLOR_BUFFER_MSAA), 527 ITEM(DEPTH_BUFFER_MSAA), 528 ITEM(SEPARATE_META), 529 ITEM(SHADER_BINARY), 530 ITEM(SHADER_RINGS), 531 ITEM(SCRATCH_BUFFER), 532 }; 533#undef ITEM 534 535 assert(priority < ARRAY_SIZE(table)); 536 return table[priority]; 537} 538 539static int bo_list_compare_va(const struct radeon_bo_list_item *a, 540 const struct radeon_bo_list_item *b) 541{ 542 return a->vm_address < b->vm_address ? -1 : a->vm_address > b->vm_address ? 1 : 0; 543} 544 545static void si_dump_bo_list(struct si_context *sctx, const struct radeon_saved_cs *saved, FILE *f) 546{ 547 unsigned i, j; 548 549 if (!saved->bo_list) 550 return; 551 552 /* Sort the list according to VM adddresses first. */ 553 qsort(saved->bo_list, saved->bo_count, sizeof(saved->bo_list[0]), (void *)bo_list_compare_va); 554 555 fprintf(f, "Buffer list (in units of pages = 4kB):\n" COLOR_YELLOW 556 " Size VM start page " 557 "VM end page Usage" COLOR_RESET "\n"); 558 559 for (i = 0; i < saved->bo_count; i++) { 560 /* Note: Buffer sizes are expected to be aligned to 4k by the winsys. */ 561 const unsigned page_size = sctx->screen->info.gart_page_size; 562 uint64_t va = saved->bo_list[i].vm_address; 563 uint64_t size = saved->bo_list[i].bo_size; 564 bool hit = false; 565 566 /* If there's unused virtual memory between 2 buffers, print it. */ 567 if (i) { 568 uint64_t previous_va_end = 569 saved->bo_list[i - 1].vm_address + saved->bo_list[i - 1].bo_size; 570 571 if (va > previous_va_end) { 572 fprintf(f, " %10" PRIu64 " -- hole --\n", (va - previous_va_end) / page_size); 573 } 574 } 575 576 /* Print the buffer. */ 577 fprintf(f, " %10" PRIu64 " 0x%013" PRIX64 " 0x%013" PRIX64 " ", 578 size / page_size, va / page_size, (va + size) / page_size); 579 580 /* Print the usage. */ 581 for (j = 0; j < 32; j++) { 582 if (!(saved->bo_list[i].priority_usage & (1u << j))) 583 continue; 584 585 fprintf(f, "%s%s", !hit ? "" : ", ", priority_to_string(j)); 586 hit = true; 587 } 588 fprintf(f, "\n"); 589 } 590 fprintf(f, "\nNote: The holes represent memory not used by the IB.\n" 591 " Other buffers can still be allocated there.\n\n"); 592} 593 594static void si_dump_framebuffer(struct si_context *sctx, struct u_log_context *log) 595{ 596 struct pipe_framebuffer_state *state = &sctx->framebuffer.state; 597 struct si_texture *tex; 598 int i; 599 600 for (i = 0; i < state->nr_cbufs; i++) { 601 if (!state->cbufs[i]) 602 continue; 603 604 tex = (struct si_texture *)state->cbufs[i]->texture; 605 u_log_printf(log, COLOR_YELLOW "Color buffer %i:" COLOR_RESET "\n", i); 606 si_print_texture_info(sctx->screen, tex, log); 607 u_log_printf(log, "\n"); 608 } 609 610 if (state->zsbuf) { 611 tex = (struct si_texture *)state->zsbuf->texture; 612 u_log_printf(log, COLOR_YELLOW "Depth-stencil buffer:" COLOR_RESET "\n"); 613 si_print_texture_info(sctx->screen, tex, log); 614 u_log_printf(log, "\n"); 615 } 616} 617 618typedef unsigned (*slot_remap_func)(unsigned); 619 620struct si_log_chunk_desc_list { 621 /** Pointer to memory map of buffer where the list is uploader */ 622 uint32_t *gpu_list; 623 /** Reference of buffer where the list is uploaded, so that gpu_list 624 * is kept live. */ 625 struct si_resource *buf; 626 627 const char *shader_name; 628 const char *elem_name; 629 slot_remap_func slot_remap; 630 enum chip_class chip_class; 631 unsigned element_dw_size; 632 unsigned num_elements; 633 634 uint32_t list[0]; 635}; 636 637static void si_log_chunk_desc_list_destroy(void *data) 638{ 639 struct si_log_chunk_desc_list *chunk = data; 640 si_resource_reference(&chunk->buf, NULL); 641 FREE(chunk); 642} 643 644static void si_log_chunk_desc_list_print(void *data, FILE *f) 645{ 646 struct si_log_chunk_desc_list *chunk = data; 647 unsigned sq_img_rsrc_word0 = 648 chunk->chip_class >= GFX10 ? R_00A000_SQ_IMG_RSRC_WORD0 : R_008F10_SQ_IMG_RSRC_WORD0; 649 650 for (unsigned i = 0; i < chunk->num_elements; i++) { 651 unsigned cpu_dw_offset = i * chunk->element_dw_size; 652 unsigned gpu_dw_offset = chunk->slot_remap(i) * chunk->element_dw_size; 653 const char *list_note = chunk->gpu_list ? "GPU list" : "CPU list"; 654 uint32_t *cpu_list = chunk->list + cpu_dw_offset; 655 uint32_t *gpu_list = chunk->gpu_list ? chunk->gpu_list + gpu_dw_offset : cpu_list; 656 657 fprintf(f, COLOR_GREEN "%s%s slot %u (%s):" COLOR_RESET "\n", chunk->shader_name, 658 chunk->elem_name, i, list_note); 659 660 switch (chunk->element_dw_size) { 661 case 4: 662 for (unsigned j = 0; j < 4; j++) 663 ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[j], 664 0xffffffff); 665 break; 666 case 8: 667 for (unsigned j = 0; j < 8; j++) 668 ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff); 669 670 fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n"); 671 for (unsigned j = 0; j < 4; j++) 672 ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j], 673 0xffffffff); 674 break; 675 case 16: 676 for (unsigned j = 0; j < 8; j++) 677 ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[j], 0xffffffff); 678 679 fprintf(f, COLOR_CYAN " Buffer:" COLOR_RESET "\n"); 680 for (unsigned j = 0; j < 4; j++) 681 ac_dump_reg(f, chunk->chip_class, R_008F00_SQ_BUF_RSRC_WORD0 + j * 4, gpu_list[4 + j], 682 0xffffffff); 683 684 fprintf(f, COLOR_CYAN " FMASK:" COLOR_RESET "\n"); 685 for (unsigned j = 0; j < 8; j++) 686 ac_dump_reg(f, chunk->chip_class, sq_img_rsrc_word0 + j * 4, gpu_list[8 + j], 687 0xffffffff); 688 689 fprintf(f, COLOR_CYAN " Sampler state:" COLOR_RESET "\n"); 690 for (unsigned j = 0; j < 4; j++) 691 ac_dump_reg(f, chunk->chip_class, R_008F30_SQ_IMG_SAMP_WORD0 + j * 4, gpu_list[12 + j], 692 0xffffffff); 693 break; 694 } 695 696 if (memcmp(gpu_list, cpu_list, chunk->element_dw_size * 4) != 0) { 697 fprintf(f, COLOR_RED "!!!!! This slot was corrupted in GPU memory !!!!!" COLOR_RESET "\n"); 698 } 699 700 fprintf(f, "\n"); 701 } 702} 703 704static const struct u_log_chunk_type si_log_chunk_type_descriptor_list = { 705 .destroy = si_log_chunk_desc_list_destroy, 706 .print = si_log_chunk_desc_list_print, 707}; 708 709static void si_dump_descriptor_list(struct si_screen *screen, struct si_descriptors *desc, 710 const char *shader_name, const char *elem_name, 711 unsigned element_dw_size, unsigned num_elements, 712 slot_remap_func slot_remap, struct u_log_context *log) 713{ 714 if (!desc->list) 715 return; 716 717 /* In some cases, the caller doesn't know how many elements are really 718 * uploaded. Reduce num_elements to fit in the range of active slots. */ 719 unsigned active_range_dw_begin = desc->first_active_slot * desc->element_dw_size; 720 unsigned active_range_dw_end = 721 active_range_dw_begin + desc->num_active_slots * desc->element_dw_size; 722 723 while (num_elements > 0) { 724 int i = slot_remap(num_elements - 1); 725 unsigned dw_begin = i * element_dw_size; 726 unsigned dw_end = dw_begin + element_dw_size; 727 728 if (dw_begin >= active_range_dw_begin && dw_end <= active_range_dw_end) 729 break; 730 731 num_elements--; 732 } 733 734 struct si_log_chunk_desc_list *chunk = 735 CALLOC_VARIANT_LENGTH_STRUCT(si_log_chunk_desc_list, 4 * element_dw_size * num_elements); 736 chunk->shader_name = shader_name; 737 chunk->elem_name = elem_name; 738 chunk->element_dw_size = element_dw_size; 739 chunk->num_elements = num_elements; 740 chunk->slot_remap = slot_remap; 741 chunk->chip_class = screen->info.chip_class; 742 743 si_resource_reference(&chunk->buf, desc->buffer); 744 chunk->gpu_list = desc->gpu_list; 745 746 for (unsigned i = 0; i < num_elements; ++i) { 747 memcpy(&chunk->list[i * element_dw_size], &desc->list[slot_remap(i) * element_dw_size], 748 4 * element_dw_size); 749 } 750 751 u_log_chunk(log, &si_log_chunk_type_descriptor_list, chunk); 752} 753 754static unsigned si_identity(unsigned slot) 755{ 756 return slot; 757} 758 759static void si_dump_descriptors(struct si_context *sctx, gl_shader_stage stage, 760 const struct si_shader_info *info, struct u_log_context *log) 761{ 762 enum pipe_shader_type processor = pipe_shader_type_from_mesa(stage); 763 struct si_descriptors *descs = 764 &sctx->descriptors[SI_DESCS_FIRST_SHADER + processor * SI_NUM_SHADER_DESCS]; 765 static const char *shader_name[] = {"VS", "PS", "GS", "TCS", "TES", "CS"}; 766 const char *name = shader_name[processor]; 767 unsigned enabled_constbuf, enabled_shaderbuf, enabled_samplers; 768 unsigned enabled_images; 769 770 if (info) { 771 enabled_constbuf = u_bit_consecutive(0, info->base.num_ubos); 772 enabled_shaderbuf = u_bit_consecutive(0, info->base.num_ssbos); 773 enabled_samplers = info->base.textures_used[0]; 774 enabled_images = u_bit_consecutive(0, info->base.num_images); 775 } else { 776 enabled_constbuf = 777 sctx->const_and_shader_buffers[processor].enabled_mask >> SI_NUM_SHADER_BUFFERS; 778 enabled_shaderbuf = 0; 779 for (int i = 0; i < SI_NUM_SHADER_BUFFERS; i++) { 780 enabled_shaderbuf |= 781 (sctx->const_and_shader_buffers[processor].enabled_mask & 782 1llu << (SI_NUM_SHADER_BUFFERS - i - 1)) << i; 783 } 784 enabled_samplers = sctx->samplers[processor].enabled_mask; 785 enabled_images = sctx->images[processor].enabled_mask; 786 } 787 788 if (stage == MESA_SHADER_VERTEX && sctx->vb_descriptors_buffer && 789 sctx->vb_descriptors_gpu_list) { 790 assert(info); /* only CS may not have an info struct */ 791 struct si_descriptors desc = {}; 792 793 desc.buffer = sctx->vb_descriptors_buffer; 794 desc.list = sctx->vb_descriptors_gpu_list; 795 desc.gpu_list = sctx->vb_descriptors_gpu_list; 796 desc.element_dw_size = 4; 797 desc.num_active_slots = sctx->vertex_elements->vb_desc_list_alloc_size / 16; 798 799 si_dump_descriptor_list(sctx->screen, &desc, name, " - Vertex buffer", 4, info->num_inputs, 800 si_identity, log); 801 } 802 803 si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name, 804 " - Constant buffer", 4, util_last_bit(enabled_constbuf), 805 si_get_constbuf_slot, log); 806 si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_CONST_AND_SHADER_BUFFERS], name, 807 " - Shader buffer", 4, util_last_bit(enabled_shaderbuf), 808 si_get_shaderbuf_slot, log); 809 si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name, 810 " - Sampler", 16, util_last_bit(enabled_samplers), si_get_sampler_slot, 811 log); 812 si_dump_descriptor_list(sctx->screen, &descs[SI_SHADER_DESCS_SAMPLERS_AND_IMAGES], name, 813 " - Image", 8, util_last_bit(enabled_images), si_get_image_slot, log); 814} 815 816static void si_dump_gfx_descriptors(struct si_context *sctx, 817 const struct si_shader_ctx_state *state, 818 struct u_log_context *log) 819{ 820 if (!state->cso || !state->current) 821 return; 822 823 si_dump_descriptors(sctx, state->cso->info.stage, &state->cso->info, log); 824} 825 826static void si_dump_compute_descriptors(struct si_context *sctx, struct u_log_context *log) 827{ 828 if (!sctx->cs_shader_state.program) 829 return; 830 831 si_dump_descriptors(sctx, MESA_SHADER_COMPUTE, NULL, log); 832} 833 834struct si_shader_inst { 835 const char *text; /* start of disassembly for this instruction */ 836 unsigned textlen; 837 unsigned size; /* instruction size = 4 or 8 */ 838 uint64_t addr; /* instruction address */ 839}; 840 841/** 842 * Open the given \p binary as \p rtld_binary and split the contained 843 * disassembly string into instructions and add them to the array 844 * pointed to by \p instructions, which must be sufficiently large. 845 * 846 * Labels are considered to be part of the following instruction. 847 * 848 * The caller must keep \p rtld_binary alive as long as \p instructions are 849 * used and then close it afterwards. 850 */ 851static void si_add_split_disasm(struct si_screen *screen, struct ac_rtld_binary *rtld_binary, 852 struct si_shader_binary *binary, uint64_t *addr, unsigned *num, 853 struct si_shader_inst *instructions, 854 gl_shader_stage stage, unsigned wave_size) 855{ 856 if (!ac_rtld_open(rtld_binary, (struct ac_rtld_open_info){ 857 .info = &screen->info, 858 .shader_type = stage, 859 .wave_size = wave_size, 860 .num_parts = 1, 861 .elf_ptrs = &binary->elf_buffer, 862 .elf_sizes = &binary->elf_size})) 863 return; 864 865 const char *disasm; 866 size_t nbytes; 867 if (!ac_rtld_get_section_by_name(rtld_binary, ".AMDGPU.disasm", &disasm, &nbytes)) 868 return; 869 870 const char *end = disasm + nbytes; 871 while (disasm < end) { 872 const char *semicolon = memchr(disasm, ';', end - disasm); 873 if (!semicolon) 874 break; 875 876 struct si_shader_inst *inst = &instructions[(*num)++]; 877 const char *inst_end = memchr(semicolon + 1, '\n', end - semicolon - 1); 878 if (!inst_end) 879 inst_end = end; 880 881 inst->text = disasm; 882 inst->textlen = inst_end - disasm; 883 884 inst->addr = *addr; 885 /* More than 16 chars after ";" means the instruction is 8 bytes long. */ 886 inst->size = inst_end - semicolon > 16 ? 8 : 4; 887 *addr += inst->size; 888 889 if (inst_end == end) 890 break; 891 disasm = inst_end + 1; 892 } 893} 894 895/* If the shader is being executed, print its asm instructions, and annotate 896 * those that are being executed right now with information about waves that 897 * execute them. This is most useful during a GPU hang. 898 */ 899static void si_print_annotated_shader(struct si_shader *shader, struct ac_wave_info *waves, 900 unsigned num_waves, FILE *f) 901{ 902 if (!shader) 903 return; 904 905 struct si_screen *screen = shader->selector->screen; 906 gl_shader_stage stage = shader->selector->info.stage; 907 uint64_t start_addr = shader->bo->gpu_address; 908 uint64_t end_addr = start_addr + shader->bo->b.b.width0; 909 unsigned i; 910 911 /* See if any wave executes the shader. */ 912 for (i = 0; i < num_waves; i++) { 913 if (start_addr <= waves[i].pc && waves[i].pc <= end_addr) 914 break; 915 } 916 if (i == num_waves) 917 return; /* the shader is not being executed */ 918 919 /* Remember the first found wave. The waves are sorted according to PC. */ 920 waves = &waves[i]; 921 num_waves -= i; 922 923 /* Get the list of instructions. 924 * Buffer size / 4 is the upper bound of the instruction count. 925 */ 926 unsigned num_inst = 0; 927 uint64_t inst_addr = start_addr; 928 unsigned wave_size = si_get_shader_wave_size(shader); 929 struct ac_rtld_binary rtld_binaries[5] = {}; 930 struct si_shader_inst *instructions = 931 calloc(shader->bo->b.b.width0 / 4, sizeof(struct si_shader_inst)); 932 933 if (shader->prolog) { 934 si_add_split_disasm(screen, &rtld_binaries[0], &shader->prolog->binary, &inst_addr, &num_inst, 935 instructions, stage, wave_size); 936 } 937 if (shader->previous_stage) { 938 si_add_split_disasm(screen, &rtld_binaries[1], &shader->previous_stage->binary, &inst_addr, 939 &num_inst, instructions, stage, wave_size); 940 } 941 if (shader->prolog2) { 942 si_add_split_disasm(screen, &rtld_binaries[2], &shader->prolog2->binary, &inst_addr, 943 &num_inst, instructions, stage, wave_size); 944 } 945 si_add_split_disasm(screen, &rtld_binaries[3], &shader->binary, &inst_addr, &num_inst, 946 instructions, stage, wave_size); 947 if (shader->epilog) { 948 si_add_split_disasm(screen, &rtld_binaries[4], &shader->epilog->binary, &inst_addr, &num_inst, 949 instructions, stage, wave_size); 950 } 951 952 fprintf(f, COLOR_YELLOW "%s - annotated disassembly:" COLOR_RESET "\n", 953 si_get_shader_name(shader)); 954 955 /* Print instructions with annotations. */ 956 for (i = 0; i < num_inst; i++) { 957 struct si_shader_inst *inst = &instructions[i]; 958 959 fprintf(f, "%.*s [PC=0x%" PRIx64 ", size=%u]\n", inst->textlen, inst->text, inst->addr, 960 inst->size); 961 962 /* Print which waves execute the instruction right now. */ 963 while (num_waves && inst->addr == waves->pc) { 964 fprintf(f, 965 " " COLOR_GREEN "^ SE%u SH%u CU%u " 966 "SIMD%u WAVE%u EXEC=%016" PRIx64 " ", 967 waves->se, waves->sh, waves->cu, waves->simd, waves->wave, waves->exec); 968 969 if (inst->size == 4) { 970 fprintf(f, "INST32=%08X" COLOR_RESET "\n", waves->inst_dw0); 971 } else { 972 fprintf(f, "INST64=%08X %08X" COLOR_RESET "\n", waves->inst_dw0, waves->inst_dw1); 973 } 974 975 waves->matched = true; 976 waves = &waves[1]; 977 num_waves--; 978 } 979 } 980 981 fprintf(f, "\n\n"); 982 free(instructions); 983 for (unsigned i = 0; i < ARRAY_SIZE(rtld_binaries); ++i) 984 ac_rtld_close(&rtld_binaries[i]); 985} 986 987static void si_dump_annotated_shaders(struct si_context *sctx, FILE *f) 988{ 989 struct ac_wave_info waves[AC_MAX_WAVES_PER_CHIP]; 990 unsigned num_waves = ac_get_wave_info(sctx->chip_class, waves); 991 992 fprintf(f, COLOR_CYAN "The number of active waves = %u" COLOR_RESET "\n\n", num_waves); 993 994 si_print_annotated_shader(sctx->shader.vs.current, waves, num_waves, f); 995 si_print_annotated_shader(sctx->shader.tcs.current, waves, num_waves, f); 996 si_print_annotated_shader(sctx->shader.tes.current, waves, num_waves, f); 997 si_print_annotated_shader(sctx->shader.gs.current, waves, num_waves, f); 998 si_print_annotated_shader(sctx->shader.ps.current, waves, num_waves, f); 999 1000 /* Print waves executing shaders that are not currently bound. */ 1001 unsigned i; 1002 bool found = false; 1003 for (i = 0; i < num_waves; i++) { 1004 if (waves[i].matched) 1005 continue; 1006 1007 if (!found) { 1008 fprintf(f, COLOR_CYAN "Waves not executing currently-bound shaders:" COLOR_RESET "\n"); 1009 found = true; 1010 } 1011 fprintf(f, 1012 " SE%u SH%u CU%u SIMD%u WAVE%u EXEC=%016" PRIx64 " INST=%08X %08X PC=%" PRIx64 1013 "\n", 1014 waves[i].se, waves[i].sh, waves[i].cu, waves[i].simd, waves[i].wave, waves[i].exec, 1015 waves[i].inst_dw0, waves[i].inst_dw1, waves[i].pc); 1016 } 1017 if (found) 1018 fprintf(f, "\n\n"); 1019} 1020 1021static void si_dump_command(const char *title, const char *command, FILE *f) 1022{ 1023 char line[2000]; 1024 1025 FILE *p = popen(command, "r"); 1026 if (!p) 1027 return; 1028 1029 fprintf(f, COLOR_YELLOW "%s: " COLOR_RESET "\n", title); 1030 while (fgets(line, sizeof(line), p)) 1031 fputs(line, f); 1032 fprintf(f, "\n\n"); 1033 pclose(p); 1034} 1035 1036static void si_dump_debug_state(struct pipe_context *ctx, FILE *f, unsigned flags) 1037{ 1038 struct si_context *sctx = (struct si_context *)ctx; 1039 1040 if (sctx->log) 1041 u_log_flush(sctx->log); 1042 1043 if (flags & PIPE_DUMP_DEVICE_STATUS_REGISTERS) { 1044 si_dump_debug_registers(sctx, f); 1045 1046 si_dump_annotated_shaders(sctx, f); 1047 si_dump_command("Active waves (raw data)", "umr -O halt_waves -wa | column -t", f); 1048 si_dump_command("Wave information", "umr -O halt_waves,bits -wa", f); 1049 } 1050} 1051 1052void si_log_draw_state(struct si_context *sctx, struct u_log_context *log) 1053{ 1054 struct si_shader_ctx_state *tcs_shader; 1055 1056 if (!log) 1057 return; 1058 1059 tcs_shader = &sctx->shader.tcs; 1060 if (sctx->shader.tes.cso && !sctx->shader.tcs.cso) 1061 tcs_shader = &sctx->fixed_func_tcs_shader; 1062 1063 si_dump_framebuffer(sctx, log); 1064 1065 si_dump_gfx_shader(sctx, &sctx->shader.vs, log); 1066 si_dump_gfx_shader(sctx, tcs_shader, log); 1067 si_dump_gfx_shader(sctx, &sctx->shader.tes, log); 1068 si_dump_gfx_shader(sctx, &sctx->shader.gs, log); 1069 si_dump_gfx_shader(sctx, &sctx->shader.ps, log); 1070 1071 si_dump_descriptor_list(sctx->screen, &sctx->descriptors[SI_DESCS_INTERNAL], "", "RW buffers", 1072 4, sctx->descriptors[SI_DESCS_INTERNAL].num_active_slots, si_identity, 1073 log); 1074 si_dump_gfx_descriptors(sctx, &sctx->shader.vs, log); 1075 si_dump_gfx_descriptors(sctx, tcs_shader, log); 1076 si_dump_gfx_descriptors(sctx, &sctx->shader.tes, log); 1077 si_dump_gfx_descriptors(sctx, &sctx->shader.gs, log); 1078 si_dump_gfx_descriptors(sctx, &sctx->shader.ps, log); 1079} 1080 1081void si_log_compute_state(struct si_context *sctx, struct u_log_context *log) 1082{ 1083 if (!log) 1084 return; 1085 1086 si_dump_compute_shader(sctx, log); 1087 si_dump_compute_descriptors(sctx, log); 1088} 1089 1090void si_check_vm_faults(struct si_context *sctx, struct radeon_saved_cs *saved, enum ring_type ring) 1091{ 1092 struct pipe_screen *screen = sctx->b.screen; 1093 FILE *f; 1094 uint64_t addr; 1095 char cmd_line[4096]; 1096 1097 if (!ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, &addr)) 1098 return; 1099 1100 f = dd_get_debug_file(false); 1101 if (!f) 1102 return; 1103 1104 fprintf(f, "VM fault report.\n\n"); 1105 if (os_get_command_line(cmd_line, sizeof(cmd_line))) 1106 fprintf(f, "Command: %s\n", cmd_line); 1107 fprintf(f, "Driver vendor: %s\n", screen->get_vendor(screen)); 1108 fprintf(f, "Device vendor: %s\n", screen->get_device_vendor(screen)); 1109 fprintf(f, "Device name: %s\n\n", screen->get_name(screen)); 1110 fprintf(f, "Failing VM page: 0x%08" PRIx64 "\n\n", addr); 1111 1112 if (sctx->apitrace_call_number) 1113 fprintf(f, "Last apitrace call: %u\n\n", sctx->apitrace_call_number); 1114 1115 switch (ring) { 1116 case RING_GFX: { 1117 struct u_log_context log; 1118 u_log_context_init(&log); 1119 1120 si_log_draw_state(sctx, &log); 1121 si_log_compute_state(sctx, &log); 1122 si_log_cs(sctx, &log, true); 1123 1124 u_log_new_page_print(&log, f); 1125 u_log_context_destroy(&log); 1126 break; 1127 } 1128 1129 default: 1130 break; 1131 } 1132 1133 fclose(f); 1134 1135 fprintf(stderr, "Detected a VM fault, exiting...\n"); 1136 exit(0); 1137} 1138 1139void si_init_debug_functions(struct si_context *sctx) 1140{ 1141 sctx->b.dump_debug_state = si_dump_debug_state; 1142 1143 /* Set the initial dmesg timestamp for this context, so that 1144 * only new messages will be checked for VM faults. 1145 */ 1146 if (sctx->screen->debug_flags & DBG(CHECK_VM)) 1147 ac_vm_fault_occured(sctx->chip_class, &sctx->dmesg_timestamp, NULL); 1148} 1149