1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "pipe/p_screen.h" 28#include "pipe/p_state.h" 29#include "tgsi/tgsi_dump.h" 30#include "tgsi/tgsi_parse.h" 31#include "util/format/u_format.h" 32#include "util/u_inlines.h" 33#include "util/u_memory.h" 34#include "util/u_string.h" 35 36#include "nir/tgsi_to_nir.h" 37 38#include "freedreno_context.h" 39#include "freedreno_util.h" 40 41#include "ir3/ir3_cache.h" 42#include "ir3/ir3_compiler.h" 43#include "ir3/ir3_gallium.h" 44#include "ir3/ir3_nir.h" 45#include "ir3/ir3_shader.h" 46 47/** 48 * The hardware cso for shader state 49 * 50 * Initially just a container for the ir3_shader, but this is where we'll 51 * plumb in async compile. 52 */ 53struct ir3_shader_state { 54 struct ir3_shader *shader; 55 56 /* Fence signalled when async compile is completed: */ 57 struct util_queue_fence ready; 58}; 59 60/** 61 * Should initial variants be compiled synchronously? 62 * 63 * The only case where pipe_debug_message() is used in the initial-variants 64 * path is with FD_MESA_DEBUG=shaderdb. So if either debug is disabled (ie. 65 * debug.debug_message==NULL), or shaderdb stats are not enabled, we can 66 * compile the initial shader variant asynchronously. 67 */ 68static bool 69initial_variants_synchronous(struct fd_context *ctx) 70{ 71 return unlikely(ctx->debug.debug_message) || FD_DBG(SHADERDB) || 72 FD_DBG(SERIALC); 73} 74 75static void 76dump_shader_info(struct ir3_shader_variant *v, 77 struct pipe_debug_callback *debug) 78{ 79 if (!FD_DBG(SHADERDB)) 80 return; 81 82 pipe_debug_message( 83 debug, SHADER_INFO, 84 "%s shader: %u inst, %u nops, %u non-nops, %u mov, %u cov, " 85 "%u dwords, %u last-baryf, %u half, %u full, %u constlen, " 86 "%u cat0, %u cat1, %u cat2, %u cat3, %u cat4, %u cat5, %u cat6, %u cat7, " 87 "%u stp, %u ldp, %u sstall, %u (ss), %u (sy), %d waves, %d max_sun, " 88 "%d loops\n", 89 ir3_shader_stage(v), v->info.instrs_count, v->info.nops_count, 90 v->info.instrs_count - v->info.nops_count, v->info.mov_count, 91 v->info.cov_count, v->info.sizedwords, v->info.last_baryf, 92 v->info.max_half_reg + 1, v->info.max_reg + 1, v->constlen, 93 v->info.instrs_per_cat[0], v->info.instrs_per_cat[1], 94 v->info.instrs_per_cat[2], v->info.instrs_per_cat[3], 95 v->info.instrs_per_cat[4], v->info.instrs_per_cat[5], 96 v->info.instrs_per_cat[6], v->info.instrs_per_cat[7], 97 v->info.stp_count, v->info.ldp_count, v->info.sstall, 98 v->info.ss, v->info.sy, v->info.max_waves, v->max_sun, v->loops); 99} 100 101static void 102upload_shader_variant(struct ir3_shader_variant *v) 103{ 104 struct shader_info *info = &v->shader->nir->info; 105 struct ir3_compiler *compiler = v->shader->compiler; 106 107 assert(!v->bo); 108 109 v->bo = 110 fd_bo_new(compiler->dev, v->info.size, 0, 111 "%s:%s", ir3_shader_stage(v), info->name); 112 113 /* Always include shaders in kernel crash dumps. */ 114 fd_bo_mark_for_dump(v->bo); 115 116 memcpy(fd_bo_map(v->bo), v->bin, v->info.size); 117} 118 119struct ir3_shader_variant * 120ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key, 121 bool binning_pass, struct pipe_debug_callback *debug) 122{ 123 struct ir3_shader_variant *v; 124 bool created = false; 125 126 /* Some shader key values may not be used by a given ir3_shader (for 127 * example, fragment shader saturates in the vertex shader), so clean out 128 * those flags to avoid recompiling. 129 */ 130 ir3_key_clear_unused(&key, shader); 131 132 v = ir3_shader_get_variant(shader, &key, binning_pass, false, &created); 133 134 if (created) { 135 if (shader->initial_variants_done) { 136 perf_debug_message(debug, SHADER_INFO, 137 "%s shader: recompiling at draw time: global " 138 "0x%08x, vfsamples %x/%x, astc %x/%x\n", 139 ir3_shader_stage(v), key.global, key.vsamples, 140 key.fsamples, key.vastc_srgb, key.fastc_srgb); 141 } 142 143 dump_shader_info(v, debug); 144 upload_shader_variant(v); 145 146 if (v->binning) { 147 upload_shader_variant(v->binning); 148 dump_shader_info(v->binning, debug); 149 } 150 } 151 152 return v; 153} 154 155static void 156copy_stream_out(struct ir3_stream_output_info *i, 157 const struct pipe_stream_output_info *p) 158{ 159 STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride)); 160 STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output)); 161 162 i->num_outputs = p->num_outputs; 163 for (int n = 0; n < ARRAY_SIZE(i->stride); n++) 164 i->stride[n] = p->stride[n]; 165 166 for (int n = 0; n < ARRAY_SIZE(i->output); n++) { 167 i->output[n].register_index = p->output[n].register_index; 168 i->output[n].start_component = p->output[n].start_component; 169 i->output[n].num_components = p->output[n].num_components; 170 i->output[n].output_buffer = p->output[n].output_buffer; 171 i->output[n].dst_offset = p->output[n].dst_offset; 172 i->output[n].stream = p->output[n].stream; 173 } 174} 175 176static void 177create_initial_variants(struct ir3_shader_state *hwcso, 178 struct pipe_debug_callback *debug) 179{ 180 struct ir3_shader *shader = hwcso->shader; 181 struct ir3_compiler *compiler = shader->compiler; 182 nir_shader *nir = shader->nir; 183 184 /* Compile standard variants immediately to try to avoid draw-time stalls 185 * to run the compiler. 186 */ 187 struct ir3_shader_key key = { 188 .tessellation = IR3_TESS_NONE, 189 .ucp_enables = MASK(nir->info.clip_distance_array_size), 190 .msaa = true, 191 }; 192 193 switch (nir->info.stage) { 194 case MESA_SHADER_TESS_EVAL: 195 key.tessellation = ir3_tess_mode(nir->info.tess.primitive_mode); 196 break; 197 198 case MESA_SHADER_TESS_CTRL: 199 /* The primitive_mode field, while it exists for TCS, is not 200 * populated (since separable shaders between TCS/TES are legal, 201 * so TCS wouldn't have access to TES's declaration). Make a 202 * guess so that we shader-db something plausible for TCS. 203 */ 204 if (nir->info.outputs_written & VARYING_BIT_TESS_LEVEL_INNER) 205 key.tessellation = IR3_TESS_TRIANGLES; 206 else 207 key.tessellation = IR3_TESS_ISOLINES; 208 break; 209 210 case MESA_SHADER_GEOMETRY: 211 key.has_gs = true; 212 break; 213 214 default: 215 break; 216 } 217 218 key.safe_constlen = false; 219 struct ir3_shader_variant *v = ir3_shader_variant(shader, key, false, debug); 220 if (!v) 221 return; 222 223 if (v->constlen > compiler->max_const_safe) { 224 key.safe_constlen = true; 225 ir3_shader_variant(shader, key, false, debug); 226 } 227 228 /* For vertex shaders, also compile initial binning pass shader: */ 229 if (nir->info.stage == MESA_SHADER_VERTEX) { 230 key.safe_constlen = false; 231 v = ir3_shader_variant(shader, key, true, debug); 232 if (!v) 233 return; 234 235 if (v->constlen > compiler->max_const_safe) { 236 key.safe_constlen = true; 237 ir3_shader_variant(shader, key, true, debug); 238 } 239 } 240 241 shader->initial_variants_done = true; 242} 243 244static void 245create_initial_variants_async(void *job, void *gdata, int thread_index) 246{ 247 struct ir3_shader_state *hwcso = job; 248 struct pipe_debug_callback debug = {}; 249 250 create_initial_variants(hwcso, &debug); 251} 252 253static void 254create_initial_compute_variants_async(void *job, void *gdata, int thread_index) 255{ 256 struct ir3_shader_state *hwcso = job; 257 struct ir3_shader *shader = hwcso->shader; 258 struct pipe_debug_callback debug = {}; 259 static struct ir3_shader_key key; /* static is implicitly zeroed */ 260 261 ir3_shader_variant(shader, key, false, &debug); 262 shader->initial_variants_done = true; 263} 264 265/* a bit annoying that compute-shader and normal shader state objects 266 * aren't a bit more aligned. 267 */ 268void * 269ir3_shader_compute_state_create(struct pipe_context *pctx, 270 const struct pipe_compute_state *cso) 271{ 272 struct fd_context *ctx = fd_context(pctx); 273 274 /* req_input_mem will only be non-zero for cl kernels (ie. clover). 275 * This isn't a perfect test because I guess it is possible (but 276 * uncommon) for none for the kernel parameters to be a global, 277 * but ctx->set_global_bindings() can't fail, so this is the next 278 * best place to fail if we need a newer version of kernel driver: 279 */ 280 if ((cso->req_input_mem > 0) && 281 fd_device_version(ctx->dev) < FD_VERSION_BO_IOVA) { 282 return NULL; 283 } 284 285 struct ir3_compiler *compiler = ctx->screen->compiler; 286 nir_shader *nir; 287 288 if (cso->ir_type == PIPE_SHADER_IR_NIR) { 289 /* we take ownership of the reference: */ 290 nir = (nir_shader *)cso->prog; 291 } else { 292 debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI); 293 if (ir3_shader_debug & IR3_DBG_DISASM) { 294 tgsi_dump(cso->prog, 0); 295 } 296 nir = tgsi_to_nir(cso->prog, pctx->screen, false); 297 } 298 299 struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir, 0, NULL); 300 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); 301 302 util_queue_fence_init(&hwcso->ready); 303 hwcso->shader = shader; 304 305 /* Immediately compile a standard variant. We have so few variants in our 306 * shaders, that doing so almost eliminates draw-time recompiles. (This 307 * is also how we get data from shader-db's ./run) 308 */ 309 310 if (initial_variants_synchronous(ctx)) { 311 static struct ir3_shader_key key; /* static is implicitly zeroed */ 312 ir3_shader_variant(shader, key, false, &ctx->debug); 313 shader->initial_variants_done = true; 314 } else { 315 struct fd_screen *screen = ctx->screen; 316 util_queue_add_job(&screen->compile_queue, hwcso, &hwcso->ready, 317 create_initial_compute_variants_async, NULL, 0); 318 } 319 320 return hwcso; 321} 322 323void * 324ir3_shader_state_create(struct pipe_context *pctx, 325 const struct pipe_shader_state *cso) 326{ 327 struct fd_context *ctx = fd_context(pctx); 328 struct ir3_compiler *compiler = ctx->screen->compiler; 329 struct ir3_shader_state *hwcso = calloc(1, sizeof(*hwcso)); 330 331 /* 332 * Convert to nir (if necessary): 333 */ 334 335 nir_shader *nir; 336 if (cso->type == PIPE_SHADER_IR_NIR) { 337 /* we take ownership of the reference: */ 338 nir = cso->ir.nir; 339 } else { 340 debug_assert(cso->type == PIPE_SHADER_IR_TGSI); 341 if (ir3_shader_debug & IR3_DBG_DISASM) { 342 tgsi_dump(cso->tokens, 0); 343 } 344 nir = tgsi_to_nir(cso->tokens, pctx->screen, false); 345 } 346 347 /* 348 * Create ir3_shader: 349 * 350 * This part is cheap, it doesn't compile initial variants 351 */ 352 353 struct ir3_stream_output_info stream_output = {}; 354 copy_stream_out(&stream_output, &cso->stream_output); 355 356 hwcso->shader = ir3_shader_from_nir(compiler, nir, 0, &stream_output); 357 358 /* 359 * Create initial variants to avoid draw-time stalls. This is 360 * normally done asynchronously, unless debug is enabled (which 361 * will be the case for shader-db) 362 */ 363 364 util_queue_fence_init(&hwcso->ready); 365 366 if (initial_variants_synchronous(ctx)) { 367 create_initial_variants(hwcso, &ctx->debug); 368 } else { 369 util_queue_add_job(&ctx->screen->compile_queue, hwcso, &hwcso->ready, 370 create_initial_variants_async, NULL, 0); 371 } 372 373 return hwcso; 374} 375 376void 377ir3_shader_state_delete(struct pipe_context *pctx, void *_hwcso) 378{ 379 struct fd_context *ctx = fd_context(pctx); 380 struct fd_screen *screen = ctx->screen; 381 struct ir3_shader_state *hwcso = _hwcso; 382 struct ir3_shader *so = hwcso->shader; 383 384 ir3_cache_invalidate(ctx->shader_cache, hwcso); 385 386 /* util_queue_drop_job() guarantees that either: 387 * 1) job did not execute 388 * 2) job completed 389 * 390 * In either case the fence is signaled 391 */ 392 util_queue_drop_job(&screen->compile_queue, &hwcso->ready); 393 394 /* free the uploaded shaders, since this is handled outside of the 395 * shared ir3 code (ie. not used by turnip): 396 */ 397 for (struct ir3_shader_variant *v = so->variants; v; v = v->next) { 398 fd_bo_del(v->bo); 399 v->bo = NULL; 400 401 if (v->binning && v->binning->bo) { 402 fd_bo_del(v->binning->bo); 403 v->binning->bo = NULL; 404 } 405 } 406 407 ir3_shader_destroy(so); 408 util_queue_fence_destroy(&hwcso->ready); 409 free(hwcso); 410} 411 412struct ir3_shader * 413ir3_get_shader(struct ir3_shader_state *hwcso) 414{ 415 if (!hwcso) 416 return NULL; 417 418 struct ir3_shader *shader = hwcso->shader; 419 perf_time (1000, "waited for %s:%s:%s variants", 420 _mesa_shader_stage_to_abbrev(shader->type), 421 shader->nir->info.name, 422 shader->nir->info.label) { 423 /* wait for initial variants to compile: */ 424 util_queue_fence_wait(&hwcso->ready); 425 } 426 427 return shader; 428} 429 430struct shader_info * 431ir3_get_shader_info(struct ir3_shader_state *hwcso) 432{ 433 if (!hwcso) 434 return NULL; 435 return &hwcso->shader->nir->info; 436} 437 438/* fixup dirty shader state in case some "unrelated" (from the state- 439 * tracker's perspective) state change causes us to switch to a 440 * different variant. 441 */ 442void 443ir3_fixup_shader_state(struct pipe_context *pctx, struct ir3_shader_key *key) 444{ 445 struct fd_context *ctx = fd_context(pctx); 446 447 if (!ir3_shader_key_equal(ctx->last.key, key)) { 448 if (ir3_shader_key_changes_fs(ctx->last.key, key)) { 449 fd_context_dirty_shader(ctx, PIPE_SHADER_FRAGMENT, 450 FD_DIRTY_SHADER_PROG); 451 } 452 453 if (ir3_shader_key_changes_vs(ctx->last.key, key)) { 454 fd_context_dirty_shader(ctx, PIPE_SHADER_VERTEX, FD_DIRTY_SHADER_PROG); 455 } 456 457 /* NOTE: currently only a6xx has gs/tess, but needs no 458 * gs/tess specific lowering. 459 */ 460 461 *ctx->last.key = *key; 462 } 463} 464 465static char * 466ir3_screen_finalize_nir(struct pipe_screen *pscreen, void *nir) 467{ 468 struct fd_screen *screen = fd_screen(pscreen); 469 470 ir3_nir_lower_io_to_temporaries(nir); 471 ir3_finalize_nir(screen->compiler, nir); 472 473 return NULL; 474} 475 476static void 477ir3_set_max_shader_compiler_threads(struct pipe_screen *pscreen, 478 unsigned max_threads) 479{ 480 struct fd_screen *screen = fd_screen(pscreen); 481 482 /* This function doesn't allow a greater number of threads than 483 * the queue had at its creation. 484 */ 485 util_queue_adjust_num_threads(&screen->compile_queue, max_threads); 486} 487 488static bool 489ir3_is_parallel_shader_compilation_finished(struct pipe_screen *pscreen, 490 void *shader, 491 enum pipe_shader_type shader_type) 492{ 493 struct ir3_shader_state *hwcso = (struct ir3_shader_state *)shader; 494 495 return util_queue_fence_is_signalled(&hwcso->ready); 496} 497 498void 499ir3_prog_init(struct pipe_context *pctx) 500{ 501 pctx->create_vs_state = ir3_shader_state_create; 502 pctx->delete_vs_state = ir3_shader_state_delete; 503 504 pctx->create_tcs_state = ir3_shader_state_create; 505 pctx->delete_tcs_state = ir3_shader_state_delete; 506 507 pctx->create_tes_state = ir3_shader_state_create; 508 pctx->delete_tes_state = ir3_shader_state_delete; 509 510 pctx->create_gs_state = ir3_shader_state_create; 511 pctx->delete_gs_state = ir3_shader_state_delete; 512 513 pctx->create_fs_state = ir3_shader_state_create; 514 pctx->delete_fs_state = ir3_shader_state_delete; 515} 516 517void 518ir3_screen_init(struct pipe_screen *pscreen) 519{ 520 struct fd_screen *screen = fd_screen(pscreen); 521 522 screen->compiler = ir3_compiler_create(screen->dev, screen->dev_id, false); 523 524 /* TODO do we want to limit things to # of fast cores, or just limit 525 * based on total # of both big and little cores. The little cores 526 * tend to be in-order and probably much slower for compiling than 527 * big cores. OTOH if they are sitting idle, maybe it is useful to 528 * use them? 529 */ 530 unsigned num_threads = sysconf(_SC_NPROCESSORS_ONLN) - 1; 531 532 util_queue_init(&screen->compile_queue, "ir3q", 64, num_threads, 533 UTIL_QUEUE_INIT_RESIZE_IF_FULL | 534 UTIL_QUEUE_INIT_SET_FULL_THREAD_AFFINITY, NULL); 535 536 pscreen->finalize_nir = ir3_screen_finalize_nir; 537 pscreen->set_max_shader_compiler_threads = 538 ir3_set_max_shader_compiler_threads; 539 pscreen->is_parallel_shader_compilation_finished = 540 ir3_is_parallel_shader_compilation_finished; 541} 542 543void 544ir3_screen_fini(struct pipe_screen *pscreen) 545{ 546 struct fd_screen *screen = fd_screen(pscreen); 547 548 util_queue_destroy(&screen->compile_queue); 549 ir3_compiler_destroy(screen->compiler); 550 screen->compiler = NULL; 551} 552 553void 554ir3_update_max_tf_vtx(struct fd_context *ctx, 555 const struct ir3_shader_variant *v) 556{ 557 struct fd_streamout_stateobj *so = &ctx->streamout; 558 struct ir3_stream_output_info *info = &v->shader->stream_output; 559 uint32_t maxvtxcnt = 0x7fffffff; 560 561 if (v->shader->stream_output.num_outputs == 0) 562 ctx->streamout.max_tf_vtx = 0; 563 if (so->num_targets == 0) 564 ctx->streamout.max_tf_vtx = 0; 565 566 /* offset to write to is: 567 * 568 * total_vtxcnt = vtxcnt + offsets[i] 569 * offset = total_vtxcnt * stride[i] 570 * 571 * offset = vtxcnt * stride[i] ; calculated in shader 572 * + offsets[i] * stride[i] ; calculated at emit_tfbos() 573 * 574 * assuming for each vtx, each target buffer will have data written 575 * up to 'offset + stride[i]', that leaves maxvtxcnt as: 576 * 577 * buffer_size = (maxvtxcnt * stride[i]) + stride[i] 578 * maxvtxcnt = (buffer_size - stride[i]) / stride[i] 579 * 580 * but shader is actually doing a less-than (rather than less-than- 581 * equal) check, so we can drop the -stride[i]. 582 * 583 * TODO is assumption about `offset + stride[i]` legit? 584 */ 585 for (unsigned i = 0; i < so->num_targets; i++) { 586 struct pipe_stream_output_target *target = so->targets[i]; 587 unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */ 588 if (target) { 589 uint32_t max = target->buffer_size / stride; 590 maxvtxcnt = MIN2(maxvtxcnt, max); 591 } 592 } 593 594 ctx->streamout.max_tf_vtx = maxvtxcnt; 595} 596