1/* 2 * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 * SOFTWARE. 22 * 23 * Authors: 24 * Rob Clark <robclark@freedesktop.org> 25 */ 26 27#include "pipe/p_state.h" 28#include "pipe/p_screen.h" 29#include "util/u_string.h" 30#include "util/u_memory.h" 31#include "util/u_inlines.h" 32#include "util/u_format.h" 33#include "tgsi/tgsi_dump.h" 34#include "tgsi/tgsi_parse.h" 35 36#include "nir/tgsi_to_nir.h" 37 38#include "freedreno_context.h" 39#include "freedreno_util.h" 40 41#include "ir3/ir3_shader.h" 42#include "ir3/ir3_gallium.h" 43#include "ir3/ir3_compiler.h" 44#include "ir3/ir3_nir.h" 45 46static void 47dump_shader_info(struct ir3_shader_variant *v, struct pipe_debug_callback *debug) 48{ 49 if (!unlikely(fd_mesa_debug & FD_DBG_SHADERDB)) 50 return; 51 52 pipe_debug_message(debug, SHADER_INFO, "\n" 53 "SHADER-DB: %s prog %d/%d: %u instructions, %u dwords\n" 54 "SHADER-DB: %s prog %d/%d: %u half, %u full\n" 55 "SHADER-DB: %s prog %d/%d: %u const, %u constlen\n" 56 "SHADER-DB: %s prog %d/%d: %u (ss), %u (sy)\n" 57 "SHADER-DB: %s prog %d/%d: max_sun=%u\n", 58 ir3_shader_stage(v->shader), 59 v->shader->id, v->id, 60 v->info.instrs_count, 61 v->info.sizedwords, 62 ir3_shader_stage(v->shader), 63 v->shader->id, v->id, 64 v->info.max_half_reg + 1, 65 v->info.max_reg + 1, 66 ir3_shader_stage(v->shader), 67 v->shader->id, v->id, 68 v->info.max_const + 1, 69 v->constlen, 70 ir3_shader_stage(v->shader), 71 v->shader->id, v->id, 72 v->info.ss, v->info.sy, 73 ir3_shader_stage(v->shader), 74 v->shader->id, v->id, 75 v->max_sun); 76} 77 78struct ir3_shader_variant * 79ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key, 80 bool binning_pass, struct pipe_debug_callback *debug) 81{ 82 struct ir3_shader_variant *v; 83 bool created = false; 84 85 /* some shader key values only apply to vertex or frag shader, 86 * so normalize the key to avoid constructing multiple identical 87 * variants: 88 */ 89 ir3_normalize_key(&key, shader->type); 90 91 v = ir3_shader_get_variant(shader, &key, binning_pass, &created); 92 93 if (created) { 94 dump_shader_info(v, debug); 95 } 96 97 return v; 98} 99 100static void 101copy_stream_out(struct ir3_stream_output_info *i, 102 const struct pipe_stream_output_info *p) 103{ 104 STATIC_ASSERT(ARRAY_SIZE(i->stride) == ARRAY_SIZE(p->stride)); 105 STATIC_ASSERT(ARRAY_SIZE(i->output) == ARRAY_SIZE(p->output)); 106 107 i->num_outputs = p->num_outputs; 108 for (int n = 0; n < ARRAY_SIZE(i->stride); n++) 109 i->stride[n] = p->stride[n]; 110 111 for (int n = 0; n < ARRAY_SIZE(i->output); n++) { 112 i->output[n].register_index = p->output[n].register_index; 113 i->output[n].start_component = p->output[n].start_component; 114 i->output[n].num_components = p->output[n].num_components; 115 i->output[n].output_buffer = p->output[n].output_buffer; 116 i->output[n].dst_offset = p->output[n].dst_offset; 117 i->output[n].stream = p->output[n].stream; 118 } 119} 120 121struct ir3_shader * 122ir3_shader_create(struct ir3_compiler *compiler, 123 const struct pipe_shader_state *cso, gl_shader_stage type, 124 struct pipe_debug_callback *debug, 125 struct pipe_screen *screen) 126{ 127 nir_shader *nir; 128 if (cso->type == PIPE_SHADER_IR_NIR) { 129 /* we take ownership of the reference: */ 130 nir = cso->ir.nir; 131 } else { 132 debug_assert(cso->type == PIPE_SHADER_IR_TGSI); 133 if (ir3_shader_debug & IR3_DBG_DISASM) { 134 tgsi_dump(cso->tokens, 0); 135 } 136 nir = ir3_tgsi_to_nir(compiler, cso->tokens, screen); 137 } 138 139 struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir); 140 141 copy_stream_out(&shader->stream_output, &cso->stream_output); 142 143 if (fd_mesa_debug & FD_DBG_SHADERDB) { 144 /* if shader-db run, create a standard variant immediately 145 * (as otherwise nothing will trigger the shader to be 146 * actually compiled) 147 */ 148 static struct ir3_shader_key key; 149 memset(&key, 0, sizeof(key)); 150 ir3_shader_variant(shader, key, false, debug); 151 } 152 return shader; 153} 154 155/* a bit annoying that compute-shader and normal shader state objects 156 * aren't a bit more aligned. 157 */ 158struct ir3_shader * 159ir3_shader_create_compute(struct ir3_compiler *compiler, 160 const struct pipe_compute_state *cso, 161 struct pipe_debug_callback *debug, 162 struct pipe_screen *screen) 163{ 164 nir_shader *nir; 165 if (cso->ir_type == PIPE_SHADER_IR_NIR) { 166 /* we take ownership of the reference: */ 167 nir = (nir_shader *)cso->prog; 168 } else { 169 debug_assert(cso->ir_type == PIPE_SHADER_IR_TGSI); 170 if (ir3_shader_debug & IR3_DBG_DISASM) { 171 tgsi_dump(cso->prog, 0); 172 } 173 nir = ir3_tgsi_to_nir(compiler, cso->prog, screen); 174 } 175 176 struct ir3_shader *shader = ir3_shader_from_nir(compiler, nir); 177 178 return shader; 179} 180 181struct nir_shader * 182ir3_tgsi_to_nir(struct ir3_compiler *compiler, 183 const struct tgsi_token *tokens, 184 struct pipe_screen *screen) 185{ 186 if (!screen) { 187 const nir_shader_compiler_options *options = 188 ir3_get_compiler_options(compiler); 189 return tgsi_to_nir_noscreen(tokens, options); 190 } 191 192 return tgsi_to_nir(tokens, screen); 193} 194 195/* This has to reach into the fd_context a bit more than the rest of 196 * ir3, but it needs to be aligned with the compiler, so both agree 197 * on which const regs hold what. And the logic is identical between 198 * a3xx/a4xx, the only difference is small details in the actual 199 * CP_LOAD_STATE packets (which is handled inside the generation 200 * specific ctx->emit_const(_bo)() fxns) 201 */ 202 203#include "freedreno_resource.h" 204 205static inline bool 206is_stateobj(struct fd_ringbuffer *ring) 207{ 208 /* XXX this is an ugly way to differentiate.. */ 209 return !!(ring->flags & FD_RINGBUFFER_STREAMING); 210} 211 212static inline void 213ring_wfi(struct fd_batch *batch, struct fd_ringbuffer *ring) 214{ 215 /* when we emit const state via ring (IB2) we need a WFI, but when 216 * it is emit'd via stateobj, we don't 217 */ 218 if (is_stateobj(ring)) 219 return; 220 221 fd_wfi(batch, ring); 222} 223 224static void 225emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v, 226 struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) 227{ 228 const unsigned index = 0; /* user consts are index 0 */ 229 230 if (constbuf->enabled_mask & (1 << index)) { 231 struct pipe_constant_buffer *cb = &constbuf->cb[index]; 232 /* size in dwords, aligned to vec4. (This works at least 233 * with mesa/st, which seems to align constant buffer to 234 * 16 bytes) 235 */ 236 unsigned size = align(cb->buffer_size, 16) / 4; 237 238 /* in particular, with binning shader we may end up with 239 * unused consts, ie. we could end up w/ constlen that is 240 * smaller than first_driver_param. In that case truncate 241 * the user consts early to avoid HLSQ lockup caused by 242 * writing too many consts 243 */ 244 uint32_t max_const = MIN2(v->num_uniforms, v->constlen); 245 246 /* and even if the start of the const buffer is before 247 * first_immediate, the end may not be: 248 */ 249 size = MIN2(size, 4 * max_const); 250 251 if (size > 0) { 252 ring_wfi(ctx->batch, ring); 253 ctx->emit_const(ring, v->type, 0, 254 cb->buffer_offset, size, 255 cb->user_buffer, cb->buffer); 256 } 257 } 258 259 struct ir3_ubo_analysis_state *state; 260 state = &v->shader->ubo_state; 261 262 for (uint32_t i = 1; i < ARRAY_SIZE(state->range); i++) { 263 struct pipe_constant_buffer *cb = &constbuf->cb[i]; 264 265 if (state->range[i].start < state->range[i].end && 266 constbuf->enabled_mask & (1 << i)) { 267 268 uint32_t size = state->range[i].end - state->range[i].start; 269 uint32_t offset = cb->buffer_offset + state->range[i].start; 270 debug_assert((state->range[i].offset % 16) == 0); 271 debug_assert((size % 16) == 0); 272 debug_assert((offset % 16) == 0); 273 ctx->emit_const(ring, v->type, state->range[i].offset / 4, 274 offset, size / 4, cb->user_buffer, cb->buffer); 275 } 276 } 277} 278 279static void 280emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, 281 struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf) 282{ 283 uint32_t offset = v->constbase.ubo; 284 if (v->constlen > offset) { 285 uint32_t params = v->num_ubos; 286 uint32_t offsets[params]; 287 struct pipe_resource *prscs[params]; 288 289 for (uint32_t i = 0; i < params; i++) { 290 const uint32_t index = i + 1; /* UBOs start at index 1 */ 291 struct pipe_constant_buffer *cb = &constbuf->cb[index]; 292 assert(!cb->user_buffer); 293 294 if ((constbuf->enabled_mask & (1 << index)) && cb->buffer) { 295 offsets[i] = cb->buffer_offset; 296 prscs[i] = cb->buffer; 297 } else { 298 offsets[i] = 0; 299 prscs[i] = NULL; 300 } 301 } 302 303 ring_wfi(ctx->batch, ring); 304 ctx->emit_const_bo(ring, v->type, false, offset * 4, params, prscs, offsets); 305 } 306} 307 308static void 309emit_ssbo_sizes(struct fd_context *ctx, const struct ir3_shader_variant *v, 310 struct fd_ringbuffer *ring, struct fd_shaderbuf_stateobj *sb) 311{ 312 uint32_t offset = v->constbase.ssbo_sizes; 313 if (v->constlen > offset) { 314 uint32_t sizes[align(v->const_layout.ssbo_size.count, 4)]; 315 unsigned mask = v->const_layout.ssbo_size.mask; 316 317 while (mask) { 318 unsigned index = u_bit_scan(&mask); 319 unsigned off = v->const_layout.ssbo_size.off[index]; 320 sizes[off] = sb->sb[index].buffer_size; 321 } 322 323 ring_wfi(ctx->batch, ring); 324 ctx->emit_const(ring, v->type, offset * 4, 325 0, ARRAY_SIZE(sizes), sizes, NULL); 326 } 327} 328 329static void 330emit_image_dims(struct fd_context *ctx, const struct ir3_shader_variant *v, 331 struct fd_ringbuffer *ring, struct fd_shaderimg_stateobj *si) 332{ 333 uint32_t offset = v->constbase.image_dims; 334 if (v->constlen > offset) { 335 uint32_t dims[align(v->const_layout.image_dims.count, 4)]; 336 unsigned mask = v->const_layout.image_dims.mask; 337 338 while (mask) { 339 struct pipe_image_view *img; 340 struct fd_resource *rsc; 341 unsigned index = u_bit_scan(&mask); 342 unsigned off = v->const_layout.image_dims.off[index]; 343 344 img = &si->si[index]; 345 rsc = fd_resource(img->resource); 346 347 dims[off + 0] = util_format_get_blocksize(img->format); 348 if (img->resource->target != PIPE_BUFFER) { 349 unsigned lvl = img->u.tex.level; 350 /* note for 2d/cube/etc images, even if re-interpreted 351 * as a different color format, the pixel size should 352 * be the same, so use original dimensions for y and z 353 * stride: 354 */ 355 dims[off + 1] = rsc->slices[lvl].pitch * rsc->cpp; 356 /* see corresponding logic in fd_resource_offset(): */ 357 if (rsc->layer_first) { 358 dims[off + 2] = rsc->layer_size; 359 } else { 360 dims[off + 2] = rsc->slices[lvl].size0; 361 } 362 } else { 363 /* For buffer-backed images, the log2 of the format's 364 * bytes-per-pixel is placed on the 2nd slot. This is useful 365 * when emitting image_size instructions, for which we need 366 * to divide by bpp for image buffers. Since the bpp 367 * can only be power-of-two, the division is implemented 368 * as a SHR, and for that it is handy to have the log2 of 369 * bpp as a constant. (log2 = first-set-bit - 1) 370 */ 371 dims[off + 1] = ffs(dims[off + 0]) - 1; 372 } 373 } 374 375 ring_wfi(ctx->batch, ring); 376 ctx->emit_const(ring, v->type, offset * 4, 377 0, ARRAY_SIZE(dims), dims, NULL); 378 } 379} 380 381static void 382emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v, 383 struct fd_ringbuffer *ring) 384{ 385 int size = v->immediates_count; 386 uint32_t base = v->constbase.immediate; 387 388 /* truncate size to avoid writing constants that shader 389 * does not use: 390 */ 391 size = MIN2(size + base, v->constlen) - base; 392 393 /* convert out of vec4: */ 394 base *= 4; 395 size *= 4; 396 397 if (size > 0) { 398 ring_wfi(ctx->batch, ring); 399 ctx->emit_const(ring, v->type, base, 400 0, size, v->immediates[0].val, NULL); 401 } 402} 403 404/* emit stream-out buffers: */ 405static void 406emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v, 407 struct fd_ringbuffer *ring) 408{ 409 /* streamout addresses after driver-params: */ 410 uint32_t offset = v->constbase.tfbo; 411 if (v->constlen > offset) { 412 struct fd_streamout_stateobj *so = &ctx->streamout; 413 struct ir3_stream_output_info *info = &v->shader->stream_output; 414 uint32_t params = 4; 415 uint32_t offsets[params]; 416 struct pipe_resource *prscs[params]; 417 418 for (uint32_t i = 0; i < params; i++) { 419 struct pipe_stream_output_target *target = so->targets[i]; 420 421 if (target) { 422 offsets[i] = (so->offsets[i] * info->stride[i] * 4) + 423 target->buffer_offset; 424 prscs[i] = target->buffer; 425 } else { 426 offsets[i] = 0; 427 prscs[i] = NULL; 428 } 429 } 430 431 ring_wfi(ctx->batch, ring); 432 ctx->emit_const_bo(ring, v->type, true, offset * 4, params, prscs, offsets); 433 } 434} 435 436static uint32_t 437max_tf_vtx(struct fd_context *ctx, const struct ir3_shader_variant *v) 438{ 439 struct fd_streamout_stateobj *so = &ctx->streamout; 440 struct ir3_stream_output_info *info = &v->shader->stream_output; 441 uint32_t maxvtxcnt = 0x7fffffff; 442 443 if (ctx->screen->gpu_id >= 500) 444 return 0; 445 if (v->binning_pass) 446 return 0; 447 if (v->shader->stream_output.num_outputs == 0) 448 return 0; 449 if (so->num_targets == 0) 450 return 0; 451 452 /* offset to write to is: 453 * 454 * total_vtxcnt = vtxcnt + offsets[i] 455 * offset = total_vtxcnt * stride[i] 456 * 457 * offset = vtxcnt * stride[i] ; calculated in shader 458 * + offsets[i] * stride[i] ; calculated at emit_tfbos() 459 * 460 * assuming for each vtx, each target buffer will have data written 461 * up to 'offset + stride[i]', that leaves maxvtxcnt as: 462 * 463 * buffer_size = (maxvtxcnt * stride[i]) + stride[i] 464 * maxvtxcnt = (buffer_size - stride[i]) / stride[i] 465 * 466 * but shader is actually doing a less-than (rather than less-than- 467 * equal) check, so we can drop the -stride[i]. 468 * 469 * TODO is assumption about `offset + stride[i]` legit? 470 */ 471 for (unsigned i = 0; i < so->num_targets; i++) { 472 struct pipe_stream_output_target *target = so->targets[i]; 473 unsigned stride = info->stride[i] * 4; /* convert dwords->bytes */ 474 if (target) { 475 uint32_t max = target->buffer_size / stride; 476 maxvtxcnt = MIN2(maxvtxcnt, max); 477 } 478 } 479 480 return maxvtxcnt; 481} 482 483static void 484emit_common_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, 485 struct fd_context *ctx, enum pipe_shader_type t) 486{ 487 enum fd_dirty_shader_state dirty = ctx->dirty_shader[t]; 488 489 /* When we use CP_SET_DRAW_STATE objects to emit constant state, 490 * if we emit any of it we need to emit all. This is because 491 * we are using the same state-group-id each time for uniform 492 * state, and if previous update is never evaluated (due to no 493 * visible primitives in the current tile) then the new stateobj 494 * completely replaces the old one. 495 * 496 * Possibly if we split up different parts of the const state to 497 * different state-objects we could avoid this. 498 */ 499 if (dirty && is_stateobj(ring)) 500 dirty = ~0; 501 502 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_CONST)) { 503 struct fd_constbuf_stateobj *constbuf; 504 bool shader_dirty; 505 506 constbuf = &ctx->constbuf[t]; 507 shader_dirty = !!(dirty & FD_DIRTY_SHADER_PROG); 508 509 emit_user_consts(ctx, v, ring, constbuf); 510 emit_ubos(ctx, v, ring, constbuf); 511 if (shader_dirty) 512 emit_immediates(ctx, v, ring); 513 } 514 515 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_SSBO)) { 516 struct fd_shaderbuf_stateobj *sb = &ctx->shaderbuf[t]; 517 emit_ssbo_sizes(ctx, v, ring, sb); 518 } 519 520 if (dirty & (FD_DIRTY_SHADER_PROG | FD_DIRTY_SHADER_IMAGE)) { 521 struct fd_shaderimg_stateobj *si = &ctx->shaderimg[t]; 522 emit_image_dims(ctx, v, ring, si); 523 } 524} 525 526void 527ir3_emit_vs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, 528 struct fd_context *ctx, const struct pipe_draw_info *info) 529{ 530 debug_assert(v->type == MESA_SHADER_VERTEX); 531 532 emit_common_consts(v, ring, ctx, PIPE_SHADER_VERTEX); 533 534 /* emit driver params every time: */ 535 /* TODO skip emit if shader doesn't use driver params to avoid WFI.. */ 536 if (info) { 537 uint32_t offset = v->constbase.driver_param; 538 if (v->constlen > offset) { 539 uint32_t vertex_params[IR3_DP_VS_COUNT] = { 540 [IR3_DP_VTXID_BASE] = info->index_size ? 541 info->index_bias : info->start, 542 [IR3_DP_VTXCNT_MAX] = max_tf_vtx(ctx, v), 543 }; 544 /* if no user-clip-planes, we don't need to emit the 545 * entire thing: 546 */ 547 uint32_t vertex_params_size = 4; 548 549 if (v->key.ucp_enables) { 550 struct pipe_clip_state *ucp = &ctx->ucp; 551 unsigned pos = IR3_DP_UCP0_X; 552 for (unsigned i = 0; pos <= IR3_DP_UCP7_W; i++) { 553 for (unsigned j = 0; j < 4; j++) { 554 vertex_params[pos] = fui(ucp->ucp[i][j]); 555 pos++; 556 } 557 } 558 vertex_params_size = ARRAY_SIZE(vertex_params); 559 } 560 561 ring_wfi(ctx->batch, ring); 562 563 bool needs_vtxid_base = 564 ir3_find_sysval_regid(v, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) != regid(63, 0); 565 566 /* for indirect draw, we need to copy VTXID_BASE from 567 * indirect-draw parameters buffer.. which is annoying 568 * and means we can't easily emit these consts in cmd 569 * stream so need to copy them to bo. 570 */ 571 if (info->indirect && needs_vtxid_base) { 572 struct pipe_draw_indirect_info *indirect = info->indirect; 573 struct pipe_resource *vertex_params_rsc = 574 pipe_buffer_create(&ctx->screen->base, 575 PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM, 576 vertex_params_size * 4); 577 unsigned src_off = info->indirect->offset;; 578 void *ptr; 579 580 ptr = fd_bo_map(fd_resource(vertex_params_rsc)->bo); 581 memcpy(ptr, vertex_params, vertex_params_size * 4); 582 583 if (info->index_size) { 584 /* indexed draw, index_bias is 4th field: */ 585 src_off += 3 * 4; 586 } else { 587 /* non-indexed draw, start is 3rd field: */ 588 src_off += 2 * 4; 589 } 590 591 /* copy index_bias or start from draw params: */ 592 ctx->mem_to_mem(ring, vertex_params_rsc, 0, 593 indirect->buffer, src_off, 1); 594 595 ctx->emit_const(ring, MESA_SHADER_VERTEX, offset * 4, 0, 596 vertex_params_size, NULL, vertex_params_rsc); 597 598 pipe_resource_reference(&vertex_params_rsc, NULL); 599 } else { 600 ctx->emit_const(ring, MESA_SHADER_VERTEX, offset * 4, 0, 601 vertex_params_size, vertex_params, NULL); 602 } 603 604 /* if needed, emit stream-out buffer addresses: */ 605 if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) { 606 emit_tfbos(ctx, v, ring); 607 } 608 } 609 } 610} 611 612void 613ir3_emit_fs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, 614 struct fd_context *ctx) 615{ 616 debug_assert(v->type == MESA_SHADER_FRAGMENT); 617 618 emit_common_consts(v, ring, ctx, PIPE_SHADER_FRAGMENT); 619} 620 621/* emit compute-shader consts: */ 622void 623ir3_emit_cs_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, 624 struct fd_context *ctx, const struct pipe_grid_info *info) 625{ 626 debug_assert(gl_shader_stage_is_compute(v->type)); 627 628 emit_common_consts(v, ring, ctx, PIPE_SHADER_COMPUTE); 629 630 /* emit compute-shader driver-params: */ 631 uint32_t offset = v->constbase.driver_param; 632 if (v->constlen > offset) { 633 ring_wfi(ctx->batch, ring); 634 635 if (info->indirect) { 636 struct pipe_resource *indirect = NULL; 637 unsigned indirect_offset; 638 639 /* This is a bit awkward, but CP_LOAD_STATE.EXT_SRC_ADDR needs 640 * to be aligned more strongly than 4 bytes. So in this case 641 * we need a temporary buffer to copy NumWorkGroups.xyz to. 642 * 643 * TODO if previous compute job is writing to info->indirect, 644 * we might need a WFI.. but since we currently flush for each 645 * compute job, we are probably ok for now. 646 */ 647 if (info->indirect_offset & 0xf) { 648 indirect = pipe_buffer_create(&ctx->screen->base, 649 PIPE_BIND_COMMAND_ARGS_BUFFER, PIPE_USAGE_STREAM, 650 0x1000); 651 indirect_offset = 0; 652 653 ctx->mem_to_mem(ring, indirect, 0, info->indirect, 654 info->indirect_offset, 3); 655 } else { 656 pipe_resource_reference(&indirect, info->indirect); 657 indirect_offset = info->indirect_offset; 658 } 659 660 ctx->emit_const(ring, MESA_SHADER_COMPUTE, offset * 4, 661 indirect_offset, 4, NULL, indirect); 662 663 pipe_resource_reference(&indirect, NULL); 664 } else { 665 uint32_t compute_params[IR3_DP_CS_COUNT] = { 666 [IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0], 667 [IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1], 668 [IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2], 669 [IR3_DP_LOCAL_GROUP_SIZE_X] = info->block[0], 670 [IR3_DP_LOCAL_GROUP_SIZE_Y] = info->block[1], 671 [IR3_DP_LOCAL_GROUP_SIZE_Z] = info->block[2], 672 }; 673 674 ctx->emit_const(ring, MESA_SHADER_COMPUTE, offset * 4, 0, 675 ARRAY_SIZE(compute_params), compute_params, NULL); 676 } 677 } 678} 679