1/* 2 * Copyright © 2014-2017 Broadcom 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 */ 23 24#include "util/u_blitter.h" 25#include "util/u_draw.h" 26#include "util/u_prim.h" 27#include "util/format/u_format.h" 28#include "util/u_pack_color.h" 29#include "util/u_prim_restart.h" 30#include "util/u_upload_mgr.h" 31 32#include "v3d_context.h" 33#include "v3d_resource.h" 34#include "v3d_cl.h" 35#include "broadcom/compiler/v3d_compiler.h" 36#include "broadcom/common/v3d_macros.h" 37#include "broadcom/common/v3d_util.h" 38#include "broadcom/cle/v3dx_pack.h" 39 40static void 41v3d_start_binning(struct v3d_context *v3d, struct v3d_job *job) 42{ 43 assert(job->needs_flush); 44 45 /* Get space to emit our BCL state, using a branch to jump to a new BO 46 * if necessary. 47 */ 48 49 v3d_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */); 50 51 job->submit.bcl_start = job->bcl.bo->offset; 52 v3d_job_add_bo(job, job->bcl.bo); 53 54 /* The PTB will request the tile alloc initial size per tile at start 55 * of tile binning. 56 */ 57 uint32_t tile_alloc_size = 58 MAX2(job->num_layers, 1) * job->draw_tiles_x * job->draw_tiles_y * 64; 59 60 /* The PTB allocates in aligned 4k chunks after the initial setup. */ 61 tile_alloc_size = align(tile_alloc_size, 4096); 62 63 /* Include the first two chunk allocations that the PTB does so that 64 * we definitely clear the OOM condition before triggering one (the HW 65 * won't trigger OOM during the first allocations). 66 */ 67 tile_alloc_size += 8192; 68 69 /* For performance, allocate some extra initial memory after the PTB's 70 * minimal allocations, so that we hopefully don't have to block the 71 * GPU on the kernel handling an OOM signal. 72 */ 73 tile_alloc_size += 512 * 1024; 74 75 job->tile_alloc = v3d_bo_alloc(v3d->screen, tile_alloc_size, 76 "tile_alloc"); 77 uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64; 78 job->tile_state = v3d_bo_alloc(v3d->screen, 79 MAX2(job->num_layers, 1) * 80 job->draw_tiles_y * 81 job->draw_tiles_x * 82 tsda_per_tile_size, 83 "TSDA"); 84 85#if V3D_VERSION >= 41 86 /* This must go before the binning mode configuration. It is 87 * required for layered framebuffers to work. 88 */ 89 if (job->num_layers > 0) { 90 cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) { 91 config.number_of_layers = job->num_layers; 92 } 93 } 94#endif 95 96#if V3D_VERSION >= 40 97 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) { 98 config.width_in_pixels = job->draw_width; 99 config.height_in_pixels = job->draw_height; 100 config.number_of_render_targets = 101 MAX2(job->nr_cbufs, 1); 102 103 config.multisample_mode_4x = job->msaa; 104 105 config.maximum_bpp_of_all_render_targets = job->internal_bpp; 106 } 107#else /* V3D_VERSION < 40 */ 108 /* "Binning mode lists start with a Tile Binning Mode Configuration 109 * item (120)" 110 * 111 * Part1 signals the end of binning config setup. 112 */ 113 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG_PART2, config) { 114 config.tile_allocation_memory_address = 115 cl_address(job->tile_alloc, 0); 116 config.tile_allocation_memory_size = job->tile_alloc->size; 117 } 118 119 cl_emit(&job->bcl, TILE_BINNING_MODE_CFG_PART1, config) { 120 config.tile_state_data_array_base_address = 121 cl_address(job->tile_state, 0); 122 123 config.width_in_tiles = job->draw_tiles_x; 124 config.height_in_tiles = job->draw_tiles_y; 125 /* Must be >= 1 */ 126 config.number_of_render_targets = 127 MAX2(job->nr_cbufs, 1); 128 129 config.multisample_mode_4x = job->msaa; 130 131 config.maximum_bpp_of_all_render_targets = job->internal_bpp; 132 } 133#endif /* V3D_VERSION < 40 */ 134 135 /* There's definitely nothing in the VCD cache we want. */ 136 cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin); 137 138 /* Disable any leftover OQ state from another job. */ 139 cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter); 140 141 /* "Binning mode lists must have a Start Tile Binning item (6) after 142 * any prefix state data before the binning list proper starts." 143 */ 144 cl_emit(&job->bcl, START_TILE_BINNING, bin); 145} 146/** 147 * Does the initial bining command list setup for drawing to a given FBO. 148 */ 149static void 150v3d_start_draw(struct v3d_context *v3d) 151{ 152 struct v3d_job *job = v3d->job; 153 154 if (job->needs_flush) 155 return; 156 157 job->needs_flush = true; 158 job->draw_width = v3d->framebuffer.width; 159 job->draw_height = v3d->framebuffer.height; 160 job->num_layers = util_framebuffer_get_num_layers(&v3d->framebuffer); 161 162 v3d_start_binning(v3d, job); 163} 164 165static void 166v3d_predraw_check_stage_inputs(struct pipe_context *pctx, 167 enum pipe_shader_type s) 168{ 169 struct v3d_context *v3d = v3d_context(pctx); 170 171 /* Flush writes to textures we're sampling. */ 172 for (int i = 0; i < v3d->tex[s].num_textures; i++) { 173 struct pipe_sampler_view *pview = v3d->tex[s].textures[i]; 174 if (!pview) 175 continue; 176 struct v3d_sampler_view *view = v3d_sampler_view(pview); 177 178 if (view->texture != view->base.texture && 179 view->base.format != PIPE_FORMAT_X32_S8X24_UINT) 180 v3d_update_shadow_texture(pctx, &view->base); 181 182 v3d_flush_jobs_writing_resource(v3d, view->texture, 183 V3D_FLUSH_DEFAULT, 184 s == PIPE_SHADER_COMPUTE); 185 } 186 187 /* Flush writes to UBOs. */ 188 u_foreach_bit(i, v3d->constbuf[s].enabled_mask) { 189 struct pipe_constant_buffer *cb = &v3d->constbuf[s].cb[i]; 190 if (cb->buffer) { 191 v3d_flush_jobs_writing_resource(v3d, cb->buffer, 192 V3D_FLUSH_DEFAULT, 193 s == PIPE_SHADER_COMPUTE); 194 } 195 } 196 197 /* Flush reads/writes to our SSBOs */ 198 u_foreach_bit(i, v3d->ssbo[s].enabled_mask) { 199 struct pipe_shader_buffer *sb = &v3d->ssbo[s].sb[i]; 200 if (sb->buffer) { 201 v3d_flush_jobs_reading_resource(v3d, sb->buffer, 202 V3D_FLUSH_NOT_CURRENT_JOB, 203 s == PIPE_SHADER_COMPUTE); 204 } 205 } 206 207 /* Flush reads/writes to our image views */ 208 u_foreach_bit(i, v3d->shaderimg[s].enabled_mask) { 209 struct v3d_image_view *view = &v3d->shaderimg[s].si[i]; 210 211 v3d_flush_jobs_reading_resource(v3d, view->base.resource, 212 V3D_FLUSH_NOT_CURRENT_JOB, 213 s == PIPE_SHADER_COMPUTE); 214 } 215 216 /* Flush writes to our vertex buffers (i.e. from transform feedback) */ 217 if (s == PIPE_SHADER_VERTEX) { 218 u_foreach_bit(i, v3d->vertexbuf.enabled_mask) { 219 struct pipe_vertex_buffer *vb = &v3d->vertexbuf.vb[i]; 220 221 v3d_flush_jobs_writing_resource(v3d, vb->buffer.resource, 222 V3D_FLUSH_DEFAULT, 223 false); 224 } 225 } 226} 227 228static void 229v3d_predraw_check_outputs(struct pipe_context *pctx) 230{ 231 struct v3d_context *v3d = v3d_context(pctx); 232 233 /* Flush jobs reading from TF buffers that we are about to write. */ 234 if (v3d_transform_feedback_enabled(v3d)) { 235 struct v3d_streamout_stateobj *so = &v3d->streamout; 236 237 for (int i = 0; i < so->num_targets; i++) { 238 if (!so->targets[i]) 239 continue; 240 241 const struct pipe_stream_output_target *target = 242 so->targets[i]; 243 v3d_flush_jobs_reading_resource(v3d, target->buffer, 244 V3D_FLUSH_DEFAULT, 245 false); 246 } 247 } 248} 249 250/** 251 * Checks if the state for the current draw reads a particular resource in 252 * in the given shader stage. 253 */ 254static bool 255v3d_state_reads_resource(struct v3d_context *v3d, 256 struct pipe_resource *prsc, 257 enum pipe_shader_type s) 258{ 259 struct v3d_resource *rsc = v3d_resource(prsc); 260 261 /* Vertex buffers */ 262 if (s == PIPE_SHADER_VERTEX) { 263 u_foreach_bit(i, v3d->vertexbuf.enabled_mask) { 264 struct pipe_vertex_buffer *vb = &v3d->vertexbuf.vb[i]; 265 if (!vb->buffer.resource) 266 continue; 267 268 struct v3d_resource *vb_rsc = 269 v3d_resource(vb->buffer.resource); 270 if (rsc->bo == vb_rsc->bo) 271 return true; 272 } 273 } 274 275 /* Constant buffers */ 276 u_foreach_bit(i, v3d->constbuf[s].enabled_mask) { 277 struct pipe_constant_buffer *cb = &v3d->constbuf[s].cb[i]; 278 if (!cb->buffer) 279 continue; 280 281 struct v3d_resource *cb_rsc = v3d_resource(cb->buffer); 282 if (rsc->bo == cb_rsc->bo) 283 return true; 284 } 285 286 /* Shader storage buffers */ 287 u_foreach_bit(i, v3d->ssbo[s].enabled_mask) { 288 struct pipe_shader_buffer *sb = &v3d->ssbo[s].sb[i]; 289 if (!sb->buffer) 290 continue; 291 292 struct v3d_resource *sb_rsc = v3d_resource(sb->buffer); 293 if (rsc->bo == sb_rsc->bo) 294 return true; 295 } 296 297 /* Textures */ 298 for (int i = 0; i < v3d->tex[s].num_textures; i++) { 299 struct pipe_sampler_view *pview = v3d->tex[s].textures[i]; 300 if (!pview) 301 continue; 302 303 struct v3d_sampler_view *view = v3d_sampler_view(pview); 304 struct v3d_resource *v_rsc = v3d_resource(view->texture); 305 if (rsc->bo == v_rsc->bo) 306 return true; 307 } 308 309 return false; 310} 311 312static void 313v3d_emit_wait_for_tf(struct v3d_job *job) 314{ 315 /* XXX: we might be able to skip this in some cases, for now we 316 * always emit it. 317 */ 318 cl_emit(&job->bcl, FLUSH_TRANSFORM_FEEDBACK_DATA, flush); 319 320 cl_emit(&job->bcl, WAIT_FOR_TRANSFORM_FEEDBACK, wait) { 321 /* XXX: Wait for all outstanding writes... maybe we can do 322 * better in some cases. 323 */ 324 wait.block_count = 255; 325 } 326 327 /* We have just flushed all our outstanding TF work in this job so make 328 * sure we don't emit TF flushes again for any of it again. 329 */ 330 _mesa_set_clear(job->tf_write_prscs, NULL); 331} 332 333static void 334v3d_emit_wait_for_tf_if_needed(struct v3d_context *v3d, struct v3d_job *job) 335{ 336 if (!job->tf_enabled) 337 return; 338 339 set_foreach(job->tf_write_prscs, entry) { 340 struct pipe_resource *prsc = (struct pipe_resource *)entry->key; 341 for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) { 342 /* Fragment shaders can only start executing after all 343 * binning (and thus TF) is complete. 344 * 345 * XXX: For VS/GS/TES, if the binning shader does not 346 * read the resource then we could also avoid emitting 347 * the wait. 348 */ 349 if (s == PIPE_SHADER_FRAGMENT) 350 continue; 351 352 if (v3d_state_reads_resource(v3d, prsc, s)) { 353 v3d_emit_wait_for_tf(job); 354 return; 355 } 356 } 357 } 358} 359 360#if V3D_VERSION >= 41 361static void 362v3d_emit_gs_state_record(struct v3d_job *job, 363 struct v3d_compiled_shader *gs_bin, 364 struct v3d_cl_reloc gs_bin_uniforms, 365 struct v3d_compiled_shader *gs, 366 struct v3d_cl_reloc gs_render_uniforms) 367{ 368 cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) { 369 shader.geometry_bin_mode_shader_code_address = 370 cl_address(v3d_resource(gs_bin->resource)->bo, 371 gs_bin->offset); 372 shader.geometry_bin_mode_shader_4_way_threadable = 373 gs_bin->prog_data.gs->base.threads == 4; 374 shader.geometry_bin_mode_shader_start_in_final_thread_section = 375 gs_bin->prog_data.gs->base.single_seg; 376 shader.geometry_bin_mode_shader_propagate_nans = true; 377 shader.geometry_bin_mode_shader_uniforms_address = 378 gs_bin_uniforms; 379 380 shader.geometry_render_mode_shader_code_address = 381 cl_address(v3d_resource(gs->resource)->bo, gs->offset); 382 shader.geometry_render_mode_shader_4_way_threadable = 383 gs->prog_data.gs->base.threads == 4; 384 shader.geometry_render_mode_shader_start_in_final_thread_section = 385 gs->prog_data.gs->base.single_seg; 386 shader.geometry_render_mode_shader_propagate_nans = true; 387 shader.geometry_render_mode_shader_uniforms_address = 388 gs_render_uniforms; 389 } 390} 391 392static uint8_t 393v3d_gs_output_primitive(uint32_t prim_type) 394{ 395 switch (prim_type) { 396 case GL_POINTS: 397 return GEOMETRY_SHADER_POINTS; 398 case GL_LINE_STRIP: 399 return GEOMETRY_SHADER_LINE_STRIP; 400 case GL_TRIANGLE_STRIP: 401 return GEOMETRY_SHADER_TRI_STRIP; 402 default: 403 unreachable("Unsupported primitive type"); 404 } 405} 406 407static void 408v3d_emit_tes_gs_common_params(struct v3d_job *job, 409 uint8_t gs_out_prim_type, 410 uint8_t gs_num_invocations) 411{ 412 /* This, and v3d_emit_tes_gs_shader_params below, fill in default 413 * values for tessellation fields even though we don't support 414 * tessellation yet because our packing functions (and the simulator) 415 * complain if we don't. 416 */ 417 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) { 418 shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE; 419 shader.tessellation_point_mode = false; 420 shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN; 421 shader.tessellation_clockwise = true; 422 shader.tessellation_invocations = 1; 423 424 shader.geometry_shader_output_format = 425 v3d_gs_output_primitive(gs_out_prim_type); 426 shader.geometry_shader_instances = gs_num_invocations & 0x1F; 427 } 428} 429 430static uint8_t 431simd_width_to_gs_pack_mode(uint32_t width) 432{ 433 switch (width) { 434 case 16: 435 return V3D_PACK_MODE_16_WAY; 436 case 8: 437 return V3D_PACK_MODE_8_WAY; 438 case 4: 439 return V3D_PACK_MODE_4_WAY; 440 case 1: 441 return V3D_PACK_MODE_1_WAY; 442 default: 443 unreachable("Invalid SIMD width"); 444 }; 445} 446 447static void 448v3d_emit_tes_gs_shader_params(struct v3d_job *job, 449 uint32_t gs_simd, 450 uint32_t gs_vpm_output_size, 451 uint32_t gs_max_vpm_input_size_per_batch) 452{ 453 cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) { 454 shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED; 455 shader.per_patch_data_column_depth = 1; 456 shader.tcs_output_segment_size_in_sectors = 1; 457 shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; 458 shader.tes_output_segment_size_in_sectors = 1; 459 shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY; 460 shader.gs_output_segment_size_in_sectors = gs_vpm_output_size; 461 shader.gs_output_segment_pack_mode = 462 simd_width_to_gs_pack_mode(gs_simd); 463 shader.tbg_max_patches_per_tcs_batch = 1; 464 shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0; 465 shader.tbg_min_tcs_output_segments_required_in_play = 1; 466 shader.tbg_min_per_patch_data_segments_required_in_play = 1; 467 shader.tpg_max_patches_per_tes_batch = 1; 468 shader.tpg_max_vertex_segments_per_tes_batch = 0; 469 shader.tpg_max_tcs_output_segments_per_tes_batch = 1; 470 shader.tpg_min_tes_output_segments_required_in_play = 1; 471 shader.gbg_max_tes_output_vertex_segments_per_gs_batch = 472 gs_max_vpm_input_size_per_batch; 473 shader.gbg_min_gs_output_segments_required_in_play = 1; 474 } 475} 476#endif 477 478static void 479v3d_emit_gl_shader_state(struct v3d_context *v3d, 480 const struct pipe_draw_info *info) 481{ 482 struct v3d_job *job = v3d->job; 483 /* V3D_DIRTY_VTXSTATE */ 484 struct v3d_vertex_stateobj *vtx = v3d->vtx; 485 /* V3D_DIRTY_VTXBUF */ 486 struct v3d_vertexbuf_stateobj *vertexbuf = &v3d->vertexbuf; 487 488 /* Upload the uniforms to the indirect CL first */ 489 struct v3d_cl_reloc fs_uniforms = 490 v3d_write_uniforms(v3d, job, v3d->prog.fs, 491 PIPE_SHADER_FRAGMENT); 492 493 struct v3d_cl_reloc gs_uniforms = { NULL, 0 }; 494 struct v3d_cl_reloc gs_bin_uniforms = { NULL, 0 }; 495 if (v3d->prog.gs) { 496 gs_uniforms = v3d_write_uniforms(v3d, job, v3d->prog.gs, 497 PIPE_SHADER_GEOMETRY); 498 } 499 if (v3d->prog.gs_bin) { 500 gs_bin_uniforms = v3d_write_uniforms(v3d, job, v3d->prog.gs_bin, 501 PIPE_SHADER_GEOMETRY); 502 } 503 504 struct v3d_cl_reloc vs_uniforms = 505 v3d_write_uniforms(v3d, job, v3d->prog.vs, 506 PIPE_SHADER_VERTEX); 507 struct v3d_cl_reloc cs_uniforms = 508 v3d_write_uniforms(v3d, job, v3d->prog.cs, 509 PIPE_SHADER_VERTEX); 510 511 /* Update the cache dirty flag based on the shader progs data */ 512 job->tmu_dirty_rcl |= v3d->prog.cs->prog_data.vs->base.tmu_dirty_rcl; 513 job->tmu_dirty_rcl |= v3d->prog.vs->prog_data.vs->base.tmu_dirty_rcl; 514 if (v3d->prog.gs_bin) { 515 job->tmu_dirty_rcl |= 516 v3d->prog.gs_bin->prog_data.gs->base.tmu_dirty_rcl; 517 } 518 if (v3d->prog.gs) { 519 job->tmu_dirty_rcl |= 520 v3d->prog.gs->prog_data.gs->base.tmu_dirty_rcl; 521 } 522 job->tmu_dirty_rcl |= v3d->prog.fs->prog_data.fs->base.tmu_dirty_rcl; 523 524 uint32_t num_elements_to_emit = 0; 525 for (int i = 0; i < vtx->num_elements; i++) { 526 struct pipe_vertex_element *elem = &vtx->pipe[i]; 527 struct pipe_vertex_buffer *vb = 528 &vertexbuf->vb[elem->vertex_buffer_index]; 529 if (vb->buffer.resource) 530 num_elements_to_emit++; 531 } 532 533 uint32_t shader_state_record_length = 534 cl_packet_length(GL_SHADER_STATE_RECORD); 535#if V3D_VERSION >= 41 536 if (v3d->prog.gs) { 537 shader_state_record_length += 538 cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) + 539 cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) + 540 2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS); 541 } 542#endif 543 544 /* See GFXH-930 workaround below */ 545 uint32_t shader_rec_offset = 546 v3d_cl_ensure_space(&job->indirect, 547 shader_state_record_length + 548 MAX2(num_elements_to_emit, 1) * 549 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD), 550 32); 551 552 /* XXX perf: We should move most of the SHADER_STATE_RECORD setup to 553 * compile time, so that we mostly just have to OR the VS and FS 554 * records together at draw time. 555 */ 556 557 struct vpm_config vpm_cfg_bin, vpm_cfg; 558 559 assert(v3d->screen->devinfo.ver >= 41 || !v3d->prog.gs); 560 v3d_compute_vpm_config(&v3d->screen->devinfo, 561 v3d->prog.cs->prog_data.vs, 562 v3d->prog.vs->prog_data.vs, 563 v3d->prog.gs ? v3d->prog.gs_bin->prog_data.gs : NULL, 564 v3d->prog.gs ? v3d->prog.gs->prog_data.gs : NULL, 565 &vpm_cfg_bin, 566 &vpm_cfg); 567 568 if (v3d->prog.gs) { 569#if V3D_VERSION >= 41 570 v3d_emit_gs_state_record(v3d->job, 571 v3d->prog.gs_bin, gs_bin_uniforms, 572 v3d->prog.gs, gs_uniforms); 573 574 struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs; 575 v3d_emit_tes_gs_common_params(v3d->job, 576 gs->out_prim_type, 577 gs->num_invocations); 578 579 /* Bin Tes/Gs params */ 580 v3d_emit_tes_gs_shader_params(v3d->job, 581 vpm_cfg_bin.gs_width, 582 vpm_cfg_bin.Gd, 583 vpm_cfg_bin.Gv); 584 585 /* Render Tes/Gs params */ 586 v3d_emit_tes_gs_shader_params(v3d->job, 587 vpm_cfg.gs_width, 588 vpm_cfg.Gd, 589 vpm_cfg.Gv); 590#else 591 unreachable("No GS support pre-4.1"); 592#endif 593 } 594 595 cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) { 596 shader.enable_clipping = true; 597 /* V3D_DIRTY_PRIM_MODE | V3D_DIRTY_RASTERIZER */ 598 shader.point_size_in_shaded_vertex_data = 599 (info->mode == PIPE_PRIM_POINTS && 600 v3d->rasterizer->base.point_size_per_vertex); 601 602 /* Must be set if the shader modifies Z, discards, or modifies 603 * the sample mask. For any of these cases, the fragment 604 * shader needs to write the Z value (even just discards). 605 */ 606 shader.fragment_shader_does_z_writes = 607 v3d->prog.fs->prog_data.fs->writes_z; 608 /* Set if the EZ test must be disabled (due to shader side 609 * effects and the early_z flag not being present in the 610 * shader). 611 */ 612 shader.turn_off_early_z_test = 613 v3d->prog.fs->prog_data.fs->disable_ez; 614 615 shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 = 616 v3d->prog.fs->prog_data.fs->uses_center_w; 617 618#if V3D_VERSION >= 41 619 shader.any_shader_reads_hardware_written_primitive_id = 620 (v3d->prog.gs && v3d->prog.gs->prog_data.gs->uses_pid) || 621 v3d->prog.fs->prog_data.fs->uses_pid; 622 shader.insert_primitive_id_as_first_varying_to_fragment_shader = 623 !v3d->prog.gs && v3d->prog.fs->prog_data.fs->uses_pid; 624#endif 625 626#if V3D_VERSION >= 40 627 shader.do_scoreboard_wait_on_first_thread_switch = 628 v3d->prog.fs->prog_data.fs->lock_scoreboard_on_first_thrsw; 629 shader.disable_implicit_point_line_varyings = 630 !v3d->prog.fs->prog_data.fs->uses_implicit_point_line_varyings; 631#endif 632 633 shader.number_of_varyings_in_fragment_shader = 634 v3d->prog.fs->prog_data.fs->num_inputs; 635 636 shader.coordinate_shader_propagate_nans = true; 637 shader.vertex_shader_propagate_nans = true; 638 shader.fragment_shader_propagate_nans = true; 639 640 shader.coordinate_shader_code_address = 641 cl_address(v3d_resource(v3d->prog.cs->resource)->bo, 642 v3d->prog.cs->offset); 643 shader.vertex_shader_code_address = 644 cl_address(v3d_resource(v3d->prog.vs->resource)->bo, 645 v3d->prog.vs->offset); 646 shader.fragment_shader_code_address = 647 cl_address(v3d_resource(v3d->prog.fs->resource)->bo, 648 v3d->prog.fs->offset); 649 650 /* XXX: Use combined input/output size flag in the common 651 * case. 652 */ 653 shader.coordinate_shader_has_separate_input_and_output_vpm_blocks = 654 v3d->prog.cs->prog_data.vs->separate_segments; 655 shader.vertex_shader_has_separate_input_and_output_vpm_blocks = 656 v3d->prog.vs->prog_data.vs->separate_segments; 657 658 shader.coordinate_shader_input_vpm_segment_size = 659 v3d->prog.cs->prog_data.vs->separate_segments ? 660 v3d->prog.cs->prog_data.vs->vpm_input_size : 1; 661 shader.vertex_shader_input_vpm_segment_size = 662 v3d->prog.vs->prog_data.vs->separate_segments ? 663 v3d->prog.vs->prog_data.vs->vpm_input_size : 1; 664 665 shader.coordinate_shader_output_vpm_segment_size = 666 v3d->prog.cs->prog_data.vs->vpm_output_size; 667 shader.vertex_shader_output_vpm_segment_size = 668 v3d->prog.vs->prog_data.vs->vpm_output_size; 669 670 shader.coordinate_shader_uniforms_address = cs_uniforms; 671 shader.vertex_shader_uniforms_address = vs_uniforms; 672 shader.fragment_shader_uniforms_address = fs_uniforms; 673 674#if V3D_VERSION >= 41 675 shader.min_coord_shader_input_segments_required_in_play = 676 vpm_cfg_bin.As; 677 shader.min_vertex_shader_input_segments_required_in_play = 678 vpm_cfg.As; 679 680 shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = 681 vpm_cfg_bin.Ve; 682 shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size = 683 vpm_cfg.Ve; 684 685 shader.coordinate_shader_4_way_threadable = 686 v3d->prog.cs->prog_data.vs->base.threads == 4; 687 shader.vertex_shader_4_way_threadable = 688 v3d->prog.vs->prog_data.vs->base.threads == 4; 689 shader.fragment_shader_4_way_threadable = 690 v3d->prog.fs->prog_data.fs->base.threads == 4; 691 692 shader.coordinate_shader_start_in_final_thread_section = 693 v3d->prog.cs->prog_data.vs->base.single_seg; 694 shader.vertex_shader_start_in_final_thread_section = 695 v3d->prog.vs->prog_data.vs->base.single_seg; 696 shader.fragment_shader_start_in_final_thread_section = 697 v3d->prog.fs->prog_data.fs->base.single_seg; 698#else 699 shader.coordinate_shader_4_way_threadable = 700 v3d->prog.cs->prog_data.vs->base.threads == 4; 701 shader.coordinate_shader_2_way_threadable = 702 v3d->prog.cs->prog_data.vs->base.threads == 2; 703 shader.vertex_shader_4_way_threadable = 704 v3d->prog.vs->prog_data.vs->base.threads == 4; 705 shader.vertex_shader_2_way_threadable = 706 v3d->prog.vs->prog_data.vs->base.threads == 2; 707 shader.fragment_shader_4_way_threadable = 708 v3d->prog.fs->prog_data.fs->base.threads == 4; 709 shader.fragment_shader_2_way_threadable = 710 v3d->prog.fs->prog_data.fs->base.threads == 2; 711#endif 712 713 shader.vertex_id_read_by_coordinate_shader = 714 v3d->prog.cs->prog_data.vs->uses_vid; 715 shader.instance_id_read_by_coordinate_shader = 716 v3d->prog.cs->prog_data.vs->uses_iid; 717 shader.vertex_id_read_by_vertex_shader = 718 v3d->prog.vs->prog_data.vs->uses_vid; 719 shader.instance_id_read_by_vertex_shader = 720 v3d->prog.vs->prog_data.vs->uses_iid; 721 722 shader.address_of_default_attribute_values = 723 cl_address(v3d_resource(vtx->defaults)->bo, 724 vtx->defaults_offset); 725 } 726 727 bool cs_loaded_any = false; 728 for (int i = 0; i < vtx->num_elements; i++) { 729 struct pipe_vertex_element *elem = &vtx->pipe[i]; 730 struct pipe_vertex_buffer *vb = 731 &vertexbuf->vb[elem->vertex_buffer_index]; 732 struct v3d_resource *rsc = v3d_resource(vb->buffer.resource); 733 734 if (!rsc) 735 continue; 736 737 const uint32_t size = 738 cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD); 739 cl_emit_with_prepacked(&job->indirect, 740 GL_SHADER_STATE_ATTRIBUTE_RECORD, 741 &vtx->attrs[i * size], attr) { 742 attr.stride = vb->stride; 743 attr.address = cl_address(rsc->bo, 744 vb->buffer_offset + 745 elem->src_offset); 746 attr.number_of_values_read_by_coordinate_shader = 747 v3d->prog.cs->prog_data.vs->vattr_sizes[i]; 748 attr.number_of_values_read_by_vertex_shader = 749 v3d->prog.vs->prog_data.vs->vattr_sizes[i]; 750 751 /* GFXH-930: At least one attribute must be enabled 752 * and read by CS and VS. If we have attributes being 753 * consumed by the VS but not the CS, then set up a 754 * dummy load of the last attribute into the CS's VPM 755 * inputs. (Since CS is just dead-code-elimination 756 * compared to VS, we can't have CS loading but not 757 * VS). 758 */ 759 if (v3d->prog.cs->prog_data.vs->vattr_sizes[i]) 760 cs_loaded_any = true; 761 if (i == vtx->num_elements - 1 && !cs_loaded_any) { 762 attr.number_of_values_read_by_coordinate_shader = 1; 763 } 764#if V3D_VERSION >= 41 765 attr.maximum_index = 0xffffff; 766#endif 767 } 768 STATIC_ASSERT(sizeof(vtx->attrs) >= V3D_MAX_VS_INPUTS / 4 * size); 769 } 770 771 if (num_elements_to_emit == 0) { 772 /* GFXH-930: At least one attribute must be enabled and read 773 * by CS and VS. If we have no attributes being consumed by 774 * the shader, set up a dummy to be loaded into the VPM. 775 */ 776 cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) { 777 /* Valid address of data whose value will be unused. */ 778 attr.address = cl_address(job->indirect.bo, 0); 779 780 attr.type = ATTRIBUTE_FLOAT; 781 attr.stride = 0; 782 attr.vec_size = 1; 783 784 attr.number_of_values_read_by_coordinate_shader = 1; 785 attr.number_of_values_read_by_vertex_shader = 1; 786 } 787 num_elements_to_emit = 1; 788 } 789 790 cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) { 791 vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc; 792 vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc; 793 } 794 795#if V3D_VERSION >= 41 796 if (v3d->prog.gs) { 797 cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) { 798 state.address = cl_address(job->indirect.bo, 799 shader_rec_offset); 800 state.number_of_attribute_arrays = num_elements_to_emit; 801 } 802 } else { 803 cl_emit(&job->bcl, GL_SHADER_STATE, state) { 804 state.address = cl_address(job->indirect.bo, 805 shader_rec_offset); 806 state.number_of_attribute_arrays = num_elements_to_emit; 807 } 808 } 809#else 810 assert(!v3d->prog.gs); 811 cl_emit(&job->bcl, GL_SHADER_STATE, state) { 812 state.address = cl_address(job->indirect.bo, shader_rec_offset); 813 state.number_of_attribute_arrays = num_elements_to_emit; 814 } 815#endif 816 817 v3d_bo_unreference(&cs_uniforms.bo); 818 v3d_bo_unreference(&vs_uniforms.bo); 819 if (gs_uniforms.bo) 820 v3d_bo_unreference(&gs_uniforms.bo); 821 if (gs_bin_uniforms.bo) 822 v3d_bo_unreference(&gs_bin_uniforms.bo); 823 v3d_bo_unreference(&fs_uniforms.bo); 824} 825 826/** 827 * Updates the number of primitives generated from the number of vertices 828 * to draw. This only works when no GS is present, since otherwise the number 829 * of primitives generated cannot be determined in advance and we need to 830 * use the PRIMITIVE_COUNTS_FEEDBACK command instead, however, that requires 831 * a sync wait for the draw to complete, so we only use that when GS is present. 832 */ 833static void 834v3d_update_primitives_generated_counter(struct v3d_context *v3d, 835 const struct pipe_draw_info *info, 836 const struct pipe_draw_start_count_bias *draw) 837{ 838 assert(!v3d->prog.gs); 839 840 if (!v3d->active_queries) 841 return; 842 843 uint32_t prims = u_prims_for_vertices(info->mode, draw->count); 844 v3d->prims_generated += prims; 845} 846 847static void 848v3d_update_job_ez(struct v3d_context *v3d, struct v3d_job *job) 849{ 850 switch (v3d->zsa->ez_state) { 851 case V3D_EZ_UNDECIDED: 852 /* If the Z/S state didn't pick a direction but didn't 853 * disable, then go along with the current EZ state. This 854 * allows EZ optimization for Z func == EQUAL or NEVER. 855 */ 856 break; 857 858 case V3D_EZ_LT_LE: 859 case V3D_EZ_GT_GE: 860 /* If the Z/S state picked a direction, then it needs to match 861 * the current direction if we've decided on one. 862 */ 863 if (job->ez_state == V3D_EZ_UNDECIDED) 864 job->ez_state = v3d->zsa->ez_state; 865 else if (job->ez_state != v3d->zsa->ez_state) 866 job->ez_state = V3D_EZ_DISABLED; 867 break; 868 869 case V3D_EZ_DISABLED: 870 /* If the current Z/S state disables EZ because of a bad Z 871 * func or stencil operation, then we can't do any more EZ in 872 * this frame. 873 */ 874 job->ez_state = V3D_EZ_DISABLED; 875 break; 876 } 877 878 /* If the FS affects the Z of the pixels, then it may update against 879 * the chosen EZ direction (though we could use 880 * ARB_conservative_depth's hints to avoid this) 881 */ 882 if (v3d->prog.fs->prog_data.fs->writes_z) { 883 job->ez_state = V3D_EZ_DISABLED; 884 } 885 886 if (job->first_ez_state == V3D_EZ_UNDECIDED && 887 (job->ez_state != V3D_EZ_DISABLED || job->draw_calls_queued == 0)) 888 job->first_ez_state = job->ez_state; 889} 890 891static uint32_t 892v3d_hw_prim_type(enum pipe_prim_type prim_type) 893{ 894 switch (prim_type) { 895 case PIPE_PRIM_POINTS: 896 case PIPE_PRIM_LINES: 897 case PIPE_PRIM_LINE_LOOP: 898 case PIPE_PRIM_LINE_STRIP: 899 case PIPE_PRIM_TRIANGLES: 900 case PIPE_PRIM_TRIANGLE_STRIP: 901 case PIPE_PRIM_TRIANGLE_FAN: 902 return prim_type; 903 904 case PIPE_PRIM_LINES_ADJACENCY: 905 case PIPE_PRIM_LINE_STRIP_ADJACENCY: 906 case PIPE_PRIM_TRIANGLES_ADJACENCY: 907 case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY: 908 return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY); 909 910 default: 911 unreachable("Unsupported primitive type"); 912 } 913} 914 915static bool 916v3d_check_compiled_shaders(struct v3d_context *v3d) 917{ 918 static bool warned[5] = { 0 }; 919 920 uint32_t failed_stage = MESA_SHADER_NONE; 921 if (!v3d->prog.vs->resource || !v3d->prog.cs->resource) { 922 failed_stage = MESA_SHADER_VERTEX; 923 } else if ((v3d->prog.gs_bin && !v3d->prog.gs_bin->resource) || 924 (v3d->prog.gs && !v3d->prog.gs->resource)) { 925 failed_stage = MESA_SHADER_GEOMETRY; 926 } else if (v3d->prog.fs && !v3d->prog.fs->resource) { 927 failed_stage = MESA_SHADER_FRAGMENT; 928 } 929 930 if (likely(failed_stage == MESA_SHADER_NONE)) 931 return true; 932 933 if (!warned[failed_stage]) { 934 fprintf(stderr, 935 "%s shader failed to compile. Expect corruption.\n", 936 _mesa_shader_stage_to_string(failed_stage)); 937 warned[failed_stage] = true; 938 } 939 return false; 940} 941 942static void 943v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, 944 unsigned drawid_offset, 945 const struct pipe_draw_indirect_info *indirect, 946 const struct pipe_draw_start_count_bias *draws, 947 unsigned num_draws) 948{ 949 if (num_draws > 1) { 950 util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws); 951 return; 952 } 953 954 if (!indirect && (!draws[0].count || !info->instance_count)) 955 return; 956 957 struct v3d_context *v3d = v3d_context(pctx); 958 959 if (!indirect && 960 !info->primitive_restart && 961 !u_trim_pipe_prim(info->mode, (unsigned*)&draws[0].count)) 962 return; 963 964 /* Fall back for weird desktop GL primitive restart values. */ 965 if (info->primitive_restart && 966 info->index_size) { 967 uint32_t mask = util_prim_restart_index_from_size(info->index_size); 968 if (info->restart_index != mask) { 969 util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect, &draws[0]); 970 return; 971 } 972 } 973 974 /* Before setting up the draw, flush anything writing to the resources 975 * that we read from or reading from resources we write to. 976 */ 977 for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) 978 v3d_predraw_check_stage_inputs(pctx, s); 979 980 if (indirect && indirect->buffer) { 981 v3d_flush_jobs_writing_resource(v3d, indirect->buffer, 982 V3D_FLUSH_DEFAULT, false); 983 } 984 985 v3d_predraw_check_outputs(pctx); 986 987 /* If transform feedback is active and we are switching primitive type 988 * we need to submit the job before drawing and update the vertex count 989 * written to TF based on the primitive type since we will need to 990 * know the exact vertex count if the application decides to call 991 * glDrawTransformFeedback() later. 992 */ 993 if (v3d->streamout.num_targets > 0 && 994 u_base_prim_type(info->mode) != u_base_prim_type(v3d->prim_mode)) { 995 v3d_update_primitive_counters(v3d); 996 } 997 998 struct v3d_job *job = v3d_get_job_for_fbo(v3d); 999 1000 /* If vertex texturing depends on the output of rendering, we need to 1001 * ensure that that rendering is complete before we run a coordinate 1002 * shader that depends on it. 1003 * 1004 * Given that doing that is unusual, for now we just block the binner 1005 * on the last submitted render, rather than tracking the last 1006 * rendering to each texture's BO. 1007 */ 1008 if (v3d->tex[PIPE_SHADER_VERTEX].num_textures || (indirect && indirect->buffer)) { 1009 perf_debug("Blocking binner on last render " 1010 "due to vertex texturing or indirect drawing.\n"); 1011 job->submit.in_sync_bcl = v3d->out_sync; 1012 } 1013 1014 /* We also need to ensure that compute is complete when render depends 1015 * on resources written by it. 1016 */ 1017 if (v3d->sync_on_last_compute_job) { 1018 job->submit.in_sync_bcl = v3d->out_sync; 1019 v3d->sync_on_last_compute_job = false; 1020 } 1021 1022 /* Mark SSBOs and images as being written. We don't actually know 1023 * which ones are read vs written, so just assume the worst. 1024 */ 1025 for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) { 1026 u_foreach_bit(i, v3d->ssbo[s].enabled_mask) { 1027 v3d_job_add_write_resource(job, 1028 v3d->ssbo[s].sb[i].buffer); 1029 job->tmu_dirty_rcl = true; 1030 } 1031 1032 u_foreach_bit(i, v3d->shaderimg[s].enabled_mask) { 1033 v3d_job_add_write_resource(job, 1034 v3d->shaderimg[s].si[i].base.resource); 1035 job->tmu_dirty_rcl = true; 1036 } 1037 } 1038 1039 /* Get space to emit our draw call into the BCL, using a branch to 1040 * jump to a new BO if necessary. 1041 */ 1042 v3d_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */); 1043 1044 if (v3d->prim_mode != info->mode) { 1045 v3d->prim_mode = info->mode; 1046 v3d->dirty |= V3D_DIRTY_PRIM_MODE; 1047 } 1048 1049 v3d_start_draw(v3d); 1050 v3d_update_compiled_shaders(v3d, info->mode); 1051 if (!v3d_check_compiled_shaders(v3d)) 1052 return; 1053 v3d_update_job_ez(v3d, job); 1054 1055 /* If this job was writing to transform feedback buffers before this 1056 * draw and we are reading from them here, then we need to wait for TF 1057 * to complete before we emit this draw. 1058 * 1059 * Notice this check needs to happen before we emit state for the 1060 * current draw call, where we update job->tf_enabled, so we can ensure 1061 * that we only check TF writes for prior draws. 1062 */ 1063 v3d_emit_wait_for_tf_if_needed(v3d, job); 1064 1065#if V3D_VERSION >= 41 1066 v3d41_emit_state(pctx); 1067#else 1068 v3d33_emit_state(pctx); 1069#endif 1070 1071 if (v3d->dirty & (V3D_DIRTY_VTXBUF | 1072 V3D_DIRTY_VTXSTATE | 1073 V3D_DIRTY_PRIM_MODE | 1074 V3D_DIRTY_RASTERIZER | 1075 V3D_DIRTY_COMPILED_CS | 1076 V3D_DIRTY_COMPILED_VS | 1077 V3D_DIRTY_COMPILED_GS_BIN | 1078 V3D_DIRTY_COMPILED_GS | 1079 V3D_DIRTY_COMPILED_FS | 1080 v3d->prog.cs->uniform_dirty_bits | 1081 v3d->prog.vs->uniform_dirty_bits | 1082 (v3d->prog.gs_bin ? 1083 v3d->prog.gs_bin->uniform_dirty_bits : 0) | 1084 (v3d->prog.gs ? 1085 v3d->prog.gs->uniform_dirty_bits : 0) | 1086 v3d->prog.fs->uniform_dirty_bits)) { 1087 v3d_emit_gl_shader_state(v3d, info); 1088 } 1089 1090 v3d->dirty = 0; 1091 1092 /* The Base Vertex/Base Instance packet sets those values to nonzero 1093 * for the next draw call only. 1094 */ 1095 if ((info->index_size && draws->index_bias) || info->start_instance) { 1096 cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) { 1097 base.base_instance = info->start_instance; 1098 base.base_vertex = info->index_size ? draws->index_bias : 0; 1099 } 1100 } 1101 1102 uint32_t prim_tf_enable = 0; 1103#if V3D_VERSION < 40 1104 /* V3D 3.x: The HW only processes transform feedback on primitives 1105 * with the flag set. 1106 */ 1107 if (v3d->streamout.num_targets) 1108 prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS); 1109#endif 1110 1111 if (!v3d->prog.gs) 1112 v3d_update_primitives_generated_counter(v3d, info, &draws[0]); 1113 1114 uint32_t hw_prim_type = v3d_hw_prim_type(info->mode); 1115 if (info->index_size) { 1116 uint32_t index_size = info->index_size; 1117 uint32_t offset = draws[0].start * index_size; 1118 struct pipe_resource *prsc; 1119 if (info->has_user_indices) { 1120 unsigned start_offset = draws[0].start * info->index_size; 1121 prsc = NULL; 1122 u_upload_data(v3d->uploader, start_offset, 1123 draws[0].count * info->index_size, 4, 1124 (char*)info->index.user + start_offset, 1125 &offset, &prsc); 1126 } else { 1127 prsc = info->index.resource; 1128 } 1129 struct v3d_resource *rsc = v3d_resource(prsc); 1130 1131#if V3D_VERSION >= 40 1132 cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) { 1133 ib.address = cl_address(rsc->bo, 0); 1134 ib.size = rsc->bo->size; 1135 } 1136#endif 1137 1138 if (indirect && indirect->buffer) { 1139 cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) { 1140 prim.index_type = ffs(info->index_size) - 1; 1141#if V3D_VERSION < 40 1142 prim.address_of_indices_list = 1143 cl_address(rsc->bo, offset); 1144#endif /* V3D_VERSION < 40 */ 1145 prim.mode = hw_prim_type | prim_tf_enable; 1146 prim.enable_primitive_restarts = info->primitive_restart; 1147 1148 prim.number_of_draw_indirect_indexed_records = indirect->draw_count; 1149 1150 prim.stride_in_multiples_of_4_bytes = indirect->stride >> 2; 1151 prim.address = cl_address(v3d_resource(indirect->buffer)->bo, 1152 indirect->offset); 1153 } 1154 } else if (info->instance_count > 1) { 1155 cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) { 1156 prim.index_type = ffs(info->index_size) - 1; 1157#if V3D_VERSION >= 40 1158 prim.index_offset = offset; 1159#else /* V3D_VERSION < 40 */ 1160 prim.maximum_index = (1u << 31) - 1; /* XXX */ 1161 prim.address_of_indices_list = 1162 cl_address(rsc->bo, offset); 1163#endif /* V3D_VERSION < 40 */ 1164 prim.mode = hw_prim_type | prim_tf_enable; 1165 prim.enable_primitive_restarts = info->primitive_restart; 1166 1167 prim.number_of_instances = info->instance_count; 1168 prim.instance_length = draws[0].count; 1169 } 1170 } else { 1171 cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) { 1172 prim.index_type = ffs(info->index_size) - 1; 1173 prim.length = draws[0].count; 1174#if V3D_VERSION >= 40 1175 prim.index_offset = offset; 1176#else /* V3D_VERSION < 40 */ 1177 prim.maximum_index = (1u << 31) - 1; /* XXX */ 1178 prim.address_of_indices_list = 1179 cl_address(rsc->bo, offset); 1180#endif /* V3D_VERSION < 40 */ 1181 prim.mode = hw_prim_type | prim_tf_enable; 1182 prim.enable_primitive_restarts = info->primitive_restart; 1183 } 1184 } 1185 1186 if (info->has_user_indices) 1187 pipe_resource_reference(&prsc, NULL); 1188 } else { 1189 if (indirect && indirect->buffer) { 1190 cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) { 1191 prim.mode = hw_prim_type | prim_tf_enable; 1192 prim.number_of_draw_indirect_array_records = indirect->draw_count; 1193 1194 prim.stride_in_multiples_of_4_bytes = indirect->stride >> 2; 1195 prim.address = cl_address(v3d_resource(indirect->buffer)->bo, 1196 indirect->offset); 1197 } 1198 } else if (info->instance_count > 1) { 1199 struct pipe_stream_output_target *so = 1200 indirect && indirect->count_from_stream_output ? 1201 indirect->count_from_stream_output : NULL; 1202 uint32_t vert_count = so ? 1203 v3d_stream_output_target_get_vertex_count(so) : 1204 draws[0].count; 1205 cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) { 1206 prim.mode = hw_prim_type | prim_tf_enable; 1207 prim.index_of_first_vertex = draws[0].start; 1208 prim.number_of_instances = info->instance_count; 1209 prim.instance_length = vert_count; 1210 } 1211 } else { 1212 struct pipe_stream_output_target *so = 1213 indirect && indirect->count_from_stream_output ? 1214 indirect->count_from_stream_output : NULL; 1215 uint32_t vert_count = so ? 1216 v3d_stream_output_target_get_vertex_count(so) : 1217 draws[0].count; 1218 cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) { 1219 prim.mode = hw_prim_type | prim_tf_enable; 1220 prim.length = vert_count; 1221 prim.index_of_first_vertex = draws[0].start; 1222 } 1223 } 1224 } 1225 1226 /* A flush is required in between a TF draw and any following TF specs 1227 * packet, or the GPU may hang. Just flush each time for now. 1228 */ 1229 if (v3d->streamout.num_targets) 1230 cl_emit(&job->bcl, TRANSFORM_FEEDBACK_FLUSH_AND_COUNT, flush); 1231 1232 job->draw_calls_queued++; 1233 if (v3d->streamout.num_targets) 1234 job->tf_draw_calls_queued++; 1235 1236 /* Increment the TF offsets by how many verts we wrote. XXX: This 1237 * needs some clamping to the buffer size. 1238 */ 1239 for (int i = 0; i < v3d->streamout.num_targets; i++) 1240 v3d->streamout.offsets[i] += draws[0].count; 1241 1242 if (v3d->zsa && job->zsbuf && v3d->zsa->base.depth_enabled) { 1243 struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture); 1244 v3d_job_add_bo(job, rsc->bo); 1245 1246 job->load |= PIPE_CLEAR_DEPTH & ~job->clear; 1247 if (v3d->zsa->base.depth_writemask) 1248 job->store |= PIPE_CLEAR_DEPTH; 1249 rsc->initialized_buffers = PIPE_CLEAR_DEPTH; 1250 } 1251 1252 if (v3d->zsa && job->zsbuf && v3d->zsa->base.stencil[0].enabled) { 1253 struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture); 1254 if (rsc->separate_stencil) 1255 rsc = rsc->separate_stencil; 1256 1257 v3d_job_add_bo(job, rsc->bo); 1258 1259 job->load |= PIPE_CLEAR_STENCIL & ~job->clear; 1260 if (v3d->zsa->base.stencil[0].writemask || 1261 v3d->zsa->base.stencil[1].writemask) { 1262 job->store |= PIPE_CLEAR_STENCIL; 1263 } 1264 rsc->initialized_buffers |= PIPE_CLEAR_STENCIL; 1265 } 1266 1267 for (int i = 0; i < job->nr_cbufs; i++) { 1268 uint32_t bit = PIPE_CLEAR_COLOR0 << i; 1269 int blend_rt = v3d->blend->base.independent_blend_enable ? i : 0; 1270 1271 if (job->store & bit || !job->cbufs[i]) 1272 continue; 1273 struct v3d_resource *rsc = v3d_resource(job->cbufs[i]->texture); 1274 1275 job->load |= bit & ~job->clear; 1276 if (v3d->blend->base.rt[blend_rt].colormask) 1277 job->store |= bit; 1278 v3d_job_add_bo(job, rsc->bo); 1279 } 1280 1281 if (job->referenced_size > 768 * 1024 * 1024) { 1282 perf_debug("Flushing job with %dkb to try to free up memory\n", 1283 job->referenced_size / 1024); 1284 v3d_flush(pctx); 1285 } 1286 1287 if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH)) 1288 v3d_flush(pctx); 1289} 1290 1291#if V3D_VERSION >= 41 1292#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16 1293#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0 1294/* Allow this dispatch to start while the last one is still running. */ 1295#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26) 1296/* Maximum supergroup ID. 6 bits. */ 1297#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20 1298/* Batches per supergroup minus 1. 8 bits. */ 1299#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12 1300/* Workgroups per supergroup, 0 means 16 */ 1301#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8 1302#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0 1303 1304#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2) 1305#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1) 1306#define V3D_CSD_CFG5_THREADING (1 << 0) 1307 1308static void 1309v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info) 1310{ 1311 struct v3d_context *v3d = v3d_context(pctx); 1312 struct v3d_screen *screen = v3d->screen; 1313 1314 v3d_predraw_check_stage_inputs(pctx, PIPE_SHADER_COMPUTE); 1315 1316 v3d_update_compiled_cs(v3d); 1317 1318 if (!v3d->prog.compute->resource) { 1319 static bool warned = false; 1320 if (!warned) { 1321 fprintf(stderr, 1322 "Compute shader failed to compile. " 1323 "Expect corruption.\n"); 1324 warned = true; 1325 } 1326 return; 1327 } 1328 1329 /* Some of the units of scale: 1330 * 1331 * - Batches of 16 work items (shader invocations) that will be queued 1332 * to the run on a QPU at once. 1333 * 1334 * - Workgroups composed of work items based on the shader's layout 1335 * declaration. 1336 * 1337 * - Supergroups of 1-16 workgroups. There can only be 16 supergroups 1338 * running at a time on the core, so we want to keep them large to 1339 * keep the QPUs busy, but a whole supergroup will sync at a barrier 1340 * so we want to keep them small if one is present. 1341 */ 1342 struct drm_v3d_submit_csd submit = { 0 }; 1343 struct v3d_job *job = v3d_job_create(v3d); 1344 1345 /* Set up the actual number of workgroups, synchronously mapping the 1346 * indirect buffer if necessary to get the dimensions. 1347 */ 1348 if (info->indirect) { 1349 struct pipe_transfer *transfer; 1350 uint32_t *map = pipe_buffer_map_range(pctx, info->indirect, 1351 info->indirect_offset, 1352 3 * sizeof(uint32_t), 1353 PIPE_MAP_READ, 1354 &transfer); 1355 memcpy(v3d->compute_num_workgroups, map, 3 * sizeof(uint32_t)); 1356 pipe_buffer_unmap(pctx, transfer); 1357 1358 if (v3d->compute_num_workgroups[0] == 0 || 1359 v3d->compute_num_workgroups[1] == 0 || 1360 v3d->compute_num_workgroups[2] == 0) { 1361 /* Nothing to dispatch, so skip the draw (CSD can't 1362 * handle 0 workgroups). 1363 */ 1364 return; 1365 } 1366 } else { 1367 v3d->compute_num_workgroups[0] = info->grid[0]; 1368 v3d->compute_num_workgroups[1] = info->grid[1]; 1369 v3d->compute_num_workgroups[2] = info->grid[2]; 1370 } 1371 1372 uint32_t num_wgs = 1; 1373 for (int i = 0; i < 3; i++) { 1374 num_wgs *= v3d->compute_num_workgroups[i]; 1375 submit.cfg[i] |= (v3d->compute_num_workgroups[i] << 1376 V3D_CSD_CFG012_WG_COUNT_SHIFT); 1377 } 1378 1379 uint32_t wg_size = info->block[0] * info->block[1] * info->block[2]; 1380 1381 struct v3d_compute_prog_data *compute = 1382 v3d->prog.compute->prog_data.compute; 1383 uint32_t wgs_per_sg = 1384 v3d_csd_choose_workgroups_per_supergroup( 1385 &v3d->screen->devinfo, 1386 compute->has_subgroups, 1387 compute->base.has_control_barrier, 1388 compute->base.threads, 1389 num_wgs, wg_size); 1390 1391 uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16); 1392 uint32_t whole_sgs = num_wgs / wgs_per_sg; 1393 uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg; 1394 uint32_t num_batches = batches_per_sg * whole_sgs + 1395 DIV_ROUND_UP(rem_wgs * wg_size, 16); 1396 1397 submit.cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT; 1398 submit.cfg[3] |= 1399 (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT; 1400 submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT; 1401 1402 1403 /* Number of batches the dispatch will invoke (minus 1). */ 1404 submit.cfg[4] = num_batches - 1; 1405 1406 /* Make sure we didn't accidentally underflow. */ 1407 assert(submit.cfg[4] != ~0); 1408 1409 v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo); 1410 submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset + 1411 v3d->prog.compute->offset); 1412 submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS; 1413 if (v3d->prog.compute->prog_data.base->single_seg) 1414 submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG; 1415 if (v3d->prog.compute->prog_data.base->threads == 4) 1416 submit.cfg[5] |= V3D_CSD_CFG5_THREADING; 1417 1418 if (v3d->prog.compute->prog_data.compute->shared_size) { 1419 v3d->compute_shared_memory = 1420 v3d_bo_alloc(v3d->screen, 1421 v3d->prog.compute->prog_data.compute->shared_size * 1422 wgs_per_sg, 1423 "shared_vars"); 1424 } 1425 1426 struct v3d_cl_reloc uniforms = v3d_write_uniforms(v3d, job, 1427 v3d->prog.compute, 1428 PIPE_SHADER_COMPUTE); 1429 v3d_job_add_bo(job, uniforms.bo); 1430 submit.cfg[6] = uniforms.bo->offset + uniforms.offset; 1431 1432 /* Pull some job state that was stored in a SUBMIT_CL struct out to 1433 * our SUBMIT_CSD struct 1434 */ 1435 submit.bo_handles = job->submit.bo_handles; 1436 submit.bo_handle_count = job->submit.bo_handle_count; 1437 1438 /* Serialize this in the rest of our command stream. */ 1439 submit.in_sync = v3d->out_sync; 1440 submit.out_sync = v3d->out_sync; 1441 1442 if (v3d->active_perfmon) { 1443 assert(screen->has_perfmon); 1444 submit.perfmon_id = v3d->active_perfmon->kperfmon_id; 1445 } 1446 1447 v3d->last_perfmon = v3d->active_perfmon; 1448 1449 if (!(unlikely(V3D_DEBUG & V3D_DEBUG_NORAST))) { 1450 int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_CSD, 1451 &submit); 1452 static bool warned = false; 1453 if (ret && !warned) { 1454 fprintf(stderr, "CSD submit call returned %s. " 1455 "Expect corruption.\n", strerror(errno)); 1456 warned = true; 1457 } else if (!ret) { 1458 if (v3d->active_perfmon) 1459 v3d->active_perfmon->job_submitted = true; 1460 } 1461 } 1462 1463 v3d_job_free(v3d, job); 1464 1465 /* Mark SSBOs as being written.. we don't actually know which ones are 1466 * read vs written, so just assume the worst 1467 */ 1468 u_foreach_bit(i, v3d->ssbo[PIPE_SHADER_COMPUTE].enabled_mask) { 1469 struct v3d_resource *rsc = v3d_resource( 1470 v3d->ssbo[PIPE_SHADER_COMPUTE].sb[i].buffer); 1471 rsc->writes++; 1472 rsc->compute_written = true; 1473 } 1474 1475 u_foreach_bit(i, v3d->shaderimg[PIPE_SHADER_COMPUTE].enabled_mask) { 1476 struct v3d_resource *rsc = v3d_resource( 1477 v3d->shaderimg[PIPE_SHADER_COMPUTE].si[i].base.resource); 1478 rsc->writes++; 1479 rsc->compute_written = true; 1480 } 1481 1482 v3d_bo_unreference(&uniforms.bo); 1483 v3d_bo_unreference(&v3d->compute_shared_memory); 1484} 1485#endif 1486 1487/** 1488 * Implements gallium's clear() hook (glClear()) by drawing a pair of triangles. 1489 */ 1490static void 1491v3d_draw_clear(struct v3d_context *v3d, 1492 unsigned buffers, 1493 const union pipe_color_union *color, 1494 double depth, unsigned stencil) 1495{ 1496 static const union pipe_color_union dummy_color = {}; 1497 1498 /* The blitter util dereferences the color regardless, even though the 1499 * gallium clear API may not pass one in when only Z/S are cleared. 1500 */ 1501 if (!color) 1502 color = &dummy_color; 1503 1504 v3d_blitter_save(v3d); 1505 util_blitter_clear(v3d->blitter, 1506 v3d->framebuffer.width, 1507 v3d->framebuffer.height, 1508 util_framebuffer_get_num_layers(&v3d->framebuffer), 1509 buffers, color, depth, stencil, 1510 util_framebuffer_get_num_samples(&v3d->framebuffer) > 1); 1511} 1512 1513/** 1514 * Attempts to perform the GL clear by using the TLB's fast clear at the start 1515 * of the frame. 1516 */ 1517static unsigned 1518v3d_tlb_clear(struct v3d_job *job, unsigned buffers, 1519 const union pipe_color_union *color, 1520 double depth, unsigned stencil) 1521{ 1522 struct v3d_context *v3d = job->v3d; 1523 1524 if (job->draw_calls_queued) { 1525 /* If anything in the CL has drawn using the buffer, then the 1526 * TLB clear we're trying to add now would happen before that 1527 * drawing. 1528 */ 1529 buffers &= ~(job->load | job->store); 1530 } 1531 1532 /* GFXH-1461: If we were to emit a load of just depth or just stencil, 1533 * then the clear for the other may get lost. We need to decide now 1534 * if it would be possible to need to emit a load of just one after 1535 * we've set up our TLB clears. 1536 */ 1537 if (buffers & PIPE_CLEAR_DEPTHSTENCIL && 1538 (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL && 1539 job->zsbuf && 1540 util_format_is_depth_and_stencil(job->zsbuf->texture->format)) { 1541 buffers &= ~PIPE_CLEAR_DEPTHSTENCIL; 1542 } 1543 1544 for (int i = 0; i < job->nr_cbufs; i++) { 1545 uint32_t bit = PIPE_CLEAR_COLOR0 << i; 1546 if (!(buffers & bit)) 1547 continue; 1548 1549 struct pipe_surface *psurf = v3d->framebuffer.cbufs[i]; 1550 struct v3d_surface *surf = v3d_surface(psurf); 1551 struct v3d_resource *rsc = v3d_resource(psurf->texture); 1552 1553 union util_color uc; 1554 uint32_t internal_size = 4 << surf->internal_bpp; 1555 1556 static union pipe_color_union swapped_color; 1557 if (v3d->swap_color_rb & (1 << i)) { 1558 swapped_color.f[0] = color->f[2]; 1559 swapped_color.f[1] = color->f[1]; 1560 swapped_color.f[2] = color->f[0]; 1561 swapped_color.f[3] = color->f[3]; 1562 color = &swapped_color; 1563 } 1564 1565 switch (surf->internal_type) { 1566 case V3D_INTERNAL_TYPE_8: 1567 util_pack_color(color->f, PIPE_FORMAT_R8G8B8A8_UNORM, 1568 &uc); 1569 memcpy(job->clear_color[i], uc.ui, internal_size); 1570 break; 1571 case V3D_INTERNAL_TYPE_8I: 1572 case V3D_INTERNAL_TYPE_8UI: 1573 job->clear_color[i][0] = ((color->ui[0] & 0xff) | 1574 (color->ui[1] & 0xff) << 8 | 1575 (color->ui[2] & 0xff) << 16 | 1576 (color->ui[3] & 0xff) << 24); 1577 break; 1578 case V3D_INTERNAL_TYPE_16F: 1579 util_pack_color(color->f, PIPE_FORMAT_R16G16B16A16_FLOAT, 1580 &uc); 1581 memcpy(job->clear_color[i], uc.ui, internal_size); 1582 break; 1583 case V3D_INTERNAL_TYPE_16I: 1584 case V3D_INTERNAL_TYPE_16UI: 1585 job->clear_color[i][0] = ((color->ui[0] & 0xffff) | 1586 color->ui[1] << 16); 1587 job->clear_color[i][1] = ((color->ui[2] & 0xffff) | 1588 color->ui[3] << 16); 1589 break; 1590 case V3D_INTERNAL_TYPE_32F: 1591 case V3D_INTERNAL_TYPE_32I: 1592 case V3D_INTERNAL_TYPE_32UI: 1593 memcpy(job->clear_color[i], color->ui, internal_size); 1594 break; 1595 } 1596 1597 rsc->initialized_buffers |= bit; 1598 } 1599 1600 unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL; 1601 if (zsclear) { 1602 struct v3d_resource *rsc = 1603 v3d_resource(v3d->framebuffer.zsbuf->texture); 1604 1605 if (zsclear & PIPE_CLEAR_DEPTH) 1606 job->clear_z = depth; 1607 if (zsclear & PIPE_CLEAR_STENCIL) 1608 job->clear_s = stencil; 1609 1610 rsc->initialized_buffers |= zsclear; 1611 } 1612 1613 job->draw_min_x = 0; 1614 job->draw_min_y = 0; 1615 job->draw_max_x = v3d->framebuffer.width; 1616 job->draw_max_y = v3d->framebuffer.height; 1617 job->clear |= buffers; 1618 job->store |= buffers; 1619 job->scissor.disabled = true; 1620 1621 v3d_start_draw(v3d); 1622 1623 return buffers; 1624} 1625 1626static void 1627v3d_clear(struct pipe_context *pctx, unsigned buffers, const struct pipe_scissor_state *scissor_state, 1628 const union pipe_color_union *color, double depth, unsigned stencil) 1629{ 1630 struct v3d_context *v3d = v3d_context(pctx); 1631 struct v3d_job *job = v3d_get_job_for_fbo(v3d); 1632 1633 buffers &= ~v3d_tlb_clear(job, buffers, color, depth, stencil); 1634 1635 if (buffers) 1636 v3d_draw_clear(v3d, buffers, color, depth, stencil); 1637} 1638 1639static void 1640v3d_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps, 1641 const union pipe_color_union *color, 1642 unsigned x, unsigned y, unsigned w, unsigned h, 1643 bool render_condition_enabled) 1644{ 1645 fprintf(stderr, "unimpl: clear RT\n"); 1646} 1647 1648static void 1649v3d_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps, 1650 unsigned buffers, double depth, unsigned stencil, 1651 unsigned x, unsigned y, unsigned w, unsigned h, 1652 bool render_condition_enabled) 1653{ 1654 fprintf(stderr, "unimpl: clear DS\n"); 1655} 1656 1657void 1658v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job) 1659{ 1660 v3d_start_binning(v3d, job); 1661} 1662 1663void 1664v3dX(draw_init)(struct pipe_context *pctx) 1665{ 1666 pctx->draw_vbo = v3d_draw_vbo; 1667 pctx->clear = v3d_clear; 1668 pctx->clear_render_target = v3d_clear_render_target; 1669 pctx->clear_depth_stencil = v3d_clear_depth_stencil; 1670#if V3D_VERSION >= 41 1671 if (v3d_context(pctx)->screen->has_csd) 1672 pctx->launch_grid = v3d_launch_grid; 1673#endif 1674} 1675