1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * Copyright 2018 Advanced Micro Devices, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the "Software"), 8 * to deal in the Software without restriction, including without limitation 9 * on the rights to use, copy, modify, merge, publish, distribute, sub 10 * license, and/or sell copies of the Software, and to permit persons to whom 11 * the Software is furnished to do so, subject to the following conditions: 12 * 13 * The above copyright notice and this permission notice (including the next 14 * paragraph) shall be included in all copies or substantial portions of the 15 * Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 21 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 22 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 23 * USE OR OTHER DEALINGS IN THE SOFTWARE. 24 */ 25 26#include "si_pipe.h" 27 28#include "util/os_time.h" 29#include "util/u_upload_mgr.h" 30 31/* initialize */ 32void si_need_gfx_cs_space(struct si_context *ctx) 33{ 34 struct radeon_cmdbuf *cs = ctx->gfx_cs; 35 36 /* There is no need to flush the DMA IB here, because 37 * si_need_dma_space always flushes the GFX IB if there is 38 * a conflict, which means any unflushed DMA commands automatically 39 * precede the GFX IB (= they had no dependency on the GFX IB when 40 * they were submitted). 41 */ 42 43 /* There are two memory usage counters in the winsys for all buffers 44 * that have been added (cs_add_buffer) and two counters in the pipe 45 * driver for those that haven't been added yet. 46 */ 47 if (unlikely(!radeon_cs_memory_below_limit(ctx->screen, ctx->gfx_cs, 48 ctx->vram, ctx->gtt))) { 49 ctx->gtt = 0; 50 ctx->vram = 0; 51 si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); 52 return; 53 } 54 ctx->gtt = 0; 55 ctx->vram = 0; 56 57 unsigned need_dwords = si_get_minimum_num_gfx_cs_dwords(ctx); 58 if (!ctx->ws->cs_check_space(cs, need_dwords)) 59 si_flush_gfx_cs(ctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW, NULL); 60} 61 62void si_unref_sdma_uploads(struct si_context *sctx) 63{ 64 for (unsigned i = 0; i < sctx->num_sdma_uploads; i++) { 65 si_resource_reference(&sctx->sdma_uploads[i].dst, NULL); 66 si_resource_reference(&sctx->sdma_uploads[i].src, NULL); 67 } 68 sctx->num_sdma_uploads = 0; 69} 70 71void si_flush_gfx_cs(struct si_context *ctx, unsigned flags, 72 struct pipe_fence_handle **fence) 73{ 74 struct radeon_cmdbuf *cs = ctx->gfx_cs; 75 struct radeon_winsys *ws = ctx->ws; 76 unsigned wait_flags = 0; 77 78 if (ctx->gfx_flush_in_progress) 79 return; 80 81 if (!ctx->screen->info.kernel_flushes_tc_l2_after_ib) { 82 wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | 83 SI_CONTEXT_CS_PARTIAL_FLUSH | 84 SI_CONTEXT_INV_GLOBAL_L2; 85 } else if (ctx->chip_class == SI) { 86 /* The kernel flushes L2 before shaders are finished. */ 87 wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | 88 SI_CONTEXT_CS_PARTIAL_FLUSH; 89 } else if (!(flags & RADEON_FLUSH_START_NEXT_GFX_IB_NOW)) { 90 wait_flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | 91 SI_CONTEXT_CS_PARTIAL_FLUSH; 92 } 93 94 /* Drop this flush if it's a no-op. */ 95 if (!radeon_emitted(cs, ctx->initial_gfx_cs_size) && 96 (!wait_flags || !ctx->gfx_last_ib_is_busy)) 97 return; 98 99 if (si_check_device_reset(ctx)) 100 return; 101 102 if (ctx->screen->debug_flags & DBG(CHECK_VM)) 103 flags &= ~PIPE_FLUSH_ASYNC; 104 105 ctx->gfx_flush_in_progress = true; 106 107 /* If the state tracker is flushing the GFX IB, si_flush_from_st is 108 * responsible for flushing the DMA IB and merging the fences from both. 109 * If the driver flushes the GFX IB internally, and it should never ask 110 * for a fence handle. 111 */ 112 assert(!radeon_emitted(ctx->dma_cs, 0) || fence == NULL); 113 114 /* Update the sdma_uploads list by flushing the uploader. */ 115 u_upload_unmap(ctx->b.const_uploader); 116 117 /* Execute SDMA uploads. */ 118 ctx->sdma_uploads_in_progress = true; 119 for (unsigned i = 0; i < ctx->num_sdma_uploads; i++) { 120 struct si_sdma_upload *up = &ctx->sdma_uploads[i]; 121 struct pipe_box box; 122 123 assert(up->src_offset % 4 == 0 && up->dst_offset % 4 == 0 && 124 up->size % 4 == 0); 125 126 u_box_1d(up->src_offset, up->size, &box); 127 ctx->dma_copy(&ctx->b, &up->dst->b.b, 0, up->dst_offset, 0, 0, 128 &up->src->b.b, 0, &box); 129 } 130 ctx->sdma_uploads_in_progress = false; 131 si_unref_sdma_uploads(ctx); 132 133 /* Flush SDMA (preamble IB). */ 134 if (radeon_emitted(ctx->dma_cs, 0)) 135 si_flush_dma_cs(ctx, flags, NULL); 136 137 if (ctx->has_graphics) { 138 if (!LIST_IS_EMPTY(&ctx->active_queries)) 139 si_suspend_queries(ctx); 140 141 ctx->streamout.suspended = false; 142 if (ctx->streamout.begin_emitted) { 143 si_emit_streamout_end(ctx); 144 ctx->streamout.suspended = true; 145 } 146 } 147 148 /* Make sure CP DMA is idle at the end of IBs after L2 prefetches 149 * because the kernel doesn't wait for it. */ 150 if (ctx->chip_class >= CIK) 151 si_cp_dma_wait_for_idle(ctx); 152 153 /* Wait for draw calls to finish if needed. */ 154 if (wait_flags) { 155 ctx->flags |= wait_flags; 156 si_emit_cache_flush(ctx); 157 } 158 ctx->gfx_last_ib_is_busy = wait_flags == 0; 159 160 if (ctx->current_saved_cs) { 161 si_trace_emit(ctx); 162 163 /* Save the IB for debug contexts. */ 164 si_save_cs(ws, cs, &ctx->current_saved_cs->gfx, true); 165 ctx->current_saved_cs->flushed = true; 166 ctx->current_saved_cs->time_flush = os_time_get_nano(); 167 168 si_log_hw_flush(ctx); 169 } 170 171 /* Flush the CS. */ 172 ws->cs_flush(cs, flags, &ctx->last_gfx_fence); 173 if (fence) 174 ws->fence_reference(fence, ctx->last_gfx_fence); 175 176 ctx->num_gfx_cs_flushes++; 177 178 /* Check VM faults if needed. */ 179 if (ctx->screen->debug_flags & DBG(CHECK_VM)) { 180 /* Use conservative timeout 800ms, after which we won't wait any 181 * longer and assume the GPU is hung. 182 */ 183 ctx->ws->fence_wait(ctx->ws, ctx->last_gfx_fence, 800*1000*1000); 184 185 si_check_vm_faults(ctx, &ctx->current_saved_cs->gfx, RING_GFX); 186 } 187 188 if (ctx->current_saved_cs) 189 si_saved_cs_reference(&ctx->current_saved_cs, NULL); 190 191 si_begin_new_gfx_cs(ctx); 192 ctx->gfx_flush_in_progress = false; 193} 194 195static void si_begin_gfx_cs_debug(struct si_context *ctx) 196{ 197 static const uint32_t zeros[1]; 198 assert(!ctx->current_saved_cs); 199 200 ctx->current_saved_cs = calloc(1, sizeof(*ctx->current_saved_cs)); 201 if (!ctx->current_saved_cs) 202 return; 203 204 pipe_reference_init(&ctx->current_saved_cs->reference, 1); 205 206 ctx->current_saved_cs->trace_buf = si_resource( 207 pipe_buffer_create(ctx->b.screen, 0, PIPE_USAGE_STAGING, 8)); 208 if (!ctx->current_saved_cs->trace_buf) { 209 free(ctx->current_saved_cs); 210 ctx->current_saved_cs = NULL; 211 return; 212 } 213 214 pipe_buffer_write_nooverlap(&ctx->b, &ctx->current_saved_cs->trace_buf->b.b, 215 0, sizeof(zeros), zeros); 216 ctx->current_saved_cs->trace_id = 0; 217 218 si_trace_emit(ctx); 219 220 radeon_add_to_buffer_list(ctx, ctx->gfx_cs, ctx->current_saved_cs->trace_buf, 221 RADEON_USAGE_READWRITE, RADEON_PRIO_TRACE); 222} 223 224void si_begin_new_gfx_cs(struct si_context *ctx) 225{ 226 if (ctx->is_debug) 227 si_begin_gfx_cs_debug(ctx); 228 229 /* Always invalidate caches at the beginning of IBs, because external 230 * users (e.g. BO evictions and SDMA/UVD/VCE IBs) can modify our 231 * buffers. 232 * 233 * Note that the cache flush done by the kernel at the end of GFX IBs 234 * isn't useful here, because that flush can finish after the following 235 * IB starts drawing. 236 * 237 * TODO: Do we also need to invalidate CB & DB caches? 238 */ 239 ctx->flags |= SI_CONTEXT_INV_ICACHE | 240 SI_CONTEXT_INV_SMEM_L1 | 241 SI_CONTEXT_INV_VMEM_L1 | 242 SI_CONTEXT_INV_GLOBAL_L2 | 243 SI_CONTEXT_START_PIPELINE_STATS; 244 245 ctx->cs_shader_state.initialized = false; 246 si_all_descriptors_begin_new_cs(ctx); 247 248 if (!ctx->has_graphics) { 249 ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw; 250 return; 251 } 252 253 /* set all valid group as dirty so they get reemited on 254 * next draw command 255 */ 256 si_pm4_reset_emitted(ctx); 257 258 /* The CS initialization should be emitted before everything else. */ 259 si_pm4_emit(ctx, ctx->init_config); 260 if (ctx->init_config_gs_rings) 261 si_pm4_emit(ctx, ctx->init_config_gs_rings); 262 263 if (ctx->queued.named.ls) 264 ctx->prefetch_L2_mask |= SI_PREFETCH_LS; 265 if (ctx->queued.named.hs) 266 ctx->prefetch_L2_mask |= SI_PREFETCH_HS; 267 if (ctx->queued.named.es) 268 ctx->prefetch_L2_mask |= SI_PREFETCH_ES; 269 if (ctx->queued.named.gs) 270 ctx->prefetch_L2_mask |= SI_PREFETCH_GS; 271 if (ctx->queued.named.vs) 272 ctx->prefetch_L2_mask |= SI_PREFETCH_VS; 273 if (ctx->queued.named.ps) 274 ctx->prefetch_L2_mask |= SI_PREFETCH_PS; 275 if (ctx->vb_descriptors_buffer && ctx->vertex_elements) 276 ctx->prefetch_L2_mask |= SI_PREFETCH_VBO_DESCRIPTORS; 277 278 /* CLEAR_STATE disables all colorbuffers, so only enable bound ones. */ 279 bool has_clear_state = ctx->screen->has_clear_state; 280 if (has_clear_state) { 281 ctx->framebuffer.dirty_cbufs = 282 u_bit_consecutive(0, ctx->framebuffer.state.nr_cbufs); 283 /* CLEAR_STATE disables the zbuffer, so only enable it if it's bound. */ 284 ctx->framebuffer.dirty_zsbuf = ctx->framebuffer.state.zsbuf != NULL; 285 } else { 286 ctx->framebuffer.dirty_cbufs = u_bit_consecutive(0, 8); 287 ctx->framebuffer.dirty_zsbuf = true; 288 } 289 /* This should always be marked as dirty to set the framebuffer scissor 290 * at least. */ 291 si_mark_atom_dirty(ctx, &ctx->atoms.s.framebuffer); 292 293 si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_regs); 294 /* CLEAR_STATE sets zeros. */ 295 if (!has_clear_state || ctx->clip_state.any_nonzeros) 296 si_mark_atom_dirty(ctx, &ctx->atoms.s.clip_state); 297 ctx->sample_locs_num_samples = 0; 298 si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_sample_locs); 299 si_mark_atom_dirty(ctx, &ctx->atoms.s.msaa_config); 300 /* CLEAR_STATE sets 0xffff. */ 301 if (!has_clear_state || ctx->sample_mask != 0xffff) 302 si_mark_atom_dirty(ctx, &ctx->atoms.s.sample_mask); 303 si_mark_atom_dirty(ctx, &ctx->atoms.s.cb_render_state); 304 /* CLEAR_STATE sets zeros. */ 305 if (!has_clear_state || ctx->blend_color.any_nonzeros) 306 si_mark_atom_dirty(ctx, &ctx->atoms.s.blend_color); 307 si_mark_atom_dirty(ctx, &ctx->atoms.s.db_render_state); 308 if (ctx->chip_class >= GFX9) 309 si_mark_atom_dirty(ctx, &ctx->atoms.s.dpbb_state); 310 si_mark_atom_dirty(ctx, &ctx->atoms.s.stencil_ref); 311 si_mark_atom_dirty(ctx, &ctx->atoms.s.spi_map); 312 si_mark_atom_dirty(ctx, &ctx->atoms.s.streamout_enable); 313 si_mark_atom_dirty(ctx, &ctx->atoms.s.render_cond); 314 /* CLEAR_STATE disables all window rectangles. */ 315 if (!has_clear_state || ctx->num_window_rectangles > 0) 316 si_mark_atom_dirty(ctx, &ctx->atoms.s.window_rectangles); 317 318 si_mark_atom_dirty(ctx, &ctx->atoms.s.guardband); 319 si_mark_atom_dirty(ctx, &ctx->atoms.s.scissors); 320 si_mark_atom_dirty(ctx, &ctx->atoms.s.viewports); 321 322 si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state); 323 if (ctx->scratch_buffer) { 324 si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b); 325 } 326 327 if (ctx->streamout.suspended) { 328 ctx->streamout.append_bitmask = ctx->streamout.enabled_mask; 329 si_streamout_buffers_dirty(ctx); 330 } 331 332 if (!LIST_IS_EMPTY(&ctx->active_queries)) 333 si_resume_queries(ctx); 334 335 assert(!ctx->gfx_cs->prev_dw); 336 ctx->initial_gfx_cs_size = ctx->gfx_cs->current.cdw; 337 338 /* Invalidate various draw states so that they are emitted before 339 * the first draw call. */ 340 si_invalidate_draw_sh_constants(ctx); 341 ctx->last_index_size = -1; 342 ctx->last_primitive_restart_en = -1; 343 ctx->last_restart_index = SI_RESTART_INDEX_UNKNOWN; 344 ctx->last_prim = -1; 345 ctx->last_multi_vgt_param = -1; 346 ctx->last_rast_prim = -1; 347 ctx->last_sc_line_stipple = ~0; 348 ctx->last_vs_state = ~0; 349 ctx->last_ls = NULL; 350 ctx->last_tcs = NULL; 351 ctx->last_tes_sh_base = -1; 352 ctx->last_num_tcs_input_cp = -1; 353 ctx->last_ls_hs_config = -1; /* impossible value */ 354 355 if (has_clear_state) { 356 ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_CONTROL] = 0x00000000; 357 ctx->tracked_regs.reg_value[SI_TRACKED_DB_COUNT_CONTROL] = 0x00000000; 358 ctx->tracked_regs.reg_value[SI_TRACKED_DB_RENDER_OVERRIDE2] = 0x00000000; 359 ctx->tracked_regs.reg_value[SI_TRACKED_DB_SHADER_CONTROL] = 0x00000000; 360 ctx->tracked_regs.reg_value[SI_TRACKED_CB_TARGET_MASK] = 0xffffffff; 361 ctx->tracked_regs.reg_value[SI_TRACKED_CB_DCC_CONTROL] = 0x00000000; 362 ctx->tracked_regs.reg_value[SI_TRACKED_SX_PS_DOWNCONVERT] = 0x00000000; 363 ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_EPSILON] = 0x00000000; 364 ctx->tracked_regs.reg_value[SI_TRACKED_SX_BLEND_OPT_CONTROL] = 0x00000000; 365 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_LINE_CNTL] = 0x00001000; 366 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_AA_CONFIG] = 0x00000000; 367 ctx->tracked_regs.reg_value[SI_TRACKED_DB_EQAA] = 0x00000000; 368 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_MODE_CNTL_1] = 0x00000000; 369 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_PRIM_FILTER_CNTL] = 0; 370 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_SMALL_PRIM_FILTER_CNTL] = 0x00000000; 371 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VS_OUT_CNTL] = 0x00000000; 372 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_CLIP_CNTL] = 0x00090000; 373 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_BINNER_CNTL_0] = 0x00000003; 374 ctx->tracked_regs.reg_value[SI_TRACKED_DB_DFSM_CONTROL] = 0x00000000; 375 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_CLIP_ADJ] = 0x3f800000; 376 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_VERT_DISC_ADJ] = 0x3f800000; 377 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_CLIP_ADJ] = 0x3f800000; 378 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_GB_HORZ_DISC_ADJ] = 0x3f800000; 379 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET] = 0; 380 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SU_VTX_CNTL] = 0x00000005; 381 ctx->tracked_regs.reg_value[SI_TRACKED_PA_SC_CLIPRECT_RULE] = 0xffff; 382 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_ESGS_RING_ITEMSIZE] = 0x00000000; 383 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_1] = 0x00000000; 384 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_2] = 0x00000000; 385 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_OFFSET_3] = 0x00000000; 386 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_OUT_PRIM_TYPE] = 0x00000000; 387 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GSVS_RING_ITEMSIZE] = 0x00000000; 388 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_VERT_OUT] = 0x00000000; 389 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE] = 0x00000000; 390 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_1] = 0x00000000; 391 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_2] = 0x00000000; 392 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_VERT_ITEMSIZE_3] = 0x00000000; 393 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_INSTANCE_CNT] = 0x00000000; 394 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_ONCHIP_CNTL] = 0x00000000; 395 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MAX_PRIMS_PER_SUBGROUP] = 0x00000000; 396 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_GS_MODE] = 0x00000000; 397 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_PRIMITIVEID_EN] = 0x00000000; 398 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_REUSE_OFF] = 0x00000000; 399 ctx->tracked_regs.reg_value[SI_TRACKED_SPI_VS_OUT_CONFIG] = 0x00000000; 400 ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_POS_FORMAT] = 0x00000000; 401 ctx->tracked_regs.reg_value[SI_TRACKED_PA_CL_VTE_CNTL] = 0x00000000; 402 ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ENA] = 0x00000000; 403 ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_INPUT_ADDR] = 0x00000000; 404 ctx->tracked_regs.reg_value[SI_TRACKED_SPI_BARYC_CNTL] = 0x00000000; 405 ctx->tracked_regs.reg_value[SI_TRACKED_SPI_PS_IN_CONTROL] = 0x00000002; 406 ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_Z_FORMAT] = 0x00000000; 407 ctx->tracked_regs.reg_value[SI_TRACKED_SPI_SHADER_COL_FORMAT] = 0x00000000; 408 ctx->tracked_regs.reg_value[SI_TRACKED_CB_SHADER_MASK] = 0xffffffff; 409 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_TF_PARAM] = 0x00000000; 410 ctx->tracked_regs.reg_value[SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL] = 0x0000001e; /* From VI */ 411 412 /* Set all saved registers state to saved. */ 413 ctx->tracked_regs.reg_saved = 0xffffffffffffffff; 414 } else { 415 /* Set all saved registers state to unknown. */ 416 ctx->tracked_regs.reg_saved = 0; 417 } 418 419 /* 0xffffffff is a impossible value to register SPI_PS_INPUT_CNTL_n */ 420 memset(ctx->tracked_regs.spi_ps_input_cntl, 0xff, sizeof(uint32_t) * 32); 421} 422