1/* 2 * Copyright 2010 Jerome Glisse <glisse@freedesktop.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * on the rights to use, copy, modify, merge, publish, distribute, sub 8 * license, and/or sell copies of the Software, and to permit persons to whom 9 * the Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 21 * USE OR OTHER DEALINGS IN THE SOFTWARE. 22 * 23 * Authors: 24 * Jerome Glisse 25 */ 26#include "r600_pipe.h" 27#include "r600d.h" 28#include "util/u_memory.h" 29#include <errno.h> 30#include <unistd.h> 31 32 33void r600_need_cs_space(struct r600_context *ctx, unsigned num_dw, 34 boolean count_draw_in, unsigned num_atomics) 35{ 36 /* Flush the DMA IB if it's not empty. */ 37 if (radeon_emitted(&ctx->b.dma.cs, 0)) 38 ctx->b.dma.flush(ctx, PIPE_FLUSH_ASYNC, NULL); 39 40 if (!radeon_cs_memory_below_limit(ctx->b.screen, &ctx->b.gfx.cs, 41 ctx->b.vram, ctx->b.gtt)) { 42 ctx->b.gtt = 0; 43 ctx->b.vram = 0; 44 ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL); 45 return; 46 } 47 /* all will be accounted once relocation are emitted */ 48 ctx->b.gtt = 0; 49 ctx->b.vram = 0; 50 51 /* Check available space in CS. */ 52 if (count_draw_in) { 53 uint64_t mask; 54 55 /* The number of dwords all the dirty states would take. */ 56 mask = ctx->dirty_atoms; 57 while (mask != 0) 58 num_dw += ctx->atoms[u_bit_scan64(&mask)]->num_dw; 59 60 /* The upper-bound of how much space a draw command would take. */ 61 num_dw += R600_MAX_FLUSH_CS_DWORDS + R600_MAX_DRAW_CS_DWORDS; 62 } 63 64 /* add atomic counters, 8 pre + 8 post per counter + 16 post if any counters */ 65 num_dw += (num_atomics * 16) + (num_atomics ? 16 : 0); 66 67 /* Count in r600_suspend_queries. */ 68 num_dw += ctx->b.num_cs_dw_queries_suspend; 69 70 /* Count in streamout_end at the end of CS. */ 71 if (ctx->b.streamout.begin_emitted) { 72 num_dw += ctx->b.streamout.num_dw_for_end; 73 } 74 75 /* SX_MISC */ 76 if (ctx->b.chip_class == R600) { 77 num_dw += 3; 78 } 79 80 /* Count in framebuffer cache flushes at the end of CS. */ 81 num_dw += R600_MAX_FLUSH_CS_DWORDS; 82 83 /* The fence at the end of CS. */ 84 num_dw += 10; 85 86 /* Flush if there's not enough space. */ 87 if (!ctx->b.ws->cs_check_space(&ctx->b.gfx.cs, num_dw, false)) { 88 ctx->b.gfx.flush(ctx, PIPE_FLUSH_ASYNC, NULL); 89 } 90} 91 92void r600_flush_emit(struct r600_context *rctx) 93{ 94 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs; 95 unsigned cp_coher_cntl = 0; 96 unsigned wait_until = 0; 97 98 if (!rctx->b.flags) { 99 return; 100 } 101 102 /* Ensure coherency between streamout and shaders. */ 103 if (rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH) 104 rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER); 105 106 if (rctx->b.flags & R600_CONTEXT_WAIT_3D_IDLE) { 107 wait_until |= S_008040_WAIT_3D_IDLE(1); 108 } 109 if (rctx->b.flags & R600_CONTEXT_WAIT_CP_DMA_IDLE) { 110 wait_until |= S_008040_WAIT_CP_DMA_IDLE(1); 111 } 112 113 if (wait_until) { 114 /* Use of WAIT_UNTIL is deprecated on Cayman+ */ 115 if (rctx->b.family >= CHIP_CAYMAN) { 116 /* emit a PS partial flush on Cayman/TN */ 117 rctx->b.flags |= R600_CONTEXT_PS_PARTIAL_FLUSH; 118 } 119 } 120 121 /* Wait packets must be executed first, because SURFACE_SYNC doesn't 122 * wait for shaders if it's not flushing CB or DB. 123 */ 124 if (rctx->b.flags & R600_CONTEXT_PS_PARTIAL_FLUSH) { 125 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 126 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 127 } 128 129 if (rctx->b.flags & R600_CONTEXT_CS_PARTIAL_FLUSH) { 130 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 131 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CS_PARTIAL_FLUSH) | EVENT_INDEX(4)); 132 } 133 134 if (wait_until) { 135 /* Use of WAIT_UNTIL is deprecated on Cayman+ */ 136 if (rctx->b.family < CHIP_CAYMAN) { 137 /* wait for things to settle */ 138 radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, wait_until); 139 } 140 } 141 142 if (rctx->b.chip_class >= R700 && 143 (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_CB_META)) { 144 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 145 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_CB_META) | EVENT_INDEX(0)); 146 } 147 148 if (rctx->b.chip_class >= R700 && 149 (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_DB_META)) { 150 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 151 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_FLUSH_AND_INV_DB_META) | EVENT_INDEX(0)); 152 153 /* Set FULL_CACHE_ENA for DB META flushes on r7xx and later. 154 * 155 * This hack predates use of FLUSH_AND_INV_DB_META, so it's 156 * unclear whether it's still needed or even whether it has 157 * any effect. 158 */ 159 cp_coher_cntl |= S_0085F0_FULL_CACHE_ENA(1); 160 } 161 162 if (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV || 163 (rctx->b.chip_class == R600 && rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH)) { 164 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 165 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_EVENT) | EVENT_INDEX(0)); 166 } 167 168 if (rctx->b.flags & R600_CONTEXT_INV_CONST_CACHE) { 169 /* Direct constant addressing uses the shader cache. 170 * Indirect contant addressing uses the vertex cache. */ 171 cp_coher_cntl |= S_0085F0_SH_ACTION_ENA(1) | 172 (rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) 173 : S_0085F0_TC_ACTION_ENA(1)); 174 } 175 if (rctx->b.flags & R600_CONTEXT_INV_VERTEX_CACHE) { 176 cp_coher_cntl |= rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) 177 : S_0085F0_TC_ACTION_ENA(1); 178 } 179 if (rctx->b.flags & R600_CONTEXT_INV_TEX_CACHE) { 180 /* Textures use the texture cache. 181 * Texture buffer objects use the vertex cache. */ 182 cp_coher_cntl |= S_0085F0_TC_ACTION_ENA(1) | 183 (rctx->has_vertex_cache ? S_0085F0_VC_ACTION_ENA(1) : 0); 184 } 185 186 /* Don't use the DB CP COHER logic on r6xx. 187 * There are hw bugs. 188 */ 189 if (rctx->b.chip_class >= R700 && 190 (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_DB)) { 191 cp_coher_cntl |= S_0085F0_DB_ACTION_ENA(1) | 192 S_0085F0_DB_DEST_BASE_ENA(1) | 193 S_0085F0_SMX_ACTION_ENA(1); 194 } 195 196 /* Don't use the CB CP COHER logic on r6xx. 197 * There are hw bugs. 198 */ 199 if (rctx->b.chip_class >= R700 && 200 (rctx->b.flags & R600_CONTEXT_FLUSH_AND_INV_CB)) { 201 cp_coher_cntl |= S_0085F0_CB_ACTION_ENA(1) | 202 S_0085F0_CB0_DEST_BASE_ENA(1) | 203 S_0085F0_CB1_DEST_BASE_ENA(1) | 204 S_0085F0_CB2_DEST_BASE_ENA(1) | 205 S_0085F0_CB3_DEST_BASE_ENA(1) | 206 S_0085F0_CB4_DEST_BASE_ENA(1) | 207 S_0085F0_CB5_DEST_BASE_ENA(1) | 208 S_0085F0_CB6_DEST_BASE_ENA(1) | 209 S_0085F0_CB7_DEST_BASE_ENA(1) | 210 S_0085F0_SMX_ACTION_ENA(1); 211 if (rctx->b.chip_class >= EVERGREEN) 212 cp_coher_cntl |= S_0085F0_CB8_DEST_BASE_ENA(1) | 213 S_0085F0_CB9_DEST_BASE_ENA(1) | 214 S_0085F0_CB10_DEST_BASE_ENA(1) | 215 S_0085F0_CB11_DEST_BASE_ENA(1); 216 } 217 218 if (rctx->b.chip_class >= R700 && 219 rctx->b.flags & R600_CONTEXT_STREAMOUT_FLUSH) { 220 cp_coher_cntl |= S_0085F0_SO0_DEST_BASE_ENA(1) | 221 S_0085F0_SO1_DEST_BASE_ENA(1) | 222 S_0085F0_SO2_DEST_BASE_ENA(1) | 223 S_0085F0_SO3_DEST_BASE_ENA(1) | 224 S_0085F0_SMX_ACTION_ENA(1); 225 } 226 227 /* Workaround for buggy flushing on some R6xx chipsets. */ 228 if ((rctx->b.flags & (R600_CONTEXT_FLUSH_AND_INV | 229 R600_CONTEXT_STREAMOUT_FLUSH)) && 230 (rctx->b.family == CHIP_RV670 || 231 rctx->b.family == CHIP_RS780 || 232 rctx->b.family == CHIP_RS880)) { 233 cp_coher_cntl |= S_0085F0_CB1_DEST_BASE_ENA(1) | 234 S_0085F0_DEST_BASE_0_ENA(1); 235 } 236 237 if (cp_coher_cntl) { 238 radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0)); 239 radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */ 240 radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */ 241 radeon_emit(cs, 0); /* CP_COHER_BASE */ 242 radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */ 243 } 244 245 if (rctx->b.flags & R600_CONTEXT_START_PIPELINE_STATS) { 246 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 247 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_START) | 248 EVENT_INDEX(0)); 249 } else if (rctx->b.flags & R600_CONTEXT_STOP_PIPELINE_STATS) { 250 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 251 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_PIPELINESTAT_STOP) | 252 EVENT_INDEX(0)); 253 } 254 255 /* everything is properly flushed */ 256 rctx->b.flags = 0; 257} 258 259void r600_context_gfx_flush(void *context, unsigned flags, 260 struct pipe_fence_handle **fence) 261{ 262 struct r600_context *ctx = context; 263 struct radeon_cmdbuf *cs = &ctx->b.gfx.cs; 264 struct radeon_winsys *ws = ctx->b.ws; 265 266 if (!radeon_emitted(cs, ctx->b.initial_gfx_cs_size)) 267 return; 268 269 if (r600_check_device_reset(&ctx->b)) 270 return; 271 272 r600_preflush_suspend_features(&ctx->b); 273 274 /* flush the framebuffer cache */ 275 ctx->b.flags |= R600_CONTEXT_FLUSH_AND_INV | 276 R600_CONTEXT_FLUSH_AND_INV_CB | 277 R600_CONTEXT_FLUSH_AND_INV_DB | 278 R600_CONTEXT_FLUSH_AND_INV_CB_META | 279 R600_CONTEXT_FLUSH_AND_INV_DB_META | 280 R600_CONTEXT_WAIT_3D_IDLE | 281 R600_CONTEXT_WAIT_CP_DMA_IDLE; 282 283 r600_flush_emit(ctx); 284 285 if (ctx->trace_buf) 286 eg_trace_emit(ctx); 287 /* old kernels and userspace don't set SX_MISC, so we must reset it to 0 here */ 288 if (ctx->b.chip_class == R600) { 289 radeon_set_context_reg(cs, R_028350_SX_MISC, 0); 290 } 291 292 if (ctx->is_debug) { 293 /* Save the IB for debug contexts. */ 294 radeon_clear_saved_cs(&ctx->last_gfx); 295 radeon_save_cs(ws, cs, &ctx->last_gfx, true); 296 r600_resource_reference(&ctx->last_trace_buf, ctx->trace_buf); 297 r600_resource_reference(&ctx->trace_buf, NULL); 298 } 299 /* Flush the CS. */ 300 ws->cs_flush(cs, flags, &ctx->b.last_gfx_fence); 301 if (fence) 302 ws->fence_reference(fence, ctx->b.last_gfx_fence); 303 ctx->b.num_gfx_cs_flushes++; 304 305 if (ctx->is_debug) { 306 if (!ws->fence_wait(ws, ctx->b.last_gfx_fence, 10000000)) { 307 const char *fname = getenv("R600_TRACE"); 308 if (!fname) 309 exit(-1); 310 FILE *fl = fopen(fname, "w+"); 311 if (fl) { 312 eg_dump_debug_state(&ctx->b.b, fl, 0); 313 fclose(fl); 314 } else 315 perror(fname); 316 exit(-1); 317 } 318 } 319 r600_begin_new_cs(ctx); 320} 321 322void r600_begin_new_cs(struct r600_context *ctx) 323{ 324 unsigned shader; 325 326 if (ctx->is_debug) { 327 uint32_t zero = 0; 328 329 /* Create a buffer used for writing trace IDs and initialize it to 0. */ 330 assert(!ctx->trace_buf); 331 ctx->trace_buf = (struct r600_resource*) 332 pipe_buffer_create(ctx->b.b.screen, 0, 333 PIPE_USAGE_STAGING, 4); 334 if (ctx->trace_buf) 335 pipe_buffer_write_nooverlap(&ctx->b.b, &ctx->trace_buf->b.b, 336 0, sizeof(zero), &zero); 337 ctx->trace_id = 0; 338 } 339 340 if (ctx->trace_buf) 341 eg_trace_emit(ctx); 342 343 ctx->b.flags = 0; 344 ctx->b.gtt = 0; 345 ctx->b.vram = 0; 346 347 /* Begin a new CS. */ 348 r600_emit_command_buffer(&ctx->b.gfx.cs, &ctx->start_cs_cmd); 349 350 /* Re-emit states. */ 351 r600_mark_atom_dirty(ctx, &ctx->alphatest_state.atom); 352 r600_mark_atom_dirty(ctx, &ctx->blend_color.atom); 353 r600_mark_atom_dirty(ctx, &ctx->cb_misc_state.atom); 354 r600_mark_atom_dirty(ctx, &ctx->clip_misc_state.atom); 355 r600_mark_atom_dirty(ctx, &ctx->clip_state.atom); 356 r600_mark_atom_dirty(ctx, &ctx->db_misc_state.atom); 357 r600_mark_atom_dirty(ctx, &ctx->db_state.atom); 358 r600_mark_atom_dirty(ctx, &ctx->framebuffer.atom); 359 if (ctx->b.chip_class >= EVERGREEN) { 360 r600_mark_atom_dirty(ctx, &ctx->fragment_images.atom); 361 r600_mark_atom_dirty(ctx, &ctx->fragment_buffers.atom); 362 r600_mark_atom_dirty(ctx, &ctx->compute_images.atom); 363 r600_mark_atom_dirty(ctx, &ctx->compute_buffers.atom); 364 } 365 r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[R600_HW_STAGE_PS].atom); 366 r600_mark_atom_dirty(ctx, &ctx->poly_offset_state.atom); 367 r600_mark_atom_dirty(ctx, &ctx->vgt_state.atom); 368 r600_mark_atom_dirty(ctx, &ctx->sample_mask.atom); 369 ctx->b.scissors.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; 370 r600_mark_atom_dirty(ctx, &ctx->b.scissors.atom); 371 ctx->b.viewports.dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; 372 ctx->b.viewports.depth_range_dirty_mask = (1 << R600_MAX_VIEWPORTS) - 1; 373 r600_mark_atom_dirty(ctx, &ctx->b.viewports.atom); 374 if (ctx->b.chip_class <= EVERGREEN) { 375 r600_mark_atom_dirty(ctx, &ctx->config_state.atom); 376 } 377 r600_mark_atom_dirty(ctx, &ctx->stencil_ref.atom); 378 r600_mark_atom_dirty(ctx, &ctx->vertex_fetch_shader.atom); 379 r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[R600_HW_STAGE_ES].atom); 380 r600_mark_atom_dirty(ctx, &ctx->shader_stages.atom); 381 if (ctx->gs_shader) { 382 r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[R600_HW_STAGE_GS].atom); 383 r600_mark_atom_dirty(ctx, &ctx->gs_rings.atom); 384 } 385 if (ctx->tes_shader) { 386 r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[EG_HW_STAGE_HS].atom); 387 r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[EG_HW_STAGE_LS].atom); 388 } 389 r600_mark_atom_dirty(ctx, &ctx->hw_shader_stages[R600_HW_STAGE_VS].atom); 390 r600_mark_atom_dirty(ctx, &ctx->b.streamout.enable_atom); 391 r600_mark_atom_dirty(ctx, &ctx->b.render_cond_atom); 392 393 if (ctx->blend_state.cso) 394 r600_mark_atom_dirty(ctx, &ctx->blend_state.atom); 395 if (ctx->dsa_state.cso) 396 r600_mark_atom_dirty(ctx, &ctx->dsa_state.atom); 397 if (ctx->rasterizer_state.cso) 398 r600_mark_atom_dirty(ctx, &ctx->rasterizer_state.atom); 399 400 if (ctx->b.chip_class <= R700) { 401 r600_mark_atom_dirty(ctx, &ctx->seamless_cube_map.atom); 402 } 403 404 ctx->vertex_buffer_state.dirty_mask = ctx->vertex_buffer_state.enabled_mask; 405 r600_vertex_buffers_dirty(ctx); 406 407 /* Re-emit shader resources. */ 408 for (shader = 0; shader < PIPE_SHADER_TYPES; shader++) { 409 struct r600_constbuf_state *constbuf = &ctx->constbuf_state[shader]; 410 struct r600_textures_info *samplers = &ctx->samplers[shader]; 411 412 constbuf->dirty_mask = constbuf->enabled_mask; 413 samplers->views.dirty_mask = samplers->views.enabled_mask; 414 samplers->states.dirty_mask = samplers->states.enabled_mask; 415 416 r600_constant_buffers_dirty(ctx, constbuf); 417 r600_sampler_views_dirty(ctx, &samplers->views); 418 r600_sampler_states_dirty(ctx, &samplers->states); 419 } 420 421 for (shader = 0; shader < ARRAY_SIZE(ctx->scratch_buffers); shader++) { 422 ctx->scratch_buffers[shader].dirty = true; 423 } 424 425 r600_postflush_resume_features(&ctx->b); 426 427 /* Re-emit the draw state. */ 428 ctx->last_primitive_type = -1; 429 ctx->last_start_instance = -1; 430 ctx->last_rast_prim = -1; 431 ctx->current_rast_prim = -1; 432 433 assert(!ctx->b.gfx.cs.prev_dw); 434 ctx->b.initial_gfx_cs_size = ctx->b.gfx.cs.current.cdw; 435} 436 437void r600_emit_pfp_sync_me(struct r600_context *rctx) 438{ 439 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs; 440 441 if (rctx->b.chip_class >= EVERGREEN && 442 rctx->b.screen->info.drm_minor >= 46) { 443 radeon_emit(cs, PKT3(PKT3_PFP_SYNC_ME, 0, 0)); 444 radeon_emit(cs, 0); 445 } else { 446 /* Emulate PFP_SYNC_ME by writing a value to memory in ME and 447 * waiting for it in PFP. 448 */ 449 struct r600_resource *buf = NULL; 450 unsigned offset, reloc; 451 uint64_t va; 452 453 /* 16-byte address alignment is required by WAIT_REG_MEM. */ 454 u_suballocator_alloc(&rctx->b.allocator_zeroed_memory, 4, 16, 455 &offset, (struct pipe_resource**)&buf); 456 if (!buf) { 457 /* This is too heavyweight, but will work. */ 458 rctx->b.gfx.flush(rctx, PIPE_FLUSH_ASYNC, NULL); 459 return; 460 } 461 462 reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, buf, 463 RADEON_USAGE_READWRITE, 464 RADEON_PRIO_FENCE); 465 466 va = buf->gpu_address + offset; 467 assert(va % 16 == 0); 468 469 /* Write 1 to memory in ME. */ 470 radeon_emit(cs, PKT3(PKT3_MEM_WRITE, 3, 0)); 471 radeon_emit(cs, va); 472 radeon_emit(cs, ((va >> 32) & 0xff) | MEM_WRITE_32_BITS); 473 radeon_emit(cs, 1); 474 radeon_emit(cs, 0); 475 476 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 477 radeon_emit(cs, reloc); 478 479 /* Wait in PFP (PFP can only do GEQUAL against memory). */ 480 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 481 radeon_emit(cs, WAIT_REG_MEM_GEQUAL | 482 WAIT_REG_MEM_MEMORY | 483 WAIT_REG_MEM_PFP); 484 radeon_emit(cs, va); 485 radeon_emit(cs, va >> 32); 486 radeon_emit(cs, 1); /* reference value */ 487 radeon_emit(cs, 0xffffffff); /* mask */ 488 radeon_emit(cs, 4); /* poll interval */ 489 490 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 491 radeon_emit(cs, reloc); 492 493 r600_resource_reference(&buf, NULL); 494 } 495} 496 497/* The max number of bytes to copy per packet. */ 498#define CP_DMA_MAX_BYTE_COUNT ((1 << 21) - 8) 499 500void r600_cp_dma_copy_buffer(struct r600_context *rctx, 501 struct pipe_resource *dst, uint64_t dst_offset, 502 struct pipe_resource *src, uint64_t src_offset, 503 unsigned size) 504{ 505 struct radeon_cmdbuf *cs = &rctx->b.gfx.cs; 506 507 assert(size); 508 assert(rctx->screen->b.has_cp_dma); 509 510 /* Mark the buffer range of destination as valid (initialized), 511 * so that transfer_map knows it should wait for the GPU when mapping 512 * that range. */ 513 util_range_add(dst, &r600_resource(dst)->valid_buffer_range, dst_offset, 514 dst_offset + size); 515 516 dst_offset += r600_resource(dst)->gpu_address; 517 src_offset += r600_resource(src)->gpu_address; 518 519 /* Flush the caches where the resources are bound. */ 520 rctx->b.flags |= r600_get_flush_flags(R600_COHERENCY_SHADER) | 521 R600_CONTEXT_WAIT_3D_IDLE; 522 523 /* There are differences between R700 and EG in CP DMA, 524 * but we only use the common bits here. */ 525 while (size) { 526 unsigned sync = 0; 527 unsigned byte_count = MIN2(size, CP_DMA_MAX_BYTE_COUNT); 528 unsigned src_reloc, dst_reloc; 529 530 r600_need_cs_space(rctx, 531 10 + (rctx->b.flags ? R600_MAX_FLUSH_CS_DWORDS : 0) + 532 3 + R600_MAX_PFP_SYNC_ME_DWORDS, FALSE, 0); 533 534 /* Flush the caches for the first copy only. */ 535 if (rctx->b.flags) { 536 r600_flush_emit(rctx); 537 } 538 539 /* Do the synchronization after the last copy, so that all data is written to memory. */ 540 if (size == byte_count) { 541 sync = PKT3_CP_DMA_CP_SYNC; 542 } 543 544 /* This must be done after r600_need_cs_space. */ 545 src_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)src, 546 RADEON_USAGE_READ, RADEON_PRIO_CP_DMA); 547 dst_reloc = radeon_add_to_buffer_list(&rctx->b, &rctx->b.gfx, (struct r600_resource*)dst, 548 RADEON_USAGE_WRITE, RADEON_PRIO_CP_DMA); 549 550 radeon_emit(cs, PKT3(PKT3_CP_DMA, 4, 0)); 551 radeon_emit(cs, src_offset); /* SRC_ADDR_LO [31:0] */ 552 radeon_emit(cs, sync | ((src_offset >> 32) & 0xff)); /* CP_SYNC [31] | SRC_ADDR_HI [7:0] */ 553 radeon_emit(cs, dst_offset); /* DST_ADDR_LO [31:0] */ 554 radeon_emit(cs, (dst_offset >> 32) & 0xff); /* DST_ADDR_HI [7:0] */ 555 radeon_emit(cs, byte_count); /* COMMAND [29:22] | BYTE_COUNT [20:0] */ 556 557 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 558 radeon_emit(cs, src_reloc); 559 radeon_emit(cs, PKT3(PKT3_NOP, 0, 0)); 560 radeon_emit(cs, dst_reloc); 561 562 size -= byte_count; 563 src_offset += byte_count; 564 dst_offset += byte_count; 565 } 566 567 /* CP_DMA_CP_SYNC doesn't wait for idle on R6xx, but this does. */ 568 if (rctx->b.chip_class == R600) 569 radeon_set_config_reg(cs, R_008040_WAIT_UNTIL, 570 S_008040_WAIT_CP_DMA_IDLE(1)); 571 572 /* CP DMA is executed in ME, but index buffers are read by PFP. 573 * This ensures that ME (CP DMA) is idle before PFP starts fetching 574 * indices. If we wanted to execute CP DMA in PFP, this packet 575 * should precede it. 576 */ 577 r600_emit_pfp_sync_me(rctx); 578} 579 580void r600_dma_copy_buffer(struct r600_context *rctx, 581 struct pipe_resource *dst, 582 struct pipe_resource *src, 583 uint64_t dst_offset, 584 uint64_t src_offset, 585 uint64_t size) 586{ 587 struct radeon_cmdbuf *cs = &rctx->b.dma.cs; 588 unsigned i, ncopy, csize; 589 struct r600_resource *rdst = (struct r600_resource*)dst; 590 struct r600_resource *rsrc = (struct r600_resource*)src; 591 592 /* Mark the buffer range of destination as valid (initialized), 593 * so that transfer_map knows it should wait for the GPU when mapping 594 * that range. */ 595 util_range_add(&rdst->b.b, &rdst->valid_buffer_range, dst_offset, 596 dst_offset + size); 597 598 size >>= 2; /* convert to dwords */ 599 ncopy = (size / R600_DMA_COPY_MAX_SIZE_DW) + !!(size % R600_DMA_COPY_MAX_SIZE_DW); 600 601 r600_need_dma_space(&rctx->b, ncopy * 5, rdst, rsrc); 602 for (i = 0; i < ncopy; i++) { 603 csize = size < R600_DMA_COPY_MAX_SIZE_DW ? size : R600_DMA_COPY_MAX_SIZE_DW; 604 /* emit reloc before writing cs so that cs is always in consistent state */ 605 radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rsrc, RADEON_USAGE_READ, 0); 606 radeon_add_to_buffer_list(&rctx->b, &rctx->b.dma, rdst, RADEON_USAGE_WRITE, 0); 607 radeon_emit(cs, DMA_PACKET(DMA_PACKET_COPY, 0, 0, csize)); 608 radeon_emit(cs, dst_offset & 0xfffffffc); 609 radeon_emit(cs, src_offset & 0xfffffffc); 610 radeon_emit(cs, (dst_offset >> 32UL) & 0xff); 611 radeon_emit(cs, (src_offset >> 32UL) & 0xff); 612 dst_offset += csize << 2; 613 src_offset += csize << 2; 614 size -= csize; 615 } 616} 617