1/* 2 * Copyright 2013 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "si_build_pm4.h" 26#include "util/u_memory.h" 27#include "util/u_suballoc.h" 28 29static void si_set_streamout_enable(struct si_context *sctx, bool enable); 30 31static inline void si_so_target_reference(struct si_streamout_target **dst, 32 struct pipe_stream_output_target *src) 33{ 34 pipe_so_target_reference((struct pipe_stream_output_target **)dst, src); 35} 36 37static struct pipe_stream_output_target *si_create_so_target(struct pipe_context *ctx, 38 struct pipe_resource *buffer, 39 unsigned buffer_offset, 40 unsigned buffer_size) 41{ 42 struct si_streamout_target *t; 43 struct si_resource *buf = si_resource(buffer); 44 45 t = CALLOC_STRUCT(si_streamout_target); 46 if (!t) { 47 return NULL; 48 } 49 50 t->b.reference.count = 1; 51 t->b.context = ctx; 52 pipe_resource_reference(&t->b.buffer, buffer); 53 t->b.buffer_offset = buffer_offset; 54 t->b.buffer_size = buffer_size; 55 56 util_range_add(&buf->b.b, &buf->valid_buffer_range, buffer_offset, buffer_offset + buffer_size); 57 return &t->b; 58} 59 60static void si_so_target_destroy(struct pipe_context *ctx, struct pipe_stream_output_target *target) 61{ 62 struct si_streamout_target *t = (struct si_streamout_target *)target; 63 pipe_resource_reference(&t->b.buffer, NULL); 64 si_resource_reference(&t->buf_filled_size, NULL); 65 FREE(t); 66} 67 68void si_streamout_buffers_dirty(struct si_context *sctx) 69{ 70 if (!sctx->streamout.enabled_mask) 71 return; 72 73 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin); 74 si_set_streamout_enable(sctx, true); 75} 76 77static void si_set_streamout_targets(struct pipe_context *ctx, unsigned num_targets, 78 struct pipe_stream_output_target **targets, 79 const unsigned *offsets) 80{ 81 struct si_context *sctx = (struct si_context *)ctx; 82 unsigned old_num_targets = sctx->streamout.num_targets; 83 unsigned i; 84 bool wait_now = false; 85 86 /* We are going to unbind the buffers. Mark which caches need to be flushed. */ 87 if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) { 88 /* Since streamout uses vector writes which go through TC L2 89 * and most other clients can use TC L2 as well, we don't need 90 * to flush it. 91 * 92 * The only cases which requires flushing it is VGT DMA index 93 * fetching (on <= GFX7) and indirect draw data, which are rare 94 * cases. Thus, flag the TC L2 dirtiness in the resource and 95 * handle it at draw call time. 96 */ 97 for (i = 0; i < sctx->streamout.num_targets; i++) 98 if (sctx->streamout.targets[i]) 99 si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true; 100 101 /* Invalidate the scalar cache in case a streamout buffer is 102 * going to be used as a constant buffer. 103 * 104 * Invalidate vL1, because streamout bypasses it (done by 105 * setting GLC=1 in the store instruction), but vL1 in other 106 * CUs can contain outdated data of streamout buffers. 107 * 108 * VS_PARTIAL_FLUSH is required if the buffers are going to be 109 * used as an input immediately. 110 */ 111 sctx->flags |= SI_CONTEXT_INV_SCACHE | SI_CONTEXT_INV_VCACHE; 112 113 /* The BUFFER_FILLED_SIZE is written using a PS_DONE event. */ 114 if (sctx->screen->use_ngg_streamout) { 115 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 116 117 /* Wait now. This is needed to make sure that GDS is not 118 * busy at the end of IBs. 119 * 120 * Also, the next streamout operation will overwrite GDS, 121 * so we need to make sure that it's idle. 122 */ 123 wait_now = true; 124 } else { 125 sctx->flags |= SI_CONTEXT_VS_PARTIAL_FLUSH | SI_CONTEXT_PFP_SYNC_ME; 126 } 127 } 128 129 /* All readers of the streamout targets need to be finished before we can 130 * start writing to the targets. 131 */ 132 if (num_targets) { 133 if (sctx->screen->use_ngg_streamout) 134 si_allocate_gds(sctx); 135 136 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | SI_CONTEXT_CS_PARTIAL_FLUSH | 137 SI_CONTEXT_PFP_SYNC_ME; 138 } 139 140 /* Streamout buffers must be bound in 2 places: 141 * 1) in VGT by setting the VGT_STRMOUT registers 142 * 2) as shader resources 143 */ 144 145 /* Stop streamout. */ 146 if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) 147 si_emit_streamout_end(sctx); 148 149 /* Set the new targets. */ 150 unsigned enabled_mask = 0, append_bitmask = 0; 151 for (i = 0; i < num_targets; i++) { 152 si_so_target_reference(&sctx->streamout.targets[i], targets[i]); 153 if (!targets[i]) 154 continue; 155 156 si_context_add_resource_size(sctx, targets[i]->buffer); 157 enabled_mask |= 1 << i; 158 159 if (offsets[i] == ((unsigned)-1)) 160 append_bitmask |= 1 << i; 161 162 /* Allocate space for the filled buffer size. */ 163 struct si_streamout_target *t = sctx->streamout.targets[i]; 164 if (!t->buf_filled_size) { 165 unsigned buf_filled_size_size = sctx->screen->use_ngg_streamout ? 8 : 4; 166 u_suballocator_alloc(&sctx->allocator_zeroed_memory, buf_filled_size_size, 4, 167 &t->buf_filled_size_offset, 168 (struct pipe_resource **)&t->buf_filled_size); 169 } 170 } 171 172 for (; i < sctx->streamout.num_targets; i++) 173 si_so_target_reference(&sctx->streamout.targets[i], NULL); 174 175 sctx->streamout.enabled_mask = enabled_mask; 176 sctx->streamout.num_targets = num_targets; 177 sctx->streamout.append_bitmask = append_bitmask; 178 179 /* Update dirty state bits. */ 180 if (num_targets) { 181 si_streamout_buffers_dirty(sctx); 182 } else { 183 si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false); 184 si_set_streamout_enable(sctx, false); 185 } 186 187 /* Set the shader resources.*/ 188 for (i = 0; i < num_targets; i++) { 189 if (targets[i]) { 190 struct pipe_shader_buffer sbuf; 191 sbuf.buffer = targets[i]->buffer; 192 193 if (sctx->screen->use_ngg_streamout) { 194 sbuf.buffer_offset = targets[i]->buffer_offset; 195 sbuf.buffer_size = targets[i]->buffer_size; 196 } else { 197 sbuf.buffer_offset = 0; 198 sbuf.buffer_size = targets[i]->buffer_offset + targets[i]->buffer_size; 199 } 200 201 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf); 202 si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT; 203 } else { 204 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); 205 } 206 } 207 for (; i < old_num_targets; i++) 208 si_set_internal_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); 209 210 if (wait_now) 211 sctx->emit_cache_flush(sctx, &sctx->gfx_cs); 212} 213 214static void gfx10_emit_streamout_begin(struct si_context *sctx) 215{ 216 struct si_streamout_target **t = sctx->streamout.targets; 217 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 218 unsigned last_target = 0; 219 220 for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { 221 if (t[i]) 222 last_target = i; 223 } 224 225 radeon_begin(cs); 226 227 for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { 228 if (!t[i]) 229 continue; 230 231 t[i]->stride_in_dw = sctx->streamout.stride_in_dw[i]; 232 233 bool append = sctx->streamout.append_bitmask & (1 << i); 234 uint64_t va = 0; 235 236 if (append) { 237 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ, 238 RADEON_PRIO_SO_FILLED_SIZE); 239 240 va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 241 } 242 243 radeon_emit(PKT3(PKT3_DMA_DATA, 5, 0)); 244 radeon_emit(S_411_SRC_SEL(append ? V_411_SRC_ADDR_TC_L2 : V_411_DATA) | 245 S_411_DST_SEL(V_411_GDS) | S_411_CP_SYNC(i == last_target)); 246 radeon_emit(va); 247 radeon_emit(va >> 32); 248 radeon_emit(4 * i); /* destination in GDS */ 249 radeon_emit(0); 250 radeon_emit(S_415_BYTE_COUNT_GFX9(4) | S_415_DISABLE_WR_CONFIRM_GFX9(i != last_target)); 251 } 252 radeon_end(); 253 254 sctx->streamout.begin_emitted = true; 255} 256 257static void gfx10_emit_streamout_end(struct si_context *sctx) 258{ 259 struct si_streamout_target **t = sctx->streamout.targets; 260 261 for (unsigned i = 0; i < sctx->streamout.num_targets; i++) { 262 if (!t[i]) 263 continue; 264 265 uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 266 267 si_cp_release_mem(sctx, &sctx->gfx_cs, V_028A90_PS_DONE, 0, EOP_DST_SEL_TC_L2, 268 EOP_INT_SEL_SEND_DATA_AFTER_WR_CONFIRM, EOP_DATA_SEL_GDS, 269 t[i]->buf_filled_size, va, EOP_DATA_GDS(i, 1), 0); 270 271 t[i]->buf_filled_size_valid = true; 272 } 273 274 sctx->streamout.begin_emitted = false; 275} 276 277static void si_flush_vgt_streamout(struct si_context *sctx) 278{ 279 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 280 unsigned reg_strmout_cntl; 281 282 radeon_begin(cs); 283 284 /* The register is at different places on different ASICs. */ 285 if (sctx->chip_class >= GFX7) { 286 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 287 radeon_set_uconfig_reg(reg_strmout_cntl, 0); 288 } else { 289 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; 290 radeon_set_config_reg(reg_strmout_cntl, 0); 291 } 292 293 radeon_emit(PKT3(PKT3_EVENT_WRITE, 0, 0)); 294 radeon_emit(EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); 295 296 radeon_emit(PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 297 radeon_emit(WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 298 radeon_emit(reg_strmout_cntl >> 2); /* register */ 299 radeon_emit(0); 300 radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ 301 radeon_emit(S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ 302 radeon_emit(4); /* poll interval */ 303 radeon_end(); 304} 305 306static void si_emit_streamout_begin(struct si_context *sctx) 307{ 308 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 309 struct si_streamout_target **t = sctx->streamout.targets; 310 uint16_t *stride_in_dw = sctx->streamout.stride_in_dw; 311 unsigned i; 312 313 si_flush_vgt_streamout(sctx); 314 315 radeon_begin(cs); 316 317 for (i = 0; i < sctx->streamout.num_targets; i++) { 318 if (!t[i]) 319 continue; 320 321 t[i]->stride_in_dw = stride_in_dw[i]; 322 323 /* AMD GCN binds streamout buffers as shader resources. 324 * VGT only counts primitives and tells the shader 325 * through SGPRs what to do. */ 326 radeon_set_context_reg_seq(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 2); 327 radeon_emit((t[i]->b.buffer_offset + t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ 328 radeon_emit(stride_in_dw[i]); /* VTX_STRIDE (in DW) */ 329 330 if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { 331 uint64_t va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 332 333 /* Append. */ 334 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 335 radeon_emit(STRMOUT_SELECT_BUFFER(i) | 336 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ 337 radeon_emit(0); /* unused */ 338 radeon_emit(0); /* unused */ 339 radeon_emit(va); /* src address lo */ 340 radeon_emit(va >> 32); /* src address hi */ 341 342 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_READ, 343 RADEON_PRIO_SO_FILLED_SIZE); 344 } else { 345 /* Start from the beginning. */ 346 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 347 radeon_emit(STRMOUT_SELECT_BUFFER(i) | 348 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ 349 radeon_emit(0); /* unused */ 350 radeon_emit(0); /* unused */ 351 radeon_emit(t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ 352 radeon_emit(0); /* unused */ 353 } 354 } 355 radeon_end(); 356 357 sctx->streamout.begin_emitted = true; 358} 359 360void si_emit_streamout_end(struct si_context *sctx) 361{ 362 if (sctx->screen->use_ngg_streamout) { 363 gfx10_emit_streamout_end(sctx); 364 return; 365 } 366 367 struct radeon_cmdbuf *cs = &sctx->gfx_cs; 368 struct si_streamout_target **t = sctx->streamout.targets; 369 unsigned i; 370 uint64_t va; 371 372 si_flush_vgt_streamout(sctx); 373 374 radeon_begin(cs); 375 376 for (i = 0; i < sctx->streamout.num_targets; i++) { 377 if (!t[i]) 378 continue; 379 380 va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 381 radeon_emit(PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 382 radeon_emit(STRMOUT_SELECT_BUFFER(i) | STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 383 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ 384 radeon_emit(va); /* dst address lo */ 385 radeon_emit(va >> 32); /* dst address hi */ 386 radeon_emit(0); /* unused */ 387 radeon_emit(0); /* unused */ 388 389 radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, t[i]->buf_filled_size, RADEON_USAGE_WRITE, 390 RADEON_PRIO_SO_FILLED_SIZE); 391 392 /* Zero the buffer size. The counters (primitives generated, 393 * primitives emitted) may be enabled even if there is not 394 * buffer bound. This ensures that the primitives-emitted query 395 * won't increment. */ 396 radeon_set_context_reg(R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16 * i, 0); 397 398 t[i]->buf_filled_size_valid = true; 399 } 400 radeon_end_update_context_roll(sctx); 401 402 sctx->streamout.begin_emitted = false; 403} 404 405/* STREAMOUT CONFIG DERIVED STATE 406 * 407 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work. 408 * The buffer mask is an independent state, so no writes occur if there 409 * are no buffers bound. 410 */ 411 412static void si_emit_streamout_enable(struct si_context *sctx) 413{ 414 assert(!sctx->screen->use_ngg_streamout); 415 416 radeon_begin(&sctx->gfx_cs); 417 radeon_set_context_reg_seq(R_028B94_VGT_STRMOUT_CONFIG, 2); 418 radeon_emit(S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) | 419 S_028B94_RAST_STREAM(0) | 420 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) | 421 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) | 422 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx))); 423 radeon_emit(sctx->streamout.hw_enabled_mask & sctx->streamout.enabled_stream_buffers_mask); 424 radeon_end(); 425} 426 427static void si_set_streamout_enable(struct si_context *sctx, bool enable) 428{ 429 bool old_strmout_en = si_get_strmout_en(sctx); 430 unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask; 431 432 sctx->streamout.streamout_enabled = enable; 433 434 sctx->streamout.hw_enabled_mask = 435 sctx->streamout.enabled_mask | (sctx->streamout.enabled_mask << 4) | 436 (sctx->streamout.enabled_mask << 8) | (sctx->streamout.enabled_mask << 12); 437 438 if (!sctx->screen->use_ngg_streamout && 439 ((old_strmout_en != si_get_strmout_en(sctx)) || 440 (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))) 441 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); 442} 443 444void si_update_prims_generated_query_state(struct si_context *sctx, unsigned type, int diff) 445{ 446 if (!sctx->screen->use_ngg_streamout && type == PIPE_QUERY_PRIMITIVES_GENERATED) { 447 bool old_strmout_en = si_get_strmout_en(sctx); 448 449 sctx->streamout.num_prims_gen_queries += diff; 450 assert(sctx->streamout.num_prims_gen_queries >= 0); 451 452 sctx->streamout.prims_gen_query_enabled = sctx->streamout.num_prims_gen_queries != 0; 453 454 if (old_strmout_en != si_get_strmout_en(sctx)) 455 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); 456 457 if (si_update_ngg(sctx)) { 458 si_shader_change_notify(sctx); 459 sctx->do_update_shaders = true; 460 } 461 } 462} 463 464void si_init_streamout_functions(struct si_context *sctx) 465{ 466 sctx->b.create_stream_output_target = si_create_so_target; 467 sctx->b.stream_output_target_destroy = si_so_target_destroy; 468 sctx->b.set_stream_output_targets = si_set_streamout_targets; 469 470 if (sctx->screen->use_ngg_streamout) { 471 sctx->atoms.s.streamout_begin.emit = gfx10_emit_streamout_begin; 472 } else { 473 sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; 474 sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; 475 } 476} 477