1/* 2 * Copyright 2013 Advanced Micro Devices, Inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25#include "si_build_pm4.h" 26 27#include "util/u_memory.h" 28#include "util/u_suballoc.h" 29 30static void si_set_streamout_enable(struct si_context *sctx, bool enable); 31 32static inline void si_so_target_reference(struct si_streamout_target **dst, 33 struct pipe_stream_output_target *src) 34{ 35 pipe_so_target_reference((struct pipe_stream_output_target**)dst, src); 36} 37 38static struct pipe_stream_output_target * 39si_create_so_target(struct pipe_context *ctx, 40 struct pipe_resource *buffer, 41 unsigned buffer_offset, 42 unsigned buffer_size) 43{ 44 struct si_context *sctx = (struct si_context *)ctx; 45 struct si_streamout_target *t; 46 struct si_resource *buf = si_resource(buffer); 47 48 t = CALLOC_STRUCT(si_streamout_target); 49 if (!t) { 50 return NULL; 51 } 52 53 u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4, 54 &t->buf_filled_size_offset, 55 (struct pipe_resource**)&t->buf_filled_size); 56 if (!t->buf_filled_size) { 57 FREE(t); 58 return NULL; 59 } 60 61 t->b.reference.count = 1; 62 t->b.context = ctx; 63 pipe_resource_reference(&t->b.buffer, buffer); 64 t->b.buffer_offset = buffer_offset; 65 t->b.buffer_size = buffer_size; 66 67 util_range_add(&buf->valid_buffer_range, buffer_offset, 68 buffer_offset + buffer_size); 69 return &t->b; 70} 71 72static void si_so_target_destroy(struct pipe_context *ctx, 73 struct pipe_stream_output_target *target) 74{ 75 struct si_streamout_target *t = (struct si_streamout_target*)target; 76 pipe_resource_reference(&t->b.buffer, NULL); 77 si_resource_reference(&t->buf_filled_size, NULL); 78 FREE(t); 79} 80 81void si_streamout_buffers_dirty(struct si_context *sctx) 82{ 83 if (!sctx->streamout.enabled_mask) 84 return; 85 86 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin); 87 si_set_streamout_enable(sctx, true); 88} 89 90static void si_set_streamout_targets(struct pipe_context *ctx, 91 unsigned num_targets, 92 struct pipe_stream_output_target **targets, 93 const unsigned *offsets) 94{ 95 struct si_context *sctx = (struct si_context *)ctx; 96 unsigned old_num_targets = sctx->streamout.num_targets; 97 unsigned i; 98 99 /* We are going to unbind the buffers. Mark which caches need to be flushed. */ 100 if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) { 101 /* Since streamout uses vector writes which go through TC L2 102 * and most other clients can use TC L2 as well, we don't need 103 * to flush it. 104 * 105 * The only cases which requires flushing it is VGT DMA index 106 * fetching (on <= CIK) and indirect draw data, which are rare 107 * cases. Thus, flag the TC L2 dirtiness in the resource and 108 * handle it at draw call time. 109 */ 110 for (i = 0; i < sctx->streamout.num_targets; i++) 111 if (sctx->streamout.targets[i]) 112 si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true; 113 114 /* Invalidate the scalar cache in case a streamout buffer is 115 * going to be used as a constant buffer. 116 * 117 * Invalidate vL1, because streamout bypasses it (done by 118 * setting GLC=1 in the store instruction), but vL1 in other 119 * CUs can contain outdated data of streamout buffers. 120 * 121 * VS_PARTIAL_FLUSH is required if the buffers are going to be 122 * used as an input immediately. 123 */ 124 sctx->flags |= SI_CONTEXT_INV_SMEM_L1 | 125 SI_CONTEXT_INV_VMEM_L1 | 126 SI_CONTEXT_VS_PARTIAL_FLUSH; 127 } 128 129 /* All readers of the streamout targets need to be finished before we can 130 * start writing to the targets. 131 */ 132 if (num_targets) 133 sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH | 134 SI_CONTEXT_CS_PARTIAL_FLUSH; 135 136 /* Streamout buffers must be bound in 2 places: 137 * 1) in VGT by setting the VGT_STRMOUT registers 138 * 2) as shader resources 139 */ 140 141 /* Stop streamout. */ 142 if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) 143 si_emit_streamout_end(sctx); 144 145 /* Set the new targets. */ 146 unsigned enabled_mask = 0, append_bitmask = 0; 147 for (i = 0; i < num_targets; i++) { 148 si_so_target_reference(&sctx->streamout.targets[i], targets[i]); 149 if (!targets[i]) 150 continue; 151 152 si_context_add_resource_size(sctx, targets[i]->buffer); 153 enabled_mask |= 1 << i; 154 155 if (offsets[i] == ((unsigned)-1)) 156 append_bitmask |= 1 << i; 157 } 158 159 for (; i < sctx->streamout.num_targets; i++) 160 si_so_target_reference(&sctx->streamout.targets[i], NULL); 161 162 sctx->streamout.enabled_mask = enabled_mask; 163 sctx->streamout.num_targets = num_targets; 164 sctx->streamout.append_bitmask = append_bitmask; 165 166 /* Update dirty state bits. */ 167 if (num_targets) { 168 si_streamout_buffers_dirty(sctx); 169 } else { 170 si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false); 171 si_set_streamout_enable(sctx, false); 172 } 173 174 /* Set the shader resources.*/ 175 for (i = 0; i < num_targets; i++) { 176 if (targets[i]) { 177 struct pipe_shader_buffer sbuf; 178 sbuf.buffer = targets[i]->buffer; 179 sbuf.buffer_offset = 0; 180 sbuf.buffer_size = targets[i]->buffer_offset + 181 targets[i]->buffer_size; 182 si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf); 183 si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT; 184 } else { 185 si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); 186 } 187 } 188 for (; i < old_num_targets; i++) 189 si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL); 190} 191 192static void si_flush_vgt_streamout(struct si_context *sctx) 193{ 194 struct radeon_cmdbuf *cs = sctx->gfx_cs; 195 unsigned reg_strmout_cntl; 196 197 /* The register is at different places on different ASICs. */ 198 if (sctx->chip_class >= CIK) { 199 reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL; 200 radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0); 201 } else { 202 reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL; 203 radeon_set_config_reg(cs, reg_strmout_cntl, 0); 204 } 205 206 radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0)); 207 radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0)); 208 209 radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0)); 210 radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */ 211 radeon_emit(cs, reg_strmout_cntl >> 2); /* register */ 212 radeon_emit(cs, 0); 213 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */ 214 radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */ 215 radeon_emit(cs, 4); /* poll interval */ 216} 217 218static void si_emit_streamout_begin(struct si_context *sctx) 219{ 220 struct radeon_cmdbuf *cs = sctx->gfx_cs; 221 struct si_streamout_target **t = sctx->streamout.targets; 222 uint16_t *stride_in_dw = sctx->streamout.stride_in_dw; 223 unsigned i; 224 225 si_flush_vgt_streamout(sctx); 226 227 for (i = 0; i < sctx->streamout.num_targets; i++) { 228 if (!t[i]) 229 continue; 230 231 t[i]->stride_in_dw = stride_in_dw[i]; 232 233 /* SI binds streamout buffers as shader resources. 234 * VGT only counts primitives and tells the shader 235 * through SGPRs what to do. */ 236 radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2); 237 radeon_emit(cs, (t[i]->b.buffer_offset + 238 t[i]->b.buffer_size) >> 2); /* BUFFER_SIZE (in DW) */ 239 radeon_emit(cs, stride_in_dw[i]); /* VTX_STRIDE (in DW) */ 240 241 if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) { 242 uint64_t va = t[i]->buf_filled_size->gpu_address + 243 t[i]->buf_filled_size_offset; 244 245 /* Append. */ 246 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 247 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | 248 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */ 249 radeon_emit(cs, 0); /* unused */ 250 radeon_emit(cs, 0); /* unused */ 251 radeon_emit(cs, va); /* src address lo */ 252 radeon_emit(cs, va >> 32); /* src address hi */ 253 254 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, 255 t[i]->buf_filled_size, 256 RADEON_USAGE_READ, 257 RADEON_PRIO_SO_FILLED_SIZE); 258 } else { 259 /* Start from the beginning. */ 260 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 261 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | 262 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */ 263 radeon_emit(cs, 0); /* unused */ 264 radeon_emit(cs, 0); /* unused */ 265 radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */ 266 radeon_emit(cs, 0); /* unused */ 267 } 268 } 269 270 sctx->streamout.begin_emitted = true; 271} 272 273void si_emit_streamout_end(struct si_context *sctx) 274{ 275 struct radeon_cmdbuf *cs = sctx->gfx_cs; 276 struct si_streamout_target **t = sctx->streamout.targets; 277 unsigned i; 278 uint64_t va; 279 280 si_flush_vgt_streamout(sctx); 281 282 for (i = 0; i < sctx->streamout.num_targets; i++) { 283 if (!t[i]) 284 continue; 285 286 va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset; 287 radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0)); 288 radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) | 289 STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) | 290 STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */ 291 radeon_emit(cs, va); /* dst address lo */ 292 radeon_emit(cs, va >> 32); /* dst address hi */ 293 radeon_emit(cs, 0); /* unused */ 294 radeon_emit(cs, 0); /* unused */ 295 296 radeon_add_to_buffer_list(sctx, sctx->gfx_cs, 297 t[i]->buf_filled_size, 298 RADEON_USAGE_WRITE, 299 RADEON_PRIO_SO_FILLED_SIZE); 300 301 /* Zero the buffer size. The counters (primitives generated, 302 * primitives emitted) may be enabled even if there is not 303 * buffer bound. This ensures that the primitives-emitted query 304 * won't increment. */ 305 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0); 306 sctx->context_roll = true; 307 308 t[i]->buf_filled_size_valid = true; 309 } 310 311 sctx->streamout.begin_emitted = false; 312} 313 314/* STREAMOUT CONFIG DERIVED STATE 315 * 316 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work. 317 * The buffer mask is an independent state, so no writes occur if there 318 * are no buffers bound. 319 */ 320 321static void si_emit_streamout_enable(struct si_context *sctx) 322{ 323 radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2); 324 radeon_emit(sctx->gfx_cs, 325 S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) | 326 S_028B94_RAST_STREAM(0) | 327 S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) | 328 S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) | 329 S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx))); 330 radeon_emit(sctx->gfx_cs, 331 sctx->streamout.hw_enabled_mask & 332 sctx->streamout.enabled_stream_buffers_mask); 333} 334 335static void si_set_streamout_enable(struct si_context *sctx, bool enable) 336{ 337 bool old_strmout_en = si_get_strmout_en(sctx); 338 unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask; 339 340 sctx->streamout.streamout_enabled = enable; 341 342 sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask | 343 (sctx->streamout.enabled_mask << 4) | 344 (sctx->streamout.enabled_mask << 8) | 345 (sctx->streamout.enabled_mask << 12); 346 347 if ((old_strmout_en != si_get_strmout_en(sctx)) || 348 (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask)) 349 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); 350} 351 352void si_update_prims_generated_query_state(struct si_context *sctx, 353 unsigned type, int diff) 354{ 355 if (type == PIPE_QUERY_PRIMITIVES_GENERATED) { 356 bool old_strmout_en = si_get_strmout_en(sctx); 357 358 sctx->streamout.num_prims_gen_queries += diff; 359 assert(sctx->streamout.num_prims_gen_queries >= 0); 360 361 sctx->streamout.prims_gen_query_enabled = 362 sctx->streamout.num_prims_gen_queries != 0; 363 364 if (old_strmout_en != si_get_strmout_en(sctx)) 365 si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable); 366 } 367} 368 369void si_init_streamout_functions(struct si_context *sctx) 370{ 371 sctx->b.create_stream_output_target = si_create_so_target; 372 sctx->b.stream_output_target_destroy = si_so_target_destroy; 373 sctx->b.set_stream_output_targets = si_set_streamout_targets; 374 sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin; 375 sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable; 376} 377