1/*
2 * Copyright 2013 Advanced Micro Devices, Inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9 * and/or sell copies of the Software, and to permit persons to whom the
10 * Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 * SOFTWARE.
23 */
24
25#include "si_build_pm4.h"
26
27#include "util/u_memory.h"
28#include "util/u_suballoc.h"
29
30static void si_set_streamout_enable(struct si_context *sctx, bool enable);
31
32static inline void si_so_target_reference(struct si_streamout_target **dst,
33					  struct pipe_stream_output_target *src)
34{
35	pipe_so_target_reference((struct pipe_stream_output_target**)dst, src);
36}
37
38static struct pipe_stream_output_target *
39si_create_so_target(struct pipe_context *ctx,
40		    struct pipe_resource *buffer,
41		    unsigned buffer_offset,
42		    unsigned buffer_size)
43{
44	struct si_context *sctx = (struct si_context *)ctx;
45	struct si_streamout_target *t;
46	struct si_resource *buf = si_resource(buffer);
47
48	t = CALLOC_STRUCT(si_streamout_target);
49	if (!t) {
50		return NULL;
51	}
52
53	u_suballocator_alloc(sctx->allocator_zeroed_memory, 4, 4,
54			     &t->buf_filled_size_offset,
55			     (struct pipe_resource**)&t->buf_filled_size);
56	if (!t->buf_filled_size) {
57		FREE(t);
58		return NULL;
59	}
60
61	t->b.reference.count = 1;
62	t->b.context = ctx;
63	pipe_resource_reference(&t->b.buffer, buffer);
64	t->b.buffer_offset = buffer_offset;
65	t->b.buffer_size = buffer_size;
66
67	util_range_add(&buf->valid_buffer_range, buffer_offset,
68		       buffer_offset + buffer_size);
69	return &t->b;
70}
71
72static void si_so_target_destroy(struct pipe_context *ctx,
73				 struct pipe_stream_output_target *target)
74{
75	struct si_streamout_target *t = (struct si_streamout_target*)target;
76	pipe_resource_reference(&t->b.buffer, NULL);
77	si_resource_reference(&t->buf_filled_size, NULL);
78	FREE(t);
79}
80
81void si_streamout_buffers_dirty(struct si_context *sctx)
82{
83	if (!sctx->streamout.enabled_mask)
84		return;
85
86	si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_begin);
87	si_set_streamout_enable(sctx, true);
88}
89
90static void si_set_streamout_targets(struct pipe_context *ctx,
91				     unsigned num_targets,
92				     struct pipe_stream_output_target **targets,
93				     const unsigned *offsets)
94{
95	struct si_context *sctx = (struct si_context *)ctx;
96	unsigned old_num_targets = sctx->streamout.num_targets;
97	unsigned i;
98
99	/* We are going to unbind the buffers. Mark which caches need to be flushed. */
100	if (sctx->streamout.num_targets && sctx->streamout.begin_emitted) {
101		/* Since streamout uses vector writes which go through TC L2
102		 * and most other clients can use TC L2 as well, we don't need
103		 * to flush it.
104		 *
105		 * The only cases which requires flushing it is VGT DMA index
106		 * fetching (on <= CIK) and indirect draw data, which are rare
107		 * cases. Thus, flag the TC L2 dirtiness in the resource and
108		 * handle it at draw call time.
109		 */
110		for (i = 0; i < sctx->streamout.num_targets; i++)
111			if (sctx->streamout.targets[i])
112				si_resource(sctx->streamout.targets[i]->b.buffer)->TC_L2_dirty = true;
113
114		/* Invalidate the scalar cache in case a streamout buffer is
115		 * going to be used as a constant buffer.
116		 *
117		 * Invalidate vL1, because streamout bypasses it (done by
118		 * setting GLC=1 in the store instruction), but vL1 in other
119		 * CUs can contain outdated data of streamout buffers.
120		 *
121		 * VS_PARTIAL_FLUSH is required if the buffers are going to be
122		 * used as an input immediately.
123		 */
124		sctx->flags |= SI_CONTEXT_INV_SMEM_L1 |
125				 SI_CONTEXT_INV_VMEM_L1 |
126				 SI_CONTEXT_VS_PARTIAL_FLUSH;
127	}
128
129	/* All readers of the streamout targets need to be finished before we can
130	 * start writing to the targets.
131	 */
132	if (num_targets)
133		sctx->flags |= SI_CONTEXT_PS_PARTIAL_FLUSH |
134		                 SI_CONTEXT_CS_PARTIAL_FLUSH;
135
136	/* Streamout buffers must be bound in 2 places:
137	 * 1) in VGT by setting the VGT_STRMOUT registers
138	 * 2) as shader resources
139	 */
140
141	/* Stop streamout. */
142	if (sctx->streamout.num_targets && sctx->streamout.begin_emitted)
143		si_emit_streamout_end(sctx);
144
145	/* Set the new targets. */
146	unsigned enabled_mask = 0, append_bitmask = 0;
147	for (i = 0; i < num_targets; i++) {
148		si_so_target_reference(&sctx->streamout.targets[i], targets[i]);
149		if (!targets[i])
150			continue;
151
152		si_context_add_resource_size(sctx, targets[i]->buffer);
153		enabled_mask |= 1 << i;
154
155		if (offsets[i] == ((unsigned)-1))
156			append_bitmask |= 1 << i;
157	}
158
159	for (; i < sctx->streamout.num_targets; i++)
160		si_so_target_reference(&sctx->streamout.targets[i], NULL);
161
162	sctx->streamout.enabled_mask = enabled_mask;
163	sctx->streamout.num_targets = num_targets;
164	sctx->streamout.append_bitmask = append_bitmask;
165
166	/* Update dirty state bits. */
167	if (num_targets) {
168		si_streamout_buffers_dirty(sctx);
169	} else {
170		si_set_atom_dirty(sctx, &sctx->atoms.s.streamout_begin, false);
171		si_set_streamout_enable(sctx, false);
172	}
173
174	/* Set the shader resources.*/
175	for (i = 0; i < num_targets; i++) {
176		if (targets[i]) {
177			struct pipe_shader_buffer sbuf;
178			sbuf.buffer = targets[i]->buffer;
179			sbuf.buffer_offset = 0;
180			sbuf.buffer_size = targets[i]->buffer_offset +
181					   targets[i]->buffer_size;
182			si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, &sbuf);
183			si_resource(targets[i]->buffer)->bind_history |= PIPE_BIND_STREAM_OUTPUT;
184		} else {
185			si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
186		}
187	}
188	for (; i < old_num_targets; i++)
189		si_set_rw_shader_buffer(sctx, SI_VS_STREAMOUT_BUF0 + i, NULL);
190}
191
192static void si_flush_vgt_streamout(struct si_context *sctx)
193{
194	struct radeon_cmdbuf *cs = sctx->gfx_cs;
195	unsigned reg_strmout_cntl;
196
197	/* The register is at different places on different ASICs. */
198	if (sctx->chip_class >= CIK) {
199		reg_strmout_cntl = R_0300FC_CP_STRMOUT_CNTL;
200		radeon_set_uconfig_reg(cs, reg_strmout_cntl, 0);
201	} else {
202		reg_strmout_cntl = R_0084FC_CP_STRMOUT_CNTL;
203		radeon_set_config_reg(cs, reg_strmout_cntl, 0);
204	}
205
206	radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
207	radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SO_VGTSTREAMOUT_FLUSH) | EVENT_INDEX(0));
208
209	radeon_emit(cs, PKT3(PKT3_WAIT_REG_MEM, 5, 0));
210	radeon_emit(cs, WAIT_REG_MEM_EQUAL); /* wait until the register is equal to the reference value */
211	radeon_emit(cs, reg_strmout_cntl >> 2);  /* register */
212	radeon_emit(cs, 0);
213	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* reference value */
214	radeon_emit(cs, S_0084FC_OFFSET_UPDATE_DONE(1)); /* mask */
215	radeon_emit(cs, 4); /* poll interval */
216}
217
218static void si_emit_streamout_begin(struct si_context *sctx)
219{
220	struct radeon_cmdbuf *cs = sctx->gfx_cs;
221	struct si_streamout_target **t = sctx->streamout.targets;
222	uint16_t *stride_in_dw = sctx->streamout.stride_in_dw;
223	unsigned i;
224
225	si_flush_vgt_streamout(sctx);
226
227	for (i = 0; i < sctx->streamout.num_targets; i++) {
228		if (!t[i])
229			continue;
230
231		t[i]->stride_in_dw = stride_in_dw[i];
232
233		/* SI binds streamout buffers as shader resources.
234		 * VGT only counts primitives and tells the shader
235		 * through SGPRs what to do. */
236		radeon_set_context_reg_seq(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 2);
237		radeon_emit(cs, (t[i]->b.buffer_offset +
238				 t[i]->b.buffer_size) >> 2);	/* BUFFER_SIZE (in DW) */
239		radeon_emit(cs, stride_in_dw[i]);		/* VTX_STRIDE (in DW) */
240
241		if (sctx->streamout.append_bitmask & (1 << i) && t[i]->buf_filled_size_valid) {
242			uint64_t va = t[i]->buf_filled_size->gpu_address +
243				      t[i]->buf_filled_size_offset;
244
245			/* Append. */
246			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
247			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
248				    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_MEM)); /* control */
249			radeon_emit(cs, 0); /* unused */
250			radeon_emit(cs, 0); /* unused */
251			radeon_emit(cs, va); /* src address lo */
252			radeon_emit(cs, va >> 32); /* src address hi */
253
254			radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
255						  t[i]->buf_filled_size,
256						  RADEON_USAGE_READ,
257						  RADEON_PRIO_SO_FILLED_SIZE);
258		} else {
259			/* Start from the beginning. */
260			radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
261			radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
262				    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_FROM_PACKET)); /* control */
263			radeon_emit(cs, 0); /* unused */
264			radeon_emit(cs, 0); /* unused */
265			radeon_emit(cs, t[i]->b.buffer_offset >> 2); /* buffer offset in DW */
266			radeon_emit(cs, 0); /* unused */
267		}
268	}
269
270	sctx->streamout.begin_emitted = true;
271}
272
273void si_emit_streamout_end(struct si_context *sctx)
274{
275	struct radeon_cmdbuf *cs = sctx->gfx_cs;
276	struct si_streamout_target **t = sctx->streamout.targets;
277	unsigned i;
278	uint64_t va;
279
280	si_flush_vgt_streamout(sctx);
281
282	for (i = 0; i < sctx->streamout.num_targets; i++) {
283		if (!t[i])
284			continue;
285
286		va = t[i]->buf_filled_size->gpu_address + t[i]->buf_filled_size_offset;
287		radeon_emit(cs, PKT3(PKT3_STRMOUT_BUFFER_UPDATE, 4, 0));
288		radeon_emit(cs, STRMOUT_SELECT_BUFFER(i) |
289			    STRMOUT_OFFSET_SOURCE(STRMOUT_OFFSET_NONE) |
290			    STRMOUT_STORE_BUFFER_FILLED_SIZE); /* control */
291		radeon_emit(cs, va);     /* dst address lo */
292		radeon_emit(cs, va >> 32); /* dst address hi */
293		radeon_emit(cs, 0); /* unused */
294		radeon_emit(cs, 0); /* unused */
295
296		radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
297					  t[i]->buf_filled_size,
298					  RADEON_USAGE_WRITE,
299					  RADEON_PRIO_SO_FILLED_SIZE);
300
301		/* Zero the buffer size. The counters (primitives generated,
302		 * primitives emitted) may be enabled even if there is not
303		 * buffer bound. This ensures that the primitives-emitted query
304		 * won't increment. */
305		radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
306		sctx->context_roll = true;
307
308		t[i]->buf_filled_size_valid = true;
309	}
310
311	sctx->streamout.begin_emitted = false;
312}
313
314/* STREAMOUT CONFIG DERIVED STATE
315 *
316 * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
317 * The buffer mask is an independent state, so no writes occur if there
318 * are no buffers bound.
319 */
320
321static void si_emit_streamout_enable(struct si_context *sctx)
322{
323	radeon_set_context_reg_seq(sctx->gfx_cs, R_028B94_VGT_STRMOUT_CONFIG, 2);
324	radeon_emit(sctx->gfx_cs,
325		    S_028B94_STREAMOUT_0_EN(si_get_strmout_en(sctx)) |
326		    S_028B94_RAST_STREAM(0) |
327		    S_028B94_STREAMOUT_1_EN(si_get_strmout_en(sctx)) |
328		    S_028B94_STREAMOUT_2_EN(si_get_strmout_en(sctx)) |
329		    S_028B94_STREAMOUT_3_EN(si_get_strmout_en(sctx)));
330	radeon_emit(sctx->gfx_cs,
331		    sctx->streamout.hw_enabled_mask &
332		    sctx->streamout.enabled_stream_buffers_mask);
333}
334
335static void si_set_streamout_enable(struct si_context *sctx, bool enable)
336{
337	bool old_strmout_en = si_get_strmout_en(sctx);
338	unsigned old_hw_enabled_mask = sctx->streamout.hw_enabled_mask;
339
340	sctx->streamout.streamout_enabled = enable;
341
342	sctx->streamout.hw_enabled_mask = sctx->streamout.enabled_mask |
343					  (sctx->streamout.enabled_mask << 4) |
344					  (sctx->streamout.enabled_mask << 8) |
345					  (sctx->streamout.enabled_mask << 12);
346
347	if ((old_strmout_en != si_get_strmout_en(sctx)) ||
348            (old_hw_enabled_mask != sctx->streamout.hw_enabled_mask))
349		si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
350}
351
352void si_update_prims_generated_query_state(struct si_context *sctx,
353					   unsigned type, int diff)
354{
355	if (type == PIPE_QUERY_PRIMITIVES_GENERATED) {
356		bool old_strmout_en = si_get_strmout_en(sctx);
357
358		sctx->streamout.num_prims_gen_queries += diff;
359		assert(sctx->streamout.num_prims_gen_queries >= 0);
360
361		sctx->streamout.prims_gen_query_enabled =
362			sctx->streamout.num_prims_gen_queries != 0;
363
364		if (old_strmout_en != si_get_strmout_en(sctx))
365			si_mark_atom_dirty(sctx, &sctx->atoms.s.streamout_enable);
366	}
367}
368
369void si_init_streamout_functions(struct si_context *sctx)
370{
371	sctx->b.create_stream_output_target = si_create_so_target;
372	sctx->b.stream_output_target_destroy = si_so_target_destroy;
373	sctx->b.set_stream_output_targets = si_set_streamout_targets;
374	sctx->atoms.s.streamout_begin.emit = si_emit_streamout_begin;
375	sctx->atoms.s.streamout_enable.emit = si_emit_streamout_enable;
376}
377