1848b8605Smrg/*
2848b8605Smrg * Copyright (c) 2014 Scott Mansell
3848b8605Smrg * Copyright © 2014 Broadcom
4848b8605Smrg *
5848b8605Smrg * Permission is hereby granted, free of charge, to any person obtaining a
6848b8605Smrg * copy of this software and associated documentation files (the "Software"),
7848b8605Smrg * to deal in the Software without restriction, including without limitation
8848b8605Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
9848b8605Smrg * and/or sell copies of the Software, and to permit persons to whom the
10848b8605Smrg * Software is furnished to do so, subject to the following conditions:
11848b8605Smrg *
12848b8605Smrg * The above copyright notice and this permission notice (including the next
13848b8605Smrg * paragraph) shall be included in all copies or substantial portions of the
14848b8605Smrg * Software.
15848b8605Smrg *
16848b8605Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17848b8605Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18848b8605Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
19848b8605Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20848b8605Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21848b8605Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22848b8605Smrg * IN THE SOFTWARE.
23848b8605Smrg */
24848b8605Smrg
25b8e80941Smrg#include "util/u_blitter.h"
26b8e80941Smrg#include "util/u_prim.h"
27848b8605Smrg#include "util/u_format.h"
28848b8605Smrg#include "util/u_pack_color.h"
29b8e80941Smrg#include "util/u_upload_mgr.h"
30848b8605Smrg#include "indices/u_primconvert.h"
31848b8605Smrg
32848b8605Smrg#include "vc4_context.h"
33848b8605Smrg#include "vc4_resource.h"
34848b8605Smrg
35b8e80941Smrg#define VC4_HW_2116_COUNT		0x1ef0
36b8e80941Smrg
37b8e80941Smrgstatic void
38b8e80941Smrgvc4_get_draw_cl_space(struct vc4_job *job, int vert_count)
39b8e80941Smrg{
40b8e80941Smrg        /* The SW-5891 workaround may cause us to emit multiple shader recs
41b8e80941Smrg         * and draw packets.
42b8e80941Smrg         */
43b8e80941Smrg        int num_draws = DIV_ROUND_UP(vert_count, 65535 - 2) + 1;
44b8e80941Smrg
45b8e80941Smrg        /* Binner gets our packet state -- vc4_emit.c contents,
46b8e80941Smrg         * and the primitive itself.
47b8e80941Smrg         */
48b8e80941Smrg        cl_ensure_space(&job->bcl,
49b8e80941Smrg                        256 + (VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE +
50b8e80941Smrg                               VC4_PACKET_GL_SHADER_STATE_SIZE) * num_draws);
51b8e80941Smrg
52b8e80941Smrg        /* Nothing for rcl -- that's covered by vc4_context.c */
53b8e80941Smrg
54b8e80941Smrg        /* shader_rec gets up to 12 dwords of reloc handles plus a maximally
55b8e80941Smrg         * sized shader_rec (104 bytes base for 8 vattrs plus 32 bytes of
56b8e80941Smrg         * vattr stride).
57b8e80941Smrg         */
58b8e80941Smrg        cl_ensure_space(&job->shader_rec,
59b8e80941Smrg                        (12 * sizeof(uint32_t) + 104 + 8 * 32) * num_draws);
60b8e80941Smrg
61b8e80941Smrg        /* Uniforms are covered by vc4_write_uniforms(). */
62b8e80941Smrg
63b8e80941Smrg        /* There could be up to 16 textures per stage, plus misc other
64b8e80941Smrg         * pointers.
65b8e80941Smrg         */
66b8e80941Smrg        cl_ensure_space(&job->bo_handles, (2 * 16 + 20) * sizeof(uint32_t));
67b8e80941Smrg        cl_ensure_space(&job->bo_pointers,
68b8e80941Smrg                        (2 * 16 + 20) * sizeof(struct vc4_bo *));
69b8e80941Smrg}
70b8e80941Smrg
71848b8605Smrg/**
72848b8605Smrg * Does the initial bining command list setup for drawing to a given FBO.
73848b8605Smrg */
74848b8605Smrgstatic void
75848b8605Smrgvc4_start_draw(struct vc4_context *vc4)
76848b8605Smrg{
77b8e80941Smrg        struct vc4_job *job = vc4->job;
78b8e80941Smrg
79b8e80941Smrg        if (job->needs_flush)
80848b8605Smrg                return;
81848b8605Smrg
82b8e80941Smrg        vc4_get_draw_cl_space(job, 0);
83b8e80941Smrg
84b8e80941Smrg        cl_emit(&job->bcl, TILE_BINNING_MODE_CONFIGURATION, bin) {
85b8e80941Smrg                bin.width_in_tiles = job->draw_tiles_x;
86b8e80941Smrg                bin.height_in_tiles = job->draw_tiles_y;
87b8e80941Smrg                bin.multisample_mode_4x = job->msaa;
88b8e80941Smrg        }
89848b8605Smrg
90b8e80941Smrg        /* START_TILE_BINNING resets the statechange counters in the hardware,
91b8e80941Smrg         * which are what is used when a primitive is binned to a tile to
92b8e80941Smrg         * figure out what new state packets need to be written to that tile's
93b8e80941Smrg         * command list.
94848b8605Smrg         */
95b8e80941Smrg        cl_emit(&job->bcl, START_TILE_BINNING, start);
96b8e80941Smrg
97b8e80941Smrg        /* Reset the current compressed primitives format.  This gets modified
98b8e80941Smrg         * by VC4_PACKET_GL_INDEXED_PRIMITIVE and
99b8e80941Smrg         * VC4_PACKET_GL_ARRAY_PRIMITIVE, so it needs to be reset at the start
100b8e80941Smrg         * of every tile.
101b8e80941Smrg         */
102b8e80941Smrg        cl_emit(&job->bcl, PRIMITIVE_LIST_FORMAT, list) {
103b8e80941Smrg                list.data_type = _16_BIT_INDEX;
104b8e80941Smrg                list.primitive_type = TRIANGLES_LIST;
105848b8605Smrg        }
106b8e80941Smrg
107b8e80941Smrg        job->needs_flush = true;
108b8e80941Smrg        job->draw_width = vc4->framebuffer.width;
109b8e80941Smrg        job->draw_height = vc4->framebuffer.height;
110b8e80941Smrg}
111b8e80941Smrg
112b8e80941Smrgstatic void
113b8e80941Smrgvc4_predraw_check_textures(struct pipe_context *pctx,
114b8e80941Smrg                           struct vc4_texture_stateobj *stage_tex)
115b8e80941Smrg{
116b8e80941Smrg        struct vc4_context *vc4 = vc4_context(pctx);
117b8e80941Smrg
118b8e80941Smrg        for (int i = 0; i < stage_tex->num_textures; i++) {
119b8e80941Smrg                struct vc4_sampler_view *view =
120b8e80941Smrg                        vc4_sampler_view(stage_tex->textures[i]);
121b8e80941Smrg                if (!view)
122b8e80941Smrg                        continue;
123b8e80941Smrg
124b8e80941Smrg                if (view->texture != view->base.texture)
125b8e80941Smrg                        vc4_update_shadow_baselevel_texture(pctx, &view->base);
126b8e80941Smrg
127b8e80941Smrg                vc4_flush_jobs_writing_resource(vc4, view->texture);
128b8e80941Smrg        }
129b8e80941Smrg}
130b8e80941Smrg
131b8e80941Smrgstatic void
132b8e80941Smrgvc4_emit_gl_shader_state(struct vc4_context *vc4,
133b8e80941Smrg                         const struct pipe_draw_info *info,
134b8e80941Smrg                         uint32_t extra_index_bias)
135b8e80941Smrg{
136b8e80941Smrg        struct vc4_job *job = vc4->job;
137b8e80941Smrg        /* VC4_DIRTY_VTXSTATE */
138b8e80941Smrg        struct vc4_vertex_stateobj *vtx = vc4->vtx;
139b8e80941Smrg        /* VC4_DIRTY_VTXBUF */
140b8e80941Smrg        struct vc4_vertexbuf_stateobj *vertexbuf = &vc4->vertexbuf;
141b8e80941Smrg
142b8e80941Smrg        /* The simulator throws a fit if VS or CS don't read an attribute, so
143b8e80941Smrg         * we emit a dummy read.
144b8e80941Smrg         */
145b8e80941Smrg        uint32_t num_elements_emit = MAX2(vtx->num_elements, 1);
146b8e80941Smrg
147b8e80941Smrg        /* Emit the shader record. */
148b8e80941Smrg        cl_start_shader_reloc(&job->shader_rec, 3 + num_elements_emit);
149b8e80941Smrg
150b8e80941Smrg        cl_emit(&job->shader_rec, SHADER_RECORD, rec) {
151b8e80941Smrg                rec.enable_clipping = true;
152b8e80941Smrg
153b8e80941Smrg                /* VC4_DIRTY_COMPILED_FS */
154b8e80941Smrg                rec.fragment_shader_is_single_threaded =
155b8e80941Smrg                        !vc4->prog.fs->fs_threaded;
156b8e80941Smrg
157b8e80941Smrg                /* VC4_DIRTY_PRIM_MODE | VC4_DIRTY_RASTERIZER */
158b8e80941Smrg                rec.point_size_included_in_shaded_vertex_data =
159b8e80941Smrg                         (info->mode == PIPE_PRIM_POINTS &&
160b8e80941Smrg                          vc4->rasterizer->base.point_size_per_vertex);
161b8e80941Smrg
162b8e80941Smrg                /* VC4_DIRTY_COMPILED_FS */
163b8e80941Smrg                rec.fragment_shader_number_of_varyings =
164b8e80941Smrg                        vc4->prog.fs->num_inputs;
165b8e80941Smrg                rec.fragment_shader_code_address =
166b8e80941Smrg                        cl_address(vc4->prog.fs->bo, 0);
167b8e80941Smrg
168b8e80941Smrg                rec.coordinate_shader_attribute_array_select_bits =
169b8e80941Smrg                         vc4->prog.cs->vattrs_live;
170b8e80941Smrg                rec.coordinate_shader_total_attributes_size =
171b8e80941Smrg                         vc4->prog.cs->vattr_offsets[8];
172b8e80941Smrg                rec.coordinate_shader_code_address =
173b8e80941Smrg                        cl_address(vc4->prog.cs->bo, 0);
174b8e80941Smrg
175b8e80941Smrg                rec.vertex_shader_attribute_array_select_bits =
176b8e80941Smrg                         vc4->prog.vs->vattrs_live;
177b8e80941Smrg                rec.vertex_shader_total_attributes_size =
178b8e80941Smrg                         vc4->prog.vs->vattr_offsets[8];
179b8e80941Smrg                rec.vertex_shader_code_address =
180b8e80941Smrg                        cl_address(vc4->prog.vs->bo, 0);
181b8e80941Smrg        };
182b8e80941Smrg
183b8e80941Smrg        uint32_t max_index = 0xffff;
184b8e80941Smrg        for (int i = 0; i < vtx->num_elements; i++) {
185b8e80941Smrg                struct pipe_vertex_element *elem = &vtx->pipe[i];
186b8e80941Smrg                struct pipe_vertex_buffer *vb =
187b8e80941Smrg                        &vertexbuf->vb[elem->vertex_buffer_index];
188b8e80941Smrg                struct vc4_resource *rsc = vc4_resource(vb->buffer.resource);
189b8e80941Smrg                /* not vc4->dirty tracked: vc4->last_index_bias */
190b8e80941Smrg                uint32_t offset = (vb->buffer_offset +
191b8e80941Smrg                                   elem->src_offset +
192b8e80941Smrg                                   vb->stride * (info->index_bias +
193b8e80941Smrg                                                 extra_index_bias));
194b8e80941Smrg                uint32_t vb_size = rsc->bo->size - offset;
195b8e80941Smrg                uint32_t elem_size =
196b8e80941Smrg                        util_format_get_blocksize(elem->src_format);
197b8e80941Smrg
198b8e80941Smrg                cl_emit(&job->shader_rec, ATTRIBUTE_RECORD, attr) {
199b8e80941Smrg                        attr.address = cl_address(rsc->bo, offset);
200b8e80941Smrg                        attr.number_of_bytes_minus_1 = elem_size - 1;
201b8e80941Smrg                        attr.stride = vb->stride;
202b8e80941Smrg                        attr.coordinate_shader_vpm_offset =
203b8e80941Smrg                                vc4->prog.cs->vattr_offsets[i];
204b8e80941Smrg                        attr.vertex_shader_vpm_offset =
205b8e80941Smrg                                vc4->prog.vs->vattr_offsets[i];
206b8e80941Smrg                }
207b8e80941Smrg
208b8e80941Smrg                if (vb->stride > 0) {
209b8e80941Smrg                        max_index = MIN2(max_index,
210b8e80941Smrg                                         (vb_size - elem_size) / vb->stride);
211b8e80941Smrg                }
212848b8605Smrg        }
213848b8605Smrg
214b8e80941Smrg        if (vtx->num_elements == 0) {
215b8e80941Smrg                assert(num_elements_emit == 1);
216b8e80941Smrg                struct vc4_bo *bo = vc4_bo_alloc(vc4->screen, 4096, "scratch VBO");
217b8e80941Smrg
218b8e80941Smrg                cl_emit(&job->shader_rec, ATTRIBUTE_RECORD, attr) {
219b8e80941Smrg                        attr.address = cl_address(bo, 0);
220b8e80941Smrg                        attr.number_of_bytes_minus_1 = 16 - 1;
221b8e80941Smrg                        attr.stride = 0;
222b8e80941Smrg                        attr.coordinate_shader_vpm_offset = 0;
223b8e80941Smrg                        attr.vertex_shader_vpm_offset = 0;
224b8e80941Smrg                }
225b8e80941Smrg
226b8e80941Smrg                vc4_bo_unreference(&bo);
227b8e80941Smrg        }
228b8e80941Smrg
229b8e80941Smrg        cl_emit(&job->bcl, GL_SHADER_STATE, shader_state) {
230b8e80941Smrg                /* Note that number of attributes == 0 in the packet means 8
231b8e80941Smrg                 * attributes.  This field also contains the offset into
232b8e80941Smrg                 * shader_rec.
233b8e80941Smrg                 */
234b8e80941Smrg                assert(vtx->num_elements <= 8);
235b8e80941Smrg                shader_state.number_of_attribute_arrays =
236b8e80941Smrg                        num_elements_emit & 0x7;
237b8e80941Smrg        }
238b8e80941Smrg
239b8e80941Smrg        vc4_write_uniforms(vc4, vc4->prog.fs,
240b8e80941Smrg                           &vc4->constbuf[PIPE_SHADER_FRAGMENT],
241b8e80941Smrg                           &vc4->fragtex);
242b8e80941Smrg        vc4_write_uniforms(vc4, vc4->prog.vs,
243b8e80941Smrg                           &vc4->constbuf[PIPE_SHADER_VERTEX],
244b8e80941Smrg                           &vc4->verttex);
245b8e80941Smrg        vc4_write_uniforms(vc4, vc4->prog.cs,
246b8e80941Smrg                           &vc4->constbuf[PIPE_SHADER_VERTEX],
247b8e80941Smrg                           &vc4->verttex);
248b8e80941Smrg
249b8e80941Smrg        vc4->last_index_bias = info->index_bias + extra_index_bias;
250b8e80941Smrg        vc4->max_index = max_index;
251b8e80941Smrg        job->shader_rec_count++;
252b8e80941Smrg}
253b8e80941Smrg
254b8e80941Smrg/**
255b8e80941Smrg * HW-2116 workaround: Flush the batch before triggering the hardware state
256b8e80941Smrg * counter wraparound behavior.
257b8e80941Smrg *
258b8e80941Smrg * State updates are tracked by a global counter which increments at the first
259b8e80941Smrg * state update after a draw or a START_BINNING.  Tiles can then have their
260b8e80941Smrg * state updated at draw time with a set of cheap checks for whether the
261b8e80941Smrg * state's copy of the global counter matches the global counter the last time
262b8e80941Smrg * that state was written to the tile.
263b8e80941Smrg *
264b8e80941Smrg * The state counters are relatively small and wrap around quickly, so you
265b8e80941Smrg * could get false negatives for needing to update a particular state in the
266b8e80941Smrg * tile.  To avoid this, the hardware attempts to write all of the state in
267b8e80941Smrg * the tile at wraparound time.  This apparently is broken, so we just flush
268b8e80941Smrg * everything before that behavior is triggered.  A batch flush is sufficient
269b8e80941Smrg * to get our current contents drawn and reset the counters to 0.
270b8e80941Smrg *
271b8e80941Smrg * Note that we can't just use VC4_PACKET_FLUSH_ALL, because that caps the
272b8e80941Smrg * tiles with VC4_PACKET_RETURN_FROM_LIST.
273b8e80941Smrg */
274b8e80941Smrgstatic void
275b8e80941Smrgvc4_hw_2116_workaround(struct pipe_context *pctx, int vert_count)
276b8e80941Smrg{
277b8e80941Smrg        struct vc4_context *vc4 = vc4_context(pctx);
278b8e80941Smrg        struct vc4_job *job = vc4_get_job_for_fbo(vc4);
279b8e80941Smrg
280b8e80941Smrg        if (job->draw_calls_queued + vert_count / 65535 >= VC4_HW_2116_COUNT) {
281b8e80941Smrg                perf_debug("Flushing batch due to HW-2116 workaround "
282b8e80941Smrg                           "(too many draw calls per scene\n");
283b8e80941Smrg                vc4_job_submit(vc4, job);
284b8e80941Smrg        }
285848b8605Smrg}
286848b8605Smrg
287848b8605Smrgstatic void
288848b8605Smrgvc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
289848b8605Smrg{
290848b8605Smrg        struct vc4_context *vc4 = vc4_context(pctx);
291b8e80941Smrg        struct pipe_draw_info local_info;
292b8e80941Smrg
293b8e80941Smrg	if (!info->count_from_stream_output && !info->indirect &&
294b8e80941Smrg	    !info->primitive_restart &&
295b8e80941Smrg	    !u_trim_pipe_prim(info->mode, (unsigned*)&info->count))
296b8e80941Smrg		return;
297848b8605Smrg
298848b8605Smrg        if (info->mode >= PIPE_PRIM_QUADS) {
299b8e80941Smrg                if (info->mode == PIPE_PRIM_QUADS &&
300b8e80941Smrg                    info->count == 4 &&
301b8e80941Smrg                    !vc4->rasterizer->base.flatshade) {
302b8e80941Smrg                        local_info = *info;
303b8e80941Smrg                        local_info.mode = PIPE_PRIM_TRIANGLE_FAN;
304b8e80941Smrg                        info = &local_info;
305b8e80941Smrg                } else {
306b8e80941Smrg                        util_primconvert_save_rasterizer_state(vc4->primconvert, &vc4->rasterizer->base);
307b8e80941Smrg                        util_primconvert_draw_vbo(vc4->primconvert, info);
308b8e80941Smrg                        perf_debug("Fallback conversion for %d %s vertices\n",
309b8e80941Smrg                                   info->count, u_prim_name(info->mode));
310b8e80941Smrg                        return;
311b8e80941Smrg                }
312b8e80941Smrg        }
313b8e80941Smrg
314b8e80941Smrg        /* Before setting up the draw, do any fixup blits necessary. */
315b8e80941Smrg        vc4_predraw_check_textures(pctx, &vc4->verttex);
316b8e80941Smrg        vc4_predraw_check_textures(pctx, &vc4->fragtex);
317b8e80941Smrg
318b8e80941Smrg        vc4_hw_2116_workaround(pctx, info->count);
319b8e80941Smrg
320b8e80941Smrg        struct vc4_job *job = vc4_get_job_for_fbo(vc4);
321b8e80941Smrg
322b8e80941Smrg        /* Make sure that the raster order flags haven't changed, which can
323b8e80941Smrg         * only be set at job granularity.
324b8e80941Smrg         */
325b8e80941Smrg        if (job->flags != vc4->rasterizer->tile_raster_order_flags) {
326b8e80941Smrg                vc4_job_submit(vc4, job);
327b8e80941Smrg                job = vc4_get_job_for_fbo(vc4);
328b8e80941Smrg        }
329b8e80941Smrg
330b8e80941Smrg        vc4_get_draw_cl_space(job, info->count);
331b8e80941Smrg
332b8e80941Smrg        if (vc4->prim_mode != info->mode) {
333b8e80941Smrg                vc4->prim_mode = info->mode;
334b8e80941Smrg                vc4->dirty |= VC4_DIRTY_PRIM_MODE;
335848b8605Smrg        }
336848b8605Smrg
337848b8605Smrg        vc4_start_draw(vc4);
338b8e80941Smrg        if (!vc4_update_compiled_shaders(vc4, info->mode)) {
339b8e80941Smrg                debug_warn_once("shader compile failed, skipping draw call.\n");
340b8e80941Smrg                return;
341b8e80941Smrg        }
342848b8605Smrg
343848b8605Smrg        vc4_emit_state(pctx);
344848b8605Smrg
345b8e80941Smrg        bool needs_drawarrays_shader_state = false;
346b8e80941Smrg
347b8e80941Smrg        if ((vc4->dirty & (VC4_DIRTY_VTXBUF |
348b8e80941Smrg                           VC4_DIRTY_VTXSTATE |
349b8e80941Smrg                           VC4_DIRTY_PRIM_MODE |
350b8e80941Smrg                           VC4_DIRTY_RASTERIZER |
351b8e80941Smrg                           VC4_DIRTY_COMPILED_CS |
352b8e80941Smrg                           VC4_DIRTY_COMPILED_VS |
353b8e80941Smrg                           VC4_DIRTY_COMPILED_FS |
354b8e80941Smrg                           vc4->prog.cs->uniform_dirty_bits |
355b8e80941Smrg                           vc4->prog.vs->uniform_dirty_bits |
356b8e80941Smrg                           vc4->prog.fs->uniform_dirty_bits)) ||
357b8e80941Smrg            vc4->last_index_bias != info->index_bias) {
358b8e80941Smrg                if (info->index_size)
359b8e80941Smrg                        vc4_emit_gl_shader_state(vc4, info, 0);
360b8e80941Smrg                else
361b8e80941Smrg                        needs_drawarrays_shader_state = true;
362b8e80941Smrg        }
363b8e80941Smrg
364b8e80941Smrg        vc4->dirty = 0;
365848b8605Smrg
366848b8605Smrg        /* Note that the primitive type fields match with OpenGL/gallium
367848b8605Smrg         * definitions, up to but not including QUADS.
368848b8605Smrg         */
369b8e80941Smrg        if (info->index_size) {
370b8e80941Smrg                uint32_t index_size = info->index_size;
371b8e80941Smrg                uint32_t offset = info->start * index_size;
372b8e80941Smrg                struct pipe_resource *prsc;
373b8e80941Smrg                if (info->index_size == 4) {
374b8e80941Smrg                        prsc = vc4_get_shadow_index_buffer(pctx, info,
375b8e80941Smrg                                                           offset,
376b8e80941Smrg                                                           info->count, &offset);
377b8e80941Smrg                        index_size = 2;
378b8e80941Smrg                } else {
379b8e80941Smrg                        if (info->has_user_indices) {
380b8e80941Smrg                                prsc = NULL;
381b8e80941Smrg                                u_upload_data(vc4->uploader, 0,
382b8e80941Smrg                                              info->count * index_size, 4,
383b8e80941Smrg                                              info->index.user,
384b8e80941Smrg                                              &offset, &prsc);
385b8e80941Smrg                        } else {
386b8e80941Smrg                                prsc = info->index.resource;
387b8e80941Smrg                        }
388b8e80941Smrg                }
389b8e80941Smrg                struct vc4_resource *rsc = vc4_resource(prsc);
390b8e80941Smrg
391b8e80941Smrg                struct vc4_cl_out *bcl = cl_start(&job->bcl);
392b8e80941Smrg
393b8e80941Smrg                /* The original design for the VC4 kernel UABI had multiple
394b8e80941Smrg                 * packets that used relocations in the BCL (some of which
395b8e80941Smrg                 * needed two BOs), but later modifications eliminated all but
396b8e80941Smrg                 * this one usage.  We have an arbitrary 32-bit offset value,
397b8e80941Smrg                 * and need to also supply an arbitrary 32-bit index buffer
398b8e80941Smrg                 * GEM handle, so we have this fake packet we emit in our BCL
399b8e80941Smrg                 * to be validated, which the kernel uses at validation time
400b8e80941Smrg                 * to perform the relocation in the IB packet (without
401b8e80941Smrg                 * emitting to the actual HW).
402b8e80941Smrg                 */
403b8e80941Smrg                uint32_t hindex = vc4_gem_hindex(job, rsc->bo);
404b8e80941Smrg                if (job->last_gem_handle_hindex != hindex) {
405b8e80941Smrg                        cl_u8(&bcl, VC4_PACKET_GEM_HANDLES);
406b8e80941Smrg                        cl_u32(&bcl, hindex);
407b8e80941Smrg                        cl_u32(&bcl, 0);
408b8e80941Smrg                        job->last_gem_handle_hindex = hindex;
409b8e80941Smrg                }
410b8e80941Smrg
411b8e80941Smrg                cl_u8(&bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
412b8e80941Smrg                cl_u8(&bcl,
413848b8605Smrg                      info->mode |
414b8e80941Smrg                      (index_size == 2 ?
415848b8605Smrg                       VC4_INDEX_BUFFER_U16:
416848b8605Smrg                       VC4_INDEX_BUFFER_U8));
417b8e80941Smrg                cl_u32(&bcl, info->count);
418b8e80941Smrg                cl_u32(&bcl, offset);
419b8e80941Smrg                cl_u32(&bcl, vc4->max_index);
420b8e80941Smrg
421b8e80941Smrg                cl_end(&job->bcl, bcl);
422b8e80941Smrg                job->draw_calls_queued++;
423b8e80941Smrg
424b8e80941Smrg                if (info->index_size == 4 || info->has_user_indices)
425b8e80941Smrg                        pipe_resource_reference(&prsc, NULL);
426848b8605Smrg        } else {
427b8e80941Smrg                uint32_t count = info->count;
428b8e80941Smrg                uint32_t start = info->start;
429b8e80941Smrg                uint32_t extra_index_bias = 0;
430b8e80941Smrg                static const uint32_t max_verts = 65535;
431b8e80941Smrg
432b8e80941Smrg                /* GFXH-515 / SW-5891: The binner emits 16 bit indices for
433b8e80941Smrg                 * drawarrays, which means that if start + count > 64k it
434b8e80941Smrg                 * would truncate the top bits.  Work around this by emitting
435b8e80941Smrg                 * a limited number of primitives at a time and reemitting the
436b8e80941Smrg                 * shader state pointing farther down the vertex attribute
437b8e80941Smrg                 * arrays.
438b8e80941Smrg                 *
439b8e80941Smrg                 * To do this properly for line loops or trifans, we'd need to
440b8e80941Smrg                 * make a new VB containing the first vertex plus whatever
441b8e80941Smrg                 * remainder.
442b8e80941Smrg                 */
443b8e80941Smrg                if (start + count > max_verts) {
444b8e80941Smrg                        extra_index_bias = start;
445b8e80941Smrg                        start = 0;
446b8e80941Smrg                        needs_drawarrays_shader_state = true;
447b8e80941Smrg                }
448b8e80941Smrg
449b8e80941Smrg                while (count) {
450b8e80941Smrg                        uint32_t this_count = count;
451b8e80941Smrg                        uint32_t step = count;
452b8e80941Smrg
453b8e80941Smrg                        if (needs_drawarrays_shader_state) {
454b8e80941Smrg                                vc4_emit_gl_shader_state(vc4, info,
455b8e80941Smrg                                                         extra_index_bias);
456b8e80941Smrg                        }
457b8e80941Smrg
458b8e80941Smrg                        if (count > max_verts) {
459b8e80941Smrg                                switch (info->mode) {
460b8e80941Smrg                                case PIPE_PRIM_POINTS:
461b8e80941Smrg                                        this_count = step = max_verts;
462b8e80941Smrg                                        break;
463b8e80941Smrg                                case PIPE_PRIM_LINES:
464b8e80941Smrg                                        this_count = step = max_verts - (max_verts % 2);
465b8e80941Smrg                                        break;
466b8e80941Smrg                                case PIPE_PRIM_LINE_STRIP:
467b8e80941Smrg                                        this_count = max_verts;
468b8e80941Smrg                                        step = max_verts - 1;
469b8e80941Smrg                                        break;
470b8e80941Smrg                                case PIPE_PRIM_LINE_LOOP:
471b8e80941Smrg                                        this_count = max_verts;
472b8e80941Smrg                                        step = max_verts - 1;
473b8e80941Smrg                                        debug_warn_once("unhandled line loop "
474b8e80941Smrg                                                        "looping behavior with "
475b8e80941Smrg                                                        ">65535 verts\n");
476b8e80941Smrg                                        break;
477b8e80941Smrg                                case PIPE_PRIM_TRIANGLES:
478b8e80941Smrg                                        this_count = step = max_verts - (max_verts % 3);
479b8e80941Smrg                                        break;
480b8e80941Smrg                                case PIPE_PRIM_TRIANGLE_STRIP:
481b8e80941Smrg                                        this_count = max_verts;
482b8e80941Smrg                                        step = max_verts - 2;
483b8e80941Smrg                                        break;
484b8e80941Smrg                                default:
485b8e80941Smrg                                        debug_warn_once("unhandled primitive "
486b8e80941Smrg                                                        "max vert count, truncating\n");
487b8e80941Smrg                                        this_count = step = max_verts;
488b8e80941Smrg                                }
489b8e80941Smrg                        }
490b8e80941Smrg
491b8e80941Smrg                        cl_emit(&job->bcl, VERTEX_ARRAY_PRIMITIVES, array) {
492b8e80941Smrg                                array.primitive_mode = info->mode;
493b8e80941Smrg                                array.length = this_count;
494b8e80941Smrg                                array.index_of_first_vertex = start;
495b8e80941Smrg                        }
496b8e80941Smrg                        job->draw_calls_queued++;
497b8e80941Smrg
498b8e80941Smrg                        count -= step;
499b8e80941Smrg                        extra_index_bias += start + step;
500b8e80941Smrg                        start = 0;
501b8e80941Smrg                        needs_drawarrays_shader_state = true;
502b8e80941Smrg                }
503848b8605Smrg        }
504848b8605Smrg
505b8e80941Smrg        /* We shouldn't have tripped the HW_2116 bug with the GFXH-515
506b8e80941Smrg         * workaround.
507b8e80941Smrg         */
508b8e80941Smrg        assert(job->draw_calls_queued <= VC4_HW_2116_COUNT);
509848b8605Smrg
510b8e80941Smrg        if (vc4->zsa && vc4->framebuffer.zsbuf) {
511b8e80941Smrg                struct vc4_resource *rsc =
512b8e80941Smrg                        vc4_resource(vc4->framebuffer.zsbuf->texture);
513848b8605Smrg
514b8e80941Smrg                if (vc4->zsa->base.depth.enabled) {
515b8e80941Smrg                        job->resolve |= PIPE_CLEAR_DEPTH;
516b8e80941Smrg                        rsc->initialized_buffers = PIPE_CLEAR_DEPTH;
517b8e80941Smrg                }
518848b8605Smrg
519b8e80941Smrg                if (vc4->zsa->base.stencil[0].enabled) {
520b8e80941Smrg                        job->resolve |= PIPE_CLEAR_STENCIL;
521b8e80941Smrg                        rsc->initialized_buffers |= PIPE_CLEAR_STENCIL;
522b8e80941Smrg                }
523848b8605Smrg        }
524848b8605Smrg
525b8e80941Smrg        job->resolve |= PIPE_CLEAR_COLOR0;
526b8e80941Smrg
527b8e80941Smrg        /* If we've used half of the presumably 256MB CMA area, flush the job
528b8e80941Smrg         * so that we don't accumulate a job that will end up not being
529b8e80941Smrg         * executable.
530b8e80941Smrg         */
531b8e80941Smrg        if (job->bo_space > 128 * 1024 * 1024)
532b8e80941Smrg                vc4_flush(pctx);
533b8e80941Smrg
534b8e80941Smrg        if (vc4_debug & VC4_DEBUG_ALWAYS_FLUSH)
535b8e80941Smrg                vc4_flush(pctx);
536848b8605Smrg}
537848b8605Smrg
538848b8605Smrgstatic uint32_t
539848b8605Smrgpack_rgba(enum pipe_format format, const float *rgba)
540848b8605Smrg{
541848b8605Smrg        union util_color uc;
542848b8605Smrg        util_pack_color(rgba, format, &uc);
543b8e80941Smrg        if (util_format_get_blocksize(format) == 2)
544b8e80941Smrg                return uc.us;
545b8e80941Smrg        else
546b8e80941Smrg                return uc.ui[0];
547848b8605Smrg}
548848b8605Smrg
549848b8605Smrgstatic void
550848b8605Smrgvc4_clear(struct pipe_context *pctx, unsigned buffers,
551848b8605Smrg          const union pipe_color_union *color, double depth, unsigned stencil)
552848b8605Smrg{
553848b8605Smrg        struct vc4_context *vc4 = vc4_context(pctx);
554b8e80941Smrg        struct vc4_job *job = vc4_get_job_for_fbo(vc4);
555b8e80941Smrg
556b8e80941Smrg        if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
557b8e80941Smrg                struct vc4_resource *rsc =
558b8e80941Smrg                        vc4_resource(vc4->framebuffer.zsbuf->texture);
559b8e80941Smrg                unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL;
560b8e80941Smrg
561b8e80941Smrg                /* Clearing ZS will clear both Z and stencil, so if we're
562b8e80941Smrg                 * trying to clear just one then we need to draw a quad to do
563b8e80941Smrg                 * it instead.  We need to do this before setting up
564b8e80941Smrg                 * tile-based clears in vc4->job, because the blitter may
565b8e80941Smrg                 * submit the current job.
566b8e80941Smrg                 */
567b8e80941Smrg                if ((zsclear == PIPE_CLEAR_DEPTH ||
568b8e80941Smrg                     zsclear == PIPE_CLEAR_STENCIL) &&
569b8e80941Smrg                    (rsc->initialized_buffers & ~(zsclear | job->cleared)) &&
570b8e80941Smrg                    util_format_is_depth_and_stencil(vc4->framebuffer.zsbuf->format)) {
571b8e80941Smrg                        static const union pipe_color_union dummy_color = {};
572b8e80941Smrg
573b8e80941Smrg                        perf_debug("Partial clear of Z+stencil buffer, "
574b8e80941Smrg                                   "drawing a quad instead of fast clearing\n");
575b8e80941Smrg                        vc4_blitter_save(vc4);
576b8e80941Smrg                        util_blitter_clear(vc4->blitter,
577b8e80941Smrg                                           vc4->framebuffer.width,
578b8e80941Smrg                                           vc4->framebuffer.height,
579b8e80941Smrg                                           1,
580b8e80941Smrg                                           zsclear,
581b8e80941Smrg                                           &dummy_color, depth, stencil);
582b8e80941Smrg                        buffers &= ~zsclear;
583b8e80941Smrg                        if (!buffers)
584b8e80941Smrg                                return;
585b8e80941Smrg                        job = vc4_get_job_for_fbo(vc4);
586b8e80941Smrg                }
587b8e80941Smrg        }
588848b8605Smrg
589848b8605Smrg        /* We can't flag new buffers for clearing once we've queued draws.  We
590848b8605Smrg         * could avoid this by using the 3d engine to clear.
591848b8605Smrg         */
592b8e80941Smrg        if (job->draw_calls_queued) {
593b8e80941Smrg                perf_debug("Flushing rendering to process new clear.\n");
594b8e80941Smrg                vc4_job_submit(vc4, job);
595b8e80941Smrg                job = vc4_get_job_for_fbo(vc4);
596b8e80941Smrg        }
597848b8605Smrg
598848b8605Smrg        if (buffers & PIPE_CLEAR_COLOR0) {
599b8e80941Smrg                struct vc4_resource *rsc =
600b8e80941Smrg                        vc4_resource(vc4->framebuffer.cbufs[0]->texture);
601b8e80941Smrg                uint32_t clear_color;
602b8e80941Smrg
603b8e80941Smrg                if (vc4_rt_format_is_565(vc4->framebuffer.cbufs[0]->format)) {
604b8e80941Smrg                        /* In 565 mode, the hardware will be packing our color
605b8e80941Smrg                         * for us.
606b8e80941Smrg                         */
607b8e80941Smrg                        clear_color = pack_rgba(PIPE_FORMAT_R8G8B8A8_UNORM,
608b8e80941Smrg                                                color->f);
609b8e80941Smrg                } else {
610b8e80941Smrg                        /* Otherwise, we need to do this packing because we
611b8e80941Smrg                         * support multiple swizzlings of RGBA8888.
612b8e80941Smrg                         */
613b8e80941Smrg                        clear_color =
614b8e80941Smrg                                pack_rgba(vc4->framebuffer.cbufs[0]->format,
615b8e80941Smrg                                          color->f);
616b8e80941Smrg                }
617b8e80941Smrg                job->clear_color[0] = job->clear_color[1] = clear_color;
618b8e80941Smrg                rsc->initialized_buffers |= (buffers & PIPE_CLEAR_COLOR0);
619848b8605Smrg        }
620848b8605Smrg
621b8e80941Smrg        if (buffers & PIPE_CLEAR_DEPTHSTENCIL) {
622b8e80941Smrg                struct vc4_resource *rsc =
623b8e80941Smrg                        vc4_resource(vc4->framebuffer.zsbuf->texture);
624b8e80941Smrg
625b8e80941Smrg                /* Though the depth buffer is stored with Z in the high 24,
626b8e80941Smrg                 * for this field we just need to store it in the low 24.
627b8e80941Smrg                 */
628b8e80941Smrg                if (buffers & PIPE_CLEAR_DEPTH) {
629b8e80941Smrg                        job->clear_depth = util_pack_z(PIPE_FORMAT_Z24X8_UNORM,
630b8e80941Smrg                                                       depth);
631b8e80941Smrg                }
632b8e80941Smrg                if (buffers & PIPE_CLEAR_STENCIL)
633b8e80941Smrg                        job->clear_stencil = stencil;
634b8e80941Smrg
635b8e80941Smrg                rsc->initialized_buffers |= (buffers & PIPE_CLEAR_DEPTHSTENCIL);
636b8e80941Smrg        }
637848b8605Smrg
638b8e80941Smrg        job->draw_min_x = 0;
639b8e80941Smrg        job->draw_min_y = 0;
640b8e80941Smrg        job->draw_max_x = vc4->framebuffer.width;
641b8e80941Smrg        job->draw_max_y = vc4->framebuffer.height;
642b8e80941Smrg        job->cleared |= buffers;
643b8e80941Smrg        job->resolve |= buffers;
644848b8605Smrg
645848b8605Smrg        vc4_start_draw(vc4);
646848b8605Smrg}
647848b8605Smrg
648848b8605Smrgstatic void
649848b8605Smrgvc4_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps,
650848b8605Smrg                        const union pipe_color_union *color,
651b8e80941Smrg                        unsigned x, unsigned y, unsigned w, unsigned h,
652b8e80941Smrg			bool render_condition_enabled)
653848b8605Smrg{
654848b8605Smrg        fprintf(stderr, "unimpl: clear RT\n");
655848b8605Smrg}
656848b8605Smrg
657848b8605Smrgstatic void
658848b8605Smrgvc4_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps,
659848b8605Smrg                        unsigned buffers, double depth, unsigned stencil,
660b8e80941Smrg                        unsigned x, unsigned y, unsigned w, unsigned h,
661b8e80941Smrg			bool render_condition_enabled)
662848b8605Smrg{
663848b8605Smrg        fprintf(stderr, "unimpl: clear DS\n");
664848b8605Smrg}
665848b8605Smrg
666848b8605Smrgvoid
667848b8605Smrgvc4_draw_init(struct pipe_context *pctx)
668848b8605Smrg{
669848b8605Smrg        pctx->draw_vbo = vc4_draw_vbo;
670848b8605Smrg        pctx->clear = vc4_clear;
671848b8605Smrg        pctx->clear_render_target = vc4_clear_render_target;
672848b8605Smrg        pctx->clear_depth_stencil = vc4_clear_depth_stencil;
673848b8605Smrg}
674