1/*
2 * Copyright © 2014-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "util/u_blitter.h"
25#include "util/u_draw.h"
26#include "util/u_prim.h"
27#include "util/format/u_format.h"
28#include "util/u_pack_color.h"
29#include "util/u_prim_restart.h"
30#include "util/u_upload_mgr.h"
31
32#include "v3d_context.h"
33#include "v3d_resource.h"
34#include "v3d_cl.h"
35#include "broadcom/compiler/v3d_compiler.h"
36#include "broadcom/common/v3d_macros.h"
37#include "broadcom/common/v3d_util.h"
38#include "broadcom/cle/v3dx_pack.h"
39
40static void
41v3d_start_binning(struct v3d_context *v3d, struct v3d_job *job)
42{
43        assert(job->needs_flush);
44
45        /* Get space to emit our BCL state, using a branch to jump to a new BO
46         * if necessary.
47         */
48
49        v3d_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */);
50
51        job->submit.bcl_start = job->bcl.bo->offset;
52        v3d_job_add_bo(job, job->bcl.bo);
53
54        /* The PTB will request the tile alloc initial size per tile at start
55         * of tile binning.
56         */
57        uint32_t tile_alloc_size =
58                MAX2(job->num_layers, 1) * job->draw_tiles_x * job->draw_tiles_y * 64;
59
60        /* The PTB allocates in aligned 4k chunks after the initial setup. */
61        tile_alloc_size = align(tile_alloc_size, 4096);
62
63        /* Include the first two chunk allocations that the PTB does so that
64         * we definitely clear the OOM condition before triggering one (the HW
65         * won't trigger OOM during the first allocations).
66         */
67        tile_alloc_size += 8192;
68
69        /* For performance, allocate some extra initial memory after the PTB's
70         * minimal allocations, so that we hopefully don't have to block the
71         * GPU on the kernel handling an OOM signal.
72         */
73        tile_alloc_size += 512 * 1024;
74
75        job->tile_alloc = v3d_bo_alloc(v3d->screen, tile_alloc_size,
76                                       "tile_alloc");
77        uint32_t tsda_per_tile_size = v3d->screen->devinfo.ver >= 40 ? 256 : 64;
78        job->tile_state = v3d_bo_alloc(v3d->screen,
79                                       MAX2(job->num_layers, 1) *
80                                       job->draw_tiles_y *
81                                       job->draw_tiles_x *
82                                       tsda_per_tile_size,
83                                       "TSDA");
84
85#if V3D_VERSION >= 41
86        /* This must go before the binning mode configuration. It is
87         * required for layered framebuffers to work.
88         */
89        if (job->num_layers > 0) {
90                cl_emit(&job->bcl, NUMBER_OF_LAYERS, config) {
91                        config.number_of_layers = job->num_layers;
92                }
93        }
94#endif
95
96#if V3D_VERSION >= 40
97        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG, config) {
98                config.width_in_pixels = job->draw_width;
99                config.height_in_pixels = job->draw_height;
100                config.number_of_render_targets =
101                        MAX2(job->nr_cbufs, 1);
102
103                config.multisample_mode_4x = job->msaa;
104
105                config.maximum_bpp_of_all_render_targets = job->internal_bpp;
106        }
107#else /* V3D_VERSION < 40 */
108        /* "Binning mode lists start with a Tile Binning Mode Configuration
109         * item (120)"
110         *
111         * Part1 signals the end of binning config setup.
112         */
113        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG_PART2, config) {
114                config.tile_allocation_memory_address =
115                        cl_address(job->tile_alloc, 0);
116                config.tile_allocation_memory_size = job->tile_alloc->size;
117        }
118
119        cl_emit(&job->bcl, TILE_BINNING_MODE_CFG_PART1, config) {
120                config.tile_state_data_array_base_address =
121                        cl_address(job->tile_state, 0);
122
123                config.width_in_tiles = job->draw_tiles_x;
124                config.height_in_tiles = job->draw_tiles_y;
125                /* Must be >= 1 */
126                config.number_of_render_targets =
127                        MAX2(job->nr_cbufs, 1);
128
129                config.multisample_mode_4x = job->msaa;
130
131                config.maximum_bpp_of_all_render_targets = job->internal_bpp;
132        }
133#endif /* V3D_VERSION < 40 */
134
135        /* There's definitely nothing in the VCD cache we want. */
136        cl_emit(&job->bcl, FLUSH_VCD_CACHE, bin);
137
138        /* Disable any leftover OQ state from another job. */
139        cl_emit(&job->bcl, OCCLUSION_QUERY_COUNTER, counter);
140
141        /* "Binning mode lists must have a Start Tile Binning item (6) after
142         *  any prefix state data before the binning list proper starts."
143         */
144        cl_emit(&job->bcl, START_TILE_BINNING, bin);
145}
146/**
147 * Does the initial bining command list setup for drawing to a given FBO.
148 */
149static void
150v3d_start_draw(struct v3d_context *v3d)
151{
152        struct v3d_job *job = v3d->job;
153
154        if (job->needs_flush)
155                return;
156
157        job->needs_flush = true;
158        job->draw_width = v3d->framebuffer.width;
159        job->draw_height = v3d->framebuffer.height;
160        job->num_layers = util_framebuffer_get_num_layers(&v3d->framebuffer);
161
162        v3d_start_binning(v3d, job);
163}
164
165static void
166v3d_predraw_check_stage_inputs(struct pipe_context *pctx,
167                               enum pipe_shader_type s)
168{
169        struct v3d_context *v3d = v3d_context(pctx);
170
171        /* Flush writes to textures we're sampling. */
172        for (int i = 0; i < v3d->tex[s].num_textures; i++) {
173                struct pipe_sampler_view *pview = v3d->tex[s].textures[i];
174                if (!pview)
175                        continue;
176                struct v3d_sampler_view *view = v3d_sampler_view(pview);
177
178                if (view->texture != view->base.texture &&
179                    view->base.format != PIPE_FORMAT_X32_S8X24_UINT)
180                        v3d_update_shadow_texture(pctx, &view->base);
181
182                v3d_flush_jobs_writing_resource(v3d, view->texture,
183                                                V3D_FLUSH_DEFAULT,
184                                                s == PIPE_SHADER_COMPUTE);
185        }
186
187        /* Flush writes to UBOs. */
188        u_foreach_bit(i, v3d->constbuf[s].enabled_mask) {
189                struct pipe_constant_buffer *cb = &v3d->constbuf[s].cb[i];
190                if (cb->buffer) {
191                        v3d_flush_jobs_writing_resource(v3d, cb->buffer,
192                                                        V3D_FLUSH_DEFAULT,
193                                                        s == PIPE_SHADER_COMPUTE);
194                }
195        }
196
197        /* Flush reads/writes to our SSBOs */
198        u_foreach_bit(i, v3d->ssbo[s].enabled_mask) {
199                struct pipe_shader_buffer *sb = &v3d->ssbo[s].sb[i];
200                if (sb->buffer) {
201                        v3d_flush_jobs_reading_resource(v3d, sb->buffer,
202                                                        V3D_FLUSH_NOT_CURRENT_JOB,
203                                                        s == PIPE_SHADER_COMPUTE);
204                }
205        }
206
207        /* Flush reads/writes to our image views */
208        u_foreach_bit(i, v3d->shaderimg[s].enabled_mask) {
209                struct v3d_image_view *view = &v3d->shaderimg[s].si[i];
210
211                v3d_flush_jobs_reading_resource(v3d, view->base.resource,
212                                                V3D_FLUSH_NOT_CURRENT_JOB,
213                                                s == PIPE_SHADER_COMPUTE);
214        }
215
216        /* Flush writes to our vertex buffers (i.e. from transform feedback) */
217        if (s == PIPE_SHADER_VERTEX) {
218                u_foreach_bit(i, v3d->vertexbuf.enabled_mask) {
219                        struct pipe_vertex_buffer *vb = &v3d->vertexbuf.vb[i];
220
221                        v3d_flush_jobs_writing_resource(v3d, vb->buffer.resource,
222                                                        V3D_FLUSH_DEFAULT,
223                                                        false);
224                }
225        }
226}
227
228static void
229v3d_predraw_check_outputs(struct pipe_context *pctx)
230{
231        struct v3d_context *v3d = v3d_context(pctx);
232
233        /* Flush jobs reading from TF buffers that we are about to write. */
234        if (v3d_transform_feedback_enabled(v3d)) {
235                struct v3d_streamout_stateobj *so = &v3d->streamout;
236
237                for (int i = 0; i < so->num_targets; i++) {
238                        if (!so->targets[i])
239                                continue;
240
241                        const struct pipe_stream_output_target *target =
242                                so->targets[i];
243                        v3d_flush_jobs_reading_resource(v3d, target->buffer,
244                                                        V3D_FLUSH_DEFAULT,
245                                                        false);
246                }
247        }
248}
249
250/**
251 * Checks if the state for the current draw reads a particular resource in
252 * in the given shader stage.
253 */
254static bool
255v3d_state_reads_resource(struct v3d_context *v3d,
256                         struct pipe_resource *prsc,
257                         enum pipe_shader_type s)
258{
259        struct v3d_resource *rsc = v3d_resource(prsc);
260
261        /* Vertex buffers */
262        if (s == PIPE_SHADER_VERTEX) {
263                u_foreach_bit(i, v3d->vertexbuf.enabled_mask) {
264                        struct pipe_vertex_buffer *vb = &v3d->vertexbuf.vb[i];
265                        if (!vb->buffer.resource)
266                                continue;
267
268                        struct v3d_resource *vb_rsc =
269                                v3d_resource(vb->buffer.resource);
270                        if (rsc->bo == vb_rsc->bo)
271                                return true;
272                }
273        }
274
275        /* Constant buffers */
276        u_foreach_bit(i, v3d->constbuf[s].enabled_mask) {
277                struct pipe_constant_buffer *cb = &v3d->constbuf[s].cb[i];
278                if (!cb->buffer)
279                        continue;
280
281                struct v3d_resource *cb_rsc = v3d_resource(cb->buffer);
282                if (rsc->bo == cb_rsc->bo)
283                        return true;
284        }
285
286        /* Shader storage buffers */
287        u_foreach_bit(i, v3d->ssbo[s].enabled_mask) {
288                struct pipe_shader_buffer *sb = &v3d->ssbo[s].sb[i];
289                if (!sb->buffer)
290                        continue;
291
292                struct v3d_resource *sb_rsc = v3d_resource(sb->buffer);
293                if (rsc->bo == sb_rsc->bo)
294                        return true;
295        }
296
297        /* Textures  */
298        for (int i = 0; i < v3d->tex[s].num_textures; i++) {
299                struct pipe_sampler_view *pview = v3d->tex[s].textures[i];
300                if (!pview)
301                        continue;
302
303                struct v3d_sampler_view *view = v3d_sampler_view(pview);
304                struct v3d_resource *v_rsc = v3d_resource(view->texture);
305                if (rsc->bo == v_rsc->bo)
306                        return true;
307        }
308
309        return false;
310}
311
312static void
313v3d_emit_wait_for_tf(struct v3d_job *job)
314{
315        /* XXX: we might be able to skip this in some cases, for now we
316         * always emit it.
317         */
318        cl_emit(&job->bcl, FLUSH_TRANSFORM_FEEDBACK_DATA, flush);
319
320        cl_emit(&job->bcl, WAIT_FOR_TRANSFORM_FEEDBACK, wait) {
321                /* XXX: Wait for all outstanding writes... maybe we can do
322                 * better in some cases.
323                 */
324                wait.block_count = 255;
325        }
326
327        /* We have just flushed all our outstanding TF work in this job so make
328         * sure we don't emit TF flushes again for any of it again.
329         */
330        _mesa_set_clear(job->tf_write_prscs, NULL);
331}
332
333static void
334v3d_emit_wait_for_tf_if_needed(struct v3d_context *v3d, struct v3d_job *job)
335{
336        if (!job->tf_enabled)
337            return;
338
339        set_foreach(job->tf_write_prscs, entry) {
340                struct pipe_resource *prsc = (struct pipe_resource *)entry->key;
341                for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) {
342                        /* Fragment shaders can only start executing after all
343                         * binning (and thus TF) is complete.
344                         *
345                         * XXX: For VS/GS/TES, if the binning shader does not
346                         * read the resource then we could also avoid emitting
347                         * the wait.
348                         */
349                        if (s == PIPE_SHADER_FRAGMENT)
350                            continue;
351
352                        if (v3d_state_reads_resource(v3d, prsc, s)) {
353                                v3d_emit_wait_for_tf(job);
354                                return;
355                        }
356                }
357        }
358}
359
360#if V3D_VERSION >= 41
361static void
362v3d_emit_gs_state_record(struct v3d_job *job,
363                         struct v3d_compiled_shader *gs_bin,
364                         struct v3d_cl_reloc gs_bin_uniforms,
365                         struct v3d_compiled_shader *gs,
366                         struct v3d_cl_reloc gs_render_uniforms)
367{
368        cl_emit(&job->indirect, GEOMETRY_SHADER_STATE_RECORD, shader) {
369                shader.geometry_bin_mode_shader_code_address =
370                        cl_address(v3d_resource(gs_bin->resource)->bo,
371                                   gs_bin->offset);
372                shader.geometry_bin_mode_shader_4_way_threadable =
373                        gs_bin->prog_data.gs->base.threads == 4;
374                shader.geometry_bin_mode_shader_start_in_final_thread_section =
375                        gs_bin->prog_data.gs->base.single_seg;
376                shader.geometry_bin_mode_shader_propagate_nans = true;
377                shader.geometry_bin_mode_shader_uniforms_address =
378                        gs_bin_uniforms;
379
380                shader.geometry_render_mode_shader_code_address =
381                        cl_address(v3d_resource(gs->resource)->bo, gs->offset);
382                shader.geometry_render_mode_shader_4_way_threadable =
383                        gs->prog_data.gs->base.threads == 4;
384                shader.geometry_render_mode_shader_start_in_final_thread_section =
385                        gs->prog_data.gs->base.single_seg;
386                shader.geometry_render_mode_shader_propagate_nans = true;
387                shader.geometry_render_mode_shader_uniforms_address =
388                        gs_render_uniforms;
389        }
390}
391
392static uint8_t
393v3d_gs_output_primitive(uint32_t prim_type)
394{
395    switch (prim_type) {
396    case GL_POINTS:
397        return GEOMETRY_SHADER_POINTS;
398    case GL_LINE_STRIP:
399        return GEOMETRY_SHADER_LINE_STRIP;
400    case GL_TRIANGLE_STRIP:
401        return GEOMETRY_SHADER_TRI_STRIP;
402    default:
403        unreachable("Unsupported primitive type");
404    }
405}
406
407static void
408v3d_emit_tes_gs_common_params(struct v3d_job *job,
409                              uint8_t gs_out_prim_type,
410                              uint8_t gs_num_invocations)
411{
412        /* This, and v3d_emit_tes_gs_shader_params below, fill in default
413         * values for tessellation fields even though we don't support
414         * tessellation yet because our packing functions (and the simulator)
415         * complain if we don't.
416         */
417        cl_emit(&job->indirect, TESSELLATION_GEOMETRY_COMMON_PARAMS, shader) {
418                shader.tessellation_type = TESSELLATION_TYPE_TRIANGLE;
419                shader.tessellation_point_mode = false;
420                shader.tessellation_edge_spacing = TESSELLATION_EDGE_SPACING_EVEN;
421                shader.tessellation_clockwise = true;
422                shader.tessellation_invocations = 1;
423
424                shader.geometry_shader_output_format =
425                        v3d_gs_output_primitive(gs_out_prim_type);
426                shader.geometry_shader_instances = gs_num_invocations & 0x1F;
427        }
428}
429
430static uint8_t
431simd_width_to_gs_pack_mode(uint32_t width)
432{
433    switch (width) {
434    case 16:
435        return V3D_PACK_MODE_16_WAY;
436    case 8:
437        return V3D_PACK_MODE_8_WAY;
438    case 4:
439        return V3D_PACK_MODE_4_WAY;
440    case 1:
441        return V3D_PACK_MODE_1_WAY;
442    default:
443        unreachable("Invalid SIMD width");
444    };
445}
446
447static void
448v3d_emit_tes_gs_shader_params(struct v3d_job *job,
449                              uint32_t gs_simd,
450                              uint32_t gs_vpm_output_size,
451                              uint32_t gs_max_vpm_input_size_per_batch)
452{
453        cl_emit(&job->indirect, TESSELLATION_GEOMETRY_SHADER_PARAMS, shader) {
454                shader.tcs_batch_flush_mode = V3D_TCS_FLUSH_MODE_FULLY_PACKED;
455                shader.per_patch_data_column_depth = 1;
456                shader.tcs_output_segment_size_in_sectors = 1;
457                shader.tcs_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
458                shader.tes_output_segment_size_in_sectors = 1;
459                shader.tes_output_segment_pack_mode = V3D_PACK_MODE_16_WAY;
460                shader.gs_output_segment_size_in_sectors = gs_vpm_output_size;
461                shader.gs_output_segment_pack_mode =
462                        simd_width_to_gs_pack_mode(gs_simd);
463                shader.tbg_max_patches_per_tcs_batch = 1;
464                shader.tbg_max_extra_vertex_segs_for_patches_after_first = 0;
465                shader.tbg_min_tcs_output_segments_required_in_play = 1;
466                shader.tbg_min_per_patch_data_segments_required_in_play = 1;
467                shader.tpg_max_patches_per_tes_batch = 1;
468                shader.tpg_max_vertex_segments_per_tes_batch = 0;
469                shader.tpg_max_tcs_output_segments_per_tes_batch = 1;
470                shader.tpg_min_tes_output_segments_required_in_play = 1;
471                shader.gbg_max_tes_output_vertex_segments_per_gs_batch =
472                        gs_max_vpm_input_size_per_batch;
473                shader.gbg_min_gs_output_segments_required_in_play = 1;
474        }
475}
476#endif
477
478static void
479v3d_emit_gl_shader_state(struct v3d_context *v3d,
480                         const struct pipe_draw_info *info)
481{
482        struct v3d_job *job = v3d->job;
483        /* V3D_DIRTY_VTXSTATE */
484        struct v3d_vertex_stateobj *vtx = v3d->vtx;
485        /* V3D_DIRTY_VTXBUF */
486        struct v3d_vertexbuf_stateobj *vertexbuf = &v3d->vertexbuf;
487
488        /* Upload the uniforms to the indirect CL first */
489        struct v3d_cl_reloc fs_uniforms =
490                v3d_write_uniforms(v3d, job, v3d->prog.fs,
491                                   PIPE_SHADER_FRAGMENT);
492
493        struct v3d_cl_reloc gs_uniforms = { NULL, 0 };
494        struct v3d_cl_reloc gs_bin_uniforms = { NULL, 0 };
495        if (v3d->prog.gs) {
496                gs_uniforms = v3d_write_uniforms(v3d, job, v3d->prog.gs,
497                                                 PIPE_SHADER_GEOMETRY);
498        }
499        if (v3d->prog.gs_bin) {
500                gs_bin_uniforms = v3d_write_uniforms(v3d, job, v3d->prog.gs_bin,
501                                                     PIPE_SHADER_GEOMETRY);
502        }
503
504        struct v3d_cl_reloc vs_uniforms =
505                v3d_write_uniforms(v3d, job, v3d->prog.vs,
506                                   PIPE_SHADER_VERTEX);
507        struct v3d_cl_reloc cs_uniforms =
508                v3d_write_uniforms(v3d, job, v3d->prog.cs,
509                                   PIPE_SHADER_VERTEX);
510
511        /* Update the cache dirty flag based on the shader progs data */
512        job->tmu_dirty_rcl |= v3d->prog.cs->prog_data.vs->base.tmu_dirty_rcl;
513        job->tmu_dirty_rcl |= v3d->prog.vs->prog_data.vs->base.tmu_dirty_rcl;
514        if (v3d->prog.gs_bin) {
515                job->tmu_dirty_rcl |=
516                        v3d->prog.gs_bin->prog_data.gs->base.tmu_dirty_rcl;
517        }
518        if (v3d->prog.gs) {
519                job->tmu_dirty_rcl |=
520                        v3d->prog.gs->prog_data.gs->base.tmu_dirty_rcl;
521        }
522        job->tmu_dirty_rcl |= v3d->prog.fs->prog_data.fs->base.tmu_dirty_rcl;
523
524        uint32_t num_elements_to_emit = 0;
525        for (int i = 0; i < vtx->num_elements; i++) {
526                struct pipe_vertex_element *elem = &vtx->pipe[i];
527                struct pipe_vertex_buffer *vb =
528                        &vertexbuf->vb[elem->vertex_buffer_index];
529                if (vb->buffer.resource)
530                        num_elements_to_emit++;
531        }
532
533        uint32_t shader_state_record_length =
534                cl_packet_length(GL_SHADER_STATE_RECORD);
535#if V3D_VERSION >= 41
536        if (v3d->prog.gs) {
537                shader_state_record_length +=
538                        cl_packet_length(GEOMETRY_SHADER_STATE_RECORD) +
539                        cl_packet_length(TESSELLATION_GEOMETRY_COMMON_PARAMS) +
540                        2 * cl_packet_length(TESSELLATION_GEOMETRY_SHADER_PARAMS);
541        }
542#endif
543
544        /* See GFXH-930 workaround below */
545        uint32_t shader_rec_offset =
546                    v3d_cl_ensure_space(&job->indirect,
547                                    shader_state_record_length +
548                                    MAX2(num_elements_to_emit, 1) *
549                                    cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD),
550                                    32);
551
552        /* XXX perf: We should move most of the SHADER_STATE_RECORD setup to
553         * compile time, so that we mostly just have to OR the VS and FS
554         * records together at draw time.
555         */
556
557        struct vpm_config vpm_cfg_bin, vpm_cfg;
558
559        assert(v3d->screen->devinfo.ver >= 41 || !v3d->prog.gs);
560        v3d_compute_vpm_config(&v3d->screen->devinfo,
561                               v3d->prog.cs->prog_data.vs,
562                               v3d->prog.vs->prog_data.vs,
563                               v3d->prog.gs ? v3d->prog.gs_bin->prog_data.gs : NULL,
564                               v3d->prog.gs ? v3d->prog.gs->prog_data.gs : NULL,
565                               &vpm_cfg_bin,
566                               &vpm_cfg);
567
568        if (v3d->prog.gs) {
569#if V3D_VERSION >= 41
570                v3d_emit_gs_state_record(v3d->job,
571                                         v3d->prog.gs_bin, gs_bin_uniforms,
572                                         v3d->prog.gs, gs_uniforms);
573
574                struct v3d_gs_prog_data *gs = v3d->prog.gs->prog_data.gs;
575                v3d_emit_tes_gs_common_params(v3d->job,
576                                              gs->out_prim_type,
577                                              gs->num_invocations);
578
579                /* Bin Tes/Gs params */
580                v3d_emit_tes_gs_shader_params(v3d->job,
581                                              vpm_cfg_bin.gs_width,
582                                              vpm_cfg_bin.Gd,
583                                              vpm_cfg_bin.Gv);
584
585                /* Render Tes/Gs params */
586                v3d_emit_tes_gs_shader_params(v3d->job,
587                                              vpm_cfg.gs_width,
588                                              vpm_cfg.Gd,
589                                              vpm_cfg.Gv);
590#else
591                unreachable("No GS support pre-4.1");
592#endif
593        }
594
595        cl_emit(&job->indirect, GL_SHADER_STATE_RECORD, shader) {
596                shader.enable_clipping = true;
597                /* V3D_DIRTY_PRIM_MODE | V3D_DIRTY_RASTERIZER */
598                shader.point_size_in_shaded_vertex_data =
599                        (info->mode == PIPE_PRIM_POINTS &&
600                         v3d->rasterizer->base.point_size_per_vertex);
601
602                /* Must be set if the shader modifies Z, discards, or modifies
603                 * the sample mask.  For any of these cases, the fragment
604                 * shader needs to write the Z value (even just discards).
605                 */
606                shader.fragment_shader_does_z_writes =
607                        v3d->prog.fs->prog_data.fs->writes_z;
608                /* Set if the EZ test must be disabled (due to shader side
609                 * effects and the early_z flag not being present in the
610                 * shader).
611                 */
612                shader.turn_off_early_z_test =
613                        v3d->prog.fs->prog_data.fs->disable_ez;
614
615                shader.fragment_shader_uses_real_pixel_centre_w_in_addition_to_centroid_w2 =
616                        v3d->prog.fs->prog_data.fs->uses_center_w;
617
618#if V3D_VERSION >= 41
619                shader.any_shader_reads_hardware_written_primitive_id =
620                        (v3d->prog.gs && v3d->prog.gs->prog_data.gs->uses_pid) ||
621                        v3d->prog.fs->prog_data.fs->uses_pid;
622                shader.insert_primitive_id_as_first_varying_to_fragment_shader =
623                        !v3d->prog.gs && v3d->prog.fs->prog_data.fs->uses_pid;
624#endif
625
626#if V3D_VERSION >= 40
627               shader.do_scoreboard_wait_on_first_thread_switch =
628                        v3d->prog.fs->prog_data.fs->lock_scoreboard_on_first_thrsw;
629               shader.disable_implicit_point_line_varyings =
630                        !v3d->prog.fs->prog_data.fs->uses_implicit_point_line_varyings;
631#endif
632
633                shader.number_of_varyings_in_fragment_shader =
634                        v3d->prog.fs->prog_data.fs->num_inputs;
635
636                shader.coordinate_shader_propagate_nans = true;
637                shader.vertex_shader_propagate_nans = true;
638                shader.fragment_shader_propagate_nans = true;
639
640                shader.coordinate_shader_code_address =
641                        cl_address(v3d_resource(v3d->prog.cs->resource)->bo,
642                                   v3d->prog.cs->offset);
643                shader.vertex_shader_code_address =
644                        cl_address(v3d_resource(v3d->prog.vs->resource)->bo,
645                                   v3d->prog.vs->offset);
646                shader.fragment_shader_code_address =
647                        cl_address(v3d_resource(v3d->prog.fs->resource)->bo,
648                                   v3d->prog.fs->offset);
649
650                /* XXX: Use combined input/output size flag in the common
651                 * case.
652                 */
653                shader.coordinate_shader_has_separate_input_and_output_vpm_blocks =
654                        v3d->prog.cs->prog_data.vs->separate_segments;
655                shader.vertex_shader_has_separate_input_and_output_vpm_blocks =
656                        v3d->prog.vs->prog_data.vs->separate_segments;
657
658                shader.coordinate_shader_input_vpm_segment_size =
659                        v3d->prog.cs->prog_data.vs->separate_segments ?
660                        v3d->prog.cs->prog_data.vs->vpm_input_size : 1;
661                shader.vertex_shader_input_vpm_segment_size =
662                        v3d->prog.vs->prog_data.vs->separate_segments ?
663                        v3d->prog.vs->prog_data.vs->vpm_input_size : 1;
664
665                shader.coordinate_shader_output_vpm_segment_size =
666                        v3d->prog.cs->prog_data.vs->vpm_output_size;
667                shader.vertex_shader_output_vpm_segment_size =
668                        v3d->prog.vs->prog_data.vs->vpm_output_size;
669
670                shader.coordinate_shader_uniforms_address = cs_uniforms;
671                shader.vertex_shader_uniforms_address = vs_uniforms;
672                shader.fragment_shader_uniforms_address = fs_uniforms;
673
674#if V3D_VERSION >= 41
675                shader.min_coord_shader_input_segments_required_in_play =
676                        vpm_cfg_bin.As;
677                shader.min_vertex_shader_input_segments_required_in_play =
678                        vpm_cfg.As;
679
680                shader.min_coord_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
681                        vpm_cfg_bin.Ve;
682                shader.min_vertex_shader_output_segments_required_in_play_in_addition_to_vcm_cache_size =
683                        vpm_cfg.Ve;
684
685                shader.coordinate_shader_4_way_threadable =
686                        v3d->prog.cs->prog_data.vs->base.threads == 4;
687                shader.vertex_shader_4_way_threadable =
688                        v3d->prog.vs->prog_data.vs->base.threads == 4;
689                shader.fragment_shader_4_way_threadable =
690                        v3d->prog.fs->prog_data.fs->base.threads == 4;
691
692                shader.coordinate_shader_start_in_final_thread_section =
693                        v3d->prog.cs->prog_data.vs->base.single_seg;
694                shader.vertex_shader_start_in_final_thread_section =
695                        v3d->prog.vs->prog_data.vs->base.single_seg;
696                shader.fragment_shader_start_in_final_thread_section =
697                        v3d->prog.fs->prog_data.fs->base.single_seg;
698#else
699                shader.coordinate_shader_4_way_threadable =
700                        v3d->prog.cs->prog_data.vs->base.threads == 4;
701                shader.coordinate_shader_2_way_threadable =
702                        v3d->prog.cs->prog_data.vs->base.threads == 2;
703                shader.vertex_shader_4_way_threadable =
704                        v3d->prog.vs->prog_data.vs->base.threads == 4;
705                shader.vertex_shader_2_way_threadable =
706                        v3d->prog.vs->prog_data.vs->base.threads == 2;
707                shader.fragment_shader_4_way_threadable =
708                        v3d->prog.fs->prog_data.fs->base.threads == 4;
709                shader.fragment_shader_2_way_threadable =
710                        v3d->prog.fs->prog_data.fs->base.threads == 2;
711#endif
712
713                shader.vertex_id_read_by_coordinate_shader =
714                        v3d->prog.cs->prog_data.vs->uses_vid;
715                shader.instance_id_read_by_coordinate_shader =
716                        v3d->prog.cs->prog_data.vs->uses_iid;
717                shader.vertex_id_read_by_vertex_shader =
718                        v3d->prog.vs->prog_data.vs->uses_vid;
719                shader.instance_id_read_by_vertex_shader =
720                        v3d->prog.vs->prog_data.vs->uses_iid;
721
722                shader.address_of_default_attribute_values =
723                        cl_address(v3d_resource(vtx->defaults)->bo,
724                                   vtx->defaults_offset);
725        }
726
727        bool cs_loaded_any = false;
728        for (int i = 0; i < vtx->num_elements; i++) {
729                struct pipe_vertex_element *elem = &vtx->pipe[i];
730                struct pipe_vertex_buffer *vb =
731                        &vertexbuf->vb[elem->vertex_buffer_index];
732                struct v3d_resource *rsc = v3d_resource(vb->buffer.resource);
733
734                if (!rsc)
735                        continue;
736
737                const uint32_t size =
738                        cl_packet_length(GL_SHADER_STATE_ATTRIBUTE_RECORD);
739                cl_emit_with_prepacked(&job->indirect,
740                                       GL_SHADER_STATE_ATTRIBUTE_RECORD,
741                                       &vtx->attrs[i * size], attr) {
742                        attr.stride = vb->stride;
743                        attr.address = cl_address(rsc->bo,
744                                                  vb->buffer_offset +
745                                                  elem->src_offset);
746                        attr.number_of_values_read_by_coordinate_shader =
747                                v3d->prog.cs->prog_data.vs->vattr_sizes[i];
748                        attr.number_of_values_read_by_vertex_shader =
749                                v3d->prog.vs->prog_data.vs->vattr_sizes[i];
750
751                        /* GFXH-930: At least one attribute must be enabled
752                         * and read by CS and VS.  If we have attributes being
753                         * consumed by the VS but not the CS, then set up a
754                         * dummy load of the last attribute into the CS's VPM
755                         * inputs.  (Since CS is just dead-code-elimination
756                         * compared to VS, we can't have CS loading but not
757                         * VS).
758                         */
759                        if (v3d->prog.cs->prog_data.vs->vattr_sizes[i])
760                                cs_loaded_any = true;
761                        if (i == vtx->num_elements - 1 && !cs_loaded_any) {
762                                attr.number_of_values_read_by_coordinate_shader = 1;
763                        }
764#if V3D_VERSION >= 41
765                        attr.maximum_index = 0xffffff;
766#endif
767                }
768                STATIC_ASSERT(sizeof(vtx->attrs) >= V3D_MAX_VS_INPUTS / 4 * size);
769        }
770
771        if (num_elements_to_emit == 0) {
772                /* GFXH-930: At least one attribute must be enabled and read
773                 * by CS and VS.  If we have no attributes being consumed by
774                 * the shader, set up a dummy to be loaded into the VPM.
775                 */
776                cl_emit(&job->indirect, GL_SHADER_STATE_ATTRIBUTE_RECORD, attr) {
777                        /* Valid address of data whose value will be unused. */
778                        attr.address = cl_address(job->indirect.bo, 0);
779
780                        attr.type = ATTRIBUTE_FLOAT;
781                        attr.stride = 0;
782                        attr.vec_size = 1;
783
784                        attr.number_of_values_read_by_coordinate_shader = 1;
785                        attr.number_of_values_read_by_vertex_shader = 1;
786                }
787                num_elements_to_emit = 1;
788        }
789
790        cl_emit(&job->bcl, VCM_CACHE_SIZE, vcm) {
791                vcm.number_of_16_vertex_batches_for_binning = vpm_cfg_bin.Vc;
792                vcm.number_of_16_vertex_batches_for_rendering = vpm_cfg.Vc;
793        }
794
795#if V3D_VERSION >= 41
796        if (v3d->prog.gs) {
797                cl_emit(&job->bcl, GL_SHADER_STATE_INCLUDING_GS, state) {
798                        state.address = cl_address(job->indirect.bo,
799                                                   shader_rec_offset);
800                        state.number_of_attribute_arrays = num_elements_to_emit;
801                }
802        } else {
803                cl_emit(&job->bcl, GL_SHADER_STATE, state) {
804                        state.address = cl_address(job->indirect.bo,
805                                                   shader_rec_offset);
806                        state.number_of_attribute_arrays = num_elements_to_emit;
807                }
808        }
809#else
810        assert(!v3d->prog.gs);
811        cl_emit(&job->bcl, GL_SHADER_STATE, state) {
812                state.address = cl_address(job->indirect.bo, shader_rec_offset);
813                state.number_of_attribute_arrays = num_elements_to_emit;
814        }
815#endif
816
817        v3d_bo_unreference(&cs_uniforms.bo);
818        v3d_bo_unreference(&vs_uniforms.bo);
819        if (gs_uniforms.bo)
820                v3d_bo_unreference(&gs_uniforms.bo);
821        if (gs_bin_uniforms.bo)
822                v3d_bo_unreference(&gs_bin_uniforms.bo);
823        v3d_bo_unreference(&fs_uniforms.bo);
824}
825
826/**
827 * Updates the number of primitives generated from the number of vertices
828 * to draw. This only works when no GS is present, since otherwise the number
829 * of primitives generated cannot be determined in advance and we need to
830 * use the PRIMITIVE_COUNTS_FEEDBACK command instead, however, that requires
831 * a sync wait for the draw to complete, so we only use that when GS is present.
832 */
833static void
834v3d_update_primitives_generated_counter(struct v3d_context *v3d,
835                                        const struct pipe_draw_info *info,
836                                        const struct pipe_draw_start_count_bias *draw)
837{
838        assert(!v3d->prog.gs);
839
840        if (!v3d->active_queries)
841                return;
842
843        uint32_t prims = u_prims_for_vertices(info->mode, draw->count);
844        v3d->prims_generated += prims;
845}
846
847static void
848v3d_update_job_ez(struct v3d_context *v3d, struct v3d_job *job)
849{
850        switch (v3d->zsa->ez_state) {
851        case V3D_EZ_UNDECIDED:
852                /* If the Z/S state didn't pick a direction but didn't
853                 * disable, then go along with the current EZ state.  This
854                 * allows EZ optimization for Z func == EQUAL or NEVER.
855                 */
856                break;
857
858        case V3D_EZ_LT_LE:
859        case V3D_EZ_GT_GE:
860                /* If the Z/S state picked a direction, then it needs to match
861                 * the current direction if we've decided on one.
862                 */
863                if (job->ez_state == V3D_EZ_UNDECIDED)
864                        job->ez_state = v3d->zsa->ez_state;
865                else if (job->ez_state != v3d->zsa->ez_state)
866                        job->ez_state = V3D_EZ_DISABLED;
867                break;
868
869        case V3D_EZ_DISABLED:
870                /* If the current Z/S state disables EZ because of a bad Z
871                 * func or stencil operation, then we can't do any more EZ in
872                 * this frame.
873                 */
874                job->ez_state = V3D_EZ_DISABLED;
875                break;
876        }
877
878        /* If the FS affects the Z of the pixels, then it may update against
879         * the chosen EZ direction (though we could use
880         * ARB_conservative_depth's hints to avoid this)
881         */
882        if (v3d->prog.fs->prog_data.fs->writes_z) {
883                job->ez_state = V3D_EZ_DISABLED;
884        }
885
886        if (job->first_ez_state == V3D_EZ_UNDECIDED &&
887            (job->ez_state != V3D_EZ_DISABLED || job->draw_calls_queued == 0))
888                job->first_ez_state = job->ez_state;
889}
890
891static uint32_t
892v3d_hw_prim_type(enum pipe_prim_type prim_type)
893{
894        switch (prim_type) {
895        case PIPE_PRIM_POINTS:
896        case PIPE_PRIM_LINES:
897        case PIPE_PRIM_LINE_LOOP:
898        case PIPE_PRIM_LINE_STRIP:
899        case PIPE_PRIM_TRIANGLES:
900        case PIPE_PRIM_TRIANGLE_STRIP:
901        case PIPE_PRIM_TRIANGLE_FAN:
902                return prim_type;
903
904        case PIPE_PRIM_LINES_ADJACENCY:
905        case PIPE_PRIM_LINE_STRIP_ADJACENCY:
906        case PIPE_PRIM_TRIANGLES_ADJACENCY:
907        case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
908                return 8 + (prim_type - PIPE_PRIM_LINES_ADJACENCY);
909
910        default:
911                unreachable("Unsupported primitive type");
912        }
913}
914
915static bool
916v3d_check_compiled_shaders(struct v3d_context *v3d)
917{
918        static bool warned[5] = { 0 };
919
920        uint32_t failed_stage = MESA_SHADER_NONE;
921        if (!v3d->prog.vs->resource || !v3d->prog.cs->resource) {
922                failed_stage = MESA_SHADER_VERTEX;
923        } else if ((v3d->prog.gs_bin && !v3d->prog.gs_bin->resource) ||
924                   (v3d->prog.gs && !v3d->prog.gs->resource)) {
925                failed_stage = MESA_SHADER_GEOMETRY;
926        } else if (v3d->prog.fs && !v3d->prog.fs->resource) {
927                failed_stage = MESA_SHADER_FRAGMENT;
928        }
929
930        if (likely(failed_stage == MESA_SHADER_NONE))
931                return true;
932
933        if (!warned[failed_stage]) {
934                fprintf(stderr,
935                        "%s shader failed to compile. Expect corruption.\n",
936                        _mesa_shader_stage_to_string(failed_stage));
937                warned[failed_stage] = true;
938        }
939        return false;
940}
941
942static void
943v3d_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
944             unsigned drawid_offset,
945             const struct pipe_draw_indirect_info *indirect,
946             const struct pipe_draw_start_count_bias *draws,
947             unsigned num_draws)
948{
949        if (num_draws > 1) {
950                util_draw_multi(pctx, info, drawid_offset, indirect, draws, num_draws);
951                return;
952        }
953
954        if (!indirect && (!draws[0].count || !info->instance_count))
955           return;
956
957        struct v3d_context *v3d = v3d_context(pctx);
958
959        if (!indirect &&
960            !info->primitive_restart &&
961            !u_trim_pipe_prim(info->mode, (unsigned*)&draws[0].count))
962                return;
963
964        /* Fall back for weird desktop GL primitive restart values. */
965        if (info->primitive_restart &&
966            info->index_size) {
967                uint32_t mask = util_prim_restart_index_from_size(info->index_size);
968                if (info->restart_index != mask) {
969                        util_draw_vbo_without_prim_restart(pctx, info, drawid_offset, indirect, &draws[0]);
970                        return;
971                }
972        }
973
974        /* Before setting up the draw, flush anything writing to the resources
975         * that we read from or reading from resources we write to.
976         */
977        for (int s = 0; s < PIPE_SHADER_COMPUTE; s++)
978                v3d_predraw_check_stage_inputs(pctx, s);
979
980        if (indirect && indirect->buffer) {
981                v3d_flush_jobs_writing_resource(v3d, indirect->buffer,
982                                                V3D_FLUSH_DEFAULT, false);
983        }
984
985        v3d_predraw_check_outputs(pctx);
986
987        /* If transform feedback is active and we are switching primitive type
988         * we need to submit the job before drawing and update the vertex count
989         * written to TF based on the primitive type since we will need to
990         * know the exact vertex count if the application decides to call
991         * glDrawTransformFeedback() later.
992         */
993        if (v3d->streamout.num_targets > 0 &&
994            u_base_prim_type(info->mode) != u_base_prim_type(v3d->prim_mode)) {
995                v3d_update_primitive_counters(v3d);
996        }
997
998        struct v3d_job *job = v3d_get_job_for_fbo(v3d);
999
1000        /* If vertex texturing depends on the output of rendering, we need to
1001         * ensure that that rendering is complete before we run a coordinate
1002         * shader that depends on it.
1003         *
1004         * Given that doing that is unusual, for now we just block the binner
1005         * on the last submitted render, rather than tracking the last
1006         * rendering to each texture's BO.
1007         */
1008        if (v3d->tex[PIPE_SHADER_VERTEX].num_textures || (indirect && indirect->buffer)) {
1009                perf_debug("Blocking binner on last render "
1010                           "due to vertex texturing or indirect drawing.\n");
1011                job->submit.in_sync_bcl = v3d->out_sync;
1012        }
1013
1014        /* We also need to ensure that compute is complete when render depends
1015         * on resources written by it.
1016         */
1017        if (v3d->sync_on_last_compute_job) {
1018                job->submit.in_sync_bcl = v3d->out_sync;
1019                v3d->sync_on_last_compute_job = false;
1020        }
1021
1022        /* Mark SSBOs and images as being written.  We don't actually know
1023         * which ones are read vs written, so just assume the worst.
1024         */
1025        for (int s = 0; s < PIPE_SHADER_COMPUTE; s++) {
1026                u_foreach_bit(i, v3d->ssbo[s].enabled_mask) {
1027                        v3d_job_add_write_resource(job,
1028                                                   v3d->ssbo[s].sb[i].buffer);
1029                        job->tmu_dirty_rcl = true;
1030                }
1031
1032                u_foreach_bit(i, v3d->shaderimg[s].enabled_mask) {
1033                        v3d_job_add_write_resource(job,
1034                                                   v3d->shaderimg[s].si[i].base.resource);
1035                        job->tmu_dirty_rcl = true;
1036                }
1037        }
1038
1039        /* Get space to emit our draw call into the BCL, using a branch to
1040         * jump to a new BO if necessary.
1041         */
1042        v3d_cl_ensure_space_with_branch(&job->bcl, 256 /* XXX */);
1043
1044        if (v3d->prim_mode != info->mode) {
1045                v3d->prim_mode = info->mode;
1046                v3d->dirty |= V3D_DIRTY_PRIM_MODE;
1047        }
1048
1049        v3d_start_draw(v3d);
1050        v3d_update_compiled_shaders(v3d, info->mode);
1051        if (!v3d_check_compiled_shaders(v3d))
1052                return;
1053        v3d_update_job_ez(v3d, job);
1054
1055        /* If this job was writing to transform feedback buffers before this
1056         * draw and we are reading from them here, then we need to wait for TF
1057         * to complete before we emit this draw.
1058         *
1059         * Notice this check needs to happen before we emit state for the
1060         * current draw call, where we update job->tf_enabled, so we can ensure
1061         * that we only check TF writes for prior draws.
1062         */
1063        v3d_emit_wait_for_tf_if_needed(v3d, job);
1064
1065#if V3D_VERSION >= 41
1066        v3d41_emit_state(pctx);
1067#else
1068        v3d33_emit_state(pctx);
1069#endif
1070
1071        if (v3d->dirty & (V3D_DIRTY_VTXBUF |
1072                          V3D_DIRTY_VTXSTATE |
1073                          V3D_DIRTY_PRIM_MODE |
1074                          V3D_DIRTY_RASTERIZER |
1075                          V3D_DIRTY_COMPILED_CS |
1076                          V3D_DIRTY_COMPILED_VS |
1077                          V3D_DIRTY_COMPILED_GS_BIN |
1078                          V3D_DIRTY_COMPILED_GS |
1079                          V3D_DIRTY_COMPILED_FS |
1080                          v3d->prog.cs->uniform_dirty_bits |
1081                          v3d->prog.vs->uniform_dirty_bits |
1082                          (v3d->prog.gs_bin ?
1083                                    v3d->prog.gs_bin->uniform_dirty_bits : 0) |
1084                          (v3d->prog.gs ?
1085                                    v3d->prog.gs->uniform_dirty_bits : 0) |
1086                          v3d->prog.fs->uniform_dirty_bits)) {
1087                v3d_emit_gl_shader_state(v3d, info);
1088        }
1089
1090        v3d->dirty = 0;
1091
1092        /* The Base Vertex/Base Instance packet sets those values to nonzero
1093         * for the next draw call only.
1094         */
1095        if ((info->index_size && draws->index_bias) || info->start_instance) {
1096                cl_emit(&job->bcl, BASE_VERTEX_BASE_INSTANCE, base) {
1097                        base.base_instance = info->start_instance;
1098                        base.base_vertex = info->index_size ? draws->index_bias : 0;
1099                }
1100        }
1101
1102        uint32_t prim_tf_enable = 0;
1103#if V3D_VERSION < 40
1104        /* V3D 3.x: The HW only processes transform feedback on primitives
1105         * with the flag set.
1106         */
1107        if (v3d->streamout.num_targets)
1108                prim_tf_enable = (V3D_PRIM_POINTS_TF - V3D_PRIM_POINTS);
1109#endif
1110
1111        if (!v3d->prog.gs)
1112                v3d_update_primitives_generated_counter(v3d, info, &draws[0]);
1113
1114        uint32_t hw_prim_type = v3d_hw_prim_type(info->mode);
1115        if (info->index_size) {
1116                uint32_t index_size = info->index_size;
1117                uint32_t offset = draws[0].start * index_size;
1118                struct pipe_resource *prsc;
1119                if (info->has_user_indices) {
1120                        unsigned start_offset = draws[0].start * info->index_size;
1121                        prsc = NULL;
1122                        u_upload_data(v3d->uploader, start_offset,
1123                                      draws[0].count * info->index_size, 4,
1124                                      (char*)info->index.user + start_offset,
1125                                      &offset, &prsc);
1126                } else {
1127                        prsc = info->index.resource;
1128                }
1129                struct v3d_resource *rsc = v3d_resource(prsc);
1130
1131#if V3D_VERSION >= 40
1132                cl_emit(&job->bcl, INDEX_BUFFER_SETUP, ib) {
1133                        ib.address = cl_address(rsc->bo, 0);
1134                        ib.size = rsc->bo->size;
1135                }
1136#endif
1137
1138                if (indirect && indirect->buffer) {
1139                        cl_emit(&job->bcl, INDIRECT_INDEXED_INSTANCED_PRIM_LIST, prim) {
1140                                prim.index_type = ffs(info->index_size) - 1;
1141#if V3D_VERSION < 40
1142                                prim.address_of_indices_list =
1143                                        cl_address(rsc->bo, offset);
1144#endif /* V3D_VERSION < 40 */
1145                                prim.mode = hw_prim_type | prim_tf_enable;
1146                                prim.enable_primitive_restarts = info->primitive_restart;
1147
1148                                prim.number_of_draw_indirect_indexed_records = indirect->draw_count;
1149
1150                                prim.stride_in_multiples_of_4_bytes = indirect->stride >> 2;
1151                                prim.address = cl_address(v3d_resource(indirect->buffer)->bo,
1152                                                          indirect->offset);
1153                        }
1154                } else if (info->instance_count > 1) {
1155                        cl_emit(&job->bcl, INDEXED_INSTANCED_PRIM_LIST, prim) {
1156                                prim.index_type = ffs(info->index_size) - 1;
1157#if V3D_VERSION >= 40
1158                                prim.index_offset = offset;
1159#else /* V3D_VERSION < 40 */
1160                                prim.maximum_index = (1u << 31) - 1; /* XXX */
1161                                prim.address_of_indices_list =
1162                                        cl_address(rsc->bo, offset);
1163#endif /* V3D_VERSION < 40 */
1164                                prim.mode = hw_prim_type | prim_tf_enable;
1165                                prim.enable_primitive_restarts = info->primitive_restart;
1166
1167                                prim.number_of_instances = info->instance_count;
1168                                prim.instance_length = draws[0].count;
1169                        }
1170                } else {
1171                        cl_emit(&job->bcl, INDEXED_PRIM_LIST, prim) {
1172                                prim.index_type = ffs(info->index_size) - 1;
1173                                prim.length = draws[0].count;
1174#if V3D_VERSION >= 40
1175                                prim.index_offset = offset;
1176#else /* V3D_VERSION < 40 */
1177                                prim.maximum_index = (1u << 31) - 1; /* XXX */
1178                                prim.address_of_indices_list =
1179                                        cl_address(rsc->bo, offset);
1180#endif /* V3D_VERSION < 40 */
1181                                prim.mode = hw_prim_type | prim_tf_enable;
1182                                prim.enable_primitive_restarts = info->primitive_restart;
1183                        }
1184                }
1185
1186                if (info->has_user_indices)
1187                        pipe_resource_reference(&prsc, NULL);
1188        } else {
1189                if (indirect && indirect->buffer) {
1190                        cl_emit(&job->bcl, INDIRECT_VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
1191                                prim.mode = hw_prim_type | prim_tf_enable;
1192                                prim.number_of_draw_indirect_array_records = indirect->draw_count;
1193
1194                                prim.stride_in_multiples_of_4_bytes = indirect->stride >> 2;
1195                                prim.address = cl_address(v3d_resource(indirect->buffer)->bo,
1196                                                          indirect->offset);
1197                        }
1198                } else if (info->instance_count > 1) {
1199                        struct pipe_stream_output_target *so =
1200                                indirect && indirect->count_from_stream_output ?
1201                                        indirect->count_from_stream_output : NULL;
1202                        uint32_t vert_count = so ?
1203                                v3d_stream_output_target_get_vertex_count(so) :
1204                                draws[0].count;
1205                        cl_emit(&job->bcl, VERTEX_ARRAY_INSTANCED_PRIMS, prim) {
1206                                prim.mode = hw_prim_type | prim_tf_enable;
1207                                prim.index_of_first_vertex = draws[0].start;
1208                                prim.number_of_instances = info->instance_count;
1209                                prim.instance_length = vert_count;
1210                        }
1211                } else {
1212                        struct pipe_stream_output_target *so =
1213                                indirect && indirect->count_from_stream_output ?
1214                                        indirect->count_from_stream_output : NULL;
1215                        uint32_t vert_count = so ?
1216                                v3d_stream_output_target_get_vertex_count(so) :
1217                                draws[0].count;
1218                        cl_emit(&job->bcl, VERTEX_ARRAY_PRIMS, prim) {
1219                                prim.mode = hw_prim_type | prim_tf_enable;
1220                                prim.length = vert_count;
1221                                prim.index_of_first_vertex = draws[0].start;
1222                        }
1223                }
1224        }
1225
1226        /* A flush is required in between a TF draw and any following TF specs
1227         * packet, or the GPU may hang.  Just flush each time for now.
1228         */
1229        if (v3d->streamout.num_targets)
1230                cl_emit(&job->bcl, TRANSFORM_FEEDBACK_FLUSH_AND_COUNT, flush);
1231
1232        job->draw_calls_queued++;
1233        if (v3d->streamout.num_targets)
1234           job->tf_draw_calls_queued++;
1235
1236        /* Increment the TF offsets by how many verts we wrote.  XXX: This
1237         * needs some clamping to the buffer size.
1238         */
1239        for (int i = 0; i < v3d->streamout.num_targets; i++)
1240                v3d->streamout.offsets[i] += draws[0].count;
1241
1242        if (v3d->zsa && job->zsbuf && v3d->zsa->base.depth_enabled) {
1243                struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
1244                v3d_job_add_bo(job, rsc->bo);
1245
1246                job->load |= PIPE_CLEAR_DEPTH & ~job->clear;
1247                if (v3d->zsa->base.depth_writemask)
1248                        job->store |= PIPE_CLEAR_DEPTH;
1249                rsc->initialized_buffers = PIPE_CLEAR_DEPTH;
1250        }
1251
1252        if (v3d->zsa && job->zsbuf && v3d->zsa->base.stencil[0].enabled) {
1253                struct v3d_resource *rsc = v3d_resource(job->zsbuf->texture);
1254                if (rsc->separate_stencil)
1255                        rsc = rsc->separate_stencil;
1256
1257                v3d_job_add_bo(job, rsc->bo);
1258
1259                job->load |= PIPE_CLEAR_STENCIL & ~job->clear;
1260                if (v3d->zsa->base.stencil[0].writemask ||
1261                    v3d->zsa->base.stencil[1].writemask) {
1262                        job->store |= PIPE_CLEAR_STENCIL;
1263                }
1264                rsc->initialized_buffers |= PIPE_CLEAR_STENCIL;
1265        }
1266
1267        for (int i = 0; i < job->nr_cbufs; i++) {
1268                uint32_t bit = PIPE_CLEAR_COLOR0 << i;
1269                int blend_rt = v3d->blend->base.independent_blend_enable ? i : 0;
1270
1271                if (job->store & bit || !job->cbufs[i])
1272                        continue;
1273                struct v3d_resource *rsc = v3d_resource(job->cbufs[i]->texture);
1274
1275                job->load |= bit & ~job->clear;
1276                if (v3d->blend->base.rt[blend_rt].colormask)
1277                        job->store |= bit;
1278                v3d_job_add_bo(job, rsc->bo);
1279        }
1280
1281        if (job->referenced_size > 768 * 1024 * 1024) {
1282                perf_debug("Flushing job with %dkb to try to free up memory\n",
1283                        job->referenced_size / 1024);
1284                v3d_flush(pctx);
1285        }
1286
1287        if (unlikely(V3D_DEBUG & V3D_DEBUG_ALWAYS_FLUSH))
1288                v3d_flush(pctx);
1289}
1290
1291#if V3D_VERSION >= 41
1292#define V3D_CSD_CFG012_WG_COUNT_SHIFT 16
1293#define V3D_CSD_CFG012_WG_OFFSET_SHIFT 0
1294/* Allow this dispatch to start while the last one is still running. */
1295#define V3D_CSD_CFG3_OVERLAP_WITH_PREV (1 << 26)
1296/* Maximum supergroup ID.  6 bits. */
1297#define V3D_CSD_CFG3_MAX_SG_ID_SHIFT 20
1298/* Batches per supergroup minus 1.  8 bits. */
1299#define V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT 12
1300/* Workgroups per supergroup, 0 means 16 */
1301#define V3D_CSD_CFG3_WGS_PER_SG_SHIFT 8
1302#define V3D_CSD_CFG3_WG_SIZE_SHIFT 0
1303
1304#define V3D_CSD_CFG5_PROPAGATE_NANS (1 << 2)
1305#define V3D_CSD_CFG5_SINGLE_SEG (1 << 1)
1306#define V3D_CSD_CFG5_THREADING (1 << 0)
1307
1308static void
1309v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
1310{
1311        struct v3d_context *v3d = v3d_context(pctx);
1312        struct v3d_screen *screen = v3d->screen;
1313
1314        v3d_predraw_check_stage_inputs(pctx, PIPE_SHADER_COMPUTE);
1315
1316        v3d_update_compiled_cs(v3d);
1317
1318        if (!v3d->prog.compute->resource) {
1319                static bool warned = false;
1320                if (!warned) {
1321                        fprintf(stderr,
1322                                "Compute shader failed to compile.  "
1323                                "Expect corruption.\n");
1324                        warned = true;
1325                }
1326                return;
1327        }
1328
1329        /* Some of the units of scale:
1330         *
1331         * - Batches of 16 work items (shader invocations) that will be queued
1332         *   to the run on a QPU at once.
1333         *
1334         * - Workgroups composed of work items based on the shader's layout
1335         *   declaration.
1336         *
1337         * - Supergroups of 1-16 workgroups.  There can only be 16 supergroups
1338         *   running at a time on the core, so we want to keep them large to
1339         *   keep the QPUs busy, but a whole supergroup will sync at a barrier
1340         *   so we want to keep them small if one is present.
1341         */
1342        struct drm_v3d_submit_csd submit = { 0 };
1343        struct v3d_job *job = v3d_job_create(v3d);
1344
1345        /* Set up the actual number of workgroups, synchronously mapping the
1346         * indirect buffer if necessary to get the dimensions.
1347         */
1348        if (info->indirect) {
1349                struct pipe_transfer *transfer;
1350                uint32_t *map = pipe_buffer_map_range(pctx, info->indirect,
1351                                                      info->indirect_offset,
1352                                                      3 * sizeof(uint32_t),
1353                                                      PIPE_MAP_READ,
1354                                                      &transfer);
1355                memcpy(v3d->compute_num_workgroups, map, 3 * sizeof(uint32_t));
1356                pipe_buffer_unmap(pctx, transfer);
1357
1358                if (v3d->compute_num_workgroups[0] == 0 ||
1359                    v3d->compute_num_workgroups[1] == 0 ||
1360                    v3d->compute_num_workgroups[2] == 0) {
1361                        /* Nothing to dispatch, so skip the draw (CSD can't
1362                         * handle 0 workgroups).
1363                         */
1364                        return;
1365                }
1366        } else {
1367                v3d->compute_num_workgroups[0] = info->grid[0];
1368                v3d->compute_num_workgroups[1] = info->grid[1];
1369                v3d->compute_num_workgroups[2] = info->grid[2];
1370        }
1371
1372        uint32_t num_wgs = 1;
1373        for (int i = 0; i < 3; i++) {
1374                num_wgs *= v3d->compute_num_workgroups[i];
1375                submit.cfg[i] |= (v3d->compute_num_workgroups[i] <<
1376                                  V3D_CSD_CFG012_WG_COUNT_SHIFT);
1377        }
1378
1379        uint32_t wg_size = info->block[0] * info->block[1] * info->block[2];
1380
1381        struct v3d_compute_prog_data *compute =
1382                v3d->prog.compute->prog_data.compute;
1383        uint32_t wgs_per_sg =
1384                v3d_csd_choose_workgroups_per_supergroup(
1385                        &v3d->screen->devinfo,
1386                        compute->has_subgroups,
1387                        compute->base.has_control_barrier,
1388                        compute->base.threads,
1389                        num_wgs, wg_size);
1390
1391        uint32_t batches_per_sg = DIV_ROUND_UP(wgs_per_sg * wg_size, 16);
1392        uint32_t whole_sgs = num_wgs / wgs_per_sg;
1393        uint32_t rem_wgs = num_wgs - whole_sgs * wgs_per_sg;
1394        uint32_t num_batches = batches_per_sg * whole_sgs +
1395                               DIV_ROUND_UP(rem_wgs * wg_size, 16);
1396
1397        submit.cfg[3] |= (wgs_per_sg & 0xf) << V3D_CSD_CFG3_WGS_PER_SG_SHIFT;
1398        submit.cfg[3] |=
1399                (batches_per_sg - 1) << V3D_CSD_CFG3_BATCHES_PER_SG_M1_SHIFT;
1400        submit.cfg[3] |= (wg_size & 0xff) << V3D_CSD_CFG3_WG_SIZE_SHIFT;
1401
1402
1403        /* Number of batches the dispatch will invoke (minus 1). */
1404        submit.cfg[4] = num_batches - 1;
1405
1406        /* Make sure we didn't accidentally underflow. */
1407        assert(submit.cfg[4] != ~0);
1408
1409        v3d_job_add_bo(job, v3d_resource(v3d->prog.compute->resource)->bo);
1410        submit.cfg[5] = (v3d_resource(v3d->prog.compute->resource)->bo->offset +
1411                         v3d->prog.compute->offset);
1412        submit.cfg[5] |= V3D_CSD_CFG5_PROPAGATE_NANS;
1413        if (v3d->prog.compute->prog_data.base->single_seg)
1414                submit.cfg[5] |= V3D_CSD_CFG5_SINGLE_SEG;
1415        if (v3d->prog.compute->prog_data.base->threads == 4)
1416                submit.cfg[5] |= V3D_CSD_CFG5_THREADING;
1417
1418        if (v3d->prog.compute->prog_data.compute->shared_size) {
1419                v3d->compute_shared_memory =
1420                        v3d_bo_alloc(v3d->screen,
1421                                     v3d->prog.compute->prog_data.compute->shared_size *
1422                                     wgs_per_sg,
1423                                     "shared_vars");
1424        }
1425
1426        struct v3d_cl_reloc uniforms = v3d_write_uniforms(v3d, job,
1427                                                          v3d->prog.compute,
1428                                                          PIPE_SHADER_COMPUTE);
1429        v3d_job_add_bo(job, uniforms.bo);
1430        submit.cfg[6] = uniforms.bo->offset + uniforms.offset;
1431
1432        /* Pull some job state that was stored in a SUBMIT_CL struct out to
1433         * our SUBMIT_CSD struct
1434         */
1435        submit.bo_handles = job->submit.bo_handles;
1436        submit.bo_handle_count = job->submit.bo_handle_count;
1437
1438        /* Serialize this in the rest of our command stream. */
1439        submit.in_sync = v3d->out_sync;
1440        submit.out_sync = v3d->out_sync;
1441
1442        if (v3d->active_perfmon) {
1443                assert(screen->has_perfmon);
1444                submit.perfmon_id = v3d->active_perfmon->kperfmon_id;
1445        }
1446
1447        v3d->last_perfmon = v3d->active_perfmon;
1448
1449        if (!(unlikely(V3D_DEBUG & V3D_DEBUG_NORAST))) {
1450                int ret = v3d_ioctl(screen->fd, DRM_IOCTL_V3D_SUBMIT_CSD,
1451                                    &submit);
1452                static bool warned = false;
1453                if (ret && !warned) {
1454                        fprintf(stderr, "CSD submit call returned %s.  "
1455                                "Expect corruption.\n", strerror(errno));
1456                        warned = true;
1457                } else if (!ret) {
1458                        if (v3d->active_perfmon)
1459                                v3d->active_perfmon->job_submitted = true;
1460                }
1461        }
1462
1463        v3d_job_free(v3d, job);
1464
1465        /* Mark SSBOs as being written.. we don't actually know which ones are
1466         * read vs written, so just assume the worst
1467         */
1468        u_foreach_bit(i, v3d->ssbo[PIPE_SHADER_COMPUTE].enabled_mask) {
1469                struct v3d_resource *rsc = v3d_resource(
1470                        v3d->ssbo[PIPE_SHADER_COMPUTE].sb[i].buffer);
1471                rsc->writes++;
1472                rsc->compute_written = true;
1473        }
1474
1475        u_foreach_bit(i, v3d->shaderimg[PIPE_SHADER_COMPUTE].enabled_mask) {
1476                struct v3d_resource *rsc = v3d_resource(
1477                        v3d->shaderimg[PIPE_SHADER_COMPUTE].si[i].base.resource);
1478                rsc->writes++;
1479                rsc->compute_written = true;
1480        }
1481
1482        v3d_bo_unreference(&uniforms.bo);
1483        v3d_bo_unreference(&v3d->compute_shared_memory);
1484}
1485#endif
1486
1487/**
1488 * Implements gallium's clear() hook (glClear()) by drawing a pair of triangles.
1489 */
1490static void
1491v3d_draw_clear(struct v3d_context *v3d,
1492               unsigned buffers,
1493               const union pipe_color_union *color,
1494               double depth, unsigned stencil)
1495{
1496        static const union pipe_color_union dummy_color = {};
1497
1498        /* The blitter util dereferences the color regardless, even though the
1499         * gallium clear API may not pass one in when only Z/S are cleared.
1500         */
1501        if (!color)
1502                color = &dummy_color;
1503
1504        v3d_blitter_save(v3d);
1505        util_blitter_clear(v3d->blitter,
1506                           v3d->framebuffer.width,
1507                           v3d->framebuffer.height,
1508                           util_framebuffer_get_num_layers(&v3d->framebuffer),
1509                           buffers, color, depth, stencil,
1510                           util_framebuffer_get_num_samples(&v3d->framebuffer) > 1);
1511}
1512
1513/**
1514 * Attempts to perform the GL clear by using the TLB's fast clear at the start
1515 * of the frame.
1516 */
1517static unsigned
1518v3d_tlb_clear(struct v3d_job *job, unsigned buffers,
1519              const union pipe_color_union *color,
1520              double depth, unsigned stencil)
1521{
1522        struct v3d_context *v3d = job->v3d;
1523
1524        if (job->draw_calls_queued) {
1525                /* If anything in the CL has drawn using the buffer, then the
1526                 * TLB clear we're trying to add now would happen before that
1527                 * drawing.
1528                 */
1529                buffers &= ~(job->load | job->store);
1530        }
1531
1532        /* GFXH-1461: If we were to emit a load of just depth or just stencil,
1533         * then the clear for the other may get lost.  We need to decide now
1534         * if it would be possible to need to emit a load of just one after
1535         * we've set up our TLB clears.
1536         */
1537        if (buffers & PIPE_CLEAR_DEPTHSTENCIL &&
1538            (buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL &&
1539            job->zsbuf &&
1540            util_format_is_depth_and_stencil(job->zsbuf->texture->format)) {
1541                buffers &= ~PIPE_CLEAR_DEPTHSTENCIL;
1542        }
1543
1544        for (int i = 0; i < job->nr_cbufs; i++) {
1545                uint32_t bit = PIPE_CLEAR_COLOR0 << i;
1546                if (!(buffers & bit))
1547                        continue;
1548
1549                struct pipe_surface *psurf = v3d->framebuffer.cbufs[i];
1550                struct v3d_surface *surf = v3d_surface(psurf);
1551                struct v3d_resource *rsc = v3d_resource(psurf->texture);
1552
1553                union util_color uc;
1554                uint32_t internal_size = 4 << surf->internal_bpp;
1555
1556                static union pipe_color_union swapped_color;
1557                if (v3d->swap_color_rb & (1 << i)) {
1558                        swapped_color.f[0] = color->f[2];
1559                        swapped_color.f[1] = color->f[1];
1560                        swapped_color.f[2] = color->f[0];
1561                        swapped_color.f[3] = color->f[3];
1562                        color = &swapped_color;
1563                }
1564
1565                switch (surf->internal_type) {
1566                case V3D_INTERNAL_TYPE_8:
1567                        util_pack_color(color->f, PIPE_FORMAT_R8G8B8A8_UNORM,
1568                                        &uc);
1569                        memcpy(job->clear_color[i], uc.ui, internal_size);
1570                        break;
1571                case V3D_INTERNAL_TYPE_8I:
1572                case V3D_INTERNAL_TYPE_8UI:
1573                        job->clear_color[i][0] = ((color->ui[0] & 0xff) |
1574                                                  (color->ui[1] & 0xff) << 8 |
1575                                                  (color->ui[2] & 0xff) << 16 |
1576                                                  (color->ui[3] & 0xff) << 24);
1577                        break;
1578                case V3D_INTERNAL_TYPE_16F:
1579                        util_pack_color(color->f, PIPE_FORMAT_R16G16B16A16_FLOAT,
1580                                        &uc);
1581                        memcpy(job->clear_color[i], uc.ui, internal_size);
1582                        break;
1583                case V3D_INTERNAL_TYPE_16I:
1584                case V3D_INTERNAL_TYPE_16UI:
1585                        job->clear_color[i][0] = ((color->ui[0] & 0xffff) |
1586                                                  color->ui[1] << 16);
1587                        job->clear_color[i][1] = ((color->ui[2] & 0xffff) |
1588                                                  color->ui[3] << 16);
1589                        break;
1590                case V3D_INTERNAL_TYPE_32F:
1591                case V3D_INTERNAL_TYPE_32I:
1592                case V3D_INTERNAL_TYPE_32UI:
1593                        memcpy(job->clear_color[i], color->ui, internal_size);
1594                        break;
1595                }
1596
1597                rsc->initialized_buffers |= bit;
1598        }
1599
1600        unsigned zsclear = buffers & PIPE_CLEAR_DEPTHSTENCIL;
1601        if (zsclear) {
1602                struct v3d_resource *rsc =
1603                        v3d_resource(v3d->framebuffer.zsbuf->texture);
1604
1605                if (zsclear & PIPE_CLEAR_DEPTH)
1606                        job->clear_z = depth;
1607                if (zsclear & PIPE_CLEAR_STENCIL)
1608                        job->clear_s = stencil;
1609
1610                rsc->initialized_buffers |= zsclear;
1611        }
1612
1613        job->draw_min_x = 0;
1614        job->draw_min_y = 0;
1615        job->draw_max_x = v3d->framebuffer.width;
1616        job->draw_max_y = v3d->framebuffer.height;
1617        job->clear |= buffers;
1618        job->store |= buffers;
1619        job->scissor.disabled = true;
1620
1621        v3d_start_draw(v3d);
1622
1623        return buffers;
1624}
1625
1626static void
1627v3d_clear(struct pipe_context *pctx, unsigned buffers, const struct pipe_scissor_state *scissor_state,
1628          const union pipe_color_union *color, double depth, unsigned stencil)
1629{
1630        struct v3d_context *v3d = v3d_context(pctx);
1631        struct v3d_job *job = v3d_get_job_for_fbo(v3d);
1632
1633        buffers &= ~v3d_tlb_clear(job, buffers, color, depth, stencil);
1634
1635        if (buffers)
1636                v3d_draw_clear(v3d, buffers, color, depth, stencil);
1637}
1638
1639static void
1640v3d_clear_render_target(struct pipe_context *pctx, struct pipe_surface *ps,
1641                        const union pipe_color_union *color,
1642                        unsigned x, unsigned y, unsigned w, unsigned h,
1643                        bool render_condition_enabled)
1644{
1645        fprintf(stderr, "unimpl: clear RT\n");
1646}
1647
1648static void
1649v3d_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *ps,
1650                        unsigned buffers, double depth, unsigned stencil,
1651                        unsigned x, unsigned y, unsigned w, unsigned h,
1652                        bool render_condition_enabled)
1653{
1654        fprintf(stderr, "unimpl: clear DS\n");
1655}
1656
1657void
1658v3dX(start_binning)(struct v3d_context *v3d, struct v3d_job *job)
1659{
1660        v3d_start_binning(v3d, job);
1661}
1662
1663void
1664v3dX(draw_init)(struct pipe_context *pctx)
1665{
1666        pctx->draw_vbo = v3d_draw_vbo;
1667        pctx->clear = v3d_clear;
1668        pctx->clear_render_target = v3d_clear_render_target;
1669        pctx->clear_depth_stencil = v3d_clear_depth_stencil;
1670#if V3D_VERSION >= 41
1671        if (v3d_context(pctx)->screen->has_csd)
1672                pctx->launch_grid = v3d_launch_grid;
1673#endif
1674}
1675