1/*
2 * Copyright © 2016-2017 Broadcom
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24#include "broadcom/common/v3d_device_info.h"
25#include "v3d_compiler.h"
26#include "util/u_prim.h"
27#include "compiler/nir/nir_schedule.h"
28#include "compiler/nir/nir_builder.h"
29
30int
31vir_get_nsrc(struct qinst *inst)
32{
33        switch (inst->qpu.type) {
34        case V3D_QPU_INSTR_TYPE_BRANCH:
35                return 0;
36        case V3D_QPU_INSTR_TYPE_ALU:
37                if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
38                        return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
39                else
40                        return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
41        }
42
43        return 0;
44}
45
46/**
47 * Returns whether the instruction has any side effects that must be
48 * preserved.
49 */
50bool
51vir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
52{
53        switch (inst->qpu.type) {
54        case V3D_QPU_INSTR_TYPE_BRANCH:
55                return true;
56        case V3D_QPU_INSTR_TYPE_ALU:
57                switch (inst->qpu.alu.add.op) {
58                case V3D_QPU_A_SETREVF:
59                case V3D_QPU_A_SETMSF:
60                case V3D_QPU_A_VPMSETUP:
61                case V3D_QPU_A_STVPMV:
62                case V3D_QPU_A_STVPMD:
63                case V3D_QPU_A_STVPMP:
64                case V3D_QPU_A_VPMWT:
65                case V3D_QPU_A_TMUWT:
66                        return true;
67                default:
68                        break;
69                }
70
71                switch (inst->qpu.alu.mul.op) {
72                case V3D_QPU_M_MULTOP:
73                        return true;
74                default:
75                        break;
76                }
77        }
78
79        if (inst->qpu.sig.ldtmu ||
80            inst->qpu.sig.ldvary ||
81            inst->qpu.sig.ldtlbu ||
82            inst->qpu.sig.ldtlb ||
83            inst->qpu.sig.wrtmuc ||
84            inst->qpu.sig.thrsw) {
85                return true;
86        }
87
88        /* ldunifa works like ldunif: it reads an element and advances the
89         * pointer, so each read has a side effect (we don't care for ldunif
90         * because we reconstruct the uniform stream buffer after compiling
91         * with the surviving uniforms), so allowing DCE to remove
92         * one would break follow-up loads. We could fix this by emiting a
93         * unifa for each ldunifa, but each unifa requires 3 delay slots
94         * before a ldunifa, so that would be quite expensive.
95         */
96        if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
97                return true;
98
99        return false;
100}
101
102bool
103vir_is_raw_mov(struct qinst *inst)
104{
105        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
106            (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
107             inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
108                return false;
109        }
110
111        if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
112            inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
113                return false;
114        }
115
116        if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
117            inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
118            inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
119            inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
120                return false;
121        }
122
123        if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
124            inst->qpu.flags.mc != V3D_QPU_COND_NONE)
125                return false;
126
127        return true;
128}
129
130bool
131vir_is_add(struct qinst *inst)
132{
133        return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
134                inst->qpu.alu.add.op != V3D_QPU_A_NOP);
135}
136
137bool
138vir_is_mul(struct qinst *inst)
139{
140        return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
141                inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
142}
143
144bool
145vir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
146{
147        if (inst->dst.file == QFILE_MAGIC)
148                return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
149
150        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
151            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
152                return true;
153        }
154
155        return false;
156}
157
158bool
159vir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
160{
161        for (int i = 0; i < vir_get_nsrc(inst); i++) {
162                switch (inst->src[i].file) {
163                case QFILE_VPM:
164                        return true;
165                default:
166                        break;
167                }
168        }
169
170        if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
171                                  inst->qpu.sig.ldtlb ||
172                                  inst->qpu.sig.ldtlbu ||
173                                  inst->qpu.sig.ldvpm)) {
174                return true;
175        }
176
177        return false;
178}
179
180bool
181vir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
182{
183        switch (inst->dst.file) {
184        case QFILE_MAGIC:
185                switch (inst->dst.index) {
186                case V3D_QPU_WADDR_RECIP:
187                case V3D_QPU_WADDR_RSQRT:
188                case V3D_QPU_WADDR_EXP:
189                case V3D_QPU_WADDR_LOG:
190                case V3D_QPU_WADDR_SIN:
191                        return true;
192                }
193                break;
194        default:
195                break;
196        }
197
198        if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
199                return true;
200
201        return false;
202}
203
204void
205vir_set_unpack(struct qinst *inst, int src,
206               enum v3d_qpu_input_unpack unpack)
207{
208        assert(src == 0 || src == 1);
209
210        if (vir_is_add(inst)) {
211                if (src == 0)
212                        inst->qpu.alu.add.a_unpack = unpack;
213                else
214                        inst->qpu.alu.add.b_unpack = unpack;
215        } else {
216                assert(vir_is_mul(inst));
217                if (src == 0)
218                        inst->qpu.alu.mul.a_unpack = unpack;
219                else
220                        inst->qpu.alu.mul.b_unpack = unpack;
221        }
222}
223
224void
225vir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
226{
227        if (vir_is_add(inst)) {
228                inst->qpu.alu.add.output_pack = pack;
229        } else {
230                assert(vir_is_mul(inst));
231                inst->qpu.alu.mul.output_pack = pack;
232        }
233}
234
235void
236vir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
237{
238        if (vir_is_add(inst)) {
239                inst->qpu.flags.ac = cond;
240        } else {
241                assert(vir_is_mul(inst));
242                inst->qpu.flags.mc = cond;
243        }
244}
245
246enum v3d_qpu_cond
247vir_get_cond(struct qinst *inst)
248{
249        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
250
251        if (vir_is_add(inst))
252                return inst->qpu.flags.ac;
253        else if (vir_is_mul(inst))
254                return inst->qpu.flags.mc;
255        else /* NOP */
256                return V3D_QPU_COND_NONE;
257}
258
259void
260vir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
261{
262        c->flags_temp = -1;
263        if (vir_is_add(inst)) {
264                inst->qpu.flags.apf = pf;
265        } else {
266                assert(vir_is_mul(inst));
267                inst->qpu.flags.mpf = pf;
268        }
269}
270
271void
272vir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
273{
274        c->flags_temp = -1;
275        if (vir_is_add(inst)) {
276                inst->qpu.flags.auf = uf;
277        } else {
278                assert(vir_is_mul(inst));
279                inst->qpu.flags.muf = uf;
280        }
281}
282
283#if 0
284uint8_t
285vir_channels_written(struct qinst *inst)
286{
287        if (vir_is_mul(inst)) {
288                switch (inst->dst.pack) {
289                case QPU_PACK_MUL_NOP:
290                case QPU_PACK_MUL_8888:
291                        return 0xf;
292                case QPU_PACK_MUL_8A:
293                        return 0x1;
294                case QPU_PACK_MUL_8B:
295                        return 0x2;
296                case QPU_PACK_MUL_8C:
297                        return 0x4;
298                case QPU_PACK_MUL_8D:
299                        return 0x8;
300                }
301        } else {
302                switch (inst->dst.pack) {
303                case QPU_PACK_A_NOP:
304                case QPU_PACK_A_8888:
305                case QPU_PACK_A_8888_SAT:
306                case QPU_PACK_A_32_SAT:
307                        return 0xf;
308                case QPU_PACK_A_8A:
309                case QPU_PACK_A_8A_SAT:
310                        return 0x1;
311                case QPU_PACK_A_8B:
312                case QPU_PACK_A_8B_SAT:
313                        return 0x2;
314                case QPU_PACK_A_8C:
315                case QPU_PACK_A_8C_SAT:
316                        return 0x4;
317                case QPU_PACK_A_8D:
318                case QPU_PACK_A_8D_SAT:
319                        return 0x8;
320                case QPU_PACK_A_16A:
321                case QPU_PACK_A_16A_SAT:
322                        return 0x3;
323                case QPU_PACK_A_16B:
324                case QPU_PACK_A_16B_SAT:
325                        return 0xc;
326                }
327        }
328        unreachable("Bad pack field");
329}
330#endif
331
332struct qreg
333vir_get_temp(struct v3d_compile *c)
334{
335        struct qreg reg;
336
337        reg.file = QFILE_TEMP;
338        reg.index = c->num_temps++;
339
340        if (c->num_temps > c->defs_array_size) {
341                uint32_t old_size = c->defs_array_size;
342                c->defs_array_size = MAX2(old_size * 2, 16);
343
344                c->defs = reralloc(c, c->defs, struct qinst *,
345                                   c->defs_array_size);
346                memset(&c->defs[old_size], 0,
347                       sizeof(c->defs[0]) * (c->defs_array_size - old_size));
348
349                c->spillable = reralloc(c, c->spillable,
350                                        BITSET_WORD,
351                                        BITSET_WORDS(c->defs_array_size));
352                for (int i = old_size; i < c->defs_array_size; i++)
353                        BITSET_SET(c->spillable, i);
354        }
355
356        return reg;
357}
358
359struct qinst *
360vir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
361{
362        struct qinst *inst = calloc(1, sizeof(*inst));
363
364        inst->qpu = v3d_qpu_nop();
365        inst->qpu.alu.add.op = op;
366
367        inst->dst = dst;
368        inst->src[0] = src0;
369        inst->src[1] = src1;
370        inst->uniform = ~0;
371
372        return inst;
373}
374
375struct qinst *
376vir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
377{
378        struct qinst *inst = calloc(1, sizeof(*inst));
379
380        inst->qpu = v3d_qpu_nop();
381        inst->qpu.alu.mul.op = op;
382
383        inst->dst = dst;
384        inst->src[0] = src0;
385        inst->src[1] = src1;
386        inst->uniform = ~0;
387
388        return inst;
389}
390
391struct qinst *
392vir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
393{
394        struct qinst *inst = calloc(1, sizeof(*inst));
395
396        inst->qpu = v3d_qpu_nop();
397        inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
398        inst->qpu.branch.cond = cond;
399        inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
400        inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
401        inst->qpu.branch.ub = true;
402        inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
403
404        inst->dst = vir_nop_reg();
405        inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
406
407        return inst;
408}
409
410static void
411vir_emit(struct v3d_compile *c, struct qinst *inst)
412{
413        switch (c->cursor.mode) {
414        case vir_cursor_add:
415                list_add(&inst->link, c->cursor.link);
416                break;
417        case vir_cursor_addtail:
418                list_addtail(&inst->link, c->cursor.link);
419                break;
420        }
421
422        c->cursor = vir_after_inst(inst);
423        c->live_intervals_valid = false;
424}
425
426/* Updates inst to write to a new temporary, emits it, and notes the def. */
427struct qreg
428vir_emit_def(struct v3d_compile *c, struct qinst *inst)
429{
430        assert(inst->dst.file == QFILE_NULL);
431
432        /* If we're emitting an instruction that's a def, it had better be
433         * writing a register.
434         */
435        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
436                assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
437                       v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
438                assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
439                       v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
440        }
441
442        inst->dst = vir_get_temp(c);
443
444        if (inst->dst.file == QFILE_TEMP)
445                c->defs[inst->dst.index] = inst;
446
447        vir_emit(c, inst);
448
449        return inst->dst;
450}
451
452struct qinst *
453vir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
454{
455        if (inst->dst.file == QFILE_TEMP)
456                c->defs[inst->dst.index] = NULL;
457
458        vir_emit(c, inst);
459
460        return inst;
461}
462
463struct qblock *
464vir_new_block(struct v3d_compile *c)
465{
466        struct qblock *block = rzalloc(c, struct qblock);
467
468        list_inithead(&block->instructions);
469
470        block->predecessors = _mesa_set_create(block,
471                                               _mesa_hash_pointer,
472                                               _mesa_key_pointer_equal);
473
474        block->index = c->next_block_index++;
475
476        return block;
477}
478
479void
480vir_set_emit_block(struct v3d_compile *c, struct qblock *block)
481{
482        c->cur_block = block;
483        c->cursor = vir_after_block(block);
484        list_addtail(&block->link, &c->blocks);
485}
486
487struct qblock *
488vir_entry_block(struct v3d_compile *c)
489{
490        return list_first_entry(&c->blocks, struct qblock, link);
491}
492
493struct qblock *
494vir_exit_block(struct v3d_compile *c)
495{
496        return list_last_entry(&c->blocks, struct qblock, link);
497}
498
499void
500vir_link_blocks(struct qblock *predecessor, struct qblock *successor)
501{
502        _mesa_set_add(successor->predecessors, predecessor);
503        if (predecessor->successors[0]) {
504                assert(!predecessor->successors[1]);
505                predecessor->successors[1] = successor;
506        } else {
507                predecessor->successors[0] = successor;
508        }
509}
510
511const struct v3d_compiler *
512v3d_compiler_init(const struct v3d_device_info *devinfo)
513{
514        struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
515        if (!compiler)
516                return NULL;
517
518        compiler->devinfo = devinfo;
519
520        if (!vir_init_reg_sets(compiler)) {
521                ralloc_free(compiler);
522                return NULL;
523        }
524
525        return compiler;
526}
527
528void
529v3d_compiler_free(const struct v3d_compiler *compiler)
530{
531        ralloc_free((void *)compiler);
532}
533
534static struct v3d_compile *
535vir_compile_init(const struct v3d_compiler *compiler,
536                 struct v3d_key *key,
537                 nir_shader *s,
538                 void (*debug_output)(const char *msg,
539                                      void *debug_output_data),
540                 void *debug_output_data,
541                 int program_id, int variant_id,
542                 uint32_t max_threads,
543                 uint32_t min_threads_for_reg_alloc,
544                 bool tmu_spilling_allowed,
545                 bool disable_loop_unrolling,
546                 bool disable_constant_ubo_load_sorting,
547                 bool disable_tmu_pipelining,
548                 bool fallback_scheduler)
549{
550        struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
551
552        c->compiler = compiler;
553        c->devinfo = compiler->devinfo;
554        c->key = key;
555        c->program_id = program_id;
556        c->variant_id = variant_id;
557        c->threads = max_threads;
558        c->debug_output = debug_output;
559        c->debug_output_data = debug_output_data;
560        c->compilation_result = V3D_COMPILATION_SUCCEEDED;
561        c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
562        c->tmu_spilling_allowed = tmu_spilling_allowed;
563        c->fallback_scheduler = fallback_scheduler;
564        c->disable_tmu_pipelining = disable_tmu_pipelining;
565        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
566        c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
567                ? true : disable_loop_unrolling;
568
569        s = nir_shader_clone(c, s);
570        c->s = s;
571
572        list_inithead(&c->blocks);
573        vir_set_emit_block(c, vir_new_block(c));
574
575        c->output_position_index = -1;
576        c->output_sample_mask_index = -1;
577
578        c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
579                                            _mesa_key_pointer_equal);
580
581        c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
582        c->flags_temp = -1;
583
584        return c;
585}
586
587static int
588type_size_vec4(const struct glsl_type *type, bool bindless)
589{
590        return glsl_count_attribute_slots(type, false);
591}
592
593static void
594v3d_lower_nir(struct v3d_compile *c)
595{
596        struct nir_lower_tex_options tex_options = {
597                .lower_txd = true,
598                .lower_tg4_broadcom_swizzle = true,
599
600                .lower_rect = false, /* XXX: Use this on V3D 3.x */
601                .lower_txp = ~0,
602                /* Apply swizzles to all samplers. */
603                .swizzle_result = ~0,
604        };
605
606        /* Lower the format swizzle and (for 32-bit returns)
607         * ARB_texture_swizzle-style swizzle.
608         */
609        assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
610        for (int i = 0; i < c->key->num_tex_used; i++) {
611                for (int j = 0; j < 4; j++)
612                        tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
613        }
614
615        assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler));
616        for (int i = 0; i < c->key->num_samplers_used; i++) {
617                if (c->key->sampler[i].return_size == 16) {
618                        tex_options.lower_tex_packing[i] =
619                                nir_lower_tex_packing_16;
620                }
621        }
622
623        /* CS textures may not have return_size reflecting the shadow state. */
624        nir_foreach_uniform_variable(var, c->s) {
625                const struct glsl_type *type = glsl_without_array(var->type);
626                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
627
628                if (!glsl_type_is_sampler(type) ||
629                    !glsl_sampler_type_is_shadow(type))
630                        continue;
631
632                for (int i = 0; i < array_len; i++) {
633                        tex_options.lower_tex_packing[var->data.binding + i] =
634                                nir_lower_tex_packing_16;
635                }
636        }
637
638        NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
639        NIR_PASS_V(c->s, nir_lower_system_values);
640        NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL);
641
642        NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
643                   nir_var_function_temp,
644                   0,
645                   glsl_get_natural_size_align_bytes);
646        NIR_PASS_V(c->s, v3d_nir_lower_scratch);
647}
648
649static void
650v3d_set_prog_data_uniforms(struct v3d_compile *c,
651                           struct v3d_prog_data *prog_data)
652{
653        int count = c->num_uniforms;
654        struct v3d_uniform_list *ulist = &prog_data->uniforms;
655
656        ulist->count = count;
657        ulist->data = ralloc_array(prog_data, uint32_t, count);
658        memcpy(ulist->data, c->uniform_data,
659               count * sizeof(*ulist->data));
660        ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
661        memcpy(ulist->contents, c->uniform_contents,
662               count * sizeof(*ulist->contents));
663}
664
665static void
666v3d_vs_set_prog_data(struct v3d_compile *c,
667                     struct v3d_vs_prog_data *prog_data)
668{
669        /* The vertex data gets format converted by the VPM so that
670         * each attribute channel takes up a VPM column.  Precompute
671         * the sizes for the shader record.
672         */
673        for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
674                prog_data->vattr_sizes[i] = c->vattr_sizes[i];
675                prog_data->vpm_input_size += c->vattr_sizes[i];
676        }
677
678        memset(prog_data->driver_location_map, -1,
679               sizeof(prog_data->driver_location_map));
680
681        nir_foreach_shader_in_variable(var, c->s) {
682                prog_data->driver_location_map[var->data.location] =
683                        var->data.driver_location;
684        }
685
686        prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
687                                          SYSTEM_VALUE_VERTEX_ID) ||
688                              BITSET_TEST(c->s->info.system_values_read,
689                                          SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
690
691        prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
692                                           SYSTEM_VALUE_BASE_INSTANCE);
693
694        prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
695                                          SYSTEM_VALUE_INSTANCE_ID) ||
696                              BITSET_TEST(c->s->info.system_values_read,
697                                          SYSTEM_VALUE_INSTANCE_INDEX);
698
699        if (prog_data->uses_vid)
700                prog_data->vpm_input_size++;
701        if (prog_data->uses_biid)
702                prog_data->vpm_input_size++;
703        if (prog_data->uses_iid)
704                prog_data->vpm_input_size++;
705
706        /* Input/output segment size are in sectors (8 rows of 32 bits per
707         * channel).
708         */
709        prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
710        prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
711
712        /* Set us up for shared input/output segments.  This is apparently
713         * necessary for our VCM setup to avoid varying corruption.
714         */
715        prog_data->separate_segments = false;
716        prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
717                                          prog_data->vpm_input_size);
718        prog_data->vpm_input_size = 0;
719
720        /* Compute VCM cache size.  We set up our program to take up less than
721         * half of the VPM, so that any set of bin and render programs won't
722         * run out of space.  We need space for at least one input segment,
723         * and then allocate the rest to output segments (one for the current
724         * program, the rest to VCM).  The valid range of the VCM cache size
725         * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
726         * batches.
727         */
728        assert(c->devinfo->vpm_size);
729        int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
730        int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
731        int half_vpm = vpm_size_in_sectors / 2;
732        int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
733        int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
734        assert(vpm_output_batches >= 2);
735        prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
736}
737
738static void
739v3d_gs_set_prog_data(struct v3d_compile *c,
740                     struct v3d_gs_prog_data *prog_data)
741{
742        prog_data->num_inputs = c->num_inputs;
743        memcpy(prog_data->input_slots, c->input_slots,
744               c->num_inputs * sizeof(*c->input_slots));
745
746        /* gl_PrimitiveIdIn is written by the GBG into the first word of the
747         * VPM output header automatically and the shader will overwrite
748         * it after reading it if necessary, so it doesn't add to the VPM
749         * size requirements.
750         */
751        prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
752                                          SYSTEM_VALUE_PRIMITIVE_ID);
753
754        /* Output segment size is in sectors (8 rows of 32 bits per channel) */
755        prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
756
757        /* Compute SIMD dispatch width and update VPM output size accordingly
758         * to ensure we can fit our program in memory. Available widths are
759         * 16, 8, 4, 1.
760         *
761         * Notice that at draw time we will have to consider VPM memory
762         * requirements from other stages and choose a smaller dispatch
763         * width if needed to fit the program in VPM memory.
764         */
765        prog_data->simd_width = 16;
766        while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
767               prog_data->simd_width == 2) {
768                prog_data->simd_width >>= 1;
769                prog_data->vpm_output_size =
770                        align(prog_data->vpm_output_size, 2) / 2;
771        }
772        assert(prog_data->vpm_output_size <= 16);
773        assert(prog_data->simd_width != 2);
774
775        prog_data->out_prim_type = c->s->info.gs.output_primitive;
776        prog_data->num_invocations = c->s->info.gs.invocations;
777
778        prog_data->writes_psiz =
779            c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
780}
781
782static void
783v3d_set_fs_prog_data_inputs(struct v3d_compile *c,
784                            struct v3d_fs_prog_data *prog_data)
785{
786        prog_data->num_inputs = c->num_inputs;
787        memcpy(prog_data->input_slots, c->input_slots,
788               c->num_inputs * sizeof(*c->input_slots));
789
790        STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
791                      (V3D_MAX_FS_INPUTS - 1) / 24);
792        for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
793                if (BITSET_TEST(c->flat_shade_flags, i))
794                        prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
795
796                if (BITSET_TEST(c->noperspective_flags, i))
797                        prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
798
799                if (BITSET_TEST(c->centroid_flags, i))
800                        prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
801        }
802}
803
804static void
805v3d_fs_set_prog_data(struct v3d_compile *c,
806                     struct v3d_fs_prog_data *prog_data)
807{
808        v3d_set_fs_prog_data_inputs(c, prog_data);
809        prog_data->writes_z = c->writes_z;
810        prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
811        prog_data->uses_center_w = c->uses_center_w;
812        prog_data->uses_implicit_point_line_varyings =
813                c->uses_implicit_point_line_varyings;
814        prog_data->lock_scoreboard_on_first_thrsw =
815                c->lock_scoreboard_on_first_thrsw;
816        prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
817        prog_data->uses_pid = c->fs_uses_primitive_id;
818}
819
820static void
821v3d_cs_set_prog_data(struct v3d_compile *c,
822                     struct v3d_compute_prog_data *prog_data)
823{
824        prog_data->shared_size = c->s->info.shared_size;
825
826        prog_data->local_size[0] = c->s->info.workgroup_size[0];
827        prog_data->local_size[1] = c->s->info.workgroup_size[1];
828        prog_data->local_size[2] = c->s->info.workgroup_size[2];
829
830        prog_data->has_subgroups = c->has_subgroups;
831}
832
833static void
834v3d_set_prog_data(struct v3d_compile *c,
835                  struct v3d_prog_data *prog_data)
836{
837        prog_data->threads = c->threads;
838        prog_data->single_seg = !c->last_thrsw;
839        prog_data->spill_size = c->spill_size;
840        prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
841        prog_data->has_control_barrier = c->s->info.uses_control_barrier;
842
843        v3d_set_prog_data_uniforms(c, prog_data);
844
845        switch (c->s->info.stage) {
846        case MESA_SHADER_VERTEX:
847                v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
848                break;
849        case MESA_SHADER_GEOMETRY:
850                v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
851                break;
852        case MESA_SHADER_FRAGMENT:
853                v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
854                break;
855        case MESA_SHADER_COMPUTE:
856                v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
857                break;
858        default:
859                unreachable("unsupported shader stage");
860        }
861}
862
863static uint64_t *
864v3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
865{
866        *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
867
868        uint64_t *qpu_insts = malloc(*final_assembly_size);
869        if (!qpu_insts)
870                return NULL;
871
872        memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
873
874        vir_compile_destroy(c);
875
876        return qpu_insts;
877}
878
879static void
880v3d_nir_lower_vs_early(struct v3d_compile *c)
881{
882        /* Split our I/O vars and dead code eliminate the unused
883         * components.
884         */
885        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
886                   nir_var_shader_in | nir_var_shader_out);
887        uint64_t used_outputs[4] = {0};
888        for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
889                int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
890                int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
891                used_outputs[comp] |= 1ull << slot;
892        }
893        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
894                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
895        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
896        v3d_optimize_nir(c, c->s);
897        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
898
899        /* This must go before nir_lower_io */
900        if (c->vs_key->per_vertex_point_size)
901                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
902
903        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
904                   type_size_vec4,
905                   (nir_lower_io_options)0);
906        /* clean up nir_lower_io's deref_var remains and do a constant folding pass
907         * on the code it generated.
908         */
909        NIR_PASS_V(c->s, nir_opt_dce);
910        NIR_PASS_V(c->s, nir_opt_constant_folding);
911}
912
913static void
914v3d_nir_lower_gs_early(struct v3d_compile *c)
915{
916        /* Split our I/O vars and dead code eliminate the unused
917         * components.
918         */
919        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
920                   nir_var_shader_in | nir_var_shader_out);
921        uint64_t used_outputs[4] = {0};
922        for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
923                int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
924                int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
925                used_outputs[comp] |= 1ull << slot;
926        }
927        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
928                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
929        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
930        v3d_optimize_nir(c, c->s);
931        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
932
933        /* This must go before nir_lower_io */
934        if (c->gs_key->per_vertex_point_size)
935                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
936
937        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
938                   type_size_vec4,
939                   (nir_lower_io_options)0);
940        /* clean up nir_lower_io's deref_var remains */
941        NIR_PASS_V(c->s, nir_opt_dce);
942}
943
944static void
945v3d_fixup_fs_output_types(struct v3d_compile *c)
946{
947        nir_foreach_shader_out_variable(var, c->s) {
948                uint32_t mask = 0;
949
950                switch (var->data.location) {
951                case FRAG_RESULT_COLOR:
952                        mask = ~0;
953                        break;
954                case FRAG_RESULT_DATA0:
955                case FRAG_RESULT_DATA1:
956                case FRAG_RESULT_DATA2:
957                case FRAG_RESULT_DATA3:
958                        mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
959                        break;
960                }
961
962                if (c->fs_key->int_color_rb & mask) {
963                        var->type =
964                                glsl_vector_type(GLSL_TYPE_INT,
965                                                 glsl_get_components(var->type));
966                } else if (c->fs_key->uint_color_rb & mask) {
967                        var->type =
968                                glsl_vector_type(GLSL_TYPE_UINT,
969                                                 glsl_get_components(var->type));
970                }
971        }
972}
973
974static void
975v3d_nir_lower_fs_early(struct v3d_compile *c)
976{
977        if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
978                v3d_fixup_fs_output_types(c);
979
980        NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
981
982        if (c->fs_key->line_smoothing) {
983                v3d_nir_lower_line_smooth(c->s);
984                NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
985                /* The lowering pass can introduce new sysval reads */
986                nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
987        }
988}
989
990static void
991v3d_nir_lower_gs_late(struct v3d_compile *c)
992{
993        if (c->key->ucp_enables) {
994                NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
995                           false, NULL);
996        }
997
998        /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
999        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
1000}
1001
1002static void
1003v3d_nir_lower_vs_late(struct v3d_compile *c)
1004{
1005        if (c->key->ucp_enables) {
1006                NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
1007                           false, false, NULL);
1008                NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1009                           nir_var_shader_out);
1010        }
1011
1012        /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1013        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
1014}
1015
1016static void
1017v3d_nir_lower_fs_late(struct v3d_compile *c)
1018{
1019        /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
1020         * Vulkan allows it, in which case the SPIR-V compiler will declare
1021         * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
1022         * the last parameter to always operate with a compact array in both
1023         * OpenGL and Vulkan so we do't have to care about the API we
1024         * are using.
1025         */
1026        if (c->key->ucp_enables)
1027                NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
1028
1029        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
1030}
1031
1032static uint32_t
1033vir_get_max_temps(struct v3d_compile *c)
1034{
1035        int max_ip = 0;
1036        vir_for_each_inst_inorder(inst, c)
1037                max_ip++;
1038
1039        uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1040
1041        for (int t = 0; t < c->num_temps; t++) {
1042                for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1043                                                i < max_ip); i++) {
1044                        if (i > max_ip)
1045                                break;
1046                        pressure[i]++;
1047                }
1048        }
1049
1050        uint32_t max_temps = 0;
1051        for (int i = 0; i < max_ip; i++)
1052                max_temps = MAX2(max_temps, pressure[i]);
1053
1054        ralloc_free(pressure);
1055
1056        return max_temps;
1057}
1058
1059enum v3d_dependency_class {
1060        V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
1061};
1062
1063static bool
1064v3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
1065                            nir_schedule_dependency *dep,
1066                            void *user_data)
1067{
1068        struct v3d_compile *c = user_data;
1069
1070        switch (intr->intrinsic) {
1071        case nir_intrinsic_store_output:
1072                /* Writing to location 0 overwrites the value passed in for
1073                 * gl_PrimitiveID on geometry shaders
1074                 */
1075                if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
1076                    nir_intrinsic_base(intr) != 0)
1077                        break;
1078
1079                nir_const_value *const_value =
1080                        nir_src_as_const_value(intr->src[1]);
1081
1082                if (const_value == NULL)
1083                        break;
1084
1085                uint64_t offset =
1086                        nir_const_value_as_uint(*const_value,
1087                                                nir_src_bit_size(intr->src[1]));
1088                if (offset != 0)
1089                        break;
1090
1091                dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1092                dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
1093                return true;
1094
1095        case nir_intrinsic_load_primitive_id:
1096                if (c->s->info.stage != MESA_SHADER_GEOMETRY)
1097                        break;
1098
1099                dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
1100                dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
1101                return true;
1102
1103        default:
1104                break;
1105        }
1106
1107        return false;
1108}
1109
1110static bool
1111should_split_wrmask(const nir_instr *instr, const void *data)
1112{
1113        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
1114        switch (intr->intrinsic) {
1115        case nir_intrinsic_store_ssbo:
1116        case nir_intrinsic_store_shared:
1117        case nir_intrinsic_store_global:
1118        case nir_intrinsic_store_scratch:
1119                return true;
1120        default:
1121                return false;
1122        }
1123}
1124
1125static nir_intrinsic_instr *
1126nir_instr_as_constant_ubo_load(nir_instr *inst)
1127{
1128        if (inst->type != nir_instr_type_intrinsic)
1129                return NULL;
1130
1131        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1132        if (intr->intrinsic != nir_intrinsic_load_ubo)
1133                return NULL;
1134
1135        assert(nir_src_is_const(intr->src[0]));
1136        if (!nir_src_is_const(intr->src[1]))
1137                return NULL;
1138
1139        return intr;
1140}
1141
1142static bool
1143v3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
1144{
1145        bool progress = false;
1146
1147        nir_instr *ref_inst = &ref->instr;
1148        uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
1149        uint32_t ref_index = nir_src_as_uint(ref->src[0]);
1150
1151        /* Go through all instructions after ref searching for constant UBO
1152         * loads for the same UBO index.
1153         */
1154        bool seq_break = false;
1155        nir_instr *inst = &ref->instr;
1156        nir_instr *next_inst = NULL;
1157        while (true) {
1158                inst = next_inst ? next_inst : nir_instr_next(inst);
1159                if (!inst)
1160                        break;
1161
1162                next_inst = NULL;
1163
1164                if (inst->type != nir_instr_type_intrinsic)
1165                        continue;
1166
1167                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
1168                if (intr->intrinsic != nir_intrinsic_load_ubo)
1169                        continue;
1170
1171                /* We only produce unifa sequences for non-divergent loads */
1172                if (nir_src_is_divergent(intr->src[1]))
1173                        continue;
1174
1175                /* If there are any UBO loads that are not constant or that
1176                 * use a different UBO index in between the reference load and
1177                 * any other constant load for the same index, they would break
1178                 * the unifa sequence. We will flag that so we can then move
1179                 * all constant UBO loads for the reference index before these
1180                 * and not just the ones that are not ordered to avoid breaking
1181                 * the sequence and reduce unifa writes.
1182                 */
1183                if (!nir_src_is_const(intr->src[1])) {
1184                        seq_break = true;
1185                        continue;
1186                }
1187                uint32_t offset = nir_src_as_uint(intr->src[1]);
1188
1189                assert(nir_src_is_const(intr->src[0]));
1190                uint32_t index = nir_src_as_uint(intr->src[0]);
1191                if (index != ref_index) {
1192                       seq_break = true;
1193                       continue;
1194                }
1195
1196                /* Only move loads with an offset that is close enough to the
1197                 * reference offset, since otherwise we would not be able to
1198                 * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
1199                 */
1200                if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
1201                        continue;
1202
1203                /* We will move this load if its offset is smaller than ref's
1204                 * (in which case we will move it before ref) or if the offset
1205                 * is larger than ref's but there are sequence breakers in
1206                 * in between (in which case we will move it after ref and
1207                 * before the sequence breakers).
1208                 */
1209                if (!seq_break && offset >= ref_offset)
1210                        continue;
1211
1212                /* Find where exactly we want to move this load:
1213                 *
1214                 * If we are moving it before ref, we want to check any other
1215                 * UBO loads we placed before ref and make sure we insert this
1216                 * one properly ordered with them. Likewise, if we are moving
1217                 * it after ref.
1218                 */
1219                nir_instr *pos = ref_inst;
1220                nir_instr *tmp = pos;
1221                do {
1222                        if (offset < ref_offset)
1223                                tmp = nir_instr_prev(tmp);
1224                        else
1225                                tmp = nir_instr_next(tmp);
1226
1227                        if (!tmp || tmp == inst)
1228                                break;
1229
1230                        /* Ignore non-unifa UBO loads */
1231                        if (tmp->type != nir_instr_type_intrinsic)
1232                                continue;
1233
1234                        nir_intrinsic_instr *tmp_intr =
1235                                nir_instr_as_intrinsic(tmp);
1236                        if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
1237                                continue;
1238
1239                        if (nir_src_is_divergent(tmp_intr->src[1]))
1240                                continue;
1241
1242                        /* Stop if we find a unifa UBO load that breaks the
1243                         * sequence.
1244                         */
1245                        if (!nir_src_is_const(tmp_intr->src[1]))
1246                                break;
1247
1248                        if (nir_src_as_uint(tmp_intr->src[0]) != index)
1249                                break;
1250
1251                        uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
1252                        if (offset < ref_offset) {
1253                                if (tmp_offset < offset ||
1254                                    tmp_offset >= ref_offset) {
1255                                        break;
1256                                } else {
1257                                        pos = tmp;
1258                                }
1259                        } else {
1260                                if (tmp_offset > offset ||
1261                                    tmp_offset <= ref_offset) {
1262                                        break;
1263                                } else {
1264                                        pos = tmp;
1265                                }
1266                        }
1267                } while (true);
1268
1269                /* We can't move the UBO load before the instruction that
1270                 * defines its constant offset. If that instruction is placed
1271                 * in between the new location (pos) and the current location
1272                 * of this load, we will have to move that instruction too.
1273                 *
1274                 * We don't care about the UBO index definition because that
1275                 * is optimized to be reused by all UBO loads for the same
1276                 * index and therefore is certain to be defined before the
1277                 * first UBO load that uses it.
1278                 */
1279                nir_instr *offset_inst = NULL;
1280                tmp = inst;
1281                while ((tmp = nir_instr_prev(tmp)) != NULL) {
1282                        if (pos == tmp) {
1283                                /* We reached the target location without
1284                                 * finding the instruction that defines the
1285                                 * offset, so that instruction must be before
1286                                 * the new position and we don't have to fix it.
1287                                 */
1288                                break;
1289                        }
1290                        if (intr->src[1].ssa->parent_instr == tmp) {
1291                                offset_inst = tmp;
1292                                break;
1293                        }
1294                }
1295
1296                if (offset_inst) {
1297                        exec_node_remove(&offset_inst->node);
1298                        exec_node_insert_node_before(&pos->node,
1299                                                     &offset_inst->node);
1300                }
1301
1302                /* Since we are moving the instruction before its current
1303                 * location, grab its successor before the move so that
1304                 * we can continue the next iteration of the main loop from
1305                 * that instruction.
1306                 */
1307                next_inst = nir_instr_next(inst);
1308
1309                /* Move this load to the selected location */
1310                exec_node_remove(&inst->node);
1311                if (offset < ref_offset)
1312                        exec_node_insert_node_before(&pos->node, &inst->node);
1313                else
1314                        exec_node_insert_after(&pos->node, &inst->node);
1315
1316                progress = true;
1317        }
1318
1319        return progress;
1320}
1321
1322static bool
1323v3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
1324                                      nir_block *block)
1325{
1326        bool progress = false;
1327        bool local_progress;
1328        do {
1329                local_progress = false;
1330                nir_foreach_instr_safe(inst, block) {
1331                        nir_intrinsic_instr *intr =
1332                                nir_instr_as_constant_ubo_load(inst);
1333                        if (intr) {
1334                                local_progress |=
1335                                        v3d_nir_sort_constant_ubo_load(block, intr);
1336                        }
1337                }
1338                progress |= local_progress;
1339        } while (local_progress);
1340
1341        return progress;
1342}
1343
1344/**
1345 * Sorts constant UBO loads in each block by offset to maximize chances of
1346 * skipping unifa writes when converting to VIR. This can increase register
1347 * pressure.
1348 */
1349static bool
1350v3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
1351{
1352        nir_foreach_function(function, s) {
1353                if (function->impl) {
1354                        nir_foreach_block(block, function->impl) {
1355                                c->sorted_any_ubo_loads |=
1356                                        v3d_nir_sort_constant_ubo_loads_block(c, block);
1357                        }
1358                        nir_metadata_preserve(function->impl,
1359                                              nir_metadata_block_index |
1360                                              nir_metadata_dominance);
1361                }
1362        }
1363        return c->sorted_any_ubo_loads;
1364}
1365
1366static void
1367lower_load_num_subgroups(struct v3d_compile *c,
1368                         nir_builder *b,
1369                         nir_intrinsic_instr *intr)
1370{
1371        assert(c->s->info.stage == MESA_SHADER_COMPUTE);
1372        assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
1373
1374        b->cursor = nir_after_instr(&intr->instr);
1375        uint32_t num_subgroups =
1376                DIV_ROUND_UP(c->s->info.workgroup_size[0] *
1377                             c->s->info.workgroup_size[1] *
1378                             c->s->info.workgroup_size[2], V3D_CHANNELS);
1379        nir_ssa_def *result = nir_imm_int(b, num_subgroups);
1380        nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
1381        nir_instr_remove(&intr->instr);
1382}
1383
1384static bool
1385lower_subgroup_intrinsics(struct v3d_compile *c,
1386                          nir_block *block, nir_builder *b)
1387{
1388        bool progress = false;
1389        nir_foreach_instr_safe(inst, block) {
1390                if (inst->type != nir_instr_type_intrinsic)
1391                        continue;;
1392
1393                nir_intrinsic_instr *intr =
1394                        nir_instr_as_intrinsic(inst);
1395                if (!intr)
1396                        continue;
1397
1398                switch (intr->intrinsic) {
1399                case nir_intrinsic_load_num_subgroups:
1400                        lower_load_num_subgroups(c, b, intr);
1401                        progress = true;
1402                        FALLTHROUGH;
1403                case nir_intrinsic_load_subgroup_id:
1404                case nir_intrinsic_load_subgroup_size:
1405                case nir_intrinsic_load_subgroup_invocation:
1406                case nir_intrinsic_elect:
1407                        c->has_subgroups = true;
1408                        break;
1409                default:
1410                        break;
1411                }
1412        }
1413
1414        return progress;
1415}
1416
1417static bool
1418v3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
1419{
1420        bool progress = false;
1421        nir_foreach_function(function, s) {
1422                if (function->impl) {
1423                        nir_builder b;
1424                        nir_builder_init(&b, function->impl);
1425
1426                        nir_foreach_block(block, function->impl)
1427                                progress |= lower_subgroup_intrinsics(c, block, &b);
1428
1429                        nir_metadata_preserve(function->impl,
1430                                              nir_metadata_block_index |
1431                                              nir_metadata_dominance);
1432                }
1433        }
1434        return progress;
1435}
1436
1437static void
1438v3d_attempt_compile(struct v3d_compile *c)
1439{
1440        switch (c->s->info.stage) {
1441        case MESA_SHADER_VERTEX:
1442                c->vs_key = (struct v3d_vs_key *) c->key;
1443                break;
1444        case MESA_SHADER_GEOMETRY:
1445                c->gs_key = (struct v3d_gs_key *) c->key;
1446                break;
1447        case MESA_SHADER_FRAGMENT:
1448                c->fs_key = (struct v3d_fs_key *) c->key;
1449                break;
1450        case MESA_SHADER_COMPUTE:
1451                break;
1452        default:
1453                unreachable("unsupported shader stage");
1454        }
1455
1456        switch (c->s->info.stage) {
1457        case MESA_SHADER_VERTEX:
1458                v3d_nir_lower_vs_early(c);
1459                break;
1460        case MESA_SHADER_GEOMETRY:
1461                v3d_nir_lower_gs_early(c);
1462                break;
1463        case MESA_SHADER_FRAGMENT:
1464                v3d_nir_lower_fs_early(c);
1465                break;
1466        default:
1467                break;
1468        }
1469
1470        v3d_lower_nir(c);
1471
1472        switch (c->s->info.stage) {
1473        case MESA_SHADER_VERTEX:
1474                v3d_nir_lower_vs_late(c);
1475                break;
1476        case MESA_SHADER_GEOMETRY:
1477                v3d_nir_lower_gs_late(c);
1478                break;
1479        case MESA_SHADER_FRAGMENT:
1480                v3d_nir_lower_fs_late(c);
1481                break;
1482        default:
1483                break;
1484        }
1485
1486        NIR_PASS_V(c->s, v3d_nir_lower_io, c);
1487        NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
1488        NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
1489        nir_lower_idiv_options idiv_options = {
1490                .imprecise_32bit_lowering = true,
1491                .allow_fp16 = true,
1492        };
1493        NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
1494
1495        if (c->key->robust_buffer_access) {
1496           /* v3d_nir_lower_robust_buffer_access assumes constant buffer
1497            * indices on ubo/ssbo intrinsics so run copy propagation and
1498            * constant folding passes before we run the lowering to warrant
1499            * this. We also want to run the lowering before v3d_optimize to
1500            * clean-up redundant get_buffer_size calls produced in the pass.
1501            */
1502           NIR_PASS_V(c->s, nir_copy_prop);
1503           NIR_PASS_V(c->s, nir_opt_constant_folding);
1504           NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
1505        }
1506
1507        NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
1508
1509        NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
1510
1511        v3d_optimize_nir(c, c->s);
1512
1513        /* Do late algebraic optimization to turn add(a, neg(b)) back into
1514         * subs, then the mandatory cleanup after algebraic.  Note that it may
1515         * produce fnegs, and if so then we need to keep running to squash
1516         * fneg(fneg(a)).
1517         */
1518        bool more_late_algebraic = true;
1519        while (more_late_algebraic) {
1520                more_late_algebraic = false;
1521                NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
1522                NIR_PASS_V(c->s, nir_opt_constant_folding);
1523                NIR_PASS_V(c->s, nir_copy_prop);
1524                NIR_PASS_V(c->s, nir_opt_dce);
1525                NIR_PASS_V(c->s, nir_opt_cse);
1526        }
1527
1528        NIR_PASS_V(c->s, nir_lower_bool_to_int32);
1529        nir_convert_to_lcssa(c->s, true, true);
1530        NIR_PASS_V(c->s, nir_divergence_analysis);
1531        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
1532
1533        struct nir_schedule_options schedule_options = {
1534                /* Schedule for about half our register space, to enable more
1535                 * shaders to hit 4 threads.
1536                 */
1537                .threshold = 24,
1538
1539                /* Vertex shaders share the same memory for inputs and outputs,
1540                 * fragement and geometry shaders do not.
1541                 */
1542                .stages_with_shared_io_memory =
1543                (((1 << MESA_ALL_SHADER_STAGES) - 1) &
1544                 ~((1 << MESA_SHADER_FRAGMENT) |
1545                   (1 << MESA_SHADER_GEOMETRY))),
1546
1547                .fallback = c->fallback_scheduler,
1548
1549                .intrinsic_cb = v3d_intrinsic_dependency_cb,
1550                .intrinsic_cb_data = c,
1551        };
1552        NIR_PASS_V(c->s, nir_schedule, &schedule_options);
1553
1554        if (!c->disable_constant_ubo_load_sorting)
1555                NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
1556
1557        v3d_nir_to_vir(c);
1558}
1559
1560uint32_t
1561v3d_prog_data_size(gl_shader_stage stage)
1562{
1563        static const int prog_data_size[] = {
1564                [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
1565                [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
1566                [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
1567                [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
1568        };
1569
1570        assert(stage >= 0 &&
1571               stage < ARRAY_SIZE(prog_data_size) &&
1572               prog_data_size[stage]);
1573
1574        return prog_data_size[stage];
1575}
1576
1577int v3d_shaderdb_dump(struct v3d_compile *c,
1578		      char **shaderdb_str)
1579{
1580        if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
1581                return -1;
1582
1583        return asprintf(shaderdb_str,
1584                        "%s shader: %d inst, %d threads, %d loops, "
1585                        "%d uniforms, %d max-temps, %d:%d spills:fills, "
1586                        "%d sfu-stalls, %d inst-and-stalls, %d nops",
1587                        vir_get_stage_name(c),
1588                        c->qpu_inst_count,
1589                        c->threads,
1590                        c->loops,
1591                        c->num_uniforms,
1592                        vir_get_max_temps(c),
1593                        c->spills,
1594                        c->fills,
1595                        c->qpu_inst_stalled_count,
1596                        c->qpu_inst_count + c->qpu_inst_stalled_count,
1597                        c->nop_count);
1598}
1599
1600/* This is a list of incremental changes to the compilation strategy
1601 * that will be used to try to compile the shader successfully. The
1602 * default strategy is to enable all optimizations which will have
1603 * the highest register pressure but is expected to produce most
1604 * optimal code. Following strategies incrementally disable specific
1605 * optimizations that are known to contribute to register pressure
1606 * in order to be able to compile the shader successfully while meeting
1607 * thread count requirements.
1608 *
1609 * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
1610 * cover previous hardware as well (meaning that we are not limiting
1611 * register allocation to any particular thread count). This is fine
1612 * because v3d_nir_to_vir will cap this to the actual minimum.
1613 */
1614struct v3d_compiler_strategy {
1615        const char *name;
1616        uint32_t max_threads;
1617        uint32_t min_threads;
1618        bool disable_loop_unrolling;
1619        bool disable_ubo_load_sorting;
1620        bool disable_tmu_pipelining;
1621        bool tmu_spilling_allowed;
1622} static const strategies[] = {
1623  /*0*/ { "default",                        4, 4, false, false, false, false },
1624  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false, false },
1625  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false, false },
1626  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,  false },
1627  /*4*/ { "lower thread count",             2, 1, false, false, false, false },
1628  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, false },
1629  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, false },
1630  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  true  },
1631  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  true  }
1632};
1633
1634/**
1635 * If a particular optimization didn't make any progress during a compile
1636 * attempt disabling it alone won't allow us to compile the shader successfuly,
1637 * since we'll end up with the same code. Detect these scenarios so we can
1638 * avoid wasting time with useless compiles. We should also consider if the
1639 * strategy changes other aspects of the compilation process though, like
1640 * spilling, and not skip it in that case.
1641 */
1642static bool
1643skip_compile_strategy(struct v3d_compile *c, uint32_t idx)
1644{
1645   /* We decide if we can skip a strategy based on the optimizations that
1646    * were active in the previous strategy, so we should only be calling this
1647    * for strategies after the first.
1648    */
1649   assert(idx > 0);
1650
1651   /* Don't skip a strategy that changes spilling behavior */
1652   if (strategies[idx].tmu_spilling_allowed !=
1653       strategies[idx - 1].tmu_spilling_allowed) {
1654           return false;
1655   }
1656
1657   switch (idx) {
1658   /* Loop unrolling: skip if we didn't unroll any loops */
1659   case 1:
1660   case 5:
1661           return !c->unrolled_any_loops;
1662   /* UBO load sorting: skip if we didn't sort any loads */
1663   case 2:
1664   case 6:
1665           return !c->sorted_any_ubo_loads;
1666   /* TMU pipelining: skip if we didn't pipeline any TMU ops */
1667   case 3:
1668   case 7:
1669           return !c->pipelined_any_tmu;
1670   /* Lower thread count: skip if we already tried less that 4 threads */
1671   case 4:
1672          return c->threads < 4;
1673   default:
1674           return false;
1675   };
1676}
1677uint64_t *v3d_compile(const struct v3d_compiler *compiler,
1678                      struct v3d_key *key,
1679                      struct v3d_prog_data **out_prog_data,
1680                      nir_shader *s,
1681                      void (*debug_output)(const char *msg,
1682                                           void *debug_output_data),
1683                      void *debug_output_data,
1684                      int program_id, int variant_id,
1685                      uint32_t *final_assembly_size)
1686{
1687        struct v3d_compile *c = NULL;
1688        for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
1689                /* Fallback strategy */
1690                if (i > 0) {
1691                        assert(c);
1692                        if (skip_compile_strategy(c, i))
1693                                continue;
1694
1695                        char *debug_msg;
1696                        int ret = asprintf(&debug_msg,
1697                                           "Falling back to strategy '%s' for %s",
1698                                           strategies[i].name,
1699                                           vir_get_stage_name(c));
1700
1701                        if (ret >= 0) {
1702                                if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
1703                                        fprintf(stderr, "%s\n", debug_msg);
1704
1705                                c->debug_output(debug_msg, c->debug_output_data);
1706                                free(debug_msg);
1707                        }
1708
1709                        vir_compile_destroy(c);
1710                }
1711
1712                c = vir_compile_init(compiler, key, s,
1713                                     debug_output, debug_output_data,
1714                                     program_id, variant_id,
1715                                     strategies[i].max_threads,
1716                                     strategies[i].min_threads,
1717                                     strategies[i].tmu_spilling_allowed,
1718                                     strategies[i].disable_loop_unrolling,
1719                                     strategies[i].disable_ubo_load_sorting,
1720                                     strategies[i].disable_tmu_pipelining,
1721                                     i == ARRAY_SIZE(strategies) - 1);
1722
1723                v3d_attempt_compile(c);
1724
1725                if (i >= ARRAY_SIZE(strategies) - 1 ||
1726                    c->compilation_result !=
1727                    V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
1728                        break;
1729                }
1730        }
1731
1732        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
1733            c->compilation_result !=
1734            V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
1735            c->spills > 0) {
1736                char *debug_msg;
1737                int ret = asprintf(&debug_msg,
1738                                   "Compiled %s with %d spills and %d fills",
1739                                   vir_get_stage_name(c),
1740                                   c->spills, c->fills);
1741                fprintf(stderr, "%s\n", debug_msg);
1742
1743                if (ret >= 0) {
1744                        c->debug_output(debug_msg, c->debug_output_data);
1745                        free(debug_msg);
1746                }
1747        }
1748
1749        if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
1750                fprintf(stderr, "Failed to compile %s with any strategy.\n",
1751                        vir_get_stage_name(c));
1752        }
1753
1754        struct v3d_prog_data *prog_data;
1755
1756        prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
1757
1758        v3d_set_prog_data(c, prog_data);
1759
1760        *out_prog_data = prog_data;
1761
1762        char *shaderdb;
1763        int ret = v3d_shaderdb_dump(c, &shaderdb);
1764        if (ret >= 0) {
1765                if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
1766                        fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
1767
1768                c->debug_output(shaderdb, c->debug_output_data);
1769                free(shaderdb);
1770        }
1771
1772       return v3d_return_qpu_insts(c, final_assembly_size);
1773}
1774
1775void
1776vir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
1777{
1778        if (qinst->dst.file == QFILE_TEMP)
1779                c->defs[qinst->dst.index] = NULL;
1780
1781        assert(&qinst->link != c->cursor.link);
1782
1783        list_del(&qinst->link);
1784        free(qinst);
1785
1786        c->live_intervals_valid = false;
1787}
1788
1789struct qreg
1790vir_follow_movs(struct v3d_compile *c, struct qreg reg)
1791{
1792        /* XXX
1793        int pack = reg.pack;
1794
1795        while (reg.file == QFILE_TEMP &&
1796               c->defs[reg.index] &&
1797               (c->defs[reg.index]->op == QOP_MOV ||
1798                c->defs[reg.index]->op == QOP_FMOV) &&
1799               !c->defs[reg.index]->dst.pack &&
1800               !c->defs[reg.index]->src[0].pack) {
1801                reg = c->defs[reg.index]->src[0];
1802        }
1803
1804        reg.pack = pack;
1805        */
1806        return reg;
1807}
1808
1809void
1810vir_compile_destroy(struct v3d_compile *c)
1811{
1812        /* Defuse the assert that we aren't removing the cursor's instruction.
1813         */
1814        c->cursor.link = NULL;
1815
1816        vir_for_each_block(block, c) {
1817                while (!list_is_empty(&block->instructions)) {
1818                        struct qinst *qinst =
1819                                list_first_entry(&block->instructions,
1820                                                 struct qinst, link);
1821                        vir_remove_instruction(c, qinst);
1822                }
1823        }
1824
1825        ralloc_free(c);
1826}
1827
1828uint32_t
1829vir_get_uniform_index(struct v3d_compile *c,
1830                      enum quniform_contents contents,
1831                      uint32_t data)
1832{
1833        for (int i = 0; i < c->num_uniforms; i++) {
1834                if (c->uniform_contents[i] == contents &&
1835                    c->uniform_data[i] == data) {
1836                        return i;
1837                }
1838        }
1839
1840        uint32_t uniform = c->num_uniforms++;
1841
1842        if (uniform >= c->uniform_array_size) {
1843                c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
1844                                             c->uniform_array_size * 2);
1845
1846                c->uniform_data = reralloc(c, c->uniform_data,
1847                                           uint32_t,
1848                                           c->uniform_array_size);
1849                c->uniform_contents = reralloc(c, c->uniform_contents,
1850                                               enum quniform_contents,
1851                                               c->uniform_array_size);
1852        }
1853
1854        c->uniform_contents[uniform] = contents;
1855        c->uniform_data[uniform] = data;
1856
1857        return uniform;
1858}
1859
1860/* Looks back into the current block to find the ldunif that wrote the uniform
1861 * at the requested index. If it finds it, it returns true and writes the
1862 * destination register of the ldunif instruction to 'unif'.
1863 *
1864 * This can impact register pressure and end up leading to worse code, so we
1865 * limit the number of instructions we are willing to look back through to
1866 * strike a good balance.
1867 */
1868static bool
1869try_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
1870{
1871        uint32_t count = 20;
1872        struct qinst *prev_inst = NULL;
1873        assert(c->cur_block);
1874
1875#ifdef DEBUG
1876        /* We can only reuse a uniform if it was emitted in the same block,
1877         * so callers must make sure the current instruction is being emitted
1878         * in the current block.
1879         */
1880        bool found = false;
1881        vir_for_each_inst(inst, c->cur_block) {
1882                if (&inst->link == c->cursor.link) {
1883                        found = true;
1884                        break;
1885                }
1886        }
1887
1888        assert(found || &c->cur_block->instructions == c->cursor.link);
1889#endif
1890
1891        list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
1892                                     &c->cur_block->instructions, link) {
1893                if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
1894                    inst->uniform == index) {
1895                        prev_inst = inst;
1896                        break;
1897                }
1898
1899                if (--count == 0)
1900                        break;
1901        }
1902
1903        if (!prev_inst)
1904                return false;
1905
1906
1907        list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
1908                                 &c->cur_block->instructions, link) {
1909                if (inst->dst.file == prev_inst->dst.file &&
1910                    inst->dst.index == prev_inst->dst.index) {
1911                        return false;
1912                }
1913        }
1914
1915        *unif = prev_inst->dst;
1916        return true;
1917}
1918
1919struct qreg
1920vir_uniform(struct v3d_compile *c,
1921            enum quniform_contents contents,
1922            uint32_t data)
1923{
1924        const int num_uniforms = c->num_uniforms;
1925        const int index = vir_get_uniform_index(c, contents, data);
1926
1927        /* If this is not the first time we see this uniform try to reuse the
1928         * result of the last ldunif that loaded it.
1929         */
1930        const bool is_new_uniform = num_uniforms != c->num_uniforms;
1931        if (!is_new_uniform && !c->disable_ldunif_opt) {
1932                struct qreg ldunif_dst;
1933                if (try_opt_ldunif(c, index, &ldunif_dst))
1934                        return ldunif_dst;
1935        }
1936
1937        struct qinst *inst = vir_NOP(c);
1938        inst->qpu.sig.ldunif = true;
1939        inst->uniform = index;
1940        inst->dst = vir_get_temp(c);
1941        c->defs[inst->dst.index] = inst;
1942        return inst->dst;
1943}
1944
1945#define OPTPASS(func)                                                   \
1946        do {                                                            \
1947                bool stage_progress = func(c);                          \
1948                if (stage_progress) {                                   \
1949                        progress = true;                                \
1950                        if (print_opt_debug) {                          \
1951                                fprintf(stderr,                         \
1952                                        "VIR opt pass %2d: %s progress\n", \
1953                                        pass, #func);                   \
1954                        }                                               \
1955                        /*XXX vir_validate(c);*/                        \
1956                }                                                       \
1957        } while (0)
1958
1959void
1960vir_optimize(struct v3d_compile *c)
1961{
1962        bool print_opt_debug = false;
1963        int pass = 1;
1964
1965        while (true) {
1966                bool progress = false;
1967
1968                OPTPASS(vir_opt_copy_propagate);
1969                OPTPASS(vir_opt_redundant_flags);
1970                OPTPASS(vir_opt_dead_code);
1971                OPTPASS(vir_opt_small_immediates);
1972                OPTPASS(vir_opt_constant_alu);
1973
1974                if (!progress)
1975                        break;
1976
1977                pass++;
1978        }
1979}
1980
1981const char *
1982vir_get_stage_name(struct v3d_compile *c)
1983{
1984        if (c->vs_key && c->vs_key->is_coord)
1985                return "MESA_SHADER_VERTEX_BIN";
1986        else if (c->gs_key && c->gs_key->is_coord)
1987                return "MESA_SHADER_GEOMETRY_BIN";
1988        else
1989                return gl_shader_stage_name(c->s->info.stage);
1990}
1991
1992static inline uint32_t
1993compute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
1994{
1995   assert(devinfo->vpm_size > 0);
1996   const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
1997   return devinfo->vpm_size / sector_size;
1998}
1999
2000/* Computes various parameters affecting VPM memory configuration for programs
2001 * involving geometry shaders to ensure the program fits in memory and honors
2002 * requirements described in section "VPM usage" of the programming manual.
2003 */
2004static bool
2005compute_vpm_config_gs(struct v3d_device_info *devinfo,
2006                      struct v3d_vs_prog_data *vs,
2007                      struct v3d_gs_prog_data *gs,
2008                      struct vpm_config *vpm_cfg_out)
2009{
2010   const uint32_t A = vs->separate_segments ? 1 : 0;
2011   const uint32_t Ad = vs->vpm_input_size;
2012   const uint32_t Vd = vs->vpm_output_size;
2013
2014   const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
2015
2016   /* Try to fit program into our VPM memory budget by adjusting
2017    * configurable parameters iteratively. We do this in two phases:
2018    * the first phase tries to fit the program into the total available
2019    * VPM memory. If we succeed at that, then the second phase attempts
2020    * to fit the program into half of that budget so we can run bin and
2021    * render programs in parallel.
2022    */
2023   struct vpm_config vpm_cfg[2];
2024   struct vpm_config *final_vpm_cfg = NULL;
2025   uint32_t phase = 0;
2026
2027   vpm_cfg[phase].As = 1;
2028   vpm_cfg[phase].Gs = 1;
2029   vpm_cfg[phase].Gd = gs->vpm_output_size;
2030   vpm_cfg[phase].gs_width = gs->simd_width;
2031
2032   /* While there is a requirement that Vc >= [Vn / 16], this is
2033    * always the case when tessellation is not present because in that
2034    * case Vn can only be 6 at most (when input primitive is triangles
2035    * with adjacency).
2036    *
2037    * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
2038    * and Broadcom has not found it worth it to increase it beyond this
2039    * in general. Increasing Vc also increases VPM memory pressure which
2040    * can turn up being detrimental for performance in some scenarios.
2041    */
2042   vpm_cfg[phase].Vc = 2;
2043
2044   /* Gv is a constraint on the hardware to not exceed the
2045    * specified number of vertex segments per GS batch. If adding a
2046    * new primitive to a GS batch would result in a range of more
2047    * than Gv vertex segments being referenced by the batch, then
2048    * the hardware will flush the batch and start a new one. This
2049    * means that we can choose any value we want, we just need to
2050    * be aware that larger values improve GS batch utilization
2051    * at the expense of more VPM memory pressure (which can affect
2052    * other performance aspects, such as GS dispatch width).
2053    * We start with the largest value, and will reduce it if we
2054    * find that total memory pressure is too high.
2055    */
2056   vpm_cfg[phase].Gv = 3;
2057   do {
2058      /* When GS is present in absence of TES, then we need to satisfy
2059       * that Ve >= Gv. We go with the smallest value of Ve to avoid
2060       * increasing memory pressure.
2061       */
2062      vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
2063
2064      uint32_t vpm_sectors =
2065         A * vpm_cfg[phase].As * Ad +
2066         (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
2067         vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
2068
2069      /* Ideally we want to use no more than half of the available
2070       * memory so we can execute a bin and render program in parallel
2071       * without stalls. If we achieved that then we are done.
2072       */
2073      if (vpm_sectors <= vpm_size / 2) {
2074         final_vpm_cfg = &vpm_cfg[phase];
2075         break;
2076      }
2077
2078      /* At the very least, we should not allocate more than the
2079       * total available VPM memory. If we have a configuration that
2080       * succeeds at this we save it and continue to see if we can
2081       * meet the half-memory-use criteria too.
2082       */
2083      if (phase == 0 && vpm_sectors <= vpm_size) {
2084         vpm_cfg[1] = vpm_cfg[0];
2085         phase = 1;
2086      }
2087
2088      /* Try lowering Gv */
2089      if (vpm_cfg[phase].Gv > 0) {
2090         vpm_cfg[phase].Gv--;
2091         continue;
2092      }
2093
2094      /* Try lowering GS dispatch width */
2095      if (vpm_cfg[phase].gs_width > 1) {
2096         do {
2097            vpm_cfg[phase].gs_width >>= 1;
2098            vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
2099         } while (vpm_cfg[phase].gs_width == 2);
2100
2101         /* Reset Gv to max after dropping dispatch width */
2102         vpm_cfg[phase].Gv = 3;
2103         continue;
2104      }
2105
2106      /* We ran out of options to reduce memory pressure. If we
2107       * are at phase 1 we have at least a valid configuration, so we
2108       * we use that.
2109       */
2110      if (phase == 1)
2111         final_vpm_cfg = &vpm_cfg[0];
2112      break;
2113   } while (true);
2114
2115   if (!final_vpm_cfg)
2116      return false;
2117
2118   assert(final_vpm_cfg);
2119   assert(final_vpm_cfg->Gd <= 16);
2120   assert(final_vpm_cfg->Gv < 4);
2121   assert(final_vpm_cfg->Ve < 4);
2122   assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
2123   assert(final_vpm_cfg->gs_width == 1 ||
2124          final_vpm_cfg->gs_width == 4 ||
2125          final_vpm_cfg->gs_width == 8 ||
2126          final_vpm_cfg->gs_width == 16);
2127
2128   *vpm_cfg_out = *final_vpm_cfg;
2129   return true;
2130}
2131
2132bool
2133v3d_compute_vpm_config(struct v3d_device_info *devinfo,
2134                       struct v3d_vs_prog_data *vs_bin,
2135                       struct v3d_vs_prog_data *vs,
2136                       struct v3d_gs_prog_data *gs_bin,
2137                       struct v3d_gs_prog_data *gs,
2138                       struct vpm_config *vpm_cfg_bin,
2139                       struct vpm_config *vpm_cfg)
2140{
2141   assert(vs && vs_bin);
2142   assert((gs != NULL) == (gs_bin != NULL));
2143
2144   if (!gs) {
2145      vpm_cfg_bin->As = 1;
2146      vpm_cfg_bin->Ve = 0;
2147      vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
2148
2149      vpm_cfg->As = 1;
2150      vpm_cfg->Ve = 0;
2151      vpm_cfg->Vc = vs->vcm_cache_size;
2152   } else {
2153      if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
2154         return false;
2155
2156      if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
2157         return false;
2158   }
2159
2160   return true;
2161}
2162