vir.c revision 7ec681f3
101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2016-2017 Broadcom
301e04c3fSmrg *
401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
501e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
601e04c3fSmrg * to deal in the Software without restriction, including without limitation
701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
901e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1001e04c3fSmrg *
1101e04c3fSmrg * The above copyright notice and this permission notice (including the next
1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1301e04c3fSmrg * Software.
1401e04c3fSmrg *
1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2101e04c3fSmrg * IN THE SOFTWARE.
2201e04c3fSmrg */
2301e04c3fSmrg
2401e04c3fSmrg#include "broadcom/common/v3d_device_info.h"
2501e04c3fSmrg#include "v3d_compiler.h"
267ec681f3Smrg#include "util/u_prim.h"
277ec681f3Smrg#include "compiler/nir/nir_schedule.h"
287ec681f3Smrg#include "compiler/nir/nir_builder.h"
2901e04c3fSmrg
3001e04c3fSmrgint
31ed98bd31Smayavir_get_nsrc(struct qinst *inst)
3201e04c3fSmrg{
3301e04c3fSmrg        switch (inst->qpu.type) {
3401e04c3fSmrg        case V3D_QPU_INSTR_TYPE_BRANCH:
3501e04c3fSmrg                return 0;
3601e04c3fSmrg        case V3D_QPU_INSTR_TYPE_ALU:
3701e04c3fSmrg                if (inst->qpu.alu.add.op != V3D_QPU_A_NOP)
3801e04c3fSmrg                        return v3d_qpu_add_op_num_src(inst->qpu.alu.add.op);
3901e04c3fSmrg                else
4001e04c3fSmrg                        return v3d_qpu_mul_op_num_src(inst->qpu.alu.mul.op);
4101e04c3fSmrg        }
4201e04c3fSmrg
4301e04c3fSmrg        return 0;
4401e04c3fSmrg}
4501e04c3fSmrg
4601e04c3fSmrg/**
4701e04c3fSmrg * Returns whether the instruction has any side effects that must be
4801e04c3fSmrg * preserved.
4901e04c3fSmrg */
5001e04c3fSmrgbool
5101e04c3fSmrgvir_has_side_effects(struct v3d_compile *c, struct qinst *inst)
5201e04c3fSmrg{
5301e04c3fSmrg        switch (inst->qpu.type) {
5401e04c3fSmrg        case V3D_QPU_INSTR_TYPE_BRANCH:
5501e04c3fSmrg                return true;
5601e04c3fSmrg        case V3D_QPU_INSTR_TYPE_ALU:
5701e04c3fSmrg                switch (inst->qpu.alu.add.op) {
5801e04c3fSmrg                case V3D_QPU_A_SETREVF:
5901e04c3fSmrg                case V3D_QPU_A_SETMSF:
6001e04c3fSmrg                case V3D_QPU_A_VPMSETUP:
6101e04c3fSmrg                case V3D_QPU_A_STVPMV:
6201e04c3fSmrg                case V3D_QPU_A_STVPMD:
6301e04c3fSmrg                case V3D_QPU_A_STVPMP:
6401e04c3fSmrg                case V3D_QPU_A_VPMWT:
6501e04c3fSmrg                case V3D_QPU_A_TMUWT:
6601e04c3fSmrg                        return true;
6701e04c3fSmrg                default:
6801e04c3fSmrg                        break;
6901e04c3fSmrg                }
7001e04c3fSmrg
7101e04c3fSmrg                switch (inst->qpu.alu.mul.op) {
7201e04c3fSmrg                case V3D_QPU_M_MULTOP:
7301e04c3fSmrg                        return true;
7401e04c3fSmrg                default:
7501e04c3fSmrg                        break;
7601e04c3fSmrg                }
7701e04c3fSmrg        }
7801e04c3fSmrg
7901e04c3fSmrg        if (inst->qpu.sig.ldtmu ||
8001e04c3fSmrg            inst->qpu.sig.ldvary ||
817ec681f3Smrg            inst->qpu.sig.ldtlbu ||
827ec681f3Smrg            inst->qpu.sig.ldtlb ||
8301e04c3fSmrg            inst->qpu.sig.wrtmuc ||
8401e04c3fSmrg            inst->qpu.sig.thrsw) {
8501e04c3fSmrg                return true;
8601e04c3fSmrg        }
8701e04c3fSmrg
887ec681f3Smrg        /* ldunifa works like ldunif: it reads an element and advances the
897ec681f3Smrg         * pointer, so each read has a side effect (we don't care for ldunif
907ec681f3Smrg         * because we reconstruct the uniform stream buffer after compiling
917ec681f3Smrg         * with the surviving uniforms), so allowing DCE to remove
927ec681f3Smrg         * one would break follow-up loads. We could fix this by emiting a
937ec681f3Smrg         * unifa for each ldunifa, but each unifa requires 3 delay slots
947ec681f3Smrg         * before a ldunifa, so that would be quite expensive.
957ec681f3Smrg         */
967ec681f3Smrg        if (inst->qpu.sig.ldunifa || inst->qpu.sig.ldunifarf)
977ec681f3Smrg                return true;
987ec681f3Smrg
9901e04c3fSmrg        return false;
10001e04c3fSmrg}
10101e04c3fSmrg
10201e04c3fSmrgbool
10301e04c3fSmrgvir_is_raw_mov(struct qinst *inst)
10401e04c3fSmrg{
10501e04c3fSmrg        if (inst->qpu.type != V3D_QPU_INSTR_TYPE_ALU ||
10601e04c3fSmrg            (inst->qpu.alu.mul.op != V3D_QPU_M_FMOV &&
10701e04c3fSmrg             inst->qpu.alu.mul.op != V3D_QPU_M_MOV)) {
10801e04c3fSmrg                return false;
10901e04c3fSmrg        }
11001e04c3fSmrg
11101e04c3fSmrg        if (inst->qpu.alu.add.output_pack != V3D_QPU_PACK_NONE ||
11201e04c3fSmrg            inst->qpu.alu.mul.output_pack != V3D_QPU_PACK_NONE) {
11301e04c3fSmrg                return false;
11401e04c3fSmrg        }
11501e04c3fSmrg
116ed98bd31Smaya        if (inst->qpu.alu.add.a_unpack != V3D_QPU_UNPACK_NONE ||
117ed98bd31Smaya            inst->qpu.alu.add.b_unpack != V3D_QPU_UNPACK_NONE ||
118ed98bd31Smaya            inst->qpu.alu.mul.a_unpack != V3D_QPU_UNPACK_NONE ||
119ed98bd31Smaya            inst->qpu.alu.mul.b_unpack != V3D_QPU_UNPACK_NONE) {
120ed98bd31Smaya                return false;
121ed98bd31Smaya        }
122ed98bd31Smaya
12301e04c3fSmrg        if (inst->qpu.flags.ac != V3D_QPU_COND_NONE ||
12401e04c3fSmrg            inst->qpu.flags.mc != V3D_QPU_COND_NONE)
12501e04c3fSmrg                return false;
12601e04c3fSmrg
12701e04c3fSmrg        return true;
12801e04c3fSmrg}
12901e04c3fSmrg
13001e04c3fSmrgbool
13101e04c3fSmrgvir_is_add(struct qinst *inst)
13201e04c3fSmrg{
13301e04c3fSmrg        return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
13401e04c3fSmrg                inst->qpu.alu.add.op != V3D_QPU_A_NOP);
13501e04c3fSmrg}
13601e04c3fSmrg
13701e04c3fSmrgbool
13801e04c3fSmrgvir_is_mul(struct qinst *inst)
13901e04c3fSmrg{
14001e04c3fSmrg        return (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
14101e04c3fSmrg                inst->qpu.alu.mul.op != V3D_QPU_M_NOP);
14201e04c3fSmrg}
14301e04c3fSmrg
14401e04c3fSmrgbool
1457ec681f3Smrgvir_is_tex(const struct v3d_device_info *devinfo, struct qinst *inst)
14601e04c3fSmrg{
14701e04c3fSmrg        if (inst->dst.file == QFILE_MAGIC)
1487ec681f3Smrg                return v3d_qpu_magic_waddr_is_tmu(devinfo, inst->dst.index);
14901e04c3fSmrg
15001e04c3fSmrg        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU &&
15101e04c3fSmrg            inst->qpu.alu.add.op == V3D_QPU_A_TMUWT) {
15201e04c3fSmrg                return true;
15301e04c3fSmrg        }
15401e04c3fSmrg
15501e04c3fSmrg        return false;
15601e04c3fSmrg}
15701e04c3fSmrg
15801e04c3fSmrgbool
15901e04c3fSmrgvir_writes_r3(const struct v3d_device_info *devinfo, struct qinst *inst)
16001e04c3fSmrg{
16101e04c3fSmrg        for (int i = 0; i < vir_get_nsrc(inst); i++) {
16201e04c3fSmrg                switch (inst->src[i].file) {
16301e04c3fSmrg                case QFILE_VPM:
16401e04c3fSmrg                        return true;
16501e04c3fSmrg                default:
16601e04c3fSmrg                        break;
16701e04c3fSmrg                }
16801e04c3fSmrg        }
16901e04c3fSmrg
17001e04c3fSmrg        if (devinfo->ver < 41 && (inst->qpu.sig.ldvary ||
17101e04c3fSmrg                                  inst->qpu.sig.ldtlb ||
17201e04c3fSmrg                                  inst->qpu.sig.ldtlbu ||
17301e04c3fSmrg                                  inst->qpu.sig.ldvpm)) {
17401e04c3fSmrg                return true;
17501e04c3fSmrg        }
17601e04c3fSmrg
17701e04c3fSmrg        return false;
17801e04c3fSmrg}
17901e04c3fSmrg
18001e04c3fSmrgbool
18101e04c3fSmrgvir_writes_r4(const struct v3d_device_info *devinfo, struct qinst *inst)
18201e04c3fSmrg{
18301e04c3fSmrg        switch (inst->dst.file) {
18401e04c3fSmrg        case QFILE_MAGIC:
18501e04c3fSmrg                switch (inst->dst.index) {
18601e04c3fSmrg                case V3D_QPU_WADDR_RECIP:
18701e04c3fSmrg                case V3D_QPU_WADDR_RSQRT:
18801e04c3fSmrg                case V3D_QPU_WADDR_EXP:
18901e04c3fSmrg                case V3D_QPU_WADDR_LOG:
19001e04c3fSmrg                case V3D_QPU_WADDR_SIN:
19101e04c3fSmrg                        return true;
19201e04c3fSmrg                }
19301e04c3fSmrg                break;
19401e04c3fSmrg        default:
19501e04c3fSmrg                break;
19601e04c3fSmrg        }
19701e04c3fSmrg
19801e04c3fSmrg        if (devinfo->ver < 41 && inst->qpu.sig.ldtmu)
19901e04c3fSmrg                return true;
20001e04c3fSmrg
20101e04c3fSmrg        return false;
20201e04c3fSmrg}
20301e04c3fSmrg
20401e04c3fSmrgvoid
20501e04c3fSmrgvir_set_unpack(struct qinst *inst, int src,
20601e04c3fSmrg               enum v3d_qpu_input_unpack unpack)
20701e04c3fSmrg{
20801e04c3fSmrg        assert(src == 0 || src == 1);
20901e04c3fSmrg
21001e04c3fSmrg        if (vir_is_add(inst)) {
21101e04c3fSmrg                if (src == 0)
21201e04c3fSmrg                        inst->qpu.alu.add.a_unpack = unpack;
21301e04c3fSmrg                else
21401e04c3fSmrg                        inst->qpu.alu.add.b_unpack = unpack;
21501e04c3fSmrg        } else {
21601e04c3fSmrg                assert(vir_is_mul(inst));
21701e04c3fSmrg                if (src == 0)
21801e04c3fSmrg                        inst->qpu.alu.mul.a_unpack = unpack;
21901e04c3fSmrg                else
22001e04c3fSmrg                        inst->qpu.alu.mul.b_unpack = unpack;
22101e04c3fSmrg        }
22201e04c3fSmrg}
22301e04c3fSmrg
2247ec681f3Smrgvoid
2257ec681f3Smrgvir_set_pack(struct qinst *inst, enum v3d_qpu_output_pack pack)
2267ec681f3Smrg{
2277ec681f3Smrg        if (vir_is_add(inst)) {
2287ec681f3Smrg                inst->qpu.alu.add.output_pack = pack;
2297ec681f3Smrg        } else {
2307ec681f3Smrg                assert(vir_is_mul(inst));
2317ec681f3Smrg                inst->qpu.alu.mul.output_pack = pack;
2327ec681f3Smrg        }
2337ec681f3Smrg}
2347ec681f3Smrg
23501e04c3fSmrgvoid
23601e04c3fSmrgvir_set_cond(struct qinst *inst, enum v3d_qpu_cond cond)
23701e04c3fSmrg{
23801e04c3fSmrg        if (vir_is_add(inst)) {
23901e04c3fSmrg                inst->qpu.flags.ac = cond;
24001e04c3fSmrg        } else {
24101e04c3fSmrg                assert(vir_is_mul(inst));
24201e04c3fSmrg                inst->qpu.flags.mc = cond;
24301e04c3fSmrg        }
24401e04c3fSmrg}
24501e04c3fSmrg
2467ec681f3Smrgenum v3d_qpu_cond
2477ec681f3Smrgvir_get_cond(struct qinst *inst)
2487ec681f3Smrg{
2497ec681f3Smrg        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
2507ec681f3Smrg
2517ec681f3Smrg        if (vir_is_add(inst))
2527ec681f3Smrg                return inst->qpu.flags.ac;
2537ec681f3Smrg        else if (vir_is_mul(inst))
2547ec681f3Smrg                return inst->qpu.flags.mc;
2557ec681f3Smrg        else /* NOP */
2567ec681f3Smrg                return V3D_QPU_COND_NONE;
2577ec681f3Smrg}
2587ec681f3Smrg
25901e04c3fSmrgvoid
2607ec681f3Smrgvir_set_pf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_pf pf)
26101e04c3fSmrg{
2627ec681f3Smrg        c->flags_temp = -1;
26301e04c3fSmrg        if (vir_is_add(inst)) {
26401e04c3fSmrg                inst->qpu.flags.apf = pf;
26501e04c3fSmrg        } else {
26601e04c3fSmrg                assert(vir_is_mul(inst));
26701e04c3fSmrg                inst->qpu.flags.mpf = pf;
26801e04c3fSmrg        }
26901e04c3fSmrg}
27001e04c3fSmrg
271ed98bd31Smayavoid
2727ec681f3Smrgvir_set_uf(struct v3d_compile *c, struct qinst *inst, enum v3d_qpu_uf uf)
273ed98bd31Smaya{
2747ec681f3Smrg        c->flags_temp = -1;
275ed98bd31Smaya        if (vir_is_add(inst)) {
276ed98bd31Smaya                inst->qpu.flags.auf = uf;
277ed98bd31Smaya        } else {
278ed98bd31Smaya                assert(vir_is_mul(inst));
279ed98bd31Smaya                inst->qpu.flags.muf = uf;
280ed98bd31Smaya        }
281ed98bd31Smaya}
282ed98bd31Smaya
28301e04c3fSmrg#if 0
28401e04c3fSmrguint8_t
28501e04c3fSmrgvir_channels_written(struct qinst *inst)
28601e04c3fSmrg{
28701e04c3fSmrg        if (vir_is_mul(inst)) {
28801e04c3fSmrg                switch (inst->dst.pack) {
28901e04c3fSmrg                case QPU_PACK_MUL_NOP:
29001e04c3fSmrg                case QPU_PACK_MUL_8888:
29101e04c3fSmrg                        return 0xf;
29201e04c3fSmrg                case QPU_PACK_MUL_8A:
29301e04c3fSmrg                        return 0x1;
29401e04c3fSmrg                case QPU_PACK_MUL_8B:
29501e04c3fSmrg                        return 0x2;
29601e04c3fSmrg                case QPU_PACK_MUL_8C:
29701e04c3fSmrg                        return 0x4;
29801e04c3fSmrg                case QPU_PACK_MUL_8D:
29901e04c3fSmrg                        return 0x8;
30001e04c3fSmrg                }
30101e04c3fSmrg        } else {
30201e04c3fSmrg                switch (inst->dst.pack) {
30301e04c3fSmrg                case QPU_PACK_A_NOP:
30401e04c3fSmrg                case QPU_PACK_A_8888:
30501e04c3fSmrg                case QPU_PACK_A_8888_SAT:
30601e04c3fSmrg                case QPU_PACK_A_32_SAT:
30701e04c3fSmrg                        return 0xf;
30801e04c3fSmrg                case QPU_PACK_A_8A:
30901e04c3fSmrg                case QPU_PACK_A_8A_SAT:
31001e04c3fSmrg                        return 0x1;
31101e04c3fSmrg                case QPU_PACK_A_8B:
31201e04c3fSmrg                case QPU_PACK_A_8B_SAT:
31301e04c3fSmrg                        return 0x2;
31401e04c3fSmrg                case QPU_PACK_A_8C:
31501e04c3fSmrg                case QPU_PACK_A_8C_SAT:
31601e04c3fSmrg                        return 0x4;
31701e04c3fSmrg                case QPU_PACK_A_8D:
31801e04c3fSmrg                case QPU_PACK_A_8D_SAT:
31901e04c3fSmrg                        return 0x8;
32001e04c3fSmrg                case QPU_PACK_A_16A:
32101e04c3fSmrg                case QPU_PACK_A_16A_SAT:
32201e04c3fSmrg                        return 0x3;
32301e04c3fSmrg                case QPU_PACK_A_16B:
32401e04c3fSmrg                case QPU_PACK_A_16B_SAT:
32501e04c3fSmrg                        return 0xc;
32601e04c3fSmrg                }
32701e04c3fSmrg        }
32801e04c3fSmrg        unreachable("Bad pack field");
32901e04c3fSmrg}
33001e04c3fSmrg#endif
33101e04c3fSmrg
33201e04c3fSmrgstruct qreg
33301e04c3fSmrgvir_get_temp(struct v3d_compile *c)
33401e04c3fSmrg{
33501e04c3fSmrg        struct qreg reg;
33601e04c3fSmrg
33701e04c3fSmrg        reg.file = QFILE_TEMP;
33801e04c3fSmrg        reg.index = c->num_temps++;
33901e04c3fSmrg
34001e04c3fSmrg        if (c->num_temps > c->defs_array_size) {
34101e04c3fSmrg                uint32_t old_size = c->defs_array_size;
34201e04c3fSmrg                c->defs_array_size = MAX2(old_size * 2, 16);
34301e04c3fSmrg
34401e04c3fSmrg                c->defs = reralloc(c, c->defs, struct qinst *,
34501e04c3fSmrg                                   c->defs_array_size);
34601e04c3fSmrg                memset(&c->defs[old_size], 0,
34701e04c3fSmrg                       sizeof(c->defs[0]) * (c->defs_array_size - old_size));
34801e04c3fSmrg
34901e04c3fSmrg                c->spillable = reralloc(c, c->spillable,
35001e04c3fSmrg                                        BITSET_WORD,
35101e04c3fSmrg                                        BITSET_WORDS(c->defs_array_size));
35201e04c3fSmrg                for (int i = old_size; i < c->defs_array_size; i++)
35301e04c3fSmrg                        BITSET_SET(c->spillable, i);
35401e04c3fSmrg        }
35501e04c3fSmrg
35601e04c3fSmrg        return reg;
35701e04c3fSmrg}
35801e04c3fSmrg
35901e04c3fSmrgstruct qinst *
36001e04c3fSmrgvir_add_inst(enum v3d_qpu_add_op op, struct qreg dst, struct qreg src0, struct qreg src1)
36101e04c3fSmrg{
36201e04c3fSmrg        struct qinst *inst = calloc(1, sizeof(*inst));
36301e04c3fSmrg
36401e04c3fSmrg        inst->qpu = v3d_qpu_nop();
36501e04c3fSmrg        inst->qpu.alu.add.op = op;
36601e04c3fSmrg
36701e04c3fSmrg        inst->dst = dst;
36801e04c3fSmrg        inst->src[0] = src0;
36901e04c3fSmrg        inst->src[1] = src1;
37001e04c3fSmrg        inst->uniform = ~0;
37101e04c3fSmrg
37201e04c3fSmrg        return inst;
37301e04c3fSmrg}
37401e04c3fSmrg
37501e04c3fSmrgstruct qinst *
37601e04c3fSmrgvir_mul_inst(enum v3d_qpu_mul_op op, struct qreg dst, struct qreg src0, struct qreg src1)
37701e04c3fSmrg{
37801e04c3fSmrg        struct qinst *inst = calloc(1, sizeof(*inst));
37901e04c3fSmrg
38001e04c3fSmrg        inst->qpu = v3d_qpu_nop();
38101e04c3fSmrg        inst->qpu.alu.mul.op = op;
38201e04c3fSmrg
38301e04c3fSmrg        inst->dst = dst;
38401e04c3fSmrg        inst->src[0] = src0;
38501e04c3fSmrg        inst->src[1] = src1;
38601e04c3fSmrg        inst->uniform = ~0;
38701e04c3fSmrg
38801e04c3fSmrg        return inst;
38901e04c3fSmrg}
39001e04c3fSmrg
39101e04c3fSmrgstruct qinst *
392ed98bd31Smayavir_branch_inst(struct v3d_compile *c, enum v3d_qpu_branch_cond cond)
39301e04c3fSmrg{
39401e04c3fSmrg        struct qinst *inst = calloc(1, sizeof(*inst));
39501e04c3fSmrg
39601e04c3fSmrg        inst->qpu = v3d_qpu_nop();
39701e04c3fSmrg        inst->qpu.type = V3D_QPU_INSTR_TYPE_BRANCH;
39801e04c3fSmrg        inst->qpu.branch.cond = cond;
39901e04c3fSmrg        inst->qpu.branch.msfign = V3D_QPU_MSFIGN_NONE;
40001e04c3fSmrg        inst->qpu.branch.bdi = V3D_QPU_BRANCH_DEST_REL;
40101e04c3fSmrg        inst->qpu.branch.ub = true;
40201e04c3fSmrg        inst->qpu.branch.bdu = V3D_QPU_BRANCH_DEST_REL;
40301e04c3fSmrg
404ed98bd31Smaya        inst->dst = vir_nop_reg();
405ed98bd31Smaya        inst->uniform = vir_get_uniform_index(c, QUNIFORM_CONSTANT, 0);
40601e04c3fSmrg
40701e04c3fSmrg        return inst;
40801e04c3fSmrg}
40901e04c3fSmrg
41001e04c3fSmrgstatic void
41101e04c3fSmrgvir_emit(struct v3d_compile *c, struct qinst *inst)
41201e04c3fSmrg{
41301e04c3fSmrg        switch (c->cursor.mode) {
41401e04c3fSmrg        case vir_cursor_add:
41501e04c3fSmrg                list_add(&inst->link, c->cursor.link);
41601e04c3fSmrg                break;
41701e04c3fSmrg        case vir_cursor_addtail:
41801e04c3fSmrg                list_addtail(&inst->link, c->cursor.link);
41901e04c3fSmrg                break;
42001e04c3fSmrg        }
42101e04c3fSmrg
42201e04c3fSmrg        c->cursor = vir_after_inst(inst);
42301e04c3fSmrg        c->live_intervals_valid = false;
42401e04c3fSmrg}
42501e04c3fSmrg
42601e04c3fSmrg/* Updates inst to write to a new temporary, emits it, and notes the def. */
42701e04c3fSmrgstruct qreg
42801e04c3fSmrgvir_emit_def(struct v3d_compile *c, struct qinst *inst)
42901e04c3fSmrg{
43001e04c3fSmrg        assert(inst->dst.file == QFILE_NULL);
43101e04c3fSmrg
43201e04c3fSmrg        /* If we're emitting an instruction that's a def, it had better be
43301e04c3fSmrg         * writing a register.
43401e04c3fSmrg         */
43501e04c3fSmrg        if (inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU) {
43601e04c3fSmrg                assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP ||
43701e04c3fSmrg                       v3d_qpu_add_op_has_dst(inst->qpu.alu.add.op));
43801e04c3fSmrg                assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP ||
43901e04c3fSmrg                       v3d_qpu_mul_op_has_dst(inst->qpu.alu.mul.op));
44001e04c3fSmrg        }
44101e04c3fSmrg
44201e04c3fSmrg        inst->dst = vir_get_temp(c);
44301e04c3fSmrg
44401e04c3fSmrg        if (inst->dst.file == QFILE_TEMP)
44501e04c3fSmrg                c->defs[inst->dst.index] = inst;
44601e04c3fSmrg
44701e04c3fSmrg        vir_emit(c, inst);
44801e04c3fSmrg
44901e04c3fSmrg        return inst->dst;
45001e04c3fSmrg}
45101e04c3fSmrg
45201e04c3fSmrgstruct qinst *
45301e04c3fSmrgvir_emit_nondef(struct v3d_compile *c, struct qinst *inst)
45401e04c3fSmrg{
45501e04c3fSmrg        if (inst->dst.file == QFILE_TEMP)
45601e04c3fSmrg                c->defs[inst->dst.index] = NULL;
45701e04c3fSmrg
45801e04c3fSmrg        vir_emit(c, inst);
45901e04c3fSmrg
46001e04c3fSmrg        return inst;
46101e04c3fSmrg}
46201e04c3fSmrg
46301e04c3fSmrgstruct qblock *
46401e04c3fSmrgvir_new_block(struct v3d_compile *c)
46501e04c3fSmrg{
46601e04c3fSmrg        struct qblock *block = rzalloc(c, struct qblock);
46701e04c3fSmrg
46801e04c3fSmrg        list_inithead(&block->instructions);
46901e04c3fSmrg
47001e04c3fSmrg        block->predecessors = _mesa_set_create(block,
47101e04c3fSmrg                                               _mesa_hash_pointer,
47201e04c3fSmrg                                               _mesa_key_pointer_equal);
47301e04c3fSmrg
47401e04c3fSmrg        block->index = c->next_block_index++;
47501e04c3fSmrg
47601e04c3fSmrg        return block;
47701e04c3fSmrg}
47801e04c3fSmrg
47901e04c3fSmrgvoid
48001e04c3fSmrgvir_set_emit_block(struct v3d_compile *c, struct qblock *block)
48101e04c3fSmrg{
48201e04c3fSmrg        c->cur_block = block;
48301e04c3fSmrg        c->cursor = vir_after_block(block);
48401e04c3fSmrg        list_addtail(&block->link, &c->blocks);
48501e04c3fSmrg}
48601e04c3fSmrg
48701e04c3fSmrgstruct qblock *
48801e04c3fSmrgvir_entry_block(struct v3d_compile *c)
48901e04c3fSmrg{
49001e04c3fSmrg        return list_first_entry(&c->blocks, struct qblock, link);
49101e04c3fSmrg}
49201e04c3fSmrg
49301e04c3fSmrgstruct qblock *
49401e04c3fSmrgvir_exit_block(struct v3d_compile *c)
49501e04c3fSmrg{
49601e04c3fSmrg        return list_last_entry(&c->blocks, struct qblock, link);
49701e04c3fSmrg}
49801e04c3fSmrg
49901e04c3fSmrgvoid
50001e04c3fSmrgvir_link_blocks(struct qblock *predecessor, struct qblock *successor)
50101e04c3fSmrg{
50201e04c3fSmrg        _mesa_set_add(successor->predecessors, predecessor);
50301e04c3fSmrg        if (predecessor->successors[0]) {
50401e04c3fSmrg                assert(!predecessor->successors[1]);
50501e04c3fSmrg                predecessor->successors[1] = successor;
50601e04c3fSmrg        } else {
50701e04c3fSmrg                predecessor->successors[0] = successor;
50801e04c3fSmrg        }
50901e04c3fSmrg}
51001e04c3fSmrg
51101e04c3fSmrgconst struct v3d_compiler *
51201e04c3fSmrgv3d_compiler_init(const struct v3d_device_info *devinfo)
51301e04c3fSmrg{
51401e04c3fSmrg        struct v3d_compiler *compiler = rzalloc(NULL, struct v3d_compiler);
51501e04c3fSmrg        if (!compiler)
51601e04c3fSmrg                return NULL;
51701e04c3fSmrg
51801e04c3fSmrg        compiler->devinfo = devinfo;
51901e04c3fSmrg
52001e04c3fSmrg        if (!vir_init_reg_sets(compiler)) {
52101e04c3fSmrg                ralloc_free(compiler);
52201e04c3fSmrg                return NULL;
52301e04c3fSmrg        }
52401e04c3fSmrg
52501e04c3fSmrg        return compiler;
52601e04c3fSmrg}
52701e04c3fSmrg
52801e04c3fSmrgvoid
52901e04c3fSmrgv3d_compiler_free(const struct v3d_compiler *compiler)
53001e04c3fSmrg{
53101e04c3fSmrg        ralloc_free((void *)compiler);
53201e04c3fSmrg}
53301e04c3fSmrg
53401e04c3fSmrgstatic struct v3d_compile *
53501e04c3fSmrgvir_compile_init(const struct v3d_compiler *compiler,
53601e04c3fSmrg                 struct v3d_key *key,
53701e04c3fSmrg                 nir_shader *s,
538ed98bd31Smaya                 void (*debug_output)(const char *msg,
539ed98bd31Smaya                                      void *debug_output_data),
540ed98bd31Smaya                 void *debug_output_data,
5417ec681f3Smrg                 int program_id, int variant_id,
5427ec681f3Smrg                 uint32_t max_threads,
5437ec681f3Smrg                 uint32_t min_threads_for_reg_alloc,
5447ec681f3Smrg                 bool tmu_spilling_allowed,
5457ec681f3Smrg                 bool disable_loop_unrolling,
5467ec681f3Smrg                 bool disable_constant_ubo_load_sorting,
5477ec681f3Smrg                 bool disable_tmu_pipelining,
5487ec681f3Smrg                 bool fallback_scheduler)
54901e04c3fSmrg{
55001e04c3fSmrg        struct v3d_compile *c = rzalloc(NULL, struct v3d_compile);
55101e04c3fSmrg
55201e04c3fSmrg        c->compiler = compiler;
55301e04c3fSmrg        c->devinfo = compiler->devinfo;
55401e04c3fSmrg        c->key = key;
55501e04c3fSmrg        c->program_id = program_id;
55601e04c3fSmrg        c->variant_id = variant_id;
5577ec681f3Smrg        c->threads = max_threads;
558ed98bd31Smaya        c->debug_output = debug_output;
559ed98bd31Smaya        c->debug_output_data = debug_output_data;
5607ec681f3Smrg        c->compilation_result = V3D_COMPILATION_SUCCEEDED;
5617ec681f3Smrg        c->min_threads_for_reg_alloc = min_threads_for_reg_alloc;
5627ec681f3Smrg        c->tmu_spilling_allowed = tmu_spilling_allowed;
5637ec681f3Smrg        c->fallback_scheduler = fallback_scheduler;
5647ec681f3Smrg        c->disable_tmu_pipelining = disable_tmu_pipelining;
5657ec681f3Smrg        c->disable_constant_ubo_load_sorting = disable_constant_ubo_load_sorting;
5667ec681f3Smrg        c->disable_loop_unrolling = V3D_DEBUG & V3D_DEBUG_NO_LOOP_UNROLL
5677ec681f3Smrg                ? true : disable_loop_unrolling;
56801e04c3fSmrg
56901e04c3fSmrg        s = nir_shader_clone(c, s);
57001e04c3fSmrg        c->s = s;
57101e04c3fSmrg
57201e04c3fSmrg        list_inithead(&c->blocks);
57301e04c3fSmrg        vir_set_emit_block(c, vir_new_block(c));
57401e04c3fSmrg
57501e04c3fSmrg        c->output_position_index = -1;
57601e04c3fSmrg        c->output_sample_mask_index = -1;
57701e04c3fSmrg
57801e04c3fSmrg        c->def_ht = _mesa_hash_table_create(c, _mesa_hash_pointer,
57901e04c3fSmrg                                            _mesa_key_pointer_equal);
58001e04c3fSmrg
5817ec681f3Smrg        c->tmu.outstanding_regs = _mesa_pointer_set_create(c);
5827ec681f3Smrg        c->flags_temp = -1;
5837ec681f3Smrg
58401e04c3fSmrg        return c;
58501e04c3fSmrg}
58601e04c3fSmrg
58701e04c3fSmrgstatic int
588ed98bd31Smayatype_size_vec4(const struct glsl_type *type, bool bindless)
58901e04c3fSmrg{
59001e04c3fSmrg        return glsl_count_attribute_slots(type, false);
59101e04c3fSmrg}
59201e04c3fSmrg
59301e04c3fSmrgstatic void
59401e04c3fSmrgv3d_lower_nir(struct v3d_compile *c)
59501e04c3fSmrg{
59601e04c3fSmrg        struct nir_lower_tex_options tex_options = {
59701e04c3fSmrg                .lower_txd = true,
598ed98bd31Smaya                .lower_tg4_broadcom_swizzle = true,
599ed98bd31Smaya
60001e04c3fSmrg                .lower_rect = false, /* XXX: Use this on V3D 3.x */
60101e04c3fSmrg                .lower_txp = ~0,
60201e04c3fSmrg                /* Apply swizzles to all samplers. */
60301e04c3fSmrg                .swizzle_result = ~0,
60401e04c3fSmrg        };
60501e04c3fSmrg
60601e04c3fSmrg        /* Lower the format swizzle and (for 32-bit returns)
60701e04c3fSmrg         * ARB_texture_swizzle-style swizzle.
60801e04c3fSmrg         */
6097ec681f3Smrg        assert(c->key->num_tex_used <= ARRAY_SIZE(c->key->tex));
6107ec681f3Smrg        for (int i = 0; i < c->key->num_tex_used; i++) {
61101e04c3fSmrg                for (int j = 0; j < 4; j++)
61201e04c3fSmrg                        tex_options.swizzles[i][j] = c->key->tex[i].swizzle[j];
6137ec681f3Smrg        }
61401e04c3fSmrg
6157ec681f3Smrg        assert(c->key->num_samplers_used <= ARRAY_SIZE(c->key->sampler));
6167ec681f3Smrg        for (int i = 0; i < c->key->num_samplers_used; i++) {
6177ec681f3Smrg                if (c->key->sampler[i].return_size == 16) {
618ed98bd31Smaya                        tex_options.lower_tex_packing[i] =
619ed98bd31Smaya                                nir_lower_tex_packing_16;
620ed98bd31Smaya                }
621ed98bd31Smaya        }
622ed98bd31Smaya
623ed98bd31Smaya        /* CS textures may not have return_size reflecting the shadow state. */
6247ec681f3Smrg        nir_foreach_uniform_variable(var, c->s) {
625ed98bd31Smaya                const struct glsl_type *type = glsl_without_array(var->type);
626ed98bd31Smaya                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
627ed98bd31Smaya
628ed98bd31Smaya                if (!glsl_type_is_sampler(type) ||
629ed98bd31Smaya                    !glsl_sampler_type_is_shadow(type))
630ed98bd31Smaya                        continue;
631ed98bd31Smaya
632ed98bd31Smaya                for (int i = 0; i < array_len; i++) {
633ed98bd31Smaya                        tex_options.lower_tex_packing[var->data.binding + i] =
634ed98bd31Smaya                                nir_lower_tex_packing_16;
635ed98bd31Smaya                }
63601e04c3fSmrg        }
63701e04c3fSmrg
63801e04c3fSmrg        NIR_PASS_V(c->s, nir_lower_tex, &tex_options);
639ed98bd31Smaya        NIR_PASS_V(c->s, nir_lower_system_values);
6407ec681f3Smrg        NIR_PASS_V(c->s, nir_lower_compute_system_values, NULL);
64101e04c3fSmrg
642ed98bd31Smaya        NIR_PASS_V(c->s, nir_lower_vars_to_scratch,
643ed98bd31Smaya                   nir_var_function_temp,
644ed98bd31Smaya                   0,
645ed98bd31Smaya                   glsl_get_natural_size_align_bytes);
646ed98bd31Smaya        NIR_PASS_V(c->s, v3d_nir_lower_scratch);
64701e04c3fSmrg}
64801e04c3fSmrg
64901e04c3fSmrgstatic void
65001e04c3fSmrgv3d_set_prog_data_uniforms(struct v3d_compile *c,
65101e04c3fSmrg                           struct v3d_prog_data *prog_data)
65201e04c3fSmrg{
65301e04c3fSmrg        int count = c->num_uniforms;
65401e04c3fSmrg        struct v3d_uniform_list *ulist = &prog_data->uniforms;
65501e04c3fSmrg
65601e04c3fSmrg        ulist->count = count;
65701e04c3fSmrg        ulist->data = ralloc_array(prog_data, uint32_t, count);
65801e04c3fSmrg        memcpy(ulist->data, c->uniform_data,
65901e04c3fSmrg               count * sizeof(*ulist->data));
66001e04c3fSmrg        ulist->contents = ralloc_array(prog_data, enum quniform_contents, count);
66101e04c3fSmrg        memcpy(ulist->contents, c->uniform_contents,
66201e04c3fSmrg               count * sizeof(*ulist->contents));
66301e04c3fSmrg}
66401e04c3fSmrg
66501e04c3fSmrgstatic void
666ed98bd31Smayav3d_vs_set_prog_data(struct v3d_compile *c,
667ed98bd31Smaya                     struct v3d_vs_prog_data *prog_data)
66801e04c3fSmrg{
66901e04c3fSmrg        /* The vertex data gets format converted by the VPM so that
67001e04c3fSmrg         * each attribute channel takes up a VPM column.  Precompute
67101e04c3fSmrg         * the sizes for the shader record.
67201e04c3fSmrg         */
67301e04c3fSmrg        for (int i = 0; i < ARRAY_SIZE(prog_data->vattr_sizes); i++) {
67401e04c3fSmrg                prog_data->vattr_sizes[i] = c->vattr_sizes[i];
67501e04c3fSmrg                prog_data->vpm_input_size += c->vattr_sizes[i];
67601e04c3fSmrg        }
67701e04c3fSmrg
6787ec681f3Smrg        memset(prog_data->driver_location_map, -1,
6797ec681f3Smrg               sizeof(prog_data->driver_location_map));
6807ec681f3Smrg
6817ec681f3Smrg        nir_foreach_shader_in_variable(var, c->s) {
6827ec681f3Smrg                prog_data->driver_location_map[var->data.location] =
6837ec681f3Smrg                        var->data.driver_location;
6847ec681f3Smrg        }
6857ec681f3Smrg
6867ec681f3Smrg        prog_data->uses_vid = BITSET_TEST(c->s->info.system_values_read,
6877ec681f3Smrg                                          SYSTEM_VALUE_VERTEX_ID) ||
6887ec681f3Smrg                              BITSET_TEST(c->s->info.system_values_read,
6897ec681f3Smrg                                          SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
6907ec681f3Smrg
6917ec681f3Smrg        prog_data->uses_biid = BITSET_TEST(c->s->info.system_values_read,
6927ec681f3Smrg                                           SYSTEM_VALUE_BASE_INSTANCE);
6937ec681f3Smrg
6947ec681f3Smrg        prog_data->uses_iid = BITSET_TEST(c->s->info.system_values_read,
6957ec681f3Smrg                                          SYSTEM_VALUE_INSTANCE_ID) ||
6967ec681f3Smrg                              BITSET_TEST(c->s->info.system_values_read,
6977ec681f3Smrg                                          SYSTEM_VALUE_INSTANCE_INDEX);
69801e04c3fSmrg
69901e04c3fSmrg        if (prog_data->uses_vid)
70001e04c3fSmrg                prog_data->vpm_input_size++;
7017ec681f3Smrg        if (prog_data->uses_biid)
7027ec681f3Smrg                prog_data->vpm_input_size++;
70301e04c3fSmrg        if (prog_data->uses_iid)
70401e04c3fSmrg                prog_data->vpm_input_size++;
70501e04c3fSmrg
70601e04c3fSmrg        /* Input/output segment size are in sectors (8 rows of 32 bits per
70701e04c3fSmrg         * channel).
70801e04c3fSmrg         */
70901e04c3fSmrg        prog_data->vpm_input_size = align(prog_data->vpm_input_size, 8) / 8;
710ed98bd31Smaya        prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
711ed98bd31Smaya
712ed98bd31Smaya        /* Set us up for shared input/output segments.  This is apparently
713ed98bd31Smaya         * necessary for our VCM setup to avoid varying corruption.
714ed98bd31Smaya         */
715ed98bd31Smaya        prog_data->separate_segments = false;
716ed98bd31Smaya        prog_data->vpm_output_size = MAX2(prog_data->vpm_output_size,
717ed98bd31Smaya                                          prog_data->vpm_input_size);
718ed98bd31Smaya        prog_data->vpm_input_size = 0;
71901e04c3fSmrg
72001e04c3fSmrg        /* Compute VCM cache size.  We set up our program to take up less than
72101e04c3fSmrg         * half of the VPM, so that any set of bin and render programs won't
72201e04c3fSmrg         * run out of space.  We need space for at least one input segment,
72301e04c3fSmrg         * and then allocate the rest to output segments (one for the current
72401e04c3fSmrg         * program, the rest to VCM).  The valid range of the VCM cache size
72501e04c3fSmrg         * field is 1-4 16-vertex batches, but GFXH-1744 limits us to 2-4
72601e04c3fSmrg         * batches.
72701e04c3fSmrg         */
72801e04c3fSmrg        assert(c->devinfo->vpm_size);
729ed98bd31Smaya        int sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
73001e04c3fSmrg        int vpm_size_in_sectors = c->devinfo->vpm_size / sector_size;
73101e04c3fSmrg        int half_vpm = vpm_size_in_sectors / 2;
73201e04c3fSmrg        int vpm_output_sectors = half_vpm - prog_data->vpm_input_size;
73301e04c3fSmrg        int vpm_output_batches = vpm_output_sectors / prog_data->vpm_output_size;
73401e04c3fSmrg        assert(vpm_output_batches >= 2);
73501e04c3fSmrg        prog_data->vcm_cache_size = CLAMP(vpm_output_batches - 1, 2, 4);
73601e04c3fSmrg}
73701e04c3fSmrg
7387ec681f3Smrgstatic void
7397ec681f3Smrgv3d_gs_set_prog_data(struct v3d_compile *c,
7407ec681f3Smrg                     struct v3d_gs_prog_data *prog_data)
7417ec681f3Smrg{
7427ec681f3Smrg        prog_data->num_inputs = c->num_inputs;
7437ec681f3Smrg        memcpy(prog_data->input_slots, c->input_slots,
7447ec681f3Smrg               c->num_inputs * sizeof(*c->input_slots));
7457ec681f3Smrg
7467ec681f3Smrg        /* gl_PrimitiveIdIn is written by the GBG into the first word of the
7477ec681f3Smrg         * VPM output header automatically and the shader will overwrite
7487ec681f3Smrg         * it after reading it if necessary, so it doesn't add to the VPM
7497ec681f3Smrg         * size requirements.
7507ec681f3Smrg         */
7517ec681f3Smrg        prog_data->uses_pid = BITSET_TEST(c->s->info.system_values_read,
7527ec681f3Smrg                                          SYSTEM_VALUE_PRIMITIVE_ID);
7537ec681f3Smrg
7547ec681f3Smrg        /* Output segment size is in sectors (8 rows of 32 bits per channel) */
7557ec681f3Smrg        prog_data->vpm_output_size = align(c->vpm_output_size, 8) / 8;
7567ec681f3Smrg
7577ec681f3Smrg        /* Compute SIMD dispatch width and update VPM output size accordingly
7587ec681f3Smrg         * to ensure we can fit our program in memory. Available widths are
7597ec681f3Smrg         * 16, 8, 4, 1.
7607ec681f3Smrg         *
7617ec681f3Smrg         * Notice that at draw time we will have to consider VPM memory
7627ec681f3Smrg         * requirements from other stages and choose a smaller dispatch
7637ec681f3Smrg         * width if needed to fit the program in VPM memory.
7647ec681f3Smrg         */
7657ec681f3Smrg        prog_data->simd_width = 16;
7667ec681f3Smrg        while ((prog_data->simd_width > 1 && prog_data->vpm_output_size > 16) ||
7677ec681f3Smrg               prog_data->simd_width == 2) {
7687ec681f3Smrg                prog_data->simd_width >>= 1;
7697ec681f3Smrg                prog_data->vpm_output_size =
7707ec681f3Smrg                        align(prog_data->vpm_output_size, 2) / 2;
7717ec681f3Smrg        }
7727ec681f3Smrg        assert(prog_data->vpm_output_size <= 16);
7737ec681f3Smrg        assert(prog_data->simd_width != 2);
7747ec681f3Smrg
7757ec681f3Smrg        prog_data->out_prim_type = c->s->info.gs.output_primitive;
7767ec681f3Smrg        prog_data->num_invocations = c->s->info.gs.invocations;
7777ec681f3Smrg
7787ec681f3Smrg        prog_data->writes_psiz =
7797ec681f3Smrg            c->s->info.outputs_written & (1 << VARYING_SLOT_PSIZ);
7807ec681f3Smrg}
7817ec681f3Smrg
78201e04c3fSmrgstatic void
78301e04c3fSmrgv3d_set_fs_prog_data_inputs(struct v3d_compile *c,
78401e04c3fSmrg                            struct v3d_fs_prog_data *prog_data)
78501e04c3fSmrg{
786ed98bd31Smaya        prog_data->num_inputs = c->num_inputs;
78701e04c3fSmrg        memcpy(prog_data->input_slots, c->input_slots,
78801e04c3fSmrg               c->num_inputs * sizeof(*c->input_slots));
78901e04c3fSmrg
79001e04c3fSmrg        STATIC_ASSERT(ARRAY_SIZE(prog_data->flat_shade_flags) >
79101e04c3fSmrg                      (V3D_MAX_FS_INPUTS - 1) / 24);
79201e04c3fSmrg        for (int i = 0; i < V3D_MAX_FS_INPUTS; i++) {
79301e04c3fSmrg                if (BITSET_TEST(c->flat_shade_flags, i))
79401e04c3fSmrg                        prog_data->flat_shade_flags[i / 24] |= 1 << (i % 24);
79501e04c3fSmrg
79601e04c3fSmrg                if (BITSET_TEST(c->noperspective_flags, i))
79701e04c3fSmrg                        prog_data->noperspective_flags[i / 24] |= 1 << (i % 24);
79801e04c3fSmrg
79901e04c3fSmrg                if (BITSET_TEST(c->centroid_flags, i))
80001e04c3fSmrg                        prog_data->centroid_flags[i / 24] |= 1 << (i % 24);
80101e04c3fSmrg        }
80201e04c3fSmrg}
80301e04c3fSmrg
804ed98bd31Smayastatic void
805ed98bd31Smayav3d_fs_set_prog_data(struct v3d_compile *c,
806ed98bd31Smaya                     struct v3d_fs_prog_data *prog_data)
807ed98bd31Smaya{
808ed98bd31Smaya        v3d_set_fs_prog_data_inputs(c, prog_data);
809ed98bd31Smaya        prog_data->writes_z = c->writes_z;
810ed98bd31Smaya        prog_data->disable_ez = !c->s->info.fs.early_fragment_tests;
811ed98bd31Smaya        prog_data->uses_center_w = c->uses_center_w;
8127ec681f3Smrg        prog_data->uses_implicit_point_line_varyings =
8137ec681f3Smrg                c->uses_implicit_point_line_varyings;
8147ec681f3Smrg        prog_data->lock_scoreboard_on_first_thrsw =
8157ec681f3Smrg                c->lock_scoreboard_on_first_thrsw;
8167ec681f3Smrg        prog_data->force_per_sample_msaa = c->force_per_sample_msaa;
8177ec681f3Smrg        prog_data->uses_pid = c->fs_uses_primitive_id;
818ed98bd31Smaya}
819ed98bd31Smaya
820ed98bd31Smayastatic void
821ed98bd31Smayav3d_cs_set_prog_data(struct v3d_compile *c,
822ed98bd31Smaya                     struct v3d_compute_prog_data *prog_data)
823ed98bd31Smaya{
8247ec681f3Smrg        prog_data->shared_size = c->s->info.shared_size;
8257ec681f3Smrg
8267ec681f3Smrg        prog_data->local_size[0] = c->s->info.workgroup_size[0];
8277ec681f3Smrg        prog_data->local_size[1] = c->s->info.workgroup_size[1];
8287ec681f3Smrg        prog_data->local_size[2] = c->s->info.workgroup_size[2];
8297ec681f3Smrg
8307ec681f3Smrg        prog_data->has_subgroups = c->has_subgroups;
831ed98bd31Smaya}
832ed98bd31Smaya
833ed98bd31Smayastatic void
834ed98bd31Smayav3d_set_prog_data(struct v3d_compile *c,
835ed98bd31Smaya                  struct v3d_prog_data *prog_data)
836ed98bd31Smaya{
837ed98bd31Smaya        prog_data->threads = c->threads;
838ed98bd31Smaya        prog_data->single_seg = !c->last_thrsw;
839ed98bd31Smaya        prog_data->spill_size = c->spill_size;
8407ec681f3Smrg        prog_data->tmu_dirty_rcl = c->tmu_dirty_rcl;
8417ec681f3Smrg        prog_data->has_control_barrier = c->s->info.uses_control_barrier;
842ed98bd31Smaya
843ed98bd31Smaya        v3d_set_prog_data_uniforms(c, prog_data);
844ed98bd31Smaya
8457ec681f3Smrg        switch (c->s->info.stage) {
8467ec681f3Smrg        case MESA_SHADER_VERTEX:
847ed98bd31Smaya                v3d_vs_set_prog_data(c, (struct v3d_vs_prog_data *)prog_data);
8487ec681f3Smrg                break;
8497ec681f3Smrg        case MESA_SHADER_GEOMETRY:
8507ec681f3Smrg                v3d_gs_set_prog_data(c, (struct v3d_gs_prog_data *)prog_data);
8517ec681f3Smrg                break;
8527ec681f3Smrg        case MESA_SHADER_FRAGMENT:
853ed98bd31Smaya                v3d_fs_set_prog_data(c, (struct v3d_fs_prog_data *)prog_data);
8547ec681f3Smrg                break;
8557ec681f3Smrg        case MESA_SHADER_COMPUTE:
8567ec681f3Smrg                v3d_cs_set_prog_data(c, (struct v3d_compute_prog_data *)prog_data);
8577ec681f3Smrg                break;
8587ec681f3Smrg        default:
8597ec681f3Smrg                unreachable("unsupported shader stage");
860ed98bd31Smaya        }
861ed98bd31Smaya}
862ed98bd31Smaya
863ed98bd31Smayastatic uint64_t *
864ed98bd31Smayav3d_return_qpu_insts(struct v3d_compile *c, uint32_t *final_assembly_size)
865ed98bd31Smaya{
866ed98bd31Smaya        *final_assembly_size = c->qpu_inst_count * sizeof(uint64_t);
867ed98bd31Smaya
868ed98bd31Smaya        uint64_t *qpu_insts = malloc(*final_assembly_size);
869ed98bd31Smaya        if (!qpu_insts)
870ed98bd31Smaya                return NULL;
871ed98bd31Smaya
872ed98bd31Smaya        memcpy(qpu_insts, c->qpu_insts, *final_assembly_size);
873ed98bd31Smaya
874ed98bd31Smaya        vir_compile_destroy(c);
875ed98bd31Smaya
876ed98bd31Smaya        return qpu_insts;
877ed98bd31Smaya}
878ed98bd31Smaya
879ed98bd31Smayastatic void
880ed98bd31Smayav3d_nir_lower_vs_early(struct v3d_compile *c)
881ed98bd31Smaya{
882ed98bd31Smaya        /* Split our I/O vars and dead code eliminate the unused
883ed98bd31Smaya         * components.
884ed98bd31Smaya         */
885ed98bd31Smaya        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
886ed98bd31Smaya                   nir_var_shader_in | nir_var_shader_out);
887ed98bd31Smaya        uint64_t used_outputs[4] = {0};
8887ec681f3Smrg        for (int i = 0; i < c->vs_key->num_used_outputs; i++) {
8897ec681f3Smrg                int slot = v3d_slot_get_slot(c->vs_key->used_outputs[i]);
8907ec681f3Smrg                int comp = v3d_slot_get_component(c->vs_key->used_outputs[i]);
891ed98bd31Smaya                used_outputs[comp] |= 1ull << slot;
892ed98bd31Smaya        }
893ed98bd31Smaya        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
8947ec681f3Smrg                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
895ed98bd31Smaya        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
8967ec681f3Smrg        v3d_optimize_nir(c, c->s);
8977ec681f3Smrg        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
8987ec681f3Smrg
8997ec681f3Smrg        /* This must go before nir_lower_io */
9007ec681f3Smrg        if (c->vs_key->per_vertex_point_size)
9017ec681f3Smrg                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
9027ec681f3Smrg
903ed98bd31Smaya        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
904ed98bd31Smaya                   type_size_vec4,
905ed98bd31Smaya                   (nir_lower_io_options)0);
9067ec681f3Smrg        /* clean up nir_lower_io's deref_var remains and do a constant folding pass
9077ec681f3Smrg         * on the code it generated.
9087ec681f3Smrg         */
9097ec681f3Smrg        NIR_PASS_V(c->s, nir_opt_dce);
9107ec681f3Smrg        NIR_PASS_V(c->s, nir_opt_constant_folding);
9117ec681f3Smrg}
9127ec681f3Smrg
9137ec681f3Smrgstatic void
9147ec681f3Smrgv3d_nir_lower_gs_early(struct v3d_compile *c)
9157ec681f3Smrg{
9167ec681f3Smrg        /* Split our I/O vars and dead code eliminate the unused
9177ec681f3Smrg         * components.
9187ec681f3Smrg         */
9197ec681f3Smrg        NIR_PASS_V(c->s, nir_lower_io_to_scalar_early,
9207ec681f3Smrg                   nir_var_shader_in | nir_var_shader_out);
9217ec681f3Smrg        uint64_t used_outputs[4] = {0};
9227ec681f3Smrg        for (int i = 0; i < c->gs_key->num_used_outputs; i++) {
9237ec681f3Smrg                int slot = v3d_slot_get_slot(c->gs_key->used_outputs[i]);
9247ec681f3Smrg                int comp = v3d_slot_get_component(c->gs_key->used_outputs[i]);
9257ec681f3Smrg                used_outputs[comp] |= 1ull << slot;
9267ec681f3Smrg        }
9277ec681f3Smrg        NIR_PASS_V(c->s, nir_remove_unused_io_vars,
9287ec681f3Smrg                   nir_var_shader_out, used_outputs, NULL); /* demotes to globals */
9297ec681f3Smrg        NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
9307ec681f3Smrg        v3d_optimize_nir(c, c->s);
9317ec681f3Smrg        NIR_PASS_V(c->s, nir_remove_dead_variables, nir_var_shader_in, NULL);
9327ec681f3Smrg
9337ec681f3Smrg        /* This must go before nir_lower_io */
9347ec681f3Smrg        if (c->gs_key->per_vertex_point_size)
9357ec681f3Smrg                NIR_PASS_V(c->s, nir_lower_point_size, 1.0f, 0.0f);
9367ec681f3Smrg
9377ec681f3Smrg        NIR_PASS_V(c->s, nir_lower_io, nir_var_shader_in | nir_var_shader_out,
9387ec681f3Smrg                   type_size_vec4,
9397ec681f3Smrg                   (nir_lower_io_options)0);
9407ec681f3Smrg        /* clean up nir_lower_io's deref_var remains */
9417ec681f3Smrg        NIR_PASS_V(c->s, nir_opt_dce);
942ed98bd31Smaya}
943ed98bd31Smaya
94401e04c3fSmrgstatic void
94501e04c3fSmrgv3d_fixup_fs_output_types(struct v3d_compile *c)
94601e04c3fSmrg{
9477ec681f3Smrg        nir_foreach_shader_out_variable(var, c->s) {
94801e04c3fSmrg                uint32_t mask = 0;
94901e04c3fSmrg
95001e04c3fSmrg                switch (var->data.location) {
95101e04c3fSmrg                case FRAG_RESULT_COLOR:
95201e04c3fSmrg                        mask = ~0;
95301e04c3fSmrg                        break;
95401e04c3fSmrg                case FRAG_RESULT_DATA0:
95501e04c3fSmrg                case FRAG_RESULT_DATA1:
95601e04c3fSmrg                case FRAG_RESULT_DATA2:
95701e04c3fSmrg                case FRAG_RESULT_DATA3:
95801e04c3fSmrg                        mask = 1 << (var->data.location - FRAG_RESULT_DATA0);
95901e04c3fSmrg                        break;
96001e04c3fSmrg                }
96101e04c3fSmrg
96201e04c3fSmrg                if (c->fs_key->int_color_rb & mask) {
96301e04c3fSmrg                        var->type =
96401e04c3fSmrg                                glsl_vector_type(GLSL_TYPE_INT,
96501e04c3fSmrg                                                 glsl_get_components(var->type));
96601e04c3fSmrg                } else if (c->fs_key->uint_color_rb & mask) {
96701e04c3fSmrg                        var->type =
96801e04c3fSmrg                                glsl_vector_type(GLSL_TYPE_UINT,
96901e04c3fSmrg                                                 glsl_get_components(var->type));
97001e04c3fSmrg                }
97101e04c3fSmrg        }
97201e04c3fSmrg}
97301e04c3fSmrg
974ed98bd31Smayastatic void
975ed98bd31Smayav3d_nir_lower_fs_early(struct v3d_compile *c)
97601e04c3fSmrg{
977ed98bd31Smaya        if (c->fs_key->int_color_rb || c->fs_key->uint_color_rb)
978ed98bd31Smaya                v3d_fixup_fs_output_types(c);
97901e04c3fSmrg
9807ec681f3Smrg        NIR_PASS_V(c->s, v3d_nir_lower_logic_ops, c);
9817ec681f3Smrg
9827ec681f3Smrg        if (c->fs_key->line_smoothing) {
9837ec681f3Smrg                v3d_nir_lower_line_smooth(c->s);
9847ec681f3Smrg                NIR_PASS_V(c->s, nir_lower_global_vars_to_local);
9857ec681f3Smrg                /* The lowering pass can introduce new sysval reads */
9867ec681f3Smrg                nir_shader_gather_info(c->s, nir_shader_get_entrypoint(c->s));
987ed98bd31Smaya        }
988ed98bd31Smaya}
98901e04c3fSmrg
990ed98bd31Smayastatic void
9917ec681f3Smrgv3d_nir_lower_gs_late(struct v3d_compile *c)
992ed98bd31Smaya{
9937ec681f3Smrg        if (c->key->ucp_enables) {
9947ec681f3Smrg                NIR_PASS_V(c->s, nir_lower_clip_gs, c->key->ucp_enables,
9957ec681f3Smrg                           false, NULL);
9967ec681f3Smrg        }
9977ec681f3Smrg
9987ec681f3Smrg        /* Note: GS output scalarizing must happen after nir_lower_clip_gs. */
9997ec681f3Smrg        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
10007ec681f3Smrg}
100101e04c3fSmrg
10027ec681f3Smrgstatic void
10037ec681f3Smrgv3d_nir_lower_vs_late(struct v3d_compile *c)
10047ec681f3Smrg{
1005ed98bd31Smaya        if (c->key->ucp_enables) {
1006ed98bd31Smaya                NIR_PASS_V(c->s, nir_lower_clip_vs, c->key->ucp_enables,
10077ec681f3Smrg                           false, false, NULL);
1008ed98bd31Smaya                NIR_PASS_V(c->s, nir_lower_io_to_scalar,
1009ed98bd31Smaya                           nir_var_shader_out);
1010ed98bd31Smaya        }
1011ed98bd31Smaya
1012ed98bd31Smaya        /* Note: VS output scalarizing must happen after nir_lower_clip_vs. */
1013ed98bd31Smaya        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_out);
1014ed98bd31Smaya}
101501e04c3fSmrg
1016ed98bd31Smayastatic void
1017ed98bd31Smayav3d_nir_lower_fs_late(struct v3d_compile *c)
1018ed98bd31Smaya{
10197ec681f3Smrg        /* In OpenGL the fragment shader can't read gl_ClipDistance[], but
10207ec681f3Smrg         * Vulkan allows it, in which case the SPIR-V compiler will declare
10217ec681f3Smrg         * VARING_SLOT_CLIP_DIST0 as compact array variable. Pass true as
10227ec681f3Smrg         * the last parameter to always operate with a compact array in both
10237ec681f3Smrg         * OpenGL and Vulkan so we do't have to care about the API we
10247ec681f3Smrg         * are using.
10257ec681f3Smrg         */
1026ed98bd31Smaya        if (c->key->ucp_enables)
10277ec681f3Smrg                NIR_PASS_V(c->s, nir_lower_clip_fs, c->key->ucp_enables, true);
102801e04c3fSmrg
102901e04c3fSmrg        NIR_PASS_V(c->s, nir_lower_io_to_scalar, nir_var_shader_in);
1030ed98bd31Smaya}
1031ed98bd31Smaya
1032ed98bd31Smayastatic uint32_t
1033ed98bd31Smayavir_get_max_temps(struct v3d_compile *c)
1034ed98bd31Smaya{
1035ed98bd31Smaya        int max_ip = 0;
1036ed98bd31Smaya        vir_for_each_inst_inorder(inst, c)
1037ed98bd31Smaya                max_ip++;
1038ed98bd31Smaya
1039ed98bd31Smaya        uint32_t *pressure = rzalloc_array(NULL, uint32_t, max_ip);
1040ed98bd31Smaya
1041ed98bd31Smaya        for (int t = 0; t < c->num_temps; t++) {
1042ed98bd31Smaya                for (int i = c->temp_start[t]; (i < c->temp_end[t] &&
1043ed98bd31Smaya                                                i < max_ip); i++) {
1044ed98bd31Smaya                        if (i > max_ip)
1045ed98bd31Smaya                                break;
1046ed98bd31Smaya                        pressure[i]++;
1047ed98bd31Smaya                }
1048ed98bd31Smaya        }
1049ed98bd31Smaya
1050ed98bd31Smaya        uint32_t max_temps = 0;
1051ed98bd31Smaya        for (int i = 0; i < max_ip; i++)
1052ed98bd31Smaya                max_temps = MAX2(max_temps, pressure[i]);
1053ed98bd31Smaya
1054ed98bd31Smaya        ralloc_free(pressure);
1055ed98bd31Smaya
1056ed98bd31Smaya        return max_temps;
1057ed98bd31Smaya}
1058ed98bd31Smaya
10597ec681f3Smrgenum v3d_dependency_class {
10607ec681f3Smrg        V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0
10617ec681f3Smrg};
10627ec681f3Smrg
10637ec681f3Smrgstatic bool
10647ec681f3Smrgv3d_intrinsic_dependency_cb(nir_intrinsic_instr *intr,
10657ec681f3Smrg                            nir_schedule_dependency *dep,
10667ec681f3Smrg                            void *user_data)
1067ed98bd31Smaya{
10687ec681f3Smrg        struct v3d_compile *c = user_data;
10697ec681f3Smrg
10707ec681f3Smrg        switch (intr->intrinsic) {
10717ec681f3Smrg        case nir_intrinsic_store_output:
10727ec681f3Smrg                /* Writing to location 0 overwrites the value passed in for
10737ec681f3Smrg                 * gl_PrimitiveID on geometry shaders
10747ec681f3Smrg                 */
10757ec681f3Smrg                if (c->s->info.stage != MESA_SHADER_GEOMETRY ||
10767ec681f3Smrg                    nir_intrinsic_base(intr) != 0)
10777ec681f3Smrg                        break;
10787ec681f3Smrg
10797ec681f3Smrg                nir_const_value *const_value =
10807ec681f3Smrg                        nir_src_as_const_value(intr->src[1]);
10817ec681f3Smrg
10827ec681f3Smrg                if (const_value == NULL)
10837ec681f3Smrg                        break;
10847ec681f3Smrg
10857ec681f3Smrg                uint64_t offset =
10867ec681f3Smrg                        nir_const_value_as_uint(*const_value,
10877ec681f3Smrg                                                nir_src_bit_size(intr->src[1]));
10887ec681f3Smrg                if (offset != 0)
10897ec681f3Smrg                        break;
10907ec681f3Smrg
10917ec681f3Smrg                dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
10927ec681f3Smrg                dep->type = NIR_SCHEDULE_WRITE_DEPENDENCY;
10937ec681f3Smrg                return true;
10947ec681f3Smrg
10957ec681f3Smrg        case nir_intrinsic_load_primitive_id:
10967ec681f3Smrg                if (c->s->info.stage != MESA_SHADER_GEOMETRY)
10977ec681f3Smrg                        break;
10987ec681f3Smrg
10997ec681f3Smrg                dep->klass = V3D_DEPENDENCY_CLASS_GS_VPM_OUTPUT_0;
11007ec681f3Smrg                dep->type = NIR_SCHEDULE_READ_DEPENDENCY;
11017ec681f3Smrg                return true;
11027ec681f3Smrg
11037ec681f3Smrg        default:
11047ec681f3Smrg                break;
11057ec681f3Smrg        }
11067ec681f3Smrg
11077ec681f3Smrg        return false;
11087ec681f3Smrg}
11097ec681f3Smrg
11107ec681f3Smrgstatic bool
11117ec681f3Smrgshould_split_wrmask(const nir_instr *instr, const void *data)
11127ec681f3Smrg{
11137ec681f3Smrg        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
11147ec681f3Smrg        switch (intr->intrinsic) {
11157ec681f3Smrg        case nir_intrinsic_store_ssbo:
11167ec681f3Smrg        case nir_intrinsic_store_shared:
11177ec681f3Smrg        case nir_intrinsic_store_global:
11187ec681f3Smrg        case nir_intrinsic_store_scratch:
11197ec681f3Smrg                return true;
11207ec681f3Smrg        default:
11217ec681f3Smrg                return false;
11227ec681f3Smrg        }
11237ec681f3Smrg}
11247ec681f3Smrg
11257ec681f3Smrgstatic nir_intrinsic_instr *
11267ec681f3Smrgnir_instr_as_constant_ubo_load(nir_instr *inst)
11277ec681f3Smrg{
11287ec681f3Smrg        if (inst->type != nir_instr_type_intrinsic)
11297ec681f3Smrg                return NULL;
11307ec681f3Smrg
11317ec681f3Smrg        nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
11327ec681f3Smrg        if (intr->intrinsic != nir_intrinsic_load_ubo)
11337ec681f3Smrg                return NULL;
11347ec681f3Smrg
11357ec681f3Smrg        assert(nir_src_is_const(intr->src[0]));
11367ec681f3Smrg        if (!nir_src_is_const(intr->src[1]))
11377ec681f3Smrg                return NULL;
11387ec681f3Smrg
11397ec681f3Smrg        return intr;
11407ec681f3Smrg}
11417ec681f3Smrg
11427ec681f3Smrgstatic bool
11437ec681f3Smrgv3d_nir_sort_constant_ubo_load(nir_block *block, nir_intrinsic_instr *ref)
11447ec681f3Smrg{
11457ec681f3Smrg        bool progress = false;
11467ec681f3Smrg
11477ec681f3Smrg        nir_instr *ref_inst = &ref->instr;
11487ec681f3Smrg        uint32_t ref_offset = nir_src_as_uint(ref->src[1]);
11497ec681f3Smrg        uint32_t ref_index = nir_src_as_uint(ref->src[0]);
11507ec681f3Smrg
11517ec681f3Smrg        /* Go through all instructions after ref searching for constant UBO
11527ec681f3Smrg         * loads for the same UBO index.
11537ec681f3Smrg         */
11547ec681f3Smrg        bool seq_break = false;
11557ec681f3Smrg        nir_instr *inst = &ref->instr;
11567ec681f3Smrg        nir_instr *next_inst = NULL;
11577ec681f3Smrg        while (true) {
11587ec681f3Smrg                inst = next_inst ? next_inst : nir_instr_next(inst);
11597ec681f3Smrg                if (!inst)
11607ec681f3Smrg                        break;
11617ec681f3Smrg
11627ec681f3Smrg                next_inst = NULL;
11637ec681f3Smrg
11647ec681f3Smrg                if (inst->type != nir_instr_type_intrinsic)
11657ec681f3Smrg                        continue;
11667ec681f3Smrg
11677ec681f3Smrg                nir_intrinsic_instr *intr = nir_instr_as_intrinsic(inst);
11687ec681f3Smrg                if (intr->intrinsic != nir_intrinsic_load_ubo)
11697ec681f3Smrg                        continue;
11707ec681f3Smrg
11717ec681f3Smrg                /* We only produce unifa sequences for non-divergent loads */
11727ec681f3Smrg                if (nir_src_is_divergent(intr->src[1]))
11737ec681f3Smrg                        continue;
11747ec681f3Smrg
11757ec681f3Smrg                /* If there are any UBO loads that are not constant or that
11767ec681f3Smrg                 * use a different UBO index in between the reference load and
11777ec681f3Smrg                 * any other constant load for the same index, they would break
11787ec681f3Smrg                 * the unifa sequence. We will flag that so we can then move
11797ec681f3Smrg                 * all constant UBO loads for the reference index before these
11807ec681f3Smrg                 * and not just the ones that are not ordered to avoid breaking
11817ec681f3Smrg                 * the sequence and reduce unifa writes.
11827ec681f3Smrg                 */
11837ec681f3Smrg                if (!nir_src_is_const(intr->src[1])) {
11847ec681f3Smrg                        seq_break = true;
11857ec681f3Smrg                        continue;
11867ec681f3Smrg                }
11877ec681f3Smrg                uint32_t offset = nir_src_as_uint(intr->src[1]);
11887ec681f3Smrg
11897ec681f3Smrg                assert(nir_src_is_const(intr->src[0]));
11907ec681f3Smrg                uint32_t index = nir_src_as_uint(intr->src[0]);
11917ec681f3Smrg                if (index != ref_index) {
11927ec681f3Smrg                       seq_break = true;
11937ec681f3Smrg                       continue;
11947ec681f3Smrg                }
11957ec681f3Smrg
11967ec681f3Smrg                /* Only move loads with an offset that is close enough to the
11977ec681f3Smrg                 * reference offset, since otherwise we would not be able to
11987ec681f3Smrg                 * skip the unifa write for them. See ntq_emit_load_ubo_unifa.
11997ec681f3Smrg                 */
12007ec681f3Smrg                if (abs(ref_offset - offset) > MAX_UNIFA_SKIP_DISTANCE)
12017ec681f3Smrg                        continue;
12027ec681f3Smrg
12037ec681f3Smrg                /* We will move this load if its offset is smaller than ref's
12047ec681f3Smrg                 * (in which case we will move it before ref) or if the offset
12057ec681f3Smrg                 * is larger than ref's but there are sequence breakers in
12067ec681f3Smrg                 * in between (in which case we will move it after ref and
12077ec681f3Smrg                 * before the sequence breakers).
12087ec681f3Smrg                 */
12097ec681f3Smrg                if (!seq_break && offset >= ref_offset)
12107ec681f3Smrg                        continue;
12117ec681f3Smrg
12127ec681f3Smrg                /* Find where exactly we want to move this load:
12137ec681f3Smrg                 *
12147ec681f3Smrg                 * If we are moving it before ref, we want to check any other
12157ec681f3Smrg                 * UBO loads we placed before ref and make sure we insert this
12167ec681f3Smrg                 * one properly ordered with them. Likewise, if we are moving
12177ec681f3Smrg                 * it after ref.
12187ec681f3Smrg                 */
12197ec681f3Smrg                nir_instr *pos = ref_inst;
12207ec681f3Smrg                nir_instr *tmp = pos;
12217ec681f3Smrg                do {
12227ec681f3Smrg                        if (offset < ref_offset)
12237ec681f3Smrg                                tmp = nir_instr_prev(tmp);
12247ec681f3Smrg                        else
12257ec681f3Smrg                                tmp = nir_instr_next(tmp);
12267ec681f3Smrg
12277ec681f3Smrg                        if (!tmp || tmp == inst)
12287ec681f3Smrg                                break;
12297ec681f3Smrg
12307ec681f3Smrg                        /* Ignore non-unifa UBO loads */
12317ec681f3Smrg                        if (tmp->type != nir_instr_type_intrinsic)
12327ec681f3Smrg                                continue;
12337ec681f3Smrg
12347ec681f3Smrg                        nir_intrinsic_instr *tmp_intr =
12357ec681f3Smrg                                nir_instr_as_intrinsic(tmp);
12367ec681f3Smrg                        if (tmp_intr->intrinsic != nir_intrinsic_load_ubo)
12377ec681f3Smrg                                continue;
1238ed98bd31Smaya
12397ec681f3Smrg                        if (nir_src_is_divergent(tmp_intr->src[1]))
12407ec681f3Smrg                                continue;
12417ec681f3Smrg
12427ec681f3Smrg                        /* Stop if we find a unifa UBO load that breaks the
12437ec681f3Smrg                         * sequence.
12447ec681f3Smrg                         */
12457ec681f3Smrg                        if (!nir_src_is_const(tmp_intr->src[1]))
12467ec681f3Smrg                                break;
12477ec681f3Smrg
12487ec681f3Smrg                        if (nir_src_as_uint(tmp_intr->src[0]) != index)
12497ec681f3Smrg                                break;
12507ec681f3Smrg
12517ec681f3Smrg                        uint32_t tmp_offset = nir_src_as_uint(tmp_intr->src[1]);
12527ec681f3Smrg                        if (offset < ref_offset) {
12537ec681f3Smrg                                if (tmp_offset < offset ||
12547ec681f3Smrg                                    tmp_offset >= ref_offset) {
12557ec681f3Smrg                                        break;
12567ec681f3Smrg                                } else {
12577ec681f3Smrg                                        pos = tmp;
12587ec681f3Smrg                                }
12597ec681f3Smrg                        } else {
12607ec681f3Smrg                                if (tmp_offset > offset ||
12617ec681f3Smrg                                    tmp_offset <= ref_offset) {
12627ec681f3Smrg                                        break;
12637ec681f3Smrg                                } else {
12647ec681f3Smrg                                        pos = tmp;
12657ec681f3Smrg                                }
12667ec681f3Smrg                        }
12677ec681f3Smrg                } while (true);
12687ec681f3Smrg
12697ec681f3Smrg                /* We can't move the UBO load before the instruction that
12707ec681f3Smrg                 * defines its constant offset. If that instruction is placed
12717ec681f3Smrg                 * in between the new location (pos) and the current location
12727ec681f3Smrg                 * of this load, we will have to move that instruction too.
12737ec681f3Smrg                 *
12747ec681f3Smrg                 * We don't care about the UBO index definition because that
12757ec681f3Smrg                 * is optimized to be reused by all UBO loads for the same
12767ec681f3Smrg                 * index and therefore is certain to be defined before the
12777ec681f3Smrg                 * first UBO load that uses it.
12787ec681f3Smrg                 */
12797ec681f3Smrg                nir_instr *offset_inst = NULL;
12807ec681f3Smrg                tmp = inst;
12817ec681f3Smrg                while ((tmp = nir_instr_prev(tmp)) != NULL) {
12827ec681f3Smrg                        if (pos == tmp) {
12837ec681f3Smrg                                /* We reached the target location without
12847ec681f3Smrg                                 * finding the instruction that defines the
12857ec681f3Smrg                                 * offset, so that instruction must be before
12867ec681f3Smrg                                 * the new position and we don't have to fix it.
12877ec681f3Smrg                                 */
12887ec681f3Smrg                                break;
12897ec681f3Smrg                        }
12907ec681f3Smrg                        if (intr->src[1].ssa->parent_instr == tmp) {
12917ec681f3Smrg                                offset_inst = tmp;
12927ec681f3Smrg                                break;
12937ec681f3Smrg                        }
12947ec681f3Smrg                }
12957ec681f3Smrg
12967ec681f3Smrg                if (offset_inst) {
12977ec681f3Smrg                        exec_node_remove(&offset_inst->node);
12987ec681f3Smrg                        exec_node_insert_node_before(&pos->node,
12997ec681f3Smrg                                                     &offset_inst->node);
13007ec681f3Smrg                }
13017ec681f3Smrg
13027ec681f3Smrg                /* Since we are moving the instruction before its current
13037ec681f3Smrg                 * location, grab its successor before the move so that
13047ec681f3Smrg                 * we can continue the next iteration of the main loop from
13057ec681f3Smrg                 * that instruction.
13067ec681f3Smrg                 */
13077ec681f3Smrg                next_inst = nir_instr_next(inst);
13087ec681f3Smrg
13097ec681f3Smrg                /* Move this load to the selected location */
13107ec681f3Smrg                exec_node_remove(&inst->node);
13117ec681f3Smrg                if (offset < ref_offset)
13127ec681f3Smrg                        exec_node_insert_node_before(&pos->node, &inst->node);
13137ec681f3Smrg                else
13147ec681f3Smrg                        exec_node_insert_after(&pos->node, &inst->node);
13157ec681f3Smrg
13167ec681f3Smrg                progress = true;
13177ec681f3Smrg        }
13187ec681f3Smrg
13197ec681f3Smrg        return progress;
13207ec681f3Smrg}
13217ec681f3Smrg
13227ec681f3Smrgstatic bool
13237ec681f3Smrgv3d_nir_sort_constant_ubo_loads_block(struct v3d_compile *c,
13247ec681f3Smrg                                      nir_block *block)
13257ec681f3Smrg{
13267ec681f3Smrg        bool progress = false;
13277ec681f3Smrg        bool local_progress;
13287ec681f3Smrg        do {
13297ec681f3Smrg                local_progress = false;
13307ec681f3Smrg                nir_foreach_instr_safe(inst, block) {
13317ec681f3Smrg                        nir_intrinsic_instr *intr =
13327ec681f3Smrg                                nir_instr_as_constant_ubo_load(inst);
13337ec681f3Smrg                        if (intr) {
13347ec681f3Smrg                                local_progress |=
13357ec681f3Smrg                                        v3d_nir_sort_constant_ubo_load(block, intr);
13367ec681f3Smrg                        }
13377ec681f3Smrg                }
13387ec681f3Smrg                progress |= local_progress;
13397ec681f3Smrg        } while (local_progress);
13407ec681f3Smrg
13417ec681f3Smrg        return progress;
13427ec681f3Smrg}
13437ec681f3Smrg
13447ec681f3Smrg/**
13457ec681f3Smrg * Sorts constant UBO loads in each block by offset to maximize chances of
13467ec681f3Smrg * skipping unifa writes when converting to VIR. This can increase register
13477ec681f3Smrg * pressure.
13487ec681f3Smrg */
13497ec681f3Smrgstatic bool
13507ec681f3Smrgv3d_nir_sort_constant_ubo_loads(nir_shader *s, struct v3d_compile *c)
13517ec681f3Smrg{
13527ec681f3Smrg        nir_foreach_function(function, s) {
13537ec681f3Smrg                if (function->impl) {
13547ec681f3Smrg                        nir_foreach_block(block, function->impl) {
13557ec681f3Smrg                                c->sorted_any_ubo_loads |=
13567ec681f3Smrg                                        v3d_nir_sort_constant_ubo_loads_block(c, block);
13577ec681f3Smrg                        }
13587ec681f3Smrg                        nir_metadata_preserve(function->impl,
13597ec681f3Smrg                                              nir_metadata_block_index |
13607ec681f3Smrg                                              nir_metadata_dominance);
13617ec681f3Smrg                }
13627ec681f3Smrg        }
13637ec681f3Smrg        return c->sorted_any_ubo_loads;
13647ec681f3Smrg}
13657ec681f3Smrg
13667ec681f3Smrgstatic void
13677ec681f3Smrglower_load_num_subgroups(struct v3d_compile *c,
13687ec681f3Smrg                         nir_builder *b,
13697ec681f3Smrg                         nir_intrinsic_instr *intr)
13707ec681f3Smrg{
13717ec681f3Smrg        assert(c->s->info.stage == MESA_SHADER_COMPUTE);
13727ec681f3Smrg        assert(intr->intrinsic == nir_intrinsic_load_num_subgroups);
13737ec681f3Smrg
13747ec681f3Smrg        b->cursor = nir_after_instr(&intr->instr);
13757ec681f3Smrg        uint32_t num_subgroups =
13767ec681f3Smrg                DIV_ROUND_UP(c->s->info.workgroup_size[0] *
13777ec681f3Smrg                             c->s->info.workgroup_size[1] *
13787ec681f3Smrg                             c->s->info.workgroup_size[2], V3D_CHANNELS);
13797ec681f3Smrg        nir_ssa_def *result = nir_imm_int(b, num_subgroups);
13807ec681f3Smrg        nir_ssa_def_rewrite_uses(&intr->dest.ssa, result);
13817ec681f3Smrg        nir_instr_remove(&intr->instr);
13827ec681f3Smrg}
13837ec681f3Smrg
13847ec681f3Smrgstatic bool
13857ec681f3Smrglower_subgroup_intrinsics(struct v3d_compile *c,
13867ec681f3Smrg                          nir_block *block, nir_builder *b)
13877ec681f3Smrg{
13887ec681f3Smrg        bool progress = false;
13897ec681f3Smrg        nir_foreach_instr_safe(inst, block) {
13907ec681f3Smrg                if (inst->type != nir_instr_type_intrinsic)
13917ec681f3Smrg                        continue;;
13927ec681f3Smrg
13937ec681f3Smrg                nir_intrinsic_instr *intr =
13947ec681f3Smrg                        nir_instr_as_intrinsic(inst);
13957ec681f3Smrg                if (!intr)
13967ec681f3Smrg                        continue;
13977ec681f3Smrg
13987ec681f3Smrg                switch (intr->intrinsic) {
13997ec681f3Smrg                case nir_intrinsic_load_num_subgroups:
14007ec681f3Smrg                        lower_load_num_subgroups(c, b, intr);
14017ec681f3Smrg                        progress = true;
14027ec681f3Smrg                        FALLTHROUGH;
14037ec681f3Smrg                case nir_intrinsic_load_subgroup_id:
14047ec681f3Smrg                case nir_intrinsic_load_subgroup_size:
14057ec681f3Smrg                case nir_intrinsic_load_subgroup_invocation:
14067ec681f3Smrg                case nir_intrinsic_elect:
14077ec681f3Smrg                        c->has_subgroups = true;
14087ec681f3Smrg                        break;
14097ec681f3Smrg                default:
14107ec681f3Smrg                        break;
14117ec681f3Smrg                }
14127ec681f3Smrg        }
14137ec681f3Smrg
14147ec681f3Smrg        return progress;
14157ec681f3Smrg}
14167ec681f3Smrg
14177ec681f3Smrgstatic bool
14187ec681f3Smrgv3d_nir_lower_subgroup_intrinsics(nir_shader *s, struct v3d_compile *c)
14197ec681f3Smrg{
14207ec681f3Smrg        bool progress = false;
14217ec681f3Smrg        nir_foreach_function(function, s) {
14227ec681f3Smrg                if (function->impl) {
14237ec681f3Smrg                        nir_builder b;
14247ec681f3Smrg                        nir_builder_init(&b, function->impl);
14257ec681f3Smrg
14267ec681f3Smrg                        nir_foreach_block(block, function->impl)
14277ec681f3Smrg                                progress |= lower_subgroup_intrinsics(c, block, &b);
14287ec681f3Smrg
14297ec681f3Smrg                        nir_metadata_preserve(function->impl,
14307ec681f3Smrg                                              nir_metadata_block_index |
14317ec681f3Smrg                                              nir_metadata_dominance);
14327ec681f3Smrg                }
14337ec681f3Smrg        }
14347ec681f3Smrg        return progress;
14357ec681f3Smrg}
14367ec681f3Smrg
14377ec681f3Smrgstatic void
14387ec681f3Smrgv3d_attempt_compile(struct v3d_compile *c)
14397ec681f3Smrg{
1440ed98bd31Smaya        switch (c->s->info.stage) {
1441ed98bd31Smaya        case MESA_SHADER_VERTEX:
14427ec681f3Smrg                c->vs_key = (struct v3d_vs_key *) c->key;
14437ec681f3Smrg                break;
14447ec681f3Smrg        case MESA_SHADER_GEOMETRY:
14457ec681f3Smrg                c->gs_key = (struct v3d_gs_key *) c->key;
1446ed98bd31Smaya                break;
1447ed98bd31Smaya        case MESA_SHADER_FRAGMENT:
14487ec681f3Smrg                c->fs_key = (struct v3d_fs_key *) c->key;
1449ed98bd31Smaya                break;
1450ed98bd31Smaya        case MESA_SHADER_COMPUTE:
1451ed98bd31Smaya                break;
1452ed98bd31Smaya        default:
1453ed98bd31Smaya                unreachable("unsupported shader stage");
1454ed98bd31Smaya        }
1455ed98bd31Smaya
14567ec681f3Smrg        switch (c->s->info.stage) {
14577ec681f3Smrg        case MESA_SHADER_VERTEX:
1458ed98bd31Smaya                v3d_nir_lower_vs_early(c);
14597ec681f3Smrg                break;
14607ec681f3Smrg        case MESA_SHADER_GEOMETRY:
14617ec681f3Smrg                v3d_nir_lower_gs_early(c);
14627ec681f3Smrg                break;
14637ec681f3Smrg        case MESA_SHADER_FRAGMENT:
1464ed98bd31Smaya                v3d_nir_lower_fs_early(c);
14657ec681f3Smrg                break;
14667ec681f3Smrg        default:
14677ec681f3Smrg                break;
1468ed98bd31Smaya        }
1469ed98bd31Smaya
1470ed98bd31Smaya        v3d_lower_nir(c);
1471ed98bd31Smaya
14727ec681f3Smrg        switch (c->s->info.stage) {
14737ec681f3Smrg        case MESA_SHADER_VERTEX:
1474ed98bd31Smaya                v3d_nir_lower_vs_late(c);
14757ec681f3Smrg                break;
14767ec681f3Smrg        case MESA_SHADER_GEOMETRY:
14777ec681f3Smrg                v3d_nir_lower_gs_late(c);
14787ec681f3Smrg                break;
14797ec681f3Smrg        case MESA_SHADER_FRAGMENT:
1480ed98bd31Smaya                v3d_nir_lower_fs_late(c);
14817ec681f3Smrg                break;
14827ec681f3Smrg        default:
14837ec681f3Smrg                break;
1484ed98bd31Smaya        }
1485ed98bd31Smaya
1486ed98bd31Smaya        NIR_PASS_V(c->s, v3d_nir_lower_io, c);
1487ed98bd31Smaya        NIR_PASS_V(c->s, v3d_nir_lower_txf_ms, c);
1488ed98bd31Smaya        NIR_PASS_V(c->s, v3d_nir_lower_image_load_store);
14897ec681f3Smrg        nir_lower_idiv_options idiv_options = {
14907ec681f3Smrg                .imprecise_32bit_lowering = true,
14917ec681f3Smrg                .allow_fp16 = true,
14927ec681f3Smrg        };
14937ec681f3Smrg        NIR_PASS_V(c->s, nir_lower_idiv, &idiv_options);
14947ec681f3Smrg
14957ec681f3Smrg        if (c->key->robust_buffer_access) {
14967ec681f3Smrg           /* v3d_nir_lower_robust_buffer_access assumes constant buffer
14977ec681f3Smrg            * indices on ubo/ssbo intrinsics so run copy propagation and
14987ec681f3Smrg            * constant folding passes before we run the lowering to warrant
14997ec681f3Smrg            * this. We also want to run the lowering before v3d_optimize to
15007ec681f3Smrg            * clean-up redundant get_buffer_size calls produced in the pass.
15017ec681f3Smrg            */
15027ec681f3Smrg           NIR_PASS_V(c->s, nir_copy_prop);
15037ec681f3Smrg           NIR_PASS_V(c->s, nir_opt_constant_folding);
15047ec681f3Smrg           NIR_PASS_V(c->s, v3d_nir_lower_robust_buffer_access, c);
15057ec681f3Smrg        }
15067ec681f3Smrg
15077ec681f3Smrg        NIR_PASS_V(c->s, nir_lower_wrmasks, should_split_wrmask, c->s);
15087ec681f3Smrg
15097ec681f3Smrg        NIR_PASS_V(c->s, v3d_nir_lower_subgroup_intrinsics, c);
15107ec681f3Smrg
15117ec681f3Smrg        v3d_optimize_nir(c, c->s);
15127ec681f3Smrg
15137ec681f3Smrg        /* Do late algebraic optimization to turn add(a, neg(b)) back into
15147ec681f3Smrg         * subs, then the mandatory cleanup after algebraic.  Note that it may
15157ec681f3Smrg         * produce fnegs, and if so then we need to keep running to squash
15167ec681f3Smrg         * fneg(fneg(a)).
15177ec681f3Smrg         */
15187ec681f3Smrg        bool more_late_algebraic = true;
15197ec681f3Smrg        while (more_late_algebraic) {
15207ec681f3Smrg                more_late_algebraic = false;
15217ec681f3Smrg                NIR_PASS(more_late_algebraic, c->s, nir_opt_algebraic_late);
15227ec681f3Smrg                NIR_PASS_V(c->s, nir_opt_constant_folding);
15237ec681f3Smrg                NIR_PASS_V(c->s, nir_copy_prop);
15247ec681f3Smrg                NIR_PASS_V(c->s, nir_opt_dce);
15257ec681f3Smrg                NIR_PASS_V(c->s, nir_opt_cse);
15267ec681f3Smrg        }
152701e04c3fSmrg
1528ed98bd31Smaya        NIR_PASS_V(c->s, nir_lower_bool_to_int32);
15297ec681f3Smrg        nir_convert_to_lcssa(c->s, true, true);
15307ec681f3Smrg        NIR_PASS_V(c->s, nir_divergence_analysis);
153101e04c3fSmrg        NIR_PASS_V(c->s, nir_convert_from_ssa, true);
153201e04c3fSmrg
15337ec681f3Smrg        struct nir_schedule_options schedule_options = {
15347ec681f3Smrg                /* Schedule for about half our register space, to enable more
15357ec681f3Smrg                 * shaders to hit 4 threads.
15367ec681f3Smrg                 */
15377ec681f3Smrg                .threshold = 24,
15387ec681f3Smrg
15397ec681f3Smrg                /* Vertex shaders share the same memory for inputs and outputs,
15407ec681f3Smrg                 * fragement and geometry shaders do not.
15417ec681f3Smrg                 */
15427ec681f3Smrg                .stages_with_shared_io_memory =
15437ec681f3Smrg                (((1 << MESA_ALL_SHADER_STAGES) - 1) &
15447ec681f3Smrg                 ~((1 << MESA_SHADER_FRAGMENT) |
15457ec681f3Smrg                   (1 << MESA_SHADER_GEOMETRY))),
15467ec681f3Smrg
15477ec681f3Smrg                .fallback = c->fallback_scheduler,
15487ec681f3Smrg
15497ec681f3Smrg                .intrinsic_cb = v3d_intrinsic_dependency_cb,
15507ec681f3Smrg                .intrinsic_cb_data = c,
15517ec681f3Smrg        };
15527ec681f3Smrg        NIR_PASS_V(c->s, nir_schedule, &schedule_options);
15537ec681f3Smrg
15547ec681f3Smrg        if (!c->disable_constant_ubo_load_sorting)
15557ec681f3Smrg                NIR_PASS_V(c->s, v3d_nir_sort_constant_ubo_loads, c);
15567ec681f3Smrg
155701e04c3fSmrg        v3d_nir_to_vir(c);
15587ec681f3Smrg}
15597ec681f3Smrg
15607ec681f3Smrguint32_t
15617ec681f3Smrgv3d_prog_data_size(gl_shader_stage stage)
15627ec681f3Smrg{
15637ec681f3Smrg        static const int prog_data_size[] = {
15647ec681f3Smrg                [MESA_SHADER_VERTEX] = sizeof(struct v3d_vs_prog_data),
15657ec681f3Smrg                [MESA_SHADER_GEOMETRY] = sizeof(struct v3d_gs_prog_data),
15667ec681f3Smrg                [MESA_SHADER_FRAGMENT] = sizeof(struct v3d_fs_prog_data),
15677ec681f3Smrg                [MESA_SHADER_COMPUTE] = sizeof(struct v3d_compute_prog_data),
15687ec681f3Smrg        };
15697ec681f3Smrg
15707ec681f3Smrg        assert(stage >= 0 &&
15717ec681f3Smrg               stage < ARRAY_SIZE(prog_data_size) &&
15727ec681f3Smrg               prog_data_size[stage]);
15737ec681f3Smrg
15747ec681f3Smrg        return prog_data_size[stage];
15757ec681f3Smrg}
15767ec681f3Smrg
15777ec681f3Smrgint v3d_shaderdb_dump(struct v3d_compile *c,
15787ec681f3Smrg		      char **shaderdb_str)
15797ec681f3Smrg{
15807ec681f3Smrg        if (c == NULL || c->compilation_result != V3D_COMPILATION_SUCCEEDED)
15817ec681f3Smrg                return -1;
15827ec681f3Smrg
15837ec681f3Smrg        return asprintf(shaderdb_str,
15847ec681f3Smrg                        "%s shader: %d inst, %d threads, %d loops, "
15857ec681f3Smrg                        "%d uniforms, %d max-temps, %d:%d spills:fills, "
15867ec681f3Smrg                        "%d sfu-stalls, %d inst-and-stalls, %d nops",
15877ec681f3Smrg                        vir_get_stage_name(c),
15887ec681f3Smrg                        c->qpu_inst_count,
15897ec681f3Smrg                        c->threads,
15907ec681f3Smrg                        c->loops,
15917ec681f3Smrg                        c->num_uniforms,
15927ec681f3Smrg                        vir_get_max_temps(c),
15937ec681f3Smrg                        c->spills,
15947ec681f3Smrg                        c->fills,
15957ec681f3Smrg                        c->qpu_inst_stalled_count,
15967ec681f3Smrg                        c->qpu_inst_count + c->qpu_inst_stalled_count,
15977ec681f3Smrg                        c->nop_count);
15987ec681f3Smrg}
15997ec681f3Smrg
16007ec681f3Smrg/* This is a list of incremental changes to the compilation strategy
16017ec681f3Smrg * that will be used to try to compile the shader successfully. The
16027ec681f3Smrg * default strategy is to enable all optimizations which will have
16037ec681f3Smrg * the highest register pressure but is expected to produce most
16047ec681f3Smrg * optimal code. Following strategies incrementally disable specific
16057ec681f3Smrg * optimizations that are known to contribute to register pressure
16067ec681f3Smrg * in order to be able to compile the shader successfully while meeting
16077ec681f3Smrg * thread count requirements.
16087ec681f3Smrg *
16097ec681f3Smrg * V3D 4.1+ has a min thread count of 2, but we can use 1 here to also
16107ec681f3Smrg * cover previous hardware as well (meaning that we are not limiting
16117ec681f3Smrg * register allocation to any particular thread count). This is fine
16127ec681f3Smrg * because v3d_nir_to_vir will cap this to the actual minimum.
16137ec681f3Smrg */
16147ec681f3Smrgstruct v3d_compiler_strategy {
16157ec681f3Smrg        const char *name;
16167ec681f3Smrg        uint32_t max_threads;
16177ec681f3Smrg        uint32_t min_threads;
16187ec681f3Smrg        bool disable_loop_unrolling;
16197ec681f3Smrg        bool disable_ubo_load_sorting;
16207ec681f3Smrg        bool disable_tmu_pipelining;
16217ec681f3Smrg        bool tmu_spilling_allowed;
16227ec681f3Smrg} static const strategies[] = {
16237ec681f3Smrg  /*0*/ { "default",                        4, 4, false, false, false, false },
16247ec681f3Smrg  /*1*/ { "disable loop unrolling",         4, 4, true,  false, false, false },
16257ec681f3Smrg  /*2*/ { "disable UBO load sorting",       4, 4, true,  true,  false, false },
16267ec681f3Smrg  /*3*/ { "disable TMU pipelining",         4, 4, true,  true,  true,  false },
16277ec681f3Smrg  /*4*/ { "lower thread count",             2, 1, false, false, false, false },
16287ec681f3Smrg  /*5*/ { "disable loop unrolling (ltc)",   2, 1, true,  false, false, false },
16297ec681f3Smrg  /*6*/ { "disable UBO load sorting (ltc)", 2, 1, true,  true,  false, false },
16307ec681f3Smrg  /*7*/ { "disable TMU pipelining (ltc)",   2, 1, true,  true,  true,  true  },
16317ec681f3Smrg  /*8*/ { "fallback scheduler",             2, 1, true,  true,  true,  true  }
16327ec681f3Smrg};
16337ec681f3Smrg
16347ec681f3Smrg/**
16357ec681f3Smrg * If a particular optimization didn't make any progress during a compile
16367ec681f3Smrg * attempt disabling it alone won't allow us to compile the shader successfuly,
16377ec681f3Smrg * since we'll end up with the same code. Detect these scenarios so we can
16387ec681f3Smrg * avoid wasting time with useless compiles. We should also consider if the
16397ec681f3Smrg * strategy changes other aspects of the compilation process though, like
16407ec681f3Smrg * spilling, and not skip it in that case.
16417ec681f3Smrg */
16427ec681f3Smrgstatic bool
16437ec681f3Smrgskip_compile_strategy(struct v3d_compile *c, uint32_t idx)
16447ec681f3Smrg{
16457ec681f3Smrg   /* We decide if we can skip a strategy based on the optimizations that
16467ec681f3Smrg    * were active in the previous strategy, so we should only be calling this
16477ec681f3Smrg    * for strategies after the first.
16487ec681f3Smrg    */
16497ec681f3Smrg   assert(idx > 0);
16507ec681f3Smrg
16517ec681f3Smrg   /* Don't skip a strategy that changes spilling behavior */
16527ec681f3Smrg   if (strategies[idx].tmu_spilling_allowed !=
16537ec681f3Smrg       strategies[idx - 1].tmu_spilling_allowed) {
16547ec681f3Smrg           return false;
16557ec681f3Smrg   }
16567ec681f3Smrg
16577ec681f3Smrg   switch (idx) {
16587ec681f3Smrg   /* Loop unrolling: skip if we didn't unroll any loops */
16597ec681f3Smrg   case 1:
16607ec681f3Smrg   case 5:
16617ec681f3Smrg           return !c->unrolled_any_loops;
16627ec681f3Smrg   /* UBO load sorting: skip if we didn't sort any loads */
16637ec681f3Smrg   case 2:
16647ec681f3Smrg   case 6:
16657ec681f3Smrg           return !c->sorted_any_ubo_loads;
16667ec681f3Smrg   /* TMU pipelining: skip if we didn't pipeline any TMU ops */
16677ec681f3Smrg   case 3:
16687ec681f3Smrg   case 7:
16697ec681f3Smrg           return !c->pipelined_any_tmu;
16707ec681f3Smrg   /* Lower thread count: skip if we already tried less that 4 threads */
16717ec681f3Smrg   case 4:
16727ec681f3Smrg          return c->threads < 4;
16737ec681f3Smrg   default:
16747ec681f3Smrg           return false;
16757ec681f3Smrg   };
16767ec681f3Smrg}
16777ec681f3Smrguint64_t *v3d_compile(const struct v3d_compiler *compiler,
16787ec681f3Smrg                      struct v3d_key *key,
16797ec681f3Smrg                      struct v3d_prog_data **out_prog_data,
16807ec681f3Smrg                      nir_shader *s,
16817ec681f3Smrg                      void (*debug_output)(const char *msg,
16827ec681f3Smrg                                           void *debug_output_data),
16837ec681f3Smrg                      void *debug_output_data,
16847ec681f3Smrg                      int program_id, int variant_id,
16857ec681f3Smrg                      uint32_t *final_assembly_size)
16867ec681f3Smrg{
16877ec681f3Smrg        struct v3d_compile *c = NULL;
16887ec681f3Smrg        for (int i = 0; i < ARRAY_SIZE(strategies); i++) {
16897ec681f3Smrg                /* Fallback strategy */
16907ec681f3Smrg                if (i > 0) {
16917ec681f3Smrg                        assert(c);
16927ec681f3Smrg                        if (skip_compile_strategy(c, i))
16937ec681f3Smrg                                continue;
16947ec681f3Smrg
16957ec681f3Smrg                        char *debug_msg;
16967ec681f3Smrg                        int ret = asprintf(&debug_msg,
16977ec681f3Smrg                                           "Falling back to strategy '%s' for %s",
16987ec681f3Smrg                                           strategies[i].name,
16997ec681f3Smrg                                           vir_get_stage_name(c));
17007ec681f3Smrg
17017ec681f3Smrg                        if (ret >= 0) {
17027ec681f3Smrg                                if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF))
17037ec681f3Smrg                                        fprintf(stderr, "%s\n", debug_msg);
17047ec681f3Smrg
17057ec681f3Smrg                                c->debug_output(debug_msg, c->debug_output_data);
17067ec681f3Smrg                                free(debug_msg);
17077ec681f3Smrg                        }
17087ec681f3Smrg
17097ec681f3Smrg                        vir_compile_destroy(c);
17107ec681f3Smrg                }
17117ec681f3Smrg
17127ec681f3Smrg                c = vir_compile_init(compiler, key, s,
17137ec681f3Smrg                                     debug_output, debug_output_data,
17147ec681f3Smrg                                     program_id, variant_id,
17157ec681f3Smrg                                     strategies[i].max_threads,
17167ec681f3Smrg                                     strategies[i].min_threads,
17177ec681f3Smrg                                     strategies[i].tmu_spilling_allowed,
17187ec681f3Smrg                                     strategies[i].disable_loop_unrolling,
17197ec681f3Smrg                                     strategies[i].disable_ubo_load_sorting,
17207ec681f3Smrg                                     strategies[i].disable_tmu_pipelining,
17217ec681f3Smrg                                     i == ARRAY_SIZE(strategies) - 1);
17227ec681f3Smrg
17237ec681f3Smrg                v3d_attempt_compile(c);
17247ec681f3Smrg
17257ec681f3Smrg                if (i >= ARRAY_SIZE(strategies) - 1 ||
17267ec681f3Smrg                    c->compilation_result !=
17277ec681f3Smrg                    V3D_COMPILATION_FAILED_REGISTER_ALLOCATION) {
17287ec681f3Smrg                        break;
17297ec681f3Smrg                }
17307ec681f3Smrg        }
17317ec681f3Smrg
17327ec681f3Smrg        if (unlikely(V3D_DEBUG & V3D_DEBUG_PERF) &&
17337ec681f3Smrg            c->compilation_result !=
17347ec681f3Smrg            V3D_COMPILATION_FAILED_REGISTER_ALLOCATION &&
17357ec681f3Smrg            c->spills > 0) {
17367ec681f3Smrg                char *debug_msg;
17377ec681f3Smrg                int ret = asprintf(&debug_msg,
17387ec681f3Smrg                                   "Compiled %s with %d spills and %d fills",
17397ec681f3Smrg                                   vir_get_stage_name(c),
17407ec681f3Smrg                                   c->spills, c->fills);
17417ec681f3Smrg                fprintf(stderr, "%s\n", debug_msg);
17427ec681f3Smrg
17437ec681f3Smrg                if (ret >= 0) {
17447ec681f3Smrg                        c->debug_output(debug_msg, c->debug_output_data);
17457ec681f3Smrg                        free(debug_msg);
17467ec681f3Smrg                }
17477ec681f3Smrg        }
17487ec681f3Smrg
17497ec681f3Smrg        if (c->compilation_result != V3D_COMPILATION_SUCCEEDED) {
17507ec681f3Smrg                fprintf(stderr, "Failed to compile %s with any strategy.\n",
17517ec681f3Smrg                        vir_get_stage_name(c));
17527ec681f3Smrg        }
17537ec681f3Smrg
17547ec681f3Smrg        struct v3d_prog_data *prog_data;
17557ec681f3Smrg
17567ec681f3Smrg        prog_data = rzalloc_size(NULL, v3d_prog_data_size(c->s->info.stage));
175701e04c3fSmrg
1758ed98bd31Smaya        v3d_set_prog_data(c, prog_data);
1759ed98bd31Smaya
1760ed98bd31Smaya        *out_prog_data = prog_data;
1761ed98bd31Smaya
1762ed98bd31Smaya        char *shaderdb;
17637ec681f3Smrg        int ret = v3d_shaderdb_dump(c, &shaderdb);
1764ed98bd31Smaya        if (ret >= 0) {
1765ed98bd31Smaya                if (V3D_DEBUG & V3D_DEBUG_SHADERDB)
1766ed98bd31Smaya                        fprintf(stderr, "SHADER-DB: %s\n", shaderdb);
1767ed98bd31Smaya
1768ed98bd31Smaya                c->debug_output(shaderdb, c->debug_output_data);
1769ed98bd31Smaya                free(shaderdb);
1770ed98bd31Smaya        }
177101e04c3fSmrg
1772ed98bd31Smaya       return v3d_return_qpu_insts(c, final_assembly_size);
177301e04c3fSmrg}
177401e04c3fSmrg
177501e04c3fSmrgvoid
177601e04c3fSmrgvir_remove_instruction(struct v3d_compile *c, struct qinst *qinst)
177701e04c3fSmrg{
177801e04c3fSmrg        if (qinst->dst.file == QFILE_TEMP)
177901e04c3fSmrg                c->defs[qinst->dst.index] = NULL;
178001e04c3fSmrg
178101e04c3fSmrg        assert(&qinst->link != c->cursor.link);
178201e04c3fSmrg
178301e04c3fSmrg        list_del(&qinst->link);
178401e04c3fSmrg        free(qinst);
178501e04c3fSmrg
178601e04c3fSmrg        c->live_intervals_valid = false;
178701e04c3fSmrg}
178801e04c3fSmrg
178901e04c3fSmrgstruct qreg
179001e04c3fSmrgvir_follow_movs(struct v3d_compile *c, struct qreg reg)
179101e04c3fSmrg{
179201e04c3fSmrg        /* XXX
179301e04c3fSmrg        int pack = reg.pack;
179401e04c3fSmrg
179501e04c3fSmrg        while (reg.file == QFILE_TEMP &&
179601e04c3fSmrg               c->defs[reg.index] &&
179701e04c3fSmrg               (c->defs[reg.index]->op == QOP_MOV ||
179801e04c3fSmrg                c->defs[reg.index]->op == QOP_FMOV) &&
179901e04c3fSmrg               !c->defs[reg.index]->dst.pack &&
180001e04c3fSmrg               !c->defs[reg.index]->src[0].pack) {
180101e04c3fSmrg                reg = c->defs[reg.index]->src[0];
180201e04c3fSmrg        }
180301e04c3fSmrg
180401e04c3fSmrg        reg.pack = pack;
180501e04c3fSmrg        */
180601e04c3fSmrg        return reg;
180701e04c3fSmrg}
180801e04c3fSmrg
180901e04c3fSmrgvoid
181001e04c3fSmrgvir_compile_destroy(struct v3d_compile *c)
181101e04c3fSmrg{
181201e04c3fSmrg        /* Defuse the assert that we aren't removing the cursor's instruction.
181301e04c3fSmrg         */
181401e04c3fSmrg        c->cursor.link = NULL;
181501e04c3fSmrg
181601e04c3fSmrg        vir_for_each_block(block, c) {
18177ec681f3Smrg                while (!list_is_empty(&block->instructions)) {
181801e04c3fSmrg                        struct qinst *qinst =
181901e04c3fSmrg                                list_first_entry(&block->instructions,
182001e04c3fSmrg                                                 struct qinst, link);
182101e04c3fSmrg                        vir_remove_instruction(c, qinst);
182201e04c3fSmrg                }
182301e04c3fSmrg        }
182401e04c3fSmrg
182501e04c3fSmrg        ralloc_free(c);
182601e04c3fSmrg}
182701e04c3fSmrg
1828ed98bd31Smayauint32_t
1829ed98bd31Smayavir_get_uniform_index(struct v3d_compile *c,
1830ed98bd31Smaya                      enum quniform_contents contents,
1831ed98bd31Smaya                      uint32_t data)
183201e04c3fSmrg{
183301e04c3fSmrg        for (int i = 0; i < c->num_uniforms; i++) {
183401e04c3fSmrg                if (c->uniform_contents[i] == contents &&
183501e04c3fSmrg                    c->uniform_data[i] == data) {
1836ed98bd31Smaya                        return i;
183701e04c3fSmrg                }
183801e04c3fSmrg        }
183901e04c3fSmrg
184001e04c3fSmrg        uint32_t uniform = c->num_uniforms++;
184101e04c3fSmrg
184201e04c3fSmrg        if (uniform >= c->uniform_array_size) {
184301e04c3fSmrg                c->uniform_array_size = MAX2(MAX2(16, uniform + 1),
184401e04c3fSmrg                                             c->uniform_array_size * 2);
184501e04c3fSmrg
184601e04c3fSmrg                c->uniform_data = reralloc(c, c->uniform_data,
184701e04c3fSmrg                                           uint32_t,
184801e04c3fSmrg                                           c->uniform_array_size);
184901e04c3fSmrg                c->uniform_contents = reralloc(c, c->uniform_contents,
185001e04c3fSmrg                                               enum quniform_contents,
185101e04c3fSmrg                                               c->uniform_array_size);
185201e04c3fSmrg        }
185301e04c3fSmrg
185401e04c3fSmrg        c->uniform_contents[uniform] = contents;
185501e04c3fSmrg        c->uniform_data[uniform] = data;
185601e04c3fSmrg
1857ed98bd31Smaya        return uniform;
185801e04c3fSmrg}
185901e04c3fSmrg
18607ec681f3Smrg/* Looks back into the current block to find the ldunif that wrote the uniform
18617ec681f3Smrg * at the requested index. If it finds it, it returns true and writes the
18627ec681f3Smrg * destination register of the ldunif instruction to 'unif'.
18637ec681f3Smrg *
18647ec681f3Smrg * This can impact register pressure and end up leading to worse code, so we
18657ec681f3Smrg * limit the number of instructions we are willing to look back through to
18667ec681f3Smrg * strike a good balance.
18677ec681f3Smrg */
18687ec681f3Smrgstatic bool
18697ec681f3Smrgtry_opt_ldunif(struct v3d_compile *c, uint32_t index, struct qreg *unif)
18707ec681f3Smrg{
18717ec681f3Smrg        uint32_t count = 20;
18727ec681f3Smrg        struct qinst *prev_inst = NULL;
18737ec681f3Smrg        assert(c->cur_block);
18747ec681f3Smrg
18757ec681f3Smrg#ifdef DEBUG
18767ec681f3Smrg        /* We can only reuse a uniform if it was emitted in the same block,
18777ec681f3Smrg         * so callers must make sure the current instruction is being emitted
18787ec681f3Smrg         * in the current block.
18797ec681f3Smrg         */
18807ec681f3Smrg        bool found = false;
18817ec681f3Smrg        vir_for_each_inst(inst, c->cur_block) {
18827ec681f3Smrg                if (&inst->link == c->cursor.link) {
18837ec681f3Smrg                        found = true;
18847ec681f3Smrg                        break;
18857ec681f3Smrg                }
18867ec681f3Smrg        }
18877ec681f3Smrg
18887ec681f3Smrg        assert(found || &c->cur_block->instructions == c->cursor.link);
18897ec681f3Smrg#endif
18907ec681f3Smrg
18917ec681f3Smrg        list_for_each_entry_from_rev(struct qinst, inst, c->cursor.link->prev,
18927ec681f3Smrg                                     &c->cur_block->instructions, link) {
18937ec681f3Smrg                if ((inst->qpu.sig.ldunif || inst->qpu.sig.ldunifrf) &&
18947ec681f3Smrg                    inst->uniform == index) {
18957ec681f3Smrg                        prev_inst = inst;
18967ec681f3Smrg                        break;
18977ec681f3Smrg                }
18987ec681f3Smrg
18997ec681f3Smrg                if (--count == 0)
19007ec681f3Smrg                        break;
19017ec681f3Smrg        }
19027ec681f3Smrg
19037ec681f3Smrg        if (!prev_inst)
19047ec681f3Smrg                return false;
19057ec681f3Smrg
19067ec681f3Smrg
19077ec681f3Smrg        list_for_each_entry_from(struct qinst, inst, prev_inst->link.next,
19087ec681f3Smrg                                 &c->cur_block->instructions, link) {
19097ec681f3Smrg                if (inst->dst.file == prev_inst->dst.file &&
19107ec681f3Smrg                    inst->dst.index == prev_inst->dst.index) {
19117ec681f3Smrg                        return false;
19127ec681f3Smrg                }
19137ec681f3Smrg        }
19147ec681f3Smrg
19157ec681f3Smrg        *unif = prev_inst->dst;
19167ec681f3Smrg        return true;
19177ec681f3Smrg}
19187ec681f3Smrg
1919ed98bd31Smayastruct qreg
1920ed98bd31Smayavir_uniform(struct v3d_compile *c,
1921ed98bd31Smaya            enum quniform_contents contents,
1922ed98bd31Smaya            uint32_t data)
192301e04c3fSmrg{
19247ec681f3Smrg        const int num_uniforms = c->num_uniforms;
19257ec681f3Smrg        const int index = vir_get_uniform_index(c, contents, data);
19267ec681f3Smrg
19277ec681f3Smrg        /* If this is not the first time we see this uniform try to reuse the
19287ec681f3Smrg         * result of the last ldunif that loaded it.
19297ec681f3Smrg         */
19307ec681f3Smrg        const bool is_new_uniform = num_uniforms != c->num_uniforms;
19317ec681f3Smrg        if (!is_new_uniform && !c->disable_ldunif_opt) {
19327ec681f3Smrg                struct qreg ldunif_dst;
19337ec681f3Smrg                if (try_opt_ldunif(c, index, &ldunif_dst))
19347ec681f3Smrg                        return ldunif_dst;
19357ec681f3Smrg        }
19367ec681f3Smrg
1937ed98bd31Smaya        struct qinst *inst = vir_NOP(c);
1938ed98bd31Smaya        inst->qpu.sig.ldunif = true;
19397ec681f3Smrg        inst->uniform = index;
1940ed98bd31Smaya        inst->dst = vir_get_temp(c);
1941ed98bd31Smaya        c->defs[inst->dst.index] = inst;
1942ed98bd31Smaya        return inst->dst;
194301e04c3fSmrg}
194401e04c3fSmrg
194501e04c3fSmrg#define OPTPASS(func)                                                   \
194601e04c3fSmrg        do {                                                            \
194701e04c3fSmrg                bool stage_progress = func(c);                          \
194801e04c3fSmrg                if (stage_progress) {                                   \
194901e04c3fSmrg                        progress = true;                                \
195001e04c3fSmrg                        if (print_opt_debug) {                          \
195101e04c3fSmrg                                fprintf(stderr,                         \
195201e04c3fSmrg                                        "VIR opt pass %2d: %s progress\n", \
195301e04c3fSmrg                                        pass, #func);                   \
195401e04c3fSmrg                        }                                               \
195501e04c3fSmrg                        /*XXX vir_validate(c);*/                        \
195601e04c3fSmrg                }                                                       \
195701e04c3fSmrg        } while (0)
195801e04c3fSmrg
195901e04c3fSmrgvoid
196001e04c3fSmrgvir_optimize(struct v3d_compile *c)
196101e04c3fSmrg{
196201e04c3fSmrg        bool print_opt_debug = false;
196301e04c3fSmrg        int pass = 1;
196401e04c3fSmrg
196501e04c3fSmrg        while (true) {
196601e04c3fSmrg                bool progress = false;
196701e04c3fSmrg
196801e04c3fSmrg                OPTPASS(vir_opt_copy_propagate);
1969ed98bd31Smaya                OPTPASS(vir_opt_redundant_flags);
197001e04c3fSmrg                OPTPASS(vir_opt_dead_code);
197101e04c3fSmrg                OPTPASS(vir_opt_small_immediates);
19727ec681f3Smrg                OPTPASS(vir_opt_constant_alu);
197301e04c3fSmrg
197401e04c3fSmrg                if (!progress)
197501e04c3fSmrg                        break;
197601e04c3fSmrg
197701e04c3fSmrg                pass++;
197801e04c3fSmrg        }
197901e04c3fSmrg}
198001e04c3fSmrg
198101e04c3fSmrgconst char *
198201e04c3fSmrgvir_get_stage_name(struct v3d_compile *c)
198301e04c3fSmrg{
198401e04c3fSmrg        if (c->vs_key && c->vs_key->is_coord)
19857ec681f3Smrg                return "MESA_SHADER_VERTEX_BIN";
19867ec681f3Smrg        else if (c->gs_key && c->gs_key->is_coord)
19877ec681f3Smrg                return "MESA_SHADER_GEOMETRY_BIN";
198801e04c3fSmrg        else
198901e04c3fSmrg                return gl_shader_stage_name(c->s->info.stage);
199001e04c3fSmrg}
19917ec681f3Smrg
19927ec681f3Smrgstatic inline uint32_t
19937ec681f3Smrgcompute_vpm_size_in_sectors(const struct v3d_device_info *devinfo)
19947ec681f3Smrg{
19957ec681f3Smrg   assert(devinfo->vpm_size > 0);
19967ec681f3Smrg   const uint32_t sector_size = V3D_CHANNELS * sizeof(uint32_t) * 8;
19977ec681f3Smrg   return devinfo->vpm_size / sector_size;
19987ec681f3Smrg}
19997ec681f3Smrg
20007ec681f3Smrg/* Computes various parameters affecting VPM memory configuration for programs
20017ec681f3Smrg * involving geometry shaders to ensure the program fits in memory and honors
20027ec681f3Smrg * requirements described in section "VPM usage" of the programming manual.
20037ec681f3Smrg */
20047ec681f3Smrgstatic bool
20057ec681f3Smrgcompute_vpm_config_gs(struct v3d_device_info *devinfo,
20067ec681f3Smrg                      struct v3d_vs_prog_data *vs,
20077ec681f3Smrg                      struct v3d_gs_prog_data *gs,
20087ec681f3Smrg                      struct vpm_config *vpm_cfg_out)
20097ec681f3Smrg{
20107ec681f3Smrg   const uint32_t A = vs->separate_segments ? 1 : 0;
20117ec681f3Smrg   const uint32_t Ad = vs->vpm_input_size;
20127ec681f3Smrg   const uint32_t Vd = vs->vpm_output_size;
20137ec681f3Smrg
20147ec681f3Smrg   const uint32_t vpm_size = compute_vpm_size_in_sectors(devinfo);
20157ec681f3Smrg
20167ec681f3Smrg   /* Try to fit program into our VPM memory budget by adjusting
20177ec681f3Smrg    * configurable parameters iteratively. We do this in two phases:
20187ec681f3Smrg    * the first phase tries to fit the program into the total available
20197ec681f3Smrg    * VPM memory. If we succeed at that, then the second phase attempts
20207ec681f3Smrg    * to fit the program into half of that budget so we can run bin and
20217ec681f3Smrg    * render programs in parallel.
20227ec681f3Smrg    */
20237ec681f3Smrg   struct vpm_config vpm_cfg[2];
20247ec681f3Smrg   struct vpm_config *final_vpm_cfg = NULL;
20257ec681f3Smrg   uint32_t phase = 0;
20267ec681f3Smrg
20277ec681f3Smrg   vpm_cfg[phase].As = 1;
20287ec681f3Smrg   vpm_cfg[phase].Gs = 1;
20297ec681f3Smrg   vpm_cfg[phase].Gd = gs->vpm_output_size;
20307ec681f3Smrg   vpm_cfg[phase].gs_width = gs->simd_width;
20317ec681f3Smrg
20327ec681f3Smrg   /* While there is a requirement that Vc >= [Vn / 16], this is
20337ec681f3Smrg    * always the case when tessellation is not present because in that
20347ec681f3Smrg    * case Vn can only be 6 at most (when input primitive is triangles
20357ec681f3Smrg    * with adjacency).
20367ec681f3Smrg    *
20377ec681f3Smrg    * We always choose Vc=2. We can't go lower than this due to GFXH-1744,
20387ec681f3Smrg    * and Broadcom has not found it worth it to increase it beyond this
20397ec681f3Smrg    * in general. Increasing Vc also increases VPM memory pressure which
20407ec681f3Smrg    * can turn up being detrimental for performance in some scenarios.
20417ec681f3Smrg    */
20427ec681f3Smrg   vpm_cfg[phase].Vc = 2;
20437ec681f3Smrg
20447ec681f3Smrg   /* Gv is a constraint on the hardware to not exceed the
20457ec681f3Smrg    * specified number of vertex segments per GS batch. If adding a
20467ec681f3Smrg    * new primitive to a GS batch would result in a range of more
20477ec681f3Smrg    * than Gv vertex segments being referenced by the batch, then
20487ec681f3Smrg    * the hardware will flush the batch and start a new one. This
20497ec681f3Smrg    * means that we can choose any value we want, we just need to
20507ec681f3Smrg    * be aware that larger values improve GS batch utilization
20517ec681f3Smrg    * at the expense of more VPM memory pressure (which can affect
20527ec681f3Smrg    * other performance aspects, such as GS dispatch width).
20537ec681f3Smrg    * We start with the largest value, and will reduce it if we
20547ec681f3Smrg    * find that total memory pressure is too high.
20557ec681f3Smrg    */
20567ec681f3Smrg   vpm_cfg[phase].Gv = 3;
20577ec681f3Smrg   do {
20587ec681f3Smrg      /* When GS is present in absence of TES, then we need to satisfy
20597ec681f3Smrg       * that Ve >= Gv. We go with the smallest value of Ve to avoid
20607ec681f3Smrg       * increasing memory pressure.
20617ec681f3Smrg       */
20627ec681f3Smrg      vpm_cfg[phase].Ve = vpm_cfg[phase].Gv;
20637ec681f3Smrg
20647ec681f3Smrg      uint32_t vpm_sectors =
20657ec681f3Smrg         A * vpm_cfg[phase].As * Ad +
20667ec681f3Smrg         (vpm_cfg[phase].Vc + vpm_cfg[phase].Ve) * Vd +
20677ec681f3Smrg         vpm_cfg[phase].Gs * vpm_cfg[phase].Gd;
20687ec681f3Smrg
20697ec681f3Smrg      /* Ideally we want to use no more than half of the available
20707ec681f3Smrg       * memory so we can execute a bin and render program in parallel
20717ec681f3Smrg       * without stalls. If we achieved that then we are done.
20727ec681f3Smrg       */
20737ec681f3Smrg      if (vpm_sectors <= vpm_size / 2) {
20747ec681f3Smrg         final_vpm_cfg = &vpm_cfg[phase];
20757ec681f3Smrg         break;
20767ec681f3Smrg      }
20777ec681f3Smrg
20787ec681f3Smrg      /* At the very least, we should not allocate more than the
20797ec681f3Smrg       * total available VPM memory. If we have a configuration that
20807ec681f3Smrg       * succeeds at this we save it and continue to see if we can
20817ec681f3Smrg       * meet the half-memory-use criteria too.
20827ec681f3Smrg       */
20837ec681f3Smrg      if (phase == 0 && vpm_sectors <= vpm_size) {
20847ec681f3Smrg         vpm_cfg[1] = vpm_cfg[0];
20857ec681f3Smrg         phase = 1;
20867ec681f3Smrg      }
20877ec681f3Smrg
20887ec681f3Smrg      /* Try lowering Gv */
20897ec681f3Smrg      if (vpm_cfg[phase].Gv > 0) {
20907ec681f3Smrg         vpm_cfg[phase].Gv--;
20917ec681f3Smrg         continue;
20927ec681f3Smrg      }
20937ec681f3Smrg
20947ec681f3Smrg      /* Try lowering GS dispatch width */
20957ec681f3Smrg      if (vpm_cfg[phase].gs_width > 1) {
20967ec681f3Smrg         do {
20977ec681f3Smrg            vpm_cfg[phase].gs_width >>= 1;
20987ec681f3Smrg            vpm_cfg[phase].Gd = align(vpm_cfg[phase].Gd, 2) / 2;
20997ec681f3Smrg         } while (vpm_cfg[phase].gs_width == 2);
21007ec681f3Smrg
21017ec681f3Smrg         /* Reset Gv to max after dropping dispatch width */
21027ec681f3Smrg         vpm_cfg[phase].Gv = 3;
21037ec681f3Smrg         continue;
21047ec681f3Smrg      }
21057ec681f3Smrg
21067ec681f3Smrg      /* We ran out of options to reduce memory pressure. If we
21077ec681f3Smrg       * are at phase 1 we have at least a valid configuration, so we
21087ec681f3Smrg       * we use that.
21097ec681f3Smrg       */
21107ec681f3Smrg      if (phase == 1)
21117ec681f3Smrg         final_vpm_cfg = &vpm_cfg[0];
21127ec681f3Smrg      break;
21137ec681f3Smrg   } while (true);
21147ec681f3Smrg
21157ec681f3Smrg   if (!final_vpm_cfg)
21167ec681f3Smrg      return false;
21177ec681f3Smrg
21187ec681f3Smrg   assert(final_vpm_cfg);
21197ec681f3Smrg   assert(final_vpm_cfg->Gd <= 16);
21207ec681f3Smrg   assert(final_vpm_cfg->Gv < 4);
21217ec681f3Smrg   assert(final_vpm_cfg->Ve < 4);
21227ec681f3Smrg   assert(final_vpm_cfg->Vc >= 2 && final_vpm_cfg->Vc <= 4);
21237ec681f3Smrg   assert(final_vpm_cfg->gs_width == 1 ||
21247ec681f3Smrg          final_vpm_cfg->gs_width == 4 ||
21257ec681f3Smrg          final_vpm_cfg->gs_width == 8 ||
21267ec681f3Smrg          final_vpm_cfg->gs_width == 16);
21277ec681f3Smrg
21287ec681f3Smrg   *vpm_cfg_out = *final_vpm_cfg;
21297ec681f3Smrg   return true;
21307ec681f3Smrg}
21317ec681f3Smrg
21327ec681f3Smrgbool
21337ec681f3Smrgv3d_compute_vpm_config(struct v3d_device_info *devinfo,
21347ec681f3Smrg                       struct v3d_vs_prog_data *vs_bin,
21357ec681f3Smrg                       struct v3d_vs_prog_data *vs,
21367ec681f3Smrg                       struct v3d_gs_prog_data *gs_bin,
21377ec681f3Smrg                       struct v3d_gs_prog_data *gs,
21387ec681f3Smrg                       struct vpm_config *vpm_cfg_bin,
21397ec681f3Smrg                       struct vpm_config *vpm_cfg)
21407ec681f3Smrg{
21417ec681f3Smrg   assert(vs && vs_bin);
21427ec681f3Smrg   assert((gs != NULL) == (gs_bin != NULL));
21437ec681f3Smrg
21447ec681f3Smrg   if (!gs) {
21457ec681f3Smrg      vpm_cfg_bin->As = 1;
21467ec681f3Smrg      vpm_cfg_bin->Ve = 0;
21477ec681f3Smrg      vpm_cfg_bin->Vc = vs_bin->vcm_cache_size;
21487ec681f3Smrg
21497ec681f3Smrg      vpm_cfg->As = 1;
21507ec681f3Smrg      vpm_cfg->Ve = 0;
21517ec681f3Smrg      vpm_cfg->Vc = vs->vcm_cache_size;
21527ec681f3Smrg   } else {
21537ec681f3Smrg      if (!compute_vpm_config_gs(devinfo, vs_bin, gs_bin, vpm_cfg_bin))
21547ec681f3Smrg         return false;
21557ec681f3Smrg
21567ec681f3Smrg      if (!compute_vpm_config_gs(devinfo, vs, gs, vpm_cfg))
21577ec681f3Smrg         return false;
21587ec681f3Smrg   }
21597ec681f3Smrg
21607ec681f3Smrg   return true;
21617ec681f3Smrg}
2162