17ec681f3Smrg/*
27ec681f3Smrg * Copyright (C) 2019-2021 Collabora, Ltd.
37ec681f3Smrg *
47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a
57ec681f3Smrg * copy of this software and associated documentation files (the "Software"),
67ec681f3Smrg * to deal in the Software without restriction, including without limitation
77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the
97ec681f3Smrg * Software is furnished to do so, subject to the following conditions:
107ec681f3Smrg *
117ec681f3Smrg * The above copyright notice and this permission notice (including the next
127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the
137ec681f3Smrg * Software.
147ec681f3Smrg *
157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
217ec681f3Smrg * SOFTWARE.
227ec681f3Smrg *
237ec681f3Smrg * Authors (Collabora):
247ec681f3Smrg *    Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com>
257ec681f3Smrg */
267ec681f3Smrg
277ec681f3Smrg#include "compiler.h"
287ec681f3Smrg
297ec681f3Smrg/* Bifrost texture operations have a `skip` bit, instructinh helper invocations
307ec681f3Smrg * to skip execution. Each clause has a `terminate_discarded_threads` bit,
317ec681f3Smrg * which will terminate helper invocations.
327ec681f3Smrg *
337ec681f3Smrg * The terminate bit should be set on the last clause requiring helper
347ec681f3Smrg * invocations. Without control flow, that's the last source-order instruction;
357ec681f3Smrg * with control flow, there may be multiple such instructions (with ifs) or no
367ec681f3Smrg * such instruction (with loops).
377ec681f3Smrg *
387ec681f3Smrg * The skip bit should be set unless the value of this instruction is required
397ec681f3Smrg * by a future instruction requiring helper invocations. Consider:
407ec681f3Smrg *
417ec681f3Smrg *      0 = texture ...
427ec681f3Smrg *      1 = fmul 0, #10
437ec681f3Smrg *      2 = dfdx 1
447ec681f3Smrg *      store 2
457ec681f3Smrg *
467ec681f3Smrg * Since the derivative calculation 2 requires helper invocations, the value 1
477ec681f3Smrg * must be calculated by helper invocations, and since it depends on 0, 0 must
487ec681f3Smrg * be calculated by helpers. Hence the texture op does NOT have the skip bit
497ec681f3Smrg * set, and the clause containing the derivative has the terminate bit set.
507ec681f3Smrg *
517ec681f3Smrg * Calculating the terminate bit occurs by forward dataflow analysis to
527ec681f3Smrg * determine which blocks require helper invocations. A block requires
537ec681f3Smrg * invocations in if any of its instructions use helper invocations, or if it
547ec681f3Smrg * depends on a block that requires invocation. With that analysis, the
557ec681f3Smrg * terminate bit is set on the last instruction using invocations within any
567ec681f3Smrg * block that does *not* require invocations out.
577ec681f3Smrg *
587ec681f3Smrg * Likewise, calculating the execute bit requires backward dataflow analysis
597ec681f3Smrg * with union as the join operation and the generating set being the union of
607ec681f3Smrg * sources of instructions writing executed values. The skip bit is the inverse
617ec681f3Smrg * of the execute bit.
627ec681f3Smrg */
637ec681f3Smrg
647ec681f3Smrgstatic bool
657ec681f3Smrgbi_has_skip_bit(enum bi_opcode op)
667ec681f3Smrg{
677ec681f3Smrg        switch (op) {
687ec681f3Smrg        case BI_OPCODE_TEXC:
697ec681f3Smrg        case BI_OPCODE_TEXS_2D_F16:
707ec681f3Smrg        case BI_OPCODE_TEXS_2D_F32:
717ec681f3Smrg        case BI_OPCODE_TEXS_CUBE_F16:
727ec681f3Smrg        case BI_OPCODE_TEXS_CUBE_F32:
737ec681f3Smrg        case BI_OPCODE_VAR_TEX_F16:
747ec681f3Smrg        case BI_OPCODE_VAR_TEX_F32:
757ec681f3Smrg                return true;
767ec681f3Smrg        default:
777ec681f3Smrg                return false;
787ec681f3Smrg        }
797ec681f3Smrg}
807ec681f3Smrg
817ec681f3Smrg/* Does a given instruction require helper threads to be active (because it
827ec681f3Smrg * reads from other subgroup lanes)? This only applies to fragment shaders.
837ec681f3Smrg * Other shader stages do not have a notion of helper threads. */
847ec681f3Smrg
857ec681f3Smrgstatic bool
867ec681f3Smrgbi_instr_uses_helpers(bi_instr *I)
877ec681f3Smrg{
887ec681f3Smrg        switch (I->op) {
897ec681f3Smrg        case BI_OPCODE_TEXC:
907ec681f3Smrg        case BI_OPCODE_TEXS_2D_F16:
917ec681f3Smrg        case BI_OPCODE_TEXS_2D_F32:
927ec681f3Smrg        case BI_OPCODE_TEXS_CUBE_F16:
937ec681f3Smrg        case BI_OPCODE_TEXS_CUBE_F32:
947ec681f3Smrg        case BI_OPCODE_VAR_TEX_F16:
957ec681f3Smrg        case BI_OPCODE_VAR_TEX_F32:
967ec681f3Smrg                return !I->lod_mode; /* set for zero, clear for computed */
977ec681f3Smrg        case BI_OPCODE_CLPER_I32:
987ec681f3Smrg        case BI_OPCODE_CLPER_V6_I32:
997ec681f3Smrg                /* Fragment shaders require helpers to implement derivatives.
1007ec681f3Smrg                 * Other shader stages don't have helpers at all */
1017ec681f3Smrg                return true;
1027ec681f3Smrg        default:
1037ec681f3Smrg                return false;
1047ec681f3Smrg        }
1057ec681f3Smrg}
1067ec681f3Smrg
1077ec681f3Smrg/* Does a block use helpers directly */
1087ec681f3Smrgstatic bool
1097ec681f3Smrgbi_block_uses_helpers(bi_block *block)
1107ec681f3Smrg{
1117ec681f3Smrg        bi_foreach_instr_in_block(block, I) {
1127ec681f3Smrg                if (bi_instr_uses_helpers(I))
1137ec681f3Smrg                        return true;
1147ec681f3Smrg        }
1157ec681f3Smrg
1167ec681f3Smrg        return false;
1177ec681f3Smrg}
1187ec681f3Smrg
1197ec681f3Smrgstatic bool
1207ec681f3Smrgbi_block_terminates_helpers(bi_block *block)
1217ec681f3Smrg{
1227ec681f3Smrg        /* Can't terminate if a successor needs helpers */
1237ec681f3Smrg        bi_foreach_successor(block, succ) {
1247ec681f3Smrg                if (succ->pass_flags & 1)
1257ec681f3Smrg                        return false;
1267ec681f3Smrg        }
1277ec681f3Smrg
1287ec681f3Smrg        /* Otherwise we terminate */
1297ec681f3Smrg        return true;
1307ec681f3Smrg}
1317ec681f3Smrg
1327ec681f3Smrgvoid
1337ec681f3Smrgbi_analyze_helper_terminate(bi_context *ctx)
1347ec681f3Smrg{
1357ec681f3Smrg        /* Other shader stages do not have a notion of helper threads, so we
1367ec681f3Smrg         * can skip the analysis. Don't run for blend shaders, either, since
1377ec681f3Smrg         * they run in the context of another shader that we don't see. */
1387ec681f3Smrg        if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend)
1397ec681f3Smrg                return;
1407ec681f3Smrg
1417ec681f3Smrg        /* Set blocks as directly requiring helpers, and if they do add them to
1427ec681f3Smrg         * the worklist to propagate to their predecessors */
1437ec681f3Smrg
1447ec681f3Smrg        struct set *worklist = _mesa_set_create(NULL,
1457ec681f3Smrg                        _mesa_hash_pointer,
1467ec681f3Smrg                        _mesa_key_pointer_equal);
1477ec681f3Smrg
1487ec681f3Smrg        struct set *visited = _mesa_set_create(NULL,
1497ec681f3Smrg                        _mesa_hash_pointer,
1507ec681f3Smrg                        _mesa_key_pointer_equal);
1517ec681f3Smrg
1527ec681f3Smrg        bi_foreach_block(ctx, block) {
1537ec681f3Smrg                block->pass_flags = bi_block_uses_helpers(block) ? 1 : 0;
1547ec681f3Smrg
1557ec681f3Smrg                if (block->pass_flags & 1)
1567ec681f3Smrg                        _mesa_set_add(worklist, block);
1577ec681f3Smrg        }
1587ec681f3Smrg
1597ec681f3Smrg        /* Next, propagate back. Since there are a finite number of blocks, the
1607ec681f3Smrg         * worklist (a subset of all the blocks) is finite. Since a block can
1617ec681f3Smrg         * only be added to the worklist if it is not on the visited list and
1627ec681f3Smrg         * the visited list - also a subset of the blocks - grows every
1637ec681f3Smrg         * iteration, the algorithm must terminate. */
1647ec681f3Smrg
1657ec681f3Smrg        struct set_entry *cur;
1667ec681f3Smrg
1677ec681f3Smrg        while((cur = _mesa_set_next_entry(worklist, NULL)) != NULL) {
1687ec681f3Smrg                /* Pop off a block requiring helpers */
1697ec681f3Smrg                bi_block *blk = (struct bi_block *) cur->key;
1707ec681f3Smrg                _mesa_set_remove(worklist, cur);
1717ec681f3Smrg
1727ec681f3Smrg                /* Its predecessors also require helpers */
1737ec681f3Smrg                bi_foreach_predecessor(blk, pred) {
1747ec681f3Smrg                        if (!_mesa_set_search(visited, pred)) {
1757ec681f3Smrg                                pred->pass_flags |= 1;
1767ec681f3Smrg                                _mesa_set_add(worklist, pred);
1777ec681f3Smrg                        }
1787ec681f3Smrg                }
1797ec681f3Smrg
1807ec681f3Smrg                _mesa_set_add(visited, blk);
1817ec681f3Smrg        }
1827ec681f3Smrg
1837ec681f3Smrg        _mesa_set_destroy(visited, NULL);
1847ec681f3Smrg        _mesa_set_destroy(worklist, NULL);
1857ec681f3Smrg
1867ec681f3Smrg        /* Finally, mark clauses requiring helpers */
1877ec681f3Smrg        bi_foreach_block(ctx, block) {
1887ec681f3Smrg                /* At the end, there are helpers iff we don't terminate */
1897ec681f3Smrg                bool helpers = !bi_block_terminates_helpers(block);
1907ec681f3Smrg
1917ec681f3Smrg                bi_foreach_clause_in_block_rev(block, clause) {
1927ec681f3Smrg                        bi_foreach_instr_in_clause_rev(block, clause, I) {
1937ec681f3Smrg                                helpers |= bi_instr_uses_helpers(I);
1947ec681f3Smrg                        }
1957ec681f3Smrg
1967ec681f3Smrg                        clause->td = !helpers;
1977ec681f3Smrg                }
1987ec681f3Smrg        }
1997ec681f3Smrg}
2007ec681f3Smrg
2017ec681f3Smrgstatic bool
2027ec681f3Smrgbi_helper_block_update(BITSET_WORD *deps, bi_block *block)
2037ec681f3Smrg{
2047ec681f3Smrg        bool progress = false;
2057ec681f3Smrg
2067ec681f3Smrg        bi_foreach_instr_in_block_rev(block, I) {
2077ec681f3Smrg                /* If our destination is required by helper invocation... */
2087ec681f3Smrg                if (I->dest[0].type != BI_INDEX_NORMAL)
2097ec681f3Smrg                        continue;
2107ec681f3Smrg
2117ec681f3Smrg                if (!BITSET_TEST(deps, bi_get_node(I->dest[0])))
2127ec681f3Smrg                        continue;
2137ec681f3Smrg
2147ec681f3Smrg                /* ...so are our sources */
2157ec681f3Smrg                bi_foreach_src(I, s) {
2167ec681f3Smrg                        if (I->src[s].type == BI_INDEX_NORMAL) {
2177ec681f3Smrg                                unsigned node = bi_get_node(I->src[s]);
2187ec681f3Smrg                                progress |= !BITSET_TEST(deps, node);
2197ec681f3Smrg                                BITSET_SET(deps, node);
2207ec681f3Smrg                        }
2217ec681f3Smrg                }
2227ec681f3Smrg        }
2237ec681f3Smrg
2247ec681f3Smrg        return progress;
2257ec681f3Smrg}
2267ec681f3Smrg
2277ec681f3Smrgvoid
2287ec681f3Smrgbi_analyze_helper_requirements(bi_context *ctx)
2297ec681f3Smrg{
2307ec681f3Smrg        unsigned temp_count = bi_max_temp(ctx);
2317ec681f3Smrg        BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), BITSET_WORDS(temp_count));
2327ec681f3Smrg
2337ec681f3Smrg        /* Initialize with the sources of instructions consuming
2347ec681f3Smrg         * derivatives */
2357ec681f3Smrg
2367ec681f3Smrg        bi_foreach_instr_global(ctx, I) {
2377ec681f3Smrg                if (I->dest[0].type != BI_INDEX_NORMAL) continue;
2387ec681f3Smrg                if (!bi_instr_uses_helpers(I)) continue;
2397ec681f3Smrg
2407ec681f3Smrg                bi_foreach_src(I, s) {
2417ec681f3Smrg                        if (I->src[s].type == BI_INDEX_NORMAL)
2427ec681f3Smrg                                BITSET_SET(deps, bi_get_node(I->src[s]));
2437ec681f3Smrg                }
2447ec681f3Smrg        }
2457ec681f3Smrg
2467ec681f3Smrg        /* Propagate that up */
2477ec681f3Smrg
2487ec681f3Smrg        struct set *work_list = _mesa_set_create(NULL,
2497ec681f3Smrg                        _mesa_hash_pointer,
2507ec681f3Smrg                        _mesa_key_pointer_equal);
2517ec681f3Smrg
2527ec681f3Smrg        struct set *visited = _mesa_set_create(NULL,
2537ec681f3Smrg                        _mesa_hash_pointer,
2547ec681f3Smrg                        _mesa_key_pointer_equal);
2557ec681f3Smrg
2567ec681f3Smrg        struct set_entry *cur = _mesa_set_add(work_list, pan_exit_block(&ctx->blocks));
2577ec681f3Smrg
2587ec681f3Smrg        do {
2597ec681f3Smrg                bi_block *blk = (struct bi_block *) cur->key;
2607ec681f3Smrg                _mesa_set_remove(work_list, cur);
2617ec681f3Smrg
2627ec681f3Smrg                bool progress = bi_helper_block_update(deps, blk);
2637ec681f3Smrg
2647ec681f3Smrg                if (progress || !_mesa_set_search(visited, blk)) {
2657ec681f3Smrg                        bi_foreach_predecessor(blk, pred)
2667ec681f3Smrg                                _mesa_set_add(work_list, pred);
2677ec681f3Smrg                }
2687ec681f3Smrg
2697ec681f3Smrg                _mesa_set_add(visited, blk);
2707ec681f3Smrg        } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL);
2717ec681f3Smrg
2727ec681f3Smrg        _mesa_set_destroy(visited, NULL);
2737ec681f3Smrg        _mesa_set_destroy(work_list, NULL);
2747ec681f3Smrg
2757ec681f3Smrg        /* Set the execute bits */
2767ec681f3Smrg
2777ec681f3Smrg        bi_foreach_instr_global(ctx, I) {
2787ec681f3Smrg                if (!bi_has_skip_bit(I->op)) continue;
2797ec681f3Smrg                if (I->dest[0].type != BI_INDEX_NORMAL) continue;
2807ec681f3Smrg
2817ec681f3Smrg                I->skip = !BITSET_TEST(deps, bi_get_node(I->dest[0]));
2827ec681f3Smrg        }
2837ec681f3Smrg
2847ec681f3Smrg        free(deps);
2857ec681f3Smrg}
286