17ec681f3Smrg/* 27ec681f3Smrg * Copyright (C) 2019-2021 Collabora, Ltd. 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg * Authors (Collabora): 247ec681f3Smrg * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 257ec681f3Smrg */ 267ec681f3Smrg 277ec681f3Smrg#include "compiler.h" 287ec681f3Smrg 297ec681f3Smrg/* Bifrost texture operations have a `skip` bit, instructinh helper invocations 307ec681f3Smrg * to skip execution. Each clause has a `terminate_discarded_threads` bit, 317ec681f3Smrg * which will terminate helper invocations. 327ec681f3Smrg * 337ec681f3Smrg * The terminate bit should be set on the last clause requiring helper 347ec681f3Smrg * invocations. Without control flow, that's the last source-order instruction; 357ec681f3Smrg * with control flow, there may be multiple such instructions (with ifs) or no 367ec681f3Smrg * such instruction (with loops). 377ec681f3Smrg * 387ec681f3Smrg * The skip bit should be set unless the value of this instruction is required 397ec681f3Smrg * by a future instruction requiring helper invocations. Consider: 407ec681f3Smrg * 417ec681f3Smrg * 0 = texture ... 427ec681f3Smrg * 1 = fmul 0, #10 437ec681f3Smrg * 2 = dfdx 1 447ec681f3Smrg * store 2 457ec681f3Smrg * 467ec681f3Smrg * Since the derivative calculation 2 requires helper invocations, the value 1 477ec681f3Smrg * must be calculated by helper invocations, and since it depends on 0, 0 must 487ec681f3Smrg * be calculated by helpers. Hence the texture op does NOT have the skip bit 497ec681f3Smrg * set, and the clause containing the derivative has the terminate bit set. 507ec681f3Smrg * 517ec681f3Smrg * Calculating the terminate bit occurs by forward dataflow analysis to 527ec681f3Smrg * determine which blocks require helper invocations. A block requires 537ec681f3Smrg * invocations in if any of its instructions use helper invocations, or if it 547ec681f3Smrg * depends on a block that requires invocation. With that analysis, the 557ec681f3Smrg * terminate bit is set on the last instruction using invocations within any 567ec681f3Smrg * block that does *not* require invocations out. 577ec681f3Smrg * 587ec681f3Smrg * Likewise, calculating the execute bit requires backward dataflow analysis 597ec681f3Smrg * with union as the join operation and the generating set being the union of 607ec681f3Smrg * sources of instructions writing executed values. The skip bit is the inverse 617ec681f3Smrg * of the execute bit. 627ec681f3Smrg */ 637ec681f3Smrg 647ec681f3Smrgstatic bool 657ec681f3Smrgbi_has_skip_bit(enum bi_opcode op) 667ec681f3Smrg{ 677ec681f3Smrg switch (op) { 687ec681f3Smrg case BI_OPCODE_TEXC: 697ec681f3Smrg case BI_OPCODE_TEXS_2D_F16: 707ec681f3Smrg case BI_OPCODE_TEXS_2D_F32: 717ec681f3Smrg case BI_OPCODE_TEXS_CUBE_F16: 727ec681f3Smrg case BI_OPCODE_TEXS_CUBE_F32: 737ec681f3Smrg case BI_OPCODE_VAR_TEX_F16: 747ec681f3Smrg case BI_OPCODE_VAR_TEX_F32: 757ec681f3Smrg return true; 767ec681f3Smrg default: 777ec681f3Smrg return false; 787ec681f3Smrg } 797ec681f3Smrg} 807ec681f3Smrg 817ec681f3Smrg/* Does a given instruction require helper threads to be active (because it 827ec681f3Smrg * reads from other subgroup lanes)? This only applies to fragment shaders. 837ec681f3Smrg * Other shader stages do not have a notion of helper threads. */ 847ec681f3Smrg 857ec681f3Smrgstatic bool 867ec681f3Smrgbi_instr_uses_helpers(bi_instr *I) 877ec681f3Smrg{ 887ec681f3Smrg switch (I->op) { 897ec681f3Smrg case BI_OPCODE_TEXC: 907ec681f3Smrg case BI_OPCODE_TEXS_2D_F16: 917ec681f3Smrg case BI_OPCODE_TEXS_2D_F32: 927ec681f3Smrg case BI_OPCODE_TEXS_CUBE_F16: 937ec681f3Smrg case BI_OPCODE_TEXS_CUBE_F32: 947ec681f3Smrg case BI_OPCODE_VAR_TEX_F16: 957ec681f3Smrg case BI_OPCODE_VAR_TEX_F32: 967ec681f3Smrg return !I->lod_mode; /* set for zero, clear for computed */ 977ec681f3Smrg case BI_OPCODE_CLPER_I32: 987ec681f3Smrg case BI_OPCODE_CLPER_V6_I32: 997ec681f3Smrg /* Fragment shaders require helpers to implement derivatives. 1007ec681f3Smrg * Other shader stages don't have helpers at all */ 1017ec681f3Smrg return true; 1027ec681f3Smrg default: 1037ec681f3Smrg return false; 1047ec681f3Smrg } 1057ec681f3Smrg} 1067ec681f3Smrg 1077ec681f3Smrg/* Does a block use helpers directly */ 1087ec681f3Smrgstatic bool 1097ec681f3Smrgbi_block_uses_helpers(bi_block *block) 1107ec681f3Smrg{ 1117ec681f3Smrg bi_foreach_instr_in_block(block, I) { 1127ec681f3Smrg if (bi_instr_uses_helpers(I)) 1137ec681f3Smrg return true; 1147ec681f3Smrg } 1157ec681f3Smrg 1167ec681f3Smrg return false; 1177ec681f3Smrg} 1187ec681f3Smrg 1197ec681f3Smrgstatic bool 1207ec681f3Smrgbi_block_terminates_helpers(bi_block *block) 1217ec681f3Smrg{ 1227ec681f3Smrg /* Can't terminate if a successor needs helpers */ 1237ec681f3Smrg bi_foreach_successor(block, succ) { 1247ec681f3Smrg if (succ->pass_flags & 1) 1257ec681f3Smrg return false; 1267ec681f3Smrg } 1277ec681f3Smrg 1287ec681f3Smrg /* Otherwise we terminate */ 1297ec681f3Smrg return true; 1307ec681f3Smrg} 1317ec681f3Smrg 1327ec681f3Smrgvoid 1337ec681f3Smrgbi_analyze_helper_terminate(bi_context *ctx) 1347ec681f3Smrg{ 1357ec681f3Smrg /* Other shader stages do not have a notion of helper threads, so we 1367ec681f3Smrg * can skip the analysis. Don't run for blend shaders, either, since 1377ec681f3Smrg * they run in the context of another shader that we don't see. */ 1387ec681f3Smrg if (ctx->stage != MESA_SHADER_FRAGMENT || ctx->inputs->is_blend) 1397ec681f3Smrg return; 1407ec681f3Smrg 1417ec681f3Smrg /* Set blocks as directly requiring helpers, and if they do add them to 1427ec681f3Smrg * the worklist to propagate to their predecessors */ 1437ec681f3Smrg 1447ec681f3Smrg struct set *worklist = _mesa_set_create(NULL, 1457ec681f3Smrg _mesa_hash_pointer, 1467ec681f3Smrg _mesa_key_pointer_equal); 1477ec681f3Smrg 1487ec681f3Smrg struct set *visited = _mesa_set_create(NULL, 1497ec681f3Smrg _mesa_hash_pointer, 1507ec681f3Smrg _mesa_key_pointer_equal); 1517ec681f3Smrg 1527ec681f3Smrg bi_foreach_block(ctx, block) { 1537ec681f3Smrg block->pass_flags = bi_block_uses_helpers(block) ? 1 : 0; 1547ec681f3Smrg 1557ec681f3Smrg if (block->pass_flags & 1) 1567ec681f3Smrg _mesa_set_add(worklist, block); 1577ec681f3Smrg } 1587ec681f3Smrg 1597ec681f3Smrg /* Next, propagate back. Since there are a finite number of blocks, the 1607ec681f3Smrg * worklist (a subset of all the blocks) is finite. Since a block can 1617ec681f3Smrg * only be added to the worklist if it is not on the visited list and 1627ec681f3Smrg * the visited list - also a subset of the blocks - grows every 1637ec681f3Smrg * iteration, the algorithm must terminate. */ 1647ec681f3Smrg 1657ec681f3Smrg struct set_entry *cur; 1667ec681f3Smrg 1677ec681f3Smrg while((cur = _mesa_set_next_entry(worklist, NULL)) != NULL) { 1687ec681f3Smrg /* Pop off a block requiring helpers */ 1697ec681f3Smrg bi_block *blk = (struct bi_block *) cur->key; 1707ec681f3Smrg _mesa_set_remove(worklist, cur); 1717ec681f3Smrg 1727ec681f3Smrg /* Its predecessors also require helpers */ 1737ec681f3Smrg bi_foreach_predecessor(blk, pred) { 1747ec681f3Smrg if (!_mesa_set_search(visited, pred)) { 1757ec681f3Smrg pred->pass_flags |= 1; 1767ec681f3Smrg _mesa_set_add(worklist, pred); 1777ec681f3Smrg } 1787ec681f3Smrg } 1797ec681f3Smrg 1807ec681f3Smrg _mesa_set_add(visited, blk); 1817ec681f3Smrg } 1827ec681f3Smrg 1837ec681f3Smrg _mesa_set_destroy(visited, NULL); 1847ec681f3Smrg _mesa_set_destroy(worklist, NULL); 1857ec681f3Smrg 1867ec681f3Smrg /* Finally, mark clauses requiring helpers */ 1877ec681f3Smrg bi_foreach_block(ctx, block) { 1887ec681f3Smrg /* At the end, there are helpers iff we don't terminate */ 1897ec681f3Smrg bool helpers = !bi_block_terminates_helpers(block); 1907ec681f3Smrg 1917ec681f3Smrg bi_foreach_clause_in_block_rev(block, clause) { 1927ec681f3Smrg bi_foreach_instr_in_clause_rev(block, clause, I) { 1937ec681f3Smrg helpers |= bi_instr_uses_helpers(I); 1947ec681f3Smrg } 1957ec681f3Smrg 1967ec681f3Smrg clause->td = !helpers; 1977ec681f3Smrg } 1987ec681f3Smrg } 1997ec681f3Smrg} 2007ec681f3Smrg 2017ec681f3Smrgstatic bool 2027ec681f3Smrgbi_helper_block_update(BITSET_WORD *deps, bi_block *block) 2037ec681f3Smrg{ 2047ec681f3Smrg bool progress = false; 2057ec681f3Smrg 2067ec681f3Smrg bi_foreach_instr_in_block_rev(block, I) { 2077ec681f3Smrg /* If our destination is required by helper invocation... */ 2087ec681f3Smrg if (I->dest[0].type != BI_INDEX_NORMAL) 2097ec681f3Smrg continue; 2107ec681f3Smrg 2117ec681f3Smrg if (!BITSET_TEST(deps, bi_get_node(I->dest[0]))) 2127ec681f3Smrg continue; 2137ec681f3Smrg 2147ec681f3Smrg /* ...so are our sources */ 2157ec681f3Smrg bi_foreach_src(I, s) { 2167ec681f3Smrg if (I->src[s].type == BI_INDEX_NORMAL) { 2177ec681f3Smrg unsigned node = bi_get_node(I->src[s]); 2187ec681f3Smrg progress |= !BITSET_TEST(deps, node); 2197ec681f3Smrg BITSET_SET(deps, node); 2207ec681f3Smrg } 2217ec681f3Smrg } 2227ec681f3Smrg } 2237ec681f3Smrg 2247ec681f3Smrg return progress; 2257ec681f3Smrg} 2267ec681f3Smrg 2277ec681f3Smrgvoid 2287ec681f3Smrgbi_analyze_helper_requirements(bi_context *ctx) 2297ec681f3Smrg{ 2307ec681f3Smrg unsigned temp_count = bi_max_temp(ctx); 2317ec681f3Smrg BITSET_WORD *deps = calloc(sizeof(BITSET_WORD), BITSET_WORDS(temp_count)); 2327ec681f3Smrg 2337ec681f3Smrg /* Initialize with the sources of instructions consuming 2347ec681f3Smrg * derivatives */ 2357ec681f3Smrg 2367ec681f3Smrg bi_foreach_instr_global(ctx, I) { 2377ec681f3Smrg if (I->dest[0].type != BI_INDEX_NORMAL) continue; 2387ec681f3Smrg if (!bi_instr_uses_helpers(I)) continue; 2397ec681f3Smrg 2407ec681f3Smrg bi_foreach_src(I, s) { 2417ec681f3Smrg if (I->src[s].type == BI_INDEX_NORMAL) 2427ec681f3Smrg BITSET_SET(deps, bi_get_node(I->src[s])); 2437ec681f3Smrg } 2447ec681f3Smrg } 2457ec681f3Smrg 2467ec681f3Smrg /* Propagate that up */ 2477ec681f3Smrg 2487ec681f3Smrg struct set *work_list = _mesa_set_create(NULL, 2497ec681f3Smrg _mesa_hash_pointer, 2507ec681f3Smrg _mesa_key_pointer_equal); 2517ec681f3Smrg 2527ec681f3Smrg struct set *visited = _mesa_set_create(NULL, 2537ec681f3Smrg _mesa_hash_pointer, 2547ec681f3Smrg _mesa_key_pointer_equal); 2557ec681f3Smrg 2567ec681f3Smrg struct set_entry *cur = _mesa_set_add(work_list, pan_exit_block(&ctx->blocks)); 2577ec681f3Smrg 2587ec681f3Smrg do { 2597ec681f3Smrg bi_block *blk = (struct bi_block *) cur->key; 2607ec681f3Smrg _mesa_set_remove(work_list, cur); 2617ec681f3Smrg 2627ec681f3Smrg bool progress = bi_helper_block_update(deps, blk); 2637ec681f3Smrg 2647ec681f3Smrg if (progress || !_mesa_set_search(visited, blk)) { 2657ec681f3Smrg bi_foreach_predecessor(blk, pred) 2667ec681f3Smrg _mesa_set_add(work_list, pred); 2677ec681f3Smrg } 2687ec681f3Smrg 2697ec681f3Smrg _mesa_set_add(visited, blk); 2707ec681f3Smrg } while((cur = _mesa_set_next_entry(work_list, NULL)) != NULL); 2717ec681f3Smrg 2727ec681f3Smrg _mesa_set_destroy(visited, NULL); 2737ec681f3Smrg _mesa_set_destroy(work_list, NULL); 2747ec681f3Smrg 2757ec681f3Smrg /* Set the execute bits */ 2767ec681f3Smrg 2777ec681f3Smrg bi_foreach_instr_global(ctx, I) { 2787ec681f3Smrg if (!bi_has_skip_bit(I->op)) continue; 2797ec681f3Smrg if (I->dest[0].type != BI_INDEX_NORMAL) continue; 2807ec681f3Smrg 2817ec681f3Smrg I->skip = !BITSET_TEST(deps, bi_get_node(I->dest[0])); 2827ec681f3Smrg } 2837ec681f3Smrg 2847ec681f3Smrg free(deps); 2857ec681f3Smrg} 286