17ec681f3Smrg/* 27ec681f3Smrg * Copyright © 2020 Google, Inc. 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#include "nir.h" 257ec681f3Smrg#include "nir_builder.h" 267ec681f3Smrg 277ec681f3Smrg/* A pass to split intrinsics with discontinuous writemasks into ones 287ec681f3Smrg * with contiguous writemasks starting with .x, ie: 297ec681f3Smrg * 307ec681f3Smrg * vec4 32 ssa_76 = vec4 ssa_35, ssa_35, ssa_35, ssa_35 317ec681f3Smrg * intrinsic store_ssbo (ssa_76, ssa_105, ssa_106) (2, 0, 4, 0) // wrmask=y 327ec681f3Smrg * 337ec681f3Smrg * is turned into: 347ec681f3Smrg * 357ec681f3Smrg * vec4 32 ssa_76 = vec4 ssa_35, ssa_35, ssa_35, ssa_35 367ec681f3Smrg * vec1 32 ssa_107 = load_const (0x00000001) 377ec681f3Smrg * vec1 32 ssa_108 = iadd ssa_106, ssa_107 387ec681f3Smrg * vec1 32 ssa_109 = mov ssa_76.y 397ec681f3Smrg * intrinsic store_ssbo (ssa_109, ssa_105, ssa_108) (1, 0, 4, 0) // wrmask=x 407ec681f3Smrg * 417ec681f3Smrg * and likewise: 427ec681f3Smrg * 437ec681f3Smrg * vec4 32 ssa_76 = vec4 ssa_35, ssa_35, ssa_35, ssa_35 447ec681f3Smrg * intrinsic store_ssbo (ssa_76, ssa_105, ssa_106) (15, 0, 4, 0) // wrmask=xzw 457ec681f3Smrg * 467ec681f3Smrg * is split into: 477ec681f3Smrg * 487ec681f3Smrg * // .x component: 497ec681f3Smrg * vec4 32 ssa_76 = vec4 ssa_35, ssa_35, ssa_35, ssa_35 507ec681f3Smrg * vec1 32 ssa_107 = load_const (0x00000000) 517ec681f3Smrg * vec1 32 ssa_108 = iadd ssa_106, ssa_107 527ec681f3Smrg * vec1 32 ssa_109 = mov ssa_76.x 537ec681f3Smrg * intrinsic store_ssbo (ssa_109, ssa_105, ssa_108) (1, 0, 4, 0) // wrmask=x 547ec681f3Smrg * // .zw components: 557ec681f3Smrg * vec1 32 ssa_110 = load_const (0x00000002) 567ec681f3Smrg * vec1 32 ssa_111 = iadd ssa_106, ssa_110 577ec681f3Smrg * vec2 32 ssa_112 = mov ssa_76.zw 587ec681f3Smrg * intrinsic store_ssbo (ssa_112, ssa_105, ssa_111) (3, 0, 4, 0) // wrmask=xy 597ec681f3Smrg */ 607ec681f3Smrg 617ec681f3Smrgstatic int 627ec681f3Smrgvalue_src(nir_intrinsic_op intrinsic) 637ec681f3Smrg{ 647ec681f3Smrg switch (intrinsic) { 657ec681f3Smrg case nir_intrinsic_store_output: 667ec681f3Smrg case nir_intrinsic_store_per_vertex_output: 677ec681f3Smrg case nir_intrinsic_store_ssbo: 687ec681f3Smrg case nir_intrinsic_store_shared: 697ec681f3Smrg case nir_intrinsic_store_global: 707ec681f3Smrg case nir_intrinsic_store_scratch: 717ec681f3Smrg return 0; 727ec681f3Smrg default: 737ec681f3Smrg return -1; 747ec681f3Smrg } 757ec681f3Smrg} 767ec681f3Smrg 777ec681f3Smrgstatic int 787ec681f3Smrgoffset_src(nir_intrinsic_op intrinsic) 797ec681f3Smrg{ 807ec681f3Smrg switch (intrinsic) { 817ec681f3Smrg case nir_intrinsic_store_output: 827ec681f3Smrg case nir_intrinsic_store_shared: 837ec681f3Smrg case nir_intrinsic_store_global: 847ec681f3Smrg case nir_intrinsic_store_scratch: 857ec681f3Smrg return 1; 867ec681f3Smrg case nir_intrinsic_store_per_vertex_output: 877ec681f3Smrg case nir_intrinsic_store_ssbo: 887ec681f3Smrg return 2; 897ec681f3Smrg default: 907ec681f3Smrg return -1; 917ec681f3Smrg } 927ec681f3Smrg} 937ec681f3Smrg 947ec681f3Smrgstatic void 957ec681f3Smrgsplit_wrmask(nir_builder *b, nir_intrinsic_instr *intr) 967ec681f3Smrg{ 977ec681f3Smrg const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; 987ec681f3Smrg 997ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 1007ec681f3Smrg 1017ec681f3Smrg assert(!info->has_dest); /* expecting only store intrinsics */ 1027ec681f3Smrg 1037ec681f3Smrg unsigned num_srcs = info->num_srcs; 1047ec681f3Smrg unsigned value_idx = value_src(intr->intrinsic); 1057ec681f3Smrg unsigned offset_idx = offset_src(intr->intrinsic); 1067ec681f3Smrg unsigned num_comp = nir_intrinsic_src_components(intr, value_idx); 1077ec681f3Smrg 1087ec681f3Smrg unsigned wrmask = nir_intrinsic_write_mask(intr); 1097ec681f3Smrg while (wrmask) { 1107ec681f3Smrg unsigned first_component = ffs(wrmask) - 1; 1117ec681f3Smrg unsigned length = ffs(~(wrmask >> first_component)) - 1; 1127ec681f3Smrg 1137ec681f3Smrg nir_ssa_def *value = nir_ssa_for_src(b, intr->src[value_idx], num_comp); 1147ec681f3Smrg nir_ssa_def *offset = nir_ssa_for_src(b, intr->src[offset_idx], 1); 1157ec681f3Smrg 1167ec681f3Smrg /* swizzle out the consecutive components that we'll store 1177ec681f3Smrg * in this iteration: 1187ec681f3Smrg */ 1197ec681f3Smrg unsigned cur_mask = (BITFIELD_MASK(length) << first_component); 1207ec681f3Smrg value = nir_channels(b, value, cur_mask); 1217ec681f3Smrg 1227ec681f3Smrg /* and create the replacement intrinsic: */ 1237ec681f3Smrg nir_intrinsic_instr *new_intr = 1247ec681f3Smrg nir_intrinsic_instr_create(b->shader, intr->intrinsic); 1257ec681f3Smrg 1267ec681f3Smrg nir_intrinsic_copy_const_indices(new_intr, intr); 1277ec681f3Smrg nir_intrinsic_set_write_mask(new_intr, BITFIELD_MASK(length)); 1287ec681f3Smrg 1297ec681f3Smrg const int offset_units = value->bit_size / 8; 1307ec681f3Smrg 1317ec681f3Smrg if (nir_intrinsic_has_align_mul(intr)) { 1327ec681f3Smrg assert(nir_intrinsic_has_align_offset(intr)); 1337ec681f3Smrg unsigned align_mul = nir_intrinsic_align_mul(intr); 1347ec681f3Smrg unsigned align_off = nir_intrinsic_align_offset(intr); 1357ec681f3Smrg 1367ec681f3Smrg align_off += offset_units * first_component; 1377ec681f3Smrg align_off = align_off % align_mul; 1387ec681f3Smrg 1397ec681f3Smrg nir_intrinsic_set_align(new_intr, align_mul, align_off); 1407ec681f3Smrg } 1417ec681f3Smrg 1427ec681f3Smrg /* if the instruction has a BASE, fold the offset adjustment 1437ec681f3Smrg * into that instead of adding alu instructions, otherwise add 1447ec681f3Smrg * instructions 1457ec681f3Smrg */ 1467ec681f3Smrg unsigned offset_adj = offset_units * first_component; 1477ec681f3Smrg if (nir_intrinsic_has_base(intr)) { 1487ec681f3Smrg nir_intrinsic_set_base(new_intr, 1497ec681f3Smrg nir_intrinsic_base(intr) + offset_adj); 1507ec681f3Smrg } else { 1517ec681f3Smrg offset = nir_iadd(b, offset, 1527ec681f3Smrg nir_imm_intN_t(b, offset_adj, offset->bit_size)); 1537ec681f3Smrg } 1547ec681f3Smrg 1557ec681f3Smrg new_intr->num_components = length; 1567ec681f3Smrg 1577ec681f3Smrg /* Copy the sources, replacing value/offset, and passing everything 1587ec681f3Smrg * else through to the new instrution: 1597ec681f3Smrg */ 1607ec681f3Smrg for (unsigned i = 0; i < num_srcs; i++) { 1617ec681f3Smrg if (i == value_idx) { 1627ec681f3Smrg new_intr->src[i] = nir_src_for_ssa(value); 1637ec681f3Smrg } else if (i == offset_idx) { 1647ec681f3Smrg new_intr->src[i] = nir_src_for_ssa(offset); 1657ec681f3Smrg } else { 1667ec681f3Smrg new_intr->src[i] = intr->src[i]; 1677ec681f3Smrg } 1687ec681f3Smrg } 1697ec681f3Smrg 1707ec681f3Smrg nir_builder_instr_insert(b, &new_intr->instr); 1717ec681f3Smrg 1727ec681f3Smrg /* Clear the bits in the writemask that we just wrote, then try 1737ec681f3Smrg * again to see if more channels are left. 1747ec681f3Smrg */ 1757ec681f3Smrg wrmask &= ~cur_mask; 1767ec681f3Smrg } 1777ec681f3Smrg 1787ec681f3Smrg /* Finally remove the original intrinsic. */ 1797ec681f3Smrg nir_instr_remove(&intr->instr); 1807ec681f3Smrg} 1817ec681f3Smrg 1827ec681f3Smrgstruct nir_lower_wrmasks_state { 1837ec681f3Smrg nir_instr_filter_cb cb; 1847ec681f3Smrg const void *data; 1857ec681f3Smrg}; 1867ec681f3Smrg 1877ec681f3Smrgstatic bool 1887ec681f3Smrgnir_lower_wrmasks_instr(nir_builder *b, nir_instr *instr, void *data) 1897ec681f3Smrg{ 1907ec681f3Smrg struct nir_lower_wrmasks_state *state = data; 1917ec681f3Smrg 1927ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 1937ec681f3Smrg return false; 1947ec681f3Smrg 1957ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 1967ec681f3Smrg 1977ec681f3Smrg /* if no wrmask, then skip it: */ 1987ec681f3Smrg if (!nir_intrinsic_has_write_mask(intr)) 1997ec681f3Smrg return false; 2007ec681f3Smrg 2017ec681f3Smrg /* if wrmask is already contiguous, then nothing to do: */ 2027ec681f3Smrg if (nir_intrinsic_write_mask(intr) == BITFIELD_MASK(intr->num_components)) 2037ec681f3Smrg return false; 2047ec681f3Smrg 2057ec681f3Smrg /* do we know how to lower this instruction? */ 2067ec681f3Smrg if (value_src(intr->intrinsic) < 0) 2077ec681f3Smrg return false; 2087ec681f3Smrg 2097ec681f3Smrg assert(offset_src(intr->intrinsic) >= 0); 2107ec681f3Smrg 2117ec681f3Smrg /* does backend need us to lower this intrinsic? */ 2127ec681f3Smrg if (state->cb && !state->cb(instr, state->data)) 2137ec681f3Smrg return false; 2147ec681f3Smrg 2157ec681f3Smrg split_wrmask(b, intr); 2167ec681f3Smrg 2177ec681f3Smrg return true; 2187ec681f3Smrg} 2197ec681f3Smrg 2207ec681f3Smrgbool 2217ec681f3Smrgnir_lower_wrmasks(nir_shader *shader, nir_instr_filter_cb cb, const void *data) 2227ec681f3Smrg{ 2237ec681f3Smrg struct nir_lower_wrmasks_state state = { 2247ec681f3Smrg .cb = cb, 2257ec681f3Smrg .data = data, 2267ec681f3Smrg }; 2277ec681f3Smrg 2287ec681f3Smrg return nir_shader_instructions_pass(shader, 2297ec681f3Smrg nir_lower_wrmasks_instr, 2307ec681f3Smrg nir_metadata_block_index | 2317ec681f3Smrg nir_metadata_dominance, 2327ec681f3Smrg &state); 2337ec681f3Smrg} 234