17ec681f3Smrg/* 27ec681f3Smrg * Copyright © Microsoft Corporation 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 207ec681f3Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 217ec681f3Smrg * IN THE SOFTWARE. 227ec681f3Smrg */ 237ec681f3Smrg 247ec681f3Smrg#include "dxil_nir.h" 257ec681f3Smrg 267ec681f3Smrg#include "nir_builder.h" 277ec681f3Smrg#include "nir_deref.h" 287ec681f3Smrg#include "nir_to_dxil.h" 297ec681f3Smrg#include "util/u_math.h" 307ec681f3Smrg 317ec681f3Smrgstatic void 327ec681f3Smrgcl_type_size_align(const struct glsl_type *type, unsigned *size, 337ec681f3Smrg unsigned *align) 347ec681f3Smrg{ 357ec681f3Smrg *size = glsl_get_cl_size(type); 367ec681f3Smrg *align = glsl_get_cl_alignment(type); 377ec681f3Smrg} 387ec681f3Smrg 397ec681f3Smrgstatic void 407ec681f3Smrgextract_comps_from_vec32(nir_builder *b, nir_ssa_def *vec32, 417ec681f3Smrg unsigned dst_bit_size, 427ec681f3Smrg nir_ssa_def **dst_comps, 437ec681f3Smrg unsigned num_dst_comps) 447ec681f3Smrg{ 457ec681f3Smrg unsigned step = DIV_ROUND_UP(dst_bit_size, 32); 467ec681f3Smrg unsigned comps_per32b = 32 / dst_bit_size; 477ec681f3Smrg nir_ssa_def *tmp; 487ec681f3Smrg 497ec681f3Smrg for (unsigned i = 0; i < vec32->num_components; i += step) { 507ec681f3Smrg switch (dst_bit_size) { 517ec681f3Smrg case 64: 527ec681f3Smrg tmp = nir_pack_64_2x32_split(b, nir_channel(b, vec32, i), 537ec681f3Smrg nir_channel(b, vec32, i + 1)); 547ec681f3Smrg dst_comps[i / 2] = tmp; 557ec681f3Smrg break; 567ec681f3Smrg case 32: 577ec681f3Smrg dst_comps[i] = nir_channel(b, vec32, i); 587ec681f3Smrg break; 597ec681f3Smrg case 16: 607ec681f3Smrg case 8: { 617ec681f3Smrg unsigned dst_offs = i * comps_per32b; 627ec681f3Smrg 637ec681f3Smrg tmp = nir_unpack_bits(b, nir_channel(b, vec32, i), dst_bit_size); 647ec681f3Smrg for (unsigned j = 0; j < comps_per32b && dst_offs + j < num_dst_comps; j++) 657ec681f3Smrg dst_comps[dst_offs + j] = nir_channel(b, tmp, j); 667ec681f3Smrg } 677ec681f3Smrg 687ec681f3Smrg break; 697ec681f3Smrg } 707ec681f3Smrg } 717ec681f3Smrg} 727ec681f3Smrg 737ec681f3Smrgstatic nir_ssa_def * 747ec681f3Smrgload_comps_to_vec32(nir_builder *b, unsigned src_bit_size, 757ec681f3Smrg nir_ssa_def **src_comps, unsigned num_src_comps) 767ec681f3Smrg{ 777ec681f3Smrg unsigned num_vec32comps = DIV_ROUND_UP(num_src_comps * src_bit_size, 32); 787ec681f3Smrg unsigned step = DIV_ROUND_UP(src_bit_size, 32); 797ec681f3Smrg unsigned comps_per32b = 32 / src_bit_size; 807ec681f3Smrg nir_ssa_def *vec32comps[4]; 817ec681f3Smrg 827ec681f3Smrg for (unsigned i = 0; i < num_vec32comps; i += step) { 837ec681f3Smrg switch (src_bit_size) { 847ec681f3Smrg case 64: 857ec681f3Smrg vec32comps[i] = nir_unpack_64_2x32_split_x(b, src_comps[i / 2]); 867ec681f3Smrg vec32comps[i + 1] = nir_unpack_64_2x32_split_y(b, src_comps[i / 2]); 877ec681f3Smrg break; 887ec681f3Smrg case 32: 897ec681f3Smrg vec32comps[i] = src_comps[i]; 907ec681f3Smrg break; 917ec681f3Smrg case 16: 927ec681f3Smrg case 8: { 937ec681f3Smrg unsigned src_offs = i * comps_per32b; 947ec681f3Smrg 957ec681f3Smrg vec32comps[i] = nir_u2u32(b, src_comps[src_offs]); 967ec681f3Smrg for (unsigned j = 1; j < comps_per32b && src_offs + j < num_src_comps; j++) { 977ec681f3Smrg nir_ssa_def *tmp = nir_ishl(b, nir_u2u32(b, src_comps[src_offs + j]), 987ec681f3Smrg nir_imm_int(b, j * src_bit_size)); 997ec681f3Smrg vec32comps[i] = nir_ior(b, vec32comps[i], tmp); 1007ec681f3Smrg } 1017ec681f3Smrg break; 1027ec681f3Smrg } 1037ec681f3Smrg } 1047ec681f3Smrg } 1057ec681f3Smrg 1067ec681f3Smrg return nir_vec(b, vec32comps, num_vec32comps); 1077ec681f3Smrg} 1087ec681f3Smrg 1097ec681f3Smrgstatic nir_ssa_def * 1107ec681f3Smrgbuild_load_ptr_dxil(nir_builder *b, nir_deref_instr *deref, nir_ssa_def *idx) 1117ec681f3Smrg{ 1127ec681f3Smrg return nir_load_ptr_dxil(b, 1, 32, &deref->dest.ssa, idx); 1137ec681f3Smrg} 1147ec681f3Smrg 1157ec681f3Smrgstatic bool 1167ec681f3Smrglower_load_deref(nir_builder *b, nir_intrinsic_instr *intr) 1177ec681f3Smrg{ 1187ec681f3Smrg assert(intr->dest.is_ssa); 1197ec681f3Smrg 1207ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 1217ec681f3Smrg 1227ec681f3Smrg nir_deref_instr *deref = nir_src_as_deref(intr->src[0]); 1237ec681f3Smrg if (!nir_deref_mode_is(deref, nir_var_shader_temp)) 1247ec681f3Smrg return false; 1257ec681f3Smrg nir_ssa_def *ptr = nir_u2u32(b, nir_build_deref_offset(b, deref, cl_type_size_align)); 1267ec681f3Smrg nir_ssa_def *offset = nir_iand(b, ptr, nir_inot(b, nir_imm_int(b, 3))); 1277ec681f3Smrg 1287ec681f3Smrg assert(intr->dest.is_ssa); 1297ec681f3Smrg unsigned num_components = nir_dest_num_components(intr->dest); 1307ec681f3Smrg unsigned bit_size = nir_dest_bit_size(intr->dest); 1317ec681f3Smrg unsigned load_size = MAX2(32, bit_size); 1327ec681f3Smrg unsigned num_bits = num_components * bit_size; 1337ec681f3Smrg nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; 1347ec681f3Smrg unsigned comp_idx = 0; 1357ec681f3Smrg 1367ec681f3Smrg nir_deref_path path; 1377ec681f3Smrg nir_deref_path_init(&path, deref, NULL); 1387ec681f3Smrg nir_ssa_def *base_idx = nir_ishr(b, offset, nir_imm_int(b, 2 /* log2(32 / 8) */)); 1397ec681f3Smrg 1407ec681f3Smrg /* Split loads into 32-bit chunks */ 1417ec681f3Smrg for (unsigned i = 0; i < num_bits; i += load_size) { 1427ec681f3Smrg unsigned subload_num_bits = MIN2(num_bits - i, load_size); 1437ec681f3Smrg nir_ssa_def *idx = nir_iadd(b, base_idx, nir_imm_int(b, i / 32)); 1447ec681f3Smrg nir_ssa_def *vec32 = build_load_ptr_dxil(b, path.path[0], idx); 1457ec681f3Smrg 1467ec681f3Smrg if (load_size == 64) { 1477ec681f3Smrg idx = nir_iadd(b, idx, nir_imm_int(b, 1)); 1487ec681f3Smrg vec32 = nir_vec2(b, vec32, 1497ec681f3Smrg build_load_ptr_dxil(b, path.path[0], idx)); 1507ec681f3Smrg } 1517ec681f3Smrg 1527ec681f3Smrg /* If we have 2 bytes or less to load we need to adjust the u32 value so 1537ec681f3Smrg * we can always extract the LSB. 1547ec681f3Smrg */ 1557ec681f3Smrg if (subload_num_bits <= 16) { 1567ec681f3Smrg nir_ssa_def *shift = nir_imul(b, nir_iand(b, ptr, nir_imm_int(b, 3)), 1577ec681f3Smrg nir_imm_int(b, 8)); 1587ec681f3Smrg vec32 = nir_ushr(b, vec32, shift); 1597ec681f3Smrg } 1607ec681f3Smrg 1617ec681f3Smrg /* And now comes the pack/unpack step to match the original type. */ 1627ec681f3Smrg extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], 1637ec681f3Smrg subload_num_bits / bit_size); 1647ec681f3Smrg comp_idx += subload_num_bits / bit_size; 1657ec681f3Smrg } 1667ec681f3Smrg 1677ec681f3Smrg nir_deref_path_finish(&path); 1687ec681f3Smrg assert(comp_idx == num_components); 1697ec681f3Smrg nir_ssa_def *result = nir_vec(b, comps, num_components); 1707ec681f3Smrg nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); 1717ec681f3Smrg nir_instr_remove(&intr->instr); 1727ec681f3Smrg return true; 1737ec681f3Smrg} 1747ec681f3Smrg 1757ec681f3Smrgstatic nir_ssa_def * 1767ec681f3Smrgubo_load_select_32b_comps(nir_builder *b, nir_ssa_def *vec32, 1777ec681f3Smrg nir_ssa_def *offset, unsigned num_bytes) 1787ec681f3Smrg{ 1797ec681f3Smrg assert(num_bytes == 16 || num_bytes == 12 || num_bytes == 8 || 1807ec681f3Smrg num_bytes == 4 || num_bytes == 3 || num_bytes == 2 || 1817ec681f3Smrg num_bytes == 1); 1827ec681f3Smrg assert(vec32->num_components == 4); 1837ec681f3Smrg 1847ec681f3Smrg /* 16 and 12 byte types are always aligned on 16 bytes. */ 1857ec681f3Smrg if (num_bytes > 8) 1867ec681f3Smrg return vec32; 1877ec681f3Smrg 1887ec681f3Smrg nir_ssa_def *comps[4]; 1897ec681f3Smrg nir_ssa_def *cond; 1907ec681f3Smrg 1917ec681f3Smrg for (unsigned i = 0; i < 4; i++) 1927ec681f3Smrg comps[i] = nir_channel(b, vec32, i); 1937ec681f3Smrg 1947ec681f3Smrg /* If we have 8bytes or less to load, select which half the vec4 should 1957ec681f3Smrg * be used. 1967ec681f3Smrg */ 1977ec681f3Smrg cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x8)), 1987ec681f3Smrg nir_imm_int(b, 0)); 1997ec681f3Smrg 2007ec681f3Smrg comps[0] = nir_bcsel(b, cond, comps[2], comps[0]); 2017ec681f3Smrg comps[1] = nir_bcsel(b, cond, comps[3], comps[1]); 2027ec681f3Smrg 2037ec681f3Smrg /* Thanks to the CL alignment constraints, if we want 8 bytes we're done. */ 2047ec681f3Smrg if (num_bytes == 8) 2057ec681f3Smrg return nir_vec(b, comps, 2); 2067ec681f3Smrg 2077ec681f3Smrg /* 4 bytes or less needed, select which of the 32bit component should be 2087ec681f3Smrg * used and return it. The sub-32bit split is handled in 2097ec681f3Smrg * extract_comps_from_vec32(). 2107ec681f3Smrg */ 2117ec681f3Smrg cond = nir_ine(b, nir_iand(b, offset, nir_imm_int(b, 0x4)), 2127ec681f3Smrg nir_imm_int(b, 0)); 2137ec681f3Smrg return nir_bcsel(b, cond, comps[1], comps[0]); 2147ec681f3Smrg} 2157ec681f3Smrg 2167ec681f3Smrgnir_ssa_def * 2177ec681f3Smrgbuild_load_ubo_dxil(nir_builder *b, nir_ssa_def *buffer, 2187ec681f3Smrg nir_ssa_def *offset, unsigned num_components, 2197ec681f3Smrg unsigned bit_size) 2207ec681f3Smrg{ 2217ec681f3Smrg nir_ssa_def *idx = nir_ushr(b, offset, nir_imm_int(b, 4)); 2227ec681f3Smrg nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; 2237ec681f3Smrg unsigned num_bits = num_components * bit_size; 2247ec681f3Smrg unsigned comp_idx = 0; 2257ec681f3Smrg 2267ec681f3Smrg /* We need to split loads in 16byte chunks because that's the 2277ec681f3Smrg * granularity of cBufferLoadLegacy(). 2287ec681f3Smrg */ 2297ec681f3Smrg for (unsigned i = 0; i < num_bits; i += (16 * 8)) { 2307ec681f3Smrg /* For each 16byte chunk (or smaller) we generate a 32bit ubo vec 2317ec681f3Smrg * load. 2327ec681f3Smrg */ 2337ec681f3Smrg unsigned subload_num_bits = MIN2(num_bits - i, 16 * 8); 2347ec681f3Smrg nir_ssa_def *vec32 = 2357ec681f3Smrg nir_load_ubo_dxil(b, 4, 32, buffer, nir_iadd(b, idx, nir_imm_int(b, i / (16 * 8)))); 2367ec681f3Smrg 2377ec681f3Smrg /* First re-arrange the vec32 to account for intra 16-byte offset. */ 2387ec681f3Smrg vec32 = ubo_load_select_32b_comps(b, vec32, offset, subload_num_bits / 8); 2397ec681f3Smrg 2407ec681f3Smrg /* If we have 2 bytes or less to load we need to adjust the u32 value so 2417ec681f3Smrg * we can always extract the LSB. 2427ec681f3Smrg */ 2437ec681f3Smrg if (subload_num_bits <= 16) { 2447ec681f3Smrg nir_ssa_def *shift = nir_imul(b, nir_iand(b, offset, 2457ec681f3Smrg nir_imm_int(b, 3)), 2467ec681f3Smrg nir_imm_int(b, 8)); 2477ec681f3Smrg vec32 = nir_ushr(b, vec32, shift); 2487ec681f3Smrg } 2497ec681f3Smrg 2507ec681f3Smrg /* And now comes the pack/unpack step to match the original type. */ 2517ec681f3Smrg extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], 2527ec681f3Smrg subload_num_bits / bit_size); 2537ec681f3Smrg comp_idx += subload_num_bits / bit_size; 2547ec681f3Smrg } 2557ec681f3Smrg 2567ec681f3Smrg assert(comp_idx == num_components); 2577ec681f3Smrg return nir_vec(b, comps, num_components); 2587ec681f3Smrg} 2597ec681f3Smrg 2607ec681f3Smrgstatic bool 2617ec681f3Smrglower_load_ssbo(nir_builder *b, nir_intrinsic_instr *intr) 2627ec681f3Smrg{ 2637ec681f3Smrg assert(intr->dest.is_ssa); 2647ec681f3Smrg assert(intr->src[0].is_ssa); 2657ec681f3Smrg assert(intr->src[1].is_ssa); 2667ec681f3Smrg 2677ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 2687ec681f3Smrg 2697ec681f3Smrg nir_ssa_def *buffer = intr->src[0].ssa; 2707ec681f3Smrg nir_ssa_def *offset = nir_iand(b, intr->src[1].ssa, nir_imm_int(b, ~3)); 2717ec681f3Smrg enum gl_access_qualifier access = nir_intrinsic_access(intr); 2727ec681f3Smrg unsigned bit_size = nir_dest_bit_size(intr->dest); 2737ec681f3Smrg unsigned num_components = nir_dest_num_components(intr->dest); 2747ec681f3Smrg unsigned num_bits = num_components * bit_size; 2757ec681f3Smrg 2767ec681f3Smrg nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; 2777ec681f3Smrg unsigned comp_idx = 0; 2787ec681f3Smrg 2797ec681f3Smrg /* We need to split loads in 16byte chunks because that's the optimal 2807ec681f3Smrg * granularity of bufferLoad(). Minimum alignment is 4byte, which saves 2817ec681f3Smrg * from us from extra complexity to extract >= 32 bit components. 2827ec681f3Smrg */ 2837ec681f3Smrg for (unsigned i = 0; i < num_bits; i += 4 * 32) { 2847ec681f3Smrg /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec 2857ec681f3Smrg * load. 2867ec681f3Smrg */ 2877ec681f3Smrg unsigned subload_num_bits = MIN2(num_bits - i, 4 * 32); 2887ec681f3Smrg 2897ec681f3Smrg /* The number of components to store depends on the number of bytes. */ 2907ec681f3Smrg nir_ssa_def *vec32 = 2917ec681f3Smrg nir_load_ssbo(b, DIV_ROUND_UP(subload_num_bits, 32), 32, 2927ec681f3Smrg buffer, nir_iadd(b, offset, nir_imm_int(b, i / 8)), 2937ec681f3Smrg .align_mul = 4, 2947ec681f3Smrg .align_offset = 0, 2957ec681f3Smrg .access = access); 2967ec681f3Smrg 2977ec681f3Smrg /* If we have 2 bytes or less to load we need to adjust the u32 value so 2987ec681f3Smrg * we can always extract the LSB. 2997ec681f3Smrg */ 3007ec681f3Smrg if (subload_num_bits <= 16) { 3017ec681f3Smrg nir_ssa_def *shift = nir_imul(b, nir_iand(b, intr->src[1].ssa, nir_imm_int(b, 3)), 3027ec681f3Smrg nir_imm_int(b, 8)); 3037ec681f3Smrg vec32 = nir_ushr(b, vec32, shift); 3047ec681f3Smrg } 3057ec681f3Smrg 3067ec681f3Smrg /* And now comes the pack/unpack step to match the original type. */ 3077ec681f3Smrg extract_comps_from_vec32(b, vec32, bit_size, &comps[comp_idx], 3087ec681f3Smrg subload_num_bits / bit_size); 3097ec681f3Smrg comp_idx += subload_num_bits / bit_size; 3107ec681f3Smrg } 3117ec681f3Smrg 3127ec681f3Smrg assert(comp_idx == num_components); 3137ec681f3Smrg nir_ssa_def *result = nir_vec(b, comps, num_components); 3147ec681f3Smrg nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); 3157ec681f3Smrg nir_instr_remove(&intr->instr); 3167ec681f3Smrg return true; 3177ec681f3Smrg} 3187ec681f3Smrg 3197ec681f3Smrgstatic bool 3207ec681f3Smrglower_store_ssbo(nir_builder *b, nir_intrinsic_instr *intr) 3217ec681f3Smrg{ 3227ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 3237ec681f3Smrg 3247ec681f3Smrg assert(intr->src[0].is_ssa); 3257ec681f3Smrg assert(intr->src[1].is_ssa); 3267ec681f3Smrg assert(intr->src[2].is_ssa); 3277ec681f3Smrg 3287ec681f3Smrg nir_ssa_def *val = intr->src[0].ssa; 3297ec681f3Smrg nir_ssa_def *buffer = intr->src[1].ssa; 3307ec681f3Smrg nir_ssa_def *offset = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, ~3)); 3317ec681f3Smrg 3327ec681f3Smrg unsigned bit_size = val->bit_size; 3337ec681f3Smrg unsigned num_components = val->num_components; 3347ec681f3Smrg unsigned num_bits = num_components * bit_size; 3357ec681f3Smrg 3367ec681f3Smrg nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; 3377ec681f3Smrg unsigned comp_idx = 0; 3387ec681f3Smrg 3397ec681f3Smrg for (unsigned i = 0; i < num_components; i++) 3407ec681f3Smrg comps[i] = nir_channel(b, val, i); 3417ec681f3Smrg 3427ec681f3Smrg /* We split stores in 16byte chunks because that's the optimal granularity 3437ec681f3Smrg * of bufferStore(). Minimum alignment is 4byte, which saves from us from 3447ec681f3Smrg * extra complexity to store >= 32 bit components. 3457ec681f3Smrg */ 3467ec681f3Smrg for (unsigned i = 0; i < num_bits; i += 4 * 32) { 3477ec681f3Smrg /* For each 16byte chunk (or smaller) we generate a 32bit ssbo vec 3487ec681f3Smrg * store. 3497ec681f3Smrg */ 3507ec681f3Smrg unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32); 3517ec681f3Smrg nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8)); 3527ec681f3Smrg nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx], 3537ec681f3Smrg substore_num_bits / bit_size); 3547ec681f3Smrg nir_intrinsic_instr *store; 3557ec681f3Smrg 3567ec681f3Smrg if (substore_num_bits < 32) { 3577ec681f3Smrg nir_ssa_def *mask = nir_imm_int(b, (1 << substore_num_bits) - 1); 3587ec681f3Smrg 3597ec681f3Smrg /* If we have 16 bits or less to store we need to place them 3607ec681f3Smrg * correctly in the u32 component. Anything greater than 16 bits 3617ec681f3Smrg * (including uchar3) is naturally aligned on 32bits. 3627ec681f3Smrg */ 3637ec681f3Smrg if (substore_num_bits <= 16) { 3647ec681f3Smrg nir_ssa_def *pos = nir_iand(b, intr->src[2].ssa, nir_imm_int(b, 3)); 3657ec681f3Smrg nir_ssa_def *shift = nir_imul_imm(b, pos, 8); 3667ec681f3Smrg 3677ec681f3Smrg vec32 = nir_ishl(b, vec32, shift); 3687ec681f3Smrg mask = nir_ishl(b, mask, shift); 3697ec681f3Smrg } 3707ec681f3Smrg 3717ec681f3Smrg store = nir_intrinsic_instr_create(b->shader, 3727ec681f3Smrg nir_intrinsic_store_ssbo_masked_dxil); 3737ec681f3Smrg store->src[0] = nir_src_for_ssa(vec32); 3747ec681f3Smrg store->src[1] = nir_src_for_ssa(nir_inot(b, mask)); 3757ec681f3Smrg store->src[2] = nir_src_for_ssa(buffer); 3767ec681f3Smrg store->src[3] = nir_src_for_ssa(local_offset); 3777ec681f3Smrg } else { 3787ec681f3Smrg store = nir_intrinsic_instr_create(b->shader, 3797ec681f3Smrg nir_intrinsic_store_ssbo); 3807ec681f3Smrg store->src[0] = nir_src_for_ssa(vec32); 3817ec681f3Smrg store->src[1] = nir_src_for_ssa(buffer); 3827ec681f3Smrg store->src[2] = nir_src_for_ssa(local_offset); 3837ec681f3Smrg 3847ec681f3Smrg nir_intrinsic_set_align(store, 4, 0); 3857ec681f3Smrg } 3867ec681f3Smrg 3877ec681f3Smrg /* The number of components to store depends on the number of bits. */ 3887ec681f3Smrg store->num_components = DIV_ROUND_UP(substore_num_bits, 32); 3897ec681f3Smrg nir_builder_instr_insert(b, &store->instr); 3907ec681f3Smrg comp_idx += substore_num_bits / bit_size; 3917ec681f3Smrg } 3927ec681f3Smrg 3937ec681f3Smrg nir_instr_remove(&intr->instr); 3947ec681f3Smrg return true; 3957ec681f3Smrg} 3967ec681f3Smrg 3977ec681f3Smrgstatic void 3987ec681f3Smrglower_load_vec32(nir_builder *b, nir_ssa_def *index, unsigned num_comps, nir_ssa_def **comps, nir_intrinsic_op op) 3997ec681f3Smrg{ 4007ec681f3Smrg for (unsigned i = 0; i < num_comps; i++) { 4017ec681f3Smrg nir_intrinsic_instr *load = 4027ec681f3Smrg nir_intrinsic_instr_create(b->shader, op); 4037ec681f3Smrg 4047ec681f3Smrg load->num_components = 1; 4057ec681f3Smrg load->src[0] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); 4067ec681f3Smrg nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL); 4077ec681f3Smrg nir_builder_instr_insert(b, &load->instr); 4087ec681f3Smrg comps[i] = &load->dest.ssa; 4097ec681f3Smrg } 4107ec681f3Smrg} 4117ec681f3Smrg 4127ec681f3Smrgstatic bool 4137ec681f3Smrglower_32b_offset_load(nir_builder *b, nir_intrinsic_instr *intr) 4147ec681f3Smrg{ 4157ec681f3Smrg assert(intr->dest.is_ssa); 4167ec681f3Smrg unsigned bit_size = nir_dest_bit_size(intr->dest); 4177ec681f3Smrg unsigned num_components = nir_dest_num_components(intr->dest); 4187ec681f3Smrg unsigned num_bits = num_components * bit_size; 4197ec681f3Smrg 4207ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 4217ec681f3Smrg nir_intrinsic_op op = intr->intrinsic; 4227ec681f3Smrg 4237ec681f3Smrg assert(intr->src[0].is_ssa); 4247ec681f3Smrg nir_ssa_def *offset = intr->src[0].ssa; 4257ec681f3Smrg if (op == nir_intrinsic_load_shared) { 4267ec681f3Smrg offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); 4277ec681f3Smrg op = nir_intrinsic_load_shared_dxil; 4287ec681f3Smrg } else { 4297ec681f3Smrg offset = nir_u2u32(b, offset); 4307ec681f3Smrg op = nir_intrinsic_load_scratch_dxil; 4317ec681f3Smrg } 4327ec681f3Smrg nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); 4337ec681f3Smrg nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; 4347ec681f3Smrg nir_ssa_def *comps_32bit[NIR_MAX_VEC_COMPONENTS * 2]; 4357ec681f3Smrg 4367ec681f3Smrg /* We need to split loads in 32-bit accesses because the buffer 4377ec681f3Smrg * is an i32 array and DXIL does not support type casts. 4387ec681f3Smrg */ 4397ec681f3Smrg unsigned num_32bit_comps = DIV_ROUND_UP(num_bits, 32); 4407ec681f3Smrg lower_load_vec32(b, index, num_32bit_comps, comps_32bit, op); 4417ec681f3Smrg unsigned num_comps_per_pass = MIN2(num_32bit_comps, 4); 4427ec681f3Smrg 4437ec681f3Smrg for (unsigned i = 0; i < num_32bit_comps; i += num_comps_per_pass) { 4447ec681f3Smrg unsigned num_vec32_comps = MIN2(num_32bit_comps - i, 4); 4457ec681f3Smrg unsigned num_dest_comps = num_vec32_comps * 32 / bit_size; 4467ec681f3Smrg nir_ssa_def *vec32 = nir_vec(b, &comps_32bit[i], num_vec32_comps); 4477ec681f3Smrg 4487ec681f3Smrg /* If we have 16 bits or less to load we need to adjust the u32 value so 4497ec681f3Smrg * we can always extract the LSB. 4507ec681f3Smrg */ 4517ec681f3Smrg if (num_bits <= 16) { 4527ec681f3Smrg nir_ssa_def *shift = 4537ec681f3Smrg nir_imul(b, nir_iand(b, offset, nir_imm_int(b, 3)), 4547ec681f3Smrg nir_imm_int(b, 8)); 4557ec681f3Smrg vec32 = nir_ushr(b, vec32, shift); 4567ec681f3Smrg } 4577ec681f3Smrg 4587ec681f3Smrg /* And now comes the pack/unpack step to match the original type. */ 4597ec681f3Smrg unsigned dest_index = i * 32 / bit_size; 4607ec681f3Smrg extract_comps_from_vec32(b, vec32, bit_size, &comps[dest_index], num_dest_comps); 4617ec681f3Smrg } 4627ec681f3Smrg 4637ec681f3Smrg nir_ssa_def *result = nir_vec(b, comps, num_components); 4647ec681f3Smrg nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); 4657ec681f3Smrg nir_instr_remove(&intr->instr); 4667ec681f3Smrg 4677ec681f3Smrg return true; 4687ec681f3Smrg} 4697ec681f3Smrg 4707ec681f3Smrgstatic void 4717ec681f3Smrglower_store_vec32(nir_builder *b, nir_ssa_def *index, nir_ssa_def *vec32, nir_intrinsic_op op) 4727ec681f3Smrg{ 4737ec681f3Smrg 4747ec681f3Smrg for (unsigned i = 0; i < vec32->num_components; i++) { 4757ec681f3Smrg nir_intrinsic_instr *store = 4767ec681f3Smrg nir_intrinsic_instr_create(b->shader, op); 4777ec681f3Smrg 4787ec681f3Smrg store->src[0] = nir_src_for_ssa(nir_channel(b, vec32, i)); 4797ec681f3Smrg store->src[1] = nir_src_for_ssa(nir_iadd(b, index, nir_imm_int(b, i))); 4807ec681f3Smrg store->num_components = 1; 4817ec681f3Smrg nir_builder_instr_insert(b, &store->instr); 4827ec681f3Smrg } 4837ec681f3Smrg} 4847ec681f3Smrg 4857ec681f3Smrgstatic void 4867ec681f3Smrglower_masked_store_vec32(nir_builder *b, nir_ssa_def *offset, nir_ssa_def *index, 4877ec681f3Smrg nir_ssa_def *vec32, unsigned num_bits, nir_intrinsic_op op) 4887ec681f3Smrg{ 4897ec681f3Smrg nir_ssa_def *mask = nir_imm_int(b, (1 << num_bits) - 1); 4907ec681f3Smrg 4917ec681f3Smrg /* If we have 16 bits or less to store we need to place them correctly in 4927ec681f3Smrg * the u32 component. Anything greater than 16 bits (including uchar3) is 4937ec681f3Smrg * naturally aligned on 32bits. 4947ec681f3Smrg */ 4957ec681f3Smrg if (num_bits <= 16) { 4967ec681f3Smrg nir_ssa_def *shift = 4977ec681f3Smrg nir_imul_imm(b, nir_iand(b, offset, nir_imm_int(b, 3)), 8); 4987ec681f3Smrg 4997ec681f3Smrg vec32 = nir_ishl(b, vec32, shift); 5007ec681f3Smrg mask = nir_ishl(b, mask, shift); 5017ec681f3Smrg } 5027ec681f3Smrg 5037ec681f3Smrg if (op == nir_intrinsic_store_shared_dxil) { 5047ec681f3Smrg /* Use the dedicated masked intrinsic */ 5057ec681f3Smrg nir_store_shared_masked_dxil(b, vec32, nir_inot(b, mask), index); 5067ec681f3Smrg } else { 5077ec681f3Smrg /* For scratch, since we don't need atomics, just generate the read-modify-write in NIR */ 5087ec681f3Smrg nir_ssa_def *load = nir_load_scratch_dxil(b, 1, 32, index); 5097ec681f3Smrg 5107ec681f3Smrg nir_ssa_def *new_val = nir_ior(b, vec32, 5117ec681f3Smrg nir_iand(b, 5127ec681f3Smrg nir_inot(b, mask), 5137ec681f3Smrg load)); 5147ec681f3Smrg 5157ec681f3Smrg lower_store_vec32(b, index, new_val, op); 5167ec681f3Smrg } 5177ec681f3Smrg} 5187ec681f3Smrg 5197ec681f3Smrgstatic bool 5207ec681f3Smrglower_32b_offset_store(nir_builder *b, nir_intrinsic_instr *intr) 5217ec681f3Smrg{ 5227ec681f3Smrg assert(intr->src[0].is_ssa); 5237ec681f3Smrg unsigned num_components = nir_src_num_components(intr->src[0]); 5247ec681f3Smrg unsigned bit_size = nir_src_bit_size(intr->src[0]); 5257ec681f3Smrg unsigned num_bits = num_components * bit_size; 5267ec681f3Smrg 5277ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 5287ec681f3Smrg nir_intrinsic_op op = intr->intrinsic; 5297ec681f3Smrg 5307ec681f3Smrg nir_ssa_def *offset = intr->src[1].ssa; 5317ec681f3Smrg if (op == nir_intrinsic_store_shared) { 5327ec681f3Smrg offset = nir_iadd(b, offset, nir_imm_int(b, nir_intrinsic_base(intr))); 5337ec681f3Smrg op = nir_intrinsic_store_shared_dxil; 5347ec681f3Smrg } else { 5357ec681f3Smrg offset = nir_u2u32(b, offset); 5367ec681f3Smrg op = nir_intrinsic_store_scratch_dxil; 5377ec681f3Smrg } 5387ec681f3Smrg nir_ssa_def *comps[NIR_MAX_VEC_COMPONENTS]; 5397ec681f3Smrg 5407ec681f3Smrg unsigned comp_idx = 0; 5417ec681f3Smrg for (unsigned i = 0; i < num_components; i++) 5427ec681f3Smrg comps[i] = nir_channel(b, intr->src[0].ssa, i); 5437ec681f3Smrg 5447ec681f3Smrg for (unsigned i = 0; i < num_bits; i += 4 * 32) { 5457ec681f3Smrg /* For each 4byte chunk (or smaller) we generate a 32bit scalar store. 5467ec681f3Smrg */ 5477ec681f3Smrg unsigned substore_num_bits = MIN2(num_bits - i, 4 * 32); 5487ec681f3Smrg nir_ssa_def *local_offset = nir_iadd(b, offset, nir_imm_int(b, i / 8)); 5497ec681f3Smrg nir_ssa_def *vec32 = load_comps_to_vec32(b, bit_size, &comps[comp_idx], 5507ec681f3Smrg substore_num_bits / bit_size); 5517ec681f3Smrg nir_ssa_def *index = nir_ushr(b, local_offset, nir_imm_int(b, 2)); 5527ec681f3Smrg 5537ec681f3Smrg /* For anything less than 32bits we need to use the masked version of the 5547ec681f3Smrg * intrinsic to preserve data living in the same 32bit slot. 5557ec681f3Smrg */ 5567ec681f3Smrg if (num_bits < 32) { 5577ec681f3Smrg lower_masked_store_vec32(b, local_offset, index, vec32, num_bits, op); 5587ec681f3Smrg } else { 5597ec681f3Smrg lower_store_vec32(b, index, vec32, op); 5607ec681f3Smrg } 5617ec681f3Smrg 5627ec681f3Smrg comp_idx += substore_num_bits / bit_size; 5637ec681f3Smrg } 5647ec681f3Smrg 5657ec681f3Smrg nir_instr_remove(&intr->instr); 5667ec681f3Smrg 5677ec681f3Smrg return true; 5687ec681f3Smrg} 5697ec681f3Smrg 5707ec681f3Smrgstatic void 5717ec681f3Smrgubo_to_temp_patch_deref_mode(nir_deref_instr *deref) 5727ec681f3Smrg{ 5737ec681f3Smrg deref->modes = nir_var_shader_temp; 5747ec681f3Smrg nir_foreach_use(use_src, &deref->dest.ssa) { 5757ec681f3Smrg if (use_src->parent_instr->type != nir_instr_type_deref) 5767ec681f3Smrg continue; 5777ec681f3Smrg 5787ec681f3Smrg nir_deref_instr *parent = nir_instr_as_deref(use_src->parent_instr); 5797ec681f3Smrg ubo_to_temp_patch_deref_mode(parent); 5807ec681f3Smrg } 5817ec681f3Smrg} 5827ec681f3Smrg 5837ec681f3Smrgstatic void 5847ec681f3Smrgubo_to_temp_update_entry(nir_deref_instr *deref, struct hash_entry *he) 5857ec681f3Smrg{ 5867ec681f3Smrg assert(nir_deref_mode_is(deref, nir_var_mem_constant)); 5877ec681f3Smrg assert(deref->dest.is_ssa); 5887ec681f3Smrg assert(he->data); 5897ec681f3Smrg 5907ec681f3Smrg nir_foreach_use(use_src, &deref->dest.ssa) { 5917ec681f3Smrg if (use_src->parent_instr->type == nir_instr_type_deref) { 5927ec681f3Smrg ubo_to_temp_update_entry(nir_instr_as_deref(use_src->parent_instr), he); 5937ec681f3Smrg } else if (use_src->parent_instr->type == nir_instr_type_intrinsic) { 5947ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(use_src->parent_instr); 5957ec681f3Smrg if (intr->intrinsic != nir_intrinsic_load_deref) 5967ec681f3Smrg he->data = NULL; 5977ec681f3Smrg } else { 5987ec681f3Smrg he->data = NULL; 5997ec681f3Smrg } 6007ec681f3Smrg 6017ec681f3Smrg if (!he->data) 6027ec681f3Smrg break; 6037ec681f3Smrg } 6047ec681f3Smrg} 6057ec681f3Smrg 6067ec681f3Smrgbool 6077ec681f3Smrgdxil_nir_lower_ubo_to_temp(nir_shader *nir) 6087ec681f3Smrg{ 6097ec681f3Smrg struct hash_table *ubo_to_temp = _mesa_pointer_hash_table_create(NULL); 6107ec681f3Smrg bool progress = false; 6117ec681f3Smrg 6127ec681f3Smrg /* First pass: collect all UBO accesses that could be turned into 6137ec681f3Smrg * shader temp accesses. 6147ec681f3Smrg */ 6157ec681f3Smrg foreach_list_typed(nir_function, func, node, &nir->functions) { 6167ec681f3Smrg if (!func->is_entrypoint) 6177ec681f3Smrg continue; 6187ec681f3Smrg assert(func->impl); 6197ec681f3Smrg 6207ec681f3Smrg nir_foreach_block(block, func->impl) { 6217ec681f3Smrg nir_foreach_instr_safe(instr, block) { 6227ec681f3Smrg if (instr->type != nir_instr_type_deref) 6237ec681f3Smrg continue; 6247ec681f3Smrg 6257ec681f3Smrg nir_deref_instr *deref = nir_instr_as_deref(instr); 6267ec681f3Smrg if (!nir_deref_mode_is(deref, nir_var_mem_constant) || 6277ec681f3Smrg deref->deref_type != nir_deref_type_var) 6287ec681f3Smrg continue; 6297ec681f3Smrg 6307ec681f3Smrg struct hash_entry *he = 6317ec681f3Smrg _mesa_hash_table_search(ubo_to_temp, deref->var); 6327ec681f3Smrg 6337ec681f3Smrg if (!he) 6347ec681f3Smrg he = _mesa_hash_table_insert(ubo_to_temp, deref->var, deref->var); 6357ec681f3Smrg 6367ec681f3Smrg if (!he->data) 6377ec681f3Smrg continue; 6387ec681f3Smrg 6397ec681f3Smrg ubo_to_temp_update_entry(deref, he); 6407ec681f3Smrg } 6417ec681f3Smrg } 6427ec681f3Smrg } 6437ec681f3Smrg 6447ec681f3Smrg hash_table_foreach(ubo_to_temp, he) { 6457ec681f3Smrg nir_variable *var = he->data; 6467ec681f3Smrg 6477ec681f3Smrg if (!var) 6487ec681f3Smrg continue; 6497ec681f3Smrg 6507ec681f3Smrg /* Change the variable mode. */ 6517ec681f3Smrg var->data.mode = nir_var_shader_temp; 6527ec681f3Smrg 6537ec681f3Smrg /* Make sure the variable has a name. 6547ec681f3Smrg * DXIL variables must have names. 6557ec681f3Smrg */ 6567ec681f3Smrg if (!var->name) 6577ec681f3Smrg var->name = ralloc_asprintf(nir, "global_%d", exec_list_length(&nir->variables)); 6587ec681f3Smrg 6597ec681f3Smrg progress = true; 6607ec681f3Smrg } 6617ec681f3Smrg _mesa_hash_table_destroy(ubo_to_temp, NULL); 6627ec681f3Smrg 6637ec681f3Smrg /* Second pass: patch all derefs that were accessing the converted UBOs 6647ec681f3Smrg * variables. 6657ec681f3Smrg */ 6667ec681f3Smrg foreach_list_typed(nir_function, func, node, &nir->functions) { 6677ec681f3Smrg if (!func->is_entrypoint) 6687ec681f3Smrg continue; 6697ec681f3Smrg assert(func->impl); 6707ec681f3Smrg 6717ec681f3Smrg nir_foreach_block(block, func->impl) { 6727ec681f3Smrg nir_foreach_instr_safe(instr, block) { 6737ec681f3Smrg if (instr->type != nir_instr_type_deref) 6747ec681f3Smrg continue; 6757ec681f3Smrg 6767ec681f3Smrg nir_deref_instr *deref = nir_instr_as_deref(instr); 6777ec681f3Smrg if (nir_deref_mode_is(deref, nir_var_mem_constant) && 6787ec681f3Smrg deref->deref_type == nir_deref_type_var && 6797ec681f3Smrg deref->var->data.mode == nir_var_shader_temp) 6807ec681f3Smrg ubo_to_temp_patch_deref_mode(deref); 6817ec681f3Smrg } 6827ec681f3Smrg } 6837ec681f3Smrg } 6847ec681f3Smrg 6857ec681f3Smrg return progress; 6867ec681f3Smrg} 6877ec681f3Smrg 6887ec681f3Smrgstatic bool 6897ec681f3Smrglower_load_ubo(nir_builder *b, nir_intrinsic_instr *intr) 6907ec681f3Smrg{ 6917ec681f3Smrg assert(intr->dest.is_ssa); 6927ec681f3Smrg assert(intr->src[0].is_ssa); 6937ec681f3Smrg assert(intr->src[1].is_ssa); 6947ec681f3Smrg 6957ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 6967ec681f3Smrg 6977ec681f3Smrg nir_ssa_def *result = 6987ec681f3Smrg build_load_ubo_dxil(b, intr->src[0].ssa, intr->src[1].ssa, 6997ec681f3Smrg nir_dest_num_components(intr->dest), 7007ec681f3Smrg nir_dest_bit_size(intr->dest)); 7017ec681f3Smrg 7027ec681f3Smrg nir_ssa_def_rewrite_uses(&intr->dest.ssa, result); 7037ec681f3Smrg nir_instr_remove(&intr->instr); 7047ec681f3Smrg return true; 7057ec681f3Smrg} 7067ec681f3Smrg 7077ec681f3Smrgbool 7087ec681f3Smrgdxil_nir_lower_loads_stores_to_dxil(nir_shader *nir) 7097ec681f3Smrg{ 7107ec681f3Smrg bool progress = false; 7117ec681f3Smrg 7127ec681f3Smrg foreach_list_typed(nir_function, func, node, &nir->functions) { 7137ec681f3Smrg if (!func->is_entrypoint) 7147ec681f3Smrg continue; 7157ec681f3Smrg assert(func->impl); 7167ec681f3Smrg 7177ec681f3Smrg nir_builder b; 7187ec681f3Smrg nir_builder_init(&b, func->impl); 7197ec681f3Smrg 7207ec681f3Smrg nir_foreach_block(block, func->impl) { 7217ec681f3Smrg nir_foreach_instr_safe(instr, block) { 7227ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 7237ec681f3Smrg continue; 7247ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 7257ec681f3Smrg 7267ec681f3Smrg switch (intr->intrinsic) { 7277ec681f3Smrg case nir_intrinsic_load_deref: 7287ec681f3Smrg progress |= lower_load_deref(&b, intr); 7297ec681f3Smrg break; 7307ec681f3Smrg case nir_intrinsic_load_shared: 7317ec681f3Smrg case nir_intrinsic_load_scratch: 7327ec681f3Smrg progress |= lower_32b_offset_load(&b, intr); 7337ec681f3Smrg break; 7347ec681f3Smrg case nir_intrinsic_load_ssbo: 7357ec681f3Smrg progress |= lower_load_ssbo(&b, intr); 7367ec681f3Smrg break; 7377ec681f3Smrg case nir_intrinsic_load_ubo: 7387ec681f3Smrg progress |= lower_load_ubo(&b, intr); 7397ec681f3Smrg break; 7407ec681f3Smrg case nir_intrinsic_store_shared: 7417ec681f3Smrg case nir_intrinsic_store_scratch: 7427ec681f3Smrg progress |= lower_32b_offset_store(&b, intr); 7437ec681f3Smrg break; 7447ec681f3Smrg case nir_intrinsic_store_ssbo: 7457ec681f3Smrg progress |= lower_store_ssbo(&b, intr); 7467ec681f3Smrg break; 7477ec681f3Smrg default: 7487ec681f3Smrg break; 7497ec681f3Smrg } 7507ec681f3Smrg } 7517ec681f3Smrg } 7527ec681f3Smrg } 7537ec681f3Smrg 7547ec681f3Smrg return progress; 7557ec681f3Smrg} 7567ec681f3Smrg 7577ec681f3Smrgstatic bool 7587ec681f3Smrglower_shared_atomic(nir_builder *b, nir_intrinsic_instr *intr, 7597ec681f3Smrg nir_intrinsic_op dxil_op) 7607ec681f3Smrg{ 7617ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 7627ec681f3Smrg 7637ec681f3Smrg assert(intr->src[0].is_ssa); 7647ec681f3Smrg nir_ssa_def *offset = 7657ec681f3Smrg nir_iadd(b, intr->src[0].ssa, nir_imm_int(b, nir_intrinsic_base(intr))); 7667ec681f3Smrg nir_ssa_def *index = nir_ushr(b, offset, nir_imm_int(b, 2)); 7677ec681f3Smrg 7687ec681f3Smrg nir_intrinsic_instr *atomic = nir_intrinsic_instr_create(b->shader, dxil_op); 7697ec681f3Smrg atomic->src[0] = nir_src_for_ssa(index); 7707ec681f3Smrg assert(intr->src[1].is_ssa); 7717ec681f3Smrg atomic->src[1] = nir_src_for_ssa(intr->src[1].ssa); 7727ec681f3Smrg if (dxil_op == nir_intrinsic_shared_atomic_comp_swap_dxil) { 7737ec681f3Smrg assert(intr->src[2].is_ssa); 7747ec681f3Smrg atomic->src[2] = nir_src_for_ssa(intr->src[2].ssa); 7757ec681f3Smrg } 7767ec681f3Smrg atomic->num_components = 0; 7777ec681f3Smrg nir_ssa_dest_init(&atomic->instr, &atomic->dest, 1, 32, NULL); 7787ec681f3Smrg 7797ec681f3Smrg nir_builder_instr_insert(b, &atomic->instr); 7807ec681f3Smrg nir_ssa_def_rewrite_uses(&intr->dest.ssa, &atomic->dest.ssa); 7817ec681f3Smrg nir_instr_remove(&intr->instr); 7827ec681f3Smrg return true; 7837ec681f3Smrg} 7847ec681f3Smrg 7857ec681f3Smrgbool 7867ec681f3Smrgdxil_nir_lower_atomics_to_dxil(nir_shader *nir) 7877ec681f3Smrg{ 7887ec681f3Smrg bool progress = false; 7897ec681f3Smrg 7907ec681f3Smrg foreach_list_typed(nir_function, func, node, &nir->functions) { 7917ec681f3Smrg if (!func->is_entrypoint) 7927ec681f3Smrg continue; 7937ec681f3Smrg assert(func->impl); 7947ec681f3Smrg 7957ec681f3Smrg nir_builder b; 7967ec681f3Smrg nir_builder_init(&b, func->impl); 7977ec681f3Smrg 7987ec681f3Smrg nir_foreach_block(block, func->impl) { 7997ec681f3Smrg nir_foreach_instr_safe(instr, block) { 8007ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 8017ec681f3Smrg continue; 8027ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 8037ec681f3Smrg 8047ec681f3Smrg switch (intr->intrinsic) { 8057ec681f3Smrg 8067ec681f3Smrg#define ATOMIC(op) \ 8077ec681f3Smrg case nir_intrinsic_shared_atomic_##op: \ 8087ec681f3Smrg progress |= lower_shared_atomic(&b, intr, \ 8097ec681f3Smrg nir_intrinsic_shared_atomic_##op##_dxil); \ 8107ec681f3Smrg break 8117ec681f3Smrg 8127ec681f3Smrg ATOMIC(add); 8137ec681f3Smrg ATOMIC(imin); 8147ec681f3Smrg ATOMIC(umin); 8157ec681f3Smrg ATOMIC(imax); 8167ec681f3Smrg ATOMIC(umax); 8177ec681f3Smrg ATOMIC(and); 8187ec681f3Smrg ATOMIC(or); 8197ec681f3Smrg ATOMIC(xor); 8207ec681f3Smrg ATOMIC(exchange); 8217ec681f3Smrg ATOMIC(comp_swap); 8227ec681f3Smrg 8237ec681f3Smrg#undef ATOMIC 8247ec681f3Smrg default: 8257ec681f3Smrg break; 8267ec681f3Smrg } 8277ec681f3Smrg } 8287ec681f3Smrg } 8297ec681f3Smrg } 8307ec681f3Smrg 8317ec681f3Smrg return progress; 8327ec681f3Smrg} 8337ec681f3Smrg 8347ec681f3Smrgstatic bool 8357ec681f3Smrglower_deref_ssbo(nir_builder *b, nir_deref_instr *deref) 8367ec681f3Smrg{ 8377ec681f3Smrg assert(nir_deref_mode_is(deref, nir_var_mem_ssbo)); 8387ec681f3Smrg assert(deref->deref_type == nir_deref_type_var || 8397ec681f3Smrg deref->deref_type == nir_deref_type_cast); 8407ec681f3Smrg nir_variable *var = deref->var; 8417ec681f3Smrg 8427ec681f3Smrg b->cursor = nir_before_instr(&deref->instr); 8437ec681f3Smrg 8447ec681f3Smrg if (deref->deref_type == nir_deref_type_var) { 8457ec681f3Smrg /* We turn all deref_var into deref_cast and build a pointer value based on 8467ec681f3Smrg * the var binding which encodes the UAV id. 8477ec681f3Smrg */ 8487ec681f3Smrg nir_ssa_def *ptr = nir_imm_int64(b, (uint64_t)var->data.binding << 32); 8497ec681f3Smrg nir_deref_instr *deref_cast = 8507ec681f3Smrg nir_build_deref_cast(b, ptr, nir_var_mem_ssbo, deref->type, 8517ec681f3Smrg glsl_get_explicit_stride(var->type)); 8527ec681f3Smrg nir_ssa_def_rewrite_uses(&deref->dest.ssa, 8537ec681f3Smrg &deref_cast->dest.ssa); 8547ec681f3Smrg nir_instr_remove(&deref->instr); 8557ec681f3Smrg 8567ec681f3Smrg deref = deref_cast; 8577ec681f3Smrg return true; 8587ec681f3Smrg } 8597ec681f3Smrg return false; 8607ec681f3Smrg} 8617ec681f3Smrg 8627ec681f3Smrgbool 8637ec681f3Smrgdxil_nir_lower_deref_ssbo(nir_shader *nir) 8647ec681f3Smrg{ 8657ec681f3Smrg bool progress = false; 8667ec681f3Smrg 8677ec681f3Smrg foreach_list_typed(nir_function, func, node, &nir->functions) { 8687ec681f3Smrg if (!func->is_entrypoint) 8697ec681f3Smrg continue; 8707ec681f3Smrg assert(func->impl); 8717ec681f3Smrg 8727ec681f3Smrg nir_builder b; 8737ec681f3Smrg nir_builder_init(&b, func->impl); 8747ec681f3Smrg 8757ec681f3Smrg nir_foreach_block(block, func->impl) { 8767ec681f3Smrg nir_foreach_instr_safe(instr, block) { 8777ec681f3Smrg if (instr->type != nir_instr_type_deref) 8787ec681f3Smrg continue; 8797ec681f3Smrg 8807ec681f3Smrg nir_deref_instr *deref = nir_instr_as_deref(instr); 8817ec681f3Smrg 8827ec681f3Smrg if (!nir_deref_mode_is(deref, nir_var_mem_ssbo) || 8837ec681f3Smrg (deref->deref_type != nir_deref_type_var && 8847ec681f3Smrg deref->deref_type != nir_deref_type_cast)) 8857ec681f3Smrg continue; 8867ec681f3Smrg 8877ec681f3Smrg progress |= lower_deref_ssbo(&b, deref); 8887ec681f3Smrg } 8897ec681f3Smrg } 8907ec681f3Smrg } 8917ec681f3Smrg 8927ec681f3Smrg return progress; 8937ec681f3Smrg} 8947ec681f3Smrg 8957ec681f3Smrgstatic bool 8967ec681f3Smrglower_alu_deref_srcs(nir_builder *b, nir_alu_instr *alu) 8977ec681f3Smrg{ 8987ec681f3Smrg const nir_op_info *info = &nir_op_infos[alu->op]; 8997ec681f3Smrg bool progress = false; 9007ec681f3Smrg 9017ec681f3Smrg b->cursor = nir_before_instr(&alu->instr); 9027ec681f3Smrg 9037ec681f3Smrg for (unsigned i = 0; i < info->num_inputs; i++) { 9047ec681f3Smrg nir_deref_instr *deref = nir_src_as_deref(alu->src[i].src); 9057ec681f3Smrg 9067ec681f3Smrg if (!deref) 9077ec681f3Smrg continue; 9087ec681f3Smrg 9097ec681f3Smrg nir_deref_path path; 9107ec681f3Smrg nir_deref_path_init(&path, deref, NULL); 9117ec681f3Smrg nir_deref_instr *root_deref = path.path[0]; 9127ec681f3Smrg nir_deref_path_finish(&path); 9137ec681f3Smrg 9147ec681f3Smrg if (root_deref->deref_type != nir_deref_type_cast) 9157ec681f3Smrg continue; 9167ec681f3Smrg 9177ec681f3Smrg nir_ssa_def *ptr = 9187ec681f3Smrg nir_iadd(b, root_deref->parent.ssa, 9197ec681f3Smrg nir_build_deref_offset(b, deref, cl_type_size_align)); 9207ec681f3Smrg nir_instr_rewrite_src(&alu->instr, &alu->src[i].src, nir_src_for_ssa(ptr)); 9217ec681f3Smrg progress = true; 9227ec681f3Smrg } 9237ec681f3Smrg 9247ec681f3Smrg return progress; 9257ec681f3Smrg} 9267ec681f3Smrg 9277ec681f3Smrgbool 9287ec681f3Smrgdxil_nir_opt_alu_deref_srcs(nir_shader *nir) 9297ec681f3Smrg{ 9307ec681f3Smrg bool progress = false; 9317ec681f3Smrg 9327ec681f3Smrg foreach_list_typed(nir_function, func, node, &nir->functions) { 9337ec681f3Smrg if (!func->is_entrypoint) 9347ec681f3Smrg continue; 9357ec681f3Smrg assert(func->impl); 9367ec681f3Smrg 9377ec681f3Smrg bool progress = false; 9387ec681f3Smrg nir_builder b; 9397ec681f3Smrg nir_builder_init(&b, func->impl); 9407ec681f3Smrg 9417ec681f3Smrg nir_foreach_block(block, func->impl) { 9427ec681f3Smrg nir_foreach_instr_safe(instr, block) { 9437ec681f3Smrg if (instr->type != nir_instr_type_alu) 9447ec681f3Smrg continue; 9457ec681f3Smrg 9467ec681f3Smrg nir_alu_instr *alu = nir_instr_as_alu(instr); 9477ec681f3Smrg progress |= lower_alu_deref_srcs(&b, alu); 9487ec681f3Smrg } 9497ec681f3Smrg } 9507ec681f3Smrg } 9517ec681f3Smrg 9527ec681f3Smrg return progress; 9537ec681f3Smrg} 9547ec681f3Smrg 9557ec681f3Smrgstatic nir_ssa_def * 9567ec681f3Smrgmemcpy_load_deref_elem(nir_builder *b, nir_deref_instr *parent, 9577ec681f3Smrg nir_ssa_def *index) 9587ec681f3Smrg{ 9597ec681f3Smrg nir_deref_instr *deref; 9607ec681f3Smrg 9617ec681f3Smrg index = nir_i2i(b, index, nir_dest_bit_size(parent->dest)); 9627ec681f3Smrg assert(parent->deref_type == nir_deref_type_cast); 9637ec681f3Smrg deref = nir_build_deref_ptr_as_array(b, parent, index); 9647ec681f3Smrg 9657ec681f3Smrg return nir_load_deref(b, deref); 9667ec681f3Smrg} 9677ec681f3Smrg 9687ec681f3Smrgstatic void 9697ec681f3Smrgmemcpy_store_deref_elem(nir_builder *b, nir_deref_instr *parent, 9707ec681f3Smrg nir_ssa_def *index, nir_ssa_def *value) 9717ec681f3Smrg{ 9727ec681f3Smrg nir_deref_instr *deref; 9737ec681f3Smrg 9747ec681f3Smrg index = nir_i2i(b, index, nir_dest_bit_size(parent->dest)); 9757ec681f3Smrg assert(parent->deref_type == nir_deref_type_cast); 9767ec681f3Smrg deref = nir_build_deref_ptr_as_array(b, parent, index); 9777ec681f3Smrg nir_store_deref(b, deref, value, 1); 9787ec681f3Smrg} 9797ec681f3Smrg 9807ec681f3Smrgstatic bool 9817ec681f3Smrglower_memcpy_deref(nir_builder *b, nir_intrinsic_instr *intr) 9827ec681f3Smrg{ 9837ec681f3Smrg nir_deref_instr *dst_deref = nir_src_as_deref(intr->src[0]); 9847ec681f3Smrg nir_deref_instr *src_deref = nir_src_as_deref(intr->src[1]); 9857ec681f3Smrg assert(intr->src[2].is_ssa); 9867ec681f3Smrg nir_ssa_def *num_bytes = intr->src[2].ssa; 9877ec681f3Smrg 9887ec681f3Smrg assert(dst_deref && src_deref); 9897ec681f3Smrg 9907ec681f3Smrg b->cursor = nir_after_instr(&intr->instr); 9917ec681f3Smrg 9927ec681f3Smrg dst_deref = nir_build_deref_cast(b, &dst_deref->dest.ssa, dst_deref->modes, 9937ec681f3Smrg glsl_uint8_t_type(), 1); 9947ec681f3Smrg src_deref = nir_build_deref_cast(b, &src_deref->dest.ssa, src_deref->modes, 9957ec681f3Smrg glsl_uint8_t_type(), 1); 9967ec681f3Smrg 9977ec681f3Smrg /* 9987ec681f3Smrg * We want to avoid 64b instructions, so let's assume we'll always be 9997ec681f3Smrg * passed a value that fits in a 32b type and truncate the 64b value. 10007ec681f3Smrg */ 10017ec681f3Smrg num_bytes = nir_u2u32(b, num_bytes); 10027ec681f3Smrg 10037ec681f3Smrg nir_variable *loop_index_var = 10047ec681f3Smrg nir_local_variable_create(b->impl, glsl_uint_type(), "loop_index"); 10057ec681f3Smrg nir_deref_instr *loop_index_deref = nir_build_deref_var(b, loop_index_var); 10067ec681f3Smrg nir_store_deref(b, loop_index_deref, nir_imm_int(b, 0), 1); 10077ec681f3Smrg 10087ec681f3Smrg nir_loop *loop = nir_push_loop(b); 10097ec681f3Smrg nir_ssa_def *loop_index = nir_load_deref(b, loop_index_deref); 10107ec681f3Smrg nir_ssa_def *cmp = nir_ige(b, loop_index, num_bytes); 10117ec681f3Smrg nir_if *loop_check = nir_push_if(b, cmp); 10127ec681f3Smrg nir_jump(b, nir_jump_break); 10137ec681f3Smrg nir_pop_if(b, loop_check); 10147ec681f3Smrg nir_ssa_def *val = memcpy_load_deref_elem(b, src_deref, loop_index); 10157ec681f3Smrg memcpy_store_deref_elem(b, dst_deref, loop_index, val); 10167ec681f3Smrg nir_store_deref(b, loop_index_deref, nir_iadd_imm(b, loop_index, 1), 1); 10177ec681f3Smrg nir_pop_loop(b, loop); 10187ec681f3Smrg nir_instr_remove(&intr->instr); 10197ec681f3Smrg return true; 10207ec681f3Smrg} 10217ec681f3Smrg 10227ec681f3Smrgbool 10237ec681f3Smrgdxil_nir_lower_memcpy_deref(nir_shader *nir) 10247ec681f3Smrg{ 10257ec681f3Smrg bool progress = false; 10267ec681f3Smrg 10277ec681f3Smrg foreach_list_typed(nir_function, func, node, &nir->functions) { 10287ec681f3Smrg if (!func->is_entrypoint) 10297ec681f3Smrg continue; 10307ec681f3Smrg assert(func->impl); 10317ec681f3Smrg 10327ec681f3Smrg nir_builder b; 10337ec681f3Smrg nir_builder_init(&b, func->impl); 10347ec681f3Smrg 10357ec681f3Smrg nir_foreach_block(block, func->impl) { 10367ec681f3Smrg nir_foreach_instr_safe(instr, block) { 10377ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 10387ec681f3Smrg continue; 10397ec681f3Smrg 10407ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 10417ec681f3Smrg 10427ec681f3Smrg if (intr->intrinsic == nir_intrinsic_memcpy_deref) 10437ec681f3Smrg progress |= lower_memcpy_deref(&b, intr); 10447ec681f3Smrg } 10457ec681f3Smrg } 10467ec681f3Smrg } 10477ec681f3Smrg 10487ec681f3Smrg return progress; 10497ec681f3Smrg} 10507ec681f3Smrg 10517ec681f3Smrgstatic void 10527ec681f3Smrgcast_phi(nir_builder *b, nir_phi_instr *phi, unsigned new_bit_size) 10537ec681f3Smrg{ 10547ec681f3Smrg nir_phi_instr *lowered = nir_phi_instr_create(b->shader); 10557ec681f3Smrg int num_components = 0; 10567ec681f3Smrg int old_bit_size = phi->dest.ssa.bit_size; 10577ec681f3Smrg 10587ec681f3Smrg nir_op upcast_op = nir_type_conversion_op(nir_type_uint | old_bit_size, 10597ec681f3Smrg nir_type_uint | new_bit_size, 10607ec681f3Smrg nir_rounding_mode_undef); 10617ec681f3Smrg nir_op downcast_op = nir_type_conversion_op(nir_type_uint | new_bit_size, 10627ec681f3Smrg nir_type_uint | old_bit_size, 10637ec681f3Smrg nir_rounding_mode_undef); 10647ec681f3Smrg 10657ec681f3Smrg nir_foreach_phi_src(src, phi) { 10667ec681f3Smrg assert(num_components == 0 || num_components == src->src.ssa->num_components); 10677ec681f3Smrg num_components = src->src.ssa->num_components; 10687ec681f3Smrg 10697ec681f3Smrg b->cursor = nir_after_instr_and_phis(src->src.ssa->parent_instr); 10707ec681f3Smrg 10717ec681f3Smrg nir_ssa_def *cast = nir_build_alu(b, upcast_op, src->src.ssa, NULL, NULL, NULL); 10727ec681f3Smrg nir_phi_instr_add_src(lowered, src->pred, nir_src_for_ssa(cast)); 10737ec681f3Smrg } 10747ec681f3Smrg 10757ec681f3Smrg nir_ssa_dest_init(&lowered->instr, &lowered->dest, 10767ec681f3Smrg num_components, new_bit_size, NULL); 10777ec681f3Smrg 10787ec681f3Smrg b->cursor = nir_before_instr(&phi->instr); 10797ec681f3Smrg nir_builder_instr_insert(b, &lowered->instr); 10807ec681f3Smrg 10817ec681f3Smrg b->cursor = nir_after_phis(nir_cursor_current_block(b->cursor)); 10827ec681f3Smrg nir_ssa_def *result = nir_build_alu(b, downcast_op, &lowered->dest.ssa, NULL, NULL, NULL); 10837ec681f3Smrg 10847ec681f3Smrg nir_ssa_def_rewrite_uses(&phi->dest.ssa, result); 10857ec681f3Smrg nir_instr_remove(&phi->instr); 10867ec681f3Smrg} 10877ec681f3Smrg 10887ec681f3Smrgstatic bool 10897ec681f3Smrgupcast_phi_impl(nir_function_impl *impl, unsigned min_bit_size) 10907ec681f3Smrg{ 10917ec681f3Smrg nir_builder b; 10927ec681f3Smrg nir_builder_init(&b, impl); 10937ec681f3Smrg bool progress = false; 10947ec681f3Smrg 10957ec681f3Smrg nir_foreach_block_reverse(block, impl) { 10967ec681f3Smrg nir_foreach_instr_safe(instr, block) { 10977ec681f3Smrg if (instr->type != nir_instr_type_phi) 10987ec681f3Smrg continue; 10997ec681f3Smrg 11007ec681f3Smrg nir_phi_instr *phi = nir_instr_as_phi(instr); 11017ec681f3Smrg assert(phi->dest.is_ssa); 11027ec681f3Smrg 11037ec681f3Smrg if (phi->dest.ssa.bit_size == 1 || 11047ec681f3Smrg phi->dest.ssa.bit_size >= min_bit_size) 11057ec681f3Smrg continue; 11067ec681f3Smrg 11077ec681f3Smrg cast_phi(&b, phi, min_bit_size); 11087ec681f3Smrg progress = true; 11097ec681f3Smrg } 11107ec681f3Smrg } 11117ec681f3Smrg 11127ec681f3Smrg if (progress) { 11137ec681f3Smrg nir_metadata_preserve(impl, nir_metadata_block_index | 11147ec681f3Smrg nir_metadata_dominance); 11157ec681f3Smrg } else { 11167ec681f3Smrg nir_metadata_preserve(impl, nir_metadata_all); 11177ec681f3Smrg } 11187ec681f3Smrg 11197ec681f3Smrg return progress; 11207ec681f3Smrg} 11217ec681f3Smrg 11227ec681f3Smrgbool 11237ec681f3Smrgdxil_nir_lower_upcast_phis(nir_shader *shader, unsigned min_bit_size) 11247ec681f3Smrg{ 11257ec681f3Smrg bool progress = false; 11267ec681f3Smrg 11277ec681f3Smrg nir_foreach_function(function, shader) { 11287ec681f3Smrg if (function->impl) 11297ec681f3Smrg progress |= upcast_phi_impl(function->impl, min_bit_size); 11307ec681f3Smrg } 11317ec681f3Smrg 11327ec681f3Smrg return progress; 11337ec681f3Smrg} 11347ec681f3Smrg 11357ec681f3Smrgstruct dxil_nir_split_clip_cull_distance_params { 11367ec681f3Smrg nir_variable *new_var; 11377ec681f3Smrg nir_shader *shader; 11387ec681f3Smrg}; 11397ec681f3Smrg 11407ec681f3Smrg/* In GLSL and SPIR-V, clip and cull distance are arrays of floats (with a limit of 8). 11417ec681f3Smrg * In DXIL, clip and cull distances are up to 2 float4s combined. 11427ec681f3Smrg * Coming from GLSL, we can request this 2 float4 format, but coming from SPIR-V, 11437ec681f3Smrg * we can't, and have to accept a "compact" array of scalar floats. 11447ec681f3Smrg * 11457ec681f3Smrg * To help emitting a valid input signature for this case, split the variables so that they 11467ec681f3Smrg * match what we need to put in the signature (e.g. { float clip[4]; float clip1; float cull[3]; }) 11477ec681f3Smrg */ 11487ec681f3Smrgstatic bool 11497ec681f3Smrgdxil_nir_split_clip_cull_distance_instr(nir_builder *b, 11507ec681f3Smrg nir_instr *instr, 11517ec681f3Smrg void *cb_data) 11527ec681f3Smrg{ 11537ec681f3Smrg struct dxil_nir_split_clip_cull_distance_params *params = cb_data; 11547ec681f3Smrg nir_variable *new_var = params->new_var; 11557ec681f3Smrg 11567ec681f3Smrg if (instr->type != nir_instr_type_deref) 11577ec681f3Smrg return false; 11587ec681f3Smrg 11597ec681f3Smrg nir_deref_instr *deref = nir_instr_as_deref(instr); 11607ec681f3Smrg nir_variable *var = nir_deref_instr_get_variable(deref); 11617ec681f3Smrg if (!var || 11627ec681f3Smrg var->data.location < VARYING_SLOT_CLIP_DIST0 || 11637ec681f3Smrg var->data.location > VARYING_SLOT_CULL_DIST1 || 11647ec681f3Smrg !var->data.compact) 11657ec681f3Smrg return false; 11667ec681f3Smrg 11677ec681f3Smrg /* The location should only be inside clip distance, because clip 11687ec681f3Smrg * and cull should've been merged by nir_lower_clip_cull_distance_arrays() 11697ec681f3Smrg */ 11707ec681f3Smrg assert(var->data.location == VARYING_SLOT_CLIP_DIST0 || 11717ec681f3Smrg var->data.location == VARYING_SLOT_CLIP_DIST1); 11727ec681f3Smrg 11737ec681f3Smrg /* The deref chain to the clip/cull variables should be simple, just the 11747ec681f3Smrg * var and an array with a constant index, otherwise more lowering/optimization 11757ec681f3Smrg * might be needed before this pass, e.g. copy prop, lower_io_to_temporaries, 11767ec681f3Smrg * split_var_copies, and/or lower_var_copies 11777ec681f3Smrg */ 11787ec681f3Smrg assert(deref->deref_type == nir_deref_type_var || 11797ec681f3Smrg deref->deref_type == nir_deref_type_array); 11807ec681f3Smrg 11817ec681f3Smrg b->cursor = nir_before_instr(instr); 11827ec681f3Smrg if (!new_var) { 11837ec681f3Smrg /* Update lengths for new and old vars */ 11847ec681f3Smrg int old_length = glsl_array_size(var->type); 11857ec681f3Smrg int new_length = (old_length + var->data.location_frac) - 4; 11867ec681f3Smrg old_length -= new_length; 11877ec681f3Smrg 11887ec681f3Smrg /* The existing variable fits in the float4 */ 11897ec681f3Smrg if (new_length <= 0) 11907ec681f3Smrg return false; 11917ec681f3Smrg 11927ec681f3Smrg new_var = nir_variable_clone(var, params->shader); 11937ec681f3Smrg nir_shader_add_variable(params->shader, new_var); 11947ec681f3Smrg assert(glsl_get_base_type(glsl_get_array_element(var->type)) == GLSL_TYPE_FLOAT); 11957ec681f3Smrg var->type = glsl_array_type(glsl_float_type(), old_length, 0); 11967ec681f3Smrg new_var->type = glsl_array_type(glsl_float_type(), new_length, 0); 11977ec681f3Smrg new_var->data.location++; 11987ec681f3Smrg new_var->data.location_frac = 0; 11997ec681f3Smrg params->new_var = new_var; 12007ec681f3Smrg } 12017ec681f3Smrg 12027ec681f3Smrg /* Update the type for derefs of the old var */ 12037ec681f3Smrg if (deref->deref_type == nir_deref_type_var) { 12047ec681f3Smrg deref->type = var->type; 12057ec681f3Smrg return false; 12067ec681f3Smrg } 12077ec681f3Smrg 12087ec681f3Smrg nir_const_value *index = nir_src_as_const_value(deref->arr.index); 12097ec681f3Smrg assert(index); 12107ec681f3Smrg 12117ec681f3Smrg /* Treat this array as a vector starting at the component index in location_frac, 12127ec681f3Smrg * so if location_frac is 1 and index is 0, then it's accessing the 'y' component 12137ec681f3Smrg * of the vector. If index + location_frac is >= 4, there's no component there, 12147ec681f3Smrg * so we need to add a new variable and adjust the index. 12157ec681f3Smrg */ 12167ec681f3Smrg unsigned total_index = index->u32 + var->data.location_frac; 12177ec681f3Smrg if (total_index < 4) 12187ec681f3Smrg return false; 12197ec681f3Smrg 12207ec681f3Smrg nir_deref_instr *new_var_deref = nir_build_deref_var(b, new_var); 12217ec681f3Smrg nir_deref_instr *new_array_deref = nir_build_deref_array(b, new_var_deref, nir_imm_int(b, total_index % 4)); 12227ec681f3Smrg nir_ssa_def_rewrite_uses(&deref->dest.ssa, &new_array_deref->dest.ssa); 12237ec681f3Smrg return true; 12247ec681f3Smrg} 12257ec681f3Smrg 12267ec681f3Smrgbool 12277ec681f3Smrgdxil_nir_split_clip_cull_distance(nir_shader *shader) 12287ec681f3Smrg{ 12297ec681f3Smrg struct dxil_nir_split_clip_cull_distance_params params = { 12307ec681f3Smrg .new_var = NULL, 12317ec681f3Smrg .shader = shader, 12327ec681f3Smrg }; 12337ec681f3Smrg nir_shader_instructions_pass(shader, 12347ec681f3Smrg dxil_nir_split_clip_cull_distance_instr, 12357ec681f3Smrg nir_metadata_block_index | 12367ec681f3Smrg nir_metadata_dominance | 12377ec681f3Smrg nir_metadata_loop_analysis, 12387ec681f3Smrg ¶ms); 12397ec681f3Smrg return params.new_var != NULL; 12407ec681f3Smrg} 12417ec681f3Smrg 12427ec681f3Smrgstatic bool 12437ec681f3Smrgdxil_nir_lower_double_math_instr(nir_builder *b, 12447ec681f3Smrg nir_instr *instr, 12457ec681f3Smrg UNUSED void *cb_data) 12467ec681f3Smrg{ 12477ec681f3Smrg if (instr->type != nir_instr_type_alu) 12487ec681f3Smrg return false; 12497ec681f3Smrg 12507ec681f3Smrg nir_alu_instr *alu = nir_instr_as_alu(instr); 12517ec681f3Smrg 12527ec681f3Smrg /* TODO: See if we can apply this explicitly to packs/unpacks that are then 12537ec681f3Smrg * used as a double. As-is, if we had an app explicitly do a 64bit integer op, 12547ec681f3Smrg * then try to bitcast to double (not expressible in HLSL, but it is in other 12557ec681f3Smrg * source languages), this would unpack the integer and repack as a double, when 12567ec681f3Smrg * we probably want to just send the bitcast through to the backend. 12577ec681f3Smrg */ 12587ec681f3Smrg 12597ec681f3Smrg b->cursor = nir_before_instr(&alu->instr); 12607ec681f3Smrg 12617ec681f3Smrg bool progress = false; 12627ec681f3Smrg for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; ++i) { 12637ec681f3Smrg if (nir_alu_type_get_base_type(nir_op_infos[alu->op].input_types[i]) == nir_type_float && 12647ec681f3Smrg alu->src[i].src.ssa->bit_size == 64) { 12657ec681f3Smrg nir_ssa_def *packed_double = nir_channel(b, alu->src[i].src.ssa, alu->src[i].swizzle[0]); 12667ec681f3Smrg nir_ssa_def *unpacked_double = nir_unpack_64_2x32(b, packed_double); 12677ec681f3Smrg nir_ssa_def *repacked_double = nir_pack_double_2x32_dxil(b, unpacked_double); 12687ec681f3Smrg nir_instr_rewrite_src_ssa(instr, &alu->src[i].src, repacked_double); 12697ec681f3Smrg memset(alu->src[i].swizzle, 0, ARRAY_SIZE(alu->src[i].swizzle)); 12707ec681f3Smrg progress = true; 12717ec681f3Smrg } 12727ec681f3Smrg } 12737ec681f3Smrg 12747ec681f3Smrg if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_float && 12757ec681f3Smrg alu->dest.dest.ssa.bit_size == 64) { 12767ec681f3Smrg b->cursor = nir_after_instr(&alu->instr); 12777ec681f3Smrg nir_ssa_def *packed_double = &alu->dest.dest.ssa; 12787ec681f3Smrg nir_ssa_def *unpacked_double = nir_unpack_double_2x32_dxil(b, packed_double); 12797ec681f3Smrg nir_ssa_def *repacked_double = nir_pack_64_2x32(b, unpacked_double); 12807ec681f3Smrg nir_ssa_def_rewrite_uses_after(packed_double, repacked_double, unpacked_double->parent_instr); 12817ec681f3Smrg progress = true; 12827ec681f3Smrg } 12837ec681f3Smrg 12847ec681f3Smrg return progress; 12857ec681f3Smrg} 12867ec681f3Smrg 12877ec681f3Smrgbool 12887ec681f3Smrgdxil_nir_lower_double_math(nir_shader *shader) 12897ec681f3Smrg{ 12907ec681f3Smrg return nir_shader_instructions_pass(shader, 12917ec681f3Smrg dxil_nir_lower_double_math_instr, 12927ec681f3Smrg nir_metadata_block_index | 12937ec681f3Smrg nir_metadata_dominance | 12947ec681f3Smrg nir_metadata_loop_analysis, 12957ec681f3Smrg NULL); 12967ec681f3Smrg} 12977ec681f3Smrg 12987ec681f3Smrgtypedef struct { 12997ec681f3Smrg gl_system_value *values; 13007ec681f3Smrg uint32_t count; 13017ec681f3Smrg} zero_system_values_state; 13027ec681f3Smrg 13037ec681f3Smrgstatic bool 13047ec681f3Smrglower_system_value_to_zero_filter(const nir_instr* instr, const void* cb_state) 13057ec681f3Smrg{ 13067ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) { 13077ec681f3Smrg return false; 13087ec681f3Smrg } 13097ec681f3Smrg 13107ec681f3Smrg nir_intrinsic_instr* intrin = nir_instr_as_intrinsic(instr); 13117ec681f3Smrg 13127ec681f3Smrg /* All the intrinsics we care about are loads */ 13137ec681f3Smrg if (!nir_intrinsic_infos[intrin->intrinsic].has_dest) 13147ec681f3Smrg return false; 13157ec681f3Smrg 13167ec681f3Smrg assert(intrin->dest.is_ssa); 13177ec681f3Smrg 13187ec681f3Smrg zero_system_values_state* state = (zero_system_values_state*)cb_state; 13197ec681f3Smrg for (uint32_t i = 0; i < state->count; ++i) { 13207ec681f3Smrg gl_system_value value = state->values[i]; 13217ec681f3Smrg nir_intrinsic_op value_op = nir_intrinsic_from_system_value(value); 13227ec681f3Smrg 13237ec681f3Smrg if (intrin->intrinsic == value_op) { 13247ec681f3Smrg return true; 13257ec681f3Smrg } else if (intrin->intrinsic == nir_intrinsic_load_deref) { 13267ec681f3Smrg nir_deref_instr* deref = nir_src_as_deref(intrin->src[0]); 13277ec681f3Smrg if (!nir_deref_mode_is(deref, nir_var_system_value)) 13287ec681f3Smrg return false; 13297ec681f3Smrg 13307ec681f3Smrg nir_variable* var = deref->var; 13317ec681f3Smrg if (var->data.location == value) { 13327ec681f3Smrg return true; 13337ec681f3Smrg } 13347ec681f3Smrg } 13357ec681f3Smrg } 13367ec681f3Smrg 13377ec681f3Smrg return false; 13387ec681f3Smrg} 13397ec681f3Smrg 13407ec681f3Smrgstatic nir_ssa_def* 13417ec681f3Smrglower_system_value_to_zero_instr(nir_builder* b, nir_instr* instr, void* _state) 13427ec681f3Smrg{ 13437ec681f3Smrg return nir_imm_int(b, 0); 13447ec681f3Smrg} 13457ec681f3Smrg 13467ec681f3Smrgbool 13477ec681f3Smrgdxil_nir_lower_system_values_to_zero(nir_shader* shader, 13487ec681f3Smrg gl_system_value* system_values, 13497ec681f3Smrg uint32_t count) 13507ec681f3Smrg{ 13517ec681f3Smrg zero_system_values_state state = { system_values, count }; 13527ec681f3Smrg return nir_shader_lower_instructions(shader, 13537ec681f3Smrg lower_system_value_to_zero_filter, 13547ec681f3Smrg lower_system_value_to_zero_instr, 13557ec681f3Smrg &state); 13567ec681f3Smrg} 13577ec681f3Smrg 13587ec681f3Smrgstatic const struct glsl_type * 13597ec681f3Smrgget_bare_samplers_for_type(const struct glsl_type *type) 13607ec681f3Smrg{ 13617ec681f3Smrg if (glsl_type_is_sampler(type)) { 13627ec681f3Smrg if (glsl_sampler_type_is_shadow(type)) 13637ec681f3Smrg return glsl_bare_shadow_sampler_type(); 13647ec681f3Smrg else 13657ec681f3Smrg return glsl_bare_sampler_type(); 13667ec681f3Smrg } else if (glsl_type_is_array(type)) { 13677ec681f3Smrg return glsl_array_type( 13687ec681f3Smrg get_bare_samplers_for_type(glsl_get_array_element(type)), 13697ec681f3Smrg glsl_get_length(type), 13707ec681f3Smrg 0 /*explicit size*/); 13717ec681f3Smrg } 13727ec681f3Smrg assert(!"Unexpected type"); 13737ec681f3Smrg return NULL; 13747ec681f3Smrg} 13757ec681f3Smrg 13767ec681f3Smrgstatic bool 13777ec681f3Smrgredirect_sampler_derefs(struct nir_builder *b, nir_instr *instr, void *data) 13787ec681f3Smrg{ 13797ec681f3Smrg if (instr->type != nir_instr_type_tex) 13807ec681f3Smrg return false; 13817ec681f3Smrg 13827ec681f3Smrg nir_tex_instr *tex = nir_instr_as_tex(instr); 13837ec681f3Smrg if (!nir_tex_instr_need_sampler(tex)) 13847ec681f3Smrg return false; 13857ec681f3Smrg 13867ec681f3Smrg int sampler_idx = nir_tex_instr_src_index(tex, nir_tex_src_sampler_deref); 13877ec681f3Smrg if (sampler_idx == -1) { 13887ec681f3Smrg /* No derefs, must be using indices */ 13897ec681f3Smrg nir_variable *bare_sampler = _mesa_hash_table_u64_search(data, tex->sampler_index); 13907ec681f3Smrg 13917ec681f3Smrg /* Already have a bare sampler here */ 13927ec681f3Smrg if (bare_sampler) 13937ec681f3Smrg return false; 13947ec681f3Smrg 13957ec681f3Smrg nir_variable *typed_sampler = NULL; 13967ec681f3Smrg nir_foreach_variable_with_modes(var, b->shader, nir_var_uniform) { 13977ec681f3Smrg if (var->data.binding <= tex->sampler_index && 13987ec681f3Smrg var->data.binding + glsl_type_get_sampler_count(var->type) > tex->sampler_index) { 13997ec681f3Smrg /* Already have a bare sampler for this binding, add it to the table */ 14007ec681f3Smrg if (glsl_get_sampler_result_type(glsl_without_array(var->type)) == GLSL_TYPE_VOID) { 14017ec681f3Smrg _mesa_hash_table_u64_insert(data, tex->sampler_index, var); 14027ec681f3Smrg return false; 14037ec681f3Smrg } 14047ec681f3Smrg 14057ec681f3Smrg typed_sampler = var; 14067ec681f3Smrg } 14077ec681f3Smrg } 14087ec681f3Smrg 14097ec681f3Smrg /* Clone the typed sampler to a bare sampler and we're done */ 14107ec681f3Smrg assert(typed_sampler); 14117ec681f3Smrg bare_sampler = nir_variable_clone(typed_sampler, b->shader); 14127ec681f3Smrg bare_sampler->type = get_bare_samplers_for_type(typed_sampler->type); 14137ec681f3Smrg nir_shader_add_variable(b->shader, bare_sampler); 14147ec681f3Smrg _mesa_hash_table_u64_insert(data, tex->sampler_index, bare_sampler); 14157ec681f3Smrg return true; 14167ec681f3Smrg } 14177ec681f3Smrg 14187ec681f3Smrg /* Using derefs, means we have to rewrite the deref chain in addition to cloning */ 14197ec681f3Smrg nir_deref_instr *final_deref = nir_src_as_deref(tex->src[sampler_idx].src); 14207ec681f3Smrg nir_deref_path path; 14217ec681f3Smrg nir_deref_path_init(&path, final_deref, NULL); 14227ec681f3Smrg 14237ec681f3Smrg nir_deref_instr *old_tail = path.path[0]; 14247ec681f3Smrg assert(old_tail->deref_type == nir_deref_type_var); 14257ec681f3Smrg nir_variable *old_var = old_tail->var; 14267ec681f3Smrg if (glsl_get_sampler_result_type(glsl_without_array(old_var->type)) == GLSL_TYPE_VOID) { 14277ec681f3Smrg nir_deref_path_finish(&path); 14287ec681f3Smrg return false; 14297ec681f3Smrg } 14307ec681f3Smrg 14317ec681f3Smrg nir_variable *new_var = _mesa_hash_table_u64_search(data, old_var->data.binding); 14327ec681f3Smrg if (!new_var) { 14337ec681f3Smrg new_var = nir_variable_clone(old_var, b->shader); 14347ec681f3Smrg new_var->type = get_bare_samplers_for_type(old_var->type); 14357ec681f3Smrg nir_shader_add_variable(b->shader, new_var); 14367ec681f3Smrg _mesa_hash_table_u64_insert(data, old_var->data.binding, new_var); 14377ec681f3Smrg } 14387ec681f3Smrg 14397ec681f3Smrg b->cursor = nir_after_instr(&old_tail->instr); 14407ec681f3Smrg nir_deref_instr *new_tail = nir_build_deref_var(b, new_var); 14417ec681f3Smrg 14427ec681f3Smrg for (unsigned i = 1; path.path[i]; ++i) { 14437ec681f3Smrg b->cursor = nir_after_instr(&path.path[i]->instr); 14447ec681f3Smrg new_tail = nir_build_deref_follower(b, new_tail, path.path[i]); 14457ec681f3Smrg } 14467ec681f3Smrg 14477ec681f3Smrg nir_deref_path_finish(&path); 14487ec681f3Smrg nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[sampler_idx].src, &new_tail->dest.ssa); 14497ec681f3Smrg 14507ec681f3Smrg return true; 14517ec681f3Smrg} 14527ec681f3Smrg 14537ec681f3Smrgbool 14547ec681f3Smrgdxil_nir_create_bare_samplers(nir_shader *nir) 14557ec681f3Smrg{ 14567ec681f3Smrg struct hash_table_u64 *sampler_to_bare = _mesa_hash_table_u64_create(NULL); 14577ec681f3Smrg 14587ec681f3Smrg bool progress = nir_shader_instructions_pass(nir, redirect_sampler_derefs, 14597ec681f3Smrg nir_metadata_block_index | nir_metadata_dominance | nir_metadata_loop_analysis, sampler_to_bare); 14607ec681f3Smrg 14617ec681f3Smrg _mesa_hash_table_u64_destroy(sampler_to_bare); 14627ec681f3Smrg return progress; 14637ec681f3Smrg} 14647ec681f3Smrg 14657ec681f3Smrg 14667ec681f3Smrgstatic bool 14677ec681f3Smrglower_bool_input_filter(const nir_instr *instr, 14687ec681f3Smrg UNUSED const void *_options) 14697ec681f3Smrg{ 14707ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 14717ec681f3Smrg return false; 14727ec681f3Smrg 14737ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 14747ec681f3Smrg if (intr->intrinsic == nir_intrinsic_load_front_face) 14757ec681f3Smrg return true; 14767ec681f3Smrg 14777ec681f3Smrg if (intr->intrinsic == nir_intrinsic_load_deref) { 14787ec681f3Smrg nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); 14797ec681f3Smrg nir_variable *var = nir_deref_instr_get_variable(deref); 14807ec681f3Smrg return var->data.mode == nir_var_shader_in && 14817ec681f3Smrg glsl_get_base_type(var->type) == GLSL_TYPE_BOOL; 14827ec681f3Smrg } 14837ec681f3Smrg 14847ec681f3Smrg return false; 14857ec681f3Smrg} 14867ec681f3Smrg 14877ec681f3Smrgstatic nir_ssa_def * 14887ec681f3Smrglower_bool_input_impl(nir_builder *b, nir_instr *instr, 14897ec681f3Smrg UNUSED void *_options) 14907ec681f3Smrg{ 14917ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 14927ec681f3Smrg 14937ec681f3Smrg if (intr->intrinsic == nir_intrinsic_load_deref) { 14947ec681f3Smrg nir_deref_instr *deref = nir_instr_as_deref(intr->src[0].ssa->parent_instr); 14957ec681f3Smrg nir_variable *var = nir_deref_instr_get_variable(deref); 14967ec681f3Smrg 14977ec681f3Smrg /* rewrite var->type */ 14987ec681f3Smrg var->type = glsl_vector_type(GLSL_TYPE_UINT, 14997ec681f3Smrg glsl_get_vector_elements(var->type)); 15007ec681f3Smrg deref->type = var->type; 15017ec681f3Smrg } 15027ec681f3Smrg 15037ec681f3Smrg intr->dest.ssa.bit_size = 32; 15047ec681f3Smrg return nir_i2b1(b, &intr->dest.ssa); 15057ec681f3Smrg} 15067ec681f3Smrg 15077ec681f3Smrgbool 15087ec681f3Smrgdxil_nir_lower_bool_input(struct nir_shader *s) 15097ec681f3Smrg{ 15107ec681f3Smrg return nir_shader_lower_instructions(s, lower_bool_input_filter, 15117ec681f3Smrg lower_bool_input_impl, NULL); 15127ec681f3Smrg} 15137ec681f3Smrg 15147ec681f3Smrg/* Comparison function to sort io values so that first come normal varyings, 15157ec681f3Smrg * then system values, and then system generated values. 15167ec681f3Smrg */ 15177ec681f3Smrgstatic int 15187ec681f3Smrgvariable_location_cmp(const nir_variable* a, const nir_variable* b) 15197ec681f3Smrg{ 15207ec681f3Smrg // Sort by driver_location, location, then index 15217ec681f3Smrg return a->data.driver_location != b->data.driver_location ? 15227ec681f3Smrg a->data.driver_location - b->data.driver_location : 15237ec681f3Smrg a->data.location != b->data.location ? 15247ec681f3Smrg a->data.location - b->data.location : 15257ec681f3Smrg a->data.index - b->data.index; 15267ec681f3Smrg} 15277ec681f3Smrg 15287ec681f3Smrg/* Order varyings according to driver location */ 15297ec681f3Smrguint64_t 15307ec681f3Smrgdxil_sort_by_driver_location(nir_shader* s, nir_variable_mode modes) 15317ec681f3Smrg{ 15327ec681f3Smrg nir_sort_variables_with_modes(s, variable_location_cmp, modes); 15337ec681f3Smrg 15347ec681f3Smrg uint64_t result = 0; 15357ec681f3Smrg nir_foreach_variable_with_modes(var, s, modes) { 15367ec681f3Smrg result |= 1ull << var->data.location; 15377ec681f3Smrg } 15387ec681f3Smrg return result; 15397ec681f3Smrg} 15407ec681f3Smrg 15417ec681f3Smrg/* Sort PS outputs so that color outputs come first */ 15427ec681f3Smrgvoid 15437ec681f3Smrgdxil_sort_ps_outputs(nir_shader* s) 15447ec681f3Smrg{ 15457ec681f3Smrg nir_foreach_variable_with_modes_safe(var, s, nir_var_shader_out) { 15467ec681f3Smrg /* We use the driver_location here to avoid introducing a new 15477ec681f3Smrg * struct or member variable here. The true, updated driver location 15487ec681f3Smrg * will be written below, after sorting */ 15497ec681f3Smrg switch (var->data.location) { 15507ec681f3Smrg case FRAG_RESULT_DEPTH: 15517ec681f3Smrg var->data.driver_location = 1; 15527ec681f3Smrg break; 15537ec681f3Smrg case FRAG_RESULT_STENCIL: 15547ec681f3Smrg var->data.driver_location = 2; 15557ec681f3Smrg break; 15567ec681f3Smrg case FRAG_RESULT_SAMPLE_MASK: 15577ec681f3Smrg var->data.driver_location = 3; 15587ec681f3Smrg break; 15597ec681f3Smrg default: 15607ec681f3Smrg var->data.driver_location = 0; 15617ec681f3Smrg } 15627ec681f3Smrg } 15637ec681f3Smrg 15647ec681f3Smrg nir_sort_variables_with_modes(s, variable_location_cmp, 15657ec681f3Smrg nir_var_shader_out); 15667ec681f3Smrg 15677ec681f3Smrg unsigned driver_loc = 0; 15687ec681f3Smrg nir_foreach_variable_with_modes(var, s, nir_var_shader_out) { 15697ec681f3Smrg var->data.driver_location = driver_loc++; 15707ec681f3Smrg } 15717ec681f3Smrg} 15727ec681f3Smrg 15737ec681f3Smrg/* Order between stage values so that normal varyings come first, 15747ec681f3Smrg * then sysvalues and then system generated values. 15757ec681f3Smrg */ 15767ec681f3Smrguint64_t 15777ec681f3Smrgdxil_reassign_driver_locations(nir_shader* s, nir_variable_mode modes, 15787ec681f3Smrg uint64_t other_stage_mask) 15797ec681f3Smrg{ 15807ec681f3Smrg nir_foreach_variable_with_modes_safe(var, s, modes) { 15817ec681f3Smrg /* We use the driver_location here to avoid introducing a new 15827ec681f3Smrg * struct or member variable here. The true, updated driver location 15837ec681f3Smrg * will be written below, after sorting */ 15847ec681f3Smrg var->data.driver_location = nir_var_to_dxil_sysvalue_type(var, other_stage_mask); 15857ec681f3Smrg } 15867ec681f3Smrg 15877ec681f3Smrg nir_sort_variables_with_modes(s, variable_location_cmp, modes); 15887ec681f3Smrg 15897ec681f3Smrg uint64_t result = 0; 15907ec681f3Smrg unsigned driver_loc = 0; 15917ec681f3Smrg nir_foreach_variable_with_modes(var, s, modes) { 15927ec681f3Smrg result |= 1ull << var->data.location; 15937ec681f3Smrg var->data.driver_location = driver_loc++; 15947ec681f3Smrg } 15957ec681f3Smrg return result; 15967ec681f3Smrg} 1597