17ec681f3Smrg/* 27ec681f3Smrg * Copyright (C) 2020 Collabora Ltd. 37ec681f3Smrg * 47ec681f3Smrg * Permission is hereby granted, free of charge, to any person obtaining a 57ec681f3Smrg * copy of this software and associated documentation files (the "Software"), 67ec681f3Smrg * to deal in the Software without restriction, including without limitation 77ec681f3Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 87ec681f3Smrg * and/or sell copies of the Software, and to permit persons to whom the 97ec681f3Smrg * Software is furnished to do so, subject to the following conditions: 107ec681f3Smrg * 117ec681f3Smrg * The above copyright notice and this permission notice (including the next 127ec681f3Smrg * paragraph) shall be included in all copies or substantial portions of the 137ec681f3Smrg * Software. 147ec681f3Smrg * 157ec681f3Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 167ec681f3Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 177ec681f3Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 187ec681f3Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 197ec681f3Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 207ec681f3Smrg * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 217ec681f3Smrg * SOFTWARE. 227ec681f3Smrg * 237ec681f3Smrg * Authors (Collabora): 247ec681f3Smrg * Alyssa Rosenzweig <alyssa.rosenzweig@collabora.com> 257ec681f3Smrg */ 267ec681f3Smrg 277ec681f3Smrg#include "main/mtypes.h" 287ec681f3Smrg#include "compiler/glsl/glsl_to_nir.h" 297ec681f3Smrg#include "compiler/nir_types.h" 307ec681f3Smrg#include "compiler/nir/nir_builder.h" 317ec681f3Smrg#include "util/u_debug.h" 327ec681f3Smrg 337ec681f3Smrg#include "disassemble.h" 347ec681f3Smrg#include "bifrost_compile.h" 357ec681f3Smrg#include "compiler.h" 367ec681f3Smrg#include "bi_quirks.h" 377ec681f3Smrg#include "bi_builder.h" 387ec681f3Smrg#include "bifrost_nir.h" 397ec681f3Smrg 407ec681f3Smrgstatic const struct debug_named_value bifrost_debug_options[] = { 417ec681f3Smrg {"msgs", BIFROST_DBG_MSGS, "Print debug messages"}, 427ec681f3Smrg {"shaders", BIFROST_DBG_SHADERS, "Dump shaders in NIR and MIR"}, 437ec681f3Smrg {"shaderdb", BIFROST_DBG_SHADERDB, "Print statistics"}, 447ec681f3Smrg {"verbose", BIFROST_DBG_VERBOSE, "Disassemble verbosely"}, 457ec681f3Smrg {"internal", BIFROST_DBG_INTERNAL, "Dump even internal shaders"}, 467ec681f3Smrg {"nosched", BIFROST_DBG_NOSCHED, "Force trivial bundling"}, 477ec681f3Smrg {"inorder", BIFROST_DBG_INORDER, "Force in-order bundling"}, 487ec681f3Smrg {"novalidate",BIFROST_DBG_NOVALIDATE, "Skip IR validation"}, 497ec681f3Smrg {"noopt", BIFROST_DBG_NOOPT, "Skip optimization passes"}, 507ec681f3Smrg DEBUG_NAMED_VALUE_END 517ec681f3Smrg}; 527ec681f3Smrg 537ec681f3SmrgDEBUG_GET_ONCE_FLAGS_OPTION(bifrost_debug, "BIFROST_MESA_DEBUG", bifrost_debug_options, 0) 547ec681f3Smrg 557ec681f3Smrg/* How many bytes are prefetched by the Bifrost shader core. From the final 567ec681f3Smrg * clause of the shader, this range must be valid instructions or zero. */ 577ec681f3Smrg#define BIFROST_SHADER_PREFETCH 128 587ec681f3Smrg 597ec681f3Smrgint bifrost_debug = 0; 607ec681f3Smrg 617ec681f3Smrg#define DBG(fmt, ...) \ 627ec681f3Smrg do { if (bifrost_debug & BIFROST_DBG_MSGS) \ 637ec681f3Smrg fprintf(stderr, "%s:%d: "fmt, \ 647ec681f3Smrg __FUNCTION__, __LINE__, ##__VA_ARGS__); } while (0) 657ec681f3Smrg 667ec681f3Smrgstatic bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list); 677ec681f3Smrg 687ec681f3Smrgstatic void 697ec681f3Smrgbi_block_add_successor(bi_block *block, bi_block *successor) 707ec681f3Smrg{ 717ec681f3Smrg assert(block != NULL && successor != NULL); 727ec681f3Smrg 737ec681f3Smrg /* Cull impossible edges */ 747ec681f3Smrg if (block->unconditional_jumps) 757ec681f3Smrg return; 767ec681f3Smrg 777ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(block->successors); ++i) { 787ec681f3Smrg if (block->successors[i]) { 797ec681f3Smrg if (block->successors[i] == successor) 807ec681f3Smrg return; 817ec681f3Smrg else 827ec681f3Smrg continue; 837ec681f3Smrg } 847ec681f3Smrg 857ec681f3Smrg block->successors[i] = successor; 867ec681f3Smrg _mesa_set_add(successor->predecessors, block); 877ec681f3Smrg return; 887ec681f3Smrg } 897ec681f3Smrg 907ec681f3Smrg unreachable("Too many successors"); 917ec681f3Smrg} 927ec681f3Smrg 937ec681f3Smrgstatic void 947ec681f3Smrgbi_emit_jump(bi_builder *b, nir_jump_instr *instr) 957ec681f3Smrg{ 967ec681f3Smrg bi_instr *branch = bi_jump(b, bi_zero()); 977ec681f3Smrg 987ec681f3Smrg switch (instr->type) { 997ec681f3Smrg case nir_jump_break: 1007ec681f3Smrg branch->branch_target = b->shader->break_block; 1017ec681f3Smrg break; 1027ec681f3Smrg case nir_jump_continue: 1037ec681f3Smrg branch->branch_target = b->shader->continue_block; 1047ec681f3Smrg break; 1057ec681f3Smrg default: 1067ec681f3Smrg unreachable("Unhandled jump type"); 1077ec681f3Smrg } 1087ec681f3Smrg 1097ec681f3Smrg bi_block_add_successor(b->shader->current_block, branch->branch_target); 1107ec681f3Smrg b->shader->current_block->unconditional_jumps = true; 1117ec681f3Smrg} 1127ec681f3Smrg 1137ec681f3Smrgstatic bi_index 1147ec681f3Smrgbi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr) 1157ec681f3Smrg{ 1167ec681f3Smrg switch (intr->intrinsic) { 1177ec681f3Smrg case nir_intrinsic_load_barycentric_centroid: 1187ec681f3Smrg case nir_intrinsic_load_barycentric_sample: 1197ec681f3Smrg return bi_register(61); 1207ec681f3Smrg 1217ec681f3Smrg /* Need to put the sample ID in the top 16-bits */ 1227ec681f3Smrg case nir_intrinsic_load_barycentric_at_sample: 1237ec681f3Smrg return bi_mkvec_v2i16(b, bi_half(bi_dontcare(), false), 1247ec681f3Smrg bi_half(bi_src_index(&intr->src[0]), false)); 1257ec681f3Smrg 1267ec681f3Smrg /* Interpret as 8:8 signed fixed point positions in pixels along X and 1277ec681f3Smrg * Y axes respectively, relative to top-left of pixel. In NIR, (0, 0) 1287ec681f3Smrg * is the center of the pixel so we first fixup and then convert. For 1297ec681f3Smrg * fp16 input: 1307ec681f3Smrg * 1317ec681f3Smrg * f2i16(((x, y) + (0.5, 0.5)) * 2**8) = 1327ec681f3Smrg * f2i16((256 * (x, y)) + (128, 128)) = 1337ec681f3Smrg * V2F16_TO_V2S16(FMA.v2f16((x, y), #256, #128)) 1347ec681f3Smrg * 1357ec681f3Smrg * For fp32 input, that lacks enough precision for MSAA 16x, but the 1367ec681f3Smrg * idea is the same. FIXME: still doesn't pass 1377ec681f3Smrg */ 1387ec681f3Smrg case nir_intrinsic_load_barycentric_at_offset: { 1397ec681f3Smrg bi_index offset = bi_src_index(&intr->src[0]); 1407ec681f3Smrg bi_index f16 = bi_null(); 1417ec681f3Smrg unsigned sz = nir_src_bit_size(intr->src[0]); 1427ec681f3Smrg 1437ec681f3Smrg if (sz == 16) { 1447ec681f3Smrg f16 = bi_fma_v2f16(b, offset, bi_imm_f16(256.0), 1457ec681f3Smrg bi_imm_f16(128.0), BI_ROUND_NONE); 1467ec681f3Smrg } else { 1477ec681f3Smrg assert(sz == 32); 1487ec681f3Smrg bi_index f[2]; 1497ec681f3Smrg for (unsigned i = 0; i < 2; ++i) { 1507ec681f3Smrg f[i] = bi_fadd_rscale_f32(b, 1517ec681f3Smrg bi_word(offset, i), 1527ec681f3Smrg bi_imm_f32(0.5), bi_imm_u32(8), 1537ec681f3Smrg BI_ROUND_NONE, BI_SPECIAL_NONE); 1547ec681f3Smrg } 1557ec681f3Smrg 1567ec681f3Smrg f16 = bi_v2f32_to_v2f16(b, f[0], f[1], BI_ROUND_NONE); 1577ec681f3Smrg } 1587ec681f3Smrg 1597ec681f3Smrg return bi_v2f16_to_v2s16(b, f16, BI_ROUND_RTZ); 1607ec681f3Smrg } 1617ec681f3Smrg 1627ec681f3Smrg case nir_intrinsic_load_barycentric_pixel: 1637ec681f3Smrg default: 1647ec681f3Smrg return bi_dontcare(); 1657ec681f3Smrg } 1667ec681f3Smrg} 1677ec681f3Smrg 1687ec681f3Smrgstatic enum bi_sample 1697ec681f3Smrgbi_interp_for_intrinsic(nir_intrinsic_op op) 1707ec681f3Smrg{ 1717ec681f3Smrg switch (op) { 1727ec681f3Smrg case nir_intrinsic_load_barycentric_centroid: 1737ec681f3Smrg return BI_SAMPLE_CENTROID; 1747ec681f3Smrg case nir_intrinsic_load_barycentric_sample: 1757ec681f3Smrg case nir_intrinsic_load_barycentric_at_sample: 1767ec681f3Smrg return BI_SAMPLE_SAMPLE; 1777ec681f3Smrg case nir_intrinsic_load_barycentric_at_offset: 1787ec681f3Smrg return BI_SAMPLE_EXPLICIT; 1797ec681f3Smrg case nir_intrinsic_load_barycentric_pixel: 1807ec681f3Smrg default: 1817ec681f3Smrg return BI_SAMPLE_CENTER; 1827ec681f3Smrg } 1837ec681f3Smrg} 1847ec681f3Smrg 1857ec681f3Smrg/* auto, 64-bit omitted */ 1867ec681f3Smrgstatic enum bi_register_format 1877ec681f3Smrgbi_reg_fmt_for_nir(nir_alu_type T) 1887ec681f3Smrg{ 1897ec681f3Smrg switch (T) { 1907ec681f3Smrg case nir_type_float16: return BI_REGISTER_FORMAT_F16; 1917ec681f3Smrg case nir_type_float32: return BI_REGISTER_FORMAT_F32; 1927ec681f3Smrg case nir_type_int16: return BI_REGISTER_FORMAT_S16; 1937ec681f3Smrg case nir_type_uint16: return BI_REGISTER_FORMAT_U16; 1947ec681f3Smrg case nir_type_int32: return BI_REGISTER_FORMAT_S32; 1957ec681f3Smrg case nir_type_uint32: return BI_REGISTER_FORMAT_U32; 1967ec681f3Smrg default: unreachable("Invalid type for register format"); 1977ec681f3Smrg } 1987ec681f3Smrg} 1997ec681f3Smrg 2007ec681f3Smrg/* Checks if the _IMM variant of an intrinsic can be used, returning in imm the 2017ec681f3Smrg * immediate to be used (which applies even if _IMM can't be used) */ 2027ec681f3Smrg 2037ec681f3Smrgstatic bool 2047ec681f3Smrgbi_is_intr_immediate(nir_intrinsic_instr *instr, unsigned *immediate, unsigned max) 2057ec681f3Smrg{ 2067ec681f3Smrg nir_src *offset = nir_get_io_offset_src(instr); 2077ec681f3Smrg 2087ec681f3Smrg if (!nir_src_is_const(*offset)) 2097ec681f3Smrg return false; 2107ec681f3Smrg 2117ec681f3Smrg *immediate = nir_intrinsic_base(instr) + nir_src_as_uint(*offset); 2127ec681f3Smrg return (*immediate) < max; 2137ec681f3Smrg} 2147ec681f3Smrg 2157ec681f3Smrgstatic void 2167ec681f3Smrgbi_make_vec_to(bi_builder *b, bi_index final_dst, 2177ec681f3Smrg bi_index *src, 2187ec681f3Smrg unsigned *channel, 2197ec681f3Smrg unsigned count, 2207ec681f3Smrg unsigned bitsize); 2217ec681f3Smrg 2227ec681f3Smrg/* Bifrost's load instructions lack a component offset despite operating in 2237ec681f3Smrg * terms of vec4 slots. Usually I/O vectorization avoids nonzero components, 2247ec681f3Smrg * but they may be unavoidable with separate shaders in use. To solve this, we 2257ec681f3Smrg * lower to a larger load and an explicit copy of the desired components. */ 2267ec681f3Smrg 2277ec681f3Smrgstatic void 2287ec681f3Smrgbi_copy_component(bi_builder *b, nir_intrinsic_instr *instr, bi_index tmp) 2297ec681f3Smrg{ 2307ec681f3Smrg unsigned component = nir_intrinsic_component(instr); 2317ec681f3Smrg 2327ec681f3Smrg if (component == 0) 2337ec681f3Smrg return; 2347ec681f3Smrg 2357ec681f3Smrg bi_index srcs[] = { tmp, tmp, tmp, tmp }; 2367ec681f3Smrg unsigned channels[] = { component, component + 1, component + 2 }; 2377ec681f3Smrg 2387ec681f3Smrg bi_make_vec_to(b, 2397ec681f3Smrg bi_dest_index(&instr->dest), 2407ec681f3Smrg srcs, channels, instr->num_components, 2417ec681f3Smrg nir_dest_bit_size(instr->dest)); 2427ec681f3Smrg} 2437ec681f3Smrg 2447ec681f3Smrgstatic void 2457ec681f3Smrgbi_emit_load_attr(bi_builder *b, nir_intrinsic_instr *instr) 2467ec681f3Smrg{ 2477ec681f3Smrg nir_alu_type T = nir_intrinsic_dest_type(instr); 2487ec681f3Smrg enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); 2497ec681f3Smrg nir_src *offset = nir_get_io_offset_src(instr); 2507ec681f3Smrg unsigned component = nir_intrinsic_component(instr); 2517ec681f3Smrg enum bi_vecsize vecsize = (instr->num_components + component - 1); 2527ec681f3Smrg unsigned imm_index = 0; 2537ec681f3Smrg unsigned base = nir_intrinsic_base(instr); 2547ec681f3Smrg bool constant = nir_src_is_const(*offset); 2557ec681f3Smrg bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); 2567ec681f3Smrg bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); 2577ec681f3Smrg 2587ec681f3Smrg if (immediate) { 2597ec681f3Smrg bi_ld_attr_imm_to(b, dest, bi_register(61), bi_register(62), 2607ec681f3Smrg regfmt, vecsize, imm_index); 2617ec681f3Smrg } else { 2627ec681f3Smrg bi_index idx = bi_src_index(&instr->src[0]); 2637ec681f3Smrg 2647ec681f3Smrg if (constant) 2657ec681f3Smrg idx = bi_imm_u32(imm_index); 2667ec681f3Smrg else if (base != 0) 2677ec681f3Smrg idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); 2687ec681f3Smrg 2697ec681f3Smrg bi_ld_attr_to(b, dest, bi_register(61), bi_register(62), 2707ec681f3Smrg idx, regfmt, vecsize); 2717ec681f3Smrg } 2727ec681f3Smrg 2737ec681f3Smrg bi_copy_component(b, instr, dest); 2747ec681f3Smrg} 2757ec681f3Smrg 2767ec681f3Smrgstatic void 2777ec681f3Smrgbi_emit_load_vary(bi_builder *b, nir_intrinsic_instr *instr) 2787ec681f3Smrg{ 2797ec681f3Smrg enum bi_sample sample = BI_SAMPLE_CENTER; 2807ec681f3Smrg enum bi_update update = BI_UPDATE_STORE; 2817ec681f3Smrg enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; 2827ec681f3Smrg bool smooth = instr->intrinsic == nir_intrinsic_load_interpolated_input; 2837ec681f3Smrg bi_index src0 = bi_null(); 2847ec681f3Smrg 2857ec681f3Smrg unsigned component = nir_intrinsic_component(instr); 2867ec681f3Smrg enum bi_vecsize vecsize = (instr->num_components + component - 1); 2877ec681f3Smrg bi_index dest = (component == 0) ? bi_dest_index(&instr->dest) : bi_temp(b->shader); 2887ec681f3Smrg 2897ec681f3Smrg unsigned sz = nir_dest_bit_size(instr->dest); 2907ec681f3Smrg 2917ec681f3Smrg if (smooth) { 2927ec681f3Smrg nir_intrinsic_instr *parent = nir_src_as_intrinsic(instr->src[0]); 2937ec681f3Smrg assert(parent); 2947ec681f3Smrg 2957ec681f3Smrg sample = bi_interp_for_intrinsic(parent->intrinsic); 2967ec681f3Smrg src0 = bi_varying_src0_for_barycentric(b, parent); 2977ec681f3Smrg 2987ec681f3Smrg assert(sz == 16 || sz == 32); 2997ec681f3Smrg regfmt = (sz == 16) ? BI_REGISTER_FORMAT_F16 3007ec681f3Smrg : BI_REGISTER_FORMAT_F32; 3017ec681f3Smrg } else { 3027ec681f3Smrg assert(sz == 32); 3037ec681f3Smrg regfmt = BI_REGISTER_FORMAT_U32; 3047ec681f3Smrg } 3057ec681f3Smrg 3067ec681f3Smrg nir_src *offset = nir_get_io_offset_src(instr); 3077ec681f3Smrg unsigned imm_index = 0; 3087ec681f3Smrg bool immediate = bi_is_intr_immediate(instr, &imm_index, 20); 3097ec681f3Smrg 3107ec681f3Smrg if (immediate && smooth) { 3117ec681f3Smrg bi_ld_var_imm_to(b, dest, src0, regfmt, sample, update, 3127ec681f3Smrg vecsize, imm_index); 3137ec681f3Smrg } else if (immediate && !smooth) { 3147ec681f3Smrg bi_ld_var_flat_imm_to(b, dest, BI_FUNCTION_NONE, regfmt, 3157ec681f3Smrg vecsize, imm_index); 3167ec681f3Smrg } else { 3177ec681f3Smrg bi_index idx = bi_src_index(offset); 3187ec681f3Smrg unsigned base = nir_intrinsic_base(instr); 3197ec681f3Smrg 3207ec681f3Smrg if (base != 0) 3217ec681f3Smrg idx = bi_iadd_u32(b, idx, bi_imm_u32(base), false); 3227ec681f3Smrg 3237ec681f3Smrg if (smooth) { 3247ec681f3Smrg bi_ld_var_to(b, dest, src0, idx, regfmt, sample, 3257ec681f3Smrg update, vecsize); 3267ec681f3Smrg } else { 3277ec681f3Smrg bi_ld_var_flat_to(b, dest, idx, BI_FUNCTION_NONE, 3287ec681f3Smrg regfmt, vecsize); 3297ec681f3Smrg } 3307ec681f3Smrg } 3317ec681f3Smrg 3327ec681f3Smrg bi_copy_component(b, instr, dest); 3337ec681f3Smrg} 3347ec681f3Smrg 3357ec681f3Smrgstatic void 3367ec681f3Smrgbi_make_vec16_to(bi_builder *b, bi_index dst, bi_index *src, 3377ec681f3Smrg unsigned *channel, unsigned count) 3387ec681f3Smrg{ 3397ec681f3Smrg for (unsigned i = 0; i < count; i += 2) { 3407ec681f3Smrg bool next = (i + 1) < count; 3417ec681f3Smrg 3427ec681f3Smrg unsigned chan = channel ? channel[i] : 0; 3437ec681f3Smrg unsigned nextc = next && channel ? channel[i + 1] : 0; 3447ec681f3Smrg 3457ec681f3Smrg bi_index w0 = bi_word(src[i], chan >> 1); 3467ec681f3Smrg bi_index w1 = next ? bi_word(src[i + 1], nextc >> 1) : bi_zero(); 3477ec681f3Smrg 3487ec681f3Smrg bi_index h0 = bi_half(w0, chan & 1); 3497ec681f3Smrg bi_index h1 = bi_half(w1, nextc & 1); 3507ec681f3Smrg 3517ec681f3Smrg bi_index to = bi_word(dst, i >> 1); 3527ec681f3Smrg 3537ec681f3Smrg if (bi_is_word_equiv(w0, w1) && (chan & 1) == 0 && ((nextc & 1) == 1)) 3547ec681f3Smrg bi_mov_i32_to(b, to, w0); 3557ec681f3Smrg else if (bi_is_word_equiv(w0, w1)) 3567ec681f3Smrg bi_swz_v2i16_to(b, to, bi_swz_16(w0, chan & 1, nextc & 1)); 3577ec681f3Smrg else 3587ec681f3Smrg bi_mkvec_v2i16_to(b, to, h0, h1); 3597ec681f3Smrg } 3607ec681f3Smrg} 3617ec681f3Smrg 3627ec681f3Smrgstatic void 3637ec681f3Smrgbi_make_vec_to(bi_builder *b, bi_index final_dst, 3647ec681f3Smrg bi_index *src, 3657ec681f3Smrg unsigned *channel, 3667ec681f3Smrg unsigned count, 3677ec681f3Smrg unsigned bitsize) 3687ec681f3Smrg{ 3697ec681f3Smrg /* If we reads our own output, we need a temporary move to allow for 3707ec681f3Smrg * swapping. TODO: Could do a bit better for pairwise swaps of 16-bit 3717ec681f3Smrg * vectors */ 3727ec681f3Smrg bool reads_self = false; 3737ec681f3Smrg 3747ec681f3Smrg for (unsigned i = 0; i < count; ++i) 3757ec681f3Smrg reads_self |= bi_is_equiv(final_dst, src[i]); 3767ec681f3Smrg 3777ec681f3Smrg /* SSA can't read itself */ 3787ec681f3Smrg assert(!reads_self || final_dst.reg); 3797ec681f3Smrg 3807ec681f3Smrg bi_index dst = reads_self ? bi_temp(b->shader) : final_dst; 3817ec681f3Smrg 3827ec681f3Smrg if (bitsize == 32) { 3837ec681f3Smrg for (unsigned i = 0; i < count; ++i) { 3847ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, i), 3857ec681f3Smrg bi_word(src[i], channel ? channel[i] : 0)); 3867ec681f3Smrg } 3877ec681f3Smrg } else if (bitsize == 16) { 3887ec681f3Smrg bi_make_vec16_to(b, dst, src, channel, count); 3897ec681f3Smrg } else if (bitsize == 8 && count == 1) { 3907ec681f3Smrg bi_swz_v4i8_to(b, dst, bi_byte( 3917ec681f3Smrg bi_word(src[0], channel[0] >> 2), 3927ec681f3Smrg channel[0] & 3)); 3937ec681f3Smrg } else { 3947ec681f3Smrg unreachable("8-bit mkvec not yet supported"); 3957ec681f3Smrg } 3967ec681f3Smrg 3977ec681f3Smrg /* Emit an explicit copy if needed */ 3987ec681f3Smrg if (!bi_is_equiv(dst, final_dst)) { 3997ec681f3Smrg unsigned shift = (bitsize == 8) ? 2 : (bitsize == 16) ? 1 : 0; 4007ec681f3Smrg unsigned vec = (1 << shift); 4017ec681f3Smrg 4027ec681f3Smrg for (unsigned i = 0; i < count; i += vec) { 4037ec681f3Smrg bi_mov_i32_to(b, bi_word(final_dst, i >> shift), 4047ec681f3Smrg bi_word(dst, i >> shift)); 4057ec681f3Smrg } 4067ec681f3Smrg } 4077ec681f3Smrg} 4087ec681f3Smrg 4097ec681f3Smrgstatic bi_instr * 4107ec681f3Smrgbi_load_sysval_to(bi_builder *b, bi_index dest, int sysval, 4117ec681f3Smrg unsigned nr_components, unsigned offset) 4127ec681f3Smrg{ 4137ec681f3Smrg unsigned sysval_ubo = 4147ec681f3Smrg MAX2(b->shader->inputs->sysval_ubo, b->shader->nir->info.num_ubos); 4157ec681f3Smrg unsigned uniform = 4167ec681f3Smrg pan_lookup_sysval(b->shader->sysval_to_id, 4177ec681f3Smrg &b->shader->info->sysvals, 4187ec681f3Smrg sysval); 4197ec681f3Smrg unsigned idx = (uniform * 16) + offset; 4207ec681f3Smrg 4217ec681f3Smrg return bi_load_to(b, nr_components * 32, dest, 4227ec681f3Smrg bi_imm_u32(idx), 4237ec681f3Smrg bi_imm_u32(sysval_ubo), BI_SEG_UBO); 4247ec681f3Smrg} 4257ec681f3Smrg 4267ec681f3Smrgstatic void 4277ec681f3Smrgbi_load_sysval_nir(bi_builder *b, nir_intrinsic_instr *intr, 4287ec681f3Smrg unsigned nr_components, unsigned offset) 4297ec681f3Smrg{ 4307ec681f3Smrg bi_load_sysval_to(b, bi_dest_index(&intr->dest), 4317ec681f3Smrg panfrost_sysval_for_instr(&intr->instr, NULL), 4327ec681f3Smrg nr_components, offset); 4337ec681f3Smrg} 4347ec681f3Smrg 4357ec681f3Smrgstatic bi_index 4367ec681f3Smrgbi_load_sysval(bi_builder *b, int sysval, 4377ec681f3Smrg unsigned nr_components, unsigned offset) 4387ec681f3Smrg{ 4397ec681f3Smrg bi_index tmp = bi_temp(b->shader); 4407ec681f3Smrg bi_load_sysval_to(b, tmp, sysval, nr_components, offset); 4417ec681f3Smrg return tmp; 4427ec681f3Smrg} 4437ec681f3Smrg 4447ec681f3Smrgstatic void 4457ec681f3Smrgbi_load_sample_id_to(bi_builder *b, bi_index dst) 4467ec681f3Smrg{ 4477ec681f3Smrg /* r61[16:23] contains the sampleID, mask it out. Upper bits 4487ec681f3Smrg * seem to read garbage (despite being architecturally defined 4497ec681f3Smrg * as zero), so use a 5-bit mask instead of 8-bits */ 4507ec681f3Smrg 4517ec681f3Smrg bi_rshift_and_i32_to(b, dst, bi_register(61), bi_imm_u32(0x1f), 4527ec681f3Smrg bi_imm_u8(16)); 4537ec681f3Smrg} 4547ec681f3Smrg 4557ec681f3Smrgstatic bi_index 4567ec681f3Smrgbi_load_sample_id(bi_builder *b) 4577ec681f3Smrg{ 4587ec681f3Smrg bi_index sample_id = bi_temp(b->shader); 4597ec681f3Smrg bi_load_sample_id_to(b, sample_id); 4607ec681f3Smrg return sample_id; 4617ec681f3Smrg} 4627ec681f3Smrg 4637ec681f3Smrgstatic bi_index 4647ec681f3Smrgbi_pixel_indices(bi_builder *b, unsigned rt) 4657ec681f3Smrg{ 4667ec681f3Smrg /* We want to load the current pixel. */ 4677ec681f3Smrg struct bifrost_pixel_indices pix = { 4687ec681f3Smrg .y = BIFROST_CURRENT_PIXEL, 4697ec681f3Smrg .rt = rt 4707ec681f3Smrg }; 4717ec681f3Smrg 4727ec681f3Smrg uint32_t indices_u32 = 0; 4737ec681f3Smrg memcpy(&indices_u32, &pix, sizeof(indices_u32)); 4747ec681f3Smrg bi_index indices = bi_imm_u32(indices_u32); 4757ec681f3Smrg 4767ec681f3Smrg /* Sample index above is left as zero. For multisampling, we need to 4777ec681f3Smrg * fill in the actual sample ID in the lower byte */ 4787ec681f3Smrg 4797ec681f3Smrg if (b->shader->inputs->blend.nr_samples > 1) 4807ec681f3Smrg indices = bi_iadd_u32(b, indices, bi_load_sample_id(b), false); 4817ec681f3Smrg 4827ec681f3Smrg return indices; 4837ec681f3Smrg} 4847ec681f3Smrg 4857ec681f3Smrgstatic void 4867ec681f3Smrgbi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr) 4877ec681f3Smrg{ 4887ec681f3Smrg ASSERTED nir_io_semantics sem = nir_intrinsic_io_semantics(instr); 4897ec681f3Smrg 4907ec681f3Smrg /* Source color is passed through r0-r3, or r4-r7 for the second 4917ec681f3Smrg * source when dual-source blending. TODO: Precolour instead */ 4927ec681f3Smrg bi_index srcs[] = { 4937ec681f3Smrg bi_register(0), bi_register(1), bi_register(2), bi_register(3) 4947ec681f3Smrg }; 4957ec681f3Smrg bi_index srcs2[] = { 4967ec681f3Smrg bi_register(4), bi_register(5), bi_register(6), bi_register(7) 4977ec681f3Smrg }; 4987ec681f3Smrg 4997ec681f3Smrg bool second_source = (sem.location == VARYING_SLOT_VAR0); 5007ec681f3Smrg 5017ec681f3Smrg bi_make_vec_to(b, bi_dest_index(&instr->dest), 5027ec681f3Smrg second_source ? srcs2 : srcs, 5037ec681f3Smrg NULL, 4, 32); 5047ec681f3Smrg} 5057ec681f3Smrg 5067ec681f3Smrgstatic void 5077ec681f3Smrgbi_emit_blend_op(bi_builder *b, bi_index rgba, nir_alu_type T, unsigned rt) 5087ec681f3Smrg{ 5097ec681f3Smrg /* Reads 2 or 4 staging registers to cover the input */ 5107ec681f3Smrg unsigned size = nir_alu_type_get_type_size(T); 5117ec681f3Smrg unsigned sr_count = (size <= 16) ? 2 : 4; 5127ec681f3Smrg const struct panfrost_compile_inputs *inputs = b->shader->inputs; 5137ec681f3Smrg uint64_t blend_desc = inputs->blend.bifrost_blend_desc; 5147ec681f3Smrg 5157ec681f3Smrg if (inputs->is_blend && inputs->blend.nr_samples > 1) { 5167ec681f3Smrg /* Conversion descriptor comes from the compile inputs, pixel 5177ec681f3Smrg * indices derived at run time based on sample ID */ 5187ec681f3Smrg bi_st_tile(b, rgba, bi_pixel_indices(b, rt), bi_register(60), 5197ec681f3Smrg bi_imm_u32(blend_desc >> 32), BI_VECSIZE_V4); 5207ec681f3Smrg } else if (b->shader->inputs->is_blend) { 5217ec681f3Smrg /* Blend descriptor comes from the compile inputs */ 5227ec681f3Smrg /* Put the result in r0 */ 5237ec681f3Smrg bi_blend_to(b, bi_register(0), rgba, 5247ec681f3Smrg bi_register(60), 5257ec681f3Smrg bi_imm_u32(blend_desc & 0xffffffff), 5267ec681f3Smrg bi_imm_u32(blend_desc >> 32), sr_count); 5277ec681f3Smrg } else { 5287ec681f3Smrg /* Blend descriptor comes from the FAU RAM. By convention, the 5297ec681f3Smrg * return address is stored in r48 and will be used by the 5307ec681f3Smrg * blend shader to jump back to the fragment shader after */ 5317ec681f3Smrg bi_blend_to(b, bi_register(48), rgba, 5327ec681f3Smrg bi_register(60), 5337ec681f3Smrg bi_fau(BIR_FAU_BLEND_0 + rt, false), 5347ec681f3Smrg bi_fau(BIR_FAU_BLEND_0 + rt, true), sr_count); 5357ec681f3Smrg } 5367ec681f3Smrg 5377ec681f3Smrg assert(rt < 8); 5387ec681f3Smrg b->shader->info->bifrost.blend[rt].type = T; 5397ec681f3Smrg} 5407ec681f3Smrg 5417ec681f3Smrg/* Blend shaders do not need to run ATEST since they are dependent on a 5427ec681f3Smrg * fragment shader that runs it. Blit shaders may not need to run ATEST, since 5437ec681f3Smrg * ATEST is not needed if early-z is forced, alpha-to-coverage is disabled, and 5447ec681f3Smrg * there are no writes to the coverage mask. The latter two are satisfied for 5457ec681f3Smrg * all blit shaders, so we just care about early-z, which blit shaders force 5467ec681f3Smrg * iff they do not write depth or stencil */ 5477ec681f3Smrg 5487ec681f3Smrgstatic bool 5497ec681f3Smrgbi_skip_atest(bi_context *ctx, bool emit_zs) 5507ec681f3Smrg{ 5517ec681f3Smrg return (ctx->inputs->is_blit && !emit_zs) || ctx->inputs->is_blend; 5527ec681f3Smrg} 5537ec681f3Smrg 5547ec681f3Smrgstatic void 5557ec681f3Smrgbi_emit_atest(bi_builder *b, bi_index alpha) 5567ec681f3Smrg{ 5577ec681f3Smrg bi_index coverage = bi_register(60); 5587ec681f3Smrg bi_instr *atest = bi_atest_to(b, coverage, coverage, alpha); 5597ec681f3Smrg b->shader->emitted_atest = true; 5607ec681f3Smrg 5617ec681f3Smrg /* Pseudo-source to encode in the tuple */ 5627ec681f3Smrg atest->src[2] = bi_fau(BIR_FAU_ATEST_PARAM, false); 5637ec681f3Smrg} 5647ec681f3Smrg 5657ec681f3Smrgstatic void 5667ec681f3Smrgbi_emit_fragment_out(bi_builder *b, nir_intrinsic_instr *instr) 5677ec681f3Smrg{ 5687ec681f3Smrg bool combined = instr->intrinsic == 5697ec681f3Smrg nir_intrinsic_store_combined_output_pan; 5707ec681f3Smrg 5717ec681f3Smrg unsigned writeout = combined ? nir_intrinsic_component(instr) : 5727ec681f3Smrg PAN_WRITEOUT_C; 5737ec681f3Smrg 5747ec681f3Smrg bool emit_blend = writeout & (PAN_WRITEOUT_C); 5757ec681f3Smrg bool emit_zs = writeout & (PAN_WRITEOUT_Z | PAN_WRITEOUT_S); 5767ec681f3Smrg 5777ec681f3Smrg const nir_variable *var = 5787ec681f3Smrg nir_find_variable_with_driver_location(b->shader->nir, 5797ec681f3Smrg nir_var_shader_out, nir_intrinsic_base(instr)); 5807ec681f3Smrg assert(var); 5817ec681f3Smrg 5827ec681f3Smrg unsigned loc = var->data.location; 5837ec681f3Smrg bi_index src0 = bi_src_index(&instr->src[0]); 5847ec681f3Smrg 5857ec681f3Smrg /* By ISA convention, the coverage mask is stored in R60. The store 5867ec681f3Smrg * itself will be handled by a subsequent ATEST instruction */ 5877ec681f3Smrg if (loc == FRAG_RESULT_SAMPLE_MASK) { 5887ec681f3Smrg bi_index orig = bi_register(60); 5897ec681f3Smrg bi_index msaa = bi_load_sysval(b, PAN_SYSVAL_MULTISAMPLED, 1, 0); 5907ec681f3Smrg bi_index new = bi_lshift_and_i32(b, orig, src0, bi_imm_u8(0)); 5917ec681f3Smrg bi_mux_i32_to(b, orig, orig, new, msaa, BI_MUX_INT_ZERO); 5927ec681f3Smrg return; 5937ec681f3Smrg } 5947ec681f3Smrg 5957ec681f3Smrg 5967ec681f3Smrg /* Dual-source blending is implemented by putting the color in 5977ec681f3Smrg * registers r4-r7. */ 5987ec681f3Smrg if (var->data.index) { 5997ec681f3Smrg unsigned count = nir_src_num_components(instr->src[0]); 6007ec681f3Smrg 6017ec681f3Smrg for (unsigned i = 0; i < count; ++i) 6027ec681f3Smrg bi_mov_i32_to(b, bi_register(4 + i), bi_word(src0, i)); 6037ec681f3Smrg 6047ec681f3Smrg b->shader->info->bifrost.blend_src1_type = 6057ec681f3Smrg nir_intrinsic_src_type(instr); 6067ec681f3Smrg 6077ec681f3Smrg return; 6087ec681f3Smrg } 6097ec681f3Smrg 6107ec681f3Smrg /* Emit ATEST if we have to, note ATEST requires a floating-point alpha 6117ec681f3Smrg * value, but render target #0 might not be floating point. However the 6127ec681f3Smrg * alpha value is only used for alpha-to-coverage, a stage which is 6137ec681f3Smrg * skipped for pure integer framebuffers, so the issue is moot. */ 6147ec681f3Smrg 6157ec681f3Smrg if (!b->shader->emitted_atest && !bi_skip_atest(b->shader, emit_zs)) { 6167ec681f3Smrg nir_alu_type T = nir_intrinsic_src_type(instr); 6177ec681f3Smrg 6187ec681f3Smrg bi_index rgba = bi_src_index(&instr->src[0]); 6197ec681f3Smrg bi_index alpha = 6207ec681f3Smrg (T == nir_type_float16) ? bi_half(bi_word(rgba, 1), true) : 6217ec681f3Smrg (T == nir_type_float32) ? bi_word(rgba, 3) : 6227ec681f3Smrg bi_dontcare(); 6237ec681f3Smrg 6247ec681f3Smrg /* Don't read out-of-bounds */ 6257ec681f3Smrg if (nir_src_num_components(instr->src[0]) < 4) 6267ec681f3Smrg alpha = bi_imm_f32(1.0); 6277ec681f3Smrg 6287ec681f3Smrg bi_emit_atest(b, alpha); 6297ec681f3Smrg } 6307ec681f3Smrg 6317ec681f3Smrg if (emit_zs) { 6327ec681f3Smrg bi_index z = { 0 }, s = { 0 }; 6337ec681f3Smrg 6347ec681f3Smrg if (writeout & PAN_WRITEOUT_Z) 6357ec681f3Smrg z = bi_src_index(&instr->src[2]); 6367ec681f3Smrg 6377ec681f3Smrg if (writeout & PAN_WRITEOUT_S) 6387ec681f3Smrg s = bi_src_index(&instr->src[3]); 6397ec681f3Smrg 6407ec681f3Smrg bi_zs_emit_to(b, bi_register(60), z, s, bi_register(60), 6417ec681f3Smrg writeout & PAN_WRITEOUT_S, 6427ec681f3Smrg writeout & PAN_WRITEOUT_Z); 6437ec681f3Smrg } 6447ec681f3Smrg 6457ec681f3Smrg if (emit_blend) { 6467ec681f3Smrg assert(loc >= FRAG_RESULT_DATA0); 6477ec681f3Smrg 6487ec681f3Smrg unsigned rt = (loc - FRAG_RESULT_DATA0); 6497ec681f3Smrg bi_index color = bi_src_index(&instr->src[0]); 6507ec681f3Smrg 6517ec681f3Smrg /* Explicit copy since BLEND inputs are precoloured to R0-R3, 6527ec681f3Smrg * TODO: maybe schedule around this or implement in RA as a 6537ec681f3Smrg * spill */ 6547ec681f3Smrg bool has_mrt = false; 6557ec681f3Smrg 6567ec681f3Smrg nir_foreach_shader_out_variable(var, b->shader->nir) 6577ec681f3Smrg has_mrt |= (var->data.location > FRAG_RESULT_DATA0); 6587ec681f3Smrg 6597ec681f3Smrg if (has_mrt) { 6607ec681f3Smrg bi_index srcs[4] = { color, color, color, color }; 6617ec681f3Smrg unsigned channels[4] = { 0, 1, 2, 3 }; 6627ec681f3Smrg color = bi_temp(b->shader); 6637ec681f3Smrg bi_make_vec_to(b, color, srcs, channels, 6647ec681f3Smrg nir_src_num_components(instr->src[0]), 6657ec681f3Smrg nir_alu_type_get_type_size(nir_intrinsic_src_type(instr))); 6667ec681f3Smrg } 6677ec681f3Smrg 6687ec681f3Smrg bi_emit_blend_op(b, color, nir_intrinsic_src_type(instr), rt); 6697ec681f3Smrg } 6707ec681f3Smrg 6717ec681f3Smrg if (b->shader->inputs->is_blend) { 6727ec681f3Smrg /* Jump back to the fragment shader, return address is stored 6737ec681f3Smrg * in r48 (see above). 6747ec681f3Smrg */ 6757ec681f3Smrg bi_jump(b, bi_register(48)); 6767ec681f3Smrg } 6777ec681f3Smrg} 6787ec681f3Smrg 6797ec681f3Smrgstatic void 6807ec681f3Smrgbi_emit_store_vary(bi_builder *b, nir_intrinsic_instr *instr) 6817ec681f3Smrg{ 6827ec681f3Smrg /* In principle we can do better for 16-bit. At the moment we require 6837ec681f3Smrg * 32-bit to permit the use of .auto, in order to force .u32 for flat 6847ec681f3Smrg * varyings, to handle internal TGSI shaders that set flat in the VS 6857ec681f3Smrg * but smooth in the FS */ 6867ec681f3Smrg 6877ec681f3Smrg ASSERTED nir_alu_type T = nir_intrinsic_src_type(instr); 6887ec681f3Smrg assert(nir_alu_type_get_type_size(T) == 32); 6897ec681f3Smrg enum bi_register_format regfmt = BI_REGISTER_FORMAT_AUTO; 6907ec681f3Smrg 6917ec681f3Smrg unsigned imm_index = 0; 6927ec681f3Smrg bool immediate = bi_is_intr_immediate(instr, &imm_index, 16); 6937ec681f3Smrg 6947ec681f3Smrg bi_index address; 6957ec681f3Smrg if (immediate) { 6967ec681f3Smrg address = bi_lea_attr_imm(b, 6977ec681f3Smrg bi_register(61), bi_register(62), 6987ec681f3Smrg regfmt, imm_index); 6997ec681f3Smrg } else { 7007ec681f3Smrg bi_index idx = 7017ec681f3Smrg bi_iadd_u32(b, 7027ec681f3Smrg bi_src_index(nir_get_io_offset_src(instr)), 7037ec681f3Smrg bi_imm_u32(nir_intrinsic_base(instr)), 7047ec681f3Smrg false); 7057ec681f3Smrg address = bi_lea_attr(b, 7067ec681f3Smrg bi_register(61), bi_register(62), 7077ec681f3Smrg idx, regfmt); 7087ec681f3Smrg } 7097ec681f3Smrg 7107ec681f3Smrg /* Only look at the total components needed. In effect, we fill in all 7117ec681f3Smrg * the intermediate "holes" in the write mask, since we can't mask off 7127ec681f3Smrg * stores. Since nir_lower_io_to_temporaries ensures each varying is 7137ec681f3Smrg * written at most once, anything that's masked out is undefined, so it 7147ec681f3Smrg * doesn't matter what we write there. So we may as well do the 7157ec681f3Smrg * simplest thing possible. */ 7167ec681f3Smrg unsigned nr = util_last_bit(nir_intrinsic_write_mask(instr)); 7177ec681f3Smrg assert(nr > 0 && nr <= nir_intrinsic_src_components(instr, 0)); 7187ec681f3Smrg 7197ec681f3Smrg bi_st_cvt(b, bi_src_index(&instr->src[0]), address, 7207ec681f3Smrg bi_word(address, 1), bi_word(address, 2), 7217ec681f3Smrg regfmt, nr - 1); 7227ec681f3Smrg} 7237ec681f3Smrg 7247ec681f3Smrgstatic void 7257ec681f3Smrgbi_emit_load_ubo(bi_builder *b, nir_intrinsic_instr *instr) 7267ec681f3Smrg{ 7277ec681f3Smrg nir_src *offset = nir_get_io_offset_src(instr); 7287ec681f3Smrg 7297ec681f3Smrg bool offset_is_const = nir_src_is_const(*offset); 7307ec681f3Smrg bi_index dyn_offset = bi_src_index(offset); 7317ec681f3Smrg uint32_t const_offset = offset_is_const ? nir_src_as_uint(*offset) : 0; 7327ec681f3Smrg bool kernel_input = (instr->intrinsic == nir_intrinsic_load_kernel_input); 7337ec681f3Smrg 7347ec681f3Smrg bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest), 7357ec681f3Smrg bi_dest_index(&instr->dest), offset_is_const ? 7367ec681f3Smrg bi_imm_u32(const_offset) : dyn_offset, 7377ec681f3Smrg kernel_input ? bi_zero() : bi_src_index(&instr->src[0]), 7387ec681f3Smrg BI_SEG_UBO); 7397ec681f3Smrg} 7407ec681f3Smrg 7417ec681f3Smrgstatic bi_index 7427ec681f3Smrgbi_addr_high(nir_src *src) 7437ec681f3Smrg{ 7447ec681f3Smrg return (nir_src_bit_size(*src) == 64) ? 7457ec681f3Smrg bi_word(bi_src_index(src), 1) : bi_zero(); 7467ec681f3Smrg} 7477ec681f3Smrg 7487ec681f3Smrgstatic void 7497ec681f3Smrgbi_emit_load(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) 7507ec681f3Smrg{ 7517ec681f3Smrg bi_load_to(b, instr->num_components * nir_dest_bit_size(instr->dest), 7527ec681f3Smrg bi_dest_index(&instr->dest), 7537ec681f3Smrg bi_src_index(&instr->src[0]), bi_addr_high(&instr->src[0]), 7547ec681f3Smrg seg); 7557ec681f3Smrg} 7567ec681f3Smrg 7577ec681f3Smrgstatic void 7587ec681f3Smrgbi_emit_store(bi_builder *b, nir_intrinsic_instr *instr, enum bi_seg seg) 7597ec681f3Smrg{ 7607ec681f3Smrg /* Require contiguous masks, gauranteed by nir_lower_wrmasks */ 7617ec681f3Smrg assert(nir_intrinsic_write_mask(instr) == 7627ec681f3Smrg BITFIELD_MASK(instr->num_components)); 7637ec681f3Smrg 7647ec681f3Smrg bi_store(b, instr->num_components * nir_src_bit_size(instr->src[0]), 7657ec681f3Smrg bi_src_index(&instr->src[0]), 7667ec681f3Smrg bi_src_index(&instr->src[1]), bi_addr_high(&instr->src[1]), 7677ec681f3Smrg seg); 7687ec681f3Smrg} 7697ec681f3Smrg 7707ec681f3Smrg/* Exchanges the staging register with memory */ 7717ec681f3Smrg 7727ec681f3Smrgstatic void 7737ec681f3Smrgbi_emit_axchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg, enum bi_seg seg) 7747ec681f3Smrg{ 7757ec681f3Smrg assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); 7767ec681f3Smrg 7777ec681f3Smrg unsigned sz = nir_src_bit_size(*arg); 7787ec681f3Smrg assert(sz == 32 || sz == 64); 7797ec681f3Smrg 7807ec681f3Smrg bi_index data = bi_src_index(arg); 7817ec681f3Smrg 7827ec681f3Smrg bi_index data_words[] = { 7837ec681f3Smrg bi_word(data, 0), 7847ec681f3Smrg bi_word(data, 1), 7857ec681f3Smrg }; 7867ec681f3Smrg 7877ec681f3Smrg bi_index inout = bi_temp_reg(b->shader); 7887ec681f3Smrg bi_make_vec_to(b, inout, data_words, NULL, sz / 32, 32); 7897ec681f3Smrg 7907ec681f3Smrg bi_axchg_to(b, sz, inout, inout, 7917ec681f3Smrg bi_word(addr, 0), 7927ec681f3Smrg (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(), 7937ec681f3Smrg seg); 7947ec681f3Smrg 7957ec681f3Smrg bi_index inout_words[] = { 7967ec681f3Smrg bi_word(inout, 0), 7977ec681f3Smrg bi_word(inout, 1), 7987ec681f3Smrg }; 7997ec681f3Smrg 8007ec681f3Smrg bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); 8017ec681f3Smrg} 8027ec681f3Smrg 8037ec681f3Smrg/* Exchanges the second staging register with memory if comparison with first 8047ec681f3Smrg * staging register passes */ 8057ec681f3Smrg 8067ec681f3Smrgstatic void 8077ec681f3Smrgbi_emit_acmpxchg_to(bi_builder *b, bi_index dst, bi_index addr, nir_src *arg_1, nir_src *arg_2, enum bi_seg seg) 8087ec681f3Smrg{ 8097ec681f3Smrg assert(seg == BI_SEG_NONE || seg == BI_SEG_WLS); 8107ec681f3Smrg 8117ec681f3Smrg /* hardware is swapped from NIR */ 8127ec681f3Smrg bi_index src0 = bi_src_index(arg_2); 8137ec681f3Smrg bi_index src1 = bi_src_index(arg_1); 8147ec681f3Smrg 8157ec681f3Smrg unsigned sz = nir_src_bit_size(*arg_1); 8167ec681f3Smrg assert(sz == 32 || sz == 64); 8177ec681f3Smrg 8187ec681f3Smrg bi_index data_words[] = { 8197ec681f3Smrg bi_word(src0, 0), 8207ec681f3Smrg sz == 32 ? bi_word(src1, 0) : bi_word(src0, 1), 8217ec681f3Smrg 8227ec681f3Smrg /* 64-bit */ 8237ec681f3Smrg bi_word(src1, 0), 8247ec681f3Smrg bi_word(src1, 1), 8257ec681f3Smrg }; 8267ec681f3Smrg 8277ec681f3Smrg bi_index inout = bi_temp_reg(b->shader); 8287ec681f3Smrg bi_make_vec_to(b, inout, data_words, NULL, 2 * (sz / 32), 32); 8297ec681f3Smrg 8307ec681f3Smrg bi_acmpxchg_to(b, sz, inout, inout, 8317ec681f3Smrg bi_word(addr, 0), 8327ec681f3Smrg (seg == BI_SEG_NONE) ? bi_word(addr, 1) : bi_zero(), 8337ec681f3Smrg seg); 8347ec681f3Smrg 8357ec681f3Smrg bi_index inout_words[] = { 8367ec681f3Smrg bi_word(inout, 0), 8377ec681f3Smrg bi_word(inout, 1), 8387ec681f3Smrg }; 8397ec681f3Smrg 8407ec681f3Smrg bi_make_vec_to(b, dst, inout_words, NULL, sz / 32, 32); 8417ec681f3Smrg} 8427ec681f3Smrg 8437ec681f3Smrg/* Extracts an atomic opcode */ 8447ec681f3Smrg 8457ec681f3Smrgstatic enum bi_atom_opc 8467ec681f3Smrgbi_atom_opc_for_nir(nir_intrinsic_op op) 8477ec681f3Smrg{ 8487ec681f3Smrg switch (op) { 8497ec681f3Smrg case nir_intrinsic_global_atomic_add: 8507ec681f3Smrg case nir_intrinsic_shared_atomic_add: 8517ec681f3Smrg case nir_intrinsic_image_atomic_add: 8527ec681f3Smrg return BI_ATOM_OPC_AADD; 8537ec681f3Smrg 8547ec681f3Smrg case nir_intrinsic_global_atomic_imin: 8557ec681f3Smrg case nir_intrinsic_shared_atomic_imin: 8567ec681f3Smrg case nir_intrinsic_image_atomic_imin: 8577ec681f3Smrg return BI_ATOM_OPC_ASMIN; 8587ec681f3Smrg 8597ec681f3Smrg case nir_intrinsic_global_atomic_umin: 8607ec681f3Smrg case nir_intrinsic_shared_atomic_umin: 8617ec681f3Smrg case nir_intrinsic_image_atomic_umin: 8627ec681f3Smrg return BI_ATOM_OPC_AUMIN; 8637ec681f3Smrg 8647ec681f3Smrg case nir_intrinsic_global_atomic_imax: 8657ec681f3Smrg case nir_intrinsic_shared_atomic_imax: 8667ec681f3Smrg case nir_intrinsic_image_atomic_imax: 8677ec681f3Smrg return BI_ATOM_OPC_ASMAX; 8687ec681f3Smrg 8697ec681f3Smrg case nir_intrinsic_global_atomic_umax: 8707ec681f3Smrg case nir_intrinsic_shared_atomic_umax: 8717ec681f3Smrg case nir_intrinsic_image_atomic_umax: 8727ec681f3Smrg return BI_ATOM_OPC_AUMAX; 8737ec681f3Smrg 8747ec681f3Smrg case nir_intrinsic_global_atomic_and: 8757ec681f3Smrg case nir_intrinsic_shared_atomic_and: 8767ec681f3Smrg case nir_intrinsic_image_atomic_and: 8777ec681f3Smrg return BI_ATOM_OPC_AAND; 8787ec681f3Smrg 8797ec681f3Smrg case nir_intrinsic_global_atomic_or: 8807ec681f3Smrg case nir_intrinsic_shared_atomic_or: 8817ec681f3Smrg case nir_intrinsic_image_atomic_or: 8827ec681f3Smrg return BI_ATOM_OPC_AOR; 8837ec681f3Smrg 8847ec681f3Smrg case nir_intrinsic_global_atomic_xor: 8857ec681f3Smrg case nir_intrinsic_shared_atomic_xor: 8867ec681f3Smrg case nir_intrinsic_image_atomic_xor: 8877ec681f3Smrg return BI_ATOM_OPC_AXOR; 8887ec681f3Smrg 8897ec681f3Smrg default: 8907ec681f3Smrg unreachable("Unexpected computational atomic"); 8917ec681f3Smrg } 8927ec681f3Smrg} 8937ec681f3Smrg 8947ec681f3Smrg/* Optimized unary atomics are available with an implied #1 argument */ 8957ec681f3Smrg 8967ec681f3Smrgstatic bool 8977ec681f3Smrgbi_promote_atom_c1(enum bi_atom_opc op, bi_index arg, enum bi_atom_opc *out) 8987ec681f3Smrg{ 8997ec681f3Smrg /* Check we have a compatible constant */ 9007ec681f3Smrg if (arg.type != BI_INDEX_CONSTANT) 9017ec681f3Smrg return false; 9027ec681f3Smrg 9037ec681f3Smrg if (!(arg.value == 1 || (arg.value == -1 && op == BI_ATOM_OPC_AADD))) 9047ec681f3Smrg return false; 9057ec681f3Smrg 9067ec681f3Smrg /* Check for a compatible operation */ 9077ec681f3Smrg switch (op) { 9087ec681f3Smrg case BI_ATOM_OPC_AADD: 9097ec681f3Smrg *out = (arg.value == 1) ? BI_ATOM_OPC_AINC : BI_ATOM_OPC_ADEC; 9107ec681f3Smrg return true; 9117ec681f3Smrg case BI_ATOM_OPC_ASMAX: 9127ec681f3Smrg *out = BI_ATOM_OPC_ASMAX1; 9137ec681f3Smrg return true; 9147ec681f3Smrg case BI_ATOM_OPC_AUMAX: 9157ec681f3Smrg *out = BI_ATOM_OPC_AUMAX1; 9167ec681f3Smrg return true; 9177ec681f3Smrg case BI_ATOM_OPC_AOR: 9187ec681f3Smrg *out = BI_ATOM_OPC_AOR1; 9197ec681f3Smrg return true; 9207ec681f3Smrg default: 9217ec681f3Smrg return false; 9227ec681f3Smrg } 9237ec681f3Smrg} 9247ec681f3Smrg 9257ec681f3Smrg/* Coordinates are 16-bit integers in Bifrost but 32-bit in NIR */ 9267ec681f3Smrg 9277ec681f3Smrgstatic bi_index 9287ec681f3Smrgbi_emit_image_coord(bi_builder *b, bi_index coord, unsigned src_idx, 9297ec681f3Smrg unsigned coord_comps, bool is_array) 9307ec681f3Smrg{ 9317ec681f3Smrg assert(coord_comps > 0 && coord_comps <= 3); 9327ec681f3Smrg 9337ec681f3Smrg if (src_idx == 0) { 9347ec681f3Smrg if (coord_comps == 1 || (coord_comps == 2 && is_array)) 9357ec681f3Smrg return bi_word(coord, 0); 9367ec681f3Smrg else 9377ec681f3Smrg return bi_mkvec_v2i16(b, 9387ec681f3Smrg bi_half(bi_word(coord, 0), false), 9397ec681f3Smrg bi_half(bi_word(coord, 1), false)); 9407ec681f3Smrg } else { 9417ec681f3Smrg if (coord_comps == 3) 9427ec681f3Smrg return bi_word(coord, 2); 9437ec681f3Smrg else if (coord_comps == 2 && is_array) 9447ec681f3Smrg return bi_word(coord, 1); 9457ec681f3Smrg else 9467ec681f3Smrg return bi_zero(); 9477ec681f3Smrg } 9487ec681f3Smrg} 9497ec681f3Smrg 9507ec681f3Smrgstatic bi_index 9517ec681f3Smrgbi_emit_image_index(bi_builder *b, nir_intrinsic_instr *instr) 9527ec681f3Smrg{ 9537ec681f3Smrg nir_src src = instr->src[0]; 9547ec681f3Smrg bi_index index = bi_src_index(&src); 9557ec681f3Smrg bi_context *ctx = b->shader; 9567ec681f3Smrg 9577ec681f3Smrg /* Images come after vertex attributes, so handle an explicit offset */ 9587ec681f3Smrg unsigned offset = (ctx->stage == MESA_SHADER_VERTEX) ? 9597ec681f3Smrg util_bitcount64(ctx->nir->info.inputs_read) : 0; 9607ec681f3Smrg 9617ec681f3Smrg if (offset == 0) 9627ec681f3Smrg return index; 9637ec681f3Smrg else if (nir_src_is_const(src)) 9647ec681f3Smrg return bi_imm_u32(nir_src_as_uint(src) + offset); 9657ec681f3Smrg else 9667ec681f3Smrg return bi_iadd_u32(b, index, bi_imm_u32(offset), false); 9677ec681f3Smrg} 9687ec681f3Smrg 9697ec681f3Smrgstatic void 9707ec681f3Smrgbi_emit_image_load(bi_builder *b, nir_intrinsic_instr *instr) 9717ec681f3Smrg{ 9727ec681f3Smrg enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 9737ec681f3Smrg unsigned coord_comps = nir_image_intrinsic_coord_components(instr); 9747ec681f3Smrg bool array = nir_intrinsic_image_array(instr); 9757ec681f3Smrg ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); 9767ec681f3Smrg 9777ec681f3Smrg bi_index coords = bi_src_index(&instr->src[1]); 9787ec681f3Smrg /* TODO: MSAA */ 9797ec681f3Smrg assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); 9807ec681f3Smrg 9817ec681f3Smrg bi_ld_attr_tex_to(b, bi_dest_index(&instr->dest), 9827ec681f3Smrg bi_emit_image_coord(b, coords, 0, coord_comps, array), 9837ec681f3Smrg bi_emit_image_coord(b, coords, 1, coord_comps, array), 9847ec681f3Smrg bi_emit_image_index(b, instr), 9857ec681f3Smrg bi_reg_fmt_for_nir(nir_intrinsic_dest_type(instr)), 9867ec681f3Smrg instr->num_components - 1); 9877ec681f3Smrg} 9887ec681f3Smrg 9897ec681f3Smrgstatic bi_index 9907ec681f3Smrgbi_emit_lea_image(bi_builder *b, nir_intrinsic_instr *instr) 9917ec681f3Smrg{ 9927ec681f3Smrg enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr); 9937ec681f3Smrg bool array = nir_intrinsic_image_array(instr); 9947ec681f3Smrg ASSERTED unsigned nr_dim = glsl_get_sampler_dim_coordinate_components(dim); 9957ec681f3Smrg unsigned coord_comps = nir_image_intrinsic_coord_components(instr); 9967ec681f3Smrg 9977ec681f3Smrg /* TODO: MSAA */ 9987ec681f3Smrg assert(nr_dim != GLSL_SAMPLER_DIM_MS && "MSAA'd images not supported"); 9997ec681f3Smrg 10007ec681f3Smrg enum bi_register_format type = (instr->intrinsic == nir_intrinsic_image_store) ? 10017ec681f3Smrg bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)) : 10027ec681f3Smrg BI_REGISTER_FORMAT_AUTO; 10037ec681f3Smrg 10047ec681f3Smrg bi_index coords = bi_src_index(&instr->src[1]); 10057ec681f3Smrg bi_index xy = bi_emit_image_coord(b, coords, 0, coord_comps, array); 10067ec681f3Smrg bi_index zw = bi_emit_image_coord(b, coords, 1, coord_comps, array); 10077ec681f3Smrg 10087ec681f3Smrg bi_instr *I = bi_lea_attr_tex_to(b, bi_temp(b->shader), xy, zw, 10097ec681f3Smrg bi_emit_image_index(b, instr), type); 10107ec681f3Smrg 10117ec681f3Smrg /* LEA_ATTR_TEX defaults to the secondary attribute table, but our ABI 10127ec681f3Smrg * has all images in the primary attribute table */ 10137ec681f3Smrg I->table = BI_TABLE_ATTRIBUTE_1; 10147ec681f3Smrg 10157ec681f3Smrg return I->dest[0]; 10167ec681f3Smrg} 10177ec681f3Smrg 10187ec681f3Smrgstatic void 10197ec681f3Smrgbi_emit_image_store(bi_builder *b, nir_intrinsic_instr *instr) 10207ec681f3Smrg{ 10217ec681f3Smrg bi_index addr = bi_emit_lea_image(b, instr); 10227ec681f3Smrg 10237ec681f3Smrg bi_st_cvt(b, bi_src_index(&instr->src[3]), 10247ec681f3Smrg addr, bi_word(addr, 1), bi_word(addr, 2), 10257ec681f3Smrg bi_reg_fmt_for_nir(nir_intrinsic_src_type(instr)), 10267ec681f3Smrg instr->num_components - 1); 10277ec681f3Smrg} 10287ec681f3Smrg 10297ec681f3Smrgstatic void 10307ec681f3Smrgbi_emit_atomic_i32_to(bi_builder *b, bi_index dst, 10317ec681f3Smrg bi_index addr, bi_index arg, nir_intrinsic_op intrinsic) 10327ec681f3Smrg{ 10337ec681f3Smrg /* ATOM_C.i32 takes a vector with {arg, coalesced}, ATOM_C1.i32 doesn't 10347ec681f3Smrg * take any vector but can still output in RETURN mode */ 10357ec681f3Smrg bi_index sr = bi_temp_reg(b->shader); 10367ec681f3Smrg 10377ec681f3Smrg enum bi_atom_opc opc = bi_atom_opc_for_nir(intrinsic); 10387ec681f3Smrg enum bi_atom_opc post_opc = opc; 10397ec681f3Smrg 10407ec681f3Smrg /* Generate either ATOM_C or ATOM_C1 as required */ 10417ec681f3Smrg if (bi_promote_atom_c1(opc, arg, &opc)) { 10427ec681f3Smrg bi_patom_c1_i32_to(b, sr, bi_word(addr, 0), 10437ec681f3Smrg bi_word(addr, 1), opc, 2); 10447ec681f3Smrg } else { 10457ec681f3Smrg bi_mov_i32_to(b, sr, arg); 10467ec681f3Smrg bi_patom_c_i32_to(b, sr, sr, bi_word(addr, 0), 10477ec681f3Smrg bi_word(addr, 1), opc, 2); 10487ec681f3Smrg } 10497ec681f3Smrg 10507ec681f3Smrg /* Post-process it */ 10517ec681f3Smrg bi_atom_post_i32_to(b, dst, bi_word(sr, 0), bi_word(sr, 1), post_opc); 10527ec681f3Smrg} 10537ec681f3Smrg 10547ec681f3Smrg/* gl_FragCoord.xy = u16_to_f32(R59.xy) + 0.5 10557ec681f3Smrg * gl_FragCoord.z = ld_vary(fragz) 10567ec681f3Smrg * gl_FragCoord.w = ld_vary(fragw) 10577ec681f3Smrg */ 10587ec681f3Smrg 10597ec681f3Smrgstatic void 10607ec681f3Smrgbi_emit_load_frag_coord(bi_builder *b, nir_intrinsic_instr *instr) 10617ec681f3Smrg{ 10627ec681f3Smrg bi_index src[4] = {}; 10637ec681f3Smrg 10647ec681f3Smrg for (unsigned i = 0; i < 2; ++i) { 10657ec681f3Smrg src[i] = bi_fadd_f32(b, 10667ec681f3Smrg bi_u16_to_f32(b, bi_half(bi_register(59), i)), 10677ec681f3Smrg bi_imm_f32(0.5f), BI_ROUND_NONE); 10687ec681f3Smrg } 10697ec681f3Smrg 10707ec681f3Smrg for (unsigned i = 0; i < 2; ++i) { 10717ec681f3Smrg src[2 + i] = bi_ld_var_special(b, bi_zero(), 10727ec681f3Smrg BI_REGISTER_FORMAT_F32, BI_SAMPLE_CENTER, 10737ec681f3Smrg BI_UPDATE_CLOBBER, 10747ec681f3Smrg (i == 0) ? BI_VARYING_NAME_FRAG_Z : 10757ec681f3Smrg BI_VARYING_NAME_FRAG_W, 10767ec681f3Smrg BI_VECSIZE_NONE); 10777ec681f3Smrg } 10787ec681f3Smrg 10797ec681f3Smrg bi_make_vec_to(b, bi_dest_index(&instr->dest), src, NULL, 4, 32); 10807ec681f3Smrg} 10817ec681f3Smrg 10827ec681f3Smrgstatic void 10837ec681f3Smrgbi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr) 10847ec681f3Smrg{ 10857ec681f3Smrg unsigned rt = b->shader->inputs->blend.rt; 10867ec681f3Smrg unsigned size = nir_dest_bit_size(instr->dest); 10877ec681f3Smrg 10887ec681f3Smrg /* Get the render target */ 10897ec681f3Smrg if (!b->shader->inputs->is_blend) { 10907ec681f3Smrg const nir_variable *var = 10917ec681f3Smrg nir_find_variable_with_driver_location(b->shader->nir, 10927ec681f3Smrg nir_var_shader_out, nir_intrinsic_base(instr)); 10937ec681f3Smrg unsigned loc = var->data.location; 10947ec681f3Smrg assert(loc >= FRAG_RESULT_DATA0); 10957ec681f3Smrg rt = (loc - FRAG_RESULT_DATA0); 10967ec681f3Smrg } 10977ec681f3Smrg 10987ec681f3Smrg bi_index desc = b->shader->inputs->is_blend ? 10997ec681f3Smrg bi_imm_u32(b->shader->inputs->blend.bifrost_blend_desc >> 32) : 11007ec681f3Smrg b->shader->inputs->bifrost.static_rt_conv ? 11017ec681f3Smrg bi_imm_u32(b->shader->inputs->bifrost.rt_conv[rt]) : 11027ec681f3Smrg bi_load_sysval(b, PAN_SYSVAL(RT_CONVERSION, rt | (size << 4)), 1, 0); 11037ec681f3Smrg 11047ec681f3Smrg bi_ld_tile_to(b, bi_dest_index(&instr->dest), bi_pixel_indices(b, rt), 11057ec681f3Smrg bi_register(60), desc, (instr->num_components - 1)); 11067ec681f3Smrg} 11077ec681f3Smrg 11087ec681f3Smrgstatic void 11097ec681f3Smrgbi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) 11107ec681f3Smrg{ 11117ec681f3Smrg bi_index dst = nir_intrinsic_infos[instr->intrinsic].has_dest ? 11127ec681f3Smrg bi_dest_index(&instr->dest) : bi_null(); 11137ec681f3Smrg gl_shader_stage stage = b->shader->stage; 11147ec681f3Smrg 11157ec681f3Smrg switch (instr->intrinsic) { 11167ec681f3Smrg case nir_intrinsic_load_barycentric_pixel: 11177ec681f3Smrg case nir_intrinsic_load_barycentric_centroid: 11187ec681f3Smrg case nir_intrinsic_load_barycentric_sample: 11197ec681f3Smrg case nir_intrinsic_load_barycentric_at_sample: 11207ec681f3Smrg case nir_intrinsic_load_barycentric_at_offset: 11217ec681f3Smrg /* handled later via load_vary */ 11227ec681f3Smrg break; 11237ec681f3Smrg case nir_intrinsic_load_interpolated_input: 11247ec681f3Smrg case nir_intrinsic_load_input: 11257ec681f3Smrg if (b->shader->inputs->is_blend) 11267ec681f3Smrg bi_emit_load_blend_input(b, instr); 11277ec681f3Smrg else if (stage == MESA_SHADER_FRAGMENT) 11287ec681f3Smrg bi_emit_load_vary(b, instr); 11297ec681f3Smrg else if (stage == MESA_SHADER_VERTEX) 11307ec681f3Smrg bi_emit_load_attr(b, instr); 11317ec681f3Smrg else 11327ec681f3Smrg unreachable("Unsupported shader stage"); 11337ec681f3Smrg break; 11347ec681f3Smrg 11357ec681f3Smrg case nir_intrinsic_store_output: 11367ec681f3Smrg if (stage == MESA_SHADER_FRAGMENT) 11377ec681f3Smrg bi_emit_fragment_out(b, instr); 11387ec681f3Smrg else if (stage == MESA_SHADER_VERTEX) 11397ec681f3Smrg bi_emit_store_vary(b, instr); 11407ec681f3Smrg else 11417ec681f3Smrg unreachable("Unsupported shader stage"); 11427ec681f3Smrg break; 11437ec681f3Smrg 11447ec681f3Smrg case nir_intrinsic_store_combined_output_pan: 11457ec681f3Smrg assert(stage == MESA_SHADER_FRAGMENT); 11467ec681f3Smrg bi_emit_fragment_out(b, instr); 11477ec681f3Smrg break; 11487ec681f3Smrg 11497ec681f3Smrg case nir_intrinsic_load_ubo: 11507ec681f3Smrg case nir_intrinsic_load_kernel_input: 11517ec681f3Smrg bi_emit_load_ubo(b, instr); 11527ec681f3Smrg break; 11537ec681f3Smrg 11547ec681f3Smrg case nir_intrinsic_load_global: 11557ec681f3Smrg case nir_intrinsic_load_global_constant: 11567ec681f3Smrg bi_emit_load(b, instr, BI_SEG_NONE); 11577ec681f3Smrg break; 11587ec681f3Smrg 11597ec681f3Smrg case nir_intrinsic_store_global: 11607ec681f3Smrg bi_emit_store(b, instr, BI_SEG_NONE); 11617ec681f3Smrg break; 11627ec681f3Smrg 11637ec681f3Smrg case nir_intrinsic_load_scratch: 11647ec681f3Smrg bi_emit_load(b, instr, BI_SEG_TL); 11657ec681f3Smrg break; 11667ec681f3Smrg 11677ec681f3Smrg case nir_intrinsic_store_scratch: 11687ec681f3Smrg bi_emit_store(b, instr, BI_SEG_TL); 11697ec681f3Smrg break; 11707ec681f3Smrg 11717ec681f3Smrg case nir_intrinsic_load_shared: 11727ec681f3Smrg bi_emit_load(b, instr, BI_SEG_WLS); 11737ec681f3Smrg break; 11747ec681f3Smrg 11757ec681f3Smrg case nir_intrinsic_store_shared: 11767ec681f3Smrg bi_emit_store(b, instr, BI_SEG_WLS); 11777ec681f3Smrg break; 11787ec681f3Smrg 11797ec681f3Smrg /* Blob doesn't seem to do anything for memory barriers, note +BARRIER 11807ec681f3Smrg * is illegal in fragment shaders */ 11817ec681f3Smrg case nir_intrinsic_memory_barrier: 11827ec681f3Smrg case nir_intrinsic_memory_barrier_buffer: 11837ec681f3Smrg case nir_intrinsic_memory_barrier_image: 11847ec681f3Smrg case nir_intrinsic_memory_barrier_shared: 11857ec681f3Smrg case nir_intrinsic_group_memory_barrier: 11867ec681f3Smrg break; 11877ec681f3Smrg 11887ec681f3Smrg case nir_intrinsic_control_barrier: 11897ec681f3Smrg assert(b->shader->stage != MESA_SHADER_FRAGMENT); 11907ec681f3Smrg bi_barrier(b); 11917ec681f3Smrg break; 11927ec681f3Smrg 11937ec681f3Smrg case nir_intrinsic_shared_atomic_add: 11947ec681f3Smrg case nir_intrinsic_shared_atomic_imin: 11957ec681f3Smrg case nir_intrinsic_shared_atomic_umin: 11967ec681f3Smrg case nir_intrinsic_shared_atomic_imax: 11977ec681f3Smrg case nir_intrinsic_shared_atomic_umax: 11987ec681f3Smrg case nir_intrinsic_shared_atomic_and: 11997ec681f3Smrg case nir_intrinsic_shared_atomic_or: 12007ec681f3Smrg case nir_intrinsic_shared_atomic_xor: { 12017ec681f3Smrg assert(nir_src_bit_size(instr->src[1]) == 32); 12027ec681f3Smrg 12037ec681f3Smrg bi_index addr = bi_seg_add_i64(b, bi_src_index(&instr->src[0]), 12047ec681f3Smrg bi_zero(), false, BI_SEG_WLS); 12057ec681f3Smrg 12067ec681f3Smrg bi_emit_atomic_i32_to(b, dst, addr, bi_src_index(&instr->src[1]), 12077ec681f3Smrg instr->intrinsic); 12087ec681f3Smrg break; 12097ec681f3Smrg } 12107ec681f3Smrg 12117ec681f3Smrg case nir_intrinsic_image_atomic_add: 12127ec681f3Smrg case nir_intrinsic_image_atomic_imin: 12137ec681f3Smrg case nir_intrinsic_image_atomic_umin: 12147ec681f3Smrg case nir_intrinsic_image_atomic_imax: 12157ec681f3Smrg case nir_intrinsic_image_atomic_umax: 12167ec681f3Smrg case nir_intrinsic_image_atomic_and: 12177ec681f3Smrg case nir_intrinsic_image_atomic_or: 12187ec681f3Smrg case nir_intrinsic_image_atomic_xor: 12197ec681f3Smrg assert(nir_src_bit_size(instr->src[3]) == 32); 12207ec681f3Smrg 12217ec681f3Smrg bi_emit_atomic_i32_to(b, dst, 12227ec681f3Smrg bi_emit_lea_image(b, instr), 12237ec681f3Smrg bi_src_index(&instr->src[3]), 12247ec681f3Smrg instr->intrinsic); 12257ec681f3Smrg break; 12267ec681f3Smrg 12277ec681f3Smrg case nir_intrinsic_global_atomic_add: 12287ec681f3Smrg case nir_intrinsic_global_atomic_imin: 12297ec681f3Smrg case nir_intrinsic_global_atomic_umin: 12307ec681f3Smrg case nir_intrinsic_global_atomic_imax: 12317ec681f3Smrg case nir_intrinsic_global_atomic_umax: 12327ec681f3Smrg case nir_intrinsic_global_atomic_and: 12337ec681f3Smrg case nir_intrinsic_global_atomic_or: 12347ec681f3Smrg case nir_intrinsic_global_atomic_xor: 12357ec681f3Smrg assert(nir_src_bit_size(instr->src[1]) == 32); 12367ec681f3Smrg 12377ec681f3Smrg bi_emit_atomic_i32_to(b, dst, 12387ec681f3Smrg bi_src_index(&instr->src[0]), 12397ec681f3Smrg bi_src_index(&instr->src[1]), 12407ec681f3Smrg instr->intrinsic); 12417ec681f3Smrg break; 12427ec681f3Smrg 12437ec681f3Smrg case nir_intrinsic_image_load: 12447ec681f3Smrg bi_emit_image_load(b, instr); 12457ec681f3Smrg break; 12467ec681f3Smrg 12477ec681f3Smrg case nir_intrinsic_image_store: 12487ec681f3Smrg bi_emit_image_store(b, instr); 12497ec681f3Smrg break; 12507ec681f3Smrg 12517ec681f3Smrg case nir_intrinsic_global_atomic_exchange: 12527ec681f3Smrg bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), 12537ec681f3Smrg &instr->src[1], BI_SEG_NONE); 12547ec681f3Smrg break; 12557ec681f3Smrg 12567ec681f3Smrg case nir_intrinsic_image_atomic_exchange: 12577ec681f3Smrg bi_emit_axchg_to(b, dst, bi_emit_lea_image(b, instr), 12587ec681f3Smrg &instr->src[3], BI_SEG_NONE); 12597ec681f3Smrg break; 12607ec681f3Smrg 12617ec681f3Smrg case nir_intrinsic_shared_atomic_exchange: 12627ec681f3Smrg bi_emit_axchg_to(b, dst, bi_src_index(&instr->src[0]), 12637ec681f3Smrg &instr->src[1], BI_SEG_WLS); 12647ec681f3Smrg break; 12657ec681f3Smrg 12667ec681f3Smrg case nir_intrinsic_global_atomic_comp_swap: 12677ec681f3Smrg bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), 12687ec681f3Smrg &instr->src[1], &instr->src[2], BI_SEG_NONE); 12697ec681f3Smrg break; 12707ec681f3Smrg 12717ec681f3Smrg case nir_intrinsic_image_atomic_comp_swap: 12727ec681f3Smrg bi_emit_acmpxchg_to(b, dst, bi_emit_lea_image(b, instr), 12737ec681f3Smrg &instr->src[3], &instr->src[4], BI_SEG_NONE); 12747ec681f3Smrg break; 12757ec681f3Smrg 12767ec681f3Smrg case nir_intrinsic_shared_atomic_comp_swap: 12777ec681f3Smrg bi_emit_acmpxchg_to(b, dst, bi_src_index(&instr->src[0]), 12787ec681f3Smrg &instr->src[1], &instr->src[2], BI_SEG_WLS); 12797ec681f3Smrg break; 12807ec681f3Smrg 12817ec681f3Smrg case nir_intrinsic_load_frag_coord: 12827ec681f3Smrg bi_emit_load_frag_coord(b, instr); 12837ec681f3Smrg break; 12847ec681f3Smrg 12857ec681f3Smrg case nir_intrinsic_load_output: 12867ec681f3Smrg bi_emit_ld_tile(b, instr); 12877ec681f3Smrg break; 12887ec681f3Smrg 12897ec681f3Smrg case nir_intrinsic_discard_if: { 12907ec681f3Smrg bi_index src = bi_src_index(&instr->src[0]); 12917ec681f3Smrg assert(nir_src_bit_size(instr->src[0]) == 1); 12927ec681f3Smrg bi_discard_b32(b, bi_half(src, false)); 12937ec681f3Smrg break; 12947ec681f3Smrg } 12957ec681f3Smrg 12967ec681f3Smrg case nir_intrinsic_discard: 12977ec681f3Smrg bi_discard_f32(b, bi_zero(), bi_zero(), BI_CMPF_EQ); 12987ec681f3Smrg break; 12997ec681f3Smrg 13007ec681f3Smrg case nir_intrinsic_load_ssbo_address: 13017ec681f3Smrg bi_load_sysval_nir(b, instr, 2, 0); 13027ec681f3Smrg break; 13037ec681f3Smrg 13047ec681f3Smrg case nir_intrinsic_load_work_dim: 13057ec681f3Smrg bi_load_sysval_nir(b, instr, 1, 0); 13067ec681f3Smrg break; 13077ec681f3Smrg 13087ec681f3Smrg case nir_intrinsic_load_first_vertex: 13097ec681f3Smrg bi_load_sysval_nir(b, instr, 1, 0); 13107ec681f3Smrg break; 13117ec681f3Smrg 13127ec681f3Smrg case nir_intrinsic_load_base_vertex: 13137ec681f3Smrg bi_load_sysval_nir(b, instr, 1, 4); 13147ec681f3Smrg break; 13157ec681f3Smrg 13167ec681f3Smrg case nir_intrinsic_load_base_instance: 13177ec681f3Smrg bi_load_sysval_nir(b, instr, 1, 8); 13187ec681f3Smrg break; 13197ec681f3Smrg 13207ec681f3Smrg case nir_intrinsic_load_draw_id: 13217ec681f3Smrg bi_load_sysval_nir(b, instr, 1, 0); 13227ec681f3Smrg break; 13237ec681f3Smrg 13247ec681f3Smrg case nir_intrinsic_get_ssbo_size: 13257ec681f3Smrg bi_load_sysval_nir(b, instr, 1, 8); 13267ec681f3Smrg break; 13277ec681f3Smrg 13287ec681f3Smrg case nir_intrinsic_load_viewport_scale: 13297ec681f3Smrg case nir_intrinsic_load_viewport_offset: 13307ec681f3Smrg case nir_intrinsic_load_num_workgroups: 13317ec681f3Smrg case nir_intrinsic_load_workgroup_size: 13327ec681f3Smrg bi_load_sysval_nir(b, instr, 3, 0); 13337ec681f3Smrg break; 13347ec681f3Smrg 13357ec681f3Smrg case nir_intrinsic_image_size: 13367ec681f3Smrg bi_load_sysval_nir(b, instr, 13377ec681f3Smrg nir_dest_num_components(instr->dest), 0); 13387ec681f3Smrg break; 13397ec681f3Smrg 13407ec681f3Smrg case nir_intrinsic_load_blend_const_color_rgba: 13417ec681f3Smrg bi_load_sysval_nir(b, instr, 13427ec681f3Smrg nir_dest_num_components(instr->dest), 0); 13437ec681f3Smrg break; 13447ec681f3Smrg 13457ec681f3Smrg case nir_intrinsic_load_sample_positions_pan: 13467ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, 0), 13477ec681f3Smrg bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, false)); 13487ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, 1), 13497ec681f3Smrg bi_fau(BIR_FAU_SAMPLE_POS_ARRAY, true)); 13507ec681f3Smrg break; 13517ec681f3Smrg 13527ec681f3Smrg case nir_intrinsic_load_sample_mask_in: 13537ec681f3Smrg /* r61[0:15] contains the coverage bitmap */ 13547ec681f3Smrg bi_u16_to_u32_to(b, dst, bi_half(bi_register(61), false)); 13557ec681f3Smrg break; 13567ec681f3Smrg 13577ec681f3Smrg case nir_intrinsic_load_sample_id: 13587ec681f3Smrg bi_load_sample_id_to(b, dst); 13597ec681f3Smrg break; 13607ec681f3Smrg 13617ec681f3Smrg case nir_intrinsic_load_front_face: 13627ec681f3Smrg /* r58 == 0 means primitive is front facing */ 13637ec681f3Smrg bi_icmp_i32_to(b, dst, bi_register(58), bi_zero(), BI_CMPF_EQ, 13647ec681f3Smrg BI_RESULT_TYPE_M1); 13657ec681f3Smrg break; 13667ec681f3Smrg 13677ec681f3Smrg case nir_intrinsic_load_point_coord: 13687ec681f3Smrg bi_ld_var_special_to(b, dst, bi_zero(), BI_REGISTER_FORMAT_F32, 13697ec681f3Smrg BI_SAMPLE_CENTER, BI_UPDATE_CLOBBER, 13707ec681f3Smrg BI_VARYING_NAME_POINT, BI_VECSIZE_V2); 13717ec681f3Smrg break; 13727ec681f3Smrg 13737ec681f3Smrg case nir_intrinsic_load_vertex_id_zero_base: 13747ec681f3Smrg bi_mov_i32_to(b, dst, bi_register(61)); 13757ec681f3Smrg break; 13767ec681f3Smrg 13777ec681f3Smrg case nir_intrinsic_load_instance_id: 13787ec681f3Smrg bi_mov_i32_to(b, dst, bi_register(62)); 13797ec681f3Smrg break; 13807ec681f3Smrg 13817ec681f3Smrg case nir_intrinsic_load_subgroup_invocation: 13827ec681f3Smrg bi_mov_i32_to(b, dst, bi_fau(BIR_FAU_LANE_ID, false)); 13837ec681f3Smrg break; 13847ec681f3Smrg 13857ec681f3Smrg case nir_intrinsic_load_local_invocation_id: 13867ec681f3Smrg for (unsigned i = 0; i < 3; ++i) 13877ec681f3Smrg bi_u16_to_u32_to(b, bi_word(dst, i), 13887ec681f3Smrg bi_half(bi_register(55 + i / 2), i % 2)); 13897ec681f3Smrg break; 13907ec681f3Smrg 13917ec681f3Smrg case nir_intrinsic_load_workgroup_id: 13927ec681f3Smrg for (unsigned i = 0; i < 3; ++i) 13937ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, i), bi_register(57 + i)); 13947ec681f3Smrg break; 13957ec681f3Smrg 13967ec681f3Smrg case nir_intrinsic_load_global_invocation_id: 13977ec681f3Smrg case nir_intrinsic_load_global_invocation_id_zero_base: 13987ec681f3Smrg for (unsigned i = 0; i < 3; ++i) 13997ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, i), bi_register(60 + i)); 14007ec681f3Smrg break; 14017ec681f3Smrg 14027ec681f3Smrg case nir_intrinsic_shader_clock: 14037ec681f3Smrg bi_ld_gclk_u64_to(b, dst, BI_SOURCE_CYCLE_COUNTER); 14047ec681f3Smrg break; 14057ec681f3Smrg 14067ec681f3Smrg default: 14077ec681f3Smrg fprintf(stderr, "Unhandled intrinsic %s\n", nir_intrinsic_infos[instr->intrinsic].name); 14087ec681f3Smrg assert(0); 14097ec681f3Smrg } 14107ec681f3Smrg} 14117ec681f3Smrg 14127ec681f3Smrgstatic void 14137ec681f3Smrgbi_emit_load_const(bi_builder *b, nir_load_const_instr *instr) 14147ec681f3Smrg{ 14157ec681f3Smrg /* Make sure we've been lowered */ 14167ec681f3Smrg assert(instr->def.num_components <= (32 / instr->def.bit_size)); 14177ec681f3Smrg 14187ec681f3Smrg /* Accumulate all the channels of the constant, as if we did an 14197ec681f3Smrg * implicit SEL over them */ 14207ec681f3Smrg uint32_t acc = 0; 14217ec681f3Smrg 14227ec681f3Smrg for (unsigned i = 0; i < instr->def.num_components; ++i) { 14237ec681f3Smrg uint32_t v = nir_const_value_as_uint(instr->value[i], instr->def.bit_size); 14247ec681f3Smrg 14257ec681f3Smrg v = bi_extend_constant(v, instr->def.bit_size); 14267ec681f3Smrg acc |= (v << (i * instr->def.bit_size)); 14277ec681f3Smrg } 14287ec681f3Smrg 14297ec681f3Smrg bi_mov_i32_to(b, bi_get_index(instr->def.index, false, 0), bi_imm_u32(acc)); 14307ec681f3Smrg} 14317ec681f3Smrg 14327ec681f3Smrgstatic bi_index 14337ec681f3Smrgbi_alu_src_index(nir_alu_src src, unsigned comps) 14347ec681f3Smrg{ 14357ec681f3Smrg /* we don't lower modifiers until the backend */ 14367ec681f3Smrg assert(!(src.negate || src.abs)); 14377ec681f3Smrg 14387ec681f3Smrg unsigned bitsize = nir_src_bit_size(src.src); 14397ec681f3Smrg 14407ec681f3Smrg /* TODO: Do we need to do something more clever with 1-bit bools? */ 14417ec681f3Smrg if (bitsize == 1) 14427ec681f3Smrg bitsize = 16; 14437ec681f3Smrg 14447ec681f3Smrg /* the bi_index carries the 32-bit (word) offset separate from the 14457ec681f3Smrg * subword swizzle, first handle the offset */ 14467ec681f3Smrg 14477ec681f3Smrg unsigned offset = 0; 14487ec681f3Smrg 14497ec681f3Smrg assert(bitsize == 8 || bitsize == 16 || bitsize == 32); 14507ec681f3Smrg unsigned subword_shift = (bitsize == 32) ? 0 : (bitsize == 16) ? 1 : 2; 14517ec681f3Smrg 14527ec681f3Smrg for (unsigned i = 0; i < comps; ++i) { 14537ec681f3Smrg unsigned new_offset = (src.swizzle[i] >> subword_shift); 14547ec681f3Smrg 14557ec681f3Smrg if (i > 0) 14567ec681f3Smrg assert(offset == new_offset && "wrong vectorization"); 14577ec681f3Smrg 14587ec681f3Smrg offset = new_offset; 14597ec681f3Smrg } 14607ec681f3Smrg 14617ec681f3Smrg bi_index idx = bi_word(bi_src_index(&src.src), offset); 14627ec681f3Smrg 14637ec681f3Smrg /* Compose the subword swizzle with existing (identity) swizzle */ 14647ec681f3Smrg assert(idx.swizzle == BI_SWIZZLE_H01); 14657ec681f3Smrg 14667ec681f3Smrg /* Bigger vectors should have been lowered */ 14677ec681f3Smrg assert(comps <= (1 << subword_shift)); 14687ec681f3Smrg 14697ec681f3Smrg if (bitsize == 16) { 14707ec681f3Smrg unsigned c0 = src.swizzle[0] & 1; 14717ec681f3Smrg unsigned c1 = (comps > 1) ? src.swizzle[1] & 1 : c0; 14727ec681f3Smrg idx.swizzle = BI_SWIZZLE_H00 + c1 + (c0 << 1); 14737ec681f3Smrg } else if (bitsize == 8) { 14747ec681f3Smrg /* 8-bit vectors not yet supported */ 14757ec681f3Smrg assert(comps == 1 && "8-bit vectors not supported"); 14767ec681f3Smrg assert(src.swizzle[0] < 4 && "8-bit vectors not supported"); 14777ec681f3Smrg idx.swizzle = BI_SWIZZLE_B0000 + src.swizzle[0]; 14787ec681f3Smrg } 14797ec681f3Smrg 14807ec681f3Smrg return idx; 14817ec681f3Smrg} 14827ec681f3Smrg 14837ec681f3Smrgstatic enum bi_round 14847ec681f3Smrgbi_nir_round(nir_op op) 14857ec681f3Smrg{ 14867ec681f3Smrg switch (op) { 14877ec681f3Smrg case nir_op_fround_even: return BI_ROUND_NONE; 14887ec681f3Smrg case nir_op_ftrunc: return BI_ROUND_RTZ; 14897ec681f3Smrg case nir_op_fceil: return BI_ROUND_RTP; 14907ec681f3Smrg case nir_op_ffloor: return BI_ROUND_RTN; 14917ec681f3Smrg default: unreachable("invalid nir round op"); 14927ec681f3Smrg } 14937ec681f3Smrg} 14947ec681f3Smrg 14957ec681f3Smrg/* Convenience for lowered transcendentals */ 14967ec681f3Smrg 14977ec681f3Smrgstatic bi_index 14987ec681f3Smrgbi_fmul_f32(bi_builder *b, bi_index s0, bi_index s1) 14997ec681f3Smrg{ 15007ec681f3Smrg return bi_fma_f32(b, s0, s1, bi_imm_f32(-0.0f), BI_ROUND_NONE); 15017ec681f3Smrg} 15027ec681f3Smrg 15037ec681f3Smrg/* Approximate with FRCP_APPROX.f32 and apply a single iteration of 15047ec681f3Smrg * Newton-Raphson to improve precision */ 15057ec681f3Smrg 15067ec681f3Smrgstatic void 15077ec681f3Smrgbi_lower_frcp_32(bi_builder *b, bi_index dst, bi_index s0) 15087ec681f3Smrg{ 15097ec681f3Smrg bi_index x1 = bi_frcp_approx_f32(b, s0); 15107ec681f3Smrg bi_index m = bi_frexpm_f32(b, s0, false, false); 15117ec681f3Smrg bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, false); 15127ec681f3Smrg bi_index t1 = bi_fma_rscale_f32(b, m, bi_neg(x1), bi_imm_f32(1.0), 15137ec681f3Smrg bi_zero(), BI_ROUND_NONE, BI_SPECIAL_N); 15147ec681f3Smrg bi_fma_rscale_f32_to(b, dst, t1, x1, x1, e, 15157ec681f3Smrg BI_ROUND_NONE, BI_SPECIAL_NONE); 15167ec681f3Smrg} 15177ec681f3Smrg 15187ec681f3Smrgstatic void 15197ec681f3Smrgbi_lower_frsq_32(bi_builder *b, bi_index dst, bi_index s0) 15207ec681f3Smrg{ 15217ec681f3Smrg bi_index x1 = bi_frsq_approx_f32(b, s0); 15227ec681f3Smrg bi_index m = bi_frexpm_f32(b, s0, false, true); 15237ec681f3Smrg bi_index e = bi_frexpe_f32(b, bi_neg(s0), false, true); 15247ec681f3Smrg bi_index t1 = bi_fmul_f32(b, x1, x1); 15257ec681f3Smrg bi_index t2 = bi_fma_rscale_f32(b, m, bi_neg(t1), bi_imm_f32(1.0), 15267ec681f3Smrg bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_N); 15277ec681f3Smrg bi_fma_rscale_f32_to(b, dst, t2, x1, x1, e, 15287ec681f3Smrg BI_ROUND_NONE, BI_SPECIAL_N); 15297ec681f3Smrg} 15307ec681f3Smrg 15317ec681f3Smrg/* More complex transcendentals, see 15327ec681f3Smrg * https://gitlab.freedesktop.org/panfrost/mali-isa-docs/-/blob/master/Bifrost.adoc 15337ec681f3Smrg * for documentation */ 15347ec681f3Smrg 15357ec681f3Smrgstatic void 15367ec681f3Smrgbi_lower_fexp2_32(bi_builder *b, bi_index dst, bi_index s0) 15377ec681f3Smrg{ 15387ec681f3Smrg bi_index t1 = bi_temp(b->shader); 15397ec681f3Smrg bi_instr *t1_instr = bi_fadd_f32_to(b, t1, 15407ec681f3Smrg s0, bi_imm_u32(0x49400000), BI_ROUND_NONE); 15417ec681f3Smrg t1_instr->clamp = BI_CLAMP_CLAMP_0_INF; 15427ec681f3Smrg 15437ec681f3Smrg bi_index t2 = bi_fadd_f32(b, t1, bi_imm_u32(0xc9400000), BI_ROUND_NONE); 15447ec681f3Smrg 15457ec681f3Smrg bi_instr *a2 = bi_fadd_f32_to(b, bi_temp(b->shader), 15467ec681f3Smrg s0, bi_neg(t2), BI_ROUND_NONE); 15477ec681f3Smrg a2->clamp = BI_CLAMP_CLAMP_M1_1; 15487ec681f3Smrg 15497ec681f3Smrg bi_index a1t = bi_fexp_table_u4(b, t1, BI_ADJ_NONE); 15507ec681f3Smrg bi_index t3 = bi_isub_u32(b, t1, bi_imm_u32(0x49400000), false); 15517ec681f3Smrg bi_index a1i = bi_arshift_i32(b, t3, bi_null(), bi_imm_u8(4)); 15527ec681f3Smrg bi_index p1 = bi_fma_f32(b, a2->dest[0], bi_imm_u32(0x3d635635), 15537ec681f3Smrg bi_imm_u32(0x3e75fffa), BI_ROUND_NONE); 15547ec681f3Smrg bi_index p2 = bi_fma_f32(b, p1, a2->dest[0], 15557ec681f3Smrg bi_imm_u32(0x3f317218), BI_ROUND_NONE); 15567ec681f3Smrg bi_index p3 = bi_fmul_f32(b, a2->dest[0], p2); 15577ec681f3Smrg bi_instr *x = bi_fma_rscale_f32_to(b, bi_temp(b->shader), 15587ec681f3Smrg p3, a1t, a1t, a1i, BI_ROUND_NONE, BI_SPECIAL_NONE); 15597ec681f3Smrg x->clamp = BI_CLAMP_CLAMP_0_INF; 15607ec681f3Smrg 15617ec681f3Smrg bi_instr *max = bi_fmax_f32_to(b, dst, x->dest[0], s0); 15627ec681f3Smrg max->sem = BI_SEM_NAN_PROPAGATE; 15637ec681f3Smrg} 15647ec681f3Smrg 15657ec681f3Smrgstatic void 15667ec681f3Smrgbi_fexp_32(bi_builder *b, bi_index dst, bi_index s0, bi_index log2_base) 15677ec681f3Smrg{ 15687ec681f3Smrg /* Scale by base, Multiply by 2*24 and convert to integer to get a 8:24 15697ec681f3Smrg * fixed-point input */ 15707ec681f3Smrg bi_index scale = bi_fma_rscale_f32(b, s0, log2_base, bi_negzero(), 15717ec681f3Smrg bi_imm_u32(24), BI_ROUND_NONE, BI_SPECIAL_NONE); 15727ec681f3Smrg bi_index fixed_pt = bi_f32_to_s32(b, scale, BI_ROUND_NONE); 15737ec681f3Smrg 15747ec681f3Smrg /* Compute the result for the fixed-point input, but pass along 15757ec681f3Smrg * the floating-point scale for correct NaN propagation */ 15767ec681f3Smrg bi_fexp_f32_to(b, dst, fixed_pt, scale); 15777ec681f3Smrg} 15787ec681f3Smrg 15797ec681f3Smrgstatic void 15807ec681f3Smrgbi_lower_flog2_32(bi_builder *b, bi_index dst, bi_index s0) 15817ec681f3Smrg{ 15827ec681f3Smrg /* s0 = a1 * 2^e, with a1 in [0.75, 1.5) */ 15837ec681f3Smrg bi_index a1 = bi_frexpm_f32(b, s0, true, false); 15847ec681f3Smrg bi_index ei = bi_frexpe_f32(b, s0, true, false); 15857ec681f3Smrg bi_index ef = bi_s32_to_f32(b, ei, BI_ROUND_RTZ); 15867ec681f3Smrg 15877ec681f3Smrg /* xt estimates -log(r1), a coarse approximation of log(a1) */ 15887ec681f3Smrg bi_index r1 = bi_flog_table_f32(b, s0, BI_MODE_RED, BI_PRECISION_NONE); 15897ec681f3Smrg bi_index xt = bi_flog_table_f32(b, s0, BI_MODE_BASE2, BI_PRECISION_NONE); 15907ec681f3Smrg 15917ec681f3Smrg /* log(s0) = log(a1 * 2^e) = e + log(a1) = e + log(a1 * r1) - 15927ec681f3Smrg * log(r1), so let x1 = e - log(r1) ~= e + xt and x2 = log(a1 * r1), 15937ec681f3Smrg * and then log(s0) = x1 + x2 */ 15947ec681f3Smrg bi_index x1 = bi_fadd_f32(b, ef, xt, BI_ROUND_NONE); 15957ec681f3Smrg 15967ec681f3Smrg /* Since a1 * r1 is close to 1, x2 = log(a1 * r1) may be computed by 15977ec681f3Smrg * polynomial approximation around 1. The series is expressed around 15987ec681f3Smrg * 1, so set y = (a1 * r1) - 1.0 */ 15997ec681f3Smrg bi_index y = bi_fma_f32(b, a1, r1, bi_imm_f32(-1.0), BI_ROUND_NONE); 16007ec681f3Smrg 16017ec681f3Smrg /* x2 = log_2(1 + y) = log_e(1 + y) * (1/log_e(2)), so approximate 16027ec681f3Smrg * log_e(1 + y) by the Taylor series (lower precision than the blob): 16037ec681f3Smrg * y - y^2/2 + O(y^3) = y(1 - y/2) + O(y^3) */ 16047ec681f3Smrg bi_index loge = bi_fmul_f32(b, y, 16057ec681f3Smrg bi_fma_f32(b, y, bi_imm_f32(-0.5), bi_imm_f32(1.0), BI_ROUND_NONE)); 16067ec681f3Smrg 16077ec681f3Smrg bi_index x2 = bi_fmul_f32(b, loge, bi_imm_f32(1.0 / logf(2.0))); 16087ec681f3Smrg 16097ec681f3Smrg /* log(s0) = x1 + x2 */ 16107ec681f3Smrg bi_fadd_f32_to(b, dst, x1, x2, BI_ROUND_NONE); 16117ec681f3Smrg} 16127ec681f3Smrg 16137ec681f3Smrgstatic void 16147ec681f3Smrgbi_flog2_32(bi_builder *b, bi_index dst, bi_index s0) 16157ec681f3Smrg{ 16167ec681f3Smrg bi_index frexp = bi_frexpe_f32(b, s0, true, false); 16177ec681f3Smrg bi_index frexpi = bi_s32_to_f32(b, frexp, BI_ROUND_RTZ); 16187ec681f3Smrg bi_index add = bi_fadd_lscale_f32(b, bi_imm_f32(-1.0f), s0); 16197ec681f3Smrg bi_fma_f32_to(b, dst, bi_flogd_f32(b, s0), add, frexpi, 16207ec681f3Smrg BI_ROUND_NONE); 16217ec681f3Smrg} 16227ec681f3Smrg 16237ec681f3Smrgstatic void 16247ec681f3Smrgbi_lower_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) 16257ec681f3Smrg{ 16267ec681f3Smrg bi_index log2_base = bi_null(); 16277ec681f3Smrg 16287ec681f3Smrg if (base.type == BI_INDEX_CONSTANT) { 16297ec681f3Smrg log2_base = bi_imm_f32(log2f(uif(base.value))); 16307ec681f3Smrg } else { 16317ec681f3Smrg log2_base = bi_temp(b->shader); 16327ec681f3Smrg bi_lower_flog2_32(b, log2_base, base); 16337ec681f3Smrg } 16347ec681f3Smrg 16357ec681f3Smrg return bi_lower_fexp2_32(b, dst, bi_fmul_f32(b, exp, log2_base)); 16367ec681f3Smrg} 16377ec681f3Smrg 16387ec681f3Smrgstatic void 16397ec681f3Smrgbi_fpow_32(bi_builder *b, bi_index dst, bi_index base, bi_index exp) 16407ec681f3Smrg{ 16417ec681f3Smrg bi_index log2_base = bi_null(); 16427ec681f3Smrg 16437ec681f3Smrg if (base.type == BI_INDEX_CONSTANT) { 16447ec681f3Smrg log2_base = bi_imm_f32(log2f(uif(base.value))); 16457ec681f3Smrg } else { 16467ec681f3Smrg log2_base = bi_temp(b->shader); 16477ec681f3Smrg bi_flog2_32(b, log2_base, base); 16487ec681f3Smrg } 16497ec681f3Smrg 16507ec681f3Smrg return bi_fexp_32(b, dst, exp, log2_base); 16517ec681f3Smrg} 16527ec681f3Smrg 16537ec681f3Smrg/* Bifrost has extremely coarse tables for approximating sin/cos, accessible as 16547ec681f3Smrg * FSIN/COS_TABLE.u6, which multiplies the bottom 6-bits by pi/32 and 16557ec681f3Smrg * calculates the results. We use them to calculate sin/cos via a Taylor 16567ec681f3Smrg * approximation: 16577ec681f3Smrg * 16587ec681f3Smrg * f(x + e) = f(x) + e f'(x) + (e^2)/2 f''(x) 16597ec681f3Smrg * sin(x + e) = sin(x) + e cos(x) - (e^2)/2 sin(x) 16607ec681f3Smrg * cos(x + e) = cos(x) - e sin(x) - (e^2)/2 cos(x) 16617ec681f3Smrg */ 16627ec681f3Smrg 16637ec681f3Smrg#define TWO_OVER_PI bi_imm_f32(2.0f / 3.14159f) 16647ec681f3Smrg#define MPI_OVER_TWO bi_imm_f32(-3.14159f / 2.0) 16657ec681f3Smrg#define SINCOS_BIAS bi_imm_u32(0x49400000) 16667ec681f3Smrg 16677ec681f3Smrgstatic void 16687ec681f3Smrgbi_lower_fsincos_32(bi_builder *b, bi_index dst, bi_index s0, bool cos) 16697ec681f3Smrg{ 16707ec681f3Smrg /* bottom 6-bits of result times pi/32 approximately s0 mod 2pi */ 16717ec681f3Smrg bi_index x_u6 = bi_fma_f32(b, s0, TWO_OVER_PI, SINCOS_BIAS, BI_ROUND_NONE); 16727ec681f3Smrg 16737ec681f3Smrg /* Approximate domain error (small) */ 16747ec681f3Smrg bi_index e = bi_fma_f32(b, bi_fadd_f32(b, x_u6, bi_neg(SINCOS_BIAS), 16757ec681f3Smrg BI_ROUND_NONE), 16767ec681f3Smrg MPI_OVER_TWO, s0, BI_ROUND_NONE); 16777ec681f3Smrg 16787ec681f3Smrg /* Lookup sin(x), cos(x) */ 16797ec681f3Smrg bi_index sinx = bi_fsin_table_u6(b, x_u6, false); 16807ec681f3Smrg bi_index cosx = bi_fcos_table_u6(b, x_u6, false); 16817ec681f3Smrg 16827ec681f3Smrg /* e^2 / 2 */ 16837ec681f3Smrg bi_index e2_over_2 = bi_fma_rscale_f32(b, e, e, bi_negzero(), 16847ec681f3Smrg bi_imm_u32(-1), BI_ROUND_NONE, BI_SPECIAL_NONE); 16857ec681f3Smrg 16867ec681f3Smrg /* (-e^2)/2 f''(x) */ 16877ec681f3Smrg bi_index quadratic = bi_fma_f32(b, bi_neg(e2_over_2), 16887ec681f3Smrg cos ? cosx : sinx, 16897ec681f3Smrg bi_negzero(), BI_ROUND_NONE); 16907ec681f3Smrg 16917ec681f3Smrg /* e f'(x) - (e^2/2) f''(x) */ 16927ec681f3Smrg bi_instr *I = bi_fma_f32_to(b, bi_temp(b->shader), e, 16937ec681f3Smrg cos ? bi_neg(sinx) : cosx, 16947ec681f3Smrg quadratic, BI_ROUND_NONE); 16957ec681f3Smrg I->clamp = BI_CLAMP_CLAMP_M1_1; 16967ec681f3Smrg 16977ec681f3Smrg /* f(x) + e f'(x) - (e^2/2) f''(x) */ 16987ec681f3Smrg bi_fadd_f32_to(b, dst, I->dest[0], cos ? cosx : sinx, BI_ROUND_NONE); 16997ec681f3Smrg} 17007ec681f3Smrg 17017ec681f3Smrg/* The XOR lane op is useful for derivative calculation, but was added in v7. 17027ec681f3Smrg * Add a safe helper that will do the appropriate lowering on v6 */ 17037ec681f3Smrg 17047ec681f3Smrgstatic bi_index 17057ec681f3Smrgbi_clper_xor(bi_builder *b, bi_index s0, bi_index s1) 17067ec681f3Smrg{ 17077ec681f3Smrg if (b->shader->arch >= 7) { 17087ec681f3Smrg return bi_clper_i32(b, s0, s1, 17097ec681f3Smrg BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_XOR, 17107ec681f3Smrg BI_SUBGROUP_SUBGROUP4); 17117ec681f3Smrg } 17127ec681f3Smrg 17137ec681f3Smrg bi_index lane_id = bi_fau(BIR_FAU_LANE_ID, false); 17147ec681f3Smrg bi_index lane = bi_lshift_xor_i32(b, lane_id, s1, bi_imm_u8(0)); 17157ec681f3Smrg return bi_clper_v6_i32(b, s0, lane); 17167ec681f3Smrg} 17177ec681f3Smrg 17187ec681f3Smrgstatic bi_instr * 17197ec681f3Smrgbi_emit_alu_bool(bi_builder *b, unsigned sz, nir_op op, 17207ec681f3Smrg bi_index dst, bi_index s0, bi_index s1, bi_index s2) 17217ec681f3Smrg{ 17227ec681f3Smrg /* Handle 1-bit bools as 0/~0 by default and let the optimizer deal 17237ec681f3Smrg * with the bit patterns later. 0/~0 has the nice property of being 17247ec681f3Smrg * independent of replicated vectorization. */ 17257ec681f3Smrg if (sz == 1) sz = 16; 17267ec681f3Smrg bi_index f = bi_zero(); 17277ec681f3Smrg bi_index t = bi_imm_u16(0xFFFF); 17287ec681f3Smrg 17297ec681f3Smrg switch (op) { 17307ec681f3Smrg case nir_op_feq: 17317ec681f3Smrg return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1); 17327ec681f3Smrg case nir_op_flt: 17337ec681f3Smrg return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1); 17347ec681f3Smrg case nir_op_fge: 17357ec681f3Smrg return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1); 17367ec681f3Smrg case nir_op_fneu: 17377ec681f3Smrg return bi_fcmp_to(b, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1); 17387ec681f3Smrg 17397ec681f3Smrg case nir_op_ieq: 17407ec681f3Smrg return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_EQ, BI_RESULT_TYPE_M1); 17417ec681f3Smrg case nir_op_ine: 17427ec681f3Smrg return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_NE, BI_RESULT_TYPE_M1); 17437ec681f3Smrg case nir_op_ilt: 17447ec681f3Smrg return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1); 17457ec681f3Smrg case nir_op_ige: 17467ec681f3Smrg return bi_icmp_to(b, nir_type_int, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1); 17477ec681f3Smrg case nir_op_ult: 17487ec681f3Smrg return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_LT, BI_RESULT_TYPE_M1); 17497ec681f3Smrg case nir_op_uge: 17507ec681f3Smrg return bi_icmp_to(b, nir_type_uint, sz, dst, s0, s1, BI_CMPF_GE, BI_RESULT_TYPE_M1); 17517ec681f3Smrg 17527ec681f3Smrg case nir_op_iand: 17537ec681f3Smrg return bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 17547ec681f3Smrg case nir_op_ior: 17557ec681f3Smrg return bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 17567ec681f3Smrg case nir_op_ixor: 17577ec681f3Smrg return bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 17587ec681f3Smrg case nir_op_inot: 17597ec681f3Smrg return bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); 17607ec681f3Smrg 17617ec681f3Smrg case nir_op_f2b1: 17627ec681f3Smrg return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ); 17637ec681f3Smrg case nir_op_i2b1: 17647ec681f3Smrg return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ); 17657ec681f3Smrg case nir_op_b2b1: 17667ec681f3Smrg return bi_csel_to(b, nir_type_int, sz, dst, s0, f, f, t, BI_CMPF_EQ); 17677ec681f3Smrg 17687ec681f3Smrg case nir_op_bcsel: 17697ec681f3Smrg return bi_csel_to(b, nir_type_int, sz, dst, s0, f, s1, s2, BI_CMPF_NE); 17707ec681f3Smrg 17717ec681f3Smrg default: 17727ec681f3Smrg fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[op].name); 17737ec681f3Smrg unreachable("Unhandled boolean ALU instruction"); 17747ec681f3Smrg } 17757ec681f3Smrg} 17767ec681f3Smrg 17777ec681f3Smrgstatic void 17787ec681f3Smrgbi_emit_alu(bi_builder *b, nir_alu_instr *instr) 17797ec681f3Smrg{ 17807ec681f3Smrg bi_index dst = bi_dest_index(&instr->dest.dest); 17817ec681f3Smrg unsigned srcs = nir_op_infos[instr->op].num_inputs; 17827ec681f3Smrg unsigned sz = nir_dest_bit_size(instr->dest.dest); 17837ec681f3Smrg unsigned comps = nir_dest_num_components(instr->dest.dest); 17847ec681f3Smrg unsigned src_sz = srcs > 0 ? nir_src_bit_size(instr->src[0].src) : 0; 17857ec681f3Smrg unsigned src1_sz = srcs > 1 ? nir_src_bit_size(instr->src[1].src) : 0; 17867ec681f3Smrg bool is_bool = (sz == 1); 17877ec681f3Smrg 17887ec681f3Smrg /* TODO: Anything else? */ 17897ec681f3Smrg if (sz == 1) 17907ec681f3Smrg sz = 16; 17917ec681f3Smrg 17927ec681f3Smrg /* Indicate scalarness */ 17937ec681f3Smrg if (sz == 16 && comps == 1) 17947ec681f3Smrg dst.swizzle = BI_SWIZZLE_H00; 17957ec681f3Smrg 17967ec681f3Smrg if (!instr->dest.dest.is_ssa) { 17977ec681f3Smrg for (unsigned i = 0; i < comps; ++i) 17987ec681f3Smrg assert(instr->dest.write_mask); 17997ec681f3Smrg } 18007ec681f3Smrg 18017ec681f3Smrg /* First, match against the various moves in NIR. These are 18027ec681f3Smrg * special-cased because they can operate on vectors even after 18037ec681f3Smrg * lowering ALU to scalar. For Bifrost, bi_alu_src_index assumes the 18047ec681f3Smrg * instruction is no "bigger" than SIMD-within-a-register. These moves 18057ec681f3Smrg * are the exceptions that need to handle swizzles specially. */ 18067ec681f3Smrg 18077ec681f3Smrg switch (instr->op) { 18087ec681f3Smrg case nir_op_pack_32_2x16: 18097ec681f3Smrg case nir_op_vec2: 18107ec681f3Smrg case nir_op_vec3: 18117ec681f3Smrg case nir_op_vec4: { 18127ec681f3Smrg bi_index unoffset_srcs[4] = { 18137ec681f3Smrg srcs > 0 ? bi_src_index(&instr->src[0].src) : bi_null(), 18147ec681f3Smrg srcs > 1 ? bi_src_index(&instr->src[1].src) : bi_null(), 18157ec681f3Smrg srcs > 2 ? bi_src_index(&instr->src[2].src) : bi_null(), 18167ec681f3Smrg srcs > 3 ? bi_src_index(&instr->src[3].src) : bi_null(), 18177ec681f3Smrg }; 18187ec681f3Smrg 18197ec681f3Smrg unsigned channels[4] = { 18207ec681f3Smrg instr->src[0].swizzle[0], 18217ec681f3Smrg instr->src[1].swizzle[0], 18227ec681f3Smrg srcs > 2 ? instr->src[2].swizzle[0] : 0, 18237ec681f3Smrg srcs > 3 ? instr->src[3].swizzle[0] : 0, 18247ec681f3Smrg }; 18257ec681f3Smrg 18267ec681f3Smrg bi_make_vec_to(b, dst, unoffset_srcs, channels, srcs, sz); 18277ec681f3Smrg return; 18287ec681f3Smrg } 18297ec681f3Smrg 18307ec681f3Smrg case nir_op_vec8: 18317ec681f3Smrg case nir_op_vec16: 18327ec681f3Smrg unreachable("should've been lowered"); 18337ec681f3Smrg 18347ec681f3Smrg case nir_op_unpack_32_2x16: 18357ec681f3Smrg case nir_op_unpack_64_2x32_split_x: 18367ec681f3Smrg bi_mov_i32_to(b, dst, bi_src_index(&instr->src[0].src)); 18377ec681f3Smrg return; 18387ec681f3Smrg 18397ec681f3Smrg case nir_op_unpack_64_2x32_split_y: 18407ec681f3Smrg bi_mov_i32_to(b, dst, bi_word(bi_src_index(&instr->src[0].src), 1)); 18417ec681f3Smrg return; 18427ec681f3Smrg 18437ec681f3Smrg case nir_op_pack_64_2x32_split: 18447ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, 0), bi_src_index(&instr->src[0].src)); 18457ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, 1), bi_src_index(&instr->src[1].src)); 18467ec681f3Smrg return; 18477ec681f3Smrg 18487ec681f3Smrg case nir_op_pack_64_2x32: 18497ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, 0), bi_word(bi_src_index(&instr->src[0].src), 0)); 18507ec681f3Smrg bi_mov_i32_to(b, bi_word(dst, 1), bi_word(bi_src_index(&instr->src[0].src), 1)); 18517ec681f3Smrg return; 18527ec681f3Smrg 18537ec681f3Smrg case nir_op_mov: { 18547ec681f3Smrg bi_index idx = bi_src_index(&instr->src[0].src); 18557ec681f3Smrg bi_index unoffset_srcs[4] = { idx, idx, idx, idx }; 18567ec681f3Smrg 18577ec681f3Smrg unsigned channels[4] = { 18587ec681f3Smrg comps > 0 ? instr->src[0].swizzle[0] : 0, 18597ec681f3Smrg comps > 1 ? instr->src[0].swizzle[1] : 0, 18607ec681f3Smrg comps > 2 ? instr->src[0].swizzle[2] : 0, 18617ec681f3Smrg comps > 3 ? instr->src[0].swizzle[3] : 0, 18627ec681f3Smrg }; 18637ec681f3Smrg 18647ec681f3Smrg if (sz == 1) sz = 16; 18657ec681f3Smrg bi_make_vec_to(b, dst, unoffset_srcs, channels, comps, sz); 18667ec681f3Smrg return; 18677ec681f3Smrg } 18687ec681f3Smrg 18697ec681f3Smrg case nir_op_f2f16: 18707ec681f3Smrg assert(src_sz == 32); 18717ec681f3Smrg bi_index idx = bi_src_index(&instr->src[0].src); 18727ec681f3Smrg bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]); 18737ec681f3Smrg bi_index s1 = comps > 1 ? 18747ec681f3Smrg bi_word(idx, instr->src[0].swizzle[1]) : s0; 18757ec681f3Smrg 18767ec681f3Smrg bi_v2f32_to_v2f16_to(b, dst, s0, s1, BI_ROUND_NONE); 18777ec681f3Smrg return; 18787ec681f3Smrg 18797ec681f3Smrg /* Vectorized downcasts */ 18807ec681f3Smrg case nir_op_u2u16: 18817ec681f3Smrg case nir_op_i2i16: { 18827ec681f3Smrg if (!(src_sz == 32 && comps == 2)) 18837ec681f3Smrg break; 18847ec681f3Smrg 18857ec681f3Smrg bi_index idx = bi_src_index(&instr->src[0].src); 18867ec681f3Smrg bi_index s0 = bi_word(idx, instr->src[0].swizzle[0]); 18877ec681f3Smrg bi_index s1 = bi_word(idx, instr->src[0].swizzle[1]); 18887ec681f3Smrg 18897ec681f3Smrg bi_mkvec_v2i16_to(b, dst, 18907ec681f3Smrg bi_half(s0, false), bi_half(s1, false)); 18917ec681f3Smrg return; 18927ec681f3Smrg } 18937ec681f3Smrg 18947ec681f3Smrg case nir_op_i2i8: 18957ec681f3Smrg case nir_op_u2u8: 18967ec681f3Smrg { 18977ec681f3Smrg /* Acts like an 8-bit swizzle */ 18987ec681f3Smrg bi_index idx = bi_src_index(&instr->src[0].src); 18997ec681f3Smrg unsigned factor = src_sz / 8; 19007ec681f3Smrg unsigned chan[4] = { 0 }; 19017ec681f3Smrg 19027ec681f3Smrg for (unsigned i = 0; i < comps; ++i) 19037ec681f3Smrg chan[i] = instr->src[0].swizzle[i] * factor; 19047ec681f3Smrg 19057ec681f3Smrg bi_make_vec_to(b, dst, &idx, chan, comps, 8); 19067ec681f3Smrg return; 19077ec681f3Smrg } 19087ec681f3Smrg 19097ec681f3Smrg default: 19107ec681f3Smrg break; 19117ec681f3Smrg } 19127ec681f3Smrg 19137ec681f3Smrg bi_index s0 = srcs > 0 ? bi_alu_src_index(instr->src[0], comps) : bi_null(); 19147ec681f3Smrg bi_index s1 = srcs > 1 ? bi_alu_src_index(instr->src[1], comps) : bi_null(); 19157ec681f3Smrg bi_index s2 = srcs > 2 ? bi_alu_src_index(instr->src[2], comps) : bi_null(); 19167ec681f3Smrg 19177ec681f3Smrg if (is_bool) { 19187ec681f3Smrg bi_emit_alu_bool(b, src_sz, instr->op, dst, s0, s1, s2); 19197ec681f3Smrg return; 19207ec681f3Smrg } 19217ec681f3Smrg 19227ec681f3Smrg switch (instr->op) { 19237ec681f3Smrg case nir_op_ffma: 19247ec681f3Smrg bi_fma_to(b, sz, dst, s0, s1, s2, BI_ROUND_NONE); 19257ec681f3Smrg break; 19267ec681f3Smrg 19277ec681f3Smrg case nir_op_fmul: 19287ec681f3Smrg bi_fma_to(b, sz, dst, s0, s1, bi_negzero(), BI_ROUND_NONE); 19297ec681f3Smrg break; 19307ec681f3Smrg 19317ec681f3Smrg case nir_op_fsub: 19327ec681f3Smrg s1 = bi_neg(s1); 19337ec681f3Smrg FALLTHROUGH; 19347ec681f3Smrg case nir_op_fadd: 19357ec681f3Smrg bi_fadd_to(b, sz, dst, s0, s1, BI_ROUND_NONE); 19367ec681f3Smrg break; 19377ec681f3Smrg 19387ec681f3Smrg case nir_op_fsat: { 19397ec681f3Smrg bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 19407ec681f3Smrg I->clamp = BI_CLAMP_CLAMP_0_1; 19417ec681f3Smrg break; 19427ec681f3Smrg } 19437ec681f3Smrg 19447ec681f3Smrg case nir_op_fsat_signed_mali: { 19457ec681f3Smrg bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 19467ec681f3Smrg I->clamp = BI_CLAMP_CLAMP_M1_1; 19477ec681f3Smrg break; 19487ec681f3Smrg } 19497ec681f3Smrg 19507ec681f3Smrg case nir_op_fclamp_pos_mali: { 19517ec681f3Smrg bi_instr *I = bi_fclamp_to(b, sz, dst, s0); 19527ec681f3Smrg I->clamp = BI_CLAMP_CLAMP_0_INF; 19537ec681f3Smrg break; 19547ec681f3Smrg } 19557ec681f3Smrg 19567ec681f3Smrg case nir_op_fneg: 19577ec681f3Smrg bi_fabsneg_to(b, sz, dst, bi_neg(s0)); 19587ec681f3Smrg break; 19597ec681f3Smrg 19607ec681f3Smrg case nir_op_fabs: 19617ec681f3Smrg bi_fabsneg_to(b, sz, dst, bi_abs(s0)); 19627ec681f3Smrg break; 19637ec681f3Smrg 19647ec681f3Smrg case nir_op_fsin: 19657ec681f3Smrg bi_lower_fsincos_32(b, dst, s0, false); 19667ec681f3Smrg break; 19677ec681f3Smrg 19687ec681f3Smrg case nir_op_fcos: 19697ec681f3Smrg bi_lower_fsincos_32(b, dst, s0, true); 19707ec681f3Smrg break; 19717ec681f3Smrg 19727ec681f3Smrg case nir_op_fexp2: 19737ec681f3Smrg assert(sz == 32); /* should've been lowered */ 19747ec681f3Smrg 19757ec681f3Smrg if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 19767ec681f3Smrg bi_lower_fexp2_32(b, dst, s0); 19777ec681f3Smrg else 19787ec681f3Smrg bi_fexp_32(b, dst, s0, bi_imm_f32(1.0f)); 19797ec681f3Smrg 19807ec681f3Smrg break; 19817ec681f3Smrg 19827ec681f3Smrg case nir_op_flog2: 19837ec681f3Smrg assert(sz == 32); /* should've been lowered */ 19847ec681f3Smrg 19857ec681f3Smrg if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 19867ec681f3Smrg bi_lower_flog2_32(b, dst, s0); 19877ec681f3Smrg else 19887ec681f3Smrg bi_flog2_32(b, dst, s0); 19897ec681f3Smrg 19907ec681f3Smrg break; 19917ec681f3Smrg 19927ec681f3Smrg case nir_op_fpow: 19937ec681f3Smrg assert(sz == 32); /* should've been lowered */ 19947ec681f3Smrg 19957ec681f3Smrg if (b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 19967ec681f3Smrg bi_lower_fpow_32(b, dst, s0, s1); 19977ec681f3Smrg else 19987ec681f3Smrg bi_fpow_32(b, dst, s0, s1); 19997ec681f3Smrg 20007ec681f3Smrg break; 20017ec681f3Smrg 20027ec681f3Smrg case nir_op_bcsel: 20037ec681f3Smrg if (src1_sz == 8) 20047ec681f3Smrg bi_mux_v4i8_to(b, dst, s2, s1, s0, BI_MUX_INT_ZERO); 20057ec681f3Smrg else 20067ec681f3Smrg bi_csel_to(b, nir_type_int, src1_sz, 20077ec681f3Smrg dst, s0, bi_zero(), s1, s2, BI_CMPF_NE); 20087ec681f3Smrg break; 20097ec681f3Smrg 20107ec681f3Smrg case nir_op_ishl: 20117ec681f3Smrg bi_lshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); 20127ec681f3Smrg break; 20137ec681f3Smrg case nir_op_ushr: 20147ec681f3Smrg bi_rshift_or_to(b, sz, dst, s0, bi_zero(), bi_byte(s1, 0)); 20157ec681f3Smrg break; 20167ec681f3Smrg 20177ec681f3Smrg case nir_op_ishr: 20187ec681f3Smrg bi_arshift_to(b, sz, dst, s0, bi_null(), bi_byte(s1, 0)); 20197ec681f3Smrg break; 20207ec681f3Smrg 20217ec681f3Smrg case nir_op_imin: 20227ec681f3Smrg case nir_op_umin: 20237ec681f3Smrg bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, 20247ec681f3Smrg s0, s1, s0, s1, BI_CMPF_LT); 20257ec681f3Smrg break; 20267ec681f3Smrg 20277ec681f3Smrg case nir_op_imax: 20287ec681f3Smrg case nir_op_umax: 20297ec681f3Smrg bi_csel_to(b, nir_op_infos[instr->op].input_types[0], sz, dst, 20307ec681f3Smrg s0, s1, s0, s1, BI_CMPF_GT); 20317ec681f3Smrg break; 20327ec681f3Smrg 20337ec681f3Smrg case nir_op_fddx_must_abs_mali: 20347ec681f3Smrg case nir_op_fddy_must_abs_mali: { 20357ec681f3Smrg bi_index bit = bi_imm_u32(instr->op == nir_op_fddx_must_abs_mali ? 1 : 2); 20367ec681f3Smrg bi_index adjacent = bi_clper_xor(b, s0, bit); 20377ec681f3Smrg bi_fadd_to(b, sz, dst, adjacent, bi_neg(s0), BI_ROUND_NONE); 20387ec681f3Smrg break; 20397ec681f3Smrg } 20407ec681f3Smrg 20417ec681f3Smrg case nir_op_fddx: 20427ec681f3Smrg case nir_op_fddy: { 20437ec681f3Smrg bi_index lane1 = bi_lshift_and_i32(b, 20447ec681f3Smrg bi_fau(BIR_FAU_LANE_ID, false), 20457ec681f3Smrg bi_imm_u32(instr->op == nir_op_fddx ? 2 : 1), 20467ec681f3Smrg bi_imm_u8(0)); 20477ec681f3Smrg 20487ec681f3Smrg bi_index lane2 = bi_iadd_u32(b, lane1, 20497ec681f3Smrg bi_imm_u32(instr->op == nir_op_fddx ? 1 : 2), 20507ec681f3Smrg false); 20517ec681f3Smrg 20527ec681f3Smrg bi_index left, right; 20537ec681f3Smrg 20547ec681f3Smrg if (b->shader->quirks & BIFROST_LIMITED_CLPER) { 20557ec681f3Smrg left = bi_clper_v6_i32(b, s0, lane1); 20567ec681f3Smrg right = bi_clper_v6_i32(b, s0, lane2); 20577ec681f3Smrg } else { 20587ec681f3Smrg left = bi_clper_i32(b, s0, lane1, 20597ec681f3Smrg BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, 20607ec681f3Smrg BI_SUBGROUP_SUBGROUP4); 20617ec681f3Smrg 20627ec681f3Smrg right = bi_clper_i32(b, s0, lane2, 20637ec681f3Smrg BI_INACTIVE_RESULT_ZERO, BI_LANE_OP_NONE, 20647ec681f3Smrg BI_SUBGROUP_SUBGROUP4); 20657ec681f3Smrg } 20667ec681f3Smrg 20677ec681f3Smrg bi_fadd_to(b, sz, dst, right, bi_neg(left), BI_ROUND_NONE); 20687ec681f3Smrg break; 20697ec681f3Smrg } 20707ec681f3Smrg 20717ec681f3Smrg case nir_op_f2f32: 20727ec681f3Smrg bi_f16_to_f32_to(b, dst, s0); 20737ec681f3Smrg break; 20747ec681f3Smrg 20757ec681f3Smrg case nir_op_f2i32: 20767ec681f3Smrg if (src_sz == 32) 20777ec681f3Smrg bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ); 20787ec681f3Smrg else 20797ec681f3Smrg bi_f16_to_s32_to(b, dst, s0, BI_ROUND_RTZ); 20807ec681f3Smrg break; 20817ec681f3Smrg 20827ec681f3Smrg /* Note 32-bit sources => no vectorization, so 32-bit works */ 20837ec681f3Smrg case nir_op_f2u16: 20847ec681f3Smrg if (src_sz == 32) 20857ec681f3Smrg bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ); 20867ec681f3Smrg else 20877ec681f3Smrg bi_v2f16_to_v2u16_to(b, dst, s0, BI_ROUND_RTZ); 20887ec681f3Smrg break; 20897ec681f3Smrg 20907ec681f3Smrg case nir_op_f2i16: 20917ec681f3Smrg if (src_sz == 32) 20927ec681f3Smrg bi_f32_to_s32_to(b, dst, s0, BI_ROUND_RTZ); 20937ec681f3Smrg else 20947ec681f3Smrg bi_v2f16_to_v2s16_to(b, dst, s0, BI_ROUND_RTZ); 20957ec681f3Smrg break; 20967ec681f3Smrg 20977ec681f3Smrg case nir_op_f2u32: 20987ec681f3Smrg if (src_sz == 32) 20997ec681f3Smrg bi_f32_to_u32_to(b, dst, s0, BI_ROUND_RTZ); 21007ec681f3Smrg else 21017ec681f3Smrg bi_f16_to_u32_to(b, dst, s0, BI_ROUND_RTZ); 21027ec681f3Smrg break; 21037ec681f3Smrg 21047ec681f3Smrg case nir_op_u2f16: 21057ec681f3Smrg if (src_sz == 32) 21067ec681f3Smrg bi_v2u16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ); 21077ec681f3Smrg else if (src_sz == 16) 21087ec681f3Smrg bi_v2u16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ); 21097ec681f3Smrg else if (src_sz == 8) 21107ec681f3Smrg bi_v2u8_to_v2f16_to(b, dst, s0); 21117ec681f3Smrg break; 21127ec681f3Smrg 21137ec681f3Smrg case nir_op_u2f32: 21147ec681f3Smrg if (src_sz == 32) 21157ec681f3Smrg bi_u32_to_f32_to(b, dst, s0, BI_ROUND_RTZ); 21167ec681f3Smrg else if (src_sz == 16) 21177ec681f3Smrg bi_u16_to_f32_to(b, dst, s0); 21187ec681f3Smrg else 21197ec681f3Smrg bi_u8_to_f32_to(b, dst, s0); 21207ec681f3Smrg break; 21217ec681f3Smrg 21227ec681f3Smrg case nir_op_i2f16: 21237ec681f3Smrg if (src_sz == 32) 21247ec681f3Smrg bi_v2s16_to_v2f16_to(b, dst, bi_half(s0, false), BI_ROUND_RTZ); 21257ec681f3Smrg else if (src_sz == 16) 21267ec681f3Smrg bi_v2s16_to_v2f16_to(b, dst, s0, BI_ROUND_RTZ); 21277ec681f3Smrg else if (src_sz == 8) 21287ec681f3Smrg bi_v2s8_to_v2f16_to(b, dst, s0); 21297ec681f3Smrg break; 21307ec681f3Smrg 21317ec681f3Smrg case nir_op_i2f32: 21327ec681f3Smrg if (src_sz == 32) 21337ec681f3Smrg bi_s32_to_f32_to(b, dst, s0, BI_ROUND_RTZ); 21347ec681f3Smrg else if (src_sz == 16) 21357ec681f3Smrg bi_s16_to_f32_to(b, dst, s0); 21367ec681f3Smrg else if (src_sz == 8) 21377ec681f3Smrg bi_s8_to_f32_to(b, dst, s0); 21387ec681f3Smrg break; 21397ec681f3Smrg 21407ec681f3Smrg case nir_op_i2i32: 21417ec681f3Smrg if (src_sz == 16) 21427ec681f3Smrg bi_s16_to_s32_to(b, dst, s0); 21437ec681f3Smrg else 21447ec681f3Smrg bi_s8_to_s32_to(b, dst, s0); 21457ec681f3Smrg break; 21467ec681f3Smrg 21477ec681f3Smrg case nir_op_u2u32: 21487ec681f3Smrg if (src_sz == 16) 21497ec681f3Smrg bi_u16_to_u32_to(b, dst, s0); 21507ec681f3Smrg else 21517ec681f3Smrg bi_u8_to_u32_to(b, dst, s0); 21527ec681f3Smrg break; 21537ec681f3Smrg 21547ec681f3Smrg case nir_op_i2i16: 21557ec681f3Smrg assert(src_sz == 8 || src_sz == 32); 21567ec681f3Smrg 21577ec681f3Smrg if (src_sz == 8) 21587ec681f3Smrg bi_v2s8_to_v2s16_to(b, dst, s0); 21597ec681f3Smrg else 21607ec681f3Smrg bi_mov_i32_to(b, dst, s0); 21617ec681f3Smrg break; 21627ec681f3Smrg 21637ec681f3Smrg case nir_op_u2u16: 21647ec681f3Smrg assert(src_sz == 8 || src_sz == 32); 21657ec681f3Smrg 21667ec681f3Smrg if (src_sz == 8) 21677ec681f3Smrg bi_v2u8_to_v2u16_to(b, dst, s0); 21687ec681f3Smrg else 21697ec681f3Smrg bi_mov_i32_to(b, dst, s0); 21707ec681f3Smrg break; 21717ec681f3Smrg 21727ec681f3Smrg case nir_op_b2f16: 21737ec681f3Smrg case nir_op_b2f32: 21747ec681f3Smrg bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(), 21757ec681f3Smrg (sz == 16) ? bi_imm_f16(1.0) : bi_imm_f32(1.0), 21767ec681f3Smrg (sz == 16) ? bi_imm_f16(0.0) : bi_imm_f32(0.0), 21777ec681f3Smrg BI_CMPF_NE); 21787ec681f3Smrg break; 21797ec681f3Smrg 21807ec681f3Smrg case nir_op_b2b32: 21817ec681f3Smrg bi_csel_to(b, nir_type_int, sz, dst, s0, bi_zero(), 21827ec681f3Smrg bi_imm_u32(~0), bi_zero(), BI_CMPF_NE); 21837ec681f3Smrg break; 21847ec681f3Smrg 21857ec681f3Smrg case nir_op_b2i8: 21867ec681f3Smrg case nir_op_b2i16: 21877ec681f3Smrg case nir_op_b2i32: 21887ec681f3Smrg bi_lshift_and_to(b, sz, dst, s0, bi_imm_uintN(1, sz), bi_imm_u8(0)); 21897ec681f3Smrg break; 21907ec681f3Smrg 21917ec681f3Smrg case nir_op_fround_even: 21927ec681f3Smrg case nir_op_fceil: 21937ec681f3Smrg case nir_op_ffloor: 21947ec681f3Smrg case nir_op_ftrunc: 21957ec681f3Smrg bi_fround_to(b, sz, dst, s0, bi_nir_round(instr->op)); 21967ec681f3Smrg break; 21977ec681f3Smrg 21987ec681f3Smrg case nir_op_fmin: 21997ec681f3Smrg bi_fmin_to(b, sz, dst, s0, s1); 22007ec681f3Smrg break; 22017ec681f3Smrg 22027ec681f3Smrg case nir_op_fmax: 22037ec681f3Smrg bi_fmax_to(b, sz, dst, s0, s1); 22047ec681f3Smrg break; 22057ec681f3Smrg 22067ec681f3Smrg case nir_op_iadd: 22077ec681f3Smrg bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, false); 22087ec681f3Smrg break; 22097ec681f3Smrg 22107ec681f3Smrg case nir_op_iadd_sat: 22117ec681f3Smrg bi_iadd_to(b, nir_type_int, sz, dst, s0, s1, true); 22127ec681f3Smrg break; 22137ec681f3Smrg 22147ec681f3Smrg case nir_op_uadd_sat: 22157ec681f3Smrg bi_iadd_to(b, nir_type_uint, sz, dst, s0, s1, true); 22167ec681f3Smrg break; 22177ec681f3Smrg 22187ec681f3Smrg case nir_op_ihadd: 22197ec681f3Smrg bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTN); 22207ec681f3Smrg break; 22217ec681f3Smrg 22227ec681f3Smrg case nir_op_irhadd: 22237ec681f3Smrg bi_hadd_to(b, nir_type_int, sz, dst, s0, s1, BI_ROUND_RTP); 22247ec681f3Smrg break; 22257ec681f3Smrg 22267ec681f3Smrg case nir_op_ineg: 22277ec681f3Smrg bi_isub_to(b, nir_type_int, sz, dst, bi_zero(), s0, false); 22287ec681f3Smrg break; 22297ec681f3Smrg 22307ec681f3Smrg case nir_op_isub: 22317ec681f3Smrg bi_isub_to(b, nir_type_int, sz, dst, s0, s1, false); 22327ec681f3Smrg break; 22337ec681f3Smrg 22347ec681f3Smrg case nir_op_isub_sat: 22357ec681f3Smrg bi_isub_to(b, nir_type_int, sz, dst, s0, s1, true); 22367ec681f3Smrg break; 22377ec681f3Smrg 22387ec681f3Smrg case nir_op_usub_sat: 22397ec681f3Smrg bi_isub_to(b, nir_type_uint, sz, dst, s0, s1, true); 22407ec681f3Smrg break; 22417ec681f3Smrg 22427ec681f3Smrg case nir_op_imul: 22437ec681f3Smrg bi_imul_to(b, sz, dst, s0, s1); 22447ec681f3Smrg break; 22457ec681f3Smrg 22467ec681f3Smrg case nir_op_iabs: 22477ec681f3Smrg bi_iabs_to(b, sz, dst, s0); 22487ec681f3Smrg break; 22497ec681f3Smrg 22507ec681f3Smrg case nir_op_iand: 22517ec681f3Smrg bi_lshift_and_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 22527ec681f3Smrg break; 22537ec681f3Smrg 22547ec681f3Smrg case nir_op_ior: 22557ec681f3Smrg bi_lshift_or_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 22567ec681f3Smrg break; 22577ec681f3Smrg 22587ec681f3Smrg case nir_op_ixor: 22597ec681f3Smrg bi_lshift_xor_to(b, sz, dst, s0, s1, bi_imm_u8(0)); 22607ec681f3Smrg break; 22617ec681f3Smrg 22627ec681f3Smrg case nir_op_inot: 22637ec681f3Smrg bi_lshift_or_to(b, sz, dst, bi_zero(), bi_not(s0), bi_imm_u8(0)); 22647ec681f3Smrg break; 22657ec681f3Smrg 22667ec681f3Smrg case nir_op_frsq: 22677ec681f3Smrg if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 22687ec681f3Smrg bi_lower_frsq_32(b, dst, s0); 22697ec681f3Smrg else 22707ec681f3Smrg bi_frsq_to(b, sz, dst, s0); 22717ec681f3Smrg break; 22727ec681f3Smrg 22737ec681f3Smrg case nir_op_frcp: 22747ec681f3Smrg if (sz == 32 && b->shader->quirks & BIFROST_NO_FP32_TRANSCENDENTALS) 22757ec681f3Smrg bi_lower_frcp_32(b, dst, s0); 22767ec681f3Smrg else 22777ec681f3Smrg bi_frcp_to(b, sz, dst, s0); 22787ec681f3Smrg break; 22797ec681f3Smrg 22807ec681f3Smrg case nir_op_uclz: 22817ec681f3Smrg bi_clz_to(b, sz, dst, s0, false); 22827ec681f3Smrg break; 22837ec681f3Smrg 22847ec681f3Smrg case nir_op_bit_count: 22857ec681f3Smrg bi_popcount_i32_to(b, dst, s0); 22867ec681f3Smrg break; 22877ec681f3Smrg 22887ec681f3Smrg case nir_op_bitfield_reverse: 22897ec681f3Smrg bi_bitrev_i32_to(b, dst, s0); 22907ec681f3Smrg break; 22917ec681f3Smrg 22927ec681f3Smrg case nir_op_ufind_msb: { 22937ec681f3Smrg bi_index clz = bi_clz(b, src_sz, s0, false); 22947ec681f3Smrg 22957ec681f3Smrg if (sz == 8) 22967ec681f3Smrg clz = bi_byte(clz, 0); 22977ec681f3Smrg else if (sz == 16) 22987ec681f3Smrg clz = bi_half(clz, false); 22997ec681f3Smrg 23007ec681f3Smrg bi_isub_u32_to(b, dst, bi_imm_u32(src_sz - 1), clz, false); 23017ec681f3Smrg break; 23027ec681f3Smrg } 23037ec681f3Smrg 23047ec681f3Smrg default: 23057ec681f3Smrg fprintf(stderr, "Unhandled ALU op %s\n", nir_op_infos[instr->op].name); 23067ec681f3Smrg unreachable("Unknown ALU op"); 23077ec681f3Smrg } 23087ec681f3Smrg} 23097ec681f3Smrg 23107ec681f3Smrg/* Returns dimension with 0 special casing cubemaps. Shamelessly copied from Midgard */ 23117ec681f3Smrgstatic unsigned 23127ec681f3Smrgbifrost_tex_format(enum glsl_sampler_dim dim) 23137ec681f3Smrg{ 23147ec681f3Smrg switch (dim) { 23157ec681f3Smrg case GLSL_SAMPLER_DIM_1D: 23167ec681f3Smrg case GLSL_SAMPLER_DIM_BUF: 23177ec681f3Smrg return 1; 23187ec681f3Smrg 23197ec681f3Smrg case GLSL_SAMPLER_DIM_2D: 23207ec681f3Smrg case GLSL_SAMPLER_DIM_MS: 23217ec681f3Smrg case GLSL_SAMPLER_DIM_EXTERNAL: 23227ec681f3Smrg case GLSL_SAMPLER_DIM_RECT: 23237ec681f3Smrg return 2; 23247ec681f3Smrg 23257ec681f3Smrg case GLSL_SAMPLER_DIM_3D: 23267ec681f3Smrg return 3; 23277ec681f3Smrg 23287ec681f3Smrg case GLSL_SAMPLER_DIM_CUBE: 23297ec681f3Smrg return 0; 23307ec681f3Smrg 23317ec681f3Smrg default: 23327ec681f3Smrg DBG("Unknown sampler dim type\n"); 23337ec681f3Smrg assert(0); 23347ec681f3Smrg return 0; 23357ec681f3Smrg } 23367ec681f3Smrg} 23377ec681f3Smrg 23387ec681f3Smrgstatic enum bifrost_texture_format_full 23397ec681f3Smrgbi_texture_format(nir_alu_type T, enum bi_clamp clamp) 23407ec681f3Smrg{ 23417ec681f3Smrg switch (T) { 23427ec681f3Smrg case nir_type_float16: return BIFROST_TEXTURE_FORMAT_F16 + clamp; 23437ec681f3Smrg case nir_type_float32: return BIFROST_TEXTURE_FORMAT_F32 + clamp; 23447ec681f3Smrg case nir_type_uint16: return BIFROST_TEXTURE_FORMAT_U16; 23457ec681f3Smrg case nir_type_int16: return BIFROST_TEXTURE_FORMAT_S16; 23467ec681f3Smrg case nir_type_uint32: return BIFROST_TEXTURE_FORMAT_U32; 23477ec681f3Smrg case nir_type_int32: return BIFROST_TEXTURE_FORMAT_S32; 23487ec681f3Smrg default: unreachable("Invalid type for texturing"); 23497ec681f3Smrg } 23507ec681f3Smrg} 23517ec681f3Smrg 23527ec681f3Smrg/* Array indices are specified as 32-bit uints, need to convert. In .z component from NIR */ 23537ec681f3Smrgstatic bi_index 23547ec681f3Smrgbi_emit_texc_array_index(bi_builder *b, bi_index idx, nir_alu_type T) 23557ec681f3Smrg{ 23567ec681f3Smrg /* For (u)int we can just passthrough */ 23577ec681f3Smrg nir_alu_type base = nir_alu_type_get_base_type(T); 23587ec681f3Smrg if (base == nir_type_int || base == nir_type_uint) 23597ec681f3Smrg return idx; 23607ec681f3Smrg 23617ec681f3Smrg /* Otherwise we convert */ 23627ec681f3Smrg assert(T == nir_type_float32); 23637ec681f3Smrg 23647ec681f3Smrg /* OpenGL ES 3.2 specification section 8.14.2 ("Coordinate Wrapping and 23657ec681f3Smrg * Texel Selection") defines the layer to be taken from clamp(RNE(r), 23667ec681f3Smrg * 0, dt - 1). So we use round RTE, clamping is handled at the data 23677ec681f3Smrg * structure level */ 23687ec681f3Smrg 23697ec681f3Smrg return bi_f32_to_u32(b, idx, BI_ROUND_NONE); 23707ec681f3Smrg} 23717ec681f3Smrg 23727ec681f3Smrg/* TEXC's explicit and bias LOD modes requires the LOD to be transformed to a 23737ec681f3Smrg * 16-bit 8:8 fixed-point format. We lower as: 23747ec681f3Smrg * 23757ec681f3Smrg * F32_TO_S32(clamp(x, -16.0, +16.0) * 256.0) & 0xFFFF = 23767ec681f3Smrg * MKVEC(F32_TO_S32(clamp(x * 1.0/16.0, -1.0, 1.0) * (16.0 * 256.0)), #0) 23777ec681f3Smrg */ 23787ec681f3Smrg 23797ec681f3Smrgstatic bi_index 23807ec681f3Smrgbi_emit_texc_lod_88(bi_builder *b, bi_index lod, bool fp16) 23817ec681f3Smrg{ 23827ec681f3Smrg /* Precompute for constant LODs to avoid general constant folding */ 23837ec681f3Smrg if (lod.type == BI_INDEX_CONSTANT) { 23847ec681f3Smrg uint32_t raw = lod.value; 23857ec681f3Smrg float x = fp16 ? _mesa_half_to_float(raw) : uif(raw); 23867ec681f3Smrg int32_t s32 = CLAMP(x, -16.0f, 16.0f) * 256.0f; 23877ec681f3Smrg return bi_imm_u32(s32 & 0xFFFF); 23887ec681f3Smrg } 23897ec681f3Smrg 23907ec681f3Smrg /* Sort of arbitrary. Must be less than 128.0, greater than or equal to 23917ec681f3Smrg * the max LOD (16 since we cap at 2^16 texture dimensions), and 23927ec681f3Smrg * preferably small to minimize precision loss */ 23937ec681f3Smrg const float max_lod = 16.0; 23947ec681f3Smrg 23957ec681f3Smrg bi_instr *fsat = bi_fma_f32_to(b, bi_temp(b->shader), 23967ec681f3Smrg fp16 ? bi_half(lod, false) : lod, 23977ec681f3Smrg bi_imm_f32(1.0f / max_lod), bi_negzero(), BI_ROUND_NONE); 23987ec681f3Smrg 23997ec681f3Smrg fsat->clamp = BI_CLAMP_CLAMP_M1_1; 24007ec681f3Smrg 24017ec681f3Smrg bi_index fmul = bi_fma_f32(b, fsat->dest[0], bi_imm_f32(max_lod * 256.0f), 24027ec681f3Smrg bi_negzero(), BI_ROUND_NONE); 24037ec681f3Smrg 24047ec681f3Smrg return bi_mkvec_v2i16(b, 24057ec681f3Smrg bi_half(bi_f32_to_s32(b, fmul, BI_ROUND_RTZ), false), 24067ec681f3Smrg bi_imm_u16(0)); 24077ec681f3Smrg} 24087ec681f3Smrg 24097ec681f3Smrg/* FETCH takes a 32-bit staging register containing the LOD as an integer in 24107ec681f3Smrg * the bottom 16-bits and (if present) the cube face index in the top 16-bits. 24117ec681f3Smrg * TODO: Cube face. 24127ec681f3Smrg */ 24137ec681f3Smrg 24147ec681f3Smrgstatic bi_index 24157ec681f3Smrgbi_emit_texc_lod_cube(bi_builder *b, bi_index lod) 24167ec681f3Smrg{ 24177ec681f3Smrg return bi_lshift_or_i32(b, lod, bi_zero(), bi_imm_u8(8)); 24187ec681f3Smrg} 24197ec681f3Smrg 24207ec681f3Smrg/* The hardware specifies texel offsets and multisample indices together as a 24217ec681f3Smrg * u8vec4 <offset, ms index>. By default all are zero, so if have either a 24227ec681f3Smrg * nonzero texel offset or a nonzero multisample index, we build a u8vec4 with 24237ec681f3Smrg * the bits we need and return that to be passed as a staging register. Else we 24247ec681f3Smrg * return 0 to avoid allocating a data register when everything is zero. */ 24257ec681f3Smrg 24267ec681f3Smrgstatic bi_index 24277ec681f3Smrgbi_emit_texc_offset_ms_index(bi_builder *b, nir_tex_instr *instr) 24287ec681f3Smrg{ 24297ec681f3Smrg bi_index dest = bi_zero(); 24307ec681f3Smrg 24317ec681f3Smrg int offs_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset); 24327ec681f3Smrg if (offs_idx >= 0 && 24337ec681f3Smrg (!nir_src_is_const(instr->src[offs_idx].src) || 24347ec681f3Smrg nir_src_as_uint(instr->src[offs_idx].src) != 0)) { 24357ec681f3Smrg unsigned nr = nir_src_num_components(instr->src[offs_idx].src); 24367ec681f3Smrg bi_index idx = bi_src_index(&instr->src[offs_idx].src); 24377ec681f3Smrg dest = bi_mkvec_v4i8(b, 24387ec681f3Smrg (nr > 0) ? bi_byte(bi_word(idx, 0), 0) : bi_imm_u8(0), 24397ec681f3Smrg (nr > 1) ? bi_byte(bi_word(idx, 1), 0) : bi_imm_u8(0), 24407ec681f3Smrg (nr > 2) ? bi_byte(bi_word(idx, 2), 0) : bi_imm_u8(0), 24417ec681f3Smrg bi_imm_u8(0)); 24427ec681f3Smrg } 24437ec681f3Smrg 24447ec681f3Smrg int ms_idx = nir_tex_instr_src_index(instr, nir_tex_src_ms_index); 24457ec681f3Smrg if (ms_idx >= 0 && 24467ec681f3Smrg (!nir_src_is_const(instr->src[ms_idx].src) || 24477ec681f3Smrg nir_src_as_uint(instr->src[ms_idx].src) != 0)) { 24487ec681f3Smrg dest = bi_lshift_or_i32(b, 24497ec681f3Smrg bi_src_index(&instr->src[ms_idx].src), dest, 24507ec681f3Smrg bi_imm_u8(24)); 24517ec681f3Smrg } 24527ec681f3Smrg 24537ec681f3Smrg return dest; 24547ec681f3Smrg} 24557ec681f3Smrg 24567ec681f3Smrgstatic void 24577ec681f3Smrgbi_emit_cube_coord(bi_builder *b, bi_index coord, 24587ec681f3Smrg bi_index *face, bi_index *s, bi_index *t) 24597ec681f3Smrg{ 24607ec681f3Smrg /* Compute max { |x|, |y|, |z| } */ 24617ec681f3Smrg bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader), 24627ec681f3Smrg bi_temp(b->shader), coord, 24637ec681f3Smrg bi_word(coord, 1), bi_word(coord, 2)); 24647ec681f3Smrg 24657ec681f3Smrg /* Select coordinates */ 24667ec681f3Smrg 24677ec681f3Smrg bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord, 24687ec681f3Smrg cubeface->dest[1]); 24697ec681f3Smrg 24707ec681f3Smrg bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2), 24717ec681f3Smrg cubeface->dest[1]); 24727ec681f3Smrg 24737ec681f3Smrg /* The OpenGL ES specification requires us to transform an input vector 24747ec681f3Smrg * (x, y, z) to the coordinate, given the selected S/T: 24757ec681f3Smrg * 24767ec681f3Smrg * (1/2 ((s / max{x,y,z}) + 1), 1/2 ((t / max{x, y, z}) + 1)) 24777ec681f3Smrg * 24787ec681f3Smrg * We implement (s shown, t similar) in a form friendlier to FMA 24797ec681f3Smrg * instructions, and clamp coordinates at the end for correct 24807ec681f3Smrg * NaN/infinity handling: 24817ec681f3Smrg * 24827ec681f3Smrg * fsat(s * (0.5 * (1 / max{x, y, z})) + 0.5) 24837ec681f3Smrg * 24847ec681f3Smrg * Take the reciprocal of max{x, y, z} 24857ec681f3Smrg */ 24867ec681f3Smrg 24877ec681f3Smrg bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]); 24887ec681f3Smrg 24897ec681f3Smrg /* Calculate 0.5 * (1.0 / max{x, y, z}) */ 24907ec681f3Smrg bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_negzero(), 24917ec681f3Smrg BI_ROUND_NONE); 24927ec681f3Smrg 24937ec681f3Smrg /* Transform the coordinates */ 24947ec681f3Smrg *s = bi_temp(b->shader); 24957ec681f3Smrg *t = bi_temp(b->shader); 24967ec681f3Smrg 24977ec681f3Smrg bi_instr *S = bi_fma_f32_to(b, *s, fma1, ssel, bi_imm_f32(0.5f), 24987ec681f3Smrg BI_ROUND_NONE); 24997ec681f3Smrg bi_instr *T = bi_fma_f32_to(b, *t, fma1, tsel, bi_imm_f32(0.5f), 25007ec681f3Smrg BI_ROUND_NONE); 25017ec681f3Smrg 25027ec681f3Smrg S->clamp = BI_CLAMP_CLAMP_0_1; 25037ec681f3Smrg T->clamp = BI_CLAMP_CLAMP_0_1; 25047ec681f3Smrg 25057ec681f3Smrg /* Face index at bit[29:31], matching the cube map descriptor */ 25067ec681f3Smrg *face = cubeface->dest[1]; 25077ec681f3Smrg} 25087ec681f3Smrg 25097ec681f3Smrg/* Emits a cube map descriptor, returning lower 32-bits and putting upper 25107ec681f3Smrg * 32-bits in passed pointer t. The packing of the face with the S coordinate 25117ec681f3Smrg * exploits the redundancy of floating points with the range restriction of 25127ec681f3Smrg * CUBEFACE output. 25137ec681f3Smrg * 25147ec681f3Smrg * struct cube_map_descriptor { 25157ec681f3Smrg * float s : 29; 25167ec681f3Smrg * unsigned face : 3; 25177ec681f3Smrg * float t : 32; 25187ec681f3Smrg * } 25197ec681f3Smrg * 25207ec681f3Smrg * Since the cube face index is preshifted, this is easy to pack with a bitwise 25217ec681f3Smrg * MUX.i32 and a fixed mask, selecting the lower bits 29 from s and the upper 3 25227ec681f3Smrg * bits from face. 25237ec681f3Smrg */ 25247ec681f3Smrg 25257ec681f3Smrgstatic bi_index 25267ec681f3Smrgbi_emit_texc_cube_coord(bi_builder *b, bi_index coord, bi_index *t) 25277ec681f3Smrg{ 25287ec681f3Smrg bi_index face, s; 25297ec681f3Smrg bi_emit_cube_coord(b, coord, &face, &s, t); 25307ec681f3Smrg bi_index mask = bi_imm_u32(BITFIELD_MASK(29)); 25317ec681f3Smrg return bi_mux_i32(b, s, face, mask, BI_MUX_BIT); 25327ec681f3Smrg} 25337ec681f3Smrg 25347ec681f3Smrg/* Map to the main texture op used. Some of these (txd in particular) will 25357ec681f3Smrg * lower to multiple texture ops with different opcodes (GRDESC_DER + TEX in 25367ec681f3Smrg * sequence). We assume that lowering is handled elsewhere. 25377ec681f3Smrg */ 25387ec681f3Smrg 25397ec681f3Smrgstatic enum bifrost_tex_op 25407ec681f3Smrgbi_tex_op(nir_texop op) 25417ec681f3Smrg{ 25427ec681f3Smrg switch (op) { 25437ec681f3Smrg case nir_texop_tex: 25447ec681f3Smrg case nir_texop_txb: 25457ec681f3Smrg case nir_texop_txl: 25467ec681f3Smrg case nir_texop_txd: 25477ec681f3Smrg case nir_texop_tex_prefetch: 25487ec681f3Smrg return BIFROST_TEX_OP_TEX; 25497ec681f3Smrg case nir_texop_txf: 25507ec681f3Smrg case nir_texop_txf_ms: 25517ec681f3Smrg case nir_texop_txf_ms_fb: 25527ec681f3Smrg case nir_texop_tg4: 25537ec681f3Smrg return BIFROST_TEX_OP_FETCH; 25547ec681f3Smrg case nir_texop_txs: 25557ec681f3Smrg case nir_texop_lod: 25567ec681f3Smrg case nir_texop_query_levels: 25577ec681f3Smrg case nir_texop_texture_samples: 25587ec681f3Smrg case nir_texop_samples_identical: 25597ec681f3Smrg unreachable("should've been lowered"); 25607ec681f3Smrg default: 25617ec681f3Smrg unreachable("unsupported tex op"); 25627ec681f3Smrg } 25637ec681f3Smrg} 25647ec681f3Smrg 25657ec681f3Smrg/* Data registers required by texturing in the order they appear. All are 25667ec681f3Smrg * optional, the texture operation descriptor determines which are present. 25677ec681f3Smrg * Note since 3D arrays are not permitted at an API level, Z_COORD and 25687ec681f3Smrg * ARRAY/SHADOW are exlusive, so TEXC in practice reads at most 8 registers */ 25697ec681f3Smrg 25707ec681f3Smrgenum bifrost_tex_dreg { 25717ec681f3Smrg BIFROST_TEX_DREG_Z_COORD = 0, 25727ec681f3Smrg BIFROST_TEX_DREG_Y_DELTAS = 1, 25737ec681f3Smrg BIFROST_TEX_DREG_LOD = 2, 25747ec681f3Smrg BIFROST_TEX_DREG_GRDESC_HI = 3, 25757ec681f3Smrg BIFROST_TEX_DREG_SHADOW = 4, 25767ec681f3Smrg BIFROST_TEX_DREG_ARRAY = 5, 25777ec681f3Smrg BIFROST_TEX_DREG_OFFSETMS = 6, 25787ec681f3Smrg BIFROST_TEX_DREG_SAMPLER = 7, 25797ec681f3Smrg BIFROST_TEX_DREG_TEXTURE = 8, 25807ec681f3Smrg BIFROST_TEX_DREG_COUNT, 25817ec681f3Smrg}; 25827ec681f3Smrg 25837ec681f3Smrgstatic void 25847ec681f3Smrgbi_emit_texc(bi_builder *b, nir_tex_instr *instr) 25857ec681f3Smrg{ 25867ec681f3Smrg bool computed_lod = false; 25877ec681f3Smrg 25887ec681f3Smrg struct bifrost_texture_operation desc = { 25897ec681f3Smrg .op = bi_tex_op(instr->op), 25907ec681f3Smrg .offset_or_bias_disable = false, /* TODO */ 25917ec681f3Smrg .shadow_or_clamp_disable = instr->is_shadow, 25927ec681f3Smrg .array = instr->is_array, 25937ec681f3Smrg .dimension = bifrost_tex_format(instr->sampler_dim), 25947ec681f3Smrg .format = bi_texture_format(instr->dest_type | nir_dest_bit_size(instr->dest), BI_CLAMP_NONE), /* TODO */ 25957ec681f3Smrg .mask = 0xF, 25967ec681f3Smrg }; 25977ec681f3Smrg 25987ec681f3Smrg switch (desc.op) { 25997ec681f3Smrg case BIFROST_TEX_OP_TEX: 26007ec681f3Smrg desc.lod_or_fetch = BIFROST_LOD_MODE_COMPUTE; 26017ec681f3Smrg computed_lod = true; 26027ec681f3Smrg break; 26037ec681f3Smrg case BIFROST_TEX_OP_FETCH: 26047ec681f3Smrg desc.lod_or_fetch = (enum bifrost_lod_mode) 26057ec681f3Smrg (instr->op == nir_texop_tg4 ? 26067ec681f3Smrg BIFROST_TEXTURE_FETCH_GATHER4_R + instr->component : 26077ec681f3Smrg BIFROST_TEXTURE_FETCH_TEXEL); 26087ec681f3Smrg break; 26097ec681f3Smrg default: 26107ec681f3Smrg unreachable("texture op unsupported"); 26117ec681f3Smrg } 26127ec681f3Smrg 26137ec681f3Smrg /* 32-bit indices to be allocated as consecutive staging registers */ 26147ec681f3Smrg bi_index dregs[BIFROST_TEX_DREG_COUNT] = { }; 26157ec681f3Smrg bi_index cx = bi_null(), cy = bi_null(); 26167ec681f3Smrg 26177ec681f3Smrg for (unsigned i = 0; i < instr->num_srcs; ++i) { 26187ec681f3Smrg bi_index index = bi_src_index(&instr->src[i].src); 26197ec681f3Smrg unsigned sz = nir_src_bit_size(instr->src[i].src); 26207ec681f3Smrg ASSERTED nir_alu_type base = nir_tex_instr_src_type(instr, i); 26217ec681f3Smrg nir_alu_type T = base | sz; 26227ec681f3Smrg 26237ec681f3Smrg switch (instr->src[i].src_type) { 26247ec681f3Smrg case nir_tex_src_coord: 26257ec681f3Smrg if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 26267ec681f3Smrg cx = bi_emit_texc_cube_coord(b, index, &cy); 26277ec681f3Smrg } else { 26287ec681f3Smrg unsigned components = nir_src_num_components(instr->src[i].src); 26297ec681f3Smrg 26307ec681f3Smrg /* Copy XY (for 2D+) or XX (for 1D) */ 26317ec681f3Smrg cx = index; 26327ec681f3Smrg cy = bi_word(index, MIN2(1, components - 1)); 26337ec681f3Smrg 26347ec681f3Smrg assert(components >= 1 && components <= 3); 26357ec681f3Smrg 26367ec681f3Smrg if (components < 3) { 26377ec681f3Smrg /* nothing to do */ 26387ec681f3Smrg } else if (desc.array) { 26397ec681f3Smrg /* 2D array */ 26407ec681f3Smrg dregs[BIFROST_TEX_DREG_ARRAY] = 26417ec681f3Smrg bi_emit_texc_array_index(b, 26427ec681f3Smrg bi_word(index, 2), T); 26437ec681f3Smrg } else { 26447ec681f3Smrg /* 3D */ 26457ec681f3Smrg dregs[BIFROST_TEX_DREG_Z_COORD] = 26467ec681f3Smrg bi_word(index, 2); 26477ec681f3Smrg } 26487ec681f3Smrg } 26497ec681f3Smrg break; 26507ec681f3Smrg 26517ec681f3Smrg case nir_tex_src_lod: 26527ec681f3Smrg if (desc.op == BIFROST_TEX_OP_TEX && 26537ec681f3Smrg nir_src_is_const(instr->src[i].src) && 26547ec681f3Smrg nir_src_as_uint(instr->src[i].src) == 0) { 26557ec681f3Smrg desc.lod_or_fetch = BIFROST_LOD_MODE_ZERO; 26567ec681f3Smrg } else if (desc.op == BIFROST_TEX_OP_TEX) { 26577ec681f3Smrg assert(base == nir_type_float); 26587ec681f3Smrg 26597ec681f3Smrg assert(sz == 16 || sz == 32); 26607ec681f3Smrg dregs[BIFROST_TEX_DREG_LOD] = 26617ec681f3Smrg bi_emit_texc_lod_88(b, index, sz == 16); 26627ec681f3Smrg desc.lod_or_fetch = BIFROST_LOD_MODE_EXPLICIT; 26637ec681f3Smrg } else { 26647ec681f3Smrg assert(desc.op == BIFROST_TEX_OP_FETCH); 26657ec681f3Smrg assert(base == nir_type_uint || base == nir_type_int); 26667ec681f3Smrg assert(sz == 16 || sz == 32); 26677ec681f3Smrg 26687ec681f3Smrg dregs[BIFROST_TEX_DREG_LOD] = 26697ec681f3Smrg bi_emit_texc_lod_cube(b, index); 26707ec681f3Smrg } 26717ec681f3Smrg 26727ec681f3Smrg break; 26737ec681f3Smrg 26747ec681f3Smrg case nir_tex_src_bias: 26757ec681f3Smrg /* Upper 16-bits interpreted as a clamp, leave zero */ 26767ec681f3Smrg assert(desc.op == BIFROST_TEX_OP_TEX); 26777ec681f3Smrg assert(base == nir_type_float); 26787ec681f3Smrg assert(sz == 16 || sz == 32); 26797ec681f3Smrg dregs[BIFROST_TEX_DREG_LOD] = 26807ec681f3Smrg bi_emit_texc_lod_88(b, index, sz == 16); 26817ec681f3Smrg desc.lod_or_fetch = BIFROST_LOD_MODE_BIAS; 26827ec681f3Smrg computed_lod = true; 26837ec681f3Smrg break; 26847ec681f3Smrg 26857ec681f3Smrg case nir_tex_src_ms_index: 26867ec681f3Smrg case nir_tex_src_offset: 26877ec681f3Smrg if (desc.offset_or_bias_disable) 26887ec681f3Smrg break; 26897ec681f3Smrg 26907ec681f3Smrg dregs[BIFROST_TEX_DREG_OFFSETMS] = 26917ec681f3Smrg bi_emit_texc_offset_ms_index(b, instr); 26927ec681f3Smrg if (!bi_is_equiv(dregs[BIFROST_TEX_DREG_OFFSETMS], bi_zero())) 26937ec681f3Smrg desc.offset_or_bias_disable = true; 26947ec681f3Smrg break; 26957ec681f3Smrg 26967ec681f3Smrg case nir_tex_src_comparator: 26977ec681f3Smrg dregs[BIFROST_TEX_DREG_SHADOW] = index; 26987ec681f3Smrg break; 26997ec681f3Smrg 27007ec681f3Smrg case nir_tex_src_texture_offset: 27017ec681f3Smrg assert(instr->texture_index == 0); 27027ec681f3Smrg dregs[BIFROST_TEX_DREG_TEXTURE] = index; 27037ec681f3Smrg break; 27047ec681f3Smrg 27057ec681f3Smrg case nir_tex_src_sampler_offset: 27067ec681f3Smrg assert(instr->sampler_index == 0); 27077ec681f3Smrg dregs[BIFROST_TEX_DREG_SAMPLER] = index; 27087ec681f3Smrg break; 27097ec681f3Smrg 27107ec681f3Smrg default: 27117ec681f3Smrg unreachable("Unhandled src type in texc emit"); 27127ec681f3Smrg } 27137ec681f3Smrg } 27147ec681f3Smrg 27157ec681f3Smrg if (desc.op == BIFROST_TEX_OP_FETCH && bi_is_null(dregs[BIFROST_TEX_DREG_LOD])) { 27167ec681f3Smrg dregs[BIFROST_TEX_DREG_LOD] = 27177ec681f3Smrg bi_emit_texc_lod_cube(b, bi_zero()); 27187ec681f3Smrg } 27197ec681f3Smrg 27207ec681f3Smrg /* Choose an index mode */ 27217ec681f3Smrg 27227ec681f3Smrg bool direct_tex = bi_is_null(dregs[BIFROST_TEX_DREG_TEXTURE]); 27237ec681f3Smrg bool direct_samp = bi_is_null(dregs[BIFROST_TEX_DREG_SAMPLER]); 27247ec681f3Smrg bool direct = direct_tex && direct_samp; 27257ec681f3Smrg 27267ec681f3Smrg desc.immediate_indices = direct && (instr->sampler_index < 16); 27277ec681f3Smrg 27287ec681f3Smrg if (desc.immediate_indices) { 27297ec681f3Smrg desc.sampler_index_or_mode = instr->sampler_index; 27307ec681f3Smrg desc.index = instr->texture_index; 27317ec681f3Smrg } else { 27327ec681f3Smrg enum bifrost_index mode = 0; 27337ec681f3Smrg 27347ec681f3Smrg if (direct && instr->sampler_index == instr->texture_index) { 27357ec681f3Smrg mode = BIFROST_INDEX_IMMEDIATE_SHARED; 27367ec681f3Smrg desc.index = instr->texture_index; 27377ec681f3Smrg } else if (direct) { 27387ec681f3Smrg mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; 27397ec681f3Smrg desc.index = instr->sampler_index; 27407ec681f3Smrg dregs[BIFROST_TEX_DREG_TEXTURE] = bi_mov_i32(b, 27417ec681f3Smrg bi_imm_u32(instr->texture_index)); 27427ec681f3Smrg } else if (direct_tex) { 27437ec681f3Smrg assert(!direct_samp); 27447ec681f3Smrg mode = BIFROST_INDEX_IMMEDIATE_TEXTURE; 27457ec681f3Smrg desc.index = instr->texture_index; 27467ec681f3Smrg } else if (direct_samp) { 27477ec681f3Smrg assert(!direct_tex); 27487ec681f3Smrg mode = BIFROST_INDEX_IMMEDIATE_SAMPLER; 27497ec681f3Smrg desc.index = instr->sampler_index; 27507ec681f3Smrg } else { 27517ec681f3Smrg mode = BIFROST_INDEX_REGISTER; 27527ec681f3Smrg } 27537ec681f3Smrg 27547ec681f3Smrg desc.sampler_index_or_mode = mode | (0x3 << 2); 27557ec681f3Smrg } 27567ec681f3Smrg 27577ec681f3Smrg /* Allocate staging registers contiguously by compacting the array. 27587ec681f3Smrg * Index is not SSA (tied operands) */ 27597ec681f3Smrg 27607ec681f3Smrg unsigned sr_count = 0; 27617ec681f3Smrg 27627ec681f3Smrg for (unsigned i = 0; i < ARRAY_SIZE(dregs); ++i) { 27637ec681f3Smrg if (!bi_is_null(dregs[i])) 27647ec681f3Smrg dregs[sr_count++] = dregs[i]; 27657ec681f3Smrg } 27667ec681f3Smrg 27677ec681f3Smrg bi_index idx = sr_count ? bi_temp_reg(b->shader) : bi_null(); 27687ec681f3Smrg 27697ec681f3Smrg if (sr_count) 27707ec681f3Smrg bi_make_vec_to(b, idx, dregs, NULL, sr_count, 32); 27717ec681f3Smrg 27727ec681f3Smrg uint32_t desc_u = 0; 27737ec681f3Smrg memcpy(&desc_u, &desc, sizeof(desc_u)); 27747ec681f3Smrg bi_texc_to(b, sr_count ? idx : bi_dest_index(&instr->dest), 27757ec681f3Smrg idx, cx, cy, bi_imm_u32(desc_u), !computed_lod, 27767ec681f3Smrg sr_count); 27777ec681f3Smrg 27787ec681f3Smrg /* Explicit copy to facilitate tied operands */ 27797ec681f3Smrg if (sr_count) { 27807ec681f3Smrg bi_index srcs[4] = { idx, idx, idx, idx }; 27817ec681f3Smrg unsigned channels[4] = { 0, 1, 2, 3 }; 27827ec681f3Smrg bi_make_vec_to(b, bi_dest_index(&instr->dest), srcs, channels, 4, 32); 27837ec681f3Smrg } 27847ec681f3Smrg} 27857ec681f3Smrg 27867ec681f3Smrg/* Simple textures ops correspond to NIR tex or txl with LOD = 0 on 2D/cube 27877ec681f3Smrg * textures with sufficiently small immediate indices. Anything else 27887ec681f3Smrg * needs a complete texture op. */ 27897ec681f3Smrg 27907ec681f3Smrgstatic void 27917ec681f3Smrgbi_emit_texs(bi_builder *b, nir_tex_instr *instr) 27927ec681f3Smrg{ 27937ec681f3Smrg int coord_idx = nir_tex_instr_src_index(instr, nir_tex_src_coord); 27947ec681f3Smrg assert(coord_idx >= 0); 27957ec681f3Smrg bi_index coords = bi_src_index(&instr->src[coord_idx].src); 27967ec681f3Smrg 27977ec681f3Smrg if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE) { 27987ec681f3Smrg bi_index face, s, t; 27997ec681f3Smrg bi_emit_cube_coord(b, coords, &face, &s, &t); 28007ec681f3Smrg 28017ec681f3Smrg bi_texs_cube_to(b, nir_dest_bit_size(instr->dest), 28027ec681f3Smrg bi_dest_index(&instr->dest), 28037ec681f3Smrg s, t, face, 28047ec681f3Smrg instr->sampler_index, instr->texture_index); 28057ec681f3Smrg } else { 28067ec681f3Smrg bi_texs_2d_to(b, nir_dest_bit_size(instr->dest), 28077ec681f3Smrg bi_dest_index(&instr->dest), 28087ec681f3Smrg coords, bi_word(coords, 1), 28097ec681f3Smrg instr->op != nir_texop_tex, /* zero LOD */ 28107ec681f3Smrg instr->sampler_index, instr->texture_index); 28117ec681f3Smrg } 28127ec681f3Smrg} 28137ec681f3Smrg 28147ec681f3Smrgstatic bool 28157ec681f3Smrgbi_is_simple_tex(nir_tex_instr *instr) 28167ec681f3Smrg{ 28177ec681f3Smrg if (instr->op != nir_texop_tex && instr->op != nir_texop_txl) 28187ec681f3Smrg return false; 28197ec681f3Smrg 28207ec681f3Smrg if (instr->dest_type != nir_type_float32 && 28217ec681f3Smrg instr->dest_type != nir_type_float16) 28227ec681f3Smrg return false; 28237ec681f3Smrg 28247ec681f3Smrg if (instr->is_shadow || instr->is_array) 28257ec681f3Smrg return false; 28267ec681f3Smrg 28277ec681f3Smrg switch (instr->sampler_dim) { 28287ec681f3Smrg case GLSL_SAMPLER_DIM_2D: 28297ec681f3Smrg case GLSL_SAMPLER_DIM_EXTERNAL: 28307ec681f3Smrg case GLSL_SAMPLER_DIM_RECT: 28317ec681f3Smrg break; 28327ec681f3Smrg 28337ec681f3Smrg case GLSL_SAMPLER_DIM_CUBE: 28347ec681f3Smrg /* LOD can't be specified with TEXS_CUBE */ 28357ec681f3Smrg if (instr->op == nir_texop_txl) 28367ec681f3Smrg return false; 28377ec681f3Smrg break; 28387ec681f3Smrg 28397ec681f3Smrg default: 28407ec681f3Smrg return false; 28417ec681f3Smrg } 28427ec681f3Smrg 28437ec681f3Smrg for (unsigned i = 0; i < instr->num_srcs; ++i) { 28447ec681f3Smrg if (instr->src[i].src_type != nir_tex_src_lod && 28457ec681f3Smrg instr->src[i].src_type != nir_tex_src_coord) 28467ec681f3Smrg return false; 28477ec681f3Smrg } 28487ec681f3Smrg 28497ec681f3Smrg /* Indices need to fit in provided bits */ 28507ec681f3Smrg unsigned idx_bits = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE ? 2 : 3; 28517ec681f3Smrg if (MAX2(instr->sampler_index, instr->texture_index) >= (1 << idx_bits)) 28527ec681f3Smrg return false; 28537ec681f3Smrg 28547ec681f3Smrg int lod_idx = nir_tex_instr_src_index(instr, nir_tex_src_lod); 28557ec681f3Smrg if (lod_idx < 0) 28567ec681f3Smrg return true; 28577ec681f3Smrg 28587ec681f3Smrg nir_src lod = instr->src[lod_idx].src; 28597ec681f3Smrg return nir_src_is_const(lod) && nir_src_as_uint(lod) == 0; 28607ec681f3Smrg} 28617ec681f3Smrg 28627ec681f3Smrgstatic void 28637ec681f3Smrgbi_emit_tex(bi_builder *b, nir_tex_instr *instr) 28647ec681f3Smrg{ 28657ec681f3Smrg switch (instr->op) { 28667ec681f3Smrg case nir_texop_txs: 28677ec681f3Smrg bi_load_sysval_to(b, bi_dest_index(&instr->dest), 28687ec681f3Smrg panfrost_sysval_for_instr(&instr->instr, NULL), 28697ec681f3Smrg 4, 0); 28707ec681f3Smrg return; 28717ec681f3Smrg case nir_texop_tex: 28727ec681f3Smrg case nir_texop_txl: 28737ec681f3Smrg case nir_texop_txb: 28747ec681f3Smrg case nir_texop_txf: 28757ec681f3Smrg case nir_texop_txf_ms: 28767ec681f3Smrg case nir_texop_tg4: 28777ec681f3Smrg break; 28787ec681f3Smrg default: 28797ec681f3Smrg unreachable("Invalid texture operation"); 28807ec681f3Smrg } 28817ec681f3Smrg 28827ec681f3Smrg if (bi_is_simple_tex(instr)) 28837ec681f3Smrg bi_emit_texs(b, instr); 28847ec681f3Smrg else 28857ec681f3Smrg bi_emit_texc(b, instr); 28867ec681f3Smrg} 28877ec681f3Smrg 28887ec681f3Smrgstatic void 28897ec681f3Smrgbi_emit_instr(bi_builder *b, struct nir_instr *instr) 28907ec681f3Smrg{ 28917ec681f3Smrg switch (instr->type) { 28927ec681f3Smrg case nir_instr_type_load_const: 28937ec681f3Smrg bi_emit_load_const(b, nir_instr_as_load_const(instr)); 28947ec681f3Smrg break; 28957ec681f3Smrg 28967ec681f3Smrg case nir_instr_type_intrinsic: 28977ec681f3Smrg bi_emit_intrinsic(b, nir_instr_as_intrinsic(instr)); 28987ec681f3Smrg break; 28997ec681f3Smrg 29007ec681f3Smrg case nir_instr_type_alu: 29017ec681f3Smrg bi_emit_alu(b, nir_instr_as_alu(instr)); 29027ec681f3Smrg break; 29037ec681f3Smrg 29047ec681f3Smrg case nir_instr_type_tex: 29057ec681f3Smrg bi_emit_tex(b, nir_instr_as_tex(instr)); 29067ec681f3Smrg break; 29077ec681f3Smrg 29087ec681f3Smrg case nir_instr_type_jump: 29097ec681f3Smrg bi_emit_jump(b, nir_instr_as_jump(instr)); 29107ec681f3Smrg break; 29117ec681f3Smrg 29127ec681f3Smrg default: 29137ec681f3Smrg unreachable("should've been lowered"); 29147ec681f3Smrg } 29157ec681f3Smrg} 29167ec681f3Smrg 29177ec681f3Smrgstatic bi_block * 29187ec681f3Smrgcreate_empty_block(bi_context *ctx) 29197ec681f3Smrg{ 29207ec681f3Smrg bi_block *blk = rzalloc(ctx, bi_block); 29217ec681f3Smrg 29227ec681f3Smrg blk->predecessors = _mesa_set_create(blk, 29237ec681f3Smrg _mesa_hash_pointer, 29247ec681f3Smrg _mesa_key_pointer_equal); 29257ec681f3Smrg 29267ec681f3Smrg return blk; 29277ec681f3Smrg} 29287ec681f3Smrg 29297ec681f3Smrgstatic bi_block * 29307ec681f3Smrgemit_block(bi_context *ctx, nir_block *block) 29317ec681f3Smrg{ 29327ec681f3Smrg if (ctx->after_block) { 29337ec681f3Smrg ctx->current_block = ctx->after_block; 29347ec681f3Smrg ctx->after_block = NULL; 29357ec681f3Smrg } else { 29367ec681f3Smrg ctx->current_block = create_empty_block(ctx); 29377ec681f3Smrg } 29387ec681f3Smrg 29397ec681f3Smrg list_addtail(&ctx->current_block->link, &ctx->blocks); 29407ec681f3Smrg list_inithead(&ctx->current_block->instructions); 29417ec681f3Smrg 29427ec681f3Smrg bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 29437ec681f3Smrg 29447ec681f3Smrg nir_foreach_instr(instr, block) { 29457ec681f3Smrg bi_emit_instr(&_b, instr); 29467ec681f3Smrg ++ctx->instruction_count; 29477ec681f3Smrg } 29487ec681f3Smrg 29497ec681f3Smrg return ctx->current_block; 29507ec681f3Smrg} 29517ec681f3Smrg 29527ec681f3Smrgstatic void 29537ec681f3Smrgemit_if(bi_context *ctx, nir_if *nif) 29547ec681f3Smrg{ 29557ec681f3Smrg bi_block *before_block = ctx->current_block; 29567ec681f3Smrg 29577ec681f3Smrg /* Speculatively emit the branch, but we can't fill it in until later */ 29587ec681f3Smrg bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 29597ec681f3Smrg bi_instr *then_branch = bi_branchz_i16(&_b, 29607ec681f3Smrg bi_half(bi_src_index(&nif->condition), false), 29617ec681f3Smrg bi_zero(), BI_CMPF_EQ); 29627ec681f3Smrg 29637ec681f3Smrg /* Emit the two subblocks. */ 29647ec681f3Smrg bi_block *then_block = emit_cf_list(ctx, &nif->then_list); 29657ec681f3Smrg bi_block *end_then_block = ctx->current_block; 29667ec681f3Smrg 29677ec681f3Smrg /* Emit second block, and check if it's empty */ 29687ec681f3Smrg 29697ec681f3Smrg int count_in = ctx->instruction_count; 29707ec681f3Smrg bi_block *else_block = emit_cf_list(ctx, &nif->else_list); 29717ec681f3Smrg bi_block *end_else_block = ctx->current_block; 29727ec681f3Smrg ctx->after_block = create_empty_block(ctx); 29737ec681f3Smrg 29747ec681f3Smrg /* Now that we have the subblocks emitted, fix up the branches */ 29757ec681f3Smrg 29767ec681f3Smrg assert(then_block); 29777ec681f3Smrg assert(else_block); 29787ec681f3Smrg 29797ec681f3Smrg if (ctx->instruction_count == count_in) { 29807ec681f3Smrg then_branch->branch_target = ctx->after_block; 29817ec681f3Smrg bi_block_add_successor(end_then_block, ctx->after_block); /* fallthrough */ 29827ec681f3Smrg } else { 29837ec681f3Smrg then_branch->branch_target = else_block; 29847ec681f3Smrg 29857ec681f3Smrg /* Emit a jump from the end of the then block to the end of the else */ 29867ec681f3Smrg _b.cursor = bi_after_block(end_then_block); 29877ec681f3Smrg bi_instr *then_exit = bi_jump(&_b, bi_zero()); 29887ec681f3Smrg then_exit->branch_target = ctx->after_block; 29897ec681f3Smrg 29907ec681f3Smrg bi_block_add_successor(end_then_block, then_exit->branch_target); 29917ec681f3Smrg bi_block_add_successor(end_else_block, ctx->after_block); /* fallthrough */ 29927ec681f3Smrg } 29937ec681f3Smrg 29947ec681f3Smrg bi_block_add_successor(before_block, then_branch->branch_target); /* then_branch */ 29957ec681f3Smrg bi_block_add_successor(before_block, then_block); /* fallthrough */ 29967ec681f3Smrg} 29977ec681f3Smrg 29987ec681f3Smrgstatic void 29997ec681f3Smrgemit_loop(bi_context *ctx, nir_loop *nloop) 30007ec681f3Smrg{ 30017ec681f3Smrg /* Remember where we are */ 30027ec681f3Smrg bi_block *start_block = ctx->current_block; 30037ec681f3Smrg 30047ec681f3Smrg bi_block *saved_break = ctx->break_block; 30057ec681f3Smrg bi_block *saved_continue = ctx->continue_block; 30067ec681f3Smrg 30077ec681f3Smrg ctx->continue_block = create_empty_block(ctx); 30087ec681f3Smrg ctx->break_block = create_empty_block(ctx); 30097ec681f3Smrg ctx->after_block = ctx->continue_block; 30107ec681f3Smrg 30117ec681f3Smrg /* Emit the body itself */ 30127ec681f3Smrg emit_cf_list(ctx, &nloop->body); 30137ec681f3Smrg 30147ec681f3Smrg /* Branch back to loop back */ 30157ec681f3Smrg bi_builder _b = bi_init_builder(ctx, bi_after_block(ctx->current_block)); 30167ec681f3Smrg bi_instr *I = bi_jump(&_b, bi_zero()); 30177ec681f3Smrg I->branch_target = ctx->continue_block; 30187ec681f3Smrg bi_block_add_successor(start_block, ctx->continue_block); 30197ec681f3Smrg bi_block_add_successor(ctx->current_block, ctx->continue_block); 30207ec681f3Smrg 30217ec681f3Smrg ctx->after_block = ctx->break_block; 30227ec681f3Smrg 30237ec681f3Smrg /* Pop off */ 30247ec681f3Smrg ctx->break_block = saved_break; 30257ec681f3Smrg ctx->continue_block = saved_continue; 30267ec681f3Smrg ++ctx->loop_count; 30277ec681f3Smrg} 30287ec681f3Smrg 30297ec681f3Smrgstatic bi_block * 30307ec681f3Smrgemit_cf_list(bi_context *ctx, struct exec_list *list) 30317ec681f3Smrg{ 30327ec681f3Smrg bi_block *start_block = NULL; 30337ec681f3Smrg 30347ec681f3Smrg foreach_list_typed(nir_cf_node, node, node, list) { 30357ec681f3Smrg switch (node->type) { 30367ec681f3Smrg case nir_cf_node_block: { 30377ec681f3Smrg bi_block *block = emit_block(ctx, nir_cf_node_as_block(node)); 30387ec681f3Smrg 30397ec681f3Smrg if (!start_block) 30407ec681f3Smrg start_block = block; 30417ec681f3Smrg 30427ec681f3Smrg break; 30437ec681f3Smrg } 30447ec681f3Smrg 30457ec681f3Smrg case nir_cf_node_if: 30467ec681f3Smrg emit_if(ctx, nir_cf_node_as_if(node)); 30477ec681f3Smrg break; 30487ec681f3Smrg 30497ec681f3Smrg case nir_cf_node_loop: 30507ec681f3Smrg emit_loop(ctx, nir_cf_node_as_loop(node)); 30517ec681f3Smrg break; 30527ec681f3Smrg 30537ec681f3Smrg default: 30547ec681f3Smrg unreachable("Unknown control flow"); 30557ec681f3Smrg } 30567ec681f3Smrg } 30577ec681f3Smrg 30587ec681f3Smrg return start_block; 30597ec681f3Smrg} 30607ec681f3Smrg 30617ec681f3Smrg/* shader-db stuff */ 30627ec681f3Smrg 30637ec681f3Smrgstruct bi_stats { 30647ec681f3Smrg unsigned nr_clauses, nr_tuples, nr_ins; 30657ec681f3Smrg unsigned nr_arith, nr_texture, nr_varying, nr_ldst; 30667ec681f3Smrg}; 30677ec681f3Smrg 30687ec681f3Smrgstatic void 30697ec681f3Smrgbi_count_tuple_stats(bi_clause *clause, bi_tuple *tuple, struct bi_stats *stats) 30707ec681f3Smrg{ 30717ec681f3Smrg /* Count instructions */ 30727ec681f3Smrg stats->nr_ins += (tuple->fma ? 1 : 0) + (tuple->add ? 1 : 0); 30737ec681f3Smrg 30747ec681f3Smrg /* Non-message passing tuples are always arithmetic */ 30757ec681f3Smrg if (tuple->add != clause->message) { 30767ec681f3Smrg stats->nr_arith++; 30777ec681f3Smrg return; 30787ec681f3Smrg } 30797ec681f3Smrg 30807ec681f3Smrg /* Message + FMA we'll count as arithmetic _and_ message */ 30817ec681f3Smrg if (tuple->fma) 30827ec681f3Smrg stats->nr_arith++; 30837ec681f3Smrg 30847ec681f3Smrg switch (clause->message_type) { 30857ec681f3Smrg case BIFROST_MESSAGE_VARYING: 30867ec681f3Smrg /* Check components interpolated */ 30877ec681f3Smrg stats->nr_varying += (clause->message->vecsize + 1) * 30887ec681f3Smrg (bi_is_regfmt_16(clause->message->register_format) ? 1 : 2); 30897ec681f3Smrg break; 30907ec681f3Smrg 30917ec681f3Smrg case BIFROST_MESSAGE_VARTEX: 30927ec681f3Smrg /* 2 coordinates, fp32 each */ 30937ec681f3Smrg stats->nr_varying += (2 * 2); 30947ec681f3Smrg FALLTHROUGH; 30957ec681f3Smrg case BIFROST_MESSAGE_TEX: 30967ec681f3Smrg stats->nr_texture++; 30977ec681f3Smrg break; 30987ec681f3Smrg 30997ec681f3Smrg case BIFROST_MESSAGE_ATTRIBUTE: 31007ec681f3Smrg case BIFROST_MESSAGE_LOAD: 31017ec681f3Smrg case BIFROST_MESSAGE_STORE: 31027ec681f3Smrg case BIFROST_MESSAGE_ATOMIC: 31037ec681f3Smrg stats->nr_ldst++; 31047ec681f3Smrg break; 31057ec681f3Smrg 31067ec681f3Smrg case BIFROST_MESSAGE_NONE: 31077ec681f3Smrg case BIFROST_MESSAGE_BARRIER: 31087ec681f3Smrg case BIFROST_MESSAGE_BLEND: 31097ec681f3Smrg case BIFROST_MESSAGE_TILE: 31107ec681f3Smrg case BIFROST_MESSAGE_Z_STENCIL: 31117ec681f3Smrg case BIFROST_MESSAGE_ATEST: 31127ec681f3Smrg case BIFROST_MESSAGE_JOB: 31137ec681f3Smrg case BIFROST_MESSAGE_64BIT: 31147ec681f3Smrg /* Nothing to do */ 31157ec681f3Smrg break; 31167ec681f3Smrg }; 31177ec681f3Smrg 31187ec681f3Smrg} 31197ec681f3Smrg 31207ec681f3Smrgstatic void 31217ec681f3Smrgbi_print_stats(bi_context *ctx, unsigned size, FILE *fp) 31227ec681f3Smrg{ 31237ec681f3Smrg struct bi_stats stats = { 0 }; 31247ec681f3Smrg 31257ec681f3Smrg /* Count instructions, clauses, and tuples. Also attempt to construct 31267ec681f3Smrg * normalized execution engine cycle counts, using the following ratio: 31277ec681f3Smrg * 31287ec681f3Smrg * 24 arith tuples/cycle 31297ec681f3Smrg * 2 texture messages/cycle 31307ec681f3Smrg * 16 x 16-bit varying channels interpolated/cycle 31317ec681f3Smrg * 1 load store message/cycle 31327ec681f3Smrg * 31337ec681f3Smrg * These numbers seem to match Arm Mobile Studio's heuristic. The real 31347ec681f3Smrg * cycle counts are surely more complicated. 31357ec681f3Smrg */ 31367ec681f3Smrg 31377ec681f3Smrg bi_foreach_block(ctx, block) { 31387ec681f3Smrg bi_foreach_clause_in_block(block, clause) { 31397ec681f3Smrg stats.nr_clauses++; 31407ec681f3Smrg stats.nr_tuples += clause->tuple_count; 31417ec681f3Smrg 31427ec681f3Smrg for (unsigned i = 0; i < clause->tuple_count; ++i) 31437ec681f3Smrg bi_count_tuple_stats(clause, &clause->tuples[i], &stats); 31447ec681f3Smrg } 31457ec681f3Smrg } 31467ec681f3Smrg 31477ec681f3Smrg float cycles_arith = ((float) stats.nr_arith) / 24.0; 31487ec681f3Smrg float cycles_texture = ((float) stats.nr_texture) / 2.0; 31497ec681f3Smrg float cycles_varying = ((float) stats.nr_varying) / 16.0; 31507ec681f3Smrg float cycles_ldst = ((float) stats.nr_ldst) / 1.0; 31517ec681f3Smrg 31527ec681f3Smrg float cycles_message = MAX3(cycles_texture, cycles_varying, cycles_ldst); 31537ec681f3Smrg float cycles_bound = MAX2(cycles_arith, cycles_message); 31547ec681f3Smrg 31557ec681f3Smrg /* Thread count and register pressure are traded off only on v7 */ 31567ec681f3Smrg bool full_threads = (ctx->arch == 7 && ctx->info->work_reg_count <= 32); 31577ec681f3Smrg unsigned nr_threads = full_threads ? 2 : 1; 31587ec681f3Smrg 31597ec681f3Smrg /* Dump stats */ 31607ec681f3Smrg 31617ec681f3Smrg fprintf(stderr, "%s - %s shader: " 31627ec681f3Smrg "%u inst, %u tuples, %u clauses, " 31637ec681f3Smrg "%f cycles, %f arith, %f texture, %f vary, %f ldst, " 31647ec681f3Smrg "%u quadwords, %u threads, %u loops, " 31657ec681f3Smrg "%u:%u spills:fills\n", 31667ec681f3Smrg ctx->nir->info.label ?: "", 31677ec681f3Smrg ctx->inputs->is_blend ? "PAN_SHADER_BLEND" : 31687ec681f3Smrg gl_shader_stage_name(ctx->stage), 31697ec681f3Smrg stats.nr_ins, stats.nr_tuples, stats.nr_clauses, 31707ec681f3Smrg cycles_bound, cycles_arith, cycles_texture, 31717ec681f3Smrg cycles_varying, cycles_ldst, 31727ec681f3Smrg size / 16, nr_threads, 31737ec681f3Smrg ctx->loop_count, 31747ec681f3Smrg ctx->spills, ctx->fills); 31757ec681f3Smrg} 31767ec681f3Smrg 31777ec681f3Smrgstatic int 31787ec681f3Smrgglsl_type_size(const struct glsl_type *type, bool bindless) 31797ec681f3Smrg{ 31807ec681f3Smrg return glsl_count_attribute_slots(type, false); 31817ec681f3Smrg} 31827ec681f3Smrg 31837ec681f3Smrg/* Split stores to memory. We don't split stores to vertex outputs, since 31847ec681f3Smrg * nir_lower_io_to_temporaries will ensure there's only a single write. 31857ec681f3Smrg */ 31867ec681f3Smrg 31877ec681f3Smrgstatic bool 31887ec681f3Smrgshould_split_wrmask(const nir_instr *instr, UNUSED const void *data) 31897ec681f3Smrg{ 31907ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 31917ec681f3Smrg 31927ec681f3Smrg switch (intr->intrinsic) { 31937ec681f3Smrg case nir_intrinsic_store_ssbo: 31947ec681f3Smrg case nir_intrinsic_store_shared: 31957ec681f3Smrg case nir_intrinsic_store_global: 31967ec681f3Smrg case nir_intrinsic_store_scratch: 31977ec681f3Smrg return true; 31987ec681f3Smrg default: 31997ec681f3Smrg return false; 32007ec681f3Smrg } 32017ec681f3Smrg} 32027ec681f3Smrg 32037ec681f3Smrg/* Bifrost wants transcendentals as FP32 */ 32047ec681f3Smrg 32057ec681f3Smrgstatic unsigned 32067ec681f3Smrgbi_lower_bit_size(const nir_instr *instr, UNUSED void *data) 32077ec681f3Smrg{ 32087ec681f3Smrg if (instr->type != nir_instr_type_alu) 32097ec681f3Smrg return 0; 32107ec681f3Smrg 32117ec681f3Smrg nir_alu_instr *alu = nir_instr_as_alu(instr); 32127ec681f3Smrg 32137ec681f3Smrg switch (alu->op) { 32147ec681f3Smrg case nir_op_fexp2: 32157ec681f3Smrg case nir_op_flog2: 32167ec681f3Smrg case nir_op_fpow: 32177ec681f3Smrg case nir_op_fsin: 32187ec681f3Smrg case nir_op_fcos: 32197ec681f3Smrg return (nir_dest_bit_size(alu->dest.dest) == 32) ? 0 : 32; 32207ec681f3Smrg default: 32217ec681f3Smrg return 0; 32227ec681f3Smrg } 32237ec681f3Smrg} 32247ec681f3Smrg 32257ec681f3Smrg/* Although Bifrost generally supports packed 16-bit vec2 and 8-bit vec4, 32267ec681f3Smrg * transcendentals are an exception. Also shifts because of lane size mismatch 32277ec681f3Smrg * (8-bit in Bifrost, 32-bit in NIR TODO - workaround!). Some conversions need 32287ec681f3Smrg * to be scalarized due to type size. */ 32297ec681f3Smrg 32307ec681f3Smrgstatic bool 32317ec681f3Smrgbi_vectorize_filter(const nir_instr *instr, void *data) 32327ec681f3Smrg{ 32337ec681f3Smrg /* Defaults work for everything else */ 32347ec681f3Smrg if (instr->type != nir_instr_type_alu) 32357ec681f3Smrg return true; 32367ec681f3Smrg 32377ec681f3Smrg const nir_alu_instr *alu = nir_instr_as_alu(instr); 32387ec681f3Smrg 32397ec681f3Smrg switch (alu->op) { 32407ec681f3Smrg case nir_op_frcp: 32417ec681f3Smrg case nir_op_frsq: 32427ec681f3Smrg case nir_op_ishl: 32437ec681f3Smrg case nir_op_ishr: 32447ec681f3Smrg case nir_op_ushr: 32457ec681f3Smrg case nir_op_f2i16: 32467ec681f3Smrg case nir_op_f2u16: 32477ec681f3Smrg case nir_op_i2f16: 32487ec681f3Smrg case nir_op_u2f16: 32497ec681f3Smrg return false; 32507ec681f3Smrg default: 32517ec681f3Smrg return true; 32527ec681f3Smrg } 32537ec681f3Smrg} 32547ec681f3Smrg 32557ec681f3Smrg/* XXX: This is a kludge to workaround NIR's lack of divergence metadata. If we 32567ec681f3Smrg * keep divergence info around after we consume it for indirect lowering, 32577ec681f3Smrg * nir_convert_from_ssa will regress code quality since it will avoid 32587ec681f3Smrg * coalescing divergent with non-divergent nodes. */ 32597ec681f3Smrg 32607ec681f3Smrgstatic bool 32617ec681f3Smrgnir_invalidate_divergence_ssa(nir_ssa_def *ssa, UNUSED void *data) 32627ec681f3Smrg{ 32637ec681f3Smrg ssa->divergent = false; 32647ec681f3Smrg return true; 32657ec681f3Smrg} 32667ec681f3Smrg 32677ec681f3Smrgstatic bool 32687ec681f3Smrgnir_invalidate_divergence(struct nir_builder *b, nir_instr *instr, 32697ec681f3Smrg UNUSED void *data) 32707ec681f3Smrg{ 32717ec681f3Smrg return nir_foreach_ssa_def(instr, nir_invalidate_divergence_ssa, NULL); 32727ec681f3Smrg} 32737ec681f3Smrg 32747ec681f3Smrg/* Ensure we write exactly 4 components */ 32757ec681f3Smrgstatic nir_ssa_def * 32767ec681f3Smrgbifrost_nir_valid_channel(nir_builder *b, nir_ssa_def *in, 32777ec681f3Smrg unsigned channel, unsigned first, unsigned mask) 32787ec681f3Smrg{ 32797ec681f3Smrg if (!(mask & BITFIELD_BIT(channel))) 32807ec681f3Smrg channel = first; 32817ec681f3Smrg 32827ec681f3Smrg return nir_channel(b, in, channel); 32837ec681f3Smrg} 32847ec681f3Smrg 32857ec681f3Smrg/* Lower fragment store_output instructions to always write 4 components, 32867ec681f3Smrg * matching the hardware semantic. This may require additional moves. Skipping 32877ec681f3Smrg * these moves is possible in theory, but invokes undefined behaviour in the 32887ec681f3Smrg * compiler. The DDK inserts these moves, so we will as well. */ 32897ec681f3Smrg 32907ec681f3Smrgstatic bool 32917ec681f3Smrgbifrost_nir_lower_blend_components(struct nir_builder *b, 32927ec681f3Smrg nir_instr *instr, void *data) 32937ec681f3Smrg{ 32947ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 32957ec681f3Smrg return false; 32967ec681f3Smrg 32977ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 32987ec681f3Smrg 32997ec681f3Smrg if (intr->intrinsic != nir_intrinsic_store_output) 33007ec681f3Smrg return false; 33017ec681f3Smrg 33027ec681f3Smrg nir_ssa_def *in = intr->src[0].ssa; 33037ec681f3Smrg unsigned first = nir_intrinsic_component(intr); 33047ec681f3Smrg unsigned mask = nir_intrinsic_write_mask(intr); 33057ec681f3Smrg 33067ec681f3Smrg assert(first == 0 && "shouldn't get nonzero components"); 33077ec681f3Smrg 33087ec681f3Smrg /* Nothing to do */ 33097ec681f3Smrg if (mask == BITFIELD_MASK(4)) 33107ec681f3Smrg return false; 33117ec681f3Smrg 33127ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 33137ec681f3Smrg 33147ec681f3Smrg /* Replicate the first valid component instead */ 33157ec681f3Smrg nir_ssa_def *replicated = 33167ec681f3Smrg nir_vec4(b, bifrost_nir_valid_channel(b, in, 0, first, mask), 33177ec681f3Smrg bifrost_nir_valid_channel(b, in, 1, first, mask), 33187ec681f3Smrg bifrost_nir_valid_channel(b, in, 2, first, mask), 33197ec681f3Smrg bifrost_nir_valid_channel(b, in, 3, first, mask)); 33207ec681f3Smrg 33217ec681f3Smrg /* Rewrite to use our replicated version */ 33227ec681f3Smrg nir_instr_rewrite_src_ssa(instr, &intr->src[0], replicated); 33237ec681f3Smrg nir_intrinsic_set_component(intr, 0); 33247ec681f3Smrg nir_intrinsic_set_write_mask(intr, 0xF); 33257ec681f3Smrg intr->num_components = 4; 33267ec681f3Smrg 33277ec681f3Smrg return true; 33287ec681f3Smrg} 33297ec681f3Smrg 33307ec681f3Smrgstatic void 33317ec681f3Smrgbi_optimize_nir(nir_shader *nir, unsigned gpu_id, bool is_blend) 33327ec681f3Smrg{ 33337ec681f3Smrg bool progress; 33347ec681f3Smrg unsigned lower_flrp = 16 | 32 | 64; 33357ec681f3Smrg 33367ec681f3Smrg NIR_PASS(progress, nir, nir_lower_regs_to_ssa); 33377ec681f3Smrg 33387ec681f3Smrg nir_lower_tex_options lower_tex_options = { 33397ec681f3Smrg .lower_txs_lod = true, 33407ec681f3Smrg .lower_txp = ~0, 33417ec681f3Smrg .lower_tg4_broadcom_swizzle = true, 33427ec681f3Smrg .lower_txd = true, 33437ec681f3Smrg }; 33447ec681f3Smrg 33457ec681f3Smrg NIR_PASS(progress, nir, pan_nir_lower_64bit_intrin); 33467ec681f3Smrg NIR_PASS(progress, nir, pan_lower_helper_invocation); 33477ec681f3Smrg 33487ec681f3Smrg NIR_PASS(progress, nir, nir_lower_int64); 33497ec681f3Smrg 33507ec681f3Smrg nir_lower_idiv_options idiv_options = { 33517ec681f3Smrg .imprecise_32bit_lowering = true, 33527ec681f3Smrg .allow_fp16 = true, 33537ec681f3Smrg }; 33547ec681f3Smrg NIR_PASS(progress, nir, nir_lower_idiv, &idiv_options); 33557ec681f3Smrg 33567ec681f3Smrg NIR_PASS(progress, nir, nir_lower_tex, &lower_tex_options); 33577ec681f3Smrg NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL); 33587ec681f3Smrg NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); 33597ec681f3Smrg 33607ec681f3Smrg do { 33617ec681f3Smrg progress = false; 33627ec681f3Smrg 33637ec681f3Smrg NIR_PASS(progress, nir, nir_lower_var_copies); 33647ec681f3Smrg NIR_PASS(progress, nir, nir_lower_vars_to_ssa); 33657ec681f3Smrg NIR_PASS(progress, nir, nir_lower_wrmasks, should_split_wrmask, NULL); 33667ec681f3Smrg 33677ec681f3Smrg NIR_PASS(progress, nir, nir_copy_prop); 33687ec681f3Smrg NIR_PASS(progress, nir, nir_opt_remove_phis); 33697ec681f3Smrg NIR_PASS(progress, nir, nir_opt_dce); 33707ec681f3Smrg NIR_PASS(progress, nir, nir_opt_dead_cf); 33717ec681f3Smrg NIR_PASS(progress, nir, nir_opt_cse); 33727ec681f3Smrg NIR_PASS(progress, nir, nir_opt_peephole_select, 64, false, true); 33737ec681f3Smrg NIR_PASS(progress, nir, nir_opt_algebraic); 33747ec681f3Smrg NIR_PASS(progress, nir, nir_opt_constant_folding); 33757ec681f3Smrg 33767ec681f3Smrg NIR_PASS(progress, nir, nir_lower_alu); 33777ec681f3Smrg 33787ec681f3Smrg if (lower_flrp != 0) { 33797ec681f3Smrg bool lower_flrp_progress = false; 33807ec681f3Smrg NIR_PASS(lower_flrp_progress, 33817ec681f3Smrg nir, 33827ec681f3Smrg nir_lower_flrp, 33837ec681f3Smrg lower_flrp, 33847ec681f3Smrg false /* always_precise */); 33857ec681f3Smrg if (lower_flrp_progress) { 33867ec681f3Smrg NIR_PASS(progress, nir, 33877ec681f3Smrg nir_opt_constant_folding); 33887ec681f3Smrg progress = true; 33897ec681f3Smrg } 33907ec681f3Smrg 33917ec681f3Smrg /* Nothing should rematerialize any flrps, so we only 33927ec681f3Smrg * need to do this lowering once. 33937ec681f3Smrg */ 33947ec681f3Smrg lower_flrp = 0; 33957ec681f3Smrg } 33967ec681f3Smrg 33977ec681f3Smrg NIR_PASS(progress, nir, nir_opt_undef); 33987ec681f3Smrg NIR_PASS(progress, nir, nir_lower_undef_to_zero); 33997ec681f3Smrg 34007ec681f3Smrg NIR_PASS(progress, nir, nir_opt_loop_unroll); 34017ec681f3Smrg } while (progress); 34027ec681f3Smrg 34037ec681f3Smrg /* TODO: Why is 64-bit getting rematerialized? 34047ec681f3Smrg * KHR-GLES31.core.shader_image_load_store.basic-allTargets-atomicFS */ 34057ec681f3Smrg NIR_PASS(progress, nir, nir_lower_int64); 34067ec681f3Smrg 34077ec681f3Smrg /* We need to cleanup after each iteration of late algebraic 34087ec681f3Smrg * optimizations, since otherwise NIR can produce weird edge cases 34097ec681f3Smrg * (like fneg of a constant) which we don't handle */ 34107ec681f3Smrg bool late_algebraic = true; 34117ec681f3Smrg while (late_algebraic) { 34127ec681f3Smrg late_algebraic = false; 34137ec681f3Smrg NIR_PASS(late_algebraic, nir, nir_opt_algebraic_late); 34147ec681f3Smrg NIR_PASS(progress, nir, nir_opt_constant_folding); 34157ec681f3Smrg NIR_PASS(progress, nir, nir_copy_prop); 34167ec681f3Smrg NIR_PASS(progress, nir, nir_opt_dce); 34177ec681f3Smrg NIR_PASS(progress, nir, nir_opt_cse); 34187ec681f3Smrg } 34197ec681f3Smrg 34207ec681f3Smrg NIR_PASS(progress, nir, nir_lower_alu_to_scalar, NULL, NULL); 34217ec681f3Smrg NIR_PASS(progress, nir, nir_opt_vectorize, bi_vectorize_filter, NULL); 34227ec681f3Smrg NIR_PASS(progress, nir, nir_lower_load_const_to_scalar); 34237ec681f3Smrg NIR_PASS(progress, nir, nir_opt_dce); 34247ec681f3Smrg 34257ec681f3Smrg /* Prepass to simplify instruction selection */ 34267ec681f3Smrg NIR_PASS(progress, nir, bifrost_nir_lower_algebraic_late); 34277ec681f3Smrg NIR_PASS(progress, nir, nir_opt_dce); 34287ec681f3Smrg 34297ec681f3Smrg if (nir->info.stage == MESA_SHADER_FRAGMENT) { 34307ec681f3Smrg NIR_PASS_V(nir, nir_shader_instructions_pass, 34317ec681f3Smrg bifrost_nir_lower_blend_components, 34327ec681f3Smrg nir_metadata_block_index | nir_metadata_dominance, 34337ec681f3Smrg NULL); 34347ec681f3Smrg } 34357ec681f3Smrg 34367ec681f3Smrg /* Backend scheduler is purely local, so do some global optimizations 34377ec681f3Smrg * to reduce register pressure. */ 34387ec681f3Smrg nir_move_options move_all = 34397ec681f3Smrg nir_move_const_undef | nir_move_load_ubo | nir_move_load_input | 34407ec681f3Smrg nir_move_comparisons | nir_move_copies | nir_move_load_ssbo; 34417ec681f3Smrg 34427ec681f3Smrg NIR_PASS_V(nir, nir_opt_sink, move_all); 34437ec681f3Smrg NIR_PASS_V(nir, nir_opt_move, move_all); 34447ec681f3Smrg 34457ec681f3Smrg /* We might lower attribute, varying, and image indirects. Use the 34467ec681f3Smrg * gathered info to skip the extra analysis in the happy path. */ 34477ec681f3Smrg bool any_indirects = 34487ec681f3Smrg nir->info.inputs_read_indirectly || 34497ec681f3Smrg nir->info.outputs_accessed_indirectly || 34507ec681f3Smrg nir->info.patch_inputs_read_indirectly || 34517ec681f3Smrg nir->info.patch_outputs_accessed_indirectly || 34527ec681f3Smrg nir->info.images_used; 34537ec681f3Smrg 34547ec681f3Smrg if (any_indirects) { 34557ec681f3Smrg nir_convert_to_lcssa(nir, true, true); 34567ec681f3Smrg NIR_PASS_V(nir, nir_divergence_analysis); 34577ec681f3Smrg NIR_PASS_V(nir, bi_lower_divergent_indirects, 34587ec681f3Smrg bifrost_lanes_per_warp(gpu_id)); 34597ec681f3Smrg NIR_PASS_V(nir, nir_shader_instructions_pass, 34607ec681f3Smrg nir_invalidate_divergence, nir_metadata_all, NULL); 34617ec681f3Smrg } 34627ec681f3Smrg 34637ec681f3Smrg /* Take us out of SSA */ 34647ec681f3Smrg NIR_PASS(progress, nir, nir_lower_locals_to_regs); 34657ec681f3Smrg NIR_PASS(progress, nir, nir_move_vec_src_uses_to_dest); 34667ec681f3Smrg NIR_PASS(progress, nir, nir_convert_from_ssa, true); 34677ec681f3Smrg} 34687ec681f3Smrg 34697ec681f3Smrg/* The cmdstream lowers 8-bit fragment output as 16-bit, so we need to do the 34707ec681f3Smrg * same lowering here to zero-extend correctly */ 34717ec681f3Smrg 34727ec681f3Smrgstatic bool 34737ec681f3Smrgbifrost_nir_lower_i8_fragout_impl(struct nir_builder *b, 34747ec681f3Smrg nir_intrinsic_instr *intr, UNUSED void *data) 34757ec681f3Smrg{ 34767ec681f3Smrg if (nir_src_bit_size(intr->src[0]) != 8) 34777ec681f3Smrg return false; 34787ec681f3Smrg 34797ec681f3Smrg nir_alu_type type = 34807ec681f3Smrg nir_alu_type_get_base_type(nir_intrinsic_src_type(intr)); 34817ec681f3Smrg 34827ec681f3Smrg assert(type == nir_type_int || type == nir_type_uint); 34837ec681f3Smrg 34847ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 34857ec681f3Smrg nir_ssa_def *cast = nir_convert_to_bit_size(b, intr->src[0].ssa, type, 16); 34867ec681f3Smrg 34877ec681f3Smrg nir_intrinsic_set_src_type(intr, type | 16); 34887ec681f3Smrg nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0], cast); 34897ec681f3Smrg return true; 34907ec681f3Smrg} 34917ec681f3Smrg 34927ec681f3Smrgstatic bool 34937ec681f3Smrgbifrost_nir_lower_i8_fragin_impl(struct nir_builder *b, 34947ec681f3Smrg nir_intrinsic_instr *intr, UNUSED void *data) 34957ec681f3Smrg{ 34967ec681f3Smrg if (nir_dest_bit_size(intr->dest) != 8) 34977ec681f3Smrg return false; 34987ec681f3Smrg 34997ec681f3Smrg nir_alu_type type = 35007ec681f3Smrg nir_alu_type_get_base_type(nir_intrinsic_dest_type(intr)); 35017ec681f3Smrg 35027ec681f3Smrg assert(type == nir_type_int || type == nir_type_uint); 35037ec681f3Smrg 35047ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 35057ec681f3Smrg nir_ssa_def *out = 35067ec681f3Smrg nir_load_output(b, intr->num_components, 16, intr->src[0].ssa, 35077ec681f3Smrg .base = nir_intrinsic_base(intr), 35087ec681f3Smrg .component = nir_intrinsic_component(intr), 35097ec681f3Smrg .dest_type = type | 16, 35107ec681f3Smrg .io_semantics = nir_intrinsic_io_semantics(intr)); 35117ec681f3Smrg 35127ec681f3Smrg nir_ssa_def *cast = nir_convert_to_bit_size(b, out, type, 8); 35137ec681f3Smrg nir_ssa_def_rewrite_uses(&intr->dest.ssa, cast); 35147ec681f3Smrg return true; 35157ec681f3Smrg} 35167ec681f3Smrg 35177ec681f3Smrgstatic bool 35187ec681f3Smrgbifrost_nir_lower_i8_frag(struct nir_builder *b, 35197ec681f3Smrg nir_instr *instr, UNUSED void *data) 35207ec681f3Smrg{ 35217ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 35227ec681f3Smrg return false; 35237ec681f3Smrg 35247ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 35257ec681f3Smrg if (intr->intrinsic == nir_intrinsic_load_output) 35267ec681f3Smrg return bifrost_nir_lower_i8_fragin_impl(b, intr, data); 35277ec681f3Smrg else if (intr->intrinsic == nir_intrinsic_store_output) 35287ec681f3Smrg return bifrost_nir_lower_i8_fragout_impl(b, intr, data); 35297ec681f3Smrg else 35307ec681f3Smrg return false; 35317ec681f3Smrg} 35327ec681f3Smrg 35337ec681f3Smrgstatic void 35347ec681f3Smrgbi_opt_post_ra(bi_context *ctx) 35357ec681f3Smrg{ 35367ec681f3Smrg bi_foreach_instr_global_safe(ctx, ins) { 35377ec681f3Smrg if (ins->op == BI_OPCODE_MOV_I32 && bi_is_equiv(ins->dest[0], ins->src[0])) 35387ec681f3Smrg bi_remove_instruction(ins); 35397ec681f3Smrg } 35407ec681f3Smrg} 35417ec681f3Smrg 35427ec681f3Smrgstatic bool 35437ec681f3Smrgbifrost_nir_lower_store_component(struct nir_builder *b, 35447ec681f3Smrg nir_instr *instr, void *data) 35457ec681f3Smrg{ 35467ec681f3Smrg if (instr->type != nir_instr_type_intrinsic) 35477ec681f3Smrg return false; 35487ec681f3Smrg 35497ec681f3Smrg nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); 35507ec681f3Smrg 35517ec681f3Smrg if (intr->intrinsic != nir_intrinsic_store_output) 35527ec681f3Smrg return false; 35537ec681f3Smrg 35547ec681f3Smrg struct hash_table_u64 *slots = data; 35557ec681f3Smrg unsigned component = nir_intrinsic_component(intr); 35567ec681f3Smrg nir_src *slot_src = nir_get_io_offset_src(intr); 35577ec681f3Smrg uint64_t slot = nir_src_as_uint(*slot_src) + nir_intrinsic_base(intr); 35587ec681f3Smrg 35597ec681f3Smrg nir_intrinsic_instr *prev = _mesa_hash_table_u64_search(slots, slot); 35607ec681f3Smrg unsigned mask = (prev ? nir_intrinsic_write_mask(prev) : 0); 35617ec681f3Smrg 35627ec681f3Smrg nir_ssa_def *value = intr->src[0].ssa; 35637ec681f3Smrg b->cursor = nir_before_instr(&intr->instr); 35647ec681f3Smrg 35657ec681f3Smrg nir_ssa_def *undef = nir_ssa_undef(b, 1, value->bit_size); 35667ec681f3Smrg nir_ssa_def *channels[4] = { undef, undef, undef, undef }; 35677ec681f3Smrg 35687ec681f3Smrg /* Copy old */ 35697ec681f3Smrg u_foreach_bit(i, mask) { 35707ec681f3Smrg assert(prev != NULL); 35717ec681f3Smrg nir_ssa_def *prev_ssa = prev->src[0].ssa; 35727ec681f3Smrg channels[i] = nir_channel(b, prev_ssa, i); 35737ec681f3Smrg } 35747ec681f3Smrg 35757ec681f3Smrg /* Copy new */ 35767ec681f3Smrg unsigned new_mask = nir_intrinsic_write_mask(intr); 35777ec681f3Smrg mask |= (new_mask << component); 35787ec681f3Smrg 35797ec681f3Smrg u_foreach_bit(i, new_mask) { 35807ec681f3Smrg assert(component + i < 4); 35817ec681f3Smrg channels[component + i] = nir_channel(b, value, i); 35827ec681f3Smrg } 35837ec681f3Smrg 35847ec681f3Smrg intr->num_components = util_last_bit(mask); 35857ec681f3Smrg nir_instr_rewrite_src_ssa(instr, &intr->src[0], 35867ec681f3Smrg nir_vec(b, channels, intr->num_components)); 35877ec681f3Smrg 35887ec681f3Smrg nir_intrinsic_set_component(intr, 0); 35897ec681f3Smrg nir_intrinsic_set_write_mask(intr, mask); 35907ec681f3Smrg 35917ec681f3Smrg if (prev) { 35927ec681f3Smrg _mesa_hash_table_u64_remove(slots, slot); 35937ec681f3Smrg nir_instr_remove(&prev->instr); 35947ec681f3Smrg } 35957ec681f3Smrg 35967ec681f3Smrg _mesa_hash_table_u64_insert(slots, slot, intr); 35977ec681f3Smrg return false; 35987ec681f3Smrg} 35997ec681f3Smrg 36007ec681f3Smrg/* Dead code elimination for branches at the end of a block - only one branch 36017ec681f3Smrg * per block is legal semantically, but unreachable jumps can be generated. 36027ec681f3Smrg * Likewise we can generate jumps to the terminal block which need to be 36037ec681f3Smrg * lowered away to a jump to #0x0, which induces successful termination. */ 36047ec681f3Smrg 36057ec681f3Smrgstatic void 36067ec681f3Smrgbi_lower_branch(bi_block *block) 36077ec681f3Smrg{ 36087ec681f3Smrg bool branched = false; 36097ec681f3Smrg ASSERTED bool was_jump = false; 36107ec681f3Smrg 36117ec681f3Smrg bi_foreach_instr_in_block_safe(block, ins) { 36127ec681f3Smrg if (!ins->branch_target) continue; 36137ec681f3Smrg 36147ec681f3Smrg if (branched) { 36157ec681f3Smrg assert(was_jump && (ins->op == BI_OPCODE_JUMP)); 36167ec681f3Smrg bi_remove_instruction(ins); 36177ec681f3Smrg continue; 36187ec681f3Smrg } 36197ec681f3Smrg 36207ec681f3Smrg branched = true; 36217ec681f3Smrg was_jump = ins->op == BI_OPCODE_JUMP; 36227ec681f3Smrg 36237ec681f3Smrg if (bi_is_terminal_block(ins->branch_target)) 36247ec681f3Smrg ins->branch_target = NULL; 36257ec681f3Smrg } 36267ec681f3Smrg} 36277ec681f3Smrg 36287ec681f3Smrgstatic void 36297ec681f3Smrgbi_pack_clauses(bi_context *ctx, struct util_dynarray *binary) 36307ec681f3Smrg{ 36317ec681f3Smrg unsigned final_clause = bi_pack(ctx, binary); 36327ec681f3Smrg 36337ec681f3Smrg /* If we need to wait for ATEST or BLEND in the first clause, pass the 36347ec681f3Smrg * corresponding bits through to the renderer state descriptor */ 36357ec681f3Smrg bi_block *first_block = list_first_entry(&ctx->blocks, bi_block, link); 36367ec681f3Smrg bi_clause *first_clause = bi_next_clause(ctx, first_block, NULL); 36377ec681f3Smrg 36387ec681f3Smrg unsigned first_deps = first_clause ? first_clause->dependencies : 0; 36397ec681f3Smrg ctx->info->bifrost.wait_6 = (first_deps & (1 << 6)); 36407ec681f3Smrg ctx->info->bifrost.wait_7 = (first_deps & (1 << 7)); 36417ec681f3Smrg 36427ec681f3Smrg /* Pad the shader with enough zero bytes to trick the prefetcher, 36437ec681f3Smrg * unless we're compiling an empty shader (in which case we don't pad 36447ec681f3Smrg * so the size remains 0) */ 36457ec681f3Smrg unsigned prefetch_size = BIFROST_SHADER_PREFETCH - final_clause; 36467ec681f3Smrg 36477ec681f3Smrg if (binary->size) { 36487ec681f3Smrg memset(util_dynarray_grow(binary, uint8_t, prefetch_size), 36497ec681f3Smrg 0, prefetch_size); 36507ec681f3Smrg } 36517ec681f3Smrg} 36527ec681f3Smrg 36537ec681f3Smrgvoid 36547ec681f3Smrgbifrost_compile_shader_nir(nir_shader *nir, 36557ec681f3Smrg const struct panfrost_compile_inputs *inputs, 36567ec681f3Smrg struct util_dynarray *binary, 36577ec681f3Smrg struct pan_shader_info *info) 36587ec681f3Smrg{ 36597ec681f3Smrg bifrost_debug = debug_get_option_bifrost_debug(); 36607ec681f3Smrg 36617ec681f3Smrg bi_context *ctx = rzalloc(NULL, bi_context); 36627ec681f3Smrg ctx->sysval_to_id = panfrost_init_sysvals(&info->sysvals, ctx); 36637ec681f3Smrg 36647ec681f3Smrg ctx->inputs = inputs; 36657ec681f3Smrg ctx->nir = nir; 36667ec681f3Smrg ctx->info = info; 36677ec681f3Smrg ctx->stage = nir->info.stage; 36687ec681f3Smrg ctx->quirks = bifrost_get_quirks(inputs->gpu_id); 36697ec681f3Smrg ctx->arch = inputs->gpu_id >> 12; 36707ec681f3Smrg 36717ec681f3Smrg /* If nothing is pushed, all UBOs need to be uploaded */ 36727ec681f3Smrg ctx->ubo_mask = ~0; 36737ec681f3Smrg 36747ec681f3Smrg list_inithead(&ctx->blocks); 36757ec681f3Smrg 36767ec681f3Smrg /* Lower gl_Position pre-optimisation, but after lowering vars to ssa 36777ec681f3Smrg * (so we don't accidentally duplicate the epilogue since mesa/st has 36787ec681f3Smrg * messed with our I/O quite a bit already) */ 36797ec681f3Smrg 36807ec681f3Smrg NIR_PASS_V(nir, nir_lower_vars_to_ssa); 36817ec681f3Smrg 36827ec681f3Smrg if (ctx->stage == MESA_SHADER_VERTEX) { 36837ec681f3Smrg NIR_PASS_V(nir, nir_lower_viewport_transform); 36847ec681f3Smrg NIR_PASS_V(nir, nir_lower_point_size, 1.0, 1024.0); 36857ec681f3Smrg } 36867ec681f3Smrg 36877ec681f3Smrg /* Lower large arrays to scratch and small arrays to bcsel (TODO: tune 36887ec681f3Smrg * threshold, but not until addresses / csel is optimized better) */ 36897ec681f3Smrg NIR_PASS_V(nir, nir_lower_vars_to_scratch, nir_var_function_temp, 16, 36907ec681f3Smrg glsl_get_natural_size_align_bytes); 36917ec681f3Smrg NIR_PASS_V(nir, nir_lower_indirect_derefs, nir_var_function_temp, ~0); 36927ec681f3Smrg 36937ec681f3Smrg NIR_PASS_V(nir, nir_split_var_copies); 36947ec681f3Smrg NIR_PASS_V(nir, nir_lower_global_vars_to_local); 36957ec681f3Smrg NIR_PASS_V(nir, nir_lower_var_copies); 36967ec681f3Smrg NIR_PASS_V(nir, nir_lower_vars_to_ssa); 36977ec681f3Smrg NIR_PASS_V(nir, nir_lower_io, nir_var_shader_in | nir_var_shader_out, 36987ec681f3Smrg glsl_type_size, 0); 36997ec681f3Smrg 37007ec681f3Smrg if (ctx->stage == MESA_SHADER_FRAGMENT) { 37017ec681f3Smrg NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, 37027ec681f3Smrg ~0, false); 37037ec681f3Smrg } else { 37047ec681f3Smrg struct hash_table_u64 *stores = _mesa_hash_table_u64_create(ctx); 37057ec681f3Smrg NIR_PASS_V(nir, nir_shader_instructions_pass, 37067ec681f3Smrg bifrost_nir_lower_store_component, 37077ec681f3Smrg nir_metadata_block_index | 37087ec681f3Smrg nir_metadata_dominance, stores); 37097ec681f3Smrg _mesa_hash_table_u64_destroy(stores); 37107ec681f3Smrg } 37117ec681f3Smrg 37127ec681f3Smrg NIR_PASS_V(nir, nir_lower_ssbo); 37137ec681f3Smrg NIR_PASS_V(nir, pan_nir_lower_zs_store); 37147ec681f3Smrg NIR_PASS_V(nir, pan_lower_sample_pos); 37157ec681f3Smrg NIR_PASS_V(nir, nir_lower_bit_size, bi_lower_bit_size, NULL); 37167ec681f3Smrg 37177ec681f3Smrg if (nir->info.stage == MESA_SHADER_FRAGMENT) { 37187ec681f3Smrg NIR_PASS_V(nir, nir_shader_instructions_pass, 37197ec681f3Smrg bifrost_nir_lower_i8_frag, 37207ec681f3Smrg nir_metadata_block_index | nir_metadata_dominance, 37217ec681f3Smrg NULL); 37227ec681f3Smrg } 37237ec681f3Smrg 37247ec681f3Smrg bi_optimize_nir(nir, ctx->inputs->gpu_id, ctx->inputs->is_blend); 37257ec681f3Smrg 37267ec681f3Smrg NIR_PASS_V(nir, pan_nir_reorder_writeout); 37277ec681f3Smrg 37287ec681f3Smrg bool skip_internal = nir->info.internal; 37297ec681f3Smrg skip_internal &= !(bifrost_debug & BIFROST_DBG_INTERNAL); 37307ec681f3Smrg 37317ec681f3Smrg if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { 37327ec681f3Smrg nir_print_shader(nir, stdout); 37337ec681f3Smrg } 37347ec681f3Smrg 37357ec681f3Smrg info->tls_size = nir->scratch_size; 37367ec681f3Smrg 37377ec681f3Smrg nir_foreach_function(func, nir) { 37387ec681f3Smrg if (!func->impl) 37397ec681f3Smrg continue; 37407ec681f3Smrg 37417ec681f3Smrg ctx->ssa_alloc += func->impl->ssa_alloc; 37427ec681f3Smrg ctx->reg_alloc += func->impl->reg_alloc; 37437ec681f3Smrg 37447ec681f3Smrg emit_cf_list(ctx, &func->impl->body); 37457ec681f3Smrg break; /* TODO: Multi-function shaders */ 37467ec681f3Smrg } 37477ec681f3Smrg 37487ec681f3Smrg unsigned block_source_count = 0; 37497ec681f3Smrg 37507ec681f3Smrg bi_foreach_block(ctx, block) { 37517ec681f3Smrg /* Name blocks now that we're done emitting so the order is 37527ec681f3Smrg * consistent */ 37537ec681f3Smrg block->name = block_source_count++; 37547ec681f3Smrg } 37557ec681f3Smrg 37567ec681f3Smrg bi_validate(ctx, "NIR -> BIR"); 37577ec681f3Smrg 37587ec681f3Smrg /* If the shader doesn't write any colour or depth outputs, it may 37597ec681f3Smrg * still need an ATEST at the very end! */ 37607ec681f3Smrg bool need_dummy_atest = 37617ec681f3Smrg (ctx->stage == MESA_SHADER_FRAGMENT) && 37627ec681f3Smrg !ctx->emitted_atest && 37637ec681f3Smrg !bi_skip_atest(ctx, false); 37647ec681f3Smrg 37657ec681f3Smrg if (need_dummy_atest) { 37667ec681f3Smrg bi_block *end = list_last_entry(&ctx->blocks, bi_block, link); 37677ec681f3Smrg bi_builder b = bi_init_builder(ctx, bi_after_block(end)); 37687ec681f3Smrg bi_emit_atest(&b, bi_zero()); 37697ec681f3Smrg } 37707ec681f3Smrg 37717ec681f3Smrg bool optimize = !(bifrost_debug & BIFROST_DBG_NOOPT); 37727ec681f3Smrg 37737ec681f3Smrg /* Runs before constant folding */ 37747ec681f3Smrg bi_lower_swizzle(ctx); 37757ec681f3Smrg bi_validate(ctx, "Early lowering"); 37767ec681f3Smrg 37777ec681f3Smrg /* Runs before copy prop */ 37787ec681f3Smrg if (optimize && !ctx->inputs->no_ubo_to_push) { 37797ec681f3Smrg bi_opt_push_ubo(ctx); 37807ec681f3Smrg } 37817ec681f3Smrg 37827ec681f3Smrg if (likely(optimize)) { 37837ec681f3Smrg bi_opt_copy_prop(ctx); 37847ec681f3Smrg bi_opt_constant_fold(ctx); 37857ec681f3Smrg bi_opt_copy_prop(ctx); 37867ec681f3Smrg bi_opt_mod_prop_forward(ctx); 37877ec681f3Smrg bi_opt_mod_prop_backward(ctx); 37887ec681f3Smrg bi_opt_dead_code_eliminate(ctx); 37897ec681f3Smrg bi_opt_cse(ctx); 37907ec681f3Smrg bi_opt_dead_code_eliminate(ctx); 37917ec681f3Smrg bi_validate(ctx, "Optimization passes"); 37927ec681f3Smrg } 37937ec681f3Smrg 37947ec681f3Smrg bi_foreach_instr_global(ctx, I) { 37957ec681f3Smrg bi_lower_opt_instruction(I); 37967ec681f3Smrg } 37977ec681f3Smrg 37987ec681f3Smrg bi_foreach_block(ctx, block) { 37997ec681f3Smrg bi_lower_branch(block); 38007ec681f3Smrg } 38017ec681f3Smrg 38027ec681f3Smrg if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 38037ec681f3Smrg bi_print_shader(ctx, stdout); 38047ec681f3Smrg bi_lower_fau(ctx); 38057ec681f3Smrg 38067ec681f3Smrg /* Analyze before register allocation to avoid false dependencies. The 38077ec681f3Smrg * skip bit is a function of only the data flow graph and is invariant 38087ec681f3Smrg * under valid scheduling. */ 38097ec681f3Smrg bi_analyze_helper_requirements(ctx); 38107ec681f3Smrg bi_validate(ctx, "Late lowering"); 38117ec681f3Smrg 38127ec681f3Smrg bi_register_allocate(ctx); 38137ec681f3Smrg 38147ec681f3Smrg if (likely(optimize)) 38157ec681f3Smrg bi_opt_post_ra(ctx); 38167ec681f3Smrg 38177ec681f3Smrg if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 38187ec681f3Smrg bi_print_shader(ctx, stdout); 38197ec681f3Smrg 38207ec681f3Smrg if (ctx->arch <= 8) { 38217ec681f3Smrg bi_schedule(ctx); 38227ec681f3Smrg bi_assign_scoreboard(ctx); 38237ec681f3Smrg } 38247ec681f3Smrg 38257ec681f3Smrg /* Analyze after scheduling since we depend on instruction order. */ 38267ec681f3Smrg bi_analyze_helper_terminate(ctx); 38277ec681f3Smrg 38287ec681f3Smrg if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) 38297ec681f3Smrg bi_print_shader(ctx, stdout); 38307ec681f3Smrg 38317ec681f3Smrg if (ctx->arch <= 8) { 38327ec681f3Smrg bi_pack_clauses(ctx, binary); 38337ec681f3Smrg } else { 38347ec681f3Smrg /* TODO: pack flat */ 38357ec681f3Smrg } 38367ec681f3Smrg 38377ec681f3Smrg info->ubo_mask = ctx->ubo_mask & BITSET_MASK(ctx->nir->info.num_ubos); 38387ec681f3Smrg 38397ec681f3Smrg if (bifrost_debug & BIFROST_DBG_SHADERS && !skip_internal) { 38407ec681f3Smrg disassemble_bifrost(stdout, binary->data, binary->size, 38417ec681f3Smrg bifrost_debug & BIFROST_DBG_VERBOSE); 38427ec681f3Smrg fflush(stdout); 38437ec681f3Smrg } 38447ec681f3Smrg 38457ec681f3Smrg if ((bifrost_debug & BIFROST_DBG_SHADERDB || inputs->shaderdb) && 38467ec681f3Smrg !skip_internal) { 38477ec681f3Smrg bi_print_stats(ctx, binary->size, stderr); 38487ec681f3Smrg } 38497ec681f3Smrg 38507ec681f3Smrg _mesa_hash_table_u64_destroy(ctx->sysval_to_id); 38517ec681f3Smrg ralloc_free(ctx); 38527ec681f3Smrg} 3853