101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2016 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 2101e04c3fSmrg * IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg#include "nir.h" 2501e04c3fSmrg#include "nir_builder.h" 2601e04c3fSmrg 277ec681f3Smrg#define COND_LOWER_OP(b, name, ...) \ 287ec681f3Smrg (b->shader->options->lower_int64_options & \ 297ec681f3Smrg nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \ 307ec681f3Smrg lower_##name##64(b, __VA_ARGS__) : nir_##name(b, __VA_ARGS__) 317ec681f3Smrg 327ec681f3Smrg#define COND_LOWER_CMP(b, name, ...) \ 337ec681f3Smrg (b->shader->options->lower_int64_options & \ 347ec681f3Smrg nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \ 357ec681f3Smrg lower_int64_compare(b, nir_op_##name, __VA_ARGS__) : \ 367ec681f3Smrg nir_##name(b, __VA_ARGS__) 377ec681f3Smrg 387ec681f3Smrg#define COND_LOWER_CAST(b, name, ...) \ 397ec681f3Smrg (b->shader->options->lower_int64_options & \ 407ec681f3Smrg nir_lower_int64_op_to_options_mask(nir_op_##name)) ? \ 417ec681f3Smrg lower_##name(b, __VA_ARGS__) : \ 427ec681f3Smrg nir_##name(b, __VA_ARGS__) 437ec681f3Smrg 447e102996Smayastatic nir_ssa_def * 457e102996Smayalower_b2i64(nir_builder *b, nir_ssa_def *x) 467e102996Smaya{ 477e102996Smaya return nir_pack_64_2x32_split(b, nir_b2i32(b, x), nir_imm_int(b, 0)); 487e102996Smaya} 497e102996Smaya 507e102996Smayastatic nir_ssa_def * 517e102996Smayalower_i2b(nir_builder *b, nir_ssa_def *x) 527e102996Smaya{ 537e102996Smaya return nir_ine(b, nir_ior(b, nir_unpack_64_2x32_split_x(b, x), 547e102996Smaya nir_unpack_64_2x32_split_y(b, x)), 557e102996Smaya nir_imm_int(b, 0)); 567e102996Smaya} 577e102996Smaya 587e102996Smayastatic nir_ssa_def * 597e102996Smayalower_i2i8(nir_builder *b, nir_ssa_def *x) 607e102996Smaya{ 617e102996Smaya return nir_i2i8(b, nir_unpack_64_2x32_split_x(b, x)); 627e102996Smaya} 637e102996Smaya 647e102996Smayastatic nir_ssa_def * 657e102996Smayalower_i2i16(nir_builder *b, nir_ssa_def *x) 667e102996Smaya{ 677e102996Smaya return nir_i2i16(b, nir_unpack_64_2x32_split_x(b, x)); 687e102996Smaya} 697e102996Smaya 707e102996Smaya 717e102996Smayastatic nir_ssa_def * 727e102996Smayalower_i2i32(nir_builder *b, nir_ssa_def *x) 737e102996Smaya{ 747e102996Smaya return nir_unpack_64_2x32_split_x(b, x); 757e102996Smaya} 767e102996Smaya 777e102996Smayastatic nir_ssa_def * 787e102996Smayalower_i2i64(nir_builder *b, nir_ssa_def *x) 797e102996Smaya{ 807e102996Smaya nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_i2i32(b, x); 817ec681f3Smrg return nir_pack_64_2x32_split(b, x32, nir_ishr_imm(b, x32, 31)); 827e102996Smaya} 837e102996Smaya 847e102996Smayastatic nir_ssa_def * 857e102996Smayalower_u2u8(nir_builder *b, nir_ssa_def *x) 867e102996Smaya{ 877e102996Smaya return nir_u2u8(b, nir_unpack_64_2x32_split_x(b, x)); 887e102996Smaya} 897e102996Smaya 907e102996Smayastatic nir_ssa_def * 917e102996Smayalower_u2u16(nir_builder *b, nir_ssa_def *x) 927e102996Smaya{ 937e102996Smaya return nir_u2u16(b, nir_unpack_64_2x32_split_x(b, x)); 947e102996Smaya} 957e102996Smaya 967e102996Smayastatic nir_ssa_def * 977e102996Smayalower_u2u32(nir_builder *b, nir_ssa_def *x) 987e102996Smaya{ 997e102996Smaya return nir_unpack_64_2x32_split_x(b, x); 1007e102996Smaya} 1017e102996Smaya 1027e102996Smayastatic nir_ssa_def * 1037e102996Smayalower_u2u64(nir_builder *b, nir_ssa_def *x) 1047e102996Smaya{ 1057e102996Smaya nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_u2u32(b, x); 1067e102996Smaya return nir_pack_64_2x32_split(b, x32, nir_imm_int(b, 0)); 1077e102996Smaya} 1087e102996Smaya 1097e102996Smayastatic nir_ssa_def * 1107e102996Smayalower_bcsel64(nir_builder *b, nir_ssa_def *cond, nir_ssa_def *x, nir_ssa_def *y) 1117e102996Smaya{ 1127e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 1137e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 1147e102996Smaya nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 1157e102996Smaya nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 1167e102996Smaya 1177e102996Smaya return nir_pack_64_2x32_split(b, nir_bcsel(b, cond, x_lo, y_lo), 1187e102996Smaya nir_bcsel(b, cond, x_hi, y_hi)); 1197e102996Smaya} 1207e102996Smaya 1217e102996Smayastatic nir_ssa_def * 1227e102996Smayalower_inot64(nir_builder *b, nir_ssa_def *x) 1237e102996Smaya{ 1247e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 1257e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 1267e102996Smaya 1277e102996Smaya return nir_pack_64_2x32_split(b, nir_inot(b, x_lo), nir_inot(b, x_hi)); 1287e102996Smaya} 1297e102996Smaya 1307e102996Smayastatic nir_ssa_def * 1317e102996Smayalower_iand64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 1327e102996Smaya{ 1337e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 1347e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 1357e102996Smaya nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 1367e102996Smaya nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 1377e102996Smaya 1387e102996Smaya return nir_pack_64_2x32_split(b, nir_iand(b, x_lo, y_lo), 1397e102996Smaya nir_iand(b, x_hi, y_hi)); 1407e102996Smaya} 1417e102996Smaya 1427e102996Smayastatic nir_ssa_def * 1437e102996Smayalower_ior64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 1447e102996Smaya{ 1457e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 1467e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 1477e102996Smaya nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 1487e102996Smaya nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 1497e102996Smaya 1507e102996Smaya return nir_pack_64_2x32_split(b, nir_ior(b, x_lo, y_lo), 1517e102996Smaya nir_ior(b, x_hi, y_hi)); 1527e102996Smaya} 1537e102996Smaya 1547e102996Smayastatic nir_ssa_def * 1557e102996Smayalower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 1567e102996Smaya{ 1577e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 1587e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 1597e102996Smaya nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 1607e102996Smaya nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 1617e102996Smaya 1627e102996Smaya return nir_pack_64_2x32_split(b, nir_ixor(b, x_lo, y_lo), 1637e102996Smaya nir_ixor(b, x_hi, y_hi)); 1647e102996Smaya} 1657e102996Smaya 1667e102996Smayastatic nir_ssa_def * 1677e102996Smayalower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 1687e102996Smaya{ 1697e102996Smaya /* Implemented as 1707e102996Smaya * 1717e102996Smaya * uint64_t lshift(uint64_t x, int c) 1727e102996Smaya * { 1737e102996Smaya * if (c == 0) return x; 1747e102996Smaya * 1757e102996Smaya * uint32_t lo = LO(x), hi = HI(x); 1767e102996Smaya * 1777e102996Smaya * if (c < 32) { 1787e102996Smaya * uint32_t lo_shifted = lo << c; 1797e102996Smaya * uint32_t hi_shifted = hi << c; 1807e102996Smaya * uint32_t lo_shifted_hi = lo >> abs(32 - c); 1817e102996Smaya * return pack_64(lo_shifted, hi_shifted | lo_shifted_hi); 1827e102996Smaya * } else { 1837e102996Smaya * uint32_t lo_shifted_hi = lo << abs(32 - c); 1847e102996Smaya * return pack_64(0, lo_shifted_hi); 1857e102996Smaya * } 1867e102996Smaya * } 1877e102996Smaya */ 1887e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 1897e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 1907e102996Smaya 1917e102996Smaya nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 1927e102996Smaya nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y); 1937e102996Smaya nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y); 1947e102996Smaya nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count); 1957e102996Smaya 1967e102996Smaya nir_ssa_def *res_if_lt_32 = 1977e102996Smaya nir_pack_64_2x32_split(b, lo_shifted, 1987e102996Smaya nir_ior(b, hi_shifted, lo_shifted_hi)); 1997e102996Smaya nir_ssa_def *res_if_ge_32 = 2007e102996Smaya nir_pack_64_2x32_split(b, nir_imm_int(b, 0), 2017e102996Smaya nir_ishl(b, x_lo, reverse_count)); 2027e102996Smaya 2037ec681f3Smrg return nir_bcsel(b, nir_ieq_imm(b, y, 0), x, 2047e102996Smaya nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 2057e102996Smaya res_if_ge_32, res_if_lt_32)); 2067e102996Smaya} 2077e102996Smaya 2087e102996Smayastatic nir_ssa_def * 2097e102996Smayalower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 2107e102996Smaya{ 2117e102996Smaya /* Implemented as 2127e102996Smaya * 2137e102996Smaya * uint64_t arshift(uint64_t x, int c) 2147e102996Smaya * { 2157e102996Smaya * if (c == 0) return x; 2167e102996Smaya * 2177e102996Smaya * uint32_t lo = LO(x); 2187e102996Smaya * int32_t hi = HI(x); 2197e102996Smaya * 2207e102996Smaya * if (c < 32) { 2217e102996Smaya * uint32_t lo_shifted = lo >> c; 2227e102996Smaya * uint32_t hi_shifted = hi >> c; 2237e102996Smaya * uint32_t hi_shifted_lo = hi << abs(32 - c); 2247e102996Smaya * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted); 2257e102996Smaya * } else { 2267e102996Smaya * uint32_t hi_shifted = hi >> 31; 2277e102996Smaya * uint32_t hi_shifted_lo = hi >> abs(32 - c); 2287e102996Smaya * return pack_64(hi_shifted, hi_shifted_lo); 2297e102996Smaya * } 2307e102996Smaya * } 2317e102996Smaya */ 2327e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 2337e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 2347e102996Smaya 2357e102996Smaya nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 2367e102996Smaya nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y); 2377e102996Smaya nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y); 2387e102996Smaya nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count); 2397e102996Smaya 2407e102996Smaya nir_ssa_def *res_if_lt_32 = 2417e102996Smaya nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo), 2427e102996Smaya hi_shifted); 2437e102996Smaya nir_ssa_def *res_if_ge_32 = 2447e102996Smaya nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count), 2457e102996Smaya nir_ishr(b, x_hi, nir_imm_int(b, 31))); 2467e102996Smaya 2477ec681f3Smrg return nir_bcsel(b, nir_ieq_imm(b, y, 0), x, 2487e102996Smaya nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 2497e102996Smaya res_if_ge_32, res_if_lt_32)); 2507e102996Smaya} 2517e102996Smaya 2527e102996Smayastatic nir_ssa_def * 2537e102996Smayalower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 2547e102996Smaya{ 2557e102996Smaya /* Implemented as 2567e102996Smaya * 2577e102996Smaya * uint64_t rshift(uint64_t x, int c) 2587e102996Smaya * { 2597e102996Smaya * if (c == 0) return x; 2607e102996Smaya * 2617e102996Smaya * uint32_t lo = LO(x), hi = HI(x); 2627e102996Smaya * 2637e102996Smaya * if (c < 32) { 2647e102996Smaya * uint32_t lo_shifted = lo >> c; 2657e102996Smaya * uint32_t hi_shifted = hi >> c; 2667e102996Smaya * uint32_t hi_shifted_lo = hi << abs(32 - c); 2677e102996Smaya * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted); 2687e102996Smaya * } else { 2697e102996Smaya * uint32_t hi_shifted_lo = hi >> abs(32 - c); 2707e102996Smaya * return pack_64(0, hi_shifted_lo); 2717e102996Smaya * } 2727e102996Smaya * } 2737e102996Smaya */ 2747e102996Smaya 2757e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 2767e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 2777e102996Smaya 2787e102996Smaya nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 2797e102996Smaya nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y); 2807e102996Smaya nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y); 2817e102996Smaya nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count); 2827e102996Smaya 2837e102996Smaya nir_ssa_def *res_if_lt_32 = 2847e102996Smaya nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo), 2857e102996Smaya hi_shifted); 2867e102996Smaya nir_ssa_def *res_if_ge_32 = 2877e102996Smaya nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count), 2887e102996Smaya nir_imm_int(b, 0)); 2897e102996Smaya 2907ec681f3Smrg return nir_bcsel(b, nir_ieq_imm(b, y, 0), x, 2917e102996Smaya nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 2927e102996Smaya res_if_ge_32, res_if_lt_32)); 2937e102996Smaya} 2947e102996Smaya 2957e102996Smayastatic nir_ssa_def * 2967e102996Smayalower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 2977e102996Smaya{ 2987e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 2997e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 3007e102996Smaya nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 3017e102996Smaya nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 3027e102996Smaya 3037e102996Smaya nir_ssa_def *res_lo = nir_iadd(b, x_lo, y_lo); 3047e102996Smaya nir_ssa_def *carry = nir_b2i32(b, nir_ult(b, res_lo, x_lo)); 3057e102996Smaya nir_ssa_def *res_hi = nir_iadd(b, carry, nir_iadd(b, x_hi, y_hi)); 3067e102996Smaya 3077e102996Smaya return nir_pack_64_2x32_split(b, res_lo, res_hi); 3087e102996Smaya} 3097e102996Smaya 3107e102996Smayastatic nir_ssa_def * 3117e102996Smayalower_isub64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 3127e102996Smaya{ 3137e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 3147e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 3157e102996Smaya nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 3167e102996Smaya nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 3177e102996Smaya 3187e102996Smaya nir_ssa_def *res_lo = nir_isub(b, x_lo, y_lo); 3197e102996Smaya nir_ssa_def *borrow = nir_ineg(b, nir_b2i32(b, nir_ult(b, x_lo, y_lo))); 3207e102996Smaya nir_ssa_def *res_hi = nir_iadd(b, nir_isub(b, x_hi, y_hi), borrow); 3217e102996Smaya 3227e102996Smaya return nir_pack_64_2x32_split(b, res_lo, res_hi); 3237e102996Smaya} 3247e102996Smaya 3257e102996Smayastatic nir_ssa_def * 3267e102996Smayalower_ineg64(nir_builder *b, nir_ssa_def *x) 3277e102996Smaya{ 3287e102996Smaya /* Since isub is the same number of instructions (with better dependencies) 3297e102996Smaya * as iadd, subtraction is actually more efficient for ineg than the usual 3307e102996Smaya * 2's complement "flip the bits and add one". 3317e102996Smaya */ 3327e102996Smaya return lower_isub64(b, nir_imm_int64(b, 0), x); 3337e102996Smaya} 3347e102996Smaya 3357e102996Smayastatic nir_ssa_def * 3367e102996Smayalower_iabs64(nir_builder *b, nir_ssa_def *x) 3377e102996Smaya{ 3387e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 3397e102996Smaya nir_ssa_def *x_is_neg = nir_ilt(b, x_hi, nir_imm_int(b, 0)); 3407e102996Smaya return nir_bcsel(b, x_is_neg, nir_ineg(b, x), x); 3417e102996Smaya} 3427e102996Smaya 3437e102996Smayastatic nir_ssa_def * 3447e102996Smayalower_int64_compare(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *y) 3457e102996Smaya{ 3467e102996Smaya nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 3477e102996Smaya nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 3487e102996Smaya nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 3497e102996Smaya nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 3507e102996Smaya 3517e102996Smaya switch (op) { 3527e102996Smaya case nir_op_ieq: 3537e102996Smaya return nir_iand(b, nir_ieq(b, x_hi, y_hi), nir_ieq(b, x_lo, y_lo)); 3547e102996Smaya case nir_op_ine: 3557e102996Smaya return nir_ior(b, nir_ine(b, x_hi, y_hi), nir_ine(b, x_lo, y_lo)); 3567e102996Smaya case nir_op_ult: 3577e102996Smaya return nir_ior(b, nir_ult(b, x_hi, y_hi), 3587e102996Smaya nir_iand(b, nir_ieq(b, x_hi, y_hi), 3597e102996Smaya nir_ult(b, x_lo, y_lo))); 3607e102996Smaya case nir_op_ilt: 3617e102996Smaya return nir_ior(b, nir_ilt(b, x_hi, y_hi), 3627e102996Smaya nir_iand(b, nir_ieq(b, x_hi, y_hi), 3637e102996Smaya nir_ult(b, x_lo, y_lo))); 3647e102996Smaya break; 3657e102996Smaya case nir_op_uge: 3667e102996Smaya /* Lower as !(x < y) in the hopes of better CSE */ 3677e102996Smaya return nir_inot(b, lower_int64_compare(b, nir_op_ult, x, y)); 3687e102996Smaya case nir_op_ige: 3697e102996Smaya /* Lower as !(x < y) in the hopes of better CSE */ 3707e102996Smaya return nir_inot(b, lower_int64_compare(b, nir_op_ilt, x, y)); 3717e102996Smaya default: 3727e102996Smaya unreachable("Invalid comparison"); 3737e102996Smaya } 3747e102996Smaya} 3757e102996Smaya 3767e102996Smayastatic nir_ssa_def * 3777e102996Smayalower_umax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 3787e102996Smaya{ 3797e102996Smaya return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), y, x); 3807e102996Smaya} 3817e102996Smaya 3827e102996Smayastatic nir_ssa_def * 3837e102996Smayalower_imax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 3847e102996Smaya{ 3857e102996Smaya return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), y, x); 3867e102996Smaya} 3877e102996Smaya 3887e102996Smayastatic nir_ssa_def * 3897e102996Smayalower_umin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 3907e102996Smaya{ 3917e102996Smaya return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), x, y); 3927e102996Smaya} 3937e102996Smaya 3947e102996Smayastatic nir_ssa_def * 3957e102996Smayalower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 3967e102996Smaya{ 3977e102996Smaya return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y); 3987e102996Smaya} 3997e102996Smaya 4007e102996Smayastatic nir_ssa_def * 4017e102996Smayalower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, 4027e102996Smaya bool sign_extend) 4037e102996Smaya{ 4047e102996Smaya nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y) 4057e102996Smaya : nir_umul_high(b, x, y); 4067e102996Smaya 4077e102996Smaya return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi); 4087e102996Smaya} 4097e102996Smaya 41001e04c3fSmrgstatic nir_ssa_def * 41101e04c3fSmrglower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 41201e04c3fSmrg{ 41301e04c3fSmrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 41401e04c3fSmrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 41501e04c3fSmrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 41601e04c3fSmrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 41701e04c3fSmrg 4187e102996Smaya nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo); 4197e102996Smaya nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo), 42001e04c3fSmrg nir_iadd(b, nir_imul(b, x_lo, y_hi), 42101e04c3fSmrg nir_imul(b, x_hi, y_lo))); 42201e04c3fSmrg 4237e102996Smaya return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo), 4247e102996Smaya res_hi); 4257e102996Smaya} 4267e102996Smaya 4277e102996Smayastatic nir_ssa_def * 4287e102996Smayalower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, 4297e102996Smaya bool sign_extend) 4307e102996Smaya{ 4317e102996Smaya nir_ssa_def *x32[4], *y32[4]; 4327e102996Smaya x32[0] = nir_unpack_64_2x32_split_x(b, x); 4337e102996Smaya x32[1] = nir_unpack_64_2x32_split_y(b, x); 4347e102996Smaya if (sign_extend) { 4357ec681f3Smrg x32[2] = x32[3] = nir_ishr_imm(b, x32[1], 31); 4367e102996Smaya } else { 4377e102996Smaya x32[2] = x32[3] = nir_imm_int(b, 0); 4387e102996Smaya } 4397e102996Smaya 4407e102996Smaya y32[0] = nir_unpack_64_2x32_split_x(b, y); 4417e102996Smaya y32[1] = nir_unpack_64_2x32_split_y(b, y); 4427e102996Smaya if (sign_extend) { 4437ec681f3Smrg y32[2] = y32[3] = nir_ishr_imm(b, y32[1], 31); 4447e102996Smaya } else { 4457e102996Smaya y32[2] = y32[3] = nir_imm_int(b, 0); 4467e102996Smaya } 4477e102996Smaya 4487e102996Smaya nir_ssa_def *res[8] = { NULL, }; 4497e102996Smaya 4507e102996Smaya /* Yes, the following generates a pile of code. However, we throw res[0] 4517e102996Smaya * and res[1] away in the end and, if we're in the umul case, four of our 4527e102996Smaya * eight dword operands will be constant zero and opt_algebraic will clean 4537e102996Smaya * this up nicely. 4547e102996Smaya */ 4557e102996Smaya for (unsigned i = 0; i < 4; i++) { 4567e102996Smaya nir_ssa_def *carry = NULL; 4577e102996Smaya for (unsigned j = 0; j < 4; j++) { 4587e102996Smaya /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the 4597e102996Smaya * maximum value of tmp is UINT32_MAX * UINT32_MAX. The maximum 4607e102996Smaya * value that will fit in tmp is 4617e102996Smaya * 4627e102996Smaya * UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX 4637e102996Smaya * = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX 4647e102996Smaya * = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX 4657e102996Smaya * 4667e102996Smaya * so we're guaranteed that we can add in two more 32-bit values 4677e102996Smaya * without overflowing tmp. 4687e102996Smaya */ 4697e102996Smaya nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]); 4707e102996Smaya 4717e102996Smaya if (res[i + j]) 4727e102996Smaya tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j])); 4737e102996Smaya if (carry) 4747e102996Smaya tmp = nir_iadd(b, tmp, carry); 4757e102996Smaya res[i + j] = nir_u2u32(b, tmp); 4767ec681f3Smrg carry = nir_ushr_imm(b, tmp, 32); 4777e102996Smaya } 4787e102996Smaya res[i + 4] = nir_u2u32(b, carry); 4797e102996Smaya } 4807e102996Smaya 4817e102996Smaya return nir_pack_64_2x32_split(b, res[2], res[3]); 48201e04c3fSmrg} 48301e04c3fSmrg 48401e04c3fSmrgstatic nir_ssa_def * 48501e04c3fSmrglower_isign64(nir_builder *b, nir_ssa_def *x) 48601e04c3fSmrg{ 48701e04c3fSmrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 48801e04c3fSmrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 48901e04c3fSmrg 49001e04c3fSmrg nir_ssa_def *is_non_zero = nir_i2b(b, nir_ior(b, x_lo, x_hi)); 4917ec681f3Smrg nir_ssa_def *res_hi = nir_ishr_imm(b, x_hi, 31); 4927e102996Smaya nir_ssa_def *res_lo = nir_ior(b, res_hi, nir_b2i32(b, is_non_zero)); 49301e04c3fSmrg 49401e04c3fSmrg return nir_pack_64_2x32_split(b, res_lo, res_hi); 49501e04c3fSmrg} 49601e04c3fSmrg 49701e04c3fSmrgstatic void 49801e04c3fSmrglower_udiv64_mod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d, 49901e04c3fSmrg nir_ssa_def **q, nir_ssa_def **r) 50001e04c3fSmrg{ 50101e04c3fSmrg /* TODO: We should specially handle the case where the denominator is a 50201e04c3fSmrg * constant. In that case, we should be able to reduce it to a multiply by 50301e04c3fSmrg * a constant, some shifts, and an add. 50401e04c3fSmrg */ 50501e04c3fSmrg nir_ssa_def *n_lo = nir_unpack_64_2x32_split_x(b, n); 50601e04c3fSmrg nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 50701e04c3fSmrg nir_ssa_def *d_lo = nir_unpack_64_2x32_split_x(b, d); 50801e04c3fSmrg nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 50901e04c3fSmrg 5107e102996Smaya nir_ssa_def *q_lo = nir_imm_zero(b, n->num_components, 32); 5117e102996Smaya nir_ssa_def *q_hi = nir_imm_zero(b, n->num_components, 32); 51201e04c3fSmrg 51301e04c3fSmrg nir_ssa_def *n_hi_before_if = n_hi; 51401e04c3fSmrg nir_ssa_def *q_hi_before_if = q_hi; 51501e04c3fSmrg 51601e04c3fSmrg /* If the upper 32 bits of denom are non-zero, it is impossible for shifts 51701e04c3fSmrg * greater than 32 bits to occur. If the upper 32 bits of the numerator 51801e04c3fSmrg * are zero, it is impossible for (denom << [63, 32]) <= numer unless 51901e04c3fSmrg * denom == 0. 52001e04c3fSmrg */ 52101e04c3fSmrg nir_ssa_def *need_high_div = 5227ec681f3Smrg nir_iand(b, nir_ieq_imm(b, d_hi, 0), nir_uge(b, n_hi, d_lo)); 52301e04c3fSmrg nir_push_if(b, nir_bany(b, need_high_div)); 52401e04c3fSmrg { 52501e04c3fSmrg /* If we only have one component, then the bany above goes away and 52601e04c3fSmrg * this is always true within the if statement. 52701e04c3fSmrg */ 52801e04c3fSmrg if (n->num_components == 1) 52901e04c3fSmrg need_high_div = nir_imm_true(b); 53001e04c3fSmrg 53101e04c3fSmrg nir_ssa_def *log2_d_lo = nir_ufind_msb(b, d_lo); 53201e04c3fSmrg 53301e04c3fSmrg for (int i = 31; i >= 0; i--) { 53401e04c3fSmrg /* if ((d.x << i) <= n.y) { 53501e04c3fSmrg * n.y -= d.x << i; 53601e04c3fSmrg * quot.y |= 1U << i; 53701e04c3fSmrg * } 53801e04c3fSmrg */ 53901e04c3fSmrg nir_ssa_def *d_shift = nir_ishl(b, d_lo, nir_imm_int(b, i)); 54001e04c3fSmrg nir_ssa_def *new_n_hi = nir_isub(b, n_hi, d_shift); 54101e04c3fSmrg nir_ssa_def *new_q_hi = nir_ior(b, q_hi, nir_imm_int(b, 1u << i)); 54201e04c3fSmrg nir_ssa_def *cond = nir_iand(b, need_high_div, 54301e04c3fSmrg nir_uge(b, n_hi, d_shift)); 54401e04c3fSmrg if (i != 0) { 54501e04c3fSmrg /* log2_d_lo is always <= 31, so we don't need to bother with it 54601e04c3fSmrg * in the last iteration. 54701e04c3fSmrg */ 54801e04c3fSmrg cond = nir_iand(b, cond, 54901e04c3fSmrg nir_ige(b, nir_imm_int(b, 31 - i), log2_d_lo)); 55001e04c3fSmrg } 55101e04c3fSmrg n_hi = nir_bcsel(b, cond, new_n_hi, n_hi); 55201e04c3fSmrg q_hi = nir_bcsel(b, cond, new_q_hi, q_hi); 55301e04c3fSmrg } 55401e04c3fSmrg } 55501e04c3fSmrg nir_pop_if(b, NULL); 55601e04c3fSmrg n_hi = nir_if_phi(b, n_hi, n_hi_before_if); 55701e04c3fSmrg q_hi = nir_if_phi(b, q_hi, q_hi_before_if); 55801e04c3fSmrg 55901e04c3fSmrg nir_ssa_def *log2_denom = nir_ufind_msb(b, d_hi); 56001e04c3fSmrg 56101e04c3fSmrg n = nir_pack_64_2x32_split(b, n_lo, n_hi); 56201e04c3fSmrg d = nir_pack_64_2x32_split(b, d_lo, d_hi); 56301e04c3fSmrg for (int i = 31; i >= 0; i--) { 56401e04c3fSmrg /* if ((d64 << i) <= n64) { 56501e04c3fSmrg * n64 -= d64 << i; 56601e04c3fSmrg * quot.x |= 1U << i; 56701e04c3fSmrg * } 56801e04c3fSmrg */ 56901e04c3fSmrg nir_ssa_def *d_shift = nir_ishl(b, d, nir_imm_int(b, i)); 57001e04c3fSmrg nir_ssa_def *new_n = nir_isub(b, n, d_shift); 57101e04c3fSmrg nir_ssa_def *new_q_lo = nir_ior(b, q_lo, nir_imm_int(b, 1u << i)); 57201e04c3fSmrg nir_ssa_def *cond = nir_uge(b, n, d_shift); 57301e04c3fSmrg if (i != 0) { 57401e04c3fSmrg /* log2_denom is always <= 31, so we don't need to bother with it 57501e04c3fSmrg * in the last iteration. 57601e04c3fSmrg */ 57701e04c3fSmrg cond = nir_iand(b, cond, 57801e04c3fSmrg nir_ige(b, nir_imm_int(b, 31 - i), log2_denom)); 57901e04c3fSmrg } 58001e04c3fSmrg n = nir_bcsel(b, cond, new_n, n); 58101e04c3fSmrg q_lo = nir_bcsel(b, cond, new_q_lo, q_lo); 58201e04c3fSmrg } 58301e04c3fSmrg 58401e04c3fSmrg *q = nir_pack_64_2x32_split(b, q_lo, q_hi); 58501e04c3fSmrg *r = n; 58601e04c3fSmrg} 58701e04c3fSmrg 58801e04c3fSmrgstatic nir_ssa_def * 58901e04c3fSmrglower_udiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 59001e04c3fSmrg{ 59101e04c3fSmrg nir_ssa_def *q, *r; 59201e04c3fSmrg lower_udiv64_mod64(b, n, d, &q, &r); 59301e04c3fSmrg return q; 59401e04c3fSmrg} 59501e04c3fSmrg 59601e04c3fSmrgstatic nir_ssa_def * 59701e04c3fSmrglower_idiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 59801e04c3fSmrg{ 59901e04c3fSmrg nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 60001e04c3fSmrg nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 60101e04c3fSmrg 60201e04c3fSmrg nir_ssa_def *negate = nir_ine(b, nir_ilt(b, n_hi, nir_imm_int(b, 0)), 60301e04c3fSmrg nir_ilt(b, d_hi, nir_imm_int(b, 0))); 60401e04c3fSmrg nir_ssa_def *q, *r; 60501e04c3fSmrg lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 60601e04c3fSmrg return nir_bcsel(b, negate, nir_ineg(b, q), q); 60701e04c3fSmrg} 60801e04c3fSmrg 60901e04c3fSmrgstatic nir_ssa_def * 61001e04c3fSmrglower_umod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 61101e04c3fSmrg{ 61201e04c3fSmrg nir_ssa_def *q, *r; 61301e04c3fSmrg lower_udiv64_mod64(b, n, d, &q, &r); 61401e04c3fSmrg return r; 61501e04c3fSmrg} 61601e04c3fSmrg 61701e04c3fSmrgstatic nir_ssa_def * 61801e04c3fSmrglower_imod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 61901e04c3fSmrg{ 62001e04c3fSmrg nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 62101e04c3fSmrg nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 62201e04c3fSmrg nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0)); 62301e04c3fSmrg nir_ssa_def *d_is_neg = nir_ilt(b, d_hi, nir_imm_int(b, 0)); 62401e04c3fSmrg 62501e04c3fSmrg nir_ssa_def *q, *r; 62601e04c3fSmrg lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 62701e04c3fSmrg 62801e04c3fSmrg nir_ssa_def *rem = nir_bcsel(b, n_is_neg, nir_ineg(b, r), r); 62901e04c3fSmrg 6307ec681f3Smrg return nir_bcsel(b, nir_ieq_imm(b, r, 0), nir_imm_int64(b, 0), 63101e04c3fSmrg nir_bcsel(b, nir_ieq(b, n_is_neg, d_is_neg), rem, 63201e04c3fSmrg nir_iadd(b, rem, d))); 63301e04c3fSmrg} 63401e04c3fSmrg 63501e04c3fSmrgstatic nir_ssa_def * 63601e04c3fSmrglower_irem64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 63701e04c3fSmrg{ 63801e04c3fSmrg nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 63901e04c3fSmrg nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0)); 64001e04c3fSmrg 64101e04c3fSmrg nir_ssa_def *q, *r; 64201e04c3fSmrg lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 64301e04c3fSmrg return nir_bcsel(b, n_is_neg, nir_ineg(b, r), r); 64401e04c3fSmrg} 64501e04c3fSmrg 6467e102996Smayastatic nir_ssa_def * 6477e102996Smayalower_extract(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *c) 6487e102996Smaya{ 6497e102996Smaya assert(op == nir_op_extract_u8 || op == nir_op_extract_i8 || 6507e102996Smaya op == nir_op_extract_u16 || op == nir_op_extract_i16); 6517e102996Smaya 6527e102996Smaya const int chunk = nir_src_as_uint(nir_src_for_ssa(c)); 6537e102996Smaya const int chunk_bits = 6547e102996Smaya (op == nir_op_extract_u8 || op == nir_op_extract_i8) ? 8 : 16; 6557e102996Smaya const int num_chunks_in_32 = 32 / chunk_bits; 6567e102996Smaya 6577e102996Smaya nir_ssa_def *extract32; 6587e102996Smaya if (chunk < num_chunks_in_32) { 6597e102996Smaya extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_x(b, x), 6607e102996Smaya nir_imm_int(b, chunk), 6617e102996Smaya NULL, NULL); 6627e102996Smaya } else { 6637e102996Smaya extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_y(b, x), 6647e102996Smaya nir_imm_int(b, chunk - num_chunks_in_32), 6657e102996Smaya NULL, NULL); 6667e102996Smaya } 6677e102996Smaya 6687e102996Smaya if (op == nir_op_extract_i8 || op == nir_op_extract_i16) 6697e102996Smaya return lower_i2i64(b, extract32); 6707e102996Smaya else 6717e102996Smaya return lower_u2u64(b, extract32); 6727e102996Smaya} 6737e102996Smaya 6747ec681f3Smrgstatic nir_ssa_def * 6757ec681f3Smrglower_ufind_msb64(nir_builder *b, nir_ssa_def *x) 6767ec681f3Smrg{ 6777ec681f3Smrg 6787ec681f3Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 6797ec681f3Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 6807ec681f3Smrg nir_ssa_def *lo_count = nir_ufind_msb(b, x_lo); 6817ec681f3Smrg nir_ssa_def *hi_count = nir_ufind_msb(b, x_hi); 6827ec681f3Smrg nir_ssa_def *valid_hi_bits = nir_ine(b, x_hi, nir_imm_int(b, 0)); 6837ec681f3Smrg nir_ssa_def *hi_res = nir_iadd(b, nir_imm_intN_t(b, 32, 32), hi_count); 6847ec681f3Smrg return nir_bcsel(b, valid_hi_bits, hi_res, lo_count); 6857ec681f3Smrg} 6867ec681f3Smrg 6877ec681f3Smrgstatic nir_ssa_def * 6887ec681f3Smrglower_2f(nir_builder *b, nir_ssa_def *x, unsigned dest_bit_size, 6897ec681f3Smrg bool src_is_signed) 6907ec681f3Smrg{ 6917ec681f3Smrg nir_ssa_def *x_sign = NULL; 6927ec681f3Smrg 6937ec681f3Smrg if (src_is_signed) { 6947ec681f3Smrg x_sign = nir_bcsel(b, COND_LOWER_CMP(b, ilt, x, nir_imm_int64(b, 0)), 6957ec681f3Smrg nir_imm_floatN_t(b, -1, dest_bit_size), 6967ec681f3Smrg nir_imm_floatN_t(b, 1, dest_bit_size)); 6977ec681f3Smrg x = COND_LOWER_OP(b, iabs, x); 6987ec681f3Smrg } 6997ec681f3Smrg 7007ec681f3Smrg nir_ssa_def *exp = COND_LOWER_OP(b, ufind_msb, x); 7017ec681f3Smrg unsigned significand_bits; 7027ec681f3Smrg 7037ec681f3Smrg switch (dest_bit_size) { 7047ec681f3Smrg case 32: 7057ec681f3Smrg significand_bits = 23; 7067ec681f3Smrg break; 7077ec681f3Smrg case 16: 7087ec681f3Smrg significand_bits = 10; 7097ec681f3Smrg break; 7107ec681f3Smrg default: 7117ec681f3Smrg unreachable("Invalid dest_bit_size"); 7127ec681f3Smrg } 7137ec681f3Smrg 7147ec681f3Smrg nir_ssa_def *discard = 7157ec681f3Smrg nir_imax(b, nir_isub(b, exp, nir_imm_int(b, significand_bits)), 7167ec681f3Smrg nir_imm_int(b, 0)); 7177ec681f3Smrg nir_ssa_def *significand = 7187ec681f3Smrg COND_LOWER_CAST(b, u2u32, COND_LOWER_OP(b, ushr, x, discard)); 7197ec681f3Smrg 7207ec681f3Smrg /* Round-to-nearest-even implementation: 7217ec681f3Smrg * - if the non-representable part of the significand is higher than half 7227ec681f3Smrg * the minimum representable significand, we round-up 7237ec681f3Smrg * - if the non-representable part of the significand is equal to half the 7247ec681f3Smrg * minimum representable significand and the representable part of the 7257ec681f3Smrg * significand is odd, we round-up 7267ec681f3Smrg * - in any other case, we round-down 7277ec681f3Smrg */ 7287ec681f3Smrg nir_ssa_def *lsb_mask = COND_LOWER_OP(b, ishl, nir_imm_int64(b, 1), discard); 7297ec681f3Smrg nir_ssa_def *rem_mask = COND_LOWER_OP(b, isub, lsb_mask, nir_imm_int64(b, 1)); 7307ec681f3Smrg nir_ssa_def *half = COND_LOWER_OP(b, ishr, lsb_mask, nir_imm_int(b, 1)); 7317ec681f3Smrg nir_ssa_def *rem = COND_LOWER_OP(b, iand, x, rem_mask); 7327ec681f3Smrg nir_ssa_def *halfway = nir_iand(b, COND_LOWER_CMP(b, ieq, rem, half), 7337ec681f3Smrg nir_ine(b, discard, nir_imm_int(b, 0))); 7347ec681f3Smrg nir_ssa_def *is_odd = nir_i2b(b, nir_iand(b, significand, nir_imm_int(b, 1))); 7357ec681f3Smrg nir_ssa_def *round_up = nir_ior(b, COND_LOWER_CMP(b, ilt, half, rem), 7367ec681f3Smrg nir_iand(b, halfway, is_odd)); 7377ec681f3Smrg significand = nir_iadd(b, significand, nir_b2i32(b, round_up)); 7387ec681f3Smrg 7397ec681f3Smrg nir_ssa_def *res; 7407ec681f3Smrg 7417ec681f3Smrg if (dest_bit_size == 32) 7427ec681f3Smrg res = nir_fmul(b, nir_u2f32(b, significand), 7437ec681f3Smrg nir_fexp2(b, nir_u2f32(b, discard))); 7447ec681f3Smrg else 7457ec681f3Smrg res = nir_fmul(b, nir_u2f16(b, significand), 7467ec681f3Smrg nir_fexp2(b, nir_u2f16(b, discard))); 7477ec681f3Smrg 7487ec681f3Smrg if (src_is_signed) 7497ec681f3Smrg res = nir_fmul(b, res, x_sign); 7507ec681f3Smrg 7517ec681f3Smrg return res; 7527ec681f3Smrg} 7537ec681f3Smrg 7547ec681f3Smrgstatic nir_ssa_def * 7557ec681f3Smrglower_f2(nir_builder *b, nir_ssa_def *x, bool dst_is_signed) 7567ec681f3Smrg{ 7577ec681f3Smrg assert(x->bit_size == 16 || x->bit_size == 32); 7587ec681f3Smrg nir_ssa_def *x_sign = NULL; 7597ec681f3Smrg 7607ec681f3Smrg if (dst_is_signed) 7617ec681f3Smrg x_sign = nir_fsign(b, x); 7627ec681f3Smrg else 7637ec681f3Smrg x = nir_fmin(b, x, nir_imm_floatN_t(b, UINT64_MAX, x->bit_size)); 7647ec681f3Smrg 7657ec681f3Smrg x = nir_ftrunc(b, x); 7667ec681f3Smrg 7677ec681f3Smrg if (dst_is_signed) { 7687ec681f3Smrg x = nir_fmin(b, x, nir_imm_floatN_t(b, INT64_MAX, x->bit_size)); 7697ec681f3Smrg x = nir_fmax(b, x, nir_imm_floatN_t(b, INT64_MIN, x->bit_size)); 7707ec681f3Smrg x = nir_fabs(b, x); 7717ec681f3Smrg } 7727ec681f3Smrg 7737ec681f3Smrg nir_ssa_def *div = nir_imm_floatN_t(b, 1ULL << 32, x->bit_size); 7747ec681f3Smrg nir_ssa_def *res_hi = nir_f2u32(b, nir_fdiv(b, x, div)); 7757ec681f3Smrg nir_ssa_def *res_lo = nir_f2u32(b, nir_frem(b, x, div)); 7767ec681f3Smrg nir_ssa_def *res = nir_pack_64_2x32_split(b, res_lo, res_hi); 7777ec681f3Smrg 7787ec681f3Smrg if (dst_is_signed) 7797ec681f3Smrg res = nir_bcsel(b, nir_flt(b, x_sign, nir_imm_floatN_t(b, 0, x->bit_size)), 7807ec681f3Smrg nir_ineg(b, res), res); 7817ec681f3Smrg 7827ec681f3Smrg return res; 7837ec681f3Smrg} 7847ec681f3Smrg 7857ec681f3Smrgstatic nir_ssa_def * 7867ec681f3Smrglower_bit_count64(nir_builder *b, nir_ssa_def *x) 7877ec681f3Smrg{ 7887ec681f3Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 7897ec681f3Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 7907ec681f3Smrg nir_ssa_def *lo_count = nir_bit_count(b, x_lo); 7917ec681f3Smrg nir_ssa_def *hi_count = nir_bit_count(b, x_hi); 7927ec681f3Smrg return nir_iadd(b, lo_count, hi_count); 7937ec681f3Smrg} 7947ec681f3Smrg 7957e102996Smayanir_lower_int64_options 7967e102996Smayanir_lower_int64_op_to_options_mask(nir_op opcode) 79701e04c3fSmrg{ 79801e04c3fSmrg switch (opcode) { 79901e04c3fSmrg case nir_op_imul: 8007ec681f3Smrg case nir_op_amul: 80101e04c3fSmrg return nir_lower_imul64; 8027e102996Smaya case nir_op_imul_2x32_64: 8037e102996Smaya case nir_op_umul_2x32_64: 8047e102996Smaya return nir_lower_imul_2x32_64; 8057e102996Smaya case nir_op_imul_high: 8067e102996Smaya case nir_op_umul_high: 8077e102996Smaya return nir_lower_imul_high64; 80801e04c3fSmrg case nir_op_isign: 80901e04c3fSmrg return nir_lower_isign64; 81001e04c3fSmrg case nir_op_udiv: 81101e04c3fSmrg case nir_op_idiv: 81201e04c3fSmrg case nir_op_umod: 81301e04c3fSmrg case nir_op_imod: 81401e04c3fSmrg case nir_op_irem: 81501e04c3fSmrg return nir_lower_divmod64; 8167e102996Smaya case nir_op_b2i64: 8177e102996Smaya case nir_op_i2b1: 8187ec681f3Smrg case nir_op_i2i8: 8197ec681f3Smrg case nir_op_i2i16: 8207e102996Smaya case nir_op_i2i32: 8217e102996Smaya case nir_op_i2i64: 8227ec681f3Smrg case nir_op_u2u8: 8237ec681f3Smrg case nir_op_u2u16: 8247e102996Smaya case nir_op_u2u32: 8257e102996Smaya case nir_op_u2u64: 8267ec681f3Smrg case nir_op_i2f32: 8277ec681f3Smrg case nir_op_u2f32: 8287ec681f3Smrg case nir_op_i2f16: 8297ec681f3Smrg case nir_op_u2f16: 8307ec681f3Smrg case nir_op_f2i64: 8317ec681f3Smrg case nir_op_f2u64: 8327e102996Smaya case nir_op_bcsel: 8337e102996Smaya return nir_lower_mov64; 8347e102996Smaya case nir_op_ieq: 8357e102996Smaya case nir_op_ine: 8367e102996Smaya case nir_op_ult: 8377e102996Smaya case nir_op_ilt: 8387e102996Smaya case nir_op_uge: 8397e102996Smaya case nir_op_ige: 8407e102996Smaya return nir_lower_icmp64; 8417e102996Smaya case nir_op_iadd: 8427e102996Smaya case nir_op_isub: 8437e102996Smaya return nir_lower_iadd64; 8447e102996Smaya case nir_op_imin: 8457e102996Smaya case nir_op_imax: 8467e102996Smaya case nir_op_umin: 8477e102996Smaya case nir_op_umax: 8487e102996Smaya return nir_lower_minmax64; 8497e102996Smaya case nir_op_iabs: 8507e102996Smaya return nir_lower_iabs64; 8517e102996Smaya case nir_op_ineg: 8527e102996Smaya return nir_lower_ineg64; 8537e102996Smaya case nir_op_iand: 8547e102996Smaya case nir_op_ior: 8557e102996Smaya case nir_op_ixor: 8567e102996Smaya case nir_op_inot: 8577e102996Smaya return nir_lower_logic64; 8587e102996Smaya case nir_op_ishl: 8597e102996Smaya case nir_op_ishr: 8607e102996Smaya case nir_op_ushr: 8617e102996Smaya return nir_lower_shift64; 8627e102996Smaya case nir_op_extract_u8: 8637e102996Smaya case nir_op_extract_i8: 8647e102996Smaya case nir_op_extract_u16: 8657e102996Smaya case nir_op_extract_i16: 8667e102996Smaya return nir_lower_extract64; 8677ec681f3Smrg case nir_op_ufind_msb: 8687ec681f3Smrg return nir_lower_ufind_msb64; 8697ec681f3Smrg case nir_op_bit_count: 8707ec681f3Smrg return nir_lower_bit_count64; 87101e04c3fSmrg default: 87201e04c3fSmrg return 0; 87301e04c3fSmrg } 87401e04c3fSmrg} 87501e04c3fSmrg 87601e04c3fSmrgstatic nir_ssa_def * 87701e04c3fSmrglower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu) 87801e04c3fSmrg{ 87901e04c3fSmrg nir_ssa_def *src[4]; 88001e04c3fSmrg for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) 88101e04c3fSmrg src[i] = nir_ssa_for_alu_src(b, alu, i); 88201e04c3fSmrg 88301e04c3fSmrg switch (alu->op) { 88401e04c3fSmrg case nir_op_imul: 8857ec681f3Smrg case nir_op_amul: 88601e04c3fSmrg return lower_imul64(b, src[0], src[1]); 8877e102996Smaya case nir_op_imul_2x32_64: 8887e102996Smaya return lower_mul_2x32_64(b, src[0], src[1], true); 8897e102996Smaya case nir_op_umul_2x32_64: 8907e102996Smaya return lower_mul_2x32_64(b, src[0], src[1], false); 8917e102996Smaya case nir_op_imul_high: 8927e102996Smaya return lower_mul_high64(b, src[0], src[1], true); 8937e102996Smaya case nir_op_umul_high: 8947e102996Smaya return lower_mul_high64(b, src[0], src[1], false); 89501e04c3fSmrg case nir_op_isign: 89601e04c3fSmrg return lower_isign64(b, src[0]); 89701e04c3fSmrg case nir_op_udiv: 89801e04c3fSmrg return lower_udiv64(b, src[0], src[1]); 89901e04c3fSmrg case nir_op_idiv: 90001e04c3fSmrg return lower_idiv64(b, src[0], src[1]); 90101e04c3fSmrg case nir_op_umod: 90201e04c3fSmrg return lower_umod64(b, src[0], src[1]); 90301e04c3fSmrg case nir_op_imod: 90401e04c3fSmrg return lower_imod64(b, src[0], src[1]); 90501e04c3fSmrg case nir_op_irem: 90601e04c3fSmrg return lower_irem64(b, src[0], src[1]); 9077e102996Smaya case nir_op_b2i64: 9087e102996Smaya return lower_b2i64(b, src[0]); 9097e102996Smaya case nir_op_i2b1: 9107e102996Smaya return lower_i2b(b, src[0]); 9117e102996Smaya case nir_op_i2i8: 9127e102996Smaya return lower_i2i8(b, src[0]); 9137e102996Smaya case nir_op_i2i16: 9147e102996Smaya return lower_i2i16(b, src[0]); 9157e102996Smaya case nir_op_i2i32: 9167e102996Smaya return lower_i2i32(b, src[0]); 9177e102996Smaya case nir_op_i2i64: 9187e102996Smaya return lower_i2i64(b, src[0]); 9197e102996Smaya case nir_op_u2u8: 9207e102996Smaya return lower_u2u8(b, src[0]); 9217e102996Smaya case nir_op_u2u16: 9227e102996Smaya return lower_u2u16(b, src[0]); 9237e102996Smaya case nir_op_u2u32: 9247e102996Smaya return lower_u2u32(b, src[0]); 9257e102996Smaya case nir_op_u2u64: 9267e102996Smaya return lower_u2u64(b, src[0]); 9277e102996Smaya case nir_op_bcsel: 9287e102996Smaya return lower_bcsel64(b, src[0], src[1], src[2]); 9297e102996Smaya case nir_op_ieq: 9307e102996Smaya case nir_op_ine: 9317e102996Smaya case nir_op_ult: 9327e102996Smaya case nir_op_ilt: 9337e102996Smaya case nir_op_uge: 9347e102996Smaya case nir_op_ige: 9357e102996Smaya return lower_int64_compare(b, alu->op, src[0], src[1]); 9367e102996Smaya case nir_op_iadd: 9377e102996Smaya return lower_iadd64(b, src[0], src[1]); 9387e102996Smaya case nir_op_isub: 9397e102996Smaya return lower_isub64(b, src[0], src[1]); 9407e102996Smaya case nir_op_imin: 9417e102996Smaya return lower_imin64(b, src[0], src[1]); 9427e102996Smaya case nir_op_imax: 9437e102996Smaya return lower_imax64(b, src[0], src[1]); 9447e102996Smaya case nir_op_umin: 9457e102996Smaya return lower_umin64(b, src[0], src[1]); 9467e102996Smaya case nir_op_umax: 9477e102996Smaya return lower_umax64(b, src[0], src[1]); 9487e102996Smaya case nir_op_iabs: 9497e102996Smaya return lower_iabs64(b, src[0]); 9507e102996Smaya case nir_op_ineg: 9517e102996Smaya return lower_ineg64(b, src[0]); 9527e102996Smaya case nir_op_iand: 9537e102996Smaya return lower_iand64(b, src[0], src[1]); 9547e102996Smaya case nir_op_ior: 9557e102996Smaya return lower_ior64(b, src[0], src[1]); 9567e102996Smaya case nir_op_ixor: 9577e102996Smaya return lower_ixor64(b, src[0], src[1]); 9587e102996Smaya case nir_op_inot: 9597e102996Smaya return lower_inot64(b, src[0]); 9607e102996Smaya case nir_op_ishl: 9617e102996Smaya return lower_ishl64(b, src[0], src[1]); 9627e102996Smaya case nir_op_ishr: 9637e102996Smaya return lower_ishr64(b, src[0], src[1]); 9647e102996Smaya case nir_op_ushr: 9657e102996Smaya return lower_ushr64(b, src[0], src[1]); 9667e102996Smaya case nir_op_extract_u8: 9677e102996Smaya case nir_op_extract_i8: 9687e102996Smaya case nir_op_extract_u16: 9697e102996Smaya case nir_op_extract_i16: 9707e102996Smaya return lower_extract(b, alu->op, src[0], src[1]); 9717ec681f3Smrg case nir_op_ufind_msb: 9727ec681f3Smrg return lower_ufind_msb64(b, src[0]); 9737ec681f3Smrg case nir_op_bit_count: 9747ec681f3Smrg return lower_bit_count64(b, src[0]); 9757ec681f3Smrg case nir_op_i2f64: 9767ec681f3Smrg case nir_op_i2f32: 9777ec681f3Smrg case nir_op_i2f16: 9787ec681f3Smrg return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), true); 9797ec681f3Smrg case nir_op_u2f64: 9807ec681f3Smrg case nir_op_u2f32: 9817ec681f3Smrg case nir_op_u2f16: 9827ec681f3Smrg return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), false); 9837ec681f3Smrg case nir_op_f2i64: 9847ec681f3Smrg case nir_op_f2u64: 9857ec681f3Smrg /* We don't support f64toi64 (yet?). */ 9867ec681f3Smrg if (src[0]->bit_size > 32) 9877ec681f3Smrg return false; 9887ec681f3Smrg 9897ec681f3Smrg return lower_f2(b, src[0], alu->op == nir_op_f2i64); 99001e04c3fSmrg default: 99101e04c3fSmrg unreachable("Invalid ALU opcode to lower"); 99201e04c3fSmrg } 99301e04c3fSmrg} 99401e04c3fSmrg 99501e04c3fSmrgstatic bool 9967ec681f3Smrgshould_lower_int64_alu_instr(const nir_alu_instr *alu, 9977ec681f3Smrg const nir_shader_compiler_options *options) 9987ec681f3Smrg{ 9997ec681f3Smrg switch (alu->op) { 10007ec681f3Smrg case nir_op_i2b1: 10017ec681f3Smrg case nir_op_i2i8: 10027ec681f3Smrg case nir_op_i2i16: 10037ec681f3Smrg case nir_op_i2i32: 10047ec681f3Smrg case nir_op_u2u8: 10057ec681f3Smrg case nir_op_u2u16: 10067ec681f3Smrg case nir_op_u2u32: 10077ec681f3Smrg assert(alu->src[0].src.is_ssa); 10087ec681f3Smrg if (alu->src[0].src.ssa->bit_size != 64) 10097ec681f3Smrg return false; 10107ec681f3Smrg break; 10117ec681f3Smrg case nir_op_bcsel: 10127ec681f3Smrg assert(alu->src[1].src.is_ssa); 10137ec681f3Smrg assert(alu->src[2].src.is_ssa); 10147ec681f3Smrg assert(alu->src[1].src.ssa->bit_size == 10157ec681f3Smrg alu->src[2].src.ssa->bit_size); 10167ec681f3Smrg if (alu->src[1].src.ssa->bit_size != 64) 10177ec681f3Smrg return false; 10187ec681f3Smrg break; 10197ec681f3Smrg case nir_op_ieq: 10207ec681f3Smrg case nir_op_ine: 10217ec681f3Smrg case nir_op_ult: 10227ec681f3Smrg case nir_op_ilt: 10237ec681f3Smrg case nir_op_uge: 10247ec681f3Smrg case nir_op_ige: 10257ec681f3Smrg assert(alu->src[0].src.is_ssa); 10267ec681f3Smrg assert(alu->src[1].src.is_ssa); 10277ec681f3Smrg assert(alu->src[0].src.ssa->bit_size == 10287ec681f3Smrg alu->src[1].src.ssa->bit_size); 10297ec681f3Smrg if (alu->src[0].src.ssa->bit_size != 64) 10307ec681f3Smrg return false; 10317ec681f3Smrg break; 10327ec681f3Smrg case nir_op_ufind_msb: 10337ec681f3Smrg case nir_op_bit_count: 10347ec681f3Smrg assert(alu->src[0].src.is_ssa); 10357ec681f3Smrg if (alu->src[0].src.ssa->bit_size != 64) 10367ec681f3Smrg return false; 10377ec681f3Smrg break; 10387ec681f3Smrg case nir_op_amul: 10397ec681f3Smrg assert(alu->dest.dest.is_ssa); 10407ec681f3Smrg if (options->has_imul24) 10417ec681f3Smrg return false; 10427ec681f3Smrg if (alu->dest.dest.ssa.bit_size != 64) 10437ec681f3Smrg return false; 10447ec681f3Smrg break; 10457ec681f3Smrg case nir_op_i2f64: 10467ec681f3Smrg case nir_op_u2f64: 10477ec681f3Smrg case nir_op_i2f32: 10487ec681f3Smrg case nir_op_u2f32: 10497ec681f3Smrg case nir_op_i2f16: 10507ec681f3Smrg case nir_op_u2f16: 10517ec681f3Smrg assert(alu->src[0].src.is_ssa); 10527ec681f3Smrg if (alu->src[0].src.ssa->bit_size != 64) 10537ec681f3Smrg return false; 10547ec681f3Smrg break; 10557ec681f3Smrg case nir_op_f2u64: 10567ec681f3Smrg case nir_op_f2i64: 10577ec681f3Smrg FALLTHROUGH; 10587ec681f3Smrg default: 10597ec681f3Smrg assert(alu->dest.dest.is_ssa); 10607ec681f3Smrg if (alu->dest.dest.ssa.bit_size != 64) 10617ec681f3Smrg return false; 10627ec681f3Smrg break; 10637ec681f3Smrg } 106401e04c3fSmrg 10657ec681f3Smrg unsigned mask = nir_lower_int64_op_to_options_mask(alu->op); 10667ec681f3Smrg return (options->lower_int64_options & mask) != 0; 10677ec681f3Smrg} 106801e04c3fSmrg 10697ec681f3Smrgstatic nir_ssa_def * 10707ec681f3Smrgsplit_64bit_subgroup_op(nir_builder *b, const nir_intrinsic_instr *intrin) 10717ec681f3Smrg{ 10727ec681f3Smrg const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic]; 107301e04c3fSmrg 10747ec681f3Smrg /* This works on subgroup ops with a single 64-bit source which can be 10757ec681f3Smrg * trivially lowered by doing the exact same op on both halves. 10767ec681f3Smrg */ 10777ec681f3Smrg assert(intrin->src[0].is_ssa && intrin->src[0].ssa->bit_size == 64); 10787ec681f3Smrg nir_ssa_def *split_src0[2] = { 10797ec681f3Smrg nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa), 10807ec681f3Smrg nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa), 10817ec681f3Smrg }; 10827ec681f3Smrg 10837ec681f3Smrg assert(info->has_dest && intrin->dest.is_ssa && 10847ec681f3Smrg intrin->dest.ssa.bit_size == 64); 10857ec681f3Smrg 10867ec681f3Smrg nir_ssa_def *res[2]; 10877ec681f3Smrg for (unsigned i = 0; i < 2; i++) { 10887ec681f3Smrg nir_intrinsic_instr *split = 10897ec681f3Smrg nir_intrinsic_instr_create(b->shader, intrin->intrinsic); 10907ec681f3Smrg split->num_components = intrin->num_components; 10917ec681f3Smrg split->src[0] = nir_src_for_ssa(split_src0[i]); 10927ec681f3Smrg 10937ec681f3Smrg /* Other sources must be less than 64 bits and get copied directly */ 10947ec681f3Smrg for (unsigned j = 1; j < info->num_srcs; j++) { 10957ec681f3Smrg assert(intrin->src[j].is_ssa && intrin->src[j].ssa->bit_size < 64); 10967ec681f3Smrg split->src[j] = nir_src_for_ssa(intrin->src[j].ssa); 109701e04c3fSmrg } 10987ec681f3Smrg 10997ec681f3Smrg /* Copy const indices, if any */ 11007ec681f3Smrg memcpy(split->const_index, intrin->const_index, 11017ec681f3Smrg sizeof(intrin->const_index)); 11027ec681f3Smrg 11037ec681f3Smrg nir_ssa_dest_init(&split->instr, &split->dest, 11047ec681f3Smrg intrin->dest.ssa.num_components, 32, NULL); 11057ec681f3Smrg nir_builder_instr_insert(b, &split->instr); 11067ec681f3Smrg 11077ec681f3Smrg res[i] = &split->dest.ssa; 110801e04c3fSmrg } 110901e04c3fSmrg 11107ec681f3Smrg return nir_pack_64_2x32_split(b, res[0], res[1]); 11117ec681f3Smrg} 11127ec681f3Smrg 11137ec681f3Smrgstatic nir_ssa_def * 11147ec681f3Smrgbuild_vote_ieq(nir_builder *b, nir_ssa_def *x) 11157ec681f3Smrg{ 11167ec681f3Smrg nir_intrinsic_instr *vote = 11177ec681f3Smrg nir_intrinsic_instr_create(b->shader, nir_intrinsic_vote_ieq); 11187ec681f3Smrg vote->src[0] = nir_src_for_ssa(x); 11197ec681f3Smrg vote->num_components = x->num_components; 11207ec681f3Smrg nir_ssa_dest_init(&vote->instr, &vote->dest, 1, 1, NULL); 11217ec681f3Smrg nir_builder_instr_insert(b, &vote->instr); 11227ec681f3Smrg return &vote->dest.ssa; 11237ec681f3Smrg} 11247ec681f3Smrg 11257ec681f3Smrgstatic nir_ssa_def * 11267ec681f3Smrglower_vote_ieq(nir_builder *b, nir_ssa_def *x) 11277ec681f3Smrg{ 11287ec681f3Smrg return nir_iand(b, build_vote_ieq(b, nir_unpack_64_2x32_split_x(b, x)), 11297ec681f3Smrg build_vote_ieq(b, nir_unpack_64_2x32_split_y(b, x))); 11307ec681f3Smrg} 11317ec681f3Smrg 11327ec681f3Smrgstatic nir_ssa_def * 11337ec681f3Smrgbuild_scan_intrinsic(nir_builder *b, nir_intrinsic_op scan_op, 11347ec681f3Smrg nir_op reduction_op, unsigned cluster_size, 11357ec681f3Smrg nir_ssa_def *val) 11367ec681f3Smrg{ 11377ec681f3Smrg nir_intrinsic_instr *scan = 11387ec681f3Smrg nir_intrinsic_instr_create(b->shader, scan_op); 11397ec681f3Smrg scan->num_components = val->num_components; 11407ec681f3Smrg scan->src[0] = nir_src_for_ssa(val); 11417ec681f3Smrg nir_intrinsic_set_reduction_op(scan, reduction_op); 11427ec681f3Smrg if (scan_op == nir_intrinsic_reduce) 11437ec681f3Smrg nir_intrinsic_set_cluster_size(scan, cluster_size); 11447ec681f3Smrg nir_ssa_dest_init(&scan->instr, &scan->dest, 11457ec681f3Smrg val->num_components, val->bit_size, NULL); 11467ec681f3Smrg nir_builder_instr_insert(b, &scan->instr); 11477ec681f3Smrg return &scan->dest.ssa; 11487ec681f3Smrg} 11497ec681f3Smrg 11507ec681f3Smrgstatic nir_ssa_def * 11517ec681f3Smrglower_scan_iadd64(nir_builder *b, const nir_intrinsic_instr *intrin) 11527ec681f3Smrg{ 11537ec681f3Smrg unsigned cluster_size = 11547ec681f3Smrg intrin->intrinsic == nir_intrinsic_reduce ? 11557ec681f3Smrg nir_intrinsic_cluster_size(intrin) : 0; 11567ec681f3Smrg 11577ec681f3Smrg /* Split it into three chunks of no more than 24 bits each. With 8 bits 11587ec681f3Smrg * of headroom, we're guaranteed that there will never be overflow in the 11597ec681f3Smrg * individual subgroup operations. (Assuming, of course, a subgroup size 11607ec681f3Smrg * no larger than 256 which seems reasonable.) We can then scan on each of 11617ec681f3Smrg * the chunks and add them back together at the end. 11627ec681f3Smrg */ 11637ec681f3Smrg assert(intrin->src[0].is_ssa); 11647ec681f3Smrg nir_ssa_def *x = intrin->src[0].ssa; 11657ec681f3Smrg nir_ssa_def *x_low = 11667ec681f3Smrg nir_u2u32(b, nir_iand_imm(b, x, 0xffffff)); 11677ec681f3Smrg nir_ssa_def *x_mid = 11687ec681f3Smrg nir_u2u32(b, nir_iand_imm(b, nir_ushr(b, x, nir_imm_int(b, 24)), 11697ec681f3Smrg 0xffffff)); 11707ec681f3Smrg nir_ssa_def *x_hi = 11717ec681f3Smrg nir_u2u32(b, nir_ushr(b, x, nir_imm_int(b, 48))); 11727ec681f3Smrg 11737ec681f3Smrg nir_ssa_def *scan_low = 11747ec681f3Smrg build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd, 11757ec681f3Smrg cluster_size, x_low); 11767ec681f3Smrg nir_ssa_def *scan_mid = 11777ec681f3Smrg build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd, 11787ec681f3Smrg cluster_size, x_mid); 11797ec681f3Smrg nir_ssa_def *scan_hi = 11807ec681f3Smrg build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd, 11817ec681f3Smrg cluster_size, x_hi); 11827ec681f3Smrg 11837ec681f3Smrg scan_low = nir_u2u64(b, scan_low); 11847ec681f3Smrg scan_mid = nir_ishl(b, nir_u2u64(b, scan_mid), nir_imm_int(b, 24)); 11857ec681f3Smrg scan_hi = nir_ishl(b, nir_u2u64(b, scan_hi), nir_imm_int(b, 48)); 11867ec681f3Smrg 11877ec681f3Smrg return nir_iadd(b, scan_hi, nir_iadd(b, scan_mid, scan_low)); 11887ec681f3Smrg} 11897ec681f3Smrg 11907ec681f3Smrgstatic bool 11917ec681f3Smrgshould_lower_int64_intrinsic(const nir_intrinsic_instr *intrin, 11927ec681f3Smrg const nir_shader_compiler_options *options) 11937ec681f3Smrg{ 11947ec681f3Smrg switch (intrin->intrinsic) { 11957ec681f3Smrg case nir_intrinsic_read_invocation: 11967ec681f3Smrg case nir_intrinsic_read_first_invocation: 11977ec681f3Smrg case nir_intrinsic_shuffle: 11987ec681f3Smrg case nir_intrinsic_shuffle_xor: 11997ec681f3Smrg case nir_intrinsic_shuffle_up: 12007ec681f3Smrg case nir_intrinsic_shuffle_down: 12017ec681f3Smrg case nir_intrinsic_quad_broadcast: 12027ec681f3Smrg case nir_intrinsic_quad_swap_horizontal: 12037ec681f3Smrg case nir_intrinsic_quad_swap_vertical: 12047ec681f3Smrg case nir_intrinsic_quad_swap_diagonal: 12057ec681f3Smrg assert(intrin->dest.is_ssa); 12067ec681f3Smrg return intrin->dest.ssa.bit_size == 64 && 12077ec681f3Smrg (options->lower_int64_options & nir_lower_subgroup_shuffle64); 12087ec681f3Smrg 12097ec681f3Smrg case nir_intrinsic_vote_ieq: 12107ec681f3Smrg assert(intrin->src[0].is_ssa); 12117ec681f3Smrg return intrin->src[0].ssa->bit_size == 64 && 12127ec681f3Smrg (options->lower_int64_options & nir_lower_vote_ieq64); 12137ec681f3Smrg 12147ec681f3Smrg case nir_intrinsic_reduce: 12157ec681f3Smrg case nir_intrinsic_inclusive_scan: 12167ec681f3Smrg case nir_intrinsic_exclusive_scan: 12177ec681f3Smrg assert(intrin->dest.is_ssa); 12187ec681f3Smrg if (intrin->dest.ssa.bit_size != 64) 12197ec681f3Smrg return false; 12207ec681f3Smrg 12217ec681f3Smrg switch (nir_intrinsic_reduction_op(intrin)) { 12227ec681f3Smrg case nir_op_iadd: 12237ec681f3Smrg return options->lower_int64_options & nir_lower_scan_reduce_iadd64; 12247ec681f3Smrg case nir_op_iand: 12257ec681f3Smrg case nir_op_ior: 12267ec681f3Smrg case nir_op_ixor: 12277ec681f3Smrg return options->lower_int64_options & nir_lower_scan_reduce_bitwise64; 12287ec681f3Smrg default: 12297ec681f3Smrg return false; 12307ec681f3Smrg } 12317ec681f3Smrg break; 12327ec681f3Smrg 12337ec681f3Smrg default: 12347ec681f3Smrg return false; 12357e102996Smaya } 12367ec681f3Smrg} 123701e04c3fSmrg 12387ec681f3Smrgstatic nir_ssa_def * 12397ec681f3Smrglower_int64_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin) 12407ec681f3Smrg{ 12417ec681f3Smrg switch (intrin->intrinsic) { 12427ec681f3Smrg case nir_intrinsic_read_invocation: 12437ec681f3Smrg case nir_intrinsic_read_first_invocation: 12447ec681f3Smrg case nir_intrinsic_shuffle: 12457ec681f3Smrg case nir_intrinsic_shuffle_xor: 12467ec681f3Smrg case nir_intrinsic_shuffle_up: 12477ec681f3Smrg case nir_intrinsic_shuffle_down: 12487ec681f3Smrg case nir_intrinsic_quad_broadcast: 12497ec681f3Smrg case nir_intrinsic_quad_swap_horizontal: 12507ec681f3Smrg case nir_intrinsic_quad_swap_vertical: 12517ec681f3Smrg case nir_intrinsic_quad_swap_diagonal: 12527ec681f3Smrg return split_64bit_subgroup_op(b, intrin); 12537ec681f3Smrg 12547ec681f3Smrg case nir_intrinsic_vote_ieq: 12557ec681f3Smrg assert(intrin->src[0].is_ssa); 12567ec681f3Smrg return lower_vote_ieq(b, intrin->src[0].ssa); 12577ec681f3Smrg 12587ec681f3Smrg case nir_intrinsic_reduce: 12597ec681f3Smrg case nir_intrinsic_inclusive_scan: 12607ec681f3Smrg case nir_intrinsic_exclusive_scan: 12617ec681f3Smrg switch (nir_intrinsic_reduction_op(intrin)) { 12627ec681f3Smrg case nir_op_iadd: 12637ec681f3Smrg return lower_scan_iadd64(b, intrin); 12647ec681f3Smrg case nir_op_iand: 12657ec681f3Smrg case nir_op_ior: 12667ec681f3Smrg case nir_op_ixor: 12677ec681f3Smrg return split_64bit_subgroup_op(b, intrin); 12687ec681f3Smrg default: 12697ec681f3Smrg unreachable("Unsupported subgroup scan/reduce op"); 12707ec681f3Smrg } 12717ec681f3Smrg break; 12727ec681f3Smrg 12737ec681f3Smrg default: 12747ec681f3Smrg unreachable("Unsupported intrinsic"); 12757ec681f3Smrg } 127601e04c3fSmrg} 127701e04c3fSmrg 12787ec681f3Smrgstatic bool 12797ec681f3Smrgshould_lower_int64_instr(const nir_instr *instr, const void *_options) 128001e04c3fSmrg{ 12817ec681f3Smrg switch (instr->type) { 12827ec681f3Smrg case nir_instr_type_alu: 12837ec681f3Smrg return should_lower_int64_alu_instr(nir_instr_as_alu(instr), _options); 12847ec681f3Smrg case nir_instr_type_intrinsic: 12857ec681f3Smrg return should_lower_int64_intrinsic(nir_instr_as_intrinsic(instr), 12867ec681f3Smrg _options); 12877ec681f3Smrg default: 12887ec681f3Smrg return false; 12897ec681f3Smrg } 12907ec681f3Smrg} 129101e04c3fSmrg 12927ec681f3Smrgstatic nir_ssa_def * 12937ec681f3Smrglower_int64_instr(nir_builder *b, nir_instr *instr, void *_options) 12947ec681f3Smrg{ 12957ec681f3Smrg switch (instr->type) { 12967ec681f3Smrg case nir_instr_type_alu: 12977ec681f3Smrg return lower_int64_alu_instr(b, nir_instr_as_alu(instr)); 12987ec681f3Smrg case nir_instr_type_intrinsic: 12997ec681f3Smrg return lower_int64_intrinsic(b, nir_instr_as_intrinsic(instr)); 13007ec681f3Smrg default: 13017ec681f3Smrg return NULL; 130201e04c3fSmrg } 13037ec681f3Smrg} 130401e04c3fSmrg 13057ec681f3Smrgbool 13067ec681f3Smrgnir_lower_int64(nir_shader *shader) 13077ec681f3Smrg{ 13087ec681f3Smrg return nir_shader_lower_instructions(shader, should_lower_int64_instr, 13097ec681f3Smrg lower_int64_instr, 13107ec681f3Smrg (void *)shader->options); 131101e04c3fSmrg} 1312