101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2016 Intel Corporation
301e04c3fSmrg *
401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
501e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
601e04c3fSmrg * to deal in the Software without restriction, including without limitation
701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
901e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1001e04c3fSmrg *
1101e04c3fSmrg * The above copyright notice and this permission notice (including the next
1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1301e04c3fSmrg * Software.
1401e04c3fSmrg *
1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
2101e04c3fSmrg * IN THE SOFTWARE.
2201e04c3fSmrg */
2301e04c3fSmrg
2401e04c3fSmrg#include "nir.h"
2501e04c3fSmrg#include "nir_builder.h"
2601e04c3fSmrg
277ec681f3Smrg#define COND_LOWER_OP(b, name, ...)                                   \
287ec681f3Smrg        (b->shader->options->lower_int64_options &                    \
297ec681f3Smrg         nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
307ec681f3Smrg        lower_##name##64(b, __VA_ARGS__) : nir_##name(b, __VA_ARGS__)
317ec681f3Smrg
327ec681f3Smrg#define COND_LOWER_CMP(b, name, ...)                                  \
337ec681f3Smrg        (b->shader->options->lower_int64_options &                    \
347ec681f3Smrg         nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
357ec681f3Smrg        lower_int64_compare(b, nir_op_##name, __VA_ARGS__) :          \
367ec681f3Smrg        nir_##name(b, __VA_ARGS__)
377ec681f3Smrg
387ec681f3Smrg#define COND_LOWER_CAST(b, name, ...)                                 \
397ec681f3Smrg        (b->shader->options->lower_int64_options &                    \
407ec681f3Smrg         nir_lower_int64_op_to_options_mask(nir_op_##name)) ?         \
417ec681f3Smrg        lower_##name(b, __VA_ARGS__) :                                \
427ec681f3Smrg        nir_##name(b, __VA_ARGS__)
437ec681f3Smrg
447e102996Smayastatic nir_ssa_def *
457e102996Smayalower_b2i64(nir_builder *b, nir_ssa_def *x)
467e102996Smaya{
477e102996Smaya   return nir_pack_64_2x32_split(b, nir_b2i32(b, x), nir_imm_int(b, 0));
487e102996Smaya}
497e102996Smaya
507e102996Smayastatic nir_ssa_def *
517e102996Smayalower_i2b(nir_builder *b, nir_ssa_def *x)
527e102996Smaya{
537e102996Smaya   return nir_ine(b, nir_ior(b, nir_unpack_64_2x32_split_x(b, x),
547e102996Smaya                                nir_unpack_64_2x32_split_y(b, x)),
557e102996Smaya                     nir_imm_int(b, 0));
567e102996Smaya}
577e102996Smaya
587e102996Smayastatic nir_ssa_def *
597e102996Smayalower_i2i8(nir_builder *b, nir_ssa_def *x)
607e102996Smaya{
617e102996Smaya   return nir_i2i8(b, nir_unpack_64_2x32_split_x(b, x));
627e102996Smaya}
637e102996Smaya
647e102996Smayastatic nir_ssa_def *
657e102996Smayalower_i2i16(nir_builder *b, nir_ssa_def *x)
667e102996Smaya{
677e102996Smaya   return nir_i2i16(b, nir_unpack_64_2x32_split_x(b, x));
687e102996Smaya}
697e102996Smaya
707e102996Smaya
717e102996Smayastatic nir_ssa_def *
727e102996Smayalower_i2i32(nir_builder *b, nir_ssa_def *x)
737e102996Smaya{
747e102996Smaya   return nir_unpack_64_2x32_split_x(b, x);
757e102996Smaya}
767e102996Smaya
777e102996Smayastatic nir_ssa_def *
787e102996Smayalower_i2i64(nir_builder *b, nir_ssa_def *x)
797e102996Smaya{
807e102996Smaya   nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_i2i32(b, x);
817ec681f3Smrg   return nir_pack_64_2x32_split(b, x32, nir_ishr_imm(b, x32, 31));
827e102996Smaya}
837e102996Smaya
847e102996Smayastatic nir_ssa_def *
857e102996Smayalower_u2u8(nir_builder *b, nir_ssa_def *x)
867e102996Smaya{
877e102996Smaya   return nir_u2u8(b, nir_unpack_64_2x32_split_x(b, x));
887e102996Smaya}
897e102996Smaya
907e102996Smayastatic nir_ssa_def *
917e102996Smayalower_u2u16(nir_builder *b, nir_ssa_def *x)
927e102996Smaya{
937e102996Smaya   return nir_u2u16(b, nir_unpack_64_2x32_split_x(b, x));
947e102996Smaya}
957e102996Smaya
967e102996Smayastatic nir_ssa_def *
977e102996Smayalower_u2u32(nir_builder *b, nir_ssa_def *x)
987e102996Smaya{
997e102996Smaya   return nir_unpack_64_2x32_split_x(b, x);
1007e102996Smaya}
1017e102996Smaya
1027e102996Smayastatic nir_ssa_def *
1037e102996Smayalower_u2u64(nir_builder *b, nir_ssa_def *x)
1047e102996Smaya{
1057e102996Smaya   nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_u2u32(b, x);
1067e102996Smaya   return nir_pack_64_2x32_split(b, x32, nir_imm_int(b, 0));
1077e102996Smaya}
1087e102996Smaya
1097e102996Smayastatic nir_ssa_def *
1107e102996Smayalower_bcsel64(nir_builder *b, nir_ssa_def *cond, nir_ssa_def *x, nir_ssa_def *y)
1117e102996Smaya{
1127e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
1137e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
1147e102996Smaya   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
1157e102996Smaya   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
1167e102996Smaya
1177e102996Smaya   return nir_pack_64_2x32_split(b, nir_bcsel(b, cond, x_lo, y_lo),
1187e102996Smaya                                    nir_bcsel(b, cond, x_hi, y_hi));
1197e102996Smaya}
1207e102996Smaya
1217e102996Smayastatic nir_ssa_def *
1227e102996Smayalower_inot64(nir_builder *b, nir_ssa_def *x)
1237e102996Smaya{
1247e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
1257e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
1267e102996Smaya
1277e102996Smaya   return nir_pack_64_2x32_split(b, nir_inot(b, x_lo), nir_inot(b, x_hi));
1287e102996Smaya}
1297e102996Smaya
1307e102996Smayastatic nir_ssa_def *
1317e102996Smayalower_iand64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
1327e102996Smaya{
1337e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
1347e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
1357e102996Smaya   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
1367e102996Smaya   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
1377e102996Smaya
1387e102996Smaya   return nir_pack_64_2x32_split(b, nir_iand(b, x_lo, y_lo),
1397e102996Smaya                                    nir_iand(b, x_hi, y_hi));
1407e102996Smaya}
1417e102996Smaya
1427e102996Smayastatic nir_ssa_def *
1437e102996Smayalower_ior64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
1447e102996Smaya{
1457e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
1467e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
1477e102996Smaya   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
1487e102996Smaya   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
1497e102996Smaya
1507e102996Smaya   return nir_pack_64_2x32_split(b, nir_ior(b, x_lo, y_lo),
1517e102996Smaya                                    nir_ior(b, x_hi, y_hi));
1527e102996Smaya}
1537e102996Smaya
1547e102996Smayastatic nir_ssa_def *
1557e102996Smayalower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
1567e102996Smaya{
1577e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
1587e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
1597e102996Smaya   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
1607e102996Smaya   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
1617e102996Smaya
1627e102996Smaya   return nir_pack_64_2x32_split(b, nir_ixor(b, x_lo, y_lo),
1637e102996Smaya                                    nir_ixor(b, x_hi, y_hi));
1647e102996Smaya}
1657e102996Smaya
1667e102996Smayastatic nir_ssa_def *
1677e102996Smayalower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
1687e102996Smaya{
1697e102996Smaya   /* Implemented as
1707e102996Smaya    *
1717e102996Smaya    * uint64_t lshift(uint64_t x, int c)
1727e102996Smaya    * {
1737e102996Smaya    *    if (c == 0) return x;
1747e102996Smaya    *
1757e102996Smaya    *    uint32_t lo = LO(x), hi = HI(x);
1767e102996Smaya    *
1777e102996Smaya    *    if (c < 32) {
1787e102996Smaya    *       uint32_t lo_shifted = lo << c;
1797e102996Smaya    *       uint32_t hi_shifted = hi << c;
1807e102996Smaya    *       uint32_t lo_shifted_hi = lo >> abs(32 - c);
1817e102996Smaya    *       return pack_64(lo_shifted, hi_shifted | lo_shifted_hi);
1827e102996Smaya    *    } else {
1837e102996Smaya    *       uint32_t lo_shifted_hi = lo << abs(32 - c);
1847e102996Smaya    *       return pack_64(0, lo_shifted_hi);
1857e102996Smaya    *    }
1867e102996Smaya    * }
1877e102996Smaya    */
1887e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
1897e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
1907e102996Smaya
1917e102996Smaya   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
1927e102996Smaya   nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y);
1937e102996Smaya   nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y);
1947e102996Smaya   nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count);
1957e102996Smaya
1967e102996Smaya   nir_ssa_def *res_if_lt_32 =
1977e102996Smaya      nir_pack_64_2x32_split(b, lo_shifted,
1987e102996Smaya                                nir_ior(b, hi_shifted, lo_shifted_hi));
1997e102996Smaya   nir_ssa_def *res_if_ge_32 =
2007e102996Smaya      nir_pack_64_2x32_split(b, nir_imm_int(b, 0),
2017e102996Smaya                                nir_ishl(b, x_lo, reverse_count));
2027e102996Smaya
2037ec681f3Smrg   return nir_bcsel(b, nir_ieq_imm(b, y, 0), x,
2047e102996Smaya                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
2057e102996Smaya                                 res_if_ge_32, res_if_lt_32));
2067e102996Smaya}
2077e102996Smaya
2087e102996Smayastatic nir_ssa_def *
2097e102996Smayalower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
2107e102996Smaya{
2117e102996Smaya   /* Implemented as
2127e102996Smaya    *
2137e102996Smaya    * uint64_t arshift(uint64_t x, int c)
2147e102996Smaya    * {
2157e102996Smaya    *    if (c == 0) return x;
2167e102996Smaya    *
2177e102996Smaya    *    uint32_t lo = LO(x);
2187e102996Smaya    *    int32_t  hi = HI(x);
2197e102996Smaya    *
2207e102996Smaya    *    if (c < 32) {
2217e102996Smaya    *       uint32_t lo_shifted = lo >> c;
2227e102996Smaya    *       uint32_t hi_shifted = hi >> c;
2237e102996Smaya    *       uint32_t hi_shifted_lo = hi << abs(32 - c);
2247e102996Smaya    *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
2257e102996Smaya    *    } else {
2267e102996Smaya    *       uint32_t hi_shifted = hi >> 31;
2277e102996Smaya    *       uint32_t hi_shifted_lo = hi >> abs(32 - c);
2287e102996Smaya    *       return pack_64(hi_shifted, hi_shifted_lo);
2297e102996Smaya    *    }
2307e102996Smaya    * }
2317e102996Smaya    */
2327e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
2337e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
2347e102996Smaya
2357e102996Smaya   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
2367e102996Smaya   nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);
2377e102996Smaya   nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y);
2387e102996Smaya   nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);
2397e102996Smaya
2407e102996Smaya   nir_ssa_def *res_if_lt_32 =
2417e102996Smaya      nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),
2427e102996Smaya                                hi_shifted);
2437e102996Smaya   nir_ssa_def *res_if_ge_32 =
2447e102996Smaya      nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count),
2457e102996Smaya                                nir_ishr(b, x_hi, nir_imm_int(b, 31)));
2467e102996Smaya
2477ec681f3Smrg   return nir_bcsel(b, nir_ieq_imm(b, y, 0), x,
2487e102996Smaya                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
2497e102996Smaya                                 res_if_ge_32, res_if_lt_32));
2507e102996Smaya}
2517e102996Smaya
2527e102996Smayastatic nir_ssa_def *
2537e102996Smayalower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
2547e102996Smaya{
2557e102996Smaya   /* Implemented as
2567e102996Smaya    *
2577e102996Smaya    * uint64_t rshift(uint64_t x, int c)
2587e102996Smaya    * {
2597e102996Smaya    *    if (c == 0) return x;
2607e102996Smaya    *
2617e102996Smaya    *    uint32_t lo = LO(x), hi = HI(x);
2627e102996Smaya    *
2637e102996Smaya    *    if (c < 32) {
2647e102996Smaya    *       uint32_t lo_shifted = lo >> c;
2657e102996Smaya    *       uint32_t hi_shifted = hi >> c;
2667e102996Smaya    *       uint32_t hi_shifted_lo = hi << abs(32 - c);
2677e102996Smaya    *       return pack_64(hi_shifted, hi_shifted_lo | lo_shifted);
2687e102996Smaya    *    } else {
2697e102996Smaya    *       uint32_t hi_shifted_lo = hi >> abs(32 - c);
2707e102996Smaya    *       return pack_64(0, hi_shifted_lo);
2717e102996Smaya    *    }
2727e102996Smaya    * }
2737e102996Smaya    */
2747e102996Smaya
2757e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
2767e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
2777e102996Smaya
2787e102996Smaya   nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32)));
2797e102996Smaya   nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y);
2807e102996Smaya   nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y);
2817e102996Smaya   nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count);
2827e102996Smaya
2837e102996Smaya   nir_ssa_def *res_if_lt_32 =
2847e102996Smaya      nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo),
2857e102996Smaya                                hi_shifted);
2867e102996Smaya   nir_ssa_def *res_if_ge_32 =
2877e102996Smaya      nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count),
2887e102996Smaya                                nir_imm_int(b, 0));
2897e102996Smaya
2907ec681f3Smrg   return nir_bcsel(b, nir_ieq_imm(b, y, 0), x,
2917e102996Smaya                    nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)),
2927e102996Smaya                                 res_if_ge_32, res_if_lt_32));
2937e102996Smaya}
2947e102996Smaya
2957e102996Smayastatic nir_ssa_def *
2967e102996Smayalower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
2977e102996Smaya{
2987e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
2997e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
3007e102996Smaya   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
3017e102996Smaya   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
3027e102996Smaya
3037e102996Smaya   nir_ssa_def *res_lo = nir_iadd(b, x_lo, y_lo);
3047e102996Smaya   nir_ssa_def *carry = nir_b2i32(b, nir_ult(b, res_lo, x_lo));
3057e102996Smaya   nir_ssa_def *res_hi = nir_iadd(b, carry, nir_iadd(b, x_hi, y_hi));
3067e102996Smaya
3077e102996Smaya   return nir_pack_64_2x32_split(b, res_lo, res_hi);
3087e102996Smaya}
3097e102996Smaya
3107e102996Smayastatic nir_ssa_def *
3117e102996Smayalower_isub64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
3127e102996Smaya{
3137e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
3147e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
3157e102996Smaya   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
3167e102996Smaya   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
3177e102996Smaya
3187e102996Smaya   nir_ssa_def *res_lo = nir_isub(b, x_lo, y_lo);
3197e102996Smaya   nir_ssa_def *borrow = nir_ineg(b, nir_b2i32(b, nir_ult(b, x_lo, y_lo)));
3207e102996Smaya   nir_ssa_def *res_hi = nir_iadd(b, nir_isub(b, x_hi, y_hi), borrow);
3217e102996Smaya
3227e102996Smaya   return nir_pack_64_2x32_split(b, res_lo, res_hi);
3237e102996Smaya}
3247e102996Smaya
3257e102996Smayastatic nir_ssa_def *
3267e102996Smayalower_ineg64(nir_builder *b, nir_ssa_def *x)
3277e102996Smaya{
3287e102996Smaya   /* Since isub is the same number of instructions (with better dependencies)
3297e102996Smaya    * as iadd, subtraction is actually more efficient for ineg than the usual
3307e102996Smaya    * 2's complement "flip the bits and add one".
3317e102996Smaya    */
3327e102996Smaya   return lower_isub64(b, nir_imm_int64(b, 0), x);
3337e102996Smaya}
3347e102996Smaya
3357e102996Smayastatic nir_ssa_def *
3367e102996Smayalower_iabs64(nir_builder *b, nir_ssa_def *x)
3377e102996Smaya{
3387e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
3397e102996Smaya   nir_ssa_def *x_is_neg = nir_ilt(b, x_hi, nir_imm_int(b, 0));
3407e102996Smaya   return nir_bcsel(b, x_is_neg, nir_ineg(b, x), x);
3417e102996Smaya}
3427e102996Smaya
3437e102996Smayastatic nir_ssa_def *
3447e102996Smayalower_int64_compare(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *y)
3457e102996Smaya{
3467e102996Smaya   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
3477e102996Smaya   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
3487e102996Smaya   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
3497e102996Smaya   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
3507e102996Smaya
3517e102996Smaya   switch (op) {
3527e102996Smaya   case nir_op_ieq:
3537e102996Smaya      return nir_iand(b, nir_ieq(b, x_hi, y_hi), nir_ieq(b, x_lo, y_lo));
3547e102996Smaya   case nir_op_ine:
3557e102996Smaya      return nir_ior(b, nir_ine(b, x_hi, y_hi), nir_ine(b, x_lo, y_lo));
3567e102996Smaya   case nir_op_ult:
3577e102996Smaya      return nir_ior(b, nir_ult(b, x_hi, y_hi),
3587e102996Smaya                        nir_iand(b, nir_ieq(b, x_hi, y_hi),
3597e102996Smaya                                    nir_ult(b, x_lo, y_lo)));
3607e102996Smaya   case nir_op_ilt:
3617e102996Smaya      return nir_ior(b, nir_ilt(b, x_hi, y_hi),
3627e102996Smaya                        nir_iand(b, nir_ieq(b, x_hi, y_hi),
3637e102996Smaya                                    nir_ult(b, x_lo, y_lo)));
3647e102996Smaya      break;
3657e102996Smaya   case nir_op_uge:
3667e102996Smaya      /* Lower as !(x < y) in the hopes of better CSE */
3677e102996Smaya      return nir_inot(b, lower_int64_compare(b, nir_op_ult, x, y));
3687e102996Smaya   case nir_op_ige:
3697e102996Smaya      /* Lower as !(x < y) in the hopes of better CSE */
3707e102996Smaya      return nir_inot(b, lower_int64_compare(b, nir_op_ilt, x, y));
3717e102996Smaya   default:
3727e102996Smaya      unreachable("Invalid comparison");
3737e102996Smaya   }
3747e102996Smaya}
3757e102996Smaya
3767e102996Smayastatic nir_ssa_def *
3777e102996Smayalower_umax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
3787e102996Smaya{
3797e102996Smaya   return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), y, x);
3807e102996Smaya}
3817e102996Smaya
3827e102996Smayastatic nir_ssa_def *
3837e102996Smayalower_imax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
3847e102996Smaya{
3857e102996Smaya   return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), y, x);
3867e102996Smaya}
3877e102996Smaya
3887e102996Smayastatic nir_ssa_def *
3897e102996Smayalower_umin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
3907e102996Smaya{
3917e102996Smaya   return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), x, y);
3927e102996Smaya}
3937e102996Smaya
3947e102996Smayastatic nir_ssa_def *
3957e102996Smayalower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
3967e102996Smaya{
3977e102996Smaya   return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y);
3987e102996Smaya}
3997e102996Smaya
4007e102996Smayastatic nir_ssa_def *
4017e102996Smayalower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
4027e102996Smaya                  bool sign_extend)
4037e102996Smaya{
4047e102996Smaya   nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y)
4057e102996Smaya                                     : nir_umul_high(b, x, y);
4067e102996Smaya
4077e102996Smaya   return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi);
4087e102996Smaya}
4097e102996Smaya
41001e04c3fSmrgstatic nir_ssa_def *
41101e04c3fSmrglower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y)
41201e04c3fSmrg{
41301e04c3fSmrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
41401e04c3fSmrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
41501e04c3fSmrg   nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y);
41601e04c3fSmrg   nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y);
41701e04c3fSmrg
4187e102996Smaya   nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo);
4197e102996Smaya   nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo),
42001e04c3fSmrg                         nir_iadd(b, nir_imul(b, x_lo, y_hi),
42101e04c3fSmrg                                     nir_imul(b, x_hi, y_lo)));
42201e04c3fSmrg
4237e102996Smaya   return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo),
4247e102996Smaya                                 res_hi);
4257e102996Smaya}
4267e102996Smaya
4277e102996Smayastatic nir_ssa_def *
4287e102996Smayalower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y,
4297e102996Smaya                 bool sign_extend)
4307e102996Smaya{
4317e102996Smaya   nir_ssa_def *x32[4], *y32[4];
4327e102996Smaya   x32[0] = nir_unpack_64_2x32_split_x(b, x);
4337e102996Smaya   x32[1] = nir_unpack_64_2x32_split_y(b, x);
4347e102996Smaya   if (sign_extend) {
4357ec681f3Smrg      x32[2] = x32[3] = nir_ishr_imm(b, x32[1], 31);
4367e102996Smaya   } else {
4377e102996Smaya      x32[2] = x32[3] = nir_imm_int(b, 0);
4387e102996Smaya   }
4397e102996Smaya
4407e102996Smaya   y32[0] = nir_unpack_64_2x32_split_x(b, y);
4417e102996Smaya   y32[1] = nir_unpack_64_2x32_split_y(b, y);
4427e102996Smaya   if (sign_extend) {
4437ec681f3Smrg      y32[2] = y32[3] = nir_ishr_imm(b, y32[1], 31);
4447e102996Smaya   } else {
4457e102996Smaya      y32[2] = y32[3] = nir_imm_int(b, 0);
4467e102996Smaya   }
4477e102996Smaya
4487e102996Smaya   nir_ssa_def *res[8] = { NULL, };
4497e102996Smaya
4507e102996Smaya   /* Yes, the following generates a pile of code.  However, we throw res[0]
4517e102996Smaya    * and res[1] away in the end and, if we're in the umul case, four of our
4527e102996Smaya    * eight dword operands will be constant zero and opt_algebraic will clean
4537e102996Smaya    * this up nicely.
4547e102996Smaya    */
4557e102996Smaya   for (unsigned i = 0; i < 4; i++) {
4567e102996Smaya      nir_ssa_def *carry = NULL;
4577e102996Smaya      for (unsigned j = 0; j < 4; j++) {
4587e102996Smaya         /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the
4597e102996Smaya          * maximum value of tmp is UINT32_MAX * UINT32_MAX.  The maximum
4607e102996Smaya          * value that will fit in tmp is
4617e102996Smaya          *
4627e102996Smaya          *    UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX
4637e102996Smaya          *               = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX
4647e102996Smaya          *               = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX
4657e102996Smaya          *
4667e102996Smaya          * so we're guaranteed that we can add in two more 32-bit values
4677e102996Smaya          * without overflowing tmp.
4687e102996Smaya          */
4697e102996Smaya         nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]);
4707e102996Smaya
4717e102996Smaya         if (res[i + j])
4727e102996Smaya            tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j]));
4737e102996Smaya         if (carry)
4747e102996Smaya            tmp = nir_iadd(b, tmp, carry);
4757e102996Smaya         res[i + j] = nir_u2u32(b, tmp);
4767ec681f3Smrg         carry = nir_ushr_imm(b, tmp, 32);
4777e102996Smaya      }
4787e102996Smaya      res[i + 4] = nir_u2u32(b, carry);
4797e102996Smaya   }
4807e102996Smaya
4817e102996Smaya   return nir_pack_64_2x32_split(b, res[2], res[3]);
48201e04c3fSmrg}
48301e04c3fSmrg
48401e04c3fSmrgstatic nir_ssa_def *
48501e04c3fSmrglower_isign64(nir_builder *b, nir_ssa_def *x)
48601e04c3fSmrg{
48701e04c3fSmrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
48801e04c3fSmrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
48901e04c3fSmrg
49001e04c3fSmrg   nir_ssa_def *is_non_zero = nir_i2b(b, nir_ior(b, x_lo, x_hi));
4917ec681f3Smrg   nir_ssa_def *res_hi = nir_ishr_imm(b, x_hi, 31);
4927e102996Smaya   nir_ssa_def *res_lo = nir_ior(b, res_hi, nir_b2i32(b, is_non_zero));
49301e04c3fSmrg
49401e04c3fSmrg   return nir_pack_64_2x32_split(b, res_lo, res_hi);
49501e04c3fSmrg}
49601e04c3fSmrg
49701e04c3fSmrgstatic void
49801e04c3fSmrglower_udiv64_mod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d,
49901e04c3fSmrg                   nir_ssa_def **q, nir_ssa_def **r)
50001e04c3fSmrg{
50101e04c3fSmrg   /* TODO: We should specially handle the case where the denominator is a
50201e04c3fSmrg    * constant.  In that case, we should be able to reduce it to a multiply by
50301e04c3fSmrg    * a constant, some shifts, and an add.
50401e04c3fSmrg    */
50501e04c3fSmrg   nir_ssa_def *n_lo = nir_unpack_64_2x32_split_x(b, n);
50601e04c3fSmrg   nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
50701e04c3fSmrg   nir_ssa_def *d_lo = nir_unpack_64_2x32_split_x(b, d);
50801e04c3fSmrg   nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
50901e04c3fSmrg
5107e102996Smaya   nir_ssa_def *q_lo = nir_imm_zero(b, n->num_components, 32);
5117e102996Smaya   nir_ssa_def *q_hi = nir_imm_zero(b, n->num_components, 32);
51201e04c3fSmrg
51301e04c3fSmrg   nir_ssa_def *n_hi_before_if = n_hi;
51401e04c3fSmrg   nir_ssa_def *q_hi_before_if = q_hi;
51501e04c3fSmrg
51601e04c3fSmrg   /* If the upper 32 bits of denom are non-zero, it is impossible for shifts
51701e04c3fSmrg    * greater than 32 bits to occur.  If the upper 32 bits of the numerator
51801e04c3fSmrg    * are zero, it is impossible for (denom << [63, 32]) <= numer unless
51901e04c3fSmrg    * denom == 0.
52001e04c3fSmrg    */
52101e04c3fSmrg   nir_ssa_def *need_high_div =
5227ec681f3Smrg      nir_iand(b, nir_ieq_imm(b, d_hi, 0), nir_uge(b, n_hi, d_lo));
52301e04c3fSmrg   nir_push_if(b, nir_bany(b, need_high_div));
52401e04c3fSmrg   {
52501e04c3fSmrg      /* If we only have one component, then the bany above goes away and
52601e04c3fSmrg       * this is always true within the if statement.
52701e04c3fSmrg       */
52801e04c3fSmrg      if (n->num_components == 1)
52901e04c3fSmrg         need_high_div = nir_imm_true(b);
53001e04c3fSmrg
53101e04c3fSmrg      nir_ssa_def *log2_d_lo = nir_ufind_msb(b, d_lo);
53201e04c3fSmrg
53301e04c3fSmrg      for (int i = 31; i >= 0; i--) {
53401e04c3fSmrg         /* if ((d.x << i) <= n.y) {
53501e04c3fSmrg          *    n.y -= d.x << i;
53601e04c3fSmrg          *    quot.y |= 1U << i;
53701e04c3fSmrg          * }
53801e04c3fSmrg          */
53901e04c3fSmrg         nir_ssa_def *d_shift = nir_ishl(b, d_lo, nir_imm_int(b, i));
54001e04c3fSmrg         nir_ssa_def *new_n_hi = nir_isub(b, n_hi, d_shift);
54101e04c3fSmrg         nir_ssa_def *new_q_hi = nir_ior(b, q_hi, nir_imm_int(b, 1u << i));
54201e04c3fSmrg         nir_ssa_def *cond = nir_iand(b, need_high_div,
54301e04c3fSmrg                                         nir_uge(b, n_hi, d_shift));
54401e04c3fSmrg         if (i != 0) {
54501e04c3fSmrg            /* log2_d_lo is always <= 31, so we don't need to bother with it
54601e04c3fSmrg             * in the last iteration.
54701e04c3fSmrg             */
54801e04c3fSmrg            cond = nir_iand(b, cond,
54901e04c3fSmrg                               nir_ige(b, nir_imm_int(b, 31 - i), log2_d_lo));
55001e04c3fSmrg         }
55101e04c3fSmrg         n_hi = nir_bcsel(b, cond, new_n_hi, n_hi);
55201e04c3fSmrg         q_hi = nir_bcsel(b, cond, new_q_hi, q_hi);
55301e04c3fSmrg      }
55401e04c3fSmrg   }
55501e04c3fSmrg   nir_pop_if(b, NULL);
55601e04c3fSmrg   n_hi = nir_if_phi(b, n_hi, n_hi_before_if);
55701e04c3fSmrg   q_hi = nir_if_phi(b, q_hi, q_hi_before_if);
55801e04c3fSmrg
55901e04c3fSmrg   nir_ssa_def *log2_denom = nir_ufind_msb(b, d_hi);
56001e04c3fSmrg
56101e04c3fSmrg   n = nir_pack_64_2x32_split(b, n_lo, n_hi);
56201e04c3fSmrg   d = nir_pack_64_2x32_split(b, d_lo, d_hi);
56301e04c3fSmrg   for (int i = 31; i >= 0; i--) {
56401e04c3fSmrg      /* if ((d64 << i) <= n64) {
56501e04c3fSmrg       *    n64 -= d64 << i;
56601e04c3fSmrg       *    quot.x |= 1U << i;
56701e04c3fSmrg       * }
56801e04c3fSmrg       */
56901e04c3fSmrg      nir_ssa_def *d_shift = nir_ishl(b, d, nir_imm_int(b, i));
57001e04c3fSmrg      nir_ssa_def *new_n = nir_isub(b, n, d_shift);
57101e04c3fSmrg      nir_ssa_def *new_q_lo = nir_ior(b, q_lo, nir_imm_int(b, 1u << i));
57201e04c3fSmrg      nir_ssa_def *cond = nir_uge(b, n, d_shift);
57301e04c3fSmrg      if (i != 0) {
57401e04c3fSmrg         /* log2_denom is always <= 31, so we don't need to bother with it
57501e04c3fSmrg          * in the last iteration.
57601e04c3fSmrg          */
57701e04c3fSmrg         cond = nir_iand(b, cond,
57801e04c3fSmrg                            nir_ige(b, nir_imm_int(b, 31 - i), log2_denom));
57901e04c3fSmrg      }
58001e04c3fSmrg      n = nir_bcsel(b, cond, new_n, n);
58101e04c3fSmrg      q_lo = nir_bcsel(b, cond, new_q_lo, q_lo);
58201e04c3fSmrg   }
58301e04c3fSmrg
58401e04c3fSmrg   *q = nir_pack_64_2x32_split(b, q_lo, q_hi);
58501e04c3fSmrg   *r = n;
58601e04c3fSmrg}
58701e04c3fSmrg
58801e04c3fSmrgstatic nir_ssa_def *
58901e04c3fSmrglower_udiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
59001e04c3fSmrg{
59101e04c3fSmrg   nir_ssa_def *q, *r;
59201e04c3fSmrg   lower_udiv64_mod64(b, n, d, &q, &r);
59301e04c3fSmrg   return q;
59401e04c3fSmrg}
59501e04c3fSmrg
59601e04c3fSmrgstatic nir_ssa_def *
59701e04c3fSmrglower_idiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
59801e04c3fSmrg{
59901e04c3fSmrg   nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
60001e04c3fSmrg   nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
60101e04c3fSmrg
60201e04c3fSmrg   nir_ssa_def *negate = nir_ine(b, nir_ilt(b, n_hi, nir_imm_int(b, 0)),
60301e04c3fSmrg                                    nir_ilt(b, d_hi, nir_imm_int(b, 0)));
60401e04c3fSmrg   nir_ssa_def *q, *r;
60501e04c3fSmrg   lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
60601e04c3fSmrg   return nir_bcsel(b, negate, nir_ineg(b, q), q);
60701e04c3fSmrg}
60801e04c3fSmrg
60901e04c3fSmrgstatic nir_ssa_def *
61001e04c3fSmrglower_umod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
61101e04c3fSmrg{
61201e04c3fSmrg   nir_ssa_def *q, *r;
61301e04c3fSmrg   lower_udiv64_mod64(b, n, d, &q, &r);
61401e04c3fSmrg   return r;
61501e04c3fSmrg}
61601e04c3fSmrg
61701e04c3fSmrgstatic nir_ssa_def *
61801e04c3fSmrglower_imod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
61901e04c3fSmrg{
62001e04c3fSmrg   nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
62101e04c3fSmrg   nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d);
62201e04c3fSmrg   nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
62301e04c3fSmrg   nir_ssa_def *d_is_neg = nir_ilt(b, d_hi, nir_imm_int(b, 0));
62401e04c3fSmrg
62501e04c3fSmrg   nir_ssa_def *q, *r;
62601e04c3fSmrg   lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
62701e04c3fSmrg
62801e04c3fSmrg   nir_ssa_def *rem = nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
62901e04c3fSmrg
6307ec681f3Smrg   return nir_bcsel(b, nir_ieq_imm(b, r, 0), nir_imm_int64(b, 0),
63101e04c3fSmrg          nir_bcsel(b, nir_ieq(b, n_is_neg, d_is_neg), rem,
63201e04c3fSmrg                       nir_iadd(b, rem, d)));
63301e04c3fSmrg}
63401e04c3fSmrg
63501e04c3fSmrgstatic nir_ssa_def *
63601e04c3fSmrglower_irem64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d)
63701e04c3fSmrg{
63801e04c3fSmrg   nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n);
63901e04c3fSmrg   nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0));
64001e04c3fSmrg
64101e04c3fSmrg   nir_ssa_def *q, *r;
64201e04c3fSmrg   lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r);
64301e04c3fSmrg   return nir_bcsel(b, n_is_neg, nir_ineg(b, r), r);
64401e04c3fSmrg}
64501e04c3fSmrg
6467e102996Smayastatic nir_ssa_def *
6477e102996Smayalower_extract(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *c)
6487e102996Smaya{
6497e102996Smaya   assert(op == nir_op_extract_u8 || op == nir_op_extract_i8 ||
6507e102996Smaya          op == nir_op_extract_u16 || op == nir_op_extract_i16);
6517e102996Smaya
6527e102996Smaya   const int chunk = nir_src_as_uint(nir_src_for_ssa(c));
6537e102996Smaya   const int chunk_bits =
6547e102996Smaya      (op == nir_op_extract_u8 || op == nir_op_extract_i8) ? 8 : 16;
6557e102996Smaya   const int num_chunks_in_32 = 32 / chunk_bits;
6567e102996Smaya
6577e102996Smaya   nir_ssa_def *extract32;
6587e102996Smaya   if (chunk < num_chunks_in_32) {
6597e102996Smaya      extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_x(b, x),
6607e102996Smaya                                   nir_imm_int(b, chunk),
6617e102996Smaya                                   NULL, NULL);
6627e102996Smaya   } else {
6637e102996Smaya      extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_y(b, x),
6647e102996Smaya                                   nir_imm_int(b, chunk - num_chunks_in_32),
6657e102996Smaya                                   NULL, NULL);
6667e102996Smaya   }
6677e102996Smaya
6687e102996Smaya   if (op == nir_op_extract_i8 || op == nir_op_extract_i16)
6697e102996Smaya      return lower_i2i64(b, extract32);
6707e102996Smaya   else
6717e102996Smaya      return lower_u2u64(b, extract32);
6727e102996Smaya}
6737e102996Smaya
6747ec681f3Smrgstatic nir_ssa_def *
6757ec681f3Smrglower_ufind_msb64(nir_builder *b, nir_ssa_def *x)
6767ec681f3Smrg{
6777ec681f3Smrg
6787ec681f3Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
6797ec681f3Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
6807ec681f3Smrg   nir_ssa_def *lo_count = nir_ufind_msb(b, x_lo);
6817ec681f3Smrg   nir_ssa_def *hi_count = nir_ufind_msb(b, x_hi);
6827ec681f3Smrg   nir_ssa_def *valid_hi_bits = nir_ine(b, x_hi, nir_imm_int(b, 0));
6837ec681f3Smrg   nir_ssa_def *hi_res = nir_iadd(b, nir_imm_intN_t(b, 32, 32), hi_count);
6847ec681f3Smrg   return nir_bcsel(b, valid_hi_bits, hi_res, lo_count);
6857ec681f3Smrg}
6867ec681f3Smrg
6877ec681f3Smrgstatic nir_ssa_def *
6887ec681f3Smrglower_2f(nir_builder *b, nir_ssa_def *x, unsigned dest_bit_size,
6897ec681f3Smrg         bool src_is_signed)
6907ec681f3Smrg{
6917ec681f3Smrg   nir_ssa_def *x_sign = NULL;
6927ec681f3Smrg
6937ec681f3Smrg   if (src_is_signed) {
6947ec681f3Smrg      x_sign = nir_bcsel(b, COND_LOWER_CMP(b, ilt, x, nir_imm_int64(b, 0)),
6957ec681f3Smrg                         nir_imm_floatN_t(b, -1, dest_bit_size),
6967ec681f3Smrg                         nir_imm_floatN_t(b, 1, dest_bit_size));
6977ec681f3Smrg      x = COND_LOWER_OP(b, iabs, x);
6987ec681f3Smrg   }
6997ec681f3Smrg
7007ec681f3Smrg   nir_ssa_def *exp = COND_LOWER_OP(b, ufind_msb, x);
7017ec681f3Smrg   unsigned significand_bits;
7027ec681f3Smrg
7037ec681f3Smrg   switch (dest_bit_size) {
7047ec681f3Smrg   case 32:
7057ec681f3Smrg      significand_bits = 23;
7067ec681f3Smrg      break;
7077ec681f3Smrg   case 16:
7087ec681f3Smrg      significand_bits = 10;
7097ec681f3Smrg      break;
7107ec681f3Smrg   default:
7117ec681f3Smrg      unreachable("Invalid dest_bit_size");
7127ec681f3Smrg   }
7137ec681f3Smrg
7147ec681f3Smrg   nir_ssa_def *discard =
7157ec681f3Smrg      nir_imax(b, nir_isub(b, exp, nir_imm_int(b, significand_bits)),
7167ec681f3Smrg                  nir_imm_int(b, 0));
7177ec681f3Smrg   nir_ssa_def *significand =
7187ec681f3Smrg      COND_LOWER_CAST(b, u2u32, COND_LOWER_OP(b, ushr, x, discard));
7197ec681f3Smrg
7207ec681f3Smrg   /* Round-to-nearest-even implementation:
7217ec681f3Smrg    * - if the non-representable part of the significand is higher than half
7227ec681f3Smrg    *   the minimum representable significand, we round-up
7237ec681f3Smrg    * - if the non-representable part of the significand is equal to half the
7247ec681f3Smrg    *   minimum representable significand and the representable part of the
7257ec681f3Smrg    *   significand is odd, we round-up
7267ec681f3Smrg    * - in any other case, we round-down
7277ec681f3Smrg    */
7287ec681f3Smrg   nir_ssa_def *lsb_mask = COND_LOWER_OP(b, ishl, nir_imm_int64(b, 1), discard);
7297ec681f3Smrg   nir_ssa_def *rem_mask = COND_LOWER_OP(b, isub, lsb_mask, nir_imm_int64(b, 1));
7307ec681f3Smrg   nir_ssa_def *half = COND_LOWER_OP(b, ishr, lsb_mask, nir_imm_int(b, 1));
7317ec681f3Smrg   nir_ssa_def *rem = COND_LOWER_OP(b, iand, x, rem_mask);
7327ec681f3Smrg   nir_ssa_def *halfway = nir_iand(b, COND_LOWER_CMP(b, ieq, rem, half),
7337ec681f3Smrg                                   nir_ine(b, discard, nir_imm_int(b, 0)));
7347ec681f3Smrg   nir_ssa_def *is_odd = nir_i2b(b, nir_iand(b, significand, nir_imm_int(b, 1)));
7357ec681f3Smrg   nir_ssa_def *round_up = nir_ior(b, COND_LOWER_CMP(b, ilt, half, rem),
7367ec681f3Smrg                                   nir_iand(b, halfway, is_odd));
7377ec681f3Smrg   significand = nir_iadd(b, significand, nir_b2i32(b, round_up));
7387ec681f3Smrg
7397ec681f3Smrg   nir_ssa_def *res;
7407ec681f3Smrg
7417ec681f3Smrg   if (dest_bit_size == 32)
7427ec681f3Smrg      res = nir_fmul(b, nir_u2f32(b, significand),
7437ec681f3Smrg                     nir_fexp2(b, nir_u2f32(b, discard)));
7447ec681f3Smrg   else
7457ec681f3Smrg      res = nir_fmul(b, nir_u2f16(b, significand),
7467ec681f3Smrg                     nir_fexp2(b, nir_u2f16(b, discard)));
7477ec681f3Smrg
7487ec681f3Smrg   if (src_is_signed)
7497ec681f3Smrg      res = nir_fmul(b, res, x_sign);
7507ec681f3Smrg
7517ec681f3Smrg   return res;
7527ec681f3Smrg}
7537ec681f3Smrg
7547ec681f3Smrgstatic nir_ssa_def *
7557ec681f3Smrglower_f2(nir_builder *b, nir_ssa_def *x, bool dst_is_signed)
7567ec681f3Smrg{
7577ec681f3Smrg   assert(x->bit_size == 16 || x->bit_size == 32);
7587ec681f3Smrg   nir_ssa_def *x_sign = NULL;
7597ec681f3Smrg
7607ec681f3Smrg   if (dst_is_signed)
7617ec681f3Smrg      x_sign = nir_fsign(b, x);
7627ec681f3Smrg   else
7637ec681f3Smrg      x = nir_fmin(b, x, nir_imm_floatN_t(b, UINT64_MAX, x->bit_size));
7647ec681f3Smrg
7657ec681f3Smrg   x = nir_ftrunc(b, x);
7667ec681f3Smrg
7677ec681f3Smrg   if (dst_is_signed) {
7687ec681f3Smrg      x = nir_fmin(b, x, nir_imm_floatN_t(b, INT64_MAX, x->bit_size));
7697ec681f3Smrg      x = nir_fmax(b, x, nir_imm_floatN_t(b, INT64_MIN, x->bit_size));
7707ec681f3Smrg      x = nir_fabs(b, x);
7717ec681f3Smrg   }
7727ec681f3Smrg
7737ec681f3Smrg   nir_ssa_def *div = nir_imm_floatN_t(b, 1ULL << 32, x->bit_size);
7747ec681f3Smrg   nir_ssa_def *res_hi = nir_f2u32(b, nir_fdiv(b, x, div));
7757ec681f3Smrg   nir_ssa_def *res_lo = nir_f2u32(b, nir_frem(b, x, div));
7767ec681f3Smrg   nir_ssa_def *res = nir_pack_64_2x32_split(b, res_lo, res_hi);
7777ec681f3Smrg
7787ec681f3Smrg   if (dst_is_signed)
7797ec681f3Smrg      res = nir_bcsel(b, nir_flt(b, x_sign, nir_imm_floatN_t(b, 0, x->bit_size)),
7807ec681f3Smrg                      nir_ineg(b, res), res);
7817ec681f3Smrg
7827ec681f3Smrg   return res;
7837ec681f3Smrg}
7847ec681f3Smrg
7857ec681f3Smrgstatic nir_ssa_def *
7867ec681f3Smrglower_bit_count64(nir_builder *b, nir_ssa_def *x)
7877ec681f3Smrg{
7887ec681f3Smrg   nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x);
7897ec681f3Smrg   nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x);
7907ec681f3Smrg   nir_ssa_def *lo_count = nir_bit_count(b, x_lo);
7917ec681f3Smrg   nir_ssa_def *hi_count = nir_bit_count(b, x_hi);
7927ec681f3Smrg   return nir_iadd(b, lo_count, hi_count);
7937ec681f3Smrg}
7947ec681f3Smrg
7957e102996Smayanir_lower_int64_options
7967e102996Smayanir_lower_int64_op_to_options_mask(nir_op opcode)
79701e04c3fSmrg{
79801e04c3fSmrg   switch (opcode) {
79901e04c3fSmrg   case nir_op_imul:
8007ec681f3Smrg   case nir_op_amul:
80101e04c3fSmrg      return nir_lower_imul64;
8027e102996Smaya   case nir_op_imul_2x32_64:
8037e102996Smaya   case nir_op_umul_2x32_64:
8047e102996Smaya      return nir_lower_imul_2x32_64;
8057e102996Smaya   case nir_op_imul_high:
8067e102996Smaya   case nir_op_umul_high:
8077e102996Smaya      return nir_lower_imul_high64;
80801e04c3fSmrg   case nir_op_isign:
80901e04c3fSmrg      return nir_lower_isign64;
81001e04c3fSmrg   case nir_op_udiv:
81101e04c3fSmrg   case nir_op_idiv:
81201e04c3fSmrg   case nir_op_umod:
81301e04c3fSmrg   case nir_op_imod:
81401e04c3fSmrg   case nir_op_irem:
81501e04c3fSmrg      return nir_lower_divmod64;
8167e102996Smaya   case nir_op_b2i64:
8177e102996Smaya   case nir_op_i2b1:
8187ec681f3Smrg   case nir_op_i2i8:
8197ec681f3Smrg   case nir_op_i2i16:
8207e102996Smaya   case nir_op_i2i32:
8217e102996Smaya   case nir_op_i2i64:
8227ec681f3Smrg   case nir_op_u2u8:
8237ec681f3Smrg   case nir_op_u2u16:
8247e102996Smaya   case nir_op_u2u32:
8257e102996Smaya   case nir_op_u2u64:
8267ec681f3Smrg   case nir_op_i2f32:
8277ec681f3Smrg   case nir_op_u2f32:
8287ec681f3Smrg   case nir_op_i2f16:
8297ec681f3Smrg   case nir_op_u2f16:
8307ec681f3Smrg   case nir_op_f2i64:
8317ec681f3Smrg   case nir_op_f2u64:
8327e102996Smaya   case nir_op_bcsel:
8337e102996Smaya      return nir_lower_mov64;
8347e102996Smaya   case nir_op_ieq:
8357e102996Smaya   case nir_op_ine:
8367e102996Smaya   case nir_op_ult:
8377e102996Smaya   case nir_op_ilt:
8387e102996Smaya   case nir_op_uge:
8397e102996Smaya   case nir_op_ige:
8407e102996Smaya      return nir_lower_icmp64;
8417e102996Smaya   case nir_op_iadd:
8427e102996Smaya   case nir_op_isub:
8437e102996Smaya      return nir_lower_iadd64;
8447e102996Smaya   case nir_op_imin:
8457e102996Smaya   case nir_op_imax:
8467e102996Smaya   case nir_op_umin:
8477e102996Smaya   case nir_op_umax:
8487e102996Smaya      return nir_lower_minmax64;
8497e102996Smaya   case nir_op_iabs:
8507e102996Smaya      return nir_lower_iabs64;
8517e102996Smaya   case nir_op_ineg:
8527e102996Smaya      return nir_lower_ineg64;
8537e102996Smaya   case nir_op_iand:
8547e102996Smaya   case nir_op_ior:
8557e102996Smaya   case nir_op_ixor:
8567e102996Smaya   case nir_op_inot:
8577e102996Smaya      return nir_lower_logic64;
8587e102996Smaya   case nir_op_ishl:
8597e102996Smaya   case nir_op_ishr:
8607e102996Smaya   case nir_op_ushr:
8617e102996Smaya      return nir_lower_shift64;
8627e102996Smaya   case nir_op_extract_u8:
8637e102996Smaya   case nir_op_extract_i8:
8647e102996Smaya   case nir_op_extract_u16:
8657e102996Smaya   case nir_op_extract_i16:
8667e102996Smaya      return nir_lower_extract64;
8677ec681f3Smrg   case nir_op_ufind_msb:
8687ec681f3Smrg      return nir_lower_ufind_msb64;
8697ec681f3Smrg   case nir_op_bit_count:
8707ec681f3Smrg      return nir_lower_bit_count64;
87101e04c3fSmrg   default:
87201e04c3fSmrg      return 0;
87301e04c3fSmrg   }
87401e04c3fSmrg}
87501e04c3fSmrg
87601e04c3fSmrgstatic nir_ssa_def *
87701e04c3fSmrglower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu)
87801e04c3fSmrg{
87901e04c3fSmrg   nir_ssa_def *src[4];
88001e04c3fSmrg   for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++)
88101e04c3fSmrg      src[i] = nir_ssa_for_alu_src(b, alu, i);
88201e04c3fSmrg
88301e04c3fSmrg   switch (alu->op) {
88401e04c3fSmrg   case nir_op_imul:
8857ec681f3Smrg   case nir_op_amul:
88601e04c3fSmrg      return lower_imul64(b, src[0], src[1]);
8877e102996Smaya   case nir_op_imul_2x32_64:
8887e102996Smaya      return lower_mul_2x32_64(b, src[0], src[1], true);
8897e102996Smaya   case nir_op_umul_2x32_64:
8907e102996Smaya      return lower_mul_2x32_64(b, src[0], src[1], false);
8917e102996Smaya   case nir_op_imul_high:
8927e102996Smaya      return lower_mul_high64(b, src[0], src[1], true);
8937e102996Smaya   case nir_op_umul_high:
8947e102996Smaya      return lower_mul_high64(b, src[0], src[1], false);
89501e04c3fSmrg   case nir_op_isign:
89601e04c3fSmrg      return lower_isign64(b, src[0]);
89701e04c3fSmrg   case nir_op_udiv:
89801e04c3fSmrg      return lower_udiv64(b, src[0], src[1]);
89901e04c3fSmrg   case nir_op_idiv:
90001e04c3fSmrg      return lower_idiv64(b, src[0], src[1]);
90101e04c3fSmrg   case nir_op_umod:
90201e04c3fSmrg      return lower_umod64(b, src[0], src[1]);
90301e04c3fSmrg   case nir_op_imod:
90401e04c3fSmrg      return lower_imod64(b, src[0], src[1]);
90501e04c3fSmrg   case nir_op_irem:
90601e04c3fSmrg      return lower_irem64(b, src[0], src[1]);
9077e102996Smaya   case nir_op_b2i64:
9087e102996Smaya      return lower_b2i64(b, src[0]);
9097e102996Smaya   case nir_op_i2b1:
9107e102996Smaya      return lower_i2b(b, src[0]);
9117e102996Smaya   case nir_op_i2i8:
9127e102996Smaya      return lower_i2i8(b, src[0]);
9137e102996Smaya   case nir_op_i2i16:
9147e102996Smaya      return lower_i2i16(b, src[0]);
9157e102996Smaya   case nir_op_i2i32:
9167e102996Smaya      return lower_i2i32(b, src[0]);
9177e102996Smaya   case nir_op_i2i64:
9187e102996Smaya      return lower_i2i64(b, src[0]);
9197e102996Smaya   case nir_op_u2u8:
9207e102996Smaya      return lower_u2u8(b, src[0]);
9217e102996Smaya   case nir_op_u2u16:
9227e102996Smaya      return lower_u2u16(b, src[0]);
9237e102996Smaya   case nir_op_u2u32:
9247e102996Smaya      return lower_u2u32(b, src[0]);
9257e102996Smaya   case nir_op_u2u64:
9267e102996Smaya      return lower_u2u64(b, src[0]);
9277e102996Smaya   case nir_op_bcsel:
9287e102996Smaya      return lower_bcsel64(b, src[0], src[1], src[2]);
9297e102996Smaya   case nir_op_ieq:
9307e102996Smaya   case nir_op_ine:
9317e102996Smaya   case nir_op_ult:
9327e102996Smaya   case nir_op_ilt:
9337e102996Smaya   case nir_op_uge:
9347e102996Smaya   case nir_op_ige:
9357e102996Smaya      return lower_int64_compare(b, alu->op, src[0], src[1]);
9367e102996Smaya   case nir_op_iadd:
9377e102996Smaya      return lower_iadd64(b, src[0], src[1]);
9387e102996Smaya   case nir_op_isub:
9397e102996Smaya      return lower_isub64(b, src[0], src[1]);
9407e102996Smaya   case nir_op_imin:
9417e102996Smaya      return lower_imin64(b, src[0], src[1]);
9427e102996Smaya   case nir_op_imax:
9437e102996Smaya      return lower_imax64(b, src[0], src[1]);
9447e102996Smaya   case nir_op_umin:
9457e102996Smaya      return lower_umin64(b, src[0], src[1]);
9467e102996Smaya   case nir_op_umax:
9477e102996Smaya      return lower_umax64(b, src[0], src[1]);
9487e102996Smaya   case nir_op_iabs:
9497e102996Smaya      return lower_iabs64(b, src[0]);
9507e102996Smaya   case nir_op_ineg:
9517e102996Smaya      return lower_ineg64(b, src[0]);
9527e102996Smaya   case nir_op_iand:
9537e102996Smaya      return lower_iand64(b, src[0], src[1]);
9547e102996Smaya   case nir_op_ior:
9557e102996Smaya      return lower_ior64(b, src[0], src[1]);
9567e102996Smaya   case nir_op_ixor:
9577e102996Smaya      return lower_ixor64(b, src[0], src[1]);
9587e102996Smaya   case nir_op_inot:
9597e102996Smaya      return lower_inot64(b, src[0]);
9607e102996Smaya   case nir_op_ishl:
9617e102996Smaya      return lower_ishl64(b, src[0], src[1]);
9627e102996Smaya   case nir_op_ishr:
9637e102996Smaya      return lower_ishr64(b, src[0], src[1]);
9647e102996Smaya   case nir_op_ushr:
9657e102996Smaya      return lower_ushr64(b, src[0], src[1]);
9667e102996Smaya   case nir_op_extract_u8:
9677e102996Smaya   case nir_op_extract_i8:
9687e102996Smaya   case nir_op_extract_u16:
9697e102996Smaya   case nir_op_extract_i16:
9707e102996Smaya      return lower_extract(b, alu->op, src[0], src[1]);
9717ec681f3Smrg   case nir_op_ufind_msb:
9727ec681f3Smrg      return lower_ufind_msb64(b, src[0]);
9737ec681f3Smrg   case nir_op_bit_count:
9747ec681f3Smrg      return lower_bit_count64(b, src[0]);
9757ec681f3Smrg   case nir_op_i2f64:
9767ec681f3Smrg   case nir_op_i2f32:
9777ec681f3Smrg   case nir_op_i2f16:
9787ec681f3Smrg      return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), true);
9797ec681f3Smrg   case nir_op_u2f64:
9807ec681f3Smrg   case nir_op_u2f32:
9817ec681f3Smrg   case nir_op_u2f16:
9827ec681f3Smrg      return lower_2f(b, src[0], nir_dest_bit_size(alu->dest.dest), false);
9837ec681f3Smrg   case nir_op_f2i64:
9847ec681f3Smrg   case nir_op_f2u64:
9857ec681f3Smrg      /* We don't support f64toi64 (yet?). */
9867ec681f3Smrg      if (src[0]->bit_size > 32)
9877ec681f3Smrg         return false;
9887ec681f3Smrg
9897ec681f3Smrg      return lower_f2(b, src[0], alu->op == nir_op_f2i64);
99001e04c3fSmrg   default:
99101e04c3fSmrg      unreachable("Invalid ALU opcode to lower");
99201e04c3fSmrg   }
99301e04c3fSmrg}
99401e04c3fSmrg
99501e04c3fSmrgstatic bool
9967ec681f3Smrgshould_lower_int64_alu_instr(const nir_alu_instr *alu,
9977ec681f3Smrg                             const nir_shader_compiler_options *options)
9987ec681f3Smrg{
9997ec681f3Smrg   switch (alu->op) {
10007ec681f3Smrg   case nir_op_i2b1:
10017ec681f3Smrg   case nir_op_i2i8:
10027ec681f3Smrg   case nir_op_i2i16:
10037ec681f3Smrg   case nir_op_i2i32:
10047ec681f3Smrg   case nir_op_u2u8:
10057ec681f3Smrg   case nir_op_u2u16:
10067ec681f3Smrg   case nir_op_u2u32:
10077ec681f3Smrg      assert(alu->src[0].src.is_ssa);
10087ec681f3Smrg      if (alu->src[0].src.ssa->bit_size != 64)
10097ec681f3Smrg         return false;
10107ec681f3Smrg      break;
10117ec681f3Smrg   case nir_op_bcsel:
10127ec681f3Smrg      assert(alu->src[1].src.is_ssa);
10137ec681f3Smrg      assert(alu->src[2].src.is_ssa);
10147ec681f3Smrg      assert(alu->src[1].src.ssa->bit_size ==
10157ec681f3Smrg             alu->src[2].src.ssa->bit_size);
10167ec681f3Smrg      if (alu->src[1].src.ssa->bit_size != 64)
10177ec681f3Smrg         return false;
10187ec681f3Smrg      break;
10197ec681f3Smrg   case nir_op_ieq:
10207ec681f3Smrg   case nir_op_ine:
10217ec681f3Smrg   case nir_op_ult:
10227ec681f3Smrg   case nir_op_ilt:
10237ec681f3Smrg   case nir_op_uge:
10247ec681f3Smrg   case nir_op_ige:
10257ec681f3Smrg      assert(alu->src[0].src.is_ssa);
10267ec681f3Smrg      assert(alu->src[1].src.is_ssa);
10277ec681f3Smrg      assert(alu->src[0].src.ssa->bit_size ==
10287ec681f3Smrg             alu->src[1].src.ssa->bit_size);
10297ec681f3Smrg      if (alu->src[0].src.ssa->bit_size != 64)
10307ec681f3Smrg         return false;
10317ec681f3Smrg      break;
10327ec681f3Smrg   case nir_op_ufind_msb:
10337ec681f3Smrg   case nir_op_bit_count:
10347ec681f3Smrg      assert(alu->src[0].src.is_ssa);
10357ec681f3Smrg      if (alu->src[0].src.ssa->bit_size != 64)
10367ec681f3Smrg         return false;
10377ec681f3Smrg      break;
10387ec681f3Smrg   case nir_op_amul:
10397ec681f3Smrg      assert(alu->dest.dest.is_ssa);
10407ec681f3Smrg      if (options->has_imul24)
10417ec681f3Smrg         return false;
10427ec681f3Smrg      if (alu->dest.dest.ssa.bit_size != 64)
10437ec681f3Smrg         return false;
10447ec681f3Smrg      break;
10457ec681f3Smrg   case nir_op_i2f64:
10467ec681f3Smrg   case nir_op_u2f64:
10477ec681f3Smrg   case nir_op_i2f32:
10487ec681f3Smrg   case nir_op_u2f32:
10497ec681f3Smrg   case nir_op_i2f16:
10507ec681f3Smrg   case nir_op_u2f16:
10517ec681f3Smrg      assert(alu->src[0].src.is_ssa);
10527ec681f3Smrg      if (alu->src[0].src.ssa->bit_size != 64)
10537ec681f3Smrg         return false;
10547ec681f3Smrg      break;
10557ec681f3Smrg   case nir_op_f2u64:
10567ec681f3Smrg   case nir_op_f2i64:
10577ec681f3Smrg      FALLTHROUGH;
10587ec681f3Smrg   default:
10597ec681f3Smrg      assert(alu->dest.dest.is_ssa);
10607ec681f3Smrg      if (alu->dest.dest.ssa.bit_size != 64)
10617ec681f3Smrg         return false;
10627ec681f3Smrg      break;
10637ec681f3Smrg   }
106401e04c3fSmrg
10657ec681f3Smrg   unsigned mask = nir_lower_int64_op_to_options_mask(alu->op);
10667ec681f3Smrg   return (options->lower_int64_options & mask) != 0;
10677ec681f3Smrg}
106801e04c3fSmrg
10697ec681f3Smrgstatic nir_ssa_def *
10707ec681f3Smrgsplit_64bit_subgroup_op(nir_builder *b, const nir_intrinsic_instr *intrin)
10717ec681f3Smrg{
10727ec681f3Smrg   const nir_intrinsic_info *info = &nir_intrinsic_infos[intrin->intrinsic];
107301e04c3fSmrg
10747ec681f3Smrg   /* This works on subgroup ops with a single 64-bit source which can be
10757ec681f3Smrg    * trivially lowered by doing the exact same op on both halves.
10767ec681f3Smrg    */
10777ec681f3Smrg   assert(intrin->src[0].is_ssa && intrin->src[0].ssa->bit_size == 64);
10787ec681f3Smrg   nir_ssa_def *split_src0[2] = {
10797ec681f3Smrg      nir_unpack_64_2x32_split_x(b, intrin->src[0].ssa),
10807ec681f3Smrg      nir_unpack_64_2x32_split_y(b, intrin->src[0].ssa),
10817ec681f3Smrg   };
10827ec681f3Smrg
10837ec681f3Smrg   assert(info->has_dest && intrin->dest.is_ssa &&
10847ec681f3Smrg          intrin->dest.ssa.bit_size == 64);
10857ec681f3Smrg
10867ec681f3Smrg   nir_ssa_def *res[2];
10877ec681f3Smrg   for (unsigned i = 0; i < 2; i++) {
10887ec681f3Smrg      nir_intrinsic_instr *split =
10897ec681f3Smrg         nir_intrinsic_instr_create(b->shader, intrin->intrinsic);
10907ec681f3Smrg      split->num_components = intrin->num_components;
10917ec681f3Smrg      split->src[0] = nir_src_for_ssa(split_src0[i]);
10927ec681f3Smrg
10937ec681f3Smrg      /* Other sources must be less than 64 bits and get copied directly */
10947ec681f3Smrg      for (unsigned j = 1; j < info->num_srcs; j++) {
10957ec681f3Smrg         assert(intrin->src[j].is_ssa && intrin->src[j].ssa->bit_size < 64);
10967ec681f3Smrg         split->src[j] = nir_src_for_ssa(intrin->src[j].ssa);
109701e04c3fSmrg      }
10987ec681f3Smrg
10997ec681f3Smrg      /* Copy const indices, if any */
11007ec681f3Smrg      memcpy(split->const_index, intrin->const_index,
11017ec681f3Smrg             sizeof(intrin->const_index));
11027ec681f3Smrg
11037ec681f3Smrg      nir_ssa_dest_init(&split->instr, &split->dest,
11047ec681f3Smrg                        intrin->dest.ssa.num_components, 32, NULL);
11057ec681f3Smrg      nir_builder_instr_insert(b, &split->instr);
11067ec681f3Smrg
11077ec681f3Smrg      res[i] = &split->dest.ssa;
110801e04c3fSmrg   }
110901e04c3fSmrg
11107ec681f3Smrg   return nir_pack_64_2x32_split(b, res[0], res[1]);
11117ec681f3Smrg}
11127ec681f3Smrg
11137ec681f3Smrgstatic nir_ssa_def *
11147ec681f3Smrgbuild_vote_ieq(nir_builder *b, nir_ssa_def *x)
11157ec681f3Smrg{
11167ec681f3Smrg   nir_intrinsic_instr *vote =
11177ec681f3Smrg      nir_intrinsic_instr_create(b->shader, nir_intrinsic_vote_ieq);
11187ec681f3Smrg   vote->src[0] = nir_src_for_ssa(x);
11197ec681f3Smrg   vote->num_components = x->num_components;
11207ec681f3Smrg   nir_ssa_dest_init(&vote->instr, &vote->dest, 1, 1, NULL);
11217ec681f3Smrg   nir_builder_instr_insert(b, &vote->instr);
11227ec681f3Smrg   return &vote->dest.ssa;
11237ec681f3Smrg}
11247ec681f3Smrg
11257ec681f3Smrgstatic nir_ssa_def *
11267ec681f3Smrglower_vote_ieq(nir_builder *b, nir_ssa_def *x)
11277ec681f3Smrg{
11287ec681f3Smrg   return nir_iand(b, build_vote_ieq(b, nir_unpack_64_2x32_split_x(b, x)),
11297ec681f3Smrg                      build_vote_ieq(b, nir_unpack_64_2x32_split_y(b, x)));
11307ec681f3Smrg}
11317ec681f3Smrg
11327ec681f3Smrgstatic nir_ssa_def *
11337ec681f3Smrgbuild_scan_intrinsic(nir_builder *b, nir_intrinsic_op scan_op,
11347ec681f3Smrg                     nir_op reduction_op, unsigned cluster_size,
11357ec681f3Smrg                     nir_ssa_def *val)
11367ec681f3Smrg{
11377ec681f3Smrg   nir_intrinsic_instr *scan =
11387ec681f3Smrg      nir_intrinsic_instr_create(b->shader, scan_op);
11397ec681f3Smrg   scan->num_components = val->num_components;
11407ec681f3Smrg   scan->src[0] = nir_src_for_ssa(val);
11417ec681f3Smrg   nir_intrinsic_set_reduction_op(scan, reduction_op);
11427ec681f3Smrg   if (scan_op == nir_intrinsic_reduce)
11437ec681f3Smrg      nir_intrinsic_set_cluster_size(scan, cluster_size);
11447ec681f3Smrg   nir_ssa_dest_init(&scan->instr, &scan->dest,
11457ec681f3Smrg                     val->num_components, val->bit_size, NULL);
11467ec681f3Smrg   nir_builder_instr_insert(b, &scan->instr);
11477ec681f3Smrg   return &scan->dest.ssa;
11487ec681f3Smrg}
11497ec681f3Smrg
11507ec681f3Smrgstatic nir_ssa_def *
11517ec681f3Smrglower_scan_iadd64(nir_builder *b, const nir_intrinsic_instr *intrin)
11527ec681f3Smrg{
11537ec681f3Smrg   unsigned cluster_size =
11547ec681f3Smrg      intrin->intrinsic == nir_intrinsic_reduce ?
11557ec681f3Smrg      nir_intrinsic_cluster_size(intrin) : 0;
11567ec681f3Smrg
11577ec681f3Smrg   /* Split it into three chunks of no more than 24 bits each.  With 8 bits
11587ec681f3Smrg    * of headroom, we're guaranteed that there will never be overflow in the
11597ec681f3Smrg    * individual subgroup operations.  (Assuming, of course, a subgroup size
11607ec681f3Smrg    * no larger than 256 which seems reasonable.)  We can then scan on each of
11617ec681f3Smrg    * the chunks and add them back together at the end.
11627ec681f3Smrg    */
11637ec681f3Smrg   assert(intrin->src[0].is_ssa);
11647ec681f3Smrg   nir_ssa_def *x = intrin->src[0].ssa;
11657ec681f3Smrg   nir_ssa_def *x_low =
11667ec681f3Smrg      nir_u2u32(b, nir_iand_imm(b, x, 0xffffff));
11677ec681f3Smrg   nir_ssa_def *x_mid =
11687ec681f3Smrg      nir_u2u32(b, nir_iand_imm(b, nir_ushr(b, x, nir_imm_int(b, 24)),
11697ec681f3Smrg                                   0xffffff));
11707ec681f3Smrg   nir_ssa_def *x_hi =
11717ec681f3Smrg      nir_u2u32(b, nir_ushr(b, x, nir_imm_int(b, 48)));
11727ec681f3Smrg
11737ec681f3Smrg   nir_ssa_def *scan_low =
11747ec681f3Smrg      build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd,
11757ec681f3Smrg                              cluster_size, x_low);
11767ec681f3Smrg   nir_ssa_def *scan_mid =
11777ec681f3Smrg      build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd,
11787ec681f3Smrg                              cluster_size, x_mid);
11797ec681f3Smrg   nir_ssa_def *scan_hi =
11807ec681f3Smrg      build_scan_intrinsic(b, intrin->intrinsic, nir_op_iadd,
11817ec681f3Smrg                              cluster_size, x_hi);
11827ec681f3Smrg
11837ec681f3Smrg   scan_low = nir_u2u64(b, scan_low);
11847ec681f3Smrg   scan_mid = nir_ishl(b, nir_u2u64(b, scan_mid), nir_imm_int(b, 24));
11857ec681f3Smrg   scan_hi = nir_ishl(b, nir_u2u64(b, scan_hi), nir_imm_int(b, 48));
11867ec681f3Smrg
11877ec681f3Smrg   return nir_iadd(b, scan_hi, nir_iadd(b, scan_mid, scan_low));
11887ec681f3Smrg}
11897ec681f3Smrg
11907ec681f3Smrgstatic bool
11917ec681f3Smrgshould_lower_int64_intrinsic(const nir_intrinsic_instr *intrin,
11927ec681f3Smrg                             const nir_shader_compiler_options *options)
11937ec681f3Smrg{
11947ec681f3Smrg   switch (intrin->intrinsic) {
11957ec681f3Smrg   case nir_intrinsic_read_invocation:
11967ec681f3Smrg   case nir_intrinsic_read_first_invocation:
11977ec681f3Smrg   case nir_intrinsic_shuffle:
11987ec681f3Smrg   case nir_intrinsic_shuffle_xor:
11997ec681f3Smrg   case nir_intrinsic_shuffle_up:
12007ec681f3Smrg   case nir_intrinsic_shuffle_down:
12017ec681f3Smrg   case nir_intrinsic_quad_broadcast:
12027ec681f3Smrg   case nir_intrinsic_quad_swap_horizontal:
12037ec681f3Smrg   case nir_intrinsic_quad_swap_vertical:
12047ec681f3Smrg   case nir_intrinsic_quad_swap_diagonal:
12057ec681f3Smrg      assert(intrin->dest.is_ssa);
12067ec681f3Smrg      return intrin->dest.ssa.bit_size == 64 &&
12077ec681f3Smrg             (options->lower_int64_options & nir_lower_subgroup_shuffle64);
12087ec681f3Smrg
12097ec681f3Smrg   case nir_intrinsic_vote_ieq:
12107ec681f3Smrg      assert(intrin->src[0].is_ssa);
12117ec681f3Smrg      return intrin->src[0].ssa->bit_size == 64 &&
12127ec681f3Smrg             (options->lower_int64_options & nir_lower_vote_ieq64);
12137ec681f3Smrg
12147ec681f3Smrg   case nir_intrinsic_reduce:
12157ec681f3Smrg   case nir_intrinsic_inclusive_scan:
12167ec681f3Smrg   case nir_intrinsic_exclusive_scan:
12177ec681f3Smrg      assert(intrin->dest.is_ssa);
12187ec681f3Smrg      if (intrin->dest.ssa.bit_size != 64)
12197ec681f3Smrg         return false;
12207ec681f3Smrg
12217ec681f3Smrg      switch (nir_intrinsic_reduction_op(intrin)) {
12227ec681f3Smrg      case nir_op_iadd:
12237ec681f3Smrg         return options->lower_int64_options & nir_lower_scan_reduce_iadd64;
12247ec681f3Smrg      case nir_op_iand:
12257ec681f3Smrg      case nir_op_ior:
12267ec681f3Smrg      case nir_op_ixor:
12277ec681f3Smrg         return options->lower_int64_options & nir_lower_scan_reduce_bitwise64;
12287ec681f3Smrg      default:
12297ec681f3Smrg         return false;
12307ec681f3Smrg      }
12317ec681f3Smrg      break;
12327ec681f3Smrg
12337ec681f3Smrg   default:
12347ec681f3Smrg      return false;
12357e102996Smaya   }
12367ec681f3Smrg}
123701e04c3fSmrg
12387ec681f3Smrgstatic nir_ssa_def *
12397ec681f3Smrglower_int64_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin)
12407ec681f3Smrg{
12417ec681f3Smrg   switch (intrin->intrinsic) {
12427ec681f3Smrg   case nir_intrinsic_read_invocation:
12437ec681f3Smrg   case nir_intrinsic_read_first_invocation:
12447ec681f3Smrg   case nir_intrinsic_shuffle:
12457ec681f3Smrg   case nir_intrinsic_shuffle_xor:
12467ec681f3Smrg   case nir_intrinsic_shuffle_up:
12477ec681f3Smrg   case nir_intrinsic_shuffle_down:
12487ec681f3Smrg   case nir_intrinsic_quad_broadcast:
12497ec681f3Smrg   case nir_intrinsic_quad_swap_horizontal:
12507ec681f3Smrg   case nir_intrinsic_quad_swap_vertical:
12517ec681f3Smrg   case nir_intrinsic_quad_swap_diagonal:
12527ec681f3Smrg      return split_64bit_subgroup_op(b, intrin);
12537ec681f3Smrg
12547ec681f3Smrg   case nir_intrinsic_vote_ieq:
12557ec681f3Smrg      assert(intrin->src[0].is_ssa);
12567ec681f3Smrg      return lower_vote_ieq(b, intrin->src[0].ssa);
12577ec681f3Smrg
12587ec681f3Smrg   case nir_intrinsic_reduce:
12597ec681f3Smrg   case nir_intrinsic_inclusive_scan:
12607ec681f3Smrg   case nir_intrinsic_exclusive_scan:
12617ec681f3Smrg      switch (nir_intrinsic_reduction_op(intrin)) {
12627ec681f3Smrg      case nir_op_iadd:
12637ec681f3Smrg         return lower_scan_iadd64(b, intrin);
12647ec681f3Smrg      case nir_op_iand:
12657ec681f3Smrg      case nir_op_ior:
12667ec681f3Smrg      case nir_op_ixor:
12677ec681f3Smrg         return split_64bit_subgroup_op(b, intrin);
12687ec681f3Smrg      default:
12697ec681f3Smrg         unreachable("Unsupported subgroup scan/reduce op");
12707ec681f3Smrg      }
12717ec681f3Smrg      break;
12727ec681f3Smrg
12737ec681f3Smrg   default:
12747ec681f3Smrg      unreachable("Unsupported intrinsic");
12757ec681f3Smrg   }
127601e04c3fSmrg}
127701e04c3fSmrg
12787ec681f3Smrgstatic bool
12797ec681f3Smrgshould_lower_int64_instr(const nir_instr *instr, const void *_options)
128001e04c3fSmrg{
12817ec681f3Smrg   switch (instr->type) {
12827ec681f3Smrg   case nir_instr_type_alu:
12837ec681f3Smrg      return should_lower_int64_alu_instr(nir_instr_as_alu(instr), _options);
12847ec681f3Smrg   case nir_instr_type_intrinsic:
12857ec681f3Smrg      return should_lower_int64_intrinsic(nir_instr_as_intrinsic(instr),
12867ec681f3Smrg                                          _options);
12877ec681f3Smrg   default:
12887ec681f3Smrg      return false;
12897ec681f3Smrg   }
12907ec681f3Smrg}
129101e04c3fSmrg
12927ec681f3Smrgstatic nir_ssa_def *
12937ec681f3Smrglower_int64_instr(nir_builder *b, nir_instr *instr, void *_options)
12947ec681f3Smrg{
12957ec681f3Smrg   switch (instr->type) {
12967ec681f3Smrg   case nir_instr_type_alu:
12977ec681f3Smrg      return lower_int64_alu_instr(b, nir_instr_as_alu(instr));
12987ec681f3Smrg   case nir_instr_type_intrinsic:
12997ec681f3Smrg      return lower_int64_intrinsic(b, nir_instr_as_intrinsic(instr));
13007ec681f3Smrg   default:
13017ec681f3Smrg      return NULL;
130201e04c3fSmrg   }
13037ec681f3Smrg}
130401e04c3fSmrg
13057ec681f3Smrgbool
13067ec681f3Smrgnir_lower_int64(nir_shader *shader)
13077ec681f3Smrg{
13087ec681f3Smrg   return nir_shader_lower_instructions(shader, should_lower_int64_instr,
13097ec681f3Smrg                                        lower_int64_instr,
13107ec681f3Smrg                                        (void *)shader->options);
131101e04c3fSmrg}
1312