1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2016 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21b8e80941Smrg * IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg#include "nir.h" 25b8e80941Smrg#include "nir_builder.h" 26b8e80941Smrg 27b8e80941Smrgstatic nir_ssa_def * 28b8e80941Smrglower_b2i64(nir_builder *b, nir_ssa_def *x) 29b8e80941Smrg{ 30b8e80941Smrg return nir_pack_64_2x32_split(b, nir_b2i32(b, x), nir_imm_int(b, 0)); 31b8e80941Smrg} 32b8e80941Smrg 33b8e80941Smrgstatic nir_ssa_def * 34b8e80941Smrglower_i2b(nir_builder *b, nir_ssa_def *x) 35b8e80941Smrg{ 36b8e80941Smrg return nir_ine(b, nir_ior(b, nir_unpack_64_2x32_split_x(b, x), 37b8e80941Smrg nir_unpack_64_2x32_split_y(b, x)), 38b8e80941Smrg nir_imm_int(b, 0)); 39b8e80941Smrg} 40b8e80941Smrg 41b8e80941Smrgstatic nir_ssa_def * 42b8e80941Smrglower_i2i8(nir_builder *b, nir_ssa_def *x) 43b8e80941Smrg{ 44b8e80941Smrg return nir_i2i8(b, nir_unpack_64_2x32_split_x(b, x)); 45b8e80941Smrg} 46b8e80941Smrg 47b8e80941Smrgstatic nir_ssa_def * 48b8e80941Smrglower_i2i16(nir_builder *b, nir_ssa_def *x) 49b8e80941Smrg{ 50b8e80941Smrg return nir_i2i16(b, nir_unpack_64_2x32_split_x(b, x)); 51b8e80941Smrg} 52b8e80941Smrg 53b8e80941Smrg 54b8e80941Smrgstatic nir_ssa_def * 55b8e80941Smrglower_i2i32(nir_builder *b, nir_ssa_def *x) 56b8e80941Smrg{ 57b8e80941Smrg return nir_unpack_64_2x32_split_x(b, x); 58b8e80941Smrg} 59b8e80941Smrg 60b8e80941Smrgstatic nir_ssa_def * 61b8e80941Smrglower_i2i64(nir_builder *b, nir_ssa_def *x) 62b8e80941Smrg{ 63b8e80941Smrg nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_i2i32(b, x); 64b8e80941Smrg return nir_pack_64_2x32_split(b, x32, nir_ishr(b, x32, nir_imm_int(b, 31))); 65b8e80941Smrg} 66b8e80941Smrg 67b8e80941Smrgstatic nir_ssa_def * 68b8e80941Smrglower_u2u8(nir_builder *b, nir_ssa_def *x) 69b8e80941Smrg{ 70b8e80941Smrg return nir_u2u8(b, nir_unpack_64_2x32_split_x(b, x)); 71b8e80941Smrg} 72b8e80941Smrg 73b8e80941Smrgstatic nir_ssa_def * 74b8e80941Smrglower_u2u16(nir_builder *b, nir_ssa_def *x) 75b8e80941Smrg{ 76b8e80941Smrg return nir_u2u16(b, nir_unpack_64_2x32_split_x(b, x)); 77b8e80941Smrg} 78b8e80941Smrg 79b8e80941Smrgstatic nir_ssa_def * 80b8e80941Smrglower_u2u32(nir_builder *b, nir_ssa_def *x) 81b8e80941Smrg{ 82b8e80941Smrg return nir_unpack_64_2x32_split_x(b, x); 83b8e80941Smrg} 84b8e80941Smrg 85b8e80941Smrgstatic nir_ssa_def * 86b8e80941Smrglower_u2u64(nir_builder *b, nir_ssa_def *x) 87b8e80941Smrg{ 88b8e80941Smrg nir_ssa_def *x32 = x->bit_size == 32 ? x : nir_u2u32(b, x); 89b8e80941Smrg return nir_pack_64_2x32_split(b, x32, nir_imm_int(b, 0)); 90b8e80941Smrg} 91b8e80941Smrg 92b8e80941Smrgstatic nir_ssa_def * 93b8e80941Smrglower_bcsel64(nir_builder *b, nir_ssa_def *cond, nir_ssa_def *x, nir_ssa_def *y) 94b8e80941Smrg{ 95b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 96b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 97b8e80941Smrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 98b8e80941Smrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 99b8e80941Smrg 100b8e80941Smrg return nir_pack_64_2x32_split(b, nir_bcsel(b, cond, x_lo, y_lo), 101b8e80941Smrg nir_bcsel(b, cond, x_hi, y_hi)); 102b8e80941Smrg} 103b8e80941Smrg 104b8e80941Smrgstatic nir_ssa_def * 105b8e80941Smrglower_inot64(nir_builder *b, nir_ssa_def *x) 106b8e80941Smrg{ 107b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 108b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 109b8e80941Smrg 110b8e80941Smrg return nir_pack_64_2x32_split(b, nir_inot(b, x_lo), nir_inot(b, x_hi)); 111b8e80941Smrg} 112b8e80941Smrg 113b8e80941Smrgstatic nir_ssa_def * 114b8e80941Smrglower_iand64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 115b8e80941Smrg{ 116b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 117b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 118b8e80941Smrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 119b8e80941Smrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 120b8e80941Smrg 121b8e80941Smrg return nir_pack_64_2x32_split(b, nir_iand(b, x_lo, y_lo), 122b8e80941Smrg nir_iand(b, x_hi, y_hi)); 123b8e80941Smrg} 124b8e80941Smrg 125b8e80941Smrgstatic nir_ssa_def * 126b8e80941Smrglower_ior64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 127b8e80941Smrg{ 128b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 129b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 130b8e80941Smrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 131b8e80941Smrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 132b8e80941Smrg 133b8e80941Smrg return nir_pack_64_2x32_split(b, nir_ior(b, x_lo, y_lo), 134b8e80941Smrg nir_ior(b, x_hi, y_hi)); 135b8e80941Smrg} 136b8e80941Smrg 137b8e80941Smrgstatic nir_ssa_def * 138b8e80941Smrglower_ixor64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 139b8e80941Smrg{ 140b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 141b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 142b8e80941Smrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 143b8e80941Smrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 144b8e80941Smrg 145b8e80941Smrg return nir_pack_64_2x32_split(b, nir_ixor(b, x_lo, y_lo), 146b8e80941Smrg nir_ixor(b, x_hi, y_hi)); 147b8e80941Smrg} 148b8e80941Smrg 149b8e80941Smrgstatic nir_ssa_def * 150b8e80941Smrglower_ishl64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 151b8e80941Smrg{ 152b8e80941Smrg /* Implemented as 153b8e80941Smrg * 154b8e80941Smrg * uint64_t lshift(uint64_t x, int c) 155b8e80941Smrg * { 156b8e80941Smrg * if (c == 0) return x; 157b8e80941Smrg * 158b8e80941Smrg * uint32_t lo = LO(x), hi = HI(x); 159b8e80941Smrg * 160b8e80941Smrg * if (c < 32) { 161b8e80941Smrg * uint32_t lo_shifted = lo << c; 162b8e80941Smrg * uint32_t hi_shifted = hi << c; 163b8e80941Smrg * uint32_t lo_shifted_hi = lo >> abs(32 - c); 164b8e80941Smrg * return pack_64(lo_shifted, hi_shifted | lo_shifted_hi); 165b8e80941Smrg * } else { 166b8e80941Smrg * uint32_t lo_shifted_hi = lo << abs(32 - c); 167b8e80941Smrg * return pack_64(0, lo_shifted_hi); 168b8e80941Smrg * } 169b8e80941Smrg * } 170b8e80941Smrg */ 171b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 172b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 173b8e80941Smrg 174b8e80941Smrg nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 175b8e80941Smrg nir_ssa_def *lo_shifted = nir_ishl(b, x_lo, y); 176b8e80941Smrg nir_ssa_def *hi_shifted = nir_ishl(b, x_hi, y); 177b8e80941Smrg nir_ssa_def *lo_shifted_hi = nir_ushr(b, x_lo, reverse_count); 178b8e80941Smrg 179b8e80941Smrg nir_ssa_def *res_if_lt_32 = 180b8e80941Smrg nir_pack_64_2x32_split(b, lo_shifted, 181b8e80941Smrg nir_ior(b, hi_shifted, lo_shifted_hi)); 182b8e80941Smrg nir_ssa_def *res_if_ge_32 = 183b8e80941Smrg nir_pack_64_2x32_split(b, nir_imm_int(b, 0), 184b8e80941Smrg nir_ishl(b, x_lo, reverse_count)); 185b8e80941Smrg 186b8e80941Smrg return nir_bcsel(b, 187b8e80941Smrg nir_ieq(b, y, nir_imm_int(b, 0)), x, 188b8e80941Smrg nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 189b8e80941Smrg res_if_ge_32, res_if_lt_32)); 190b8e80941Smrg} 191b8e80941Smrg 192b8e80941Smrgstatic nir_ssa_def * 193b8e80941Smrglower_ishr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 194b8e80941Smrg{ 195b8e80941Smrg /* Implemented as 196b8e80941Smrg * 197b8e80941Smrg * uint64_t arshift(uint64_t x, int c) 198b8e80941Smrg * { 199b8e80941Smrg * if (c == 0) return x; 200b8e80941Smrg * 201b8e80941Smrg * uint32_t lo = LO(x); 202b8e80941Smrg * int32_t hi = HI(x); 203b8e80941Smrg * 204b8e80941Smrg * if (c < 32) { 205b8e80941Smrg * uint32_t lo_shifted = lo >> c; 206b8e80941Smrg * uint32_t hi_shifted = hi >> c; 207b8e80941Smrg * uint32_t hi_shifted_lo = hi << abs(32 - c); 208b8e80941Smrg * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted); 209b8e80941Smrg * } else { 210b8e80941Smrg * uint32_t hi_shifted = hi >> 31; 211b8e80941Smrg * uint32_t hi_shifted_lo = hi >> abs(32 - c); 212b8e80941Smrg * return pack_64(hi_shifted, hi_shifted_lo); 213b8e80941Smrg * } 214b8e80941Smrg * } 215b8e80941Smrg */ 216b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 217b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 218b8e80941Smrg 219b8e80941Smrg nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 220b8e80941Smrg nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y); 221b8e80941Smrg nir_ssa_def *hi_shifted = nir_ishr(b, x_hi, y); 222b8e80941Smrg nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count); 223b8e80941Smrg 224b8e80941Smrg nir_ssa_def *res_if_lt_32 = 225b8e80941Smrg nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo), 226b8e80941Smrg hi_shifted); 227b8e80941Smrg nir_ssa_def *res_if_ge_32 = 228b8e80941Smrg nir_pack_64_2x32_split(b, nir_ishr(b, x_hi, reverse_count), 229b8e80941Smrg nir_ishr(b, x_hi, nir_imm_int(b, 31))); 230b8e80941Smrg 231b8e80941Smrg return nir_bcsel(b, 232b8e80941Smrg nir_ieq(b, y, nir_imm_int(b, 0)), x, 233b8e80941Smrg nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 234b8e80941Smrg res_if_ge_32, res_if_lt_32)); 235b8e80941Smrg} 236b8e80941Smrg 237b8e80941Smrgstatic nir_ssa_def * 238b8e80941Smrglower_ushr64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 239b8e80941Smrg{ 240b8e80941Smrg /* Implemented as 241b8e80941Smrg * 242b8e80941Smrg * uint64_t rshift(uint64_t x, int c) 243b8e80941Smrg * { 244b8e80941Smrg * if (c == 0) return x; 245b8e80941Smrg * 246b8e80941Smrg * uint32_t lo = LO(x), hi = HI(x); 247b8e80941Smrg * 248b8e80941Smrg * if (c < 32) { 249b8e80941Smrg * uint32_t lo_shifted = lo >> c; 250b8e80941Smrg * uint32_t hi_shifted = hi >> c; 251b8e80941Smrg * uint32_t hi_shifted_lo = hi << abs(32 - c); 252b8e80941Smrg * return pack_64(hi_shifted, hi_shifted_lo | lo_shifted); 253b8e80941Smrg * } else { 254b8e80941Smrg * uint32_t hi_shifted_lo = hi >> abs(32 - c); 255b8e80941Smrg * return pack_64(0, hi_shifted_lo); 256b8e80941Smrg * } 257b8e80941Smrg * } 258b8e80941Smrg */ 259b8e80941Smrg 260b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 261b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 262b8e80941Smrg 263b8e80941Smrg nir_ssa_def *reverse_count = nir_iabs(b, nir_iadd(b, y, nir_imm_int(b, -32))); 264b8e80941Smrg nir_ssa_def *lo_shifted = nir_ushr(b, x_lo, y); 265b8e80941Smrg nir_ssa_def *hi_shifted = nir_ushr(b, x_hi, y); 266b8e80941Smrg nir_ssa_def *hi_shifted_lo = nir_ishl(b, x_hi, reverse_count); 267b8e80941Smrg 268b8e80941Smrg nir_ssa_def *res_if_lt_32 = 269b8e80941Smrg nir_pack_64_2x32_split(b, nir_ior(b, lo_shifted, hi_shifted_lo), 270b8e80941Smrg hi_shifted); 271b8e80941Smrg nir_ssa_def *res_if_ge_32 = 272b8e80941Smrg nir_pack_64_2x32_split(b, nir_ushr(b, x_hi, reverse_count), 273b8e80941Smrg nir_imm_int(b, 0)); 274b8e80941Smrg 275b8e80941Smrg return nir_bcsel(b, 276b8e80941Smrg nir_ieq(b, y, nir_imm_int(b, 0)), x, 277b8e80941Smrg nir_bcsel(b, nir_uge(b, y, nir_imm_int(b, 32)), 278b8e80941Smrg res_if_ge_32, res_if_lt_32)); 279b8e80941Smrg} 280b8e80941Smrg 281b8e80941Smrgstatic nir_ssa_def * 282b8e80941Smrglower_iadd64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 283b8e80941Smrg{ 284b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 285b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 286b8e80941Smrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 287b8e80941Smrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 288b8e80941Smrg 289b8e80941Smrg nir_ssa_def *res_lo = nir_iadd(b, x_lo, y_lo); 290b8e80941Smrg nir_ssa_def *carry = nir_b2i32(b, nir_ult(b, res_lo, x_lo)); 291b8e80941Smrg nir_ssa_def *res_hi = nir_iadd(b, carry, nir_iadd(b, x_hi, y_hi)); 292b8e80941Smrg 293b8e80941Smrg return nir_pack_64_2x32_split(b, res_lo, res_hi); 294b8e80941Smrg} 295b8e80941Smrg 296b8e80941Smrgstatic nir_ssa_def * 297b8e80941Smrglower_isub64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 298b8e80941Smrg{ 299b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 300b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 301b8e80941Smrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 302b8e80941Smrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 303b8e80941Smrg 304b8e80941Smrg nir_ssa_def *res_lo = nir_isub(b, x_lo, y_lo); 305b8e80941Smrg nir_ssa_def *borrow = nir_ineg(b, nir_b2i32(b, nir_ult(b, x_lo, y_lo))); 306b8e80941Smrg nir_ssa_def *res_hi = nir_iadd(b, nir_isub(b, x_hi, y_hi), borrow); 307b8e80941Smrg 308b8e80941Smrg return nir_pack_64_2x32_split(b, res_lo, res_hi); 309b8e80941Smrg} 310b8e80941Smrg 311b8e80941Smrgstatic nir_ssa_def * 312b8e80941Smrglower_ineg64(nir_builder *b, nir_ssa_def *x) 313b8e80941Smrg{ 314b8e80941Smrg /* Since isub is the same number of instructions (with better dependencies) 315b8e80941Smrg * as iadd, subtraction is actually more efficient for ineg than the usual 316b8e80941Smrg * 2's complement "flip the bits and add one". 317b8e80941Smrg */ 318b8e80941Smrg return lower_isub64(b, nir_imm_int64(b, 0), x); 319b8e80941Smrg} 320b8e80941Smrg 321b8e80941Smrgstatic nir_ssa_def * 322b8e80941Smrglower_iabs64(nir_builder *b, nir_ssa_def *x) 323b8e80941Smrg{ 324b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 325b8e80941Smrg nir_ssa_def *x_is_neg = nir_ilt(b, x_hi, nir_imm_int(b, 0)); 326b8e80941Smrg return nir_bcsel(b, x_is_neg, nir_ineg(b, x), x); 327b8e80941Smrg} 328b8e80941Smrg 329b8e80941Smrgstatic nir_ssa_def * 330b8e80941Smrglower_int64_compare(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *y) 331b8e80941Smrg{ 332b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 333b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 334b8e80941Smrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 335b8e80941Smrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 336b8e80941Smrg 337b8e80941Smrg switch (op) { 338b8e80941Smrg case nir_op_ieq: 339b8e80941Smrg return nir_iand(b, nir_ieq(b, x_hi, y_hi), nir_ieq(b, x_lo, y_lo)); 340b8e80941Smrg case nir_op_ine: 341b8e80941Smrg return nir_ior(b, nir_ine(b, x_hi, y_hi), nir_ine(b, x_lo, y_lo)); 342b8e80941Smrg case nir_op_ult: 343b8e80941Smrg return nir_ior(b, nir_ult(b, x_hi, y_hi), 344b8e80941Smrg nir_iand(b, nir_ieq(b, x_hi, y_hi), 345b8e80941Smrg nir_ult(b, x_lo, y_lo))); 346b8e80941Smrg case nir_op_ilt: 347b8e80941Smrg return nir_ior(b, nir_ilt(b, x_hi, y_hi), 348b8e80941Smrg nir_iand(b, nir_ieq(b, x_hi, y_hi), 349b8e80941Smrg nir_ult(b, x_lo, y_lo))); 350b8e80941Smrg break; 351b8e80941Smrg case nir_op_uge: 352b8e80941Smrg /* Lower as !(x < y) in the hopes of better CSE */ 353b8e80941Smrg return nir_inot(b, lower_int64_compare(b, nir_op_ult, x, y)); 354b8e80941Smrg case nir_op_ige: 355b8e80941Smrg /* Lower as !(x < y) in the hopes of better CSE */ 356b8e80941Smrg return nir_inot(b, lower_int64_compare(b, nir_op_ilt, x, y)); 357b8e80941Smrg default: 358b8e80941Smrg unreachable("Invalid comparison"); 359b8e80941Smrg } 360b8e80941Smrg} 361b8e80941Smrg 362b8e80941Smrgstatic nir_ssa_def * 363b8e80941Smrglower_umax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 364b8e80941Smrg{ 365b8e80941Smrg return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), y, x); 366b8e80941Smrg} 367b8e80941Smrg 368b8e80941Smrgstatic nir_ssa_def * 369b8e80941Smrglower_imax64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 370b8e80941Smrg{ 371b8e80941Smrg return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), y, x); 372b8e80941Smrg} 373b8e80941Smrg 374b8e80941Smrgstatic nir_ssa_def * 375b8e80941Smrglower_umin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 376b8e80941Smrg{ 377b8e80941Smrg return nir_bcsel(b, lower_int64_compare(b, nir_op_ult, x, y), x, y); 378b8e80941Smrg} 379b8e80941Smrg 380b8e80941Smrgstatic nir_ssa_def * 381b8e80941Smrglower_imin64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 382b8e80941Smrg{ 383b8e80941Smrg return nir_bcsel(b, lower_int64_compare(b, nir_op_ilt, x, y), x, y); 384b8e80941Smrg} 385b8e80941Smrg 386b8e80941Smrgstatic nir_ssa_def * 387b8e80941Smrglower_mul_2x32_64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, 388b8e80941Smrg bool sign_extend) 389b8e80941Smrg{ 390b8e80941Smrg nir_ssa_def *res_hi = sign_extend ? nir_imul_high(b, x, y) 391b8e80941Smrg : nir_umul_high(b, x, y); 392b8e80941Smrg 393b8e80941Smrg return nir_pack_64_2x32_split(b, nir_imul(b, x, y), res_hi); 394b8e80941Smrg} 395b8e80941Smrg 396b8e80941Smrgstatic nir_ssa_def * 397b8e80941Smrglower_imul64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y) 398b8e80941Smrg{ 399b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 400b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 401b8e80941Smrg nir_ssa_def *y_lo = nir_unpack_64_2x32_split_x(b, y); 402b8e80941Smrg nir_ssa_def *y_hi = nir_unpack_64_2x32_split_y(b, y); 403b8e80941Smrg 404b8e80941Smrg nir_ssa_def *mul_lo = nir_umul_2x32_64(b, x_lo, y_lo); 405b8e80941Smrg nir_ssa_def *res_hi = nir_iadd(b, nir_unpack_64_2x32_split_y(b, mul_lo), 406b8e80941Smrg nir_iadd(b, nir_imul(b, x_lo, y_hi), 407b8e80941Smrg nir_imul(b, x_hi, y_lo))); 408b8e80941Smrg 409b8e80941Smrg return nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, mul_lo), 410b8e80941Smrg res_hi); 411b8e80941Smrg} 412b8e80941Smrg 413b8e80941Smrgstatic nir_ssa_def * 414b8e80941Smrglower_mul_high64(nir_builder *b, nir_ssa_def *x, nir_ssa_def *y, 415b8e80941Smrg bool sign_extend) 416b8e80941Smrg{ 417b8e80941Smrg nir_ssa_def *x32[4], *y32[4]; 418b8e80941Smrg x32[0] = nir_unpack_64_2x32_split_x(b, x); 419b8e80941Smrg x32[1] = nir_unpack_64_2x32_split_y(b, x); 420b8e80941Smrg if (sign_extend) { 421b8e80941Smrg x32[2] = x32[3] = nir_ishr(b, x32[1], nir_imm_int(b, 31)); 422b8e80941Smrg } else { 423b8e80941Smrg x32[2] = x32[3] = nir_imm_int(b, 0); 424b8e80941Smrg } 425b8e80941Smrg 426b8e80941Smrg y32[0] = nir_unpack_64_2x32_split_x(b, y); 427b8e80941Smrg y32[1] = nir_unpack_64_2x32_split_y(b, y); 428b8e80941Smrg if (sign_extend) { 429b8e80941Smrg y32[2] = y32[3] = nir_ishr(b, y32[1], nir_imm_int(b, 31)); 430b8e80941Smrg } else { 431b8e80941Smrg y32[2] = y32[3] = nir_imm_int(b, 0); 432b8e80941Smrg } 433b8e80941Smrg 434b8e80941Smrg nir_ssa_def *res[8] = { NULL, }; 435b8e80941Smrg 436b8e80941Smrg /* Yes, the following generates a pile of code. However, we throw res[0] 437b8e80941Smrg * and res[1] away in the end and, if we're in the umul case, four of our 438b8e80941Smrg * eight dword operands will be constant zero and opt_algebraic will clean 439b8e80941Smrg * this up nicely. 440b8e80941Smrg */ 441b8e80941Smrg for (unsigned i = 0; i < 4; i++) { 442b8e80941Smrg nir_ssa_def *carry = NULL; 443b8e80941Smrg for (unsigned j = 0; j < 4; j++) { 444b8e80941Smrg /* The maximum values of x32[i] and y32[i] are UINT32_MAX so the 445b8e80941Smrg * maximum value of tmp is UINT32_MAX * UINT32_MAX. The maximum 446b8e80941Smrg * value that will fit in tmp is 447b8e80941Smrg * 448b8e80941Smrg * UINT64_MAX = UINT32_MAX << 32 + UINT32_MAX 449b8e80941Smrg * = UINT32_MAX * (UINT32_MAX + 1) + UINT32_MAX 450b8e80941Smrg * = UINT32_MAX * UINT32_MAX + 2 * UINT32_MAX 451b8e80941Smrg * 452b8e80941Smrg * so we're guaranteed that we can add in two more 32-bit values 453b8e80941Smrg * without overflowing tmp. 454b8e80941Smrg */ 455b8e80941Smrg nir_ssa_def *tmp = nir_umul_2x32_64(b, x32[i], y32[i]); 456b8e80941Smrg 457b8e80941Smrg if (res[i + j]) 458b8e80941Smrg tmp = nir_iadd(b, tmp, nir_u2u64(b, res[i + j])); 459b8e80941Smrg if (carry) 460b8e80941Smrg tmp = nir_iadd(b, tmp, carry); 461b8e80941Smrg res[i + j] = nir_u2u32(b, tmp); 462b8e80941Smrg carry = nir_ushr(b, tmp, nir_imm_int(b, 32)); 463b8e80941Smrg } 464b8e80941Smrg res[i + 4] = nir_u2u32(b, carry); 465b8e80941Smrg } 466b8e80941Smrg 467b8e80941Smrg return nir_pack_64_2x32_split(b, res[2], res[3]); 468b8e80941Smrg} 469b8e80941Smrg 470b8e80941Smrgstatic nir_ssa_def * 471b8e80941Smrglower_isign64(nir_builder *b, nir_ssa_def *x) 472b8e80941Smrg{ 473b8e80941Smrg nir_ssa_def *x_lo = nir_unpack_64_2x32_split_x(b, x); 474b8e80941Smrg nir_ssa_def *x_hi = nir_unpack_64_2x32_split_y(b, x); 475b8e80941Smrg 476b8e80941Smrg nir_ssa_def *is_non_zero = nir_i2b(b, nir_ior(b, x_lo, x_hi)); 477b8e80941Smrg nir_ssa_def *res_hi = nir_ishr(b, x_hi, nir_imm_int(b, 31)); 478b8e80941Smrg nir_ssa_def *res_lo = nir_ior(b, res_hi, nir_b2i32(b, is_non_zero)); 479b8e80941Smrg 480b8e80941Smrg return nir_pack_64_2x32_split(b, res_lo, res_hi); 481b8e80941Smrg} 482b8e80941Smrg 483b8e80941Smrgstatic void 484b8e80941Smrglower_udiv64_mod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d, 485b8e80941Smrg nir_ssa_def **q, nir_ssa_def **r) 486b8e80941Smrg{ 487b8e80941Smrg /* TODO: We should specially handle the case where the denominator is a 488b8e80941Smrg * constant. In that case, we should be able to reduce it to a multiply by 489b8e80941Smrg * a constant, some shifts, and an add. 490b8e80941Smrg */ 491b8e80941Smrg nir_ssa_def *n_lo = nir_unpack_64_2x32_split_x(b, n); 492b8e80941Smrg nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 493b8e80941Smrg nir_ssa_def *d_lo = nir_unpack_64_2x32_split_x(b, d); 494b8e80941Smrg nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 495b8e80941Smrg 496b8e80941Smrg nir_ssa_def *q_lo = nir_imm_zero(b, n->num_components, 32); 497b8e80941Smrg nir_ssa_def *q_hi = nir_imm_zero(b, n->num_components, 32); 498b8e80941Smrg 499b8e80941Smrg nir_ssa_def *n_hi_before_if = n_hi; 500b8e80941Smrg nir_ssa_def *q_hi_before_if = q_hi; 501b8e80941Smrg 502b8e80941Smrg /* If the upper 32 bits of denom are non-zero, it is impossible for shifts 503b8e80941Smrg * greater than 32 bits to occur. If the upper 32 bits of the numerator 504b8e80941Smrg * are zero, it is impossible for (denom << [63, 32]) <= numer unless 505b8e80941Smrg * denom == 0. 506b8e80941Smrg */ 507b8e80941Smrg nir_ssa_def *need_high_div = 508b8e80941Smrg nir_iand(b, nir_ieq(b, d_hi, nir_imm_int(b, 0)), nir_uge(b, n_hi, d_lo)); 509b8e80941Smrg nir_push_if(b, nir_bany(b, need_high_div)); 510b8e80941Smrg { 511b8e80941Smrg /* If we only have one component, then the bany above goes away and 512b8e80941Smrg * this is always true within the if statement. 513b8e80941Smrg */ 514b8e80941Smrg if (n->num_components == 1) 515b8e80941Smrg need_high_div = nir_imm_true(b); 516b8e80941Smrg 517b8e80941Smrg nir_ssa_def *log2_d_lo = nir_ufind_msb(b, d_lo); 518b8e80941Smrg 519b8e80941Smrg for (int i = 31; i >= 0; i--) { 520b8e80941Smrg /* if ((d.x << i) <= n.y) { 521b8e80941Smrg * n.y -= d.x << i; 522b8e80941Smrg * quot.y |= 1U << i; 523b8e80941Smrg * } 524b8e80941Smrg */ 525b8e80941Smrg nir_ssa_def *d_shift = nir_ishl(b, d_lo, nir_imm_int(b, i)); 526b8e80941Smrg nir_ssa_def *new_n_hi = nir_isub(b, n_hi, d_shift); 527b8e80941Smrg nir_ssa_def *new_q_hi = nir_ior(b, q_hi, nir_imm_int(b, 1u << i)); 528b8e80941Smrg nir_ssa_def *cond = nir_iand(b, need_high_div, 529b8e80941Smrg nir_uge(b, n_hi, d_shift)); 530b8e80941Smrg if (i != 0) { 531b8e80941Smrg /* log2_d_lo is always <= 31, so we don't need to bother with it 532b8e80941Smrg * in the last iteration. 533b8e80941Smrg */ 534b8e80941Smrg cond = nir_iand(b, cond, 535b8e80941Smrg nir_ige(b, nir_imm_int(b, 31 - i), log2_d_lo)); 536b8e80941Smrg } 537b8e80941Smrg n_hi = nir_bcsel(b, cond, new_n_hi, n_hi); 538b8e80941Smrg q_hi = nir_bcsel(b, cond, new_q_hi, q_hi); 539b8e80941Smrg } 540b8e80941Smrg } 541b8e80941Smrg nir_pop_if(b, NULL); 542b8e80941Smrg n_hi = nir_if_phi(b, n_hi, n_hi_before_if); 543b8e80941Smrg q_hi = nir_if_phi(b, q_hi, q_hi_before_if); 544b8e80941Smrg 545b8e80941Smrg nir_ssa_def *log2_denom = nir_ufind_msb(b, d_hi); 546b8e80941Smrg 547b8e80941Smrg n = nir_pack_64_2x32_split(b, n_lo, n_hi); 548b8e80941Smrg d = nir_pack_64_2x32_split(b, d_lo, d_hi); 549b8e80941Smrg for (int i = 31; i >= 0; i--) { 550b8e80941Smrg /* if ((d64 << i) <= n64) { 551b8e80941Smrg * n64 -= d64 << i; 552b8e80941Smrg * quot.x |= 1U << i; 553b8e80941Smrg * } 554b8e80941Smrg */ 555b8e80941Smrg nir_ssa_def *d_shift = nir_ishl(b, d, nir_imm_int(b, i)); 556b8e80941Smrg nir_ssa_def *new_n = nir_isub(b, n, d_shift); 557b8e80941Smrg nir_ssa_def *new_q_lo = nir_ior(b, q_lo, nir_imm_int(b, 1u << i)); 558b8e80941Smrg nir_ssa_def *cond = nir_uge(b, n, d_shift); 559b8e80941Smrg if (i != 0) { 560b8e80941Smrg /* log2_denom is always <= 31, so we don't need to bother with it 561b8e80941Smrg * in the last iteration. 562b8e80941Smrg */ 563b8e80941Smrg cond = nir_iand(b, cond, 564b8e80941Smrg nir_ige(b, nir_imm_int(b, 31 - i), log2_denom)); 565b8e80941Smrg } 566b8e80941Smrg n = nir_bcsel(b, cond, new_n, n); 567b8e80941Smrg q_lo = nir_bcsel(b, cond, new_q_lo, q_lo); 568b8e80941Smrg } 569b8e80941Smrg 570b8e80941Smrg *q = nir_pack_64_2x32_split(b, q_lo, q_hi); 571b8e80941Smrg *r = n; 572b8e80941Smrg} 573b8e80941Smrg 574b8e80941Smrgstatic nir_ssa_def * 575b8e80941Smrglower_udiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 576b8e80941Smrg{ 577b8e80941Smrg nir_ssa_def *q, *r; 578b8e80941Smrg lower_udiv64_mod64(b, n, d, &q, &r); 579b8e80941Smrg return q; 580b8e80941Smrg} 581b8e80941Smrg 582b8e80941Smrgstatic nir_ssa_def * 583b8e80941Smrglower_idiv64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 584b8e80941Smrg{ 585b8e80941Smrg nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 586b8e80941Smrg nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 587b8e80941Smrg 588b8e80941Smrg nir_ssa_def *negate = nir_ine(b, nir_ilt(b, n_hi, nir_imm_int(b, 0)), 589b8e80941Smrg nir_ilt(b, d_hi, nir_imm_int(b, 0))); 590b8e80941Smrg nir_ssa_def *q, *r; 591b8e80941Smrg lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 592b8e80941Smrg return nir_bcsel(b, negate, nir_ineg(b, q), q); 593b8e80941Smrg} 594b8e80941Smrg 595b8e80941Smrgstatic nir_ssa_def * 596b8e80941Smrglower_umod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 597b8e80941Smrg{ 598b8e80941Smrg nir_ssa_def *q, *r; 599b8e80941Smrg lower_udiv64_mod64(b, n, d, &q, &r); 600b8e80941Smrg return r; 601b8e80941Smrg} 602b8e80941Smrg 603b8e80941Smrgstatic nir_ssa_def * 604b8e80941Smrglower_imod64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 605b8e80941Smrg{ 606b8e80941Smrg nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 607b8e80941Smrg nir_ssa_def *d_hi = nir_unpack_64_2x32_split_y(b, d); 608b8e80941Smrg nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0)); 609b8e80941Smrg nir_ssa_def *d_is_neg = nir_ilt(b, d_hi, nir_imm_int(b, 0)); 610b8e80941Smrg 611b8e80941Smrg nir_ssa_def *q, *r; 612b8e80941Smrg lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 613b8e80941Smrg 614b8e80941Smrg nir_ssa_def *rem = nir_bcsel(b, n_is_neg, nir_ineg(b, r), r); 615b8e80941Smrg 616b8e80941Smrg return nir_bcsel(b, nir_ieq(b, r, nir_imm_int64(b, 0)), nir_imm_int64(b, 0), 617b8e80941Smrg nir_bcsel(b, nir_ieq(b, n_is_neg, d_is_neg), rem, 618b8e80941Smrg nir_iadd(b, rem, d))); 619b8e80941Smrg} 620b8e80941Smrg 621b8e80941Smrgstatic nir_ssa_def * 622b8e80941Smrglower_irem64(nir_builder *b, nir_ssa_def *n, nir_ssa_def *d) 623b8e80941Smrg{ 624b8e80941Smrg nir_ssa_def *n_hi = nir_unpack_64_2x32_split_y(b, n); 625b8e80941Smrg nir_ssa_def *n_is_neg = nir_ilt(b, n_hi, nir_imm_int(b, 0)); 626b8e80941Smrg 627b8e80941Smrg nir_ssa_def *q, *r; 628b8e80941Smrg lower_udiv64_mod64(b, nir_iabs(b, n), nir_iabs(b, d), &q, &r); 629b8e80941Smrg return nir_bcsel(b, n_is_neg, nir_ineg(b, r), r); 630b8e80941Smrg} 631b8e80941Smrg 632b8e80941Smrgstatic nir_ssa_def * 633b8e80941Smrglower_extract(nir_builder *b, nir_op op, nir_ssa_def *x, nir_ssa_def *c) 634b8e80941Smrg{ 635b8e80941Smrg assert(op == nir_op_extract_u8 || op == nir_op_extract_i8 || 636b8e80941Smrg op == nir_op_extract_u16 || op == nir_op_extract_i16); 637b8e80941Smrg 638b8e80941Smrg const int chunk = nir_src_as_uint(nir_src_for_ssa(c)); 639b8e80941Smrg const int chunk_bits = 640b8e80941Smrg (op == nir_op_extract_u8 || op == nir_op_extract_i8) ? 8 : 16; 641b8e80941Smrg const int num_chunks_in_32 = 32 / chunk_bits; 642b8e80941Smrg 643b8e80941Smrg nir_ssa_def *extract32; 644b8e80941Smrg if (chunk < num_chunks_in_32) { 645b8e80941Smrg extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_x(b, x), 646b8e80941Smrg nir_imm_int(b, chunk), 647b8e80941Smrg NULL, NULL); 648b8e80941Smrg } else { 649b8e80941Smrg extract32 = nir_build_alu(b, op, nir_unpack_64_2x32_split_y(b, x), 650b8e80941Smrg nir_imm_int(b, chunk - num_chunks_in_32), 651b8e80941Smrg NULL, NULL); 652b8e80941Smrg } 653b8e80941Smrg 654b8e80941Smrg if (op == nir_op_extract_i8 || op == nir_op_extract_i16) 655b8e80941Smrg return lower_i2i64(b, extract32); 656b8e80941Smrg else 657b8e80941Smrg return lower_u2u64(b, extract32); 658b8e80941Smrg} 659b8e80941Smrg 660b8e80941Smrgnir_lower_int64_options 661b8e80941Smrgnir_lower_int64_op_to_options_mask(nir_op opcode) 662b8e80941Smrg{ 663b8e80941Smrg switch (opcode) { 664b8e80941Smrg case nir_op_imul: 665b8e80941Smrg return nir_lower_imul64; 666b8e80941Smrg case nir_op_imul_2x32_64: 667b8e80941Smrg case nir_op_umul_2x32_64: 668b8e80941Smrg return nir_lower_imul_2x32_64; 669b8e80941Smrg case nir_op_imul_high: 670b8e80941Smrg case nir_op_umul_high: 671b8e80941Smrg return nir_lower_imul_high64; 672b8e80941Smrg case nir_op_isign: 673b8e80941Smrg return nir_lower_isign64; 674b8e80941Smrg case nir_op_udiv: 675b8e80941Smrg case nir_op_idiv: 676b8e80941Smrg case nir_op_umod: 677b8e80941Smrg case nir_op_imod: 678b8e80941Smrg case nir_op_irem: 679b8e80941Smrg return nir_lower_divmod64; 680b8e80941Smrg case nir_op_b2i64: 681b8e80941Smrg case nir_op_i2b1: 682b8e80941Smrg case nir_op_i2i32: 683b8e80941Smrg case nir_op_i2i64: 684b8e80941Smrg case nir_op_u2u32: 685b8e80941Smrg case nir_op_u2u64: 686b8e80941Smrg case nir_op_bcsel: 687b8e80941Smrg return nir_lower_mov64; 688b8e80941Smrg case nir_op_ieq: 689b8e80941Smrg case nir_op_ine: 690b8e80941Smrg case nir_op_ult: 691b8e80941Smrg case nir_op_ilt: 692b8e80941Smrg case nir_op_uge: 693b8e80941Smrg case nir_op_ige: 694b8e80941Smrg return nir_lower_icmp64; 695b8e80941Smrg case nir_op_iadd: 696b8e80941Smrg case nir_op_isub: 697b8e80941Smrg return nir_lower_iadd64; 698b8e80941Smrg case nir_op_imin: 699b8e80941Smrg case nir_op_imax: 700b8e80941Smrg case nir_op_umin: 701b8e80941Smrg case nir_op_umax: 702b8e80941Smrg return nir_lower_minmax64; 703b8e80941Smrg case nir_op_iabs: 704b8e80941Smrg return nir_lower_iabs64; 705b8e80941Smrg case nir_op_ineg: 706b8e80941Smrg return nir_lower_ineg64; 707b8e80941Smrg case nir_op_iand: 708b8e80941Smrg case nir_op_ior: 709b8e80941Smrg case nir_op_ixor: 710b8e80941Smrg case nir_op_inot: 711b8e80941Smrg return nir_lower_logic64; 712b8e80941Smrg case nir_op_ishl: 713b8e80941Smrg case nir_op_ishr: 714b8e80941Smrg case nir_op_ushr: 715b8e80941Smrg return nir_lower_shift64; 716b8e80941Smrg case nir_op_extract_u8: 717b8e80941Smrg case nir_op_extract_i8: 718b8e80941Smrg case nir_op_extract_u16: 719b8e80941Smrg case nir_op_extract_i16: 720b8e80941Smrg return nir_lower_extract64; 721b8e80941Smrg default: 722b8e80941Smrg return 0; 723b8e80941Smrg } 724b8e80941Smrg} 725b8e80941Smrg 726b8e80941Smrgstatic nir_ssa_def * 727b8e80941Smrglower_int64_alu_instr(nir_builder *b, nir_alu_instr *alu) 728b8e80941Smrg{ 729b8e80941Smrg nir_ssa_def *src[4]; 730b8e80941Smrg for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) 731b8e80941Smrg src[i] = nir_ssa_for_alu_src(b, alu, i); 732b8e80941Smrg 733b8e80941Smrg switch (alu->op) { 734b8e80941Smrg case nir_op_imul: 735b8e80941Smrg return lower_imul64(b, src[0], src[1]); 736b8e80941Smrg case nir_op_imul_2x32_64: 737b8e80941Smrg return lower_mul_2x32_64(b, src[0], src[1], true); 738b8e80941Smrg case nir_op_umul_2x32_64: 739b8e80941Smrg return lower_mul_2x32_64(b, src[0], src[1], false); 740b8e80941Smrg case nir_op_imul_high: 741b8e80941Smrg return lower_mul_high64(b, src[0], src[1], true); 742b8e80941Smrg case nir_op_umul_high: 743b8e80941Smrg return lower_mul_high64(b, src[0], src[1], false); 744b8e80941Smrg case nir_op_isign: 745b8e80941Smrg return lower_isign64(b, src[0]); 746b8e80941Smrg case nir_op_udiv: 747b8e80941Smrg return lower_udiv64(b, src[0], src[1]); 748b8e80941Smrg case nir_op_idiv: 749b8e80941Smrg return lower_idiv64(b, src[0], src[1]); 750b8e80941Smrg case nir_op_umod: 751b8e80941Smrg return lower_umod64(b, src[0], src[1]); 752b8e80941Smrg case nir_op_imod: 753b8e80941Smrg return lower_imod64(b, src[0], src[1]); 754b8e80941Smrg case nir_op_irem: 755b8e80941Smrg return lower_irem64(b, src[0], src[1]); 756b8e80941Smrg case nir_op_b2i64: 757b8e80941Smrg return lower_b2i64(b, src[0]); 758b8e80941Smrg case nir_op_i2b1: 759b8e80941Smrg return lower_i2b(b, src[0]); 760b8e80941Smrg case nir_op_i2i8: 761b8e80941Smrg return lower_i2i8(b, src[0]); 762b8e80941Smrg case nir_op_i2i16: 763b8e80941Smrg return lower_i2i16(b, src[0]); 764b8e80941Smrg case nir_op_i2i32: 765b8e80941Smrg return lower_i2i32(b, src[0]); 766b8e80941Smrg case nir_op_i2i64: 767b8e80941Smrg return lower_i2i64(b, src[0]); 768b8e80941Smrg case nir_op_u2u8: 769b8e80941Smrg return lower_u2u8(b, src[0]); 770b8e80941Smrg case nir_op_u2u16: 771b8e80941Smrg return lower_u2u16(b, src[0]); 772b8e80941Smrg case nir_op_u2u32: 773b8e80941Smrg return lower_u2u32(b, src[0]); 774b8e80941Smrg case nir_op_u2u64: 775b8e80941Smrg return lower_u2u64(b, src[0]); 776b8e80941Smrg case nir_op_bcsel: 777b8e80941Smrg return lower_bcsel64(b, src[0], src[1], src[2]); 778b8e80941Smrg case nir_op_ieq: 779b8e80941Smrg case nir_op_ine: 780b8e80941Smrg case nir_op_ult: 781b8e80941Smrg case nir_op_ilt: 782b8e80941Smrg case nir_op_uge: 783b8e80941Smrg case nir_op_ige: 784b8e80941Smrg return lower_int64_compare(b, alu->op, src[0], src[1]); 785b8e80941Smrg case nir_op_iadd: 786b8e80941Smrg return lower_iadd64(b, src[0], src[1]); 787b8e80941Smrg case nir_op_isub: 788b8e80941Smrg return lower_isub64(b, src[0], src[1]); 789b8e80941Smrg case nir_op_imin: 790b8e80941Smrg return lower_imin64(b, src[0], src[1]); 791b8e80941Smrg case nir_op_imax: 792b8e80941Smrg return lower_imax64(b, src[0], src[1]); 793b8e80941Smrg case nir_op_umin: 794b8e80941Smrg return lower_umin64(b, src[0], src[1]); 795b8e80941Smrg case nir_op_umax: 796b8e80941Smrg return lower_umax64(b, src[0], src[1]); 797b8e80941Smrg case nir_op_iabs: 798b8e80941Smrg return lower_iabs64(b, src[0]); 799b8e80941Smrg case nir_op_ineg: 800b8e80941Smrg return lower_ineg64(b, src[0]); 801b8e80941Smrg case nir_op_iand: 802b8e80941Smrg return lower_iand64(b, src[0], src[1]); 803b8e80941Smrg case nir_op_ior: 804b8e80941Smrg return lower_ior64(b, src[0], src[1]); 805b8e80941Smrg case nir_op_ixor: 806b8e80941Smrg return lower_ixor64(b, src[0], src[1]); 807b8e80941Smrg case nir_op_inot: 808b8e80941Smrg return lower_inot64(b, src[0]); 809b8e80941Smrg case nir_op_ishl: 810b8e80941Smrg return lower_ishl64(b, src[0], src[1]); 811b8e80941Smrg case nir_op_ishr: 812b8e80941Smrg return lower_ishr64(b, src[0], src[1]); 813b8e80941Smrg case nir_op_ushr: 814b8e80941Smrg return lower_ushr64(b, src[0], src[1]); 815b8e80941Smrg case nir_op_extract_u8: 816b8e80941Smrg case nir_op_extract_i8: 817b8e80941Smrg case nir_op_extract_u16: 818b8e80941Smrg case nir_op_extract_i16: 819b8e80941Smrg return lower_extract(b, alu->op, src[0], src[1]); 820b8e80941Smrg default: 821b8e80941Smrg unreachable("Invalid ALU opcode to lower"); 822b8e80941Smrg } 823b8e80941Smrg} 824b8e80941Smrg 825b8e80941Smrgstatic bool 826b8e80941Smrglower_int64_impl(nir_function_impl *impl, nir_lower_int64_options options) 827b8e80941Smrg{ 828b8e80941Smrg nir_builder b; 829b8e80941Smrg nir_builder_init(&b, impl); 830b8e80941Smrg 831b8e80941Smrg bool progress = false; 832b8e80941Smrg nir_foreach_block(block, impl) { 833b8e80941Smrg nir_foreach_instr_safe(instr, block) { 834b8e80941Smrg if (instr->type != nir_instr_type_alu) 835b8e80941Smrg continue; 836b8e80941Smrg 837b8e80941Smrg nir_alu_instr *alu = nir_instr_as_alu(instr); 838b8e80941Smrg switch (alu->op) { 839b8e80941Smrg case nir_op_i2b1: 840b8e80941Smrg case nir_op_i2i32: 841b8e80941Smrg case nir_op_u2u32: 842b8e80941Smrg assert(alu->src[0].src.is_ssa); 843b8e80941Smrg if (alu->src[0].src.ssa->bit_size != 64) 844b8e80941Smrg continue; 845b8e80941Smrg break; 846b8e80941Smrg case nir_op_bcsel: 847b8e80941Smrg assert(alu->src[1].src.is_ssa); 848b8e80941Smrg assert(alu->src[2].src.is_ssa); 849b8e80941Smrg assert(alu->src[1].src.ssa->bit_size == 850b8e80941Smrg alu->src[2].src.ssa->bit_size); 851b8e80941Smrg if (alu->src[1].src.ssa->bit_size != 64) 852b8e80941Smrg continue; 853b8e80941Smrg break; 854b8e80941Smrg case nir_op_ieq: 855b8e80941Smrg case nir_op_ine: 856b8e80941Smrg case nir_op_ult: 857b8e80941Smrg case nir_op_ilt: 858b8e80941Smrg case nir_op_uge: 859b8e80941Smrg case nir_op_ige: 860b8e80941Smrg assert(alu->src[0].src.is_ssa); 861b8e80941Smrg assert(alu->src[1].src.is_ssa); 862b8e80941Smrg assert(alu->src[0].src.ssa->bit_size == 863b8e80941Smrg alu->src[1].src.ssa->bit_size); 864b8e80941Smrg if (alu->src[0].src.ssa->bit_size != 64) 865b8e80941Smrg continue; 866b8e80941Smrg break; 867b8e80941Smrg default: 868b8e80941Smrg assert(alu->dest.dest.is_ssa); 869b8e80941Smrg if (alu->dest.dest.ssa.bit_size != 64) 870b8e80941Smrg continue; 871b8e80941Smrg break; 872b8e80941Smrg } 873b8e80941Smrg 874b8e80941Smrg if (!(options & nir_lower_int64_op_to_options_mask(alu->op))) 875b8e80941Smrg continue; 876b8e80941Smrg 877b8e80941Smrg b.cursor = nir_before_instr(instr); 878b8e80941Smrg 879b8e80941Smrg nir_ssa_def *lowered = lower_int64_alu_instr(&b, alu); 880b8e80941Smrg nir_ssa_def_rewrite_uses(&alu->dest.dest.ssa, 881b8e80941Smrg nir_src_for_ssa(lowered)); 882b8e80941Smrg nir_instr_remove(&alu->instr); 883b8e80941Smrg progress = true; 884b8e80941Smrg } 885b8e80941Smrg } 886b8e80941Smrg 887b8e80941Smrg if (progress) { 888b8e80941Smrg nir_metadata_preserve(impl, nir_metadata_none); 889b8e80941Smrg } else { 890b8e80941Smrg#ifndef NDEBUG 891b8e80941Smrg impl->valid_metadata &= ~nir_metadata_not_properly_reset; 892b8e80941Smrg#endif 893b8e80941Smrg } 894b8e80941Smrg 895b8e80941Smrg return progress; 896b8e80941Smrg} 897b8e80941Smrg 898b8e80941Smrgbool 899b8e80941Smrgnir_lower_int64(nir_shader *shader, nir_lower_int64_options options) 900b8e80941Smrg{ 901b8e80941Smrg bool progress = false; 902b8e80941Smrg 903b8e80941Smrg nir_foreach_function(function, shader) { 904b8e80941Smrg if (function->impl) 905b8e80941Smrg progress |= lower_int64_impl(function->impl, options); 906b8e80941Smrg } 907b8e80941Smrg 908b8e80941Smrg return progress; 909b8e80941Smrg} 910