1/* 2 * Copyright © 2015 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 21 * IN THE SOFTWARE. 22 * 23 */ 24 25#include "nir.h" 26#include "nir_builder.h" 27#include "c99_math.h" 28 29/* 30 * Lowers some unsupported double operations, using only: 31 * 32 * - pack/unpackDouble2x32 33 * - conversion to/from single-precision 34 * - double add, mul, and fma 35 * - conditional select 36 * - 32-bit integer and floating point arithmetic 37 */ 38 39/* Creates a double with the exponent bits set to a given integer value */ 40static nir_ssa_def * 41set_exponent(nir_builder *b, nir_ssa_def *src, nir_ssa_def *exp) 42{ 43 /* Split into bits 0-31 and 32-63 */ 44 nir_ssa_def *lo = nir_unpack_64_2x32_split_x(b, src); 45 nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src); 46 47 /* The exponent is bits 52-62, or 20-30 of the high word, so set the exponent 48 * to 1023 49 */ 50 nir_ssa_def *new_hi = nir_bfi(b, nir_imm_int(b, 0x7ff00000), exp, hi); 51 /* recombine */ 52 return nir_pack_64_2x32_split(b, lo, new_hi); 53} 54 55static nir_ssa_def * 56get_exponent(nir_builder *b, nir_ssa_def *src) 57{ 58 /* get bits 32-63 */ 59 nir_ssa_def *hi = nir_unpack_64_2x32_split_y(b, src); 60 61 /* extract bits 20-30 of the high word */ 62 return nir_ubitfield_extract(b, hi, nir_imm_int(b, 20), nir_imm_int(b, 11)); 63} 64 65/* Return infinity with the sign of the given source which is +/-0 */ 66 67static nir_ssa_def * 68get_signed_inf(nir_builder *b, nir_ssa_def *zero) 69{ 70 nir_ssa_def *zero_hi = nir_unpack_64_2x32_split_y(b, zero); 71 72 /* The bit pattern for infinity is 0x7ff0000000000000, where the sign bit 73 * is the highest bit. Only the sign bit can be non-zero in the passed in 74 * source. So we essentially need to OR the infinity and the zero, except 75 * the low 32 bits are always 0 so we can construct the correct high 32 76 * bits and then pack it together with zero low 32 bits. 77 */ 78 nir_ssa_def *inf_hi = nir_ior(b, nir_imm_int(b, 0x7ff00000), zero_hi); 79 return nir_pack_64_2x32_split(b, nir_imm_int(b, 0), inf_hi); 80} 81 82/* 83 * Generates the correctly-signed infinity if the source was zero, and flushes 84 * the result to 0 if the source was infinity or the calculated exponent was 85 * too small to be representable. 86 */ 87 88static nir_ssa_def * 89fix_inv_result(nir_builder *b, nir_ssa_def *res, nir_ssa_def *src, 90 nir_ssa_def *exp) 91{ 92 /* If the exponent is too small or the original input was infinity/NaN, 93 * force the result to 0 (flush denorms) to avoid the work of handling 94 * denorms properly. Note that this doesn't preserve positive/negative 95 * zeros, but GLSL doesn't require it. 96 */ 97 res = nir_bcsel(b, nir_ior(b, nir_ige(b, nir_imm_int(b, 0), exp), 98 nir_feq(b, nir_fabs(b, src), 99 nir_imm_double(b, INFINITY))), 100 nir_imm_double(b, 0.0f), res); 101 102 /* If the original input was 0, generate the correctly-signed infinity */ 103 res = nir_bcsel(b, nir_fne(b, src, nir_imm_double(b, 0.0f)), 104 res, get_signed_inf(b, src)); 105 106 return res; 107 108} 109 110static nir_ssa_def * 111lower_rcp(nir_builder *b, nir_ssa_def *src) 112{ 113 /* normalize the input to avoid range issues */ 114 nir_ssa_def *src_norm = set_exponent(b, src, nir_imm_int(b, 1023)); 115 116 /* cast to float, do an rcp, and then cast back to get an approximate 117 * result 118 */ 119 nir_ssa_def *ra = nir_f2f64(b, nir_frcp(b, nir_f2f32(b, src_norm))); 120 121 /* Fixup the exponent of the result - note that we check if this is too 122 * small below. 123 */ 124 nir_ssa_def *new_exp = nir_isub(b, get_exponent(b, ra), 125 nir_isub(b, get_exponent(b, src), 126 nir_imm_int(b, 1023))); 127 128 ra = set_exponent(b, ra, new_exp); 129 130 /* Do a few Newton-Raphson steps to improve precision. 131 * 132 * Each step doubles the precision, and we started off with around 24 bits, 133 * so we only need to do 2 steps to get to full precision. The step is: 134 * 135 * x_new = x * (2 - x*src) 136 * 137 * But we can re-arrange this to improve precision by using another fused 138 * multiply-add: 139 * 140 * x_new = x + x * (1 - x*src) 141 * 142 * See https://en.wikipedia.org/wiki/Division_algorithm for more details. 143 */ 144 145 ra = nir_ffma(b, ra, nir_ffma(b, ra, src, nir_imm_double(b, -1)), ra); 146 ra = nir_ffma(b, ra, nir_ffma(b, ra, src, nir_imm_double(b, -1)), ra); 147 148 return fix_inv_result(b, ra, src, new_exp); 149} 150 151static nir_ssa_def * 152lower_sqrt_rsq(nir_builder *b, nir_ssa_def *src, bool sqrt) 153{ 154 /* We want to compute: 155 * 156 * 1/sqrt(m * 2^e) 157 * 158 * When the exponent is even, this is equivalent to: 159 * 160 * 1/sqrt(m) * 2^(-e/2) 161 * 162 * and then the exponent is odd, this is equal to: 163 * 164 * 1/sqrt(m * 2) * 2^(-(e - 1)/2) 165 * 166 * where the m * 2 is absorbed into the exponent. So we want the exponent 167 * inside the square root to be 1 if e is odd and 0 if e is even, and we 168 * want to subtract off e/2 from the final exponent, rounded to negative 169 * infinity. We can do the former by first computing the unbiased exponent, 170 * and then AND'ing it with 1 to get 0 or 1, and we can do the latter by 171 * shifting right by 1. 172 */ 173 174 nir_ssa_def *unbiased_exp = nir_isub(b, get_exponent(b, src), 175 nir_imm_int(b, 1023)); 176 nir_ssa_def *even = nir_iand(b, unbiased_exp, nir_imm_int(b, 1)); 177 nir_ssa_def *half = nir_ishr(b, unbiased_exp, nir_imm_int(b, 1)); 178 179 nir_ssa_def *src_norm = set_exponent(b, src, 180 nir_iadd(b, nir_imm_int(b, 1023), 181 even)); 182 183 nir_ssa_def *ra = nir_f2f64(b, nir_frsq(b, nir_f2f32(b, src_norm))); 184 nir_ssa_def *new_exp = nir_isub(b, get_exponent(b, ra), half); 185 ra = set_exponent(b, ra, new_exp); 186 187 /* 188 * The following implements an iterative algorithm that's very similar 189 * between sqrt and rsqrt. We start with an iteration of Goldschmit's 190 * algorithm, which looks like: 191 * 192 * a = the source 193 * y_0 = initial (single-precision) rsqrt estimate 194 * 195 * h_0 = .5 * y_0 196 * g_0 = a * y_0 197 * r_0 = .5 - h_0 * g_0 198 * g_1 = g_0 * r_0 + g_0 199 * h_1 = h_0 * r_0 + h_0 200 * 201 * Now g_1 ~= sqrt(a), and h_1 ~= 1/(2 * sqrt(a)). We could continue 202 * applying another round of Goldschmit, but since we would never refer 203 * back to a (the original source), we would add too much rounding error. 204 * So instead, we do one last round of Newton-Raphson, which has better 205 * rounding characteristics, to get the final rounding correct. This is 206 * split into two cases: 207 * 208 * 1. sqrt 209 * 210 * Normally, doing a round of Newton-Raphson for sqrt involves taking a 211 * reciprocal of the original estimate, which is slow since it isn't 212 * supported in HW. But we can take advantage of the fact that we already 213 * computed a good estimate of 1/(2 * g_1) by rearranging it like so: 214 * 215 * g_2 = .5 * (g_1 + a / g_1) 216 * = g_1 + .5 * (a / g_1 - g_1) 217 * = g_1 + (.5 / g_1) * (a - g_1^2) 218 * = g_1 + h_1 * (a - g_1^2) 219 * 220 * The second term represents the error, and by splitting it out we can get 221 * better precision by computing it as part of a fused multiply-add. Since 222 * both Newton-Raphson and Goldschmit approximately double the precision of 223 * the result, these two steps should be enough. 224 * 225 * 2. rsqrt 226 * 227 * First off, note that the first round of the Goldschmit algorithm is 228 * really just a Newton-Raphson step in disguise: 229 * 230 * h_1 = h_0 * (.5 - h_0 * g_0) + h_0 231 * = h_0 * (1.5 - h_0 * g_0) 232 * = h_0 * (1.5 - .5 * a * y_0^2) 233 * = (.5 * y_0) * (1.5 - .5 * a * y_0^2) 234 * 235 * which is the standard formula multiplied by .5. Unlike in the sqrt case, 236 * we don't need the inverse to do a Newton-Raphson step; we just need h_1, 237 * so we can skip the calculation of g_1. Instead, we simply do another 238 * Newton-Raphson step: 239 * 240 * y_1 = 2 * h_1 241 * r_1 = .5 - h_1 * y_1 * a 242 * y_2 = y_1 * r_1 + y_1 243 * 244 * Where the difference from Goldschmit is that we calculate y_1 * a 245 * instead of using g_1. Doing it this way should be as fast as computing 246 * y_1 up front instead of h_1, and it lets us share the code for the 247 * initial Goldschmit step with the sqrt case. 248 * 249 * Putting it together, the computations are: 250 * 251 * h_0 = .5 * y_0 252 * g_0 = a * y_0 253 * r_0 = .5 - h_0 * g_0 254 * h_1 = h_0 * r_0 + h_0 255 * if sqrt: 256 * g_1 = g_0 * r_0 + g_0 257 * r_1 = a - g_1 * g_1 258 * g_2 = h_1 * r_1 + g_1 259 * else: 260 * y_1 = 2 * h_1 261 * r_1 = .5 - y_1 * (h_1 * a) 262 * y_2 = y_1 * r_1 + y_1 263 * 264 * For more on the ideas behind this, see "Software Division and Square 265 * Root Using Goldschmit's Algorithms" by Markstein and the Wikipedia page 266 * on square roots 267 * (https://en.wikipedia.org/wiki/Methods_of_computing_square_roots). 268 */ 269 270 nir_ssa_def *one_half = nir_imm_double(b, 0.5); 271 nir_ssa_def *h_0 = nir_fmul(b, one_half, ra); 272 nir_ssa_def *g_0 = nir_fmul(b, src, ra); 273 nir_ssa_def *r_0 = nir_ffma(b, nir_fneg(b, h_0), g_0, one_half); 274 nir_ssa_def *h_1 = nir_ffma(b, h_0, r_0, h_0); 275 nir_ssa_def *res; 276 if (sqrt) { 277 nir_ssa_def *g_1 = nir_ffma(b, g_0, r_0, g_0); 278 nir_ssa_def *r_1 = nir_ffma(b, nir_fneg(b, g_1), g_1, src); 279 res = nir_ffma(b, h_1, r_1, g_1); 280 } else { 281 nir_ssa_def *y_1 = nir_fmul(b, nir_imm_double(b, 2.0), h_1); 282 nir_ssa_def *r_1 = nir_ffma(b, nir_fneg(b, y_1), nir_fmul(b, h_1, src), 283 one_half); 284 res = nir_ffma(b, y_1, r_1, y_1); 285 } 286 287 if (sqrt) { 288 /* Here, the special cases we need to handle are 289 * 0 -> 0 and 290 * +inf -> +inf 291 */ 292 res = nir_bcsel(b, nir_ior(b, nir_feq(b, src, nir_imm_double(b, 0.0)), 293 nir_feq(b, src, nir_imm_double(b, INFINITY))), 294 src, res); 295 } else { 296 res = fix_inv_result(b, res, src, new_exp); 297 } 298 299 return res; 300} 301 302static nir_ssa_def * 303lower_trunc(nir_builder *b, nir_ssa_def *src) 304{ 305 nir_ssa_def *unbiased_exp = nir_isub(b, get_exponent(b, src), 306 nir_imm_int(b, 1023)); 307 308 nir_ssa_def *frac_bits = nir_isub(b, nir_imm_int(b, 52), unbiased_exp); 309 310 /* 311 * Decide the operation to apply depending on the unbiased exponent: 312 * 313 * if (unbiased_exp < 0) 314 * return 0 315 * else if (unbiased_exp > 52) 316 * return src 317 * else 318 * return src & (~0 << frac_bits) 319 * 320 * Notice that the else branch is a 64-bit integer operation that we need 321 * to implement in terms of 32-bit integer arithmetics (at least until we 322 * support 64-bit integer arithmetics). 323 */ 324 325 /* Compute "~0 << frac_bits" in terms of hi/lo 32-bit integer math */ 326 nir_ssa_def *mask_lo = 327 nir_bcsel(b, 328 nir_ige(b, frac_bits, nir_imm_int(b, 32)), 329 nir_imm_int(b, 0), 330 nir_ishl(b, nir_imm_int(b, ~0), frac_bits)); 331 332 nir_ssa_def *mask_hi = 333 nir_bcsel(b, 334 nir_ilt(b, frac_bits, nir_imm_int(b, 33)), 335 nir_imm_int(b, ~0), 336 nir_ishl(b, 337 nir_imm_int(b, ~0), 338 nir_isub(b, frac_bits, nir_imm_int(b, 32)))); 339 340 nir_ssa_def *src_lo = nir_unpack_64_2x32_split_x(b, src); 341 nir_ssa_def *src_hi = nir_unpack_64_2x32_split_y(b, src); 342 343 return 344 nir_bcsel(b, 345 nir_ilt(b, unbiased_exp, nir_imm_int(b, 0)), 346 nir_imm_double(b, 0.0), 347 nir_bcsel(b, nir_ige(b, unbiased_exp, nir_imm_int(b, 53)), 348 src, 349 nir_pack_64_2x32_split(b, 350 nir_iand(b, mask_lo, src_lo), 351 nir_iand(b, mask_hi, src_hi)))); 352} 353 354static nir_ssa_def * 355lower_floor(nir_builder *b, nir_ssa_def *src) 356{ 357 /* 358 * For x >= 0, floor(x) = trunc(x) 359 * For x < 0, 360 * - if x is integer, floor(x) = x 361 * - otherwise, floor(x) = trunc(x) - 1 362 */ 363 nir_ssa_def *tr = nir_ftrunc(b, src); 364 nir_ssa_def *positive = nir_fge(b, src, nir_imm_double(b, 0.0)); 365 return nir_bcsel(b, 366 nir_ior(b, positive, nir_feq(b, src, tr)), 367 tr, 368 nir_fsub(b, tr, nir_imm_double(b, 1.0))); 369} 370 371static nir_ssa_def * 372lower_ceil(nir_builder *b, nir_ssa_def *src) 373{ 374 /* if x < 0, ceil(x) = trunc(x) 375 * else if (x - trunc(x) == 0), ceil(x) = x 376 * else, ceil(x) = trunc(x) + 1 377 */ 378 nir_ssa_def *tr = nir_ftrunc(b, src); 379 nir_ssa_def *negative = nir_flt(b, src, nir_imm_double(b, 0.0)); 380 return nir_bcsel(b, 381 nir_ior(b, negative, nir_feq(b, src, tr)), 382 tr, 383 nir_fadd(b, tr, nir_imm_double(b, 1.0))); 384} 385 386static nir_ssa_def * 387lower_fract(nir_builder *b, nir_ssa_def *src) 388{ 389 return nir_fsub(b, src, nir_ffloor(b, src)); 390} 391 392static nir_ssa_def * 393lower_round_even(nir_builder *b, nir_ssa_def *src) 394{ 395 /* Add and subtract 2**52 to round off any fractional bits. */ 396 nir_ssa_def *two52 = nir_imm_double(b, (double)(1ull << 52)); 397 nir_ssa_def *sign = nir_iand(b, nir_unpack_64_2x32_split_y(b, src), 398 nir_imm_int(b, 1ull << 31)); 399 400 b->exact = true; 401 nir_ssa_def *res = nir_fsub(b, nir_fadd(b, nir_fabs(b, src), two52), two52); 402 b->exact = false; 403 404 return nir_bcsel(b, nir_flt(b, nir_fabs(b, src), two52), 405 nir_pack_64_2x32_split(b, nir_unpack_64_2x32_split_x(b, res), 406 nir_ior(b, nir_unpack_64_2x32_split_y(b, res), sign)), src); 407} 408 409static nir_ssa_def * 410lower_mod(nir_builder *b, nir_ssa_def *src0, nir_ssa_def *src1) 411{ 412 /* mod(x,y) = x - y * floor(x/y) 413 * 414 * If the division is lowered, it could add some rounding errors that make 415 * floor() to return the quotient minus one when x = N * y. If this is the 416 * case, we return zero because mod(x, y) output value is [0, y). 417 */ 418 nir_ssa_def *floor = nir_ffloor(b, nir_fdiv(b, src0, src1)); 419 nir_ssa_def *mod = nir_fsub(b, src0, nir_fmul(b, src1, floor)); 420 421 return nir_bcsel(b, 422 nir_fne(b, mod, src1), 423 mod, 424 nir_imm_double(b, 0.0)); 425} 426 427static bool 428lower_doubles_instr_to_soft(nir_builder *b, nir_alu_instr *instr, 429 const nir_shader *softfp64, 430 nir_lower_doubles_options options) 431{ 432 if (!(options & nir_lower_fp64_full_software)) 433 return false; 434 435 assert(instr->dest.dest.is_ssa); 436 437 const char *name; 438 const struct glsl_type *return_type = glsl_uint64_t_type(); 439 440 switch (instr->op) { 441 case nir_op_f2i64: 442 if (instr->src[0].src.ssa->bit_size == 64) 443 name = "__fp64_to_int64"; 444 else 445 name = "__fp32_to_int64"; 446 return_type = glsl_int64_t_type(); 447 break; 448 case nir_op_f2u64: 449 if (instr->src[0].src.ssa->bit_size == 64) 450 name = "__fp64_to_uint64"; 451 else 452 name = "__fp32_to_uint64"; 453 break; 454 case nir_op_f2f64: 455 name = "__fp32_to_fp64"; 456 break; 457 case nir_op_f2f32: 458 name = "__fp64_to_fp32"; 459 return_type = glsl_float_type(); 460 break; 461 case nir_op_f2i32: 462 name = "__fp64_to_int"; 463 return_type = glsl_int_type(); 464 break; 465 case nir_op_f2u32: 466 name = "__fp64_to_uint"; 467 return_type = glsl_uint_type(); 468 break; 469 case nir_op_f2b1: 470 case nir_op_f2b32: 471 name = "__fp64_to_bool"; 472 return_type = glsl_bool_type(); 473 break; 474 case nir_op_b2f64: 475 name = "__bool_to_fp64"; 476 break; 477 case nir_op_i2f32: 478 if (instr->src[0].src.ssa->bit_size != 64) 479 return false; 480 name = "__int64_to_fp32"; 481 return_type = glsl_float_type(); 482 break; 483 case nir_op_u2f32: 484 if (instr->src[0].src.ssa->bit_size != 64) 485 return false; 486 name = "__uint64_to_fp32"; 487 return_type = glsl_float_type(); 488 break; 489 case nir_op_i2f64: 490 if (instr->src[0].src.ssa->bit_size == 64) 491 name = "__int64_to_fp64"; 492 else 493 name = "__int_to_fp64"; 494 break; 495 case nir_op_u2f64: 496 if (instr->src[0].src.ssa->bit_size == 64) 497 name = "__uint64_to_fp64"; 498 else 499 name = "__uint_to_fp64"; 500 break; 501 case nir_op_fabs: 502 name = "__fabs64"; 503 break; 504 case nir_op_fneg: 505 name = "__fneg64"; 506 break; 507 case nir_op_fround_even: 508 name = "__fround64"; 509 break; 510 case nir_op_ftrunc: 511 name = "__ftrunc64"; 512 break; 513 case nir_op_ffloor: 514 name = "__ffloor64"; 515 break; 516 case nir_op_ffract: 517 name = "__ffract64"; 518 break; 519 case nir_op_fsign: 520 name = "__fsign64"; 521 break; 522 case nir_op_feq: 523 name = "__feq64"; 524 return_type = glsl_bool_type(); 525 break; 526 case nir_op_fne: 527 name = "__fne64"; 528 return_type = glsl_bool_type(); 529 break; 530 case nir_op_flt: 531 name = "__flt64"; 532 return_type = glsl_bool_type(); 533 break; 534 case nir_op_fge: 535 name = "__fge64"; 536 return_type = glsl_bool_type(); 537 break; 538 case nir_op_fmin: 539 name = "__fmin64"; 540 break; 541 case nir_op_fmax: 542 name = "__fmax64"; 543 break; 544 case nir_op_fadd: 545 name = "__fadd64"; 546 break; 547 case nir_op_fmul: 548 name = "__fmul64"; 549 break; 550 case nir_op_ffma: 551 name = "__ffma64"; 552 break; 553 default: 554 return false; 555 } 556 557 nir_function *func = NULL; 558 nir_foreach_function(function, softfp64) { 559 if (strcmp(function->name, name) == 0) { 560 func = function; 561 break; 562 } 563 } 564 if (!func || !func->impl) { 565 fprintf(stderr, "Cannot find function \"%s\"\n", name); 566 assert(func); 567 } 568 569 b->cursor = nir_before_instr(&instr->instr); 570 571 nir_ssa_def *params[4] = { NULL, }; 572 573 nir_variable *ret_tmp = 574 nir_local_variable_create(b->impl, return_type, "return_tmp"); 575 nir_deref_instr *ret_deref = nir_build_deref_var(b, ret_tmp); 576 params[0] = &ret_deref->dest.ssa; 577 578 assert(nir_op_infos[instr->op].num_inputs + 1 == func->num_params); 579 for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { 580 assert(i + 1 < ARRAY_SIZE(params)); 581 params[i + 1] = nir_imov_alu(b, instr->src[i], 1); 582 } 583 584 nir_inline_function_impl(b, func->impl, params); 585 586 nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, 587 nir_src_for_ssa(nir_load_deref(b, ret_deref))); 588 nir_instr_remove(&instr->instr); 589 return true; 590} 591 592nir_lower_doubles_options 593nir_lower_doubles_op_to_options_mask(nir_op opcode) 594{ 595 switch (opcode) { 596 case nir_op_frcp: return nir_lower_drcp; 597 case nir_op_fsqrt: return nir_lower_dsqrt; 598 case nir_op_frsq: return nir_lower_drsq; 599 case nir_op_ftrunc: return nir_lower_dtrunc; 600 case nir_op_ffloor: return nir_lower_dfloor; 601 case nir_op_fceil: return nir_lower_dceil; 602 case nir_op_ffract: return nir_lower_dfract; 603 case nir_op_fround_even: return nir_lower_dround_even; 604 case nir_op_fmod: return nir_lower_dmod; 605 default: return 0; 606 } 607} 608 609static bool 610lower_doubles_instr(nir_builder *b, nir_alu_instr *instr, 611 const nir_shader *softfp64, 612 nir_lower_doubles_options options) 613{ 614 assert(instr->dest.dest.is_ssa); 615 bool is_64 = instr->dest.dest.ssa.bit_size == 64; 616 617 unsigned num_srcs = nir_op_infos[instr->op].num_inputs; 618 for (unsigned i = 0; i < num_srcs; i++) { 619 is_64 |= (nir_src_bit_size(instr->src[i].src) == 64); 620 } 621 622 if (!is_64) 623 return false; 624 625 if (lower_doubles_instr_to_soft(b, instr, softfp64, options)) 626 return true; 627 628 if (!(options & nir_lower_doubles_op_to_options_mask(instr->op))) 629 return false; 630 631 b->cursor = nir_before_instr(&instr->instr); 632 633 nir_ssa_def *src = nir_fmov_alu(b, instr->src[0], 634 instr->dest.dest.ssa.num_components); 635 636 nir_ssa_def *result; 637 638 switch (instr->op) { 639 case nir_op_frcp: 640 result = lower_rcp(b, src); 641 break; 642 case nir_op_fsqrt: 643 result = lower_sqrt_rsq(b, src, true); 644 break; 645 case nir_op_frsq: 646 result = lower_sqrt_rsq(b, src, false); 647 break; 648 case nir_op_ftrunc: 649 result = lower_trunc(b, src); 650 break; 651 case nir_op_ffloor: 652 result = lower_floor(b, src); 653 break; 654 case nir_op_fceil: 655 result = lower_ceil(b, src); 656 break; 657 case nir_op_ffract: 658 result = lower_fract(b, src); 659 break; 660 case nir_op_fround_even: 661 result = lower_round_even(b, src); 662 break; 663 664 case nir_op_fmod: { 665 nir_ssa_def *src1 = nir_fmov_alu(b, instr->src[1], 666 instr->dest.dest.ssa.num_components); 667 result = lower_mod(b, src, src1); 668 } 669 break; 670 default: 671 unreachable("unhandled opcode"); 672 } 673 674 nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(result)); 675 nir_instr_remove(&instr->instr); 676 return true; 677} 678 679static bool 680nir_lower_doubles_impl(nir_function_impl *impl, 681 const nir_shader *softfp64, 682 nir_lower_doubles_options options) 683{ 684 bool progress = false; 685 686 nir_builder b; 687 nir_builder_init(&b, impl); 688 689 nir_foreach_block_safe(block, impl) { 690 nir_foreach_instr_safe(instr, block) { 691 if (instr->type == nir_instr_type_alu) 692 progress |= lower_doubles_instr(&b, nir_instr_as_alu(instr), 693 softfp64, options); 694 } 695 } 696 697 if (progress) { 698 if (options & nir_lower_fp64_full_software) { 699 /* SSA and register indices are completely messed up now */ 700 nir_index_ssa_defs(impl); 701 nir_index_local_regs(impl); 702 703 nir_metadata_preserve(impl, nir_metadata_none); 704 705 /* And we have deref casts we need to clean up thanks to function 706 * inlining. 707 */ 708 nir_opt_deref_impl(impl); 709 } else { 710 nir_metadata_preserve(impl, nir_metadata_block_index | 711 nir_metadata_dominance); 712 } 713 } else { 714#ifndef NDEBUG 715 impl->valid_metadata &= ~nir_metadata_not_properly_reset; 716#endif 717 } 718 719 return progress; 720} 721 722bool 723nir_lower_doubles(nir_shader *shader, 724 const nir_shader *softfp64, 725 nir_lower_doubles_options options) 726{ 727 bool progress = false; 728 729 nir_foreach_function(function, shader) { 730 if (function->impl) { 731 progress |= nir_lower_doubles_impl(function->impl, softfp64, options); 732 } 733 } 734 735 return progress; 736} 737