lower_instructions.cpp revision 7ec681f3
1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24/** 25 * \file lower_instructions.cpp 26 * 27 * Many GPUs lack native instructions for certain expression operations, and 28 * must replace them with some other expression tree. This pass lowers some 29 * of the most common cases, allowing the lowering code to be implemented once 30 * rather than in each driver backend. 31 * 32 * Currently supported transformations: 33 * - SUB_TO_ADD_NEG 34 * - DIV_TO_MUL_RCP 35 * - INT_DIV_TO_MUL_RCP 36 * - EXP_TO_EXP2 37 * - POW_TO_EXP2 38 * - LOG_TO_LOG2 39 * - MOD_TO_FLOOR 40 * - LDEXP_TO_ARITH 41 * - DFREXP_TO_ARITH 42 * - CARRY_TO_ARITH 43 * - BORROW_TO_ARITH 44 * - SAT_TO_CLAMP 45 * - DOPS_TO_DFRAC 46 * 47 * SUB_TO_ADD_NEG: 48 * --------------- 49 * Breaks an ir_binop_sub expression down to add(op0, neg(op1)) 50 * 51 * This simplifies expression reassociation, and for many backends 52 * there is no subtract operation separate from adding the negation. 53 * For backends with native subtract operations, they will probably 54 * want to recognize add(op0, neg(op1)) or the other way around to 55 * produce a subtract anyway. 56 * 57 * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP: 58 * --------------------------------------------------------- 59 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)). 60 * 61 * Many GPUs don't have a divide instruction (945 and 965 included), 62 * but they do have an RCP instruction to compute an approximate 63 * reciprocal. By breaking the operation down, constant reciprocals 64 * can get constant folded. 65 * 66 * FDIV_TO_MUL_RCP lowers single-precision and half-precision 67 * floating point division; 68 * DDIV_TO_MUL_RCP only lowers double-precision floating point division. 69 * DIV_TO_MUL_RCP is a convenience macro that sets both flags. 70 * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating 71 * point so that RCP is possible. 72 * 73 * EXP_TO_EXP2 and LOG_TO_LOG2: 74 * ---------------------------- 75 * Many GPUs don't have a base e log or exponent instruction, but they 76 * do have base 2 versions, so this pass converts exp and log to exp2 77 * and log2 operations. 78 * 79 * POW_TO_EXP2: 80 * ----------- 81 * Many older GPUs don't have an x**y instruction. For these GPUs, convert 82 * x**y to 2**(y * log2(x)). 83 * 84 * MOD_TO_FLOOR: 85 * ------------- 86 * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1)) 87 * 88 * Many GPUs don't have a MOD instruction (945 and 965 included), and 89 * if we have to break it down like this anyway, it gives an 90 * opportunity to do things like constant fold the (1.0 / op1) easily. 91 * 92 * Note: before we used to implement this as op1 * fract(op / op1) but this 93 * implementation had significant precision errors. 94 * 95 * LDEXP_TO_ARITH: 96 * ------------- 97 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources. 98 * 99 * DFREXP_DLDEXP_TO_ARITH: 100 * --------------- 101 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to 102 * arithmetic and bit ops for double arguments. 103 * 104 * CARRY_TO_ARITH: 105 * --------------- 106 * Converts ir_carry into (x + y) < x. 107 * 108 * BORROW_TO_ARITH: 109 * ---------------- 110 * Converts ir_borrow into (x < y). 111 * 112 * SAT_TO_CLAMP: 113 * ------------- 114 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0) 115 * 116 * DOPS_TO_DFRAC: 117 * -------------- 118 * Converts double trunc, ceil, floor, round to fract 119 */ 120 121#include "c99_math.h" 122#include "program/prog_instruction.h" /* for swizzle */ 123#include "compiler/glsl_types.h" 124#include "ir.h" 125#include "ir_builder.h" 126#include "ir_optimization.h" 127#include "util/half_float.h" 128 129using namespace ir_builder; 130 131namespace { 132 133class lower_instructions_visitor : public ir_hierarchical_visitor { 134public: 135 lower_instructions_visitor(unsigned lower) 136 : progress(false), lower(lower) { } 137 138 ir_visitor_status visit_leave(ir_expression *); 139 140 bool progress; 141 142private: 143 unsigned lower; /** Bitfield of which operations to lower */ 144 145 void sub_to_add_neg(ir_expression *); 146 void div_to_mul_rcp(ir_expression *); 147 void int_div_to_mul_rcp(ir_expression *); 148 void mod_to_floor(ir_expression *); 149 void exp_to_exp2(ir_expression *); 150 void pow_to_exp2(ir_expression *); 151 void log_to_log2(ir_expression *); 152 void ldexp_to_arith(ir_expression *); 153 void dldexp_to_arith(ir_expression *); 154 void dfrexp_sig_to_arith(ir_expression *); 155 void dfrexp_exp_to_arith(ir_expression *); 156 void carry_to_arith(ir_expression *); 157 void borrow_to_arith(ir_expression *); 158 void sat_to_clamp(ir_expression *); 159 void double_dot_to_fma(ir_expression *); 160 void double_lrp(ir_expression *); 161 void dceil_to_dfrac(ir_expression *); 162 void dfloor_to_dfrac(ir_expression *); 163 void dround_even_to_dfrac(ir_expression *); 164 void dtrunc_to_dfrac(ir_expression *); 165 void dsign_to_csel(ir_expression *); 166 void bit_count_to_math(ir_expression *); 167 void extract_to_shifts(ir_expression *); 168 void insert_to_shifts(ir_expression *); 169 void reverse_to_shifts(ir_expression *ir); 170 void find_lsb_to_float_cast(ir_expression *ir); 171 void find_msb_to_float_cast(ir_expression *ir); 172 void imul_high_to_mul(ir_expression *ir); 173 void sqrt_to_abs_sqrt(ir_expression *ir); 174 void mul64_to_mul_and_mul_high(ir_expression *ir); 175 176 ir_expression *_carry(operand a, operand b); 177 178 static ir_constant *_imm_fp(void *mem_ctx, 179 const glsl_type *type, 180 double f, 181 unsigned vector_elements=1); 182}; 183 184} /* anonymous namespace */ 185 186/** 187 * Determine if a particular type of lowering should occur 188 */ 189#define lowering(x) (this->lower & x) 190 191bool 192lower_instructions(exec_list *instructions, unsigned what_to_lower) 193{ 194 lower_instructions_visitor v(what_to_lower); 195 196 visit_list_elements(&v, instructions); 197 return v.progress; 198} 199 200void 201lower_instructions_visitor::sub_to_add_neg(ir_expression *ir) 202{ 203 ir->operation = ir_binop_add; 204 ir->init_num_operands(); 205 ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type, 206 ir->operands[1], NULL); 207 this->progress = true; 208} 209 210void 211lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir) 212{ 213 assert(ir->operands[1]->type->is_float_16_32_64()); 214 215 /* New expression for the 1.0 / op1 */ 216 ir_rvalue *expr; 217 expr = new(ir) ir_expression(ir_unop_rcp, 218 ir->operands[1]->type, 219 ir->operands[1]); 220 221 /* op0 / op1 -> op0 * (1.0 / op1) */ 222 ir->operation = ir_binop_mul; 223 ir->init_num_operands(); 224 ir->operands[1] = expr; 225 226 this->progress = true; 227} 228 229void 230lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir) 231{ 232 assert(ir->operands[1]->type->is_integer_32()); 233 234 /* Be careful with integer division -- we need to do it as a 235 * float and re-truncate, since rcp(n > 1) of an integer would 236 * just be 0. 237 */ 238 ir_rvalue *op0, *op1; 239 const struct glsl_type *vec_type; 240 241 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 242 ir->operands[1]->type->vector_elements, 243 ir->operands[1]->type->matrix_columns); 244 245 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) 246 op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL); 247 else 248 op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL); 249 250 op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL); 251 252 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 253 ir->operands[0]->type->vector_elements, 254 ir->operands[0]->type->matrix_columns); 255 256 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) 257 op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL); 258 else 259 op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL); 260 261 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 262 ir->type->vector_elements, 263 ir->type->matrix_columns); 264 265 op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1); 266 267 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) { 268 ir->operation = ir_unop_f2i; 269 ir->operands[0] = op0; 270 } else { 271 ir->operation = ir_unop_i2u; 272 ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0); 273 } 274 ir->init_num_operands(); 275 ir->operands[1] = NULL; 276 277 this->progress = true; 278} 279 280void 281lower_instructions_visitor::exp_to_exp2(ir_expression *ir) 282{ 283 ir_constant *log2_e = _imm_fp(ir, ir->type, M_LOG2E); 284 285 ir->operation = ir_unop_exp2; 286 ir->init_num_operands(); 287 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type, 288 ir->operands[0], log2_e); 289 this->progress = true; 290} 291 292void 293lower_instructions_visitor::pow_to_exp2(ir_expression *ir) 294{ 295 ir_expression *const log2_x = 296 new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 297 ir->operands[0]); 298 299 ir->operation = ir_unop_exp2; 300 ir->init_num_operands(); 301 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type, 302 ir->operands[1], log2_x); 303 ir->operands[1] = NULL; 304 this->progress = true; 305} 306 307void 308lower_instructions_visitor::log_to_log2(ir_expression *ir) 309{ 310 ir->operation = ir_binop_mul; 311 ir->init_num_operands(); 312 ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 313 ir->operands[0], NULL); 314 ir->operands[1] = _imm_fp(ir, ir->operands[0]->type, 1.0 / M_LOG2E); 315 this->progress = true; 316} 317 318void 319lower_instructions_visitor::mod_to_floor(ir_expression *ir) 320{ 321 ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x", 322 ir_var_temporary); 323 ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y", 324 ir_var_temporary); 325 this->base_ir->insert_before(x); 326 this->base_ir->insert_before(y); 327 328 ir_assignment *const assign_x = 329 new(ir) ir_assignment(new(ir) ir_dereference_variable(x), 330 ir->operands[0]); 331 ir_assignment *const assign_y = 332 new(ir) ir_assignment(new(ir) ir_dereference_variable(y), 333 ir->operands[1]); 334 335 this->base_ir->insert_before(assign_x); 336 this->base_ir->insert_before(assign_y); 337 338 ir_expression *const div_expr = 339 new(ir) ir_expression(ir_binop_div, x->type, 340 new(ir) ir_dereference_variable(x), 341 new(ir) ir_dereference_variable(y)); 342 343 /* Don't generate new IR that would need to be lowered in an additional 344 * pass. 345 */ 346 if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float_16_32()) || 347 (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double())) 348 div_to_mul_rcp(div_expr); 349 350 ir_expression *const floor_expr = 351 new(ir) ir_expression(ir_unop_floor, x->type, div_expr); 352 353 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 354 dfloor_to_dfrac(floor_expr); 355 356 ir_expression *const mul_expr = 357 new(ir) ir_expression(ir_binop_mul, 358 new(ir) ir_dereference_variable(y), 359 floor_expr); 360 361 ir->operation = ir_binop_sub; 362 ir->init_num_operands(); 363 ir->operands[0] = new(ir) ir_dereference_variable(x); 364 ir->operands[1] = mul_expr; 365 this->progress = true; 366} 367 368void 369lower_instructions_visitor::ldexp_to_arith(ir_expression *ir) 370{ 371 /* Translates 372 * ir_binop_ldexp x exp 373 * into 374 * 375 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 376 * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 377 * 378 * if (extracted_biased_exp >= 255) 379 * return x; // +/-inf, NaN 380 * 381 * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 382 * 383 * if (min(resulting_biased_exp, extracted_biased_exp) < 1) 384 * resulting_biased_exp = 0; 385 * if (resulting_biased_exp >= 255 || 386 * min(resulting_biased_exp, extracted_biased_exp) < 1) { 387 * sign_mantissa &= sign_mask; 388 * } 389 * 390 * return bitcast_u2f(sign_mantissa | 391 * lshift(i2u(resulting_biased_exp), exp_shift)); 392 * 393 * which we can't actually implement as such, since the GLSL IR doesn't 394 * have vectorized if-statements. We actually implement it without branches 395 * using conditional-select: 396 * 397 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 398 * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 399 * 400 * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 401 * 402 * flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0); 403 * resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp) 404 * zero_mantissa = logic_or(flush_to_zero, 405 * gequal(resulting_biased_exp, 255)); 406 * sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa); 407 * 408 * result = sign_mantissa | 409 * lshift(i2u(resulting_biased_exp), exp_shift)); 410 * 411 * return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result)); 412 * 413 * The definition of ldexp in the GLSL spec says: 414 * 415 * "If this product is too large to be represented in the 416 * floating-point type, the result is undefined." 417 * 418 * However, the definition of ldexp in the GLSL ES spec does not contain 419 * this sentence, so we do need to handle overflow correctly. 420 * 421 * There is additional language limiting the defined range of exp, but this 422 * is merely to allow implementations that store 2^exp in a temporary 423 * variable. 424 */ 425 426 const unsigned vec_elem = ir->type->vector_elements; 427 428 /* Types */ 429 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 430 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 431 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 432 433 /* Temporary variables */ 434 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 435 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 436 ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary); 437 438 ir_variable *extracted_biased_exp = 439 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 440 ir_variable *resulting_biased_exp = 441 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 442 443 ir_variable *sign_mantissa = 444 new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary); 445 446 ir_variable *flush_to_zero = 447 new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary); 448 ir_variable *zero_mantissa = 449 new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary); 450 451 ir_instruction &i = *base_ir; 452 453 /* Copy <x> and <exp> arguments. */ 454 i.insert_before(x); 455 i.insert_before(assign(x, ir->operands[0])); 456 i.insert_before(exp); 457 i.insert_before(assign(exp, ir->operands[1])); 458 459 /* Extract the biased exponent from <x>. */ 460 i.insert_before(extracted_biased_exp); 461 i.insert_before(assign(extracted_biased_exp, 462 rshift(bitcast_f2i(abs(x)), 463 new(ir) ir_constant(23, vec_elem)))); 464 465 /* The definition of ldexp in the GLSL 4.60 spec says: 466 * 467 * "If exp is greater than +128 (single-precision) or +1024 468 * (double-precision), the value returned is undefined. If exp is less 469 * than -126 (single-precision) or -1022 (double-precision), the value 470 * returned may be flushed to zero." 471 * 472 * So we do not have to guard against the possibility of addition overflow, 473 * which could happen when exp is close to INT_MAX. Addition underflow 474 * cannot happen (the worst case is 0 + (-INT_MAX)). 475 */ 476 i.insert_before(resulting_biased_exp); 477 i.insert_before(assign(resulting_biased_exp, 478 min2(add(extracted_biased_exp, exp), 479 new(ir) ir_constant(255, vec_elem)))); 480 481 i.insert_before(sign_mantissa); 482 i.insert_before(assign(sign_mantissa, 483 bit_and(bitcast_f2u(x), 484 new(ir) ir_constant(0x807fffffu, vec_elem)))); 485 486 /* We flush to zero if the original or resulting biased exponent is 0, 487 * indicating a +/-0.0 or subnormal input or output. 488 * 489 * The mantissa is set to 0 if the resulting biased exponent is 255, since 490 * an overflow should produce a +/-inf result. 491 * 492 * Note that NaN inputs are handled separately. 493 */ 494 i.insert_before(flush_to_zero); 495 i.insert_before(assign(flush_to_zero, 496 lequal(min2(resulting_biased_exp, 497 extracted_biased_exp), 498 ir_constant::zero(ir, ivec)))); 499 i.insert_before(assign(resulting_biased_exp, 500 csel(flush_to_zero, 501 ir_constant::zero(ir, ivec), 502 resulting_biased_exp))); 503 504 i.insert_before(zero_mantissa); 505 i.insert_before(assign(zero_mantissa, 506 logic_or(flush_to_zero, 507 equal(resulting_biased_exp, 508 new(ir) ir_constant(255, vec_elem))))); 509 i.insert_before(assign(sign_mantissa, 510 csel(zero_mantissa, 511 bit_and(sign_mantissa, 512 new(ir) ir_constant(0x80000000u, vec_elem)), 513 sign_mantissa))); 514 515 /* Don't generate new IR that would need to be lowered in an additional 516 * pass. 517 */ 518 i.insert_before(result); 519 if (!lowering(INSERT_TO_SHIFTS)) { 520 i.insert_before(assign(result, 521 bitfield_insert(sign_mantissa, 522 i2u(resulting_biased_exp), 523 new(ir) ir_constant(23u, vec_elem), 524 new(ir) ir_constant(8u, vec_elem)))); 525 } else { 526 i.insert_before(assign(result, 527 bit_or(sign_mantissa, 528 lshift(i2u(resulting_biased_exp), 529 new(ir) ir_constant(23, vec_elem))))); 530 } 531 532 ir->operation = ir_triop_csel; 533 ir->init_num_operands(); 534 ir->operands[0] = gequal(extracted_biased_exp, 535 new(ir) ir_constant(255, vec_elem)); 536 ir->operands[1] = new(ir) ir_dereference_variable(x); 537 ir->operands[2] = bitcast_u2f(result); 538 539 this->progress = true; 540} 541 542void 543lower_instructions_visitor::dldexp_to_arith(ir_expression *ir) 544{ 545 /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent 546 * from the significand. 547 */ 548 549 const unsigned vec_elem = ir->type->vector_elements; 550 551 /* Types */ 552 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 553 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 554 555 /* Constants */ 556 ir_constant *zeroi = ir_constant::zero(ir, ivec); 557 558 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u); 559 560 ir_constant *exp_shift = new(ir) ir_constant(20u); 561 ir_constant *exp_width = new(ir) ir_constant(11u); 562 ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem); 563 564 /* Temporary variables */ 565 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 566 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 567 568 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x", 569 ir_var_temporary); 570 571 ir_variable *extracted_biased_exp = 572 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 573 ir_variable *resulting_biased_exp = 574 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 575 576 ir_variable *is_not_zero_or_underflow = 577 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary); 578 579 ir_instruction &i = *base_ir; 580 581 /* Copy <x> and <exp> arguments. */ 582 i.insert_before(x); 583 i.insert_before(assign(x, ir->operands[0])); 584 i.insert_before(exp); 585 i.insert_before(assign(exp, ir->operands[1])); 586 587 ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x); 588 if (lowering(DFREXP_DLDEXP_TO_ARITH)) 589 dfrexp_exp_to_arith(frexp_exp); 590 591 /* Extract the biased exponent from <x>. */ 592 i.insert_before(extracted_biased_exp); 593 i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias))); 594 595 i.insert_before(resulting_biased_exp); 596 i.insert_before(assign(resulting_biased_exp, 597 add(extracted_biased_exp, exp))); 598 599 /* Test if result is ±0.0, subnormal, or underflow by checking if the 600 * resulting biased exponent would be less than 0x1. If so, the result is 601 * 0.0 with the sign of x. (Actually, invert the conditions so that 602 * immediate values are the second arguments, which is better for i965) 603 * TODO: Implement in a vector fashion. 604 */ 605 i.insert_before(zero_sign_x); 606 for (unsigned elem = 0; elem < vec_elem; elem++) { 607 ir_variable *unpacked = 608 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 609 i.insert_before(unpacked); 610 i.insert_before( 611 assign(unpacked, 612 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 613 i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)), 614 WRITEMASK_Y)); 615 i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X)); 616 i.insert_before(assign(zero_sign_x, 617 expr(ir_unop_pack_double_2x32, unpacked), 618 1 << elem)); 619 } 620 i.insert_before(is_not_zero_or_underflow); 621 i.insert_before(assign(is_not_zero_or_underflow, 622 gequal(resulting_biased_exp, 623 new(ir) ir_constant(0x1, vec_elem)))); 624 i.insert_before(assign(x, csel(is_not_zero_or_underflow, 625 x, zero_sign_x))); 626 i.insert_before(assign(resulting_biased_exp, 627 csel(is_not_zero_or_underflow, 628 resulting_biased_exp, zeroi))); 629 630 /* We could test for overflows by checking if the resulting biased exponent 631 * would be greater than 0xFE. Turns out we don't need to because the GLSL 632 * spec says: 633 * 634 * "If this product is too large to be represented in the 635 * floating-point type, the result is undefined." 636 */ 637 638 ir_rvalue *results[4] = {NULL}; 639 for (unsigned elem = 0; elem < vec_elem; elem++) { 640 ir_variable *unpacked = 641 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 642 i.insert_before(unpacked); 643 i.insert_before( 644 assign(unpacked, 645 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 646 647 ir_expression *bfi = bitfield_insert( 648 swizzle_y(unpacked), 649 i2u(swizzle(resulting_biased_exp, elem, 1)), 650 exp_shift->clone(ir, NULL), 651 exp_width->clone(ir, NULL)); 652 653 i.insert_before(assign(unpacked, bfi, WRITEMASK_Y)); 654 655 results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 656 } 657 658 ir->operation = ir_quadop_vector; 659 ir->init_num_operands(); 660 ir->operands[0] = results[0]; 661 ir->operands[1] = results[1]; 662 ir->operands[2] = results[2]; 663 ir->operands[3] = results[3]; 664 665 /* Don't generate new IR that would need to be lowered in an additional 666 * pass. 667 */ 668 669 this->progress = true; 670} 671 672void 673lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir) 674{ 675 const unsigned vec_elem = ir->type->vector_elements; 676 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 677 678 /* Double-precision floating-point values are stored as 679 * 1 sign bit; 680 * 11 exponent bits; 681 * 52 mantissa bits. 682 * 683 * We're just extracting the significand here, so we only need to modify 684 * the upper 32-bit uint. Unfortunately we must extract each double 685 * independently as there is no vector version of unpackDouble. 686 */ 687 688 ir_instruction &i = *base_ir; 689 690 ir_variable *is_not_zero = 691 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 692 ir_rvalue *results[4] = {NULL}; 693 694 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 695 i.insert_before(is_not_zero); 696 i.insert_before( 697 assign(is_not_zero, 698 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero))); 699 700 /* TODO: Remake this as more vector-friendly when int64 support is 701 * available. 702 */ 703 for (unsigned elem = 0; elem < vec_elem; elem++) { 704 ir_constant *zero = new(ir) ir_constant(0u, 1); 705 ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1); 706 707 /* Exponent of double floating-point values in the range [0.5, 1.0). */ 708 ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1); 709 710 ir_variable *bits = 711 new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary); 712 ir_variable *unpacked = 713 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 714 715 ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1); 716 717 i.insert_before(bits); 718 i.insert_before(unpacked); 719 i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x))); 720 721 /* Manipulate the high uint to remove the exponent and replace it with 722 * either the default exponent or zero. 723 */ 724 i.insert_before(assign(bits, swizzle_y(unpacked))); 725 i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask))); 726 i.insert_before(assign(bits, bit_or(bits, 727 csel(swizzle(is_not_zero, elem, 1), 728 exponent_value, 729 zero)))); 730 i.insert_before(assign(unpacked, bits, WRITEMASK_Y)); 731 results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 732 } 733 734 /* Put the dvec back together */ 735 ir->operation = ir_quadop_vector; 736 ir->init_num_operands(); 737 ir->operands[0] = results[0]; 738 ir->operands[1] = results[1]; 739 ir->operands[2] = results[2]; 740 ir->operands[3] = results[3]; 741 742 this->progress = true; 743} 744 745void 746lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir) 747{ 748 const unsigned vec_elem = ir->type->vector_elements; 749 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 750 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 751 752 /* Double-precision floating-point values are stored as 753 * 1 sign bit; 754 * 11 exponent bits; 755 * 52 mantissa bits. 756 * 757 * We're just extracting the exponent here, so we only care about the upper 758 * 32-bit uint. 759 */ 760 761 ir_instruction &i = *base_ir; 762 763 ir_variable *is_not_zero = 764 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 765 ir_variable *high_words = 766 new(ir) ir_variable(uvec, "high_words", ir_var_temporary); 767 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 768 ir_constant *izero = new(ir) ir_constant(0, vec_elem); 769 770 ir_rvalue *absval = abs(ir->operands[0]); 771 772 i.insert_before(is_not_zero); 773 i.insert_before(high_words); 774 i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero))); 775 776 /* Extract all of the upper uints. */ 777 for (unsigned elem = 0; elem < vec_elem; elem++) { 778 ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1); 779 780 i.insert_before(assign(high_words, 781 swizzle_y(expr(ir_unop_unpack_double_2x32, x)), 782 1 << elem)); 783 784 } 785 ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem); 786 ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem); 787 788 /* For non-zero inputs, shift the exponent down and apply bias. */ 789 ir->operation = ir_triop_csel; 790 ir->init_num_operands(); 791 ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero); 792 ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift))); 793 ir->operands[2] = izero; 794 795 this->progress = true; 796} 797 798void 799lower_instructions_visitor::carry_to_arith(ir_expression *ir) 800{ 801 /* Translates 802 * ir_binop_carry x y 803 * into 804 * sum = ir_binop_add x y 805 * bcarry = ir_binop_less sum x 806 * carry = ir_unop_b2i bcarry 807 */ 808 809 ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL); 810 ir->operation = ir_unop_i2u; 811 ir->init_num_operands(); 812 ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone)); 813 ir->operands[1] = NULL; 814 815 this->progress = true; 816} 817 818void 819lower_instructions_visitor::borrow_to_arith(ir_expression *ir) 820{ 821 /* Translates 822 * ir_binop_borrow x y 823 * into 824 * bcarry = ir_binop_less x y 825 * carry = ir_unop_b2i bcarry 826 */ 827 828 ir->operation = ir_unop_i2u; 829 ir->init_num_operands(); 830 ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1])); 831 ir->operands[1] = NULL; 832 833 this->progress = true; 834} 835 836void 837lower_instructions_visitor::sat_to_clamp(ir_expression *ir) 838{ 839 /* Translates 840 * ir_unop_saturate x 841 * into 842 * ir_binop_min (ir_binop_max(x, 0.0), 1.0) 843 */ 844 845 ir->operation = ir_binop_min; 846 ir->init_num_operands(); 847 848 ir_constant *zero = _imm_fp(ir, ir->operands[0]->type, 0.0); 849 ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type, 850 ir->operands[0], zero); 851 ir->operands[1] = _imm_fp(ir, ir->operands[0]->type, 1.0); 852 853 this->progress = true; 854} 855 856void 857lower_instructions_visitor::double_dot_to_fma(ir_expression *ir) 858{ 859 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res", 860 ir_var_temporary); 861 this->base_ir->insert_before(temp); 862 863 int nc = ir->operands[0]->type->components(); 864 for (int i = nc - 1; i >= 1; i--) { 865 ir_assignment *assig; 866 if (i == (nc - 1)) { 867 assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 868 swizzle(ir->operands[1]->clone(ir, NULL), i, 1))); 869 } else { 870 assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 871 swizzle(ir->operands[1]->clone(ir, NULL), i, 1), 872 temp)); 873 } 874 this->base_ir->insert_before(assig); 875 } 876 877 ir->operation = ir_triop_fma; 878 ir->init_num_operands(); 879 ir->operands[0] = swizzle(ir->operands[0], 0, 1); 880 ir->operands[1] = swizzle(ir->operands[1], 0, 1); 881 ir->operands[2] = new(ir) ir_dereference_variable(temp); 882 883 this->progress = true; 884 885} 886 887void 888lower_instructions_visitor::double_lrp(ir_expression *ir) 889{ 890 int swizval; 891 ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2]; 892 ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements); 893 894 switch (op2->type->vector_elements) { 895 case 1: 896 swizval = SWIZZLE_XXXX; 897 break; 898 default: 899 assert(op0->type->vector_elements == op2->type->vector_elements); 900 swizval = SWIZZLE_XYZW; 901 break; 902 } 903 904 ir->operation = ir_triop_fma; 905 ir->init_num_operands(); 906 ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements); 907 ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0); 908 909 this->progress = true; 910} 911 912void 913lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir) 914{ 915 /* 916 * frtemp = frac(x); 917 * temp = sub(x, frtemp); 918 * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0); 919 */ 920 ir_instruction &i = *base_ir; 921 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 922 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 923 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 924 ir_var_temporary); 925 926 i.insert_before(frtemp); 927 i.insert_before(assign(frtemp, fract(ir->operands[0]))); 928 929 ir->operation = ir_binop_add; 930 ir->init_num_operands(); 931 ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp); 932 ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL)); 933 934 this->progress = true; 935} 936 937void 938lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir) 939{ 940 /* 941 * frtemp = frac(x); 942 * result = sub(x, frtemp); 943 */ 944 ir->operation = ir_binop_sub; 945 ir->init_num_operands(); 946 ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL)); 947 948 this->progress = true; 949} 950void 951lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir) 952{ 953 /* 954 * insane but works 955 * temp = x + 0.5; 956 * frtemp = frac(temp); 957 * t2 = sub(temp, frtemp); 958 * if (frac(x) == 0.5) 959 * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1; 960 * else 961 * result = t2; 962 963 */ 964 ir_instruction &i = *base_ir; 965 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 966 ir_var_temporary); 967 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 968 ir_var_temporary); 969 ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2", 970 ir_var_temporary); 971 ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements); 972 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 973 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 974 975 i.insert_before(temp); 976 i.insert_before(assign(temp, add(ir->operands[0], p5))); 977 978 i.insert_before(frtemp); 979 i.insert_before(assign(frtemp, fract(temp))); 980 981 i.insert_before(t2); 982 i.insert_before(assign(t2, sub(temp, frtemp))); 983 984 ir->operation = ir_triop_csel; 985 ir->init_num_operands(); 986 ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)), 987 p5->clone(ir, NULL)); 988 ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))), 989 zero), 990 t2, 991 sub(t2, one)); 992 ir->operands[2] = new(ir) ir_dereference_variable(t2); 993 994 this->progress = true; 995} 996 997void 998lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir) 999{ 1000 /* 1001 * frtemp = frac(x); 1002 * temp = sub(x, frtemp); 1003 * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1; 1004 */ 1005 ir_rvalue *arg = ir->operands[0]; 1006 ir_instruction &i = *base_ir; 1007 1008 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 1009 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 1010 ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp", 1011 ir_var_temporary); 1012 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 1013 ir_var_temporary); 1014 1015 i.insert_before(frtemp); 1016 i.insert_before(assign(frtemp, fract(arg))); 1017 i.insert_before(temp); 1018 i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp))); 1019 1020 ir->operation = ir_triop_csel; 1021 ir->init_num_operands(); 1022 ir->operands[0] = gequal(arg->clone(ir, NULL), zero); 1023 ir->operands[1] = new (ir) ir_dereference_variable(temp); 1024 ir->operands[2] = add(temp, 1025 csel(equal(frtemp, zero->clone(ir, NULL)), 1026 zero->clone(ir, NULL), 1027 one)); 1028 1029 this->progress = true; 1030} 1031 1032void 1033lower_instructions_visitor::dsign_to_csel(ir_expression *ir) 1034{ 1035 /* 1036 * temp = x > 0.0 ? 1.0 : 0.0; 1037 * result = x < 0.0 ? -1.0 : temp; 1038 */ 1039 ir_rvalue *arg = ir->operands[0]; 1040 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 1041 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 1042 ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements); 1043 1044 ir->operation = ir_triop_csel; 1045 ir->init_num_operands(); 1046 ir->operands[0] = less(arg->clone(ir, NULL), 1047 zero->clone(ir, NULL)); 1048 ir->operands[1] = neg_one; 1049 ir->operands[2] = csel(greater(arg, zero), 1050 one, 1051 zero->clone(ir, NULL)); 1052 1053 this->progress = true; 1054} 1055 1056void 1057lower_instructions_visitor::bit_count_to_math(ir_expression *ir) 1058{ 1059 /* For more details, see: 1060 * 1061 * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel 1062 */ 1063 const unsigned elements = ir->operands[0]->type->vector_elements; 1064 ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp", 1065 ir_var_temporary); 1066 ir_constant *c55555555 = new(ir) ir_constant(0x55555555u); 1067 ir_constant *c33333333 = new(ir) ir_constant(0x33333333u); 1068 ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu); 1069 ir_constant *c01010101 = new(ir) ir_constant(0x01010101u); 1070 ir_constant *c1 = new(ir) ir_constant(1u); 1071 ir_constant *c2 = new(ir) ir_constant(2u); 1072 ir_constant *c4 = new(ir) ir_constant(4u); 1073 ir_constant *c24 = new(ir) ir_constant(24u); 1074 1075 base_ir->insert_before(temp); 1076 1077 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1078 base_ir->insert_before(assign(temp, ir->operands[0])); 1079 } else { 1080 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1081 base_ir->insert_before(assign(temp, i2u(ir->operands[0]))); 1082 } 1083 1084 /* temp = temp - ((temp >> 1) & 0x55555555u); */ 1085 base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1), 1086 c55555555)))); 1087 1088 /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */ 1089 base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333), 1090 bit_and(rshift(temp, c2), 1091 c33333333->clone(ir, NULL))))); 1092 1093 /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */ 1094 ir->operation = ir_unop_u2i; 1095 ir->init_num_operands(); 1096 ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F), 1097 c01010101), 1098 c24); 1099 1100 this->progress = true; 1101} 1102 1103void 1104lower_instructions_visitor::extract_to_shifts(ir_expression *ir) 1105{ 1106 ir_variable *bits = 1107 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 1108 1109 base_ir->insert_before(bits); 1110 base_ir->insert_before(assign(bits, ir->operands[2])); 1111 1112 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1113 ir_constant *c1 = 1114 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1115 ir_constant *c32 = 1116 new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 1117 ir_constant *cFFFFFFFF = 1118 new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 1119 1120 /* At least some hardware treats (x << y) as (x << (y%32)). This means 1121 * we'd get a mask of 0 when bits is 32. Special case it. 1122 * 1123 * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u; 1124 */ 1125 ir_expression *mask = csel(equal(bits, c32), 1126 cFFFFFFFF, 1127 sub(lshift(c1, bits), c1->clone(ir, NULL))); 1128 1129 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1130 * 1131 * If bits is zero, the result will be zero. 1132 * 1133 * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional 1134 * select as in the signed integer case. 1135 * 1136 * (value >> offset) & mask; 1137 */ 1138 ir->operation = ir_binop_bit_and; 1139 ir->init_num_operands(); 1140 ir->operands[0] = rshift(ir->operands[0], ir->operands[1]); 1141 ir->operands[1] = mask; 1142 ir->operands[2] = NULL; 1143 } else { 1144 ir_constant *c0 = 1145 new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements); 1146 ir_constant *c32 = 1147 new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 1148 ir_variable *temp = 1149 new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary); 1150 1151 /* temp = 32 - bits; */ 1152 base_ir->insert_before(temp); 1153 base_ir->insert_before(assign(temp, sub(c32, bits))); 1154 1155 /* expr = value << (temp - offset)) >> temp; */ 1156 ir_expression *expr = 1157 rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp); 1158 1159 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1160 * 1161 * If bits is zero, the result will be zero. 1162 * 1163 * Due to the (x << (y%32)) behavior mentioned before, the (value << 1164 * (32-0)) doesn't "erase" all of the data as we would like, so finish 1165 * up with: 1166 * 1167 * (bits == 0) ? 0 : e; 1168 */ 1169 ir->operation = ir_triop_csel; 1170 ir->init_num_operands(); 1171 ir->operands[0] = equal(c0, bits); 1172 ir->operands[1] = c0->clone(ir, NULL); 1173 ir->operands[2] = expr; 1174 } 1175 1176 this->progress = true; 1177} 1178 1179void 1180lower_instructions_visitor::insert_to_shifts(ir_expression *ir) 1181{ 1182 ir_constant *c1; 1183 ir_constant *c32; 1184 ir_constant *cFFFFFFFF; 1185 ir_variable *offset = 1186 new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary); 1187 ir_variable *bits = 1188 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 1189 ir_variable *mask = 1190 new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary); 1191 1192 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1193 c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements); 1194 c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 1195 cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements); 1196 } else { 1197 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1198 1199 c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1200 c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 1201 cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 1202 } 1203 1204 base_ir->insert_before(offset); 1205 base_ir->insert_before(assign(offset, ir->operands[2])); 1206 1207 base_ir->insert_before(bits); 1208 base_ir->insert_before(assign(bits, ir->operands[3])); 1209 1210 /* At least some hardware treats (x << y) as (x << (y%32)). This means 1211 * we'd get a mask of 0 when bits is 32. Special case it. 1212 * 1213 * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset; 1214 * 1215 * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1216 * 1217 * The result will be undefined if offset or bits is negative, or if the 1218 * sum of offset and bits is greater than the number of bits used to 1219 * store the operand. 1220 * 1221 * Since it's undefined, there are a couple other ways this could be 1222 * implemented. The other way that was considered was to put the csel 1223 * around the whole thing: 1224 * 1225 * final_result = bits == 32 ? insert : ... ; 1226 */ 1227 base_ir->insert_before(mask); 1228 1229 base_ir->insert_before(assign(mask, csel(equal(bits, c32), 1230 cFFFFFFFF, 1231 lshift(sub(lshift(c1, bits), 1232 c1->clone(ir, NULL)), 1233 offset)))); 1234 1235 /* (base & ~mask) | ((insert << offset) & mask) */ 1236 ir->operation = ir_binop_bit_or; 1237 ir->init_num_operands(); 1238 ir->operands[0] = bit_and(ir->operands[0], bit_not(mask)); 1239 ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask); 1240 ir->operands[2] = NULL; 1241 ir->operands[3] = NULL; 1242 1243 this->progress = true; 1244} 1245 1246void 1247lower_instructions_visitor::reverse_to_shifts(ir_expression *ir) 1248{ 1249 /* For more details, see: 1250 * 1251 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 1252 */ 1253 ir_constant *c1 = 1254 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1255 ir_constant *c2 = 1256 new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements); 1257 ir_constant *c4 = 1258 new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements); 1259 ir_constant *c8 = 1260 new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements); 1261 ir_constant *c16 = 1262 new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements); 1263 ir_constant *c33333333 = 1264 new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements); 1265 ir_constant *c55555555 = 1266 new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements); 1267 ir_constant *c0F0F0F0F = 1268 new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements); 1269 ir_constant *c00FF00FF = 1270 new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements); 1271 ir_variable *temp = 1272 new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements), 1273 "temp", ir_var_temporary); 1274 ir_instruction &i = *base_ir; 1275 1276 i.insert_before(temp); 1277 1278 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1279 i.insert_before(assign(temp, ir->operands[0])); 1280 } else { 1281 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1282 i.insert_before(assign(temp, i2u(ir->operands[0]))); 1283 } 1284 1285 /* Swap odd and even bits. 1286 * 1287 * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1); 1288 */ 1289 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555), 1290 lshift(bit_and(temp, c55555555->clone(ir, NULL)), 1291 c1->clone(ir, NULL))))); 1292 /* Swap consecutive pairs. 1293 * 1294 * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2); 1295 */ 1296 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333), 1297 lshift(bit_and(temp, c33333333->clone(ir, NULL)), 1298 c2->clone(ir, NULL))))); 1299 1300 /* Swap nibbles. 1301 * 1302 * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4); 1303 */ 1304 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F), 1305 lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)), 1306 c4->clone(ir, NULL))))); 1307 1308 /* The last step is, basically, bswap. Swap the bytes, then swap the 1309 * words. When this code is run through GCC on x86, it does generate a 1310 * bswap instruction. 1311 * 1312 * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8); 1313 * temp = ( temp >> 16 ) | ( temp << 16); 1314 */ 1315 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF), 1316 lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)), 1317 c8->clone(ir, NULL))))); 1318 1319 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1320 ir->operation = ir_binop_bit_or; 1321 ir->init_num_operands(); 1322 ir->operands[0] = rshift(temp, c16); 1323 ir->operands[1] = lshift(temp, c16->clone(ir, NULL)); 1324 } else { 1325 ir->operation = ir_unop_u2i; 1326 ir->init_num_operands(); 1327 ir->operands[0] = bit_or(rshift(temp, c16), 1328 lshift(temp, c16->clone(ir, NULL))); 1329 } 1330 1331 this->progress = true; 1332} 1333 1334void 1335lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir) 1336{ 1337 /* For more details, see: 1338 * 1339 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1340 */ 1341 const unsigned elements = ir->operands[0]->type->vector_elements; 1342 ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements); 1343 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1344 ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1345 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1346 ir_variable *temp = 1347 new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary); 1348 ir_variable *lsb_only = 1349 new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary); 1350 ir_variable *as_float = 1351 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1352 ir_variable *lsb = 1353 new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary); 1354 1355 ir_instruction &i = *base_ir; 1356 1357 i.insert_before(temp); 1358 1359 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1360 i.insert_before(assign(temp, ir->operands[0])); 1361 } else { 1362 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1363 i.insert_before(assign(temp, u2i(ir->operands[0]))); 1364 } 1365 1366 /* The int-to-float conversion is lossless because (value & -value) is 1367 * either a power of two or zero. We don't use the result in the zero 1368 * case. The uint() cast is necessary so that 0x80000000 does not 1369 * generate a negative value. 1370 * 1371 * uint lsb_only = uint(value & -value); 1372 * float as_float = float(lsb_only); 1373 */ 1374 i.insert_before(lsb_only); 1375 i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp))))); 1376 1377 i.insert_before(as_float); 1378 i.insert_before(assign(as_float, u2f(lsb_only))); 1379 1380 /* This is basically an open-coded frexp. Implementations that have a 1381 * native frexp instruction would be better served by that. This is 1382 * optimized versus a full-featured open-coded implementation in two ways: 1383 * 1384 * - We don't care about a correct result from subnormal numbers (including 1385 * 0.0), so the raw exponent can always be safely unbiased. 1386 * 1387 * - The value cannot be negative, so it does not need to be masked off to 1388 * extract the exponent. 1389 * 1390 * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1391 */ 1392 i.insert_before(lsb); 1393 i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1394 1395 /* Use lsb_only in the comparison instead of temp so that the & (far above) 1396 * can possibly generate the result without an explicit comparison. 1397 * 1398 * (lsb_only == 0) ? -1 : lsb; 1399 * 1400 * Since our input values are all integers, the unbiased exponent must not 1401 * be negative. It will only be negative (-0x7f, in fact) if lsb_only is 1402 * 0. Instead of using (lsb_only == 0), we could use (lsb >= 0). Which is 1403 * better is likely GPU dependent. Either way, the difference should be 1404 * small. 1405 */ 1406 ir->operation = ir_triop_csel; 1407 ir->init_num_operands(); 1408 ir->operands[0] = equal(lsb_only, c0); 1409 ir->operands[1] = cminus1; 1410 ir->operands[2] = new(ir) ir_dereference_variable(lsb); 1411 1412 this->progress = true; 1413} 1414 1415void 1416lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir) 1417{ 1418 /* For more details, see: 1419 * 1420 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1421 */ 1422 const unsigned elements = ir->operands[0]->type->vector_elements; 1423 ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1424 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1425 ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1426 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1427 ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements); 1428 ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements); 1429 ir_variable *temp = 1430 new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary); 1431 ir_variable *as_float = 1432 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1433 ir_variable *msb = 1434 new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary); 1435 1436 ir_instruction &i = *base_ir; 1437 1438 i.insert_before(temp); 1439 1440 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1441 i.insert_before(assign(temp, ir->operands[0])); 1442 } else { 1443 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1444 1445 /* findMSB(uint(abs(some_int))) almost always does the right thing. 1446 * There are two problem values: 1447 * 1448 * * 0x80000000. Since abs(0x80000000) == 0x80000000, findMSB returns 1449 * 31. However, findMSB(int(0x80000000)) == 30. 1450 * 1451 * * 0xffffffff. Since abs(0xffffffff) == 1, findMSB returns 1452 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1453 * 1454 * For a value of zero or negative one, -1 will be returned. 1455 * 1456 * For all negative number cases, including 0x80000000 and 0xffffffff, 1457 * the correct value is obtained from findMSB if instead of negating the 1458 * (already negative) value the logical-not is used. A conditonal 1459 * logical-not can be achieved in two instructions. 1460 */ 1461 ir_variable *as_int = 1462 new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary); 1463 ir_constant *c31 = new(ir) ir_constant(int(31), elements); 1464 1465 i.insert_before(as_int); 1466 i.insert_before(assign(as_int, ir->operands[0])); 1467 i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor, 1468 as_int, 1469 rshift(as_int, c31))))); 1470 } 1471 1472 /* The int-to-float conversion is lossless because bits are conditionally 1473 * masked off the bottom of temp to ensure the value has at most 24 bits of 1474 * data or is zero. We don't use the result in the zero case. The uint() 1475 * cast is necessary so that 0x80000000 does not generate a negative value. 1476 * 1477 * float as_float = float(temp > 255 ? temp & ~255 : temp); 1478 */ 1479 i.insert_before(as_float); 1480 i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF), 1481 bit_and(temp, cFFFFFF00), 1482 temp)))); 1483 1484 /* This is basically an open-coded frexp. Implementations that have a 1485 * native frexp instruction would be better served by that. This is 1486 * optimized versus a full-featured open-coded implementation in two ways: 1487 * 1488 * - We don't care about a correct result from subnormal numbers (including 1489 * 0.0), so the raw exponent can always be safely unbiased. 1490 * 1491 * - The value cannot be negative, so it does not need to be masked off to 1492 * extract the exponent. 1493 * 1494 * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1495 */ 1496 i.insert_before(msb); 1497 i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1498 1499 /* Use msb in the comparison instead of temp so that the subtract can 1500 * possibly generate the result without an explicit comparison. 1501 * 1502 * (msb < 0) ? -1 : msb; 1503 * 1504 * Since our input values are all integers, the unbiased exponent must not 1505 * be negative. It will only be negative (-0x7f, in fact) if temp is 0. 1506 */ 1507 ir->operation = ir_triop_csel; 1508 ir->init_num_operands(); 1509 ir->operands[0] = less(msb, c0); 1510 ir->operands[1] = cminus1; 1511 ir->operands[2] = new(ir) ir_dereference_variable(msb); 1512 1513 this->progress = true; 1514} 1515 1516ir_expression * 1517lower_instructions_visitor::_carry(operand a, operand b) 1518{ 1519 if (lowering(CARRY_TO_ARITH)) 1520 return i2u(b2i(less(add(a, b), 1521 a.val->clone(ralloc_parent(a.val), NULL)))); 1522 else 1523 return carry(a, b); 1524} 1525 1526ir_constant * 1527lower_instructions_visitor::_imm_fp(void *mem_ctx, 1528 const glsl_type *type, 1529 double f, 1530 unsigned vector_elements) 1531{ 1532 switch (type->base_type) { 1533 case GLSL_TYPE_FLOAT: 1534 return new(mem_ctx) ir_constant((float) f, vector_elements); 1535 case GLSL_TYPE_DOUBLE: 1536 return new(mem_ctx) ir_constant((double) f, vector_elements); 1537 case GLSL_TYPE_FLOAT16: 1538 return new(mem_ctx) ir_constant(float16_t(f), vector_elements); 1539 default: 1540 assert(!"unknown float type for immediate"); 1541 return NULL; 1542 } 1543} 1544 1545void 1546lower_instructions_visitor::imul_high_to_mul(ir_expression *ir) 1547{ 1548 /* ABCD 1549 * * EFGH 1550 * ====== 1551 * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 1552 * 1553 * In GLSL, (a * b) becomes 1554 * 1555 * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu); 1556 * uint m2 = (a & 0x0000ffffu) * (b >> 16); 1557 * uint m3 = (a >> 16) * (b & 0x0000ffffu); 1558 * uint m4 = (a >> 16) * (b >> 16); 1559 * 1560 * uint c1; 1561 * uint c2; 1562 * uint lo_result; 1563 * uint hi_result; 1564 * 1565 * lo_result = uaddCarry(m1, m2 << 16, c1); 1566 * hi_result = m4 + c1; 1567 * lo_result = uaddCarry(lo_result, m3 << 16, c2); 1568 * hi_result = hi_result + c2; 1569 * hi_result = hi_result + (m2 >> 16) + (m3 >> 16); 1570 */ 1571 const unsigned elements = ir->operands[0]->type->vector_elements; 1572 ir_variable *src1 = 1573 new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary); 1574 ir_variable *src1h = 1575 new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary); 1576 ir_variable *src1l = 1577 new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary); 1578 ir_variable *src2 = 1579 new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary); 1580 ir_variable *src2h = 1581 new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary); 1582 ir_variable *src2l = 1583 new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary); 1584 ir_variable *t1 = 1585 new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary); 1586 ir_variable *t2 = 1587 new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary); 1588 ir_variable *lo = 1589 new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary); 1590 ir_variable *hi = 1591 new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary); 1592 ir_variable *different_signs = NULL; 1593 ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements); 1594 ir_constant *c16 = new(ir) ir_constant(16u, elements); 1595 1596 ir_instruction &i = *base_ir; 1597 1598 i.insert_before(src1); 1599 i.insert_before(src2); 1600 i.insert_before(src1h); 1601 i.insert_before(src2h); 1602 i.insert_before(src1l); 1603 i.insert_before(src2l); 1604 1605 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1606 i.insert_before(assign(src1, ir->operands[0])); 1607 i.insert_before(assign(src2, ir->operands[1])); 1608 } else { 1609 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1610 1611 ir_variable *itmp1 = 1612 new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary); 1613 ir_variable *itmp2 = 1614 new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary); 1615 ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1616 1617 i.insert_before(itmp1); 1618 i.insert_before(itmp2); 1619 i.insert_before(assign(itmp1, ir->operands[0])); 1620 i.insert_before(assign(itmp2, ir->operands[1])); 1621 1622 different_signs = 1623 new(ir) ir_variable(glsl_type::bvec(elements), "different_signs", 1624 ir_var_temporary); 1625 1626 i.insert_before(different_signs); 1627 i.insert_before(assign(different_signs, expr(ir_binop_logic_xor, 1628 less(itmp1, c0), 1629 less(itmp2, c0->clone(ir, NULL))))); 1630 1631 i.insert_before(assign(src1, i2u(abs(itmp1)))); 1632 i.insert_before(assign(src2, i2u(abs(itmp2)))); 1633 } 1634 1635 i.insert_before(assign(src1l, bit_and(src1, c0000FFFF))); 1636 i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL)))); 1637 i.insert_before(assign(src1h, rshift(src1, c16))); 1638 i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL)))); 1639 1640 i.insert_before(lo); 1641 i.insert_before(hi); 1642 i.insert_before(t1); 1643 i.insert_before(t2); 1644 1645 i.insert_before(assign(lo, mul(src1l, src2l))); 1646 i.insert_before(assign(t1, mul(src1l, src2h))); 1647 i.insert_before(assign(t2, mul(src1h, src2l))); 1648 i.insert_before(assign(hi, mul(src1h, src2h))); 1649 1650 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL)))))); 1651 i.insert_before(assign(lo, add(lo, lshift(t1, c16->clone(ir, NULL))))); 1652 1653 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL)))))); 1654 i.insert_before(assign(lo, add(lo, lshift(t2, c16->clone(ir, NULL))))); 1655 1656 if (different_signs == NULL) { 1657 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1658 1659 ir->operation = ir_binop_add; 1660 ir->init_num_operands(); 1661 ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL))); 1662 ir->operands[1] = rshift(t2, c16->clone(ir, NULL)); 1663 } else { 1664 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1665 1666 i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))), 1667 rshift(t2, c16->clone(ir, NULL))))); 1668 1669 /* For channels where different_signs is set we have to perform a 64-bit 1670 * negation. This is *not* the same as just negating the high 32-bits. 1671 * Consider -3 * 2. The high 32-bits is 0, but the desired result is 1672 * -1, not -0! Recall -x == ~x + 1. 1673 */ 1674 ir_variable *neg_hi = 1675 new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary); 1676 ir_constant *c1 = new(ir) ir_constant(1u, elements); 1677 1678 i.insert_before(neg_hi); 1679 i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)), 1680 u2i(_carry(bit_not(lo), c1))))); 1681 1682 ir->operation = ir_triop_csel; 1683 ir->init_num_operands(); 1684 ir->operands[0] = new(ir) ir_dereference_variable(different_signs); 1685 ir->operands[1] = new(ir) ir_dereference_variable(neg_hi); 1686 ir->operands[2] = u2i(hi); 1687 } 1688} 1689 1690void 1691lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir) 1692{ 1693 ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]); 1694 this->progress = true; 1695} 1696 1697void 1698lower_instructions_visitor::mul64_to_mul_and_mul_high(ir_expression *ir) 1699{ 1700 /* Lower 32x32-> 64 to 1701 * msb = imul_high(x_lo, y_lo) 1702 * lsb = mul(x_lo, y_lo) 1703 */ 1704 const unsigned elements = ir->operands[0]->type->vector_elements; 1705 1706 const ir_expression_operation operation = 1707 ir->type->base_type == GLSL_TYPE_UINT64 ? ir_unop_pack_uint_2x32 1708 : ir_unop_pack_int_2x32; 1709 1710 const glsl_type *var_type = ir->type->base_type == GLSL_TYPE_UINT64 1711 ? glsl_type::uvec(elements) 1712 : glsl_type::ivec(elements); 1713 1714 const glsl_type *ret_type = ir->type->base_type == GLSL_TYPE_UINT64 1715 ? glsl_type::uvec2_type 1716 : glsl_type::ivec2_type; 1717 1718 ir_instruction &i = *base_ir; 1719 1720 ir_variable *msb = 1721 new(ir) ir_variable(var_type, "msb", ir_var_temporary); 1722 ir_variable *lsb = 1723 new(ir) ir_variable(var_type, "lsb", ir_var_temporary); 1724 ir_variable *x = 1725 new(ir) ir_variable(var_type, "x", ir_var_temporary); 1726 ir_variable *y = 1727 new(ir) ir_variable(var_type, "y", ir_var_temporary); 1728 1729 i.insert_before(x); 1730 i.insert_before(assign(x, ir->operands[0])); 1731 i.insert_before(y); 1732 i.insert_before(assign(y, ir->operands[1])); 1733 i.insert_before(msb); 1734 i.insert_before(lsb); 1735 1736 i.insert_before(assign(msb, imul_high(x, y))); 1737 i.insert_before(assign(lsb, mul(x, y))); 1738 1739 ir_rvalue *result[4] = {NULL}; 1740 for (unsigned elem = 0; elem < elements; elem++) { 1741 ir_rvalue *val = new(ir) ir_expression(ir_quadop_vector, ret_type, 1742 swizzle(lsb, elem, 1), 1743 swizzle(msb, elem, 1), NULL, NULL); 1744 result[elem] = expr(operation, val); 1745 } 1746 1747 ir->operation = ir_quadop_vector; 1748 ir->init_num_operands(); 1749 ir->operands[0] = result[0]; 1750 ir->operands[1] = result[1]; 1751 ir->operands[2] = result[2]; 1752 ir->operands[3] = result[3]; 1753 1754 this->progress = true; 1755} 1756 1757ir_visitor_status 1758lower_instructions_visitor::visit_leave(ir_expression *ir) 1759{ 1760 switch (ir->operation) { 1761 case ir_binop_dot: 1762 if (ir->operands[0]->type->is_double()) 1763 double_dot_to_fma(ir); 1764 break; 1765 case ir_triop_lrp: 1766 if (ir->operands[0]->type->is_double()) 1767 double_lrp(ir); 1768 break; 1769 case ir_binop_sub: 1770 if (lowering(SUB_TO_ADD_NEG)) 1771 sub_to_add_neg(ir); 1772 break; 1773 1774 case ir_binop_div: 1775 if (ir->operands[1]->type->is_integer_32() && lowering(INT_DIV_TO_MUL_RCP)) 1776 int_div_to_mul_rcp(ir); 1777 else if ((ir->operands[1]->type->is_float_16_32() && lowering(FDIV_TO_MUL_RCP)) || 1778 (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP))) 1779 div_to_mul_rcp(ir); 1780 break; 1781 1782 case ir_unop_exp: 1783 if (lowering(EXP_TO_EXP2)) 1784 exp_to_exp2(ir); 1785 break; 1786 1787 case ir_unop_log: 1788 if (lowering(LOG_TO_LOG2)) 1789 log_to_log2(ir); 1790 break; 1791 1792 case ir_binop_mod: 1793 if (lowering(MOD_TO_FLOOR) && ir->type->is_float_16_32_64()) 1794 mod_to_floor(ir); 1795 break; 1796 1797 case ir_binop_pow: 1798 if (lowering(POW_TO_EXP2)) 1799 pow_to_exp2(ir); 1800 break; 1801 1802 case ir_binop_ldexp: 1803 if (lowering(LDEXP_TO_ARITH) && ir->type->is_float()) 1804 ldexp_to_arith(ir); 1805 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double()) 1806 dldexp_to_arith(ir); 1807 break; 1808 1809 case ir_unop_frexp_exp: 1810 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1811 dfrexp_exp_to_arith(ir); 1812 break; 1813 1814 case ir_unop_frexp_sig: 1815 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1816 dfrexp_sig_to_arith(ir); 1817 break; 1818 1819 case ir_binop_carry: 1820 if (lowering(CARRY_TO_ARITH)) 1821 carry_to_arith(ir); 1822 break; 1823 1824 case ir_binop_borrow: 1825 if (lowering(BORROW_TO_ARITH)) 1826 borrow_to_arith(ir); 1827 break; 1828 1829 case ir_unop_saturate: 1830 if (lowering(SAT_TO_CLAMP)) 1831 sat_to_clamp(ir); 1832 break; 1833 1834 case ir_unop_trunc: 1835 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1836 dtrunc_to_dfrac(ir); 1837 break; 1838 1839 case ir_unop_ceil: 1840 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1841 dceil_to_dfrac(ir); 1842 break; 1843 1844 case ir_unop_floor: 1845 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1846 dfloor_to_dfrac(ir); 1847 break; 1848 1849 case ir_unop_round_even: 1850 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1851 dround_even_to_dfrac(ir); 1852 break; 1853 1854 case ir_unop_sign: 1855 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1856 dsign_to_csel(ir); 1857 break; 1858 1859 case ir_unop_bit_count: 1860 if (lowering(BIT_COUNT_TO_MATH)) 1861 bit_count_to_math(ir); 1862 break; 1863 1864 case ir_triop_bitfield_extract: 1865 if (lowering(EXTRACT_TO_SHIFTS)) 1866 extract_to_shifts(ir); 1867 break; 1868 1869 case ir_quadop_bitfield_insert: 1870 if (lowering(INSERT_TO_SHIFTS)) 1871 insert_to_shifts(ir); 1872 break; 1873 1874 case ir_unop_bitfield_reverse: 1875 if (lowering(REVERSE_TO_SHIFTS)) 1876 reverse_to_shifts(ir); 1877 break; 1878 1879 case ir_unop_find_lsb: 1880 if (lowering(FIND_LSB_TO_FLOAT_CAST)) 1881 find_lsb_to_float_cast(ir); 1882 break; 1883 1884 case ir_unop_find_msb: 1885 if (lowering(FIND_MSB_TO_FLOAT_CAST)) 1886 find_msb_to_float_cast(ir); 1887 break; 1888 1889 case ir_binop_imul_high: 1890 if (lowering(IMUL_HIGH_TO_MUL)) 1891 imul_high_to_mul(ir); 1892 break; 1893 1894 case ir_binop_mul: 1895 if (lowering(MUL64_TO_MUL_AND_MUL_HIGH) && 1896 (ir->type->base_type == GLSL_TYPE_INT64 || 1897 ir->type->base_type == GLSL_TYPE_UINT64) && 1898 (ir->operands[0]->type->base_type == GLSL_TYPE_INT || 1899 ir->operands[1]->type->base_type == GLSL_TYPE_UINT)) 1900 mul64_to_mul_and_mul_high(ir); 1901 break; 1902 1903 case ir_unop_rsq: 1904 case ir_unop_sqrt: 1905 if (lowering(SQRT_TO_ABS_SQRT)) 1906 sqrt_to_abs_sqrt(ir); 1907 break; 1908 1909 default: 1910 return visit_continue; 1911 } 1912 1913 return visit_continue; 1914} 1915