1/* 2 * Copyright © 2010 Intel Corporation 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining a 5 * copy of this software and associated documentation files (the "Software"), 6 * to deal in the Software without restriction, including without limitation 7 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8 * and/or sell copies of the Software, and to permit persons to whom the 9 * Software is furnished to do so, subject to the following conditions: 10 * 11 * The above copyright notice and this permission notice (including the next 12 * paragraph) shall be included in all copies or substantial portions of the 13 * Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21 * DEALINGS IN THE SOFTWARE. 22 */ 23 24/** 25 * \file lower_instructions.cpp 26 * 27 * Many GPUs lack native instructions for certain expression operations, and 28 * must replace them with some other expression tree. This pass lowers some 29 * of the most common cases, allowing the lowering code to be implemented once 30 * rather than in each driver backend. 31 * 32 * Currently supported transformations: 33 * - SUB_TO_ADD_NEG 34 * - DIV_TO_MUL_RCP 35 * - INT_DIV_TO_MUL_RCP 36 * - EXP_TO_EXP2 37 * - POW_TO_EXP2 38 * - LOG_TO_LOG2 39 * - MOD_TO_FLOOR 40 * - LDEXP_TO_ARITH 41 * - DFREXP_TO_ARITH 42 * - CARRY_TO_ARITH 43 * - BORROW_TO_ARITH 44 * - SAT_TO_CLAMP 45 * - DOPS_TO_DFRAC 46 * 47 * SUB_TO_ADD_NEG: 48 * --------------- 49 * Breaks an ir_binop_sub expression down to add(op0, neg(op1)) 50 * 51 * This simplifies expression reassociation, and for many backends 52 * there is no subtract operation separate from adding the negation. 53 * For backends with native subtract operations, they will probably 54 * want to recognize add(op0, neg(op1)) or the other way around to 55 * produce a subtract anyway. 56 * 57 * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP: 58 * --------------------------------------------------------- 59 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)). 60 * 61 * Many GPUs don't have a divide instruction (945 and 965 included), 62 * but they do have an RCP instruction to compute an approximate 63 * reciprocal. By breaking the operation down, constant reciprocals 64 * can get constant folded. 65 * 66 * FDIV_TO_MUL_RCP only lowers single-precision floating point division; 67 * DDIV_TO_MUL_RCP only lowers double-precision floating point division. 68 * DIV_TO_MUL_RCP is a convenience macro that sets both flags. 69 * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating 70 * point so that RCP is possible. 71 * 72 * EXP_TO_EXP2 and LOG_TO_LOG2: 73 * ---------------------------- 74 * Many GPUs don't have a base e log or exponent instruction, but they 75 * do have base 2 versions, so this pass converts exp and log to exp2 76 * and log2 operations. 77 * 78 * POW_TO_EXP2: 79 * ----------- 80 * Many older GPUs don't have an x**y instruction. For these GPUs, convert 81 * x**y to 2**(y * log2(x)). 82 * 83 * MOD_TO_FLOOR: 84 * ------------- 85 * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1)) 86 * 87 * Many GPUs don't have a MOD instruction (945 and 965 included), and 88 * if we have to break it down like this anyway, it gives an 89 * opportunity to do things like constant fold the (1.0 / op1) easily. 90 * 91 * Note: before we used to implement this as op1 * fract(op / op1) but this 92 * implementation had significant precision errors. 93 * 94 * LDEXP_TO_ARITH: 95 * ------------- 96 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources. 97 * 98 * DFREXP_DLDEXP_TO_ARITH: 99 * --------------- 100 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to 101 * arithmetic and bit ops for double arguments. 102 * 103 * CARRY_TO_ARITH: 104 * --------------- 105 * Converts ir_carry into (x + y) < x. 106 * 107 * BORROW_TO_ARITH: 108 * ---------------- 109 * Converts ir_borrow into (x < y). 110 * 111 * SAT_TO_CLAMP: 112 * ------------- 113 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0) 114 * 115 * DOPS_TO_DFRAC: 116 * -------------- 117 * Converts double trunc, ceil, floor, round to fract 118 */ 119 120#include "c99_math.h" 121#include "program/prog_instruction.h" /* for swizzle */ 122#include "compiler/glsl_types.h" 123#include "ir.h" 124#include "ir_builder.h" 125#include "ir_optimization.h" 126 127using namespace ir_builder; 128 129namespace { 130 131class lower_instructions_visitor : public ir_hierarchical_visitor { 132public: 133 lower_instructions_visitor(unsigned lower) 134 : progress(false), lower(lower) { } 135 136 ir_visitor_status visit_leave(ir_expression *); 137 138 bool progress; 139 140private: 141 unsigned lower; /** Bitfield of which operations to lower */ 142 143 void sub_to_add_neg(ir_expression *); 144 void div_to_mul_rcp(ir_expression *); 145 void int_div_to_mul_rcp(ir_expression *); 146 void mod_to_floor(ir_expression *); 147 void exp_to_exp2(ir_expression *); 148 void pow_to_exp2(ir_expression *); 149 void log_to_log2(ir_expression *); 150 void ldexp_to_arith(ir_expression *); 151 void dldexp_to_arith(ir_expression *); 152 void dfrexp_sig_to_arith(ir_expression *); 153 void dfrexp_exp_to_arith(ir_expression *); 154 void carry_to_arith(ir_expression *); 155 void borrow_to_arith(ir_expression *); 156 void sat_to_clamp(ir_expression *); 157 void double_dot_to_fma(ir_expression *); 158 void double_lrp(ir_expression *); 159 void dceil_to_dfrac(ir_expression *); 160 void dfloor_to_dfrac(ir_expression *); 161 void dround_even_to_dfrac(ir_expression *); 162 void dtrunc_to_dfrac(ir_expression *); 163 void dsign_to_csel(ir_expression *); 164 void bit_count_to_math(ir_expression *); 165 void extract_to_shifts(ir_expression *); 166 void insert_to_shifts(ir_expression *); 167 void reverse_to_shifts(ir_expression *ir); 168 void find_lsb_to_float_cast(ir_expression *ir); 169 void find_msb_to_float_cast(ir_expression *ir); 170 void imul_high_to_mul(ir_expression *ir); 171 void sqrt_to_abs_sqrt(ir_expression *ir); 172 void mul64_to_mul_and_mul_high(ir_expression *ir); 173 174 ir_expression *_carry(operand a, operand b); 175}; 176 177} /* anonymous namespace */ 178 179/** 180 * Determine if a particular type of lowering should occur 181 */ 182#define lowering(x) (this->lower & x) 183 184bool 185lower_instructions(exec_list *instructions, unsigned what_to_lower) 186{ 187 lower_instructions_visitor v(what_to_lower); 188 189 visit_list_elements(&v, instructions); 190 return v.progress; 191} 192 193void 194lower_instructions_visitor::sub_to_add_neg(ir_expression *ir) 195{ 196 ir->operation = ir_binop_add; 197 ir->init_num_operands(); 198 ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type, 199 ir->operands[1], NULL); 200 this->progress = true; 201} 202 203void 204lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir) 205{ 206 assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double()); 207 208 /* New expression for the 1.0 / op1 */ 209 ir_rvalue *expr; 210 expr = new(ir) ir_expression(ir_unop_rcp, 211 ir->operands[1]->type, 212 ir->operands[1]); 213 214 /* op0 / op1 -> op0 * (1.0 / op1) */ 215 ir->operation = ir_binop_mul; 216 ir->init_num_operands(); 217 ir->operands[1] = expr; 218 219 this->progress = true; 220} 221 222void 223lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir) 224{ 225 assert(ir->operands[1]->type->is_integer()); 226 227 /* Be careful with integer division -- we need to do it as a 228 * float and re-truncate, since rcp(n > 1) of an integer would 229 * just be 0. 230 */ 231 ir_rvalue *op0, *op1; 232 const struct glsl_type *vec_type; 233 234 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 235 ir->operands[1]->type->vector_elements, 236 ir->operands[1]->type->matrix_columns); 237 238 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) 239 op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL); 240 else 241 op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL); 242 243 op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL); 244 245 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 246 ir->operands[0]->type->vector_elements, 247 ir->operands[0]->type->matrix_columns); 248 249 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) 250 op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL); 251 else 252 op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL); 253 254 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 255 ir->type->vector_elements, 256 ir->type->matrix_columns); 257 258 op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1); 259 260 if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) { 261 ir->operation = ir_unop_f2i; 262 ir->operands[0] = op0; 263 } else { 264 ir->operation = ir_unop_i2u; 265 ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0); 266 } 267 ir->init_num_operands(); 268 ir->operands[1] = NULL; 269 270 this->progress = true; 271} 272 273void 274lower_instructions_visitor::exp_to_exp2(ir_expression *ir) 275{ 276 ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E)); 277 278 ir->operation = ir_unop_exp2; 279 ir->init_num_operands(); 280 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type, 281 ir->operands[0], log2_e); 282 this->progress = true; 283} 284 285void 286lower_instructions_visitor::pow_to_exp2(ir_expression *ir) 287{ 288 ir_expression *const log2_x = 289 new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 290 ir->operands[0]); 291 292 ir->operation = ir_unop_exp2; 293 ir->init_num_operands(); 294 ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type, 295 ir->operands[1], log2_x); 296 ir->operands[1] = NULL; 297 this->progress = true; 298} 299 300void 301lower_instructions_visitor::log_to_log2(ir_expression *ir) 302{ 303 ir->operation = ir_binop_mul; 304 ir->init_num_operands(); 305 ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 306 ir->operands[0], NULL); 307 ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E)); 308 this->progress = true; 309} 310 311void 312lower_instructions_visitor::mod_to_floor(ir_expression *ir) 313{ 314 ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x", 315 ir_var_temporary); 316 ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y", 317 ir_var_temporary); 318 this->base_ir->insert_before(x); 319 this->base_ir->insert_before(y); 320 321 ir_assignment *const assign_x = 322 new(ir) ir_assignment(new(ir) ir_dereference_variable(x), 323 ir->operands[0]); 324 ir_assignment *const assign_y = 325 new(ir) ir_assignment(new(ir) ir_dereference_variable(y), 326 ir->operands[1]); 327 328 this->base_ir->insert_before(assign_x); 329 this->base_ir->insert_before(assign_y); 330 331 ir_expression *const div_expr = 332 new(ir) ir_expression(ir_binop_div, x->type, 333 new(ir) ir_dereference_variable(x), 334 new(ir) ir_dereference_variable(y)); 335 336 /* Don't generate new IR that would need to be lowered in an additional 337 * pass. 338 */ 339 if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) || 340 (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double())) 341 div_to_mul_rcp(div_expr); 342 343 ir_expression *const floor_expr = 344 new(ir) ir_expression(ir_unop_floor, x->type, div_expr); 345 346 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 347 dfloor_to_dfrac(floor_expr); 348 349 ir_expression *const mul_expr = 350 new(ir) ir_expression(ir_binop_mul, 351 new(ir) ir_dereference_variable(y), 352 floor_expr); 353 354 ir->operation = ir_binop_sub; 355 ir->init_num_operands(); 356 ir->operands[0] = new(ir) ir_dereference_variable(x); 357 ir->operands[1] = mul_expr; 358 this->progress = true; 359} 360 361void 362lower_instructions_visitor::ldexp_to_arith(ir_expression *ir) 363{ 364 /* Translates 365 * ir_binop_ldexp x exp 366 * into 367 * 368 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 369 * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 370 * 371 * if (extracted_biased_exp >= 255) 372 * return x; // +/-inf, NaN 373 * 374 * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 375 * 376 * if (min(resulting_biased_exp, extracted_biased_exp) < 1) 377 * resulting_biased_exp = 0; 378 * if (resulting_biased_exp >= 255 || 379 * min(resulting_biased_exp, extracted_biased_exp) < 1) { 380 * sign_mantissa &= sign_mask; 381 * } 382 * 383 * return bitcast_u2f(sign_mantissa | 384 * lshift(i2u(resulting_biased_exp), exp_shift)); 385 * 386 * which we can't actually implement as such, since the GLSL IR doesn't 387 * have vectorized if-statements. We actually implement it without branches 388 * using conditional-select: 389 * 390 * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 391 * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 392 * 393 * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 394 * 395 * flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0); 396 * resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp) 397 * zero_mantissa = logic_or(flush_to_zero, 398 * gequal(resulting_biased_exp, 255)); 399 * sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa); 400 * 401 * result = sign_mantissa | 402 * lshift(i2u(resulting_biased_exp), exp_shift)); 403 * 404 * return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result)); 405 * 406 * The definition of ldexp in the GLSL spec says: 407 * 408 * "If this product is too large to be represented in the 409 * floating-point type, the result is undefined." 410 * 411 * However, the definition of ldexp in the GLSL ES spec does not contain 412 * this sentence, so we do need to handle overflow correctly. 413 * 414 * There is additional language limiting the defined range of exp, but this 415 * is merely to allow implementations that store 2^exp in a temporary 416 * variable. 417 */ 418 419 const unsigned vec_elem = ir->type->vector_elements; 420 421 /* Types */ 422 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 423 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 424 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 425 426 /* Temporary variables */ 427 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 428 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 429 ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary); 430 431 ir_variable *extracted_biased_exp = 432 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 433 ir_variable *resulting_biased_exp = 434 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 435 436 ir_variable *sign_mantissa = 437 new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary); 438 439 ir_variable *flush_to_zero = 440 new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary); 441 ir_variable *zero_mantissa = 442 new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary); 443 444 ir_instruction &i = *base_ir; 445 446 /* Copy <x> and <exp> arguments. */ 447 i.insert_before(x); 448 i.insert_before(assign(x, ir->operands[0])); 449 i.insert_before(exp); 450 i.insert_before(assign(exp, ir->operands[1])); 451 452 /* Extract the biased exponent from <x>. */ 453 i.insert_before(extracted_biased_exp); 454 i.insert_before(assign(extracted_biased_exp, 455 rshift(bitcast_f2i(abs(x)), 456 new(ir) ir_constant(23, vec_elem)))); 457 458 /* The definition of ldexp in the GLSL 4.60 spec says: 459 * 460 * "If exp is greater than +128 (single-precision) or +1024 461 * (double-precision), the value returned is undefined. If exp is less 462 * than -126 (single-precision) or -1022 (double-precision), the value 463 * returned may be flushed to zero." 464 * 465 * So we do not have to guard against the possibility of addition overflow, 466 * which could happen when exp is close to INT_MAX. Addition underflow 467 * cannot happen (the worst case is 0 + (-INT_MAX)). 468 */ 469 i.insert_before(resulting_biased_exp); 470 i.insert_before(assign(resulting_biased_exp, 471 min2(add(extracted_biased_exp, exp), 472 new(ir) ir_constant(255, vec_elem)))); 473 474 i.insert_before(sign_mantissa); 475 i.insert_before(assign(sign_mantissa, 476 bit_and(bitcast_f2u(x), 477 new(ir) ir_constant(0x807fffffu, vec_elem)))); 478 479 /* We flush to zero if the original or resulting biased exponent is 0, 480 * indicating a +/-0.0 or subnormal input or output. 481 * 482 * The mantissa is set to 0 if the resulting biased exponent is 255, since 483 * an overflow should produce a +/-inf result. 484 * 485 * Note that NaN inputs are handled separately. 486 */ 487 i.insert_before(flush_to_zero); 488 i.insert_before(assign(flush_to_zero, 489 lequal(min2(resulting_biased_exp, 490 extracted_biased_exp), 491 ir_constant::zero(ir, ivec)))); 492 i.insert_before(assign(resulting_biased_exp, 493 csel(flush_to_zero, 494 ir_constant::zero(ir, ivec), 495 resulting_biased_exp))); 496 497 i.insert_before(zero_mantissa); 498 i.insert_before(assign(zero_mantissa, 499 logic_or(flush_to_zero, 500 equal(resulting_biased_exp, 501 new(ir) ir_constant(255, vec_elem))))); 502 i.insert_before(assign(sign_mantissa, 503 csel(zero_mantissa, 504 bit_and(sign_mantissa, 505 new(ir) ir_constant(0x80000000u, vec_elem)), 506 sign_mantissa))); 507 508 /* Don't generate new IR that would need to be lowered in an additional 509 * pass. 510 */ 511 i.insert_before(result); 512 if (!lowering(INSERT_TO_SHIFTS)) { 513 i.insert_before(assign(result, 514 bitfield_insert(sign_mantissa, 515 i2u(resulting_biased_exp), 516 new(ir) ir_constant(23u, vec_elem), 517 new(ir) ir_constant(8u, vec_elem)))); 518 } else { 519 i.insert_before(assign(result, 520 bit_or(sign_mantissa, 521 lshift(i2u(resulting_biased_exp), 522 new(ir) ir_constant(23, vec_elem))))); 523 } 524 525 ir->operation = ir_triop_csel; 526 ir->init_num_operands(); 527 ir->operands[0] = gequal(extracted_biased_exp, 528 new(ir) ir_constant(255, vec_elem)); 529 ir->operands[1] = new(ir) ir_dereference_variable(x); 530 ir->operands[2] = bitcast_u2f(result); 531 532 this->progress = true; 533} 534 535void 536lower_instructions_visitor::dldexp_to_arith(ir_expression *ir) 537{ 538 /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent 539 * from the significand. 540 */ 541 542 const unsigned vec_elem = ir->type->vector_elements; 543 544 /* Types */ 545 const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 546 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 547 548 /* Constants */ 549 ir_constant *zeroi = ir_constant::zero(ir, ivec); 550 551 ir_constant *sign_mask = new(ir) ir_constant(0x80000000u); 552 553 ir_constant *exp_shift = new(ir) ir_constant(20u); 554 ir_constant *exp_width = new(ir) ir_constant(11u); 555 ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem); 556 557 /* Temporary variables */ 558 ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 559 ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 560 561 ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x", 562 ir_var_temporary); 563 564 ir_variable *extracted_biased_exp = 565 new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 566 ir_variable *resulting_biased_exp = 567 new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 568 569 ir_variable *is_not_zero_or_underflow = 570 new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary); 571 572 ir_instruction &i = *base_ir; 573 574 /* Copy <x> and <exp> arguments. */ 575 i.insert_before(x); 576 i.insert_before(assign(x, ir->operands[0])); 577 i.insert_before(exp); 578 i.insert_before(assign(exp, ir->operands[1])); 579 580 ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x); 581 if (lowering(DFREXP_DLDEXP_TO_ARITH)) 582 dfrexp_exp_to_arith(frexp_exp); 583 584 /* Extract the biased exponent from <x>. */ 585 i.insert_before(extracted_biased_exp); 586 i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias))); 587 588 i.insert_before(resulting_biased_exp); 589 i.insert_before(assign(resulting_biased_exp, 590 add(extracted_biased_exp, exp))); 591 592 /* Test if result is ±0.0, subnormal, or underflow by checking if the 593 * resulting biased exponent would be less than 0x1. If so, the result is 594 * 0.0 with the sign of x. (Actually, invert the conditions so that 595 * immediate values are the second arguments, which is better for i965) 596 * TODO: Implement in a vector fashion. 597 */ 598 i.insert_before(zero_sign_x); 599 for (unsigned elem = 0; elem < vec_elem; elem++) { 600 ir_variable *unpacked = 601 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 602 i.insert_before(unpacked); 603 i.insert_before( 604 assign(unpacked, 605 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 606 i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)), 607 WRITEMASK_Y)); 608 i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X)); 609 i.insert_before(assign(zero_sign_x, 610 expr(ir_unop_pack_double_2x32, unpacked), 611 1 << elem)); 612 } 613 i.insert_before(is_not_zero_or_underflow); 614 i.insert_before(assign(is_not_zero_or_underflow, 615 gequal(resulting_biased_exp, 616 new(ir) ir_constant(0x1, vec_elem)))); 617 i.insert_before(assign(x, csel(is_not_zero_or_underflow, 618 x, zero_sign_x))); 619 i.insert_before(assign(resulting_biased_exp, 620 csel(is_not_zero_or_underflow, 621 resulting_biased_exp, zeroi))); 622 623 /* We could test for overflows by checking if the resulting biased exponent 624 * would be greater than 0xFE. Turns out we don't need to because the GLSL 625 * spec says: 626 * 627 * "If this product is too large to be represented in the 628 * floating-point type, the result is undefined." 629 */ 630 631 ir_rvalue *results[4] = {NULL}; 632 for (unsigned elem = 0; elem < vec_elem; elem++) { 633 ir_variable *unpacked = 634 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 635 i.insert_before(unpacked); 636 i.insert_before( 637 assign(unpacked, 638 expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 639 640 ir_expression *bfi = bitfield_insert( 641 swizzle_y(unpacked), 642 i2u(swizzle(resulting_biased_exp, elem, 1)), 643 exp_shift->clone(ir, NULL), 644 exp_width->clone(ir, NULL)); 645 646 i.insert_before(assign(unpacked, bfi, WRITEMASK_Y)); 647 648 results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 649 } 650 651 ir->operation = ir_quadop_vector; 652 ir->init_num_operands(); 653 ir->operands[0] = results[0]; 654 ir->operands[1] = results[1]; 655 ir->operands[2] = results[2]; 656 ir->operands[3] = results[3]; 657 658 /* Don't generate new IR that would need to be lowered in an additional 659 * pass. 660 */ 661 662 this->progress = true; 663} 664 665void 666lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir) 667{ 668 const unsigned vec_elem = ir->type->vector_elements; 669 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 670 671 /* Double-precision floating-point values are stored as 672 * 1 sign bit; 673 * 11 exponent bits; 674 * 52 mantissa bits. 675 * 676 * We're just extracting the significand here, so we only need to modify 677 * the upper 32-bit uint. Unfortunately we must extract each double 678 * independently as there is no vector version of unpackDouble. 679 */ 680 681 ir_instruction &i = *base_ir; 682 683 ir_variable *is_not_zero = 684 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 685 ir_rvalue *results[4] = {NULL}; 686 687 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 688 i.insert_before(is_not_zero); 689 i.insert_before( 690 assign(is_not_zero, 691 nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero))); 692 693 /* TODO: Remake this as more vector-friendly when int64 support is 694 * available. 695 */ 696 for (unsigned elem = 0; elem < vec_elem; elem++) { 697 ir_constant *zero = new(ir) ir_constant(0u, 1); 698 ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1); 699 700 /* Exponent of double floating-point values in the range [0.5, 1.0). */ 701 ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1); 702 703 ir_variable *bits = 704 new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary); 705 ir_variable *unpacked = 706 new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 707 708 ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1); 709 710 i.insert_before(bits); 711 i.insert_before(unpacked); 712 i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x))); 713 714 /* Manipulate the high uint to remove the exponent and replace it with 715 * either the default exponent or zero. 716 */ 717 i.insert_before(assign(bits, swizzle_y(unpacked))); 718 i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask))); 719 i.insert_before(assign(bits, bit_or(bits, 720 csel(swizzle(is_not_zero, elem, 1), 721 exponent_value, 722 zero)))); 723 i.insert_before(assign(unpacked, bits, WRITEMASK_Y)); 724 results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 725 } 726 727 /* Put the dvec back together */ 728 ir->operation = ir_quadop_vector; 729 ir->init_num_operands(); 730 ir->operands[0] = results[0]; 731 ir->operands[1] = results[1]; 732 ir->operands[2] = results[2]; 733 ir->operands[3] = results[3]; 734 735 this->progress = true; 736} 737 738void 739lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir) 740{ 741 const unsigned vec_elem = ir->type->vector_elements; 742 const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 743 const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 744 745 /* Double-precision floating-point values are stored as 746 * 1 sign bit; 747 * 11 exponent bits; 748 * 52 mantissa bits. 749 * 750 * We're just extracting the exponent here, so we only care about the upper 751 * 32-bit uint. 752 */ 753 754 ir_instruction &i = *base_ir; 755 756 ir_variable *is_not_zero = 757 new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 758 ir_variable *high_words = 759 new(ir) ir_variable(uvec, "high_words", ir_var_temporary); 760 ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 761 ir_constant *izero = new(ir) ir_constant(0, vec_elem); 762 763 ir_rvalue *absval = abs(ir->operands[0]); 764 765 i.insert_before(is_not_zero); 766 i.insert_before(high_words); 767 i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero))); 768 769 /* Extract all of the upper uints. */ 770 for (unsigned elem = 0; elem < vec_elem; elem++) { 771 ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1); 772 773 i.insert_before(assign(high_words, 774 swizzle_y(expr(ir_unop_unpack_double_2x32, x)), 775 1 << elem)); 776 777 } 778 ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem); 779 ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem); 780 781 /* For non-zero inputs, shift the exponent down and apply bias. */ 782 ir->operation = ir_triop_csel; 783 ir->init_num_operands(); 784 ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero); 785 ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift))); 786 ir->operands[2] = izero; 787 788 this->progress = true; 789} 790 791void 792lower_instructions_visitor::carry_to_arith(ir_expression *ir) 793{ 794 /* Translates 795 * ir_binop_carry x y 796 * into 797 * sum = ir_binop_add x y 798 * bcarry = ir_binop_less sum x 799 * carry = ir_unop_b2i bcarry 800 */ 801 802 ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL); 803 ir->operation = ir_unop_i2u; 804 ir->init_num_operands(); 805 ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone)); 806 ir->operands[1] = NULL; 807 808 this->progress = true; 809} 810 811void 812lower_instructions_visitor::borrow_to_arith(ir_expression *ir) 813{ 814 /* Translates 815 * ir_binop_borrow x y 816 * into 817 * bcarry = ir_binop_less x y 818 * carry = ir_unop_b2i bcarry 819 */ 820 821 ir->operation = ir_unop_i2u; 822 ir->init_num_operands(); 823 ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1])); 824 ir->operands[1] = NULL; 825 826 this->progress = true; 827} 828 829void 830lower_instructions_visitor::sat_to_clamp(ir_expression *ir) 831{ 832 /* Translates 833 * ir_unop_saturate x 834 * into 835 * ir_binop_min (ir_binop_max(x, 0.0), 1.0) 836 */ 837 838 ir->operation = ir_binop_min; 839 ir->init_num_operands(); 840 ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type, 841 ir->operands[0], 842 new(ir) ir_constant(0.0f)); 843 ir->operands[1] = new(ir) ir_constant(1.0f); 844 845 this->progress = true; 846} 847 848void 849lower_instructions_visitor::double_dot_to_fma(ir_expression *ir) 850{ 851 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res", 852 ir_var_temporary); 853 this->base_ir->insert_before(temp); 854 855 int nc = ir->operands[0]->type->components(); 856 for (int i = nc - 1; i >= 1; i--) { 857 ir_assignment *assig; 858 if (i == (nc - 1)) { 859 assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 860 swizzle(ir->operands[1]->clone(ir, NULL), i, 1))); 861 } else { 862 assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 863 swizzle(ir->operands[1]->clone(ir, NULL), i, 1), 864 temp)); 865 } 866 this->base_ir->insert_before(assig); 867 } 868 869 ir->operation = ir_triop_fma; 870 ir->init_num_operands(); 871 ir->operands[0] = swizzle(ir->operands[0], 0, 1); 872 ir->operands[1] = swizzle(ir->operands[1], 0, 1); 873 ir->operands[2] = new(ir) ir_dereference_variable(temp); 874 875 this->progress = true; 876 877} 878 879void 880lower_instructions_visitor::double_lrp(ir_expression *ir) 881{ 882 int swizval; 883 ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2]; 884 ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements); 885 886 switch (op2->type->vector_elements) { 887 case 1: 888 swizval = SWIZZLE_XXXX; 889 break; 890 default: 891 assert(op0->type->vector_elements == op2->type->vector_elements); 892 swizval = SWIZZLE_XYZW; 893 break; 894 } 895 896 ir->operation = ir_triop_fma; 897 ir->init_num_operands(); 898 ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements); 899 ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0); 900 901 this->progress = true; 902} 903 904void 905lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir) 906{ 907 /* 908 * frtemp = frac(x); 909 * temp = sub(x, frtemp); 910 * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0); 911 */ 912 ir_instruction &i = *base_ir; 913 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 914 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 915 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 916 ir_var_temporary); 917 918 i.insert_before(frtemp); 919 i.insert_before(assign(frtemp, fract(ir->operands[0]))); 920 921 ir->operation = ir_binop_add; 922 ir->init_num_operands(); 923 ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp); 924 ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL)); 925 926 this->progress = true; 927} 928 929void 930lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir) 931{ 932 /* 933 * frtemp = frac(x); 934 * result = sub(x, frtemp); 935 */ 936 ir->operation = ir_binop_sub; 937 ir->init_num_operands(); 938 ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL)); 939 940 this->progress = true; 941} 942void 943lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir) 944{ 945 /* 946 * insane but works 947 * temp = x + 0.5; 948 * frtemp = frac(temp); 949 * t2 = sub(temp, frtemp); 950 * if (frac(x) == 0.5) 951 * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1; 952 * else 953 * result = t2; 954 955 */ 956 ir_instruction &i = *base_ir; 957 ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 958 ir_var_temporary); 959 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 960 ir_var_temporary); 961 ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2", 962 ir_var_temporary); 963 ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements); 964 ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 965 ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 966 967 i.insert_before(temp); 968 i.insert_before(assign(temp, add(ir->operands[0], p5))); 969 970 i.insert_before(frtemp); 971 i.insert_before(assign(frtemp, fract(temp))); 972 973 i.insert_before(t2); 974 i.insert_before(assign(t2, sub(temp, frtemp))); 975 976 ir->operation = ir_triop_csel; 977 ir->init_num_operands(); 978 ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)), 979 p5->clone(ir, NULL)); 980 ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))), 981 zero), 982 t2, 983 sub(t2, one)); 984 ir->operands[2] = new(ir) ir_dereference_variable(t2); 985 986 this->progress = true; 987} 988 989void 990lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir) 991{ 992 /* 993 * frtemp = frac(x); 994 * temp = sub(x, frtemp); 995 * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1; 996 */ 997 ir_rvalue *arg = ir->operands[0]; 998 ir_instruction &i = *base_ir; 999 1000 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 1001 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 1002 ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp", 1003 ir_var_temporary); 1004 ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 1005 ir_var_temporary); 1006 1007 i.insert_before(frtemp); 1008 i.insert_before(assign(frtemp, fract(arg))); 1009 i.insert_before(temp); 1010 i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp))); 1011 1012 ir->operation = ir_triop_csel; 1013 ir->init_num_operands(); 1014 ir->operands[0] = gequal(arg->clone(ir, NULL), zero); 1015 ir->operands[1] = new (ir) ir_dereference_variable(temp); 1016 ir->operands[2] = add(temp, 1017 csel(equal(frtemp, zero->clone(ir, NULL)), 1018 zero->clone(ir, NULL), 1019 one)); 1020 1021 this->progress = true; 1022} 1023 1024void 1025lower_instructions_visitor::dsign_to_csel(ir_expression *ir) 1026{ 1027 /* 1028 * temp = x > 0.0 ? 1.0 : 0.0; 1029 * result = x < 0.0 ? -1.0 : temp; 1030 */ 1031 ir_rvalue *arg = ir->operands[0]; 1032 ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 1033 ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 1034 ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements); 1035 1036 ir->operation = ir_triop_csel; 1037 ir->init_num_operands(); 1038 ir->operands[0] = less(arg->clone(ir, NULL), 1039 zero->clone(ir, NULL)); 1040 ir->operands[1] = neg_one; 1041 ir->operands[2] = csel(greater(arg, zero), 1042 one, 1043 zero->clone(ir, NULL)); 1044 1045 this->progress = true; 1046} 1047 1048void 1049lower_instructions_visitor::bit_count_to_math(ir_expression *ir) 1050{ 1051 /* For more details, see: 1052 * 1053 * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel 1054 */ 1055 const unsigned elements = ir->operands[0]->type->vector_elements; 1056 ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp", 1057 ir_var_temporary); 1058 ir_constant *c55555555 = new(ir) ir_constant(0x55555555u); 1059 ir_constant *c33333333 = new(ir) ir_constant(0x33333333u); 1060 ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu); 1061 ir_constant *c01010101 = new(ir) ir_constant(0x01010101u); 1062 ir_constant *c1 = new(ir) ir_constant(1u); 1063 ir_constant *c2 = new(ir) ir_constant(2u); 1064 ir_constant *c4 = new(ir) ir_constant(4u); 1065 ir_constant *c24 = new(ir) ir_constant(24u); 1066 1067 base_ir->insert_before(temp); 1068 1069 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1070 base_ir->insert_before(assign(temp, ir->operands[0])); 1071 } else { 1072 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1073 base_ir->insert_before(assign(temp, i2u(ir->operands[0]))); 1074 } 1075 1076 /* temp = temp - ((temp >> 1) & 0x55555555u); */ 1077 base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1), 1078 c55555555)))); 1079 1080 /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */ 1081 base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333), 1082 bit_and(rshift(temp, c2), 1083 c33333333->clone(ir, NULL))))); 1084 1085 /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */ 1086 ir->operation = ir_unop_u2i; 1087 ir->init_num_operands(); 1088 ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F), 1089 c01010101), 1090 c24); 1091 1092 this->progress = true; 1093} 1094 1095void 1096lower_instructions_visitor::extract_to_shifts(ir_expression *ir) 1097{ 1098 ir_variable *bits = 1099 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 1100 1101 base_ir->insert_before(bits); 1102 base_ir->insert_before(assign(bits, ir->operands[2])); 1103 1104 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1105 ir_constant *c1 = 1106 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1107 ir_constant *c32 = 1108 new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 1109 ir_constant *cFFFFFFFF = 1110 new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 1111 1112 /* At least some hardware treats (x << y) as (x << (y%32)). This means 1113 * we'd get a mask of 0 when bits is 32. Special case it. 1114 * 1115 * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u; 1116 */ 1117 ir_expression *mask = csel(equal(bits, c32), 1118 cFFFFFFFF, 1119 sub(lshift(c1, bits), c1->clone(ir, NULL))); 1120 1121 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1122 * 1123 * If bits is zero, the result will be zero. 1124 * 1125 * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional 1126 * select as in the signed integer case. 1127 * 1128 * (value >> offset) & mask; 1129 */ 1130 ir->operation = ir_binop_bit_and; 1131 ir->init_num_operands(); 1132 ir->operands[0] = rshift(ir->operands[0], ir->operands[1]); 1133 ir->operands[1] = mask; 1134 ir->operands[2] = NULL; 1135 } else { 1136 ir_constant *c0 = 1137 new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements); 1138 ir_constant *c32 = 1139 new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 1140 ir_variable *temp = 1141 new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary); 1142 1143 /* temp = 32 - bits; */ 1144 base_ir->insert_before(temp); 1145 base_ir->insert_before(assign(temp, sub(c32, bits))); 1146 1147 /* expr = value << (temp - offset)) >> temp; */ 1148 ir_expression *expr = 1149 rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp); 1150 1151 /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1152 * 1153 * If bits is zero, the result will be zero. 1154 * 1155 * Due to the (x << (y%32)) behavior mentioned before, the (value << 1156 * (32-0)) doesn't "erase" all of the data as we would like, so finish 1157 * up with: 1158 * 1159 * (bits == 0) ? 0 : e; 1160 */ 1161 ir->operation = ir_triop_csel; 1162 ir->init_num_operands(); 1163 ir->operands[0] = equal(c0, bits); 1164 ir->operands[1] = c0->clone(ir, NULL); 1165 ir->operands[2] = expr; 1166 } 1167 1168 this->progress = true; 1169} 1170 1171void 1172lower_instructions_visitor::insert_to_shifts(ir_expression *ir) 1173{ 1174 ir_constant *c1; 1175 ir_constant *c32; 1176 ir_constant *cFFFFFFFF; 1177 ir_variable *offset = 1178 new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary); 1179 ir_variable *bits = 1180 new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 1181 ir_variable *mask = 1182 new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary); 1183 1184 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1185 c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements); 1186 c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 1187 cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements); 1188 } else { 1189 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1190 1191 c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1192 c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 1193 cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 1194 } 1195 1196 base_ir->insert_before(offset); 1197 base_ir->insert_before(assign(offset, ir->operands[2])); 1198 1199 base_ir->insert_before(bits); 1200 base_ir->insert_before(assign(bits, ir->operands[3])); 1201 1202 /* At least some hardware treats (x << y) as (x << (y%32)). This means 1203 * we'd get a mask of 0 when bits is 32. Special case it. 1204 * 1205 * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset; 1206 * 1207 * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1208 * 1209 * The result will be undefined if offset or bits is negative, or if the 1210 * sum of offset and bits is greater than the number of bits used to 1211 * store the operand. 1212 * 1213 * Since it's undefined, there are a couple other ways this could be 1214 * implemented. The other way that was considered was to put the csel 1215 * around the whole thing: 1216 * 1217 * final_result = bits == 32 ? insert : ... ; 1218 */ 1219 base_ir->insert_before(mask); 1220 1221 base_ir->insert_before(assign(mask, csel(equal(bits, c32), 1222 cFFFFFFFF, 1223 lshift(sub(lshift(c1, bits), 1224 c1->clone(ir, NULL)), 1225 offset)))); 1226 1227 /* (base & ~mask) | ((insert << offset) & mask) */ 1228 ir->operation = ir_binop_bit_or; 1229 ir->init_num_operands(); 1230 ir->operands[0] = bit_and(ir->operands[0], bit_not(mask)); 1231 ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask); 1232 ir->operands[2] = NULL; 1233 ir->operands[3] = NULL; 1234 1235 this->progress = true; 1236} 1237 1238void 1239lower_instructions_visitor::reverse_to_shifts(ir_expression *ir) 1240{ 1241 /* For more details, see: 1242 * 1243 * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 1244 */ 1245 ir_constant *c1 = 1246 new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1247 ir_constant *c2 = 1248 new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements); 1249 ir_constant *c4 = 1250 new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements); 1251 ir_constant *c8 = 1252 new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements); 1253 ir_constant *c16 = 1254 new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements); 1255 ir_constant *c33333333 = 1256 new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements); 1257 ir_constant *c55555555 = 1258 new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements); 1259 ir_constant *c0F0F0F0F = 1260 new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements); 1261 ir_constant *c00FF00FF = 1262 new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements); 1263 ir_variable *temp = 1264 new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements), 1265 "temp", ir_var_temporary); 1266 ir_instruction &i = *base_ir; 1267 1268 i.insert_before(temp); 1269 1270 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1271 i.insert_before(assign(temp, ir->operands[0])); 1272 } else { 1273 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1274 i.insert_before(assign(temp, i2u(ir->operands[0]))); 1275 } 1276 1277 /* Swap odd and even bits. 1278 * 1279 * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1); 1280 */ 1281 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555), 1282 lshift(bit_and(temp, c55555555->clone(ir, NULL)), 1283 c1->clone(ir, NULL))))); 1284 /* Swap consecutive pairs. 1285 * 1286 * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2); 1287 */ 1288 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333), 1289 lshift(bit_and(temp, c33333333->clone(ir, NULL)), 1290 c2->clone(ir, NULL))))); 1291 1292 /* Swap nibbles. 1293 * 1294 * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4); 1295 */ 1296 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F), 1297 lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)), 1298 c4->clone(ir, NULL))))); 1299 1300 /* The last step is, basically, bswap. Swap the bytes, then swap the 1301 * words. When this code is run through GCC on x86, it does generate a 1302 * bswap instruction. 1303 * 1304 * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8); 1305 * temp = ( temp >> 16 ) | ( temp << 16); 1306 */ 1307 i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF), 1308 lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)), 1309 c8->clone(ir, NULL))))); 1310 1311 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1312 ir->operation = ir_binop_bit_or; 1313 ir->init_num_operands(); 1314 ir->operands[0] = rshift(temp, c16); 1315 ir->operands[1] = lshift(temp, c16->clone(ir, NULL)); 1316 } else { 1317 ir->operation = ir_unop_u2i; 1318 ir->init_num_operands(); 1319 ir->operands[0] = bit_or(rshift(temp, c16), 1320 lshift(temp, c16->clone(ir, NULL))); 1321 } 1322 1323 this->progress = true; 1324} 1325 1326void 1327lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir) 1328{ 1329 /* For more details, see: 1330 * 1331 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1332 */ 1333 const unsigned elements = ir->operands[0]->type->vector_elements; 1334 ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements); 1335 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1336 ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1337 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1338 ir_variable *temp = 1339 new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary); 1340 ir_variable *lsb_only = 1341 new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary); 1342 ir_variable *as_float = 1343 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1344 ir_variable *lsb = 1345 new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary); 1346 1347 ir_instruction &i = *base_ir; 1348 1349 i.insert_before(temp); 1350 1351 if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1352 i.insert_before(assign(temp, ir->operands[0])); 1353 } else { 1354 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1355 i.insert_before(assign(temp, u2i(ir->operands[0]))); 1356 } 1357 1358 /* The int-to-float conversion is lossless because (value & -value) is 1359 * either a power of two or zero. We don't use the result in the zero 1360 * case. The uint() cast is necessary so that 0x80000000 does not 1361 * generate a negative value. 1362 * 1363 * uint lsb_only = uint(value & -value); 1364 * float as_float = float(lsb_only); 1365 */ 1366 i.insert_before(lsb_only); 1367 i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp))))); 1368 1369 i.insert_before(as_float); 1370 i.insert_before(assign(as_float, u2f(lsb_only))); 1371 1372 /* This is basically an open-coded frexp. Implementations that have a 1373 * native frexp instruction would be better served by that. This is 1374 * optimized versus a full-featured open-coded implementation in two ways: 1375 * 1376 * - We don't care about a correct result from subnormal numbers (including 1377 * 0.0), so the raw exponent can always be safely unbiased. 1378 * 1379 * - The value cannot be negative, so it does not need to be masked off to 1380 * extract the exponent. 1381 * 1382 * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1383 */ 1384 i.insert_before(lsb); 1385 i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1386 1387 /* Use lsb_only in the comparison instead of temp so that the & (far above) 1388 * can possibly generate the result without an explicit comparison. 1389 * 1390 * (lsb_only == 0) ? -1 : lsb; 1391 * 1392 * Since our input values are all integers, the unbiased exponent must not 1393 * be negative. It will only be negative (-0x7f, in fact) if lsb_only is 1394 * 0. Instead of using (lsb_only == 0), we could use (lsb >= 0). Which is 1395 * better is likely GPU dependent. Either way, the difference should be 1396 * small. 1397 */ 1398 ir->operation = ir_triop_csel; 1399 ir->init_num_operands(); 1400 ir->operands[0] = equal(lsb_only, c0); 1401 ir->operands[1] = cminus1; 1402 ir->operands[2] = new(ir) ir_dereference_variable(lsb); 1403 1404 this->progress = true; 1405} 1406 1407void 1408lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir) 1409{ 1410 /* For more details, see: 1411 * 1412 * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1413 */ 1414 const unsigned elements = ir->operands[0]->type->vector_elements; 1415 ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1416 ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1417 ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1418 ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1419 ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements); 1420 ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements); 1421 ir_variable *temp = 1422 new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary); 1423 ir_variable *as_float = 1424 new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1425 ir_variable *msb = 1426 new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary); 1427 1428 ir_instruction &i = *base_ir; 1429 1430 i.insert_before(temp); 1431 1432 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1433 i.insert_before(assign(temp, ir->operands[0])); 1434 } else { 1435 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1436 1437 /* findMSB(uint(abs(some_int))) almost always does the right thing. 1438 * There are two problem values: 1439 * 1440 * * 0x80000000. Since abs(0x80000000) == 0x80000000, findMSB returns 1441 * 31. However, findMSB(int(0x80000000)) == 30. 1442 * 1443 * * 0xffffffff. Since abs(0xffffffff) == 1, findMSB returns 1444 * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1445 * 1446 * For a value of zero or negative one, -1 will be returned. 1447 * 1448 * For all negative number cases, including 0x80000000 and 0xffffffff, 1449 * the correct value is obtained from findMSB if instead of negating the 1450 * (already negative) value the logical-not is used. A conditonal 1451 * logical-not can be achieved in two instructions. 1452 */ 1453 ir_variable *as_int = 1454 new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary); 1455 ir_constant *c31 = new(ir) ir_constant(int(31), elements); 1456 1457 i.insert_before(as_int); 1458 i.insert_before(assign(as_int, ir->operands[0])); 1459 i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor, 1460 as_int, 1461 rshift(as_int, c31))))); 1462 } 1463 1464 /* The int-to-float conversion is lossless because bits are conditionally 1465 * masked off the bottom of temp to ensure the value has at most 24 bits of 1466 * data or is zero. We don't use the result in the zero case. The uint() 1467 * cast is necessary so that 0x80000000 does not generate a negative value. 1468 * 1469 * float as_float = float(temp > 255 ? temp & ~255 : temp); 1470 */ 1471 i.insert_before(as_float); 1472 i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF), 1473 bit_and(temp, cFFFFFF00), 1474 temp)))); 1475 1476 /* This is basically an open-coded frexp. Implementations that have a 1477 * native frexp instruction would be better served by that. This is 1478 * optimized versus a full-featured open-coded implementation in two ways: 1479 * 1480 * - We don't care about a correct result from subnormal numbers (including 1481 * 0.0), so the raw exponent can always be safely unbiased. 1482 * 1483 * - The value cannot be negative, so it does not need to be masked off to 1484 * extract the exponent. 1485 * 1486 * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1487 */ 1488 i.insert_before(msb); 1489 i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1490 1491 /* Use msb in the comparison instead of temp so that the subtract can 1492 * possibly generate the result without an explicit comparison. 1493 * 1494 * (msb < 0) ? -1 : msb; 1495 * 1496 * Since our input values are all integers, the unbiased exponent must not 1497 * be negative. It will only be negative (-0x7f, in fact) if temp is 0. 1498 */ 1499 ir->operation = ir_triop_csel; 1500 ir->init_num_operands(); 1501 ir->operands[0] = less(msb, c0); 1502 ir->operands[1] = cminus1; 1503 ir->operands[2] = new(ir) ir_dereference_variable(msb); 1504 1505 this->progress = true; 1506} 1507 1508ir_expression * 1509lower_instructions_visitor::_carry(operand a, operand b) 1510{ 1511 if (lowering(CARRY_TO_ARITH)) 1512 return i2u(b2i(less(add(a, b), 1513 a.val->clone(ralloc_parent(a.val), NULL)))); 1514 else 1515 return carry(a, b); 1516} 1517 1518void 1519lower_instructions_visitor::imul_high_to_mul(ir_expression *ir) 1520{ 1521 /* ABCD 1522 * * EFGH 1523 * ====== 1524 * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 1525 * 1526 * In GLSL, (a * b) becomes 1527 * 1528 * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu); 1529 * uint m2 = (a & 0x0000ffffu) * (b >> 16); 1530 * uint m3 = (a >> 16) * (b & 0x0000ffffu); 1531 * uint m4 = (a >> 16) * (b >> 16); 1532 * 1533 * uint c1; 1534 * uint c2; 1535 * uint lo_result; 1536 * uint hi_result; 1537 * 1538 * lo_result = uaddCarry(m1, m2 << 16, c1); 1539 * hi_result = m4 + c1; 1540 * lo_result = uaddCarry(lo_result, m3 << 16, c2); 1541 * hi_result = hi_result + c2; 1542 * hi_result = hi_result + (m2 >> 16) + (m3 >> 16); 1543 */ 1544 const unsigned elements = ir->operands[0]->type->vector_elements; 1545 ir_variable *src1 = 1546 new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary); 1547 ir_variable *src1h = 1548 new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary); 1549 ir_variable *src1l = 1550 new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary); 1551 ir_variable *src2 = 1552 new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary); 1553 ir_variable *src2h = 1554 new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary); 1555 ir_variable *src2l = 1556 new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary); 1557 ir_variable *t1 = 1558 new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary); 1559 ir_variable *t2 = 1560 new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary); 1561 ir_variable *lo = 1562 new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary); 1563 ir_variable *hi = 1564 new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary); 1565 ir_variable *different_signs = NULL; 1566 ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements); 1567 ir_constant *c16 = new(ir) ir_constant(16u, elements); 1568 1569 ir_instruction &i = *base_ir; 1570 1571 i.insert_before(src1); 1572 i.insert_before(src2); 1573 i.insert_before(src1h); 1574 i.insert_before(src2h); 1575 i.insert_before(src1l); 1576 i.insert_before(src2l); 1577 1578 if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1579 i.insert_before(assign(src1, ir->operands[0])); 1580 i.insert_before(assign(src2, ir->operands[1])); 1581 } else { 1582 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1583 1584 ir_variable *itmp1 = 1585 new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary); 1586 ir_variable *itmp2 = 1587 new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary); 1588 ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1589 1590 i.insert_before(itmp1); 1591 i.insert_before(itmp2); 1592 i.insert_before(assign(itmp1, ir->operands[0])); 1593 i.insert_before(assign(itmp2, ir->operands[1])); 1594 1595 different_signs = 1596 new(ir) ir_variable(glsl_type::bvec(elements), "different_signs", 1597 ir_var_temporary); 1598 1599 i.insert_before(different_signs); 1600 i.insert_before(assign(different_signs, expr(ir_binop_logic_xor, 1601 less(itmp1, c0), 1602 less(itmp2, c0->clone(ir, NULL))))); 1603 1604 i.insert_before(assign(src1, i2u(abs(itmp1)))); 1605 i.insert_before(assign(src2, i2u(abs(itmp2)))); 1606 } 1607 1608 i.insert_before(assign(src1l, bit_and(src1, c0000FFFF))); 1609 i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL)))); 1610 i.insert_before(assign(src1h, rshift(src1, c16))); 1611 i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL)))); 1612 1613 i.insert_before(lo); 1614 i.insert_before(hi); 1615 i.insert_before(t1); 1616 i.insert_before(t2); 1617 1618 i.insert_before(assign(lo, mul(src1l, src2l))); 1619 i.insert_before(assign(t1, mul(src1l, src2h))); 1620 i.insert_before(assign(t2, mul(src1h, src2l))); 1621 i.insert_before(assign(hi, mul(src1h, src2h))); 1622 1623 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL)))))); 1624 i.insert_before(assign(lo, add(lo, lshift(t1, c16->clone(ir, NULL))))); 1625 1626 i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL)))))); 1627 i.insert_before(assign(lo, add(lo, lshift(t2, c16->clone(ir, NULL))))); 1628 1629 if (different_signs == NULL) { 1630 assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1631 1632 ir->operation = ir_binop_add; 1633 ir->init_num_operands(); 1634 ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL))); 1635 ir->operands[1] = rshift(t2, c16->clone(ir, NULL)); 1636 } else { 1637 assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1638 1639 i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))), 1640 rshift(t2, c16->clone(ir, NULL))))); 1641 1642 /* For channels where different_signs is set we have to perform a 64-bit 1643 * negation. This is *not* the same as just negating the high 32-bits. 1644 * Consider -3 * 2. The high 32-bits is 0, but the desired result is 1645 * -1, not -0! Recall -x == ~x + 1. 1646 */ 1647 ir_variable *neg_hi = 1648 new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary); 1649 ir_constant *c1 = new(ir) ir_constant(1u, elements); 1650 1651 i.insert_before(neg_hi); 1652 i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)), 1653 u2i(_carry(bit_not(lo), c1))))); 1654 1655 ir->operation = ir_triop_csel; 1656 ir->init_num_operands(); 1657 ir->operands[0] = new(ir) ir_dereference_variable(different_signs); 1658 ir->operands[1] = new(ir) ir_dereference_variable(neg_hi); 1659 ir->operands[2] = u2i(hi); 1660 } 1661} 1662 1663void 1664lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir) 1665{ 1666 ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]); 1667 this->progress = true; 1668} 1669 1670void 1671lower_instructions_visitor::mul64_to_mul_and_mul_high(ir_expression *ir) 1672{ 1673 /* Lower 32x32-> 64 to 1674 * msb = imul_high(x_lo, y_lo) 1675 * lsb = mul(x_lo, y_lo) 1676 */ 1677 const unsigned elements = ir->operands[0]->type->vector_elements; 1678 1679 const ir_expression_operation operation = 1680 ir->type->base_type == GLSL_TYPE_UINT64 ? ir_unop_pack_uint_2x32 1681 : ir_unop_pack_int_2x32; 1682 1683 const glsl_type *var_type = ir->type->base_type == GLSL_TYPE_UINT64 1684 ? glsl_type::uvec(elements) 1685 : glsl_type::ivec(elements); 1686 1687 const glsl_type *ret_type = ir->type->base_type == GLSL_TYPE_UINT64 1688 ? glsl_type::uvec2_type 1689 : glsl_type::ivec2_type; 1690 1691 ir_instruction &i = *base_ir; 1692 1693 ir_variable *msb = 1694 new(ir) ir_variable(var_type, "msb", ir_var_temporary); 1695 ir_variable *lsb = 1696 new(ir) ir_variable(var_type, "lsb", ir_var_temporary); 1697 ir_variable *x = 1698 new(ir) ir_variable(var_type, "x", ir_var_temporary); 1699 ir_variable *y = 1700 new(ir) ir_variable(var_type, "y", ir_var_temporary); 1701 1702 i.insert_before(x); 1703 i.insert_before(assign(x, ir->operands[0])); 1704 i.insert_before(y); 1705 i.insert_before(assign(y, ir->operands[1])); 1706 i.insert_before(msb); 1707 i.insert_before(lsb); 1708 1709 i.insert_before(assign(msb, imul_high(x, y))); 1710 i.insert_before(assign(lsb, mul(x, y))); 1711 1712 ir_rvalue *result[4] = {NULL}; 1713 for (unsigned elem = 0; elem < elements; elem++) { 1714 ir_rvalue *val = new(ir) ir_expression(ir_quadop_vector, ret_type, 1715 swizzle(lsb, elem, 1), 1716 swizzle(msb, elem, 1), NULL, NULL); 1717 result[elem] = expr(operation, val); 1718 } 1719 1720 ir->operation = ir_quadop_vector; 1721 ir->init_num_operands(); 1722 ir->operands[0] = result[0]; 1723 ir->operands[1] = result[1]; 1724 ir->operands[2] = result[2]; 1725 ir->operands[3] = result[3]; 1726 1727 this->progress = true; 1728} 1729 1730ir_visitor_status 1731lower_instructions_visitor::visit_leave(ir_expression *ir) 1732{ 1733 switch (ir->operation) { 1734 case ir_binop_dot: 1735 if (ir->operands[0]->type->is_double()) 1736 double_dot_to_fma(ir); 1737 break; 1738 case ir_triop_lrp: 1739 if (ir->operands[0]->type->is_double()) 1740 double_lrp(ir); 1741 break; 1742 case ir_binop_sub: 1743 if (lowering(SUB_TO_ADD_NEG)) 1744 sub_to_add_neg(ir); 1745 break; 1746 1747 case ir_binop_div: 1748 if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP)) 1749 int_div_to_mul_rcp(ir); 1750 else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) || 1751 (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP))) 1752 div_to_mul_rcp(ir); 1753 break; 1754 1755 case ir_unop_exp: 1756 if (lowering(EXP_TO_EXP2)) 1757 exp_to_exp2(ir); 1758 break; 1759 1760 case ir_unop_log: 1761 if (lowering(LOG_TO_LOG2)) 1762 log_to_log2(ir); 1763 break; 1764 1765 case ir_binop_mod: 1766 if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double())) 1767 mod_to_floor(ir); 1768 break; 1769 1770 case ir_binop_pow: 1771 if (lowering(POW_TO_EXP2)) 1772 pow_to_exp2(ir); 1773 break; 1774 1775 case ir_binop_ldexp: 1776 if (lowering(LDEXP_TO_ARITH) && ir->type->is_float()) 1777 ldexp_to_arith(ir); 1778 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double()) 1779 dldexp_to_arith(ir); 1780 break; 1781 1782 case ir_unop_frexp_exp: 1783 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1784 dfrexp_exp_to_arith(ir); 1785 break; 1786 1787 case ir_unop_frexp_sig: 1788 if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1789 dfrexp_sig_to_arith(ir); 1790 break; 1791 1792 case ir_binop_carry: 1793 if (lowering(CARRY_TO_ARITH)) 1794 carry_to_arith(ir); 1795 break; 1796 1797 case ir_binop_borrow: 1798 if (lowering(BORROW_TO_ARITH)) 1799 borrow_to_arith(ir); 1800 break; 1801 1802 case ir_unop_saturate: 1803 if (lowering(SAT_TO_CLAMP)) 1804 sat_to_clamp(ir); 1805 break; 1806 1807 case ir_unop_trunc: 1808 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1809 dtrunc_to_dfrac(ir); 1810 break; 1811 1812 case ir_unop_ceil: 1813 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1814 dceil_to_dfrac(ir); 1815 break; 1816 1817 case ir_unop_floor: 1818 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1819 dfloor_to_dfrac(ir); 1820 break; 1821 1822 case ir_unop_round_even: 1823 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1824 dround_even_to_dfrac(ir); 1825 break; 1826 1827 case ir_unop_sign: 1828 if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1829 dsign_to_csel(ir); 1830 break; 1831 1832 case ir_unop_bit_count: 1833 if (lowering(BIT_COUNT_TO_MATH)) 1834 bit_count_to_math(ir); 1835 break; 1836 1837 case ir_triop_bitfield_extract: 1838 if (lowering(EXTRACT_TO_SHIFTS)) 1839 extract_to_shifts(ir); 1840 break; 1841 1842 case ir_quadop_bitfield_insert: 1843 if (lowering(INSERT_TO_SHIFTS)) 1844 insert_to_shifts(ir); 1845 break; 1846 1847 case ir_unop_bitfield_reverse: 1848 if (lowering(REVERSE_TO_SHIFTS)) 1849 reverse_to_shifts(ir); 1850 break; 1851 1852 case ir_unop_find_lsb: 1853 if (lowering(FIND_LSB_TO_FLOAT_CAST)) 1854 find_lsb_to_float_cast(ir); 1855 break; 1856 1857 case ir_unop_find_msb: 1858 if (lowering(FIND_MSB_TO_FLOAT_CAST)) 1859 find_msb_to_float_cast(ir); 1860 break; 1861 1862 case ir_binop_imul_high: 1863 if (lowering(IMUL_HIGH_TO_MUL)) 1864 imul_high_to_mul(ir); 1865 break; 1866 1867 case ir_binop_mul: 1868 if (lowering(MUL64_TO_MUL_AND_MUL_HIGH) && 1869 (ir->type->base_type == GLSL_TYPE_INT64 || 1870 ir->type->base_type == GLSL_TYPE_UINT64) && 1871 (ir->operands[0]->type->base_type == GLSL_TYPE_INT || 1872 ir->operands[1]->type->base_type == GLSL_TYPE_UINT)) 1873 mul64_to_mul_and_mul_high(ir); 1874 break; 1875 1876 case ir_unop_rsq: 1877 case ir_unop_sqrt: 1878 if (lowering(SQRT_TO_ABS_SQRT)) 1879 sqrt_to_abs_sqrt(ir); 1880 break; 1881 1882 default: 1883 return visit_continue; 1884 } 1885 1886 return visit_continue; 1887} 1888