1b8e80941Smrg/* 2b8e80941Smrg * Copyright © 2010 Intel Corporation 3b8e80941Smrg * 4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a 5b8e80941Smrg * copy of this software and associated documentation files (the "Software"), 6b8e80941Smrg * to deal in the Software without restriction, including without limitation 7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the 9b8e80941Smrg * Software is furnished to do so, subject to the following conditions: 10b8e80941Smrg * 11b8e80941Smrg * The above copyright notice and this permission notice (including the next 12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the 13b8e80941Smrg * Software. 14b8e80941Smrg * 15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 21b8e80941Smrg * DEALINGS IN THE SOFTWARE. 22b8e80941Smrg */ 23b8e80941Smrg 24b8e80941Smrg/** 25b8e80941Smrg * \file lower_instructions.cpp 26b8e80941Smrg * 27b8e80941Smrg * Many GPUs lack native instructions for certain expression operations, and 28b8e80941Smrg * must replace them with some other expression tree. This pass lowers some 29b8e80941Smrg * of the most common cases, allowing the lowering code to be implemented once 30b8e80941Smrg * rather than in each driver backend. 31b8e80941Smrg * 32b8e80941Smrg * Currently supported transformations: 33b8e80941Smrg * - SUB_TO_ADD_NEG 34b8e80941Smrg * - DIV_TO_MUL_RCP 35b8e80941Smrg * - INT_DIV_TO_MUL_RCP 36b8e80941Smrg * - EXP_TO_EXP2 37b8e80941Smrg * - POW_TO_EXP2 38b8e80941Smrg * - LOG_TO_LOG2 39b8e80941Smrg * - MOD_TO_FLOOR 40b8e80941Smrg * - LDEXP_TO_ARITH 41b8e80941Smrg * - DFREXP_TO_ARITH 42b8e80941Smrg * - CARRY_TO_ARITH 43b8e80941Smrg * - BORROW_TO_ARITH 44b8e80941Smrg * - SAT_TO_CLAMP 45b8e80941Smrg * - DOPS_TO_DFRAC 46b8e80941Smrg * 47b8e80941Smrg * SUB_TO_ADD_NEG: 48b8e80941Smrg * --------------- 49b8e80941Smrg * Breaks an ir_binop_sub expression down to add(op0, neg(op1)) 50b8e80941Smrg * 51b8e80941Smrg * This simplifies expression reassociation, and for many backends 52b8e80941Smrg * there is no subtract operation separate from adding the negation. 53b8e80941Smrg * For backends with native subtract operations, they will probably 54b8e80941Smrg * want to recognize add(op0, neg(op1)) or the other way around to 55b8e80941Smrg * produce a subtract anyway. 56b8e80941Smrg * 57b8e80941Smrg * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP: 58b8e80941Smrg * --------------------------------------------------------- 59b8e80941Smrg * Breaks an ir_binop_div expression down to op0 * (rcp(op1)). 60b8e80941Smrg * 61b8e80941Smrg * Many GPUs don't have a divide instruction (945 and 965 included), 62b8e80941Smrg * but they do have an RCP instruction to compute an approximate 63b8e80941Smrg * reciprocal. By breaking the operation down, constant reciprocals 64b8e80941Smrg * can get constant folded. 65b8e80941Smrg * 66b8e80941Smrg * FDIV_TO_MUL_RCP only lowers single-precision floating point division; 67b8e80941Smrg * DDIV_TO_MUL_RCP only lowers double-precision floating point division. 68b8e80941Smrg * DIV_TO_MUL_RCP is a convenience macro that sets both flags. 69b8e80941Smrg * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating 70b8e80941Smrg * point so that RCP is possible. 71b8e80941Smrg * 72b8e80941Smrg * EXP_TO_EXP2 and LOG_TO_LOG2: 73b8e80941Smrg * ---------------------------- 74b8e80941Smrg * Many GPUs don't have a base e log or exponent instruction, but they 75b8e80941Smrg * do have base 2 versions, so this pass converts exp and log to exp2 76b8e80941Smrg * and log2 operations. 77b8e80941Smrg * 78b8e80941Smrg * POW_TO_EXP2: 79b8e80941Smrg * ----------- 80b8e80941Smrg * Many older GPUs don't have an x**y instruction. For these GPUs, convert 81b8e80941Smrg * x**y to 2**(y * log2(x)). 82b8e80941Smrg * 83b8e80941Smrg * MOD_TO_FLOOR: 84b8e80941Smrg * ------------- 85b8e80941Smrg * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1)) 86b8e80941Smrg * 87b8e80941Smrg * Many GPUs don't have a MOD instruction (945 and 965 included), and 88b8e80941Smrg * if we have to break it down like this anyway, it gives an 89b8e80941Smrg * opportunity to do things like constant fold the (1.0 / op1) easily. 90b8e80941Smrg * 91b8e80941Smrg * Note: before we used to implement this as op1 * fract(op / op1) but this 92b8e80941Smrg * implementation had significant precision errors. 93b8e80941Smrg * 94b8e80941Smrg * LDEXP_TO_ARITH: 95b8e80941Smrg * ------------- 96b8e80941Smrg * Converts ir_binop_ldexp to arithmetic and bit operations for float sources. 97b8e80941Smrg * 98b8e80941Smrg * DFREXP_DLDEXP_TO_ARITH: 99b8e80941Smrg * --------------- 100b8e80941Smrg * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to 101b8e80941Smrg * arithmetic and bit ops for double arguments. 102b8e80941Smrg * 103b8e80941Smrg * CARRY_TO_ARITH: 104b8e80941Smrg * --------------- 105b8e80941Smrg * Converts ir_carry into (x + y) < x. 106b8e80941Smrg * 107b8e80941Smrg * BORROW_TO_ARITH: 108b8e80941Smrg * ---------------- 109b8e80941Smrg * Converts ir_borrow into (x < y). 110b8e80941Smrg * 111b8e80941Smrg * SAT_TO_CLAMP: 112b8e80941Smrg * ------------- 113b8e80941Smrg * Converts ir_unop_saturate into min(max(x, 0.0), 1.0) 114b8e80941Smrg * 115b8e80941Smrg * DOPS_TO_DFRAC: 116b8e80941Smrg * -------------- 117b8e80941Smrg * Converts double trunc, ceil, floor, round to fract 118b8e80941Smrg */ 119b8e80941Smrg 120b8e80941Smrg#include "c99_math.h" 121b8e80941Smrg#include "program/prog_instruction.h" /* for swizzle */ 122b8e80941Smrg#include "compiler/glsl_types.h" 123b8e80941Smrg#include "ir.h" 124b8e80941Smrg#include "ir_builder.h" 125b8e80941Smrg#include "ir_optimization.h" 126b8e80941Smrg 127b8e80941Smrgusing namespace ir_builder; 128b8e80941Smrg 129b8e80941Smrgnamespace { 130b8e80941Smrg 131b8e80941Smrgclass lower_instructions_visitor : public ir_hierarchical_visitor { 132b8e80941Smrgpublic: 133b8e80941Smrg lower_instructions_visitor(unsigned lower) 134b8e80941Smrg : progress(false), lower(lower) { } 135b8e80941Smrg 136b8e80941Smrg ir_visitor_status visit_leave(ir_expression *); 137b8e80941Smrg 138b8e80941Smrg bool progress; 139b8e80941Smrg 140b8e80941Smrgprivate: 141b8e80941Smrg unsigned lower; /** Bitfield of which operations to lower */ 142b8e80941Smrg 143b8e80941Smrg void sub_to_add_neg(ir_expression *); 144b8e80941Smrg void div_to_mul_rcp(ir_expression *); 145b8e80941Smrg void int_div_to_mul_rcp(ir_expression *); 146b8e80941Smrg void mod_to_floor(ir_expression *); 147b8e80941Smrg void exp_to_exp2(ir_expression *); 148b8e80941Smrg void pow_to_exp2(ir_expression *); 149b8e80941Smrg void log_to_log2(ir_expression *); 150b8e80941Smrg void ldexp_to_arith(ir_expression *); 151b8e80941Smrg void dldexp_to_arith(ir_expression *); 152b8e80941Smrg void dfrexp_sig_to_arith(ir_expression *); 153b8e80941Smrg void dfrexp_exp_to_arith(ir_expression *); 154b8e80941Smrg void carry_to_arith(ir_expression *); 155b8e80941Smrg void borrow_to_arith(ir_expression *); 156b8e80941Smrg void sat_to_clamp(ir_expression *); 157b8e80941Smrg void double_dot_to_fma(ir_expression *); 158b8e80941Smrg void double_lrp(ir_expression *); 159b8e80941Smrg void dceil_to_dfrac(ir_expression *); 160b8e80941Smrg void dfloor_to_dfrac(ir_expression *); 161b8e80941Smrg void dround_even_to_dfrac(ir_expression *); 162b8e80941Smrg void dtrunc_to_dfrac(ir_expression *); 163b8e80941Smrg void dsign_to_csel(ir_expression *); 164b8e80941Smrg void bit_count_to_math(ir_expression *); 165b8e80941Smrg void extract_to_shifts(ir_expression *); 166b8e80941Smrg void insert_to_shifts(ir_expression *); 167b8e80941Smrg void reverse_to_shifts(ir_expression *ir); 168b8e80941Smrg void find_lsb_to_float_cast(ir_expression *ir); 169b8e80941Smrg void find_msb_to_float_cast(ir_expression *ir); 170b8e80941Smrg void imul_high_to_mul(ir_expression *ir); 171b8e80941Smrg void sqrt_to_abs_sqrt(ir_expression *ir); 172b8e80941Smrg void mul64_to_mul_and_mul_high(ir_expression *ir); 173b8e80941Smrg 174b8e80941Smrg ir_expression *_carry(operand a, operand b); 175b8e80941Smrg}; 176b8e80941Smrg 177b8e80941Smrg} /* anonymous namespace */ 178b8e80941Smrg 179b8e80941Smrg/** 180b8e80941Smrg * Determine if a particular type of lowering should occur 181b8e80941Smrg */ 182b8e80941Smrg#define lowering(x) (this->lower & x) 183b8e80941Smrg 184b8e80941Smrgbool 185b8e80941Smrglower_instructions(exec_list *instructions, unsigned what_to_lower) 186b8e80941Smrg{ 187b8e80941Smrg lower_instructions_visitor v(what_to_lower); 188b8e80941Smrg 189b8e80941Smrg visit_list_elements(&v, instructions); 190b8e80941Smrg return v.progress; 191b8e80941Smrg} 192b8e80941Smrg 193b8e80941Smrgvoid 194b8e80941Smrglower_instructions_visitor::sub_to_add_neg(ir_expression *ir) 195b8e80941Smrg{ 196b8e80941Smrg ir->operation = ir_binop_add; 197b8e80941Smrg ir->init_num_operands(); 198b8e80941Smrg ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type, 199b8e80941Smrg ir->operands[1], NULL); 200b8e80941Smrg this->progress = true; 201b8e80941Smrg} 202b8e80941Smrg 203b8e80941Smrgvoid 204b8e80941Smrglower_instructions_visitor::div_to_mul_rcp(ir_expression *ir) 205b8e80941Smrg{ 206b8e80941Smrg assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double()); 207b8e80941Smrg 208b8e80941Smrg /* New expression for the 1.0 / op1 */ 209b8e80941Smrg ir_rvalue *expr; 210b8e80941Smrg expr = new(ir) ir_expression(ir_unop_rcp, 211b8e80941Smrg ir->operands[1]->type, 212b8e80941Smrg ir->operands[1]); 213b8e80941Smrg 214b8e80941Smrg /* op0 / op1 -> op0 * (1.0 / op1) */ 215b8e80941Smrg ir->operation = ir_binop_mul; 216b8e80941Smrg ir->init_num_operands(); 217b8e80941Smrg ir->operands[1] = expr; 218b8e80941Smrg 219b8e80941Smrg this->progress = true; 220b8e80941Smrg} 221b8e80941Smrg 222b8e80941Smrgvoid 223b8e80941Smrglower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir) 224b8e80941Smrg{ 225b8e80941Smrg assert(ir->operands[1]->type->is_integer()); 226b8e80941Smrg 227b8e80941Smrg /* Be careful with integer division -- we need to do it as a 228b8e80941Smrg * float and re-truncate, since rcp(n > 1) of an integer would 229b8e80941Smrg * just be 0. 230b8e80941Smrg */ 231b8e80941Smrg ir_rvalue *op0, *op1; 232b8e80941Smrg const struct glsl_type *vec_type; 233b8e80941Smrg 234b8e80941Smrg vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 235b8e80941Smrg ir->operands[1]->type->vector_elements, 236b8e80941Smrg ir->operands[1]->type->matrix_columns); 237b8e80941Smrg 238b8e80941Smrg if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) 239b8e80941Smrg op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL); 240b8e80941Smrg else 241b8e80941Smrg op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL); 242b8e80941Smrg 243b8e80941Smrg op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL); 244b8e80941Smrg 245b8e80941Smrg vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 246b8e80941Smrg ir->operands[0]->type->vector_elements, 247b8e80941Smrg ir->operands[0]->type->matrix_columns); 248b8e80941Smrg 249b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) 250b8e80941Smrg op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL); 251b8e80941Smrg else 252b8e80941Smrg op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL); 253b8e80941Smrg 254b8e80941Smrg vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 255b8e80941Smrg ir->type->vector_elements, 256b8e80941Smrg ir->type->matrix_columns); 257b8e80941Smrg 258b8e80941Smrg op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1); 259b8e80941Smrg 260b8e80941Smrg if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) { 261b8e80941Smrg ir->operation = ir_unop_f2i; 262b8e80941Smrg ir->operands[0] = op0; 263b8e80941Smrg } else { 264b8e80941Smrg ir->operation = ir_unop_i2u; 265b8e80941Smrg ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0); 266b8e80941Smrg } 267b8e80941Smrg ir->init_num_operands(); 268b8e80941Smrg ir->operands[1] = NULL; 269b8e80941Smrg 270b8e80941Smrg this->progress = true; 271b8e80941Smrg} 272b8e80941Smrg 273b8e80941Smrgvoid 274b8e80941Smrglower_instructions_visitor::exp_to_exp2(ir_expression *ir) 275b8e80941Smrg{ 276b8e80941Smrg ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E)); 277b8e80941Smrg 278b8e80941Smrg ir->operation = ir_unop_exp2; 279b8e80941Smrg ir->init_num_operands(); 280b8e80941Smrg ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type, 281b8e80941Smrg ir->operands[0], log2_e); 282b8e80941Smrg this->progress = true; 283b8e80941Smrg} 284b8e80941Smrg 285b8e80941Smrgvoid 286b8e80941Smrglower_instructions_visitor::pow_to_exp2(ir_expression *ir) 287b8e80941Smrg{ 288b8e80941Smrg ir_expression *const log2_x = 289b8e80941Smrg new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 290b8e80941Smrg ir->operands[0]); 291b8e80941Smrg 292b8e80941Smrg ir->operation = ir_unop_exp2; 293b8e80941Smrg ir->init_num_operands(); 294b8e80941Smrg ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type, 295b8e80941Smrg ir->operands[1], log2_x); 296b8e80941Smrg ir->operands[1] = NULL; 297b8e80941Smrg this->progress = true; 298b8e80941Smrg} 299b8e80941Smrg 300b8e80941Smrgvoid 301b8e80941Smrglower_instructions_visitor::log_to_log2(ir_expression *ir) 302b8e80941Smrg{ 303b8e80941Smrg ir->operation = ir_binop_mul; 304b8e80941Smrg ir->init_num_operands(); 305b8e80941Smrg ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 306b8e80941Smrg ir->operands[0], NULL); 307b8e80941Smrg ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E)); 308b8e80941Smrg this->progress = true; 309b8e80941Smrg} 310b8e80941Smrg 311b8e80941Smrgvoid 312b8e80941Smrglower_instructions_visitor::mod_to_floor(ir_expression *ir) 313b8e80941Smrg{ 314b8e80941Smrg ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x", 315b8e80941Smrg ir_var_temporary); 316b8e80941Smrg ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y", 317b8e80941Smrg ir_var_temporary); 318b8e80941Smrg this->base_ir->insert_before(x); 319b8e80941Smrg this->base_ir->insert_before(y); 320b8e80941Smrg 321b8e80941Smrg ir_assignment *const assign_x = 322b8e80941Smrg new(ir) ir_assignment(new(ir) ir_dereference_variable(x), 323b8e80941Smrg ir->operands[0]); 324b8e80941Smrg ir_assignment *const assign_y = 325b8e80941Smrg new(ir) ir_assignment(new(ir) ir_dereference_variable(y), 326b8e80941Smrg ir->operands[1]); 327b8e80941Smrg 328b8e80941Smrg this->base_ir->insert_before(assign_x); 329b8e80941Smrg this->base_ir->insert_before(assign_y); 330b8e80941Smrg 331b8e80941Smrg ir_expression *const div_expr = 332b8e80941Smrg new(ir) ir_expression(ir_binop_div, x->type, 333b8e80941Smrg new(ir) ir_dereference_variable(x), 334b8e80941Smrg new(ir) ir_dereference_variable(y)); 335b8e80941Smrg 336b8e80941Smrg /* Don't generate new IR that would need to be lowered in an additional 337b8e80941Smrg * pass. 338b8e80941Smrg */ 339b8e80941Smrg if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) || 340b8e80941Smrg (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double())) 341b8e80941Smrg div_to_mul_rcp(div_expr); 342b8e80941Smrg 343b8e80941Smrg ir_expression *const floor_expr = 344b8e80941Smrg new(ir) ir_expression(ir_unop_floor, x->type, div_expr); 345b8e80941Smrg 346b8e80941Smrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 347b8e80941Smrg dfloor_to_dfrac(floor_expr); 348b8e80941Smrg 349b8e80941Smrg ir_expression *const mul_expr = 350b8e80941Smrg new(ir) ir_expression(ir_binop_mul, 351b8e80941Smrg new(ir) ir_dereference_variable(y), 352b8e80941Smrg floor_expr); 353b8e80941Smrg 354b8e80941Smrg ir->operation = ir_binop_sub; 355b8e80941Smrg ir->init_num_operands(); 356b8e80941Smrg ir->operands[0] = new(ir) ir_dereference_variable(x); 357b8e80941Smrg ir->operands[1] = mul_expr; 358b8e80941Smrg this->progress = true; 359b8e80941Smrg} 360b8e80941Smrg 361b8e80941Smrgvoid 362b8e80941Smrglower_instructions_visitor::ldexp_to_arith(ir_expression *ir) 363b8e80941Smrg{ 364b8e80941Smrg /* Translates 365b8e80941Smrg * ir_binop_ldexp x exp 366b8e80941Smrg * into 367b8e80941Smrg * 368b8e80941Smrg * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 369b8e80941Smrg * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 370b8e80941Smrg * 371b8e80941Smrg * if (extracted_biased_exp >= 255) 372b8e80941Smrg * return x; // +/-inf, NaN 373b8e80941Smrg * 374b8e80941Smrg * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 375b8e80941Smrg * 376b8e80941Smrg * if (min(resulting_biased_exp, extracted_biased_exp) < 1) 377b8e80941Smrg * resulting_biased_exp = 0; 378b8e80941Smrg * if (resulting_biased_exp >= 255 || 379b8e80941Smrg * min(resulting_biased_exp, extracted_biased_exp) < 1) { 380b8e80941Smrg * sign_mantissa &= sign_mask; 381b8e80941Smrg * } 382b8e80941Smrg * 383b8e80941Smrg * return bitcast_u2f(sign_mantissa | 384b8e80941Smrg * lshift(i2u(resulting_biased_exp), exp_shift)); 385b8e80941Smrg * 386b8e80941Smrg * which we can't actually implement as such, since the GLSL IR doesn't 387b8e80941Smrg * have vectorized if-statements. We actually implement it without branches 388b8e80941Smrg * using conditional-select: 389b8e80941Smrg * 390b8e80941Smrg * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 391b8e80941Smrg * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 392b8e80941Smrg * 393b8e80941Smrg * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 394b8e80941Smrg * 395b8e80941Smrg * flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0); 396b8e80941Smrg * resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp) 397b8e80941Smrg * zero_mantissa = logic_or(flush_to_zero, 398b8e80941Smrg * gequal(resulting_biased_exp, 255)); 399b8e80941Smrg * sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa); 400b8e80941Smrg * 401b8e80941Smrg * result = sign_mantissa | 402b8e80941Smrg * lshift(i2u(resulting_biased_exp), exp_shift)); 403b8e80941Smrg * 404b8e80941Smrg * return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result)); 405b8e80941Smrg * 406b8e80941Smrg * The definition of ldexp in the GLSL spec says: 407b8e80941Smrg * 408b8e80941Smrg * "If this product is too large to be represented in the 409b8e80941Smrg * floating-point type, the result is undefined." 410b8e80941Smrg * 411b8e80941Smrg * However, the definition of ldexp in the GLSL ES spec does not contain 412b8e80941Smrg * this sentence, so we do need to handle overflow correctly. 413b8e80941Smrg * 414b8e80941Smrg * There is additional language limiting the defined range of exp, but this 415b8e80941Smrg * is merely to allow implementations that store 2^exp in a temporary 416b8e80941Smrg * variable. 417b8e80941Smrg */ 418b8e80941Smrg 419b8e80941Smrg const unsigned vec_elem = ir->type->vector_elements; 420b8e80941Smrg 421b8e80941Smrg /* Types */ 422b8e80941Smrg const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 423b8e80941Smrg const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 424b8e80941Smrg const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 425b8e80941Smrg 426b8e80941Smrg /* Temporary variables */ 427b8e80941Smrg ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 428b8e80941Smrg ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 429b8e80941Smrg ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary); 430b8e80941Smrg 431b8e80941Smrg ir_variable *extracted_biased_exp = 432b8e80941Smrg new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 433b8e80941Smrg ir_variable *resulting_biased_exp = 434b8e80941Smrg new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 435b8e80941Smrg 436b8e80941Smrg ir_variable *sign_mantissa = 437b8e80941Smrg new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary); 438b8e80941Smrg 439b8e80941Smrg ir_variable *flush_to_zero = 440b8e80941Smrg new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary); 441b8e80941Smrg ir_variable *zero_mantissa = 442b8e80941Smrg new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary); 443b8e80941Smrg 444b8e80941Smrg ir_instruction &i = *base_ir; 445b8e80941Smrg 446b8e80941Smrg /* Copy <x> and <exp> arguments. */ 447b8e80941Smrg i.insert_before(x); 448b8e80941Smrg i.insert_before(assign(x, ir->operands[0])); 449b8e80941Smrg i.insert_before(exp); 450b8e80941Smrg i.insert_before(assign(exp, ir->operands[1])); 451b8e80941Smrg 452b8e80941Smrg /* Extract the biased exponent from <x>. */ 453b8e80941Smrg i.insert_before(extracted_biased_exp); 454b8e80941Smrg i.insert_before(assign(extracted_biased_exp, 455b8e80941Smrg rshift(bitcast_f2i(abs(x)), 456b8e80941Smrg new(ir) ir_constant(23, vec_elem)))); 457b8e80941Smrg 458b8e80941Smrg /* The definition of ldexp in the GLSL 4.60 spec says: 459b8e80941Smrg * 460b8e80941Smrg * "If exp is greater than +128 (single-precision) or +1024 461b8e80941Smrg * (double-precision), the value returned is undefined. If exp is less 462b8e80941Smrg * than -126 (single-precision) or -1022 (double-precision), the value 463b8e80941Smrg * returned may be flushed to zero." 464b8e80941Smrg * 465b8e80941Smrg * So we do not have to guard against the possibility of addition overflow, 466b8e80941Smrg * which could happen when exp is close to INT_MAX. Addition underflow 467b8e80941Smrg * cannot happen (the worst case is 0 + (-INT_MAX)). 468b8e80941Smrg */ 469b8e80941Smrg i.insert_before(resulting_biased_exp); 470b8e80941Smrg i.insert_before(assign(resulting_biased_exp, 471b8e80941Smrg min2(add(extracted_biased_exp, exp), 472b8e80941Smrg new(ir) ir_constant(255, vec_elem)))); 473b8e80941Smrg 474b8e80941Smrg i.insert_before(sign_mantissa); 475b8e80941Smrg i.insert_before(assign(sign_mantissa, 476b8e80941Smrg bit_and(bitcast_f2u(x), 477b8e80941Smrg new(ir) ir_constant(0x807fffffu, vec_elem)))); 478b8e80941Smrg 479b8e80941Smrg /* We flush to zero if the original or resulting biased exponent is 0, 480b8e80941Smrg * indicating a +/-0.0 or subnormal input or output. 481b8e80941Smrg * 482b8e80941Smrg * The mantissa is set to 0 if the resulting biased exponent is 255, since 483b8e80941Smrg * an overflow should produce a +/-inf result. 484b8e80941Smrg * 485b8e80941Smrg * Note that NaN inputs are handled separately. 486b8e80941Smrg */ 487b8e80941Smrg i.insert_before(flush_to_zero); 488b8e80941Smrg i.insert_before(assign(flush_to_zero, 489b8e80941Smrg lequal(min2(resulting_biased_exp, 490b8e80941Smrg extracted_biased_exp), 491b8e80941Smrg ir_constant::zero(ir, ivec)))); 492b8e80941Smrg i.insert_before(assign(resulting_biased_exp, 493b8e80941Smrg csel(flush_to_zero, 494b8e80941Smrg ir_constant::zero(ir, ivec), 495b8e80941Smrg resulting_biased_exp))); 496b8e80941Smrg 497b8e80941Smrg i.insert_before(zero_mantissa); 498b8e80941Smrg i.insert_before(assign(zero_mantissa, 499b8e80941Smrg logic_or(flush_to_zero, 500b8e80941Smrg equal(resulting_biased_exp, 501b8e80941Smrg new(ir) ir_constant(255, vec_elem))))); 502b8e80941Smrg i.insert_before(assign(sign_mantissa, 503b8e80941Smrg csel(zero_mantissa, 504b8e80941Smrg bit_and(sign_mantissa, 505b8e80941Smrg new(ir) ir_constant(0x80000000u, vec_elem)), 506b8e80941Smrg sign_mantissa))); 507b8e80941Smrg 508b8e80941Smrg /* Don't generate new IR that would need to be lowered in an additional 509b8e80941Smrg * pass. 510b8e80941Smrg */ 511b8e80941Smrg i.insert_before(result); 512b8e80941Smrg if (!lowering(INSERT_TO_SHIFTS)) { 513b8e80941Smrg i.insert_before(assign(result, 514b8e80941Smrg bitfield_insert(sign_mantissa, 515b8e80941Smrg i2u(resulting_biased_exp), 516b8e80941Smrg new(ir) ir_constant(23u, vec_elem), 517b8e80941Smrg new(ir) ir_constant(8u, vec_elem)))); 518b8e80941Smrg } else { 519b8e80941Smrg i.insert_before(assign(result, 520b8e80941Smrg bit_or(sign_mantissa, 521b8e80941Smrg lshift(i2u(resulting_biased_exp), 522b8e80941Smrg new(ir) ir_constant(23, vec_elem))))); 523b8e80941Smrg } 524b8e80941Smrg 525b8e80941Smrg ir->operation = ir_triop_csel; 526b8e80941Smrg ir->init_num_operands(); 527b8e80941Smrg ir->operands[0] = gequal(extracted_biased_exp, 528b8e80941Smrg new(ir) ir_constant(255, vec_elem)); 529b8e80941Smrg ir->operands[1] = new(ir) ir_dereference_variable(x); 530b8e80941Smrg ir->operands[2] = bitcast_u2f(result); 531b8e80941Smrg 532b8e80941Smrg this->progress = true; 533b8e80941Smrg} 534b8e80941Smrg 535b8e80941Smrgvoid 536b8e80941Smrglower_instructions_visitor::dldexp_to_arith(ir_expression *ir) 537b8e80941Smrg{ 538b8e80941Smrg /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent 539b8e80941Smrg * from the significand. 540b8e80941Smrg */ 541b8e80941Smrg 542b8e80941Smrg const unsigned vec_elem = ir->type->vector_elements; 543b8e80941Smrg 544b8e80941Smrg /* Types */ 545b8e80941Smrg const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 546b8e80941Smrg const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 547b8e80941Smrg 548b8e80941Smrg /* Constants */ 549b8e80941Smrg ir_constant *zeroi = ir_constant::zero(ir, ivec); 550b8e80941Smrg 551b8e80941Smrg ir_constant *sign_mask = new(ir) ir_constant(0x80000000u); 552b8e80941Smrg 553b8e80941Smrg ir_constant *exp_shift = new(ir) ir_constant(20u); 554b8e80941Smrg ir_constant *exp_width = new(ir) ir_constant(11u); 555b8e80941Smrg ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem); 556b8e80941Smrg 557b8e80941Smrg /* Temporary variables */ 558b8e80941Smrg ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 559b8e80941Smrg ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 560b8e80941Smrg 561b8e80941Smrg ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x", 562b8e80941Smrg ir_var_temporary); 563b8e80941Smrg 564b8e80941Smrg ir_variable *extracted_biased_exp = 565b8e80941Smrg new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 566b8e80941Smrg ir_variable *resulting_biased_exp = 567b8e80941Smrg new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 568b8e80941Smrg 569b8e80941Smrg ir_variable *is_not_zero_or_underflow = 570b8e80941Smrg new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary); 571b8e80941Smrg 572b8e80941Smrg ir_instruction &i = *base_ir; 573b8e80941Smrg 574b8e80941Smrg /* Copy <x> and <exp> arguments. */ 575b8e80941Smrg i.insert_before(x); 576b8e80941Smrg i.insert_before(assign(x, ir->operands[0])); 577b8e80941Smrg i.insert_before(exp); 578b8e80941Smrg i.insert_before(assign(exp, ir->operands[1])); 579b8e80941Smrg 580b8e80941Smrg ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x); 581b8e80941Smrg if (lowering(DFREXP_DLDEXP_TO_ARITH)) 582b8e80941Smrg dfrexp_exp_to_arith(frexp_exp); 583b8e80941Smrg 584b8e80941Smrg /* Extract the biased exponent from <x>. */ 585b8e80941Smrg i.insert_before(extracted_biased_exp); 586b8e80941Smrg i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias))); 587b8e80941Smrg 588b8e80941Smrg i.insert_before(resulting_biased_exp); 589b8e80941Smrg i.insert_before(assign(resulting_biased_exp, 590b8e80941Smrg add(extracted_biased_exp, exp))); 591b8e80941Smrg 592b8e80941Smrg /* Test if result is ±0.0, subnormal, or underflow by checking if the 593b8e80941Smrg * resulting biased exponent would be less than 0x1. If so, the result is 594b8e80941Smrg * 0.0 with the sign of x. (Actually, invert the conditions so that 595b8e80941Smrg * immediate values are the second arguments, which is better for i965) 596b8e80941Smrg * TODO: Implement in a vector fashion. 597b8e80941Smrg */ 598b8e80941Smrg i.insert_before(zero_sign_x); 599b8e80941Smrg for (unsigned elem = 0; elem < vec_elem; elem++) { 600b8e80941Smrg ir_variable *unpacked = 601b8e80941Smrg new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 602b8e80941Smrg i.insert_before(unpacked); 603b8e80941Smrg i.insert_before( 604b8e80941Smrg assign(unpacked, 605b8e80941Smrg expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 606b8e80941Smrg i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)), 607b8e80941Smrg WRITEMASK_Y)); 608b8e80941Smrg i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X)); 609b8e80941Smrg i.insert_before(assign(zero_sign_x, 610b8e80941Smrg expr(ir_unop_pack_double_2x32, unpacked), 611b8e80941Smrg 1 << elem)); 612b8e80941Smrg } 613b8e80941Smrg i.insert_before(is_not_zero_or_underflow); 614b8e80941Smrg i.insert_before(assign(is_not_zero_or_underflow, 615b8e80941Smrg gequal(resulting_biased_exp, 616b8e80941Smrg new(ir) ir_constant(0x1, vec_elem)))); 617b8e80941Smrg i.insert_before(assign(x, csel(is_not_zero_or_underflow, 618b8e80941Smrg x, zero_sign_x))); 619b8e80941Smrg i.insert_before(assign(resulting_biased_exp, 620b8e80941Smrg csel(is_not_zero_or_underflow, 621b8e80941Smrg resulting_biased_exp, zeroi))); 622b8e80941Smrg 623b8e80941Smrg /* We could test for overflows by checking if the resulting biased exponent 624b8e80941Smrg * would be greater than 0xFE. Turns out we don't need to because the GLSL 625b8e80941Smrg * spec says: 626b8e80941Smrg * 627b8e80941Smrg * "If this product is too large to be represented in the 628b8e80941Smrg * floating-point type, the result is undefined." 629b8e80941Smrg */ 630b8e80941Smrg 631b8e80941Smrg ir_rvalue *results[4] = {NULL}; 632b8e80941Smrg for (unsigned elem = 0; elem < vec_elem; elem++) { 633b8e80941Smrg ir_variable *unpacked = 634b8e80941Smrg new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 635b8e80941Smrg i.insert_before(unpacked); 636b8e80941Smrg i.insert_before( 637b8e80941Smrg assign(unpacked, 638b8e80941Smrg expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 639b8e80941Smrg 640b8e80941Smrg ir_expression *bfi = bitfield_insert( 641b8e80941Smrg swizzle_y(unpacked), 642b8e80941Smrg i2u(swizzle(resulting_biased_exp, elem, 1)), 643b8e80941Smrg exp_shift->clone(ir, NULL), 644b8e80941Smrg exp_width->clone(ir, NULL)); 645b8e80941Smrg 646b8e80941Smrg i.insert_before(assign(unpacked, bfi, WRITEMASK_Y)); 647b8e80941Smrg 648b8e80941Smrg results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 649b8e80941Smrg } 650b8e80941Smrg 651b8e80941Smrg ir->operation = ir_quadop_vector; 652b8e80941Smrg ir->init_num_operands(); 653b8e80941Smrg ir->operands[0] = results[0]; 654b8e80941Smrg ir->operands[1] = results[1]; 655b8e80941Smrg ir->operands[2] = results[2]; 656b8e80941Smrg ir->operands[3] = results[3]; 657b8e80941Smrg 658b8e80941Smrg /* Don't generate new IR that would need to be lowered in an additional 659b8e80941Smrg * pass. 660b8e80941Smrg */ 661b8e80941Smrg 662b8e80941Smrg this->progress = true; 663b8e80941Smrg} 664b8e80941Smrg 665b8e80941Smrgvoid 666b8e80941Smrglower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir) 667b8e80941Smrg{ 668b8e80941Smrg const unsigned vec_elem = ir->type->vector_elements; 669b8e80941Smrg const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 670b8e80941Smrg 671b8e80941Smrg /* Double-precision floating-point values are stored as 672b8e80941Smrg * 1 sign bit; 673b8e80941Smrg * 11 exponent bits; 674b8e80941Smrg * 52 mantissa bits. 675b8e80941Smrg * 676b8e80941Smrg * We're just extracting the significand here, so we only need to modify 677b8e80941Smrg * the upper 32-bit uint. Unfortunately we must extract each double 678b8e80941Smrg * independently as there is no vector version of unpackDouble. 679b8e80941Smrg */ 680b8e80941Smrg 681b8e80941Smrg ir_instruction &i = *base_ir; 682b8e80941Smrg 683b8e80941Smrg ir_variable *is_not_zero = 684b8e80941Smrg new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 685b8e80941Smrg ir_rvalue *results[4] = {NULL}; 686b8e80941Smrg 687b8e80941Smrg ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 688b8e80941Smrg i.insert_before(is_not_zero); 689b8e80941Smrg i.insert_before( 690b8e80941Smrg assign(is_not_zero, 691b8e80941Smrg nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero))); 692b8e80941Smrg 693b8e80941Smrg /* TODO: Remake this as more vector-friendly when int64 support is 694b8e80941Smrg * available. 695b8e80941Smrg */ 696b8e80941Smrg for (unsigned elem = 0; elem < vec_elem; elem++) { 697b8e80941Smrg ir_constant *zero = new(ir) ir_constant(0u, 1); 698b8e80941Smrg ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1); 699b8e80941Smrg 700b8e80941Smrg /* Exponent of double floating-point values in the range [0.5, 1.0). */ 701b8e80941Smrg ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1); 702b8e80941Smrg 703b8e80941Smrg ir_variable *bits = 704b8e80941Smrg new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary); 705b8e80941Smrg ir_variable *unpacked = 706b8e80941Smrg new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 707b8e80941Smrg 708b8e80941Smrg ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1); 709b8e80941Smrg 710b8e80941Smrg i.insert_before(bits); 711b8e80941Smrg i.insert_before(unpacked); 712b8e80941Smrg i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x))); 713b8e80941Smrg 714b8e80941Smrg /* Manipulate the high uint to remove the exponent and replace it with 715b8e80941Smrg * either the default exponent or zero. 716b8e80941Smrg */ 717b8e80941Smrg i.insert_before(assign(bits, swizzle_y(unpacked))); 718b8e80941Smrg i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask))); 719b8e80941Smrg i.insert_before(assign(bits, bit_or(bits, 720b8e80941Smrg csel(swizzle(is_not_zero, elem, 1), 721b8e80941Smrg exponent_value, 722b8e80941Smrg zero)))); 723b8e80941Smrg i.insert_before(assign(unpacked, bits, WRITEMASK_Y)); 724b8e80941Smrg results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 725b8e80941Smrg } 726b8e80941Smrg 727b8e80941Smrg /* Put the dvec back together */ 728b8e80941Smrg ir->operation = ir_quadop_vector; 729b8e80941Smrg ir->init_num_operands(); 730b8e80941Smrg ir->operands[0] = results[0]; 731b8e80941Smrg ir->operands[1] = results[1]; 732b8e80941Smrg ir->operands[2] = results[2]; 733b8e80941Smrg ir->operands[3] = results[3]; 734b8e80941Smrg 735b8e80941Smrg this->progress = true; 736b8e80941Smrg} 737b8e80941Smrg 738b8e80941Smrgvoid 739b8e80941Smrglower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir) 740b8e80941Smrg{ 741b8e80941Smrg const unsigned vec_elem = ir->type->vector_elements; 742b8e80941Smrg const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 743b8e80941Smrg const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 744b8e80941Smrg 745b8e80941Smrg /* Double-precision floating-point values are stored as 746b8e80941Smrg * 1 sign bit; 747b8e80941Smrg * 11 exponent bits; 748b8e80941Smrg * 52 mantissa bits. 749b8e80941Smrg * 750b8e80941Smrg * We're just extracting the exponent here, so we only care about the upper 751b8e80941Smrg * 32-bit uint. 752b8e80941Smrg */ 753b8e80941Smrg 754b8e80941Smrg ir_instruction &i = *base_ir; 755b8e80941Smrg 756b8e80941Smrg ir_variable *is_not_zero = 757b8e80941Smrg new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 758b8e80941Smrg ir_variable *high_words = 759b8e80941Smrg new(ir) ir_variable(uvec, "high_words", ir_var_temporary); 760b8e80941Smrg ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 761b8e80941Smrg ir_constant *izero = new(ir) ir_constant(0, vec_elem); 762b8e80941Smrg 763b8e80941Smrg ir_rvalue *absval = abs(ir->operands[0]); 764b8e80941Smrg 765b8e80941Smrg i.insert_before(is_not_zero); 766b8e80941Smrg i.insert_before(high_words); 767b8e80941Smrg i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero))); 768b8e80941Smrg 769b8e80941Smrg /* Extract all of the upper uints. */ 770b8e80941Smrg for (unsigned elem = 0; elem < vec_elem; elem++) { 771b8e80941Smrg ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1); 772b8e80941Smrg 773b8e80941Smrg i.insert_before(assign(high_words, 774b8e80941Smrg swizzle_y(expr(ir_unop_unpack_double_2x32, x)), 775b8e80941Smrg 1 << elem)); 776b8e80941Smrg 777b8e80941Smrg } 778b8e80941Smrg ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem); 779b8e80941Smrg ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem); 780b8e80941Smrg 781b8e80941Smrg /* For non-zero inputs, shift the exponent down and apply bias. */ 782b8e80941Smrg ir->operation = ir_triop_csel; 783b8e80941Smrg ir->init_num_operands(); 784b8e80941Smrg ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero); 785b8e80941Smrg ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift))); 786b8e80941Smrg ir->operands[2] = izero; 787b8e80941Smrg 788b8e80941Smrg this->progress = true; 789b8e80941Smrg} 790b8e80941Smrg 791b8e80941Smrgvoid 792b8e80941Smrglower_instructions_visitor::carry_to_arith(ir_expression *ir) 793b8e80941Smrg{ 794b8e80941Smrg /* Translates 795b8e80941Smrg * ir_binop_carry x y 796b8e80941Smrg * into 797b8e80941Smrg * sum = ir_binop_add x y 798b8e80941Smrg * bcarry = ir_binop_less sum x 799b8e80941Smrg * carry = ir_unop_b2i bcarry 800b8e80941Smrg */ 801b8e80941Smrg 802b8e80941Smrg ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL); 803b8e80941Smrg ir->operation = ir_unop_i2u; 804b8e80941Smrg ir->init_num_operands(); 805b8e80941Smrg ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone)); 806b8e80941Smrg ir->operands[1] = NULL; 807b8e80941Smrg 808b8e80941Smrg this->progress = true; 809b8e80941Smrg} 810b8e80941Smrg 811b8e80941Smrgvoid 812b8e80941Smrglower_instructions_visitor::borrow_to_arith(ir_expression *ir) 813b8e80941Smrg{ 814b8e80941Smrg /* Translates 815b8e80941Smrg * ir_binop_borrow x y 816b8e80941Smrg * into 817b8e80941Smrg * bcarry = ir_binop_less x y 818b8e80941Smrg * carry = ir_unop_b2i bcarry 819b8e80941Smrg */ 820b8e80941Smrg 821b8e80941Smrg ir->operation = ir_unop_i2u; 822b8e80941Smrg ir->init_num_operands(); 823b8e80941Smrg ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1])); 824b8e80941Smrg ir->operands[1] = NULL; 825b8e80941Smrg 826b8e80941Smrg this->progress = true; 827b8e80941Smrg} 828b8e80941Smrg 829b8e80941Smrgvoid 830b8e80941Smrglower_instructions_visitor::sat_to_clamp(ir_expression *ir) 831b8e80941Smrg{ 832b8e80941Smrg /* Translates 833b8e80941Smrg * ir_unop_saturate x 834b8e80941Smrg * into 835b8e80941Smrg * ir_binop_min (ir_binop_max(x, 0.0), 1.0) 836b8e80941Smrg */ 837b8e80941Smrg 838b8e80941Smrg ir->operation = ir_binop_min; 839b8e80941Smrg ir->init_num_operands(); 840b8e80941Smrg ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type, 841b8e80941Smrg ir->operands[0], 842b8e80941Smrg new(ir) ir_constant(0.0f)); 843b8e80941Smrg ir->operands[1] = new(ir) ir_constant(1.0f); 844b8e80941Smrg 845b8e80941Smrg this->progress = true; 846b8e80941Smrg} 847b8e80941Smrg 848b8e80941Smrgvoid 849b8e80941Smrglower_instructions_visitor::double_dot_to_fma(ir_expression *ir) 850b8e80941Smrg{ 851b8e80941Smrg ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res", 852b8e80941Smrg ir_var_temporary); 853b8e80941Smrg this->base_ir->insert_before(temp); 854b8e80941Smrg 855b8e80941Smrg int nc = ir->operands[0]->type->components(); 856b8e80941Smrg for (int i = nc - 1; i >= 1; i--) { 857b8e80941Smrg ir_assignment *assig; 858b8e80941Smrg if (i == (nc - 1)) { 859b8e80941Smrg assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 860b8e80941Smrg swizzle(ir->operands[1]->clone(ir, NULL), i, 1))); 861b8e80941Smrg } else { 862b8e80941Smrg assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 863b8e80941Smrg swizzle(ir->operands[1]->clone(ir, NULL), i, 1), 864b8e80941Smrg temp)); 865b8e80941Smrg } 866b8e80941Smrg this->base_ir->insert_before(assig); 867b8e80941Smrg } 868b8e80941Smrg 869b8e80941Smrg ir->operation = ir_triop_fma; 870b8e80941Smrg ir->init_num_operands(); 871b8e80941Smrg ir->operands[0] = swizzle(ir->operands[0], 0, 1); 872b8e80941Smrg ir->operands[1] = swizzle(ir->operands[1], 0, 1); 873b8e80941Smrg ir->operands[2] = new(ir) ir_dereference_variable(temp); 874b8e80941Smrg 875b8e80941Smrg this->progress = true; 876b8e80941Smrg 877b8e80941Smrg} 878b8e80941Smrg 879b8e80941Smrgvoid 880b8e80941Smrglower_instructions_visitor::double_lrp(ir_expression *ir) 881b8e80941Smrg{ 882b8e80941Smrg int swizval; 883b8e80941Smrg ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2]; 884b8e80941Smrg ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements); 885b8e80941Smrg 886b8e80941Smrg switch (op2->type->vector_elements) { 887b8e80941Smrg case 1: 888b8e80941Smrg swizval = SWIZZLE_XXXX; 889b8e80941Smrg break; 890b8e80941Smrg default: 891b8e80941Smrg assert(op0->type->vector_elements == op2->type->vector_elements); 892b8e80941Smrg swizval = SWIZZLE_XYZW; 893b8e80941Smrg break; 894b8e80941Smrg } 895b8e80941Smrg 896b8e80941Smrg ir->operation = ir_triop_fma; 897b8e80941Smrg ir->init_num_operands(); 898b8e80941Smrg ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements); 899b8e80941Smrg ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0); 900b8e80941Smrg 901b8e80941Smrg this->progress = true; 902b8e80941Smrg} 903b8e80941Smrg 904b8e80941Smrgvoid 905b8e80941Smrglower_instructions_visitor::dceil_to_dfrac(ir_expression *ir) 906b8e80941Smrg{ 907b8e80941Smrg /* 908b8e80941Smrg * frtemp = frac(x); 909b8e80941Smrg * temp = sub(x, frtemp); 910b8e80941Smrg * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0); 911b8e80941Smrg */ 912b8e80941Smrg ir_instruction &i = *base_ir; 913b8e80941Smrg ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 914b8e80941Smrg ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 915b8e80941Smrg ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 916b8e80941Smrg ir_var_temporary); 917b8e80941Smrg 918b8e80941Smrg i.insert_before(frtemp); 919b8e80941Smrg i.insert_before(assign(frtemp, fract(ir->operands[0]))); 920b8e80941Smrg 921b8e80941Smrg ir->operation = ir_binop_add; 922b8e80941Smrg ir->init_num_operands(); 923b8e80941Smrg ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp); 924b8e80941Smrg ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL)); 925b8e80941Smrg 926b8e80941Smrg this->progress = true; 927b8e80941Smrg} 928b8e80941Smrg 929b8e80941Smrgvoid 930b8e80941Smrglower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir) 931b8e80941Smrg{ 932b8e80941Smrg /* 933b8e80941Smrg * frtemp = frac(x); 934b8e80941Smrg * result = sub(x, frtemp); 935b8e80941Smrg */ 936b8e80941Smrg ir->operation = ir_binop_sub; 937b8e80941Smrg ir->init_num_operands(); 938b8e80941Smrg ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL)); 939b8e80941Smrg 940b8e80941Smrg this->progress = true; 941b8e80941Smrg} 942b8e80941Smrgvoid 943b8e80941Smrglower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir) 944b8e80941Smrg{ 945b8e80941Smrg /* 946b8e80941Smrg * insane but works 947b8e80941Smrg * temp = x + 0.5; 948b8e80941Smrg * frtemp = frac(temp); 949b8e80941Smrg * t2 = sub(temp, frtemp); 950b8e80941Smrg * if (frac(x) == 0.5) 951b8e80941Smrg * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1; 952b8e80941Smrg * else 953b8e80941Smrg * result = t2; 954b8e80941Smrg 955b8e80941Smrg */ 956b8e80941Smrg ir_instruction &i = *base_ir; 957b8e80941Smrg ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 958b8e80941Smrg ir_var_temporary); 959b8e80941Smrg ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 960b8e80941Smrg ir_var_temporary); 961b8e80941Smrg ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2", 962b8e80941Smrg ir_var_temporary); 963b8e80941Smrg ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements); 964b8e80941Smrg ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 965b8e80941Smrg ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 966b8e80941Smrg 967b8e80941Smrg i.insert_before(temp); 968b8e80941Smrg i.insert_before(assign(temp, add(ir->operands[0], p5))); 969b8e80941Smrg 970b8e80941Smrg i.insert_before(frtemp); 971b8e80941Smrg i.insert_before(assign(frtemp, fract(temp))); 972b8e80941Smrg 973b8e80941Smrg i.insert_before(t2); 974b8e80941Smrg i.insert_before(assign(t2, sub(temp, frtemp))); 975b8e80941Smrg 976b8e80941Smrg ir->operation = ir_triop_csel; 977b8e80941Smrg ir->init_num_operands(); 978b8e80941Smrg ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)), 979b8e80941Smrg p5->clone(ir, NULL)); 980b8e80941Smrg ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))), 981b8e80941Smrg zero), 982b8e80941Smrg t2, 983b8e80941Smrg sub(t2, one)); 984b8e80941Smrg ir->operands[2] = new(ir) ir_dereference_variable(t2); 985b8e80941Smrg 986b8e80941Smrg this->progress = true; 987b8e80941Smrg} 988b8e80941Smrg 989b8e80941Smrgvoid 990b8e80941Smrglower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir) 991b8e80941Smrg{ 992b8e80941Smrg /* 993b8e80941Smrg * frtemp = frac(x); 994b8e80941Smrg * temp = sub(x, frtemp); 995b8e80941Smrg * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1; 996b8e80941Smrg */ 997b8e80941Smrg ir_rvalue *arg = ir->operands[0]; 998b8e80941Smrg ir_instruction &i = *base_ir; 999b8e80941Smrg 1000b8e80941Smrg ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 1001b8e80941Smrg ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 1002b8e80941Smrg ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp", 1003b8e80941Smrg ir_var_temporary); 1004b8e80941Smrg ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 1005b8e80941Smrg ir_var_temporary); 1006b8e80941Smrg 1007b8e80941Smrg i.insert_before(frtemp); 1008b8e80941Smrg i.insert_before(assign(frtemp, fract(arg))); 1009b8e80941Smrg i.insert_before(temp); 1010b8e80941Smrg i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp))); 1011b8e80941Smrg 1012b8e80941Smrg ir->operation = ir_triop_csel; 1013b8e80941Smrg ir->init_num_operands(); 1014b8e80941Smrg ir->operands[0] = gequal(arg->clone(ir, NULL), zero); 1015b8e80941Smrg ir->operands[1] = new (ir) ir_dereference_variable(temp); 1016b8e80941Smrg ir->operands[2] = add(temp, 1017b8e80941Smrg csel(equal(frtemp, zero->clone(ir, NULL)), 1018b8e80941Smrg zero->clone(ir, NULL), 1019b8e80941Smrg one)); 1020b8e80941Smrg 1021b8e80941Smrg this->progress = true; 1022b8e80941Smrg} 1023b8e80941Smrg 1024b8e80941Smrgvoid 1025b8e80941Smrglower_instructions_visitor::dsign_to_csel(ir_expression *ir) 1026b8e80941Smrg{ 1027b8e80941Smrg /* 1028b8e80941Smrg * temp = x > 0.0 ? 1.0 : 0.0; 1029b8e80941Smrg * result = x < 0.0 ? -1.0 : temp; 1030b8e80941Smrg */ 1031b8e80941Smrg ir_rvalue *arg = ir->operands[0]; 1032b8e80941Smrg ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 1033b8e80941Smrg ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 1034b8e80941Smrg ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements); 1035b8e80941Smrg 1036b8e80941Smrg ir->operation = ir_triop_csel; 1037b8e80941Smrg ir->init_num_operands(); 1038b8e80941Smrg ir->operands[0] = less(arg->clone(ir, NULL), 1039b8e80941Smrg zero->clone(ir, NULL)); 1040b8e80941Smrg ir->operands[1] = neg_one; 1041b8e80941Smrg ir->operands[2] = csel(greater(arg, zero), 1042b8e80941Smrg one, 1043b8e80941Smrg zero->clone(ir, NULL)); 1044b8e80941Smrg 1045b8e80941Smrg this->progress = true; 1046b8e80941Smrg} 1047b8e80941Smrg 1048b8e80941Smrgvoid 1049b8e80941Smrglower_instructions_visitor::bit_count_to_math(ir_expression *ir) 1050b8e80941Smrg{ 1051b8e80941Smrg /* For more details, see: 1052b8e80941Smrg * 1053b8e80941Smrg * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel 1054b8e80941Smrg */ 1055b8e80941Smrg const unsigned elements = ir->operands[0]->type->vector_elements; 1056b8e80941Smrg ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp", 1057b8e80941Smrg ir_var_temporary); 1058b8e80941Smrg ir_constant *c55555555 = new(ir) ir_constant(0x55555555u); 1059b8e80941Smrg ir_constant *c33333333 = new(ir) ir_constant(0x33333333u); 1060b8e80941Smrg ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu); 1061b8e80941Smrg ir_constant *c01010101 = new(ir) ir_constant(0x01010101u); 1062b8e80941Smrg ir_constant *c1 = new(ir) ir_constant(1u); 1063b8e80941Smrg ir_constant *c2 = new(ir) ir_constant(2u); 1064b8e80941Smrg ir_constant *c4 = new(ir) ir_constant(4u); 1065b8e80941Smrg ir_constant *c24 = new(ir) ir_constant(24u); 1066b8e80941Smrg 1067b8e80941Smrg base_ir->insert_before(temp); 1068b8e80941Smrg 1069b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1070b8e80941Smrg base_ir->insert_before(assign(temp, ir->operands[0])); 1071b8e80941Smrg } else { 1072b8e80941Smrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1073b8e80941Smrg base_ir->insert_before(assign(temp, i2u(ir->operands[0]))); 1074b8e80941Smrg } 1075b8e80941Smrg 1076b8e80941Smrg /* temp = temp - ((temp >> 1) & 0x55555555u); */ 1077b8e80941Smrg base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1), 1078b8e80941Smrg c55555555)))); 1079b8e80941Smrg 1080b8e80941Smrg /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */ 1081b8e80941Smrg base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333), 1082b8e80941Smrg bit_and(rshift(temp, c2), 1083b8e80941Smrg c33333333->clone(ir, NULL))))); 1084b8e80941Smrg 1085b8e80941Smrg /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */ 1086b8e80941Smrg ir->operation = ir_unop_u2i; 1087b8e80941Smrg ir->init_num_operands(); 1088b8e80941Smrg ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F), 1089b8e80941Smrg c01010101), 1090b8e80941Smrg c24); 1091b8e80941Smrg 1092b8e80941Smrg this->progress = true; 1093b8e80941Smrg} 1094b8e80941Smrg 1095b8e80941Smrgvoid 1096b8e80941Smrglower_instructions_visitor::extract_to_shifts(ir_expression *ir) 1097b8e80941Smrg{ 1098b8e80941Smrg ir_variable *bits = 1099b8e80941Smrg new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 1100b8e80941Smrg 1101b8e80941Smrg base_ir->insert_before(bits); 1102b8e80941Smrg base_ir->insert_before(assign(bits, ir->operands[2])); 1103b8e80941Smrg 1104b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1105b8e80941Smrg ir_constant *c1 = 1106b8e80941Smrg new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1107b8e80941Smrg ir_constant *c32 = 1108b8e80941Smrg new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 1109b8e80941Smrg ir_constant *cFFFFFFFF = 1110b8e80941Smrg new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 1111b8e80941Smrg 1112b8e80941Smrg /* At least some hardware treats (x << y) as (x << (y%32)). This means 1113b8e80941Smrg * we'd get a mask of 0 when bits is 32. Special case it. 1114b8e80941Smrg * 1115b8e80941Smrg * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u; 1116b8e80941Smrg */ 1117b8e80941Smrg ir_expression *mask = csel(equal(bits, c32), 1118b8e80941Smrg cFFFFFFFF, 1119b8e80941Smrg sub(lshift(c1, bits), c1->clone(ir, NULL))); 1120b8e80941Smrg 1121b8e80941Smrg /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1122b8e80941Smrg * 1123b8e80941Smrg * If bits is zero, the result will be zero. 1124b8e80941Smrg * 1125b8e80941Smrg * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional 1126b8e80941Smrg * select as in the signed integer case. 1127b8e80941Smrg * 1128b8e80941Smrg * (value >> offset) & mask; 1129b8e80941Smrg */ 1130b8e80941Smrg ir->operation = ir_binop_bit_and; 1131b8e80941Smrg ir->init_num_operands(); 1132b8e80941Smrg ir->operands[0] = rshift(ir->operands[0], ir->operands[1]); 1133b8e80941Smrg ir->operands[1] = mask; 1134b8e80941Smrg ir->operands[2] = NULL; 1135b8e80941Smrg } else { 1136b8e80941Smrg ir_constant *c0 = 1137b8e80941Smrg new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements); 1138b8e80941Smrg ir_constant *c32 = 1139b8e80941Smrg new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 1140b8e80941Smrg ir_variable *temp = 1141b8e80941Smrg new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary); 1142b8e80941Smrg 1143b8e80941Smrg /* temp = 32 - bits; */ 1144b8e80941Smrg base_ir->insert_before(temp); 1145b8e80941Smrg base_ir->insert_before(assign(temp, sub(c32, bits))); 1146b8e80941Smrg 1147b8e80941Smrg /* expr = value << (temp - offset)) >> temp; */ 1148b8e80941Smrg ir_expression *expr = 1149b8e80941Smrg rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp); 1150b8e80941Smrg 1151b8e80941Smrg /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1152b8e80941Smrg * 1153b8e80941Smrg * If bits is zero, the result will be zero. 1154b8e80941Smrg * 1155b8e80941Smrg * Due to the (x << (y%32)) behavior mentioned before, the (value << 1156b8e80941Smrg * (32-0)) doesn't "erase" all of the data as we would like, so finish 1157b8e80941Smrg * up with: 1158b8e80941Smrg * 1159b8e80941Smrg * (bits == 0) ? 0 : e; 1160b8e80941Smrg */ 1161b8e80941Smrg ir->operation = ir_triop_csel; 1162b8e80941Smrg ir->init_num_operands(); 1163b8e80941Smrg ir->operands[0] = equal(c0, bits); 1164b8e80941Smrg ir->operands[1] = c0->clone(ir, NULL); 1165b8e80941Smrg ir->operands[2] = expr; 1166b8e80941Smrg } 1167b8e80941Smrg 1168b8e80941Smrg this->progress = true; 1169b8e80941Smrg} 1170b8e80941Smrg 1171b8e80941Smrgvoid 1172b8e80941Smrglower_instructions_visitor::insert_to_shifts(ir_expression *ir) 1173b8e80941Smrg{ 1174b8e80941Smrg ir_constant *c1; 1175b8e80941Smrg ir_constant *c32; 1176b8e80941Smrg ir_constant *cFFFFFFFF; 1177b8e80941Smrg ir_variable *offset = 1178b8e80941Smrg new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary); 1179b8e80941Smrg ir_variable *bits = 1180b8e80941Smrg new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 1181b8e80941Smrg ir_variable *mask = 1182b8e80941Smrg new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary); 1183b8e80941Smrg 1184b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1185b8e80941Smrg c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements); 1186b8e80941Smrg c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 1187b8e80941Smrg cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements); 1188b8e80941Smrg } else { 1189b8e80941Smrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1190b8e80941Smrg 1191b8e80941Smrg c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1192b8e80941Smrg c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 1193b8e80941Smrg cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 1194b8e80941Smrg } 1195b8e80941Smrg 1196b8e80941Smrg base_ir->insert_before(offset); 1197b8e80941Smrg base_ir->insert_before(assign(offset, ir->operands[2])); 1198b8e80941Smrg 1199b8e80941Smrg base_ir->insert_before(bits); 1200b8e80941Smrg base_ir->insert_before(assign(bits, ir->operands[3])); 1201b8e80941Smrg 1202b8e80941Smrg /* At least some hardware treats (x << y) as (x << (y%32)). This means 1203b8e80941Smrg * we'd get a mask of 0 when bits is 32. Special case it. 1204b8e80941Smrg * 1205b8e80941Smrg * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset; 1206b8e80941Smrg * 1207b8e80941Smrg * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1208b8e80941Smrg * 1209b8e80941Smrg * The result will be undefined if offset or bits is negative, or if the 1210b8e80941Smrg * sum of offset and bits is greater than the number of bits used to 1211b8e80941Smrg * store the operand. 1212b8e80941Smrg * 1213b8e80941Smrg * Since it's undefined, there are a couple other ways this could be 1214b8e80941Smrg * implemented. The other way that was considered was to put the csel 1215b8e80941Smrg * around the whole thing: 1216b8e80941Smrg * 1217b8e80941Smrg * final_result = bits == 32 ? insert : ... ; 1218b8e80941Smrg */ 1219b8e80941Smrg base_ir->insert_before(mask); 1220b8e80941Smrg 1221b8e80941Smrg base_ir->insert_before(assign(mask, csel(equal(bits, c32), 1222b8e80941Smrg cFFFFFFFF, 1223b8e80941Smrg lshift(sub(lshift(c1, bits), 1224b8e80941Smrg c1->clone(ir, NULL)), 1225b8e80941Smrg offset)))); 1226b8e80941Smrg 1227b8e80941Smrg /* (base & ~mask) | ((insert << offset) & mask) */ 1228b8e80941Smrg ir->operation = ir_binop_bit_or; 1229b8e80941Smrg ir->init_num_operands(); 1230b8e80941Smrg ir->operands[0] = bit_and(ir->operands[0], bit_not(mask)); 1231b8e80941Smrg ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask); 1232b8e80941Smrg ir->operands[2] = NULL; 1233b8e80941Smrg ir->operands[3] = NULL; 1234b8e80941Smrg 1235b8e80941Smrg this->progress = true; 1236b8e80941Smrg} 1237b8e80941Smrg 1238b8e80941Smrgvoid 1239b8e80941Smrglower_instructions_visitor::reverse_to_shifts(ir_expression *ir) 1240b8e80941Smrg{ 1241b8e80941Smrg /* For more details, see: 1242b8e80941Smrg * 1243b8e80941Smrg * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 1244b8e80941Smrg */ 1245b8e80941Smrg ir_constant *c1 = 1246b8e80941Smrg new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 1247b8e80941Smrg ir_constant *c2 = 1248b8e80941Smrg new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements); 1249b8e80941Smrg ir_constant *c4 = 1250b8e80941Smrg new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements); 1251b8e80941Smrg ir_constant *c8 = 1252b8e80941Smrg new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements); 1253b8e80941Smrg ir_constant *c16 = 1254b8e80941Smrg new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements); 1255b8e80941Smrg ir_constant *c33333333 = 1256b8e80941Smrg new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements); 1257b8e80941Smrg ir_constant *c55555555 = 1258b8e80941Smrg new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements); 1259b8e80941Smrg ir_constant *c0F0F0F0F = 1260b8e80941Smrg new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements); 1261b8e80941Smrg ir_constant *c00FF00FF = 1262b8e80941Smrg new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements); 1263b8e80941Smrg ir_variable *temp = 1264b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements), 1265b8e80941Smrg "temp", ir_var_temporary); 1266b8e80941Smrg ir_instruction &i = *base_ir; 1267b8e80941Smrg 1268b8e80941Smrg i.insert_before(temp); 1269b8e80941Smrg 1270b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1271b8e80941Smrg i.insert_before(assign(temp, ir->operands[0])); 1272b8e80941Smrg } else { 1273b8e80941Smrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1274b8e80941Smrg i.insert_before(assign(temp, i2u(ir->operands[0]))); 1275b8e80941Smrg } 1276b8e80941Smrg 1277b8e80941Smrg /* Swap odd and even bits. 1278b8e80941Smrg * 1279b8e80941Smrg * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1); 1280b8e80941Smrg */ 1281b8e80941Smrg i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555), 1282b8e80941Smrg lshift(bit_and(temp, c55555555->clone(ir, NULL)), 1283b8e80941Smrg c1->clone(ir, NULL))))); 1284b8e80941Smrg /* Swap consecutive pairs. 1285b8e80941Smrg * 1286b8e80941Smrg * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2); 1287b8e80941Smrg */ 1288b8e80941Smrg i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333), 1289b8e80941Smrg lshift(bit_and(temp, c33333333->clone(ir, NULL)), 1290b8e80941Smrg c2->clone(ir, NULL))))); 1291b8e80941Smrg 1292b8e80941Smrg /* Swap nibbles. 1293b8e80941Smrg * 1294b8e80941Smrg * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4); 1295b8e80941Smrg */ 1296b8e80941Smrg i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F), 1297b8e80941Smrg lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)), 1298b8e80941Smrg c4->clone(ir, NULL))))); 1299b8e80941Smrg 1300b8e80941Smrg /* The last step is, basically, bswap. Swap the bytes, then swap the 1301b8e80941Smrg * words. When this code is run through GCC on x86, it does generate a 1302b8e80941Smrg * bswap instruction. 1303b8e80941Smrg * 1304b8e80941Smrg * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8); 1305b8e80941Smrg * temp = ( temp >> 16 ) | ( temp << 16); 1306b8e80941Smrg */ 1307b8e80941Smrg i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF), 1308b8e80941Smrg lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)), 1309b8e80941Smrg c8->clone(ir, NULL))))); 1310b8e80941Smrg 1311b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1312b8e80941Smrg ir->operation = ir_binop_bit_or; 1313b8e80941Smrg ir->init_num_operands(); 1314b8e80941Smrg ir->operands[0] = rshift(temp, c16); 1315b8e80941Smrg ir->operands[1] = lshift(temp, c16->clone(ir, NULL)); 1316b8e80941Smrg } else { 1317b8e80941Smrg ir->operation = ir_unop_u2i; 1318b8e80941Smrg ir->init_num_operands(); 1319b8e80941Smrg ir->operands[0] = bit_or(rshift(temp, c16), 1320b8e80941Smrg lshift(temp, c16->clone(ir, NULL))); 1321b8e80941Smrg } 1322b8e80941Smrg 1323b8e80941Smrg this->progress = true; 1324b8e80941Smrg} 1325b8e80941Smrg 1326b8e80941Smrgvoid 1327b8e80941Smrglower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir) 1328b8e80941Smrg{ 1329b8e80941Smrg /* For more details, see: 1330b8e80941Smrg * 1331b8e80941Smrg * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1332b8e80941Smrg */ 1333b8e80941Smrg const unsigned elements = ir->operands[0]->type->vector_elements; 1334b8e80941Smrg ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements); 1335b8e80941Smrg ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1336b8e80941Smrg ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1337b8e80941Smrg ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1338b8e80941Smrg ir_variable *temp = 1339b8e80941Smrg new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary); 1340b8e80941Smrg ir_variable *lsb_only = 1341b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary); 1342b8e80941Smrg ir_variable *as_float = 1343b8e80941Smrg new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1344b8e80941Smrg ir_variable *lsb = 1345b8e80941Smrg new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary); 1346b8e80941Smrg 1347b8e80941Smrg ir_instruction &i = *base_ir; 1348b8e80941Smrg 1349b8e80941Smrg i.insert_before(temp); 1350b8e80941Smrg 1351b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 1352b8e80941Smrg i.insert_before(assign(temp, ir->operands[0])); 1353b8e80941Smrg } else { 1354b8e80941Smrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1355b8e80941Smrg i.insert_before(assign(temp, u2i(ir->operands[0]))); 1356b8e80941Smrg } 1357b8e80941Smrg 1358b8e80941Smrg /* The int-to-float conversion is lossless because (value & -value) is 1359b8e80941Smrg * either a power of two or zero. We don't use the result in the zero 1360b8e80941Smrg * case. The uint() cast is necessary so that 0x80000000 does not 1361b8e80941Smrg * generate a negative value. 1362b8e80941Smrg * 1363b8e80941Smrg * uint lsb_only = uint(value & -value); 1364b8e80941Smrg * float as_float = float(lsb_only); 1365b8e80941Smrg */ 1366b8e80941Smrg i.insert_before(lsb_only); 1367b8e80941Smrg i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp))))); 1368b8e80941Smrg 1369b8e80941Smrg i.insert_before(as_float); 1370b8e80941Smrg i.insert_before(assign(as_float, u2f(lsb_only))); 1371b8e80941Smrg 1372b8e80941Smrg /* This is basically an open-coded frexp. Implementations that have a 1373b8e80941Smrg * native frexp instruction would be better served by that. This is 1374b8e80941Smrg * optimized versus a full-featured open-coded implementation in two ways: 1375b8e80941Smrg * 1376b8e80941Smrg * - We don't care about a correct result from subnormal numbers (including 1377b8e80941Smrg * 0.0), so the raw exponent can always be safely unbiased. 1378b8e80941Smrg * 1379b8e80941Smrg * - The value cannot be negative, so it does not need to be masked off to 1380b8e80941Smrg * extract the exponent. 1381b8e80941Smrg * 1382b8e80941Smrg * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1383b8e80941Smrg */ 1384b8e80941Smrg i.insert_before(lsb); 1385b8e80941Smrg i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1386b8e80941Smrg 1387b8e80941Smrg /* Use lsb_only in the comparison instead of temp so that the & (far above) 1388b8e80941Smrg * can possibly generate the result without an explicit comparison. 1389b8e80941Smrg * 1390b8e80941Smrg * (lsb_only == 0) ? -1 : lsb; 1391b8e80941Smrg * 1392b8e80941Smrg * Since our input values are all integers, the unbiased exponent must not 1393b8e80941Smrg * be negative. It will only be negative (-0x7f, in fact) if lsb_only is 1394b8e80941Smrg * 0. Instead of using (lsb_only == 0), we could use (lsb >= 0). Which is 1395b8e80941Smrg * better is likely GPU dependent. Either way, the difference should be 1396b8e80941Smrg * small. 1397b8e80941Smrg */ 1398b8e80941Smrg ir->operation = ir_triop_csel; 1399b8e80941Smrg ir->init_num_operands(); 1400b8e80941Smrg ir->operands[0] = equal(lsb_only, c0); 1401b8e80941Smrg ir->operands[1] = cminus1; 1402b8e80941Smrg ir->operands[2] = new(ir) ir_dereference_variable(lsb); 1403b8e80941Smrg 1404b8e80941Smrg this->progress = true; 1405b8e80941Smrg} 1406b8e80941Smrg 1407b8e80941Smrgvoid 1408b8e80941Smrglower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir) 1409b8e80941Smrg{ 1410b8e80941Smrg /* For more details, see: 1411b8e80941Smrg * 1412b8e80941Smrg * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 1413b8e80941Smrg */ 1414b8e80941Smrg const unsigned elements = ir->operands[0]->type->vector_elements; 1415b8e80941Smrg ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1416b8e80941Smrg ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 1417b8e80941Smrg ir_constant *c23 = new(ir) ir_constant(int(23), elements); 1418b8e80941Smrg ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 1419b8e80941Smrg ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements); 1420b8e80941Smrg ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements); 1421b8e80941Smrg ir_variable *temp = 1422b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary); 1423b8e80941Smrg ir_variable *as_float = 1424b8e80941Smrg new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 1425b8e80941Smrg ir_variable *msb = 1426b8e80941Smrg new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary); 1427b8e80941Smrg 1428b8e80941Smrg ir_instruction &i = *base_ir; 1429b8e80941Smrg 1430b8e80941Smrg i.insert_before(temp); 1431b8e80941Smrg 1432b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1433b8e80941Smrg i.insert_before(assign(temp, ir->operands[0])); 1434b8e80941Smrg } else { 1435b8e80941Smrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1436b8e80941Smrg 1437b8e80941Smrg /* findMSB(uint(abs(some_int))) almost always does the right thing. 1438b8e80941Smrg * There are two problem values: 1439b8e80941Smrg * 1440b8e80941Smrg * * 0x80000000. Since abs(0x80000000) == 0x80000000, findMSB returns 1441b8e80941Smrg * 31. However, findMSB(int(0x80000000)) == 30. 1442b8e80941Smrg * 1443b8e80941Smrg * * 0xffffffff. Since abs(0xffffffff) == 1, findMSB returns 1444b8e80941Smrg * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 1445b8e80941Smrg * 1446b8e80941Smrg * For a value of zero or negative one, -1 will be returned. 1447b8e80941Smrg * 1448b8e80941Smrg * For all negative number cases, including 0x80000000 and 0xffffffff, 1449b8e80941Smrg * the correct value is obtained from findMSB if instead of negating the 1450b8e80941Smrg * (already negative) value the logical-not is used. A conditonal 1451b8e80941Smrg * logical-not can be achieved in two instructions. 1452b8e80941Smrg */ 1453b8e80941Smrg ir_variable *as_int = 1454b8e80941Smrg new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary); 1455b8e80941Smrg ir_constant *c31 = new(ir) ir_constant(int(31), elements); 1456b8e80941Smrg 1457b8e80941Smrg i.insert_before(as_int); 1458b8e80941Smrg i.insert_before(assign(as_int, ir->operands[0])); 1459b8e80941Smrg i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor, 1460b8e80941Smrg as_int, 1461b8e80941Smrg rshift(as_int, c31))))); 1462b8e80941Smrg } 1463b8e80941Smrg 1464b8e80941Smrg /* The int-to-float conversion is lossless because bits are conditionally 1465b8e80941Smrg * masked off the bottom of temp to ensure the value has at most 24 bits of 1466b8e80941Smrg * data or is zero. We don't use the result in the zero case. The uint() 1467b8e80941Smrg * cast is necessary so that 0x80000000 does not generate a negative value. 1468b8e80941Smrg * 1469b8e80941Smrg * float as_float = float(temp > 255 ? temp & ~255 : temp); 1470b8e80941Smrg */ 1471b8e80941Smrg i.insert_before(as_float); 1472b8e80941Smrg i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF), 1473b8e80941Smrg bit_and(temp, cFFFFFF00), 1474b8e80941Smrg temp)))); 1475b8e80941Smrg 1476b8e80941Smrg /* This is basically an open-coded frexp. Implementations that have a 1477b8e80941Smrg * native frexp instruction would be better served by that. This is 1478b8e80941Smrg * optimized versus a full-featured open-coded implementation in two ways: 1479b8e80941Smrg * 1480b8e80941Smrg * - We don't care about a correct result from subnormal numbers (including 1481b8e80941Smrg * 0.0), so the raw exponent can always be safely unbiased. 1482b8e80941Smrg * 1483b8e80941Smrg * - The value cannot be negative, so it does not need to be masked off to 1484b8e80941Smrg * extract the exponent. 1485b8e80941Smrg * 1486b8e80941Smrg * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f; 1487b8e80941Smrg */ 1488b8e80941Smrg i.insert_before(msb); 1489b8e80941Smrg i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 1490b8e80941Smrg 1491b8e80941Smrg /* Use msb in the comparison instead of temp so that the subtract can 1492b8e80941Smrg * possibly generate the result without an explicit comparison. 1493b8e80941Smrg * 1494b8e80941Smrg * (msb < 0) ? -1 : msb; 1495b8e80941Smrg * 1496b8e80941Smrg * Since our input values are all integers, the unbiased exponent must not 1497b8e80941Smrg * be negative. It will only be negative (-0x7f, in fact) if temp is 0. 1498b8e80941Smrg */ 1499b8e80941Smrg ir->operation = ir_triop_csel; 1500b8e80941Smrg ir->init_num_operands(); 1501b8e80941Smrg ir->operands[0] = less(msb, c0); 1502b8e80941Smrg ir->operands[1] = cminus1; 1503b8e80941Smrg ir->operands[2] = new(ir) ir_dereference_variable(msb); 1504b8e80941Smrg 1505b8e80941Smrg this->progress = true; 1506b8e80941Smrg} 1507b8e80941Smrg 1508b8e80941Smrgir_expression * 1509b8e80941Smrglower_instructions_visitor::_carry(operand a, operand b) 1510b8e80941Smrg{ 1511b8e80941Smrg if (lowering(CARRY_TO_ARITH)) 1512b8e80941Smrg return i2u(b2i(less(add(a, b), 1513b8e80941Smrg a.val->clone(ralloc_parent(a.val), NULL)))); 1514b8e80941Smrg else 1515b8e80941Smrg return carry(a, b); 1516b8e80941Smrg} 1517b8e80941Smrg 1518b8e80941Smrgvoid 1519b8e80941Smrglower_instructions_visitor::imul_high_to_mul(ir_expression *ir) 1520b8e80941Smrg{ 1521b8e80941Smrg /* ABCD 1522b8e80941Smrg * * EFGH 1523b8e80941Smrg * ====== 1524b8e80941Smrg * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 1525b8e80941Smrg * 1526b8e80941Smrg * In GLSL, (a * b) becomes 1527b8e80941Smrg * 1528b8e80941Smrg * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu); 1529b8e80941Smrg * uint m2 = (a & 0x0000ffffu) * (b >> 16); 1530b8e80941Smrg * uint m3 = (a >> 16) * (b & 0x0000ffffu); 1531b8e80941Smrg * uint m4 = (a >> 16) * (b >> 16); 1532b8e80941Smrg * 1533b8e80941Smrg * uint c1; 1534b8e80941Smrg * uint c2; 1535b8e80941Smrg * uint lo_result; 1536b8e80941Smrg * uint hi_result; 1537b8e80941Smrg * 1538b8e80941Smrg * lo_result = uaddCarry(m1, m2 << 16, c1); 1539b8e80941Smrg * hi_result = m4 + c1; 1540b8e80941Smrg * lo_result = uaddCarry(lo_result, m3 << 16, c2); 1541b8e80941Smrg * hi_result = hi_result + c2; 1542b8e80941Smrg * hi_result = hi_result + (m2 >> 16) + (m3 >> 16); 1543b8e80941Smrg */ 1544b8e80941Smrg const unsigned elements = ir->operands[0]->type->vector_elements; 1545b8e80941Smrg ir_variable *src1 = 1546b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary); 1547b8e80941Smrg ir_variable *src1h = 1548b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary); 1549b8e80941Smrg ir_variable *src1l = 1550b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary); 1551b8e80941Smrg ir_variable *src2 = 1552b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary); 1553b8e80941Smrg ir_variable *src2h = 1554b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary); 1555b8e80941Smrg ir_variable *src2l = 1556b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary); 1557b8e80941Smrg ir_variable *t1 = 1558b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary); 1559b8e80941Smrg ir_variable *t2 = 1560b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary); 1561b8e80941Smrg ir_variable *lo = 1562b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary); 1563b8e80941Smrg ir_variable *hi = 1564b8e80941Smrg new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary); 1565b8e80941Smrg ir_variable *different_signs = NULL; 1566b8e80941Smrg ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements); 1567b8e80941Smrg ir_constant *c16 = new(ir) ir_constant(16u, elements); 1568b8e80941Smrg 1569b8e80941Smrg ir_instruction &i = *base_ir; 1570b8e80941Smrg 1571b8e80941Smrg i.insert_before(src1); 1572b8e80941Smrg i.insert_before(src2); 1573b8e80941Smrg i.insert_before(src1h); 1574b8e80941Smrg i.insert_before(src2h); 1575b8e80941Smrg i.insert_before(src1l); 1576b8e80941Smrg i.insert_before(src2l); 1577b8e80941Smrg 1578b8e80941Smrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 1579b8e80941Smrg i.insert_before(assign(src1, ir->operands[0])); 1580b8e80941Smrg i.insert_before(assign(src2, ir->operands[1])); 1581b8e80941Smrg } else { 1582b8e80941Smrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1583b8e80941Smrg 1584b8e80941Smrg ir_variable *itmp1 = 1585b8e80941Smrg new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary); 1586b8e80941Smrg ir_variable *itmp2 = 1587b8e80941Smrg new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary); 1588b8e80941Smrg ir_constant *c0 = new(ir) ir_constant(int(0), elements); 1589b8e80941Smrg 1590b8e80941Smrg i.insert_before(itmp1); 1591b8e80941Smrg i.insert_before(itmp2); 1592b8e80941Smrg i.insert_before(assign(itmp1, ir->operands[0])); 1593b8e80941Smrg i.insert_before(assign(itmp2, ir->operands[1])); 1594b8e80941Smrg 1595b8e80941Smrg different_signs = 1596b8e80941Smrg new(ir) ir_variable(glsl_type::bvec(elements), "different_signs", 1597b8e80941Smrg ir_var_temporary); 1598b8e80941Smrg 1599b8e80941Smrg i.insert_before(different_signs); 1600b8e80941Smrg i.insert_before(assign(different_signs, expr(ir_binop_logic_xor, 1601b8e80941Smrg less(itmp1, c0), 1602b8e80941Smrg less(itmp2, c0->clone(ir, NULL))))); 1603b8e80941Smrg 1604b8e80941Smrg i.insert_before(assign(src1, i2u(abs(itmp1)))); 1605b8e80941Smrg i.insert_before(assign(src2, i2u(abs(itmp2)))); 1606b8e80941Smrg } 1607b8e80941Smrg 1608b8e80941Smrg i.insert_before(assign(src1l, bit_and(src1, c0000FFFF))); 1609b8e80941Smrg i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL)))); 1610b8e80941Smrg i.insert_before(assign(src1h, rshift(src1, c16))); 1611b8e80941Smrg i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL)))); 1612b8e80941Smrg 1613b8e80941Smrg i.insert_before(lo); 1614b8e80941Smrg i.insert_before(hi); 1615b8e80941Smrg i.insert_before(t1); 1616b8e80941Smrg i.insert_before(t2); 1617b8e80941Smrg 1618b8e80941Smrg i.insert_before(assign(lo, mul(src1l, src2l))); 1619b8e80941Smrg i.insert_before(assign(t1, mul(src1l, src2h))); 1620b8e80941Smrg i.insert_before(assign(t2, mul(src1h, src2l))); 1621b8e80941Smrg i.insert_before(assign(hi, mul(src1h, src2h))); 1622b8e80941Smrg 1623b8e80941Smrg i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL)))))); 1624b8e80941Smrg i.insert_before(assign(lo, add(lo, lshift(t1, c16->clone(ir, NULL))))); 1625b8e80941Smrg 1626b8e80941Smrg i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL)))))); 1627b8e80941Smrg i.insert_before(assign(lo, add(lo, lshift(t2, c16->clone(ir, NULL))))); 1628b8e80941Smrg 1629b8e80941Smrg if (different_signs == NULL) { 1630b8e80941Smrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 1631b8e80941Smrg 1632b8e80941Smrg ir->operation = ir_binop_add; 1633b8e80941Smrg ir->init_num_operands(); 1634b8e80941Smrg ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL))); 1635b8e80941Smrg ir->operands[1] = rshift(t2, c16->clone(ir, NULL)); 1636b8e80941Smrg } else { 1637b8e80941Smrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 1638b8e80941Smrg 1639b8e80941Smrg i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))), 1640b8e80941Smrg rshift(t2, c16->clone(ir, NULL))))); 1641b8e80941Smrg 1642b8e80941Smrg /* For channels where different_signs is set we have to perform a 64-bit 1643b8e80941Smrg * negation. This is *not* the same as just negating the high 32-bits. 1644b8e80941Smrg * Consider -3 * 2. The high 32-bits is 0, but the desired result is 1645b8e80941Smrg * -1, not -0! Recall -x == ~x + 1. 1646b8e80941Smrg */ 1647b8e80941Smrg ir_variable *neg_hi = 1648b8e80941Smrg new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary); 1649b8e80941Smrg ir_constant *c1 = new(ir) ir_constant(1u, elements); 1650b8e80941Smrg 1651b8e80941Smrg i.insert_before(neg_hi); 1652b8e80941Smrg i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)), 1653b8e80941Smrg u2i(_carry(bit_not(lo), c1))))); 1654b8e80941Smrg 1655b8e80941Smrg ir->operation = ir_triop_csel; 1656b8e80941Smrg ir->init_num_operands(); 1657b8e80941Smrg ir->operands[0] = new(ir) ir_dereference_variable(different_signs); 1658b8e80941Smrg ir->operands[1] = new(ir) ir_dereference_variable(neg_hi); 1659b8e80941Smrg ir->operands[2] = u2i(hi); 1660b8e80941Smrg } 1661b8e80941Smrg} 1662b8e80941Smrg 1663b8e80941Smrgvoid 1664b8e80941Smrglower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir) 1665b8e80941Smrg{ 1666b8e80941Smrg ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]); 1667b8e80941Smrg this->progress = true; 1668b8e80941Smrg} 1669b8e80941Smrg 1670b8e80941Smrgvoid 1671b8e80941Smrglower_instructions_visitor::mul64_to_mul_and_mul_high(ir_expression *ir) 1672b8e80941Smrg{ 1673b8e80941Smrg /* Lower 32x32-> 64 to 1674b8e80941Smrg * msb = imul_high(x_lo, y_lo) 1675b8e80941Smrg * lsb = mul(x_lo, y_lo) 1676b8e80941Smrg */ 1677b8e80941Smrg const unsigned elements = ir->operands[0]->type->vector_elements; 1678b8e80941Smrg 1679b8e80941Smrg const ir_expression_operation operation = 1680b8e80941Smrg ir->type->base_type == GLSL_TYPE_UINT64 ? ir_unop_pack_uint_2x32 1681b8e80941Smrg : ir_unop_pack_int_2x32; 1682b8e80941Smrg 1683b8e80941Smrg const glsl_type *var_type = ir->type->base_type == GLSL_TYPE_UINT64 1684b8e80941Smrg ? glsl_type::uvec(elements) 1685b8e80941Smrg : glsl_type::ivec(elements); 1686b8e80941Smrg 1687b8e80941Smrg const glsl_type *ret_type = ir->type->base_type == GLSL_TYPE_UINT64 1688b8e80941Smrg ? glsl_type::uvec2_type 1689b8e80941Smrg : glsl_type::ivec2_type; 1690b8e80941Smrg 1691b8e80941Smrg ir_instruction &i = *base_ir; 1692b8e80941Smrg 1693b8e80941Smrg ir_variable *msb = 1694b8e80941Smrg new(ir) ir_variable(var_type, "msb", ir_var_temporary); 1695b8e80941Smrg ir_variable *lsb = 1696b8e80941Smrg new(ir) ir_variable(var_type, "lsb", ir_var_temporary); 1697b8e80941Smrg ir_variable *x = 1698b8e80941Smrg new(ir) ir_variable(var_type, "x", ir_var_temporary); 1699b8e80941Smrg ir_variable *y = 1700b8e80941Smrg new(ir) ir_variable(var_type, "y", ir_var_temporary); 1701b8e80941Smrg 1702b8e80941Smrg i.insert_before(x); 1703b8e80941Smrg i.insert_before(assign(x, ir->operands[0])); 1704b8e80941Smrg i.insert_before(y); 1705b8e80941Smrg i.insert_before(assign(y, ir->operands[1])); 1706b8e80941Smrg i.insert_before(msb); 1707b8e80941Smrg i.insert_before(lsb); 1708b8e80941Smrg 1709b8e80941Smrg i.insert_before(assign(msb, imul_high(x, y))); 1710b8e80941Smrg i.insert_before(assign(lsb, mul(x, y))); 1711b8e80941Smrg 1712b8e80941Smrg ir_rvalue *result[4] = {NULL}; 1713b8e80941Smrg for (unsigned elem = 0; elem < elements; elem++) { 1714b8e80941Smrg ir_rvalue *val = new(ir) ir_expression(ir_quadop_vector, ret_type, 1715b8e80941Smrg swizzle(lsb, elem, 1), 1716b8e80941Smrg swizzle(msb, elem, 1), NULL, NULL); 1717b8e80941Smrg result[elem] = expr(operation, val); 1718b8e80941Smrg } 1719b8e80941Smrg 1720b8e80941Smrg ir->operation = ir_quadop_vector; 1721b8e80941Smrg ir->init_num_operands(); 1722b8e80941Smrg ir->operands[0] = result[0]; 1723b8e80941Smrg ir->operands[1] = result[1]; 1724b8e80941Smrg ir->operands[2] = result[2]; 1725b8e80941Smrg ir->operands[3] = result[3]; 1726b8e80941Smrg 1727b8e80941Smrg this->progress = true; 1728b8e80941Smrg} 1729b8e80941Smrg 1730b8e80941Smrgir_visitor_status 1731b8e80941Smrglower_instructions_visitor::visit_leave(ir_expression *ir) 1732b8e80941Smrg{ 1733b8e80941Smrg switch (ir->operation) { 1734b8e80941Smrg case ir_binop_dot: 1735b8e80941Smrg if (ir->operands[0]->type->is_double()) 1736b8e80941Smrg double_dot_to_fma(ir); 1737b8e80941Smrg break; 1738b8e80941Smrg case ir_triop_lrp: 1739b8e80941Smrg if (ir->operands[0]->type->is_double()) 1740b8e80941Smrg double_lrp(ir); 1741b8e80941Smrg break; 1742b8e80941Smrg case ir_binop_sub: 1743b8e80941Smrg if (lowering(SUB_TO_ADD_NEG)) 1744b8e80941Smrg sub_to_add_neg(ir); 1745b8e80941Smrg break; 1746b8e80941Smrg 1747b8e80941Smrg case ir_binop_div: 1748b8e80941Smrg if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP)) 1749b8e80941Smrg int_div_to_mul_rcp(ir); 1750b8e80941Smrg else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) || 1751b8e80941Smrg (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP))) 1752b8e80941Smrg div_to_mul_rcp(ir); 1753b8e80941Smrg break; 1754b8e80941Smrg 1755b8e80941Smrg case ir_unop_exp: 1756b8e80941Smrg if (lowering(EXP_TO_EXP2)) 1757b8e80941Smrg exp_to_exp2(ir); 1758b8e80941Smrg break; 1759b8e80941Smrg 1760b8e80941Smrg case ir_unop_log: 1761b8e80941Smrg if (lowering(LOG_TO_LOG2)) 1762b8e80941Smrg log_to_log2(ir); 1763b8e80941Smrg break; 1764b8e80941Smrg 1765b8e80941Smrg case ir_binop_mod: 1766b8e80941Smrg if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double())) 1767b8e80941Smrg mod_to_floor(ir); 1768b8e80941Smrg break; 1769b8e80941Smrg 1770b8e80941Smrg case ir_binop_pow: 1771b8e80941Smrg if (lowering(POW_TO_EXP2)) 1772b8e80941Smrg pow_to_exp2(ir); 1773b8e80941Smrg break; 1774b8e80941Smrg 1775b8e80941Smrg case ir_binop_ldexp: 1776b8e80941Smrg if (lowering(LDEXP_TO_ARITH) && ir->type->is_float()) 1777b8e80941Smrg ldexp_to_arith(ir); 1778b8e80941Smrg if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double()) 1779b8e80941Smrg dldexp_to_arith(ir); 1780b8e80941Smrg break; 1781b8e80941Smrg 1782b8e80941Smrg case ir_unop_frexp_exp: 1783b8e80941Smrg if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1784b8e80941Smrg dfrexp_exp_to_arith(ir); 1785b8e80941Smrg break; 1786b8e80941Smrg 1787b8e80941Smrg case ir_unop_frexp_sig: 1788b8e80941Smrg if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 1789b8e80941Smrg dfrexp_sig_to_arith(ir); 1790b8e80941Smrg break; 1791b8e80941Smrg 1792b8e80941Smrg case ir_binop_carry: 1793b8e80941Smrg if (lowering(CARRY_TO_ARITH)) 1794b8e80941Smrg carry_to_arith(ir); 1795b8e80941Smrg break; 1796b8e80941Smrg 1797b8e80941Smrg case ir_binop_borrow: 1798b8e80941Smrg if (lowering(BORROW_TO_ARITH)) 1799b8e80941Smrg borrow_to_arith(ir); 1800b8e80941Smrg break; 1801b8e80941Smrg 1802b8e80941Smrg case ir_unop_saturate: 1803b8e80941Smrg if (lowering(SAT_TO_CLAMP)) 1804b8e80941Smrg sat_to_clamp(ir); 1805b8e80941Smrg break; 1806b8e80941Smrg 1807b8e80941Smrg case ir_unop_trunc: 1808b8e80941Smrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1809b8e80941Smrg dtrunc_to_dfrac(ir); 1810b8e80941Smrg break; 1811b8e80941Smrg 1812b8e80941Smrg case ir_unop_ceil: 1813b8e80941Smrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1814b8e80941Smrg dceil_to_dfrac(ir); 1815b8e80941Smrg break; 1816b8e80941Smrg 1817b8e80941Smrg case ir_unop_floor: 1818b8e80941Smrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1819b8e80941Smrg dfloor_to_dfrac(ir); 1820b8e80941Smrg break; 1821b8e80941Smrg 1822b8e80941Smrg case ir_unop_round_even: 1823b8e80941Smrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1824b8e80941Smrg dround_even_to_dfrac(ir); 1825b8e80941Smrg break; 1826b8e80941Smrg 1827b8e80941Smrg case ir_unop_sign: 1828b8e80941Smrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 1829b8e80941Smrg dsign_to_csel(ir); 1830b8e80941Smrg break; 1831b8e80941Smrg 1832b8e80941Smrg case ir_unop_bit_count: 1833b8e80941Smrg if (lowering(BIT_COUNT_TO_MATH)) 1834b8e80941Smrg bit_count_to_math(ir); 1835b8e80941Smrg break; 1836b8e80941Smrg 1837b8e80941Smrg case ir_triop_bitfield_extract: 1838b8e80941Smrg if (lowering(EXTRACT_TO_SHIFTS)) 1839b8e80941Smrg extract_to_shifts(ir); 1840b8e80941Smrg break; 1841b8e80941Smrg 1842b8e80941Smrg case ir_quadop_bitfield_insert: 1843b8e80941Smrg if (lowering(INSERT_TO_SHIFTS)) 1844b8e80941Smrg insert_to_shifts(ir); 1845b8e80941Smrg break; 1846b8e80941Smrg 1847b8e80941Smrg case ir_unop_bitfield_reverse: 1848b8e80941Smrg if (lowering(REVERSE_TO_SHIFTS)) 1849b8e80941Smrg reverse_to_shifts(ir); 1850b8e80941Smrg break; 1851b8e80941Smrg 1852b8e80941Smrg case ir_unop_find_lsb: 1853b8e80941Smrg if (lowering(FIND_LSB_TO_FLOAT_CAST)) 1854b8e80941Smrg find_lsb_to_float_cast(ir); 1855b8e80941Smrg break; 1856b8e80941Smrg 1857b8e80941Smrg case ir_unop_find_msb: 1858b8e80941Smrg if (lowering(FIND_MSB_TO_FLOAT_CAST)) 1859b8e80941Smrg find_msb_to_float_cast(ir); 1860b8e80941Smrg break; 1861b8e80941Smrg 1862b8e80941Smrg case ir_binop_imul_high: 1863b8e80941Smrg if (lowering(IMUL_HIGH_TO_MUL)) 1864b8e80941Smrg imul_high_to_mul(ir); 1865b8e80941Smrg break; 1866b8e80941Smrg 1867b8e80941Smrg case ir_binop_mul: 1868b8e80941Smrg if (lowering(MUL64_TO_MUL_AND_MUL_HIGH) && 1869b8e80941Smrg (ir->type->base_type == GLSL_TYPE_INT64 || 1870b8e80941Smrg ir->type->base_type == GLSL_TYPE_UINT64) && 1871b8e80941Smrg (ir->operands[0]->type->base_type == GLSL_TYPE_INT || 1872b8e80941Smrg ir->operands[1]->type->base_type == GLSL_TYPE_UINT)) 1873b8e80941Smrg mul64_to_mul_and_mul_high(ir); 1874b8e80941Smrg break; 1875b8e80941Smrg 1876b8e80941Smrg case ir_unop_rsq: 1877b8e80941Smrg case ir_unop_sqrt: 1878b8e80941Smrg if (lowering(SQRT_TO_ABS_SQRT)) 1879b8e80941Smrg sqrt_to_abs_sqrt(ir); 1880b8e80941Smrg break; 1881b8e80941Smrg 1882b8e80941Smrg default: 1883b8e80941Smrg return visit_continue; 1884b8e80941Smrg } 1885b8e80941Smrg 1886b8e80941Smrg return visit_continue; 1887b8e80941Smrg} 1888