101e04c3fSmrg/* 201e04c3fSmrg * Copyright © 2010 Intel Corporation 301e04c3fSmrg * 401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a 501e04c3fSmrg * copy of this software and associated documentation files (the "Software"), 601e04c3fSmrg * to deal in the Software without restriction, including without limitation 701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense, 801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the 901e04c3fSmrg * Software is furnished to do so, subject to the following conditions: 1001e04c3fSmrg * 1101e04c3fSmrg * The above copyright notice and this permission notice (including the next 1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the 1301e04c3fSmrg * Software. 1401e04c3fSmrg * 1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 2101e04c3fSmrg * DEALINGS IN THE SOFTWARE. 2201e04c3fSmrg */ 2301e04c3fSmrg 2401e04c3fSmrg/** 2501e04c3fSmrg * \file lower_instructions.cpp 2601e04c3fSmrg * 2701e04c3fSmrg * Many GPUs lack native instructions for certain expression operations, and 2801e04c3fSmrg * must replace them with some other expression tree. This pass lowers some 2901e04c3fSmrg * of the most common cases, allowing the lowering code to be implemented once 3001e04c3fSmrg * rather than in each driver backend. 3101e04c3fSmrg * 3201e04c3fSmrg * Currently supported transformations: 3301e04c3fSmrg * - SUB_TO_ADD_NEG 3401e04c3fSmrg * - DIV_TO_MUL_RCP 3501e04c3fSmrg * - INT_DIV_TO_MUL_RCP 3601e04c3fSmrg * - EXP_TO_EXP2 3701e04c3fSmrg * - POW_TO_EXP2 3801e04c3fSmrg * - LOG_TO_LOG2 3901e04c3fSmrg * - MOD_TO_FLOOR 4001e04c3fSmrg * - LDEXP_TO_ARITH 4101e04c3fSmrg * - DFREXP_TO_ARITH 4201e04c3fSmrg * - CARRY_TO_ARITH 4301e04c3fSmrg * - BORROW_TO_ARITH 4401e04c3fSmrg * - SAT_TO_CLAMP 4501e04c3fSmrg * - DOPS_TO_DFRAC 4601e04c3fSmrg * 4701e04c3fSmrg * SUB_TO_ADD_NEG: 4801e04c3fSmrg * --------------- 4901e04c3fSmrg * Breaks an ir_binop_sub expression down to add(op0, neg(op1)) 5001e04c3fSmrg * 5101e04c3fSmrg * This simplifies expression reassociation, and for many backends 5201e04c3fSmrg * there is no subtract operation separate from adding the negation. 5301e04c3fSmrg * For backends with native subtract operations, they will probably 5401e04c3fSmrg * want to recognize add(op0, neg(op1)) or the other way around to 5501e04c3fSmrg * produce a subtract anyway. 5601e04c3fSmrg * 5701e04c3fSmrg * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP: 5801e04c3fSmrg * --------------------------------------------------------- 5901e04c3fSmrg * Breaks an ir_binop_div expression down to op0 * (rcp(op1)). 6001e04c3fSmrg * 6101e04c3fSmrg * Many GPUs don't have a divide instruction (945 and 965 included), 6201e04c3fSmrg * but they do have an RCP instruction to compute an approximate 6301e04c3fSmrg * reciprocal. By breaking the operation down, constant reciprocals 6401e04c3fSmrg * can get constant folded. 6501e04c3fSmrg * 667ec681f3Smrg * FDIV_TO_MUL_RCP lowers single-precision and half-precision 677ec681f3Smrg * floating point division; 6801e04c3fSmrg * DDIV_TO_MUL_RCP only lowers double-precision floating point division. 6901e04c3fSmrg * DIV_TO_MUL_RCP is a convenience macro that sets both flags. 7001e04c3fSmrg * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating 7101e04c3fSmrg * point so that RCP is possible. 7201e04c3fSmrg * 7301e04c3fSmrg * EXP_TO_EXP2 and LOG_TO_LOG2: 7401e04c3fSmrg * ---------------------------- 7501e04c3fSmrg * Many GPUs don't have a base e log or exponent instruction, but they 7601e04c3fSmrg * do have base 2 versions, so this pass converts exp and log to exp2 7701e04c3fSmrg * and log2 operations. 7801e04c3fSmrg * 7901e04c3fSmrg * POW_TO_EXP2: 8001e04c3fSmrg * ----------- 8101e04c3fSmrg * Many older GPUs don't have an x**y instruction. For these GPUs, convert 8201e04c3fSmrg * x**y to 2**(y * log2(x)). 8301e04c3fSmrg * 8401e04c3fSmrg * MOD_TO_FLOOR: 8501e04c3fSmrg * ------------- 8601e04c3fSmrg * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1)) 8701e04c3fSmrg * 8801e04c3fSmrg * Many GPUs don't have a MOD instruction (945 and 965 included), and 8901e04c3fSmrg * if we have to break it down like this anyway, it gives an 9001e04c3fSmrg * opportunity to do things like constant fold the (1.0 / op1) easily. 9101e04c3fSmrg * 9201e04c3fSmrg * Note: before we used to implement this as op1 * fract(op / op1) but this 9301e04c3fSmrg * implementation had significant precision errors. 9401e04c3fSmrg * 9501e04c3fSmrg * LDEXP_TO_ARITH: 9601e04c3fSmrg * ------------- 9701e04c3fSmrg * Converts ir_binop_ldexp to arithmetic and bit operations for float sources. 9801e04c3fSmrg * 9901e04c3fSmrg * DFREXP_DLDEXP_TO_ARITH: 10001e04c3fSmrg * --------------- 10101e04c3fSmrg * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to 10201e04c3fSmrg * arithmetic and bit ops for double arguments. 10301e04c3fSmrg * 10401e04c3fSmrg * CARRY_TO_ARITH: 10501e04c3fSmrg * --------------- 10601e04c3fSmrg * Converts ir_carry into (x + y) < x. 10701e04c3fSmrg * 10801e04c3fSmrg * BORROW_TO_ARITH: 10901e04c3fSmrg * ---------------- 11001e04c3fSmrg * Converts ir_borrow into (x < y). 11101e04c3fSmrg * 11201e04c3fSmrg * SAT_TO_CLAMP: 11301e04c3fSmrg * ------------- 11401e04c3fSmrg * Converts ir_unop_saturate into min(max(x, 0.0), 1.0) 11501e04c3fSmrg * 11601e04c3fSmrg * DOPS_TO_DFRAC: 11701e04c3fSmrg * -------------- 11801e04c3fSmrg * Converts double trunc, ceil, floor, round to fract 11901e04c3fSmrg */ 12001e04c3fSmrg 12101e04c3fSmrg#include "c99_math.h" 12201e04c3fSmrg#include "program/prog_instruction.h" /* for swizzle */ 12301e04c3fSmrg#include "compiler/glsl_types.h" 12401e04c3fSmrg#include "ir.h" 12501e04c3fSmrg#include "ir_builder.h" 12601e04c3fSmrg#include "ir_optimization.h" 1277ec681f3Smrg#include "util/half_float.h" 12801e04c3fSmrg 12901e04c3fSmrgusing namespace ir_builder; 13001e04c3fSmrg 13101e04c3fSmrgnamespace { 13201e04c3fSmrg 13301e04c3fSmrgclass lower_instructions_visitor : public ir_hierarchical_visitor { 13401e04c3fSmrgpublic: 13501e04c3fSmrg lower_instructions_visitor(unsigned lower) 13601e04c3fSmrg : progress(false), lower(lower) { } 13701e04c3fSmrg 13801e04c3fSmrg ir_visitor_status visit_leave(ir_expression *); 13901e04c3fSmrg 14001e04c3fSmrg bool progress; 14101e04c3fSmrg 14201e04c3fSmrgprivate: 14301e04c3fSmrg unsigned lower; /** Bitfield of which operations to lower */ 14401e04c3fSmrg 14501e04c3fSmrg void sub_to_add_neg(ir_expression *); 14601e04c3fSmrg void div_to_mul_rcp(ir_expression *); 14701e04c3fSmrg void int_div_to_mul_rcp(ir_expression *); 14801e04c3fSmrg void mod_to_floor(ir_expression *); 14901e04c3fSmrg void exp_to_exp2(ir_expression *); 15001e04c3fSmrg void pow_to_exp2(ir_expression *); 15101e04c3fSmrg void log_to_log2(ir_expression *); 15201e04c3fSmrg void ldexp_to_arith(ir_expression *); 15301e04c3fSmrg void dldexp_to_arith(ir_expression *); 15401e04c3fSmrg void dfrexp_sig_to_arith(ir_expression *); 15501e04c3fSmrg void dfrexp_exp_to_arith(ir_expression *); 15601e04c3fSmrg void carry_to_arith(ir_expression *); 15701e04c3fSmrg void borrow_to_arith(ir_expression *); 15801e04c3fSmrg void sat_to_clamp(ir_expression *); 15901e04c3fSmrg void double_dot_to_fma(ir_expression *); 16001e04c3fSmrg void double_lrp(ir_expression *); 16101e04c3fSmrg void dceil_to_dfrac(ir_expression *); 16201e04c3fSmrg void dfloor_to_dfrac(ir_expression *); 16301e04c3fSmrg void dround_even_to_dfrac(ir_expression *); 16401e04c3fSmrg void dtrunc_to_dfrac(ir_expression *); 16501e04c3fSmrg void dsign_to_csel(ir_expression *); 16601e04c3fSmrg void bit_count_to_math(ir_expression *); 16701e04c3fSmrg void extract_to_shifts(ir_expression *); 16801e04c3fSmrg void insert_to_shifts(ir_expression *); 16901e04c3fSmrg void reverse_to_shifts(ir_expression *ir); 17001e04c3fSmrg void find_lsb_to_float_cast(ir_expression *ir); 17101e04c3fSmrg void find_msb_to_float_cast(ir_expression *ir); 17201e04c3fSmrg void imul_high_to_mul(ir_expression *ir); 17301e04c3fSmrg void sqrt_to_abs_sqrt(ir_expression *ir); 1747e102996Smaya void mul64_to_mul_and_mul_high(ir_expression *ir); 17501e04c3fSmrg 17601e04c3fSmrg ir_expression *_carry(operand a, operand b); 1777ec681f3Smrg 1787ec681f3Smrg static ir_constant *_imm_fp(void *mem_ctx, 1797ec681f3Smrg const glsl_type *type, 1807ec681f3Smrg double f, 1817ec681f3Smrg unsigned vector_elements=1); 18201e04c3fSmrg}; 18301e04c3fSmrg 18401e04c3fSmrg} /* anonymous namespace */ 18501e04c3fSmrg 18601e04c3fSmrg/** 18701e04c3fSmrg * Determine if a particular type of lowering should occur 18801e04c3fSmrg */ 18901e04c3fSmrg#define lowering(x) (this->lower & x) 19001e04c3fSmrg 19101e04c3fSmrgbool 19201e04c3fSmrglower_instructions(exec_list *instructions, unsigned what_to_lower) 19301e04c3fSmrg{ 19401e04c3fSmrg lower_instructions_visitor v(what_to_lower); 19501e04c3fSmrg 19601e04c3fSmrg visit_list_elements(&v, instructions); 19701e04c3fSmrg return v.progress; 19801e04c3fSmrg} 19901e04c3fSmrg 20001e04c3fSmrgvoid 20101e04c3fSmrglower_instructions_visitor::sub_to_add_neg(ir_expression *ir) 20201e04c3fSmrg{ 20301e04c3fSmrg ir->operation = ir_binop_add; 20401e04c3fSmrg ir->init_num_operands(); 20501e04c3fSmrg ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type, 20601e04c3fSmrg ir->operands[1], NULL); 20701e04c3fSmrg this->progress = true; 20801e04c3fSmrg} 20901e04c3fSmrg 21001e04c3fSmrgvoid 21101e04c3fSmrglower_instructions_visitor::div_to_mul_rcp(ir_expression *ir) 21201e04c3fSmrg{ 2137ec681f3Smrg assert(ir->operands[1]->type->is_float_16_32_64()); 21401e04c3fSmrg 21501e04c3fSmrg /* New expression for the 1.0 / op1 */ 21601e04c3fSmrg ir_rvalue *expr; 21701e04c3fSmrg expr = new(ir) ir_expression(ir_unop_rcp, 21801e04c3fSmrg ir->operands[1]->type, 21901e04c3fSmrg ir->operands[1]); 22001e04c3fSmrg 22101e04c3fSmrg /* op0 / op1 -> op0 * (1.0 / op1) */ 22201e04c3fSmrg ir->operation = ir_binop_mul; 22301e04c3fSmrg ir->init_num_operands(); 22401e04c3fSmrg ir->operands[1] = expr; 22501e04c3fSmrg 22601e04c3fSmrg this->progress = true; 22701e04c3fSmrg} 22801e04c3fSmrg 22901e04c3fSmrgvoid 23001e04c3fSmrglower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir) 23101e04c3fSmrg{ 2327ec681f3Smrg assert(ir->operands[1]->type->is_integer_32()); 23301e04c3fSmrg 23401e04c3fSmrg /* Be careful with integer division -- we need to do it as a 23501e04c3fSmrg * float and re-truncate, since rcp(n > 1) of an integer would 23601e04c3fSmrg * just be 0. 23701e04c3fSmrg */ 23801e04c3fSmrg ir_rvalue *op0, *op1; 23901e04c3fSmrg const struct glsl_type *vec_type; 24001e04c3fSmrg 24101e04c3fSmrg vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 24201e04c3fSmrg ir->operands[1]->type->vector_elements, 24301e04c3fSmrg ir->operands[1]->type->matrix_columns); 24401e04c3fSmrg 24501e04c3fSmrg if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) 24601e04c3fSmrg op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL); 24701e04c3fSmrg else 24801e04c3fSmrg op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL); 24901e04c3fSmrg 25001e04c3fSmrg op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL); 25101e04c3fSmrg 25201e04c3fSmrg vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 25301e04c3fSmrg ir->operands[0]->type->vector_elements, 25401e04c3fSmrg ir->operands[0]->type->matrix_columns); 25501e04c3fSmrg 25601e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) 25701e04c3fSmrg op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL); 25801e04c3fSmrg else 25901e04c3fSmrg op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL); 26001e04c3fSmrg 26101e04c3fSmrg vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT, 26201e04c3fSmrg ir->type->vector_elements, 26301e04c3fSmrg ir->type->matrix_columns); 26401e04c3fSmrg 26501e04c3fSmrg op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1); 26601e04c3fSmrg 26701e04c3fSmrg if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) { 26801e04c3fSmrg ir->operation = ir_unop_f2i; 26901e04c3fSmrg ir->operands[0] = op0; 27001e04c3fSmrg } else { 27101e04c3fSmrg ir->operation = ir_unop_i2u; 27201e04c3fSmrg ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0); 27301e04c3fSmrg } 27401e04c3fSmrg ir->init_num_operands(); 27501e04c3fSmrg ir->operands[1] = NULL; 27601e04c3fSmrg 27701e04c3fSmrg this->progress = true; 27801e04c3fSmrg} 27901e04c3fSmrg 28001e04c3fSmrgvoid 28101e04c3fSmrglower_instructions_visitor::exp_to_exp2(ir_expression *ir) 28201e04c3fSmrg{ 2837ec681f3Smrg ir_constant *log2_e = _imm_fp(ir, ir->type, M_LOG2E); 28401e04c3fSmrg 28501e04c3fSmrg ir->operation = ir_unop_exp2; 28601e04c3fSmrg ir->init_num_operands(); 28701e04c3fSmrg ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type, 28801e04c3fSmrg ir->operands[0], log2_e); 28901e04c3fSmrg this->progress = true; 29001e04c3fSmrg} 29101e04c3fSmrg 29201e04c3fSmrgvoid 29301e04c3fSmrglower_instructions_visitor::pow_to_exp2(ir_expression *ir) 29401e04c3fSmrg{ 29501e04c3fSmrg ir_expression *const log2_x = 29601e04c3fSmrg new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 29701e04c3fSmrg ir->operands[0]); 29801e04c3fSmrg 29901e04c3fSmrg ir->operation = ir_unop_exp2; 30001e04c3fSmrg ir->init_num_operands(); 30101e04c3fSmrg ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type, 30201e04c3fSmrg ir->operands[1], log2_x); 30301e04c3fSmrg ir->operands[1] = NULL; 30401e04c3fSmrg this->progress = true; 30501e04c3fSmrg} 30601e04c3fSmrg 30701e04c3fSmrgvoid 30801e04c3fSmrglower_instructions_visitor::log_to_log2(ir_expression *ir) 30901e04c3fSmrg{ 31001e04c3fSmrg ir->operation = ir_binop_mul; 31101e04c3fSmrg ir->init_num_operands(); 31201e04c3fSmrg ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type, 31301e04c3fSmrg ir->operands[0], NULL); 3147ec681f3Smrg ir->operands[1] = _imm_fp(ir, ir->operands[0]->type, 1.0 / M_LOG2E); 31501e04c3fSmrg this->progress = true; 31601e04c3fSmrg} 31701e04c3fSmrg 31801e04c3fSmrgvoid 31901e04c3fSmrglower_instructions_visitor::mod_to_floor(ir_expression *ir) 32001e04c3fSmrg{ 32101e04c3fSmrg ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x", 32201e04c3fSmrg ir_var_temporary); 32301e04c3fSmrg ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y", 32401e04c3fSmrg ir_var_temporary); 32501e04c3fSmrg this->base_ir->insert_before(x); 32601e04c3fSmrg this->base_ir->insert_before(y); 32701e04c3fSmrg 32801e04c3fSmrg ir_assignment *const assign_x = 32901e04c3fSmrg new(ir) ir_assignment(new(ir) ir_dereference_variable(x), 33001e04c3fSmrg ir->operands[0]); 33101e04c3fSmrg ir_assignment *const assign_y = 33201e04c3fSmrg new(ir) ir_assignment(new(ir) ir_dereference_variable(y), 33301e04c3fSmrg ir->operands[1]); 33401e04c3fSmrg 33501e04c3fSmrg this->base_ir->insert_before(assign_x); 33601e04c3fSmrg this->base_ir->insert_before(assign_y); 33701e04c3fSmrg 33801e04c3fSmrg ir_expression *const div_expr = 33901e04c3fSmrg new(ir) ir_expression(ir_binop_div, x->type, 34001e04c3fSmrg new(ir) ir_dereference_variable(x), 34101e04c3fSmrg new(ir) ir_dereference_variable(y)); 34201e04c3fSmrg 34301e04c3fSmrg /* Don't generate new IR that would need to be lowered in an additional 34401e04c3fSmrg * pass. 34501e04c3fSmrg */ 3467ec681f3Smrg if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float_16_32()) || 34701e04c3fSmrg (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double())) 34801e04c3fSmrg div_to_mul_rcp(div_expr); 34901e04c3fSmrg 35001e04c3fSmrg ir_expression *const floor_expr = 35101e04c3fSmrg new(ir) ir_expression(ir_unop_floor, x->type, div_expr); 35201e04c3fSmrg 35301e04c3fSmrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 35401e04c3fSmrg dfloor_to_dfrac(floor_expr); 35501e04c3fSmrg 35601e04c3fSmrg ir_expression *const mul_expr = 35701e04c3fSmrg new(ir) ir_expression(ir_binop_mul, 35801e04c3fSmrg new(ir) ir_dereference_variable(y), 35901e04c3fSmrg floor_expr); 36001e04c3fSmrg 36101e04c3fSmrg ir->operation = ir_binop_sub; 36201e04c3fSmrg ir->init_num_operands(); 36301e04c3fSmrg ir->operands[0] = new(ir) ir_dereference_variable(x); 36401e04c3fSmrg ir->operands[1] = mul_expr; 36501e04c3fSmrg this->progress = true; 36601e04c3fSmrg} 36701e04c3fSmrg 36801e04c3fSmrgvoid 36901e04c3fSmrglower_instructions_visitor::ldexp_to_arith(ir_expression *ir) 37001e04c3fSmrg{ 37101e04c3fSmrg /* Translates 37201e04c3fSmrg * ir_binop_ldexp x exp 37301e04c3fSmrg * into 37401e04c3fSmrg * 37501e04c3fSmrg * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 37601e04c3fSmrg * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 37701e04c3fSmrg * 37801e04c3fSmrg * if (extracted_biased_exp >= 255) 37901e04c3fSmrg * return x; // +/-inf, NaN 38001e04c3fSmrg * 38101e04c3fSmrg * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 38201e04c3fSmrg * 38301e04c3fSmrg * if (min(resulting_biased_exp, extracted_biased_exp) < 1) 38401e04c3fSmrg * resulting_biased_exp = 0; 38501e04c3fSmrg * if (resulting_biased_exp >= 255 || 38601e04c3fSmrg * min(resulting_biased_exp, extracted_biased_exp) < 1) { 38701e04c3fSmrg * sign_mantissa &= sign_mask; 38801e04c3fSmrg * } 38901e04c3fSmrg * 39001e04c3fSmrg * return bitcast_u2f(sign_mantissa | 39101e04c3fSmrg * lshift(i2u(resulting_biased_exp), exp_shift)); 39201e04c3fSmrg * 39301e04c3fSmrg * which we can't actually implement as such, since the GLSL IR doesn't 39401e04c3fSmrg * have vectorized if-statements. We actually implement it without branches 39501e04c3fSmrg * using conditional-select: 39601e04c3fSmrg * 39701e04c3fSmrg * extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift); 39801e04c3fSmrg * resulting_biased_exp = min(extracted_biased_exp + exp, 255); 39901e04c3fSmrg * 40001e04c3fSmrg * sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask; 40101e04c3fSmrg * 40201e04c3fSmrg * flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0); 40301e04c3fSmrg * resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp) 40401e04c3fSmrg * zero_mantissa = logic_or(flush_to_zero, 40501e04c3fSmrg * gequal(resulting_biased_exp, 255)); 40601e04c3fSmrg * sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa); 40701e04c3fSmrg * 40801e04c3fSmrg * result = sign_mantissa | 40901e04c3fSmrg * lshift(i2u(resulting_biased_exp), exp_shift)); 41001e04c3fSmrg * 41101e04c3fSmrg * return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result)); 41201e04c3fSmrg * 41301e04c3fSmrg * The definition of ldexp in the GLSL spec says: 41401e04c3fSmrg * 41501e04c3fSmrg * "If this product is too large to be represented in the 41601e04c3fSmrg * floating-point type, the result is undefined." 41701e04c3fSmrg * 41801e04c3fSmrg * However, the definition of ldexp in the GLSL ES spec does not contain 41901e04c3fSmrg * this sentence, so we do need to handle overflow correctly. 42001e04c3fSmrg * 42101e04c3fSmrg * There is additional language limiting the defined range of exp, but this 42201e04c3fSmrg * is merely to allow implementations that store 2^exp in a temporary 42301e04c3fSmrg * variable. 42401e04c3fSmrg */ 42501e04c3fSmrg 42601e04c3fSmrg const unsigned vec_elem = ir->type->vector_elements; 42701e04c3fSmrg 42801e04c3fSmrg /* Types */ 42901e04c3fSmrg const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 43001e04c3fSmrg const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 43101e04c3fSmrg const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 43201e04c3fSmrg 43301e04c3fSmrg /* Temporary variables */ 43401e04c3fSmrg ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 43501e04c3fSmrg ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 43601e04c3fSmrg ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary); 43701e04c3fSmrg 43801e04c3fSmrg ir_variable *extracted_biased_exp = 43901e04c3fSmrg new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 44001e04c3fSmrg ir_variable *resulting_biased_exp = 44101e04c3fSmrg new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 44201e04c3fSmrg 44301e04c3fSmrg ir_variable *sign_mantissa = 44401e04c3fSmrg new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary); 44501e04c3fSmrg 44601e04c3fSmrg ir_variable *flush_to_zero = 44701e04c3fSmrg new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary); 44801e04c3fSmrg ir_variable *zero_mantissa = 44901e04c3fSmrg new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary); 45001e04c3fSmrg 45101e04c3fSmrg ir_instruction &i = *base_ir; 45201e04c3fSmrg 45301e04c3fSmrg /* Copy <x> and <exp> arguments. */ 45401e04c3fSmrg i.insert_before(x); 45501e04c3fSmrg i.insert_before(assign(x, ir->operands[0])); 45601e04c3fSmrg i.insert_before(exp); 45701e04c3fSmrg i.insert_before(assign(exp, ir->operands[1])); 45801e04c3fSmrg 45901e04c3fSmrg /* Extract the biased exponent from <x>. */ 46001e04c3fSmrg i.insert_before(extracted_biased_exp); 46101e04c3fSmrg i.insert_before(assign(extracted_biased_exp, 46201e04c3fSmrg rshift(bitcast_f2i(abs(x)), 46301e04c3fSmrg new(ir) ir_constant(23, vec_elem)))); 46401e04c3fSmrg 46501e04c3fSmrg /* The definition of ldexp in the GLSL 4.60 spec says: 46601e04c3fSmrg * 46701e04c3fSmrg * "If exp is greater than +128 (single-precision) or +1024 46801e04c3fSmrg * (double-precision), the value returned is undefined. If exp is less 46901e04c3fSmrg * than -126 (single-precision) or -1022 (double-precision), the value 47001e04c3fSmrg * returned may be flushed to zero." 47101e04c3fSmrg * 47201e04c3fSmrg * So we do not have to guard against the possibility of addition overflow, 47301e04c3fSmrg * which could happen when exp is close to INT_MAX. Addition underflow 47401e04c3fSmrg * cannot happen (the worst case is 0 + (-INT_MAX)). 47501e04c3fSmrg */ 47601e04c3fSmrg i.insert_before(resulting_biased_exp); 47701e04c3fSmrg i.insert_before(assign(resulting_biased_exp, 47801e04c3fSmrg min2(add(extracted_biased_exp, exp), 47901e04c3fSmrg new(ir) ir_constant(255, vec_elem)))); 48001e04c3fSmrg 48101e04c3fSmrg i.insert_before(sign_mantissa); 48201e04c3fSmrg i.insert_before(assign(sign_mantissa, 48301e04c3fSmrg bit_and(bitcast_f2u(x), 48401e04c3fSmrg new(ir) ir_constant(0x807fffffu, vec_elem)))); 48501e04c3fSmrg 48601e04c3fSmrg /* We flush to zero if the original or resulting biased exponent is 0, 48701e04c3fSmrg * indicating a +/-0.0 or subnormal input or output. 48801e04c3fSmrg * 48901e04c3fSmrg * The mantissa is set to 0 if the resulting biased exponent is 255, since 49001e04c3fSmrg * an overflow should produce a +/-inf result. 49101e04c3fSmrg * 49201e04c3fSmrg * Note that NaN inputs are handled separately. 49301e04c3fSmrg */ 49401e04c3fSmrg i.insert_before(flush_to_zero); 49501e04c3fSmrg i.insert_before(assign(flush_to_zero, 49601e04c3fSmrg lequal(min2(resulting_biased_exp, 49701e04c3fSmrg extracted_biased_exp), 49801e04c3fSmrg ir_constant::zero(ir, ivec)))); 49901e04c3fSmrg i.insert_before(assign(resulting_biased_exp, 50001e04c3fSmrg csel(flush_to_zero, 50101e04c3fSmrg ir_constant::zero(ir, ivec), 50201e04c3fSmrg resulting_biased_exp))); 50301e04c3fSmrg 50401e04c3fSmrg i.insert_before(zero_mantissa); 50501e04c3fSmrg i.insert_before(assign(zero_mantissa, 50601e04c3fSmrg logic_or(flush_to_zero, 50701e04c3fSmrg equal(resulting_biased_exp, 50801e04c3fSmrg new(ir) ir_constant(255, vec_elem))))); 50901e04c3fSmrg i.insert_before(assign(sign_mantissa, 51001e04c3fSmrg csel(zero_mantissa, 51101e04c3fSmrg bit_and(sign_mantissa, 51201e04c3fSmrg new(ir) ir_constant(0x80000000u, vec_elem)), 51301e04c3fSmrg sign_mantissa))); 51401e04c3fSmrg 51501e04c3fSmrg /* Don't generate new IR that would need to be lowered in an additional 51601e04c3fSmrg * pass. 51701e04c3fSmrg */ 51801e04c3fSmrg i.insert_before(result); 51901e04c3fSmrg if (!lowering(INSERT_TO_SHIFTS)) { 52001e04c3fSmrg i.insert_before(assign(result, 52101e04c3fSmrg bitfield_insert(sign_mantissa, 52201e04c3fSmrg i2u(resulting_biased_exp), 52301e04c3fSmrg new(ir) ir_constant(23u, vec_elem), 52401e04c3fSmrg new(ir) ir_constant(8u, vec_elem)))); 52501e04c3fSmrg } else { 52601e04c3fSmrg i.insert_before(assign(result, 52701e04c3fSmrg bit_or(sign_mantissa, 52801e04c3fSmrg lshift(i2u(resulting_biased_exp), 52901e04c3fSmrg new(ir) ir_constant(23, vec_elem))))); 53001e04c3fSmrg } 53101e04c3fSmrg 53201e04c3fSmrg ir->operation = ir_triop_csel; 53301e04c3fSmrg ir->init_num_operands(); 53401e04c3fSmrg ir->operands[0] = gequal(extracted_biased_exp, 53501e04c3fSmrg new(ir) ir_constant(255, vec_elem)); 53601e04c3fSmrg ir->operands[1] = new(ir) ir_dereference_variable(x); 53701e04c3fSmrg ir->operands[2] = bitcast_u2f(result); 53801e04c3fSmrg 53901e04c3fSmrg this->progress = true; 54001e04c3fSmrg} 54101e04c3fSmrg 54201e04c3fSmrgvoid 54301e04c3fSmrglower_instructions_visitor::dldexp_to_arith(ir_expression *ir) 54401e04c3fSmrg{ 54501e04c3fSmrg /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent 54601e04c3fSmrg * from the significand. 54701e04c3fSmrg */ 54801e04c3fSmrg 54901e04c3fSmrg const unsigned vec_elem = ir->type->vector_elements; 55001e04c3fSmrg 55101e04c3fSmrg /* Types */ 55201e04c3fSmrg const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1); 55301e04c3fSmrg const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 55401e04c3fSmrg 55501e04c3fSmrg /* Constants */ 55601e04c3fSmrg ir_constant *zeroi = ir_constant::zero(ir, ivec); 55701e04c3fSmrg 55801e04c3fSmrg ir_constant *sign_mask = new(ir) ir_constant(0x80000000u); 55901e04c3fSmrg 56001e04c3fSmrg ir_constant *exp_shift = new(ir) ir_constant(20u); 56101e04c3fSmrg ir_constant *exp_width = new(ir) ir_constant(11u); 56201e04c3fSmrg ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem); 56301e04c3fSmrg 56401e04c3fSmrg /* Temporary variables */ 56501e04c3fSmrg ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary); 56601e04c3fSmrg ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary); 56701e04c3fSmrg 56801e04c3fSmrg ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x", 56901e04c3fSmrg ir_var_temporary); 57001e04c3fSmrg 57101e04c3fSmrg ir_variable *extracted_biased_exp = 57201e04c3fSmrg new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary); 57301e04c3fSmrg ir_variable *resulting_biased_exp = 57401e04c3fSmrg new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary); 57501e04c3fSmrg 57601e04c3fSmrg ir_variable *is_not_zero_or_underflow = 57701e04c3fSmrg new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary); 57801e04c3fSmrg 57901e04c3fSmrg ir_instruction &i = *base_ir; 58001e04c3fSmrg 58101e04c3fSmrg /* Copy <x> and <exp> arguments. */ 58201e04c3fSmrg i.insert_before(x); 58301e04c3fSmrg i.insert_before(assign(x, ir->operands[0])); 58401e04c3fSmrg i.insert_before(exp); 58501e04c3fSmrg i.insert_before(assign(exp, ir->operands[1])); 58601e04c3fSmrg 58701e04c3fSmrg ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x); 58801e04c3fSmrg if (lowering(DFREXP_DLDEXP_TO_ARITH)) 58901e04c3fSmrg dfrexp_exp_to_arith(frexp_exp); 59001e04c3fSmrg 59101e04c3fSmrg /* Extract the biased exponent from <x>. */ 59201e04c3fSmrg i.insert_before(extracted_biased_exp); 59301e04c3fSmrg i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias))); 59401e04c3fSmrg 59501e04c3fSmrg i.insert_before(resulting_biased_exp); 59601e04c3fSmrg i.insert_before(assign(resulting_biased_exp, 59701e04c3fSmrg add(extracted_biased_exp, exp))); 59801e04c3fSmrg 59901e04c3fSmrg /* Test if result is ±0.0, subnormal, or underflow by checking if the 60001e04c3fSmrg * resulting biased exponent would be less than 0x1. If so, the result is 60101e04c3fSmrg * 0.0 with the sign of x. (Actually, invert the conditions so that 60201e04c3fSmrg * immediate values are the second arguments, which is better for i965) 60301e04c3fSmrg * TODO: Implement in a vector fashion. 60401e04c3fSmrg */ 60501e04c3fSmrg i.insert_before(zero_sign_x); 60601e04c3fSmrg for (unsigned elem = 0; elem < vec_elem; elem++) { 60701e04c3fSmrg ir_variable *unpacked = 60801e04c3fSmrg new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 60901e04c3fSmrg i.insert_before(unpacked); 61001e04c3fSmrg i.insert_before( 61101e04c3fSmrg assign(unpacked, 61201e04c3fSmrg expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 61301e04c3fSmrg i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)), 61401e04c3fSmrg WRITEMASK_Y)); 61501e04c3fSmrg i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X)); 61601e04c3fSmrg i.insert_before(assign(zero_sign_x, 61701e04c3fSmrg expr(ir_unop_pack_double_2x32, unpacked), 61801e04c3fSmrg 1 << elem)); 61901e04c3fSmrg } 62001e04c3fSmrg i.insert_before(is_not_zero_or_underflow); 62101e04c3fSmrg i.insert_before(assign(is_not_zero_or_underflow, 62201e04c3fSmrg gequal(resulting_biased_exp, 62301e04c3fSmrg new(ir) ir_constant(0x1, vec_elem)))); 62401e04c3fSmrg i.insert_before(assign(x, csel(is_not_zero_or_underflow, 62501e04c3fSmrg x, zero_sign_x))); 62601e04c3fSmrg i.insert_before(assign(resulting_biased_exp, 62701e04c3fSmrg csel(is_not_zero_or_underflow, 62801e04c3fSmrg resulting_biased_exp, zeroi))); 62901e04c3fSmrg 63001e04c3fSmrg /* We could test for overflows by checking if the resulting biased exponent 63101e04c3fSmrg * would be greater than 0xFE. Turns out we don't need to because the GLSL 63201e04c3fSmrg * spec says: 63301e04c3fSmrg * 63401e04c3fSmrg * "If this product is too large to be represented in the 63501e04c3fSmrg * floating-point type, the result is undefined." 63601e04c3fSmrg */ 63701e04c3fSmrg 63801e04c3fSmrg ir_rvalue *results[4] = {NULL}; 63901e04c3fSmrg for (unsigned elem = 0; elem < vec_elem; elem++) { 64001e04c3fSmrg ir_variable *unpacked = 64101e04c3fSmrg new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 64201e04c3fSmrg i.insert_before(unpacked); 64301e04c3fSmrg i.insert_before( 64401e04c3fSmrg assign(unpacked, 64501e04c3fSmrg expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1)))); 64601e04c3fSmrg 64701e04c3fSmrg ir_expression *bfi = bitfield_insert( 64801e04c3fSmrg swizzle_y(unpacked), 64901e04c3fSmrg i2u(swizzle(resulting_biased_exp, elem, 1)), 65001e04c3fSmrg exp_shift->clone(ir, NULL), 65101e04c3fSmrg exp_width->clone(ir, NULL)); 65201e04c3fSmrg 65301e04c3fSmrg i.insert_before(assign(unpacked, bfi, WRITEMASK_Y)); 65401e04c3fSmrg 65501e04c3fSmrg results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 65601e04c3fSmrg } 65701e04c3fSmrg 65801e04c3fSmrg ir->operation = ir_quadop_vector; 65901e04c3fSmrg ir->init_num_operands(); 66001e04c3fSmrg ir->operands[0] = results[0]; 66101e04c3fSmrg ir->operands[1] = results[1]; 66201e04c3fSmrg ir->operands[2] = results[2]; 66301e04c3fSmrg ir->operands[3] = results[3]; 66401e04c3fSmrg 66501e04c3fSmrg /* Don't generate new IR that would need to be lowered in an additional 66601e04c3fSmrg * pass. 66701e04c3fSmrg */ 66801e04c3fSmrg 66901e04c3fSmrg this->progress = true; 67001e04c3fSmrg} 67101e04c3fSmrg 67201e04c3fSmrgvoid 67301e04c3fSmrglower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir) 67401e04c3fSmrg{ 67501e04c3fSmrg const unsigned vec_elem = ir->type->vector_elements; 67601e04c3fSmrg const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 67701e04c3fSmrg 67801e04c3fSmrg /* Double-precision floating-point values are stored as 67901e04c3fSmrg * 1 sign bit; 68001e04c3fSmrg * 11 exponent bits; 68101e04c3fSmrg * 52 mantissa bits. 68201e04c3fSmrg * 68301e04c3fSmrg * We're just extracting the significand here, so we only need to modify 68401e04c3fSmrg * the upper 32-bit uint. Unfortunately we must extract each double 68501e04c3fSmrg * independently as there is no vector version of unpackDouble. 68601e04c3fSmrg */ 68701e04c3fSmrg 68801e04c3fSmrg ir_instruction &i = *base_ir; 68901e04c3fSmrg 69001e04c3fSmrg ir_variable *is_not_zero = 69101e04c3fSmrg new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 69201e04c3fSmrg ir_rvalue *results[4] = {NULL}; 69301e04c3fSmrg 69401e04c3fSmrg ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 69501e04c3fSmrg i.insert_before(is_not_zero); 69601e04c3fSmrg i.insert_before( 69701e04c3fSmrg assign(is_not_zero, 69801e04c3fSmrg nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero))); 69901e04c3fSmrg 70001e04c3fSmrg /* TODO: Remake this as more vector-friendly when int64 support is 70101e04c3fSmrg * available. 70201e04c3fSmrg */ 70301e04c3fSmrg for (unsigned elem = 0; elem < vec_elem; elem++) { 70401e04c3fSmrg ir_constant *zero = new(ir) ir_constant(0u, 1); 70501e04c3fSmrg ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1); 70601e04c3fSmrg 70701e04c3fSmrg /* Exponent of double floating-point values in the range [0.5, 1.0). */ 70801e04c3fSmrg ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1); 70901e04c3fSmrg 71001e04c3fSmrg ir_variable *bits = 71101e04c3fSmrg new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary); 71201e04c3fSmrg ir_variable *unpacked = 71301e04c3fSmrg new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary); 71401e04c3fSmrg 71501e04c3fSmrg ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1); 71601e04c3fSmrg 71701e04c3fSmrg i.insert_before(bits); 71801e04c3fSmrg i.insert_before(unpacked); 71901e04c3fSmrg i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x))); 72001e04c3fSmrg 72101e04c3fSmrg /* Manipulate the high uint to remove the exponent and replace it with 72201e04c3fSmrg * either the default exponent or zero. 72301e04c3fSmrg */ 72401e04c3fSmrg i.insert_before(assign(bits, swizzle_y(unpacked))); 72501e04c3fSmrg i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask))); 72601e04c3fSmrg i.insert_before(assign(bits, bit_or(bits, 72701e04c3fSmrg csel(swizzle(is_not_zero, elem, 1), 72801e04c3fSmrg exponent_value, 72901e04c3fSmrg zero)))); 73001e04c3fSmrg i.insert_before(assign(unpacked, bits, WRITEMASK_Y)); 73101e04c3fSmrg results[elem] = expr(ir_unop_pack_double_2x32, unpacked); 73201e04c3fSmrg } 73301e04c3fSmrg 73401e04c3fSmrg /* Put the dvec back together */ 73501e04c3fSmrg ir->operation = ir_quadop_vector; 73601e04c3fSmrg ir->init_num_operands(); 73701e04c3fSmrg ir->operands[0] = results[0]; 73801e04c3fSmrg ir->operands[1] = results[1]; 73901e04c3fSmrg ir->operands[2] = results[2]; 74001e04c3fSmrg ir->operands[3] = results[3]; 74101e04c3fSmrg 74201e04c3fSmrg this->progress = true; 74301e04c3fSmrg} 74401e04c3fSmrg 74501e04c3fSmrgvoid 74601e04c3fSmrglower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir) 74701e04c3fSmrg{ 74801e04c3fSmrg const unsigned vec_elem = ir->type->vector_elements; 74901e04c3fSmrg const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1); 75001e04c3fSmrg const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1); 75101e04c3fSmrg 75201e04c3fSmrg /* Double-precision floating-point values are stored as 75301e04c3fSmrg * 1 sign bit; 75401e04c3fSmrg * 11 exponent bits; 75501e04c3fSmrg * 52 mantissa bits. 75601e04c3fSmrg * 75701e04c3fSmrg * We're just extracting the exponent here, so we only care about the upper 75801e04c3fSmrg * 32-bit uint. 75901e04c3fSmrg */ 76001e04c3fSmrg 76101e04c3fSmrg ir_instruction &i = *base_ir; 76201e04c3fSmrg 76301e04c3fSmrg ir_variable *is_not_zero = 76401e04c3fSmrg new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary); 76501e04c3fSmrg ir_variable *high_words = 76601e04c3fSmrg new(ir) ir_variable(uvec, "high_words", ir_var_temporary); 76701e04c3fSmrg ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem); 76801e04c3fSmrg ir_constant *izero = new(ir) ir_constant(0, vec_elem); 76901e04c3fSmrg 77001e04c3fSmrg ir_rvalue *absval = abs(ir->operands[0]); 77101e04c3fSmrg 77201e04c3fSmrg i.insert_before(is_not_zero); 77301e04c3fSmrg i.insert_before(high_words); 77401e04c3fSmrg i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero))); 77501e04c3fSmrg 77601e04c3fSmrg /* Extract all of the upper uints. */ 77701e04c3fSmrg for (unsigned elem = 0; elem < vec_elem; elem++) { 77801e04c3fSmrg ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1); 77901e04c3fSmrg 78001e04c3fSmrg i.insert_before(assign(high_words, 78101e04c3fSmrg swizzle_y(expr(ir_unop_unpack_double_2x32, x)), 78201e04c3fSmrg 1 << elem)); 78301e04c3fSmrg 78401e04c3fSmrg } 78501e04c3fSmrg ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem); 78601e04c3fSmrg ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem); 78701e04c3fSmrg 78801e04c3fSmrg /* For non-zero inputs, shift the exponent down and apply bias. */ 78901e04c3fSmrg ir->operation = ir_triop_csel; 79001e04c3fSmrg ir->init_num_operands(); 79101e04c3fSmrg ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero); 79201e04c3fSmrg ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift))); 79301e04c3fSmrg ir->operands[2] = izero; 79401e04c3fSmrg 79501e04c3fSmrg this->progress = true; 79601e04c3fSmrg} 79701e04c3fSmrg 79801e04c3fSmrgvoid 79901e04c3fSmrglower_instructions_visitor::carry_to_arith(ir_expression *ir) 80001e04c3fSmrg{ 80101e04c3fSmrg /* Translates 80201e04c3fSmrg * ir_binop_carry x y 80301e04c3fSmrg * into 80401e04c3fSmrg * sum = ir_binop_add x y 80501e04c3fSmrg * bcarry = ir_binop_less sum x 80601e04c3fSmrg * carry = ir_unop_b2i bcarry 80701e04c3fSmrg */ 80801e04c3fSmrg 80901e04c3fSmrg ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL); 81001e04c3fSmrg ir->operation = ir_unop_i2u; 81101e04c3fSmrg ir->init_num_operands(); 81201e04c3fSmrg ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone)); 81301e04c3fSmrg ir->operands[1] = NULL; 81401e04c3fSmrg 81501e04c3fSmrg this->progress = true; 81601e04c3fSmrg} 81701e04c3fSmrg 81801e04c3fSmrgvoid 81901e04c3fSmrglower_instructions_visitor::borrow_to_arith(ir_expression *ir) 82001e04c3fSmrg{ 82101e04c3fSmrg /* Translates 82201e04c3fSmrg * ir_binop_borrow x y 82301e04c3fSmrg * into 82401e04c3fSmrg * bcarry = ir_binop_less x y 82501e04c3fSmrg * carry = ir_unop_b2i bcarry 82601e04c3fSmrg */ 82701e04c3fSmrg 82801e04c3fSmrg ir->operation = ir_unop_i2u; 82901e04c3fSmrg ir->init_num_operands(); 83001e04c3fSmrg ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1])); 83101e04c3fSmrg ir->operands[1] = NULL; 83201e04c3fSmrg 83301e04c3fSmrg this->progress = true; 83401e04c3fSmrg} 83501e04c3fSmrg 83601e04c3fSmrgvoid 83701e04c3fSmrglower_instructions_visitor::sat_to_clamp(ir_expression *ir) 83801e04c3fSmrg{ 83901e04c3fSmrg /* Translates 84001e04c3fSmrg * ir_unop_saturate x 84101e04c3fSmrg * into 84201e04c3fSmrg * ir_binop_min (ir_binop_max(x, 0.0), 1.0) 84301e04c3fSmrg */ 84401e04c3fSmrg 84501e04c3fSmrg ir->operation = ir_binop_min; 84601e04c3fSmrg ir->init_num_operands(); 8477ec681f3Smrg 8487ec681f3Smrg ir_constant *zero = _imm_fp(ir, ir->operands[0]->type, 0.0); 84901e04c3fSmrg ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type, 8507ec681f3Smrg ir->operands[0], zero); 8517ec681f3Smrg ir->operands[1] = _imm_fp(ir, ir->operands[0]->type, 1.0); 85201e04c3fSmrg 85301e04c3fSmrg this->progress = true; 85401e04c3fSmrg} 85501e04c3fSmrg 85601e04c3fSmrgvoid 85701e04c3fSmrglower_instructions_visitor::double_dot_to_fma(ir_expression *ir) 85801e04c3fSmrg{ 85901e04c3fSmrg ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res", 86001e04c3fSmrg ir_var_temporary); 86101e04c3fSmrg this->base_ir->insert_before(temp); 86201e04c3fSmrg 86301e04c3fSmrg int nc = ir->operands[0]->type->components(); 86401e04c3fSmrg for (int i = nc - 1; i >= 1; i--) { 86501e04c3fSmrg ir_assignment *assig; 86601e04c3fSmrg if (i == (nc - 1)) { 86701e04c3fSmrg assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 86801e04c3fSmrg swizzle(ir->operands[1]->clone(ir, NULL), i, 1))); 86901e04c3fSmrg } else { 87001e04c3fSmrg assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1), 87101e04c3fSmrg swizzle(ir->operands[1]->clone(ir, NULL), i, 1), 87201e04c3fSmrg temp)); 87301e04c3fSmrg } 87401e04c3fSmrg this->base_ir->insert_before(assig); 87501e04c3fSmrg } 87601e04c3fSmrg 87701e04c3fSmrg ir->operation = ir_triop_fma; 87801e04c3fSmrg ir->init_num_operands(); 87901e04c3fSmrg ir->operands[0] = swizzle(ir->operands[0], 0, 1); 88001e04c3fSmrg ir->operands[1] = swizzle(ir->operands[1], 0, 1); 88101e04c3fSmrg ir->operands[2] = new(ir) ir_dereference_variable(temp); 88201e04c3fSmrg 88301e04c3fSmrg this->progress = true; 88401e04c3fSmrg 88501e04c3fSmrg} 88601e04c3fSmrg 88701e04c3fSmrgvoid 88801e04c3fSmrglower_instructions_visitor::double_lrp(ir_expression *ir) 88901e04c3fSmrg{ 89001e04c3fSmrg int swizval; 89101e04c3fSmrg ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2]; 89201e04c3fSmrg ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements); 89301e04c3fSmrg 89401e04c3fSmrg switch (op2->type->vector_elements) { 89501e04c3fSmrg case 1: 89601e04c3fSmrg swizval = SWIZZLE_XXXX; 89701e04c3fSmrg break; 89801e04c3fSmrg default: 89901e04c3fSmrg assert(op0->type->vector_elements == op2->type->vector_elements); 90001e04c3fSmrg swizval = SWIZZLE_XYZW; 90101e04c3fSmrg break; 90201e04c3fSmrg } 90301e04c3fSmrg 90401e04c3fSmrg ir->operation = ir_triop_fma; 90501e04c3fSmrg ir->init_num_operands(); 90601e04c3fSmrg ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements); 90701e04c3fSmrg ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0); 90801e04c3fSmrg 90901e04c3fSmrg this->progress = true; 91001e04c3fSmrg} 91101e04c3fSmrg 91201e04c3fSmrgvoid 91301e04c3fSmrglower_instructions_visitor::dceil_to_dfrac(ir_expression *ir) 91401e04c3fSmrg{ 91501e04c3fSmrg /* 91601e04c3fSmrg * frtemp = frac(x); 91701e04c3fSmrg * temp = sub(x, frtemp); 91801e04c3fSmrg * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0); 91901e04c3fSmrg */ 92001e04c3fSmrg ir_instruction &i = *base_ir; 92101e04c3fSmrg ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 92201e04c3fSmrg ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 92301e04c3fSmrg ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 92401e04c3fSmrg ir_var_temporary); 92501e04c3fSmrg 92601e04c3fSmrg i.insert_before(frtemp); 92701e04c3fSmrg i.insert_before(assign(frtemp, fract(ir->operands[0]))); 92801e04c3fSmrg 92901e04c3fSmrg ir->operation = ir_binop_add; 93001e04c3fSmrg ir->init_num_operands(); 93101e04c3fSmrg ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp); 93201e04c3fSmrg ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL)); 93301e04c3fSmrg 93401e04c3fSmrg this->progress = true; 93501e04c3fSmrg} 93601e04c3fSmrg 93701e04c3fSmrgvoid 93801e04c3fSmrglower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir) 93901e04c3fSmrg{ 94001e04c3fSmrg /* 94101e04c3fSmrg * frtemp = frac(x); 94201e04c3fSmrg * result = sub(x, frtemp); 94301e04c3fSmrg */ 94401e04c3fSmrg ir->operation = ir_binop_sub; 94501e04c3fSmrg ir->init_num_operands(); 94601e04c3fSmrg ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL)); 94701e04c3fSmrg 94801e04c3fSmrg this->progress = true; 94901e04c3fSmrg} 95001e04c3fSmrgvoid 95101e04c3fSmrglower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir) 95201e04c3fSmrg{ 95301e04c3fSmrg /* 95401e04c3fSmrg * insane but works 95501e04c3fSmrg * temp = x + 0.5; 95601e04c3fSmrg * frtemp = frac(temp); 95701e04c3fSmrg * t2 = sub(temp, frtemp); 95801e04c3fSmrg * if (frac(x) == 0.5) 95901e04c3fSmrg * result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1; 96001e04c3fSmrg * else 96101e04c3fSmrg * result = t2; 96201e04c3fSmrg 96301e04c3fSmrg */ 96401e04c3fSmrg ir_instruction &i = *base_ir; 96501e04c3fSmrg ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp", 96601e04c3fSmrg ir_var_temporary); 96701e04c3fSmrg ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 96801e04c3fSmrg ir_var_temporary); 96901e04c3fSmrg ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2", 97001e04c3fSmrg ir_var_temporary); 97101e04c3fSmrg ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements); 97201e04c3fSmrg ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements); 97301e04c3fSmrg ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements); 97401e04c3fSmrg 97501e04c3fSmrg i.insert_before(temp); 97601e04c3fSmrg i.insert_before(assign(temp, add(ir->operands[0], p5))); 97701e04c3fSmrg 97801e04c3fSmrg i.insert_before(frtemp); 97901e04c3fSmrg i.insert_before(assign(frtemp, fract(temp))); 98001e04c3fSmrg 98101e04c3fSmrg i.insert_before(t2); 98201e04c3fSmrg i.insert_before(assign(t2, sub(temp, frtemp))); 98301e04c3fSmrg 98401e04c3fSmrg ir->operation = ir_triop_csel; 98501e04c3fSmrg ir->init_num_operands(); 98601e04c3fSmrg ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)), 98701e04c3fSmrg p5->clone(ir, NULL)); 98801e04c3fSmrg ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))), 98901e04c3fSmrg zero), 99001e04c3fSmrg t2, 99101e04c3fSmrg sub(t2, one)); 99201e04c3fSmrg ir->operands[2] = new(ir) ir_dereference_variable(t2); 99301e04c3fSmrg 99401e04c3fSmrg this->progress = true; 99501e04c3fSmrg} 99601e04c3fSmrg 99701e04c3fSmrgvoid 99801e04c3fSmrglower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir) 99901e04c3fSmrg{ 100001e04c3fSmrg /* 100101e04c3fSmrg * frtemp = frac(x); 100201e04c3fSmrg * temp = sub(x, frtemp); 100301e04c3fSmrg * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1; 100401e04c3fSmrg */ 100501e04c3fSmrg ir_rvalue *arg = ir->operands[0]; 100601e04c3fSmrg ir_instruction &i = *base_ir; 100701e04c3fSmrg 100801e04c3fSmrg ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 100901e04c3fSmrg ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 101001e04c3fSmrg ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp", 101101e04c3fSmrg ir_var_temporary); 101201e04c3fSmrg ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp", 101301e04c3fSmrg ir_var_temporary); 101401e04c3fSmrg 101501e04c3fSmrg i.insert_before(frtemp); 101601e04c3fSmrg i.insert_before(assign(frtemp, fract(arg))); 101701e04c3fSmrg i.insert_before(temp); 101801e04c3fSmrg i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp))); 101901e04c3fSmrg 102001e04c3fSmrg ir->operation = ir_triop_csel; 102101e04c3fSmrg ir->init_num_operands(); 102201e04c3fSmrg ir->operands[0] = gequal(arg->clone(ir, NULL), zero); 102301e04c3fSmrg ir->operands[1] = new (ir) ir_dereference_variable(temp); 102401e04c3fSmrg ir->operands[2] = add(temp, 102501e04c3fSmrg csel(equal(frtemp, zero->clone(ir, NULL)), 102601e04c3fSmrg zero->clone(ir, NULL), 102701e04c3fSmrg one)); 102801e04c3fSmrg 102901e04c3fSmrg this->progress = true; 103001e04c3fSmrg} 103101e04c3fSmrg 103201e04c3fSmrgvoid 103301e04c3fSmrglower_instructions_visitor::dsign_to_csel(ir_expression *ir) 103401e04c3fSmrg{ 103501e04c3fSmrg /* 103601e04c3fSmrg * temp = x > 0.0 ? 1.0 : 0.0; 103701e04c3fSmrg * result = x < 0.0 ? -1.0 : temp; 103801e04c3fSmrg */ 103901e04c3fSmrg ir_rvalue *arg = ir->operands[0]; 104001e04c3fSmrg ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements); 104101e04c3fSmrg ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements); 104201e04c3fSmrg ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements); 104301e04c3fSmrg 104401e04c3fSmrg ir->operation = ir_triop_csel; 104501e04c3fSmrg ir->init_num_operands(); 104601e04c3fSmrg ir->operands[0] = less(arg->clone(ir, NULL), 104701e04c3fSmrg zero->clone(ir, NULL)); 104801e04c3fSmrg ir->operands[1] = neg_one; 104901e04c3fSmrg ir->operands[2] = csel(greater(arg, zero), 105001e04c3fSmrg one, 105101e04c3fSmrg zero->clone(ir, NULL)); 105201e04c3fSmrg 105301e04c3fSmrg this->progress = true; 105401e04c3fSmrg} 105501e04c3fSmrg 105601e04c3fSmrgvoid 105701e04c3fSmrglower_instructions_visitor::bit_count_to_math(ir_expression *ir) 105801e04c3fSmrg{ 105901e04c3fSmrg /* For more details, see: 106001e04c3fSmrg * 106101e04c3fSmrg * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel 106201e04c3fSmrg */ 106301e04c3fSmrg const unsigned elements = ir->operands[0]->type->vector_elements; 106401e04c3fSmrg ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp", 106501e04c3fSmrg ir_var_temporary); 106601e04c3fSmrg ir_constant *c55555555 = new(ir) ir_constant(0x55555555u); 106701e04c3fSmrg ir_constant *c33333333 = new(ir) ir_constant(0x33333333u); 106801e04c3fSmrg ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu); 106901e04c3fSmrg ir_constant *c01010101 = new(ir) ir_constant(0x01010101u); 107001e04c3fSmrg ir_constant *c1 = new(ir) ir_constant(1u); 107101e04c3fSmrg ir_constant *c2 = new(ir) ir_constant(2u); 107201e04c3fSmrg ir_constant *c4 = new(ir) ir_constant(4u); 107301e04c3fSmrg ir_constant *c24 = new(ir) ir_constant(24u); 107401e04c3fSmrg 107501e04c3fSmrg base_ir->insert_before(temp); 107601e04c3fSmrg 107701e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 107801e04c3fSmrg base_ir->insert_before(assign(temp, ir->operands[0])); 107901e04c3fSmrg } else { 108001e04c3fSmrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 108101e04c3fSmrg base_ir->insert_before(assign(temp, i2u(ir->operands[0]))); 108201e04c3fSmrg } 108301e04c3fSmrg 108401e04c3fSmrg /* temp = temp - ((temp >> 1) & 0x55555555u); */ 108501e04c3fSmrg base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1), 108601e04c3fSmrg c55555555)))); 108701e04c3fSmrg 108801e04c3fSmrg /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */ 108901e04c3fSmrg base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333), 109001e04c3fSmrg bit_and(rshift(temp, c2), 109101e04c3fSmrg c33333333->clone(ir, NULL))))); 109201e04c3fSmrg 109301e04c3fSmrg /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */ 109401e04c3fSmrg ir->operation = ir_unop_u2i; 109501e04c3fSmrg ir->init_num_operands(); 109601e04c3fSmrg ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F), 109701e04c3fSmrg c01010101), 109801e04c3fSmrg c24); 109901e04c3fSmrg 110001e04c3fSmrg this->progress = true; 110101e04c3fSmrg} 110201e04c3fSmrg 110301e04c3fSmrgvoid 110401e04c3fSmrglower_instructions_visitor::extract_to_shifts(ir_expression *ir) 110501e04c3fSmrg{ 110601e04c3fSmrg ir_variable *bits = 110701e04c3fSmrg new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 110801e04c3fSmrg 110901e04c3fSmrg base_ir->insert_before(bits); 111001e04c3fSmrg base_ir->insert_before(assign(bits, ir->operands[2])); 111101e04c3fSmrg 111201e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 111301e04c3fSmrg ir_constant *c1 = 111401e04c3fSmrg new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 111501e04c3fSmrg ir_constant *c32 = 111601e04c3fSmrg new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 111701e04c3fSmrg ir_constant *cFFFFFFFF = 111801e04c3fSmrg new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 111901e04c3fSmrg 112001e04c3fSmrg /* At least some hardware treats (x << y) as (x << (y%32)). This means 112101e04c3fSmrg * we'd get a mask of 0 when bits is 32. Special case it. 112201e04c3fSmrg * 112301e04c3fSmrg * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u; 112401e04c3fSmrg */ 112501e04c3fSmrg ir_expression *mask = csel(equal(bits, c32), 112601e04c3fSmrg cFFFFFFFF, 112701e04c3fSmrg sub(lshift(c1, bits), c1->clone(ir, NULL))); 112801e04c3fSmrg 112901e04c3fSmrg /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 113001e04c3fSmrg * 113101e04c3fSmrg * If bits is zero, the result will be zero. 113201e04c3fSmrg * 113301e04c3fSmrg * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional 113401e04c3fSmrg * select as in the signed integer case. 113501e04c3fSmrg * 113601e04c3fSmrg * (value >> offset) & mask; 113701e04c3fSmrg */ 113801e04c3fSmrg ir->operation = ir_binop_bit_and; 113901e04c3fSmrg ir->init_num_operands(); 114001e04c3fSmrg ir->operands[0] = rshift(ir->operands[0], ir->operands[1]); 114101e04c3fSmrg ir->operands[1] = mask; 114201e04c3fSmrg ir->operands[2] = NULL; 114301e04c3fSmrg } else { 114401e04c3fSmrg ir_constant *c0 = 114501e04c3fSmrg new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements); 114601e04c3fSmrg ir_constant *c32 = 114701e04c3fSmrg new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 114801e04c3fSmrg ir_variable *temp = 114901e04c3fSmrg new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary); 115001e04c3fSmrg 115101e04c3fSmrg /* temp = 32 - bits; */ 115201e04c3fSmrg base_ir->insert_before(temp); 115301e04c3fSmrg base_ir->insert_before(assign(temp, sub(c32, bits))); 115401e04c3fSmrg 115501e04c3fSmrg /* expr = value << (temp - offset)) >> temp; */ 115601e04c3fSmrg ir_expression *expr = 115701e04c3fSmrg rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp); 115801e04c3fSmrg 115901e04c3fSmrg /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 116001e04c3fSmrg * 116101e04c3fSmrg * If bits is zero, the result will be zero. 116201e04c3fSmrg * 116301e04c3fSmrg * Due to the (x << (y%32)) behavior mentioned before, the (value << 116401e04c3fSmrg * (32-0)) doesn't "erase" all of the data as we would like, so finish 116501e04c3fSmrg * up with: 116601e04c3fSmrg * 116701e04c3fSmrg * (bits == 0) ? 0 : e; 116801e04c3fSmrg */ 116901e04c3fSmrg ir->operation = ir_triop_csel; 117001e04c3fSmrg ir->init_num_operands(); 117101e04c3fSmrg ir->operands[0] = equal(c0, bits); 117201e04c3fSmrg ir->operands[1] = c0->clone(ir, NULL); 117301e04c3fSmrg ir->operands[2] = expr; 117401e04c3fSmrg } 117501e04c3fSmrg 117601e04c3fSmrg this->progress = true; 117701e04c3fSmrg} 117801e04c3fSmrg 117901e04c3fSmrgvoid 118001e04c3fSmrglower_instructions_visitor::insert_to_shifts(ir_expression *ir) 118101e04c3fSmrg{ 118201e04c3fSmrg ir_constant *c1; 118301e04c3fSmrg ir_constant *c32; 118401e04c3fSmrg ir_constant *cFFFFFFFF; 118501e04c3fSmrg ir_variable *offset = 118601e04c3fSmrg new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary); 118701e04c3fSmrg ir_variable *bits = 118801e04c3fSmrg new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary); 118901e04c3fSmrg ir_variable *mask = 119001e04c3fSmrg new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary); 119101e04c3fSmrg 119201e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 119301e04c3fSmrg c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements); 119401e04c3fSmrg c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements); 119501e04c3fSmrg cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements); 119601e04c3fSmrg } else { 119701e04c3fSmrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 119801e04c3fSmrg 119901e04c3fSmrg c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 120001e04c3fSmrg c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements); 120101e04c3fSmrg cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements); 120201e04c3fSmrg } 120301e04c3fSmrg 120401e04c3fSmrg base_ir->insert_before(offset); 120501e04c3fSmrg base_ir->insert_before(assign(offset, ir->operands[2])); 120601e04c3fSmrg 120701e04c3fSmrg base_ir->insert_before(bits); 120801e04c3fSmrg base_ir->insert_before(assign(bits, ir->operands[3])); 120901e04c3fSmrg 121001e04c3fSmrg /* At least some hardware treats (x << y) as (x << (y%32)). This means 121101e04c3fSmrg * we'd get a mask of 0 when bits is 32. Special case it. 121201e04c3fSmrg * 121301e04c3fSmrg * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset; 121401e04c3fSmrg * 121501e04c3fSmrg * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 121601e04c3fSmrg * 121701e04c3fSmrg * The result will be undefined if offset or bits is negative, or if the 121801e04c3fSmrg * sum of offset and bits is greater than the number of bits used to 121901e04c3fSmrg * store the operand. 122001e04c3fSmrg * 122101e04c3fSmrg * Since it's undefined, there are a couple other ways this could be 122201e04c3fSmrg * implemented. The other way that was considered was to put the csel 122301e04c3fSmrg * around the whole thing: 122401e04c3fSmrg * 122501e04c3fSmrg * final_result = bits == 32 ? insert : ... ; 122601e04c3fSmrg */ 122701e04c3fSmrg base_ir->insert_before(mask); 122801e04c3fSmrg 122901e04c3fSmrg base_ir->insert_before(assign(mask, csel(equal(bits, c32), 123001e04c3fSmrg cFFFFFFFF, 123101e04c3fSmrg lshift(sub(lshift(c1, bits), 123201e04c3fSmrg c1->clone(ir, NULL)), 123301e04c3fSmrg offset)))); 123401e04c3fSmrg 123501e04c3fSmrg /* (base & ~mask) | ((insert << offset) & mask) */ 123601e04c3fSmrg ir->operation = ir_binop_bit_or; 123701e04c3fSmrg ir->init_num_operands(); 123801e04c3fSmrg ir->operands[0] = bit_and(ir->operands[0], bit_not(mask)); 123901e04c3fSmrg ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask); 124001e04c3fSmrg ir->operands[2] = NULL; 124101e04c3fSmrg ir->operands[3] = NULL; 124201e04c3fSmrg 124301e04c3fSmrg this->progress = true; 124401e04c3fSmrg} 124501e04c3fSmrg 124601e04c3fSmrgvoid 124701e04c3fSmrglower_instructions_visitor::reverse_to_shifts(ir_expression *ir) 124801e04c3fSmrg{ 124901e04c3fSmrg /* For more details, see: 125001e04c3fSmrg * 125101e04c3fSmrg * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel 125201e04c3fSmrg */ 125301e04c3fSmrg ir_constant *c1 = 125401e04c3fSmrg new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements); 125501e04c3fSmrg ir_constant *c2 = 125601e04c3fSmrg new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements); 125701e04c3fSmrg ir_constant *c4 = 125801e04c3fSmrg new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements); 125901e04c3fSmrg ir_constant *c8 = 126001e04c3fSmrg new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements); 126101e04c3fSmrg ir_constant *c16 = 126201e04c3fSmrg new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements); 126301e04c3fSmrg ir_constant *c33333333 = 126401e04c3fSmrg new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements); 126501e04c3fSmrg ir_constant *c55555555 = 126601e04c3fSmrg new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements); 126701e04c3fSmrg ir_constant *c0F0F0F0F = 126801e04c3fSmrg new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements); 126901e04c3fSmrg ir_constant *c00FF00FF = 127001e04c3fSmrg new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements); 127101e04c3fSmrg ir_variable *temp = 127201e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements), 127301e04c3fSmrg "temp", ir_var_temporary); 127401e04c3fSmrg ir_instruction &i = *base_ir; 127501e04c3fSmrg 127601e04c3fSmrg i.insert_before(temp); 127701e04c3fSmrg 127801e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 127901e04c3fSmrg i.insert_before(assign(temp, ir->operands[0])); 128001e04c3fSmrg } else { 128101e04c3fSmrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 128201e04c3fSmrg i.insert_before(assign(temp, i2u(ir->operands[0]))); 128301e04c3fSmrg } 128401e04c3fSmrg 128501e04c3fSmrg /* Swap odd and even bits. 128601e04c3fSmrg * 128701e04c3fSmrg * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1); 128801e04c3fSmrg */ 128901e04c3fSmrg i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555), 129001e04c3fSmrg lshift(bit_and(temp, c55555555->clone(ir, NULL)), 129101e04c3fSmrg c1->clone(ir, NULL))))); 129201e04c3fSmrg /* Swap consecutive pairs. 129301e04c3fSmrg * 129401e04c3fSmrg * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2); 129501e04c3fSmrg */ 129601e04c3fSmrg i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333), 129701e04c3fSmrg lshift(bit_and(temp, c33333333->clone(ir, NULL)), 129801e04c3fSmrg c2->clone(ir, NULL))))); 129901e04c3fSmrg 130001e04c3fSmrg /* Swap nibbles. 130101e04c3fSmrg * 130201e04c3fSmrg * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4); 130301e04c3fSmrg */ 130401e04c3fSmrg i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F), 130501e04c3fSmrg lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)), 130601e04c3fSmrg c4->clone(ir, NULL))))); 130701e04c3fSmrg 130801e04c3fSmrg /* The last step is, basically, bswap. Swap the bytes, then swap the 130901e04c3fSmrg * words. When this code is run through GCC on x86, it does generate a 131001e04c3fSmrg * bswap instruction. 131101e04c3fSmrg * 131201e04c3fSmrg * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8); 131301e04c3fSmrg * temp = ( temp >> 16 ) | ( temp << 16); 131401e04c3fSmrg */ 131501e04c3fSmrg i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF), 131601e04c3fSmrg lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)), 131701e04c3fSmrg c8->clone(ir, NULL))))); 131801e04c3fSmrg 131901e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 132001e04c3fSmrg ir->operation = ir_binop_bit_or; 132101e04c3fSmrg ir->init_num_operands(); 132201e04c3fSmrg ir->operands[0] = rshift(temp, c16); 132301e04c3fSmrg ir->operands[1] = lshift(temp, c16->clone(ir, NULL)); 132401e04c3fSmrg } else { 132501e04c3fSmrg ir->operation = ir_unop_u2i; 132601e04c3fSmrg ir->init_num_operands(); 132701e04c3fSmrg ir->operands[0] = bit_or(rshift(temp, c16), 132801e04c3fSmrg lshift(temp, c16->clone(ir, NULL))); 132901e04c3fSmrg } 133001e04c3fSmrg 133101e04c3fSmrg this->progress = true; 133201e04c3fSmrg} 133301e04c3fSmrg 133401e04c3fSmrgvoid 133501e04c3fSmrglower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir) 133601e04c3fSmrg{ 133701e04c3fSmrg /* For more details, see: 133801e04c3fSmrg * 133901e04c3fSmrg * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 134001e04c3fSmrg */ 134101e04c3fSmrg const unsigned elements = ir->operands[0]->type->vector_elements; 134201e04c3fSmrg ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements); 134301e04c3fSmrg ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 134401e04c3fSmrg ir_constant *c23 = new(ir) ir_constant(int(23), elements); 134501e04c3fSmrg ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 134601e04c3fSmrg ir_variable *temp = 134701e04c3fSmrg new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary); 134801e04c3fSmrg ir_variable *lsb_only = 134901e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary); 135001e04c3fSmrg ir_variable *as_float = 135101e04c3fSmrg new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 135201e04c3fSmrg ir_variable *lsb = 135301e04c3fSmrg new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary); 135401e04c3fSmrg 135501e04c3fSmrg ir_instruction &i = *base_ir; 135601e04c3fSmrg 135701e04c3fSmrg i.insert_before(temp); 135801e04c3fSmrg 135901e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) { 136001e04c3fSmrg i.insert_before(assign(temp, ir->operands[0])); 136101e04c3fSmrg } else { 136201e04c3fSmrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 136301e04c3fSmrg i.insert_before(assign(temp, u2i(ir->operands[0]))); 136401e04c3fSmrg } 136501e04c3fSmrg 136601e04c3fSmrg /* The int-to-float conversion is lossless because (value & -value) is 136701e04c3fSmrg * either a power of two or zero. We don't use the result in the zero 136801e04c3fSmrg * case. The uint() cast is necessary so that 0x80000000 does not 136901e04c3fSmrg * generate a negative value. 137001e04c3fSmrg * 137101e04c3fSmrg * uint lsb_only = uint(value & -value); 137201e04c3fSmrg * float as_float = float(lsb_only); 137301e04c3fSmrg */ 137401e04c3fSmrg i.insert_before(lsb_only); 137501e04c3fSmrg i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp))))); 137601e04c3fSmrg 137701e04c3fSmrg i.insert_before(as_float); 137801e04c3fSmrg i.insert_before(assign(as_float, u2f(lsb_only))); 137901e04c3fSmrg 138001e04c3fSmrg /* This is basically an open-coded frexp. Implementations that have a 138101e04c3fSmrg * native frexp instruction would be better served by that. This is 138201e04c3fSmrg * optimized versus a full-featured open-coded implementation in two ways: 138301e04c3fSmrg * 138401e04c3fSmrg * - We don't care about a correct result from subnormal numbers (including 138501e04c3fSmrg * 0.0), so the raw exponent can always be safely unbiased. 138601e04c3fSmrg * 138701e04c3fSmrg * - The value cannot be negative, so it does not need to be masked off to 138801e04c3fSmrg * extract the exponent. 138901e04c3fSmrg * 139001e04c3fSmrg * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f; 139101e04c3fSmrg */ 139201e04c3fSmrg i.insert_before(lsb); 139301e04c3fSmrg i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 139401e04c3fSmrg 139501e04c3fSmrg /* Use lsb_only in the comparison instead of temp so that the & (far above) 139601e04c3fSmrg * can possibly generate the result without an explicit comparison. 139701e04c3fSmrg * 139801e04c3fSmrg * (lsb_only == 0) ? -1 : lsb; 139901e04c3fSmrg * 140001e04c3fSmrg * Since our input values are all integers, the unbiased exponent must not 140101e04c3fSmrg * be negative. It will only be negative (-0x7f, in fact) if lsb_only is 140201e04c3fSmrg * 0. Instead of using (lsb_only == 0), we could use (lsb >= 0). Which is 140301e04c3fSmrg * better is likely GPU dependent. Either way, the difference should be 140401e04c3fSmrg * small. 140501e04c3fSmrg */ 140601e04c3fSmrg ir->operation = ir_triop_csel; 140701e04c3fSmrg ir->init_num_operands(); 140801e04c3fSmrg ir->operands[0] = equal(lsb_only, c0); 140901e04c3fSmrg ir->operands[1] = cminus1; 141001e04c3fSmrg ir->operands[2] = new(ir) ir_dereference_variable(lsb); 141101e04c3fSmrg 141201e04c3fSmrg this->progress = true; 141301e04c3fSmrg} 141401e04c3fSmrg 141501e04c3fSmrgvoid 141601e04c3fSmrglower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir) 141701e04c3fSmrg{ 141801e04c3fSmrg /* For more details, see: 141901e04c3fSmrg * 142001e04c3fSmrg * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast 142101e04c3fSmrg */ 142201e04c3fSmrg const unsigned elements = ir->operands[0]->type->vector_elements; 142301e04c3fSmrg ir_constant *c0 = new(ir) ir_constant(int(0), elements); 142401e04c3fSmrg ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements); 142501e04c3fSmrg ir_constant *c23 = new(ir) ir_constant(int(23), elements); 142601e04c3fSmrg ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements); 142701e04c3fSmrg ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements); 142801e04c3fSmrg ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements); 142901e04c3fSmrg ir_variable *temp = 143001e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary); 143101e04c3fSmrg ir_variable *as_float = 143201e04c3fSmrg new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary); 143301e04c3fSmrg ir_variable *msb = 143401e04c3fSmrg new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary); 143501e04c3fSmrg 143601e04c3fSmrg ir_instruction &i = *base_ir; 143701e04c3fSmrg 143801e04c3fSmrg i.insert_before(temp); 143901e04c3fSmrg 144001e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 144101e04c3fSmrg i.insert_before(assign(temp, ir->operands[0])); 144201e04c3fSmrg } else { 144301e04c3fSmrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 144401e04c3fSmrg 144501e04c3fSmrg /* findMSB(uint(abs(some_int))) almost always does the right thing. 144601e04c3fSmrg * There are two problem values: 144701e04c3fSmrg * 144801e04c3fSmrg * * 0x80000000. Since abs(0x80000000) == 0x80000000, findMSB returns 144901e04c3fSmrg * 31. However, findMSB(int(0x80000000)) == 30. 145001e04c3fSmrg * 145101e04c3fSmrg * * 0xffffffff. Since abs(0xffffffff) == 1, findMSB returns 145201e04c3fSmrg * 31. Section 8.8 (Integer Functions) of the GLSL 4.50 spec says: 145301e04c3fSmrg * 145401e04c3fSmrg * For a value of zero or negative one, -1 will be returned. 145501e04c3fSmrg * 145601e04c3fSmrg * For all negative number cases, including 0x80000000 and 0xffffffff, 145701e04c3fSmrg * the correct value is obtained from findMSB if instead of negating the 145801e04c3fSmrg * (already negative) value the logical-not is used. A conditonal 145901e04c3fSmrg * logical-not can be achieved in two instructions. 146001e04c3fSmrg */ 146101e04c3fSmrg ir_variable *as_int = 146201e04c3fSmrg new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary); 146301e04c3fSmrg ir_constant *c31 = new(ir) ir_constant(int(31), elements); 146401e04c3fSmrg 146501e04c3fSmrg i.insert_before(as_int); 146601e04c3fSmrg i.insert_before(assign(as_int, ir->operands[0])); 146701e04c3fSmrg i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor, 146801e04c3fSmrg as_int, 146901e04c3fSmrg rshift(as_int, c31))))); 147001e04c3fSmrg } 147101e04c3fSmrg 147201e04c3fSmrg /* The int-to-float conversion is lossless because bits are conditionally 147301e04c3fSmrg * masked off the bottom of temp to ensure the value has at most 24 bits of 147401e04c3fSmrg * data or is zero. We don't use the result in the zero case. The uint() 147501e04c3fSmrg * cast is necessary so that 0x80000000 does not generate a negative value. 147601e04c3fSmrg * 147701e04c3fSmrg * float as_float = float(temp > 255 ? temp & ~255 : temp); 147801e04c3fSmrg */ 147901e04c3fSmrg i.insert_before(as_float); 148001e04c3fSmrg i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF), 148101e04c3fSmrg bit_and(temp, cFFFFFF00), 148201e04c3fSmrg temp)))); 148301e04c3fSmrg 148401e04c3fSmrg /* This is basically an open-coded frexp. Implementations that have a 148501e04c3fSmrg * native frexp instruction would be better served by that. This is 148601e04c3fSmrg * optimized versus a full-featured open-coded implementation in two ways: 148701e04c3fSmrg * 148801e04c3fSmrg * - We don't care about a correct result from subnormal numbers (including 148901e04c3fSmrg * 0.0), so the raw exponent can always be safely unbiased. 149001e04c3fSmrg * 149101e04c3fSmrg * - The value cannot be negative, so it does not need to be masked off to 149201e04c3fSmrg * extract the exponent. 149301e04c3fSmrg * 149401e04c3fSmrg * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f; 149501e04c3fSmrg */ 149601e04c3fSmrg i.insert_before(msb); 149701e04c3fSmrg i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F))); 149801e04c3fSmrg 149901e04c3fSmrg /* Use msb in the comparison instead of temp so that the subtract can 150001e04c3fSmrg * possibly generate the result without an explicit comparison. 150101e04c3fSmrg * 150201e04c3fSmrg * (msb < 0) ? -1 : msb; 150301e04c3fSmrg * 150401e04c3fSmrg * Since our input values are all integers, the unbiased exponent must not 150501e04c3fSmrg * be negative. It will only be negative (-0x7f, in fact) if temp is 0. 150601e04c3fSmrg */ 150701e04c3fSmrg ir->operation = ir_triop_csel; 150801e04c3fSmrg ir->init_num_operands(); 150901e04c3fSmrg ir->operands[0] = less(msb, c0); 151001e04c3fSmrg ir->operands[1] = cminus1; 151101e04c3fSmrg ir->operands[2] = new(ir) ir_dereference_variable(msb); 151201e04c3fSmrg 151301e04c3fSmrg this->progress = true; 151401e04c3fSmrg} 151501e04c3fSmrg 151601e04c3fSmrgir_expression * 151701e04c3fSmrglower_instructions_visitor::_carry(operand a, operand b) 151801e04c3fSmrg{ 151901e04c3fSmrg if (lowering(CARRY_TO_ARITH)) 152001e04c3fSmrg return i2u(b2i(less(add(a, b), 152101e04c3fSmrg a.val->clone(ralloc_parent(a.val), NULL)))); 152201e04c3fSmrg else 152301e04c3fSmrg return carry(a, b); 152401e04c3fSmrg} 152501e04c3fSmrg 15267ec681f3Smrgir_constant * 15277ec681f3Smrglower_instructions_visitor::_imm_fp(void *mem_ctx, 15287ec681f3Smrg const glsl_type *type, 15297ec681f3Smrg double f, 15307ec681f3Smrg unsigned vector_elements) 15317ec681f3Smrg{ 15327ec681f3Smrg switch (type->base_type) { 15337ec681f3Smrg case GLSL_TYPE_FLOAT: 15347ec681f3Smrg return new(mem_ctx) ir_constant((float) f, vector_elements); 15357ec681f3Smrg case GLSL_TYPE_DOUBLE: 15367ec681f3Smrg return new(mem_ctx) ir_constant((double) f, vector_elements); 15377ec681f3Smrg case GLSL_TYPE_FLOAT16: 15387ec681f3Smrg return new(mem_ctx) ir_constant(float16_t(f), vector_elements); 15397ec681f3Smrg default: 15407ec681f3Smrg assert(!"unknown float type for immediate"); 15417ec681f3Smrg return NULL; 15427ec681f3Smrg } 15437ec681f3Smrg} 15447ec681f3Smrg 154501e04c3fSmrgvoid 154601e04c3fSmrglower_instructions_visitor::imul_high_to_mul(ir_expression *ir) 154701e04c3fSmrg{ 154801e04c3fSmrg /* ABCD 154901e04c3fSmrg * * EFGH 155001e04c3fSmrg * ====== 155101e04c3fSmrg * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32 155201e04c3fSmrg * 155301e04c3fSmrg * In GLSL, (a * b) becomes 155401e04c3fSmrg * 155501e04c3fSmrg * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu); 155601e04c3fSmrg * uint m2 = (a & 0x0000ffffu) * (b >> 16); 155701e04c3fSmrg * uint m3 = (a >> 16) * (b & 0x0000ffffu); 155801e04c3fSmrg * uint m4 = (a >> 16) * (b >> 16); 155901e04c3fSmrg * 156001e04c3fSmrg * uint c1; 156101e04c3fSmrg * uint c2; 156201e04c3fSmrg * uint lo_result; 156301e04c3fSmrg * uint hi_result; 156401e04c3fSmrg * 156501e04c3fSmrg * lo_result = uaddCarry(m1, m2 << 16, c1); 156601e04c3fSmrg * hi_result = m4 + c1; 156701e04c3fSmrg * lo_result = uaddCarry(lo_result, m3 << 16, c2); 156801e04c3fSmrg * hi_result = hi_result + c2; 156901e04c3fSmrg * hi_result = hi_result + (m2 >> 16) + (m3 >> 16); 157001e04c3fSmrg */ 157101e04c3fSmrg const unsigned elements = ir->operands[0]->type->vector_elements; 157201e04c3fSmrg ir_variable *src1 = 157301e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary); 157401e04c3fSmrg ir_variable *src1h = 157501e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary); 157601e04c3fSmrg ir_variable *src1l = 157701e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary); 157801e04c3fSmrg ir_variable *src2 = 157901e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary); 158001e04c3fSmrg ir_variable *src2h = 158101e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary); 158201e04c3fSmrg ir_variable *src2l = 158301e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary); 158401e04c3fSmrg ir_variable *t1 = 158501e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary); 158601e04c3fSmrg ir_variable *t2 = 158701e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary); 158801e04c3fSmrg ir_variable *lo = 158901e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary); 159001e04c3fSmrg ir_variable *hi = 159101e04c3fSmrg new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary); 159201e04c3fSmrg ir_variable *different_signs = NULL; 159301e04c3fSmrg ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements); 159401e04c3fSmrg ir_constant *c16 = new(ir) ir_constant(16u, elements); 159501e04c3fSmrg 159601e04c3fSmrg ir_instruction &i = *base_ir; 159701e04c3fSmrg 159801e04c3fSmrg i.insert_before(src1); 159901e04c3fSmrg i.insert_before(src2); 160001e04c3fSmrg i.insert_before(src1h); 160101e04c3fSmrg i.insert_before(src2h); 160201e04c3fSmrg i.insert_before(src1l); 160301e04c3fSmrg i.insert_before(src2l); 160401e04c3fSmrg 160501e04c3fSmrg if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) { 160601e04c3fSmrg i.insert_before(assign(src1, ir->operands[0])); 160701e04c3fSmrg i.insert_before(assign(src2, ir->operands[1])); 160801e04c3fSmrg } else { 160901e04c3fSmrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 161001e04c3fSmrg 161101e04c3fSmrg ir_variable *itmp1 = 161201e04c3fSmrg new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary); 161301e04c3fSmrg ir_variable *itmp2 = 161401e04c3fSmrg new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary); 161501e04c3fSmrg ir_constant *c0 = new(ir) ir_constant(int(0), elements); 161601e04c3fSmrg 161701e04c3fSmrg i.insert_before(itmp1); 161801e04c3fSmrg i.insert_before(itmp2); 161901e04c3fSmrg i.insert_before(assign(itmp1, ir->operands[0])); 162001e04c3fSmrg i.insert_before(assign(itmp2, ir->operands[1])); 162101e04c3fSmrg 162201e04c3fSmrg different_signs = 162301e04c3fSmrg new(ir) ir_variable(glsl_type::bvec(elements), "different_signs", 162401e04c3fSmrg ir_var_temporary); 162501e04c3fSmrg 162601e04c3fSmrg i.insert_before(different_signs); 162701e04c3fSmrg i.insert_before(assign(different_signs, expr(ir_binop_logic_xor, 162801e04c3fSmrg less(itmp1, c0), 162901e04c3fSmrg less(itmp2, c0->clone(ir, NULL))))); 163001e04c3fSmrg 163101e04c3fSmrg i.insert_before(assign(src1, i2u(abs(itmp1)))); 163201e04c3fSmrg i.insert_before(assign(src2, i2u(abs(itmp2)))); 163301e04c3fSmrg } 163401e04c3fSmrg 163501e04c3fSmrg i.insert_before(assign(src1l, bit_and(src1, c0000FFFF))); 163601e04c3fSmrg i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL)))); 163701e04c3fSmrg i.insert_before(assign(src1h, rshift(src1, c16))); 163801e04c3fSmrg i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL)))); 163901e04c3fSmrg 164001e04c3fSmrg i.insert_before(lo); 164101e04c3fSmrg i.insert_before(hi); 164201e04c3fSmrg i.insert_before(t1); 164301e04c3fSmrg i.insert_before(t2); 164401e04c3fSmrg 164501e04c3fSmrg i.insert_before(assign(lo, mul(src1l, src2l))); 164601e04c3fSmrg i.insert_before(assign(t1, mul(src1l, src2h))); 164701e04c3fSmrg i.insert_before(assign(t2, mul(src1h, src2l))); 164801e04c3fSmrg i.insert_before(assign(hi, mul(src1h, src2h))); 164901e04c3fSmrg 165001e04c3fSmrg i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL)))))); 165101e04c3fSmrg i.insert_before(assign(lo, add(lo, lshift(t1, c16->clone(ir, NULL))))); 165201e04c3fSmrg 165301e04c3fSmrg i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL)))))); 165401e04c3fSmrg i.insert_before(assign(lo, add(lo, lshift(t2, c16->clone(ir, NULL))))); 165501e04c3fSmrg 165601e04c3fSmrg if (different_signs == NULL) { 165701e04c3fSmrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT); 165801e04c3fSmrg 165901e04c3fSmrg ir->operation = ir_binop_add; 166001e04c3fSmrg ir->init_num_operands(); 166101e04c3fSmrg ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL))); 166201e04c3fSmrg ir->operands[1] = rshift(t2, c16->clone(ir, NULL)); 166301e04c3fSmrg } else { 166401e04c3fSmrg assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT); 166501e04c3fSmrg 166601e04c3fSmrg i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))), 166701e04c3fSmrg rshift(t2, c16->clone(ir, NULL))))); 166801e04c3fSmrg 166901e04c3fSmrg /* For channels where different_signs is set we have to perform a 64-bit 167001e04c3fSmrg * negation. This is *not* the same as just negating the high 32-bits. 167101e04c3fSmrg * Consider -3 * 2. The high 32-bits is 0, but the desired result is 167201e04c3fSmrg * -1, not -0! Recall -x == ~x + 1. 167301e04c3fSmrg */ 167401e04c3fSmrg ir_variable *neg_hi = 167501e04c3fSmrg new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary); 167601e04c3fSmrg ir_constant *c1 = new(ir) ir_constant(1u, elements); 167701e04c3fSmrg 167801e04c3fSmrg i.insert_before(neg_hi); 167901e04c3fSmrg i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)), 168001e04c3fSmrg u2i(_carry(bit_not(lo), c1))))); 168101e04c3fSmrg 168201e04c3fSmrg ir->operation = ir_triop_csel; 168301e04c3fSmrg ir->init_num_operands(); 168401e04c3fSmrg ir->operands[0] = new(ir) ir_dereference_variable(different_signs); 168501e04c3fSmrg ir->operands[1] = new(ir) ir_dereference_variable(neg_hi); 168601e04c3fSmrg ir->operands[2] = u2i(hi); 168701e04c3fSmrg } 168801e04c3fSmrg} 168901e04c3fSmrg 169001e04c3fSmrgvoid 169101e04c3fSmrglower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir) 169201e04c3fSmrg{ 169301e04c3fSmrg ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]); 169401e04c3fSmrg this->progress = true; 169501e04c3fSmrg} 169601e04c3fSmrg 16977e102996Smayavoid 16987e102996Smayalower_instructions_visitor::mul64_to_mul_and_mul_high(ir_expression *ir) 16997e102996Smaya{ 17007e102996Smaya /* Lower 32x32-> 64 to 17017e102996Smaya * msb = imul_high(x_lo, y_lo) 17027e102996Smaya * lsb = mul(x_lo, y_lo) 17037e102996Smaya */ 17047e102996Smaya const unsigned elements = ir->operands[0]->type->vector_elements; 17057e102996Smaya 17067e102996Smaya const ir_expression_operation operation = 17077e102996Smaya ir->type->base_type == GLSL_TYPE_UINT64 ? ir_unop_pack_uint_2x32 17087e102996Smaya : ir_unop_pack_int_2x32; 17097e102996Smaya 17107e102996Smaya const glsl_type *var_type = ir->type->base_type == GLSL_TYPE_UINT64 17117e102996Smaya ? glsl_type::uvec(elements) 17127e102996Smaya : glsl_type::ivec(elements); 17137e102996Smaya 17147e102996Smaya const glsl_type *ret_type = ir->type->base_type == GLSL_TYPE_UINT64 17157e102996Smaya ? glsl_type::uvec2_type 17167e102996Smaya : glsl_type::ivec2_type; 17177e102996Smaya 17187e102996Smaya ir_instruction &i = *base_ir; 17197e102996Smaya 17207e102996Smaya ir_variable *msb = 17217e102996Smaya new(ir) ir_variable(var_type, "msb", ir_var_temporary); 17227e102996Smaya ir_variable *lsb = 17237e102996Smaya new(ir) ir_variable(var_type, "lsb", ir_var_temporary); 17247e102996Smaya ir_variable *x = 17257e102996Smaya new(ir) ir_variable(var_type, "x", ir_var_temporary); 17267e102996Smaya ir_variable *y = 17277e102996Smaya new(ir) ir_variable(var_type, "y", ir_var_temporary); 17287e102996Smaya 17297e102996Smaya i.insert_before(x); 17307e102996Smaya i.insert_before(assign(x, ir->operands[0])); 17317e102996Smaya i.insert_before(y); 17327e102996Smaya i.insert_before(assign(y, ir->operands[1])); 17337e102996Smaya i.insert_before(msb); 17347e102996Smaya i.insert_before(lsb); 17357e102996Smaya 17367e102996Smaya i.insert_before(assign(msb, imul_high(x, y))); 17377e102996Smaya i.insert_before(assign(lsb, mul(x, y))); 17387e102996Smaya 17397e102996Smaya ir_rvalue *result[4] = {NULL}; 17407e102996Smaya for (unsigned elem = 0; elem < elements; elem++) { 17417e102996Smaya ir_rvalue *val = new(ir) ir_expression(ir_quadop_vector, ret_type, 17427e102996Smaya swizzle(lsb, elem, 1), 17437e102996Smaya swizzle(msb, elem, 1), NULL, NULL); 17447e102996Smaya result[elem] = expr(operation, val); 17457e102996Smaya } 17467e102996Smaya 17477e102996Smaya ir->operation = ir_quadop_vector; 17487e102996Smaya ir->init_num_operands(); 17497e102996Smaya ir->operands[0] = result[0]; 17507e102996Smaya ir->operands[1] = result[1]; 17517e102996Smaya ir->operands[2] = result[2]; 17527e102996Smaya ir->operands[3] = result[3]; 17537e102996Smaya 17547e102996Smaya this->progress = true; 17557e102996Smaya} 17567e102996Smaya 175701e04c3fSmrgir_visitor_status 175801e04c3fSmrglower_instructions_visitor::visit_leave(ir_expression *ir) 175901e04c3fSmrg{ 176001e04c3fSmrg switch (ir->operation) { 176101e04c3fSmrg case ir_binop_dot: 176201e04c3fSmrg if (ir->operands[0]->type->is_double()) 176301e04c3fSmrg double_dot_to_fma(ir); 176401e04c3fSmrg break; 176501e04c3fSmrg case ir_triop_lrp: 176601e04c3fSmrg if (ir->operands[0]->type->is_double()) 176701e04c3fSmrg double_lrp(ir); 176801e04c3fSmrg break; 176901e04c3fSmrg case ir_binop_sub: 177001e04c3fSmrg if (lowering(SUB_TO_ADD_NEG)) 177101e04c3fSmrg sub_to_add_neg(ir); 177201e04c3fSmrg break; 177301e04c3fSmrg 177401e04c3fSmrg case ir_binop_div: 17757ec681f3Smrg if (ir->operands[1]->type->is_integer_32() && lowering(INT_DIV_TO_MUL_RCP)) 177601e04c3fSmrg int_div_to_mul_rcp(ir); 17777ec681f3Smrg else if ((ir->operands[1]->type->is_float_16_32() && lowering(FDIV_TO_MUL_RCP)) || 177801e04c3fSmrg (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP))) 177901e04c3fSmrg div_to_mul_rcp(ir); 178001e04c3fSmrg break; 178101e04c3fSmrg 178201e04c3fSmrg case ir_unop_exp: 178301e04c3fSmrg if (lowering(EXP_TO_EXP2)) 178401e04c3fSmrg exp_to_exp2(ir); 178501e04c3fSmrg break; 178601e04c3fSmrg 178701e04c3fSmrg case ir_unop_log: 178801e04c3fSmrg if (lowering(LOG_TO_LOG2)) 178901e04c3fSmrg log_to_log2(ir); 179001e04c3fSmrg break; 179101e04c3fSmrg 179201e04c3fSmrg case ir_binop_mod: 17937ec681f3Smrg if (lowering(MOD_TO_FLOOR) && ir->type->is_float_16_32_64()) 179401e04c3fSmrg mod_to_floor(ir); 179501e04c3fSmrg break; 179601e04c3fSmrg 179701e04c3fSmrg case ir_binop_pow: 179801e04c3fSmrg if (lowering(POW_TO_EXP2)) 179901e04c3fSmrg pow_to_exp2(ir); 180001e04c3fSmrg break; 180101e04c3fSmrg 180201e04c3fSmrg case ir_binop_ldexp: 180301e04c3fSmrg if (lowering(LDEXP_TO_ARITH) && ir->type->is_float()) 180401e04c3fSmrg ldexp_to_arith(ir); 180501e04c3fSmrg if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double()) 180601e04c3fSmrg dldexp_to_arith(ir); 180701e04c3fSmrg break; 180801e04c3fSmrg 180901e04c3fSmrg case ir_unop_frexp_exp: 181001e04c3fSmrg if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 181101e04c3fSmrg dfrexp_exp_to_arith(ir); 181201e04c3fSmrg break; 181301e04c3fSmrg 181401e04c3fSmrg case ir_unop_frexp_sig: 181501e04c3fSmrg if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double()) 181601e04c3fSmrg dfrexp_sig_to_arith(ir); 181701e04c3fSmrg break; 181801e04c3fSmrg 181901e04c3fSmrg case ir_binop_carry: 182001e04c3fSmrg if (lowering(CARRY_TO_ARITH)) 182101e04c3fSmrg carry_to_arith(ir); 182201e04c3fSmrg break; 182301e04c3fSmrg 182401e04c3fSmrg case ir_binop_borrow: 182501e04c3fSmrg if (lowering(BORROW_TO_ARITH)) 182601e04c3fSmrg borrow_to_arith(ir); 182701e04c3fSmrg break; 182801e04c3fSmrg 182901e04c3fSmrg case ir_unop_saturate: 183001e04c3fSmrg if (lowering(SAT_TO_CLAMP)) 183101e04c3fSmrg sat_to_clamp(ir); 183201e04c3fSmrg break; 183301e04c3fSmrg 183401e04c3fSmrg case ir_unop_trunc: 183501e04c3fSmrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 183601e04c3fSmrg dtrunc_to_dfrac(ir); 183701e04c3fSmrg break; 183801e04c3fSmrg 183901e04c3fSmrg case ir_unop_ceil: 184001e04c3fSmrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 184101e04c3fSmrg dceil_to_dfrac(ir); 184201e04c3fSmrg break; 184301e04c3fSmrg 184401e04c3fSmrg case ir_unop_floor: 184501e04c3fSmrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 184601e04c3fSmrg dfloor_to_dfrac(ir); 184701e04c3fSmrg break; 184801e04c3fSmrg 184901e04c3fSmrg case ir_unop_round_even: 185001e04c3fSmrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 185101e04c3fSmrg dround_even_to_dfrac(ir); 185201e04c3fSmrg break; 185301e04c3fSmrg 185401e04c3fSmrg case ir_unop_sign: 185501e04c3fSmrg if (lowering(DOPS_TO_DFRAC) && ir->type->is_double()) 185601e04c3fSmrg dsign_to_csel(ir); 185701e04c3fSmrg break; 185801e04c3fSmrg 185901e04c3fSmrg case ir_unop_bit_count: 186001e04c3fSmrg if (lowering(BIT_COUNT_TO_MATH)) 186101e04c3fSmrg bit_count_to_math(ir); 186201e04c3fSmrg break; 186301e04c3fSmrg 186401e04c3fSmrg case ir_triop_bitfield_extract: 186501e04c3fSmrg if (lowering(EXTRACT_TO_SHIFTS)) 186601e04c3fSmrg extract_to_shifts(ir); 186701e04c3fSmrg break; 186801e04c3fSmrg 186901e04c3fSmrg case ir_quadop_bitfield_insert: 187001e04c3fSmrg if (lowering(INSERT_TO_SHIFTS)) 187101e04c3fSmrg insert_to_shifts(ir); 187201e04c3fSmrg break; 187301e04c3fSmrg 187401e04c3fSmrg case ir_unop_bitfield_reverse: 187501e04c3fSmrg if (lowering(REVERSE_TO_SHIFTS)) 187601e04c3fSmrg reverse_to_shifts(ir); 187701e04c3fSmrg break; 187801e04c3fSmrg 187901e04c3fSmrg case ir_unop_find_lsb: 188001e04c3fSmrg if (lowering(FIND_LSB_TO_FLOAT_CAST)) 188101e04c3fSmrg find_lsb_to_float_cast(ir); 188201e04c3fSmrg break; 188301e04c3fSmrg 188401e04c3fSmrg case ir_unop_find_msb: 188501e04c3fSmrg if (lowering(FIND_MSB_TO_FLOAT_CAST)) 188601e04c3fSmrg find_msb_to_float_cast(ir); 188701e04c3fSmrg break; 188801e04c3fSmrg 188901e04c3fSmrg case ir_binop_imul_high: 189001e04c3fSmrg if (lowering(IMUL_HIGH_TO_MUL)) 189101e04c3fSmrg imul_high_to_mul(ir); 189201e04c3fSmrg break; 189301e04c3fSmrg 18947e102996Smaya case ir_binop_mul: 18957e102996Smaya if (lowering(MUL64_TO_MUL_AND_MUL_HIGH) && 18967e102996Smaya (ir->type->base_type == GLSL_TYPE_INT64 || 18977e102996Smaya ir->type->base_type == GLSL_TYPE_UINT64) && 18987e102996Smaya (ir->operands[0]->type->base_type == GLSL_TYPE_INT || 18997e102996Smaya ir->operands[1]->type->base_type == GLSL_TYPE_UINT)) 19007e102996Smaya mul64_to_mul_and_mul_high(ir); 19017e102996Smaya break; 19027e102996Smaya 190301e04c3fSmrg case ir_unop_rsq: 190401e04c3fSmrg case ir_unop_sqrt: 190501e04c3fSmrg if (lowering(SQRT_TO_ABS_SQRT)) 190601e04c3fSmrg sqrt_to_abs_sqrt(ir); 190701e04c3fSmrg break; 190801e04c3fSmrg 190901e04c3fSmrg default: 191001e04c3fSmrg return visit_continue; 191101e04c3fSmrg } 191201e04c3fSmrg 191301e04c3fSmrg return visit_continue; 191401e04c3fSmrg} 1915