1/*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21 * DEALINGS IN THE SOFTWARE.
22 */
23
24/**
25 * \file lower_instructions.cpp
26 *
27 * Many GPUs lack native instructions for certain expression operations, and
28 * must replace them with some other expression tree.  This pass lowers some
29 * of the most common cases, allowing the lowering code to be implemented once
30 * rather than in each driver backend.
31 *
32 * Currently supported transformations:
33 * - SUB_TO_ADD_NEG
34 * - DIV_TO_MUL_RCP
35 * - INT_DIV_TO_MUL_RCP
36 * - EXP_TO_EXP2
37 * - POW_TO_EXP2
38 * - LOG_TO_LOG2
39 * - MOD_TO_FLOOR
40 * - LDEXP_TO_ARITH
41 * - DFREXP_TO_ARITH
42 * - CARRY_TO_ARITH
43 * - BORROW_TO_ARITH
44 * - SAT_TO_CLAMP
45 * - DOPS_TO_DFRAC
46 *
47 * SUB_TO_ADD_NEG:
48 * ---------------
49 * Breaks an ir_binop_sub expression down to add(op0, neg(op1))
50 *
51 * This simplifies expression reassociation, and for many backends
52 * there is no subtract operation separate from adding the negation.
53 * For backends with native subtract operations, they will probably
54 * want to recognize add(op0, neg(op1)) or the other way around to
55 * produce a subtract anyway.
56 *
57 * FDIV_TO_MUL_RCP, DDIV_TO_MUL_RCP, and INT_DIV_TO_MUL_RCP:
58 * ---------------------------------------------------------
59 * Breaks an ir_binop_div expression down to op0 * (rcp(op1)).
60 *
61 * Many GPUs don't have a divide instruction (945 and 965 included),
62 * but they do have an RCP instruction to compute an approximate
63 * reciprocal.  By breaking the operation down, constant reciprocals
64 * can get constant folded.
65 *
66 * FDIV_TO_MUL_RCP only lowers single-precision floating point division;
67 * DDIV_TO_MUL_RCP only lowers double-precision floating point division.
68 * DIV_TO_MUL_RCP is a convenience macro that sets both flags.
69 * INT_DIV_TO_MUL_RCP handles the integer case, converting to and from floating
70 * point so that RCP is possible.
71 *
72 * EXP_TO_EXP2 and LOG_TO_LOG2:
73 * ----------------------------
74 * Many GPUs don't have a base e log or exponent instruction, but they
75 * do have base 2 versions, so this pass converts exp and log to exp2
76 * and log2 operations.
77 *
78 * POW_TO_EXP2:
79 * -----------
80 * Many older GPUs don't have an x**y instruction.  For these GPUs, convert
81 * x**y to 2**(y * log2(x)).
82 *
83 * MOD_TO_FLOOR:
84 * -------------
85 * Breaks an ir_binop_mod expression down to (op0 - op1 * floor(op0 / op1))
86 *
87 * Many GPUs don't have a MOD instruction (945 and 965 included), and
88 * if we have to break it down like this anyway, it gives an
89 * opportunity to do things like constant fold the (1.0 / op1) easily.
90 *
91 * Note: before we used to implement this as op1 * fract(op / op1) but this
92 * implementation had significant precision errors.
93 *
94 * LDEXP_TO_ARITH:
95 * -------------
96 * Converts ir_binop_ldexp to arithmetic and bit operations for float sources.
97 *
98 * DFREXP_DLDEXP_TO_ARITH:
99 * ---------------
100 * Converts ir_binop_ldexp, ir_unop_frexp_sig, and ir_unop_frexp_exp to
101 * arithmetic and bit ops for double arguments.
102 *
103 * CARRY_TO_ARITH:
104 * ---------------
105 * Converts ir_carry into (x + y) < x.
106 *
107 * BORROW_TO_ARITH:
108 * ----------------
109 * Converts ir_borrow into (x < y).
110 *
111 * SAT_TO_CLAMP:
112 * -------------
113 * Converts ir_unop_saturate into min(max(x, 0.0), 1.0)
114 *
115 * DOPS_TO_DFRAC:
116 * --------------
117 * Converts double trunc, ceil, floor, round to fract
118 */
119
120#include "c99_math.h"
121#include "program/prog_instruction.h" /* for swizzle */
122#include "compiler/glsl_types.h"
123#include "ir.h"
124#include "ir_builder.h"
125#include "ir_optimization.h"
126
127using namespace ir_builder;
128
129namespace {
130
131class lower_instructions_visitor : public ir_hierarchical_visitor {
132public:
133   lower_instructions_visitor(unsigned lower)
134      : progress(false), lower(lower) { }
135
136   ir_visitor_status visit_leave(ir_expression *);
137
138   bool progress;
139
140private:
141   unsigned lower; /** Bitfield of which operations to lower */
142
143   void sub_to_add_neg(ir_expression *);
144   void div_to_mul_rcp(ir_expression *);
145   void int_div_to_mul_rcp(ir_expression *);
146   void mod_to_floor(ir_expression *);
147   void exp_to_exp2(ir_expression *);
148   void pow_to_exp2(ir_expression *);
149   void log_to_log2(ir_expression *);
150   void ldexp_to_arith(ir_expression *);
151   void dldexp_to_arith(ir_expression *);
152   void dfrexp_sig_to_arith(ir_expression *);
153   void dfrexp_exp_to_arith(ir_expression *);
154   void carry_to_arith(ir_expression *);
155   void borrow_to_arith(ir_expression *);
156   void sat_to_clamp(ir_expression *);
157   void double_dot_to_fma(ir_expression *);
158   void double_lrp(ir_expression *);
159   void dceil_to_dfrac(ir_expression *);
160   void dfloor_to_dfrac(ir_expression *);
161   void dround_even_to_dfrac(ir_expression *);
162   void dtrunc_to_dfrac(ir_expression *);
163   void dsign_to_csel(ir_expression *);
164   void bit_count_to_math(ir_expression *);
165   void extract_to_shifts(ir_expression *);
166   void insert_to_shifts(ir_expression *);
167   void reverse_to_shifts(ir_expression *ir);
168   void find_lsb_to_float_cast(ir_expression *ir);
169   void find_msb_to_float_cast(ir_expression *ir);
170   void imul_high_to_mul(ir_expression *ir);
171   void sqrt_to_abs_sqrt(ir_expression *ir);
172   void mul64_to_mul_and_mul_high(ir_expression *ir);
173
174   ir_expression *_carry(operand a, operand b);
175};
176
177} /* anonymous namespace */
178
179/**
180 * Determine if a particular type of lowering should occur
181 */
182#define lowering(x) (this->lower & x)
183
184bool
185lower_instructions(exec_list *instructions, unsigned what_to_lower)
186{
187   lower_instructions_visitor v(what_to_lower);
188
189   visit_list_elements(&v, instructions);
190   return v.progress;
191}
192
193void
194lower_instructions_visitor::sub_to_add_neg(ir_expression *ir)
195{
196   ir->operation = ir_binop_add;
197   ir->init_num_operands();
198   ir->operands[1] = new(ir) ir_expression(ir_unop_neg, ir->operands[1]->type,
199					   ir->operands[1], NULL);
200   this->progress = true;
201}
202
203void
204lower_instructions_visitor::div_to_mul_rcp(ir_expression *ir)
205{
206   assert(ir->operands[1]->type->is_float() || ir->operands[1]->type->is_double());
207
208   /* New expression for the 1.0 / op1 */
209   ir_rvalue *expr;
210   expr = new(ir) ir_expression(ir_unop_rcp,
211				ir->operands[1]->type,
212				ir->operands[1]);
213
214   /* op0 / op1 -> op0 * (1.0 / op1) */
215   ir->operation = ir_binop_mul;
216   ir->init_num_operands();
217   ir->operands[1] = expr;
218
219   this->progress = true;
220}
221
222void
223lower_instructions_visitor::int_div_to_mul_rcp(ir_expression *ir)
224{
225   assert(ir->operands[1]->type->is_integer());
226
227   /* Be careful with integer division -- we need to do it as a
228    * float and re-truncate, since rcp(n > 1) of an integer would
229    * just be 0.
230    */
231   ir_rvalue *op0, *op1;
232   const struct glsl_type *vec_type;
233
234   vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
235				      ir->operands[1]->type->vector_elements,
236				      ir->operands[1]->type->matrix_columns);
237
238   if (ir->operands[1]->type->base_type == GLSL_TYPE_INT)
239      op1 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[1], NULL);
240   else
241      op1 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[1], NULL);
242
243   op1 = new(ir) ir_expression(ir_unop_rcp, op1->type, op1, NULL);
244
245   vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
246				      ir->operands[0]->type->vector_elements,
247				      ir->operands[0]->type->matrix_columns);
248
249   if (ir->operands[0]->type->base_type == GLSL_TYPE_INT)
250      op0 = new(ir) ir_expression(ir_unop_i2f, vec_type, ir->operands[0], NULL);
251   else
252      op0 = new(ir) ir_expression(ir_unop_u2f, vec_type, ir->operands[0], NULL);
253
254   vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
255				      ir->type->vector_elements,
256				      ir->type->matrix_columns);
257
258   op0 = new(ir) ir_expression(ir_binop_mul, vec_type, op0, op1);
259
260   if (ir->operands[1]->type->base_type == GLSL_TYPE_INT) {
261      ir->operation = ir_unop_f2i;
262      ir->operands[0] = op0;
263   } else {
264      ir->operation = ir_unop_i2u;
265      ir->operands[0] = new(ir) ir_expression(ir_unop_f2i, op0);
266   }
267   ir->init_num_operands();
268   ir->operands[1] = NULL;
269
270   this->progress = true;
271}
272
273void
274lower_instructions_visitor::exp_to_exp2(ir_expression *ir)
275{
276   ir_constant *log2_e = new(ir) ir_constant(float(M_LOG2E));
277
278   ir->operation = ir_unop_exp2;
279   ir->init_num_operands();
280   ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[0]->type,
281					   ir->operands[0], log2_e);
282   this->progress = true;
283}
284
285void
286lower_instructions_visitor::pow_to_exp2(ir_expression *ir)
287{
288   ir_expression *const log2_x =
289      new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
290			    ir->operands[0]);
291
292   ir->operation = ir_unop_exp2;
293   ir->init_num_operands();
294   ir->operands[0] = new(ir) ir_expression(ir_binop_mul, ir->operands[1]->type,
295					   ir->operands[1], log2_x);
296   ir->operands[1] = NULL;
297   this->progress = true;
298}
299
300void
301lower_instructions_visitor::log_to_log2(ir_expression *ir)
302{
303   ir->operation = ir_binop_mul;
304   ir->init_num_operands();
305   ir->operands[0] = new(ir) ir_expression(ir_unop_log2, ir->operands[0]->type,
306					   ir->operands[0], NULL);
307   ir->operands[1] = new(ir) ir_constant(float(1.0 / M_LOG2E));
308   this->progress = true;
309}
310
311void
312lower_instructions_visitor::mod_to_floor(ir_expression *ir)
313{
314   ir_variable *x = new(ir) ir_variable(ir->operands[0]->type, "mod_x",
315                                         ir_var_temporary);
316   ir_variable *y = new(ir) ir_variable(ir->operands[1]->type, "mod_y",
317                                         ir_var_temporary);
318   this->base_ir->insert_before(x);
319   this->base_ir->insert_before(y);
320
321   ir_assignment *const assign_x =
322      new(ir) ir_assignment(new(ir) ir_dereference_variable(x),
323                            ir->operands[0]);
324   ir_assignment *const assign_y =
325      new(ir) ir_assignment(new(ir) ir_dereference_variable(y),
326                            ir->operands[1]);
327
328   this->base_ir->insert_before(assign_x);
329   this->base_ir->insert_before(assign_y);
330
331   ir_expression *const div_expr =
332      new(ir) ir_expression(ir_binop_div, x->type,
333                            new(ir) ir_dereference_variable(x),
334                            new(ir) ir_dereference_variable(y));
335
336   /* Don't generate new IR that would need to be lowered in an additional
337    * pass.
338    */
339   if ((lowering(FDIV_TO_MUL_RCP) && ir->type->is_float()) ||
340       (lowering(DDIV_TO_MUL_RCP) && ir->type->is_double()))
341      div_to_mul_rcp(div_expr);
342
343   ir_expression *const floor_expr =
344      new(ir) ir_expression(ir_unop_floor, x->type, div_expr);
345
346   if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
347      dfloor_to_dfrac(floor_expr);
348
349   ir_expression *const mul_expr =
350      new(ir) ir_expression(ir_binop_mul,
351                            new(ir) ir_dereference_variable(y),
352                            floor_expr);
353
354   ir->operation = ir_binop_sub;
355   ir->init_num_operands();
356   ir->operands[0] = new(ir) ir_dereference_variable(x);
357   ir->operands[1] = mul_expr;
358   this->progress = true;
359}
360
361void
362lower_instructions_visitor::ldexp_to_arith(ir_expression *ir)
363{
364   /* Translates
365    *    ir_binop_ldexp x exp
366    * into
367    *
368    *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
369    *    resulting_biased_exp = min(extracted_biased_exp + exp, 255);
370    *
371    *    if (extracted_biased_exp >= 255)
372    *       return x; // +/-inf, NaN
373    *
374    *    sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask;
375    *
376    *    if (min(resulting_biased_exp, extracted_biased_exp) < 1)
377    *       resulting_biased_exp = 0;
378    *    if (resulting_biased_exp >= 255 ||
379    *        min(resulting_biased_exp, extracted_biased_exp) < 1) {
380    *       sign_mantissa &= sign_mask;
381    *    }
382    *
383    *    return bitcast_u2f(sign_mantissa |
384    *                       lshift(i2u(resulting_biased_exp), exp_shift));
385    *
386    * which we can't actually implement as such, since the GLSL IR doesn't
387    * have vectorized if-statements. We actually implement it without branches
388    * using conditional-select:
389    *
390    *    extracted_biased_exp = rshift(bitcast_f2i(abs(x)), exp_shift);
391    *    resulting_biased_exp = min(extracted_biased_exp + exp, 255);
392    *
393    *    sign_mantissa = bitcast_f2u(x) & sign_mantissa_mask;
394    *
395    *    flush_to_zero = lequal(min(resulting_biased_exp, extracted_biased_exp), 0);
396    *    resulting_biased_exp = csel(flush_to_zero, 0, resulting_biased_exp)
397    *    zero_mantissa = logic_or(flush_to_zero,
398    *                             gequal(resulting_biased_exp, 255));
399    *    sign_mantissa = csel(zero_mantissa, sign_mantissa & sign_mask, sign_mantissa);
400    *
401    *    result = sign_mantissa |
402    *             lshift(i2u(resulting_biased_exp), exp_shift));
403    *
404    *    return csel(extracted_biased_exp >= 255, x, bitcast_u2f(result));
405    *
406    * The definition of ldexp in the GLSL spec says:
407    *
408    *    "If this product is too large to be represented in the
409    *     floating-point type, the result is undefined."
410    *
411    * However, the definition of ldexp in the GLSL ES spec does not contain
412    * this sentence, so we do need to handle overflow correctly.
413    *
414    * There is additional language limiting the defined range of exp, but this
415    * is merely to allow implementations that store 2^exp in a temporary
416    * variable.
417    */
418
419   const unsigned vec_elem = ir->type->vector_elements;
420
421   /* Types */
422   const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
423   const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
424   const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
425
426   /* Temporary variables */
427   ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
428   ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
429   ir_variable *result = new(ir) ir_variable(uvec, "result", ir_var_temporary);
430
431   ir_variable *extracted_biased_exp =
432      new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
433   ir_variable *resulting_biased_exp =
434      new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
435
436   ir_variable *sign_mantissa =
437      new(ir) ir_variable(uvec, "sign_mantissa", ir_var_temporary);
438
439   ir_variable *flush_to_zero =
440      new(ir) ir_variable(bvec, "flush_to_zero", ir_var_temporary);
441   ir_variable *zero_mantissa =
442      new(ir) ir_variable(bvec, "zero_mantissa", ir_var_temporary);
443
444   ir_instruction &i = *base_ir;
445
446   /* Copy <x> and <exp> arguments. */
447   i.insert_before(x);
448   i.insert_before(assign(x, ir->operands[0]));
449   i.insert_before(exp);
450   i.insert_before(assign(exp, ir->operands[1]));
451
452   /* Extract the biased exponent from <x>. */
453   i.insert_before(extracted_biased_exp);
454   i.insert_before(assign(extracted_biased_exp,
455                          rshift(bitcast_f2i(abs(x)),
456                                 new(ir) ir_constant(23, vec_elem))));
457
458   /* The definition of ldexp in the GLSL 4.60 spec says:
459    *
460    *    "If exp is greater than +128 (single-precision) or +1024
461    *     (double-precision), the value returned is undefined. If exp is less
462    *     than -126 (single-precision) or -1022 (double-precision), the value
463    *     returned may be flushed to zero."
464    *
465    * So we do not have to guard against the possibility of addition overflow,
466    * which could happen when exp is close to INT_MAX. Addition underflow
467    * cannot happen (the worst case is 0 + (-INT_MAX)).
468    */
469   i.insert_before(resulting_biased_exp);
470   i.insert_before(assign(resulting_biased_exp,
471                          min2(add(extracted_biased_exp, exp),
472                               new(ir) ir_constant(255, vec_elem))));
473
474   i.insert_before(sign_mantissa);
475   i.insert_before(assign(sign_mantissa,
476                          bit_and(bitcast_f2u(x),
477                                  new(ir) ir_constant(0x807fffffu, vec_elem))));
478
479   /* We flush to zero if the original or resulting biased exponent is 0,
480    * indicating a +/-0.0 or subnormal input or output.
481    *
482    * The mantissa is set to 0 if the resulting biased exponent is 255, since
483    * an overflow should produce a +/-inf result.
484    *
485    * Note that NaN inputs are handled separately.
486    */
487   i.insert_before(flush_to_zero);
488   i.insert_before(assign(flush_to_zero,
489                          lequal(min2(resulting_biased_exp,
490                                      extracted_biased_exp),
491                                 ir_constant::zero(ir, ivec))));
492   i.insert_before(assign(resulting_biased_exp,
493                          csel(flush_to_zero,
494                               ir_constant::zero(ir, ivec),
495                               resulting_biased_exp)));
496
497   i.insert_before(zero_mantissa);
498   i.insert_before(assign(zero_mantissa,
499                          logic_or(flush_to_zero,
500                                   equal(resulting_biased_exp,
501                                         new(ir) ir_constant(255, vec_elem)))));
502   i.insert_before(assign(sign_mantissa,
503                          csel(zero_mantissa,
504                               bit_and(sign_mantissa,
505                                       new(ir) ir_constant(0x80000000u, vec_elem)),
506                               sign_mantissa)));
507
508   /* Don't generate new IR that would need to be lowered in an additional
509    * pass.
510    */
511   i.insert_before(result);
512   if (!lowering(INSERT_TO_SHIFTS)) {
513      i.insert_before(assign(result,
514                             bitfield_insert(sign_mantissa,
515                                             i2u(resulting_biased_exp),
516                                             new(ir) ir_constant(23u, vec_elem),
517                                             new(ir) ir_constant(8u, vec_elem))));
518   } else {
519      i.insert_before(assign(result,
520                             bit_or(sign_mantissa,
521                                    lshift(i2u(resulting_biased_exp),
522                                           new(ir) ir_constant(23, vec_elem)))));
523   }
524
525   ir->operation = ir_triop_csel;
526   ir->init_num_operands();
527   ir->operands[0] = gequal(extracted_biased_exp,
528                            new(ir) ir_constant(255, vec_elem));
529   ir->operands[1] = new(ir) ir_dereference_variable(x);
530   ir->operands[2] = bitcast_u2f(result);
531
532   this->progress = true;
533}
534
535void
536lower_instructions_visitor::dldexp_to_arith(ir_expression *ir)
537{
538   /* See ldexp_to_arith for structure. Uses frexp_exp to extract the exponent
539    * from the significand.
540    */
541
542   const unsigned vec_elem = ir->type->vector_elements;
543
544   /* Types */
545   const glsl_type *ivec = glsl_type::get_instance(GLSL_TYPE_INT, vec_elem, 1);
546   const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
547
548   /* Constants */
549   ir_constant *zeroi = ir_constant::zero(ir, ivec);
550
551   ir_constant *sign_mask = new(ir) ir_constant(0x80000000u);
552
553   ir_constant *exp_shift = new(ir) ir_constant(20u);
554   ir_constant *exp_width = new(ir) ir_constant(11u);
555   ir_constant *exp_bias = new(ir) ir_constant(1022, vec_elem);
556
557   /* Temporary variables */
558   ir_variable *x = new(ir) ir_variable(ir->type, "x", ir_var_temporary);
559   ir_variable *exp = new(ir) ir_variable(ivec, "exp", ir_var_temporary);
560
561   ir_variable *zero_sign_x = new(ir) ir_variable(ir->type, "zero_sign_x",
562                                                  ir_var_temporary);
563
564   ir_variable *extracted_biased_exp =
565      new(ir) ir_variable(ivec, "extracted_biased_exp", ir_var_temporary);
566   ir_variable *resulting_biased_exp =
567      new(ir) ir_variable(ivec, "resulting_biased_exp", ir_var_temporary);
568
569   ir_variable *is_not_zero_or_underflow =
570      new(ir) ir_variable(bvec, "is_not_zero_or_underflow", ir_var_temporary);
571
572   ir_instruction &i = *base_ir;
573
574   /* Copy <x> and <exp> arguments. */
575   i.insert_before(x);
576   i.insert_before(assign(x, ir->operands[0]));
577   i.insert_before(exp);
578   i.insert_before(assign(exp, ir->operands[1]));
579
580   ir_expression *frexp_exp = expr(ir_unop_frexp_exp, x);
581   if (lowering(DFREXP_DLDEXP_TO_ARITH))
582      dfrexp_exp_to_arith(frexp_exp);
583
584   /* Extract the biased exponent from <x>. */
585   i.insert_before(extracted_biased_exp);
586   i.insert_before(assign(extracted_biased_exp, add(frexp_exp, exp_bias)));
587
588   i.insert_before(resulting_biased_exp);
589   i.insert_before(assign(resulting_biased_exp,
590                          add(extracted_biased_exp, exp)));
591
592   /* Test if result is ±0.0, subnormal, or underflow by checking if the
593    * resulting biased exponent would be less than 0x1. If so, the result is
594    * 0.0 with the sign of x. (Actually, invert the conditions so that
595    * immediate values are the second arguments, which is better for i965)
596    * TODO: Implement in a vector fashion.
597    */
598   i.insert_before(zero_sign_x);
599   for (unsigned elem = 0; elem < vec_elem; elem++) {
600      ir_variable *unpacked =
601         new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
602      i.insert_before(unpacked);
603      i.insert_before(
604            assign(unpacked,
605                   expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
606      i.insert_before(assign(unpacked, bit_and(swizzle_y(unpacked), sign_mask->clone(ir, NULL)),
607                             WRITEMASK_Y));
608      i.insert_before(assign(unpacked, ir_constant::zero(ir, glsl_type::uint_type), WRITEMASK_X));
609      i.insert_before(assign(zero_sign_x,
610                             expr(ir_unop_pack_double_2x32, unpacked),
611                             1 << elem));
612   }
613   i.insert_before(is_not_zero_or_underflow);
614   i.insert_before(assign(is_not_zero_or_underflow,
615                          gequal(resulting_biased_exp,
616                                  new(ir) ir_constant(0x1, vec_elem))));
617   i.insert_before(assign(x, csel(is_not_zero_or_underflow,
618                                  x, zero_sign_x)));
619   i.insert_before(assign(resulting_biased_exp,
620                          csel(is_not_zero_or_underflow,
621                               resulting_biased_exp, zeroi)));
622
623   /* We could test for overflows by checking if the resulting biased exponent
624    * would be greater than 0xFE. Turns out we don't need to because the GLSL
625    * spec says:
626    *
627    *    "If this product is too large to be represented in the
628    *     floating-point type, the result is undefined."
629    */
630
631   ir_rvalue *results[4] = {NULL};
632   for (unsigned elem = 0; elem < vec_elem; elem++) {
633      ir_variable *unpacked =
634         new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
635      i.insert_before(unpacked);
636      i.insert_before(
637            assign(unpacked,
638                   expr(ir_unop_unpack_double_2x32, swizzle(x, elem, 1))));
639
640      ir_expression *bfi = bitfield_insert(
641            swizzle_y(unpacked),
642            i2u(swizzle(resulting_biased_exp, elem, 1)),
643            exp_shift->clone(ir, NULL),
644            exp_width->clone(ir, NULL));
645
646      i.insert_before(assign(unpacked, bfi, WRITEMASK_Y));
647
648      results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
649   }
650
651   ir->operation = ir_quadop_vector;
652   ir->init_num_operands();
653   ir->operands[0] = results[0];
654   ir->operands[1] = results[1];
655   ir->operands[2] = results[2];
656   ir->operands[3] = results[3];
657
658   /* Don't generate new IR that would need to be lowered in an additional
659    * pass.
660    */
661
662   this->progress = true;
663}
664
665void
666lower_instructions_visitor::dfrexp_sig_to_arith(ir_expression *ir)
667{
668   const unsigned vec_elem = ir->type->vector_elements;
669   const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
670
671   /* Double-precision floating-point values are stored as
672    *   1 sign bit;
673    *   11 exponent bits;
674    *   52 mantissa bits.
675    *
676    * We're just extracting the significand here, so we only need to modify
677    * the upper 32-bit uint. Unfortunately we must extract each double
678    * independently as there is no vector version of unpackDouble.
679    */
680
681   ir_instruction &i = *base_ir;
682
683   ir_variable *is_not_zero =
684      new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
685   ir_rvalue *results[4] = {NULL};
686
687   ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
688   i.insert_before(is_not_zero);
689   i.insert_before(
690         assign(is_not_zero,
691                nequal(abs(ir->operands[0]->clone(ir, NULL)), dzero)));
692
693   /* TODO: Remake this as more vector-friendly when int64 support is
694    * available.
695    */
696   for (unsigned elem = 0; elem < vec_elem; elem++) {
697      ir_constant *zero = new(ir) ir_constant(0u, 1);
698      ir_constant *sign_mantissa_mask = new(ir) ir_constant(0x800fffffu, 1);
699
700      /* Exponent of double floating-point values in the range [0.5, 1.0). */
701      ir_constant *exponent_value = new(ir) ir_constant(0x3fe00000u, 1);
702
703      ir_variable *bits =
704         new(ir) ir_variable(glsl_type::uint_type, "bits", ir_var_temporary);
705      ir_variable *unpacked =
706         new(ir) ir_variable(glsl_type::uvec2_type, "unpacked", ir_var_temporary);
707
708      ir_rvalue *x = swizzle(ir->operands[0]->clone(ir, NULL), elem, 1);
709
710      i.insert_before(bits);
711      i.insert_before(unpacked);
712      i.insert_before(assign(unpacked, expr(ir_unop_unpack_double_2x32, x)));
713
714      /* Manipulate the high uint to remove the exponent and replace it with
715       * either the default exponent or zero.
716       */
717      i.insert_before(assign(bits, swizzle_y(unpacked)));
718      i.insert_before(assign(bits, bit_and(bits, sign_mantissa_mask)));
719      i.insert_before(assign(bits, bit_or(bits,
720                                          csel(swizzle(is_not_zero, elem, 1),
721                                               exponent_value,
722                                               zero))));
723      i.insert_before(assign(unpacked, bits, WRITEMASK_Y));
724      results[elem] = expr(ir_unop_pack_double_2x32, unpacked);
725   }
726
727   /* Put the dvec back together */
728   ir->operation = ir_quadop_vector;
729   ir->init_num_operands();
730   ir->operands[0] = results[0];
731   ir->operands[1] = results[1];
732   ir->operands[2] = results[2];
733   ir->operands[3] = results[3];
734
735   this->progress = true;
736}
737
738void
739lower_instructions_visitor::dfrexp_exp_to_arith(ir_expression *ir)
740{
741   const unsigned vec_elem = ir->type->vector_elements;
742   const glsl_type *bvec = glsl_type::get_instance(GLSL_TYPE_BOOL, vec_elem, 1);
743   const glsl_type *uvec = glsl_type::get_instance(GLSL_TYPE_UINT, vec_elem, 1);
744
745   /* Double-precision floating-point values are stored as
746    *   1 sign bit;
747    *   11 exponent bits;
748    *   52 mantissa bits.
749    *
750    * We're just extracting the exponent here, so we only care about the upper
751    * 32-bit uint.
752    */
753
754   ir_instruction &i = *base_ir;
755
756   ir_variable *is_not_zero =
757      new(ir) ir_variable(bvec, "is_not_zero", ir_var_temporary);
758   ir_variable *high_words =
759      new(ir) ir_variable(uvec, "high_words", ir_var_temporary);
760   ir_constant *dzero = new(ir) ir_constant(0.0, vec_elem);
761   ir_constant *izero = new(ir) ir_constant(0, vec_elem);
762
763   ir_rvalue *absval = abs(ir->operands[0]);
764
765   i.insert_before(is_not_zero);
766   i.insert_before(high_words);
767   i.insert_before(assign(is_not_zero, nequal(absval->clone(ir, NULL), dzero)));
768
769   /* Extract all of the upper uints. */
770   for (unsigned elem = 0; elem < vec_elem; elem++) {
771      ir_rvalue *x = swizzle(absval->clone(ir, NULL), elem, 1);
772
773      i.insert_before(assign(high_words,
774                             swizzle_y(expr(ir_unop_unpack_double_2x32, x)),
775                             1 << elem));
776
777   }
778   ir_constant *exponent_shift = new(ir) ir_constant(20, vec_elem);
779   ir_constant *exponent_bias = new(ir) ir_constant(-1022, vec_elem);
780
781   /* For non-zero inputs, shift the exponent down and apply bias. */
782   ir->operation = ir_triop_csel;
783   ir->init_num_operands();
784   ir->operands[0] = new(ir) ir_dereference_variable(is_not_zero);
785   ir->operands[1] = add(exponent_bias, u2i(rshift(high_words, exponent_shift)));
786   ir->operands[2] = izero;
787
788   this->progress = true;
789}
790
791void
792lower_instructions_visitor::carry_to_arith(ir_expression *ir)
793{
794   /* Translates
795    *   ir_binop_carry x y
796    * into
797    *   sum = ir_binop_add x y
798    *   bcarry = ir_binop_less sum x
799    *   carry = ir_unop_b2i bcarry
800    */
801
802   ir_rvalue *x_clone = ir->operands[0]->clone(ir, NULL);
803   ir->operation = ir_unop_i2u;
804   ir->init_num_operands();
805   ir->operands[0] = b2i(less(add(ir->operands[0], ir->operands[1]), x_clone));
806   ir->operands[1] = NULL;
807
808   this->progress = true;
809}
810
811void
812lower_instructions_visitor::borrow_to_arith(ir_expression *ir)
813{
814   /* Translates
815    *   ir_binop_borrow x y
816    * into
817    *   bcarry = ir_binop_less x y
818    *   carry = ir_unop_b2i bcarry
819    */
820
821   ir->operation = ir_unop_i2u;
822   ir->init_num_operands();
823   ir->operands[0] = b2i(less(ir->operands[0], ir->operands[1]));
824   ir->operands[1] = NULL;
825
826   this->progress = true;
827}
828
829void
830lower_instructions_visitor::sat_to_clamp(ir_expression *ir)
831{
832   /* Translates
833    *   ir_unop_saturate x
834    * into
835    *   ir_binop_min (ir_binop_max(x, 0.0), 1.0)
836    */
837
838   ir->operation = ir_binop_min;
839   ir->init_num_operands();
840   ir->operands[0] = new(ir) ir_expression(ir_binop_max, ir->operands[0]->type,
841                                           ir->operands[0],
842                                           new(ir) ir_constant(0.0f));
843   ir->operands[1] = new(ir) ir_constant(1.0f);
844
845   this->progress = true;
846}
847
848void
849lower_instructions_visitor::double_dot_to_fma(ir_expression *ir)
850{
851   ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type->get_base_type(), "dot_res",
852					   ir_var_temporary);
853   this->base_ir->insert_before(temp);
854
855   int nc = ir->operands[0]->type->components();
856   for (int i = nc - 1; i >= 1; i--) {
857      ir_assignment *assig;
858      if (i == (nc - 1)) {
859         assig = assign(temp, mul(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
860                                  swizzle(ir->operands[1]->clone(ir, NULL), i, 1)));
861      } else {
862         assig = assign(temp, fma(swizzle(ir->operands[0]->clone(ir, NULL), i, 1),
863                                  swizzle(ir->operands[1]->clone(ir, NULL), i, 1),
864                                  temp));
865      }
866      this->base_ir->insert_before(assig);
867   }
868
869   ir->operation = ir_triop_fma;
870   ir->init_num_operands();
871   ir->operands[0] = swizzle(ir->operands[0], 0, 1);
872   ir->operands[1] = swizzle(ir->operands[1], 0, 1);
873   ir->operands[2] = new(ir) ir_dereference_variable(temp);
874
875   this->progress = true;
876
877}
878
879void
880lower_instructions_visitor::double_lrp(ir_expression *ir)
881{
882   int swizval;
883   ir_rvalue *op0 = ir->operands[0], *op2 = ir->operands[2];
884   ir_constant *one = new(ir) ir_constant(1.0, op2->type->vector_elements);
885
886   switch (op2->type->vector_elements) {
887   case 1:
888      swizval = SWIZZLE_XXXX;
889      break;
890   default:
891      assert(op0->type->vector_elements == op2->type->vector_elements);
892      swizval = SWIZZLE_XYZW;
893      break;
894   }
895
896   ir->operation = ir_triop_fma;
897   ir->init_num_operands();
898   ir->operands[0] = swizzle(op2, swizval, op0->type->vector_elements);
899   ir->operands[2] = mul(sub(one, op2->clone(ir, NULL)), op0);
900
901   this->progress = true;
902}
903
904void
905lower_instructions_visitor::dceil_to_dfrac(ir_expression *ir)
906{
907   /*
908    * frtemp = frac(x);
909    * temp = sub(x, frtemp);
910    * result = temp + ((frtemp != 0.0) ? 1.0 : 0.0);
911    */
912   ir_instruction &i = *base_ir;
913   ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
914   ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
915   ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
916                                             ir_var_temporary);
917
918   i.insert_before(frtemp);
919   i.insert_before(assign(frtemp, fract(ir->operands[0])));
920
921   ir->operation = ir_binop_add;
922   ir->init_num_operands();
923   ir->operands[0] = sub(ir->operands[0]->clone(ir, NULL), frtemp);
924   ir->operands[1] = csel(nequal(frtemp, zero), one, zero->clone(ir, NULL));
925
926   this->progress = true;
927}
928
929void
930lower_instructions_visitor::dfloor_to_dfrac(ir_expression *ir)
931{
932   /*
933    * frtemp = frac(x);
934    * result = sub(x, frtemp);
935    */
936   ir->operation = ir_binop_sub;
937   ir->init_num_operands();
938   ir->operands[1] = fract(ir->operands[0]->clone(ir, NULL));
939
940   this->progress = true;
941}
942void
943lower_instructions_visitor::dround_even_to_dfrac(ir_expression *ir)
944{
945   /*
946    * insane but works
947    * temp = x + 0.5;
948    * frtemp = frac(temp);
949    * t2 = sub(temp, frtemp);
950    * if (frac(x) == 0.5)
951    *     result = frac(t2 * 0.5) == 0 ? t2 : t2 - 1;
952    *  else
953    *     result = t2;
954
955    */
956   ir_instruction &i = *base_ir;
957   ir_variable *frtemp = new(ir) ir_variable(ir->operands[0]->type, "frtemp",
958                                             ir_var_temporary);
959   ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
960                                           ir_var_temporary);
961   ir_variable *t2 = new(ir) ir_variable(ir->operands[0]->type, "t2",
962                                           ir_var_temporary);
963   ir_constant *p5 = new(ir) ir_constant(0.5, ir->operands[0]->type->vector_elements);
964   ir_constant *one = new(ir) ir_constant(1.0, ir->operands[0]->type->vector_elements);
965   ir_constant *zero = new(ir) ir_constant(0.0, ir->operands[0]->type->vector_elements);
966
967   i.insert_before(temp);
968   i.insert_before(assign(temp, add(ir->operands[0], p5)));
969
970   i.insert_before(frtemp);
971   i.insert_before(assign(frtemp, fract(temp)));
972
973   i.insert_before(t2);
974   i.insert_before(assign(t2, sub(temp, frtemp)));
975
976   ir->operation = ir_triop_csel;
977   ir->init_num_operands();
978   ir->operands[0] = equal(fract(ir->operands[0]->clone(ir, NULL)),
979                           p5->clone(ir, NULL));
980   ir->operands[1] = csel(equal(fract(mul(t2, p5->clone(ir, NULL))),
981                                zero),
982                          t2,
983                          sub(t2, one));
984   ir->operands[2] = new(ir) ir_dereference_variable(t2);
985
986   this->progress = true;
987}
988
989void
990lower_instructions_visitor::dtrunc_to_dfrac(ir_expression *ir)
991{
992   /*
993    * frtemp = frac(x);
994    * temp = sub(x, frtemp);
995    * result = x >= 0 ? temp : temp + (frtemp == 0.0) ? 0 : 1;
996    */
997   ir_rvalue *arg = ir->operands[0];
998   ir_instruction &i = *base_ir;
999
1000   ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
1001   ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
1002   ir_variable *frtemp = new(ir) ir_variable(arg->type, "frtemp",
1003                                             ir_var_temporary);
1004   ir_variable *temp = new(ir) ir_variable(ir->operands[0]->type, "temp",
1005                                           ir_var_temporary);
1006
1007   i.insert_before(frtemp);
1008   i.insert_before(assign(frtemp, fract(arg)));
1009   i.insert_before(temp);
1010   i.insert_before(assign(temp, sub(arg->clone(ir, NULL), frtemp)));
1011
1012   ir->operation = ir_triop_csel;
1013   ir->init_num_operands();
1014   ir->operands[0] = gequal(arg->clone(ir, NULL), zero);
1015   ir->operands[1] = new (ir) ir_dereference_variable(temp);
1016   ir->operands[2] = add(temp,
1017                         csel(equal(frtemp, zero->clone(ir, NULL)),
1018                              zero->clone(ir, NULL),
1019                              one));
1020
1021   this->progress = true;
1022}
1023
1024void
1025lower_instructions_visitor::dsign_to_csel(ir_expression *ir)
1026{
1027   /*
1028    * temp = x > 0.0 ? 1.0 : 0.0;
1029    * result = x < 0.0 ? -1.0 : temp;
1030    */
1031   ir_rvalue *arg = ir->operands[0];
1032   ir_constant *zero = new(ir) ir_constant(0.0, arg->type->vector_elements);
1033   ir_constant *one = new(ir) ir_constant(1.0, arg->type->vector_elements);
1034   ir_constant *neg_one = new(ir) ir_constant(-1.0, arg->type->vector_elements);
1035
1036   ir->operation = ir_triop_csel;
1037   ir->init_num_operands();
1038   ir->operands[0] = less(arg->clone(ir, NULL),
1039                          zero->clone(ir, NULL));
1040   ir->operands[1] = neg_one;
1041   ir->operands[2] = csel(greater(arg, zero),
1042                          one,
1043                          zero->clone(ir, NULL));
1044
1045   this->progress = true;
1046}
1047
1048void
1049lower_instructions_visitor::bit_count_to_math(ir_expression *ir)
1050{
1051   /* For more details, see:
1052    *
1053    * http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetPaallel
1054    */
1055   const unsigned elements = ir->operands[0]->type->vector_elements;
1056   ir_variable *temp = new(ir) ir_variable(glsl_type::uvec(elements), "temp",
1057                                           ir_var_temporary);
1058   ir_constant *c55555555 = new(ir) ir_constant(0x55555555u);
1059   ir_constant *c33333333 = new(ir) ir_constant(0x33333333u);
1060   ir_constant *c0F0F0F0F = new(ir) ir_constant(0x0F0F0F0Fu);
1061   ir_constant *c01010101 = new(ir) ir_constant(0x01010101u);
1062   ir_constant *c1 = new(ir) ir_constant(1u);
1063   ir_constant *c2 = new(ir) ir_constant(2u);
1064   ir_constant *c4 = new(ir) ir_constant(4u);
1065   ir_constant *c24 = new(ir) ir_constant(24u);
1066
1067   base_ir->insert_before(temp);
1068
1069   if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1070      base_ir->insert_before(assign(temp, ir->operands[0]));
1071   } else {
1072      assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1073      base_ir->insert_before(assign(temp, i2u(ir->operands[0])));
1074   }
1075
1076   /* temp = temp - ((temp >> 1) & 0x55555555u); */
1077   base_ir->insert_before(assign(temp, sub(temp, bit_and(rshift(temp, c1),
1078                                                         c55555555))));
1079
1080   /* temp = (temp & 0x33333333u) + ((temp >> 2) & 0x33333333u); */
1081   base_ir->insert_before(assign(temp, add(bit_and(temp, c33333333),
1082                                           bit_and(rshift(temp, c2),
1083                                                   c33333333->clone(ir, NULL)))));
1084
1085   /* int(((temp + (temp >> 4) & 0xF0F0F0Fu) * 0x1010101u) >> 24); */
1086   ir->operation = ir_unop_u2i;
1087   ir->init_num_operands();
1088   ir->operands[0] = rshift(mul(bit_and(add(temp, rshift(temp, c4)), c0F0F0F0F),
1089                                c01010101),
1090                            c24);
1091
1092   this->progress = true;
1093}
1094
1095void
1096lower_instructions_visitor::extract_to_shifts(ir_expression *ir)
1097{
1098   ir_variable *bits =
1099      new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1100
1101   base_ir->insert_before(bits);
1102   base_ir->insert_before(assign(bits, ir->operands[2]));
1103
1104   if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1105      ir_constant *c1 =
1106         new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1107      ir_constant *c32 =
1108         new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1109      ir_constant *cFFFFFFFF =
1110         new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1111
1112      /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1113       * we'd get a mask of 0 when bits is 32.  Special case it.
1114       *
1115       * mask = bits == 32 ? 0xffffffff : (1u << bits) - 1u;
1116       */
1117      ir_expression *mask = csel(equal(bits, c32),
1118                                 cFFFFFFFF,
1119                                 sub(lshift(c1, bits), c1->clone(ir, NULL)));
1120
1121      /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1122       *
1123       *    If bits is zero, the result will be zero.
1124       *
1125       * Since (1 << 0) - 1 == 0, we don't need to bother with the conditional
1126       * select as in the signed integer case.
1127       *
1128       * (value >> offset) & mask;
1129       */
1130      ir->operation = ir_binop_bit_and;
1131      ir->init_num_operands();
1132      ir->operands[0] = rshift(ir->operands[0], ir->operands[1]);
1133      ir->operands[1] = mask;
1134      ir->operands[2] = NULL;
1135   } else {
1136      ir_constant *c0 =
1137         new(ir) ir_constant(int(0), ir->operands[0]->type->vector_elements);
1138      ir_constant *c32 =
1139         new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1140      ir_variable *temp =
1141         new(ir) ir_variable(ir->operands[0]->type, "temp", ir_var_temporary);
1142
1143      /* temp = 32 - bits; */
1144      base_ir->insert_before(temp);
1145      base_ir->insert_before(assign(temp, sub(c32, bits)));
1146
1147      /* expr = value << (temp - offset)) >> temp; */
1148      ir_expression *expr =
1149         rshift(lshift(ir->operands[0], sub(temp, ir->operands[1])), temp);
1150
1151      /* Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1152       *
1153       *    If bits is zero, the result will be zero.
1154       *
1155       * Due to the (x << (y%32)) behavior mentioned before, the (value <<
1156       * (32-0)) doesn't "erase" all of the data as we would like, so finish
1157       * up with:
1158       *
1159       * (bits == 0) ? 0 : e;
1160       */
1161      ir->operation = ir_triop_csel;
1162      ir->init_num_operands();
1163      ir->operands[0] = equal(c0, bits);
1164      ir->operands[1] = c0->clone(ir, NULL);
1165      ir->operands[2] = expr;
1166   }
1167
1168   this->progress = true;
1169}
1170
1171void
1172lower_instructions_visitor::insert_to_shifts(ir_expression *ir)
1173{
1174   ir_constant *c1;
1175   ir_constant *c32;
1176   ir_constant *cFFFFFFFF;
1177   ir_variable *offset =
1178      new(ir) ir_variable(ir->operands[0]->type, "offset", ir_var_temporary);
1179   ir_variable *bits =
1180      new(ir) ir_variable(ir->operands[0]->type, "bits", ir_var_temporary);
1181   ir_variable *mask =
1182      new(ir) ir_variable(ir->operands[0]->type, "mask", ir_var_temporary);
1183
1184   if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1185      c1 = new(ir) ir_constant(int(1), ir->operands[0]->type->vector_elements);
1186      c32 = new(ir) ir_constant(int(32), ir->operands[0]->type->vector_elements);
1187      cFFFFFFFF = new(ir) ir_constant(int(0xFFFFFFFF), ir->operands[0]->type->vector_elements);
1188   } else {
1189      assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1190
1191      c1 = new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1192      c32 = new(ir) ir_constant(32u, ir->operands[0]->type->vector_elements);
1193      cFFFFFFFF = new(ir) ir_constant(0xFFFFFFFFu, ir->operands[0]->type->vector_elements);
1194   }
1195
1196   base_ir->insert_before(offset);
1197   base_ir->insert_before(assign(offset, ir->operands[2]));
1198
1199   base_ir->insert_before(bits);
1200   base_ir->insert_before(assign(bits, ir->operands[3]));
1201
1202   /* At least some hardware treats (x << y) as (x << (y%32)).  This means
1203    * we'd get a mask of 0 when bits is 32.  Special case it.
1204    *
1205    * mask = (bits == 32 ? 0xffffffff : (1u << bits) - 1u) << offset;
1206    *
1207    * Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1208    *
1209    *    The result will be undefined if offset or bits is negative, or if the
1210    *    sum of offset and bits is greater than the number of bits used to
1211    *    store the operand.
1212    *
1213    * Since it's undefined, there are a couple other ways this could be
1214    * implemented.  The other way that was considered was to put the csel
1215    * around the whole thing:
1216    *
1217    *    final_result = bits == 32 ? insert : ... ;
1218    */
1219   base_ir->insert_before(mask);
1220
1221   base_ir->insert_before(assign(mask, csel(equal(bits, c32),
1222                                            cFFFFFFFF,
1223                                            lshift(sub(lshift(c1, bits),
1224                                                       c1->clone(ir, NULL)),
1225                                                   offset))));
1226
1227   /* (base & ~mask) | ((insert << offset) & mask) */
1228   ir->operation = ir_binop_bit_or;
1229   ir->init_num_operands();
1230   ir->operands[0] = bit_and(ir->operands[0], bit_not(mask));
1231   ir->operands[1] = bit_and(lshift(ir->operands[1], offset), mask);
1232   ir->operands[2] = NULL;
1233   ir->operands[3] = NULL;
1234
1235   this->progress = true;
1236}
1237
1238void
1239lower_instructions_visitor::reverse_to_shifts(ir_expression *ir)
1240{
1241   /* For more details, see:
1242    *
1243    * http://graphics.stanford.edu/~seander/bithacks.html#ReverseParallel
1244    */
1245   ir_constant *c1 =
1246      new(ir) ir_constant(1u, ir->operands[0]->type->vector_elements);
1247   ir_constant *c2 =
1248      new(ir) ir_constant(2u, ir->operands[0]->type->vector_elements);
1249   ir_constant *c4 =
1250      new(ir) ir_constant(4u, ir->operands[0]->type->vector_elements);
1251   ir_constant *c8 =
1252      new(ir) ir_constant(8u, ir->operands[0]->type->vector_elements);
1253   ir_constant *c16 =
1254      new(ir) ir_constant(16u, ir->operands[0]->type->vector_elements);
1255   ir_constant *c33333333 =
1256      new(ir) ir_constant(0x33333333u, ir->operands[0]->type->vector_elements);
1257   ir_constant *c55555555 =
1258      new(ir) ir_constant(0x55555555u, ir->operands[0]->type->vector_elements);
1259   ir_constant *c0F0F0F0F =
1260      new(ir) ir_constant(0x0F0F0F0Fu, ir->operands[0]->type->vector_elements);
1261   ir_constant *c00FF00FF =
1262      new(ir) ir_constant(0x00FF00FFu, ir->operands[0]->type->vector_elements);
1263   ir_variable *temp =
1264      new(ir) ir_variable(glsl_type::uvec(ir->operands[0]->type->vector_elements),
1265                          "temp", ir_var_temporary);
1266   ir_instruction &i = *base_ir;
1267
1268   i.insert_before(temp);
1269
1270   if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1271      i.insert_before(assign(temp, ir->operands[0]));
1272   } else {
1273      assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1274      i.insert_before(assign(temp, i2u(ir->operands[0])));
1275   }
1276
1277   /* Swap odd and even bits.
1278    *
1279    * temp = ((temp >> 1) & 0x55555555u) | ((temp & 0x55555555u) << 1);
1280    */
1281   i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c1), c55555555),
1282                                       lshift(bit_and(temp, c55555555->clone(ir, NULL)),
1283                                              c1->clone(ir, NULL)))));
1284   /* Swap consecutive pairs.
1285    *
1286    * temp = ((temp >> 2) & 0x33333333u) | ((temp & 0x33333333u) << 2);
1287    */
1288   i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c2), c33333333),
1289                                       lshift(bit_and(temp, c33333333->clone(ir, NULL)),
1290                                              c2->clone(ir, NULL)))));
1291
1292   /* Swap nibbles.
1293    *
1294    * temp = ((temp >> 4) & 0x0F0F0F0Fu) | ((temp & 0x0F0F0F0Fu) << 4);
1295    */
1296   i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c4), c0F0F0F0F),
1297                                       lshift(bit_and(temp, c0F0F0F0F->clone(ir, NULL)),
1298                                              c4->clone(ir, NULL)))));
1299
1300   /* The last step is, basically, bswap.  Swap the bytes, then swap the
1301    * words.  When this code is run through GCC on x86, it does generate a
1302    * bswap instruction.
1303    *
1304    * temp = ((temp >> 8) & 0x00FF00FFu) | ((temp & 0x00FF00FFu) << 8);
1305    * temp = ( temp >> 16              ) | ( temp                << 16);
1306    */
1307   i.insert_before(assign(temp, bit_or(bit_and(rshift(temp, c8), c00FF00FF),
1308                                       lshift(bit_and(temp, c00FF00FF->clone(ir, NULL)),
1309                                              c8->clone(ir, NULL)))));
1310
1311   if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1312      ir->operation = ir_binop_bit_or;
1313      ir->init_num_operands();
1314      ir->operands[0] = rshift(temp, c16);
1315      ir->operands[1] = lshift(temp, c16->clone(ir, NULL));
1316   } else {
1317      ir->operation = ir_unop_u2i;
1318      ir->init_num_operands();
1319      ir->operands[0] = bit_or(rshift(temp, c16),
1320                               lshift(temp, c16->clone(ir, NULL)));
1321   }
1322
1323   this->progress = true;
1324}
1325
1326void
1327lower_instructions_visitor::find_lsb_to_float_cast(ir_expression *ir)
1328{
1329   /* For more details, see:
1330    *
1331    * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1332    */
1333   const unsigned elements = ir->operands[0]->type->vector_elements;
1334   ir_constant *c0 = new(ir) ir_constant(unsigned(0), elements);
1335   ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1336   ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1337   ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1338   ir_variable *temp =
1339      new(ir) ir_variable(glsl_type::ivec(elements), "temp", ir_var_temporary);
1340   ir_variable *lsb_only =
1341      new(ir) ir_variable(glsl_type::uvec(elements), "lsb_only", ir_var_temporary);
1342   ir_variable *as_float =
1343      new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1344   ir_variable *lsb =
1345      new(ir) ir_variable(glsl_type::ivec(elements), "lsb", ir_var_temporary);
1346
1347   ir_instruction &i = *base_ir;
1348
1349   i.insert_before(temp);
1350
1351   if (ir->operands[0]->type->base_type == GLSL_TYPE_INT) {
1352      i.insert_before(assign(temp, ir->operands[0]));
1353   } else {
1354      assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1355      i.insert_before(assign(temp, u2i(ir->operands[0])));
1356   }
1357
1358   /* The int-to-float conversion is lossless because (value & -value) is
1359    * either a power of two or zero.  We don't use the result in the zero
1360    * case.  The uint() cast is necessary so that 0x80000000 does not
1361    * generate a negative value.
1362    *
1363    * uint lsb_only = uint(value & -value);
1364    * float as_float = float(lsb_only);
1365    */
1366   i.insert_before(lsb_only);
1367   i.insert_before(assign(lsb_only, i2u(bit_and(temp, neg(temp)))));
1368
1369   i.insert_before(as_float);
1370   i.insert_before(assign(as_float, u2f(lsb_only)));
1371
1372   /* This is basically an open-coded frexp.  Implementations that have a
1373    * native frexp instruction would be better served by that.  This is
1374    * optimized versus a full-featured open-coded implementation in two ways:
1375    *
1376    * - We don't care about a correct result from subnormal numbers (including
1377    *   0.0), so the raw exponent can always be safely unbiased.
1378    *
1379    * - The value cannot be negative, so it does not need to be masked off to
1380    *   extract the exponent.
1381    *
1382    * int lsb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1383    */
1384   i.insert_before(lsb);
1385   i.insert_before(assign(lsb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1386
1387   /* Use lsb_only in the comparison instead of temp so that the & (far above)
1388    * can possibly generate the result without an explicit comparison.
1389    *
1390    * (lsb_only == 0) ? -1 : lsb;
1391    *
1392    * Since our input values are all integers, the unbiased exponent must not
1393    * be negative.  It will only be negative (-0x7f, in fact) if lsb_only is
1394    * 0.  Instead of using (lsb_only == 0), we could use (lsb >= 0).  Which is
1395    * better is likely GPU dependent.  Either way, the difference should be
1396    * small.
1397    */
1398   ir->operation = ir_triop_csel;
1399   ir->init_num_operands();
1400   ir->operands[0] = equal(lsb_only, c0);
1401   ir->operands[1] = cminus1;
1402   ir->operands[2] = new(ir) ir_dereference_variable(lsb);
1403
1404   this->progress = true;
1405}
1406
1407void
1408lower_instructions_visitor::find_msb_to_float_cast(ir_expression *ir)
1409{
1410   /* For more details, see:
1411    *
1412    * http://graphics.stanford.edu/~seander/bithacks.html#ZerosOnRightFloatCast
1413    */
1414   const unsigned elements = ir->operands[0]->type->vector_elements;
1415   ir_constant *c0 = new(ir) ir_constant(int(0), elements);
1416   ir_constant *cminus1 = new(ir) ir_constant(int(-1), elements);
1417   ir_constant *c23 = new(ir) ir_constant(int(23), elements);
1418   ir_constant *c7F = new(ir) ir_constant(int(0x7F), elements);
1419   ir_constant *c000000FF = new(ir) ir_constant(0x000000FFu, elements);
1420   ir_constant *cFFFFFF00 = new(ir) ir_constant(0xFFFFFF00u, elements);
1421   ir_variable *temp =
1422      new(ir) ir_variable(glsl_type::uvec(elements), "temp", ir_var_temporary);
1423   ir_variable *as_float =
1424      new(ir) ir_variable(glsl_type::vec(elements), "as_float", ir_var_temporary);
1425   ir_variable *msb =
1426      new(ir) ir_variable(glsl_type::ivec(elements), "msb", ir_var_temporary);
1427
1428   ir_instruction &i = *base_ir;
1429
1430   i.insert_before(temp);
1431
1432   if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1433      i.insert_before(assign(temp, ir->operands[0]));
1434   } else {
1435      assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1436
1437      /* findMSB(uint(abs(some_int))) almost always does the right thing.
1438       * There are two problem values:
1439       *
1440       * * 0x80000000.  Since abs(0x80000000) == 0x80000000, findMSB returns
1441       *   31.  However, findMSB(int(0x80000000)) == 30.
1442       *
1443       * * 0xffffffff.  Since abs(0xffffffff) == 1, findMSB returns
1444       *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
1445       *
1446       *    For a value of zero or negative one, -1 will be returned.
1447       *
1448       * For all negative number cases, including 0x80000000 and 0xffffffff,
1449       * the correct value is obtained from findMSB if instead of negating the
1450       * (already negative) value the logical-not is used.  A conditonal
1451       * logical-not can be achieved in two instructions.
1452       */
1453      ir_variable *as_int =
1454         new(ir) ir_variable(glsl_type::ivec(elements), "as_int", ir_var_temporary);
1455      ir_constant *c31 = new(ir) ir_constant(int(31), elements);
1456
1457      i.insert_before(as_int);
1458      i.insert_before(assign(as_int, ir->operands[0]));
1459      i.insert_before(assign(temp, i2u(expr(ir_binop_bit_xor,
1460                                            as_int,
1461                                            rshift(as_int, c31)))));
1462   }
1463
1464   /* The int-to-float conversion is lossless because bits are conditionally
1465    * masked off the bottom of temp to ensure the value has at most 24 bits of
1466    * data or is zero.  We don't use the result in the zero case.  The uint()
1467    * cast is necessary so that 0x80000000 does not generate a negative value.
1468    *
1469    * float as_float = float(temp > 255 ? temp & ~255 : temp);
1470    */
1471   i.insert_before(as_float);
1472   i.insert_before(assign(as_float, u2f(csel(greater(temp, c000000FF),
1473                                             bit_and(temp, cFFFFFF00),
1474                                             temp))));
1475
1476   /* This is basically an open-coded frexp.  Implementations that have a
1477    * native frexp instruction would be better served by that.  This is
1478    * optimized versus a full-featured open-coded implementation in two ways:
1479    *
1480    * - We don't care about a correct result from subnormal numbers (including
1481    *   0.0), so the raw exponent can always be safely unbiased.
1482    *
1483    * - The value cannot be negative, so it does not need to be masked off to
1484    *   extract the exponent.
1485    *
1486    * int msb = (floatBitsToInt(as_float) >> 23) - 0x7f;
1487    */
1488   i.insert_before(msb);
1489   i.insert_before(assign(msb, sub(rshift(bitcast_f2i(as_float), c23), c7F)));
1490
1491   /* Use msb in the comparison instead of temp so that the subtract can
1492    * possibly generate the result without an explicit comparison.
1493    *
1494    * (msb < 0) ? -1 : msb;
1495    *
1496    * Since our input values are all integers, the unbiased exponent must not
1497    * be negative.  It will only be negative (-0x7f, in fact) if temp is 0.
1498    */
1499   ir->operation = ir_triop_csel;
1500   ir->init_num_operands();
1501   ir->operands[0] = less(msb, c0);
1502   ir->operands[1] = cminus1;
1503   ir->operands[2] = new(ir) ir_dereference_variable(msb);
1504
1505   this->progress = true;
1506}
1507
1508ir_expression *
1509lower_instructions_visitor::_carry(operand a, operand b)
1510{
1511   if (lowering(CARRY_TO_ARITH))
1512      return i2u(b2i(less(add(a, b),
1513                          a.val->clone(ralloc_parent(a.val), NULL))));
1514   else
1515      return carry(a, b);
1516}
1517
1518void
1519lower_instructions_visitor::imul_high_to_mul(ir_expression *ir)
1520{
1521   /*   ABCD
1522    * * EFGH
1523    * ======
1524    * (GH * CD) + (GH * AB) << 16 + (EF * CD) << 16 + (EF * AB) << 32
1525    *
1526    * In GLSL, (a * b) becomes
1527    *
1528    * uint m1 = (a & 0x0000ffffu) * (b & 0x0000ffffu);
1529    * uint m2 = (a & 0x0000ffffu) * (b >> 16);
1530    * uint m3 = (a >> 16)         * (b & 0x0000ffffu);
1531    * uint m4 = (a >> 16)         * (b >> 16);
1532    *
1533    * uint c1;
1534    * uint c2;
1535    * uint lo_result;
1536    * uint hi_result;
1537    *
1538    * lo_result = uaddCarry(m1, m2 << 16, c1);
1539    * hi_result = m4 + c1;
1540    * lo_result = uaddCarry(lo_result, m3 << 16, c2);
1541    * hi_result = hi_result + c2;
1542    * hi_result = hi_result + (m2 >> 16) + (m3 >> 16);
1543    */
1544   const unsigned elements = ir->operands[0]->type->vector_elements;
1545   ir_variable *src1 =
1546      new(ir) ir_variable(glsl_type::uvec(elements), "src1", ir_var_temporary);
1547   ir_variable *src1h =
1548      new(ir) ir_variable(glsl_type::uvec(elements), "src1h", ir_var_temporary);
1549   ir_variable *src1l =
1550      new(ir) ir_variable(glsl_type::uvec(elements), "src1l", ir_var_temporary);
1551   ir_variable *src2 =
1552      new(ir) ir_variable(glsl_type::uvec(elements), "src2", ir_var_temporary);
1553   ir_variable *src2h =
1554      new(ir) ir_variable(glsl_type::uvec(elements), "src2h", ir_var_temporary);
1555   ir_variable *src2l =
1556      new(ir) ir_variable(glsl_type::uvec(elements), "src2l", ir_var_temporary);
1557   ir_variable *t1 =
1558      new(ir) ir_variable(glsl_type::uvec(elements), "t1", ir_var_temporary);
1559   ir_variable *t2 =
1560      new(ir) ir_variable(glsl_type::uvec(elements), "t2", ir_var_temporary);
1561   ir_variable *lo =
1562      new(ir) ir_variable(glsl_type::uvec(elements), "lo", ir_var_temporary);
1563   ir_variable *hi =
1564      new(ir) ir_variable(glsl_type::uvec(elements), "hi", ir_var_temporary);
1565   ir_variable *different_signs = NULL;
1566   ir_constant *c0000FFFF = new(ir) ir_constant(0x0000FFFFu, elements);
1567   ir_constant *c16 = new(ir) ir_constant(16u, elements);
1568
1569   ir_instruction &i = *base_ir;
1570
1571   i.insert_before(src1);
1572   i.insert_before(src2);
1573   i.insert_before(src1h);
1574   i.insert_before(src2h);
1575   i.insert_before(src1l);
1576   i.insert_before(src2l);
1577
1578   if (ir->operands[0]->type->base_type == GLSL_TYPE_UINT) {
1579      i.insert_before(assign(src1, ir->operands[0]));
1580      i.insert_before(assign(src2, ir->operands[1]));
1581   } else {
1582      assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1583
1584      ir_variable *itmp1 =
1585         new(ir) ir_variable(glsl_type::ivec(elements), "itmp1", ir_var_temporary);
1586      ir_variable *itmp2 =
1587         new(ir) ir_variable(glsl_type::ivec(elements), "itmp2", ir_var_temporary);
1588      ir_constant *c0 = new(ir) ir_constant(int(0), elements);
1589
1590      i.insert_before(itmp1);
1591      i.insert_before(itmp2);
1592      i.insert_before(assign(itmp1, ir->operands[0]));
1593      i.insert_before(assign(itmp2, ir->operands[1]));
1594
1595      different_signs =
1596         new(ir) ir_variable(glsl_type::bvec(elements), "different_signs",
1597                             ir_var_temporary);
1598
1599      i.insert_before(different_signs);
1600      i.insert_before(assign(different_signs, expr(ir_binop_logic_xor,
1601                                                   less(itmp1, c0),
1602                                                   less(itmp2, c0->clone(ir, NULL)))));
1603
1604      i.insert_before(assign(src1, i2u(abs(itmp1))));
1605      i.insert_before(assign(src2, i2u(abs(itmp2))));
1606   }
1607
1608   i.insert_before(assign(src1l, bit_and(src1, c0000FFFF)));
1609   i.insert_before(assign(src2l, bit_and(src2, c0000FFFF->clone(ir, NULL))));
1610   i.insert_before(assign(src1h, rshift(src1, c16)));
1611   i.insert_before(assign(src2h, rshift(src2, c16->clone(ir, NULL))));
1612
1613   i.insert_before(lo);
1614   i.insert_before(hi);
1615   i.insert_before(t1);
1616   i.insert_before(t2);
1617
1618   i.insert_before(assign(lo, mul(src1l, src2l)));
1619   i.insert_before(assign(t1, mul(src1l, src2h)));
1620   i.insert_before(assign(t2, mul(src1h, src2l)));
1621   i.insert_before(assign(hi, mul(src1h, src2h)));
1622
1623   i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t1, c16->clone(ir, NULL))))));
1624   i.insert_before(assign(lo,            add(lo, lshift(t1, c16->clone(ir, NULL)))));
1625
1626   i.insert_before(assign(hi, add(hi, _carry(lo, lshift(t2, c16->clone(ir, NULL))))));
1627   i.insert_before(assign(lo,            add(lo, lshift(t2, c16->clone(ir, NULL)))));
1628
1629   if (different_signs == NULL) {
1630      assert(ir->operands[0]->type->base_type == GLSL_TYPE_UINT);
1631
1632      ir->operation = ir_binop_add;
1633      ir->init_num_operands();
1634      ir->operands[0] = add(hi, rshift(t1, c16->clone(ir, NULL)));
1635      ir->operands[1] = rshift(t2, c16->clone(ir, NULL));
1636   } else {
1637      assert(ir->operands[0]->type->base_type == GLSL_TYPE_INT);
1638
1639      i.insert_before(assign(hi, add(add(hi, rshift(t1, c16->clone(ir, NULL))),
1640                                     rshift(t2, c16->clone(ir, NULL)))));
1641
1642      /* For channels where different_signs is set we have to perform a 64-bit
1643       * negation.  This is *not* the same as just negating the high 32-bits.
1644       * Consider -3 * 2.  The high 32-bits is 0, but the desired result is
1645       * -1, not -0!  Recall -x == ~x + 1.
1646       */
1647      ir_variable *neg_hi =
1648         new(ir) ir_variable(glsl_type::ivec(elements), "neg_hi", ir_var_temporary);
1649      ir_constant *c1 = new(ir) ir_constant(1u, elements);
1650
1651      i.insert_before(neg_hi);
1652      i.insert_before(assign(neg_hi, add(bit_not(u2i(hi)),
1653                                         u2i(_carry(bit_not(lo), c1)))));
1654
1655      ir->operation = ir_triop_csel;
1656      ir->init_num_operands();
1657      ir->operands[0] = new(ir) ir_dereference_variable(different_signs);
1658      ir->operands[1] = new(ir) ir_dereference_variable(neg_hi);
1659      ir->operands[2] = u2i(hi);
1660   }
1661}
1662
1663void
1664lower_instructions_visitor::sqrt_to_abs_sqrt(ir_expression *ir)
1665{
1666   ir->operands[0] = new(ir) ir_expression(ir_unop_abs, ir->operands[0]);
1667   this->progress = true;
1668}
1669
1670void
1671lower_instructions_visitor::mul64_to_mul_and_mul_high(ir_expression *ir)
1672{
1673   /* Lower 32x32-> 64 to
1674    *    msb = imul_high(x_lo, y_lo)
1675    *    lsb = mul(x_lo, y_lo)
1676    */
1677   const unsigned elements = ir->operands[0]->type->vector_elements;
1678
1679   const ir_expression_operation operation =
1680      ir->type->base_type == GLSL_TYPE_UINT64 ? ir_unop_pack_uint_2x32
1681                                              : ir_unop_pack_int_2x32;
1682
1683   const glsl_type *var_type = ir->type->base_type == GLSL_TYPE_UINT64
1684                               ? glsl_type::uvec(elements)
1685                               : glsl_type::ivec(elements);
1686
1687   const glsl_type *ret_type = ir->type->base_type == GLSL_TYPE_UINT64
1688                               ? glsl_type::uvec2_type
1689                               : glsl_type::ivec2_type;
1690
1691   ir_instruction &i = *base_ir;
1692
1693   ir_variable *msb =
1694      new(ir) ir_variable(var_type, "msb", ir_var_temporary);
1695   ir_variable *lsb =
1696      new(ir) ir_variable(var_type, "lsb", ir_var_temporary);
1697   ir_variable *x =
1698      new(ir) ir_variable(var_type, "x", ir_var_temporary);
1699   ir_variable *y =
1700      new(ir) ir_variable(var_type, "y", ir_var_temporary);
1701
1702   i.insert_before(x);
1703   i.insert_before(assign(x, ir->operands[0]));
1704   i.insert_before(y);
1705   i.insert_before(assign(y, ir->operands[1]));
1706   i.insert_before(msb);
1707   i.insert_before(lsb);
1708
1709   i.insert_before(assign(msb, imul_high(x, y)));
1710   i.insert_before(assign(lsb, mul(x, y)));
1711
1712   ir_rvalue *result[4] = {NULL};
1713   for (unsigned elem = 0; elem < elements; elem++) {
1714      ir_rvalue *val = new(ir) ir_expression(ir_quadop_vector, ret_type,
1715                                             swizzle(lsb, elem, 1),
1716                                             swizzle(msb, elem, 1), NULL, NULL);
1717      result[elem] = expr(operation, val);
1718   }
1719
1720   ir->operation = ir_quadop_vector;
1721   ir->init_num_operands();
1722   ir->operands[0] = result[0];
1723   ir->operands[1] = result[1];
1724   ir->operands[2] = result[2];
1725   ir->operands[3] = result[3];
1726
1727   this->progress = true;
1728}
1729
1730ir_visitor_status
1731lower_instructions_visitor::visit_leave(ir_expression *ir)
1732{
1733   switch (ir->operation) {
1734   case ir_binop_dot:
1735      if (ir->operands[0]->type->is_double())
1736         double_dot_to_fma(ir);
1737      break;
1738   case ir_triop_lrp:
1739      if (ir->operands[0]->type->is_double())
1740         double_lrp(ir);
1741      break;
1742   case ir_binop_sub:
1743      if (lowering(SUB_TO_ADD_NEG))
1744	 sub_to_add_neg(ir);
1745      break;
1746
1747   case ir_binop_div:
1748      if (ir->operands[1]->type->is_integer() && lowering(INT_DIV_TO_MUL_RCP))
1749	 int_div_to_mul_rcp(ir);
1750      else if ((ir->operands[1]->type->is_float() && lowering(FDIV_TO_MUL_RCP)) ||
1751               (ir->operands[1]->type->is_double() && lowering(DDIV_TO_MUL_RCP)))
1752	 div_to_mul_rcp(ir);
1753      break;
1754
1755   case ir_unop_exp:
1756      if (lowering(EXP_TO_EXP2))
1757	 exp_to_exp2(ir);
1758      break;
1759
1760   case ir_unop_log:
1761      if (lowering(LOG_TO_LOG2))
1762	 log_to_log2(ir);
1763      break;
1764
1765   case ir_binop_mod:
1766      if (lowering(MOD_TO_FLOOR) && (ir->type->is_float() || ir->type->is_double()))
1767	 mod_to_floor(ir);
1768      break;
1769
1770   case ir_binop_pow:
1771      if (lowering(POW_TO_EXP2))
1772	 pow_to_exp2(ir);
1773      break;
1774
1775   case ir_binop_ldexp:
1776      if (lowering(LDEXP_TO_ARITH) && ir->type->is_float())
1777         ldexp_to_arith(ir);
1778      if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->type->is_double())
1779         dldexp_to_arith(ir);
1780      break;
1781
1782   case ir_unop_frexp_exp:
1783      if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1784         dfrexp_exp_to_arith(ir);
1785      break;
1786
1787   case ir_unop_frexp_sig:
1788      if (lowering(DFREXP_DLDEXP_TO_ARITH) && ir->operands[0]->type->is_double())
1789         dfrexp_sig_to_arith(ir);
1790      break;
1791
1792   case ir_binop_carry:
1793      if (lowering(CARRY_TO_ARITH))
1794         carry_to_arith(ir);
1795      break;
1796
1797   case ir_binop_borrow:
1798      if (lowering(BORROW_TO_ARITH))
1799         borrow_to_arith(ir);
1800      break;
1801
1802   case ir_unop_saturate:
1803      if (lowering(SAT_TO_CLAMP))
1804         sat_to_clamp(ir);
1805      break;
1806
1807   case ir_unop_trunc:
1808      if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1809         dtrunc_to_dfrac(ir);
1810      break;
1811
1812   case ir_unop_ceil:
1813      if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1814         dceil_to_dfrac(ir);
1815      break;
1816
1817   case ir_unop_floor:
1818      if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1819         dfloor_to_dfrac(ir);
1820      break;
1821
1822   case ir_unop_round_even:
1823      if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1824         dround_even_to_dfrac(ir);
1825      break;
1826
1827   case ir_unop_sign:
1828      if (lowering(DOPS_TO_DFRAC) && ir->type->is_double())
1829         dsign_to_csel(ir);
1830      break;
1831
1832   case ir_unop_bit_count:
1833      if (lowering(BIT_COUNT_TO_MATH))
1834         bit_count_to_math(ir);
1835      break;
1836
1837   case ir_triop_bitfield_extract:
1838      if (lowering(EXTRACT_TO_SHIFTS))
1839         extract_to_shifts(ir);
1840      break;
1841
1842   case ir_quadop_bitfield_insert:
1843      if (lowering(INSERT_TO_SHIFTS))
1844         insert_to_shifts(ir);
1845      break;
1846
1847   case ir_unop_bitfield_reverse:
1848      if (lowering(REVERSE_TO_SHIFTS))
1849         reverse_to_shifts(ir);
1850      break;
1851
1852   case ir_unop_find_lsb:
1853      if (lowering(FIND_LSB_TO_FLOAT_CAST))
1854         find_lsb_to_float_cast(ir);
1855      break;
1856
1857   case ir_unop_find_msb:
1858      if (lowering(FIND_MSB_TO_FLOAT_CAST))
1859         find_msb_to_float_cast(ir);
1860      break;
1861
1862   case ir_binop_imul_high:
1863      if (lowering(IMUL_HIGH_TO_MUL))
1864         imul_high_to_mul(ir);
1865      break;
1866
1867   case ir_binop_mul:
1868      if (lowering(MUL64_TO_MUL_AND_MUL_HIGH) &&
1869          (ir->type->base_type == GLSL_TYPE_INT64 ||
1870           ir->type->base_type == GLSL_TYPE_UINT64) &&
1871          (ir->operands[0]->type->base_type == GLSL_TYPE_INT ||
1872           ir->operands[1]->type->base_type == GLSL_TYPE_UINT))
1873         mul64_to_mul_and_mul_high(ir);
1874      break;
1875
1876   case ir_unop_rsq:
1877   case ir_unop_sqrt:
1878      if (lowering(SQRT_TO_ABS_SQRT))
1879         sqrt_to_abs_sqrt(ir);
1880      break;
1881
1882   default:
1883      return visit_continue;
1884   }
1885
1886   return visit_continue;
1887}
1888