101e04c3fSmrg/*
201e04c3fSmrg * Copyright © 2012 Intel Corporation
301e04c3fSmrg *
401e04c3fSmrg * Permission is hereby granted, free of charge, to any person obtaining a
501e04c3fSmrg * copy of this software and associated documentation files (the "Software"),
601e04c3fSmrg * to deal in the Software without restriction, including without limitation
701e04c3fSmrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
801e04c3fSmrg * and/or sell copies of the Software, and to permit persons to whom the
901e04c3fSmrg * Software is furnished to do so, subject to the following conditions:
1001e04c3fSmrg *
1101e04c3fSmrg * The above copyright notice and this permission notice (including the next
1201e04c3fSmrg * paragraph) shall be included in all copies or substantial portions of the
1301e04c3fSmrg * Software.
1401e04c3fSmrg *
1501e04c3fSmrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1601e04c3fSmrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1701e04c3fSmrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
1801e04c3fSmrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
1901e04c3fSmrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
2001e04c3fSmrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
2101e04c3fSmrg * DEALINGS IN THE SOFTWARE.
2201e04c3fSmrg */
2301e04c3fSmrg
2401e04c3fSmrg#include "ir.h"
2501e04c3fSmrg#include "ir_builder.h"
2601e04c3fSmrg#include "ir_optimization.h"
2701e04c3fSmrg#include "ir_rvalue_visitor.h"
2801e04c3fSmrg
2901e04c3fSmrgnamespace {
3001e04c3fSmrg
3101e04c3fSmrgusing namespace ir_builder;
3201e04c3fSmrg
3301e04c3fSmrg/**
3401e04c3fSmrg * A visitor that lowers built-in floating-point pack/unpack expressions
3501e04c3fSmrg * such packSnorm2x16.
3601e04c3fSmrg */
3701e04c3fSmrgclass lower_packing_builtins_visitor : public ir_rvalue_visitor {
3801e04c3fSmrgpublic:
3901e04c3fSmrg   /**
4001e04c3fSmrg    * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
4101e04c3fSmrg    */
4201e04c3fSmrg   explicit lower_packing_builtins_visitor(int op_mask)
4301e04c3fSmrg      : op_mask(op_mask),
4401e04c3fSmrg        progress(false)
4501e04c3fSmrg   {
4601e04c3fSmrg      factory.instructions = &factory_instructions;
4701e04c3fSmrg   }
4801e04c3fSmrg
4901e04c3fSmrg   virtual ~lower_packing_builtins_visitor()
5001e04c3fSmrg   {
5101e04c3fSmrg      assert(factory_instructions.is_empty());
5201e04c3fSmrg   }
5301e04c3fSmrg
5401e04c3fSmrg   bool get_progress() { return progress; }
5501e04c3fSmrg
5601e04c3fSmrg   void handle_rvalue(ir_rvalue **rvalue)
5701e04c3fSmrg   {
5801e04c3fSmrg      if (!*rvalue)
5901e04c3fSmrg	 return;
6001e04c3fSmrg
6101e04c3fSmrg      ir_expression *expr = (*rvalue)->as_expression();
6201e04c3fSmrg      if (!expr)
6301e04c3fSmrg	 return;
6401e04c3fSmrg
6501e04c3fSmrg      enum lower_packing_builtins_op lowering_op =
6601e04c3fSmrg         choose_lowering_op(expr->operation);
6701e04c3fSmrg
6801e04c3fSmrg      if (lowering_op == LOWER_PACK_UNPACK_NONE)
6901e04c3fSmrg         return;
7001e04c3fSmrg
7101e04c3fSmrg      setup_factory(ralloc_parent(expr));
7201e04c3fSmrg
7301e04c3fSmrg      ir_rvalue *op0 = expr->operands[0];
7401e04c3fSmrg      ralloc_steal(factory.mem_ctx, op0);
7501e04c3fSmrg
7601e04c3fSmrg      switch (lowering_op) {
7701e04c3fSmrg      case LOWER_PACK_SNORM_2x16:
7801e04c3fSmrg         *rvalue = lower_pack_snorm_2x16(op0);
7901e04c3fSmrg         break;
8001e04c3fSmrg      case LOWER_PACK_SNORM_4x8:
8101e04c3fSmrg         *rvalue = lower_pack_snorm_4x8(op0);
8201e04c3fSmrg         break;
8301e04c3fSmrg      case LOWER_PACK_UNORM_2x16:
8401e04c3fSmrg         *rvalue = lower_pack_unorm_2x16(op0);
8501e04c3fSmrg         break;
8601e04c3fSmrg      case LOWER_PACK_UNORM_4x8:
8701e04c3fSmrg         *rvalue = lower_pack_unorm_4x8(op0);
8801e04c3fSmrg         break;
8901e04c3fSmrg      case LOWER_PACK_HALF_2x16:
9001e04c3fSmrg         *rvalue = lower_pack_half_2x16(op0);
9101e04c3fSmrg         break;
9201e04c3fSmrg      case LOWER_UNPACK_SNORM_2x16:
9301e04c3fSmrg         *rvalue = lower_unpack_snorm_2x16(op0);
9401e04c3fSmrg         break;
9501e04c3fSmrg      case LOWER_UNPACK_SNORM_4x8:
9601e04c3fSmrg         *rvalue = lower_unpack_snorm_4x8(op0);
9701e04c3fSmrg         break;
9801e04c3fSmrg      case LOWER_UNPACK_UNORM_2x16:
9901e04c3fSmrg         *rvalue = lower_unpack_unorm_2x16(op0);
10001e04c3fSmrg         break;
10101e04c3fSmrg      case LOWER_UNPACK_UNORM_4x8:
10201e04c3fSmrg         *rvalue = lower_unpack_unorm_4x8(op0);
10301e04c3fSmrg         break;
10401e04c3fSmrg      case LOWER_UNPACK_HALF_2x16:
10501e04c3fSmrg         *rvalue = lower_unpack_half_2x16(op0);
10601e04c3fSmrg         break;
10701e04c3fSmrg      case LOWER_PACK_UNPACK_NONE:
10801e04c3fSmrg      case LOWER_PACK_USE_BFI:
10901e04c3fSmrg      case LOWER_PACK_USE_BFE:
11001e04c3fSmrg         assert(!"not reached");
11101e04c3fSmrg         break;
11201e04c3fSmrg      }
11301e04c3fSmrg
11401e04c3fSmrg      teardown_factory();
11501e04c3fSmrg      progress = true;
11601e04c3fSmrg   }
11701e04c3fSmrg
11801e04c3fSmrgprivate:
11901e04c3fSmrg   const int op_mask;
12001e04c3fSmrg   bool progress;
12101e04c3fSmrg   ir_factory factory;
12201e04c3fSmrg   exec_list factory_instructions;
12301e04c3fSmrg
12401e04c3fSmrg   /**
12501e04c3fSmrg    * Determine the needed lowering operation by filtering \a expr_op
12601e04c3fSmrg    * through \ref op_mask.
12701e04c3fSmrg    */
12801e04c3fSmrg   enum lower_packing_builtins_op
12901e04c3fSmrg   choose_lowering_op(ir_expression_operation expr_op)
13001e04c3fSmrg   {
13101e04c3fSmrg      /* C++ regards int and enum as fundamentally different types.
13201e04c3fSmrg       * So, we can't simply return from each case; we must cast the return
13301e04c3fSmrg       * value.
13401e04c3fSmrg       */
13501e04c3fSmrg      int result;
13601e04c3fSmrg
13701e04c3fSmrg      switch (expr_op) {
13801e04c3fSmrg      case ir_unop_pack_snorm_2x16:
13901e04c3fSmrg         result = op_mask & LOWER_PACK_SNORM_2x16;
14001e04c3fSmrg         break;
14101e04c3fSmrg      case ir_unop_pack_snorm_4x8:
14201e04c3fSmrg         result = op_mask & LOWER_PACK_SNORM_4x8;
14301e04c3fSmrg         break;
14401e04c3fSmrg      case ir_unop_pack_unorm_2x16:
14501e04c3fSmrg         result = op_mask & LOWER_PACK_UNORM_2x16;
14601e04c3fSmrg         break;
14701e04c3fSmrg      case ir_unop_pack_unorm_4x8:
14801e04c3fSmrg         result = op_mask & LOWER_PACK_UNORM_4x8;
14901e04c3fSmrg         break;
15001e04c3fSmrg      case ir_unop_pack_half_2x16:
15101e04c3fSmrg         result = op_mask & LOWER_PACK_HALF_2x16;
15201e04c3fSmrg         break;
15301e04c3fSmrg      case ir_unop_unpack_snorm_2x16:
15401e04c3fSmrg         result = op_mask & LOWER_UNPACK_SNORM_2x16;
15501e04c3fSmrg         break;
15601e04c3fSmrg      case ir_unop_unpack_snorm_4x8:
15701e04c3fSmrg         result = op_mask & LOWER_UNPACK_SNORM_4x8;
15801e04c3fSmrg         break;
15901e04c3fSmrg      case ir_unop_unpack_unorm_2x16:
16001e04c3fSmrg         result = op_mask & LOWER_UNPACK_UNORM_2x16;
16101e04c3fSmrg         break;
16201e04c3fSmrg      case ir_unop_unpack_unorm_4x8:
16301e04c3fSmrg         result = op_mask & LOWER_UNPACK_UNORM_4x8;
16401e04c3fSmrg         break;
16501e04c3fSmrg      case ir_unop_unpack_half_2x16:
16601e04c3fSmrg         result = op_mask & LOWER_UNPACK_HALF_2x16;
16701e04c3fSmrg         break;
16801e04c3fSmrg      default:
16901e04c3fSmrg         result = LOWER_PACK_UNPACK_NONE;
17001e04c3fSmrg         break;
17101e04c3fSmrg      }
17201e04c3fSmrg
17301e04c3fSmrg      return static_cast<enum lower_packing_builtins_op>(result);
17401e04c3fSmrg   }
17501e04c3fSmrg
17601e04c3fSmrg   void
17701e04c3fSmrg   setup_factory(void *mem_ctx)
17801e04c3fSmrg   {
17901e04c3fSmrg      assert(factory.mem_ctx == NULL);
18001e04c3fSmrg      assert(factory.instructions->is_empty());
18101e04c3fSmrg
18201e04c3fSmrg      factory.mem_ctx = mem_ctx;
18301e04c3fSmrg   }
18401e04c3fSmrg
18501e04c3fSmrg   void
18601e04c3fSmrg   teardown_factory()
18701e04c3fSmrg   {
18801e04c3fSmrg      base_ir->insert_before(factory.instructions);
18901e04c3fSmrg      assert(factory.instructions->is_empty());
19001e04c3fSmrg      factory.mem_ctx = NULL;
19101e04c3fSmrg   }
19201e04c3fSmrg
19301e04c3fSmrg   template <typename T>
19401e04c3fSmrg   ir_constant*
19501e04c3fSmrg   constant(T x)
19601e04c3fSmrg   {
19701e04c3fSmrg      return factory.constant(x);
19801e04c3fSmrg   }
19901e04c3fSmrg
20001e04c3fSmrg   /**
20101e04c3fSmrg    * \brief Pack two uint16's into a single uint32.
20201e04c3fSmrg    *
20301e04c3fSmrg    * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
20401e04c3fSmrg    * where the least significant bits specify the first element of the pair.
20501e04c3fSmrg    * Return the uint32.
20601e04c3fSmrg    */
20701e04c3fSmrg   ir_rvalue*
20801e04c3fSmrg   pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
20901e04c3fSmrg   {
21001e04c3fSmrg      assert(uvec2_rval->type == glsl_type::uvec2_type);
21101e04c3fSmrg
21201e04c3fSmrg      /* uvec2 u = UVEC2_RVAL; */
21301e04c3fSmrg      ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
21401e04c3fSmrg                                         "tmp_pack_uvec2_to_uint");
21501e04c3fSmrg      factory.emit(assign(u, uvec2_rval));
21601e04c3fSmrg
21701e04c3fSmrg      if (op_mask & LOWER_PACK_USE_BFI) {
21801e04c3fSmrg         return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
21901e04c3fSmrg                                swizzle_y(u),
22001e04c3fSmrg                                constant(16u),
22101e04c3fSmrg                                constant(16u));
22201e04c3fSmrg      }
22301e04c3fSmrg
22401e04c3fSmrg      /* return (u.y << 16) | (u.x & 0xffff); */
22501e04c3fSmrg      return bit_or(lshift(swizzle_y(u), constant(16u)),
22601e04c3fSmrg                    bit_and(swizzle_x(u), constant(0xffffu)));
22701e04c3fSmrg   }
22801e04c3fSmrg
22901e04c3fSmrg   /**
23001e04c3fSmrg    * \brief Pack four uint8's into a single uint32.
23101e04c3fSmrg    *
23201e04c3fSmrg    * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
23301e04c3fSmrg    * uint32 where the least significant bits specify the first element of the
23401e04c3fSmrg    * 4-tuple. Return the uint32.
23501e04c3fSmrg    */
23601e04c3fSmrg   ir_rvalue*
23701e04c3fSmrg   pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
23801e04c3fSmrg   {
23901e04c3fSmrg      assert(uvec4_rval->type == glsl_type::uvec4_type);
24001e04c3fSmrg
24101e04c3fSmrg      ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
24201e04c3fSmrg                                         "tmp_pack_uvec4_to_uint");
24301e04c3fSmrg
24401e04c3fSmrg      if (op_mask & LOWER_PACK_USE_BFI) {
24501e04c3fSmrg         /* uvec4 u = UVEC4_RVAL; */
24601e04c3fSmrg         factory.emit(assign(u, uvec4_rval));
24701e04c3fSmrg
24801e04c3fSmrg         return bitfield_insert(bitfield_insert(
24901e04c3fSmrg                                   bitfield_insert(
25001e04c3fSmrg                                      bit_and(swizzle_x(u), constant(0xffu)),
25101e04c3fSmrg                                      swizzle_y(u), constant(8u), constant(8u)),
25201e04c3fSmrg                                   swizzle_z(u), constant(16u), constant(8u)),
25301e04c3fSmrg                                swizzle_w(u), constant(24u), constant(8u));
25401e04c3fSmrg      }
25501e04c3fSmrg
25601e04c3fSmrg      /* uvec4 u = UVEC4_RVAL & 0xff */
25701e04c3fSmrg      factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
25801e04c3fSmrg
25901e04c3fSmrg      /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
26001e04c3fSmrg      return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
26101e04c3fSmrg                           lshift(swizzle_z(u), constant(16u))),
26201e04c3fSmrg                    bit_or(lshift(swizzle_y(u), constant(8u)),
26301e04c3fSmrg                           swizzle_x(u)));
26401e04c3fSmrg   }
26501e04c3fSmrg
26601e04c3fSmrg   /**
26701e04c3fSmrg    * \brief Unpack a uint32 into two uint16's.
26801e04c3fSmrg    *
26901e04c3fSmrg    * Interpret the given uint32 as a uint16 pair where the uint32's least
27001e04c3fSmrg    * significant bits specify the pair's first element. Return the uint16
27101e04c3fSmrg    * pair as a uvec2.
27201e04c3fSmrg    */
27301e04c3fSmrg   ir_rvalue*
27401e04c3fSmrg   unpack_uint_to_uvec2(ir_rvalue *uint_rval)
27501e04c3fSmrg   {
27601e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
27701e04c3fSmrg
27801e04c3fSmrg      /* uint u = UINT_RVAL; */
27901e04c3fSmrg      ir_variable *u = factory.make_temp(glsl_type::uint_type,
28001e04c3fSmrg                                          "tmp_unpack_uint_to_uvec2_u");
28101e04c3fSmrg      factory.emit(assign(u, uint_rval));
28201e04c3fSmrg
28301e04c3fSmrg      /* uvec2 u2; */
28401e04c3fSmrg      ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
28501e04c3fSmrg                                           "tmp_unpack_uint_to_uvec2_u2");
28601e04c3fSmrg
28701e04c3fSmrg      /* u2.x = u & 0xffffu; */
28801e04c3fSmrg      factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
28901e04c3fSmrg
29001e04c3fSmrg      /* u2.y = u >> 16u; */
29101e04c3fSmrg      factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
29201e04c3fSmrg
29301e04c3fSmrg      return deref(u2).val;
29401e04c3fSmrg   }
29501e04c3fSmrg
29601e04c3fSmrg   /**
29701e04c3fSmrg    * \brief Unpack a uint32 into two int16's.
29801e04c3fSmrg    *
29901e04c3fSmrg    * Specifically each 16-bit value is sign-extended to the full width of an
30001e04c3fSmrg    * int32 on return.
30101e04c3fSmrg    */
30201e04c3fSmrg   ir_rvalue *
30301e04c3fSmrg   unpack_uint_to_ivec2(ir_rvalue *uint_rval)
30401e04c3fSmrg   {
30501e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
30601e04c3fSmrg
30701e04c3fSmrg      if (!(op_mask & LOWER_PACK_USE_BFE)) {
30801e04c3fSmrg         return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
30901e04c3fSmrg                              constant(16u)),
31001e04c3fSmrg                       constant(16u));
31101e04c3fSmrg      }
31201e04c3fSmrg
31301e04c3fSmrg      ir_variable *i = factory.make_temp(glsl_type::int_type,
31401e04c3fSmrg                                         "tmp_unpack_uint_to_ivec2_i");
31501e04c3fSmrg      factory.emit(assign(i, u2i(uint_rval)));
31601e04c3fSmrg
31701e04c3fSmrg      /* ivec2 i2; */
31801e04c3fSmrg      ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
31901e04c3fSmrg                                          "tmp_unpack_uint_to_ivec2_i2");
32001e04c3fSmrg
32101e04c3fSmrg      factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
32201e04c3fSmrg                          WRITEMASK_X));
32301e04c3fSmrg      factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
32401e04c3fSmrg                          WRITEMASK_Y));
32501e04c3fSmrg
32601e04c3fSmrg      return deref(i2).val;
32701e04c3fSmrg   }
32801e04c3fSmrg
32901e04c3fSmrg   /**
33001e04c3fSmrg    * \brief Unpack a uint32 into four uint8's.
33101e04c3fSmrg    *
33201e04c3fSmrg    * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
33301e04c3fSmrg    * significant bits specify the 4-tuple's first element. Return the uint8
33401e04c3fSmrg    * 4-tuple as a uvec4.
33501e04c3fSmrg    */
33601e04c3fSmrg   ir_rvalue*
33701e04c3fSmrg   unpack_uint_to_uvec4(ir_rvalue *uint_rval)
33801e04c3fSmrg   {
33901e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
34001e04c3fSmrg
34101e04c3fSmrg      /* uint u = UINT_RVAL; */
34201e04c3fSmrg      ir_variable *u = factory.make_temp(glsl_type::uint_type,
34301e04c3fSmrg                                          "tmp_unpack_uint_to_uvec4_u");
34401e04c3fSmrg      factory.emit(assign(u, uint_rval));
34501e04c3fSmrg
34601e04c3fSmrg      /* uvec4 u4; */
34701e04c3fSmrg      ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
34801e04c3fSmrg                                           "tmp_unpack_uint_to_uvec4_u4");
34901e04c3fSmrg
35001e04c3fSmrg      /* u4.x = u & 0xffu; */
35101e04c3fSmrg      factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
35201e04c3fSmrg
35301e04c3fSmrg      if (op_mask & LOWER_PACK_USE_BFE) {
35401e04c3fSmrg         /* u4.y = bitfield_extract(u, 8, 8); */
35501e04c3fSmrg         factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
35601e04c3fSmrg                             WRITEMASK_Y));
35701e04c3fSmrg
35801e04c3fSmrg         /* u4.z = bitfield_extract(u, 16, 8); */
35901e04c3fSmrg         factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
36001e04c3fSmrg                             WRITEMASK_Z));
36101e04c3fSmrg      } else {
36201e04c3fSmrg         /* u4.y = (u >> 8u) & 0xffu; */
36301e04c3fSmrg         factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
36401e04c3fSmrg                                         constant(0xffu)), WRITEMASK_Y));
36501e04c3fSmrg
36601e04c3fSmrg         /* u4.z = (u >> 16u) & 0xffu; */
36701e04c3fSmrg         factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
36801e04c3fSmrg                                         constant(0xffu)), WRITEMASK_Z));
36901e04c3fSmrg      }
37001e04c3fSmrg
37101e04c3fSmrg      /* u4.w = (u >> 24u) */
37201e04c3fSmrg      factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
37301e04c3fSmrg
37401e04c3fSmrg      return deref(u4).val;
37501e04c3fSmrg   }
37601e04c3fSmrg
37701e04c3fSmrg   /**
37801e04c3fSmrg    * \brief Unpack a uint32 into four int8's.
37901e04c3fSmrg    *
38001e04c3fSmrg    * Specifically each 8-bit value is sign-extended to the full width of an
38101e04c3fSmrg    * int32 on return.
38201e04c3fSmrg    */
38301e04c3fSmrg   ir_rvalue *
38401e04c3fSmrg   unpack_uint_to_ivec4(ir_rvalue *uint_rval)
38501e04c3fSmrg   {
38601e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
38701e04c3fSmrg
38801e04c3fSmrg      if (!(op_mask & LOWER_PACK_USE_BFE)) {
38901e04c3fSmrg         return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
39001e04c3fSmrg                              constant(24u)),
39101e04c3fSmrg                       constant(24u));
39201e04c3fSmrg      }
39301e04c3fSmrg
39401e04c3fSmrg      ir_variable *i = factory.make_temp(glsl_type::int_type,
39501e04c3fSmrg                                         "tmp_unpack_uint_to_ivec4_i");
39601e04c3fSmrg      factory.emit(assign(i, u2i(uint_rval)));
39701e04c3fSmrg
39801e04c3fSmrg      /* ivec4 i4; */
39901e04c3fSmrg      ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
40001e04c3fSmrg                                          "tmp_unpack_uint_to_ivec4_i4");
40101e04c3fSmrg
40201e04c3fSmrg      factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
40301e04c3fSmrg                          WRITEMASK_X));
40401e04c3fSmrg      factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
40501e04c3fSmrg                          WRITEMASK_Y));
40601e04c3fSmrg      factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
40701e04c3fSmrg                          WRITEMASK_Z));
40801e04c3fSmrg      factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
40901e04c3fSmrg                          WRITEMASK_W));
41001e04c3fSmrg
41101e04c3fSmrg      return deref(i4).val;
41201e04c3fSmrg   }
41301e04c3fSmrg
41401e04c3fSmrg   /**
41501e04c3fSmrg    * \brief Lower a packSnorm2x16 expression.
41601e04c3fSmrg    *
41701e04c3fSmrg    * \param vec2_rval is packSnorm2x16's input
41801e04c3fSmrg    * \return packSnorm2x16's output as a uint rvalue
41901e04c3fSmrg    */
42001e04c3fSmrg   ir_rvalue*
42101e04c3fSmrg   lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
42201e04c3fSmrg   {
42301e04c3fSmrg      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
42401e04c3fSmrg       *
42501e04c3fSmrg       *    highp uint packSnorm2x16(vec2 v)
42601e04c3fSmrg       *    --------------------------------
42701e04c3fSmrg       *    First, converts each component of the normalized floating-point value
42801e04c3fSmrg       *    v into 16-bit integer values. Then, the results are packed into the
42901e04c3fSmrg       *    returned 32-bit unsigned integer.
43001e04c3fSmrg       *
43101e04c3fSmrg       *    The conversion for component c of v to fixed point is done as
43201e04c3fSmrg       *    follows:
43301e04c3fSmrg       *
43401e04c3fSmrg       *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
43501e04c3fSmrg       *
43601e04c3fSmrg       *    The first component of the vector will be written to the least
43701e04c3fSmrg       *    significant bits of the output; the last component will be written to
43801e04c3fSmrg       *    the most significant bits.
43901e04c3fSmrg       *
44001e04c3fSmrg       * This function generates IR that approximates the following pseudo-GLSL:
44101e04c3fSmrg       *
44201e04c3fSmrg       *     return pack_uvec2_to_uint(
44301e04c3fSmrg       *         uvec2(ivec2(
44401e04c3fSmrg       *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
44501e04c3fSmrg       *
44601e04c3fSmrg       * It is necessary to first convert the vec2 to ivec2 rather than directly
44701e04c3fSmrg       * converting vec2 to uvec2 because the latter conversion is undefined.
44801e04c3fSmrg       * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
44901e04c3fSmrg       * convert a negative floating point value to an uint".
45001e04c3fSmrg       */
45101e04c3fSmrg      assert(vec2_rval->type == glsl_type::vec2_type);
45201e04c3fSmrg
45301e04c3fSmrg      ir_rvalue *result = pack_uvec2_to_uint(
45401e04c3fSmrg            i2u(f2i(round_even(mul(clamp(vec2_rval,
45501e04c3fSmrg                                         constant(-1.0f),
45601e04c3fSmrg                                         constant(1.0f)),
45701e04c3fSmrg                                   constant(32767.0f))))));
45801e04c3fSmrg
45901e04c3fSmrg      assert(result->type == glsl_type::uint_type);
46001e04c3fSmrg      return result;
46101e04c3fSmrg   }
46201e04c3fSmrg
46301e04c3fSmrg   /**
46401e04c3fSmrg    * \brief Lower a packSnorm4x8 expression.
46501e04c3fSmrg    *
46601e04c3fSmrg    * \param vec4_rval is packSnorm4x8's input
46701e04c3fSmrg    * \return packSnorm4x8's output as a uint rvalue
46801e04c3fSmrg    */
46901e04c3fSmrg   ir_rvalue*
47001e04c3fSmrg   lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
47101e04c3fSmrg   {
47201e04c3fSmrg      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
47301e04c3fSmrg       *
47401e04c3fSmrg       *    highp uint packSnorm4x8(vec4 v)
47501e04c3fSmrg       *    -------------------------------
47601e04c3fSmrg       *    First, converts each component of the normalized floating-point value
47701e04c3fSmrg       *    v into 8-bit integer values. Then, the results are packed into the
47801e04c3fSmrg       *    returned 32-bit unsigned integer.
47901e04c3fSmrg       *
48001e04c3fSmrg       *    The conversion for component c of v to fixed point is done as
48101e04c3fSmrg       *    follows:
48201e04c3fSmrg       *
48301e04c3fSmrg       *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
48401e04c3fSmrg       *
48501e04c3fSmrg       *    The first component of the vector will be written to the least
48601e04c3fSmrg       *    significant bits of the output; the last component will be written to
48701e04c3fSmrg       *    the most significant bits.
48801e04c3fSmrg       *
48901e04c3fSmrg       * This function generates IR that approximates the following pseudo-GLSL:
49001e04c3fSmrg       *
49101e04c3fSmrg       *     return pack_uvec4_to_uint(
49201e04c3fSmrg       *         uvec4(ivec4(
49301e04c3fSmrg       *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
49401e04c3fSmrg       *
49501e04c3fSmrg       * It is necessary to first convert the vec4 to ivec4 rather than directly
49601e04c3fSmrg       * converting vec4 to uvec4 because the latter conversion is undefined.
49701e04c3fSmrg       * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
49801e04c3fSmrg       * convert a negative floating point value to an uint".
49901e04c3fSmrg       */
50001e04c3fSmrg      assert(vec4_rval->type == glsl_type::vec4_type);
50101e04c3fSmrg
50201e04c3fSmrg      ir_rvalue *result = pack_uvec4_to_uint(
50301e04c3fSmrg            i2u(f2i(round_even(mul(clamp(vec4_rval,
50401e04c3fSmrg                                         constant(-1.0f),
50501e04c3fSmrg                                         constant(1.0f)),
50601e04c3fSmrg                                   constant(127.0f))))));
50701e04c3fSmrg
50801e04c3fSmrg      assert(result->type == glsl_type::uint_type);
50901e04c3fSmrg      return result;
51001e04c3fSmrg   }
51101e04c3fSmrg
51201e04c3fSmrg   /**
51301e04c3fSmrg    * \brief Lower an unpackSnorm2x16 expression.
51401e04c3fSmrg    *
51501e04c3fSmrg    * \param uint_rval is unpackSnorm2x16's input
51601e04c3fSmrg    * \return unpackSnorm2x16's output as a vec2 rvalue
51701e04c3fSmrg    */
51801e04c3fSmrg   ir_rvalue*
51901e04c3fSmrg   lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
52001e04c3fSmrg   {
52101e04c3fSmrg      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
52201e04c3fSmrg       *
52301e04c3fSmrg       *    highp vec2 unpackSnorm2x16 (highp uint p)
52401e04c3fSmrg       *    -----------------------------------------
52501e04c3fSmrg       *    First, unpacks a single 32-bit unsigned integer p into a pair of
52601e04c3fSmrg       *    16-bit unsigned integers. Then, each component is converted to
52701e04c3fSmrg       *    a normalized floating-point value to generate the returned
52801e04c3fSmrg       *    two-component vector.
52901e04c3fSmrg       *
53001e04c3fSmrg       *    The conversion for unpacked fixed-point value f to floating point is
53101e04c3fSmrg       *    done as follows:
53201e04c3fSmrg       *
53301e04c3fSmrg       *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
53401e04c3fSmrg       *
53501e04c3fSmrg       *    The first component of the returned vector will be extracted from the
53601e04c3fSmrg       *    least significant bits of the input; the last component will be
53701e04c3fSmrg       *    extracted from the most significant bits.
53801e04c3fSmrg       *
53901e04c3fSmrg       * This function generates IR that approximates the following pseudo-GLSL:
54001e04c3fSmrg       *
54101e04c3fSmrg       *    return clamp(
54201e04c3fSmrg       *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
54301e04c3fSmrg       *       -1.0f, 1.0f);
54401e04c3fSmrg       *
54501e04c3fSmrg       * The above IR may appear unnecessarily complex, but the intermediate
54601e04c3fSmrg       * conversion to ivec2 and the bit shifts are necessary to correctly unpack
54701e04c3fSmrg       * negative floats.
54801e04c3fSmrg       *
54901e04c3fSmrg       * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
55001e04c3fSmrg       * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
55101e04c3fSmrg       * place that int16 into an int32, which results in the *positive* integer
55201e04c3fSmrg       * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
55301e04c3fSmrg       * unimportant bit 16. We must now extend the int16's sign bit into bits
55401e04c3fSmrg       * 17-32, which is accomplished by left-shifting then right-shifting.
55501e04c3fSmrg       */
55601e04c3fSmrg
55701e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
55801e04c3fSmrg
55901e04c3fSmrg      ir_rvalue *result =
56001e04c3fSmrg        clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
56101e04c3fSmrg                  constant(32767.0f)),
56201e04c3fSmrg              constant(-1.0f),
56301e04c3fSmrg              constant(1.0f));
56401e04c3fSmrg
56501e04c3fSmrg      assert(result->type == glsl_type::vec2_type);
56601e04c3fSmrg      return result;
56701e04c3fSmrg   }
56801e04c3fSmrg
56901e04c3fSmrg   /**
57001e04c3fSmrg    * \brief Lower an unpackSnorm4x8 expression.
57101e04c3fSmrg    *
57201e04c3fSmrg    * \param uint_rval is unpackSnorm4x8's input
57301e04c3fSmrg    * \return unpackSnorm4x8's output as a vec4 rvalue
57401e04c3fSmrg    */
57501e04c3fSmrg   ir_rvalue*
57601e04c3fSmrg   lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
57701e04c3fSmrg   {
57801e04c3fSmrg      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
57901e04c3fSmrg       *
58001e04c3fSmrg       *    highp vec4 unpackSnorm4x8 (highp uint p)
58101e04c3fSmrg       *    ----------------------------------------
58201e04c3fSmrg       *    First, unpacks a single 32-bit unsigned integer p into four
58301e04c3fSmrg       *    8-bit unsigned integers. Then, each component is converted to
58401e04c3fSmrg       *    a normalized floating-point value to generate the returned
58501e04c3fSmrg       *    four-component vector.
58601e04c3fSmrg       *
58701e04c3fSmrg       *    The conversion for unpacked fixed-point value f to floating point is
58801e04c3fSmrg       *    done as follows:
58901e04c3fSmrg       *
59001e04c3fSmrg       *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
59101e04c3fSmrg       *
59201e04c3fSmrg       *    The first component of the returned vector will be extracted from the
59301e04c3fSmrg       *    least significant bits of the input; the last component will be
59401e04c3fSmrg       *    extracted from the most significant bits.
59501e04c3fSmrg       *
59601e04c3fSmrg       * This function generates IR that approximates the following pseudo-GLSL:
59701e04c3fSmrg       *
59801e04c3fSmrg       *    return clamp(
59901e04c3fSmrg       *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
60001e04c3fSmrg       *       -1.0f, 1.0f);
60101e04c3fSmrg       *
60201e04c3fSmrg       * The above IR may appear unnecessarily complex, but the intermediate
60301e04c3fSmrg       * conversion to ivec4 and the bit shifts are necessary to correctly unpack
60401e04c3fSmrg       * negative floats.
60501e04c3fSmrg       *
60601e04c3fSmrg       * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
60701e04c3fSmrg       * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
60801e04c3fSmrg       * place that int8 into an int32, which results in the *positive* integer
60901e04c3fSmrg       * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
61001e04c3fSmrg       * unimportant bit 8. We must now extend the int8's sign bit into bits
61101e04c3fSmrg       * 9-32, which is accomplished by left-shifting then right-shifting.
61201e04c3fSmrg       */
61301e04c3fSmrg
61401e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
61501e04c3fSmrg
61601e04c3fSmrg      ir_rvalue *result =
61701e04c3fSmrg        clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
61801e04c3fSmrg                  constant(127.0f)),
61901e04c3fSmrg              constant(-1.0f),
62001e04c3fSmrg              constant(1.0f));
62101e04c3fSmrg
62201e04c3fSmrg      assert(result->type == glsl_type::vec4_type);
62301e04c3fSmrg      return result;
62401e04c3fSmrg   }
62501e04c3fSmrg
62601e04c3fSmrg   /**
62701e04c3fSmrg    * \brief Lower a packUnorm2x16 expression.
62801e04c3fSmrg    *
62901e04c3fSmrg    * \param vec2_rval is packUnorm2x16's input
63001e04c3fSmrg    * \return packUnorm2x16's output as a uint rvalue
63101e04c3fSmrg    */
63201e04c3fSmrg   ir_rvalue*
63301e04c3fSmrg   lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
63401e04c3fSmrg   {
63501e04c3fSmrg      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
63601e04c3fSmrg       *
63701e04c3fSmrg       *    highp uint packUnorm2x16 (vec2 v)
63801e04c3fSmrg       *    ---------------------------------
63901e04c3fSmrg       *    First, converts each component of the normalized floating-point value
64001e04c3fSmrg       *    v into 16-bit integer values. Then, the results are packed into the
64101e04c3fSmrg       *    returned 32-bit unsigned integer.
64201e04c3fSmrg       *
64301e04c3fSmrg       *    The conversion for component c of v to fixed point is done as
64401e04c3fSmrg       *    follows:
64501e04c3fSmrg       *
64601e04c3fSmrg       *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
64701e04c3fSmrg       *
64801e04c3fSmrg       *    The first component of the vector will be written to the least
64901e04c3fSmrg       *    significant bits of the output; the last component will be written to
65001e04c3fSmrg       *    the most significant bits.
65101e04c3fSmrg       *
65201e04c3fSmrg       * This function generates IR that approximates the following pseudo-GLSL:
65301e04c3fSmrg       *
65401e04c3fSmrg       *     return pack_uvec2_to_uint(uvec2(
65501e04c3fSmrg       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
65601e04c3fSmrg       *
65701e04c3fSmrg       * Here it is safe to directly convert the vec2 to uvec2 because the vec2
65801e04c3fSmrg       * has been clamped to a non-negative range.
65901e04c3fSmrg       */
66001e04c3fSmrg
66101e04c3fSmrg      assert(vec2_rval->type == glsl_type::vec2_type);
66201e04c3fSmrg
66301e04c3fSmrg      ir_rvalue *result = pack_uvec2_to_uint(
66401e04c3fSmrg         f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
66501e04c3fSmrg
66601e04c3fSmrg      assert(result->type == glsl_type::uint_type);
66701e04c3fSmrg      return result;
66801e04c3fSmrg   }
66901e04c3fSmrg
67001e04c3fSmrg   /**
67101e04c3fSmrg    * \brief Lower a packUnorm4x8 expression.
67201e04c3fSmrg    *
67301e04c3fSmrg    * \param vec4_rval is packUnorm4x8's input
67401e04c3fSmrg    * \return packUnorm4x8's output as a uint rvalue
67501e04c3fSmrg    */
67601e04c3fSmrg   ir_rvalue*
67701e04c3fSmrg   lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
67801e04c3fSmrg   {
67901e04c3fSmrg      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
68001e04c3fSmrg       *
68101e04c3fSmrg       *    highp uint packUnorm4x8 (vec4 v)
68201e04c3fSmrg       *    --------------------------------
68301e04c3fSmrg       *    First, converts each component of the normalized floating-point value
68401e04c3fSmrg       *    v into 8-bit integer values. Then, the results are packed into the
68501e04c3fSmrg       *    returned 32-bit unsigned integer.
68601e04c3fSmrg       *
68701e04c3fSmrg       *    The conversion for component c of v to fixed point is done as
68801e04c3fSmrg       *    follows:
68901e04c3fSmrg       *
69001e04c3fSmrg       *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
69101e04c3fSmrg       *
69201e04c3fSmrg       *    The first component of the vector will be written to the least
69301e04c3fSmrg       *    significant bits of the output; the last component will be written to
69401e04c3fSmrg       *    the most significant bits.
69501e04c3fSmrg       *
69601e04c3fSmrg       * This function generates IR that approximates the following pseudo-GLSL:
69701e04c3fSmrg       *
69801e04c3fSmrg       *     return pack_uvec4_to_uint(uvec4(
69901e04c3fSmrg       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
70001e04c3fSmrg       *
70101e04c3fSmrg       * Here it is safe to directly convert the vec4 to uvec4 because the vec4
70201e04c3fSmrg       * has been clamped to a non-negative range.
70301e04c3fSmrg       */
70401e04c3fSmrg
70501e04c3fSmrg      assert(vec4_rval->type == glsl_type::vec4_type);
70601e04c3fSmrg
70701e04c3fSmrg      ir_rvalue *result = pack_uvec4_to_uint(
70801e04c3fSmrg         f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
70901e04c3fSmrg
71001e04c3fSmrg      assert(result->type == glsl_type::uint_type);
71101e04c3fSmrg      return result;
71201e04c3fSmrg   }
71301e04c3fSmrg
71401e04c3fSmrg   /**
71501e04c3fSmrg    * \brief Lower an unpackUnorm2x16 expression.
71601e04c3fSmrg    *
71701e04c3fSmrg    * \param uint_rval is unpackUnorm2x16's input
71801e04c3fSmrg    * \return unpackUnorm2x16's output as a vec2 rvalue
71901e04c3fSmrg    */
72001e04c3fSmrg   ir_rvalue*
72101e04c3fSmrg   lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
72201e04c3fSmrg   {
72301e04c3fSmrg      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
72401e04c3fSmrg       *
72501e04c3fSmrg       *    highp vec2 unpackUnorm2x16 (highp uint p)
72601e04c3fSmrg       *    -----------------------------------------
72701e04c3fSmrg       *    First, unpacks a single 32-bit unsigned integer p into a pair of
72801e04c3fSmrg       *    16-bit unsigned integers. Then, each component is converted to
72901e04c3fSmrg       *    a normalized floating-point value to generate the returned
73001e04c3fSmrg       *    two-component vector.
73101e04c3fSmrg       *
73201e04c3fSmrg       *    The conversion for unpacked fixed-point value f to floating point is
73301e04c3fSmrg       *    done as follows:
73401e04c3fSmrg       *
73501e04c3fSmrg       *       unpackUnorm2x16: f / 65535.0
73601e04c3fSmrg       *
73701e04c3fSmrg       *    The first component of the returned vector will be extracted from the
73801e04c3fSmrg       *    least significant bits of the input; the last component will be
73901e04c3fSmrg       *    extracted from the most significant bits.
74001e04c3fSmrg       *
74101e04c3fSmrg       * This function generates IR that approximates the following pseudo-GLSL:
74201e04c3fSmrg       *
74301e04c3fSmrg       *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
74401e04c3fSmrg       */
74501e04c3fSmrg
74601e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
74701e04c3fSmrg
74801e04c3fSmrg      ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
74901e04c3fSmrg                              constant(65535.0f));
75001e04c3fSmrg
75101e04c3fSmrg      assert(result->type == glsl_type::vec2_type);
75201e04c3fSmrg      return result;
75301e04c3fSmrg   }
75401e04c3fSmrg
75501e04c3fSmrg   /**
75601e04c3fSmrg    * \brief Lower an unpackUnorm4x8 expression.
75701e04c3fSmrg    *
75801e04c3fSmrg    * \param uint_rval is unpackUnorm4x8's input
75901e04c3fSmrg    * \return unpackUnorm4x8's output as a vec4 rvalue
76001e04c3fSmrg    */
76101e04c3fSmrg   ir_rvalue*
76201e04c3fSmrg   lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
76301e04c3fSmrg   {
76401e04c3fSmrg      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
76501e04c3fSmrg       *
76601e04c3fSmrg       *    highp vec4 unpackUnorm4x8 (highp uint p)
76701e04c3fSmrg       *    ----------------------------------------
76801e04c3fSmrg       *    First, unpacks a single 32-bit unsigned integer p into four
76901e04c3fSmrg       *    8-bit unsigned integers. Then, each component is converted to
77001e04c3fSmrg       *    a normalized floating-point value to generate the returned
77101e04c3fSmrg       *    two-component vector.
77201e04c3fSmrg       *
77301e04c3fSmrg       *    The conversion for unpacked fixed-point value f to floating point is
77401e04c3fSmrg       *    done as follows:
77501e04c3fSmrg       *
77601e04c3fSmrg       *       unpackUnorm4x8: f / 255.0
77701e04c3fSmrg       *
77801e04c3fSmrg       *    The first component of the returned vector will be extracted from the
77901e04c3fSmrg       *    least significant bits of the input; the last component will be
78001e04c3fSmrg       *    extracted from the most significant bits.
78101e04c3fSmrg       *
78201e04c3fSmrg       * This function generates IR that approximates the following pseudo-GLSL:
78301e04c3fSmrg       *
78401e04c3fSmrg       *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
78501e04c3fSmrg       */
78601e04c3fSmrg
78701e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
78801e04c3fSmrg
78901e04c3fSmrg      ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
79001e04c3fSmrg                              constant(255.0f));
79101e04c3fSmrg
79201e04c3fSmrg      assert(result->type == glsl_type::vec4_type);
79301e04c3fSmrg      return result;
79401e04c3fSmrg   }
79501e04c3fSmrg
79601e04c3fSmrg   /**
79701e04c3fSmrg    * \brief Lower the component-wise calculation of packHalf2x16.
79801e04c3fSmrg    *
79901e04c3fSmrg    * \param f_rval is one component of packHafl2x16's input
80001e04c3fSmrg    * \param e_rval is the unshifted exponent bits of f_rval
80101e04c3fSmrg    * \param m_rval is the unshifted mantissa bits of f_rval
80201e04c3fSmrg    *
80301e04c3fSmrg    * \return a uint rvalue that encodes a float16 in its lower 16 bits
80401e04c3fSmrg    */
80501e04c3fSmrg   ir_rvalue*
80601e04c3fSmrg   pack_half_1x16_nosign(ir_rvalue *f_rval,
80701e04c3fSmrg                         ir_rvalue *e_rval,
80801e04c3fSmrg                         ir_rvalue *m_rval)
80901e04c3fSmrg   {
81001e04c3fSmrg      assert(e_rval->type == glsl_type::uint_type);
81101e04c3fSmrg      assert(m_rval->type == glsl_type::uint_type);
81201e04c3fSmrg
81301e04c3fSmrg      /* uint u16; */
81401e04c3fSmrg      ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
81501e04c3fSmrg                                           "tmp_pack_half_1x16_u16");
81601e04c3fSmrg
81701e04c3fSmrg      /* float f = FLOAT_RVAL; */
81801e04c3fSmrg      ir_variable *f = factory.make_temp(glsl_type::float_type,
81901e04c3fSmrg                                          "tmp_pack_half_1x16_f");
82001e04c3fSmrg      factory.emit(assign(f, f_rval));
82101e04c3fSmrg
82201e04c3fSmrg      /* uint e = E_RVAL; */
82301e04c3fSmrg      ir_variable *e = factory.make_temp(glsl_type::uint_type,
82401e04c3fSmrg                                          "tmp_pack_half_1x16_e");
82501e04c3fSmrg      factory.emit(assign(e, e_rval));
82601e04c3fSmrg
82701e04c3fSmrg      /* uint m = M_RVAL; */
82801e04c3fSmrg      ir_variable *m = factory.make_temp(glsl_type::uint_type,
82901e04c3fSmrg                                          "tmp_pack_half_1x16_m");
83001e04c3fSmrg      factory.emit(assign(m, m_rval));
83101e04c3fSmrg
83201e04c3fSmrg      /* Preliminaries
83301e04c3fSmrg       * -------------
83401e04c3fSmrg       *
83501e04c3fSmrg       * For a float16, the bit layout is:
83601e04c3fSmrg       *
83701e04c3fSmrg       *   sign:     15
83801e04c3fSmrg       *   exponent: 10:14
83901e04c3fSmrg       *   mantissa: 0:9
84001e04c3fSmrg       *
84101e04c3fSmrg       * Let f16 be a float16 value. The sign, exponent, and mantissa
84201e04c3fSmrg       * determine its value thus:
84301e04c3fSmrg       *
84401e04c3fSmrg       *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
84501e04c3fSmrg       *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
84601e04c3fSmrg       *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
84701e04c3fSmrg       *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
84801e04c3fSmrg       *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
84901e04c3fSmrg       *
85001e04c3fSmrg       * where 0 <= m16 < 2^10.
85101e04c3fSmrg       *
85201e04c3fSmrg       * For a float32, the bit layout is:
85301e04c3fSmrg       *
85401e04c3fSmrg       *   sign:     31
85501e04c3fSmrg       *   exponent: 23:30
85601e04c3fSmrg       *   mantissa: 0:22
85701e04c3fSmrg       *
85801e04c3fSmrg       * Let f32 be a float32 value. The sign, exponent, and mantissa
85901e04c3fSmrg       * determine its value thus:
86001e04c3fSmrg       *
86101e04c3fSmrg       *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
86201e04c3fSmrg       *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
86301e04c3fSmrg       *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
86401e04c3fSmrg       *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
86501e04c3fSmrg       *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
86601e04c3fSmrg       *
86701e04c3fSmrg       * where 0 <= m32 < 2^23.
86801e04c3fSmrg       *
86901e04c3fSmrg       * The minimum and maximum normal float16 values are
87001e04c3fSmrg       *
87101e04c3fSmrg       *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
87201e04c3fSmrg       *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
87301e04c3fSmrg       *
87401e04c3fSmrg       * The step at max_norm16 is
87501e04c3fSmrg       *
87601e04c3fSmrg       *   max_step16 = 2^5                                     (22)
87701e04c3fSmrg       *
87801e04c3fSmrg       * Observe that the float16 boundary values in equations 20-21 lie in the
87901e04c3fSmrg       * range of normal float32 values.
88001e04c3fSmrg       *
88101e04c3fSmrg       *
88201e04c3fSmrg       * Rounding Behavior
88301e04c3fSmrg       * -----------------
88401e04c3fSmrg       * Not all float32 values can be exactly represented as a float16. We
88501e04c3fSmrg       * round all such intermediate float32 values to the nearest float16; if
88601e04c3fSmrg       * the float32 is exactly between to float16 values, we round to the one
88701e04c3fSmrg       * with an even mantissa. This rounding behavior has several benefits:
88801e04c3fSmrg       *
88901e04c3fSmrg       *   - It has no sign bias.
89001e04c3fSmrg       *
89101e04c3fSmrg       *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
89201e04c3fSmrg       *     GPU ISA.
89301e04c3fSmrg       *
89401e04c3fSmrg       *   - By reproducing the behavior of the GPU (at least on Intel hardware),
89501e04c3fSmrg       *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
89601e04c3fSmrg       *     result in the same value as if the expression were executed on the
89701e04c3fSmrg       *     GPU.
89801e04c3fSmrg       *
89901e04c3fSmrg       * Calculation
90001e04c3fSmrg       * -----------
90101e04c3fSmrg       * Our task is to compute s16, e16, m16 given f32.  Since this function
90201e04c3fSmrg       * ignores the sign bit, assume that s32 = s16 = 0.  There are several
90301e04c3fSmrg       * cases consider.
90401e04c3fSmrg       */
90501e04c3fSmrg
90601e04c3fSmrg      factory.emit(
90701e04c3fSmrg
90801e04c3fSmrg         /* Case 1) f32 is NaN
90901e04c3fSmrg          *
91001e04c3fSmrg          *   The resultant f16 will also be NaN.
91101e04c3fSmrg          */
91201e04c3fSmrg
91301e04c3fSmrg         /* if (e32 == 255 && m32 != 0) { */
91401e04c3fSmrg         if_tree(logic_and(equal(e, constant(0xffu << 23u)),
91501e04c3fSmrg                           logic_not(equal(m, constant(0u)))),
91601e04c3fSmrg
91701e04c3fSmrg            assign(u16, constant(0x7fffu)),
91801e04c3fSmrg
91901e04c3fSmrg         /* Case 2) f32 lies in the range [0, min_norm16).
92001e04c3fSmrg          *
92101e04c3fSmrg          *   The resultant float16 will be either zero, subnormal, or normal.
92201e04c3fSmrg          *
92301e04c3fSmrg          *   Solving
92401e04c3fSmrg          *
92501e04c3fSmrg          *     f32 = min_norm16       (30)
92601e04c3fSmrg          *
92701e04c3fSmrg          *   gives
92801e04c3fSmrg          *
92901e04c3fSmrg          *     e32 = 113 and m32 = 0  (31)
93001e04c3fSmrg          *
93101e04c3fSmrg          *   Therefore this case occurs if and only if
93201e04c3fSmrg          *
93301e04c3fSmrg          *     e32 < 113              (32)
93401e04c3fSmrg          */
93501e04c3fSmrg
93601e04c3fSmrg         /* } else if (e32 < 113) { */
93701e04c3fSmrg         if_tree(less(e, constant(113u << 23u)),
93801e04c3fSmrg
93901e04c3fSmrg            /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
94001e04c3fSmrg            assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
94101e04c3fSmrg                                           constant((float) (1 << 24)))))),
94201e04c3fSmrg
94301e04c3fSmrg         /* Case 3) f32 lies in the range
94401e04c3fSmrg          *         [min_norm16, max_norm16 + max_step16).
94501e04c3fSmrg          *
94601e04c3fSmrg          *   The resultant float16 will be either normal or infinite.
94701e04c3fSmrg          *
94801e04c3fSmrg          *   Solving
94901e04c3fSmrg          *
95001e04c3fSmrg          *     f32 = max_norm16 + max_step16           (40)
95101e04c3fSmrg          *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
95201e04c3fSmrg          *         = 2^16                              (42)
95301e04c3fSmrg          *   gives
95401e04c3fSmrg          *
95501e04c3fSmrg          *     e32 = 143 and m32 = 0                   (43)
95601e04c3fSmrg          *
95701e04c3fSmrg          *   We already solved the boundary condition f32 = min_norm16 above
95801e04c3fSmrg          *   in equation 31. Therefore this case occurs if and only if
95901e04c3fSmrg          *
96001e04c3fSmrg          *     113 <= e32 and e32 < 143
96101e04c3fSmrg          */
96201e04c3fSmrg
96301e04c3fSmrg         /* } else if (e32 < 143) { */
96401e04c3fSmrg         if_tree(less(e, constant(143u << 23u)),
96501e04c3fSmrg
96601e04c3fSmrg            /* The addition below handles the case where the mantissa rounds
96701e04c3fSmrg             * up to 1024 and bumps the exponent.
96801e04c3fSmrg             *
96901e04c3fSmrg             * u16 = ((e - (112u << 23u)) >> 13u)
97001e04c3fSmrg             *     + round_to_even((float(m) / (1u << 13u));
97101e04c3fSmrg             */
97201e04c3fSmrg            assign(u16, add(rshift(sub(e, constant(112u << 23u)),
97301e04c3fSmrg                                   constant(13u)),
97401e04c3fSmrg                            f2u(round_even(
97501e04c3fSmrg                                  div(u2f(m), constant((float) (1 << 13))))))),
97601e04c3fSmrg
97701e04c3fSmrg         /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
97801e04c3fSmrg          *
97901e04c3fSmrg          *   The resultant float16 will be infinite.
98001e04c3fSmrg          *
98101e04c3fSmrg          *   The cases above caught all float32 values in the range
98201e04c3fSmrg          *   [0, max_norm16 + max_step16), so this is the fall-through case.
98301e04c3fSmrg          */
98401e04c3fSmrg
98501e04c3fSmrg         /* } else { */
98601e04c3fSmrg
98701e04c3fSmrg            assign(u16, constant(31u << 10u))))));
98801e04c3fSmrg
98901e04c3fSmrg         /* } */
99001e04c3fSmrg
99101e04c3fSmrg       return deref(u16).val;
99201e04c3fSmrg   }
99301e04c3fSmrg
99401e04c3fSmrg   /**
99501e04c3fSmrg    * \brief Lower a packHalf2x16 expression.
99601e04c3fSmrg    *
99701e04c3fSmrg    * \param vec2_rval is packHalf2x16's input
99801e04c3fSmrg    * \return packHalf2x16's output as a uint rvalue
99901e04c3fSmrg    */
100001e04c3fSmrg   ir_rvalue*
100101e04c3fSmrg   lower_pack_half_2x16(ir_rvalue *vec2_rval)
100201e04c3fSmrg   {
100301e04c3fSmrg      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
100401e04c3fSmrg       *
100501e04c3fSmrg       *    highp uint packHalf2x16 (mediump vec2 v)
100601e04c3fSmrg       *    ----------------------------------------
100701e04c3fSmrg       *    Returns an unsigned integer obtained by converting the components of
100801e04c3fSmrg       *    a two-component floating-point vector to the 16-bit floating-point
100901e04c3fSmrg       *    representation found in the OpenGL ES Specification, and then packing
101001e04c3fSmrg       *    these two 16-bit integers into a 32-bit unsigned integer.
101101e04c3fSmrg       *
101201e04c3fSmrg       *    The first vector component specifies the 16 least- significant bits
101301e04c3fSmrg       *    of the result; the second component specifies the 16 most-significant
101401e04c3fSmrg       *    bits.
101501e04c3fSmrg       */
101601e04c3fSmrg
101701e04c3fSmrg      assert(vec2_rval->type == glsl_type::vec2_type);
101801e04c3fSmrg
101901e04c3fSmrg      /* vec2 f = VEC2_RVAL; */
102001e04c3fSmrg      ir_variable *f = factory.make_temp(glsl_type::vec2_type,
102101e04c3fSmrg                                         "tmp_pack_half_2x16_f");
102201e04c3fSmrg      factory.emit(assign(f, vec2_rval));
102301e04c3fSmrg
102401e04c3fSmrg      /* uvec2 f32 = bitcast_f2u(f); */
102501e04c3fSmrg      ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
102601e04c3fSmrg                                            "tmp_pack_half_2x16_f32");
102701e04c3fSmrg      factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
102801e04c3fSmrg
102901e04c3fSmrg      /* uvec2 f16; */
103001e04c3fSmrg      ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
103101e04c3fSmrg                                        "tmp_pack_half_2x16_f16");
103201e04c3fSmrg
103301e04c3fSmrg      /* Get f32's unshifted exponent bits.
103401e04c3fSmrg       *
103501e04c3fSmrg       *   uvec2 e = f32 & 0x7f800000u;
103601e04c3fSmrg       */
103701e04c3fSmrg      ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
103801e04c3fSmrg                                          "tmp_pack_half_2x16_e");
103901e04c3fSmrg      factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
104001e04c3fSmrg
104101e04c3fSmrg      /* Get f32's unshifted mantissa bits.
104201e04c3fSmrg       *
104301e04c3fSmrg       *   uvec2 m = f32 & 0x007fffffu;
104401e04c3fSmrg       */
104501e04c3fSmrg      ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
104601e04c3fSmrg                                          "tmp_pack_half_2x16_m");
104701e04c3fSmrg      factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
104801e04c3fSmrg
104901e04c3fSmrg      /* Set f16's exponent and mantissa bits.
105001e04c3fSmrg       *
105101e04c3fSmrg       *   f16.x = pack_half_1x16_nosign(e.x, m.x);
105201e04c3fSmrg       *   f16.y = pack_half_1y16_nosign(e.y, m.y);
105301e04c3fSmrg       */
105401e04c3fSmrg      factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
105501e04c3fSmrg                                                     swizzle_x(e),
105601e04c3fSmrg                                                     swizzle_x(m)),
105701e04c3fSmrg                           WRITEMASK_X));
105801e04c3fSmrg      factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
105901e04c3fSmrg                                                     swizzle_y(e),
106001e04c3fSmrg                                                     swizzle_y(m)),
106101e04c3fSmrg                           WRITEMASK_Y));
106201e04c3fSmrg
106301e04c3fSmrg      /* Set f16's sign bits.
106401e04c3fSmrg       *
106501e04c3fSmrg       *   f16 |= (f32 & (1u << 31u) >> 16u;
106601e04c3fSmrg       */
106701e04c3fSmrg      factory.emit(
106801e04c3fSmrg         assign(f16, bit_or(f16,
106901e04c3fSmrg                            rshift(bit_and(f32, constant(1u << 31u)),
107001e04c3fSmrg                                   constant(16u)))));
107101e04c3fSmrg
107201e04c3fSmrg
107301e04c3fSmrg      /* return (f16.y << 16u) | f16.x; */
107401e04c3fSmrg      ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
107501e04c3fSmrg                                        constant(16u)),
107601e04c3fSmrg                                 swizzle_x(f16));
107701e04c3fSmrg
107801e04c3fSmrg      assert(result->type == glsl_type::uint_type);
107901e04c3fSmrg      return result;
108001e04c3fSmrg   }
108101e04c3fSmrg
108201e04c3fSmrg   /**
108301e04c3fSmrg    * \brief Lower the component-wise calculation of unpackHalf2x16.
108401e04c3fSmrg    *
108501e04c3fSmrg    * Given a uint that encodes a float16 in its lower 16 bits, this function
108601e04c3fSmrg    * returns a uint that encodes a float32 with the same value. The sign bit
108701e04c3fSmrg    * of the float16 is ignored.
108801e04c3fSmrg    *
108901e04c3fSmrg    * \param e_rval is the unshifted exponent bits of a float16
109001e04c3fSmrg    * \param m_rval is the unshifted mantissa bits of a float16
109101e04c3fSmrg    * \param a uint rvalue that encodes a float32
109201e04c3fSmrg    */
109301e04c3fSmrg   ir_rvalue*
109401e04c3fSmrg   unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
109501e04c3fSmrg   {
109601e04c3fSmrg      assert(e_rval->type == glsl_type::uint_type);
109701e04c3fSmrg      assert(m_rval->type == glsl_type::uint_type);
109801e04c3fSmrg
109901e04c3fSmrg      /* uint u32; */
110001e04c3fSmrg      ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
110101e04c3fSmrg                                           "tmp_unpack_half_1x16_u32");
110201e04c3fSmrg
110301e04c3fSmrg      /* uint e = E_RVAL; */
110401e04c3fSmrg      ir_variable *e = factory.make_temp(glsl_type::uint_type,
110501e04c3fSmrg                                          "tmp_unpack_half_1x16_e");
110601e04c3fSmrg      factory.emit(assign(e, e_rval));
110701e04c3fSmrg
110801e04c3fSmrg      /* uint m = M_RVAL; */
110901e04c3fSmrg      ir_variable *m = factory.make_temp(glsl_type::uint_type,
111001e04c3fSmrg                                          "tmp_unpack_half_1x16_m");
111101e04c3fSmrg      factory.emit(assign(m, m_rval));
111201e04c3fSmrg
111301e04c3fSmrg      /* Preliminaries
111401e04c3fSmrg       * -------------
111501e04c3fSmrg       *
111601e04c3fSmrg       * For a float16, the bit layout is:
111701e04c3fSmrg       *
111801e04c3fSmrg       *   sign:     15
111901e04c3fSmrg       *   exponent: 10:14
112001e04c3fSmrg       *   mantissa: 0:9
112101e04c3fSmrg       *
112201e04c3fSmrg       * Let f16 be a float16 value. The sign, exponent, and mantissa
112301e04c3fSmrg       * determine its value thus:
112401e04c3fSmrg       *
112501e04c3fSmrg       *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
112601e04c3fSmrg       *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
112701e04c3fSmrg       *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
112801e04c3fSmrg       *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
112901e04c3fSmrg       *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
113001e04c3fSmrg       *
113101e04c3fSmrg       * where 0 <= m16 < 2^10.
113201e04c3fSmrg       *
113301e04c3fSmrg       * For a float32, the bit layout is:
113401e04c3fSmrg       *
113501e04c3fSmrg       *   sign: 31
113601e04c3fSmrg       *   exponent: 23:30
113701e04c3fSmrg       *   mantissa: 0:22
113801e04c3fSmrg       *
113901e04c3fSmrg       * Let f32 be a float32 value. The sign, exponent, and mantissa
114001e04c3fSmrg       * determine its value thus:
114101e04c3fSmrg       *
114201e04c3fSmrg       *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
114301e04c3fSmrg       *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
114401e04c3fSmrg       *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
114501e04c3fSmrg       *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
114601e04c3fSmrg       *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
114701e04c3fSmrg       *
114801e04c3fSmrg       * where 0 <= m32 < 2^23.
114901e04c3fSmrg       *
115001e04c3fSmrg       * Calculation
115101e04c3fSmrg       * -----------
115201e04c3fSmrg       * Our task is to compute s32, e32, m32 given f16.  Since this function
115301e04c3fSmrg       * ignores the sign bit, assume that s32 = s16 = 0.  There are several
115401e04c3fSmrg       * cases consider.
115501e04c3fSmrg       */
115601e04c3fSmrg
115701e04c3fSmrg      factory.emit(
115801e04c3fSmrg
115901e04c3fSmrg         /* Case 1) f16 is zero or subnormal.
116001e04c3fSmrg          *
116101e04c3fSmrg          *   The simplest method of calcuating f32 in this case is
116201e04c3fSmrg          *
116301e04c3fSmrg          *     f32 = f16                       (20)
116401e04c3fSmrg          *         = 2^(-14) * (m16 / 2^10)    (21)
116501e04c3fSmrg          *         = m16 / 2^(-24)             (22)
116601e04c3fSmrg          */
116701e04c3fSmrg
116801e04c3fSmrg         /* if (e16 == 0) { */
116901e04c3fSmrg         if_tree(equal(e, constant(0u)),
117001e04c3fSmrg
117101e04c3fSmrg            /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
117201e04c3fSmrg            assign(u32, expr(ir_unop_bitcast_f2u,
117301e04c3fSmrg                                div(u2f(m), constant((float)(1 << 24))))),
117401e04c3fSmrg
117501e04c3fSmrg         /* Case 2) f16 is normal.
117601e04c3fSmrg          *
117701e04c3fSmrg          *   The equation
117801e04c3fSmrg          *
117901e04c3fSmrg          *     f32 = f16                              (30)
118001e04c3fSmrg          *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
118101e04c3fSmrg          *       2^(e16 - 15) * (1 + m16 / 2^10)
118201e04c3fSmrg          *
118301e04c3fSmrg          *   can be decomposed into two
118401e04c3fSmrg          *
118501e04c3fSmrg          *     2^(e32 - 127) = 2^(e16 - 15)           (32)
118601e04c3fSmrg          *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
118701e04c3fSmrg          *
118801e04c3fSmrg          *   which solve to
118901e04c3fSmrg          *
119001e04c3fSmrg          *     e32 = e16 + 112                        (34)
119101e04c3fSmrg          *     m32 = m16 * 2^13                       (35)
119201e04c3fSmrg          */
119301e04c3fSmrg
119401e04c3fSmrg         /* } else if (e16 < 31)) { */
119501e04c3fSmrg         if_tree(less(e, constant(31u << 10u)),
119601e04c3fSmrg
119701e04c3fSmrg              /* u32 = ((e + (112 << 10)) | m) << 13;
119801e04c3fSmrg               */
119901e04c3fSmrg              assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
120001e04c3fSmrg                                 constant(13u))),
120101e04c3fSmrg
120201e04c3fSmrg
120301e04c3fSmrg         /* Case 3) f16 is infinite. */
120401e04c3fSmrg         if_tree(equal(m, constant(0u)),
120501e04c3fSmrg
120601e04c3fSmrg                 assign(u32, constant(255u << 23u)),
120701e04c3fSmrg
120801e04c3fSmrg         /* Case 4) f16 is NaN. */
120901e04c3fSmrg         /* } else { */
121001e04c3fSmrg
121101e04c3fSmrg            assign(u32, constant(0x7fffffffu))))));
121201e04c3fSmrg
121301e04c3fSmrg         /* } */
121401e04c3fSmrg
121501e04c3fSmrg      return deref(u32).val;
121601e04c3fSmrg   }
121701e04c3fSmrg
121801e04c3fSmrg   /**
121901e04c3fSmrg    * \brief Lower an unpackHalf2x16 expression.
122001e04c3fSmrg    *
122101e04c3fSmrg    * \param uint_rval is unpackHalf2x16's input
122201e04c3fSmrg    * \return unpackHalf2x16's output as a vec2 rvalue
122301e04c3fSmrg    */
122401e04c3fSmrg   ir_rvalue*
122501e04c3fSmrg   lower_unpack_half_2x16(ir_rvalue *uint_rval)
122601e04c3fSmrg   {
122701e04c3fSmrg      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
122801e04c3fSmrg       *
122901e04c3fSmrg       *    mediump vec2 unpackHalf2x16 (highp uint v)
123001e04c3fSmrg       *    ------------------------------------------
123101e04c3fSmrg       *    Returns a two-component floating-point vector with components
123201e04c3fSmrg       *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
123301e04c3fSmrg       *    values, interpreting those values as 16-bit floating-point numbers
123401e04c3fSmrg       *    according to the OpenGL ES Specification, and converting them to
123501e04c3fSmrg       *    32-bit floating-point values.
123601e04c3fSmrg       *
123701e04c3fSmrg       *    The first component of the vector is obtained from the
123801e04c3fSmrg       *    16 least-significant bits of v; the second component is obtained
123901e04c3fSmrg       *    from the 16 most-significant bits of v.
124001e04c3fSmrg       */
124101e04c3fSmrg      assert(uint_rval->type == glsl_type::uint_type);
124201e04c3fSmrg
124301e04c3fSmrg      /* uint u = RVALUE;
124401e04c3fSmrg       * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
124501e04c3fSmrg       */
124601e04c3fSmrg      ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
124701e04c3fSmrg                                            "tmp_unpack_half_2x16_f16");
124801e04c3fSmrg      factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
124901e04c3fSmrg
125001e04c3fSmrg      /* uvec2 f32; */
125101e04c3fSmrg      ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
125201e04c3fSmrg                                            "tmp_unpack_half_2x16_f32");
125301e04c3fSmrg
125401e04c3fSmrg      /* Get f16's unshifted exponent bits.
125501e04c3fSmrg       *
125601e04c3fSmrg       *    uvec2 e = f16 & 0x7c00u;
125701e04c3fSmrg       */
125801e04c3fSmrg      ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
125901e04c3fSmrg                                          "tmp_unpack_half_2x16_e");
126001e04c3fSmrg      factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
126101e04c3fSmrg
126201e04c3fSmrg      /* Get f16's unshifted mantissa bits.
126301e04c3fSmrg       *
126401e04c3fSmrg       *    uvec2 m = f16 & 0x03ffu;
126501e04c3fSmrg       */
126601e04c3fSmrg      ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
126701e04c3fSmrg                                          "tmp_unpack_half_2x16_m");
126801e04c3fSmrg      factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
126901e04c3fSmrg
127001e04c3fSmrg      /* Set f32's exponent and mantissa bits.
127101e04c3fSmrg       *
127201e04c3fSmrg       *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
127301e04c3fSmrg       *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
127401e04c3fSmrg       */
127501e04c3fSmrg      factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
127601e04c3fSmrg                                                       swizzle_x(m)),
127701e04c3fSmrg                           WRITEMASK_X));
127801e04c3fSmrg      factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
127901e04c3fSmrg                                                       swizzle_y(m)),
128001e04c3fSmrg                           WRITEMASK_Y));
128101e04c3fSmrg
128201e04c3fSmrg      /* Set f32's sign bit.
128301e04c3fSmrg       *
128401e04c3fSmrg       *    f32 |= (f16 & 0x8000u) << 16u;
128501e04c3fSmrg       */
128601e04c3fSmrg      factory.emit(assign(f32, bit_or(f32,
128701e04c3fSmrg                                       lshift(bit_and(f16,
128801e04c3fSmrg                                                      constant(0x8000u)),
128901e04c3fSmrg                                              constant(16u)))));
129001e04c3fSmrg
129101e04c3fSmrg      /* return bitcast_u2f(f32); */
129201e04c3fSmrg      ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
129301e04c3fSmrg      assert(result->type == glsl_type::vec2_type);
129401e04c3fSmrg      return result;
129501e04c3fSmrg   }
129601e04c3fSmrg};
129701e04c3fSmrg
129801e04c3fSmrg} // namespace anonymous
129901e04c3fSmrg
130001e04c3fSmrg/**
130101e04c3fSmrg * \brief Lower the builtin packing functions.
130201e04c3fSmrg *
130301e04c3fSmrg * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
130401e04c3fSmrg */
130501e04c3fSmrgbool
130601e04c3fSmrglower_packing_builtins(exec_list *instructions, int op_mask)
130701e04c3fSmrg{
130801e04c3fSmrg   lower_packing_builtins_visitor v(op_mask);
130901e04c3fSmrg   visit_list_elements(&v, instructions, true);
131001e04c3fSmrg   return v.get_progress();
131101e04c3fSmrg}
1312