1b8e80941Smrg/*
2b8e80941Smrg * Copyright © 2012 Intel Corporation
3b8e80941Smrg *
4b8e80941Smrg * Permission is hereby granted, free of charge, to any person obtaining a
5b8e80941Smrg * copy of this software and associated documentation files (the "Software"),
6b8e80941Smrg * to deal in the Software without restriction, including without limitation
7b8e80941Smrg * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8b8e80941Smrg * and/or sell copies of the Software, and to permit persons to whom the
9b8e80941Smrg * Software is furnished to do so, subject to the following conditions:
10b8e80941Smrg *
11b8e80941Smrg * The above copyright notice and this permission notice (including the next
12b8e80941Smrg * paragraph) shall be included in all copies or substantial portions of the
13b8e80941Smrg * Software.
14b8e80941Smrg *
15b8e80941Smrg * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16b8e80941Smrg * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17b8e80941Smrg * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18b8e80941Smrg * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19b8e80941Smrg * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20b8e80941Smrg * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
21b8e80941Smrg * DEALINGS IN THE SOFTWARE.
22b8e80941Smrg */
23b8e80941Smrg
24b8e80941Smrg#include "ir.h"
25b8e80941Smrg#include "ir_builder.h"
26b8e80941Smrg#include "ir_optimization.h"
27b8e80941Smrg#include "ir_rvalue_visitor.h"
28b8e80941Smrg
29b8e80941Smrgnamespace {
30b8e80941Smrg
31b8e80941Smrgusing namespace ir_builder;
32b8e80941Smrg
33b8e80941Smrg/**
34b8e80941Smrg * A visitor that lowers built-in floating-point pack/unpack expressions
35b8e80941Smrg * such packSnorm2x16.
36b8e80941Smrg */
37b8e80941Smrgclass lower_packing_builtins_visitor : public ir_rvalue_visitor {
38b8e80941Smrgpublic:
39b8e80941Smrg   /**
40b8e80941Smrg    * \param op_mask is a bitmask of `enum lower_packing_builtins_op`
41b8e80941Smrg    */
42b8e80941Smrg   explicit lower_packing_builtins_visitor(int op_mask)
43b8e80941Smrg      : op_mask(op_mask),
44b8e80941Smrg        progress(false)
45b8e80941Smrg   {
46b8e80941Smrg      factory.instructions = &factory_instructions;
47b8e80941Smrg   }
48b8e80941Smrg
49b8e80941Smrg   virtual ~lower_packing_builtins_visitor()
50b8e80941Smrg   {
51b8e80941Smrg      assert(factory_instructions.is_empty());
52b8e80941Smrg   }
53b8e80941Smrg
54b8e80941Smrg   bool get_progress() { return progress; }
55b8e80941Smrg
56b8e80941Smrg   void handle_rvalue(ir_rvalue **rvalue)
57b8e80941Smrg   {
58b8e80941Smrg      if (!*rvalue)
59b8e80941Smrg	 return;
60b8e80941Smrg
61b8e80941Smrg      ir_expression *expr = (*rvalue)->as_expression();
62b8e80941Smrg      if (!expr)
63b8e80941Smrg	 return;
64b8e80941Smrg
65b8e80941Smrg      enum lower_packing_builtins_op lowering_op =
66b8e80941Smrg         choose_lowering_op(expr->operation);
67b8e80941Smrg
68b8e80941Smrg      if (lowering_op == LOWER_PACK_UNPACK_NONE)
69b8e80941Smrg         return;
70b8e80941Smrg
71b8e80941Smrg      setup_factory(ralloc_parent(expr));
72b8e80941Smrg
73b8e80941Smrg      ir_rvalue *op0 = expr->operands[0];
74b8e80941Smrg      ralloc_steal(factory.mem_ctx, op0);
75b8e80941Smrg
76b8e80941Smrg      switch (lowering_op) {
77b8e80941Smrg      case LOWER_PACK_SNORM_2x16:
78b8e80941Smrg         *rvalue = lower_pack_snorm_2x16(op0);
79b8e80941Smrg         break;
80b8e80941Smrg      case LOWER_PACK_SNORM_4x8:
81b8e80941Smrg         *rvalue = lower_pack_snorm_4x8(op0);
82b8e80941Smrg         break;
83b8e80941Smrg      case LOWER_PACK_UNORM_2x16:
84b8e80941Smrg         *rvalue = lower_pack_unorm_2x16(op0);
85b8e80941Smrg         break;
86b8e80941Smrg      case LOWER_PACK_UNORM_4x8:
87b8e80941Smrg         *rvalue = lower_pack_unorm_4x8(op0);
88b8e80941Smrg         break;
89b8e80941Smrg      case LOWER_PACK_HALF_2x16:
90b8e80941Smrg         *rvalue = lower_pack_half_2x16(op0);
91b8e80941Smrg         break;
92b8e80941Smrg      case LOWER_UNPACK_SNORM_2x16:
93b8e80941Smrg         *rvalue = lower_unpack_snorm_2x16(op0);
94b8e80941Smrg         break;
95b8e80941Smrg      case LOWER_UNPACK_SNORM_4x8:
96b8e80941Smrg         *rvalue = lower_unpack_snorm_4x8(op0);
97b8e80941Smrg         break;
98b8e80941Smrg      case LOWER_UNPACK_UNORM_2x16:
99b8e80941Smrg         *rvalue = lower_unpack_unorm_2x16(op0);
100b8e80941Smrg         break;
101b8e80941Smrg      case LOWER_UNPACK_UNORM_4x8:
102b8e80941Smrg         *rvalue = lower_unpack_unorm_4x8(op0);
103b8e80941Smrg         break;
104b8e80941Smrg      case LOWER_UNPACK_HALF_2x16:
105b8e80941Smrg         *rvalue = lower_unpack_half_2x16(op0);
106b8e80941Smrg         break;
107b8e80941Smrg      case LOWER_PACK_UNPACK_NONE:
108b8e80941Smrg      case LOWER_PACK_USE_BFI:
109b8e80941Smrg      case LOWER_PACK_USE_BFE:
110b8e80941Smrg         assert(!"not reached");
111b8e80941Smrg         break;
112b8e80941Smrg      }
113b8e80941Smrg
114b8e80941Smrg      teardown_factory();
115b8e80941Smrg      progress = true;
116b8e80941Smrg   }
117b8e80941Smrg
118b8e80941Smrgprivate:
119b8e80941Smrg   const int op_mask;
120b8e80941Smrg   bool progress;
121b8e80941Smrg   ir_factory factory;
122b8e80941Smrg   exec_list factory_instructions;
123b8e80941Smrg
124b8e80941Smrg   /**
125b8e80941Smrg    * Determine the needed lowering operation by filtering \a expr_op
126b8e80941Smrg    * through \ref op_mask.
127b8e80941Smrg    */
128b8e80941Smrg   enum lower_packing_builtins_op
129b8e80941Smrg   choose_lowering_op(ir_expression_operation expr_op)
130b8e80941Smrg   {
131b8e80941Smrg      /* C++ regards int and enum as fundamentally different types.
132b8e80941Smrg       * So, we can't simply return from each case; we must cast the return
133b8e80941Smrg       * value.
134b8e80941Smrg       */
135b8e80941Smrg      int result;
136b8e80941Smrg
137b8e80941Smrg      switch (expr_op) {
138b8e80941Smrg      case ir_unop_pack_snorm_2x16:
139b8e80941Smrg         result = op_mask & LOWER_PACK_SNORM_2x16;
140b8e80941Smrg         break;
141b8e80941Smrg      case ir_unop_pack_snorm_4x8:
142b8e80941Smrg         result = op_mask & LOWER_PACK_SNORM_4x8;
143b8e80941Smrg         break;
144b8e80941Smrg      case ir_unop_pack_unorm_2x16:
145b8e80941Smrg         result = op_mask & LOWER_PACK_UNORM_2x16;
146b8e80941Smrg         break;
147b8e80941Smrg      case ir_unop_pack_unorm_4x8:
148b8e80941Smrg         result = op_mask & LOWER_PACK_UNORM_4x8;
149b8e80941Smrg         break;
150b8e80941Smrg      case ir_unop_pack_half_2x16:
151b8e80941Smrg         result = op_mask & LOWER_PACK_HALF_2x16;
152b8e80941Smrg         break;
153b8e80941Smrg      case ir_unop_unpack_snorm_2x16:
154b8e80941Smrg         result = op_mask & LOWER_UNPACK_SNORM_2x16;
155b8e80941Smrg         break;
156b8e80941Smrg      case ir_unop_unpack_snorm_4x8:
157b8e80941Smrg         result = op_mask & LOWER_UNPACK_SNORM_4x8;
158b8e80941Smrg         break;
159b8e80941Smrg      case ir_unop_unpack_unorm_2x16:
160b8e80941Smrg         result = op_mask & LOWER_UNPACK_UNORM_2x16;
161b8e80941Smrg         break;
162b8e80941Smrg      case ir_unop_unpack_unorm_4x8:
163b8e80941Smrg         result = op_mask & LOWER_UNPACK_UNORM_4x8;
164b8e80941Smrg         break;
165b8e80941Smrg      case ir_unop_unpack_half_2x16:
166b8e80941Smrg         result = op_mask & LOWER_UNPACK_HALF_2x16;
167b8e80941Smrg         break;
168b8e80941Smrg      default:
169b8e80941Smrg         result = LOWER_PACK_UNPACK_NONE;
170b8e80941Smrg         break;
171b8e80941Smrg      }
172b8e80941Smrg
173b8e80941Smrg      return static_cast<enum lower_packing_builtins_op>(result);
174b8e80941Smrg   }
175b8e80941Smrg
176b8e80941Smrg   void
177b8e80941Smrg   setup_factory(void *mem_ctx)
178b8e80941Smrg   {
179b8e80941Smrg      assert(factory.mem_ctx == NULL);
180b8e80941Smrg      assert(factory.instructions->is_empty());
181b8e80941Smrg
182b8e80941Smrg      factory.mem_ctx = mem_ctx;
183b8e80941Smrg   }
184b8e80941Smrg
185b8e80941Smrg   void
186b8e80941Smrg   teardown_factory()
187b8e80941Smrg   {
188b8e80941Smrg      base_ir->insert_before(factory.instructions);
189b8e80941Smrg      assert(factory.instructions->is_empty());
190b8e80941Smrg      factory.mem_ctx = NULL;
191b8e80941Smrg   }
192b8e80941Smrg
193b8e80941Smrg   template <typename T>
194b8e80941Smrg   ir_constant*
195b8e80941Smrg   constant(T x)
196b8e80941Smrg   {
197b8e80941Smrg      return factory.constant(x);
198b8e80941Smrg   }
199b8e80941Smrg
200b8e80941Smrg   /**
201b8e80941Smrg    * \brief Pack two uint16's into a single uint32.
202b8e80941Smrg    *
203b8e80941Smrg    * Interpret the given uvec2 as a uint16 pair. Pack the pair into a uint32
204b8e80941Smrg    * where the least significant bits specify the first element of the pair.
205b8e80941Smrg    * Return the uint32.
206b8e80941Smrg    */
207b8e80941Smrg   ir_rvalue*
208b8e80941Smrg   pack_uvec2_to_uint(ir_rvalue *uvec2_rval)
209b8e80941Smrg   {
210b8e80941Smrg      assert(uvec2_rval->type == glsl_type::uvec2_type);
211b8e80941Smrg
212b8e80941Smrg      /* uvec2 u = UVEC2_RVAL; */
213b8e80941Smrg      ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
214b8e80941Smrg                                         "tmp_pack_uvec2_to_uint");
215b8e80941Smrg      factory.emit(assign(u, uvec2_rval));
216b8e80941Smrg
217b8e80941Smrg      if (op_mask & LOWER_PACK_USE_BFI) {
218b8e80941Smrg         return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
219b8e80941Smrg                                swizzle_y(u),
220b8e80941Smrg                                constant(16u),
221b8e80941Smrg                                constant(16u));
222b8e80941Smrg      }
223b8e80941Smrg
224b8e80941Smrg      /* return (u.y << 16) | (u.x & 0xffff); */
225b8e80941Smrg      return bit_or(lshift(swizzle_y(u), constant(16u)),
226b8e80941Smrg                    bit_and(swizzle_x(u), constant(0xffffu)));
227b8e80941Smrg   }
228b8e80941Smrg
229b8e80941Smrg   /**
230b8e80941Smrg    * \brief Pack four uint8's into a single uint32.
231b8e80941Smrg    *
232b8e80941Smrg    * Interpret the given uvec4 as a uint32 4-typle. Pack the 4-tuple into a
233b8e80941Smrg    * uint32 where the least significant bits specify the first element of the
234b8e80941Smrg    * 4-tuple. Return the uint32.
235b8e80941Smrg    */
236b8e80941Smrg   ir_rvalue*
237b8e80941Smrg   pack_uvec4_to_uint(ir_rvalue *uvec4_rval)
238b8e80941Smrg   {
239b8e80941Smrg      assert(uvec4_rval->type == glsl_type::uvec4_type);
240b8e80941Smrg
241b8e80941Smrg      ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
242b8e80941Smrg                                         "tmp_pack_uvec4_to_uint");
243b8e80941Smrg
244b8e80941Smrg      if (op_mask & LOWER_PACK_USE_BFI) {
245b8e80941Smrg         /* uvec4 u = UVEC4_RVAL; */
246b8e80941Smrg         factory.emit(assign(u, uvec4_rval));
247b8e80941Smrg
248b8e80941Smrg         return bitfield_insert(bitfield_insert(
249b8e80941Smrg                                   bitfield_insert(
250b8e80941Smrg                                      bit_and(swizzle_x(u), constant(0xffu)),
251b8e80941Smrg                                      swizzle_y(u), constant(8u), constant(8u)),
252b8e80941Smrg                                   swizzle_z(u), constant(16u), constant(8u)),
253b8e80941Smrg                                swizzle_w(u), constant(24u), constant(8u));
254b8e80941Smrg      }
255b8e80941Smrg
256b8e80941Smrg      /* uvec4 u = UVEC4_RVAL & 0xff */
257b8e80941Smrg      factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
258b8e80941Smrg
259b8e80941Smrg      /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
260b8e80941Smrg      return bit_or(bit_or(lshift(swizzle_w(u), constant(24u)),
261b8e80941Smrg                           lshift(swizzle_z(u), constant(16u))),
262b8e80941Smrg                    bit_or(lshift(swizzle_y(u), constant(8u)),
263b8e80941Smrg                           swizzle_x(u)));
264b8e80941Smrg   }
265b8e80941Smrg
266b8e80941Smrg   /**
267b8e80941Smrg    * \brief Unpack a uint32 into two uint16's.
268b8e80941Smrg    *
269b8e80941Smrg    * Interpret the given uint32 as a uint16 pair where the uint32's least
270b8e80941Smrg    * significant bits specify the pair's first element. Return the uint16
271b8e80941Smrg    * pair as a uvec2.
272b8e80941Smrg    */
273b8e80941Smrg   ir_rvalue*
274b8e80941Smrg   unpack_uint_to_uvec2(ir_rvalue *uint_rval)
275b8e80941Smrg   {
276b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
277b8e80941Smrg
278b8e80941Smrg      /* uint u = UINT_RVAL; */
279b8e80941Smrg      ir_variable *u = factory.make_temp(glsl_type::uint_type,
280b8e80941Smrg                                          "tmp_unpack_uint_to_uvec2_u");
281b8e80941Smrg      factory.emit(assign(u, uint_rval));
282b8e80941Smrg
283b8e80941Smrg      /* uvec2 u2; */
284b8e80941Smrg      ir_variable *u2 = factory.make_temp(glsl_type::uvec2_type,
285b8e80941Smrg                                           "tmp_unpack_uint_to_uvec2_u2");
286b8e80941Smrg
287b8e80941Smrg      /* u2.x = u & 0xffffu; */
288b8e80941Smrg      factory.emit(assign(u2, bit_and(u, constant(0xffffu)), WRITEMASK_X));
289b8e80941Smrg
290b8e80941Smrg      /* u2.y = u >> 16u; */
291b8e80941Smrg      factory.emit(assign(u2, rshift(u, constant(16u)), WRITEMASK_Y));
292b8e80941Smrg
293b8e80941Smrg      return deref(u2).val;
294b8e80941Smrg   }
295b8e80941Smrg
296b8e80941Smrg   /**
297b8e80941Smrg    * \brief Unpack a uint32 into two int16's.
298b8e80941Smrg    *
299b8e80941Smrg    * Specifically each 16-bit value is sign-extended to the full width of an
300b8e80941Smrg    * int32 on return.
301b8e80941Smrg    */
302b8e80941Smrg   ir_rvalue *
303b8e80941Smrg   unpack_uint_to_ivec2(ir_rvalue *uint_rval)
304b8e80941Smrg   {
305b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
306b8e80941Smrg
307b8e80941Smrg      if (!(op_mask & LOWER_PACK_USE_BFE)) {
308b8e80941Smrg         return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
309b8e80941Smrg                              constant(16u)),
310b8e80941Smrg                       constant(16u));
311b8e80941Smrg      }
312b8e80941Smrg
313b8e80941Smrg      ir_variable *i = factory.make_temp(glsl_type::int_type,
314b8e80941Smrg                                         "tmp_unpack_uint_to_ivec2_i");
315b8e80941Smrg      factory.emit(assign(i, u2i(uint_rval)));
316b8e80941Smrg
317b8e80941Smrg      /* ivec2 i2; */
318b8e80941Smrg      ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
319b8e80941Smrg                                          "tmp_unpack_uint_to_ivec2_i2");
320b8e80941Smrg
321b8e80941Smrg      factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
322b8e80941Smrg                          WRITEMASK_X));
323b8e80941Smrg      factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
324b8e80941Smrg                          WRITEMASK_Y));
325b8e80941Smrg
326b8e80941Smrg      return deref(i2).val;
327b8e80941Smrg   }
328b8e80941Smrg
329b8e80941Smrg   /**
330b8e80941Smrg    * \brief Unpack a uint32 into four uint8's.
331b8e80941Smrg    *
332b8e80941Smrg    * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
333b8e80941Smrg    * significant bits specify the 4-tuple's first element. Return the uint8
334b8e80941Smrg    * 4-tuple as a uvec4.
335b8e80941Smrg    */
336b8e80941Smrg   ir_rvalue*
337b8e80941Smrg   unpack_uint_to_uvec4(ir_rvalue *uint_rval)
338b8e80941Smrg   {
339b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
340b8e80941Smrg
341b8e80941Smrg      /* uint u = UINT_RVAL; */
342b8e80941Smrg      ir_variable *u = factory.make_temp(glsl_type::uint_type,
343b8e80941Smrg                                          "tmp_unpack_uint_to_uvec4_u");
344b8e80941Smrg      factory.emit(assign(u, uint_rval));
345b8e80941Smrg
346b8e80941Smrg      /* uvec4 u4; */
347b8e80941Smrg      ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
348b8e80941Smrg                                           "tmp_unpack_uint_to_uvec4_u4");
349b8e80941Smrg
350b8e80941Smrg      /* u4.x = u & 0xffu; */
351b8e80941Smrg      factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
352b8e80941Smrg
353b8e80941Smrg      if (op_mask & LOWER_PACK_USE_BFE) {
354b8e80941Smrg         /* u4.y = bitfield_extract(u, 8, 8); */
355b8e80941Smrg         factory.emit(assign(u4, bitfield_extract(u, constant(8u), constant(8u)),
356b8e80941Smrg                             WRITEMASK_Y));
357b8e80941Smrg
358b8e80941Smrg         /* u4.z = bitfield_extract(u, 16, 8); */
359b8e80941Smrg         factory.emit(assign(u4, bitfield_extract(u, constant(16u), constant(8u)),
360b8e80941Smrg                             WRITEMASK_Z));
361b8e80941Smrg      } else {
362b8e80941Smrg         /* u4.y = (u >> 8u) & 0xffu; */
363b8e80941Smrg         factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
364b8e80941Smrg                                         constant(0xffu)), WRITEMASK_Y));
365b8e80941Smrg
366b8e80941Smrg         /* u4.z = (u >> 16u) & 0xffu; */
367b8e80941Smrg         factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
368b8e80941Smrg                                         constant(0xffu)), WRITEMASK_Z));
369b8e80941Smrg      }
370b8e80941Smrg
371b8e80941Smrg      /* u4.w = (u >> 24u) */
372b8e80941Smrg      factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
373b8e80941Smrg
374b8e80941Smrg      return deref(u4).val;
375b8e80941Smrg   }
376b8e80941Smrg
377b8e80941Smrg   /**
378b8e80941Smrg    * \brief Unpack a uint32 into four int8's.
379b8e80941Smrg    *
380b8e80941Smrg    * Specifically each 8-bit value is sign-extended to the full width of an
381b8e80941Smrg    * int32 on return.
382b8e80941Smrg    */
383b8e80941Smrg   ir_rvalue *
384b8e80941Smrg   unpack_uint_to_ivec4(ir_rvalue *uint_rval)
385b8e80941Smrg   {
386b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
387b8e80941Smrg
388b8e80941Smrg      if (!(op_mask & LOWER_PACK_USE_BFE)) {
389b8e80941Smrg         return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
390b8e80941Smrg                              constant(24u)),
391b8e80941Smrg                       constant(24u));
392b8e80941Smrg      }
393b8e80941Smrg
394b8e80941Smrg      ir_variable *i = factory.make_temp(glsl_type::int_type,
395b8e80941Smrg                                         "tmp_unpack_uint_to_ivec4_i");
396b8e80941Smrg      factory.emit(assign(i, u2i(uint_rval)));
397b8e80941Smrg
398b8e80941Smrg      /* ivec4 i4; */
399b8e80941Smrg      ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
400b8e80941Smrg                                          "tmp_unpack_uint_to_ivec4_i4");
401b8e80941Smrg
402b8e80941Smrg      factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
403b8e80941Smrg                          WRITEMASK_X));
404b8e80941Smrg      factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
405b8e80941Smrg                          WRITEMASK_Y));
406b8e80941Smrg      factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
407b8e80941Smrg                          WRITEMASK_Z));
408b8e80941Smrg      factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
409b8e80941Smrg                          WRITEMASK_W));
410b8e80941Smrg
411b8e80941Smrg      return deref(i4).val;
412b8e80941Smrg   }
413b8e80941Smrg
414b8e80941Smrg   /**
415b8e80941Smrg    * \brief Lower a packSnorm2x16 expression.
416b8e80941Smrg    *
417b8e80941Smrg    * \param vec2_rval is packSnorm2x16's input
418b8e80941Smrg    * \return packSnorm2x16's output as a uint rvalue
419b8e80941Smrg    */
420b8e80941Smrg   ir_rvalue*
421b8e80941Smrg   lower_pack_snorm_2x16(ir_rvalue *vec2_rval)
422b8e80941Smrg   {
423b8e80941Smrg      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
424b8e80941Smrg       *
425b8e80941Smrg       *    highp uint packSnorm2x16(vec2 v)
426b8e80941Smrg       *    --------------------------------
427b8e80941Smrg       *    First, converts each component of the normalized floating-point value
428b8e80941Smrg       *    v into 16-bit integer values. Then, the results are packed into the
429b8e80941Smrg       *    returned 32-bit unsigned integer.
430b8e80941Smrg       *
431b8e80941Smrg       *    The conversion for component c of v to fixed point is done as
432b8e80941Smrg       *    follows:
433b8e80941Smrg       *
434b8e80941Smrg       *       packSnorm2x16: round(clamp(c, -1, +1) * 32767.0)
435b8e80941Smrg       *
436b8e80941Smrg       *    The first component of the vector will be written to the least
437b8e80941Smrg       *    significant bits of the output; the last component will be written to
438b8e80941Smrg       *    the most significant bits.
439b8e80941Smrg       *
440b8e80941Smrg       * This function generates IR that approximates the following pseudo-GLSL:
441b8e80941Smrg       *
442b8e80941Smrg       *     return pack_uvec2_to_uint(
443b8e80941Smrg       *         uvec2(ivec2(
444b8e80941Smrg       *           round(clamp(VEC2_RVALUE, -1.0f, 1.0f) * 32767.0f))));
445b8e80941Smrg       *
446b8e80941Smrg       * It is necessary to first convert the vec2 to ivec2 rather than directly
447b8e80941Smrg       * converting vec2 to uvec2 because the latter conversion is undefined.
448b8e80941Smrg       * From page 56 (62 of pdf) of the GLSL ES 3.00 spec: "It is undefined to
449b8e80941Smrg       * convert a negative floating point value to an uint".
450b8e80941Smrg       */
451b8e80941Smrg      assert(vec2_rval->type == glsl_type::vec2_type);
452b8e80941Smrg
453b8e80941Smrg      ir_rvalue *result = pack_uvec2_to_uint(
454b8e80941Smrg            i2u(f2i(round_even(mul(clamp(vec2_rval,
455b8e80941Smrg                                         constant(-1.0f),
456b8e80941Smrg                                         constant(1.0f)),
457b8e80941Smrg                                   constant(32767.0f))))));
458b8e80941Smrg
459b8e80941Smrg      assert(result->type == glsl_type::uint_type);
460b8e80941Smrg      return result;
461b8e80941Smrg   }
462b8e80941Smrg
463b8e80941Smrg   /**
464b8e80941Smrg    * \brief Lower a packSnorm4x8 expression.
465b8e80941Smrg    *
466b8e80941Smrg    * \param vec4_rval is packSnorm4x8's input
467b8e80941Smrg    * \return packSnorm4x8's output as a uint rvalue
468b8e80941Smrg    */
469b8e80941Smrg   ir_rvalue*
470b8e80941Smrg   lower_pack_snorm_4x8(ir_rvalue *vec4_rval)
471b8e80941Smrg   {
472b8e80941Smrg      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
473b8e80941Smrg       *
474b8e80941Smrg       *    highp uint packSnorm4x8(vec4 v)
475b8e80941Smrg       *    -------------------------------
476b8e80941Smrg       *    First, converts each component of the normalized floating-point value
477b8e80941Smrg       *    v into 8-bit integer values. Then, the results are packed into the
478b8e80941Smrg       *    returned 32-bit unsigned integer.
479b8e80941Smrg       *
480b8e80941Smrg       *    The conversion for component c of v to fixed point is done as
481b8e80941Smrg       *    follows:
482b8e80941Smrg       *
483b8e80941Smrg       *       packSnorm4x8: round(clamp(c, -1, +1) * 127.0)
484b8e80941Smrg       *
485b8e80941Smrg       *    The first component of the vector will be written to the least
486b8e80941Smrg       *    significant bits of the output; the last component will be written to
487b8e80941Smrg       *    the most significant bits.
488b8e80941Smrg       *
489b8e80941Smrg       * This function generates IR that approximates the following pseudo-GLSL:
490b8e80941Smrg       *
491b8e80941Smrg       *     return pack_uvec4_to_uint(
492b8e80941Smrg       *         uvec4(ivec4(
493b8e80941Smrg       *           round(clamp(VEC4_RVALUE, -1.0f, 1.0f) * 127.0f))));
494b8e80941Smrg       *
495b8e80941Smrg       * It is necessary to first convert the vec4 to ivec4 rather than directly
496b8e80941Smrg       * converting vec4 to uvec4 because the latter conversion is undefined.
497b8e80941Smrg       * From page 87 (93 of pdf) of the GLSL 4.30 spec: "It is undefined to
498b8e80941Smrg       * convert a negative floating point value to an uint".
499b8e80941Smrg       */
500b8e80941Smrg      assert(vec4_rval->type == glsl_type::vec4_type);
501b8e80941Smrg
502b8e80941Smrg      ir_rvalue *result = pack_uvec4_to_uint(
503b8e80941Smrg            i2u(f2i(round_even(mul(clamp(vec4_rval,
504b8e80941Smrg                                         constant(-1.0f),
505b8e80941Smrg                                         constant(1.0f)),
506b8e80941Smrg                                   constant(127.0f))))));
507b8e80941Smrg
508b8e80941Smrg      assert(result->type == glsl_type::uint_type);
509b8e80941Smrg      return result;
510b8e80941Smrg   }
511b8e80941Smrg
512b8e80941Smrg   /**
513b8e80941Smrg    * \brief Lower an unpackSnorm2x16 expression.
514b8e80941Smrg    *
515b8e80941Smrg    * \param uint_rval is unpackSnorm2x16's input
516b8e80941Smrg    * \return unpackSnorm2x16's output as a vec2 rvalue
517b8e80941Smrg    */
518b8e80941Smrg   ir_rvalue*
519b8e80941Smrg   lower_unpack_snorm_2x16(ir_rvalue *uint_rval)
520b8e80941Smrg   {
521b8e80941Smrg      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
522b8e80941Smrg       *
523b8e80941Smrg       *    highp vec2 unpackSnorm2x16 (highp uint p)
524b8e80941Smrg       *    -----------------------------------------
525b8e80941Smrg       *    First, unpacks a single 32-bit unsigned integer p into a pair of
526b8e80941Smrg       *    16-bit unsigned integers. Then, each component is converted to
527b8e80941Smrg       *    a normalized floating-point value to generate the returned
528b8e80941Smrg       *    two-component vector.
529b8e80941Smrg       *
530b8e80941Smrg       *    The conversion for unpacked fixed-point value f to floating point is
531b8e80941Smrg       *    done as follows:
532b8e80941Smrg       *
533b8e80941Smrg       *       unpackSnorm2x16: clamp(f / 32767.0, -1,+1)
534b8e80941Smrg       *
535b8e80941Smrg       *    The first component of the returned vector will be extracted from the
536b8e80941Smrg       *    least significant bits of the input; the last component will be
537b8e80941Smrg       *    extracted from the most significant bits.
538b8e80941Smrg       *
539b8e80941Smrg       * This function generates IR that approximates the following pseudo-GLSL:
540b8e80941Smrg       *
541b8e80941Smrg       *    return clamp(
542b8e80941Smrg       *       ((ivec2(unpack_uint_to_uvec2(UINT_RVALUE)) << 16) >> 16) / 32767.0f,
543b8e80941Smrg       *       -1.0f, 1.0f);
544b8e80941Smrg       *
545b8e80941Smrg       * The above IR may appear unnecessarily complex, but the intermediate
546b8e80941Smrg       * conversion to ivec2 and the bit shifts are necessary to correctly unpack
547b8e80941Smrg       * negative floats.
548b8e80941Smrg       *
549b8e80941Smrg       * To see why, consider packing and then unpacking vec2(-1.0, 0.0).
550b8e80941Smrg       * packSnorm2x16 encodes -1.0 as the int16 0xffff. During unpacking, we
551b8e80941Smrg       * place that int16 into an int32, which results in the *positive* integer
552b8e80941Smrg       * 0x0000ffff.  The int16's sign bit becomes, in the int32, the rather
553b8e80941Smrg       * unimportant bit 16. We must now extend the int16's sign bit into bits
554b8e80941Smrg       * 17-32, which is accomplished by left-shifting then right-shifting.
555b8e80941Smrg       */
556b8e80941Smrg
557b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
558b8e80941Smrg
559b8e80941Smrg      ir_rvalue *result =
560b8e80941Smrg        clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
561b8e80941Smrg                  constant(32767.0f)),
562b8e80941Smrg              constant(-1.0f),
563b8e80941Smrg              constant(1.0f));
564b8e80941Smrg
565b8e80941Smrg      assert(result->type == glsl_type::vec2_type);
566b8e80941Smrg      return result;
567b8e80941Smrg   }
568b8e80941Smrg
569b8e80941Smrg   /**
570b8e80941Smrg    * \brief Lower an unpackSnorm4x8 expression.
571b8e80941Smrg    *
572b8e80941Smrg    * \param uint_rval is unpackSnorm4x8's input
573b8e80941Smrg    * \return unpackSnorm4x8's output as a vec4 rvalue
574b8e80941Smrg    */
575b8e80941Smrg   ir_rvalue*
576b8e80941Smrg   lower_unpack_snorm_4x8(ir_rvalue *uint_rval)
577b8e80941Smrg   {
578b8e80941Smrg      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
579b8e80941Smrg       *
580b8e80941Smrg       *    highp vec4 unpackSnorm4x8 (highp uint p)
581b8e80941Smrg       *    ----------------------------------------
582b8e80941Smrg       *    First, unpacks a single 32-bit unsigned integer p into four
583b8e80941Smrg       *    8-bit unsigned integers. Then, each component is converted to
584b8e80941Smrg       *    a normalized floating-point value to generate the returned
585b8e80941Smrg       *    four-component vector.
586b8e80941Smrg       *
587b8e80941Smrg       *    The conversion for unpacked fixed-point value f to floating point is
588b8e80941Smrg       *    done as follows:
589b8e80941Smrg       *
590b8e80941Smrg       *       unpackSnorm4x8: clamp(f / 127.0, -1, +1)
591b8e80941Smrg       *
592b8e80941Smrg       *    The first component of the returned vector will be extracted from the
593b8e80941Smrg       *    least significant bits of the input; the last component will be
594b8e80941Smrg       *    extracted from the most significant bits.
595b8e80941Smrg       *
596b8e80941Smrg       * This function generates IR that approximates the following pseudo-GLSL:
597b8e80941Smrg       *
598b8e80941Smrg       *    return clamp(
599b8e80941Smrg       *       ((ivec4(unpack_uint_to_uvec4(UINT_RVALUE)) << 24) >> 24) / 127.0f,
600b8e80941Smrg       *       -1.0f, 1.0f);
601b8e80941Smrg       *
602b8e80941Smrg       * The above IR may appear unnecessarily complex, but the intermediate
603b8e80941Smrg       * conversion to ivec4 and the bit shifts are necessary to correctly unpack
604b8e80941Smrg       * negative floats.
605b8e80941Smrg       *
606b8e80941Smrg       * To see why, consider packing and then unpacking vec4(-1.0, 0.0, 0.0,
607b8e80941Smrg       * 0.0). packSnorm4x8 encodes -1.0 as the int8 0xff. During unpacking, we
608b8e80941Smrg       * place that int8 into an int32, which results in the *positive* integer
609b8e80941Smrg       * 0x000000ff.  The int8's sign bit becomes, in the int32, the rather
610b8e80941Smrg       * unimportant bit 8. We must now extend the int8's sign bit into bits
611b8e80941Smrg       * 9-32, which is accomplished by left-shifting then right-shifting.
612b8e80941Smrg       */
613b8e80941Smrg
614b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
615b8e80941Smrg
616b8e80941Smrg      ir_rvalue *result =
617b8e80941Smrg        clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
618b8e80941Smrg                  constant(127.0f)),
619b8e80941Smrg              constant(-1.0f),
620b8e80941Smrg              constant(1.0f));
621b8e80941Smrg
622b8e80941Smrg      assert(result->type == glsl_type::vec4_type);
623b8e80941Smrg      return result;
624b8e80941Smrg   }
625b8e80941Smrg
626b8e80941Smrg   /**
627b8e80941Smrg    * \brief Lower a packUnorm2x16 expression.
628b8e80941Smrg    *
629b8e80941Smrg    * \param vec2_rval is packUnorm2x16's input
630b8e80941Smrg    * \return packUnorm2x16's output as a uint rvalue
631b8e80941Smrg    */
632b8e80941Smrg   ir_rvalue*
633b8e80941Smrg   lower_pack_unorm_2x16(ir_rvalue *vec2_rval)
634b8e80941Smrg   {
635b8e80941Smrg      /* From page 88 (94 of pdf) of the GLSL ES 3.00 spec:
636b8e80941Smrg       *
637b8e80941Smrg       *    highp uint packUnorm2x16 (vec2 v)
638b8e80941Smrg       *    ---------------------------------
639b8e80941Smrg       *    First, converts each component of the normalized floating-point value
640b8e80941Smrg       *    v into 16-bit integer values. Then, the results are packed into the
641b8e80941Smrg       *    returned 32-bit unsigned integer.
642b8e80941Smrg       *
643b8e80941Smrg       *    The conversion for component c of v to fixed point is done as
644b8e80941Smrg       *    follows:
645b8e80941Smrg       *
646b8e80941Smrg       *       packUnorm2x16: round(clamp(c, 0, +1) * 65535.0)
647b8e80941Smrg       *
648b8e80941Smrg       *    The first component of the vector will be written to the least
649b8e80941Smrg       *    significant bits of the output; the last component will be written to
650b8e80941Smrg       *    the most significant bits.
651b8e80941Smrg       *
652b8e80941Smrg       * This function generates IR that approximates the following pseudo-GLSL:
653b8e80941Smrg       *
654b8e80941Smrg       *     return pack_uvec2_to_uint(uvec2(
655b8e80941Smrg       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 65535.0f)));
656b8e80941Smrg       *
657b8e80941Smrg       * Here it is safe to directly convert the vec2 to uvec2 because the vec2
658b8e80941Smrg       * has been clamped to a non-negative range.
659b8e80941Smrg       */
660b8e80941Smrg
661b8e80941Smrg      assert(vec2_rval->type == glsl_type::vec2_type);
662b8e80941Smrg
663b8e80941Smrg      ir_rvalue *result = pack_uvec2_to_uint(
664b8e80941Smrg         f2u(round_even(mul(saturate(vec2_rval), constant(65535.0f)))));
665b8e80941Smrg
666b8e80941Smrg      assert(result->type == glsl_type::uint_type);
667b8e80941Smrg      return result;
668b8e80941Smrg   }
669b8e80941Smrg
670b8e80941Smrg   /**
671b8e80941Smrg    * \brief Lower a packUnorm4x8 expression.
672b8e80941Smrg    *
673b8e80941Smrg    * \param vec4_rval is packUnorm4x8's input
674b8e80941Smrg    * \return packUnorm4x8's output as a uint rvalue
675b8e80941Smrg    */
676b8e80941Smrg   ir_rvalue*
677b8e80941Smrg   lower_pack_unorm_4x8(ir_rvalue *vec4_rval)
678b8e80941Smrg   {
679b8e80941Smrg      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
680b8e80941Smrg       *
681b8e80941Smrg       *    highp uint packUnorm4x8 (vec4 v)
682b8e80941Smrg       *    --------------------------------
683b8e80941Smrg       *    First, converts each component of the normalized floating-point value
684b8e80941Smrg       *    v into 8-bit integer values. Then, the results are packed into the
685b8e80941Smrg       *    returned 32-bit unsigned integer.
686b8e80941Smrg       *
687b8e80941Smrg       *    The conversion for component c of v to fixed point is done as
688b8e80941Smrg       *    follows:
689b8e80941Smrg       *
690b8e80941Smrg       *       packUnorm4x8: round(clamp(c, 0, +1) * 255.0)
691b8e80941Smrg       *
692b8e80941Smrg       *    The first component of the vector will be written to the least
693b8e80941Smrg       *    significant bits of the output; the last component will be written to
694b8e80941Smrg       *    the most significant bits.
695b8e80941Smrg       *
696b8e80941Smrg       * This function generates IR that approximates the following pseudo-GLSL:
697b8e80941Smrg       *
698b8e80941Smrg       *     return pack_uvec4_to_uint(uvec4(
699b8e80941Smrg       *                round(clamp(VEC2_RVALUE, 0.0f, 1.0f) * 255.0f)));
700b8e80941Smrg       *
701b8e80941Smrg       * Here it is safe to directly convert the vec4 to uvec4 because the vec4
702b8e80941Smrg       * has been clamped to a non-negative range.
703b8e80941Smrg       */
704b8e80941Smrg
705b8e80941Smrg      assert(vec4_rval->type == glsl_type::vec4_type);
706b8e80941Smrg
707b8e80941Smrg      ir_rvalue *result = pack_uvec4_to_uint(
708b8e80941Smrg         f2u(round_even(mul(saturate(vec4_rval), constant(255.0f)))));
709b8e80941Smrg
710b8e80941Smrg      assert(result->type == glsl_type::uint_type);
711b8e80941Smrg      return result;
712b8e80941Smrg   }
713b8e80941Smrg
714b8e80941Smrg   /**
715b8e80941Smrg    * \brief Lower an unpackUnorm2x16 expression.
716b8e80941Smrg    *
717b8e80941Smrg    * \param uint_rval is unpackUnorm2x16's input
718b8e80941Smrg    * \return unpackUnorm2x16's output as a vec2 rvalue
719b8e80941Smrg    */
720b8e80941Smrg   ir_rvalue*
721b8e80941Smrg   lower_unpack_unorm_2x16(ir_rvalue *uint_rval)
722b8e80941Smrg   {
723b8e80941Smrg      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
724b8e80941Smrg       *
725b8e80941Smrg       *    highp vec2 unpackUnorm2x16 (highp uint p)
726b8e80941Smrg       *    -----------------------------------------
727b8e80941Smrg       *    First, unpacks a single 32-bit unsigned integer p into a pair of
728b8e80941Smrg       *    16-bit unsigned integers. Then, each component is converted to
729b8e80941Smrg       *    a normalized floating-point value to generate the returned
730b8e80941Smrg       *    two-component vector.
731b8e80941Smrg       *
732b8e80941Smrg       *    The conversion for unpacked fixed-point value f to floating point is
733b8e80941Smrg       *    done as follows:
734b8e80941Smrg       *
735b8e80941Smrg       *       unpackUnorm2x16: f / 65535.0
736b8e80941Smrg       *
737b8e80941Smrg       *    The first component of the returned vector will be extracted from the
738b8e80941Smrg       *    least significant bits of the input; the last component will be
739b8e80941Smrg       *    extracted from the most significant bits.
740b8e80941Smrg       *
741b8e80941Smrg       * This function generates IR that approximates the following pseudo-GLSL:
742b8e80941Smrg       *
743b8e80941Smrg       *     return vec2(unpack_uint_to_uvec2(UINT_RVALUE)) / 65535.0;
744b8e80941Smrg       */
745b8e80941Smrg
746b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
747b8e80941Smrg
748b8e80941Smrg      ir_rvalue *result = div(u2f(unpack_uint_to_uvec2(uint_rval)),
749b8e80941Smrg                              constant(65535.0f));
750b8e80941Smrg
751b8e80941Smrg      assert(result->type == glsl_type::vec2_type);
752b8e80941Smrg      return result;
753b8e80941Smrg   }
754b8e80941Smrg
755b8e80941Smrg   /**
756b8e80941Smrg    * \brief Lower an unpackUnorm4x8 expression.
757b8e80941Smrg    *
758b8e80941Smrg    * \param uint_rval is unpackUnorm4x8's input
759b8e80941Smrg    * \return unpackUnorm4x8's output as a vec4 rvalue
760b8e80941Smrg    */
761b8e80941Smrg   ir_rvalue*
762b8e80941Smrg   lower_unpack_unorm_4x8(ir_rvalue *uint_rval)
763b8e80941Smrg   {
764b8e80941Smrg      /* From page 137 (143 of pdf) of the GLSL 4.30 spec:
765b8e80941Smrg       *
766b8e80941Smrg       *    highp vec4 unpackUnorm4x8 (highp uint p)
767b8e80941Smrg       *    ----------------------------------------
768b8e80941Smrg       *    First, unpacks a single 32-bit unsigned integer p into four
769b8e80941Smrg       *    8-bit unsigned integers. Then, each component is converted to
770b8e80941Smrg       *    a normalized floating-point value to generate the returned
771b8e80941Smrg       *    two-component vector.
772b8e80941Smrg       *
773b8e80941Smrg       *    The conversion for unpacked fixed-point value f to floating point is
774b8e80941Smrg       *    done as follows:
775b8e80941Smrg       *
776b8e80941Smrg       *       unpackUnorm4x8: f / 255.0
777b8e80941Smrg       *
778b8e80941Smrg       *    The first component of the returned vector will be extracted from the
779b8e80941Smrg       *    least significant bits of the input; the last component will be
780b8e80941Smrg       *    extracted from the most significant bits.
781b8e80941Smrg       *
782b8e80941Smrg       * This function generates IR that approximates the following pseudo-GLSL:
783b8e80941Smrg       *
784b8e80941Smrg       *     return vec4(unpack_uint_to_uvec4(UINT_RVALUE)) / 255.0;
785b8e80941Smrg       */
786b8e80941Smrg
787b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
788b8e80941Smrg
789b8e80941Smrg      ir_rvalue *result = div(u2f(unpack_uint_to_uvec4(uint_rval)),
790b8e80941Smrg                              constant(255.0f));
791b8e80941Smrg
792b8e80941Smrg      assert(result->type == glsl_type::vec4_type);
793b8e80941Smrg      return result;
794b8e80941Smrg   }
795b8e80941Smrg
796b8e80941Smrg   /**
797b8e80941Smrg    * \brief Lower the component-wise calculation of packHalf2x16.
798b8e80941Smrg    *
799b8e80941Smrg    * \param f_rval is one component of packHafl2x16's input
800b8e80941Smrg    * \param e_rval is the unshifted exponent bits of f_rval
801b8e80941Smrg    * \param m_rval is the unshifted mantissa bits of f_rval
802b8e80941Smrg    *
803b8e80941Smrg    * \return a uint rvalue that encodes a float16 in its lower 16 bits
804b8e80941Smrg    */
805b8e80941Smrg   ir_rvalue*
806b8e80941Smrg   pack_half_1x16_nosign(ir_rvalue *f_rval,
807b8e80941Smrg                         ir_rvalue *e_rval,
808b8e80941Smrg                         ir_rvalue *m_rval)
809b8e80941Smrg   {
810b8e80941Smrg      assert(e_rval->type == glsl_type::uint_type);
811b8e80941Smrg      assert(m_rval->type == glsl_type::uint_type);
812b8e80941Smrg
813b8e80941Smrg      /* uint u16; */
814b8e80941Smrg      ir_variable *u16 = factory.make_temp(glsl_type::uint_type,
815b8e80941Smrg                                           "tmp_pack_half_1x16_u16");
816b8e80941Smrg
817b8e80941Smrg      /* float f = FLOAT_RVAL; */
818b8e80941Smrg      ir_variable *f = factory.make_temp(glsl_type::float_type,
819b8e80941Smrg                                          "tmp_pack_half_1x16_f");
820b8e80941Smrg      factory.emit(assign(f, f_rval));
821b8e80941Smrg
822b8e80941Smrg      /* uint e = E_RVAL; */
823b8e80941Smrg      ir_variable *e = factory.make_temp(glsl_type::uint_type,
824b8e80941Smrg                                          "tmp_pack_half_1x16_e");
825b8e80941Smrg      factory.emit(assign(e, e_rval));
826b8e80941Smrg
827b8e80941Smrg      /* uint m = M_RVAL; */
828b8e80941Smrg      ir_variable *m = factory.make_temp(glsl_type::uint_type,
829b8e80941Smrg                                          "tmp_pack_half_1x16_m");
830b8e80941Smrg      factory.emit(assign(m, m_rval));
831b8e80941Smrg
832b8e80941Smrg      /* Preliminaries
833b8e80941Smrg       * -------------
834b8e80941Smrg       *
835b8e80941Smrg       * For a float16, the bit layout is:
836b8e80941Smrg       *
837b8e80941Smrg       *   sign:     15
838b8e80941Smrg       *   exponent: 10:14
839b8e80941Smrg       *   mantissa: 0:9
840b8e80941Smrg       *
841b8e80941Smrg       * Let f16 be a float16 value. The sign, exponent, and mantissa
842b8e80941Smrg       * determine its value thus:
843b8e80941Smrg       *
844b8e80941Smrg       *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
845b8e80941Smrg       *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
846b8e80941Smrg       *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
847b8e80941Smrg       *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
848b8e80941Smrg       *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
849b8e80941Smrg       *
850b8e80941Smrg       * where 0 <= m16 < 2^10.
851b8e80941Smrg       *
852b8e80941Smrg       * For a float32, the bit layout is:
853b8e80941Smrg       *
854b8e80941Smrg       *   sign:     31
855b8e80941Smrg       *   exponent: 23:30
856b8e80941Smrg       *   mantissa: 0:22
857b8e80941Smrg       *
858b8e80941Smrg       * Let f32 be a float32 value. The sign, exponent, and mantissa
859b8e80941Smrg       * determine its value thus:
860b8e80941Smrg       *
861b8e80941Smrg       *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
862b8e80941Smrg       *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
863b8e80941Smrg       *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
864b8e80941Smrg       *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
865b8e80941Smrg       *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
866b8e80941Smrg       *
867b8e80941Smrg       * where 0 <= m32 < 2^23.
868b8e80941Smrg       *
869b8e80941Smrg       * The minimum and maximum normal float16 values are
870b8e80941Smrg       *
871b8e80941Smrg       *   min_norm16 = 2^(1 - 15) * (1 + 0 / 2^10) = 2^(-14)   (20)
872b8e80941Smrg       *   max_norm16 = 2^(30 - 15) * (1 + 1023 / 2^10)         (21)
873b8e80941Smrg       *
874b8e80941Smrg       * The step at max_norm16 is
875b8e80941Smrg       *
876b8e80941Smrg       *   max_step16 = 2^5                                     (22)
877b8e80941Smrg       *
878b8e80941Smrg       * Observe that the float16 boundary values in equations 20-21 lie in the
879b8e80941Smrg       * range of normal float32 values.
880b8e80941Smrg       *
881b8e80941Smrg       *
882b8e80941Smrg       * Rounding Behavior
883b8e80941Smrg       * -----------------
884b8e80941Smrg       * Not all float32 values can be exactly represented as a float16. We
885b8e80941Smrg       * round all such intermediate float32 values to the nearest float16; if
886b8e80941Smrg       * the float32 is exactly between to float16 values, we round to the one
887b8e80941Smrg       * with an even mantissa. This rounding behavior has several benefits:
888b8e80941Smrg       *
889b8e80941Smrg       *   - It has no sign bias.
890b8e80941Smrg       *
891b8e80941Smrg       *   - It reproduces the behavior of real hardware: opcode F32TO16 in Intel's
892b8e80941Smrg       *     GPU ISA.
893b8e80941Smrg       *
894b8e80941Smrg       *   - By reproducing the behavior of the GPU (at least on Intel hardware),
895b8e80941Smrg       *     compile-time evaluation of constant packHalf2x16 GLSL expressions will
896b8e80941Smrg       *     result in the same value as if the expression were executed on the
897b8e80941Smrg       *     GPU.
898b8e80941Smrg       *
899b8e80941Smrg       * Calculation
900b8e80941Smrg       * -----------
901b8e80941Smrg       * Our task is to compute s16, e16, m16 given f32.  Since this function
902b8e80941Smrg       * ignores the sign bit, assume that s32 = s16 = 0.  There are several
903b8e80941Smrg       * cases consider.
904b8e80941Smrg       */
905b8e80941Smrg
906b8e80941Smrg      factory.emit(
907b8e80941Smrg
908b8e80941Smrg         /* Case 1) f32 is NaN
909b8e80941Smrg          *
910b8e80941Smrg          *   The resultant f16 will also be NaN.
911b8e80941Smrg          */
912b8e80941Smrg
913b8e80941Smrg         /* if (e32 == 255 && m32 != 0) { */
914b8e80941Smrg         if_tree(logic_and(equal(e, constant(0xffu << 23u)),
915b8e80941Smrg                           logic_not(equal(m, constant(0u)))),
916b8e80941Smrg
917b8e80941Smrg            assign(u16, constant(0x7fffu)),
918b8e80941Smrg
919b8e80941Smrg         /* Case 2) f32 lies in the range [0, min_norm16).
920b8e80941Smrg          *
921b8e80941Smrg          *   The resultant float16 will be either zero, subnormal, or normal.
922b8e80941Smrg          *
923b8e80941Smrg          *   Solving
924b8e80941Smrg          *
925b8e80941Smrg          *     f32 = min_norm16       (30)
926b8e80941Smrg          *
927b8e80941Smrg          *   gives
928b8e80941Smrg          *
929b8e80941Smrg          *     e32 = 113 and m32 = 0  (31)
930b8e80941Smrg          *
931b8e80941Smrg          *   Therefore this case occurs if and only if
932b8e80941Smrg          *
933b8e80941Smrg          *     e32 < 113              (32)
934b8e80941Smrg          */
935b8e80941Smrg
936b8e80941Smrg         /* } else if (e32 < 113) { */
937b8e80941Smrg         if_tree(less(e, constant(113u << 23u)),
938b8e80941Smrg
939b8e80941Smrg            /* u16 = uint(round_to_even(abs(f32) * float(1u << 24u))); */
940b8e80941Smrg            assign(u16, f2u(round_even(mul(expr(ir_unop_abs, f),
941b8e80941Smrg                                           constant((float) (1 << 24)))))),
942b8e80941Smrg
943b8e80941Smrg         /* Case 3) f32 lies in the range
944b8e80941Smrg          *         [min_norm16, max_norm16 + max_step16).
945b8e80941Smrg          *
946b8e80941Smrg          *   The resultant float16 will be either normal or infinite.
947b8e80941Smrg          *
948b8e80941Smrg          *   Solving
949b8e80941Smrg          *
950b8e80941Smrg          *     f32 = max_norm16 + max_step16           (40)
951b8e80941Smrg          *         = 2^15 * (1 + 1023 / 2^10) + 2^5    (41)
952b8e80941Smrg          *         = 2^16                              (42)
953b8e80941Smrg          *   gives
954b8e80941Smrg          *
955b8e80941Smrg          *     e32 = 143 and m32 = 0                   (43)
956b8e80941Smrg          *
957b8e80941Smrg          *   We already solved the boundary condition f32 = min_norm16 above
958b8e80941Smrg          *   in equation 31. Therefore this case occurs if and only if
959b8e80941Smrg          *
960b8e80941Smrg          *     113 <= e32 and e32 < 143
961b8e80941Smrg          */
962b8e80941Smrg
963b8e80941Smrg         /* } else if (e32 < 143) { */
964b8e80941Smrg         if_tree(less(e, constant(143u << 23u)),
965b8e80941Smrg
966b8e80941Smrg            /* The addition below handles the case where the mantissa rounds
967b8e80941Smrg             * up to 1024 and bumps the exponent.
968b8e80941Smrg             *
969b8e80941Smrg             * u16 = ((e - (112u << 23u)) >> 13u)
970b8e80941Smrg             *     + round_to_even((float(m) / (1u << 13u));
971b8e80941Smrg             */
972b8e80941Smrg            assign(u16, add(rshift(sub(e, constant(112u << 23u)),
973b8e80941Smrg                                   constant(13u)),
974b8e80941Smrg                            f2u(round_even(
975b8e80941Smrg                                  div(u2f(m), constant((float) (1 << 13))))))),
976b8e80941Smrg
977b8e80941Smrg         /* Case 4) f32 lies in the range [max_norm16 + max_step16, inf].
978b8e80941Smrg          *
979b8e80941Smrg          *   The resultant float16 will be infinite.
980b8e80941Smrg          *
981b8e80941Smrg          *   The cases above caught all float32 values in the range
982b8e80941Smrg          *   [0, max_norm16 + max_step16), so this is the fall-through case.
983b8e80941Smrg          */
984b8e80941Smrg
985b8e80941Smrg         /* } else { */
986b8e80941Smrg
987b8e80941Smrg            assign(u16, constant(31u << 10u))))));
988b8e80941Smrg
989b8e80941Smrg         /* } */
990b8e80941Smrg
991b8e80941Smrg       return deref(u16).val;
992b8e80941Smrg   }
993b8e80941Smrg
994b8e80941Smrg   /**
995b8e80941Smrg    * \brief Lower a packHalf2x16 expression.
996b8e80941Smrg    *
997b8e80941Smrg    * \param vec2_rval is packHalf2x16's input
998b8e80941Smrg    * \return packHalf2x16's output as a uint rvalue
999b8e80941Smrg    */
1000b8e80941Smrg   ir_rvalue*
1001b8e80941Smrg   lower_pack_half_2x16(ir_rvalue *vec2_rval)
1002b8e80941Smrg   {
1003b8e80941Smrg      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1004b8e80941Smrg       *
1005b8e80941Smrg       *    highp uint packHalf2x16 (mediump vec2 v)
1006b8e80941Smrg       *    ----------------------------------------
1007b8e80941Smrg       *    Returns an unsigned integer obtained by converting the components of
1008b8e80941Smrg       *    a two-component floating-point vector to the 16-bit floating-point
1009b8e80941Smrg       *    representation found in the OpenGL ES Specification, and then packing
1010b8e80941Smrg       *    these two 16-bit integers into a 32-bit unsigned integer.
1011b8e80941Smrg       *
1012b8e80941Smrg       *    The first vector component specifies the 16 least- significant bits
1013b8e80941Smrg       *    of the result; the second component specifies the 16 most-significant
1014b8e80941Smrg       *    bits.
1015b8e80941Smrg       */
1016b8e80941Smrg
1017b8e80941Smrg      assert(vec2_rval->type == glsl_type::vec2_type);
1018b8e80941Smrg
1019b8e80941Smrg      /* vec2 f = VEC2_RVAL; */
1020b8e80941Smrg      ir_variable *f = factory.make_temp(glsl_type::vec2_type,
1021b8e80941Smrg                                         "tmp_pack_half_2x16_f");
1022b8e80941Smrg      factory.emit(assign(f, vec2_rval));
1023b8e80941Smrg
1024b8e80941Smrg      /* uvec2 f32 = bitcast_f2u(f); */
1025b8e80941Smrg      ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1026b8e80941Smrg                                            "tmp_pack_half_2x16_f32");
1027b8e80941Smrg      factory.emit(assign(f32, expr(ir_unop_bitcast_f2u, f)));
1028b8e80941Smrg
1029b8e80941Smrg      /* uvec2 f16; */
1030b8e80941Smrg      ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1031b8e80941Smrg                                        "tmp_pack_half_2x16_f16");
1032b8e80941Smrg
1033b8e80941Smrg      /* Get f32's unshifted exponent bits.
1034b8e80941Smrg       *
1035b8e80941Smrg       *   uvec2 e = f32 & 0x7f800000u;
1036b8e80941Smrg       */
1037b8e80941Smrg      ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1038b8e80941Smrg                                          "tmp_pack_half_2x16_e");
1039b8e80941Smrg      factory.emit(assign(e, bit_and(f32, constant(0x7f800000u))));
1040b8e80941Smrg
1041b8e80941Smrg      /* Get f32's unshifted mantissa bits.
1042b8e80941Smrg       *
1043b8e80941Smrg       *   uvec2 m = f32 & 0x007fffffu;
1044b8e80941Smrg       */
1045b8e80941Smrg      ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1046b8e80941Smrg                                          "tmp_pack_half_2x16_m");
1047b8e80941Smrg      factory.emit(assign(m, bit_and(f32, constant(0x007fffffu))));
1048b8e80941Smrg
1049b8e80941Smrg      /* Set f16's exponent and mantissa bits.
1050b8e80941Smrg       *
1051b8e80941Smrg       *   f16.x = pack_half_1x16_nosign(e.x, m.x);
1052b8e80941Smrg       *   f16.y = pack_half_1y16_nosign(e.y, m.y);
1053b8e80941Smrg       */
1054b8e80941Smrg      factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_x(f),
1055b8e80941Smrg                                                     swizzle_x(e),
1056b8e80941Smrg                                                     swizzle_x(m)),
1057b8e80941Smrg                           WRITEMASK_X));
1058b8e80941Smrg      factory.emit(assign(f16, pack_half_1x16_nosign(swizzle_y(f),
1059b8e80941Smrg                                                     swizzle_y(e),
1060b8e80941Smrg                                                     swizzle_y(m)),
1061b8e80941Smrg                           WRITEMASK_Y));
1062b8e80941Smrg
1063b8e80941Smrg      /* Set f16's sign bits.
1064b8e80941Smrg       *
1065b8e80941Smrg       *   f16 |= (f32 & (1u << 31u) >> 16u;
1066b8e80941Smrg       */
1067b8e80941Smrg      factory.emit(
1068b8e80941Smrg         assign(f16, bit_or(f16,
1069b8e80941Smrg                            rshift(bit_and(f32, constant(1u << 31u)),
1070b8e80941Smrg                                   constant(16u)))));
1071b8e80941Smrg
1072b8e80941Smrg
1073b8e80941Smrg      /* return (f16.y << 16u) | f16.x; */
1074b8e80941Smrg      ir_rvalue *result = bit_or(lshift(swizzle_y(f16),
1075b8e80941Smrg                                        constant(16u)),
1076b8e80941Smrg                                 swizzle_x(f16));
1077b8e80941Smrg
1078b8e80941Smrg      assert(result->type == glsl_type::uint_type);
1079b8e80941Smrg      return result;
1080b8e80941Smrg   }
1081b8e80941Smrg
1082b8e80941Smrg   /**
1083b8e80941Smrg    * \brief Lower the component-wise calculation of unpackHalf2x16.
1084b8e80941Smrg    *
1085b8e80941Smrg    * Given a uint that encodes a float16 in its lower 16 bits, this function
1086b8e80941Smrg    * returns a uint that encodes a float32 with the same value. The sign bit
1087b8e80941Smrg    * of the float16 is ignored.
1088b8e80941Smrg    *
1089b8e80941Smrg    * \param e_rval is the unshifted exponent bits of a float16
1090b8e80941Smrg    * \param m_rval is the unshifted mantissa bits of a float16
1091b8e80941Smrg    * \param a uint rvalue that encodes a float32
1092b8e80941Smrg    */
1093b8e80941Smrg   ir_rvalue*
1094b8e80941Smrg   unpack_half_1x16_nosign(ir_rvalue *e_rval, ir_rvalue *m_rval)
1095b8e80941Smrg   {
1096b8e80941Smrg      assert(e_rval->type == glsl_type::uint_type);
1097b8e80941Smrg      assert(m_rval->type == glsl_type::uint_type);
1098b8e80941Smrg
1099b8e80941Smrg      /* uint u32; */
1100b8e80941Smrg      ir_variable *u32 = factory.make_temp(glsl_type::uint_type,
1101b8e80941Smrg                                           "tmp_unpack_half_1x16_u32");
1102b8e80941Smrg
1103b8e80941Smrg      /* uint e = E_RVAL; */
1104b8e80941Smrg      ir_variable *e = factory.make_temp(glsl_type::uint_type,
1105b8e80941Smrg                                          "tmp_unpack_half_1x16_e");
1106b8e80941Smrg      factory.emit(assign(e, e_rval));
1107b8e80941Smrg
1108b8e80941Smrg      /* uint m = M_RVAL; */
1109b8e80941Smrg      ir_variable *m = factory.make_temp(glsl_type::uint_type,
1110b8e80941Smrg                                          "tmp_unpack_half_1x16_m");
1111b8e80941Smrg      factory.emit(assign(m, m_rval));
1112b8e80941Smrg
1113b8e80941Smrg      /* Preliminaries
1114b8e80941Smrg       * -------------
1115b8e80941Smrg       *
1116b8e80941Smrg       * For a float16, the bit layout is:
1117b8e80941Smrg       *
1118b8e80941Smrg       *   sign:     15
1119b8e80941Smrg       *   exponent: 10:14
1120b8e80941Smrg       *   mantissa: 0:9
1121b8e80941Smrg       *
1122b8e80941Smrg       * Let f16 be a float16 value. The sign, exponent, and mantissa
1123b8e80941Smrg       * determine its value thus:
1124b8e80941Smrg       *
1125b8e80941Smrg       *   if e16 = 0 and m16 = 0, then zero:       (-1)^s16 * 0                               (1)
1126b8e80941Smrg       *   if e16 = 0 and m16!= 0, then subnormal:  (-1)^s16 * 2^(e16 - 14) * (m16 / 2^10)     (2)
1127b8e80941Smrg       *   if 0 < e16 < 31, then normal:            (-1)^s16 * 2^(e16 - 15) * (1 + m16 / 2^10) (3)
1128b8e80941Smrg       *   if e16 = 31 and m16 = 0, then infinite:  (-1)^s16 * inf                             (4)
1129b8e80941Smrg       *   if e16 = 31 and m16 != 0, then           NaN                                        (5)
1130b8e80941Smrg       *
1131b8e80941Smrg       * where 0 <= m16 < 2^10.
1132b8e80941Smrg       *
1133b8e80941Smrg       * For a float32, the bit layout is:
1134b8e80941Smrg       *
1135b8e80941Smrg       *   sign: 31
1136b8e80941Smrg       *   exponent: 23:30
1137b8e80941Smrg       *   mantissa: 0:22
1138b8e80941Smrg       *
1139b8e80941Smrg       * Let f32 be a float32 value. The sign, exponent, and mantissa
1140b8e80941Smrg       * determine its value thus:
1141b8e80941Smrg       *
1142b8e80941Smrg       *   if e32 = 0 and m32 = 0, then zero:        (-1)^s * 0                                (10)
1143b8e80941Smrg       *   if e32 = 0 and m32 != 0, then subnormal:  (-1)^s * 2^(e32 - 126) * (m32 / 2^23)     (11)
1144b8e80941Smrg       *   if 0 < e32 < 255, then normal:            (-1)^s * 2^(e32 - 127) * (1 + m32 / 2^23) (12)
1145b8e80941Smrg       *   if e32 = 255 and m32 = 0, then infinite:  (-1)^s * inf                              (13)
1146b8e80941Smrg       *   if e32 = 255 and m32 != 0, then           NaN                                       (14)
1147b8e80941Smrg       *
1148b8e80941Smrg       * where 0 <= m32 < 2^23.
1149b8e80941Smrg       *
1150b8e80941Smrg       * Calculation
1151b8e80941Smrg       * -----------
1152b8e80941Smrg       * Our task is to compute s32, e32, m32 given f16.  Since this function
1153b8e80941Smrg       * ignores the sign bit, assume that s32 = s16 = 0.  There are several
1154b8e80941Smrg       * cases consider.
1155b8e80941Smrg       */
1156b8e80941Smrg
1157b8e80941Smrg      factory.emit(
1158b8e80941Smrg
1159b8e80941Smrg         /* Case 1) f16 is zero or subnormal.
1160b8e80941Smrg          *
1161b8e80941Smrg          *   The simplest method of calcuating f32 in this case is
1162b8e80941Smrg          *
1163b8e80941Smrg          *     f32 = f16                       (20)
1164b8e80941Smrg          *         = 2^(-14) * (m16 / 2^10)    (21)
1165b8e80941Smrg          *         = m16 / 2^(-24)             (22)
1166b8e80941Smrg          */
1167b8e80941Smrg
1168b8e80941Smrg         /* if (e16 == 0) { */
1169b8e80941Smrg         if_tree(equal(e, constant(0u)),
1170b8e80941Smrg
1171b8e80941Smrg            /* u32 = bitcast_f2u(float(m) / float(1 << 24)); */
1172b8e80941Smrg            assign(u32, expr(ir_unop_bitcast_f2u,
1173b8e80941Smrg                                div(u2f(m), constant((float)(1 << 24))))),
1174b8e80941Smrg
1175b8e80941Smrg         /* Case 2) f16 is normal.
1176b8e80941Smrg          *
1177b8e80941Smrg          *   The equation
1178b8e80941Smrg          *
1179b8e80941Smrg          *     f32 = f16                              (30)
1180b8e80941Smrg          *     2^(e32 - 127) * (1 + m32 / 2^23) =     (31)
1181b8e80941Smrg          *       2^(e16 - 15) * (1 + m16 / 2^10)
1182b8e80941Smrg          *
1183b8e80941Smrg          *   can be decomposed into two
1184b8e80941Smrg          *
1185b8e80941Smrg          *     2^(e32 - 127) = 2^(e16 - 15)           (32)
1186b8e80941Smrg          *     1 + m32 / 2^23 = 1 + m16 / 2^10        (33)
1187b8e80941Smrg          *
1188b8e80941Smrg          *   which solve to
1189b8e80941Smrg          *
1190b8e80941Smrg          *     e32 = e16 + 112                        (34)
1191b8e80941Smrg          *     m32 = m16 * 2^13                       (35)
1192b8e80941Smrg          */
1193b8e80941Smrg
1194b8e80941Smrg         /* } else if (e16 < 31)) { */
1195b8e80941Smrg         if_tree(less(e, constant(31u << 10u)),
1196b8e80941Smrg
1197b8e80941Smrg              /* u32 = ((e + (112 << 10)) | m) << 13;
1198b8e80941Smrg               */
1199b8e80941Smrg              assign(u32, lshift(bit_or(add(e, constant(112u << 10u)), m),
1200b8e80941Smrg                                 constant(13u))),
1201b8e80941Smrg
1202b8e80941Smrg
1203b8e80941Smrg         /* Case 3) f16 is infinite. */
1204b8e80941Smrg         if_tree(equal(m, constant(0u)),
1205b8e80941Smrg
1206b8e80941Smrg                 assign(u32, constant(255u << 23u)),
1207b8e80941Smrg
1208b8e80941Smrg         /* Case 4) f16 is NaN. */
1209b8e80941Smrg         /* } else { */
1210b8e80941Smrg
1211b8e80941Smrg            assign(u32, constant(0x7fffffffu))))));
1212b8e80941Smrg
1213b8e80941Smrg         /* } */
1214b8e80941Smrg
1215b8e80941Smrg      return deref(u32).val;
1216b8e80941Smrg   }
1217b8e80941Smrg
1218b8e80941Smrg   /**
1219b8e80941Smrg    * \brief Lower an unpackHalf2x16 expression.
1220b8e80941Smrg    *
1221b8e80941Smrg    * \param uint_rval is unpackHalf2x16's input
1222b8e80941Smrg    * \return unpackHalf2x16's output as a vec2 rvalue
1223b8e80941Smrg    */
1224b8e80941Smrg   ir_rvalue*
1225b8e80941Smrg   lower_unpack_half_2x16(ir_rvalue *uint_rval)
1226b8e80941Smrg   {
1227b8e80941Smrg      /* From page 89 (95 of pdf) of the GLSL ES 3.00 spec:
1228b8e80941Smrg       *
1229b8e80941Smrg       *    mediump vec2 unpackHalf2x16 (highp uint v)
1230b8e80941Smrg       *    ------------------------------------------
1231b8e80941Smrg       *    Returns a two-component floating-point vector with components
1232b8e80941Smrg       *    obtained by unpacking a 32-bit unsigned integer into a pair of 16-bit
1233b8e80941Smrg       *    values, interpreting those values as 16-bit floating-point numbers
1234b8e80941Smrg       *    according to the OpenGL ES Specification, and converting them to
1235b8e80941Smrg       *    32-bit floating-point values.
1236b8e80941Smrg       *
1237b8e80941Smrg       *    The first component of the vector is obtained from the
1238b8e80941Smrg       *    16 least-significant bits of v; the second component is obtained
1239b8e80941Smrg       *    from the 16 most-significant bits of v.
1240b8e80941Smrg       */
1241b8e80941Smrg      assert(uint_rval->type == glsl_type::uint_type);
1242b8e80941Smrg
1243b8e80941Smrg      /* uint u = RVALUE;
1244b8e80941Smrg       * uvec2 f16 = uvec2(u.x & 0xffff, u.y >> 16);
1245b8e80941Smrg       */
1246b8e80941Smrg      ir_variable *f16 = factory.make_temp(glsl_type::uvec2_type,
1247b8e80941Smrg                                            "tmp_unpack_half_2x16_f16");
1248b8e80941Smrg      factory.emit(assign(f16, unpack_uint_to_uvec2(uint_rval)));
1249b8e80941Smrg
1250b8e80941Smrg      /* uvec2 f32; */
1251b8e80941Smrg      ir_variable *f32 = factory.make_temp(glsl_type::uvec2_type,
1252b8e80941Smrg                                            "tmp_unpack_half_2x16_f32");
1253b8e80941Smrg
1254b8e80941Smrg      /* Get f16's unshifted exponent bits.
1255b8e80941Smrg       *
1256b8e80941Smrg       *    uvec2 e = f16 & 0x7c00u;
1257b8e80941Smrg       */
1258b8e80941Smrg      ir_variable *e = factory.make_temp(glsl_type::uvec2_type,
1259b8e80941Smrg                                          "tmp_unpack_half_2x16_e");
1260b8e80941Smrg      factory.emit(assign(e, bit_and(f16, constant(0x7c00u))));
1261b8e80941Smrg
1262b8e80941Smrg      /* Get f16's unshifted mantissa bits.
1263b8e80941Smrg       *
1264b8e80941Smrg       *    uvec2 m = f16 & 0x03ffu;
1265b8e80941Smrg       */
1266b8e80941Smrg      ir_variable *m = factory.make_temp(glsl_type::uvec2_type,
1267b8e80941Smrg                                          "tmp_unpack_half_2x16_m");
1268b8e80941Smrg      factory.emit(assign(m, bit_and(f16, constant(0x03ffu))));
1269b8e80941Smrg
1270b8e80941Smrg      /* Set f32's exponent and mantissa bits.
1271b8e80941Smrg       *
1272b8e80941Smrg       *   f32.x = unpack_half_1x16_nosign(e.x, m.x);
1273b8e80941Smrg       *   f32.y = unpack_half_1x16_nosign(e.y, m.y);
1274b8e80941Smrg       */
1275b8e80941Smrg      factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_x(e),
1276b8e80941Smrg                                                       swizzle_x(m)),
1277b8e80941Smrg                           WRITEMASK_X));
1278b8e80941Smrg      factory.emit(assign(f32, unpack_half_1x16_nosign(swizzle_y(e),
1279b8e80941Smrg                                                       swizzle_y(m)),
1280b8e80941Smrg                           WRITEMASK_Y));
1281b8e80941Smrg
1282b8e80941Smrg      /* Set f32's sign bit.
1283b8e80941Smrg       *
1284b8e80941Smrg       *    f32 |= (f16 & 0x8000u) << 16u;
1285b8e80941Smrg       */
1286b8e80941Smrg      factory.emit(assign(f32, bit_or(f32,
1287b8e80941Smrg                                       lshift(bit_and(f16,
1288b8e80941Smrg                                                      constant(0x8000u)),
1289b8e80941Smrg                                              constant(16u)))));
1290b8e80941Smrg
1291b8e80941Smrg      /* return bitcast_u2f(f32); */
1292b8e80941Smrg      ir_rvalue *result = expr(ir_unop_bitcast_u2f, f32);
1293b8e80941Smrg      assert(result->type == glsl_type::vec2_type);
1294b8e80941Smrg      return result;
1295b8e80941Smrg   }
1296b8e80941Smrg};
1297b8e80941Smrg
1298b8e80941Smrg} // namespace anonymous
1299b8e80941Smrg
1300b8e80941Smrg/**
1301b8e80941Smrg * \brief Lower the builtin packing functions.
1302b8e80941Smrg *
1303b8e80941Smrg * \param op_mask is a bitmask of `enum lower_packing_builtins_op`.
1304b8e80941Smrg */
1305b8e80941Smrgbool
1306b8e80941Smrglower_packing_builtins(exec_list *instructions, int op_mask)
1307b8e80941Smrg{
1308b8e80941Smrg   lower_packing_builtins_visitor v(op_mask);
1309b8e80941Smrg   visit_list_elements(&v, instructions, true);
1310b8e80941Smrg   return v.get_progress();
1311b8e80941Smrg}
1312