nir_opcodes.py revision 01e04c3f
1#
2# Copyright (C) 2014 Connor Abbott
3#
4# Permission is hereby granted, free of charge, to any person obtaining a
5# copy of this software and associated documentation files (the "Software"),
6# to deal in the Software without restriction, including without limitation
7# the rights to use, copy, modify, merge, publish, distribute, sublicense,
8# and/or sell copies of the Software, and to permit persons to whom the
9# Software is furnished to do so, subject to the following conditions:
10#
11# The above copyright notice and this permission notice (including the next
12# paragraph) shall be included in all copies or substantial portions of the
13# Software.
14#
15# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21# IN THE SOFTWARE.
22#
23# Authors:
24#    Connor Abbott (cwabbott0@gmail.com)
25
26
27# Class that represents all the information we have about the opcode
28# NOTE: this must be kept in sync with nir_op_info
29
30class Opcode(object):
31   """Class that represents all the information we have about the opcode
32   NOTE: this must be kept in sync with nir_op_info
33   """
34   def __init__(self, name, output_size, output_type, input_sizes,
35                input_types, algebraic_properties, const_expr):
36      """Parameters:
37
38      - name is the name of the opcode (prepend nir_op_ for the enum name)
39      - all types are strings that get nir_type_ prepended to them
40      - input_types is a list of types
41      - algebraic_properties is a space-seperated string, where nir_op_is_ is
42        prepended before each entry
43      - const_expr is an expression or series of statements that computes the
44        constant value of the opcode given the constant values of its inputs.
45
46      Constant expressions are formed from the variables src0, src1, ...,
47      src(N-1), where N is the number of arguments.  The output of the
48      expression should be stored in the dst variable.  Per-component input
49      and output variables will be scalars and non-per-component input and
50      output variables will be a struct with fields named x, y, z, and w
51      all of the correct type.  Input and output variables can be assumed
52      to already be of the correct type and need no conversion.  In
53      particular, the conversion from the C bool type to/from  NIR_TRUE and
54      NIR_FALSE happens automatically.
55
56      For per-component instructions, the entire expression will be
57      executed once for each component.  For non-per-component
58      instructions, the expression is expected to store the correct values
59      in dst.x, dst.y, etc.  If "dst" does not exist anywhere in the
60      constant expression, an assignment to dst will happen automatically
61      and the result will be equivalent to "dst = <expression>" for
62      per-component instructions and "dst.x = dst.y = ... = <expression>"
63      for non-per-component instructions.
64      """
65      assert isinstance(name, str)
66      assert isinstance(output_size, int)
67      assert isinstance(output_type, str)
68      assert isinstance(input_sizes, list)
69      assert isinstance(input_sizes[0], int)
70      assert isinstance(input_types, list)
71      assert isinstance(input_types[0], str)
72      assert isinstance(algebraic_properties, str)
73      assert isinstance(const_expr, str)
74      assert len(input_sizes) == len(input_types)
75      assert 0 <= output_size <= 4
76      for size in input_sizes:
77         assert 0 <= size <= 4
78         if output_size != 0:
79            assert size != 0
80      self.name = name
81      self.num_inputs = len(input_sizes)
82      self.output_size = output_size
83      self.output_type = output_type
84      self.input_sizes = input_sizes
85      self.input_types = input_types
86      self.algebraic_properties = algebraic_properties
87      self.const_expr = const_expr
88
89# helper variables for strings
90tfloat = "float"
91tint = "int"
92tbool = "bool32"
93tuint = "uint"
94tuint16 = "uint16"
95tfloat32 = "float32"
96tint32 = "int32"
97tuint32 = "uint32"
98tint64 = "int64"
99tuint64 = "uint64"
100tfloat64 = "float64"
101
102commutative = "commutative "
103associative = "associative "
104
105# global dictionary of opcodes
106opcodes = {}
107
108def opcode(name, output_size, output_type, input_sizes, input_types,
109           algebraic_properties, const_expr):
110   assert name not in opcodes
111   opcodes[name] = Opcode(name, output_size, output_type, input_sizes,
112                          input_types, algebraic_properties, const_expr)
113
114def unop_convert(name, out_type, in_type, const_expr):
115   opcode(name, 0, out_type, [0], [in_type], "", const_expr)
116
117def unop(name, ty, const_expr):
118   opcode(name, 0, ty, [0], [ty], "", const_expr)
119
120def unop_horiz(name, output_size, output_type, input_size, input_type,
121               const_expr):
122   opcode(name, output_size, output_type, [input_size], [input_type], "",
123          const_expr)
124
125def unop_reduce(name, output_size, output_type, input_type, prereduce_expr,
126                reduce_expr, final_expr):
127   def prereduce(src):
128      return "(" + prereduce_expr.format(src=src) + ")"
129   def final(src):
130      return final_expr.format(src="(" + src + ")")
131   def reduce_(src0, src1):
132      return reduce_expr.format(src0=src0, src1=src1)
133   src0 = prereduce("src0.x")
134   src1 = prereduce("src0.y")
135   src2 = prereduce("src0.z")
136   src3 = prereduce("src0.w")
137   unop_horiz(name + "2", output_size, output_type, 2, input_type,
138              final(reduce_(src0, src1)))
139   unop_horiz(name + "3", output_size, output_type, 3, input_type,
140              final(reduce_(reduce_(src0, src1), src2)))
141   unop_horiz(name + "4", output_size, output_type, 4, input_type,
142              final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
143
144
145# These two move instructions differ in what modifiers they support and what
146# the negate modifier means. Otherwise, they are identical.
147unop("fmov", tfloat, "src0")
148unop("imov", tint, "src0")
149
150unop("ineg", tint, "-src0")
151unop("fneg", tfloat, "-src0")
152unop("inot", tint, "~src0") # invert every bit of the integer
153unop("fnot", tfloat, ("bit_size == 64 ? ((src0 == 0.0) ? 1.0 : 0.0f) : " +
154                      "((src0 == 0.0f) ? 1.0f : 0.0f)"))
155unop("fsign", tfloat, ("bit_size == 64 ? " +
156                       "((src0 == 0.0) ? 0.0 : ((src0 > 0.0) ? 1.0 : -1.0)) : " +
157                       "((src0 == 0.0f) ? 0.0f : ((src0 > 0.0f) ? 1.0f : -1.0f))"))
158unop("isign", tint, "(src0 == 0) ? 0 : ((src0 > 0) ? 1 : -1)")
159unop("iabs", tint, "(src0 < 0) ? -src0 : src0")
160unop("fabs", tfloat, "fabs(src0)")
161unop("fsat", tfloat, ("bit_size == 64 ? " +
162                      "((src0 > 1.0) ? 1.0 : ((src0 <= 0.0) ? 0.0 : src0)) : " +
163                      "((src0 > 1.0f) ? 1.0f : ((src0 <= 0.0f) ? 0.0f : src0))"))
164unop("frcp", tfloat, "bit_size == 64 ? 1.0 / src0 : 1.0f / src0")
165unop("frsq", tfloat, "bit_size == 64 ? 1.0 / sqrt(src0) : 1.0f / sqrtf(src0)")
166unop("fsqrt", tfloat, "bit_size == 64 ? sqrt(src0) : sqrtf(src0)")
167unop("fexp2", tfloat, "exp2f(src0)")
168unop("flog2", tfloat, "log2f(src0)")
169
170# Generate all of the numeric conversion opcodes
171for src_t in [tint, tuint, tfloat]:
172   if src_t in (tint, tuint):
173      dst_types = [tfloat, src_t]
174   elif src_t == tfloat:
175      dst_types = [tint, tuint, tfloat]
176
177   for dst_t in dst_types:
178      if dst_t == tfloat:
179         bit_sizes = [16, 32, 64]
180      else:
181         bit_sizes = [8, 16, 32, 64]
182      for bit_size in bit_sizes:
183          if bit_size == 16 and dst_t == tfloat and src_t == tfloat:
184              rnd_modes = ['_rtne', '_rtz', '']
185              for rnd_mode in rnd_modes:
186                  unop_convert("{0}2{1}{2}{3}".format(src_t[0], dst_t[0],
187                                                       bit_size, rnd_mode),
188                               dst_t + str(bit_size), src_t, "src0")
189          else:
190              unop_convert("{0}2{1}{2}".format(src_t[0], dst_t[0], bit_size),
191                           dst_t + str(bit_size), src_t, "src0")
192
193# We'll hand-code the to/from bool conversion opcodes.  Because bool doesn't
194# have multiple bit-sizes, we can always infer the size from the other type.
195unop_convert("f2b", tbool, tfloat, "src0 != 0.0")
196unop_convert("i2b", tbool, tint, "src0 != 0")
197unop_convert("b2f", tfloat, tbool, "src0 ? 1.0 : 0.0")
198unop_convert("b2i", tint, tbool, "src0 ? 1 : 0")
199
200
201# Unary floating-point rounding operations.
202
203
204unop("ftrunc", tfloat, "bit_size == 64 ? trunc(src0) : truncf(src0)")
205unop("fceil", tfloat, "bit_size == 64 ? ceil(src0) : ceilf(src0)")
206unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
207unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
208unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
209
210unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
211
212# Trigonometric operations.
213
214
215unop("fsin", tfloat, "bit_size == 64 ? sin(src0) : sinf(src0)")
216unop("fcos", tfloat, "bit_size == 64 ? cos(src0) : cosf(src0)")
217
218# dfrexp
219unop_convert("frexp_exp", tint32, tfloat64, "frexp(src0, &dst);")
220unop_convert("frexp_sig", tfloat64, tfloat64, "int n; dst = frexp(src0, &n);")
221
222# Partial derivatives.
223
224
225unop("fddx", tfloat, "0.0") # the derivative of a constant is 0.
226unop("fddy", tfloat, "0.0")
227unop("fddx_fine", tfloat, "0.0")
228unop("fddy_fine", tfloat, "0.0")
229unop("fddx_coarse", tfloat, "0.0")
230unop("fddy_coarse", tfloat, "0.0")
231
232
233# Floating point pack and unpack operations.
234
235def pack_2x16(fmt):
236   unop_horiz("pack_" + fmt + "_2x16", 1, tuint32, 2, tfloat32, """
237dst.x = (uint32_t) pack_fmt_1x16(src0.x);
238dst.x |= ((uint32_t) pack_fmt_1x16(src0.y)) << 16;
239""".replace("fmt", fmt))
240
241def pack_4x8(fmt):
242   unop_horiz("pack_" + fmt + "_4x8", 1, tuint32, 4, tfloat32, """
243dst.x = (uint32_t) pack_fmt_1x8(src0.x);
244dst.x |= ((uint32_t) pack_fmt_1x8(src0.y)) << 8;
245dst.x |= ((uint32_t) pack_fmt_1x8(src0.z)) << 16;
246dst.x |= ((uint32_t) pack_fmt_1x8(src0.w)) << 24;
247""".replace("fmt", fmt))
248
249def unpack_2x16(fmt):
250   unop_horiz("unpack_" + fmt + "_2x16", 2, tfloat32, 1, tuint32, """
251dst.x = unpack_fmt_1x16((uint16_t)(src0.x & 0xffff));
252dst.y = unpack_fmt_1x16((uint16_t)(src0.x << 16));
253""".replace("fmt", fmt))
254
255def unpack_4x8(fmt):
256   unop_horiz("unpack_" + fmt + "_4x8", 4, tfloat32, 1, tuint32, """
257dst.x = unpack_fmt_1x8((uint8_t)(src0.x & 0xff));
258dst.y = unpack_fmt_1x8((uint8_t)((src0.x >> 8) & 0xff));
259dst.z = unpack_fmt_1x8((uint8_t)((src0.x >> 16) & 0xff));
260dst.w = unpack_fmt_1x8((uint8_t)(src0.x >> 24));
261""".replace("fmt", fmt))
262
263
264pack_2x16("snorm")
265pack_4x8("snorm")
266pack_2x16("unorm")
267pack_4x8("unorm")
268pack_2x16("half")
269unpack_2x16("snorm")
270unpack_4x8("snorm")
271unpack_2x16("unorm")
272unpack_4x8("unorm")
273unpack_2x16("half")
274
275unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
276dst.x = (src0.x & 0xffff) | (src0.y << 16);
277""")
278
279unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
280dst.x = (src0.x <<  0) |
281        (src0.y <<  8) |
282        (src0.z << 16) |
283        (src0.w << 24);
284""")
285
286unop_horiz("pack_32_2x16", 1, tuint32, 2, tuint16,
287           "dst.x = src0.x | ((uint32_t)src0.y << 16);")
288
289unop_horiz("pack_64_2x32", 1, tuint64, 2, tuint32,
290           "dst.x = src0.x | ((uint64_t)src0.y << 32);")
291
292unop_horiz("pack_64_4x16", 1, tuint64, 4, tuint16,
293           "dst.x = src0.x | ((uint64_t)src0.y << 16) | ((uint64_t)src0.z << 32) | ((uint64_t)src0.w << 48);")
294
295unop_horiz("unpack_64_2x32", 2, tuint32, 1, tuint64,
296           "dst.x = src0.x; dst.y = src0.x >> 32;")
297
298unop_horiz("unpack_64_4x16", 4, tuint16, 1, tuint64,
299           "dst.x = src0.x; dst.y = src0.x >> 16; dst.z = src0.x >> 32; dst.w = src0.w >> 48;")
300
301unop_horiz("unpack_32_2x16", 2, tuint16, 1, tuint32,
302           "dst.x = src0.x; dst.y = src0.x >> 16;")
303
304# Lowered floating point unpacking operations.
305
306
307unop_convert("unpack_half_2x16_split_x", tfloat32, tuint32,
308             "unpack_half_1x16((uint16_t)(src0 & 0xffff))")
309unop_convert("unpack_half_2x16_split_y", tfloat32, tuint32,
310             "unpack_half_1x16((uint16_t)(src0 >> 16))")
311
312unop_convert("unpack_32_2x16_split_x", tuint16, tuint32, "src0")
313unop_convert("unpack_32_2x16_split_y", tuint16, tuint32, "src0 >> 16")
314
315unop_convert("unpack_64_2x32_split_x", tuint32, tuint64, "src0")
316unop_convert("unpack_64_2x32_split_y", tuint32, tuint64, "src0 >> 32")
317
318# Bit operations, part of ARB_gpu_shader5.
319
320
321unop("bitfield_reverse", tuint32, """
322/* we're not winning any awards for speed here, but that's ok */
323dst = 0;
324for (unsigned bit = 0; bit < 32; bit++)
325   dst |= ((src0 >> bit) & 1) << (31 - bit);
326""")
327unop_convert("bit_count", tuint32, tuint, """
328dst = 0;
329for (unsigned bit = 0; bit < bit_size; bit++) {
330   if ((src0 >> bit) & 1)
331      dst++;
332}
333""")
334
335unop_convert("ufind_msb", tint32, tuint, """
336dst = -1;
337for (int bit = bit_size - 1; bit >= 0; bit--) {
338   if ((src0 >> bit) & 1) {
339      dst = bit;
340      break;
341   }
342}
343""")
344
345unop("ifind_msb", tint32, """
346dst = -1;
347for (int bit = 31; bit >= 0; bit--) {
348   /* If src0 < 0, we're looking for the first 0 bit.
349    * if src0 >= 0, we're looking for the first 1 bit.
350    */
351   if ((((src0 >> bit) & 1) && (src0 >= 0)) ||
352      (!((src0 >> bit) & 1) && (src0 < 0))) {
353      dst = bit;
354      break;
355   }
356}
357""")
358
359unop_convert("find_lsb", tint32, tint, """
360dst = -1;
361for (unsigned bit = 0; bit < bit_size; bit++) {
362   if ((src0 >> bit) & 1) {
363      dst = bit;
364      break;
365   }
366}
367""")
368
369
370for i in range(1, 5):
371   for j in range(1, 5):
372      unop_horiz("fnoise{0}_{1}".format(i, j), i, tfloat, j, tfloat, "0.0f")
373
374
375# AMD_gcn_shader extended instructions
376unop_horiz("cube_face_coord", 2, tfloat32, 3, tfloat32, """
377dst.x = dst.y = 0.0;
378float absX = fabs(src0.x);
379float absY = fabs(src0.y);
380float absZ = fabs(src0.z);
381if (src0.x >= 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = -src0.z; }
382if (src0.x < 0 && absX >= absY && absX >= absZ) { dst.x = -src0.y; dst.y = src0.z; }
383if (src0.y >= 0 && absY >= absX && absY >= absZ) { dst.x = src0.z; dst.y = src0.x; }
384if (src0.y < 0 && absY >= absX && absY >= absZ) { dst.x = -src0.z; dst.y = src0.x; }
385if (src0.z >= 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = src0.x; }
386if (src0.z < 0 && absZ >= absX && absZ >= absY) { dst.x = -src0.y; dst.y = -src0.x; }
387""")
388
389unop_horiz("cube_face_index", 1, tfloat32, 3, tfloat32, """
390float absX = fabs(src0.x);
391float absY = fabs(src0.y);
392float absZ = fabs(src0.z);
393if (src0.x >= 0 && absX >= absY && absX >= absZ) dst.x = 0;
394if (src0.x < 0 && absX >= absY && absX >= absZ) dst.x = 1;
395if (src0.y >= 0 && absY >= absX && absY >= absZ) dst.x = 2;
396if (src0.y < 0 && absY >= absX && absY >= absZ) dst.x = 3;
397if (src0.z >= 0 && absZ >= absX && absZ >= absY) dst.x = 4;
398if (src0.z < 0 && absZ >= absX && absZ >= absY) dst.x = 5;
399""")
400
401
402def binop_convert(name, out_type, in_type, alg_props, const_expr):
403   opcode(name, 0, out_type, [0, 0], [in_type, in_type], alg_props, const_expr)
404
405def binop(name, ty, alg_props, const_expr):
406   binop_convert(name, ty, ty, alg_props, const_expr)
407
408def binop_compare(name, ty, alg_props, const_expr):
409   binop_convert(name, tbool, ty, alg_props, const_expr)
410
411def binop_horiz(name, out_size, out_type, src1_size, src1_type, src2_size,
412                src2_type, const_expr):
413   opcode(name, out_size, out_type, [src1_size, src2_size], [src1_type, src2_type],
414          "", const_expr)
415
416def binop_reduce(name, output_size, output_type, src_type, prereduce_expr,
417                 reduce_expr, final_expr):
418   def final(src):
419      return final_expr.format(src= "(" + src + ")")
420   def reduce_(src0, src1):
421      return reduce_expr.format(src0=src0, src1=src1)
422   def prereduce(src0, src1):
423      return "(" + prereduce_expr.format(src0=src0, src1=src1) + ")"
424   src0 = prereduce("src0.x", "src1.x")
425   src1 = prereduce("src0.y", "src1.y")
426   src2 = prereduce("src0.z", "src1.z")
427   src3 = prereduce("src0.w", "src1.w")
428   opcode(name + "2", output_size, output_type,
429          [2, 2], [src_type, src_type], commutative,
430          final(reduce_(src0, src1)))
431   opcode(name + "3", output_size, output_type,
432          [3, 3], [src_type, src_type], commutative,
433          final(reduce_(reduce_(src0, src1), src2)))
434   opcode(name + "4", output_size, output_type,
435          [4, 4], [src_type, src_type], commutative,
436          final(reduce_(reduce_(src0, src1), reduce_(src2, src3))))
437
438binop("fadd", tfloat, commutative + associative, "src0 + src1")
439binop("iadd", tint, commutative + associative, "src0 + src1")
440binop("fsub", tfloat, "", "src0 - src1")
441binop("isub", tint, "", "src0 - src1")
442
443binop("fmul", tfloat, commutative + associative, "src0 * src1")
444# low 32-bits of signed/unsigned integer multiply
445binop("imul", tint, commutative + associative, "src0 * src1")
446# high 32-bits of signed integer multiply
447binop("imul_high", tint32, commutative,
448      "(int32_t)(((int64_t) src0 * (int64_t) src1) >> 32)")
449# high 32-bits of unsigned integer multiply
450binop("umul_high", tuint32, commutative,
451      "(uint32_t)(((uint64_t) src0 * (uint64_t) src1) >> 32)")
452
453binop("fdiv", tfloat, "", "src0 / src1")
454binop("idiv", tint, "", "src1 == 0 ? 0 : (src0 / src1)")
455binop("udiv", tuint, "", "src1 == 0 ? 0 : (src0 / src1)")
456
457# returns a boolean representing the carry resulting from the addition of
458# the two unsigned arguments.
459
460binop_convert("uadd_carry", tuint, tuint, commutative, "src0 + src1 < src0")
461
462# returns a boolean representing the borrow resulting from the subtraction
463# of the two unsigned arguments.
464
465binop_convert("usub_borrow", tuint, tuint, "", "src0 < src1")
466
467binop("umod", tuint, "", "src1 == 0 ? 0 : src0 % src1")
468
469# For signed integers, there are several different possible definitions of
470# "modulus" or "remainder".  We follow the conventions used by LLVM and
471# SPIR-V.  The irem opcode implements the standard C/C++ signed "%"
472# operation while the imod opcode implements the more mathematical
473# "modulus" operation.  For details on the difference, see
474#
475# http://mathforum.org/library/drmath/view/52343.html
476
477binop("irem", tint, "", "src1 == 0 ? 0 : src0 % src1")
478binop("imod", tint, "",
479      "src1 == 0 ? 0 : ((src0 % src1 == 0 || (src0 >= 0) == (src1 >= 0)) ?"
480      "                 src0 % src1 : src0 % src1 + src1)")
481binop("fmod", tfloat, "", "src0 - src1 * floorf(src0 / src1)")
482binop("frem", tfloat, "", "src0 - src1 * truncf(src0 / src1)")
483
484#
485# Comparisons
486#
487
488
489# these integer-aware comparisons return a boolean (0 or ~0)
490
491binop_compare("flt", tfloat, "", "src0 < src1")
492binop_compare("fge", tfloat, "", "src0 >= src1")
493binop_compare("feq", tfloat, commutative, "src0 == src1")
494binop_compare("fne", tfloat, commutative, "src0 != src1")
495binop_compare("ilt", tint, "", "src0 < src1")
496binop_compare("ige", tint, "", "src0 >= src1")
497binop_compare("ieq", tint, commutative, "src0 == src1")
498binop_compare("ine", tint, commutative, "src0 != src1")
499binop_compare("ult", tuint, "", "src0 < src1")
500binop_compare("uge", tuint, "", "src0 >= src1")
501
502# integer-aware GLSL-style comparisons that compare floats and ints
503
504binop_reduce("ball_fequal",  1, tbool, tfloat, "{src0} == {src1}",
505             "{src0} && {src1}", "{src}")
506binop_reduce("bany_fnequal", 1, tbool, tfloat, "{src0} != {src1}",
507             "{src0} || {src1}", "{src}")
508binop_reduce("ball_iequal",  1, tbool, tint, "{src0} == {src1}",
509             "{src0} && {src1}", "{src}")
510binop_reduce("bany_inequal", 1, tbool, tint, "{src0} != {src1}",
511             "{src0} || {src1}", "{src}")
512
513# non-integer-aware GLSL-style comparisons that return 0.0 or 1.0
514
515binop_reduce("fall_equal",  1, tfloat32, tfloat32, "{src0} == {src1}",
516             "{src0} && {src1}", "{src} ? 1.0f : 0.0f")
517binop_reduce("fany_nequal", 1, tfloat32, tfloat32, "{src0} != {src1}",
518             "{src0} || {src1}", "{src} ? 1.0f : 0.0f")
519
520# These comparisons for integer-less hardware return 1.0 and 0.0 for true
521# and false respectively
522
523binop("slt", tfloat32, "", "(src0 < src1) ? 1.0f : 0.0f") # Set on Less Than
524binop("sge", tfloat, "", "(src0 >= src1) ? 1.0f : 0.0f") # Set on Greater or Equal
525binop("seq", tfloat32, commutative, "(src0 == src1) ? 1.0f : 0.0f") # Set on Equal
526binop("sne", tfloat32, commutative, "(src0 != src1) ? 1.0f : 0.0f") # Set on Not Equal
527
528
529opcode("ishl", 0, tint, [0, 0], [tint, tuint32], "", "src0 << src1")
530opcode("ishr", 0, tint, [0, 0], [tint, tuint32], "", "src0 >> src1")
531opcode("ushr", 0, tuint, [0, 0], [tuint, tuint32], "", "src0 >> src1")
532
533# bitwise logic operators
534#
535# These are also used as boolean and, or, xor for hardware supporting
536# integers.
537
538
539binop("iand", tuint, commutative + associative, "src0 & src1")
540binop("ior", tuint, commutative + associative, "src0 | src1")
541binop("ixor", tuint, commutative + associative, "src0 ^ src1")
542
543
544# floating point logic operators
545#
546# These use (src != 0.0) for testing the truth of the input, and output 1.0
547# for true and 0.0 for false
548
549binop("fand", tfloat32, commutative,
550      "((src0 != 0.0f) && (src1 != 0.0f)) ? 1.0f : 0.0f")
551binop("for", tfloat32, commutative,
552      "((src0 != 0.0f) || (src1 != 0.0f)) ? 1.0f : 0.0f")
553binop("fxor", tfloat32, commutative,
554      "(src0 != 0.0f && src1 == 0.0f) || (src0 == 0.0f && src1 != 0.0f) ? 1.0f : 0.0f")
555
556binop_reduce("fdot", 1, tfloat, tfloat, "{src0} * {src1}", "{src0} + {src1}",
557             "{src}")
558
559binop_reduce("fdot_replicated", 4, tfloat, tfloat,
560             "{src0} * {src1}", "{src0} + {src1}", "{src}")
561
562opcode("fdph", 1, tfloat, [3, 4], [tfloat, tfloat], "",
563       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
564opcode("fdph_replicated", 4, tfloat, [3, 4], [tfloat, tfloat], "",
565       "src0.x * src1.x + src0.y * src1.y + src0.z * src1.z + src1.w")
566
567binop("fmin", tfloat, "", "fminf(src0, src1)")
568binop("imin", tint, commutative + associative, "src1 > src0 ? src0 : src1")
569binop("umin", tuint, commutative + associative, "src1 > src0 ? src0 : src1")
570binop("fmax", tfloat, "", "fmaxf(src0, src1)")
571binop("imax", tint, commutative + associative, "src1 > src0 ? src1 : src0")
572binop("umax", tuint, commutative + associative, "src1 > src0 ? src1 : src0")
573
574# Saturated vector add for 4 8bit ints.
575binop("usadd_4x8", tint32, commutative + associative, """
576dst = 0;
577for (int i = 0; i < 32; i += 8) {
578   dst |= MIN2(((src0 >> i) & 0xff) + ((src1 >> i) & 0xff), 0xff) << i;
579}
580""")
581
582# Saturated vector subtract for 4 8bit ints.
583binop("ussub_4x8", tint32, "", """
584dst = 0;
585for (int i = 0; i < 32; i += 8) {
586   int src0_chan = (src0 >> i) & 0xff;
587   int src1_chan = (src1 >> i) & 0xff;
588   if (src0_chan > src1_chan)
589      dst |= (src0_chan - src1_chan) << i;
590}
591""")
592
593# vector min for 4 8bit ints.
594binop("umin_4x8", tint32, commutative + associative, """
595dst = 0;
596for (int i = 0; i < 32; i += 8) {
597   dst |= MIN2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
598}
599""")
600
601# vector max for 4 8bit ints.
602binop("umax_4x8", tint32, commutative + associative, """
603dst = 0;
604for (int i = 0; i < 32; i += 8) {
605   dst |= MAX2((src0 >> i) & 0xff, (src1 >> i) & 0xff) << i;
606}
607""")
608
609# unorm multiply: (a * b) / 255.
610binop("umul_unorm_4x8", tint32, commutative + associative, """
611dst = 0;
612for (int i = 0; i < 32; i += 8) {
613   int src0_chan = (src0 >> i) & 0xff;
614   int src1_chan = (src1 >> i) & 0xff;
615   dst |= ((src0_chan * src1_chan) / 255) << i;
616}
617""")
618
619binop("fpow", tfloat, "", "bit_size == 64 ? powf(src0, src1) : pow(src0, src1)")
620
621binop_horiz("pack_half_2x16_split", 1, tuint32, 1, tfloat32, 1, tfloat32,
622            "pack_half_1x16(src0.x) | (pack_half_1x16(src1.x) << 16)")
623
624binop_convert("pack_64_2x32_split", tuint64, tuint32, "",
625              "src0 | ((uint64_t)src1 << 32)")
626
627binop_convert("pack_32_2x16_split", tuint32, tuint16, "",
628              "src0 | ((uint32_t)src1 << 16)")
629
630# bfm implements the behavior of the first operation of the SM5 "bfi" assembly
631# and that of the "bfi1" i965 instruction. That is, it has undefined behavior
632# if either of its arguments are 32.
633binop_convert("bfm", tuint32, tint32, "", """
634int bits = src0, offset = src1;
635if (offset < 0 || bits < 0 || offset > 31 || bits > 31 || offset + bits > 32)
636   dst = 0; /* undefined */
637else
638   dst = ((1u << bits) - 1) << offset;
639""")
640
641opcode("ldexp", 0, tfloat, [0, 0], [tfloat, tint32], "", """
642dst = (bit_size == 64) ? ldexp(src0, src1) : ldexpf(src0, src1);
643/* flush denormals to zero. */
644if (!isnormal(dst))
645   dst = copysignf(0.0f, src0);
646""")
647
648# Combines the first component of each input to make a 2-component vector.
649
650binop_horiz("vec2", 2, tuint, 1, tuint, 1, tuint, """
651dst.x = src0.x;
652dst.y = src1.x;
653""")
654
655# Byte extraction
656binop("extract_u8", tuint, "", "(uint8_t)(src0 >> (src1 * 8))")
657binop("extract_i8", tint, "", "(int8_t)(src0 >> (src1 * 8))")
658
659# Word extraction
660binop("extract_u16", tuint, "", "(uint16_t)(src0 >> (src1 * 16))")
661binop("extract_i16", tint, "", "(int16_t)(src0 >> (src1 * 16))")
662
663
664def triop(name, ty, const_expr):
665   opcode(name, 0, ty, [0, 0, 0], [ty, ty, ty], "", const_expr)
666def triop_horiz(name, output_size, src1_size, src2_size, src3_size, const_expr):
667   opcode(name, output_size, tuint,
668   [src1_size, src2_size, src3_size],
669   [tuint, tuint, tuint], "", const_expr)
670
671triop("ffma", tfloat, "src0 * src1 + src2")
672
673triop("flrp", tfloat, "src0 * (1 - src2) + src1 * src2")
674
675# Conditional Select
676#
677# A vector conditional select instruction (like ?:, but operating per-
678# component on vectors). There are two versions, one for floating point
679# bools (0.0 vs 1.0) and one for integer bools (0 vs ~0).
680
681
682triop("fcsel", tfloat32, "(src0 != 0.0f) ? src1 : src2")
683
684# 3 way min/max/med
685triop("fmin3", tfloat, "fminf(src0, fminf(src1, src2))")
686triop("imin3", tint, "MIN2(src0, MIN2(src1, src2))")
687triop("umin3", tuint, "MIN2(src0, MIN2(src1, src2))")
688
689triop("fmax3", tfloat, "fmaxf(src0, fmaxf(src1, src2))")
690triop("imax3", tint, "MAX2(src0, MAX2(src1, src2))")
691triop("umax3", tuint, "MAX2(src0, MAX2(src1, src2))")
692
693triop("fmed3", tfloat, "fmaxf(fminf(fmaxf(src0, src1), src2), fminf(src0, src1))")
694triop("imed3", tint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
695triop("umed3", tuint, "MAX2(MIN2(MAX2(src0, src1), src2), MIN2(src0, src1))")
696
697opcode("bcsel", 0, tuint, [0, 0, 0],
698      [tbool, tuint, tuint], "", "src0 ? src1 : src2")
699
700# SM5 bfi assembly
701triop("bfi", tuint32, """
702unsigned mask = src0, insert = src1, base = src2;
703if (mask == 0) {
704   dst = base;
705} else {
706   unsigned tmp = mask;
707   while (!(tmp & 1)) {
708      tmp >>= 1;
709      insert <<= 1;
710   }
711   dst = (base & ~mask) | (insert & mask);
712}
713""")
714
715# SM5 ubfe/ibfe assembly
716opcode("ubfe", 0, tuint32,
717       [0, 0, 0], [tuint32, tint32, tint32], "", """
718unsigned base = src0;
719int offset = src1, bits = src2;
720if (bits == 0) {
721   dst = 0;
722} else if (bits < 0 || offset < 0) {
723   dst = 0; /* undefined */
724} else if (offset + bits < 32) {
725   dst = (base << (32 - bits - offset)) >> (32 - bits);
726} else {
727   dst = base >> offset;
728}
729""")
730opcode("ibfe", 0, tint32,
731       [0, 0, 0], [tint32, tint32, tint32], "", """
732int base = src0;
733int offset = src1, bits = src2;
734if (bits == 0) {
735   dst = 0;
736} else if (bits < 0 || offset < 0) {
737   dst = 0; /* undefined */
738} else if (offset + bits < 32) {
739   dst = (base << (32 - bits - offset)) >> (32 - bits);
740} else {
741   dst = base >> offset;
742}
743""")
744
745# GLSL bitfieldExtract()
746opcode("ubitfield_extract", 0, tuint32,
747       [0, 0, 0], [tuint32, tint32, tint32], "", """
748unsigned base = src0;
749int offset = src1, bits = src2;
750if (bits == 0) {
751   dst = 0;
752} else if (bits < 0 || offset < 0 || offset + bits > 32) {
753   dst = 0; /* undefined per the spec */
754} else {
755   dst = (base >> offset) & ((1ull << bits) - 1);
756}
757""")
758opcode("ibitfield_extract", 0, tint32,
759       [0, 0, 0], [tint32, tint32, tint32], "", """
760int base = src0;
761int offset = src1, bits = src2;
762if (bits == 0) {
763   dst = 0;
764} else if (offset < 0 || bits < 0 || offset + bits > 32) {
765   dst = 0;
766} else {
767   dst = (base << (32 - offset - bits)) >> offset; /* use sign-extending shift */
768}
769""")
770
771# Combines the first component of each input to make a 3-component vector.
772
773triop_horiz("vec3", 3, 1, 1, 1, """
774dst.x = src0.x;
775dst.y = src1.x;
776dst.z = src2.x;
777""")
778
779def quadop_horiz(name, output_size, src1_size, src2_size, src3_size,
780                 src4_size, const_expr):
781   opcode(name, output_size, tuint,
782          [src1_size, src2_size, src3_size, src4_size],
783          [tuint, tuint, tuint, tuint],
784          "", const_expr)
785
786opcode("bitfield_insert", 0, tuint32, [0, 0, 0, 0],
787       [tuint32, tuint32, tint32, tint32], "", """
788unsigned base = src0, insert = src1;
789int offset = src2, bits = src3;
790if (bits == 0) {
791   dst = base;
792} else if (offset < 0 || bits < 0 || bits + offset > 32) {
793   dst = 0;
794} else {
795   unsigned mask = ((1ull << bits) - 1) << offset;
796   dst = (base & ~mask) | ((insert << offset) & mask);
797}
798""")
799
800quadop_horiz("vec4", 4, 1, 1, 1, 1, """
801dst.x = src0.x;
802dst.y = src1.x;
803dst.z = src2.x;
804dst.w = src3.x;
805""")
806
807
808